|
|
@@ -0,0 +1,138 @@
|
|
|
+package com.uas.search.sort;
|
|
|
+
|
|
|
+import com.uas.search.analyzer.IKAnalyzer;
|
|
|
+import org.apache.lucene.analysis.Analyzer;
|
|
|
+import org.apache.lucene.analysis.TokenStream;
|
|
|
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
|
+import org.apache.lucene.search.FieldComparator;
|
|
|
+import org.apache.lucene.search.FieldComparatorSource;
|
|
|
+import org.springframework.util.StringUtils;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 用于 StringField 的排序
|
|
|
+ *
|
|
|
+ * @author sunyj
|
|
|
+ * @since 2017/11/23 17:27
|
|
|
+ */
|
|
|
+public class StringFieldComparatorSource extends FieldComparatorSource {
|
|
|
+
|
|
|
+ private String keyword;
|
|
|
+
|
|
|
+ public StringFieldComparatorSource(String keyword) {
|
|
|
+ this.keyword = keyword;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public FieldComparator<?> newComparator(final String fieldname, int numHits, int sortPos, boolean reversed)
|
|
|
+ throws IOException {
|
|
|
+ return new DefaultFieldComparator(fieldname, numHits) {
|
|
|
+ @Override
|
|
|
+ protected int compare(String str1, String str2) {
|
|
|
+ int superCompare = super.compare(str1, str2);
|
|
|
+ if (superCompare != UNKOWN || StringUtils.isEmpty(keyword)) {
|
|
|
+ return superCompare;
|
|
|
+ }
|
|
|
+
|
|
|
+ str1 = str1.toLowerCase();
|
|
|
+ str2 = str2.toLowerCase();
|
|
|
+ keyword = keyword.toLowerCase();
|
|
|
+
|
|
|
+ // 一方包含关键词,另一方不包含时,该方排序靠前
|
|
|
+ if (str1.contains(keyword)) {
|
|
|
+ if (str2.contains(keyword)) {
|
|
|
+ // 均包含关键词时,比较关键词在字符串中的位置,位置越靠前,最终排序越靠前
|
|
|
+ int index1 = str1.indexOf(keyword);
|
|
|
+ int index2 = str2.indexOf(keyword);
|
|
|
+ if (index1 < index2) {
|
|
|
+ return AHEAD;
|
|
|
+ } else if (index1 > index2) {
|
|
|
+ return BEHIND;
|
|
|
+ } else {
|
|
|
+ // 关键词在字符串中的位置相同时,比较字符串长度,长度较短,则排序靠前
|
|
|
+ int length1 = str1.length();
|
|
|
+ int length2 = str2.length();
|
|
|
+ if (length1 < length2) {
|
|
|
+ return AHEAD;
|
|
|
+ } else if (length1 > length2) {
|
|
|
+ return BEHIND;
|
|
|
+ } else {
|
|
|
+ // 字符串长度也相同时,截取含有的第一个关键词后,再递归比较
|
|
|
+ return compare(str1.substring(index1 + keyword.length()), str2.substring(index2 + keyword.length()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ return AHEAD;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if (str2.contains(keyword)) {
|
|
|
+ return BEHIND;
|
|
|
+ } else {
|
|
|
+ // 均不包含关键词时,统计分词后的词在字符串中的分布情况,比例较高,则排序靠前
|
|
|
+ List<String> tokenizedWords = tokenize();
|
|
|
+ int count1 = 0;
|
|
|
+ int count2 = 0;
|
|
|
+ for (String tokenizedWord : tokenizedWords) {
|
|
|
+ count1 += count(str1, tokenizedWord);
|
|
|
+ count2 += count(str2, tokenizedWord);
|
|
|
+ }
|
|
|
+ double percent1 = count1 / (1.0 * str1.length());
|
|
|
+ double percent2 = count2 / (1.0 * str2.length());
|
|
|
+ if (percent1 > percent2) {
|
|
|
+ return AHEAD;
|
|
|
+ } else if (percent1 < percent2) {
|
|
|
+ return BEHIND;
|
|
|
+ } else {
|
|
|
+ // 分词后的词在字符串中的比例相同时,比较字符串长度,长度较短,则排序靠前
|
|
|
+ int length1 = str1.length();
|
|
|
+ int length2 = str2.length();
|
|
|
+ if (length1 < length2) {
|
|
|
+ return AHEAD;
|
|
|
+ } else if (length1 > length2) {
|
|
|
+ return BEHIND;
|
|
|
+ } else {
|
|
|
+ return PARALLEL;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * @return 对关键词进行分词
|
|
|
+ */
|
|
|
+ private List<String> tokenize() {
|
|
|
+ List<String> keywords = new ArrayList<>();
|
|
|
+ try (Analyzer analyzer = new IKAnalyzer(true);
|
|
|
+ TokenStream tokenStream = analyzer.tokenStream(fieldname, keyword)) {
|
|
|
+ tokenStream.reset();
|
|
|
+ CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
|
|
|
+ while (tokenStream.incrementToken()) {
|
|
|
+ keywords.add(cta.toString());
|
|
|
+ }
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new IllegalStateException("排序时分词错误:fieldname=" + fieldname + ", keyword=" + keyword, e);
|
|
|
+ }
|
|
|
+ return keywords;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 统计 str 中 sub 的数目
|
|
|
+ */
|
|
|
+ private int count(String str, String sub) {
|
|
|
+ int count = 0;
|
|
|
+ Pattern pattern = Pattern.compile("[\\s\\S]*?" + sub + "[\\s\\S]*?");
|
|
|
+ Matcher matcher = pattern.matcher(str);
|
|
|
+ while (matcher.find()) {
|
|
|
+ count++;
|
|
|
+ }
|
|
|
+ return count;
|
|
|
+ }
|
|
|
+ };
|
|
|
+ }
|
|
|
+}
|