Quellcode durchsuchen

simplify StringFieldComparatorSource to improve search speed

sunyj vor 8 Jahren
Ursprung
Commit
a76dfce64a
1 geänderte Dateien mit 17 neuen und 77 gelöschten Zeilen
  1. 17 77
      src/main/java/com/uas/search/sort/StringFieldComparatorSource.java

+ 17 - 77
src/main/java/com/uas/search/sort/StringFieldComparatorSource.java

@@ -1,18 +1,10 @@
 package com.uas.search.sort;
 
-import com.uas.search.analyzer.IKAnalyzer;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.search.FieldComparator;
 import org.apache.lucene.search.FieldComparatorSource;
 import org.springframework.util.StringUtils;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /**
  * 用于 StringField 的排序
@@ -43,95 +35,43 @@ public class StringFieldComparatorSource extends FieldComparatorSource {
                 str2 = str2.toLowerCase();
                 keyword = keyword.toLowerCase();
 
+                int index1 = str1.indexOf(keyword);
+                int index2 = str2.indexOf(keyword);
                 // 一方包含关键词,另一方不包含时,该方排序靠前
-                if (str1.contains(keyword)) {
-                    if (str2.contains(keyword)) {
+                if (index1 > 0) {
+                    if (index2 > 0) {
                         // 均包含关键词时,比较关键词在字符串中的位置,位置越靠前,最终排序越靠前
-                        int index1 = str1.indexOf(keyword);
-                        int index2 = str2.indexOf(keyword);
                         if (index1 < index2) {
                             return AHEAD;
                         } else if (index1 > index2) {
                             return BEHIND;
                         } else {
                             // 关键词在字符串中的位置相同时,比较字符串长度,长度较短,则排序靠前
-                            int length1 = str1.length();
-                            int length2 = str2.length();
-                            if (length1 < length2) {
-                                return AHEAD;
-                            } else if (length1 > length2) {
-                                return BEHIND;
-                            } else {
-                                // 字符串长度也相同时,截取含有的第一个关键词后,再递归比较
-                                return compare(str1.substring(index1 + keyword.length()), str2.substring(index2 + keyword.length()));
-                            }
+                            return compareLength(str1, str2);
                         }
                     } else {
                         return AHEAD;
                     }
                 } else {
-                    if (str2.contains(keyword)) {
+                    if (index2 > 0) {
                         return BEHIND;
                     } else {
-                        // 均不包含关键词时,统计分词后的词在字符串中的分布情况,比例较高,则排序靠前
-                        List<String> tokenizedWords = tokenize();
-                        int count1 = 0;
-                        int count2 = 0;
-                        for (String tokenizedWord : tokenizedWords) {
-                            count1 += count(str1, tokenizedWord);
-                            count2 += count(str2, tokenizedWord);
-                        }
-                        double percent1 = count1 / (1.0 * str1.length());
-                        double percent2 = count2 / (1.0 * str2.length());
-                        if (percent1 > percent2) {
-                            return AHEAD;
-                        } else if (percent1 < percent2) {
-                            return BEHIND;
-                        } else {
-                            // 分词后的词在字符串中的比例相同时,比较字符串长度,长度较短,则排序靠前
-                            int length1 = str1.length();
-                            int length2 = str2.length();
-                            if (length1 < length2) {
-                                return AHEAD;
-                            } else if (length1 > length2) {
-                                return BEHIND;
-                            } else {
-                                return PARALLEL;
-                            }
-                        }
+                        // 均不包含关键词时,比较字符串长度,长度较短,则排序靠前
+                        return compareLength(str1, str2);
                     }
                 }
             }
 
-            /**
-             * @return 对关键词进行分词
-             */
-            private List<String> tokenize() {
-                List<String> keywords = new ArrayList<>();
-                try (Analyzer analyzer = new IKAnalyzer(true);
-                     TokenStream tokenStream = analyzer.tokenStream(fieldname, keyword)) {
-                    tokenStream.reset();
-                    CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
-                    while (tokenStream.incrementToken()) {
-                        keywords.add(cta.toString());
-                    }
-                } catch (IOException e) {
-                    throw new IllegalStateException("排序时分词错误:fieldname=" + fieldname + ", keyword=" + keyword, e);
-                }
-                return keywords;
-            }
-
-            /**
-             * 统计 str 中 sub 的数目
-             */
-            private int count(String str, String sub) {
-                int count = 0;
-                Pattern pattern = Pattern.compile("[\\s\\S]*?" + sub + "[\\s\\S]*?");
-                Matcher matcher = pattern.matcher(str);
-                while (matcher.find()) {
-                    count++;
+            private int compareLength(String str1, String str2) {
+                int length1 = str1.length();
+                int length2 = str2.length();
+                if (length1 < length2) {
+                    return AHEAD;
+                } else if (length1 > length2) {
+                    return BEHIND;
+                } else {
+                    return PARALLEL;
                 }
-                return count;
             }
         };
     }