Browse Source

add StringFieldComparatorSource to sort StringField

sunyj 8 years ago
parent
commit
1a3da6a9e6

+ 2 - 0
src/main/java/com/uas/search/constant/SearchConstants.java

@@ -75,6 +75,8 @@ public class SearchConstants {
 	public static final String BRAND_ID_FIELD = "br_id";
 	public static final String BRAND_NAMECN_FIELD = "br_name_cn";
 	public static final String BRAND_NAMEEN_FIELD = "br_name_en";
+    public static final String BRAND_NAMECN_UNTOKENIZED_FIELD = "br_name_cn_untokenized";
+    public static final String BRAND_NAMEEN_UNTOKENIZED_FIELD = "br_name_en_untokenized";
 	public static final String BRAND_UUID_FIELD = "br_uuid";
     public static final String BRAND_VISIT_COUNT_FIELD = "br_visit_count";
     public static final String BRAND_SEARCH_COUNT_FIELD = "br_search_count";

+ 8 - 5
src/main/java/com/uas/search/service/impl/SearchServiceImpl.java

@@ -16,6 +16,7 @@ import com.uas.search.model.Goods;
 import com.uas.search.model.Kind;
 import com.uas.search.service.SearchService;
 import com.uas.search.sort.SimilarValuesFieldComparatorSource;
+import com.uas.search.sort.StringFieldComparatorSource;
 import com.uas.search.util.DocumentToObjectUtils;
 import com.uas.search.util.SearchUtils;
 import org.apache.lucene.document.Document;
@@ -154,22 +155,24 @@ public class SearchServiceImpl implements SearchService {
 			throw new SearchException("搜索关键词无效:" + keyword);
 		}
 		BooleanQuery booleanQuery = new BooleanQuery();
-		booleanQuery.add(SearchUtils.getBooleanQuery(SearchConstants.BRAND_NAMECN_FIELD, keyword),
+		booleanQuery.add(SearchUtils.getBooleanQuery(SearchConstants.BRAND_NAMECN_UNTOKENIZED_FIELD, keyword, true),
 				BooleanClause.Occur.SHOULD);
-		booleanQuery.add(SearchUtils.getBooleanQuery(SearchConstants.BRAND_NAMEEN_FIELD, keyword),
+		booleanQuery.add(SearchUtils.getBooleanQuery(SearchConstants.BRAND_NAMEEN_UNTOKENIZED_FIELD, keyword, true),
 				BooleanClause.Occur.SHOULD);
 		logger.info(booleanQuery.toString());
-		return SearchUtils.getDocuments(SearchConstants.BRAND_TABLE_NAME, booleanQuery, new Sort(sortBrand()), page,
+		return SearchUtils.getDocuments(SearchConstants.BRAND_TABLE_NAME, booleanQuery, new Sort(sortBrand(keyword)), page,
 				size);
 	}
 
 	/**
 	 * @return 品牌排序规则
 	 */
-	private SortField[] sortBrand() {
-		// 分数 > 权重 > 访问量 > 搜索次数
+	private SortField[] sortBrand(String keyword) {
+		// 分数 > 自定义排序 > 权重 > 访问量 > 搜索次数
         return new SortField[]{
                 SortField.FIELD_SCORE,
+				new SortField(SearchConstants.BRAND_NAMECN_UNTOKENIZED_FIELD, new StringFieldComparatorSource(keyword)),
+				new SortField(SearchConstants.BRAND_NAMEEN_UNTOKENIZED_FIELD, new StringFieldComparatorSource(keyword)),
                 sortField(SearchConstants.BRAND_WEIGHT_FIELD, Type.DOUBLE, true, Double.MIN_VALUE),
                 sortField(SearchConstants.BRAND_VISIT_COUNT_FIELD, Type.LONG, true, Long.MIN_VALUE),
                 sortField(SearchConstants.BRAND_SEARCH_COUNT_FIELD, Type.LONG, true, Long.MIN_VALUE)

+ 4 - 3
src/main/java/com/uas/search/sort/DefaultFieldComparator.java

@@ -3,6 +3,7 @@ package com.uas.search.sort;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.SimpleFieldComparator;
+import org.springframework.util.StringUtils;
 
 import java.io.IOException;
 
@@ -42,11 +43,11 @@ abstract class DefaultFieldComparator extends SimpleFieldComparator<String> {
      * @return {@link #PARALLEL} 表示并列,{@link #BEHIND} 表示 str1 排在 str2 后面,{@link #AHEAD} 表示排在前面,{@link #UNKOWN} 表示需要进一步判断
      */
     protected int compare(String str1, String str2) {
-        if (str1 == null && str2 == null) {
+        if (StringUtils.isEmpty(str1) && StringUtils.isEmpty(str2)) {
             return PARALLEL;
-        } else if (str1 == null) {
+        } else if (StringUtils.isEmpty(str2)) {
             return BEHIND;
-        } else if (str2 == null) {
+        } else if (StringUtils.isEmpty(str2)) {
             return AHEAD;
         }
         return UNKOWN;

+ 138 - 0
src/main/java/com/uas/search/sort/StringFieldComparatorSource.java

@@ -0,0 +1,138 @@
+package com.uas.search.sort;
+
+import com.uas.search.analyzer.IKAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.FieldComparatorSource;
+import org.springframework.util.StringUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * 用于 StringField 的排序
+ *
+ * @author sunyj
+ * @since 2017/11/23 17:27
+ */
+public class StringFieldComparatorSource extends FieldComparatorSource {
+
+    private String keyword;
+
+    public StringFieldComparatorSource(String keyword) {
+        this.keyword = keyword;
+    }
+
+    @Override
+    public FieldComparator<?> newComparator(final String fieldname, int numHits, int sortPos, boolean reversed)
+            throws IOException {
+        return new DefaultFieldComparator(fieldname, numHits) {
+            @Override
+            protected int compare(String str1, String str2) {
+                int superCompare = super.compare(str1, str2);
+                if (superCompare != UNKOWN || StringUtils.isEmpty(keyword)) {
+                    return superCompare;
+                }
+
+                str1 = str1.toLowerCase();
+                str2 = str2.toLowerCase();
+                keyword = keyword.toLowerCase();
+
+                // 一方包含关键词,另一方不包含时,该方排序靠前
+                if (str1.contains(keyword)) {
+                    if (str2.contains(keyword)) {
+                        // 均包含关键词时,比较关键词在字符串中的位置,位置越靠前,最终排序越靠前
+                        int index1 = str1.indexOf(keyword);
+                        int index2 = str2.indexOf(keyword);
+                        if (index1 < index2) {
+                            return AHEAD;
+                        } else if (index1 > index2) {
+                            return BEHIND;
+                        } else {
+                            // 关键词在字符串中的位置相同时,比较字符串长度,长度较短,则排序靠前
+                            int length1 = str1.length();
+                            int length2 = str2.length();
+                            if (length1 < length2) {
+                                return AHEAD;
+                            } else if (length1 > length2) {
+                                return BEHIND;
+                            } else {
+                                // 字符串长度也相同时,截取含有的第一个关键词后,再递归比较
+                                return compare(str1.substring(index1 + keyword.length()), str2.substring(index2 + keyword.length()));
+                            }
+                        }
+                    } else {
+                        return AHEAD;
+                    }
+                } else {
+                    if (str2.contains(keyword)) {
+                        return BEHIND;
+                    } else {
+                        // 均不包含关键词时,统计分词后的词在字符串中的分布情况,比例较高,则排序靠前
+                        List<String> tokenizedWords = tokenize();
+                        int count1 = 0;
+                        int count2 = 0;
+                        for (String tokenizedWord : tokenizedWords) {
+                            count1 += count(str1, tokenizedWord);
+                            count2 += count(str2, tokenizedWord);
+                        }
+                        double percent1 = count1 / (1.0 * str1.length());
+                        double percent2 = count2 / (1.0 * str2.length());
+                        if (percent1 > percent2) {
+                            return AHEAD;
+                        } else if (percent1 < percent2) {
+                            return BEHIND;
+                        } else {
+                            // 分词后的词在字符串中的比例相同时,比较字符串长度,长度较短,则排序靠前
+                            int length1 = str1.length();
+                            int length2 = str2.length();
+                            if (length1 < length2) {
+                                return AHEAD;
+                            } else if (length1 > length2) {
+                                return BEHIND;
+                            } else {
+                                return PARALLEL;
+                            }
+                        }
+                    }
+                }
+            }
+
+            /**
+             * @return 对关键词进行分词
+             */
+            private List<String> tokenize() {
+                List<String> keywords = new ArrayList<>();
+                try (Analyzer analyzer = new IKAnalyzer(true);
+                     TokenStream tokenStream = analyzer.tokenStream(fieldname, keyword)) {
+                    tokenStream.reset();
+                    CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
+                    while (tokenStream.incrementToken()) {
+                        keywords.add(cta.toString());
+                    }
+                } catch (IOException e) {
+                    throw new IllegalStateException("排序时分词错误:fieldname=" + fieldname + ", keyword=" + keyword, e);
+                }
+                return keywords;
+            }
+
+            /**
+             * 统计 str 中 sub 的数目
+             */
+            private int count(String str, String sub) {
+                int count = 0;
+                Pattern pattern = Pattern.compile("[\\s\\S]*?" + sub + "[\\s\\S]*?");
+                Matcher matcher = pattern.matcher(str);
+                while (matcher.find()) {
+                    count++;
+                }
+                return count;
+            }
+        };
+    }
+}

+ 4 - 0
src/main/java/com/uas/search/util/ObjectToDocumentUtils.java

@@ -99,10 +99,14 @@ public class ObjectToDocumentUtils {
 		document.add(new StringField(SearchConstants.BRAND_ID_FIELD, String.valueOf(brand.getId()), Store.YES));
 		document.add(new TextField(SearchConstants.BRAND_NAMECN_FIELD, brand.getNameCn(), Store.YES));
 		document.add(new BinaryDocValuesField(SearchConstants.BRAND_NAMECN_FIELD, new BytesRef(brand.getNameCn())));
+        document.add(new StringField(SearchConstants.BRAND_NAMECN_UNTOKENIZED_FIELD, brand.getNameCn().toLowerCase(), Store.YES));
+        document.add(new BinaryDocValuesField(SearchConstants.BRAND_NAMECN_UNTOKENIZED_FIELD, new BytesRef(brand.getNameCn())));
 		document.add(new StringField(SearchConstants.BRAND_UUID_FIELD, brand.getUuid(), Store.YES));
 		if (brand.getNameEn() != null) {
 			document.add(new TextField(SearchConstants.BRAND_NAMEEN_FIELD, brand.getNameEn(), Store.YES));
 			document.add(new BinaryDocValuesField(SearchConstants.BRAND_NAMEEN_FIELD, new BytesRef(brand.getNameEn())));
+            document.add(new StringField(SearchConstants.BRAND_NAMEEN_UNTOKENIZED_FIELD, brand.getNameEn().toLowerCase(), Store.YES));
+            document.add(new BinaryDocValuesField(SearchConstants.BRAND_NAMEEN_UNTOKENIZED_FIELD, new BytesRef(brand.getNameEn())));
 		}
         if (brand.getVisitCount() != null) {
             document.add(new DoubleDocValuesField(SearchConstants.BRAND_VISIT_COUNT_FIELD, brand.getVisitCount()));

+ 41 - 1
src/main/java/com/uas/search/util/SearchUtils.java

@@ -78,6 +78,21 @@ public class SearchUtils {
 		return getBooleanQuery(field, keyword, true, true, Occur.MUST);
 	}
 
+	/**
+	 * 对搜索词进行分词后组合得到BooleanQuery
+	 *
+	 * @param field
+	 *            搜索的域名
+	 * @param keyword
+	 *            搜索关键词
+	 * @param useRegexpQuery
+	 *            是否使用 RegexpQuery
+	 * @return
+	 */
+	public static BooleanQuery getBooleanQuery(String field, String keyword, boolean useRegexpQuery) {
+		return getBooleanQuery(field, keyword, true, true, Occur.MUST, useRegexpQuery);
+	}
+
 	/**
 	 * 对搜索词进行分词后组合得到BooleanQuery
 	 * 
@@ -94,6 +109,27 @@ public class SearchUtils {
 	 * @return
 	 */
 	public static BooleanQuery getBooleanQuery(String field, String keyword, boolean useSmart, boolean loadExtDic, Occur occur) {
+		return getBooleanQuery(field, keyword, useSmart, loadExtDic, occur, false);
+	}
+
+	/**
+	 * 对搜索词进行分词后组合得到BooleanQuery
+	 *
+	 * @param field
+	 *            搜索的域名
+	 * @param keyword
+	 *            搜索关键词
+	 * @param useSmart
+	 *            是否以最大粒度进行分词
+	 * @param loadExtDic
+	 *            是否加载自定义词典
+	 * @param occur
+	 *            多个Query之间的关系
+	 * @param useRegexpQuery
+	 *            是否使用 RegexpQuery
+	 * @return
+	 */
+	public static BooleanQuery getBooleanQuery(String field, String keyword, boolean useSmart, boolean loadExtDic, Occur occur, boolean useRegexpQuery) {
 		if (StringUtils.isEmpty(field) || StringUtils.isEmpty(keyword)) {
 			return null;
 		}
@@ -104,7 +140,11 @@ public class SearchUtils {
 			tokenStream.reset();
 			CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
 			while (tokenStream.incrementToken()) {
-				booleanQuery.add(new PrefixQuery(new Term(field, cta.toString())), occur);
+				if(!useRegexpQuery){
+					booleanQuery.add(new PrefixQuery(new Term(field, cta.toString())), occur);
+				} else{
+					booleanQuery.add(new RegexpQuery(new Term(field, ".*" + cta.toString().toLowerCase() + ".*")), occur);
+				}
 			}
 			tokenStream.close();
 			analyzer.close();