Browse Source

specify if load ext dic when constructing IKAnalyzer

sunyj 8 năm trước cách đây
mục cha
commit
7c46c07461

+ 116 - 0
src/main/java/com/uas/search/analyzer/DefaultConfig.java

@@ -0,0 +1,116 @@
+package com.uas.search.analyzer;
+
+import org.wltea.analyzer.cfg.Configuration;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
+
+/**
+ * 替代 {@link org.wltea.analyzer.cfg.DefaultConfig} , 增加 loadExtDic 参数
+ *
+ * @author sunyj
+ * @since 2017/11/21 17:00
+ */
+public class DefaultConfig implements Configuration {
+    private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
+    private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
+    private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
+    private static final String EXT_DICT = "ext_dict";
+    private static final String EXT_STOP = "ext_stopwords";
+    private Properties props = new Properties();
+    private boolean useSmart;
+    private boolean loadExtDic;
+
+    public DefaultConfig(boolean useSmart, boolean loadExtDic) {
+        this.useSmart = useSmart;
+        this.loadExtDic = loadExtDic;
+        // 如果指定需要加载自定义词典,再进行加载
+        if (loadExtDic) {
+            InputStream input = this.getClass().getClassLoader().getResourceAsStream("IKAnalyzer.cfg.xml");
+            if (input != null) {
+                try {
+                    this.props.loadFromXML(input);
+                } catch (InvalidPropertiesFormatException var3) {
+                    var3.printStackTrace();
+                } catch (IOException var4) {
+                    var4.printStackTrace();
+                }
+            }
+        }
+    }
+
+    public boolean useSmart() {
+        return this.useSmart;
+    }
+
+    public void setUseSmart(boolean useSmart) {
+        this.useSmart = useSmart;
+    }
+
+    public boolean loadExtDic() {
+        return loadExtDic;
+    }
+
+    public String getMainDictionary() {
+        return "org/wltea/analyzer/dic/main2012.dic";
+    }
+
+    public String getQuantifierDicionary() {
+        return "org/wltea/analyzer/dic/quantifier.dic";
+    }
+
+    public List<String> getExtDictionarys() {
+        ArrayList extDictFiles = new ArrayList(2);
+        String extDictCfg = this.props.getProperty("ext_dict");
+        if (extDictCfg != null) {
+            String[] filePaths = extDictCfg.split(";");
+            if (filePaths != null) {
+                String[] var7 = filePaths;
+                int var6 = filePaths.length;
+
+                for (int var5 = 0; var5 < var6; ++var5) {
+                    String filePath = var7[var5];
+                    if (filePath != null && !"".equals(filePath.trim())) {
+                        extDictFiles.add(filePath.trim());
+                    }
+                }
+            }
+        }
+
+        return extDictFiles;
+    }
+
+    public List<String> getExtStopWordDictionarys() {
+        ArrayList extStopWordDictFiles = new ArrayList(2);
+        String extStopWordDictCfg = this.props.getProperty("ext_stopwords");
+        if (extStopWordDictCfg != null) {
+            String[] filePaths = extStopWordDictCfg.split(";");
+            if (filePaths != null) {
+                String[] var7 = filePaths;
+                int var6 = filePaths.length;
+
+                for (int var5 = 0; var5 < var6; ++var5) {
+                    String filePath = var7[var5];
+                    if (filePath != null && !"".equals(filePath.trim())) {
+                        extStopWordDictFiles.add(filePath.trim());
+                    }
+                }
+            }
+        }
+
+        return extStopWordDictFiles;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null || getClass() != obj.getClass() || !(obj instanceof DefaultConfig)) {
+            return false;
+        }
+        DefaultConfig defaultConfig = (DefaultConfig) obj;
+        return Objects.equals(useSmart, defaultConfig.useSmart()) && Objects.equals(loadExtDic, defaultConfig.loadExtDic());
+    }
+}

+ 56 - 0
src/main/java/com/uas/search/analyzer/IKAnalyzer.java

@@ -0,0 +1,56 @@
+package com.uas.search.analyzer;
+
+import org.apache.lucene.analysis.Analyzer;
+
+/**
+ * 替代 {@link org.wltea.analyzer.lucene.IKAnalyzer} , 增加 loadExtDic 属性
+ *
+ * @author sunyj
+ * @since 2017/11/21 16:46
+ */
+public final class IKAnalyzer extends Analyzer {
+
+    /**
+     * 是否以最大粒度进行分词
+     */
+    private boolean useSmart;
+
+    /**
+     * 是否加载自定义词典
+     */
+    private boolean loadExtDic;
+
+    public IKAnalyzer() {
+        this(false);
+    }
+
+    public IKAnalyzer(boolean useSmart) {
+        this(useSmart, true);
+    }
+
+    public IKAnalyzer(boolean useSmart, boolean loadExtDic) {
+        this.useSmart = useSmart;
+        this.loadExtDic = loadExtDic;
+    }
+
+    public boolean useSmart() {
+        return this.useSmart;
+    }
+
+    public void setUseSmart(boolean useSmart) {
+        this.useSmart = useSmart;
+    }
+
+    public boolean loadExtDic() {
+        return loadExtDic;
+    }
+
+    public void setLoadExtDic(boolean loadExtDic) {
+        this.loadExtDic = loadExtDic;
+    }
+
+    protected TokenStreamComponents createComponents(String fieldName) {
+        IKTokenizer _IKTokenizer = new IKTokenizer(this.useSmart(), this.loadExtDic());
+        return new TokenStreamComponents(_IKTokenizer);
+    }
+}

+ 60 - 0
src/main/java/com/uas/search/analyzer/IKTokenizer.java

@@ -0,0 +1,60 @@
+package com.uas.search.analyzer;
+
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.wltea.analyzer.cfg.Configuration;
+import org.wltea.analyzer.core.IKSegmenter;
+import org.wltea.analyzer.core.Lexeme;
+
+import java.io.IOException;
+
+/**
+ * 替代 {@link org.wltea.analyzer.lucene.IKTokenizer} , 增加 loadExtDic 参数
+ *
+ * @author sunyj
+ * @since 2017/11/21 16:48
+ */
+public final class IKTokenizer extends Tokenizer {
+    private final CharTermAttribute termAtt = (CharTermAttribute) this.addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = (OffsetAttribute) this.addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAtt = (TypeAttribute) this.addAttribute(TypeAttribute.class);
+    private IKSegmenter _IKImplement;
+    private int endPosition;
+
+    public IKTokenizer(boolean useSmart) {
+        this(useSmart, true);
+    }
+
+    public IKTokenizer(boolean useSmart, boolean loadExtDic) {
+        Configuration configuration = new DefaultConfig(useSmart, loadExtDic);
+        this._IKImplement = new IKSegmenter(this.input, configuration);
+    }
+
+    public boolean incrementToken() throws IOException {
+        this.clearAttributes();
+        Lexeme nextLexeme = this._IKImplement.next();
+        if (nextLexeme != null) {
+            this.termAtt.append(nextLexeme.getLexemeText());
+            this.termAtt.setLength(nextLexeme.getLength());
+            this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
+            this.endPosition = nextLexeme.getEndPosition();
+            this.typeAtt.setType(nextLexeme.getLexemeTypeString());
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    public void reset() throws IOException {
+        super.reset();
+        this._IKImplement.reset(this.input);
+    }
+
+    public final void end() {
+        int finalOffset = this.correctOffset(this.endPosition);
+        this.offsetAtt.setOffset(finalOffset, finalOffset);
+    }
+}

+ 4 - 5
src/main/java/com/uas/search/service/impl/IndexServiceImpl.java

@@ -863,10 +863,9 @@ public class IndexServiceImpl implements IndexService {
     @Override
     public List<Object> updateIndexByNewWords(List<String> newWords) {
         List<Object> updatedObjects = new ArrayList<>();
-        updatedObjects.add(updateIndexByNewWords(newWords, SearchConstants.KIND_TABLE_NAME, SearchConstants.KIND_ID_FIELD, SearchConstants.KIND_NAMECN_FIELD));
-        updatedObjects.add(updateIndexByNewWords(newWords, SearchConstants.BRAND_TABLE_NAME, SearchConstants.BRAND_ID_FIELD, SearchConstants.BRAND_NAMECN_FIELD, SearchConstants.BRAND_NAMEEN_FIELD));
-        updatedObjects.add(updateIndexByNewWords(newWords, SearchConstants.COMPONENT_TABLE_NAME, SearchConstants.COMPONENT_ID_FIELD, SearchConstants.COMPONENT_KINDNAME_FIELD));
-        updatedObjects.add(updateIndexByNewWords(newWords, SearchConstants.COMPONENT_TABLE_NAME, SearchConstants.COMPONENT_ID_FIELD, SearchConstants.COMPONENT_BRANDNAMECN_FIELD, SearchConstants.COMPONENT_BRANDNAMEEN_FIELD));
+        updatedObjects.addAll(updateIndexByNewWords(newWords, SearchConstants.KIND_TABLE_NAME, SearchConstants.KIND_ID_FIELD, SearchConstants.KIND_NAMECN_FIELD));
+        updatedObjects.addAll(updateIndexByNewWords(newWords, SearchConstants.BRAND_TABLE_NAME, SearchConstants.BRAND_ID_FIELD, SearchConstants.BRAND_NAMECN_FIELD, SearchConstants.BRAND_NAMEEN_FIELD));
+        updatedObjects.addAll(updateIndexByNewWords(newWords, SearchConstants.COMPONENT_TABLE_NAME, SearchConstants.COMPONENT_ID_FIELD, SearchConstants.COMPONENT_KINDNAME_FIELD, SearchConstants.COMPONENT_BRANDNAMECN_FIELD, SearchConstants.COMPONENT_BRANDNAMEEN_FIELD));
         return updatedObjects;
     }
 
@@ -885,7 +884,7 @@ public class IndexServiceImpl implements IndexService {
             BooleanQuery booleanQuery = new BooleanQuery();
             for (String newWord : newWords) {
                 for (String field : fields) {
-                    booleanQuery.add(SearchUtils.getBooleanQuery(field, newWord, false, BooleanClause.Occur.MUST), BooleanClause.Occur.SHOULD);
+                    booleanQuery.add(SearchUtils.getBooleanQuery(field, newWord, false, false, BooleanClause.Occur.MUST), BooleanClause.Occur.SHOULD);
                 }
             }
             logger.info(booleanQuery.toString());

+ 8 - 9
src/main/java/com/uas/search/support/IndexWriterManager.java

@@ -1,21 +1,20 @@
 package com.uas.search.support;
 
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
+import com.uas.search.analyzer.IKAnalyzer;
+import com.uas.search.util.SearchUtils;
+import com.uas.search.util.StringUtils;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.NIOFSDirectory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.wltea.analyzer.lucene.IKAnalyzer;
 
-import com.uas.search.util.SearchUtils;
-import com.uas.search.util.StringUtils;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * 对IndexWriter进行管理,防止同时有多个方法对索引进行修改,抛出LockObtainFailedException异常

+ 6 - 4
src/main/java/com/uas/search/util/SearchUtils.java

@@ -1,6 +1,7 @@
 package com.uas.search.util;
 
 import com.uas.search.LuceneProperties;
+import com.uas.search.analyzer.IKAnalyzer;
 import com.uas.search.constant.SearchConstants;
 import com.uas.search.constant.model.SPage;
 import com.uas.search.exception.SearchException;
@@ -17,7 +18,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.util.CollectionUtils;
 import org.springframework.util.StringUtils;
-import org.wltea.analyzer.lucene.IKAnalyzer;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -75,7 +75,7 @@ public class SearchUtils {
 	 * @return
 	 */
 	public static BooleanQuery getBooleanQuery(String field, String keyword) {
-		return getBooleanQuery(field, keyword, true, Occur.MUST);
+		return getBooleanQuery(field, keyword, true, true, Occur.MUST);
 	}
 
 	/**
@@ -87,16 +87,18 @@ public class SearchUtils {
 	 *            搜索关键词
      * @param useSmart
      *            是否以最大粒度进行分词
+	 * @param loadExtDic
+	 *            是否加载自定义词典
 	 * @param occur
 	 *            多个Query之间的关系
 	 * @return
 	 */
-	public static BooleanQuery getBooleanQuery(String field, String keyword, boolean useSmart, Occur occur) {
+	public static BooleanQuery getBooleanQuery(String field, String keyword, boolean useSmart, boolean loadExtDic, Occur occur) {
 		if (StringUtils.isEmpty(field) || StringUtils.isEmpty(keyword)) {
 			return null;
 		}
 		BooleanQuery booleanQuery = new BooleanQuery();
-		Analyzer analyzer = new IKAnalyzer(useSmart);
+		Analyzer analyzer = new IKAnalyzer(useSmart, loadExtDic);
 		try {
 			TokenStream tokenStream = analyzer.tokenStream(field, keyword);
 			tokenStream.reset();

+ 282 - 0
src/main/java/org/wltea/analyzer/dic/Dictionary.java

@@ -0,0 +1,282 @@
+//
+// Source code recreated from a .class file by IntelliJ IDEA
+// (powered by Fernflower decompiler)
+//
+
+package org.wltea.analyzer.dic;
+
+import org.wltea.analyzer.cfg.Configuration;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * 直接覆盖 {@link Dictionary} , 修改对实例的处理
+ *
+ * @author sunyj
+ * @since 2017/11/21 17:00
+ */
+public class Dictionary {
+    private static Dictionary singleton;
+    private DictSegment _MainDict;
+    private DictSegment _StopWordDict;
+    private DictSegment _QuantifierDict;
+    private Configuration cfg;
+
+    private Dictionary(Configuration cfg) {
+        this.cfg = cfg;
+        this.loadMainDict();
+        this.loadStopWordDict();
+        this.loadQuantifierDict();
+    }
+
+    public static Dictionary initial(Configuration cfg) {
+        // 如果与之前的配置不同,则重新生成实例
+        if (singleton == null || (singleton != null && !Objects.equals(cfg, singleton.getCfg()))) {
+            Class var1 = Dictionary.class;
+            synchronized (Dictionary.class) {
+                if (singleton == null || (singleton != null && !Objects.equals(cfg, singleton.getCfg()))) {
+                    singleton = new Dictionary(cfg);
+                    return singleton;
+                }
+            }
+        }
+
+        return singleton;
+    }
+
+    public static Dictionary getSingleton() {
+        if (singleton == null) {
+            throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
+        } else {
+            return singleton;
+        }
+    }
+
+    public Configuration getCfg() {
+        return cfg;
+    }
+
+    public void addWords(Collection<String> words) {
+        if (words != null) {
+            Iterator var3 = words.iterator();
+
+            while (var3.hasNext()) {
+                String word = (String) var3.next();
+                if (word != null) {
+                    singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
+                }
+            }
+        }
+
+    }
+
+    public void disableWords(Collection<String> words) {
+        if (words != null) {
+            Iterator var3 = words.iterator();
+
+            while (var3.hasNext()) {
+                String word = (String) var3.next();
+                if (word != null) {
+                    singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
+                }
+            }
+        }
+
+    }
+
+    public Hit matchInMainDict(char[] charArray) {
+        return singleton._MainDict.match(charArray);
+    }
+
+    public Hit matchInMainDict(char[] charArray, int begin, int length) {
+        return singleton._MainDict.match(charArray, begin, length);
+    }
+
+    public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
+        return singleton._QuantifierDict.match(charArray, begin, length);
+    }
+
+    public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
+        DictSegment ds = matchedHit.getMatchedDictSegment();
+        return ds.match(charArray, currentIndex, 1, matchedHit);
+    }
+
+    public boolean isStopWord(char[] charArray, int begin, int length) {
+        return singleton._StopWordDict.match(charArray, begin, length).isMatch();
+    }
+
+    private void loadMainDict() {
+        this._MainDict = new DictSegment(Character.valueOf('\u0000'));
+        InputStream is = this.getClass().getClassLoader().getResourceAsStream(this.cfg.getMainDictionary());
+        if (is == null) {
+            throw new RuntimeException("Main Dictionary not found!!!");
+        } else {
+            try {
+                BufferedReader ioe = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
+                String theWord = null;
+
+                do {
+                    theWord = ioe.readLine();
+                    if (theWord != null && !"".equals(theWord.trim())) {
+                        this._MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+                    }
+                } while (theWord != null);
+            } catch (IOException var12) {
+                System.err.println("Main Dictionary loading exception.");
+                var12.printStackTrace();
+            } finally {
+                try {
+                    if (is != null) {
+                        is.close();
+                        is = null;
+                    }
+                } catch (IOException var11) {
+                    var11.printStackTrace();
+                }
+
+            }
+
+            this.loadExtDict();
+        }
+    }
+
+    private void loadExtDict() {
+        List extDictFiles = this.cfg.getExtDictionarys();
+        if (extDictFiles != null) {
+            InputStream is = null;
+            Iterator var4 = extDictFiles.iterator();
+
+            while (true) {
+                do {
+                    if (!var4.hasNext()) {
+                        return;
+                    }
+
+                    String extDictName = (String) var4.next();
+                    System.out.println("加载扩展词典:" + extDictName);
+                    is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
+                } while (is == null);
+
+                try {
+                    BufferedReader ioe = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
+                    String theWord = null;
+
+                    while (true) {
+                        theWord = ioe.readLine();
+                        if (theWord != null && !"".equals(theWord.trim())) {
+                            this._MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+                        }
+
+                        if (theWord == null) {
+                            break;
+                        }
+                    }
+                } catch (IOException var15) {
+                    System.err.println("Extension Dictionary loading exception.");
+                    var15.printStackTrace();
+                } finally {
+                    try {
+                        if (is != null) {
+                            is.close();
+                            is = null;
+                        }
+                    } catch (IOException var14) {
+                        var14.printStackTrace();
+                    }
+
+                }
+            }
+        }
+    }
+
+    private void loadStopWordDict() {
+        this._StopWordDict = new DictSegment(Character.valueOf('\u0000'));
+        List extStopWordDictFiles = this.cfg.getExtStopWordDictionarys();
+        if (extStopWordDictFiles != null) {
+            InputStream is = null;
+            Iterator var4 = extStopWordDictFiles.iterator();
+
+            while (true) {
+                do {
+                    if (!var4.hasNext()) {
+                        return;
+                    }
+
+                    String extStopWordDictName = (String) var4.next();
+                    System.out.println("加载扩展停止词典:" + extStopWordDictName);
+                    is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
+                } while (is == null);
+
+                try {
+                    BufferedReader ioe = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
+                    String theWord = null;
+
+                    while (true) {
+                        theWord = ioe.readLine();
+                        if (theWord != null && !"".equals(theWord.trim())) {
+                            this._StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+                        }
+
+                        if (theWord == null) {
+                            break;
+                        }
+                    }
+                } catch (IOException var15) {
+                    System.err.println("Extension Stop word Dictionary loading exception.");
+                    var15.printStackTrace();
+                } finally {
+                    try {
+                        if (is != null) {
+                            is.close();
+                            is = null;
+                        }
+                    } catch (IOException var14) {
+                        var14.printStackTrace();
+                    }
+
+                }
+            }
+        }
+    }
+
+    private void loadQuantifierDict() {
+        this._QuantifierDict = new DictSegment(Character.valueOf('\u0000'));
+        InputStream is = this.getClass().getClassLoader().getResourceAsStream(this.cfg.getQuantifierDicionary());
+        if (is == null) {
+            throw new RuntimeException("Quantifier Dictionary not found!!!");
+        } else {
+            try {
+                BufferedReader ioe = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
+                String theWord = null;
+
+                do {
+                    theWord = ioe.readLine();
+                    if (theWord != null && !"".equals(theWord.trim())) {
+                        this._QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+                    }
+                } while (theWord != null);
+            } catch (IOException var12) {
+                System.err.println("Quantifier Dictionary loading exception.");
+                var12.printStackTrace();
+            } finally {
+                try {
+                    if (is != null) {
+                        is.close();
+                        is = null;
+                    }
+                } catch (IOException var11) {
+                    var11.printStackTrace();
+                }
+
+            }
+
+        }
+    }
+}