sunyj 8 лет назад
Родитель
Сommit
20bbc86a2a
2 измененных файлов с 91 добавлено и 7 удалено
  1. 6 0
      pom.xml
  2. 85 7
      src/main/java/com/uas/search/DictionaryHelper.java

+ 6 - 0
pom.xml

@@ -102,6 +102,12 @@
 			<artifactId>IKAnalyzer</artifactId>
 			<version>${ikanalyzer.version}</version>
 		</dependency>
+
+		<dependency>
+			<groupId>com.hankcs</groupId>
+			<artifactId>hanlp</artifactId>
+			<version>portable-1.5.2</version>
+		</dependency>
 	</dependencies>
 
 	<build>

+ 85 - 7
src/main/java/com/uas/search/DictionaryHelper.java

@@ -1,8 +1,17 @@
 package com.uas.search;
 
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.seg.Segment;
+import com.hankcs.hanlp.seg.common.Term;
+import com.hankcs.hanlp.suggest.Suggester;
+import com.uas.search.util.CollectionUtils;
 import com.uas.search.util.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.wltea.analyzer.lucene.IKAnalyzer;
 
 import java.io.BufferedReader;
 import java.io.File;
@@ -20,15 +29,34 @@ import java.util.List;
 public class DictionaryHelper {
 
     public static void main(String[] args) throws IOException {
-//        readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\kind.txt"));
-        readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\brand.txt"));
+        List<String> words = readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\kind.txt"));
+//         List<String> words=readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\brand.txt"));
+//        List<String> words= Arrays.asList("签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。",
+//                "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四",
+//                "张浩和胡健康复员回家了",
+//                "王总和小丽结婚了",
+//                "编剧邵钧林和稽道青说",
+//                "这里有关天培的有关事迹",
+//                "龚学平等领导,邓颖超生前");
+        print(hanlp(words));
+//        print(ik(words));
+//        suggest();
     }
 
-    public static void readDict(File file) throws IOException {
+    private static void print(List<String> list) {
+        if (CollectionUtils.isEmpty(list)) {
+            return;
+        }
+        for (String str : list) {
+            System.out.println(str);
+        }
+    }
+
+    private static List<String> readDict(File file) throws IOException {
+        List<String> words = new ArrayList<>();
         try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) {
             Logger logger = LoggerFactory.getLogger(DictionaryHelper.class);
             String line;
-            List<String> words = new ArrayList<>();
             while ((line = bufferedReader.readLine()) != null) {
                 if (StringUtils.isEmpty(line)) {
                     logger.error("line 为空 <" + line + ">");
@@ -48,10 +76,60 @@ public class DictionaryHelper {
                     words.add(str);
                 }
             }
-            System.out.println("------------------------------------------------");
-            for (String word : words) {
-                System.out.println(word);
+        }
+        return words;
+    }
+
+    private static List<String> hanlp(List<String> words) throws IOException {
+        List<String> result = new ArrayList<>();
+        Segment segment = HanLP.newSegment().enableNameRecognize(true);
+        for (String sentence : words) {
+            List<Term> termList = segment.seg(sentence);
+            for (Term term : termList) {
+                result.add(term.word);
+            }
+//            List<String> keywords = HanLP.extractKeyword(sentence, 5);
+        }
+        for(int i = result.size() - 1; i >= 0; i--){
+            if(result.get(i).matches("[`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、?\\s]+")){
+                result.remove(i);
             }
         }
+        return result;
+    }
+
+    private static List<String> ik(List<String> words) throws IOException {
+        List<String> result = new ArrayList<>();
+        for (String sentence : words) {
+            Analyzer analyzer = new IKAnalyzer(true);
+            TokenStream tokenStream = analyzer.tokenStream("", sentence);
+            tokenStream.reset();
+            CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
+            while (tokenStream.incrementToken()) {
+                result.add(cta.toString());
+            }
+            tokenStream.close();
+            analyzer.close();
+        }
+        return result;
+    }
+
+    private static void suggest() {
+        Suggester suggester = new Suggester();
+        String[] titleArray =
+                (
+                        "威廉王子发表演说 呼吁保护野生动物\n" +
+                                "《时代》年度人物最终入围名单出炉 普京马云入选\n" +
+                                "“黑格比”横扫菲:菲吸取“海燕”经验及早疏散\n" +
+                                "日本保密法将正式生效 日媒指其损害国民知情权\n" +
+                                "英报告说空气污染带来“公共健康危机”"
+                ).split("\\n");
+        for (String title : titleArray) {
+            suggester.addSentence(title);
+        }
+
+        System.out.println(suggester.suggest("发言", 2));       // 语义
+        System.out.println(suggester.suggest("危机公共", 2));   // 字符
+        System.out.println(suggester.suggest("mayun", 1));      // 拼音
     }
 }