|
|
@@ -1,8 +1,17 @@
|
|
|
package com.uas.search;
|
|
|
|
|
|
+import com.hankcs.hanlp.HanLP;
|
|
|
+import com.hankcs.hanlp.seg.Segment;
|
|
|
+import com.hankcs.hanlp.seg.common.Term;
|
|
|
+import com.hankcs.hanlp.suggest.Suggester;
|
|
|
+import com.uas.search.util.CollectionUtils;
|
|
|
import com.uas.search.util.StringUtils;
|
|
|
+import org.apache.lucene.analysis.Analyzer;
|
|
|
+import org.apache.lucene.analysis.TokenStream;
|
|
|
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
|
import org.slf4j.Logger;
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
+import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
|
import java.io.File;
|
|
|
@@ -20,15 +29,34 @@ import java.util.List;
|
|
|
public class DictionaryHelper {
|
|
|
|
|
|
public static void main(String[] args) throws IOException {
|
|
|
-// readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\kind.txt"));
|
|
|
- readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\brand.txt"));
|
|
|
+ List<String> words = readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\kind.txt"));
|
|
|
+// List<String> words=readDict(new File("C:\\Users\\sunyj-pc\\Desktop\\brand.txt"));
|
|
|
+// List<String> words= Arrays.asList("签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。",
|
|
|
+// "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四",
|
|
|
+// "张浩和胡健康复员回家了",
|
|
|
+// "王总和小丽结婚了",
|
|
|
+// "编剧邵钧林和稽道青说",
|
|
|
+// "这里有关天培的有关事迹",
|
|
|
+// "龚学平等领导,邓颖超生前");
|
|
|
+ print(hanlp(words));
|
|
|
+// print(ik(words));
|
|
|
+// suggest();
|
|
|
}
|
|
|
|
|
|
- public static void readDict(File file) throws IOException {
|
|
|
+ private static void print(List<String> list) {
|
|
|
+ if (CollectionUtils.isEmpty(list)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ for (String str : list) {
|
|
|
+ System.out.println(str);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> readDict(File file) throws IOException {
|
|
|
+ List<String> words = new ArrayList<>();
|
|
|
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(file))) {
|
|
|
Logger logger = LoggerFactory.getLogger(DictionaryHelper.class);
|
|
|
String line;
|
|
|
- List<String> words = new ArrayList<>();
|
|
|
while ((line = bufferedReader.readLine()) != null) {
|
|
|
if (StringUtils.isEmpty(line)) {
|
|
|
logger.error("line 为空 <" + line + ">");
|
|
|
@@ -48,10 +76,60 @@ public class DictionaryHelper {
|
|
|
words.add(str);
|
|
|
}
|
|
|
}
|
|
|
- System.out.println("------------------------------------------------");
|
|
|
- for (String word : words) {
|
|
|
- System.out.println(word);
|
|
|
+ }
|
|
|
+ return words;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> hanlp(List<String> words) throws IOException {
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ Segment segment = HanLP.newSegment().enableNameRecognize(true);
|
|
|
+ for (String sentence : words) {
|
|
|
+ List<Term> termList = segment.seg(sentence);
|
|
|
+ for (Term term : termList) {
|
|
|
+ result.add(term.word);
|
|
|
+ }
|
|
|
+// List<String> keywords = HanLP.extractKeyword(sentence, 5);
|
|
|
+ }
|
|
|
+ for(int i = result.size() - 1; i >= 0; i--){
|
|
|
+ if(result.get(i).matches("[`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、?\\s]+")){
|
|
|
+ result.remove(i);
|
|
|
}
|
|
|
}
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> ik(List<String> words) throws IOException {
|
|
|
+ List<String> result = new ArrayList<>();
|
|
|
+ for (String sentence : words) {
|
|
|
+ Analyzer analyzer = new IKAnalyzer(true);
|
|
|
+ TokenStream tokenStream = analyzer.tokenStream("", sentence);
|
|
|
+ tokenStream.reset();
|
|
|
+ CharTermAttribute cta = tokenStream.addAttribute(CharTermAttribute.class);
|
|
|
+ while (tokenStream.incrementToken()) {
|
|
|
+ result.add(cta.toString());
|
|
|
+ }
|
|
|
+ tokenStream.close();
|
|
|
+ analyzer.close();
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void suggest() {
|
|
|
+ Suggester suggester = new Suggester();
|
|
|
+ String[] titleArray =
|
|
|
+ (
|
|
|
+ "威廉王子发表演说 呼吁保护野生动物\n" +
|
|
|
+ "《时代》年度人物最终入围名单出炉 普京马云入选\n" +
|
|
|
+ "“黑格比”横扫菲:菲吸取“海燕”经验及早疏散\n" +
|
|
|
+ "日本保密法将正式生效 日媒指其损害国民知情权\n" +
|
|
|
+ "英报告说空气污染带来“公共健康危机”"
|
|
|
+ ).split("\\n");
|
|
|
+ for (String title : titleArray) {
|
|
|
+ suggester.addSentence(title);
|
|
|
+ }
|
|
|
+
|
|
|
+ System.out.println(suggester.suggest("发言", 2)); // 语义
|
|
|
+ System.out.println(suggester.suggest("危机公共", 2)); // 字符
|
|
|
+ System.out.println(suggester.suggest("mayun", 1)); // 拼音
|
|
|
}
|
|
|
}
|