├── python
    ├── bilstmcrf
    │   ├── test
    │   │   ├── urllibtest.py
    │   │   ├── printtest.py
    │   │   ├── yieldtest.py
    │   │   ├── classtest.py
    │   │   ├── everythingtest.py
    │   │   ├── tfidftest.py
    │   │   ├── tornadotest.py
    │   │   ├── paddingtest.py
    │   │   └── crftest.py
    │   ├── config.py
    │   ├── embedding.py
    │   ├── util.py
    │   ├── modelserver.py
    │   ├── main.py
    │   └── data.py
    └── bert
    │   └── server.py
├── src
    └── main
    │   ├── resources
    │       ├── lda
    │       │   ├── LdaParameters.txt
    │       │   └── doc
    │       │   │   ├── 1
    │       │   │   ├── 2
    │       │   │   ├── 3
    │       │   │   └── 4
    │       ├── xincrf.properties
    │       └── logback.xml
    │   └── java
    │       ├── segment
    │           ├── crf
    │           │   ├── tcp
    │           │   │   ├── package-info.java
    │           │   │   ├── XinCRFSegmentClient.java
    │           │   │   └── XinCRFSegmentServer.java
    │           │   ├── app
    │           │   │   └── XinCRFApp.java
    │           │   ├── XinCRFConfig.java
    │           │   ├── XinTable.java
    │           │   └── XinCRFSegment.java
    │           ├── Segment.java
    │           ├── bilstmcrf
    │           │   └── BLCSegment.java
    │           └── hmm
    │           │   └── XinHmmSegment.java
    │       ├── mining
    │           ├── data
    │           │   ├── PreProcess.java
    │           │   └── PreProcess20News.java
    │           ├── tfidf
    │           │   ├── Word.java
    │           │   ├── Vocabulary.java
    │           │   ├── OneDocTfDf.java
    │           │   ├── LSICal.java
    │           │   └── AllDocTfIdf.java
    │           ├── cluster
    │           │   ├── ClusterApp.java
    │           │   └── KmeansCluster.java
    │           └── config
    │           │   └── Config.java
    │       ├── lucene
    │           ├── Atom.java
    │           ├── XinAnalyzerApp.java
    │           ├── CharType.java
    │           ├── XinAnalyzer.java
    │           ├── XinTokenizer.java
    │           └── SegmentWrapper.java
    │       ├── test
    │           ├── proxy
    │           │   └── ProxyTest.java
    │           ├── newton
    │           │   ├── TestNewton.java
    │           │   ├── NewtonMethod.java
    │           │   ├── Derivative.java
    │           │   └── GlobalNewtonMethod.java
    │           ├── crf
    │           │   ├── TreeMapTest.java
    │           │   ├── TestTable.java
    │           │   └── DatTest.java
    │           ├── lucene
    │           │   ├── rewriteTokenize
    │           │   │   ├── IKAnalyzer4Lucene7.java
    │           │   │   ├── IKAnalyzerTest.java
    │           │   │   └── IKTokenizer4Lucene7.java
    │           │   ├── EveryThingTest.java
    │           │   └── LuceneTest.java
    │           ├── dl4j
    │           │   ├── DiagTest.java
    │           │   └── MatrixTest.java
    │           ├── socket
    │           │   └── SocketTest.java
    │           ├── hmm
    │           │   ├── baumwelch
    │           │   │   ├── SegmentationUtils.java
    │           │   │   └── IOUtils.java
    │           │   ├── MatricMethodTest.java
    │           │   ├── TestViterbi.java
    │           │   └── HmmTest.java
    │           ├── em
    │           │   └── EmTest.java
    │           └── gmm
    │           │   └── GmmTest.java
    │       ├── lda
    │           ├── LDAConfig.java
    │           ├── LdaGibbsSampling.java
    │           ├── Parameter.java
    │           ├── Documents.java
    │           └── LdaModel.java
    │       └── tools
    │           └── PathUtils.java
├── .settings
    ├── org.eclipse.m2e.core.prefs
    ├── org.eclipse.jdt.apt.core.prefs
    └── org.eclipse.jdt.core.prefs
├── .idea
    ├── markdown-navigator
    │   └── profiles_settings.xml
    ├── encodings.xml
    ├── vcs.xml
    ├── markdown-exported-files.xml
    ├── compiler.xml
    ├── checkstyle-idea.xml
    ├── misc.xml
    ├── inspectionProfiles
    │   └── Project_Default.xml
    └── markdown-navigator.xml
├── .vscode
    ├── settings.json
    └── launch.json
├── .gitignore
├── .project
├── README.md
├── .classpath
└── pom.xml


/python/bilstmcrf/test/urllibtest.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/main/resources/lda/LdaParameters.txt:
--------------------------------------------------------------------------------
1 | alpha	0.5beta	0.1topicNum	5iteration	100saveStep	10beginSaveIters	80


--------------------------------------------------------------------------------
/src/main/java/segment/crf/tcp/package-info.java:
--------------------------------------------------------------------------------
1 | package segment.crf.tcp;
2 | /**
3 |  * 为了支持分词，不必每次都启动，做成TCP通信的模式
4 |  */
5 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="MarkdownNavigator.ProfileManager">
2 |   <settings default="" pdf-export="" />
3 | </component>


--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/python/bilstmcrf/config.py:
--------------------------------------------------------------------------------
1 | rootpath = '/Users/unclewang/.xinlp/python/'
2 | chars_embedding = rootpath + 'w2v2.txt'
3 | split_data = rootpath + 'train.txt'
4 | output_path = rootpath + 'output20181209'
5 | 


--------------------------------------------------------------------------------
/src/main/java/mining/data/PreProcess.java:
--------------------------------------------------------------------------------
1 | package mining.data;
2 | 
3 | import java.io.IOException;
4 | 
5 | public interface PreProcess {
6 |     void preProcess(String dir) throws IOException;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/mining/tfidf/Word.java:
--------------------------------------------------------------------------------
 1 | package mining.tfidf;
 2 | 
 3 | import lombok.Data;
 4 | 
 5 | @Data
 6 | public class Word {
 7 |     private String string;
 8 |     private String stemString;
 9 | }
10 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/printtest.py:
--------------------------------------------------------------------------------
1 | d = "sfds " + str(23)
2 | print(d)
3 | 
4 | print("{}".format(23))
5 | print("sadjlsajf:{}:sd{}".format(23, 34))
6 | print(14832.2/4)
7 | print("测试的字数为{}，其中分词正确的字数为{}".format(100, 23))


--------------------------------------------------------------------------------
/.idea/markdown-exported-files.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="MarkdownExportedFiles">
4 |     <htmlFiles />
5 |     <imageFiles />
6 |     <otherFiles />
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.apt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.apt.aptEnabled=true
3 | org.eclipse.jdt.apt.genSrcDir=target/generated-sources/annotations
4 | org.eclipse.jdt.apt.genTestSrcDir=target/generated-test-sources/test-annotations
5 | 


--------------------------------------------------------------------------------
/src/main/java/lucene/Atom.java:
--------------------------------------------------------------------------------
 1 | package lucene;
 2 | 
 3 | import lombok.Data;
 4 | 
 5 | @Data
 6 | public class Atom {
 7 |     private String content;
 8 | 
 9 | 
10 |     // 当前词的起始位置
11 |     private int offe;
12 |     private int len;
13 |     private char[] chars;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/resources/xincrf.properties:
--------------------------------------------------------------------------------
1 | crf.model.filePath=/Users/unclewang/Idea_Projects/xinlp/src/main/resources/segment/crf/modelc1.5.txt
2 | crf.model.modelPath=/Users/unclewang/Idea_Projects/xinlp/src/main/resources/segment/crf/xincrfc1.5.model
3 | crf.model.binModelPath=xincrf.model.bin


--------------------------------------------------------------------------------
/python/bilstmcrf/test/yieldtest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | 
 5 | def fab(max):
 6 |     n, a, b = 0, 0, 1
 7 |     while n < max:
 8 |         yield b  # 使用 yield
 9 |         # print b
10 |         a, b = b, a + b
11 |         n = n + 1
12 | 
13 | 
14 | for n in fab(5):
15 |     print(n)
16 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/classtest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | class MyClass:
 4 |     """一个简单的类实例"""
 5 |     i = 12345
 6 | 
 7 |     def f(self):
 8 |         return 'hello world'
 9 | 
10 | 
11 | # 实例化类
12 | x = MyClass()
13 | 
14 | # 访问类的属性和方法
15 | print("MyClass 类的属性 i 为：", x.i)
16 | print("MyClass 类的方法 f 输出为：", x.f())
17 | 


--------------------------------------------------------------------------------
/src/main/java/segment/crf/app/XinCRFApp.java:
--------------------------------------------------------------------------------
 1 | package segment.crf.app;
 2 | 
 3 | import segment.crf.XinCRFSegment;
 4 | 
 5 | public class XinCRFApp {
 6 |     public static void main(String[] args) {
 7 |         XinCRFSegment xinCRFSegment = new XinCRFSegment();
 8 |         xinCRFSegment.viterbi("今天天气很好");
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.exclude": {
 3 |         "**/.classpath": true,
 4 |         "**/.project": true,
 5 |         "**/.settings": true,
 6 |         "**/.factorypath": true
 7 |     },
 8 |     "java.configuration.updateBuildConfiguration": "automatic",
 9 |     "java.codeGeneration.generateComments": true,
10 |     "java.test.config": {},
11 |     
12 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | logs/
 3 | src/main/python/bilstmcrf/__pycache__/
 4 | src/main/python/bilstmcrf/data/
 5 | src/main/python/bilstmcrf/output/
 6 | src/test/
 7 | target/
 8 | venv/
 9 | python/bilstmcrf/__pycache__/
10 | python/bilstmcrf/data/
11 | python/bilstmcrf/output/
12 | python/bilstmcrf/output20181209/
13 | src/main/resources/mining/
14 | src/main/resources/segment/crf/
15 | /.idea/misc.xml
16 | 


--------------------------------------------------------------------------------
/src/main/java/test/proxy/ProxyTest.java:
--------------------------------------------------------------------------------
 1 | package test.proxy;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | public class ProxyTest {
 9 |     @Test
10 |     public void test() {
11 |         String s = "hello";
12 |         List a = new ArrayList<>();
13 | 
14 | 
15 |         System.out.println(":");
16 |         System.out.println("sdfksld");
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/everythingtest.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | import numpy as np
 3 | 
 4 | import config
 5 | 
 6 | model = gensim.models.Word2Vec.load(config.embedding_char)
 7 | 
 8 | d = {1: model["人"], 2: model["武"]}
 9 | e = list(d.values())
10 | x = np.array(e)
11 | print(x.shape)
12 | print(x)
13 | 
14 | embedding_mat = np.random.uniform(-0.25, 0.25, (2, 300))
15 | embedding_mat = np.float32(embedding_mat)
16 | print(embedding_mat.shape)
17 | print(embedding_mat)
18 | 


--------------------------------------------------------------------------------
/src/main/java/mining/cluster/ClusterApp.java:
--------------------------------------------------------------------------------
 1 | package mining.cluster;
 2 | 
 3 | import mining.tfidf.AllDocTfIdf;
 4 | 
 5 | import java.util.HashMap;
 6 | 
 7 | public class ClusterApp {
 8 |     public static void main(String[] args) {
 9 |         AllDocTfIdf allDocTfIdf = new AllDocTfIdf();
10 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = allDocTfIdf.loadAllDocTfIdf();
11 |         KmeansCluster kmeansCluster = new KmeansCluster();
12 |         System.out.println(kmeansCluster.cluster(idTfIDf, 10));
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
 3 | org.eclipse.jdt.core.compiler.compliance=1.8
 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
 7 | org.eclipse.jdt.core.compiler.processAnnotations=enabled
 8 | org.eclipse.jdt.core.compiler.release=disabled
 9 | org.eclipse.jdt.core.compiler.source=1.8
10 | 


--------------------------------------------------------------------------------
/src/main/java/lda/LDAConfig.java:
--------------------------------------------------------------------------------
 1 | package lda;
 2 | 
 3 | 
 4 | public class LDAConfig {
 5 |     public static final String RESPATH = "/Users/unclewang/Idea_Projects/xinlp/src/main/resources/lda/result/";
 6 |     public static final String DOCPATH = "/Users/unclewang/Idea_Projects/xinlp/src/main/resources/lda/doc";
 7 |     public static final String PARAMPATH = "/Users/unclewang/Idea_Projects/xinlp/src/main/resources/lda/LdaParameters.txt";
 8 |     public static void main(String[] args) {
 9 |         System.out.println("x");
10 |     }
11 |     
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/test/newton/TestNewton.java:
--------------------------------------------------------------------------------
 1 | package test.newton;
 2 | /**
 3 |  * @Author unclewang
 4 |  * @Date 2018-11-27 00:45
 5 |  * https://blog.csdn.net/google19890102/article/details/41087931
 6 |  */
 7 | public class TestNewton {
 8 | 	public static void main(String args[]) {
 9 | 		NewtonMethod newton = new NewtonMethod(0, 0.00001, 100);
10 | 		System.out.println("基本牛顿法求解：" + newton.getNewtonMin());
11 |  
12 | 		GlobalNewtonMethod gNewton = new GlobalNewtonMethod(0, 0.55, 0.4,
13 | 				0.00001, 100);
14 | 		System.out.println("全局牛顿法求解：" + gNewton.getGlobalNewtonMin());
15 | 	}
16 | }


--------------------------------------------------------------------------------
/src/main/java/test/crf/TreeMapTest.java:
--------------------------------------------------------------------------------
 1 | package test.crf;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | 
 5 | import java.util.TreeMap;
 6 | 
 7 | /**
 8 |  * @Author unclewang
 9 |  * @Date 2018-11-27 14:31
10 |  */
11 | public class TreeMapTest {
12 |     @Test
13 |     public void test() {
14 |         TreeMap<Integer, String> treeMap = new TreeMap();
15 |         treeMap.put(11, "safs");
16 |         treeMap.put(31, "safs");
17 |         treeMap.put(211, "safs");
18 |         treeMap.put(12, "safs");
19 |         System.out.println(treeMap);
20 |     }
21 | 
22 |     @Test
23 |     public void test1() {
24 | 
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/test/crf/TestTable.java:
--------------------------------------------------------------------------------
 1 | package test.crf;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import segment.crf.XinTable;
 5 | 
 6 | 
 7 | public class TestTable {
 8 |     @Test
 9 |     public void test() {
10 |         XinTable table = new XinTable();
11 |         table.v = new String[][]{
12 |                 {"商", "?"},
13 |                 {"品", "?"},
14 |                 {"和", "?"},
15 |                 {"服", "?"},
16 |                 {"务", "?"},
17 |         };
18 |         System.out.println(table.get(9, 0));
19 |     }
20 | 
21 |     @Test
22 |     public void test1() {
23 |         System.out.println(-7 & 1);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="CompilerConfiguration">
 4 |     <annotationProcessing>
 5 |       <profile default="true" name="Default" enabled="true" />
 6 |       <profile name="Maven default annotation processors profile" enabled="true">
 7 |         <sourceOutputDir name="target/generated-sources/annotations" />
 8 |         <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
 9 |         <outputRelativeToContentRoot value="true" />
10 |         <module name="xinlp" />
11 |       </profile>
12 |     </annotationProcessing>
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/python/bilstmcrf/test/tfidftest.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | from sklearn.feature_extraction.text import TfidfTransformer
 3 | from sklearn.feature_extraction.text import TfidfVectorizer
 4 | 
 5 | corpus = ["I come to China to travel",
 6 |           "This is a car polupar in China",
 7 |           "I love tea and Apple ",
 8 |           "The work is to write some papers in science"]
 9 | 
10 | vectorizer = CountVectorizer()
11 | 
12 | transformer = TfidfTransformer()
13 | tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
14 | print(tfidf)
15 | 
16 | tfidf2 = TfidfVectorizer()
17 | re = tfidf2.fit_transform(corpus)
18 | print(re)
19 | 


--------------------------------------------------------------------------------
/.idea/checkstyle-idea.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="CheckStyle-IDEA">
 4 |     <option name="configuration">
 5 |       <map>
 6 |         <entry key="checkstyle-version" value="8.14" />
 7 |         <entry key="copy-libs" value="false" />
 8 |         <entry key="location-0" value="BUNDLED:(bundled):Sun Checks" />
 9 |         <entry key="location-1" value="BUNDLED:(bundled):Google Checks" />
10 |         <entry key="scan-before-checkin" value="false" />
11 |         <entry key="scanscope" value="JavaOnly" />
12 |         <entry key="suppress-errors" value="false" />
13 |       </map>
14 |     </option>
15 |   </component>
16 | </project>


--------------------------------------------------------------------------------
/src/main/java/segment/crf/XinCRFConfig.java:
--------------------------------------------------------------------------------
 1 | package segment.crf;
 2 | 
 3 | import tools.PathUtils;
 4 | 
 5 | public class XinCRFConfig {
 6 |     public static String filePath;
 7 |     public static String modelPath;
 8 |     public static String binModelPath;
 9 | 
10 |     static {
11 |         filePath = PathUtils.getDataPath() + "/segment/crf/modelc1.5.txt";
12 |         modelPath = PathUtils.getDataPath() + "/segment/crf/xincrfc1.5.model";
13 |         binModelPath = PathUtils.getDataPath() + "xincrf.model.bin";
14 |     }
15 | 
16 | 
17 |     public static String getFilePath() {
18 |         return filePath;
19 |     }
20 | 
21 |     public static String getModelPath() {
22 |         return modelPath;
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/segment/Segment.java:
--------------------------------------------------------------------------------
 1 | package segment;
 2 | 
 3 | import lucene.Atom;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | public interface Segment {
 9 |     List<Atom> seg(String text);
10 | 
11 |     default List<Atom> strings2AtomList(String[] strings) {
12 |         List<Atom> atoms = new ArrayList<>();
13 |         int d = 0;
14 |         for (String s : strings) {
15 |             Atom atom = new Atom();
16 |             atom.setContent(s);
17 |             atom.setOffe(d);
18 |             atom.setLen(s.length());
19 |             atom.setChars(s.toCharArray());
20 |             d += s.length();
21 |             atoms.add(atom);
22 |         }
23 |         return atoms;
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/tornadotest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from tornado.web import Application, RequestHandler
 4 | from tornado.ioloop import IOLoop
 5 | from tornado.httpserver import HTTPServer
 6 | 
 7 | 
 8 | class IndexHandler(RequestHandler):
 9 | 
10 |     def get(self):
11 |         # 获取get方式传递的参数
12 |         username = self.get_query_argument("username")
13 |         print(username)
14 | 
15 |     def post(self):
16 |         # 获取post方式传递的参数
17 |         username = self.get_body_argument("username")
18 |         print(username)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     app = Application([(r"/", IndexHandler)])
23 | 
24 |     app.listen(8000)
25 | 
26 |     IOLoop.current().start()
27 | 
28 | # 网页运行时需要传入参数
29 | # 192.168.11.79:8000/?username=123
30 | 


--------------------------------------------------------------------------------
/src/main/java/lda/LdaGibbsSampling.java:
--------------------------------------------------------------------------------
 1 | package lda;
 2 | 
 3 | import lombok.extern.slf4j.Slf4j;
 4 | 
 5 | @Slf4j
 6 | public class LdaGibbsSampling {
 7 |     public static void main(String[] args) {
 8 |         Documents docs = new Documents();
 9 |         docs.readDocs(LDAConfig.DOCPATH);
10 |         log.info("文章数量：" + docs.getDocs().size());
11 |         log.info("单词数量：" + docs.getTermToIndexMap().size());
12 |         Parameter parameter = Parameter.create(LDAConfig.PARAMPATH);
13 |         LdaModel ldaModel = new LdaModel(parameter);
14 |         log.info("模型初始化中");
15 |         ldaModel.init(docs);
16 |         log.info("模型训练中");
17 |         ldaModel.inference(docs);
18 |         log.info("模型打印");
19 |         ldaModel.saveIteratedModel(100);
20 |         log.info("大功告成");
21 |         
22 |     }
23 |     
24 | }
25 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>xinlp</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | 	<filteredResources>
24 | 		<filter>
25 | 			<id>1605244666038</id>
26 | 			<name></name>
27 | 			<type>30</type>
28 | 			<matcher>
29 | 				<id>org.eclipse.core.resources.regexFilterMatcher</id>
30 | 				<arguments>node_modules|.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
31 | 			</matcher>
32 | 		</filter>
33 | 	</filteredResources>
34 | </projectDescription>
35 | 


--------------------------------------------------------------------------------
/src/main/java/test/lucene/rewriteTokenize/IKAnalyzer4Lucene7.java:
--------------------------------------------------------------------------------
 1 | package test.lucene.rewriteTokenize;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | 
 5 | /**
 6 |  * 因為Analyzer的createComponents方法API改變了需要重新實現分析器
 7 |  *
 8 |  * @author THINKPAD
 9 |  */
10 | public class IKAnalyzer4Lucene7 extends Analyzer {
11 | 
12 |     private boolean useSmart = false;
13 | 
14 |     public IKAnalyzer4Lucene7() {
15 |         this(false);
16 |     }
17 | 
18 |     public IKAnalyzer4Lucene7(boolean useSmart) {
19 |         super();
20 |         this.useSmart = useSmart;
21 |     }
22 | 
23 |     public boolean isUseSmart() {
24 |         return useSmart;
25 |     }
26 | 
27 |     public void setUseSmart(boolean useSmart) {
28 |         this.useSmart = useSmart;
29 |     }
30 | 
31 |     @Override
32 |     protected TokenStreamComponents createComponents(String fieldName) {
33 |         IKTokenizer4Lucene7 tk = new IKTokenizer4Lucene7(this.useSmart);
34 |         return new TokenStreamComponents(tk);
35 |     }
36 | 
37 | }


--------------------------------------------------------------------------------
/src/main/java/tools/PathUtils.java:
--------------------------------------------------------------------------------
 1 | package tools;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | 
 5 | import java.io.File;
 6 | 
 7 | public class PathUtils {
 8 |     private static String rootPath = null;
 9 |     private static String xinlpPath = null;
10 |     private static String dataPath = null;
11 | 
12 |     static {
13 |         rootPath = System.getProperty("user.home");
14 |         xinlpPath = rootPath + "/.xinlp";
15 |         File file = new File(xinlpPath);
16 |         if (!file.exists()) {
17 |             file.mkdir();
18 |         }
19 |         dataPath = xinlpPath + "/data";
20 |         file = new File(dataPath);
21 |         if (!file.exists()) {
22 |             file.mkdir();
23 |         }
24 |     }
25 | 
26 |     public static String getRootPath() {
27 |         return rootPath;
28 |     }
29 | 
30 |     public static String getXinlpPath() {
31 |         return xinlpPath;
32 |     }
33 | 
34 |     public static String getDataPath() {
35 |         return dataPath;
36 |     }
37 | 
38 |     @Test
39 |     public void test() {
40 |         System.out.println(dataPath);
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/lucene/XinAnalyzerApp.java:
--------------------------------------------------------------------------------
 1 | package lucene;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | import org.junit.jupiter.api.Test;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | public class XinAnalyzerApp {
11 |     @Test
12 |     public void test() {
13 |         Analyzer analyzer = new XinAnalyzer(XinAnalyzer.TYPE.HMM_XIN);
14 |         String text = "今天天气很不错/今天可以出去玩/你喜欢什么颜色";
15 |         TokenStream tokenStream = analyzer.tokenStream("content", text);
16 |         CharTermAttribute attribute = tokenStream.addAttribute(CharTermAttribute.class);
17 |         try {
18 |             tokenStream.reset();
19 |         } catch (IOException e) {
20 |             e.printStackTrace();
21 |         }
22 |         while (true) {
23 |             try {
24 |                 if (!tokenStream.incrementToken()) {
25 |                     break;
26 |                 }
27 |             } catch (IOException e) {
28 |                 e.printStackTrace();
29 |             }
30 |             System.out.println(attribute.toString());
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/test/dl4j/DiagTest.java:
--------------------------------------------------------------------------------
 1 | package test.dl4j;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import org.nd4j.linalg.api.ndarray.INDArray;
 5 | import org.nd4j.linalg.factory.Nd4j;
 6 | 
 7 | public class DiagTest {
 8 |     @Test
 9 |     public void test() {
10 |         INDArray A = Nd4j.create(new double[]{1, 2, 3});
11 | 
12 |         System.out.println(A.shapeInfoToString());
13 |         INDArray B = Nd4j.diag(A);
14 |         System.out.println(B.shapeInfoToString());
15 |         System.out.println(B);
16 | 
17 |         System.err.println(B.getDouble(0, 0));
18 |         INDArray Si = Nd4j.zeros(A.shape());
19 |         System.out.println(Si.shapeInfoToString());
20 |     }
21 | 
22 |     @Test
23 |     public void test1() {
24 |         INDArray A = Nd4j.create(new double[]{1, 2, 3, 4, 5});
25 |         double[] doubles = A.toDoubleVector();
26 |         for (int i = 0; i < doubles.length; i++) {
27 |             if (i <= 2) {
28 |                 doubles[i] = doubles[i] * doubles[i];
29 |             } else {
30 |                 doubles[i] = 0;
31 |             }
32 |         }
33 |         INDArray Si = Nd4j.diag(Nd4j.create(doubles));
34 |         INDArray S1 = Si.getColumns(0, 1, 2);
35 | 
36 |         System.out.println(Si);
37 |         System.out.println(S1);
38 |     }
39 | 
40 |     @Test
41 |     public void test111() {
42 |         System.out.println(Math.pow(3, 2));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/segment/crf/tcp/XinCRFSegmentClient.java:
--------------------------------------------------------------------------------
 1 | package segment.crf.tcp;
 2 | 
 3 | import com.alibaba.fastjson.JSONObject;
 4 | import lucene.Atom;
 5 | import org.junit.jupiter.api.Test;
 6 | import segment.Segment;
 7 | 
 8 | import java.io.BufferedReader;
 9 | import java.io.IOException;
10 | import java.io.InputStreamReader;
11 | import java.io.PrintStream;
12 | import java.net.Socket;
13 | import java.util.List;
14 | 
15 | /**
16 |  * @Author unclewang
17 |  * @Date 2018-12-12 14:24
18 |  */
19 | public class XinCRFSegmentClient implements Segment {
20 | 
21 |     @Override
22 |     public List<Atom> seg(String text) {
23 |         Socket socket = null;
24 |         List<Atom> list = null;
25 |         try {
26 |             socket = new Socket("localhost", 9428);
27 |             PrintStream ps = new PrintStream(socket.getOutputStream());
28 |             ps.println(text);
29 |             ps.flush();
30 |             socket.shutdownOutput();
31 |             BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream()));
32 |             list = JSONObject.parseArray(br.readLine(), Atom.class);
33 |             br.close();
34 |             socket.close();
35 |         } catch (IOException e) {
36 |             e.printStackTrace();
37 |         }
38 |         return list;
39 |     }
40 | 
41 |     @Test
42 |     public void test() {
43 |         System.out.println(seg("你好"));
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/test/lucene/EveryThingTest.java:
--------------------------------------------------------------------------------
 1 | package test.lucene;
 2 | 
 3 | import lombok.Data;
 4 | import org.ansj.domain.Term;
 5 | import org.ansj.splitWord.analysis.NlpAnalysis;
 6 | import org.junit.jupiter.api.Test;
 7 | 
 8 | import java.util.HashMap;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * @Author unclewang
13 |  * @Date 2018-12-11 12:15
14 |  */
15 | public class EveryThingTest {
16 |     @Test
17 |     public void test() {
18 |         List<Term> terms = NlpAnalysis.parse(
19 |                 "本文对半导体氧化物气体敏感材料的电导振荡特性加以研究分析，通过试验与理论分析得出气敏电导振荡的必要条件，并对电导振荡型气体敏感元件的原理、工艺技术和结构等进行分析说明。同时，对半导体氧化物气体敏感材料的常温气体敏感特性进行归纳总结，指出其优缺点和需要解决的问题。"
20 |         ).getTerms();
21 |         for (Term term : terms) {
22 |             String word = term.getName(); //拿到词
23 |             String natureStr = term.getNatureStr(); //拿到词性
24 |             //if (expectedNature.contains(natureStr)) {
25 |             System.out.print(word + " ");
26 |             //}
27 |         }
28 |     }
29 | 
30 |     @Test
31 |     public void testHashMap() {
32 |         HashMap<String, String> stringStringHashMap = new HashMap<>();
33 |         stringStringHashMap.put("1", "sadas");
34 |         stringStringHashMap.put("1", "sas");
35 |         System.out.println(stringStringHashMap.get("1"));
36 |         stringStringHashMap.remove("1");
37 |         System.out.println(stringStringHashMap.get("1"));
38 |         System.out.println(stringStringHashMap.containsKey("1"));
39 |     }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/src/main/java/segment/crf/XinTable.java:
--------------------------------------------------------------------------------
 1 | package segment.crf;
 2 | 
 3 | /**
 4 |  * @Author unclewang
 5 |  * @Date 2018-11-27 19:39
 6 |  * 句子转换成二维数组，方便存token
 7 |  */
 8 | public class XinTable {
 9 |     public String[][] v;
10 |     static final String HEAD = "_B";
11 | 
12 | 
13 |     /**
14 |      * 获取表中某一个元素
15 |      *
16 |      * @param x
17 |      * @param y
18 |      * @return
19 |      */
20 |     public String get(int x, int y) {
21 |         if (x < 0) {
22 |             return HEAD + x;
23 |         }
24 |         if (x >= v.length) {
25 |             return HEAD + "+" + (x - v.length + 1);
26 |         }
27 | 
28 |         return v[x][y];
29 |     }
30 | 
31 |     public void setLast(int x, String t) {
32 |         v[x][v[x].length - 1] = t;
33 |     }
34 | 
35 |     public int size() {
36 |         return v.length;
37 |     }
38 | 
39 |     public String[][] getV() {
40 |         return v;
41 |     }
42 | 
43 |     public void setV(String[][] v) {
44 |         this.v = v;
45 |     }
46 | 
47 |     @Override
48 |     public String toString() {
49 |         if (v == null) {
50 |             return "null";
51 |         }
52 |         final StringBuilder sb = new StringBuilder(v.length * v[0].length * 2);
53 |         for (String[] line : v) {
54 |             for (String element : line) {
55 |                 sb.append(element).append('\t');
56 |             }
57 |             sb.append('\n');
58 |         }
59 |         return sb.toString();
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/test/socket/SocketTest.java:
--------------------------------------------------------------------------------
 1 | package test.socket;
 2 | 
 3 | import lombok.extern.slf4j.Slf4j;
 4 | import org.junit.jupiter.api.Test;
 5 | 
 6 | import java.io.BufferedReader;
 7 | import java.io.IOException;
 8 | import java.io.InputStreamReader;
 9 | import java.io.PrintStream;
10 | import java.net.ServerSocket;
11 | import java.net.Socket;
12 | 
13 | @Slf4j
14 | public class SocketTest {
15 | 
16 |     @Test
17 |     public void simpleClient() throws IOException {
18 |         Socket socket = new Socket("localhost", 10000);
19 |         PrintStream ps = new PrintStream(socket.getOutputStream());
20 |         ps.println("你好吗");
21 |         BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream()));
22 |         String line = br.readLine();
23 |         System.out.println("来自服务器：" + line);
24 |         ps.close();
25 |         br.close();
26 |         socket.close();
27 |     }
28 | 
29 |     @Test
30 |     public void simpleServer() throws IOException {
31 |         ServerSocket serverSocket = new ServerSocket(10000);
32 |         while (true) {
33 |             Socket s = serverSocket.accept();
34 |             BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream()));
35 |             System.out.println("来自客户端：" + br.readLine());
36 |             log.info("来自客户端访问:" + s.getInetAddress());
37 |             PrintStream ps = new PrintStream(s.getOutputStream());
38 |             ps.println("零零落落");
39 |             ps.close();
40 |             s.close();
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/test/hmm/baumwelch/SegmentationUtils.java:
--------------------------------------------------------------------------------
 1 | package test.hmm.baumwelch;
 2 |  
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | public class SegmentationUtils {
 7 | 	/**
 8 | 	 * 将字符串数组的每一个字符串中的字符直接转换为Unicode码
 9 | 	 * @param strs 字符串数组
10 | 	 * @return Unicode值
11 | 	 */
12 | 	public static List<int[]> strs2int(String[] strs) {
13 | 		List<int[]> res = new ArrayList<>(strs.length);
14 | 		for(int i = 0; i < strs.length;i++) {
15 | 			int[] O = new int[strs[i].length()];
16 | 			for(int j = 0; j < strs[i].length();j++) {
17 | 				O[j] = strs[i].charAt(j);
18 | 			}
19 | 			res.add(O);
20 | 		}
21 | 		return res;
22 | 	}
23 | 	
24 | 	public static int[] str2int(String str) {
25 | 		return strs2int(new String[] {str}).get(0);
26 | 	}
27 | 	/**
28 | 	 * 根据预测结果解码
29 | 	 * BEMS 0123
30 | 	 * @param predict 预测结果
31 | 	 * @param sentence 句子
32 | 	 * @return
33 | 	 */
34 | 	public static String[] decode(int[] predict, String sentence) {
35 | 		List<String> res = new ArrayList<>();
36 | 		char[] chars = sentence.toCharArray();
37 | 		for(int i = 0; i < predict.length;i++) {
38 | 			if(predict[i] == 0 || predict[i] == 1) {
39 | 				int a = i;
40 | 				while(predict[i] != 2) {
41 | 					i++;
42 | 					if(i == predict.length) {
43 | 						break;
44 | 					}
45 | 				}
46 | 				int b = i;
47 | 				if(b == predict.length) {
48 | 					b--;
49 | 				}
50 | 				res.add(new String(chars,a,b-a+1));
51 | 			} else {
52 | 				res.add(new String(chars,i,1));
53 | 			}
54 | 		}
55 | 		String[] s = new String[res.size()];
56 | 		return res.toArray(s);
57 | 	}
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/src/main/java/lucene/CharType.java:
--------------------------------------------------------------------------------
 1 | package lucene;
 2 | 
 3 | import tools.PathUtils;
 4 | 
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.ObjectInputStream;
 8 | import java.io.Serializable;
 9 | 
10 | public class CharType implements Serializable {
11 |     /**
12 |      * 单字节
13 |      */
14 |     public static final byte CT_SINGLE = 0;
15 | 
16 |     /**
17 |      * 分隔符"!,.?()[]{}+= 各种稀奇古怪的符号
18 |      */
19 |     public static final byte CT_DELIMITER = 1;
20 | 
21 |     /**
22 |      * 中文字符
23 |      */
24 |     public static final byte CT_CHINESE = 2;
25 | 
26 |     /**
27 |      * 字母
28 |      */
29 |     public static final byte CT_LETTER = 3;
30 | 
31 |     /**
32 |      * 数字
33 |      */
34 |     public static final byte CT_NUM = 4;
35 | 
36 |     /**
37 |      * 序号
38 |      */
39 |     public static final byte CT_INDEX = 5;
40 | 
41 |     /**
42 |      * 中文数字
43 |      */
44 |     public static final byte CT_CNUM = 6;
45 | 
46 |     /**
47 |      * 其他
48 |      */
49 |     public static final byte CT_OTHER = 12;
50 |     private static byte[] charType;
51 | 
52 |     static {
53 |         charType = new byte[65536];
54 |         try {
55 |             ObjectInputStream ois = new ObjectInputStream(new FileInputStream(PathUtils.getDataPath() + "/chartype/chartype.bin"));
56 |             ois.read(charType);
57 |             ois.close();
58 |         } catch (IOException e) {
59 |             e.printStackTrace();
60 |         }
61 |     }
62 | 
63 |     public static byte get(char c) {
64 |         return charType[(int) c];
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/test/crf/DatTest.java:
--------------------------------------------------------------------------------
 1 | package test.crf;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import segment.crf.DoubleArrayTrie;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.HashSet;
 8 | import java.util.List;
 9 | import java.util.Set;
10 | 
11 | /**
12 |  * @Author unclewang
13 |  * @Date 2018-12-10 15:21
14 |  */
15 | public class DatTest {
16 |     @Test
17 |     public void test() {
18 |         List<String> strings = new ArrayList<>();
19 |         strings.add("一举一动");
20 |         strings.add("一举成名");
21 |         strings.add("一举成名天下知");
22 |         strings.add("万能");
23 |         strings.add("万能胶");
24 |         Set<Character> charset = new HashSet<Character>();
25 |         for (String s : strings) {
26 |             for (Character c : s.toCharArray()) {
27 |                 charset.add(c);
28 |             }
29 |         }
30 |         String infoCharsetValue = "";
31 |         String infoCharsetCode = "";
32 |         for (Character c : charset) {
33 |             infoCharsetValue += c + "\t\t";
34 |             infoCharsetCode += (int) c + "\t";
35 |         }
36 |         infoCharsetValue += '\n';
37 |         infoCharsetCode += '\n';
38 |         System.out.print(infoCharsetValue);
39 |         System.out.print(infoCharsetCode);
40 | 
41 |         DoubleArrayTrie dat = new DoubleArrayTrie();
42 |         dat.build(strings);
43 |         int i = dat.exactMatchSearch("一举成名天下知");
44 |         System.out.println(i);
45 |         System.out.println(strings.get(i));
46 |         List<Integer> integerList = dat.commonPrefixSearch("一举成名天下知");
47 |         for (int index : integerList) {
48 |             System.out.println(strings.get(index));
49 |         }
50 | 
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xinlp
 2 | 学习《统计学习方法》，从第八章的EM算法到第十一章的CRF都基本实现了一遍，还结合现在深度学习热潮，实现了Bi-LSTM+CRF分词
 3 | 
 4 | ## 2019.03.21
 5 | 实现了一个简单的LDA模型，Gibbs采样迭代更新
 6 | 
 7 | ## EM和GMM
 8 | 先是学习了EM算法，实现了GMM高斯混合模型 \
 9 | 高斯混合模型和kmeans很像，亲身测试男女身高这种事情GMM很难训练出来的 
10 | 
11 | ### 相关博客
12 | https://www.unclewang.info/learn/machine-learning/730/ \
13 | https://www.unclewang.info/learn/machine-learning/735/
14 | 
15 | 
16 | ## 自己实现HMM分词
17 | HMM 盒子与球问题 三种问题（概率，学习，预测）都实现了 \
18 | 主要思想就是参数训练好的情况下（jieba分词的参数），viterbi算法实现就好。 \
19 | HMM参数使用的python jieba分词的参数 \
20 | 也尝试用Baum-Welch算法进行参数训练学习，发现效果贼差。。。。
21 | ### 相关博客
22 | https://www.unclewang.info/learn/machine-learning/745/ \
23 | https://www.unclewang.info/learn/machine-learning/749/
24 | 
25 | ## 自己实现CRF分词
26 | CRF参照了Ansj和Hanlp两个的写法。 \
27 | CRF参数来自于CRF++训练得到，利用训练的参数进行分词 \
28 | CRF 人工定义特征函数太费劲了，其实就是特征工程，参数学习要用的方法也没实现。其实就是特征函数难定义。使用viterbi算法进行分词，学习借助
29 | CRF，概率和hmm类似没有实现。
30 | ### 相关博客
31 | https://www.unclewang.info/learn/machine-learning/753/
32 | 
33 | ## 自己实现Bi-LSTM+CRF分词
34 | 实现的有两个版本： \
35 | ugly版本是第一遍直接实现的，因为以前也没怎么好好写过python，所以就随便命名、结构也很乱，做的时候不知道的东西就百度+bing去搜，反正遇山修路，过河修桥那样的实现的....,不过代码很精简，没有任何封装，看起来其实很流畅 \
36 | 非ugly版本是从github上找了一个很厉害的项目[guillaumegenthial/sequence_tagging](https://github.com/guillaumegenthial/sequence_tagging),仿照这种python代码完整度非常高的项目去重新写了一边代码（有很多地方直接抄的😊），代码很清晰，几个文件各司其职，也算没有辜负python（一个面向对象的动态解释型强语言）
37 | 
38 | ### 相关博客
39 | https://www.unclewang.info/learn/machine-learning/756/
40 | 
41 | ## 自己实现一个支持lucene的分词器——XinAnalyzer
42 | 用lucene的时候，看见了一个叫SmartChineseAnalyzer的支持中文分词，效果不咋的，发现竟然用的HMM分词，当时一句"我的天"，于是就想自己也写一个。。。 \
43 | 2018.12.11  自己的HMM分词器已经支持了 \
44 | 2018.12.13  支持CRF分词（tcp通信），支持BiLSTM+CRF分词（http通信）
45 | ### 相关博客
46 | https://www.unclewang.info/learn/java/760/
47 | ## 使用到的各种数据
48 | 链接:https://pan.baidu.com/s/1toe-0h4k9Ck_yGs-RwMqAA  密码:sn7o
49 | 


--------------------------------------------------------------------------------
/src/main/java/mining/tfidf/Vocabulary.java:
--------------------------------------------------------------------------------
 1 | package mining.tfidf;
 2 | 
 3 | import com.google.common.collect.BiMap;
 4 | import com.google.common.collect.HashBiMap;
 5 | import com.google.common.collect.HashMultimap;
 6 | import com.google.common.collect.Multimap;
 7 | import com.google.common.io.Files;
 8 | import mining.config.Config;
 9 | import org.junit.jupiter.api.Test;
10 | 
11 | import java.io.File;
12 | import java.io.IOException;
13 | import java.nio.charset.Charset;
14 | import java.util.List;
15 | 
16 | public class Vocabulary {
17 |     private static BiMap<String, Integer> wordIds = HashBiMap.create();
18 |     private static Multimap<String, String> stemWords = HashMultimap.create();
19 |     private static int wordSize = 0;
20 | 
21 |     static {
22 |         try {
23 |             List<String> words = Files.readLines(new File(Config.getVocabularyPath()), Charset.defaultCharset());
24 |             int id = 0;
25 |             for (String s : words) {
26 |                 String[] split = s.split("\t");
27 |                 if (!wordIds.containsKey(split[2])) {
28 |                     wordIds.put(split[2], id++);
29 |                 }
30 |                 stemWords.put(split[2], split[1]);
31 |             }
32 |             wordSize = wordIds.size();
33 |         } catch (IOException e) {
34 |             e.printStackTrace();
35 |         }
36 |     }
37 | 
38 |     public static BiMap<String, Integer> getWordIds() {
39 |         return wordIds;
40 |     }
41 | 
42 |     public static Multimap<String, String> getStemWords() {
43 |         return stemWords;
44 |     }
45 | 
46 |     public static int getWordSize() {
47 |         return wordSize;
48 |     }
49 | 
50 |     @Test
51 |     public void test() {
52 |         System.out.println(wordIds.inverse().get(0));
53 |         System.out.println(wordIds.inverse().get(2477));
54 |         System.out.println(stemWords.keySet().size());
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration scan="true">
 3 |     <!-- 日志保存路径为tomcat下面的logs下面的mobileLog文件夹，logback会自动创建文件夹，这样设置了就可以输出日志文件了
 4 |         <substitutionProperty name="logbase" value="${catalina.base}/logs/mobileLog/"
 5 |         /> -->
 6 |     <property name="log.base" value="${user.dir}/logs/"/>
 7 |     <!-- 这个是要配置输出文件的 -->
 8 |     <jmxConfigurator/>
 9 |     <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
10 |         <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
11 |             <pattern>%date [%thread] %-5level %logger{80} - %msg%n</pattern>
12 |         </encoder>
13 |     </appender>
14 |     <!-- 文件输出日志 (文件大小策略进行文件输出，超过指定大小对文件备份) -->
15 |     <appender name="logfile"
16 |               class="ch.qos.logback.core.rolling.RollingFileAppender">
17 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
18 |             <FileNamePattern>${log.base}%d{yyyy-MM-dd}.log.html.zip
19 |             </FileNamePattern>
20 | 
21 |             <!-- keep 30 days' worth of history capped at 3GB total size -->
22 |             <maxHistory>30</maxHistory>
23 |             <totalSizeCap>3GB</totalSizeCap>
24 |         </rollingPolicy>
25 |         <triggeringPolicy
26 |                 class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
27 |             <MaxFileSize>5MB</MaxFileSize>
28 |         </triggeringPolicy>
29 |         <encoder class="ch.qos.logback.core.encoder.LayoutWrappingEncoder">
30 |             <layout class="ch.qos.logback.classic.html.HTMLLayout">
31 |                 <pattern>%date%level%thread%10logger%file%line%msg</pattern>
32 |             </layout>
33 |         </encoder>
34 |     </appender>
35 | 
36 | 
37 |     <root>
38 |         <level value="info"/>
39 |         <appender-ref ref="stdout"/>
40 |         <appender-ref ref="logfile"/>
41 |     </root>
42 | 
43 | </configuration>


--------------------------------------------------------------------------------
/src/main/java/lucene/XinAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package lucene;
 2 | 
 3 | import lombok.extern.java.Log;
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | import segment.Segment;
 7 | import segment.bilstmcrf.BLCSegment;
 8 | import segment.crf.tcp.XinCRFSegmentClient;
 9 | import segment.hmm.XinHmmSegment;
10 | 
11 | import java.util.HashMap;
12 | import java.util.Map;
13 | 
14 | /**
15 |  * @Author unclewang
16 |  * @Date 2018-12-11 11:17
17 |  */
18 | @Log
19 | public class XinAnalyzer extends Analyzer {
20 |     public static enum TYPE {
21 |         HMM_XIN,
22 |         CRF_XIN,
23 |         BILSTMCRF_XIN
24 |     }
25 | 
26 |     /**
27 |      * 分词类型
28 |      */
29 |     private Map<String, String> args;
30 | 
31 |     public XinAnalyzer(TYPE type) {
32 |         this.args = new HashMap<>();
33 |         args.put("type", type.name());
34 |     }
35 | 
36 |     @Override
37 |     protected TokenStreamComponents createComponents(String fieldName) {
38 |         Tokenizer tokenizer = getTokenizer(this.args);
39 |         return new TokenStreamComponents(tokenizer);
40 |     }
41 | 
42 | 
43 |     private Tokenizer getTokenizer(Map<String, String> args) {
44 |         log.info("to create tokenizer " + args);
45 |         String type = args.get("type");
46 |         if (type == null) {
47 |             type = TYPE.HMM_XIN.name();
48 |         }
49 | 
50 |         Segment segment = null;
51 | 
52 |         switch (TYPE.valueOf(type)) {
53 |             case CRF_XIN:
54 |                 segment = new XinCRFSegmentClient();
55 |                 break;
56 |             case HMM_XIN:
57 |                 segment = new XinHmmSegment();
58 |                 break;
59 |             case BILSTMCRF_XIN:
60 |                 segment = new BLCSegment();
61 |                 break;
62 |             default:
63 |                 break;
64 |         }
65 |         return new XinTokenizer(segment);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/resources/lda/doc/3:
--------------------------------------------------------------------------------
 1 | 北京	举行	新年	音乐会
 2 | 
 3 | 江泽民	李鹏	乔石	朱镕基	李瑞环	刘华清	尉健行	李岚清	与	万	名	首都	各界	群众	和	劳动模范	代表	一起	辞旧迎新	（	附	图片	1	张	）
 4 | 
 5 | 党	和	国家	领导人	江泽民	、	李鹏	、	乔石	、	朱镕基	、	李瑞环	、	刘华清	、	尉健行	、	李岚清	等	与	万	名	首都	各界	群众	和	劳动模范	代表	一起	欣赏	了	’	98	北京	新年	音乐会	的	精彩	节目	。
 6 | 这	是	江泽民	等	在	演出	结束	后	同	演出	人员	合影	。
 7 | 
 8 | （	新华社	记者	樊如钧	摄	）
 9 | 
10 | 本报	北京	12月	31日	讯	新华社	记者	陈雁	、	本报	记者	何加正	报道	：
11 | 在	度过	了	非凡	而	辉煌	的	1997年	，
12 | 迈向	充满	希望	的	1998年	之际	，
13 | ’	98	北京	新年	音乐会	今晚	在	人民	大会堂	举行	。
14 | 党	和	国家	领导人	江泽民	、	李鹏	、	乔石	、	朱镕基	、	李瑞环	、	刘华清	、	尉健行	、	李岚清	与	万	名	首都	各界	群众	和	劳动模范	代表	一起	，
15 | 在	激昂	奋进	的	音乐声	中	辞旧迎新	。
16 | 
17 | 今晚	的	长安街	流光溢彩	，
18 | 火树银花	；
19 | 人民	大会堂	里	灯火辉煌	，
20 | 充满	欢乐	祥和	的	喜庆	气氛	。
21 | 在	这场	由	中共	北京	市委	宣传部	、	市	政府	办公厅	等	单位	主办	的	题	为	“	世纪	携手	、	共	奏	华章	”	的	新年	音乐会	上	，
22 | 中国	三	个	著名	交响乐团	———	中国	交响乐团	、	上海	交响乐团	、	北京	交响乐团	首	次	联袂	演出	。
23 | 著名	指挥家	陈佐湟	、	陈燮阳	、	谭利华	分别	指挥	演奏	了	一	批	中外	名曲	，
24 | 京	沪	两地	200	多	位	音乐家	组成	的	大型	乐队	以	饱满	的	激情	和	精湛	的	技艺	为	观众	奉献	了	一	台	高	水准	的	交响音乐会	。
25 | 
26 | 音乐会	在	雄壮	的	管弦乐	《	红旗	颂	》	中	拉开	帷幕	，
27 | 舒展	、	优美	的	乐曲声	使	人们	仿佛	看到	：
28 | 五星红旗	在	天安门	城楼	上	冉冉	升起	；
29 | 仿佛	听到	：
30 | 在	红旗	的	指引	下	中国	人民	向	现代化	新	征程	迈进	的	脚步声	。
31 | 钢琴	与	管弦乐队	作品	《	东方	之	珠	》	，
32 | 把	广大	听众	耳熟能详	的	歌曲	改编	为	器乐曲	，
33 | 以	其	优美	感人	的	旋律	抒发	了	洗雪	百年	耻辱	的	香港	明天	会	更	好	的	情感	。
34 | 专程	回国	参加	音乐会	的	著名	女高音	歌唱家	迪里拜尔	演唱	的	《	春	之	声	》	，
35 | 把	人们	带	到	了	万象更新	的	田野	和	山谷	；
36 | 享誉	国际	乐坛	的	男高音	歌唱家	莫华伦	演唱	了	著名	歌剧	《	图兰朵	》	选段	“	今夜	无	人	入睡	”	，
37 | 把	人们	带入	迷人	的	艺术	境地	。
38 | 音乐会	上	还	演奏	了	小提琴	协奏曲	《	梁山伯	与	祝英台	》	、	柴可夫斯基	的	《	第四	交响曲	———	第四	乐章	》	、	交响诗	《	罗马	的	松树	》	等	中外	著名	交响曲	。
39 | 
40 | 万	人	大会堂	今晚	座无虚席	，
41 | 观众	被	艺术家	们	精湛	的	表演	深深	打动	，
42 | 不断	报	以	经久不息	的	热烈	掌声	。
43 | 艺术家	们	频频	谢幕	，
44 | 指挥家	依次	指挥	演出	返	场	曲目	，
45 | 最后	音乐会	在	《	红色	娘子军	》	选曲	、	《	白毛女	》	选曲	、	《	北京	喜讯	到	边寨	》	等	乐曲声	中	达到	高潮	。
46 | 
47 | 演出	结束	后	，
48 | 江泽民	等	党	和	国家	领导人	走	上	舞台	，
49 | 亲切	会见	了	参加	演出	的	全体	人员	，
50 | 祝贺	演出	成功	，
51 | 并	与	他们	合影	留念	。
52 | 
53 | 李铁映	、	贾庆林	、	曾庆红	等	领导	同志	也	出席	了	今晚	音乐会	。
54 | 
55 | 
56 | 李鹏	在	北京	考察	企业
57 | 
58 | 向	广大	职工	祝贺	新年	，
59 | 对	节日	坚守	岗位	的	同志	们	表示	慰问
60 | 


--------------------------------------------------------------------------------
/src/main/java/test/em/EmTest.java:
--------------------------------------------------------------------------------
 1 | package test.em;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | 
 5 | /**
 6 |  * @author unclewang
 7 |  */
 8 | public class EmTest {
 9 |     @Test
10 |     public void test() {
11 |         //每个硬币初始一次为正的概率
12 |         double[] yita = m(0.2, 0.5);
13 |         for (int i = 0; i < 100; i++) {
14 |             yita = m(yita);
15 |             System.out.println(yita[0] + "\t" + yita[1]);
16 |         }
17 |     }
18 | 
19 |     public double[] m(double... yita) {
20 |         int[] nums = {5, 5, 9, 1, 8, 2, 4, 6, 7, 3};
21 |         double[] e = new double[5];
22 |         double[] m = new double[5];
23 |         double[] m_ = new double[5];
24 |         double[] n = new double[5];
25 |         double[] n_ = new double[5];
26 |         for (int i = 0; i < e.length; i++) {
27 |             //e步
28 |             e[i] = e(yita[0], nums[i * 2], yita[1]);
29 |             m[i] = e[i] * nums[2 * i];
30 |             m_[i] = e[i] * nums[2 * i + 1];
31 |             n[i] = (1 - e[i]) * nums[2 * i];
32 |             n_[i] = (1 - e[i]) * nums[2 * i + 1];
33 |         }
34 |         double yita1 = sum(m) / (sum(m) + sum(m_));
35 |         double yita2 = sum(n) / (sum(n) + sum(n_));
36 |         System.out.println("开始迭代");
37 |         print(e);
38 |         print(m);
39 |         print(m_);
40 |         print(n);
41 |         print(n_);
42 |         return new double[]{yita1, yita2};
43 |     }
44 | 
45 |     public void print(double[] nums) {
46 |         for (double a : nums) {
47 |             System.out.print(a + "\t");
48 |         }
49 |         System.out.println();
50 |     }
51 | 
52 |     public double sum(double[] nums) {
53 |         double sum = 0;
54 |         for (double a : nums) {
55 |             sum += a;
56 |         }
57 |         return sum;
58 |     }
59 | 
60 |     public double e(double a, double b, double c) {
61 |         double e1 = Math.pow(a, b) * Math.pow(1 - a, 10 - b);
62 |         double e2 = Math.pow(c, 10 - b) * Math.pow(1 - c, b);
63 |         return e1 / (e1 + e2);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ExternalStorageConfigurationManager" enabled="true" />
 4 |   <component name="JavaScriptSettings">
 5 |     <option name="languageLevel" value="ES6" />
 6 |   </component>
 7 |   <component name="Kotlin2JsCompilerArguments">
 8 |     <option name="sourceMapEmbedSources" />
 9 |     <option name="sourceMapPrefix" />
10 |   </component>
11 |   <component name="MavenProjectsManager">
12 |     <option name="originalFiles">
13 |       <list>
14 |         <option value="$PROJECT_DIR$/pom.xml" />
15 |       </list>
16 |     </option>
17 |   </component>
18 |   <component name="NodePackageJsonFileManager">
19 |     <packageJsonPaths />
20 |   </component>
21 |   <component name="ProjectInspectionProfilesVisibleTreeState">
22 |     <entry key="Project Default">
23 |       <profile-state>
24 |         <expanded-state>
25 |           <State />
26 |           <State>
27 |             <id>AOP</id>
28 |           </State>
29 |           <State>
30 |             <id>Android</id>
31 |           </State>
32 |           <State>
33 |             <id>Code style issuesJava</id>
34 |           </State>
35 |           <State>
36 |             <id>CorrectnessLintAndroid</id>
37 |           </State>
38 |           <State>
39 |             <id>Java</id>
40 |           </State>
41 |           <State>
42 |             <id>LintAndroid</id>
43 |           </State>
44 |           <State>
45 |             <id>MessagesCorrectnessLintAndroid</id>
46 |           </State>
47 |           <State>
48 |             <id>SecurityLintAndroid</id>
49 |           </State>
50 |         </expanded-state>
51 |       </profile-state>
52 |     </entry>
53 |   </component>
54 |   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8.161" project-jdk-type="JavaSDK">
55 |     <output url="file://$PROJECT_DIR$/out" />
56 |   </component>
57 |   <component name="PythonCompatibilityInspectionAdvertiser">
58 |     <option name="version" value="3" />
59 |   </component>
60 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="JavaDoc" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="TOP_LEVEL_CLASS_OPTIONS">
 6 |         <value>
 7 |           <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
 8 |           <option name="REQUIRED_TAGS" value="" />
 9 |         </value>
10 |       </option>
11 |       <option name="INNER_CLASS_OPTIONS">
12 |         <value>
13 |           <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
14 |           <option name="REQUIRED_TAGS" value="" />
15 |         </value>
16 |       </option>
17 |       <option name="METHOD_OPTIONS">
18 |         <value>
19 |           <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
20 |           <option name="REQUIRED_TAGS" value="@return@param@throws or @exception" />
21 |         </value>
22 |       </option>
23 |       <option name="FIELD_OPTIONS">
24 |         <value>
25 |           <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
26 |           <option name="REQUIRED_TAGS" value="" />
27 |         </value>
28 |       </option>
29 |       <option name="IGNORE_DEPRECATED" value="false" />
30 |       <option name="IGNORE_JAVADOC_PERIOD" value="true" />
31 |       <option name="IGNORE_DUPLICATED_THROWS" value="false" />
32 |       <option name="IGNORE_POINT_TO_ITSELF" value="false" />
33 |       <option name="myAdditionalJavadocTags" value="date,Author,Date" />
34 |     </inspection_tool>
35 |     <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
36 |       <option name="ourVersions">
37 |         <value>
38 |           <list size="2">
39 |             <item index="0" class="java.lang.String" itemvalue="2.7" />
40 |             <item index="1" class="java.lang.String" itemvalue="3.7" />
41 |           </list>
42 |         </value>
43 |       </option>
44 |     </inspection_tool>
45 |   </profile>
46 | </component>


--------------------------------------------------------------------------------
/src/main/java/mining/tfidf/OneDocTfDf.java:
--------------------------------------------------------------------------------
 1 | package mining.tfidf;
 2 | 
 3 | import com.google.common.collect.BiMap;
 4 | import lombok.Data;
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import org.junit.jupiter.api.Test;
 7 | 
 8 | import java.io.IOException;
 9 | import java.nio.file.Files;
10 | import java.nio.file.Paths;
11 | import java.util.HashMap;
12 | import java.util.HashSet;
13 | import java.util.List;
14 | import java.util.Map;
15 | 
16 | /**
17 |  * @Author unclewang
18 |  * @Date 2018-12-16 15:56
19 |  */
20 | @Data
21 | @Slf4j
22 | public class OneDocTfDf {
23 |     private HashMap<Integer, Double> idTf = new HashMap<>();
24 |     private HashSet<Integer> idDf = new HashSet<>();
25 |     private static BiMap<String, Integer> wordIds = Vocabulary.getWordIds();
26 | 
27 | 
28 |     /**
29 |      * @throws IOException 词频（TF） = 某个词在文章中的出现次数 / 拥有最高词频的词的次数
30 |      */
31 |     public void calOneFileTf(String filepath) {
32 |         List<String> words = null;
33 |         try {
34 |             words = Files.readAllLines(Paths.get(filepath));
35 |         } catch (IOException e) {
36 |             e.printStackTrace();
37 |         }
38 |         assert words != null;
39 |         double maxValue = 0;
40 |         for (String word : words) {
41 |             int id = wordIds.get(word.trim());
42 |             if (!idTf.containsKey(id)) {
43 |                 idTf.put(id, 0.0);
44 | 
45 |             }
46 |             double cur = idTf.get(id) + 1.0;
47 |             idTf.put(id, cur);
48 |             idDf.add(id);
49 |             if (cur > maxValue) {
50 |                 maxValue = cur;
51 |             }
52 |         }
53 |         for (Map.Entry<Integer, Double> entry : idTf.entrySet()) {
54 |             idTf.put(entry.getKey(), entry.getValue() / maxValue);
55 |         }
56 |     }
57 | 
58 |     @Test
59 |     public void test() {
60 |         calOneFileTf("/Users/unclewang/.xinlp/data/post.20news-18828/comp.windows.x/66410");
61 |         System.out.println(idDf);
62 |         calOneFileTf("/Users/unclewang/.xinlp/data/post.20news-18828/comp.windows.x/66411");
63 |         System.out.println(idDf);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/test/hmm/MatricMethodTest.java:
--------------------------------------------------------------------------------
 1 | package test.hmm;
 2 | 
 3 | import org.apache.commons.math3.linear.Array2DRowRealMatrix;
 4 | import org.apache.commons.math3.linear.LUDecomposition;
 5 | import org.apache.commons.math3.linear.RealMatrix;
 6 | import org.junit.jupiter.api.Test;
 7 | 
 8 | /**
 9 |  * @Author unclewang
10 |  * @Date 2018/11/16 20:39
11 |  */
12 | public class MatricMethodTest {
13 |     @Test
14 |     public void test() {
15 |         double b[][] = new double[5][5];
16 |         for (int i = 0; i < b.length; i++) {
17 |             b[i][i] = i;
18 |         }
19 |         //将数组转化为矩阵
20 |         RealMatrix matrix = new Array2DRowRealMatrix(b);
21 |         System.out.println("创建的数组为：\t" + matrix);
22 |         //获取矩阵的列数 getColumnDimension() 
23 |         System.out.println("矩阵的列数为:\t" + matrix.getColumnDimension());
24 |         //获取矩阵的行数
25 |         System.out.println("矩阵的行数为:\t" + matrix.getRowDimension());
26 |         //获取矩阵的某一行,返回,仍然为矩阵
27 |         System.out.println("矩阵的第一行为:\t" + matrix.getRowMatrix(0));
28 |         //获取矩阵的某一行,返回,转化为向量
29 |         System.out.println("矩阵的第一行向量表示为:\t" + matrix.getRowVector(1));
30 |         //矩阵的乘法
31 |         double testmatrix[][] = new double[2][2];
32 |         testmatrix[0][0] = 1;
33 |         testmatrix[0][1] = 2;
34 |         testmatrix[1][0] = 3;
35 |         testmatrix[1][1] = 4;
36 |         RealMatrix testmatrix1 = new Array2DRowRealMatrix(testmatrix);
37 |         System.out.println("两个矩阵相乘后的结果为：\t" + testmatrix1.multiply(testmatrix1));
38 |         //矩阵的转置
39 |         System.out.println("转置后的矩阵为：\t" + testmatrix1.transpose());
40 |         //矩阵求逆
41 |         RealMatrix inversetestMatrix = inverseMatrix(testmatrix1);
42 |         System.out.println("逆矩阵为：\t" + inversetestMatrix);
43 |         //矩阵转化为数组 getdata
44 |         double matrixtoarray[][] = inversetestMatrix.getData();
45 |         System.out.println("数组中的某一个数字为：\t" + matrixtoarray[0][1]);
46 |     }
47 | 
48 |     //求逆函数
49 |     public static RealMatrix inverseMatrix(RealMatrix A) {
50 |         RealMatrix result = new LUDecomposition(A).getSolver().getInverse();
51 |         return result;
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/paddingtest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from hanziconv import HanziConv
 4 | from jieba import cut
 5 | from tflearn.data_utils import VocabularyProcessor
 6 | 
 7 | DOCUMENTS = [
 8 |     '这是一条测试1',
 9 |     '这是一条测试2',
10 |     '这是一条测试3',
11 |     '这是其他测试',
12 | ]
13 | 
14 | 
15 | def chinese_tokenizer(documents):
16 |     """
17 |     把中文文本转为词序列
18 |     """
19 | 
20 |     for document in documents:
21 |         # 繁体转简体
22 |         text = HanziConv.toSimplified(document)
23 |         # 英文转小写
24 |         text = text.lower()
25 |         # 分词
26 |         yield list(cut(text))
27 | 
28 | 
29 | # 序列长度填充或截取到100，删除词频<=2的词
30 | vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)
31 | 
32 | # 创建词汇表，创建后不能更改
33 | vocab.fit(DOCUMENTS)
34 | 
35 | # 保存和加载词汇表
36 | vocab.save('vocab.pickle')
37 | vocab = VocabularyProcessor.restore('vocab.pickle')
38 | 
39 | # 文本转为词ID序列，未知或填充用的词ID为0
40 | id_documents = list(vocab.transform(DOCUMENTS))
41 | for id_document in id_documents:
42 |     print(id_document)
43 | # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
44 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
45 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
46 | # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
47 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
48 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
49 | # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
50 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
51 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
52 | # [2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
53 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
54 | #  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
55 | 
56 | # 词ID序列转为文本
57 | for document in vocab.reverse(id_documents):
58 |     print(document)
59 | # 这是 一条 测试 <UNK> <UNK> <UNK> ...
60 | # 这是 一条 测试 <UNK> <UNK> <UNK> ...
61 | # 这是 一条 测试 <UNK> <UNK> <UNK> ...
62 | # 这是 <UNK> 测试 <UNK> <UNK> <UNK> ...
63 | 


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 			<attribute name="test" value="true"/>
19 | 		</attributes>
20 | 	</classpathentry>
21 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
22 | 		<attributes>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 		</attributes>
25 | 	</classpathentry>
26 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
27 | 		<attributes>
28 | 			<attribute name="maven.pomderived" value="true"/>
29 | 		</attributes>
30 | 	</classpathentry>
31 | 	<classpathentry kind="src" path="target/generated-sources/annotations">
32 | 		<attributes>
33 | 			<attribute name="optional" value="true"/>
34 | 			<attribute name="maven.pomderived" value="true"/>
35 | 			<attribute name="ignore_optional_problems" value="true"/>
36 | 			<attribute name="m2e-apt" value="true"/>
37 | 		</attributes>
38 | 	</classpathentry>
39 | 	<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
40 | 		<attributes>
41 | 			<attribute name="optional" value="true"/>
42 | 			<attribute name="maven.pomderived" value="true"/>
43 | 			<attribute name="ignore_optional_problems" value="true"/>
44 | 			<attribute name="m2e-apt" value="true"/>
45 | 			<attribute name="test" value="true"/>
46 | 		</attributes>
47 | 	</classpathentry>
48 | 	<classpathentry kind="output" path="target/classes"/>
49 | </classpath>
50 | 


--------------------------------------------------------------------------------
/src/main/java/lucene/XinTokenizer.java:
--------------------------------------------------------------------------------
 1 | package lucene;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 7 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 8 | import segment.Segment;
 9 | 
10 | import java.io.BufferedReader;
11 | import java.io.IOException;
12 | 
13 | /**
14 |  * @Author unclewang
15 |  * @Date 2018-12-11 11:22
16 |  */
17 | public class XinTokenizer extends Tokenizer {
18 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
19 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
20 |     private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
21 |     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
22 |     private SegmentWrapper segment;
23 |     private int totalOffset = 0;
24 | 
25 |     public XinTokenizer(Segment segment) {
26 |         this.segment = new SegmentWrapper(input, segment);
27 |     }
28 | 
29 |     @Override
30 |     final public boolean incrementToken() throws IOException {
31 |         clearAttributes();
32 |         Atom atom;
33 |         atom = segment.next();
34 | 
35 |         if (atom != null) {
36 |             //每一个词都是1个，因为不支持智能分词（今天天气==>今天，天气，天天）
37 |             positionAttr.setPositionIncrement(1);
38 |             termAtt.setEmpty().append(atom.getContent());
39 |             termAtt.setLength(atom.getLen());
40 |             offsetAtt.setOffset(totalOffset + atom.getOffe(), totalOffset + atom.getOffe() + atom.getLen());
41 |             System.out.println(totalOffset + atom.getOffe());
42 |             typeAtt.setType("word");
43 |             return true;
44 |         } else {
45 |             totalOffset += segment.getOffset();
46 |             return false;
47 |         }
48 |     }
49 | 
50 | 
51 |     @Override
52 |     public void reset() throws IOException {
53 |         super.reset();
54 |         segment.reset(new BufferedReader(this.input));
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/test/lucene/rewriteTokenize/IKAnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package test.lucene.rewriteTokenize;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | 
10 | /**
11 |  * IKAnalyzer分詞器集成測試:
12 |  * 細粒度切分：把詞分到最細
13 |  * 智能切分：根據詞庫進行拆分符合我們的語言習慣
14 |  *
15 |  * @author THINKPAD
16 |  */
17 | public class IKAnalyzerTest {
18 |     private static void doToken(TokenStream ts) throws IOException {
19 |         ts.reset();
20 |         CharTermAttribute cta = ts.getAttribute(CharTermAttribute.class);
21 |         while (ts.incrementToken()) {
22 |             System.out.print(cta.toString() + "|");
23 |         }
24 |         System.out.println();
25 |         ts.end();
26 |         ts.close();
27 |     }
28 | 
29 |     public static void main(String[] args) throws IOException {
30 | 
31 |         String etext = "Analysis is one of the main causes of slow indexing. Simply put, the more you analyze the slower analyze the indexing (in most cases).";
32 |         String chineseText = "张三说的确实在理";
33 |         /**
34 |          * ikanalyzer 中文分詞器 因為Analyzer的createComponents方法API改變了 需要我們自己實現
35 |          * 分析器IKAnalyzer4Lucene7和分詞器IKTokenizer4Lucene7
36 |          */
37 |         // IKAnalyzer 細粒度切分
38 |         try (Analyzer ik = new IKAnalyzer4Lucene7();) {
39 |             TokenStream ts = ik.tokenStream("content", etext);
40 |             System.out.println("IKAnalyzer中文分詞器 細粒度切分，英文分詞效果：");
41 |             doToken(ts);
42 |             ts = ik.tokenStream("content", chineseText);
43 |             System.out.println("IKAnalyzer中文分詞器 細粒度切分，中文分詞效果：");
44 |             doToken(ts);
45 |         }
46 | 
47 |         // IKAnalyzer 智能切分
48 |         try (Analyzer ik = new IKAnalyzer4Lucene7(true);) {
49 |             TokenStream ts = ik.tokenStream("content", etext);
50 |             System.out.println("IKAnalyzer中文分詞器 智能切分，英文分詞效果：");
51 |             doToken(ts);
52 |             ts = ik.tokenStream("content", chineseText);
53 |             System.out.println("IKAnalyzer中文分詞器 智能切分，中文分詞效果：");
54 |             doToken(ts);
55 |         }
56 |     }
57 | }


--------------------------------------------------------------------------------
/src/main/java/lda/Parameter.java:
--------------------------------------------------------------------------------
 1 | package lda;
 2 | 
 3 | 
 4 | import lombok.Data;
 5 | import org.apache.commons.io.FileUtils;
 6 | 
 7 | import java.io.File;
 8 | import java.io.IOException;
 9 | import java.util.List;
10 | import java.util.Objects;
11 | 
12 | @Data
13 | public class Parameter {
14 |     private float alpha = 0.5f;
15 |     private float beta = 0.1f;
16 |     private int topicNum = 100;
17 |     private int iteration = 100;
18 |     private int saveStep = 10;
19 |     private int beginSaveIters = 50;
20 |     
21 |     public static Parameter create(String parameterFile) {
22 |         // TODO Auto-generated method stub
23 |         List<String> paramLines = null;
24 |         try {
25 |             paramLines = FileUtils.readLines(new File(parameterFile), "UTF8");
26 |         } catch (IOException e) {
27 |             e.printStackTrace();
28 |         }
29 |         Parameter parameter = new Parameter();
30 |         
31 |         for (String line : Objects.requireNonNull(paramLines)) {
32 |             String[] lineParts = line.split("\t");
33 |             switch (parameters.valueOf(lineParts[0])) {
34 |                 case alpha:
35 |                     parameter.alpha = Float.valueOf(lineParts[1]);
36 |                     break;
37 |                 case beta:
38 |                     parameter.beta = Float.valueOf(lineParts[1]);
39 |                     break;
40 |                 case topicNum:
41 |                     parameter.topicNum = Integer.valueOf(lineParts[1]);
42 |                     break;
43 |                 case iteration:
44 |                     parameter.iteration = Integer.valueOf(lineParts[1]);
45 |                     break;
46 |                 case saveStep:
47 |                     parameter.saveStep = Integer.valueOf(lineParts[1]);
48 |                     break;
49 |                 case beginSaveIters:
50 |                     parameter.beginSaveIters = Integer.valueOf(lineParts[1]);
51 |                     break;
52 |                 default:
53 |                     break;
54 |             }
55 |         }
56 |         return parameter;
57 |     }
58 |     
59 |     public enum parameters {
60 |         alpha, beta, topicNum, iteration, saveStep, beginSaveIters;
61 |     }
62 | }


--------------------------------------------------------------------------------
/src/main/java/test/newton/NewtonMethod.java:
--------------------------------------------------------------------------------
  1 | package test.newton;
  2 |  
  3 | /**
  4 |  * @Author unclewang
  5 |  * @Date 2018-11-26 14:44
  6 |  */
  7 | public class NewtonMethod {
  8 | 	private double originalX;// 初始点
  9 | 	private double e;// 误差阈值
 10 | 	private double maxCycle;// 最大循环次数
 11 |  
 12 | 	/**
 13 | 	 * 构造方法
 14 | 	 * 
 15 | 	 * @param originalX 初始值
 16 | 	 * @param e 误差阈值
 17 | 	 * @param maxCycle 最大循环次数
 18 | 	 */
 19 | 	public NewtonMethod(double originalX, double e, double maxCycle) {
 20 | 		this.setOriginalX(originalX);
 21 | 		this.setE(e);
 22 | 		this.setMaxCycle(maxCycle);
 23 | 	}
 24 |  
 25 | 	// 一系列get和set方法
 26 | 	public double getOriginalX() {
 27 | 		return originalX;
 28 | 	}
 29 |  
 30 | 	public void setOriginalX(double originalX) {
 31 | 		this.originalX = originalX;
 32 | 	}
 33 |  
 34 | 	public double getE() {
 35 | 		return e;
 36 | 	}
 37 |  
 38 | 	public void setE(double e) {
 39 | 		this.e = e;
 40 | 	}
 41 |  
 42 | 	public double getMaxCycle() {
 43 | 		return maxCycle;
 44 | 	}
 45 |  
 46 | 	public void setMaxCycle(double maxCycle) {
 47 | 		this.maxCycle = maxCycle;
 48 | 	}
 49 |  
 50 | 	/**
 51 | 	 * 原始函数
 52 | 	 * 
 53 | 	 * @param x 变量
 54 | 	 * @return 原始函数的值
 55 | 	 */
 56 | 	public double getOriginal(double x) {
 57 | 		return x * x - 3 * x + 2;
 58 | 	}
 59 |  
 60 | 	/**
 61 | 	 * 一次导函数
 62 | 	 * 
 63 | 	 * @param x 变量
 64 | 	 * @return 一次导函数的值
 65 | 	 */
 66 | 	public double getOneDerivative(double x) {
 67 | 		return 2 * x - 3;
 68 | 	}
 69 |  
 70 | 	/**
 71 | 	 * 二次导函数
 72 | 	 * 
 73 | 	 * @param x 变量
 74 | 	 * @return 二次导函数的值
 75 | 	 */
 76 | 	public double getTwoDerivative(double x) {
 77 | 		return 2;
 78 | 	}
 79 |  
 80 | 	/**
 81 | 	 * 利用牛顿法求解
 82 | 	 * 
 83 | 	 * @return
 84 | 	 */
 85 | 	public double getNewtonMin() {
 86 | 		double x = this.getOriginalX();
 87 | 		double y = 0;
 88 | 		double k = 1;
 89 | 		// 更新公式
 90 | 		while (k <= this.getMaxCycle()) {
 91 | 			y = this.getOriginal(x);
 92 | 			double one = this.getOneDerivative(x);
 93 | 			if (Math.abs(one) <= e) {
 94 | 				break;
 95 | 			}
 96 | 			double two = this.getTwoDerivative(x);
 97 | 			x = x - one / two;
 98 | 			k++;
 99 | 		}
100 | 		return y;
101 | 	}
102 | 
103 | 
104 | 
105 | 
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/src/main/java/test/newton/Derivative.java:
--------------------------------------------------------------------------------
 1 | package test.newton;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | public class Derivative {
 6 |     private static final double Error = 1.1 * Math.pow(1.1, -16);
 7 |     /**
 8 |      * 在用计算机解决问题时，需要注意的是计算机浮点数本身就会有误差，例如对于double类型，该误差为u=1.1*10^(-16)。
 9 |      * https://blog.csdn.net/fangqingan_java/article/details/48685093
10 |      */
11 |     private static final double DELTA_X = Math.pow(Error, -1.0 / 2);
12 |     private static final double DELTA_X_Center = Math.pow(Error, -1.0 / 3);
13 |     private double e = 0.001;
14 |     private double maxCycle = 100;
15 | 
16 |     public double f(double x1, double x2) {
17 |         return x1 * x1 + x1 * x2 - 3 * x1 + 4 * x2;
18 |     }
19 | 
20 |     public double f(double x) {
21 |         return x * x - 3 * x + 2;
22 |     }
23 | 
24 |     public double d1(double x) {
25 |         return (f(x + DELTA_X_Center) - f(x - DELTA_X_Center)) / (2 * DELTA_X_Center);
26 |     }
27 | 
28 |     public double pd1(double x1, double x2) {
29 |         return ((f(x1 + DELTA_X_Center, x2) - f(x1 - DELTA_X_Center, x2)) / (2 * DELTA_X_Center)) * ((f(x1, x2 + DELTA_X_Center) - f(x1, x2 - DELTA_X_Center)) / (2 * DELTA_X_Center));
30 |     }
31 | 
32 |     public double d2(double d1) {
33 |         return (d1(d1 + DELTA_X_Center) - d1(d1 - DELTA_X_Center)) / (2 * DELTA_X_Center);
34 |     }
35 | 
36 | 
37 |     public double newton(double x) {
38 | 
39 |         for (int i = 0; i < this.maxCycle; i++) {
40 |             double gx = d1(x);
41 |             double hx = d2(x);
42 |             if (Math.abs(gx) < this.e) {
43 |                 break;
44 |             }
45 |             x = x - gx / hx;
46 |         }
47 |         return x;
48 |     }
49 | 
50 |     public double globalNewton(double x) {
51 |         for (int i = 0; i < this.maxCycle; i++) {
52 |             double gx = d1(x);
53 |             double hx = d2(x);
54 |             double dx = -gx / hx;
55 |             if (Math.abs(gx) < this.e) {
56 |                 break;
57 |             }
58 |             x = x - gx / hx;
59 |         }
60 |         return x;
61 |     }
62 | 
63 | 
64 |     @Test
65 |     public void testD1() {
66 |         System.out.println(d1(2));
67 |         System.out.println(pd1(2, 2));
68 |         System.out.println(d2(2));
69 |         System.out.println("牛顿法得到的最小x为：" + newton(2) + "，此时函数值为：" + f(newton(2)));
70 |     }
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/test/dl4j/MatrixTest.java:
--------------------------------------------------------------------------------
 1 | package test.dl4j;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import org.nd4j.linalg.api.ndarray.INDArray;
 5 | import org.nd4j.linalg.factory.Nd4j;
 6 | 
 7 | public class MatrixTest {
 8 |     @Test
 9 |     public void testMatrix() {
10 |         int nRows = 2;
11 |         int nColumns = 2;
12 | // Create INDArray of zeros
13 |         INDArray zeros = Nd4j.zeros(nRows, nColumns);
14 | // Create one of all ones
15 |         INDArray ones = Nd4j.ones(nRows, nColumns);
16 | //hstack
17 |         INDArray hstack = Nd4j.hstack(ones, zeros);
18 |         System.out.println("### HSTACK ####");
19 |         System.out.println(hstack);
20 |     }
21 | 
22 |     @Test
23 |     public void testSVD() {
24 |         int nRows = 1;
25 |         int nColumns = 1;
26 | 
27 |         double[][] vals = {{1, 1, 1, 0, 0}, {2, 2, 2, 0, 0}, {1, 1, 1, 0, 0}, {5, 5, 5, 0, 0}, {0, 0, 0, 2, 2}, {0, 0, 0, 3, 3}, {0, 0, 0, 1, 1}};
28 |         INDArray A = Nd4j.create(vals);
29 |         nRows = A.rows();
30 |         nColumns = A.columns();
31 |         System.out.println("A: " + A);
32 | 
33 | 
34 |         INDArray S = Nd4j.zeros(1, nRows);
35 |         INDArray U = Nd4j.zeros(nRows, nRows);
36 |         INDArray V = Nd4j.zeros(nColumns, nColumns);
37 |         Nd4j.getBlasWrapper().lapack().gesvd(A, S, U, V);
38 | 
39 |         System.out.println("\n S:" + S);
40 |         System.out.println("\n U:" + U);
41 |         System.out.println("\n V:" + V);
42 |     }
43 | 
44 |     @Test
45 |     public void testmm() {
46 |         double[][] vals = {{1, 1, 1, 0, 0}, {2, 2, 2, 0, 0}, {1, 1, 1, 0, 0}, {5, 5, 5, 0, 0}, {0, 0, 0, 2, 2}, {0, 0, 0, 3, 3}, {0, 0, 0, 1, 1}};
47 |         INDArray A = Nd4j.create(vals);
48 |         long m = A.rows();
49 |         long n = A.columns();
50 |         INDArray mean = A.mean(0);
51 |         A.subiRowVector(mean);
52 |         System.out.println(A);
53 |         // The prepare SVD results, we'll decomp A to UxSxV'
54 |         INDArray s = Nd4j.create(m < n ? m : n);
55 |         INDArray VT = Nd4j.create(n, n, 'f');
56 | 
57 |         // Note - we don't care about U
58 |         Nd4j.getBlasWrapper().lapack().gesvd(A, s, null, VT);
59 |         System.out.println("\n S:" + s);
60 |         System.out.println("\n V:" + VT);
61 |     }
62 | 
63 |     @Test
64 |     public void test() {
65 |         System.out.println(System.getProperty("java.io.tmpdir"));
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/embedding.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pickle
 4 | 
 5 | import numpy as np
 6 | 
 7 | import config
 8 | 
 9 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
10 | 
11 | 
12 | def get_embedding(random, embedding_dim):
13 |     if random:
14 |         vocab, word2id, embedding = random_embedding(embedding_dim)
15 |     else:
16 |         vocab, word2id, embedding = pre_train_embedding(embedding_dim)
17 |     logging.info("向量已经生成结束")
18 |     # with open(os.path.join('.', 'data/word2id.pkl'), 'wb') as fw:
19 |     #     pickle.dump(word2id, fw)
20 |     return vocab, word2id, embedding
21 | 
22 | 
23 | def random_embedding(embedding_dim):
24 |     vocab, word2id = get_word2id()
25 |     embedding_random = np.random.uniform(-0.1, 0.1, (len(vocab), embedding_dim))
26 |     embedding_random = np.float32(embedding_random)
27 |     return vocab, word2id, embedding_random
28 | 
29 | 
30 | def get_word2id():
31 |     word2id = {}
32 |     vocab = []
33 |     file = open(config.chars_embedding, 'r')
34 |     # 先读一行是向量信息
35 |     line = file.readline().strip()
36 |     # 其实是字数加3
37 |     logging.info("随机生成[字数，维度]：" + line)
38 |     vocab.append("unk")
39 |     word2id["unk"] = 0
40 |     vocab.append("num")
41 |     word2id["num"] = 1
42 |     vocab.append("en")
43 |     word2id["en"] = 2
44 | 
45 |     d = 3
46 |     for line in file:
47 |         row = line.strip().split(' ')
48 |         vocab.append(row[0])
49 |         word2id[row[0]] = d
50 |         d += 1
51 |     file.close()
52 |     return vocab, word2id
53 | 
54 | 
55 | def pre_train_embedding(embedding_dim):
56 |     vocab = []
57 |     embedding = []
58 |     word2id = {}
59 |     file = open(config.chars_embedding, 'r')
60 |     # 先读一行是向量信息
61 |     line = file.readline().strip()
62 |     # 其实是字数加3
63 |     logging.info("预训练词向量信息：" + line)
64 |     vocab.append("unk")
65 |     word2id["unk"] = 0
66 |     embedding.append(np.random.uniform(-0.25, 0.25, embedding_dim))
67 |     vocab.append("num")
68 |     word2id["num"] = 1
69 |     embedding.append(np.random.uniform(-0.25, 0.25, embedding_dim))
70 |     vocab.append("en")
71 |     word2id["en"] = 2
72 |     embedding.append(np.random.uniform(-0.25, 0.25, embedding_dim))
73 |     d = 3
74 |     for line in file:
75 |         row = line.strip().split(' ')
76 |         vocab.append(row[0])
77 |         embedding.append(row[1:])
78 |         word2id[row[0]] = d
79 |         d += 1
80 |     file.close()
81 |     return vocab, word2id, np.asarray(embedding)
82 | 


--------------------------------------------------------------------------------
/src/main/java/mining/config/Config.java:
--------------------------------------------------------------------------------
 1 | package mining.config;
 2 | 
 3 | import com.google.common.collect.BiMap;
 4 | import com.google.common.collect.HashBiMap;
 5 | import lombok.Data;
 6 | import org.junit.jupiter.api.Test;
 7 | import tools.PathUtils;
 8 | 
 9 | import java.io.File;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 | import java.util.Objects;
13 | import java.util.concurrent.atomic.AtomicInteger;
14 | 
15 | @Data
16 | public class Config {
17 |     private static String vocabularyPath = PathUtils.getDataPath() + "/vocabulary.txt";
18 |     private static String termIdPath = PathUtils.getDataPath() + "/termid.txt";
19 |     private static String prePath = PathUtils.getDataPath() + "/20news-18828";
20 |     private static String postPath = PathUtils.getDataPath() + "/post.20news-18828";
21 |     private static String stopwordsPath = PathUtils.getDataPath() + "/stopwords.txt";
22 |     private static String tfidfsPath = PathUtils.getDataPath() + "/tfidfs.txt";
23 |     private static BiMap<Integer, String> idPostFiles = HashBiMap.create();
24 |     private static AtomicInteger id;
25 | 
26 |     static {
27 |         id = new AtomicInteger(0);
28 |     }
29 | 
30 |     public static String getVocabularyPath() {
31 |         return vocabularyPath;
32 |     }
33 | 
34 |     public static String getPrePath() {
35 |         return prePath;
36 |     }
37 | 
38 |     public static String getPostPath() {
39 |         return postPath;
40 |     }
41 | 
42 |     public static String getStopwordsPath() {
43 |         return stopwordsPath;
44 |     }
45 | 
46 |     public static String getTfidfsPath() {
47 |         return tfidfsPath;
48 |     }
49 | 
50 |     public static String getTermIdPath() {
51 |         return termIdPath;
52 |     }
53 | 
54 |     public static BiMap<Integer, String> getIdPostFiles(String filepath) {
55 |         Path path = Paths.get(filepath);
56 |         for (File file : Objects.requireNonNull(path.toFile().listFiles())) {
57 |             if (file.isDirectory()) {
58 |                 getIdPostFiles(file.getAbsolutePath());
59 |             } else {
60 |                 idPostFiles.put(id.getAndIncrement(), file.getAbsolutePath());
61 |             }
62 |         }
63 |         return idPostFiles;
64 |     }
65 | 
66 |     @Test
67 |     public void test() {
68 |         getIdPostFiles(postPath);
69 | 
70 |         System.out.println(idPostFiles.size());
71 |         for (int i = 0; i < 10; i++) {
72 |             System.out.println(idPostFiles.get(i));
73 |         }
74 | 
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/test/crftest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | # 参数设置
 5 | num_examples = 10
 6 | num_words = 20
 7 | num_features = 100
 8 | num_tags = 5
 9 | 
10 | # 构建随机特征
11 | x = np.random.rand(num_examples, num_words, num_features).astype(np.float32)
12 | 
13 | # 构建随机tag
14 | y = np.random.randint(
15 |     num_tags, size=[num_examples, num_words]).astype(np.int32)
16 | 
17 | # 获取样本句长向量（因为每一个样本可能包含不一样多的词），在这里统一设为 num_words - 1，真实情况下根据需要设置
18 | sequence_lengths = np.full(num_examples, num_words - np.random.randint(1, 3), dtype=np.int32)
19 | 
20 | # 训练，评估模型
21 | with tf.Graph().as_default():
22 |     with tf.Session() as session:
23 |         x_t = tf.constant(x)
24 |         y_t = tf.constant(y)
25 |         sequence_lengths_t = tf.constant(sequence_lengths)
26 |         print(sequence_lengths_t)
27 |         # 在这里设置一个无偏置的线性层
28 |         weights = tf.get_variable("weights", [num_features, num_tags])
29 |         matricized_x_t = tf.reshape(x_t, [-1, num_features])
30 |         matricized_unary_scores = tf.matmul(matricized_x_t, weights)
31 |         unary_scores = tf.reshape(matricized_unary_scores,
32 |                                   [num_examples, num_words, num_tags])
33 | 
34 |         # 计算log-likelihood并获得transition_params
35 |         log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
36 |             unary_scores, y_t, sequence_lengths_t)
37 | 
38 |         # 进行解码（维特比算法），获得解码之后的序列viterbi_sequence和分数viterbi_score
39 |         viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(
40 |             unary_scores, transition_params, sequence_lengths_t)
41 | 
42 |         loss = tf.reduce_mean(-log_likelihood)
43 | 
44 |         train_op = tf.train.AdamOptimizer(0.01).minimize(loss)
45 | 
46 |         session.run(tf.global_variables_initializer())
47 | 
48 |         mask = (np.expand_dims(np.arange(num_words), axis=0) <  # np.arange()创建等差数组
49 |                 np.expand_dims(sequence_lengths, axis=1))  # np.expand_dims()扩张维度
50 | 
51 |         # 得到一个num_examples*num_words的二维数组，数据类型为布尔型，目的是对句长进行截断
52 | 
53 |         # 将每个样本的sequence_lengths加起来，得到标签的总数
54 |         total_labels = np.sum(sequence_lengths)
55 | 
56 |         # 进行训练
57 |         for i in range(1000):
58 |             tf_viterbi_sequence, _ = session.run([viterbi_sequence, train_op])
59 |             if i % 100 == 0:
60 |                 correct_labels = np.sum((y == tf_viterbi_sequence) * mask)
61 |                 accuracy = 100.0 * correct_labels / float(total_labels)
62 |                 print("Accuracy: %.2f%%" % accuracy)
63 | 


--------------------------------------------------------------------------------
/src/main/java/segment/crf/tcp/XinCRFSegmentServer.java:
--------------------------------------------------------------------------------
 1 | package segment.crf.tcp;
 2 | 
 3 | import com.alibaba.fastjson.JSON;
 4 | import lombok.extern.slf4j.Slf4j;
 5 | import lucene.Atom;
 6 | import segment.crf.XinCRFSegment;
 7 | 
 8 | import java.io.BufferedReader;
 9 | import java.io.IOException;
10 | import java.io.InputStreamReader;
11 | import java.io.PrintStream;
12 | import java.net.ServerSocket;
13 | import java.net.Socket;
14 | import java.util.List;
15 | 
16 | /**
17 |  * @Author unclewang
18 |  * @Date 2018-12-12 14:15
19 |  */
20 | @Slf4j
21 | public class XinCRFSegmentServer {
22 | 
23 |     static XinCRFSegment segment = new XinCRFSegment();
24 | 
25 | 
26 |     public static void main(String[] args) throws IOException {
27 | 
28 |         ServerSocket ss = new ServerSocket(9428);
29 |         while (true) {
30 |             Socket s = ss.accept();
31 |             new Thread(new ServerThread(s)).start();
32 |         }
33 |     }
34 | 
35 | 
36 |     private static class ServerThread implements Runnable {
37 |         Socket s = null;
38 |         BufferedReader br = null;
39 |         PrintStream ps = null;
40 | 
41 |         public ServerThread(Socket s) throws IOException {
42 |             this.s = s;
43 |             this.br = new BufferedReader(new InputStreamReader(s.getInputStream()));
44 |             this.ps = new PrintStream(s.getOutputStream());
45 |         }
46 | 
47 |         @Override
48 |         public void run() {
49 |             String content = null;
50 |             while ((content = read()) != null) {
51 |                 List<Atom> atoms = segment.seg(content);
52 |                 try {
53 |                     System.out.println(JSON.toJSONString(atoms));
54 |                     ps.write(JSON.toJSONString(atoms).getBytes());
55 |                     ps.flush();
56 |                 } catch (IOException e) {
57 |                     e.printStackTrace();
58 |                 }
59 |             }
60 |             try {
61 |                 br.close();
62 |                 ps.close();
63 |                 s.close();
64 |             } catch (IOException e) {
65 |                 e.printStackTrace();
66 |             }
67 | 
68 |         }
69 | 
70 |         public String read() {
71 |             try {
72 |                 String readContent = br.readLine();
73 |                 log.info("client请求，数据为：" + readContent);
74 |                 return readContent;
75 |             } catch (IOException e) {
76 |                 e.printStackTrace();
77 |             }
78 |             return null;
79 |         }
80 | 
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/resources/lda/doc/4:
--------------------------------------------------------------------------------
  1 | 新华社	北京	十二月	三十一日	电	（	中央	人民	广播	电台	记者	刘振英	、	新华社	记者	张宿堂	）	今天	是	一九九七年	的	最后	一	天	。
  2 | 辞旧迎新	之际	，
  3 | 国务院	总理	李鹏	今天	上午	来到	北京	石景山	发电	总厂	考察	，
  4 | 向	广大	企业	职工	表示	节日	的	祝贺	，
  5 | 向	将要	在	节日	期间	坚守	工作	岗位	的	同志	们	表示	慰问	。
  6 | 
  7 | 上午	九时	二十分	，
  8 | 李鹏	总理	在	北京	市委	书记	、	市长	贾庆林	的	陪同	下	，
  9 | 来到	位于	北京	西郊	的	北京	石景山	发电	总厂	。
 10 | 始建	于	一九一九年	的	北京	石景山	发电	总厂	是	华北	电力	集团公司	骨干	发电	企业	，
 11 | 承担	着	向	首都	供电	、	供热	任务	，
 12 | 装机	总	容量	一百一十六点六万	千瓦	。
 13 | 总厂	年发电量	四十五亿	千瓦时	，
 14 | 供热	能力	八百	百万	大卡／小时	，
 15 | 现	供热	面积	已	达	八百	多	万	平方米	。
 16 | 早	在	担任	华北	电管局	领导	时	，
 17 | 李鹏	就	曾	多次	到	发电	总厂	检查	指导	工作	。
 18 | 
 19 | 在	总厂	所属	的	石景山	热电厂	，
 20 | 李鹏	首先	向	华北	电管局	、	电厂	负责人	详细	询问	了	目前	电厂	生产	、	职工	生活	和	华北	电网	向	首都	供电	、	供热	的	有关	情况	。
 21 | 随后	，
 22 | 他	又	实地	察看	了	发电机组	的	运行	情况	和	电厂	一号机	、	二号机	控制室	。
 23 | 在	控制室	，
 24 | 李鹏	与	职工	们	一一	握手	，
 25 | 向	大家	表示	慰问	。
 26 | 他	说	，
 27 | 在	一九九八年	即将	到来	之际	，
 28 | 有	机会	再次	回到	石景山	发电	总厂	，
 29 | 感到	十分	高兴	。
 30 | 李鹏	亲切	地	说	：
 31 | 『	今天	我	看到	了	许多	新	的	、	年轻	的	面孔	，
 32 | 这	说明	在	老	同志	们	作出	贡献	退	下来	后	，
 33 | 新	一	代	的	年轻人	成长	起来	了	、	成熟	起来	了	，
 34 | 我	感到	十分	欣慰	。	』
 35 | 
 36 | （	A	、	B	）
 37 | 
 38 | 李鹏	说	：
 39 | “	作为	首都	的	电力	工作者	，
 40 | 你们	为	首都	的	各	项	重大	活动	的	顺利	进行	，
 41 | 为	保障	人民	群众	的	工作	、	生活	和	学习	，
 42 | 为	促进	首都	经济	的	发展	作出	了	自己	的	贡献	。
 43 | 明天	就	是	元旦	，
 44 | 你们	还	有	许多	同志	要	坚守	岗位	，
 45 | 我	向	你们	、	向	全体	电力	工作者	表示	感谢	。
 46 | 现在	，
 47 | 我们	的	首都	已经	结束	了	拉	闸	限	电	的	历史	，
 48 | 希望	依靠	大家	，
 49 | 使	拉	闸	限	电	的	历史	永远	不	再	重演	。
 50 | 同时	，
 51 | 也	希望	你们	安全	生产	、	经济	调度	，
 52 | 实现	经济	增长	方式	的	转变	。	”
 53 | 李鹏	最后	向	电业	职工	，
 54 | 向	全	北京市	的	人民	拜年	，
 55 | 向	大家	致以	新春	的	问候	，
 56 | 祝愿	电力	事业	取得	新	的	成绩	，
 57 | 祝愿	北京市	在	改革	、	发展	和	稳定	的	各	项	工作	中	取得	新	的	成就	。
 58 | 
 59 | 参观	工厂	结束	后	，
 60 | 李鹏	又	来到	工厂	退休	职工	郭树范	和	闫戌麟	家	看望	慰问	，
 61 | 向	他们	拜年	。
 62 | 曾经	是	高级	工程师	的	郭树范	退休	前	一直	在	发电厂	从事	土建工程	建设	，
 63 | 退休	后	，
 64 | 与	老伴	一起	抚养	着	身体	欠佳	的	孙子	。
 65 | 李鹏	对	他们	倾心	照顾	下一代	表示	肯定	。
 66 | 他	说	：
 67 | “	人	老	了	，
 68 | 照顾	照顾	后代	也	是	一	件	可以	带	来	快乐	的	事	，
 69 | 当然	，
 70 | 对	孩子	们	不	能	溺爱	，
 71 | 要	让	他们	健康	成长	。	”
 72 | 在	老工人	闫戌麟	家	，
 73 | 当	李鹏	了解	到	老闫	退休	前	一直	都	是	厂里	的	先进	工作者	、	曾经	被	评为	北京市	“	五好	职工	”	，
 74 | 退休	后	仍然	为	改善	职工	的	住房	而	奔波	时	，
 75 | 十分	高兴	，
 76 | 对	他	为	工厂	建设	作出	的	贡献	表示	感谢	。
 77 | 在	郭	家	和	闫	家	，
 78 | 李鹏	都	具体	地	了解	了	他们	退休	后	的	生活	保障	问题	，
 79 | 并	与	一些	老	职工	一起	回忆	起	了	当年	建设	电厂	的	情景	。
 80 | 李鹏	说	：
 81 | “	当年	搞	建设	，
 82 | 条件	比	现在	差	多	了	，
 83 | 大家	也	很	少	计较	什么	，
 84 | 只	是	一心	想	着	把	电厂	建	好	。
 85 | 现在	条件	好	了	，
 86 | 但	艰苦奋斗	、	无私奉献	的	精神	可	不	能	丢	。	”
 87 | 李鹏	最后	祝	他们	新春	快乐	，
 88 | 身体	健康	，
 89 | 家庭	幸福	。
 90 | 
 91 | 陪同	考察	企业	并	看望	慰问	职工	的	国务院	有关	部门	和	北京市	负责人	还	有	：
 92 | 史大桢	、	高严	、	石秀诗	、	阳安江	等	。
 93 | 
 94 | 
 95 | 挂	起	红灯	迎	新年	（	图片	）
 96 | 
 97 | 元旦	来临	，
 98 | 安徽省	合肥市	长江路	悬挂	起	3300	盏	大	红	灯笼	，
 99 | 为	节日	营造	出	“	千	盏	灯笼	凌空	舞	，
100 | 十	里	长街	别样	红	”	的	欢乐	祥和	气氛	。


--------------------------------------------------------------------------------
/src/main/java/test/newton/GlobalNewtonMethod.java:
--------------------------------------------------------------------------------
  1 | package test.newton;
  2 |  
  3 | /**
  4 |  * @Author unclewang
  5 |  * @Date 2018-11-26 16:21
  6 |  */
  7 | public class GlobalNewtonMethod {
  8 | 	private double originalX;
  9 | 	private double delta;
 10 | 	private double sigma;
 11 | 	private double e;
 12 | 	private double maxCycle;
 13 |  
 14 | 	public GlobalNewtonMethod(double originalX, double delta, double sigma,
 15 | 			double e, double maxCycle) {
 16 | 		this.setOriginalX(originalX);
 17 | 		this.setDelta(delta);
 18 | 		this.setSigma(sigma);
 19 | 		this.setE(e);
 20 | 		this.setMaxCycle(maxCycle);
 21 | 	}
 22 |  
 23 | 	public double getOriginalX() {
 24 | 		return originalX;
 25 | 	}
 26 |  
 27 | 	public void setOriginalX(double originalX) {
 28 | 		this.originalX = originalX;
 29 | 	}
 30 |  
 31 | 	public double getDelta() {
 32 | 		return delta;
 33 | 	}
 34 |  
 35 | 	public void setDelta(double delta) {
 36 | 		this.delta = delta;
 37 | 	}
 38 |  
 39 | 	public double getSigma() {
 40 | 		return sigma;
 41 | 	}
 42 |  
 43 | 	public void setSigma(double sigma) {
 44 | 		this.sigma = sigma;
 45 | 	}
 46 |  
 47 | 	public double getE() {
 48 | 		return e;
 49 | 	}
 50 |  
 51 | 	public void setE(double e) {
 52 | 		this.e = e;
 53 | 	}
 54 |  
 55 | 	public double getMaxCycle() {
 56 | 		return maxCycle;
 57 | 	}
 58 |  
 59 | 	public void setMaxCycle(double maxCycle) {
 60 | 		this.maxCycle = maxCycle;
 61 | 	}
 62 |  
 63 | 	/**
 64 | 	 * 原始函数
 65 | 	 * 
 66 | 	 * @param x 变量
 67 | 	 * @return 原始函数的值
 68 | 	 */
 69 | 	public double getOriginal(double x) {
 70 | 		return x * x - 3 * x + 2;
 71 | 	}
 72 |  
 73 | 	/**
 74 | 	 * 一次导函数
 75 | 	 * 
 76 | 	 * @param x 变量
 77 | 	 * @return 一次导函数的值
 78 | 	 */
 79 | 	public double getOneDerivative(double x) {
 80 | 		return 2 * x - 3;
 81 | 	}
 82 |  
 83 | 	/**
 84 | 	 * 二次导函数
 85 | 	 * 
 86 | 	 * @param x 变量
 87 | 	 * @return 二次导函数的值
 88 | 	 */
 89 | 	public double getTwoDerivative(double x) {
 90 | 		return 2;
 91 | 	}
 92 |  
 93 | 	/**
 94 | 	 * 利用牛顿法求解
 95 | 	 * 
 96 | 	 * @return
 97 | 	 */
 98 | 	public double getGlobalNewtonMin() {
 99 | 		double x = this.getOriginalX();
100 | 		double y = 0;
101 | 		double k = 1;
102 | 		// 更新公式
103 | 		while (k <= this.getMaxCycle()) {
104 | 			y = this.getOriginal(x);
105 | 			double one = this.getOneDerivative(x);
106 | 			if (Math.abs(one) <= e) {
107 | 				break;
108 | 			}
109 | 			double two = this.getTwoDerivative(x);
110 | 			double dk = -one / two;// 搜索的方向
111 | 			double m = 0;
112 | 			double mk = 0;
113 | 			while (m < 20) {
114 | 				double left = this.getOriginal(x + Math.pow(this.getDelta(), m)
115 | 						* dk);
116 | 				double right = this.getOriginal(x) + this.getSigma()
117 | 						* Math.pow(this.getDelta(), m)
118 | 						* this.getOneDerivative(x) * dk;
119 | 				if (left <= right) {
120 | 					mk = m;
121 | 					break;
122 | 				}
123 | 				m++;
124 | 			}
125 | 			x = x + Math.pow(this.getDelta(), mk)*dk;
126 | 			k++;
127 | 		}
128 | 		return y;
129 | 	}
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/src/main/java/test/lucene/rewriteTokenize/IKTokenizer4Lucene7.java:
--------------------------------------------------------------------------------
 1 | package test.lucene.rewriteTokenize;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 6 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 7 | import org.wltea.analyzer.core.IKSegmenter;
 8 | import org.wltea.analyzer.core.Lexeme;
 9 | 
10 | import java.io.IOException;
11 | 
12 | /**
13 |  * 因為Analyzer的createComponents方法API改變了需要重新實現分詞器
14 |  *
15 |  * @author THINKPAD
16 |  */
17 | public class IKTokenizer4Lucene7 extends Tokenizer {
18 | 
19 |     // IK分詞器實現
20 |     private IKSegmenter _IKImplement;
21 | 
22 |     // 詞元文本屬性
23 |     private final CharTermAttribute termAtt;
24 |     // 詞元位移屬性
25 |     private final OffsetAttribute offsetAtt;
26 |     // 詞元分類屬性（該屬性分類參考org.wltea.analyzer.core.Lexeme中的分類常量）
27 |     private final TypeAttribute typeAtt;
28 |     // 記錄最後一個詞元的結束位置
29 |     private int endPosition;
30 | 
31 |     /**
32 |      * @param in
33 |      * @param useSmart
34 |      */
35 |     public IKTokenizer4Lucene7(boolean useSmart) {
36 |         super();
37 |         offsetAtt = addAttribute(OffsetAttribute.class);
38 |         termAtt = addAttribute(CharTermAttribute.class);
39 |         typeAtt = addAttribute(TypeAttribute.class);
40 |         _IKImplement = new IKSegmenter(input, useSmart);
41 |     }
42 | 
43 |     /*
44 |      * (non-Javadoc)
45 |      *
46 |      * @see org.apache.lucene.analysis.TokenStream#incrementToken()
47 |      */
48 |     @Override
49 |     final public boolean incrementToken() throws IOException {
50 |         // 清除所有的詞元屬性
51 |         clearAttributes();
52 |         Lexeme nextLexeme = _IKImplement.next();
53 |         if (nextLexeme != null) {
54 |             // 將Lexeme轉成Attributes
55 |             // 設置詞元文本
56 |             termAtt.append(nextLexeme.getLexemeText());
57 |             // 設置詞元長度
58 |             termAtt.setLength(nextLexeme.getLength());
59 |             // 設置詞元位移
60 |             offsetAtt.setOffset(nextLexeme.getBeginPosition(),
61 |                                 nextLexeme.getEndPosition());
62 |             // 記錄分詞的最後位置
63 |             endPosition = nextLexeme.getEndPosition();
64 |             System.out.println(endPosition);
65 |             // 記錄詞元分類
66 |             typeAtt.setType(nextLexeme.getLexemeTypeString());
67 |             // 返會true告知還有下個詞元
68 |             return true;
69 |         }
70 |         // 返會false告知詞元輸出完畢
71 |         return false;
72 |     }
73 | 
74 |     /*
75 |      * (non-Javadoc)
76 |      *
77 |      * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
78 |      */
79 |     @Override
80 |     public void reset() throws IOException {
81 |         super.reset();
82 |         _IKImplement.reset(input);
83 |     }
84 | 
85 |     @Override
86 |     public final void end() {
87 |         // set final offset
88 |         int finalOffset = correctOffset(this.endPosition);
89 |         offsetAtt.setOffset(finalOffset, finalOffset);
90 |     }
91 | }


--------------------------------------------------------------------------------
/python/bilstmcrf/util.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | 
 5 | def get_logger(filename):
 6 |     logger = logging.getLogger('logger')
 7 |     logger.setLevel(logging.DEBUG)
 8 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 9 |     handler = logging.FileHandler(filename)
10 |     handler.setLevel(logging.DEBUG)
11 |     handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
12 |     logging.getLogger().addHandler(handler)
13 |     return logger
14 | 
15 | 
16 | def str2bool(v):
17 |     # copy from StackOverflow
18 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
19 |         return True
20 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
21 |         return False
22 |     else:
23 |         raise argparse.ArgumentTypeError('Boolean value expected.')
24 | 
25 | 
26 | # 本来希望统一生成seq—max-len都按照max_len去使用
27 | def generate_sequence_len(max_len, batch_size):
28 |     sequence_len = []
29 |     for i in range(batch_size):
30 |         sequence_len.append(max_len)
31 |     return sequence_len
32 | 
33 | 
34 | def conlleval(label_predict, label_path, metric_path):
35 |     b_correct = 0
36 |     e_correct = 0
37 |     m_correct = 0
38 |     s_correct = 0
39 |     b_total = 0
40 |     e_total = 0
41 |     m_total = 0
42 |     s_total = 0
43 |     with open(label_path, "w") as fw:
44 |         line = []
45 |         for sent_result in label_predict:
46 |             for char, tag, tag_ in sent_result:
47 |                 if char == 0:
48 |                     continue
49 |                 else:
50 |                     if tag == 0:
51 |                         b_total += 1
52 |                         if tag_ == tag:
53 |                             b_correct += 1
54 |                     elif tag == 1:
55 |                         e_total += 1
56 |                         if tag_ == tag:
57 |                             e_correct += 1
58 |                     elif tag == 2:
59 |                         m_total += 1
60 |                         if tag_ == tag:
61 |                             m_correct += 1
62 |                     elif tag == 3:
63 |                         s_total += 1
64 |                         if tag_ == tag:
65 |                             s_correct += 1
66 |                 line.append("{} {} {}\n".format(char, tag, tag_))
67 |             line.append("\n")
68 |         fw.writelines(line)
69 |     total = b_total + e_total + m_total + s_total
70 |     correct = b_correct + e_correct + m_correct + s_correct
71 |     metrics = ["测试的字数为{}，其中分词正确的字数为{}，准确率为{}\n".format(total, correct, correct / total),
72 |                "B的字数为{}，其中B被正确预测的字数为{}，准确率为{}\n".format(b_total, b_correct, b_correct / b_total),
73 |                "E的字数为{}，其中E被正确预测的字数为{}，准确率为{}\n".format(e_total, e_correct, e_correct / e_total),
74 |                "M的字数为{}，其中M被正确预测的字数为{}，准确率为{}\n".format(m_total, m_correct, m_correct / m_total),
75 |                "S的字数为{}，其中S被正确预测的字数为{}，准确率为{}\n".format(s_total, s_correct, s_correct / s_total)]
76 |     with open(metric_path, "w") as fw:
77 |         fw.writelines(metrics)
78 |     return metrics
79 | 


--------------------------------------------------------------------------------
/src/main/java/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
  1 | package lucene;
  2 | 
  3 | import lombok.Data;
  4 | import segment.Segment;
  5 | 
  6 | import java.io.IOException;
  7 | import java.io.Reader;
  8 | import java.util.Iterator;
  9 | import java.util.List;
 10 | 
 11 | @Data
 12 | public class SegmentWrapper {
 13 |     /**
 14 |      * 输入
 15 |      */
 16 |     private Reader input;
 17 |     /**
 18 |      * 分词器
 19 |      */
 20 |     private Segment segment;
 21 |     /**
 22 |      * 分词结果
 23 |      */
 24 |     private Iterator<Atom> iterator;
 25 |     /**
 26 |      * term的偏移量，由于wrapper是按行读取的，必须对term.offset做一个校正
 27 |      */
 28 |     int offset;
 29 |     /**
 30 |      * 缓冲区大小
 31 |      */
 32 |     private static final int BUFFER_SIZE = 512;
 33 |     /**
 34 |      * 缓冲区
 35 |      */
 36 |     private char[] buffer = new char[BUFFER_SIZE];
 37 |     /**
 38 |      * 缓冲区未处理的下标
 39 |      */
 40 |     private int remainSize = 0;
 41 | 
 42 |     public SegmentWrapper(Reader reader, Segment segment) {
 43 |         this.input = reader;
 44 |         this.segment = segment;
 45 |     }
 46 | 
 47 |     /**
 48 |      * 重置分词器
 49 |      *
 50 |      * @param reader
 51 |      */
 52 |     public void reset(Reader reader) {
 53 |         input = reader;
 54 |         offset = 0;
 55 |         iterator = null;
 56 |     }
 57 | 
 58 |     public Atom next() throws IOException {
 59 |         if (iterator != null && iterator.hasNext()) {
 60 |             return iterator.next();
 61 |         }
 62 |         System.out.println("------------");
 63 |         String line = readLine();
 64 |         if (line == null) {
 65 |             System.out.println("-******");
 66 |             return null;
 67 |         }
 68 |         List<Atom> atomList = segment.seg(line);
 69 |         if (atomList.size() == 0) {
 70 |             return null;
 71 |         }
 72 |         offset += line.length();
 73 |         iterator = atomList.iterator();
 74 |         return iterator.next();
 75 |     }
 76 | 
 77 |     private String readLine() throws IOException {
 78 |         int offset = 0;
 79 |         int length = BUFFER_SIZE;
 80 |         if (remainSize > 0) {
 81 |             offset = remainSize;
 82 |             length -= remainSize;
 83 |         }
 84 |         int n = input.read(buffer, offset, length);
 85 |         if (n < 0) {
 86 |             if (remainSize != 0) {
 87 |                 String lastLine = new String(buffer, 0, remainSize);
 88 |                 remainSize = 0;
 89 |                 return lastLine;
 90 |             }
 91 |             return null;
 92 |         }
 93 |         n += offset;
 94 | 
 95 |         int eos = lastIndexOfEos(buffer, n);
 96 |         String line = new String(buffer, 0, eos);
 97 |         remainSize = n - eos;
 98 |         System.out.println("remainSize=" + remainSize);
 99 |         System.arraycopy(buffer, eos, buffer, 0, remainSize);
100 |         return line;
101 |     }
102 | 
103 |     private int lastIndexOfEos(char[] buffer, int length) {
104 |         for (int i = length - 1; i > 0; i--) {
105 |             if (buffer[i] == '\n' || CharType.get(buffer[i]) == CharType.CT_DELIMITER) {
106 |                 return i + 1;
107 |             }
108 |         }
109 |         return length;
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/resources/lda/doc/1:
--------------------------------------------------------------------------------
  1 | 迈向	充满	希望	的	新	世纪	——	一九九八年	新年	讲话	（	附	图片	1	张	）
  2 | 
  3 | 中共中央	总书记	、	国家	主席	江泽民
  4 | 
  5 | （	一九九七年	十二月	三十一日	）
  6 | 
  7 | 12月	31日	，
  8 | 中共中央	总书记	、	国家	主席	江泽民	发表	1998年	新年	讲话	《	迈向	充满	希望	的	新	世纪	》	。
  9 | （	新华社	记者	兰红光	摄	）
 10 | 
 11 | 同胞	们	、	朋友	们	、	女士	们	、	先生	们	：
 12 | 
 13 | 在	1998年	来临	之际	，
 14 | 我	十分	高兴	地	通过	中央	人民	广播	电台	、	中国	国际	广播	电台	和	中央	电视台	，
 15 | 向	全国	各	族	人民	，
 16 | 向	香港	特别	行政区	同胞	、	澳门	和	台湾	同胞	、	海外	侨胞	，
 17 | 向	世界	各	国	 的	朋友	们	，
 18 | 致以	诚挚	的	问候	和	良好	的	祝愿	！
 19 | 
 20 | 1997年	，
 21 | 是	中国	发展	历史	上	非常	重要	的	很	不	平凡	的	一	年	。
 22 | 中国	人民	决心	继承	邓小平	同志	的	遗志	，
 23 | 继续	把	建设	有	中国	特色	社会主义	事业	推向	前进	。
 24 | 中国	政府	顺利	恢复	对	香港	行使	主权	，
 25 | 并	按照	“	一国两制	”	、	“	港人治港	”	、	高度	自治	的	方针	保持	香港	的	繁荣	稳定	。
 26 | 中国	共产党	成功	地	召开	了	第十五	次	全国	代表大会	，
 27 | 高举	邓小平理论	伟大	旗帜	，
 28 | 总结	百年	历史	，
 29 | 展望	新	的	世纪	，
 30 | 制定	了	中国	跨	世纪	发展	的	行动	纲领	。
 31 | 
 32 | 在	这	一	年	中	，
 33 | 中国	的	改革	开放	和	现代化	建设	继续	向前	迈进	。
 34 | 国民经济	保持	了	“	高	增长	、	低	通胀	”	的	良好	发展	态势	。
 35 | 农业	生产	再次	获得	好	的	收成	，
 36 | 企业	改革	继续	深化	，
 37 | 人民	生活	进一步	改善	。
 38 | 对外	经济	技术	合作	与	交流	不断	扩大	。
 39 | 民主	法制	建设	、	精神文明	建设	和	其他	各	项	事业	都	有	新	的	进展	。
 40 | 我们	十分	关注	最近	一个	时期	一些	国家	和	地区	发生	的	金融	风波	，
 41 | 我们	相信	通过	这些	国家	和	地区	的	努力	以及	有关	的	国际	合作	，
 42 | 情况	会	逐步	得到	缓解	。
 43 | 总的来说	，
 44 | 中国	改革	和	发展	的	全局	继续	保持	了	稳定	。
 45 | 
 46 | 在	这	一	年	中	，
 47 | 中国	的	外交	工作	取得	了	重要	成果	。
 48 | 通过	高层	互访	，
 49 | 中国	与	美国	、	俄罗斯	、	法国	、	日本	等	大国	确定	了	双方	关系	未来	发展	的	目标	和	指导	方针	。
 50 | 中国	与	周边	国家	和	广大	发展中国家	的	友好	合作	进一步	加强	。
 51 | 中国	积极	参与	亚	太	经合	组织	的	活动	，
 52 | 参加	了	东盟	—	中	日	韩	和	中国	—	东盟	首脑	非正式	会晤	。
 53 | 这些	外交	活动	，
 54 | 符合	和平	与	发展	的	时代	主题	，
 55 | 顺应	世界	走向	多极化	的	趋势	，
 56 | 对于	促进	国际	社会	的	友好	合作	和	共同	发展	作出	了	积极	的	贡献	。
 57 | 
 58 | 1998年	，
 59 | 中国	人民	将	满怀信心	地	开创	新	的	业绩	。
 60 | 尽管	我们	在	经济	社会	发展	中	还	面临	不少	困难	，
 61 | 但	我们	有	邓小平理论	的	指引	，
 62 | 有	改革	开放	近	20	年	来	取得	的	伟大	成就	和	积累	的	丰富	经验	，
 63 | 还	有	其他	的	各	种	有利	条件	，
 64 | 我们	一定	能够	克服	这些	困难	，
 65 | 继续	稳步	前进	。
 66 | 只要	我们	进一步	解放思想	，
 67 | 实事求是	，
 68 | 抓住	机遇	，
 69 | 开拓进取	，
 70 | 建设	有	中国	特色	社会主义	的	道路	就	会	越	走	越	宽广	。
 71 | 
 72 | 实现	祖国	的	完全	统一	，
 73 | 是	海内外	全体	中国	人	的	共同	心愿	。
 74 | 通过	中	葡	双方	的	合作	和	努力	，
 75 | 按照	“	一国两制	”	方针	和	澳门	《	基本法	》	，
 76 | 1999年	12月	澳门	的	回归	一定	能够	顺利	实现	。
 77 | 
 78 | 台湾	是	中国	领土	不	可	分割	的	一	部分	。
 79 | 完成	祖国	统一	，
 80 | 是	大势所趋	，
 81 | 民心所向	。
 82 | 任何	企图	制造	“	两	个	中国	”	、	“	一中一台	”	、	“	台湾	独立	”	的	图谋	，
 83 | 都	注定	要	失败	。
 84 | 希望	台湾	当局	以	民族	大义	为	重	，
 85 | 拿	出	诚意	，
 86 | 采取	实际	的	行动	，
 87 | 推动	两岸	经济	文化	交流	和	人员	往来	，
 88 | 促进	两岸	直接	通邮	、	通航	、	通商	的	早日	实现	，
 89 | 并	尽早	回应	我们	发出	的	在	一个	中国	的	原则	下	两岸	进行	谈判	的	郑重	呼吁	。
 90 | 
 91 | 环顾	全球	，
 92 | 日益	密切	的	世界	经济	联系	，
 93 | 日新月异	的	科技	进步	，
 94 | 正在	为	各	国	 经济	的	发展	提供	历史	机遇	。
 95 | 但是	，
 96 | 世界	还	不	安宁	。
 97 | 南北	之间	的	贫富	差距	继续	扩大	；
 98 | 局部	冲突	时有发生	；
 99 | 不	公正	不	合理	的	旧	的	国际	政治	经济	秩序	还	没有	根本	改变	；
100 | 发展中国家	在	激烈	的	国际	经济	竞争	中	仍	处于	弱势	地位	；
101 | 人类	的	生存	与	发展	还	面临	种种	威胁	和	挑战	。
102 | 和平	与	发展	的	前景	是	光明	的	，
103 | 21	世纪	将	是	充满	希望	的	世纪	。
104 | 但	前进	的	道路	不	会	也	不	可能	一帆风顺	，
105 | 关键	是	世界	各	国	 人民	要	进一步	团结	起来	，
106 | 共同	推动	早日	建立	公正	合理	的	国际	政治	经济	新	秩序	。
107 | 
108 | 中国	政府	将	继续	坚持	奉行	独立自主	的	和平	外交	政策	，
109 | 在	和平共处	五	项	原则	的	基础	上	努力	发展	同	世界	各	国	 的	友好	关系	。
110 | 中国	愿意	加强	同	联合国	和	其他	国际	组织	的	协调	，
111 | 促进	在	扩大	经贸	科技	交流	、	保护	环境	、	消除	贫困	、	打击	国际	犯罪	等	方面	的	国际	合作	。
112 | 中国	永远	是	维护	世界	和平	与	稳定	的	重要	力量	。
113 | 中国	人民	愿	与	世界	各	国	 人民	一道	，
114 | 为	开创	持久	和平	、	共同	发展	的	新	世纪	而	不懈努力	！
115 | 
116 | 在	这	辞旧迎新	的	美好	时刻	，
117 | 我	祝	大家	新年	快乐	，
118 | 家庭	幸福	！
119 | 


--------------------------------------------------------------------------------
/src/main/resources/lda/doc/2:
--------------------------------------------------------------------------------
  1 | 在	十五大	精神	指引	下	胜利	前进	——	元旦	献辞
  2 | 
  3 | 我们	即将	以	丰收	的	喜悦	送	走	牛年	，
  4 | 以	昂扬	的	斗志	迎来	虎年	。
  5 | 我们	伟大	祖国	在	新	的	一	年	，
  6 | 将	是	充满	生机	、	充满	希望	的	一	年	。
  7 | 
  8 | 刚刚	过去	的	一	年	，
  9 | 大气磅礴	，
 10 | 波澜壮阔	。
 11 | 在	这	一	年	，
 12 | 以	江泽民	同志	为	核心	的	党中央	，
 13 | 继承	邓小平	同志	的	遗志	，
 14 | 高举	邓小平理论	的	伟大	旗帜	，
 15 | 领导	全党	和	全国	各	族	人民	坚定不移	地	沿着	建设	有	中国	特色	社会主义	道路	阔步	前进	，
 16 | 写	下	了	改革	开放	和	社会主义	现代化	建设	的	辉煌	篇章	。
 17 | 顺利	地	恢复	对	香港	行使	主权	，
 18 | 胜利	地	召开	党	的	第十五	次	全国	代表大会	———	两	件	大事	办	得	圆满	成功	。
 19 | 国民经济	稳中求进	，
 20 | 国家	经济	实力	进一步	增强	，
 21 | 人民	生活	继续	改善	，
 22 | 对外	经济	技术	交流	日益	扩大	。
 23 | 在	国际	金融	危机	的	风浪	波及	许多	国家	的	情况	下	，
 24 | 我国	保持	了	金融	形势	和	整个	经济	形势	的	稳定	发展	。
 25 | 社会主义	精神文明	建设	和	民主	法制	建设	取得	新	的	成绩	，
 26 | 各	项	社会	事业	全面	进步	。
 27 | 外交	工作	取得	可喜	的	突破	，
 28 | 我国	的	国际	地位	和	国际	威望	进一步	提高	。
 29 | 实践	使	亿万	人民	对	邓小平理论	更加	信仰	，
 30 | 对	以	江泽民	同志	为	核心	的	党中央	更加	信赖	，
 31 | 对	伟大	祖国	的	光辉	前景	更加	充满	信心	。
 32 | 
 33 | 1998年	，
 34 | 是	全面	贯彻	落实	党	的	十五大	提出	的	任务	的	第一	年	，
 35 | 各	条	战线	改革	和	发展	的	任务	都	十分	繁重	，
 36 | 有	许多	深	层次	的	矛盾	和	问题	有待	克服	和	解决	，
 37 | 特别	是	国有	企业	改革	已经	进入	攻坚	阶段	。
 38 | 我们	必须	进一步	深入	学习	和	掌握	党	的	十五大	精神	，
 39 | 统揽全局	，
 40 | 精心	部署	，
 41 | 狠抓	落实	，
 42 | 团结	一致	，
 43 | 艰苦奋斗	，
 44 | 开拓	前进	，
 45 | 为	夺取	今年	改革	开放	和	社会主义	现代化	建设	的	新	胜利	而	奋斗	。
 46 | 
 47 | 今年	是	党	的	十一	届	三中全会	召开	20	周年	，
 48 | 是	我们	党	和	国家	实现	伟大	的	历史	转折	、	进入	改革	开放	历史	新	时期	的	20	周年	。
 49 | 在	新	的	一	年	里	，
 50 | 大力	发扬	十一	届	三中全会	以来	我们	党	所	恢复	的	优良	传统	和	在	新	的	历史	条件	下	形成	的	优良	作风	，
 51 | 对于	完成	好	今年	的	各	项	任务	具有	十分	重要	的	意义	。
 52 | 
 53 | 我们	要	更	好	地	坚持	解放思想	、	实事求是	的	思想	路线	。
 54 | 解放思想	、	实事求是	，
 55 | 是	邓小平理论	的	精髓	。
 56 | 实践	证明	，
 57 | 只有	解放思想	、	实事求是	，
 58 | 才	能	冲破	各	种	不	切合	实际	的	或者	过时	的	观念	的	束缚	，
 59 | 真正	做到	尊重	、	认识	和	掌握	客观	规律	，
 60 | 勇于	突破	，
 61 | 勇于	创新	，
 62 | 不断	开创	社会主义	现代化	建设	的	新	局面	。
 63 | 党	的	十五大	是	我们	党	解放思想	、	实事求是	的	新	的	里程碑	。
 64 | 进一步	认真	学习	和	掌握	十五大	精神	，
 65 | 解放思想	、	实事求是	，
 66 | 我们	的	各	项	事业	就	能	结	出	更加	丰硕	的	成果	。
 67 | 
 68 | 我们	要	更	好	地	坚持	以	经济	建设	为	中心	。
 69 | 各	项	工作	必须	以	经济	建设	为	中心	，
 70 | 是	邓小平理论	的	基本	观点	，
 71 | 是	党	的	基本	路线	的	核心	内容	，
 72 | 近	20	年	来	的	实践	证明	，
 73 | 坚持	这个	中心	，
 74 | 是	完全	正确	的	。
 75 | 今后	，
 76 | 我们	能否	把	建设	有	中国	特色	社会主义	伟大	事业	全面	推向	21	世纪	，
 77 | 关键	仍然	要	看	能否	把	经济	工作	搞	上去	。
 78 | 各级	领导	干部	要	切实	把	精力	集中	到	贯彻	落实	好	中央	关于	今年	经济	工作	的	总体	要求	和	各	项	重要	任务	上	来	，
 79 | 不断	提高	领导	经济	建设	的	能力	和	水平	。
 80 | 
 81 | 我们	要	更	好	地	坚持	“	两手抓	、	两手	都	要	硬	”	的	方针	。
 82 | 在	坚持	以	经济	建设	为	中心	的	同时	，
 83 | 积极	推进	社会主义	精神文明	建设	和	民主	法制	建设	，
 84 | 是	建设	富强	、	民主	、	文明	的	社会主义	现代化	国家	的	重要	内容	。
 85 | 实践	证明	，
 86 | 经济	建设	的	顺利	进行	，
 87 | 离	不	开	精神文明	建设	和	民主	法制	建设	的	保证	。
 88 | 党	的	十五大	依据	邓小平理论	和	党	的	基本	路线	提出	的	党	在	社会主义	初级	阶段	经济	、	政治	、	文化	的	基本	纲领	，
 89 | 为	“	两手抓	、	两手	都	要	硬	”	提供	了	新	的	理论	根据	，
 90 | 提出	了	更	高	要求	，
 91 | 现在	的	关键	是	认真	抓好	落实	。
 92 | 
 93 | 我们	要	更	好	地	发扬	求真务实	、	密切	联系	群众	的	作风	。
 94 | 这	是	把	党	的	方针	、	政策	落到实处	，
 95 | 使	改革	和	建设	取得	胜利	的	重要	保证	。
 96 | 在	当前	改革	进一步	深化	，
 97 | 经济	不断	发展	，
 98 | 同时	又	出现	一些	新	情况	、	新	问题	和	新	困难	的	形势	下	，
 99 | 更	要	发扬	这样	的	好	作风	。
100 | 要	尊重	群众	的	意愿	，
101 | 重视	群众	的	首创	精神	，
102 | 关心	群众	的	生活	疾苦	。
103 | 江泽民	同志	最近	强调	指出	，
104 | 要	大力	倡导	说	实话	、	办	实事	、	鼓	实劲	、	讲	实效	的	作风	，
105 | 坚决	制止	追求	表面文章	，
106 | 搞	花架子	等	形式主义	，
107 | 坚决	杜绝	脱离	群众	、	脱离	实际	、	浮躁	虚夸	等	官僚主义	。
108 | 这	是	非常	重要	的	。
109 | 因此	，
110 | 各级	领导	干部	务必	牢记	全心全意	为	人民	服务	的	宗旨	，
111 | 在	勤政廉政	、	艰苦奋斗	方面	以身作则	，
112 | 当	好	表率	。
113 | 
114 | 1998	，
115 | 瞩目	中华	。
116 | 新	的	机遇	和	挑战	，
117 | 催	人	进取	；
118 | 新	的	目标	和	征途	，
119 | 催	人	奋发	。
120 | 英雄	的	中国	人民	在	以	江泽民	同志	为	核心	的	党中央	坚强	领导	和	党	的	十五大	精神	指引	下	，
121 | 更	高	地	举起	邓小平理论	的	伟大	旗帜	，
122 | 团结	一致	，
123 | 扎实	工作	，
124 | 奋勇	前进	，
125 | 一定	能够	创造	出	更加	辉煌	的	业绩	！


--------------------------------------------------------------------------------
/src/main/java/segment/bilstmcrf/BLCSegment.java:
--------------------------------------------------------------------------------
 1 | package segment.bilstmcrf;
 2 | 
 3 | import com.alibaba.fastjson.JSON;
 4 | import com.alibaba.fastjson.JSONArray;
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import lucene.Atom;
 7 | import org.apache.http.HttpEntity;
 8 | import org.apache.http.client.ClientProtocolException;
 9 | import org.apache.http.client.ResponseHandler;
10 | import org.apache.http.client.methods.HttpGet;
11 | import org.apache.http.client.utils.URIBuilder;
12 | import org.apache.http.impl.client.CloseableHttpClient;
13 | import org.apache.http.impl.client.HttpClients;
14 | import org.apache.http.util.EntityUtils;
15 | import org.junit.jupiter.api.Test;
16 | import segment.Segment;
17 | 
18 | import java.io.IOException;
19 | import java.net.URI;
20 | import java.net.URISyntaxException;
21 | import java.util.List;
22 | 
23 | /**
24 |  * @Author unclewang
25 |  * @Date 2018-12-12 12:58
26 |  * BiLSTM+CRF分词，http访问
27 |  */
28 | @Slf4j
29 | public class BLCSegment implements Segment {
30 |     //根据 /Users/unclewang/Idea_Projects/xinlp/python/bilstmcrf/modelserver.py 这个文件配置决定
31 |     public static final String IP = "192.168.1.104";
32 |     public static final Integer PORT = 9006;
33 | 
34 |     @Override
35 |     public List<Atom> seg(String text) {
36 |         try {
37 |             JSONArray jsonArray = getTokens(text);
38 |             char[] chars = text.toCharArray();
39 |             assert jsonArray.size() == chars.length;
40 |             StringBuilder sb = new StringBuilder();
41 |             for (int i = 0; i < jsonArray.size(); i++) {
42 |                 sb.append(chars[i]);
43 |                 if ("S".equals(jsonArray.get(i).toString()) || "E".equals(jsonArray.get(i).toString())) {
44 |                     sb.append("\t");
45 |                 }
46 |             }
47 |             String[] strings = sb.toString().trim().split("[\t\n]");
48 |             return strings2AtomList(strings);
49 |         } catch (URISyntaxException | IOException e) {
50 |             e.printStackTrace();
51 |         }
52 |         log.error("BiLSTM+CRF分词出问题，赶快过来看看");
53 |         return null;
54 |     }
55 | 
56 |     public JSONArray getTokens(String sent) throws URISyntaxException, IOException {
57 |         CloseableHttpClient httpClient = HttpClients.createDefault();
58 |         ResponseHandler<String> responseHandler = (response) -> {
59 |             int status = response.getStatusLine().getStatusCode();
60 |             if (status >= 200 && status < 300) {
61 |                 HttpEntity entity = response.getEntity();
62 |                 return entity != null ? EntityUtils.toString(entity, "UTF-8")
63 |                         : null;
64 |             } else if (status == 404) {
65 |                 return "404";
66 |             } else {
67 |                 System.out.println("Error occured, statusLine : " + status);
68 |                 throw new ClientProtocolException(
69 |                         "Unexpected response status: " + status);
70 |             }
71 |         };
72 | 
73 |         URI uri = new URIBuilder()
74 |                 .setScheme("http")
75 |                 .setHost(IP)
76 |                 .setPort(PORT)
77 |                 .setPath("/predict")
78 |                 .setParameter("sent", sent)
79 |                 .build();
80 |         HttpGet get = new HttpGet(uri);
81 |         get.setHeader("Accept-Encoding", "gzip,deflate,sdch");
82 |         String tokens = httpClient.execute(get, responseHandler);
83 |         System.out.println(tokens);
84 |         return JSON.parseArray(tokens);
85 |     }
86 | 
87 |     @Test
88 |     public void test() {
89 |         seg("碰到的一个问题 - 小橙子宝贝 - 博客园");
90 |     }
91 | 
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/java/lda/Documents.java:
--------------------------------------------------------------------------------
  1 | package lda;
  2 | 
  3 | import lombok.Data;
  4 | import org.apache.commons.io.FileUtils;
  5 | import org.junit.jupiter.api.Test;
  6 | 
  7 | import java.io.File;
  8 | import java.io.IOException;
  9 | import java.util.*;
 10 | import java.util.regex.Matcher;
 11 | import java.util.regex.Pattern;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | @Data
 15 | public class Documents {
 16 |     private ArrayList<Doc> docs;
 17 |     private static ArrayList<String> indexToTermList;
 18 |     private Map<String, Integer> termToIndexMap;
 19 |     private Map<String, Integer> termCountMap;
 20 |     
 21 |     public Documents() {
 22 |         docs = new ArrayList<>();
 23 |         termToIndexMap = new HashMap<>();
 24 |         indexToTermList = new ArrayList<>();
 25 |         termCountMap = new HashMap<>();
 26 |     }
 27 |     
 28 |     public static ArrayList<String> getIndexToTermList() {
 29 |         return indexToTermList;
 30 |     }
 31 |     
 32 |     public void readDocs(String docsPath) {
 33 |         for (File docFile : Objects.requireNonNull(new File(docsPath).listFiles())) {
 34 |             Doc doc = null;
 35 |             try {
 36 |                 doc = Doc.create(docFile.getAbsolutePath(), termToIndexMap, indexToTermList, termCountMap);
 37 |             } catch (IOException e) {
 38 |                 e.printStackTrace();
 39 |             }
 40 |             docs.add(doc);
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | @Data
 46 | class Doc {
 47 |     private String docName;
 48 |     private int[] docWords;
 49 |     private static final Pattern PATTERN = Pattern.compile(".*[a-zA-Z]+.*");
 50 |     private static final Pattern CHINESE_PATTERN = Pattern.compile(".*[a-zA-Z]+.*");
 51 |     
 52 |     public static Doc create(String absolutePath, Map<String, Integer> termToIndexMap, ArrayList<String> indexToTermList, Map<String, Integer> termCountMap) throws IOException {
 53 |         Doc doc = new Doc();
 54 |         
 55 |         doc.docName = absolutePath;
 56 |         List<String> docLines = FileUtils.readLines(new File(absolutePath), "UTF8");
 57 |         ArrayList<String> words = new ArrayList<>();
 58 |         docLines.forEach(line -> {
 59 |             StringTokenizer strTok = new StringTokenizer(line);
 60 |             while (strTok.hasMoreTokens()) {
 61 |                 String token = strTok.nextToken().replace(".", "").replace(">", "").replace("<", "");
 62 |                 words.add(token.toLowerCase().trim());
 63 |             }
 64 |         });
 65 |         List<String> collect = words.stream().filter(s -> isNoiseWord(s, CHINESE_PATTERN)).collect(Collectors.toList());
 66 |         doc.docWords = new int[collect.size()];
 67 |         for (int i = 0; i < collect.size(); i++) {
 68 |             String word = collect.get(i);
 69 |             if (!termToIndexMap.containsKey(word)) {
 70 |                 int newIndex = termToIndexMap.size();
 71 |                 termToIndexMap.put(word, newIndex);
 72 |                 indexToTermList.add(word);
 73 |                 termCountMap.put(word, 1);
 74 |                 doc.docWords[i] = newIndex;
 75 |             } else {
 76 |                 doc.docWords[i] = termToIndexMap.get(word);
 77 |                 termCountMap.put(word, termCountMap.get(word) + 1);
 78 |             }
 79 |         }
 80 |         collect.clear();
 81 |         return doc;
 82 |     }
 83 |     
 84 |     
 85 |     public static boolean isNoiseWord(String string, Pattern pattern) {
 86 |         string = string.toLowerCase().trim();
 87 |         Matcher m = pattern.matcher(string);
 88 |         // filter @xxx and URL
 89 |         if (string.matches(".*www\\..*") || string.matches(".*\\.com.*") ||
 90 |                 string.matches(".*http:.*")) {
 91 |             return true;
 92 |         }
 93 |         return !m.matches();
 94 |     }
 95 |     
 96 |     @Test
 97 |     public void test() {
 98 |         String path = "/Users/unclewang/Idea_Projects/xinlp/src/main/resources/lda/doc";
 99 |         Documents docSet = new Documents();
100 |         docSet.readDocs(path);
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/modelserver.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import tensorflow as tf
  6 | from tornado.ioloop import IOLoop
  7 | from tornado.web import Application, RequestHandler
  8 | 
  9 | import config
 10 | from data import get_train_test_data, tag2label, sentence2id
 11 | from embedding import get_embedding
 12 | from model import BiLSTM_CRF
 13 | from util import str2bool
 14 | 
 15 | parser = argparse.ArgumentParser(description='利用Bilstm+crf进行中文分词')
 16 | parser.add_argument('--batch_size', type=int, default=64, help='#minibatch的数量')
 17 | parser.add_argument('--epoch', type=int, default=40, help='#训练次数')
 18 | parser.add_argument('--hidden_dim', type=int, default=128, help='#Lstm里隐藏状态的维度')
 19 | parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
 20 | parser.add_argument('--lr', type=float, default=0.001, help='学习率')
 21 | parser.add_argument('--embedding_dim', type=int, default=300, help='字嵌入的维度')
 22 | parser.add_argument('--dropout', type=float, default=0.5, help='dropout保留比例')
 23 | parser.add_argument('--useCRF', type=str2bool, default=True, help='是否使用CRF训练损失函数，默认是CRF，false是使用softmax')
 24 | parser.add_argument('--max_len', type=int, default=50, help='句子最长个数')
 25 | parser.add_argument('--mode', type=str, default='predict', help='三种模式：train/test/predict')
 26 | parser.add_argument('--embedding_random', type=str, default=True,
 27 |                     help='使用随机的字嵌入（True）还是已经预训练好的（False），默认使用随机')
 28 | parser.add_argument('--update_embedding', type=str2bool, default=True, help='默认训练')
 29 | 
 30 | args = parser.parse_args()
 31 | train_data, test_data = get_train_test_data(args.embedding_random, args.max_len)
 32 | vocab, word2id, embeddings = get_embedding(args.embedding_random, args.embedding_dim)
 33 | 
 34 | configs = tf.ConfigProto()
 35 | configs.gpu_options.allow_growth = True
 36 | configs.gpu_options.per_process_gpu_memory_fraction = 0.2
 37 | # paths setting
 38 | paths = {}
 39 | output_path = config.output_path
 40 | if not os.path.exists(output_path):
 41 |     os.makedirs(output_path)
 42 | summary_path = os.path.join(output_path, "summaries")
 43 | paths['summary_path'] = summary_path
 44 | if not os.path.exists(summary_path):
 45 |     os.makedirs(summary_path)
 46 | model_path = os.path.join(output_path, "checkpoints/")
 47 | if not os.path.exists(model_path):
 48 |     os.makedirs(model_path)
 49 | ckpt_prefix = os.path.join(model_path, "model")
 50 | paths['model_path'] = ckpt_prefix
 51 | result_path = os.path.join(output_path, "results")
 52 | paths['result_path'] = result_path
 53 | if not os.path.exists(result_path):
 54 |     os.makedirs(result_path)
 55 | log_path = os.path.join(result_path, "log.txt")
 56 | paths['log_path'] = log_path
 57 | ckpt_file = tf.train.latest_checkpoint(model_path)
 58 | print(ckpt_file)
 59 | paths['model_path'] = ckpt_file
 60 | model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=configs)
 61 | model.build_graph()
 62 | 
 63 | saver = tf.train.Saver()
 64 | sess = tf.Session(config=configs)
 65 | saver.restore(sess, ckpt_file)
 66 | 
 67 | 
 68 | def predict(demo_sent):
 69 |     print('============= 开始预测 =============')
 70 |     demo_id = sentence2id(demo_sent, word2id)
 71 |     length = len(demo_id)
 72 |     if length > args.max_len:
 73 |         print('Inputs is too long ')
 74 |     demo_data = [(demo_id, [0] * length)]
 75 | 
 76 |     print(demo_sent)
 77 |     tags = model.predict_sentence(sess, demo_data)
 78 |     print(tags[:length])
 79 |     return json.dumps(tags[:length])
 80 | 
 81 | 
 82 | class IndexHandler(RequestHandler):
 83 | 
 84 |     def data_received(self, chunk):
 85 |         pass
 86 | 
 87 |     def get(self):
 88 |         # 获取get方式传递的参数
 89 |         sent = self.get_query_argument("sent")
 90 |         self.write(predict(str(sent)))
 91 | 
 92 |     def post(self):
 93 |         # 获取post方式传递的参数
 94 |         sent = self.get_body_argument("sent")
 95 |         self.write(predict(sent))
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     app = Application([(r"/predict", IndexHandler)])
100 |     app.listen(9006)
101 |     IOLoop.current().start()
102 | 


--------------------------------------------------------------------------------
/src/main/java/test/hmm/TestViterbi.java:
--------------------------------------------------------------------------------
  1 | package test.hmm;
  2 | 
  3 | import junit.framework.TestCase;
  4 | 
  5 | import static test.hmm.TestViterbi.Activity.*;
  6 | import static test.hmm.TestViterbi.Weather.Rainy;
  7 | import static test.hmm.TestViterbi.Weather.Sunny;
  8 | 
  9 | //这个测试天气和运动
 10 | public class TestViterbi extends TestCase {
 11 |     static enum Weather {
 12 |         Rainy,
 13 |         Sunny,
 14 |     }
 15 | 
 16 |     static enum Activity {
 17 |         Walk,
 18 |         Shop,
 19 |         Clean,
 20 |     }
 21 | 
 22 |     static int[] states = new int[]{Rainy.ordinal(), Sunny.ordinal()};
 23 |     static int[] observations = new int[]{Walk.ordinal(), Shop.ordinal(), Clean.ordinal()};
 24 |     double[] start_probability = new double[]{0.6, 0.4};
 25 |     double[][] transititon_probability = new double[][]{
 26 |             {0.7, 0.3},
 27 |             {0.4, 0.6},
 28 |     };
 29 |     double[][] emission_probability = new double[][]{
 30 |             {0.1, 0.4, 0.5},
 31 |             {0.6, 0.3, 0.1},
 32 |     };
 33 | 
 34 |     public void testCompute() throws Exception {
 35 |         for (int i = 0; i < start_probability.length; ++i) {
 36 |             start_probability[i] = -Math.log(start_probability[i]);
 37 |         }
 38 |         for (int i = 0; i < transititon_probability.length; ++i) {
 39 |             for (int j = 0; j < transititon_probability[i].length; ++j) {
 40 |                 transititon_probability[i][j] = -Math.log(transititon_probability[i][j]);
 41 |             }
 42 |         }
 43 |         for (int i = 0; i < emission_probability.length; ++i) {
 44 |             for (int j = 0; j < emission_probability[i].length; ++j) {
 45 |                 emission_probability[i][j] = -Math.log(emission_probability[i][j]);
 46 |             }
 47 |         }
 48 |         int[] result = compute(observations, states, start_probability, transititon_probability, emission_probability);
 49 |         for (int r : result) {
 50 |             System.out.print(Weather.values()[r] + " ");
 51 |         }
 52 |         System.out.println();
 53 |     }
 54 | 
 55 |     /**
 56 |      * 求解HMM模型，所有概率请提前取对数
 57 |      *
 58 |      * @param obs     观测序列
 59 |      * @param states  隐状态
 60 |      * @param start_p 初始概率（隐状态）
 61 |      * @param trans_p 转移概率（隐状态）
 62 |      * @param emit_p  发射概率 （隐状态表现为显状态的概率）
 63 |      * @return 最可能的序列
 64 |      */
 65 |     public static int[] compute(int[] obs, int[] states, double[] start_p, double[][] trans_p, double[][] emit_p) {
 66 |         int _max_states_value = 0;
 67 |         for (int s : states) {
 68 |             _max_states_value = Math.max(_max_states_value, s);
 69 |         }
 70 |         ++_max_states_value;
 71 |         double[][] V = new double[obs.length][_max_states_value];
 72 |         int[][] path = new int[_max_states_value][obs.length];
 73 | 
 74 |         for (int y : states) {
 75 |             V[0][y] = start_p[y] + emit_p[y][obs[0]];
 76 |             path[y][0] = y;
 77 |         }
 78 | 
 79 |         for (int t = 1; t < obs.length; ++t) {
 80 |             int[][] newpath = new int[_max_states_value][obs.length];
 81 | 
 82 |             for (int y : states) {
 83 |                 double prob = Double.MAX_VALUE;
 84 |                 int state;
 85 |                 for (int y0 : states) {
 86 |                     double nprob = V[t - 1][y0] + trans_p[y0][y] + emit_p[y][obs[t]];
 87 |                     if (nprob < prob) {
 88 |                         prob = nprob;
 89 |                         state = y0;
 90 |                         // 记录最大概率
 91 |                         V[t][y] = prob;
 92 |                         // 记录路径
 93 |                         System.arraycopy(path[state], 0, newpath[y], 0, t);
 94 |                         newpath[y][t] = y;
 95 |                     }
 96 |                 }
 97 |             }
 98 | 
 99 |             path = newpath;
100 |         }
101 | 
102 |         double prob = Double.MAX_VALUE;
103 |         int state = 0;
104 |         for (int y : states) {
105 |             if (V[obs.length - 1][y] < prob) {
106 |                 prob = V[obs.length - 1][y];
107 |                 state = y;
108 |             }
109 |         }
110 | 
111 |         return path[state];
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/mining/tfidf/LSICal.java:
--------------------------------------------------------------------------------
 1 | package mining.tfidf;
 2 | 
 3 | import com.google.common.collect.BiMap;
 4 | import lombok.extern.slf4j.Slf4j;
 5 | import org.nd4j.linalg.api.ndarray.INDArray;
 6 | import org.nd4j.linalg.factory.Nd4j;
 7 | 
 8 | import java.io.FileNotFoundException;
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | 
12 | @Slf4j
13 | public class LSICal {
14 |     private BiMap<Integer, Integer> termIdVocabularyId;
15 |     //18288数量太大，电脑吃不消
16 |     private int docNum = 10;
17 |     private double featurePercent = 10;
18 | 
19 |     public double[][] transformMatrix(HashMap<Integer, HashMap<Integer, Double>> idTfIDf) {
20 |         BiMap<Integer, Integer> termIdVocabularyId = null;
21 |         try {
22 |             termIdVocabularyId = AllDocTfIdf.loadTermid();
23 |         } catch (FileNotFoundException e) {
24 |             e.printStackTrace();
25 |         }
26 |         BiMap<Integer, Integer> vocabularyIdTermId = termIdVocabularyId.inverse();
27 |         log.info("正在生成" + docNum + "*" + termIdVocabularyId.size() + "的矩阵");
28 |         double[][] docTermMatrix = new double[docNum][termIdVocabularyId.size()];
29 |         for (Map.Entry<Integer, HashMap<Integer, Double>> entry : idTfIDf.entrySet()) {
30 |             int docId = entry.getKey();
31 |             if (docId >= docNum) {
32 |                 break;
33 |             }
34 |             for (Map.Entry<Integer, Double> termEntry : entry.getValue().entrySet()) {
35 |                 int vocaId = termEntry.getKey();
36 |                 double value = termEntry.getValue();
37 |                 int termId = vocabularyIdTermId.get(vocaId);
38 |                 docTermMatrix[docId][termId] = value;
39 |             }
40 | 
41 |         }
42 |         return docTermMatrix;
43 |     }
44 | 
45 | 
46 |     /**
47 |      * @param docTermMatrix
48 |      * @return 对于奇异值, 它跟我们特征分解中的特征值类似，在奇异值矩阵中也是按照从大到小排列，而且奇异值的减少特别的快，在很多情况下，前10%甚至1%的奇异值的和就占了全部的奇异值之和的99%以上的比例。也就是说，我们也可以用最大的k个的奇异值和对应的左右奇异向量来近似描述矩阵。也就是说：
49 |      * Am×n=Um×mΣm×nVTn×n≈Um×kΣk×kVTk×n
50 |      */
51 |     private double[][] svd(double[][] docTermMatrix) {
52 | //        docTermMatrix = new double[][]{{1, 1, 1, 0, 0}, {2, 2, 2, 0, 0}, {1, 1, 1, 0, 0}, {5, 5, 5, 0, 0}, {0, 0, 0, 2, 2}, {0, 0, 0, 3, 3}, {0, 0, 0, 1, 1}};
53 |         log.info("开始进行SVD分解，得到Doc与Doc之间的关系");
54 |         INDArray A = Nd4j.create(docTermMatrix);
55 |         if (A.rows() < A.columns()) {
56 |             A = A.transpose();
57 |         }
58 |         int nRows = A.rows();
59 |         int nColumns = A.columns();
60 |         System.out.println(A + "\n");
61 |         //左奇异矩阵U可以用于行数的压缩。相对的，右奇异矩阵V可以用于列数即特征维度的压缩,PCA降维
62 |         INDArray vt = Nd4j.zeros(nColumns, nColumns);
63 |         INDArray s = Nd4j.zeros(1, nColumns);
64 |         INDArray u = Nd4j.zeros(nRows, nRows);
65 |         Nd4j.getBlasWrapper().lapack().gesvd(A, s, u, vt);
66 |         System.out.println(A);
67 |         INDArray sigma = Nd4j.diag(s);
68 |         INDArray temp = Nd4j.zeros(nRows - nColumns, nColumns);
69 |         INDArray vsigam = Nd4j.vstack(sigma, temp);
70 |         System.out.println("--------------------");
71 | //        System.out.println("E:" + u.mmul(u.transpose()));
72 |         System.out.println("E:" + vt.transpose().mmul(vt));
73 |         System.out.println("--------------------");
74 |         System.out.println("A" + A);
75 |         System.out.println(u.mmul(vsigam.mmul(vt)));
76 |         System.out.println("--------------------");
77 | 
78 |         INDArray docSim = vt.transpose().mmul(vsigam.transpose().mmul(vsigam.mmul(vt)));
79 |         log.info("SVD计算结束");
80 |         System.out.println("\n S:" + s);
81 |         System.out.println("\n VT:" + vt);
82 |         System.out.println("\n docSim:\n" + docSim);
83 |         System.out.println("\n original:\n" + A.transpose().mmul(A));
84 |         System.out.println("\n original:\n" + A.mmul(A.transpose()));
85 |         return docTermMatrix;
86 |     }
87 | 
88 |     public static void main(String[] args) {
89 |         AllDocTfIdf allDocTfIdf = new AllDocTfIdf();
90 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = allDocTfIdf.loadAllDocTfIdf();
91 |         LSICal lsiCal = new LSICal();
92 |         lsiCal.svd(lsiCal.transformMatrix(idTfIDf));
93 | 
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | import config
 7 | from data import get_train_test_data, tag2label, sentence2id
 8 | from embedding import get_embedding
 9 | from model import BiLSTM_CRF
10 | from util import str2bool
11 | 
12 | parser = argparse.ArgumentParser(description='利用Bilstm+crf进行中文分词')
13 | parser.add_argument('--batch_size', type=int, default=64, help='#minibatch的数量')
14 | parser.add_argument('--epoch', type=int, default=40, help='#训练次数')
15 | parser.add_argument('--hidden_dim', type=int, default=128, help='#Lstm里隐藏状态的维度')
16 | parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
17 | parser.add_argument('--lr', type=float, default=0.001, help='学习率')
18 | parser.add_argument('--embedding_dim', type=int, default=300, help='字嵌入的维度')
19 | parser.add_argument('--dropout', type=float, default=0.5, help='dropout保留比例')
20 | parser.add_argument('--useCRF', type=str2bool, default=True, help='是否使用CRF训练损失函数，默认是CRF，false是使用softmax')
21 | parser.add_argument('--max_len', type=int, default=50, help='句子最长个数')
22 | parser.add_argument('--mode', type=str, default='predict', help='三种模式：train/test/predict')
23 | parser.add_argument('--embedding_random', type=str, default=True,
24 |                     help='使用随机的字嵌入（True）还是已经预训练好的（False），默认使用随机')
25 | parser.add_argument('--update_embedding', type=str2bool, default=True, help='默认训练')
26 | 
27 | args = parser.parse_args()
28 | 
29 | train_data, test_data = get_train_test_data(args.embedding_random, args.max_len)
30 | vocab, word2id, embeddings = get_embedding(args.embedding_random, args.embedding_dim)
31 | 
32 | configs = tf.ConfigProto()
33 | configs.gpu_options.allow_growth = True
34 | configs.gpu_options.per_process_gpu_memory_fraction = 0.2
35 | # paths setting
36 | paths = {}
37 | output_path = config.output_path
38 | if not os.path.exists(output_path):
39 |     os.makedirs(output_path)
40 | summary_path = os.path.join(output_path, "summaries")
41 | paths['summary_path'] = summary_path
42 | if not os.path.exists(summary_path):
43 |     os.makedirs(summary_path)
44 | model_path = os.path.join(output_path, "checkpoints/")
45 | if not os.path.exists(model_path):
46 |     os.makedirs(model_path)
47 | ckpt_prefix = os.path.join(model_path, "model")
48 | paths['model_path'] = ckpt_prefix
49 | result_path = os.path.join(output_path, "results")
50 | paths['result_path'] = result_path
51 | if not os.path.exists(result_path):
52 |     os.makedirs(result_path)
53 | log_path = os.path.join(result_path, "log.txt")
54 | paths['log_path'] = log_path
55 | 
56 | if args.mode == 'train':
57 |     model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, configs)
58 |     model.build_graph()
59 |     model.train(train=train_data, test=test_data)
60 | elif args.mode == 'test':
61 |     ckpt_file = tf.train.latest_checkpoint(model_path)
62 |     print(ckpt_file)
63 |     paths['model_path'] = ckpt_file
64 |     model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, configs)
65 |     model.build_graph()
66 |     print("test data: {}".format(len(test_data)))
67 |     model.test(test_data)
68 | elif args.mode == 'predict':
69 |     ckpt_file = tf.train.latest_checkpoint(model_path)
70 |     print(ckpt_file)
71 |     paths['model_path'] = ckpt_file
72 |     model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=configs)
73 |     model.build_graph()
74 |     saver = tf.train.Saver()
75 |     with tf.Session(config=configs) as sess:
76 |         print('============= demo =============')
77 |         saver.restore(sess, ckpt_file)
78 |         while 1:
79 |             print('Please input your sentence:')
80 |             demo_sent = input()
81 |             if demo_sent == '' or demo_sent.isspace():
82 |                 print('See you next time!')
83 |                 break
84 |             else:
85 |                 demo_id = sentence2id(demo_sent, word2id)
86 |                 length = len(demo_id)
87 |                 if length > args.max_len:
88 |                     print('Inputs is too long ')
89 |                 demo_data = [(demo_id, [0] * length)]
90 |                 print(demo_id)
91 |                 tags = model.predict_sentence(sess, demo_data)
92 |                 print(tags[:length])
93 | 


--------------------------------------------------------------------------------
/python/bilstmcrf/data.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | 
  4 | import tflearn
  5 | 
  6 | import config
  7 | from embedding import get_embedding
  8 | 
  9 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 10 | 
 11 | tag2label = {"B": 0, "E": 1, "M": 2, "S": 3}
 12 | 
 13 | 
 14 | def read_corpus(random, max_len):
 15 |     vocab, word2id, embedding = get_embedding(random, 300)
 16 |     sentsid_, sents_, tags_ = [], [], []
 17 |     logging.info("开始读取数据集")
 18 |     with open(config.split_data, encoding='utf-8') as fr:
 19 |         lines = fr.readlines()
 20 |     sentid_, sent_, tag_ = [], [], []
 21 |     for line in lines:
 22 |         if line != '\n':
 23 |             char, label = line.strip().split()
 24 |             tag_.append(tag2label[label])
 25 |             if char.startswith("num"):
 26 |                 sent_.append("num")
 27 |                 sentid_.append(1)
 28 |             elif char.startswith("en"):
 29 |                 sent_.append("en")
 30 |                 sentid_.append(2)
 31 |             elif '\u4e00' <= char <= '\u9fa5' and char in vocab:
 32 |                 sent_.append(char)
 33 |                 sentid_.append(word2id[char])
 34 |             else:
 35 |                 sent_.append("unk")
 36 |                 sentid_.append(0)
 37 |         else:
 38 |             if 3 < len(sent_) <= max_len:
 39 |                 sents_.append(sent_)
 40 |                 tags_.append(tag_)
 41 |                 sentsid_.append(sentid_)
 42 |                 sentid_, sent_, tag_ = [], [], []
 43 |             else:
 44 |                 sentid_, sent_, tag_ = [], [], []
 45 |     # 在get_feed_dict去padding，不事先padding好了
 46 |     # padding_tags = tflearn.data_utils.pad_sequences(tags_, maxlen=max_len, value=3)
 47 |     # padding_sentsid = tflearn.data_utils.pad_sequences(sentsid_, maxlen=max_len, value=0)
 48 |     # print(sents_[0])
 49 |     # print(padding_sentsid[0])
 50 |     # print(padding_tags[0])
 51 |     return sentsid_, sents_, tags_
 52 | 
 53 | 
 54 | def pad_sequences(seqs, pad_mark):
 55 |     batch_max_len = max(map(lambda x: len(x), seqs))
 56 |     # batch_max_len = 0
 57 |     # # print(seqs)
 58 |     # for seq in seqs:
 59 |     #     if seq is not None:
 60 |     #         if len(seq) > batch_max_len:
 61 |     #             batch_max_len = len(seq)
 62 | 
 63 |     seq_len_list = []
 64 |     for seq in seqs:
 65 |         seq_len_list.append(min(len(seq), batch_max_len))
 66 |     padding_seqs = tflearn.data_utils.pad_sequences(seqs, maxlen=batch_max_len, value=pad_mark)
 67 |     return padding_seqs, seq_len_list
 68 | 
 69 | 
 70 | def sentence2id(sent, word2id):
 71 |     sentid_ = []
 72 |     for char in sent:
 73 |         if char.startswith("num"):
 74 |             sentid_.append(1)
 75 |         elif char.startswith("en"):
 76 |             sentid_.append(2)
 77 |         elif '\u4e00' <= char <= '\u9fa5' and char in word2id.keys():
 78 |             sentid_.append(word2id[char])
 79 |         else:
 80 |             sentid_.append(0)
 81 |     return sentid_
 82 | 
 83 | 
 84 | # 主要是为了打乱顺序
 85 | def get_train_test_data(embedding_random, max_len):
 86 |     data = []
 87 |     train_data = []
 88 |     test_data = []
 89 |     sentids_, sents_, tags_ = read_corpus(embedding_random, max_len)
 90 | 
 91 |     l = len(tags_)
 92 |     for i in range(l):
 93 |         data.append((sentids_[i], tags_[i]))
 94 |     random.shuffle(data)
 95 | 
 96 |     for i in range(l):
 97 |         (sentid_, tag_) = data[i]
 98 |         if i < 0.95 * l:
 99 |             train_data.append((sentid_, tag_))
100 |         else:
101 |             test_data.append((sentid_, tag_))
102 |     return train_data, test_data
103 | 
104 | 
105 | def batch_yield(data, batch_size, is_train=True):
106 |     random.shuffle(data)
107 |     seqs, labels = [], []
108 |     for (sentid_, tag_) in data:
109 |         if len(seqs) == batch_size:
110 |             yield seqs, labels
111 |             seqs, labels = [], []
112 |         seqs.append(sentid_)
113 |         labels.append(tag_)
114 |     # 为了考虑predict只有一个，也得返回
115 |     if is_train is False:
116 |         yield seqs, labels
117 | 
118 | # train_x, train_y, test_x, test_y = get_train_test_data(50)
119 | # print(train_y[0])
120 | # print(train_x[0])
121 | # print(test_y[0])
122 | # print(test_x[0])
123 | 


--------------------------------------------------------------------------------
/python/bert/server.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import os
  3 | 
  4 | import six
  5 | import tensorflow as tf
  6 | from bert import modeling
  7 | from gevent import monkey
  8 | 
  9 | monkey.patch_all()
 10 | from flask import Flask, request
 11 | from gevent import pywsgi
 12 | import numpy as np
 13 | import json
 14 | 
 15 | flags = tf.flags
 16 | 
 17 | FLAGS = flags.FLAGS
 18 | 
 19 | bert_path = '/Users/unclewang/.xinlp/data/chinese_L-12_H-768_A-12'
 20 | 
 21 | flags.DEFINE_string(
 22 |     "bert_config_file", os.path.join(bert_path, 'bert_config.json'),
 23 |     "The config json file corresponding to the pre-trained BERT model."
 24 | )
 25 | 
 26 | flags.DEFINE_string(
 27 |     "bert_vocab_file", os.path.join(bert_path, 'vocab.txt'),
 28 |     "The config vocab file"
 29 | )
 30 | 
 31 | flags.DEFINE_string(
 32 |     "init_checkpoint", os.path.join(bert_path, 'bert_model.ckpt'),
 33 |     "Initial checkpoint (usually from a pre-trained BERT model)."
 34 | )
 35 | 
 36 | app = Flask(__name__)
 37 | 
 38 | 
 39 | def convert_to_unicode(text):
 40 |     """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 41 |     if six.PY3:
 42 |         if isinstance(text, str):
 43 |             return text
 44 |         elif isinstance(text, bytes):
 45 |             return text.decode("utf-8", "ignore")
 46 |         else:
 47 |             raise ValueError("Unsupported string type: %s" % (type(text)))
 48 |     elif six.PY2:
 49 |         if isinstance(text, str):
 50 |             return text.decode("utf-8", "ignore")
 51 |         elif isinstance(text, unicode):
 52 |             return text
 53 |         else:
 54 |             raise ValueError("Unsupported string type: %s" % (type(text)))
 55 |     else:
 56 |         raise ValueError("Not running on Python2 or Python 3?")
 57 | 
 58 | 
 59 | def load_vocab(vocab_file):
 60 |     vocab = collections.OrderedDict()
 61 |     vocab.setdefault("blank", 2)
 62 |     index = 0
 63 |     with tf.gfile.GFile(vocab_file, "r") as reader:
 64 |         while True:
 65 |             token = convert_to_unicode(reader.readline())
 66 |             if not token:
 67 |                 break
 68 |             token = token.strip()
 69 |             vocab[token] = index
 70 |             index += 1
 71 |     return vocab
 72 | 
 73 | 
 74 | di = load_vocab(vocab_file=FLAGS.bert_vocab_file)
 75 | init_checkpoint = FLAGS.init_checkpoint
 76 | use_tpu = False
 77 | 
 78 | sess = tf.Session()
 79 | 
 80 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
 81 | 
 82 | print(init_checkpoint)
 83 | 
 84 | is_training = False
 85 | use_one_hot_embeddings = False
 86 | 
 87 | 
 88 | def inputs(vectors, maxlen=10):
 89 |     length = len(vectors)
 90 |     if length >= maxlen:
 91 |         return vectors[0:maxlen], [1] * maxlen, [0] * maxlen
 92 |     else:
 93 |         input = vectors + [0] * (maxlen - length)
 94 |         mask = [1] * length + [0] * (maxlen - length)
 95 |         segment = [0] * maxlen
 96 |         return input, mask, segment
 97 | 
 98 | 
 99 | input_ids_p = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids_p")
100 | input_mask_p = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask_p")
101 | segment_ids_p = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids_p")
102 | 
103 | model = modeling.BertModel(
104 |     config=bert_config,
105 |     is_training=is_training,
106 |     input_ids=input_ids_p,
107 |     input_mask=input_mask_p,
108 |     token_type_ids=segment_ids_p,
109 |     use_one_hot_embeddings=use_one_hot_embeddings
110 | )
111 | 
112 | restore_saver = tf.train.Saver()
113 | restore_saver.restore(sess, init_checkpoint)
114 | 
115 | 
116 | @app.route('/bert')
117 | def response_request():
118 |     text = request.args.get('text')
119 | 
120 |     vectors = [di.get("[CLS]")] + [di.get(i) if i in di else di.get("[UNK]") for i in list(text)] + [di.get("[SEP]")]
121 | 
122 |     input, mask, segment = inputs(vectors)
123 | 
124 |     input_ids = np.reshape(np.array(input), [1, -1])
125 |     input_mask = np.reshape(np.array(mask), [1, -1])
126 |     segment_ids = np.reshape(np.array(segment), [1, -1])
127 | 
128 |     embedding = tf.squeeze(model.get_sequence_output())
129 | 
130 |     ret = sess.run(embedding,
131 |                    feed_dict={"input_ids_p:0": input_ids, "input_mask_p:0": input_mask, "segment_ids_p:0": segment_ids})
132 |     return json.dumps(ret.tolist(), ensure_ascii=False)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     server = pywsgi.WSGIServer(('0.0.0.0', 19877), app)
137 |     server.serve_forever()
138 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // 使用 IntelliSense 了解相关属性。 
  3 |     // 悬停以查看现有属性的描述。
  4 |     // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
  5 |     "version": "0.2.0",
  6 |     "configurations": [
  7 |         {
  8 |             "type": "java",
  9 |             "name": "CodeLens (Launch) - LDAConfig",
 10 |             "request": "launch",
 11 |             "mainClass": "lda.LDAConfig",
 12 |             "projectName": "xinlp"
 13 |         },
 14 |         {
 15 |             "type": "java",
 16 |             "name": "Debug (Launch) - Current File",
 17 |             "request": "launch",
 18 |             "mainClass": "${file}"
 19 |         },
 20 |         {
 21 |             "type": "java",
 22 |             "name": "Debug (Launch)-LdaGibbsSampling<xinlp>",
 23 |             "request": "launch",
 24 |             "mainClass": "lda.LdaGibbsSampling",
 25 |             "projectName": "xinlp"
 26 |         },
 27 |         {
 28 |             "type": "java",
 29 |             "name": "Debug (Launch)-ClusterApp<xinlp>",
 30 |             "request": "launch",
 31 |             "mainClass": "mining.cluster.ClusterApp",
 32 |             "projectName": "xinlp"
 33 |         },
 34 |         {
 35 |             "type": "java",
 36 |             "name": "Debug (Launch)-AllDocTfIdf<xinlp>",
 37 |             "request": "launch",
 38 |             "mainClass": "mining.tfidf.AllDocTfIdf",
 39 |             "projectName": "xinlp"
 40 |         },
 41 |         {
 42 |             "type": "java",
 43 |             "name": "Debug (Launch)-LSICal<xinlp>",
 44 |             "request": "launch",
 45 |             "mainClass": "mining.tfidf.LSICal",
 46 |             "projectName": "xinlp"
 47 |         },
 48 |         {
 49 |             "type": "java",
 50 |             "name": "Debug (Launch)-XinCRFModel<xinlp>",
 51 |             "request": "launch",
 52 |             "mainClass": "segment.crf.XinCRFModel",
 53 |             "projectName": "xinlp"
 54 |         },
 55 |         {
 56 |             "type": "java",
 57 |             "name": "Debug (Launch)-XinCRFSegment<xinlp>",
 58 |             "request": "launch",
 59 |             "mainClass": "segment.crf.XinCRFSegment",
 60 |             "projectName": "xinlp"
 61 |         },
 62 |         {
 63 |             "type": "java",
 64 |             "name": "Debug (Launch)-XinCRFApp<xinlp>",
 65 |             "request": "launch",
 66 |             "mainClass": "segment.crf.app.XinCRFApp",
 67 |             "projectName": "xinlp"
 68 |         },
 69 |         {
 70 |             "type": "java",
 71 |             "name": "Debug (Launch)-XinCRFSegmentServer<xinlp>",
 72 |             "request": "launch",
 73 |             "mainClass": "segment.crf.tcp.XinCRFSegmentServer",
 74 |             "projectName": "xinlp"
 75 |         },
 76 |         {
 77 |             "type": "java",
 78 |             "name": "Debug (Launch)-EmHmm<xinlp>",
 79 |             "request": "launch",
 80 |             "mainClass": "segment.hmm.EmHmm",
 81 |             "projectName": "xinlp"
 82 |         },
 83 |         {
 84 |             "type": "java",
 85 |             "name": "Debug (Launch)-BaumWelchHmm<xinlp>",
 86 |             "request": "launch",
 87 |             "mainClass": "test.hmm.BaumWelchHmm",
 88 |             "projectName": "xinlp"
 89 |         },
 90 |         {
 91 |             "type": "java",
 92 |             "name": "Debug (Launch)-UnsupervisedFirstOrderGeneralHmm<xinlp>",
 93 |             "request": "launch",
 94 |             "mainClass": "test.hmm.baumwelch.UnsupervisedFirstOrderGeneralHmm",
 95 |             "projectName": "xinlp"
 96 |         },
 97 |         {
 98 |             "type": "java",
 99 |             "name": "Debug (Launch)-LuceneTest<xinlp>",
100 |             "request": "launch",
101 |             "mainClass": "test.lucene.LuceneTest",
102 |             "projectName": "xinlp"
103 |         },
104 |         {
105 |             "type": "java",
106 |             "name": "Debug (Launch)-IKAnalyzerTest<xinlp>",
107 |             "request": "launch",
108 |             "mainClass": "test.lucene.rewriteTokenize.IKAnalyzerTest",
109 |             "projectName": "xinlp"
110 |         },
111 |         {
112 |             "type": "java",
113 |             "name": "Debug (Launch)-TestNewton<xinlp>",
114 |             "request": "launch",
115 |             "mainClass": "test.newton.TestNewton",
116 |             "projectName": "xinlp"
117 |         },
118 |         {
119 |             "type": "java",
120 |             "name": "Debug (Launch)-Stemmer<xinlp>",
121 |             "request": "launch",
122 |             "mainClass": "tools.Stemmer",
123 |             "projectName": "xinlp"
124 |         }
125 |     ]
126 | }


--------------------------------------------------------------------------------
/src/main/java/mining/data/PreProcess20News.java:
--------------------------------------------------------------------------------
  1 | package mining.data;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import mining.config.Config;
  5 | import mining.tfidf.Word;
  6 | import org.junit.jupiter.api.Test;
  7 | import tools.Stemmer;
  8 | 
  9 | import java.io.*;
 10 | import java.util.HashMap;
 11 | import java.util.HashSet;
 12 | import java.util.Map;
 13 | 
 14 | /**
 15 |  * @Author unclewang
 16 |  * @Date 2018-12-15 21:04
 17 |  */
 18 | @Slf4j
 19 | public class PreProcess20News implements PreProcess {
 20 |     private String prePath = Config.getPrePath();
 21 |     private String postPath = Config.getPostPath();
 22 |     private static HashSet<String> stopWordsSet = new HashSet<>();
 23 |     private static Stemmer stemmer = null;
 24 |     private static Map<String, Word> vocabulary = new HashMap<>();
 25 | 
 26 |     static {
 27 |         try {
 28 |             BufferedReader br = new BufferedReader(new FileReader(new File(Config.getStopwordsPath())));
 29 |             String s = null;
 30 |             while ((s = br.readLine()) != null) {
 31 |                 stopWordsSet.add(s);
 32 |             }
 33 |         } catch (IOException e) {
 34 |             e.printStackTrace();
 35 |         }
 36 |     }
 37 | 
 38 |     @Override
 39 |     public void preProcess(String currentPath) throws IOException {
 40 |         File currentDir = new File(currentPath);
 41 |         if (!currentDir.exists()) {
 42 |             throw new FileNotFoundException(currentPath + "，没有这个文件夹");
 43 |         }
 44 |         String subStrDir = currentPath.replaceAll(prePath, "");
 45 |         String postDir = postPath + subStrDir;
 46 | 
 47 |         File postFile = new File(postDir);
 48 |         if (!postFile.exists()) {
 49 |             postFile.mkdir();
 50 |         }
 51 |         File[] preFiles = currentDir.listFiles();
 52 |         assert preFiles != null;
 53 |         for (int i = 0; i < preFiles.length; i++) {
 54 |             String preFilePath = preFiles[i].getCanonicalPath();
 55 |             String preFileName = preFiles[i].getName();
 56 |             String stemPath = postDir + "/" + preFileName;
 57 |             if (new File(preFilePath).isDirectory()) {
 58 |                 preProcess(preFilePath);
 59 |             } else {
 60 |                 log.info("正在处理：" + preFilePath);
 61 |                 createStemFile(preFilePath, stemPath);
 62 |                 log.info(stemPath);
 63 |             }
 64 |         }
 65 |     }
 66 | 
 67 |     private void createStemFile(String preFilePath, String stemPath) throws IOException {
 68 |         try (BufferedWriter bw = new BufferedWriter(new FileWriter(stemPath))) {
 69 |             try (BufferedReader br = new BufferedReader(new FileReader(preFilePath))) {
 70 |                 String line;
 71 |                 while ((line = br.readLine()) != null) {
 72 |                     String stemLine = processLine(line);
 73 |                     bw.write(stemLine);
 74 |                 }
 75 |                 bw.flush();
 76 |             }
 77 |         }
 78 |     }
 79 | 
 80 |     private String processLine(String line) {
 81 |         line = line.toLowerCase();
 82 |         String[] sp = line.split("[^a-z]");
 83 |         StringBuilder resLine = new StringBuilder();
 84 |         for (String s : sp) {
 85 |             if (!"".equals(s) && !stopWordsSet.contains(s)) {
 86 |                 resLine.append(stem(s)).append("\n");
 87 |                 if (!vocabulary.containsKey(s)) {
 88 |                     Word word = new Word();
 89 |                     String stem = stem(s);
 90 |                     word.setString(s);
 91 |                     word.setStemString(stem);
 92 |                     vocabulary.put(s, word);
 93 |                 }
 94 |             }
 95 |         }
 96 |         return resLine.toString();
 97 |     }
 98 | 
 99 |     private String stem(String word) {
100 |         stemmer = new Stemmer();
101 |         stemmer.add(word.toCharArray(), word.length());
102 |         stemmer.stem();
103 |         return stemmer.toString();
104 |     }
105 | 
106 |     private void vocabulary2file() {
107 |         try (FileWriter fw = new FileWriter(new File(Config.getVocabularyPath()))) {
108 |             int id = 0;
109 |             for (Map.Entry<String, Word> entry : vocabulary.entrySet()) {
110 |                 fw.write(id + "\t" + entry.getKey() + "\t" + entry.getValue().getStemString() + "\n");
111 |                 id++;
112 |             }
113 |             fw.flush();
114 |         } catch (IOException e) {
115 |             e.printStackTrace();
116 |         }
117 |     }
118 | 
119 | 
120 |     @Test
121 |     public void test() throws IOException {
122 | //        stem("beautiful");
123 | //        stem("wonderful");
124 |         preProcess(prePath);
125 |         vocabulary2file();
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/test/hmm/baumwelch/IOUtils.java:
--------------------------------------------------------------------------------
  1 | package test.hmm.baumwelch;
  2 | 
  3 | import java.io.*;
  4 | import java.util.ArrayList;
  5 | import java.util.List;
  6 | 
  7 | public class IOUtils {
  8 | 	public static String readTextWithLineCheckBreak(String path, String encoding) {
  9 | 		return readText(path, encoding, "\n");
 10 | 	}
 11 | 	/**
 12 | 	 * 读取文本文件，返回整个字符串，不包括换行符号
 13 | 	 * @param path 文件路径
 14 | 	 * @param encoding 编码，传入null或者空串使用默认编码
 15 | 	 * @return
 16 | 	 */
 17 | 	public static String readText(String path, String encoding) {
 18 | 		return readText(path, encoding, null);
 19 | 	}
 20 | 	/**
 21 | 	 * 读取文本，指定每一行末尾符号
 22 | 	 * @param path
 23 | 	 * @param encoding
 24 | 	 * @param lineEndStr
 25 | 	 * @return
 26 | 	 */
 27 | 	public static String readText(String path, String encoding, String lineEndStr) {
 28 | 		try {
 29 | 			if(lineEndStr == null) {
 30 | 				lineEndStr = "";
 31 | 			}
 32 | 			BufferedReader reader = null;
 33 | 			if((!encoding.trim().equals(""))&&encoding!=null) {
 34 | 				reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),encoding));
 35 | 			} else {
 36 | 				reader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
 37 | 			}
 38 | 			String s="";
 39 | 			StringBuilder sb  = new StringBuilder();
 40 | 			while((s=reader.readLine())!=null) {
 41 | 				sb.append(s+lineEndStr);
 42 | 			}
 43 | 			reader.close();
 44 | 			return sb.toString();
 45 | 		} catch (UnsupportedEncodingException e) {
 46 | 			e.printStackTrace();
 47 | 		} catch (FileNotFoundException e) {
 48 | 			e.printStackTrace();
 49 | 		} catch (IOException e) {
 50 | 			e.printStackTrace();
 51 | 		}
 52 | 		return null;
 53 | 	}
 54 | 	/**
 55 | 	 * 读取文本文件，返回整个字符串，不包括换行符号
 56 | 	 * @param path 文件路径
 57 | 	 * @param encoding 编码，传入null或者空串使用默认编码
 58 | 	 * @param addNewLine 是否加换行符
 59 | 	 * @return
 60 | 	 */
 61 | 	public static List<String> readTextAndReturnLinesCheckLineBreak(String path, String encoding, boolean addNewLine) {
 62 | 		try {
 63 | 			String lineBreak;
 64 | 			if(addNewLine) {
 65 | 				lineBreak = "\n";
 66 | 			} else {
 67 | 				lineBreak = "";
 68 | 			}
 69 | 			BufferedReader reader = null;
 70 | 			if((!encoding.trim().equals(""))&&encoding!=null) {
 71 | 				reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),encoding));
 72 | 			} else {
 73 | 				reader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
 74 | 			}
 75 | 			String s="";
 76 | 			List<String> list = new ArrayList<>();
 77 | 			while((s=reader.readLine())!=null) {
 78 | 				list.add(s+lineBreak);
 79 | 			}
 80 | 			reader.close();
 81 | 			return list;
 82 | 		} catch (UnsupportedEncodingException e) {
 83 | 			e.printStackTrace();
 84 | 		} catch (FileNotFoundException e) {
 85 | 			e.printStackTrace();
 86 | 		} catch (IOException e) {
 87 | 			e.printStackTrace();
 88 | 		}
 89 | 		return null;
 90 | 	}
 91 | 	
 92 | 	public static List<String> readTextAndReturnLines(String path, String encoding){
 93 | 		return readTextAndReturnLinesCheckLineBreak(path, encoding, false);
 94 | 	}
 95 | 	/**
 96 | 	 * 读取文本的每一行
 97 | 	 * 并且返回数组形式
 98 | 	 * @param path
 99 | 	 * @param encoding
100 | 	 * @return
101 | 	 */
102 | 	public static String[] readTextAndReturnLinesOfArray(String path, String encoding){
103 | 		List<String> lines = readTextAndReturnLines(path, encoding);
104 | 		String[] arr = new String[lines.size()];
105 | 		lines.toArray(arr);
106 | 		return arr;
107 | 	}
108 | 	/**
109 | 	 * 写入文本文件
110 | 	 * @param data
111 | 	 * @param path
112 | 	 * @param encoding
113 | 	 */
114 | 	public static void writeTextData2File(String data,String path,String encoding) {
115 | 		try {
116 | 			BufferedWriter writer = null;
117 | 			if((!encoding.trim().equals(""))&&encoding!=null) {
118 | 				writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path),encoding));
119 | 			} else {
120 | 				writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path)));
121 | 			}
122 | 			writer.write(data);
123 | 			writer.close();
124 | 		} catch (UnsupportedEncodingException e) {
125 | 			e.printStackTrace();
126 | 		} catch (FileNotFoundException e) {
127 | 			e.printStackTrace();
128 | 		} catch (IOException e) {
129 | 			e.printStackTrace();
130 | 		}
131 | 	}
132 |  
133 | 	
134 | 	/**
135 | 	 * 把对象写入文件
136 | 	 * @param path
137 | 	 * @param object
138 | 	 */
139 | 	public static void writeObject2File(String path, Object object) {
140 | 		try {
141 | 			ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(path));
142 | 			out.writeObject(object);
143 | 			out.close();
144 | 		} catch (Exception e) {
145 | 			e.printStackTrace();
146 | 		} 
147 | 	}
148 | 	/**
149 | 	 * 读取对象
150 | 	 * @param path
151 | 	 * @return
152 | 	 */
153 | 	public static Object readObject(String path) {
154 | 		try {
155 | 			ObjectInputStream in = new ObjectInputStream(new FileInputStream(path));
156 | 			return in.readObject();
157 | 		} catch (Exception e) {
158 | 			e.printStackTrace();
159 | 		}
160 | 		return null; 
161 | 	}
162 | 	
163 | }
164 | 
165 | 


--------------------------------------------------------------------------------
/.idea/markdown-navigator.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="MarkdownProjectSettings" wasCopied="false">
 4 |     <PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.0" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="true" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="true" synchronizeSourcePosition="true" verticallyAlignSourceAndPreviewSyncPosition="true" showSearchHighlightsInPreview="false" showSelectionInPreview="true" openRemoteLinks="true" replaceUnicodeEmoji="false" lastLayoutSetsDefault="false">
 5 |       <PanelProvider>
 6 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
 7 |       </PanelProvider>
 8 |     </PreviewSettings>
 9 |     <ParserSettings gitHubSyntaxChange="false" emojiShortcuts="1" emojiImages="0">
10 |       <PegdownExtensions>
11 |         <option name="ABBREVIATIONS" value="false" />
12 |         <option name="ANCHORLINKS" value="true" />
13 |         <option name="ASIDE" value="false" />
14 |         <option name="ATXHEADERSPACE" value="true" />
15 |         <option name="AUTOLINKS" value="true" />
16 |         <option name="DEFINITIONS" value="false" />
17 |         <option name="DEFINITION_BREAK_DOUBLE_BLANK_LINE" value="false" />
18 |         <option name="FENCED_CODE_BLOCKS" value="true" />
19 |         <option name="FOOTNOTES" value="false" />
20 |         <option name="HARDWRAPS" value="false" />
21 |         <option name="HTML_DEEP_PARSER" value="false" />
22 |         <option name="INSERTED" value="false" />
23 |         <option name="QUOTES" value="false" />
24 |         <option name="RELAXEDHRULES" value="true" />
25 |         <option name="SMARTS" value="false" />
26 |         <option name="STRIKETHROUGH" value="true" />
27 |         <option name="SUBSCRIPT" value="false" />
28 |         <option name="SUPERSCRIPT" value="false" />
29 |         <option name="SUPPRESS_HTML_BLOCKS" value="false" />
30 |         <option name="SUPPRESS_INLINE_HTML" value="false" />
31 |         <option name="TABLES" value="true" />
32 |         <option name="TASKLISTITEMS" value="true" />
33 |         <option name="TOC" value="false" />
34 |         <option name="WIKILINKS" value="false" />
35 |       </PegdownExtensions>
36 |       <ParserOptions>
37 |         <option name="ADMONITION_EXT" value="false" />
38 |         <option name="ATTRIBUTES_EXT" value="false" />
39 |         <option name="COMMONMARK_LISTS" value="true" />
40 |         <option name="DUMMY" value="false" />
41 |         <option name="EMOJI_SHORTCUTS" value="true" />
42 |         <option name="ENUMERATED_REFERENCES_EXT" value="false" />
43 |         <option name="FLEXMARK_FRONT_MATTER" value="false" />
44 |         <option name="GFM_LOOSE_BLANK_LINE_AFTER_ITEM_PARA" value="false" />
45 |         <option name="GFM_TABLE_RENDERING" value="true" />
46 |         <option name="GITBOOK_URL_ENCODING" value="false" />
47 |         <option name="GITHUB_LISTS" value="false" />
48 |         <option name="GITHUB_WIKI_LINKS" value="false" />
49 |         <option name="GITLAB_EXT" value="false" />
50 |         <option name="GITLAB_MATH_EXT" value="false" />
51 |         <option name="GITLAB_MERMAID_EXT" value="false" />
52 |         <option name="HEADER_ID_NON_ASCII_TO_LOWERCASE" value="false" />
53 |         <option name="HEADER_ID_NO_DUPED_DASHES" value="false" />
54 |         <option name="JEKYLL_FRONT_MATTER" value="false" />
55 |         <option name="MACROS_EXT" value="false" />
56 |         <option name="NO_TEXT_ATTRIBUTES" value="false" />
57 |         <option name="PARSE_HTML_ANCHOR_ID" value="false" />
58 |         <option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
59 |       </ParserOptions>
60 |     </ParserSettings>
61 |     <HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true" embedImages="false" embedHttpImages="false" imageUriSerials="false" addDocTypeHtml="true" noParaTags="false">
62 |       <GeneratorProvider>
63 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
64 |       </GeneratorProvider>
65 |       <headerTop />
66 |       <headerBottom />
67 |       <bodyTop />
68 |       <bodyBottom />
69 |     </HtmlSettings>
70 |     <CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssUriSerial="true" isCssTextEnabled="false" isDynamicPageWidth="true">
71 |       <StylesheetProvider>
72 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
73 |       </StylesheetProvider>
74 |       <ScriptProviders />
75 |       <cssText />
76 |       <cssUriHistory />
77 |     </CssSettings>
78 |     <HtmlExportSettings updateOnSave="false" parentDir="" targetDir="" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetPathType="2" targetExt="" useTargetExt="false" noCssNoScripts="false" useElementStyleAttribute="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" linkFormatType="HTTP_ABSOLUTE" />
79 |     <LinkMapSettings>
80 |       <textMaps />
81 |     </LinkMapSettings>
82 |   </component>
83 | </project>


--------------------------------------------------------------------------------
/src/main/java/test/lucene/LuceneTest.java:
--------------------------------------------------------------------------------
  1 | package test.lucene;
  2 | 
  3 | import com.hankcs.lucene.HanLPAnalyzer;
  4 | import test.lucene.rewriteTokenize.IKAnalyzer4Lucene7;
  5 | import lucene.XinAnalyzer;
  6 | import org.ansj.lucene7.AnsjAnalyzer;
  7 | import org.apache.lucene.analysis.Analyzer;
  8 | import org.apache.lucene.analysis.CharArraySet;
  9 | import org.apache.lucene.analysis.TokenStream;
 10 | import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
 11 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 12 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 13 | import org.apache.lucene.document.Document;
 14 | import org.apache.lucene.document.Field;
 15 | import org.apache.lucene.document.TextField;
 16 | import org.apache.lucene.index.DirectoryReader;
 17 | import org.apache.lucene.index.IndexWriter;
 18 | import org.apache.lucene.index.IndexWriterConfig;
 19 | import org.apache.lucene.queryparser.classic.ParseException;
 20 | import org.apache.lucene.queryparser.classic.QueryParser;
 21 | import org.apache.lucene.search.IndexSearcher;
 22 | import org.apache.lucene.search.Query;
 23 | import org.apache.lucene.search.ScoreDoc;
 24 | import org.apache.lucene.store.Directory;
 25 | import org.apache.lucene.store.RAMDirectory;
 26 | import org.apache.lucene.util.Version;
 27 | import org.junit.jupiter.api.Test;
 28 | 
 29 | import java.io.IOException;
 30 | import java.util.Iterator;
 31 | 
 32 | public class LuceneTest {
 33 | 
 34 |     public static void main(String[] a) throws IOException, ParseException {
 35 |         Analyzer analyzer = new XinAnalyzer(XinAnalyzer.TYPE.HMM_XIN);
 36 |         analyzer.setVersion(Version.LUCENE_7_4_0);
 37 | 
 38 |         IndexWriterConfig config = new IndexWriterConfig(analyzer);
 39 |         // Store the index in memory:
 40 |         Directory directory = new RAMDirectory();
 41 |         // To store an index on disk, use this instead:
 42 |         //Directory directory = FSDirectory.open("/tmp/testindex");
 43 | 
 44 |         IndexWriter iwriter = new IndexWriter(directory, config);
 45 |         Document doc = new Document();
 46 |         String text = "今天天气很不错";
 47 |         doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
 48 |         iwriter.addDocument(doc);
 49 |         Document doc1 = new Document();
 50 |         String text1 = "今天很不错";
 51 |         doc1.add(new Field("fieldname", text1, TextField.TYPE_STORED));
 52 | 
 53 |         iwriter.addDocument(doc1);
 54 |         iwriter.close();
 55 | 
 56 |         // Now search the index:
 57 |         DirectoryReader ireader = DirectoryReader.open(directory);
 58 |         IndexSearcher isearcher = new IndexSearcher(ireader);
 59 |         // Parse a simple query that searches for "text":
 60 |         QueryParser parser = new QueryParser("fieldname", analyzer);
 61 |         Query query = parser.parse("很");
 62 |         ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs;
 63 | //        Assertions.assertEquals(1, hits.length);
 64 |         // Iterate through the results:
 65 |         for (int i = 0; i < hits.length; i++) {
 66 |             Document hitDoc = isearcher.doc(hits[i].doc);
 67 |             System.out.println(hitDoc.get("fieldname"));
 68 | //            Assertions.assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
 69 |         }
 70 |         ireader.close();
 71 |         directory.close();
 72 |     }
 73 | 
 74 | 
 75 |     @Test
 76 |     public void testStandardAnalyzer() throws Exception {
 77 |         StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
 78 |         print(standardAnalyzer);
 79 |     }
 80 | 
 81 | 
 82 |     @Test
 83 |     public void testSmartChineseAnalyzer() throws Exception {
 84 |         SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
 85 |         print(smartChineseAnalyzer);
 86 |     }
 87 | 
 88 |     /**
 89 |      * @throws Exception
 90 |      * @Description: 测试自定义停用词
 91 |      * 里面用的char数组
 92 |      */
 93 |     @Test
 94 |     public void testMySmartChineseAnalyzer() throws Exception {
 95 |         CharArraySet charArraySet = new CharArraySet(0, true);
 96 | 
 97 |         // 系统默认停用词
 98 |         Iterator<Object> iterator = SmartChineseAnalyzer.getDefaultStopSet().iterator();
 99 |         while (iterator.hasNext()) {
100 |             char[] chars = (char[]) iterator.next();
101 | //            for (char a : chars) {
102 | //                System.out.println(a);
103 | //            }
104 |             charArraySet.add(iterator.next());
105 |         }
106 | 
107 |         // 自定义停用词
108 |         String[] myStopWords = {"对", "的", "是", "其中"};
109 |         for (String stopWord : myStopWords) {
110 |             charArraySet.add(stopWord);
111 |         }
112 |         SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer(charArraySet);
113 |         print(smartChineseAnalyzer);
114 |     }
115 | 
116 |     @Test
117 |     public void testIKAnalyzer() throws Exception {
118 |         Analyzer analyzer = new IKAnalyzer4Lucene7();
119 |         print(analyzer);
120 |     }
121 | 
122 |     @Test
123 |     public void testAnsjAnalyzer() throws Exception {
124 |         Analyzer analyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.base_ansj);
125 |         print(analyzer);
126 |     }
127 | 
128 |     @Test
129 |     public void testXinAnalyzer() throws Exception {
130 |         Analyzer xanalyxer = new XinAnalyzer(XinAnalyzer.TYPE.BILSTMCRF_XIN);
131 |         print(xanalyxer);
132 |     }
133 | 
134 |     @Test
135 |     public void testHanAnalyzer() throws Exception {
136 |         Analyzer xanalyxer = new HanLPAnalyzer();
137 |         print(xanalyxer);
138 |     }
139 | 
140 |     private void print(Analyzer analyzer) throws Exception {
141 |         String text = "今天天气很不错/今天可以出去玩/你喜欢什么颜色";
142 |         TokenStream tokenStream = analyzer.tokenStream("content", text);
143 |         CharTermAttribute attribute = tokenStream.addAttribute(CharTermAttribute.class);
144 |         tokenStream.reset();
145 |         while (tokenStream.incrementToken()) {
146 |             System.out.println(attribute.toString());
147 |         }
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/test/gmm/GmmTest.java:
--------------------------------------------------------------------------------
  1 | package test.gmm;
  2 | 
  3 | import lombok.extern.slf4j.Slf4j;
  4 | import org.apache.commons.math3.distribution.NormalDistribution;
  5 | import org.apache.commons.math3.random.MersenneTwister;
  6 | import org.apache.commons.math3.random.RandomGenerator;
  7 | import org.apache.commons.math3.util.FastMath;
  8 | import org.junit.jupiter.api.Test;
  9 | 
 10 | import java.util.Arrays;
 11 | import java.util.Collections;
 12 | import java.util.List;
 13 | import java.util.Random;
 14 | 
 15 | /**
 16 |  * @Author unclewang
 17 |  * @Date 2018/11/15 14:15
 18 |  */
 19 | @Slf4j
 20 | public class GmmTest {
 21 |     //生成数据
 22 |     private final static int N = 100;
 23 |     private static Double[] men = new Double[(int) (N * 3.5)];
 24 |     private static Double[] women = new Double[N * 4];
 25 |     private static Double[] children = new Double[(int) (N * 2.5)];
 26 |     private static Double[] people = new Double[men.length + women.length + children.length];
 27 |     private static Double[] m = new Double[people.length];
 28 |     private static Double[] w = new Double[people.length];
 29 |     private static Double[] c = new Double[people.length];
 30 |     private static Double[] e = new Double[people.length];
 31 |     private RandomGenerator rg = new MersenneTwister(100);
 32 | 
 33 |     /**
 34 |      * EM算法参数定义
 35 |      */
 36 |     private Double[] mean;
 37 |     private Double[] sd;
 38 |     //k的和应该等于1,k[0]对应men，k[1]对应women，k[2]对应children
 39 |     private Double[] k;
 40 | 
 41 |     /**
 42 |      * TODO
 43 |      */
 44 |     @Test
 45 |     public void test() {
 46 |         init();
 47 |         for (int i = 0; i < 400; i++) {
 48 |             e();
 49 |             m();
 50 |         }
 51 |     }
 52 | 
 53 |     @Test
 54 |     public void m() {
 55 |         mean = new Double[]{reCountMean(m), reCountMean(w), reCountMean(c)};
 56 |         sd = new Double[]{reCountSd(m), reCountSd(w), reCountSd(c)};
 57 |         k = new Double[]{reCountK(m), reCountK(w), reCountK(c)};
 58 |         System.err.println(k[0] + "\t" + k[1] + "\t" + k[2]);
 59 |     }
 60 | 
 61 |     @Test
 62 |     public void e() {
 63 |         for (int i = 0; i < e.length; i++) {
 64 |             m[i] = k[0] * getP(people[i], mean[0], sd[0]);
 65 |             w[i] = k[1] * getP(people[i], mean[1], sd[1]);
 66 |             c[i] = k[2] * getP(people[i], mean[2], sd[2]);
 67 |             e[i] = m[i] + w[i] + c[i];
 68 |             m[i] /= e[i];
 69 |             w[i] /= e[i];
 70 |             c[i] /= e[i];
 71 |         }
 72 |         log.info("迭代结果：" + reCountMean(m) + "\t" + reCountMean(w) + "\t" + reCountMean(c));
 73 |     }
 74 | 
 75 | 
 76 |     public double reCountMean(Double[] d) {
 77 |         double sum = 0;
 78 |         double meanSum = 0;
 79 |         for (int i = 0; i < e.length; i++) {
 80 |             meanSum += d[i];
 81 |             sum += d[i] * people[i];
 82 |         }
 83 |         return sum / meanSum;
 84 |     }
 85 | 
 86 |     public double reCountSd(Double[] d) {
 87 |         double newMean = reCountMean(d);
 88 |         double sdSum = 0;
 89 |         double meanSum = 0;
 90 |         for (int i = 0; i < e.length; i++) {
 91 |             sdSum += d[i] * FastMath.pow(people[i] - newMean, 2);
 92 |             meanSum += d[i];
 93 |         }
 94 |         return sdSum / meanSum;
 95 |     }
 96 | 
 97 |     public double reCountK(Double[] d) {
 98 |         double meanSum = 0;
 99 |         for (int i = 0; i < d.length; i++) {
100 |             meanSum += d[i];
101 |         }
102 |         return meanSum / people.length;
103 |     }
104 | 
105 |     @Test
106 |     public void testLength() {
107 |         System.out.println(men.length);
108 |         System.out.println(women.length);
109 |         System.out.println(children.length);
110 |         System.out.println(men.length + women.length + children.length);
111 |         System.out.println(people.length);
112 |         System.out.println(e.length);
113 |     }
114 | 
115 |     @Test
116 |     public void init() {
117 |         generate();
118 |         //初始化参数，因为猜测来自三种人的分布，所以数组的长度都是3
119 |         mean = new Double[]{-170.0, 1600.3, 103.5};
120 |         sd = new Double[]{10.0, 10.2, 23.5};
121 |         k = new Double[]{0.3, 0.3, 0.4};
122 |         log.info("(1)正态分布的均值初始值设定：" + mean[0] + "\t" + mean[1] + "\t" + mean[2]);
123 |     }
124 | 
125 | 
126 |     @Test
127 |     public void testGetP() {
128 |         System.out.println(getP(0, 0, 1));
129 |         System.out.println(getP(3, 0, 1));
130 |         System.out.println(getP(-3, 0, 1));
131 |         System.out.println(getP(1, 162, 13));
132 |     }
133 | 
134 | 
135 |     public double getP(double x, double mean, double sd) {
136 |         NormalDistribution nd = new NormalDistribution(mean, sd);
137 |         double p = Math.abs(nd.cumulativeProbability(x));
138 |         return p > 0.5 ? 1 - p : p;
139 |     }
140 | 
141 |     public Double[] generatePeople(Double[] people, double mean, double sd) {
142 |         for (int i = 0; i < people.length; i++) {
143 |             people[i] = normal(mean, sd);
144 |         }
145 |         return people;
146 |     }
147 | 
148 |     public Double[] generate() {
149 |         log.info("正在生成1000个人的数据");
150 |         men = generatePeople(men, -178, 5);
151 |         women = generatePeople(women, 1630, 5);
152 |         children = generatePeople(children, 100, 4);
153 |         log.info("数据分布情况介绍：\n" + "平均值\t-178\t1630\t100\n标准差\t5\t5\t4");
154 |         for (int i = 0; i < people.length; i++) {
155 |             if (i < men.length) {
156 |                 people[i] = men[i];
157 |             } else if (i < men.length + women.length) {
158 |                 people[i] = women[i - men.length];
159 |             } else {
160 |                 people[i] = children[i - men.length - women.length];
161 |             }
162 |         }
163 |         List<Double> peopleList = Arrays.asList(people);
164 |         Collections.shuffle(peopleList, new Random(10));
165 |         people = peopleList.toArray(new Double[]{});
166 | 
167 | //        print(people);
168 |         return people;
169 |     }
170 | 
171 |     public double normal(double mean, double sd) {
172 |         NormalDistribution nd = new NormalDistribution(rg, mean, sd);
173 |         return nd.sample();
174 |     }
175 | 
176 |     public <T extends Object> void print(T[] nums) {
177 |         for (T a : nums) {
178 |             System.out.print(a + "\t");
179 |         }
180 |         System.out.println();
181 |     }
182 | }
183 | 


--------------------------------------------------------------------------------
/src/main/java/mining/cluster/KmeansCluster.java:
--------------------------------------------------------------------------------
  1 | package mining.cluster;
  2 | 
  3 | import com.google.common.collect.HashMultimap;
  4 | import mining.tfidf.AllDocTfIdf;
  5 | import org.junit.jupiter.api.Test;
  6 | 
  7 | import java.util.*;
  8 | import java.util.concurrent.atomic.AtomicReference;
  9 | 
 10 | public class KmeansCluster {
 11 |     private HashMultimap<Integer, Integer> clusterMember = HashMultimap.create();
 12 | 
 13 |     public Map<Integer, Integer> cluster(HashMap<Integer, HashMap<Integer, Double>> idTfidfs, int k) {
 14 |         int fileLen = idTfidfs.size();
 15 |         //记录每个聚类的成员点序号
 16 | 
 17 |         HashMap<Integer, HashMap<Integer, Double>> meansMap = getInitPoint(idTfidfs, k);
 18 |         System.out.println(meansMap);
 19 |         //distance[i][j]记录点i到聚类中心j的距离
 20 |         double[][] distance = new double[fileLen][k];
 21 |         //记录所有点属于的聚类序号，初始化全部为0
 22 |         int[] assignMeans = new int[fileLen];
 23 | 
 24 | 
 25 |         int iterNum = 0;
 26 |         while (true) {
 27 |             System.out.println("Iteration No." + (iterNum++) + "----------------------");
 28 |             //计算每个点和每个聚类中心的距离
 29 |             for (int i = 0; i < fileLen; i++) {
 30 |                 for (int j = 0; j < k; j++) {
 31 |                     distance[i][j] = getDistance(idTfidfs.get(i), meansMap.get(j));
 32 |                 }
 33 |             }
 34 |             //找出每个点最近的聚类中心
 35 |             int[] nearestMeans = new int[fileLen];
 36 |             for (int i = 0; i < fileLen; i++) {
 37 |                 nearestMeans[i] = findNearestMeans(distance, i);
 38 |             }
 39 |             //判断当前所有点属于的聚类序号是否已经全部是其离得最近的聚类，如果是或者达到最大的迭代次数，那么结束算法
 40 |             int okCount = 0;
 41 |             for (int i = 0; i < fileLen; i++) {
 42 |                 if (nearestMeans[i] == assignMeans[i]) {
 43 |                     okCount++;
 44 |                 }
 45 |             }
 46 |             if (okCount == fileLen || iterNum >= 10) {
 47 |                 break;
 48 |             }
 49 |             System.out.println("okCount = " + okCount);
 50 |             //如果前面条件不满足，那么需要重新聚类再进行一次迭代，需要修改每个聚类的成员和每个点属于的聚类信息
 51 |             clusterMember.clear();
 52 |             for (int i = 0; i < fileLen; i++) {
 53 |                 assignMeans[i] = nearestMeans[i];
 54 |                 clusterMember.put(nearestMeans[i], i);
 55 |             }
 56 |             for (int i = 0; i < k; i++) {
 57 |                 if (!clusterMember.containsKey(i)) {
 58 |                     continue;
 59 |                 }
 60 |                 HashMap<Integer, Double> newMean = computeNewMean(clusterMember.get(i), idTfidfs);
 61 |                 meansMap.put(i, newMean);
 62 |             }
 63 |         }
 64 |         //8、形成聚类结果并且返回
 65 |         Map<Integer, Integer> resMap = new TreeMap<>();
 66 |         for (int i = 0; i < fileLen; i++) {
 67 |             resMap.put(i, assignMeans[i]);
 68 |         }
 69 |         return resMap;
 70 |     }
 71 | 
 72 |     private HashMap<Integer, Double> computeNewMean(Set<Integer> integers, HashMap<Integer, HashMap<Integer, Double>> idTfidfs) {
 73 |         int size = integers.size();
 74 |         HashMap<Integer, Double> oneMean = new HashMap<>();
 75 |         for (Integer i : integers) {
 76 |             HashMap<Integer, Double> oneFile = idTfidfs.get(i);
 77 |             for (Map.Entry<Integer, Double> oneTerm : oneFile.entrySet()) {
 78 |                 int termKey = oneTerm.getKey();
 79 |                 double termValue = oneTerm.getValue();
 80 |                 if (!oneMean.containsKey(oneTerm.getKey())) {
 81 |                     oneMean.put(termKey, 0.0);
 82 |                 }
 83 |                 oneMean.put(termKey, oneMean.get(termKey) + (termValue / size));
 84 |             }
 85 |         }
 86 |         return oneMean;
 87 |     }
 88 | 
 89 |     private int findNearestMeans(double[][] distance, int m) {
 90 |         double minDist = Double.MAX_VALUE;
 91 |         int j = 0;
 92 |         for (int i = 0; i < distance[m].length; i++) {
 93 |             if (distance[m][i] < minDist) {
 94 |                 minDist = distance[m][i];
 95 |                 j = i;
 96 |             }
 97 |         }
 98 |         return j;
 99 |     }
100 | 
101 |     private double getDistance(HashMap<Integer, Double> map1, HashMap<Integer, Double> map2) {
102 |         return computeEuclidean(map1, map2);
103 | //        return computeCos(map1, map2);
104 |     }
105 | 
106 |     private double computeEuclidean(HashMap<Integer, Double> map1, HashMap<Integer, Double> map2) {
107 |         Set<Integer> keys = new HashSet<>();
108 |         keys.addAll(map1.keySet());
109 |         keys.addAll(map2.keySet());
110 |         double sum = 0;
111 |         for (Integer i : keys) {
112 |             sum += Math.pow(map1.getOrDefault(i, 0.0) - map2.getOrDefault(i, 0.0), 2);
113 |         }
114 |         return Math.sqrt(sum);
115 |     }
116 | 
117 |     private double computeCos(HashMap<Integer, Double> map1, HashMap<Integer, Double> map2) {
118 |         double norm1 = getMapNorm(map1);
119 |         double norm2 = getMapNorm(map2);
120 |         AtomicReference<Double> mul = new AtomicReference<>(0.0);
121 |         map1.forEach((k, v) -> {
122 |             if (map2.containsKey(k)) {
123 |                 mul.updateAndGet(v1 -> v1 + v * map2.get(k));
124 |             }
125 |         });
126 |         return mul.get() / (norm1 * norm2);
127 |     }
128 | 
129 |     private double getMapNorm(HashMap<Integer, Double> map) {
130 |         AtomicReference<Double> sum = new AtomicReference<>(0.0);
131 |         map.forEach((k, v) -> sum.updateAndGet(v1 -> v1 + Math.pow(v, 2)));
132 |         return Math.sqrt(sum.get());
133 |     }
134 | 
135 |     private HashMap<Integer, HashMap<Integer, Double>> getInitPoint(HashMap<Integer, HashMap<Integer, Double>> idTfidfs, int k) {
136 |         int count = 0;
137 |         int i = 0;
138 |         HashMap<Integer, HashMap<Integer, Double>> meansMap = new HashMap<>();
139 |         for (Map.Entry<Integer, HashMap<Integer, Double>> entry : idTfidfs.entrySet()) {
140 |             if (count == i * idTfidfs.size() / k) {
141 |                 meansMap.put(i, entry.getValue());
142 |                 clusterMember.put(count, i);
143 |                 i++;
144 |             }
145 |             count++;
146 |         }
147 |         return meansMap;
148 |     }
149 | 
150 |     @Test
151 |     public void test() {
152 |         AllDocTfIdf allDocTfIdf = new AllDocTfIdf();
153 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = allDocTfIdf.loadAllDocTfIdf();
154 |         KmeansCluster kmeansCluster = new KmeansCluster();
155 |         kmeansCluster.cluster(idTfIDf, 20);
156 |     }
157 | }
158 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>info.unclewang</groupId>
  6 |     <artifactId>xinlp</artifactId>
  7 |     <version>1.0-SNAPSHOT</version>
  8 | 
  9 |     <properties>
 10 |         <lucene.version>7.4.0</lucene.version>
 11 |         <maven.compiler.source>1.8</maven.compiler.source>
 12 |         <maven.compiler.target>1.8</maven.compiler.target>
 13 |     </properties>
 14 | 
 15 | 
 16 |     <dependencies>
 17 |         <dependency>
 18 |             <groupId>ch.qos.logback</groupId>
 19 |             <artifactId>logback-classic</artifactId>
 20 |             <version>1.2.3</version>
 21 |         </dependency>
 22 |         <dependency>
 23 |             <groupId>org.projectlombok</groupId>
 24 |             <artifactId>lombok</artifactId>
 25 |             <version>1.18.16</version>
 26 |         </dependency>
 27 |         <dependency>
 28 |             <groupId>org.junit.jupiter</groupId>
 29 |             <artifactId>junit-jupiter-api</artifactId>
 30 |             <version>5.3.2</version>
 31 |             <scope>compile</scope>
 32 |         </dependency>
 33 |         <dependency>
 34 |             <groupId>junit</groupId>
 35 |             <artifactId>junit</artifactId>
 36 |             <version>4.13.1</version>
 37 |         </dependency>
 38 |         <dependency>
 39 |             <groupId>com.google.guava</groupId>
 40 |             <artifactId>guava</artifactId>
 41 |             <version>27.0-jre</version>
 42 |         </dependency>
 43 |         <dependency>
 44 |             <groupId>commons-io</groupId>
 45 |             <artifactId>commons-io</artifactId>
 46 |             <version>2.6</version>
 47 |         </dependency>
 48 |         <dependency>
 49 |             <groupId>org.apache.commons</groupId>
 50 |             <artifactId>commons-math3</artifactId>
 51 |             <version>3.6.1</version>
 52 |         </dependency>
 53 |         <dependency>
 54 |             <groupId>org.apache.lucene</groupId>
 55 |             <artifactId>lucene-core</artifactId>
 56 |             <version>${lucene.version}</version>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.apache.lucene</groupId>
 60 |             <artifactId>lucene-suggest</artifactId>
 61 |             <version>7.4.0</version>
 62 |         </dependency>
 63 |         <dependency>
 64 |             <groupId>org.apache.lucene</groupId>
 65 |             <artifactId>lucene-analyzers-common</artifactId>
 66 |             <version>7.4.0</version>
 67 |         </dependency>
 68 |         <dependency>
 69 |             <groupId>org.apache.lucene</groupId>
 70 |             <artifactId>lucene-queryparser</artifactId>
 71 |             <version>${lucene.version}</version>
 72 |         </dependency>
 73 | 
 74 |         <dependency>
 75 |             <groupId>org.apache.lucene</groupId>
 76 |             <artifactId>lucene-analyzers-smartcn</artifactId>
 77 |             <version>${lucene.version}</version>
 78 |         </dependency>
 79 |         <!-- ikanalyzer 中文分词器  -->
 80 |         <dependency>
 81 |             <groupId>com.janeluo</groupId>
 82 |             <artifactId>ikanalyzer</artifactId>
 83 |             <version>2012_u6</version>
 84 |             <!--排除掉里面旧的lucene包，因为我们要重写里面的分析器和分词器  -->
 85 |             <exclusions>
 86 |                 <exclusion>
 87 |                     <groupId>org.apache.lucene</groupId>
 88 |                     <artifactId>lucene-core</artifactId>
 89 |                 </exclusion>
 90 |                 <exclusion>
 91 |                     <groupId>org.apache.lucene</groupId>
 92 |                     <artifactId>lucene-queryparser</artifactId>
 93 |                 </exclusion>
 94 |                 <exclusion>
 95 |                     <groupId>org.apache.lucene</groupId>
 96 |                     <artifactId>lucene-analyzers-common</artifactId>
 97 |                 </exclusion>
 98 |             </exclusions>
 99 |         </dependency>
100 |         <dependency>
101 |             <groupId>org.ansj</groupId>
102 |             <artifactId>ansj_seg</artifactId>
103 |             <version>5.1.6</version>
104 |         </dependency>
105 |         <!-- https://mvnrepository.com/artifact/org.ansj/ansj_lucene7_plug -->
106 |         <dependency>
107 |             <groupId>org.ansj</groupId>
108 |             <artifactId>ansj_lucene7_plug</artifactId>
109 |             <version>5.1.5.1</version>
110 |         </dependency>
111 |         <!-- https://mvnrepository.com/artifact/com.hankcs.nlp/hanlp-lucene-plugin -->
112 |         <dependency>
113 |             <groupId>com.hankcs.nlp</groupId>
114 |             <artifactId>hanlp-lucene-plugin</artifactId>
115 |             <version>1.1.6</version>
116 |         </dependency>
117 |         <!-- https://mvnrepository.com/artifact/com.hankcs/hanlp -->
118 |         <dependency>
119 |             <groupId>com.hankcs</groupId>
120 |             <artifactId>hanlp</artifactId>
121 |             <version>portable-1.7.0</version>
122 |         </dependency>
123 |         <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
124 |         <dependency>
125 |             <groupId>org.apache.httpcomponents</groupId>
126 |             <artifactId>httpclient</artifactId>
127 |             <version>4.5.6</version>
128 |         </dependency>
129 |         <dependency>
130 |             <groupId>com.alibaba</groupId>
131 |             <artifactId>fastjson</artifactId>
132 |             <version>1.2.53</version>
133 |         </dependency>
134 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
135 |         <!--<dependency>-->
136 |         <!--<groupId>org.apache.spark</groupId>-->
137 |         <!--<artifactId>spark-mllib_2.12</artifactId>-->
138 |         <!--<version>2.4.0</version>-->
139 |         <!--<scope>runtime</scope>-->
140 |         <!--</dependency>-->
141 | 
142 | 
143 |         <!-- https://mvnrepository.com/artifact/org.deeplearning4j/deeplearning4j-core -->
144 |         <dependency>
145 |             <groupId>org.deeplearning4j</groupId>
146 |             <artifactId>deeplearning4j-core</artifactId>
147 |             <version>1.0.0-beta3</version>
148 |         </dependency>
149 |         <dependency>
150 |             <groupId>org.nd4j</groupId>
151 |             <artifactId>nd4j-native-platform</artifactId>
152 |             <version>1.0.0-beta3</version>
153 |         </dependency>
154 |         <!-- https://mvnrepository.com/artifact/org.deeplearning4j/deeplearning4j-datasets -->
155 |         <dependency>
156 |             <groupId>org.deeplearning4j</groupId>
157 |             <artifactId>deeplearning4j-datasets</artifactId>
158 |             <version>1.0.0-beta3</version>
159 |         </dependency>
160 | 
161 |         <dependency>
162 |             <groupId>gov.nist.math</groupId>
163 |             <artifactId>jama</artifactId>
164 |             <version>1.0.3</version>
165 |         </dependency>
166 | 
167 |     </dependencies>
168 | </project>


--------------------------------------------------------------------------------
/src/main/java/segment/crf/XinCRFSegment.java:
--------------------------------------------------------------------------------
  1 | package segment.crf;
  2 | 
  3 | import lombok.Data;
  4 | import lombok.extern.java.Log;
  5 | import lucene.Atom;
  6 | import segment.Segment;
  7 | 
  8 | import java.util.LinkedList;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | 
 12 | /**
 13 |  * @Author unclewang
 14 |  * @Date 2018-11-27 18:59
 15 |  */
 16 | @Log
 17 | @Data
 18 | public class XinCRFSegment implements Segment {
 19 |     private static XinCRFModel xinCRFModel;
 20 |     private static DoubleArrayTrie<FeatureFunction> featureFunctionTrie;
 21 |     private static List<FeatureTemplate> featureTemplateList;
 22 |     private static Double[][] matrix;
 23 |     private static String[] id2tag;
 24 |     private static Map<String, Integer> tag2id;
 25 | 
 26 |     static {
 27 |         log.info("模型正在加载中");
 28 |         log.info("尝试直接读取模型文件");
 29 |         xinCRFModel = XinCRFModel.load(XinCRFConfig.getModelPath());
 30 |         if (xinCRFModel == null) {
 31 |             log.info("直接读取模型文件失败，重新构建XinCRFModel");
 32 |             long start = System.currentTimeMillis();
 33 |             xinCRFModel = XinCRFModel.getInstance();
 34 |             log.info("重新创建模型成功，共耗时" + (System.currentTimeMillis() - start) + "ms");
 35 |         }
 36 |         featureFunctionTrie = xinCRFModel.getFeatureFunctionTrie();
 37 |         featureTemplateList = xinCRFModel.getFeatureTemplateList();
 38 |         matrix = xinCRFModel.getMatrix();
 39 |         id2tag = xinCRFModel.getId2tag();
 40 |         tag2id = xinCRFModel.getTag2id();
 41 |         assert xinCRFModel != null;
 42 | //        XinCRFConfig.print(matrix);
 43 |     }
 44 | 
 45 |     public String viterbi(String sentence) {
 46 |         XinTable table = sentence2XinTable(sentence);
 47 |         return viterbi(table);
 48 |     }
 49 | 
 50 |     private String viterbi(XinTable table) {
 51 |         int observationNum = table.size();
 52 |         if (observationNum == 0) {
 53 |             return "";
 54 |         }
 55 |         int stateNum = xinCRFModel.getId2tag().length;
 56 | 
 57 |         Double[][] emissionProbability = new Double[observationNum][stateNum];
 58 |         for (int t = 0; t < observationNum; t++) {
 59 |             //找到t时刻的观测值所有相关的模版以及模版的得分
 60 |             LinkedList<double[]> scoreList = findAllTemplateOfTableI(table, t);
 61 |             for (int i = 0; i < stateNum; i++) {
 62 |                 //找到t时刻的观测值为各种状态的得分
 63 |                 emissionProbability[t][i] = computeScore(scoreList, i);
 64 |             }
 65 |         }
 66 | 
 67 | //        XinCRFConfig.print(emissionProbability);
 68 |         if (observationNum == 1) {
 69 |             double maxScore = -1e10;
 70 |             int bestTag = 0;
 71 |             for (int tag = 0; tag < emissionProbability[0].length; ++tag) {
 72 |                 if (emissionProbability[0][tag] > maxScore) {
 73 |                     maxScore = emissionProbability[0][tag];
 74 |                     bestTag = tag;
 75 |                 }
 76 |             }
 77 |             table.setLast(0, id2tag[bestTag]);
 78 |             return "";
 79 |         }
 80 |         Integer[][] path = new Integer[observationNum][stateNum];
 81 |         Double[][] deltas = new Double[observationNum][stateNum];
 82 |         for (int i = 0; i < stateNum; i++) {
 83 |             deltas[0][i] = emissionProbability[0][i];
 84 |             path[0][i] = i;
 85 |         }
 86 |         for (int t = 1; t < observationNum; t++) {
 87 |             for (int i = 0; i < stateNum; i++) {
 88 |                 double maxScore = -1e10;
 89 |                 for (int j = 0; j < stateNum; j++) {
 90 |                     double tmp = deltas[t - 1][j] + matrix[j][i] + emissionProbability[t][i];
 91 |                     if (tmp > maxScore) {
 92 |                         maxScore = tmp;
 93 |                         deltas[t][i] = tmp;
 94 |                         path[t][i] = j;
 95 |                     }
 96 |                 }
 97 |             }
 98 |         }
 99 | 
100 | 
101 | //        XinCRFConfig.print(path);
102 | //        XinCRFConfig.print(deltas);
103 | 
104 | 
105 |         if (deltas[observationNum - 1][1] > deltas[observationNum - 1][3]) {
106 |             table.v[observationNum - 1][1] = id2tag[1];
107 |         } else {
108 |             table.v[observationNum - 1][1] = id2tag[3];
109 |         }
110 | 
111 |         //找最优路径，注意最后一个字不是所有状态的最大值，而是E(1)和S(3)的最大值
112 |         for (int i = observationNum - 2; i >= 0; i--) {
113 |             table.v[i][1] = id2tag[path[i + 1][tag2id.get(table.v[i + 1][1])]];
114 | //            table.setLast(i, id2tag[path[i + 1][tag2id.get(table.get(i + 1, 1))]]);
115 |         }
116 |         StringBuilder sb = new StringBuilder();
117 |         for (int i = 0; i < table.v.length; i++) {
118 |             sb.append(table.v[i][0]);
119 |             if ("S".equals(table.v[i][1]) || "E".equals(table.v[i][1])) {
120 |                 sb.append("\t");
121 |             }
122 |         }
123 |         System.out.println(sb.toString());
124 |         return sb.toString();
125 |     }
126 | 
127 |     //计算所有模版在某个隐藏状态值的得分
128 |     private Double computeScore(LinkedList<double[]> scoreList, int i) {
129 |         double score = 0.0;
130 |         for (double[] w : scoreList) {
131 |             score += w[i];
132 |         }
133 |         return score;
134 |     }
135 | 
136 | 
137 |     /**
138 |      * 找到所有具体实例的模版函数
139 |      *
140 |      * @param table
141 |      * @param current
142 |      */
143 |     private LinkedList<double[]> findAllTemplateOfTableI(XinTable table, int current) {
144 |         LinkedList<double[]> scoreList = new LinkedList<>();
145 |         for (FeatureTemplate ft : featureTemplateList) {
146 |             //找到所有具体实例的模版函数
147 |             char[] o = ft.generateParameter(table, current);
148 |             //找到模版函数的参数
149 |             FeatureFunction featureFunction = featureFunctionTrie.getFunction(o);
150 |             //存在函数
151 |             if (featureFunction != null) {
152 |                 scoreList.add(featureFunction.getW());
153 |             }
154 |         }
155 |         return scoreList;
156 |     }
157 | 
158 | 
159 |     private XinTable sentence2XinTable(String sentence) {
160 |         char[] chars = sentence.toCharArray();
161 |         int size = chars.length;
162 |         String[][] v = new String[size][2];
163 |         for (int i = 0; i < size; i++) {
164 |             v[i][0] = String.valueOf(chars[i]);
165 |             v[i][1] = "?";
166 |         }
167 |         XinTable xinTable = new XinTable();
168 |         xinTable.setV(v);
169 |         return xinTable;
170 |     }
171 | 
172 | 
173 |     public static void main(String[] args) {
174 |         XinCRFSegment xinCRFSegment = new XinCRFSegment();
175 |         xinCRFSegment.viterbi("今天天气很好");
176 |         xinCRFSegment.viterbi("你好");
177 |         xinCRFSegment.viterbi("商品和服务");
178 |         xinCRFSegment.viterbi("武汉大学非常美");
179 |         xinCRFSegment.viterbi("我是中国人");
180 |         xinCRFSegment.viterbi("迈向充满希望的新司机");
181 |         xinCRFSegment.viterbi("香港特别行政区");
182 |     }
183 | 
184 |     @Override
185 |     public List<Atom> seg(String text) {
186 |         String[] strings = viterbi(text).trim().split("[\t\n]");
187 |         return strings2AtomList(strings);
188 |     }
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/java/test/hmm/HmmTest.java:
--------------------------------------------------------------------------------
  1 | package test.hmm;
  2 | 
  3 | import org.apache.commons.math3.linear.Array2DRowRealMatrix;
  4 | import org.apache.commons.math3.linear.RealMatrix;
  5 | import org.junit.jupiter.api.Test;
  6 | 
  7 | /**
  8 |  * @Author unclewang
  9 |  * @Date 2018/11/16 20:44
 10 |  */
 11 | public class HmmTest {
 12 |     Double[] pi = new Double[]{0.2, 0.25, 0.25, 0.25};
 13 |     double[][] stateTransfer = new double[][]{{0, 1, 0, 0}, {0.4, 0, 0.6, 0}, {0, 0.4, 0, 0.6}, {0, 0, 0.5, 0.5}};
 14 |     double[][] observeProbability = new double[][]{{0.5, 0.5}, {0.3, 0.7}, {0.6, 0.4}, {0.8, 0.2}};
 15 | 
 16 |     RealMatrix stateTransferMatrix = new Array2DRowRealMatrix(stateTransfer);
 17 |     RealMatrix observeProbabilityMatrix = new Array2DRowRealMatrix(observeProbability);
 18 |     Integer[] observeSequence = new Integer[]{0, 1, 0, 1, 0};
 19 | 
 20 | 
 21 |     //观测序列长度
 22 |     Integer T = observeSequence.length;
 23 |     //状态长度
 24 |     Integer N = pi.length;
 25 |     //viterbi算法
 26 |     Integer[][] path = new Integer[T][N];
 27 |     Double[][] dp = new Double[T][N];
 28 |     //前向算法
 29 |     Double[] alpha = new Double[N];
 30 |     Double[][] fp = new Double[T][N];
 31 |     //后向算法
 32 |     Double[] beta = new Double[N];
 33 |     Double[][] bp = new Double[T][N];
 34 | 
 35 | 
 36 |     /**
 37 |      * 书上的例子的参数
 38 |      */
 39 |     public void initExample() {
 40 |         pi = new Double[]{0.2, 0.4, 0.4};
 41 |         stateTransfer = new double[][]{{0.5, 0.2, 0.3}, {0.3, 0.5, 0.2}, {0.2, 0.3, 0.5}};
 42 |         observeProbability = new double[][]{{0.5, 0.5}, {0.4, 0.6}, {0.7, 0.3}};
 43 | 
 44 |         stateTransferMatrix = new Array2DRowRealMatrix(stateTransfer);
 45 | 
 46 |         observeProbabilityMatrix = new Array2DRowRealMatrix(observeProbability);
 47 |         observeSequence = new Integer[]{0, 1, 0};
 48 | 
 49 | 
 50 |         T = observeSequence.length;
 51 |         N = pi.length;
 52 | 
 53 |         path = new Integer[T][N];
 54 |         dp = new Double[T][N];
 55 | 
 56 |         alpha = new Double[N];
 57 |         fp = new Double[T][N];
 58 | 
 59 |         beta = new Double[N];
 60 |         bp = new Double[T][N];
 61 | 
 62 |     }
 63 | 
 64 |     /**
 65 |      * 概率预测问题
 66 |      * 前向算法
 67 |      */
 68 |     @Test
 69 |     public void forwardProbability() {
 70 | //        initExample();
 71 |         /**
 72 |          * 计算初始值
 73 |          */
 74 |         double[] s = observeProbabilityMatrix.getColumn(observeSequence[0]);
 75 |         for (int i = 0; i < N; i++) {
 76 |             alpha[i] = pi[i] * s[i];
 77 |         }
 78 |         fp[0] = alpha;
 79 | 
 80 |         for (int i = 1; i < T; i++) {
 81 |             alpha = reCountAlpha(alpha, observeSequence[i]);
 82 |             fp[i] = alpha;
 83 |         }
 84 |         double probability = 0.0;
 85 |         for (int i = 0; i < N; i++) {
 86 |             probability += fp[T - 1][i];
 87 |         }
 88 |         System.err.println(probability);
 89 |     }
 90 | 
 91 | 
 92 |     /**
 93 |      * 重新计算序列出现概率
 94 |      * 前向算法
 95 |      *
 96 |      * @param d
 97 |      * @param state
 98 |      * @return
 99 |      */
100 |     public Double[] reCountAlpha(Double[] d, int state) {
101 |         Double[] result = new Double[d.length];
102 |         for (int i = 0; i < d.length; i++) {
103 |             double[] iTransfer = stateTransferMatrix.getColumn(i);
104 |             double sum = 0;
105 |             for (int j = 0; j < iTransfer.length; j++) {
106 |                 sum += iTransfer[j] * d[j];
107 |             }
108 |             double[] s = observeProbabilityMatrix.getColumn(state);
109 |             result[i] = sum * s[i];
110 |         }
111 |         return result;
112 |     }
113 | 
114 | 
115 |     /**
116 |      * 参数学习问题，假定上面的pi和状态转移矩阵和观测概率矩阵都是刚被初始化的，并不是最能生成观测序列的
117 |      * 利用EM算法实现的baum-welch算法
118 |      */
119 |     @Test
120 |     public void learning() {
121 |         initExample();
122 |         forwardProbability();
123 |         backwardProbability();
124 |         Double[][] gamma = dot(fp, bp);
125 |         print(fp);
126 |         print(bp);
127 |         print(gamma);
128 |     }
129 | 
130 |     private Double[][] dot(Double[][] fp, Double[][] bp) {
131 |         Double[][] fpBp = new Double[fp.length][fp[0].length];
132 |         for (int i = 0; i < fp.length; i++) {
133 |             for (int j = 0; j < fp[0].length; j++) {
134 |                 fpBp[i][j] = fp[i][j] * bp[i][j];
135 |             }
136 |         }
137 |         return fpBp;
138 |     }
139 | 
140 | 
141 |     @Test
142 |     public void backwardProbability() {
143 |         Double[] beta = new Double[N];
144 |         for (int i = 0; i < N; i++) {
145 |             beta[i] = 1.0;
146 |         }
147 |         bp[T - 1] = beta;
148 |         for (int i = T - 2; i >= 0; i--) {
149 |             beta = reCountBeta(beta, observeSequence[i + 1]);
150 |             bp[i] = beta;
151 |         }
152 |     }
153 | 
154 |     public Double[] reCountBeta(Double[] beta, int state) {
155 |         Double[] result = new Double[N];
156 |         for (int i = 0; i < N; i++) {
157 |             double[] iTransfer = stateTransferMatrix.getRow(i);
158 |             double[] b = observeProbabilityMatrix.getColumn(state);
159 |             double sum = 0;
160 |             for (int j = 0; j < beta.length; j++) {
161 |                 //就是b[j]乘的地方不一样
162 |                 sum += beta[j] * iTransfer[j] * b[j];
163 |             }
164 |             result[i] = sum;
165 |         }
166 |         return result;
167 |     }
168 | 
169 | 
170 |     /**
171 |      * 序列预测问题，viterbi算法
172 |      */
173 |     @Test
174 |     public void prediction() {
175 |         initExample();
176 |         //第一步，初始化
177 |         double[] s = observeProbabilityMatrix.getColumn(observeSequence[0]);
178 |         for (int i = 0; i < N; i++) {
179 |             dp[0][i] = pi[i] * s[i];
180 |             path[0][i] = i;
181 |         }
182 |         for (int i = 1; i < T; i++) {
183 |             //每一个t的概率都是前面N个t-1的概率乘以各种转移之后的最大的;
184 |             for (int j = 0; j < N; j++) {
185 |                 double[] observeProbabilityMatrixColumn = observeProbabilityMatrix.getColumn(observeSequence[i]);
186 |                 updateDP(i, j, observeProbabilityMatrixColumn[j]);
187 |             }
188 |         }
189 |         Integer[] mostLikelyStateSequence = new Integer[T];
190 |         mostLikelyStateSequence[T - 1] = max(dp[T - 1]);
191 | 
192 |         for (int i = mostLikelyStateSequence.length - 2; i >= 0; i--) {
193 |             mostLikelyStateSequence[i] = path[i][mostLikelyStateSequence[i + 1]];
194 |         }
195 |         print(mostLikelyStateSequence);
196 |     }
197 | 
198 |     public void updateDP(int row, int col, double observeP) {
199 |         Double[] ijState = new Double[N];
200 |         for (int i = 0; i < N; i++) {
201 |             ijState[i] = dp[row - 1][i] * stateTransfer[i][col];
202 |         }
203 |         int index = max(ijState);
204 |         path[row][col] = index;
205 |         dp[row][col] = ijState[index] * observeP;
206 |     }
207 | 
208 |     public int max(Double[] d) {
209 |         double max = Double.MIN_VALUE;
210 |         int index = 0;
211 |         for (int i = 0; i < d.length; i++) {
212 |             if (d[i] > max) {
213 |                 index = i;
214 |                 max = d[i];
215 |             }
216 |         }
217 |         return index;
218 |     }
219 | 
220 | 
221 |     public <T extends Object> void print(T[] nums) {
222 |         for (T a : nums) {
223 |             System.out.print(a + "\t");
224 |         }
225 |         System.out.println();
226 |     }
227 | 
228 |     public <T extends Object> void print(T[][] nums) {
229 |         for (int i = 0; i < nums.length; i++) {
230 |             for (int j = 0; j < nums[0].length; j++) {
231 |                 System.out.print(nums[i][j] + "\t");
232 |             }
233 |             System.out.println();
234 |         }
235 |         System.out.println();
236 |     }
237 | }
238 | 


--------------------------------------------------------------------------------
/src/main/java/lda/LdaModel.java:
--------------------------------------------------------------------------------
  1 | package lda;
  2 | 
  3 | import lombok.Data;
  4 | import lombok.extern.slf4j.Slf4j;
  5 | import org.apache.commons.io.FileUtils;
  6 | 
  7 | import java.io.BufferedWriter;
  8 | import java.io.File;
  9 | import java.io.FileWriter;
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | import java.util.List;
 13 | 
 14 | @Data
 15 | @Slf4j
 16 | public class LdaModel {
 17 |     int V, K, M;//词表长度, 主题数量, 文档数量
 18 |     int[][] docs;//第m个文档第n个单词的词表索引
 19 |     int[][] z;// 第m个文档第n个单词的所在主题
 20 |     float alpha; //文档-主题 狄利克雷先验参数
 21 |     float beta; //主题-单词 狄利克雷先验参数
 22 |     int[][] nmk;//每个文档下不同主题的个数, 文档长度*主题个数
 23 |     int[][] nkt;//每个主题下不同词的个数，主题个数*词表长度
 24 |     int[] nmkSum;//第k个文档单词的个数
 25 |     int[] nktSum;//第k个主题单词的个数
 26 |     double[][] phi;//每个主题下所有单词的概率分布，主题个数*词表长度
 27 |     double[][] theta;//每个文档下所有主题的概率分布，文档长度*主题个数
 28 |     int iterations;//迭代次数
 29 |     int saveStep;//每隔几步保存一次
 30 |     int beginSaveIters;//从哪一次开始保存
 31 |     
 32 |     public LdaModel(Parameter parameter) {
 33 |         this.alpha = parameter.getAlpha();
 34 |         this.beta = parameter.getBeta();
 35 |         this.iterations = parameter.getIteration();
 36 |         this.K = parameter.getTopicNum();
 37 |         this.saveStep = parameter.getSaveStep();
 38 |         this.beginSaveIters = parameter.getBeginSaveIters();
 39 |     }
 40 |     
 41 |     public void init(Documents docSet) {
 42 |         this.M = docSet.getDocs().size();
 43 |         this.V = docSet.getTermToIndexMap().size();
 44 |         this.nmk = new int[M][K];
 45 |         this.nkt = new int[K][V];
 46 |         this.nmkSum = new int[M];
 47 |         this.nktSum = new int[K];
 48 |         this.phi = new double[K][V];
 49 |         this.theta = new double[M][K];
 50 |         this.docs = new int[M][];
 51 |         
 52 |         for (int m = 0; m < M; m++) {
 53 |             int n = docSet.getDocs().get(m).getDocWords().length;
 54 |             docs[m] = new int[n];
 55 |             System.arraycopy(docSet.getDocs().get(m).getDocWords(), 0, docs[m], 0, n);
 56 |         }
 57 |         
 58 |         this.z = new int[M][];
 59 |         for (int i = 0; i < M; i++) {
 60 |             int n = docSet.getDocs().get(i).getDocWords().length;
 61 |             z[i] = new int[n];
 62 |             for (int j = 0; j < n; j++) {
 63 |                 int k = (int) (Math.random() * K);
 64 |                 z[i][j] = k;
 65 |                 nmk[i][k]++;
 66 |                 nkt[k][docs[i][j]]++;
 67 |                 nktSum[k]++;
 68 |             }
 69 |             nmkSum[i] = n;
 70 |         }
 71 |     }
 72 |     
 73 |     public void inference(Documents docSet) {
 74 |         if (iterations < saveStep + beginSaveIters) {
 75 |             System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters));
 76 |             System.exit(0);
 77 |         }
 78 |         
 79 |         for (int i = 0; i < iterations; i++) {
 80 |             log.info("迭代次数：" + i);
 81 |             if ((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)) {
 82 |                 log.info("当前迭代次数为{},保存模型", i);
 83 |                 updateEstimatedParameters();
 84 |                 saveIteratedModel(i);
 85 |             }
 86 |             
 87 |             for (int m = 0; m < M; m++) {
 88 |                 int N = docSet.getDocs().get(m).getDocWords().length;
 89 |                 for (int n = 0; n < N; n++) {
 90 |                     // Sample from p(z_i|z_-i, w)
 91 |                     int newTopic = sampleTopicZ(m, n);
 92 |                     z[m][n] = newTopic;
 93 |                 }
 94 |             }
 95 |         }
 96 |     }
 97 |     
 98 |     /**
 99 |      * 吉布斯采样
100 |      *
101 |      * @param m
102 |      * @param n
103 |      * @return
104 |      */
105 |     private int sampleTopicZ(int m, int n) {
106 |         //删除 w_{m,n}
107 |         int oldTopic = z[m][n];
108 |         nmk[m][oldTopic]--;
109 |         nkt[oldTopic][docs[m][n]]--;
110 |         nmkSum[m]--;
111 |         nktSum[oldTopic]--;
112 |         
113 |         //计算 p(z_i = k|z_-i, w)
114 |         double[] p = new double[K];
115 |         for (int k = 0; k < K; k++) {
116 |             p[k] = (nkt[k][docs[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);
117 |         }
118 |         
119 |         for (int k = 1; k < K; k++) {
120 |             p[k] += p[k - 1];
121 |         }
122 |         double u = Math.random() * p[K - 1];
123 |         int newTopic;
124 |         for (newTopic = 0; newTopic < K; newTopic++) {
125 |             if (u < p[newTopic]) {
126 |                 break;
127 |             }
128 |         }
129 |         
130 |         // w_{m, n} 换成新topic
131 |         nmk[m][newTopic]++;
132 |         nkt[newTopic][docs[m][n]]++;
133 |         nmkSum[m]++;
134 |         nktSum[newTopic]++;
135 |         return newTopic;
136 |     }
137 |     
138 |     public void saveIteratedModel(int i) {
139 |         String resPath = LDAConfig.RESPATH;
140 |         String modelPath = resPath + "lda_" + i;
141 |         ArrayList<String> lines = new ArrayList<String>();
142 |         lines.add("alpha = " + alpha);
143 |         lines.add("beta = " + beta);
144 |         lines.add("topicNum = " + K);
145 |         lines.add("docNum = " + M);
146 |         lines.add("termNum = " + V);
147 |         lines.add("iterations = " + iterations);
148 |         lines.add("saveStep = " + saveStep);
149 |         lines.add("beginSaveIters = " + beginSaveIters);
150 |         try {
151 |             FileUtils.writeLines(new File(modelPath + ".params"), lines);
152 |         } catch (IOException e) {
153 |             e.printStackTrace();
154 |         }
155 |         
156 |         writeMatrix(phi, modelPath + ".phi");
157 |         writeMatrix(theta, modelPath + ".theta");
158 |         writeDocTopic(docs, z, modelPath + ".all");
159 |         writeTopWords(modelPath + ".topicNwords");
160 |     }
161 |     
162 |     private void writeTopWords(String path) {
163 |         int topNum = 20;
164 |         try (BufferedWriter writer = new BufferedWriter(new FileWriter(path))) {
165 |             for (int i = 0; i < K; i++) {
166 |                 List<Integer> words = new ArrayList<>();
167 |                 for (int j = 0; j < V; j++) {
168 |                     words.add(j);
169 |                 }
170 |                 double[] phii = phi[i];
171 |                 words.sort((o1, o2) -> {
172 |                     double minus = phii[o2] - phii[o1];
173 |                     if (minus == 0) {
174 |                         return 0;
175 |                     }
176 |                     return minus > 0 ? 1 : -1;
177 |                 });
178 |                 writer.write("topic " + i + "\t:\t");
179 |                 for (int t = 0; t < topNum; t++) {
180 |                     writer.write(Documents.getIndexToTermList().get(words.get(t)) + "=" + phi[i][words.get(t)] + "\t");
181 |                 }
182 |                 writer.write("\n");
183 |             }
184 |         } catch (IOException e) {
185 |             e.printStackTrace();
186 |         }
187 |     }
188 |     
189 |     
190 |     private void writeMatrix(double[][] matrix, String path) {
191 |         try (BufferedWriter writer = new BufferedWriter(new FileWriter(path))) {
192 |             for (int i = 0; i < matrix.length; i++) {
193 |                 for (int j = 0; j < matrix[0].length; j++) {
194 |                     writer.write(matrix[i][j] + "\t");
195 |                 }
196 |                 writer.write("\n");
197 |             }
198 |         } catch (IOException e) {
199 |             e.printStackTrace();
200 |         }
201 |     }
202 |     
203 |     private void writeDocTopic(int[][] doc, int[][] topic, String path) {
204 |         try (BufferedWriter writer = new BufferedWriter(new FileWriter(path))) {
205 |             for (int i = 0; i < doc.length; i++) {
206 |                 for (int j = 0; j < doc[i].length; j++) {
207 |                     writer.write(doc[i][j] + ":" + topic[i][j] + ":" + Documents.getIndexToTermList().get(j) + "\t");
208 |                 }
209 |                 writer.write("\n");
210 |             }
211 |         } catch (IOException e) {
212 |             e.printStackTrace();
213 |         }
214 |     }
215 |     
216 |     
217 |     private void updateEstimatedParameters() {
218 |         for (int m = 0; m < M; m++) {
219 |             for (int k = 0; k < K; k++) {
220 |                 theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);
221 |             }
222 |         }
223 |         
224 |         for (int k = 0; k < K; k++) {
225 |             for (int t = 0; t < V; t++) {
226 |                 phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta);
227 |             }
228 |         }
229 |     }
230 | }
231 | 


--------------------------------------------------------------------------------
/src/main/java/mining/tfidf/AllDocTfIdf.java:
--------------------------------------------------------------------------------
  1 | package mining.tfidf;
  2 | 
  3 | import com.alibaba.fastjson.JSON;
  4 | import com.alibaba.fastjson.JSONObject;
  5 | import com.google.common.collect.BiMap;
  6 | import com.google.common.collect.HashBiMap;
  7 | import com.google.common.io.Files;
  8 | import lombok.extern.slf4j.Slf4j;
  9 | import mining.config.Config;
 10 | import org.apache.commons.math3.util.FastMath;
 11 | 
 12 | import java.io.File;
 13 | import java.io.FileNotFoundException;
 14 | import java.io.IOException;
 15 | import java.nio.charset.Charset;
 16 | import java.util.*;
 17 | 
 18 | /**
 19 |  * @Author unclewang
 20 |  * @Date 2018-12-16 15:55
 21 |  */
 22 | @Slf4j
 23 | public class AllDocTfIdf {
 24 |     private static String postFiles = Config.getPostPath();
 25 |     private static HashMap<Integer, HashMap<Integer, Double>> idTf = new HashMap<>();
 26 |     private static HashMap<Integer, HashMap<Integer, Double>> idTfIDf = new HashMap<>();
 27 |     private static HashMap<Integer, HashSet<Integer>> idDf = new HashMap<>();
 28 |     private static double[] idf;
 29 |     private static BiMap<Integer, String> idFiles = Config.getIdPostFiles(postFiles);
 30 |     private static HashSet<Integer> noFeatureWordId = new HashSet<>();
 31 |     private static BiMap<Integer, Integer> termIdVocabularyId = HashBiMap.create();
 32 | 
 33 | 
 34 |     public static BiMap<Integer, String> getIdFiles() {
 35 |         return idFiles;
 36 |     }
 37 | 
 38 |     private void generateAllTfDf() {
 39 |         log.info("正在计算文档频率");
 40 |         for (int i = 0; i < idFiles.size(); i++) {
 41 |             OneDocTfDf oneDocTfDf = new OneDocTfDf();
 42 |             log.info("正在计算文档:" + i);
 43 |             oneDocTfDf.calOneFileTf(idFiles.get(i));
 44 |             idTf.put(i, oneDocTfDf.getIdTf());
 45 |             idDf.put(i, oneDocTfDf.getIdDf());
 46 |         }
 47 |     }
 48 | 
 49 |     private void generateIdNums() {
 50 |         log.info("正在生成word的文档频率");
 51 |         Set<Integer> ids = Vocabulary.getWordIds().values();
 52 |         int fileSize = idFiles.size();
 53 |         int size = ids.size();
 54 |         int[] idNums = new int[size];
 55 | 
 56 |         for (Map.Entry<Integer, HashSet<Integer>> entry : idDf.entrySet()) {
 57 |             for (Integer i : entry.getValue()) {
 58 |                 idNums[i] += 1;
 59 |             }
 60 |         }
 61 |         int sum = 0;
 62 |         int max = 0;
 63 |         idf = new double[size];
 64 |         for (int i = 0; i < size; i++) {
 65 |             idf[i] = FastMath.log(((fileSize + 1) * 1.0) / (idNums[i] + 1)) + 1;
 66 |             sum += idNums[i];
 67 |             if (idNums[i] > max) {
 68 |                 max = idNums[i];
 69 |             }
 70 |         }
 71 |         System.out.println(size + "个词一共出现了" + sum + "次");
 72 |         System.out.println(size + "个词，平均一个词出现了" + sum * 1.0 / size + "次");
 73 |         System.out.println(size + "个词中出现的最高次数为" + max + "次");
 74 |     }
 75 | 
 76 |     private HashMap<Integer, HashMap<Integer, Double>> calTfIdf() {
 77 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = new HashMap<>();
 78 |         setNoFeatureWordId();
 79 |         for (Map.Entry<Integer, HashMap<Integer, Double>> entry : idTf.entrySet()) {
 80 |             HashMap<Integer, Double> tfIdf = new HashMap<>();
 81 |             int key = entry.getKey();
 82 |             HashMap<Integer, Double> value = entry.getValue();
 83 |             for (Map.Entry<Integer, Double> tfEntry : value.entrySet()) {
 84 |                 if (!noFeatureWordId.contains(tfEntry.getKey())) {
 85 |                     tfIdf.put(tfEntry.getKey(), tfEntry.getValue() * idf[tfEntry.getKey()]);
 86 |                 }
 87 |             }
 88 |             idTfIDf.put(key, tfIdf);
 89 |         }
 90 |         return idTfIDf;
 91 |     }
 92 | 
 93 |     private void printAllDocTfIdf(HashMap<Integer, HashMap<Integer, Double>> idTfIDf) {
 94 |         String path = Config.getTfidfsPath();
 95 |         StringBuilder sb = new StringBuilder();
 96 | 
 97 |         for (Map.Entry<Integer, HashMap<Integer, Double>> entry : idTfIDf.entrySet()) {
 98 |             String json = JSON.toJSONString(entry.getValue());
 99 |             sb.append(entry.getKey() + "\t\t\t" + json + "\n");
100 |         }
101 |         try {
102 |             Files.write(sb.toString().getBytes(), new File(path));
103 |         } catch (IOException e) {
104 |             e.printStackTrace();
105 |         }
106 |     }
107 | 
108 |     /**
109 |      * 这个写的有点麻烦了，不过就这样吧，
110 |      * termid和vocabulary的id不一样，本来想着简化了
111 |      * 后来发现vocabulary太大了，没法矩阵运算，所以利用DF选取特征词以后，在进行SVD
112 |      */
113 |     private void generateTermID() {
114 |         log.info("重新生成特征词term的id");
115 |         BiMap<String, Integer> wordIds = Vocabulary.getWordIds();
116 |         for (int i = 0; i < Vocabulary.getWordSize() - noFeatureWordId.size(); ) {
117 |             for (Integer v : wordIds.values()) {
118 |                 if (!noFeatureWordId.contains(v)) {
119 |                     termIdVocabularyId.put(i, v);
120 |                     i++;
121 |                 }
122 |             }
123 |         }
124 |         StringBuilder sb = new StringBuilder();
125 |         termIdVocabularyId.forEach((integer, integer2) -> sb.append(integer + "\t\t\t" + integer2 + "\n"));
126 |         try {
127 |             Files.write(sb.toString().getBytes(), new File(Config.getTermIdPath()));
128 |         } catch (IOException e) {
129 |             e.printStackTrace();
130 |         }
131 |         log.info("生成特征词term的id结束");
132 |     }
133 | 
134 | 
135 |     /**
136 |      * DF法选取特征词，最大为1000，最小为3
137 |      */
138 |     private void setNoFeatureWordId() {
139 |         int maxDF = 100;
140 |         int minDF = 5;
141 | 
142 |         double minIdf = FastMath.log(((idFiles.size() + 1) * 1.0) / (maxDF + 1)) + 1;
143 |         double maxIdf = FastMath.log(((idFiles.size() + 1) * 1.0) / (minDF + 1)) + 1;
144 |         for (int i = 0; i < idf.length; i++) {
145 |             if (idf[i] < minIdf || idf[i] > maxIdf) {
146 |                 noFeatureWordId.add(i);
147 |             }
148 |         }
149 |         log.info("一共剔除了" + noFeatureWordId.size() + "个单词");
150 |         System.out.println(maxIdf + "\t" + minIdf);
151 |     }
152 | 
153 |     public static BiMap<Integer, Integer> loadTermid() throws FileNotFoundException {
154 |         BiMap<Integer, Integer> termId = HashBiMap.create();
155 |         if (new File(Config.getTermIdPath()).exists()) {
156 |             log.info("文件已存在，直接读取");
157 |             try {
158 |                 List<String> stringList = Files.readLines(new File(Config.getTermIdPath()), Charset.defaultCharset());
159 |                 for (String s : stringList) {
160 |                     String[] sp = s.split("\t\t\t");
161 |                     termId.put(Integer.parseInt(sp[0]), Integer.parseInt(sp[1]));
162 |                 }
163 |             } catch (IOException e) {
164 |                 e.printStackTrace();
165 |             }
166 |         } else {
167 |             throw new FileNotFoundException("先运行loadAllDocTfIdf方法就好了");
168 |         }
169 |         return termId;
170 |     }
171 | 
172 |     public HashMap<Integer, HashMap<Integer, Double>> loadAllDocTfIdf() {
173 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = new HashMap<>();
174 |         if (new File(Config.getTfidfsPath()).exists()) {
175 |             log.info("文件已存在，直接读取");
176 |             try {
177 |                 List<String> stringList = Files.readLines(new File(Config.getTfidfsPath()), Charset.defaultCharset());
178 |                 for (String s : stringList) {
179 |                     String[] split = s.split("\t\t\t");
180 |                     HashMap map = JSONObject.parseObject(split[1], HashMap.class);
181 |                     HashMap<Integer, Double> doubleHashMap = new HashMap<>();
182 |                     for (Object entry : map.keySet()) {
183 |                         int key = Integer.parseInt(entry.toString());
184 |                         doubleHashMap.put(key, Double.parseDouble(map.get(entry).toString()));
185 |                     }
186 |                     idTfIDf.put(Integer.parseInt(split[0]), doubleHashMap);
187 |                 }
188 |             } catch (IOException e) {
189 |                 e.printStackTrace();
190 |             }
191 |         } else {
192 |             log.info("文件不存在，开始生成");
193 |             generateAllTfDf();
194 |             generateIdNums();
195 |             idTfIDf = calTfIdf();
196 |             printAllDocTfIdf(idTfIDf);
197 |             generateTermID();
198 |         }
199 |         log.info("文件已经读取结束");
200 |         return idTfIDf;
201 |     }
202 | 
203 |     public static void main(String[] args) {
204 |         AllDocTfIdf allDocTfIdf = new AllDocTfIdf();
205 |         HashMap<Integer, HashMap<Integer, Double>> idTfIDf = allDocTfIdf.loadAllDocTfIdf();
206 |         System.out.println(idTfIDf.size());
207 |     }
208 | }
209 | 


--------------------------------------------------------------------------------
/src/main/java/segment/hmm/XinHmmSegment.java:
--------------------------------------------------------------------------------
  1 | package segment.hmm;
  2 | 
  3 | import com.alibaba.fastjson.JSON;
  4 | import com.alibaba.fastjson.JSONObject;
  5 | import com.google.common.collect.BiMap;
  6 | import com.google.common.collect.HashBiMap;
  7 | import lombok.Data;
  8 | import lombok.extern.slf4j.Slf4j;
  9 | import lucene.Atom;
 10 | import org.apache.commons.io.FileUtils;
 11 | import org.junit.jupiter.api.Test;
 12 | import segment.Segment;
 13 | import tools.PathUtils;
 14 | 
 15 | import java.io.File;
 16 | import java.io.IOException;
 17 | import java.math.BigDecimal;
 18 | import java.util.HashMap;
 19 | import java.util.HashSet;
 20 | import java.util.List;
 21 | import java.util.Map;
 22 | 
 23 | /**
 24 |  * viterbi算法
 25 |  * 已知状态转移矩阵A、概率观测矩阵B、初始状态概率向量Pi和观测序列O
 26 |  * 求可能性最大的状态序列
 27 |  * 在分词问题上，状态集合是BEMS
 28 |  * O是"武汉大学真美"
 29 |  * I是"BMMESS"
 30 |  * 下面是Jieba分词的参数
 31 |  * Pi是
 32 |  * {'B': -0.26268660809250016,
 33 |  * 'E': -3.14e+100,
 34 |  * 'M': -3.14e+100,
 35 |  * 'S': -1.4652633398537678}
 36 |  * A是
 37 |  * {'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
 38 |  * 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 39 |  * 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 40 |  * 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
 41 |  * B是
 42 |  * {'B': {'\u4e00': -3.6544978750449433,
 43 |  * '\u4e01': -8.125041941842026,
 44 |  * '\u4e03': -7.817392401429855,
 45 |  * '\u4e07': -6.3096425804013165,
 46 |  * '\u4e08': -8.866689067453933,
 47 |  * '\u4e09': -5.932085850549891,
 48 |  * '\u4e0a': -5.739552583325728,
 49 |  * '\u4e0b': -5.997089097239644,
 50 |  * '\u4e0d': -4.274262055936421,
 51 |  * '\u4e0e': -8.355569307500769,
 52 |  * ...},
 53 |  * 'E': {'\u4e00': -6.044987536255073,
 54 |  * '\u4e01': -9.075800412310807,
 55 |  * '\u4e03': -9.198842005220659,
 56 |  * '\u4e07': -7.655326112989935,
 57 |  * '\u4e08': -9.02382100266782,
 58 |  * '\u4e09': -7.978829805438807,
 59 |  * '\u4e0a': -5.323135439997585,
 60 |  * '\u4e0b': -5.739644714409899,
 61 |  * ...},
 62 |  * 'M': {...},
 63 |  * 'S': {...}
 64 |  * }
 65 |  *
 66 |  * @author unclewang
 67 |  */
 68 | @Data
 69 | @Slf4j
 70 | public class XinHmmSegment implements Segment {
 71 | 
 72 |     private static char[] state = new char[]{'B', 'E', 'M', 'S'};
 73 |     /**
 74 |      * 状态值集合的大小 N
 75 |      **/
 76 |     protected static int stateNum = state.length;
 77 | 
 78 |     protected static final Double MIN = -3.14e+100;
 79 |     /**
 80 |      * 初始状态概率Pi
 81 |      **/
 82 |     protected Double[] pi;
 83 |     /**
 84 |      * 转移概率A
 85 |      **/
 86 |     protected Double[][] transferProbability;
 87 |     /**
 88 |      * 发射概率B
 89 |      **/
 90 |     protected Double[][] emissionProbability;
 91 | 
 92 |     /**
 93 |      * 观测值集合的大小 出现了几种可能性，红白球的话就是2,分词的话机会是词表的长度
 94 |      **/
 95 |     protected int observationNum;
 96 | 
 97 |     /**
 98 |      * 观测序列O，比如 武汉大学真美
 99 |      */
100 |     private Integer[] observeSequence;
101 |     /**
102 |      * 词典和id双向对应map
103 |      */
104 |     private BiMap<String, Integer> wordId;
105 | 
106 | 
107 |     public XinHmmSegment() {
108 |         initLambda();
109 | 
110 |     }
111 | 
112 |     /**
113 |      * 使用jieba分词使用的概率
114 |      */
115 |     public void initLambda() {
116 |         initPi();
117 |         initA();
118 |         initB();
119 |     }
120 | 
121 |     @Override
122 |     public List<Atom> seg(String text) {
123 |         String segResult = viterbi(text);
124 |         String[] strings = segResult.split("[\t\n]");
125 |         return strings2AtomList(strings);
126 |     }
127 | 
128 |     /**
129 |      * 维特比算法
130 |      */
131 |     public String viterbi(String s) {
132 |         initLambda();
133 |         String[] sentences = s.split("[,.?;。，]");
134 |         StringBuilder sb = new StringBuilder();
135 |         for (String sentence : sentences) {
136 |             sb.append(viterbi(str2int(sentence)) + "\n");
137 |         }
138 |         return sb.toString();
139 |     }
140 | 
141 | 
142 |     public String viterbi(Integer[] observeSequence) {
143 |         observationNum = observeSequence.length;
144 |         Integer[][] path = new Integer[observationNum][stateNum];
145 |         Double[][] deltas = new Double[observationNum][stateNum];
146 | 
147 |         for (int i = 0; i < stateNum; i++) {
148 |             deltas[0][i] = pi[i] + emissionProbability[i][observeSequence[0]];
149 |             path[0][i] = i;
150 |         }
151 | 
152 |         for (int t = 1; t < observationNum; t++) {
153 |             for (int i = 0; i < stateNum; i++) {
154 |                 deltas[t][i] = deltas[t - 1][0] + transferProbability[0][i];
155 |                 path[t][i] = 0;
156 |                 for (int j = 1; j < stateNum; j++) {
157 |                     double tmp = deltas[t - 1][j] + transferProbability[j][i];
158 |                     if (tmp > deltas[t][i]) {
159 |                         deltas[t][i] = tmp;
160 |                         path[t][i] = j;
161 |                     }
162 |                 }
163 |                 deltas[t][i] += emissionProbability[i][observeSequence[t]];
164 |             }
165 |         }
166 | 
167 |         //找最优路径，注意最后一个字不是所有状态的最大值，而是E(1)和S(3)的最大值
168 |         Integer[] bestStateSequence = new Integer[observationNum];
169 |         bestStateSequence[observationNum - 1] = deltas[observationNum - 1][1] >= deltas[observationNum - 1][3] ? 1 : 3;
170 | 
171 |         for (int i = bestStateSequence.length - 2; i >= 0; i--) {
172 |             bestStateSequence[i] = path[i + 1][bestStateSequence[i + 1]];
173 |         }
174 |         StringBuilder sb = new StringBuilder();
175 |         for (int i = 0; i < observationNum; i++) {
176 |             //妈的，这一段希望没人看到，不然就搞笑了
177 |             if (observeSequence[i] == 9999) {
178 |                 sb.append("/");
179 |             } else {
180 |                 sb.append(wordId.inverse().get(observeSequence[i]));
181 |             }
182 |             if (bestStateSequence[i] == 1 || bestStateSequence[i] == 3) {
183 |                 sb.append("\t");
184 |             }
185 |         }
186 |         return sb.toString();
187 | 
188 |     }
189 | 
190 |     public Integer[] str2int(String s) {
191 |         char[] chars = s.toCharArray();
192 |         Integer[] res = new Integer[chars.length];
193 |         for (int i = 0; i < chars.length; i++) {
194 |             res[i] = wordId.getOrDefault(String.valueOf(chars[i]), 9999);
195 |         }
196 |         return res;
197 |     }
198 | 
199 | 
200 |     private void initB() {
201 |         try {
202 |             String list = FileUtils.readFileToString(new File(PathUtils.getDataPath() + "/segment/hmm/B.json"), "UTF8");
203 |             JSONObject jsonObject = JSON.parseObject(list);
204 |             Map<String, Double> bMap = toDouble(JSON.parseObject(jsonObject.get("B").toString()).getInnerMap());
205 |             Map<String, Double> eMap = toDouble(JSON.parseObject(jsonObject.get("E").toString()).getInnerMap());
206 |             Map<String, Double> mMap = toDouble(JSON.parseObject(jsonObject.get("M").toString()).getInnerMap());
207 |             Map<String, Double> sMap = toDouble(JSON.parseObject(jsonObject.get("S").toString()).getInnerMap());
208 |             HashSet<String> wordSet = new HashSet<>(bMap.keySet());
209 |             wordSet.addAll(eMap.keySet());
210 |             wordSet.addAll(mMap.keySet());
211 |             wordSet.addAll(sMap.keySet());
212 |             emissionProbability = new Double[stateNum][wordSet.size()];
213 |             wordId = HashBiMap.create();
214 |             int i = 0;
215 |             for (String s : wordSet) {
216 |                 wordId.put(s, i);
217 |                 emissionProbability[0][i] = bMap.getOrDefault(s, MIN);
218 |                 emissionProbability[1][i] = eMap.getOrDefault(s, MIN);
219 |                 emissionProbability[2][i] = mMap.getOrDefault(s, MIN);
220 |                 emissionProbability[3][i] = sMap.getOrDefault(s, MIN);
221 |                 i++;
222 |             }
223 |         } catch (IOException e) {
224 |             e.printStackTrace();
225 |         }
226 |     }
227 | 
228 |     public static Map<String, Double> toDouble(Map<String, Object> map) {
229 |         Map<String, Double> res = new HashMap<>();
230 |         map.forEach((key, value) -> res.put(key, ((BigDecimal) value).doubleValue()));
231 |         return res;
232 |     }
233 | 
234 | 
235 |     private void initA() {
236 |         transferProbability = new Double[][]{
237 |                 {MIN, -0.510825623765990, -0.916290731874155, MIN},
238 |                 {-0.5897149736854513, MIN, MIN, -0.8085250474669937},
239 |                 {MIN, -0.33344856811948514, -1.2603623820268226, MIN},
240 |                 {-0.7211965654669841, MIN, MIN, -0.6658631448798212}};
241 |     }
242 | 
243 |     private void initPi() {
244 |         pi = new Double[]{-0.26268660809250016, -3.14e+100, -3.14e+100, -1.4652633398537678};
245 |     }
246 | 
247 |     public <T extends Object> void print(T[] nums) {
248 |         for (T a : nums) {
249 |             System.out.print(a + "\t");
250 |         }
251 |         System.out.println();
252 |     }
253 | 
254 |     public <T extends Object> void print(T[][] nums) {
255 |         for (int i = 0; i < nums.length; i++) {
256 |             for (int j = 0; j < nums[0].length; j++) {
257 |                 System.out.print(nums[i][j] + "\t");
258 |             }
259 |             System.out.println();
260 |         }
261 |         System.out.println();
262 |     }
263 | 
264 |     @Test
265 |     public void segmentTest() {
266 |         initLambda();
267 |         System.out.println(viterbi("今天的天气很好，出来散心挺不错，武汉大学特别好，提高人民的生活水平"));
268 |     }
269 | }
270 | 


--------------------------------------------------------------------------------