├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── jiayan
├── __init__.py
├── __main__.py
├── data
│ ├── __init__.py
│ ├── char_pos_dict.json
│ └── dict.txt
├── examples.py
├── globals.py
├── lexicon
│ ├── __init__.py
│ └── pmi_entropy_constructor.py
├── linguistic_unit.py
├── postagger
│ ├── README.md
│ ├── __init__.py
│ └── crf_pos_tagger.py
├── sentencizer
│ ├── __init__.py
│ ├── crf_punctuator.py
│ ├── crf_sent_tagger.py
│ └── crf_sentencizer.py
├── tokenizer
│ ├── __init__.py
│ ├── hmm_tokenizer.py
│ └── ngram_tokenizer.py
├── translator
│ └── __init__.py
└── utils.py
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | #################
2 | ## Jiayan
3 | #################
4 | test.py
5 | jiayan.klm
6 | tokenizer.cache
7 | cut_model
8 | punc_model
9 | pos_model
10 |
11 |
12 | #################
13 | ## Eclipse
14 | #################
15 |
16 | *.pydevproject
17 | .project
18 | .metadata
19 | bin/
20 | tmp/
21 | *.tmp
22 | *.bak
23 | *.swp
24 | *~.nib
25 | local.properties
26 | .classpath
27 | .settings/
28 | .loadpath
29 |
30 | # External tool builders
31 | .externalToolBuilders/
32 |
33 | # Locally stored "Eclipse launch configurations"
34 | *.launch
35 |
36 | # CDT-specific
37 | .cproject
38 |
39 | # PDT-specific
40 | .buildpath
41 |
42 |
43 | #################
44 | ## Visual Studio
45 | #################
46 |
47 | ## Ignore Visual Studio temporary files, build results, and
48 | ## files generated by popular Visual Studio add-ons.
49 |
50 | # User-specific files
51 | *.suo
52 | *.user
53 | *.sln.docstates
54 |
55 | # Build results
56 | [Dd]ebug/
57 | [Rr]elease/
58 | *_i.c
59 | *_p.c
60 | *.ilk
61 | *.meta
62 | *.obj
63 | *.pch
64 | *.pdb
65 | *.pgc
66 | *.pgd
67 | *.rsp
68 | *.sbr
69 | *.tlb
70 | *.tli
71 | *.tlh
72 | *.tmp
73 | *.vspscc
74 | .builds
75 | *.dotCover
76 |
77 | ## TODO: If you have NuGet Package Restore enabled, uncomment this
78 | #packages/
79 |
80 | # Visual C++ cache files
81 | ipch/
82 | *.aps
83 | *.ncb
84 | *.opensdf
85 | *.sdf
86 |
87 | # Visual Studio profiler
88 | *.psess
89 | *.vsp
90 |
91 | # ReSharper is a .NET coding add-in
92 | _ReSharper*
93 |
94 | # Installshield output folder
95 | [Ee]xpress
96 |
97 | # DocProject is a documentation generator add-in
98 | DocProject/buildhelp/
99 | DocProject/Help/*.HxT
100 | DocProject/Help/*.HxC
101 | DocProject/Help/*.hhc
102 | DocProject/Help/*.hhk
103 | DocProject/Help/*.hhp
104 | DocProject/Help/Html2
105 | DocProject/Help/html
106 |
107 | # Click-Once directory
108 | publish
109 |
110 | # Others
111 | [Bb]in
112 | [Oo]bj
113 | sql
114 | TestResults
115 | *.Cache
116 | ClientBin
117 | stylecop.*
118 | ~$*
119 | *.dbmdl
120 | Generated_Code #added for RIA/Silverlight projects
121 |
122 | # Backup & report files from converting an old project file to a newer
123 | # Visual Studio version. Backup files are not needed, because we have git ;-)
124 | _UpgradeReport_Files/
125 | Backup*/
126 | UpgradeLog*.XML
127 | ############
128 | ## pycharm
129 | ############
130 | .idea
131 |
132 | ############
133 | ## Windows
134 | ############
135 |
136 | # Windows image file caches
137 | Thumbs.db
138 |
139 | # Folder config file
140 | Desktop.ini
141 |
142 |
143 | #############
144 | ## Python
145 | #############
146 |
147 | *.py[co]
148 |
149 | # Packages
150 | *.egg
151 | *.egg-info
152 | dist
153 | build
154 | eggs
155 | parts
156 | bin
157 | var
158 | sdist
159 | develop-eggs
160 | .installed.cfg
161 |
162 | # Installer logs
163 | pip-log.txt
164 |
165 | # Unit test / coverage reports
166 | .coverage
167 | .tox
168 |
169 | #Translations
170 | *.mo
171 |
172 | #Mr Developer
173 | .mr.developer.cfg
174 |
175 | # Mac crap
176 | .DS_Store
177 | *.log
178 | test/tmp/*
179 |
180 | #jython
181 | *.class
182 |
183 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Jiajie Yan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include jiayan/data/dict.txt
2 | include jiayan/data/char_pos_dict.json
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 甲言Jiayan
2 | [](https://pypi.org/project/jiayan/)
3 | 
4 |
5 | [中文](#简介)
6 | [English](#introduction)
7 |
8 | ## 简介
9 | 甲言,取「甲骨文言」之意,是一款专注于古汉语处理的NLP工具包。
10 | 目前通用的汉语NLP工具多以现代汉语为核心语料,对古代汉语的处理效果并不如人意(详见[分词](#2))。本项目的初衷,便是辅助古汉语信息处理,帮助有志于挖掘古文化矿藏的古汉语学者、爱好者等更好地分析和利用文言资料,从「文化遗产」中创造出「文化新产」。
11 | 当前版本支持[词库构建](#1)、[自动分词](#2)、[词性标注](#3)、[文言句读](#4)和[标点](#5)五项功能,更多功能正在开发中。
12 |
13 | ## 功能
14 | * [__词库构建__](#1)
15 | * 利用无监督的双[字典树](https://baike.baidu.com/item/Trie树)、[点互信息](https://www.jianshu.com/p/79de56cbb2c7)以及左右邻接[熵](https://baike.baidu.com/item/信息熵/7302318?fr=aladdin)进行文言词库自动构建。
16 | * [__分词__](#2)
17 | * 利用无监督、无词典的[N元语法](https://baike.baidu.com/item/n元语法)和[隐马尔可夫模型](https://baike.baidu.com/item/隐马尔可夫模型)进行古汉语自动分词。
18 | * 利用词库构建功能产生的文言词典,基于有向无环词图、句子最大概率路径和动态规划算法进行分词。
19 | * [__词性标注__](#3)
20 | * 基于词的[条件随机场](https://baike.baidu.com/item/条件随机场)的序列标注,词性详见[词性表](jiayan/postagger/README.md)。
21 | * [__断句__](#4)
22 | * 基于字符的条件随机场的序列标注,引入点互信息及[t-测试值](https://baike.baidu.com/item/t检验/9910799?fr=aladdin)为特征,对文言段落进行自动断句。
23 | * [__标点__](#5)
24 | * 基于字符的层叠式条件随机场的序列标注,在断句的基础上对文言段落进行自动标点。
25 | * 文白翻译
26 | * 开发中,目前处于文白平行语料收集、清洗阶段。
27 | * 基于[双向长短时记忆循环网络](https://baike.baidu.com/item/长短期记忆人工神经网络/17541107?fromtitle=LSTM&fromid=17541102&fr=aladdin)和[注意力机制](https://baike.baidu.com/item/注意力机制)的神经网络生成模型,对古文进行自动翻译。
28 | * 注意:受语料影响,目前不支持繁体。如需处理繁体,可先用[OpenCC](https://github.com/yichen0831/opencc-python)将输入转换为简体,再将结果转化为相应繁体(如港澳台等)。
29 |
30 | ## 安装
31 | $ pip install jiayan
32 | $ pip install https://github.com/kpu/kenlm/archive/master.zip
33 |
34 | ## 使用
35 | 以下各模块的使用方法均来自[examples.py](jiayan/examples.py)。
36 | 1. 下载模型并解压:[百度网盘](https://pan.baidu.com/s/1PXP0eSQWWcNmAb6lkuB5sw),提取码:`p0sc`
37 | * jiayan.klm:语言模型,主要用来分词,以及句读标点任务中的特征提取;
38 | * pos_model:CRF词性标注模型;
39 | * cut_model:CRF句读模型;
40 | * punc_model:CRF标点模型;
41 | * 庄子.txt:用来测试词库构建的庄子全文。
42 |
43 | 2. __词库构建__
44 | ```
45 | from jiayan import PMIEntropyLexiconConstructor
46 |
47 | constructor = PMIEntropyLexiconConstructor()
48 | lexicon = constructor.construct_lexicon('庄子.txt')
49 | constructor.save(lexicon, '庄子词库.csv')
50 | ```
51 |
52 | 结果:
53 | ```
54 | Word,Frequency,PMI,R_Entropy,L_Entropy
55 | 之,2999,80,7.944909328101839,8.279435615456894
56 | 而,2089,80,7.354575005231323,8.615211168836439
57 | 不,1941,80,7.244331150611089,6.362131306822925
58 | ...
59 | 天下,280,195.23602384978196,5.158574399464853,5.24731990592901
60 | 圣人,111,150.0620531154239,4.622606551534004,4.6853474419338585
61 | 万物,94,377.59805590304126,4.5959107835319895,4.538837960294887
62 | 天地,92,186.73504238078462,3.1492586603863617,4.894533538722486
63 | 孔子,80,176.2550051738876,4.284638190120882,2.4056390622295662
64 | 庄子,76,169.26227942514097,2.328252899085616,2.1920058354921066
65 | 仁义,58,882.3468468468468,3.501609497059026,4.96900162987599
66 | 老聃,45,2281.2228260869565,2.384853500510039,2.4331958387289765
67 | ...
68 | ```
69 | 3. __分词__
70 | 1. 字符级隐马尔可夫模型分词,效果符合语感,建议使用,需加载语言模型 `jiayan.klm`
71 | ```
72 | from jiayan import load_lm
73 | from jiayan import CharHMMTokenizer
74 |
75 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。'
76 |
77 | lm = load_lm('jiayan.klm')
78 | tokenizer = CharHMMTokenizer(lm)
79 | print(list(tokenizer.tokenize(text)))
80 | ```
81 | 结果:
82 | `['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']`
83 |
84 | 由于古汉语没有公开分词数据,无法做效果评估,但我们可以通过不同NLP工具对相同句子的处理结果来直观感受本项目的优势:
85 |
86 | 试比较 [LTP](https://github.com/HIT-SCIR/ltp) (3.4.0) 模型分词结果:
87 | `['是', '故内', '圣外王', '之', '道', ',', '暗而不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉以自为方', '。']`
88 |
89 | 再试比较 [HanLP](http://hanlp.com) 分词结果:
90 | `['是故', '内', '圣', '外', '王之道', ',', '暗', '而', '不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各为其所欲焉', '以', '自为', '方', '。']`
91 |
92 | 可见本工具对古汉语的分词效果明显优于通用汉语NLP工具。
93 |
94 | *更新:感谢HanLP的作者hankc告知——从2021年初,HanLP发布了深度学习驱动的2.x。由于使用了大规模语料上预训练的语言模型,这些语料已经包括了互联网上几乎所有的古汉语和现代汉语,所以在古汉语上的效果已经得到了质的提升。不仅仅是分词,就连词性标注和语义分析也有一定zero-shot learning的效果。相应的具体分词效果请参见该[Issue](https://github.com/jiaeyan/Jiayan/issues/15)。
95 |
96 | 2. 词级最大概率路径分词,基本以字为单位,颗粒度较粗
97 | ```
98 | from jiayan import WordNgramTokenizer
99 |
100 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。'
101 | tokenizer = WordNgramTokenizer()
102 | print(list(tokenizer.tokenize(text)))
103 | ```
104 | 结果:
105 | `['是', '故', '内', '圣', '外', '王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']`
106 |
107 | 4. __词性标注__
108 | ```
109 | from jiayan import CRFPOSTagger
110 |
111 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。']
112 |
113 | postagger = CRFPOSTagger()
114 | postagger.load('pos_model')
115 | print(postagger.postag(words))
116 | ```
117 | 结果:
118 | `['n', 'a', 'wp', 'n', 'd', 'a', 'wp', 'n', 'd', 'm', 'wp', 'n', 'a', 'u', 'm', 'v', 'r', 'p', 'r', 'a', 'wp']`
119 |
120 | 5. __断句__
121 | ```
122 | from jiayan import load_lm
123 | from jiayan import CRFSentencizer
124 |
125 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
126 |
127 | lm = load_lm('jiayan.klm')
128 | sentencizer = CRFSentencizer(lm)
129 | sentencizer.load('cut_model')
130 | print(sentencizer.sentencize(text))
131 | ```
132 | 结果:
133 | `['天下大乱', '贤圣不明', '道德不一', '天下多得一察焉以自好', '譬如耳目', '皆有所明', '不能相通', '犹百家众技也', '皆有所长', '时有所用', '虽然', '不该不遍', '一之士也', '判天地之美', '析万物之理', '察古人之全', '寡能备于天地之美', '称神之容', '是故内圣外王之道', '暗而不明', '郁而不发', '天下之人各为其所欲焉以自为方', '悲夫', '百家往而不反', '必不合矣', '后世之学者', '不幸不见天地之纯', '古之大体', '道术将为天下裂']`
134 |
135 | 6. __标点__
136 | ```
137 | from jiayan import load_lm
138 | from jiayan import CRFPunctuator
139 |
140 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
141 |
142 | lm = load_lm('jiayan.klm')
143 | punctuator = CRFPunctuator(lm, 'cut_model')
144 | punctuator.load('punc_model')
145 | print(punctuator.punctuate(text))
146 | ```
147 | 结果:
148 | `天下大乱,贤圣不明,道德不一,天下多得一察焉以自好,譬如耳目,皆有所明,不能相通,犹百家众技也,皆有所长,时有所用,虽然,不该不遍,一之士也,判天地之美,析万物之理,察古人之全,寡能备于天地之美,称神之容,是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方,悲夫!百家往而不反,必不合矣,后世之学者,不幸不见天地之纯,古之大体,道术将为天下裂。`
149 |
150 |
151 | ## 版本
152 | * v0.0.21
153 | * 将安装过程分为两步,确保得到最新的kenlm版本。
154 | * v0.0.2
155 | * 增加词性标注功能。
156 | * v0.0.1
157 | * 词库构建、自动分词、文言句读、标点功能开放。
158 |
159 |
160 | ---
161 |
162 | ## Introduction
163 | Jiayan, which means Chinese characters engraved on oracle bones, is a professional Python NLP tool for Classical Chinese.
164 | Prevailing Chinese NLP tools are mainly trained on modern Chinese data, which leads to bad performance on Classical Chinese (See [__Tokenizing__](#6)). The purpose of this project is to assist Classical Chinese information processing.
165 | Current version supports [lexicon construction](#6), [tokenizing](#7), [POS tagging](#8), [sentence segmentation](#9) and [automatic punctuation](#10), more features are in development.
166 |
167 | ## Features
168 | * [__Lexicon Construction__](#6)
169 | * With an unsupervised approach, construct lexicon with [Trie](https://en.wikipedia.org/wiki/Trie) -tree, [PMI](https://en.wikipedia.org/wiki/Pointwise_mutual_information) (_point-wise mutual information_) and neighboring [entropy](https://en.wikipedia.org/wiki/Entropy_\(information_theory\)) of left and right characters.
170 | * [__Tokenizing__](#7)
171 | * With an unsupervised, no dictionary approach to tokenize a Classical Chinese sentence with [N-gram](https://en.wikipedia.org/wiki/N-gram) language model and [HMM](https://en.wikipedia.org/wiki/Hidden_Markov_model) (_Hidden Markov Model_).
172 | * With the dictionary produced from lexicon construction, tokenize a Classical Chinese sentence with Directed Acyclic Word Graph, Max Probability Path and [Dynamic Programming](https://en.wikipedia.org/wiki/Dynamic_programming).
173 | * [__POS Tagging__](#8)
174 | * Word level sequence tagging with [CRF](https://en.wikipedia.org/wiki/Conditional_random_field) (_Conditional Random Field_). See POS tag categories [here](jiayan/postagger/README.md).
175 | * [__Sentence Segmentation__](#9)
176 | * Character level sequence tagging with CRF, introduces PMI and [T-test](https://en.wikipedia.org/wiki/Student%27s_t-test) values as features.
177 | * [__Punctuation__](#10)
178 | * Character level sequence tagging with layered CRFs, punctuate given Classical Chinese texts based on results of sentence segmentation.
179 | * Note: Due to data we used, we don't support traditional Chinese for now. If you have to process traditional one, please use [OpenCC](https://github.com/yichen0831/opencc-python) to convert traditional input to simplified, then you could convert the results back.
180 |
181 | ## Installation
182 | $ pip install jiayan
183 | $ pip install https://github.com/kpu/kenlm/archive/master.zip
184 |
185 | ## Usages
186 | The usage codes below are all from [examples.py](jiayan/examples.py).
187 | 1. Download the models and unzip them:[Google Drive](https://drive.google.com/open?id=1piZQBO8OXQ5Cpi17vAcZsrbJLPABnKzp)
188 | * jiayan.klm:the language model used for tokenizing and feature extraction for sentence segmentation and punctuation;
189 | * pos_model:the CRF model for POS tagging;
190 | * cut_model:the CRF model for sentence segmentation;
191 | * punc_model:the CRF model for punctuation;
192 | * 庄子.txt:the full text of 《Zhuangzi》 used for testing lexicon construction.
193 |
194 | 2. __Lexicon Construction__
195 | ```
196 | from jiayan import PMIEntropyLexiconConstructor
197 |
198 | constructor = PMIEntropyLexiconConstructor()
199 | lexicon = constructor.construct_lexicon('庄子.txt')
200 | constructor.save(lexicon, 'Zhuangzi_Lexicon.csv')
201 | ```
202 |
203 | Result:
204 | ```
205 | Word,Frequency,PMI,R_Entropy,L_Entropy
206 | 之,2999,80,7.944909328101839,8.279435615456894
207 | 而,2089,80,7.354575005231323,8.615211168836439
208 | 不,1941,80,7.244331150611089,6.362131306822925
209 | ...
210 | 天下,280,195.23602384978196,5.158574399464853,5.24731990592901
211 | 圣人,111,150.0620531154239,4.622606551534004,4.6853474419338585
212 | 万物,94,377.59805590304126,4.5959107835319895,4.538837960294887
213 | 天地,92,186.73504238078462,3.1492586603863617,4.894533538722486
214 | 孔子,80,176.2550051738876,4.284638190120882,2.4056390622295662
215 | 庄子,76,169.26227942514097,2.328252899085616,2.1920058354921066
216 | 仁义,58,882.3468468468468,3.501609497059026,4.96900162987599
217 | 老聃,45,2281.2228260869565,2.384853500510039,2.4331958387289765
218 | ...
219 | ```
220 | 3. __Tokenizing__
221 | 1. The character based HMM, recommended, needs language model: `jiayan.klm`
222 | ```
223 | from jiayan import load_lm
224 | from jiayan import CharHMMTokenizer
225 |
226 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。'
227 |
228 | lm = load_lm('jiayan.klm')
229 | tokenizer = CharHMMTokenizer(lm)
230 | print(list(tokenizer.tokenize(text)))
231 | ```
232 | Result:
233 | `['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']`
234 |
235 | Since there is no public tokenizing data for Classical Chinese, it's hard to do performance evaluation directly; However, we can compare the results with other popular modern Chinese NLP tools to check the performance:
236 |
237 | Compare the tokenizing result of [LTP](https://github.com/HIT-SCIR/ltp) (3.4.0):
238 | `['是', '故内', '圣外王', '之', '道', ',', '暗而不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉以自为方', '。']`
239 |
240 | Also, compare the tokenizing result of [HanLP](http://hanlp.com):
241 | `['是故', '内', '圣', '外', '王之道', ',', '暗', '而', '不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各为其所欲焉', '以', '自为', '方', '。']`
242 |
243 | It's apparent that Jiayan has much better tokenizing performance than general Chinese NLP tools.
244 |
245 | 2. Max probability path approach tokenizing based on words
246 | ```
247 | from jiayan import WordNgramTokenizer
248 |
249 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。'
250 | tokenizer = WordNgramTokenizer()
251 | print(list(tokenizer.tokenize(text)))
252 | ```
253 | Result:
254 | `['是', '故', '内', '圣', '外', '王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']`
255 |
256 | 4. __POS Tagging__
257 | ```
258 | from jiayan import CRFPOSTagger
259 |
260 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。']
261 |
262 | postagger = CRFPOSTagger()
263 | postagger.load('pos_model')
264 | print(postagger.postag(words))
265 | ```
266 | Result:
267 | `['n', 'a', 'wp', 'n', 'd', 'a', 'wp', 'n', 'd', 'm', 'wp', 'n', 'a', 'u', 'm', 'v', 'r', 'p', 'r', 'a', 'wp']`
268 |
269 | 4. __Sentence Segmentation__
270 | ```
271 | from jiayan import load_lm
272 | from jiayan import CRFSentencizer
273 |
274 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
275 |
276 | lm = load_lm('jiayan.klm')
277 | sentencizer = CRFSentencizer(lm)
278 | sentencizer.load('cut_model')
279 | print(sentencizer.sentencize(text))
280 | ```
281 | Result:
282 | `['天下大乱', '贤圣不明', '道德不一', '天下多得一察焉以自好', '譬如耳目', '皆有所明', '不能相通', '犹百家众技也', '皆有所长', '时有所用', '虽然', '不该不遍', '一之士也', '判天地之美', '析万物之理', '察古人之全', '寡能备于天地之美', '称神之容', '是故内圣外王之道', '暗而不明', '郁而不发', '天下之人各为其所欲焉以自为方', '悲夫', '百家往而不反', '必不合矣', '后世之学者', '不幸不见天地之纯', '古之大体', '道术将为天下裂']`
283 |
284 | 5. __Punctuation__
285 | ```
286 | from jiayan import load_lm
287 | from jiayan import CRFPunctuator
288 |
289 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
290 |
291 | lm = load_lm('jiayan.klm')
292 | punctuator = CRFPunctuator(lm, 'cut_model')
293 | punctuator.load('punc_model')
294 | print(punctuator.punctuate(text))
295 | ```
296 | Result:
297 | `天下大乱,贤圣不明,道德不一,天下多得一察焉以自好,譬如耳目,皆有所明,不能相通,犹百家众技也,皆有所长,时有所用,虽然,不该不遍,一之士也,判天地之美,析万物之理,察古人之全,寡能备于天地之美,称神之容,是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方,悲夫!百家往而不反,必不合矣,后世之学者,不幸不见天地之纯,古之大体,道术将为天下裂。`
298 |
299 |
300 | ## Versions
301 | * v0.0.21
302 | * Divide the installation into two steps to ensure to get the latest version of kenlm.
303 | * v0.0.2
304 | * POS tagging feature is open.
305 | * v0.0.1
306 | * Add features of lexicon construction, tokenizing, sentence segmentation and automatic punctuation.
--------------------------------------------------------------------------------
/jiayan/__init__.py:
--------------------------------------------------------------------------------
1 | import kenlm
2 |
3 | from jiayan.lexicon.pmi_entropy_constructor import PMIEntropyLexiconConstructor
4 | from jiayan.tokenizer.hmm_tokenizer import CharHMMTokenizer
5 | from jiayan.tokenizer.ngram_tokenizer import WordNgramTokenizer
6 | from jiayan.sentencizer.crf_sentencizer import CRFSentencizer
7 | from jiayan.sentencizer.crf_punctuator import CRFPunctuator
8 | from jiayan.postagger.crf_pos_tagger import CRFPOSTagger
9 |
10 |
11 | def load_lm(lm):
12 | return kenlm.LanguageModel(lm)
13 |
14 |
--------------------------------------------------------------------------------
/jiayan/__main__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/__main__.py
--------------------------------------------------------------------------------
/jiayan/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/data/__init__.py
--------------------------------------------------------------------------------
/jiayan/examples.py:
--------------------------------------------------------------------------------
1 | from jiayan import PMIEntropyLexiconConstructor
2 | from jiayan import CharHMMTokenizer
3 | from jiayan import WordNgramTokenizer
4 | from jiayan import CRFSentencizer
5 | from jiayan import CRFPunctuator
6 | from jiayan import CRFPOSTagger
7 | from jiayan import load_lm
8 |
9 |
10 | def construct_lexicon(data_file: str, out_f: str):
11 | constructor = PMIEntropyLexiconConstructor()
12 | lexicon = constructor.construct_lexicon(data_file)
13 | constructor.save(lexicon, out_f)
14 |
15 |
16 | def hmm_tokenize(lm_path: str, text: str):
17 | lm = load_lm(lm_path)
18 | tokenizer = CharHMMTokenizer(lm)
19 | print(list(tokenizer.tokenize(text)))
20 |
21 |
22 | def ngram_tokenize(text: str):
23 | tokenizer = WordNgramTokenizer()
24 | print(list(tokenizer.tokenize(text)))
25 |
26 |
27 | def crf_pos_tag(pos_model, words):
28 | postagger = CRFPOSTagger()
29 | postagger.load(pos_model)
30 | print(postagger.postag(words))
31 |
32 |
33 | def crf_sentencize(lm_path: str, cut_model, text):
34 | lm = load_lm(lm_path)
35 | sentencizer = CRFSentencizer(lm)
36 | sentencizer.load(cut_model)
37 | print(sentencizer.sentencize(text))
38 |
39 |
40 | def crf_punctuate(lm_path, cut_model, punc_model, text):
41 | lm = load_lm(lm_path)
42 | punctuator = CRFPunctuator(lm, cut_model)
43 | punctuator.load(punc_model)
44 | print(punctuator.punctuate(text))
45 |
46 |
47 | def train_sentencizer(lm_path, data_file, out_model):
48 | lm = load_lm(lm_path)
49 | sentencizer = CRFSentencizer(lm)
50 | print('Building data...')
51 | X, Y = sentencizer.build_data(data_file)
52 | train_x, train_y, test_x, test_y = sentencizer.split_data(X, Y)
53 | X[:] = []
54 | Y[:] = []
55 | print('Training...')
56 | sentencizer.train(train_x, train_y, out_model)
57 | sentencizer.eval(test_x, test_y, out_model)
58 |
59 |
60 | def train_punctuator(lm_path, data_file, cut_model, out_model):
61 | lm = load_lm(lm_path)
62 | punctuator = CRFPunctuator(lm, cut_model)
63 | print('Building data...')
64 | X, Y = punctuator.build_data(data_file)
65 | train_x, train_y, test_x, test_y = punctuator.split_data(X, Y)
66 | X[:] = []
67 | Y[:] = []
68 | print('Training...')
69 | punctuator.train(train_x, train_y, out_model)
70 | punctuator.eval(test_x, test_y, out_model)
71 |
72 |
73 | def train_postagger(data_file, pos_model):
74 | postagger = CRFPOSTagger()
75 | print('Building data...')
76 | X, Y = postagger.build_data(data_file)
77 | train_x, train_y, test_x, test_y = postagger.split_data(X, Y)
78 | X[:] = []
79 | Y[:] = []
80 | print('Training...')
81 | postagger.train(train_x, train_y, pos_model)
82 | postagger.eval(test_x, test_y, pos_model)
83 |
84 | if __name__ == '__main__':
85 | test_f = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
86 | test_f1 = '圣人之治民也先治者强先战者胜夫国事务先而一民心专举公而私不从赏告而奸不生明法而治不烦能用四者强不能用四者弱夫国之所以强者政也主之所以尊者权也故明君有权有政乱君亦有权有政积而不同其所以立异也故明君操权而上重一政而国治故法者王之本也刑者爱之自也'
87 | test_f2 = '公曰善吾不食谄人以言也以鱼五十乘赐弦章章归鱼车塞途抚其御之手曰昔者晏子辞党当作赏以正君故过失不掩之今诸臣谀以干利吾若受鱼是反晏子之义而顺谄谀之欲固辞鱼不受君子曰弦章之廉晏子之遗行也'
88 | test_f3 = '景公游于菑闻晏子死公乘侈舆服繁驵驱之而因为迟下车而趋知不若车之速则又乘比至于国者四下而趋行哭而往伏尸而号'
89 | test_f4 = '有足游浮云背凌苍天尾偃天间跃啄北海颈尾咳于天地乎然而漻漻不知六翮之所在'
90 | test_f5 = '谁知林栖者闻风坐相悦草木有本心何求美人折'
91 | test_f6 = '能说诸心能研诸侯之虑定天下之吉凶成天下之亹亹者是故变化云为吉事有祥象事知器占事知来天地设位圣人成能人谋鬼谋百姓与能八卦以象告爻彖以情言刚柔杂居而吉凶可见矣'
92 | test_f7 = '至哉坤元万物资生乃顺承天坤厚载物德合无疆含弘光大品物咸亨牝马地类行地无疆柔顺利贞君子攸行先迷失道后顺得常'
93 | test_f8 = '天下熙熙一盈一虚一治一乱所以然者何也其君贤不肖不等乎其天时变化自然乎'
94 | test_f9 = '先生之言悖龙之所以为名者乃以白马之论尔今使龙去之则无以教焉且欲师之者以智与学不如也今使龙去之此先教而后师之也先教而后师之者悖且白马非马乃仲尼之所取龙闻楚王张繁弱之弓载忘归之矢以射蛟兕于云梦之圃而丧其弓左右请求之'
95 | test_f10 = '伪学伪才揣摩以逢主意从前洋务穆彰阿倾排异己殊堪痛恨若一旦置之重法实有不忍着从宽革职永不叙用于是主战主和之功罪是非千秋论定而枋政之臣欲以掩天下后世之耳目不可得矣'
96 | test_f11 = '传字世文至圣四十七代孙建炎初随孔端友南渡遂流寓衢州'
97 | test_f12 = '若乃厯代褒崇之典累朝班赉之恩宠数便蕃固可以枚陈而列数以至验祖壁之遗书访阙里之陈迹荒墟废址沦没于春芜秋草之中者阙有之故老世传之将使闻见之所未尝者如接于耳目之近'
98 | test_f13 = '颂曰元始二妃帝尧之女嫔列有虞承舜于下以尊事卑终能劳苦瞽叟和宁卒享福祜'
99 | test_f14 = '弃母姜嫄者邰侯之女也当尧之时行见巨人迹好而履之归而有娠浸以益大心怪恶之卜筮禋祀以求无子终生子'
100 | test_f15 = '颂曰契母简狄敦仁励翼吞卵产子遂自修饰教以事理推恩有德契为帝辅盖母有力'
101 | test_f16 = '堂之下则有大冶长老桃花茶巢元脩菜何氏丛橘种秔稌莳枣栗有松期为可斫种麦以为奇事作陂塘植黄桑皆足以供先生之岁用而为雪堂之胜景云耳'
102 | test_f17 = '占者乡塾里闾亦各有史所以纪善恶而垂劝戒后世惟天于有太史而庶民之有德业者非附贤士大夫为之纪其闻者蔑焉'
103 | test_f18 = '东家杂记孔子四十七代孙孔传所述杂记曰周灵王二十一年已酉岁即鲁襄公二十二年也当襄公二十二年冬十月庚子日先圣生又曰周敬王四十一年辛酉岁即鲁哀公十六年也当哀公十六年夏四月乙丑日先圣薨先儒以为已丑者误也'
104 | test_f19 = '周灵王二十一年已酉岁即鲁襄公二十二年也当襄公二十二年冬十月庚子日先圣生是夕有二龙绕室五老降庭五老者五星之精也又颜氏之房闻奏钧天之乐空中有声云天感生圣子故降以和乐笙镛之音'
105 | test_f20 = '河山大地未尝可以法空也佛必欲空之而屹然沛然卒不能空兵刑灾祸未尝可以度也佛必欲度之而伏尸百万'
106 | test_f21 = '朱子曰心之虚灵知觉一而已矣而以为有心人道心之异者以其或生于形气之私或原于性命之正而所以为知觉者不同是以或危殆而不安或微妙而难见尔'
107 | test_f22 = '真西山读书记曰此武王伐纣之事诗意虽主伐纣而言然学者平居讽咏其辞凛然如上帝之实临其上则所以为闲邪存诚之助顾不大哉'
108 | test_f23 = '述叙既讫乃为主客发其例曰客问主人曰伪经何以名之新学也汉艺文志号为古经五经异义称为古说诸书所述古文尤繁'
109 | test_f24 = '取胡氏传一句两句为旨而以经事之相类者合以为题传为主经为客有以彼经证此经之题有用彼经而隐此经之题于是此一经者为射覆之书而春秋亡矣'
110 | test_f25 = '谁非黄帝尧舜之子孙而至于今日其不幸而为臧获为婢妾为舆台皂隶窘穷迫逼无可奈何非其数十代以前即自臧获婢妾舆台皂隶来也一旦奋发有为精勤不倦有及身而富贵者矣及其子孙而富贵者矣'
111 | test_f26 = '人器有德人和伦常社器有德族谐国安灵器有德则天伦如仪器无德人怨族乱国沸天地失道也'
112 | test_f27 = '先圣没逮今一千五百余年传世五十或问其姓则内求而不得或审其家则舌举而不下为之后者得无愧乎'
113 | test_f28 = '高辛父曰蟜极蟜极父曰玄嚣玄嚣父曰黄帝'
114 | test_f29 = '以为锦绣文采靡曼之衣'
115 | test_f30 = '通玄理而不通禅必受固执之病通禅理而不通儒多成狂慧之流求其禅儒皆通而又能贯之以道不但今鲜其人即古之紫衣黄冠下除紫阳莲池外恒不多觏'
116 | tests = [
117 | test_f, test_f1, test_f2, test_f3, test_f4, test_f5, test_f6, test_f7, test_f8,
118 | test_f9, test_f10,
119 | test_f11, test_f12, test_f13, test_f14, test_f15, test_f16, test_f17, test_f18, test_f19, test_f20,
120 | test_f21, test_f22,
121 | test_f23, test_f24, test_f25, test_f26, test_f27, test_f28, test_f29, test_f30
122 | ]
123 |
124 |
125 |
126 |
127 |
128 | # train_sentencizer('data/jiayan.klm', '/Users/jiaeyan/Desktop/chn_data/all.txt', 'cut_model_60')
129 | # train_punctuator('data/jiayan.klm', '/Users/jiaeyan/Desktop/chn_data/all.txt', 'cut_model', 'punc_model')
130 | # train_postagger('/Users/jiaeyan/Desktop/chn_data/pos_all.txt', 'pos_model')
131 |
132 | # lm = load_lm('data/jiayan.klm')
133 | #
134 | # sentcizer = CRFSentencizer(lm)
135 | # sentcizer.load("/Users/jiaeyan/Desktop/cut_model_70")
136 | # for test in tests:
137 | # print(sentcizer.sentencize(test))
138 |
139 |
140 | # punctuator = CRFPunctuator(lm, '/Users/jiaeyan/Desktop/cut_model')
141 | # punctuator.load('/Users/jiaeyan/Desktop/punc_model')
142 | # for test in tests:
143 | # print(punctuator.punctuate(test))
144 |
145 | # tokenizer = CharHMMTokenizer(lm)
146 | # for test in tests:
147 | # print(list(tokenizer.tokenize(test)))
148 |
149 | postagger = CRFPOSTagger()
150 | postagger.load('/Users/jiaeyan/Desktop/pos_model_50')
151 | # words = ['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']
152 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。']
153 | print(postagger.postag(words))
154 | # for test in tests:
155 | # words = list(tokenizer.tokenize(test))
156 | # print(words)
157 | # print(postagger.postag(words))
158 |
159 |
160 |
161 |
162 |
163 | # test = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也' \
164 | # '判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方' \
165 | # '悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂'
166 | #
167 | # lm_path = 'data/jiayan.klm'
168 |
169 | # print('Constructing lexicon...')
170 | # construct_lexicon('data/庄子.txt', '庄子1.csv')
171 | #
172 | # print('\nTokenizing test text with HMM...')
173 | # hmm_tokenize(lm_path, test)
174 | #
175 | # print('\nTokenizing test text with N-grams...')
176 | # for test in tests:
177 | # ngram_tokenize(test)
178 | #
179 | # print('\nSentencizing test text with CRF...')
180 | # crf_sentencize(lm_path, 'cut_model', test_f1)
181 |
182 | # print('\nPunctuating test text with CRF...')
183 | # crf_punctuate(lm_path, 'cut_model_60', 'punc_model', test_f1)
184 | # crf_punctuate(lm_path, 'cut_model_60', 'punc_model', test)
185 |
--------------------------------------------------------------------------------
/jiayan/globals.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 |
4 | re_zh_include = re.compile("([\u4E00-\u9FA5]+)", re.U)
5 | re_zh_exclude = re.compile("[\u4E00-\u9FA5]+", re.U)
6 |
7 | re_puncs_include = re.compile(r'([。?!,、:;])')
8 | re_puncs_exclude = re.compile(r'[。?!,、:;]')
9 |
10 | re_invalid_chars = re.compile(r'[^\u4E00-\u9FA5。?!,、:;]+', re.U)
11 |
12 | stopchars = {'之', '兹', '此', '彼',
13 | '谁', '孰', '何', '奚', '曷', '焉',
14 | '或', '有', '某',
15 | '莫', '弗', '毋', '勿', '不', '得',
16 | '亦', '乃',
17 | '於', '于', '乎', '在', '而', '以', '为',
18 | '其', '唯', '惟', '焉', '者', '所',
19 | '也', '矣', '已',
20 | '欤', '耶', '哉', '兮',
21 | '必', '又', '每', '皆', '仅', '只',
22 | '甚', '颇', '岂',
23 | '曰'}
24 |
25 |
26 | def get_char_pos_dict():
27 | with open('data/char_pos_dict.json', 'r') as f:
28 | char_pos_dict = json.load(f)
29 | return char_pos_dict
30 |
31 |
32 | re_num = re.compile(r'[第一二三四五六七八九十百千万]{2,}')
33 | re_calender = re.compile(r'[甲乙丙丁戊己庚辛壬癸子丑寅卯辰巳午未申酉戌亥]{2,}')
34 |
35 |
36 | """
37 | http://www.ziyexing.com/files-5/guhanyu/guhanyu_index.html
38 | """
39 | pron_single = {'我', '吾', '你', '而', '乃', '若', '其', '之', '他'}
40 | pron_plural = {'吾侪', '吾曹', '吾属', '我属', '我辈', '若曹', '而属', '尔辈', '公等', '卿等'}
41 | pron_demonstrate = {'之', '斯', '兹', '此', '是', '彼', '夫', '伊'}
42 | pron_interrogative = {'谁', '孰', '何', '奚', '胡', '曷', '恶', '焉', '安', '几'}
43 | pron_indefinite = {'或', '有', '某'}
44 | pron_negative = {'莫', '罔', '无', '靡', '蔑', '末'}
45 | auxiliary_verb = {'克', '能', '堪', '可', '可以', '可得', '得', '足', '足以',
46 | '欲', '肯', '将', '宁', '敢', '忍', '愿',
47 | '当', '如', '宜', '任', '合', '应',
48 | '见', '被', '为'}
49 | preposition = {'於', '于', '乎', '以', '在', '即', '及', '自', '从', '由',
50 | '当', '到', '迨', '逮', '至', '比', '竟', '向', '临', '先'
51 | '因', '用', '缘', '为', '乎', '从', '与'}
52 | conjunction = {'夫', '若', '如', '且', '至', '若夫', '且夫', '至于', '至如',
53 | '既', '终', '已',
54 | '如', '苟', '使', '令', '即', '抑', '向', '诚', '果', '设', '假',
55 | '若苟', '向使', '若使', '如若', '如使', '如令', '如果', '苟使', '假设',
56 | '假使', '假如', '假若', '假令', '设使', '设若', '设令', '倘若', '倘使', '诚使', '诚令'
57 | '虽', '则', '且', '而', '尚', '犹', '且犹', '尚犹', '纵', '虽则', '虽使', '与其', '与', '以', '为', '由',
58 | '与', '及', '暨', '之以', '而', '且又', '亦', '而且', '而又',
59 | '况', '而况', '况于', '何况',
60 | '故', '乃', '是以', '是用', '是故', '以故', '因是',
61 | '然则', '然而', '但'}
62 | particle = {'其', '之', '斯', '思', # rhyme
63 | '唯', '惟', # syntax
64 | '有', '畴', '丕', '薄', '言', # word prefix
65 | '然', '焉', '尔', '而', '斯', '若', '如', '乎', # word suffix
66 | '者', '攸', '所', # phrase prefix & suffix
67 | '载', '有', '式', '于',
68 | }
69 | modal_particle = {'也', '矣', '已', '而已', '耳', '尔', '而', '焉', '然', '夫', '者', '云', # affirmative
70 | '与', '欤', '邪', '耶', '乎', '为', '则', # question
71 | '哉', '乎', '夫', '为', '兮', '邪', # exclamation
72 | }
73 |
--------------------------------------------------------------------------------
/jiayan/lexicon/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/lexicon/__init__.py
--------------------------------------------------------------------------------
/jiayan/lexicon/pmi_entropy_constructor.py:
--------------------------------------------------------------------------------
1 | from math import log2
2 | import time
3 | from jiayan.globals import stopchars
4 | from jiayan.utils import text_iterator
5 |
6 | """
7 | A precise way to discover new words in sentence corpus, consider PMI and entropy.
8 |
9 | 1. PMI is used to evaluate how tight the two segments of the word;
10 | 2. Right and left entropy are used to evaluate how independent the word is in various contexts.
11 | """
12 |
13 |
14 | class Trie:
15 |
16 | class TrieNode:
17 | def __init__(self):
18 | self.freq = 0
19 | self.pmi = 0
20 | self.r_entropy = 0
21 | self.l_entropy = 0
22 | self.children = {}
23 |
24 | def __init__(self):
25 | self.root = self.TrieNode()
26 |
27 | def add(self, word):
28 | node = self.root
29 | for char in word:
30 | if char not in node.children:
31 | node.children[char] = self.TrieNode()
32 | node = node.children[char]
33 | node.freq += 1
34 |
35 | def find(self, word):
36 | node = self.root
37 | for char in word:
38 | if char not in node.children:
39 | return None
40 | node = node.children[char]
41 | return node
42 |
43 |
44 | class PMIEntropyLexiconConstructor:
45 |
46 | MIN_WORD_LEN = 1
47 | MAX_WORD_LEN = 4
48 |
49 | # TODO: Different PMI and Entropy thresholds for different lengths
50 | MIN_WORD_FREQ = 10
51 | MIN_PMI = 80
52 | MIN_ENTROPY = 2
53 |
54 | def __init__(self):
55 | self.trie = Trie()
56 | self.r_trie = Trie()
57 | self.total = 0
58 |
59 | def construct_lexicon(self, data_file):
60 | self.build_trie_trees(data_file)
61 | self.compute()
62 | lexicon = self.filter()
63 | return lexicon
64 |
65 | def build_trie_trees(self, data_file):
66 | """ Counts frequency of segments of data, also records their left and right char sets.
67 | """
68 | max_seg_len = self.MAX_WORD_LEN + 1
69 |
70 | start = time.time()
71 | for text in text_iterator(data_file):
72 | length = len(text)
73 | for i in range(length):
74 | for j in range(1, min(length - i + 1, max_seg_len + 1)):
75 | seg = text[i: i + j]
76 | self.trie.add(seg)
77 |
78 | r_seg = seg[::-1]
79 | self.r_trie.add(r_seg)
80 |
81 | self.total += 1
82 | end = time.time()
83 |
84 | print('Trie building time:', end - start)
85 |
86 | def compute(self):
87 | start = time.time()
88 | node = self.trie.root
89 | word = ''
90 | self.compute_help(node, word)
91 | end = time.time()
92 | print('Computation time:', end - start)
93 |
94 | def compute_help(self, node, word):
95 | if node.children:
96 | for char, child in node.children.items():
97 | word += char
98 | if len(word) <= self.MAX_WORD_LEN:
99 | self.calculate_pmi(child, word)
100 | self.calculate_rl_entropy(child, word)
101 | self.compute_help(child, word)
102 | word = word[:-1]
103 |
104 | def calculate_pmi(self, node, word):
105 | length = len(word)
106 | if length == 1:
107 | node.pmi = self.MIN_PMI
108 | else:
109 | constant = node.freq * self.total
110 | mutuals = (constant / (self.trie.find(word[:i + 1]).freq * self.trie.find(word[i + 1:]).freq)
111 | for i in range(length - 1))
112 | node.pmi = min(mutuals)
113 |
114 | def calculate_rl_entropy(self, node, word):
115 | # right entropy
116 | if node.children:
117 | node.r_entropy = self.calculate_entropy(node)
118 |
119 | # left entropy
120 | r_word = word[::-1]
121 | r_node = self.r_trie.find(r_word)
122 | if r_node.children:
123 | node.l_entropy = self.calculate_entropy(r_node)
124 |
125 | def calculate_entropy(self, node):
126 | freqs = [child.freq for child in node.children.values()]
127 | sum_freqs = sum(freqs)
128 | entropy = sum([- (x / sum_freqs) * log2(x / sum_freqs) for x in freqs])
129 | return entropy
130 |
131 | def filter(self):
132 | """ Filters the PMI and entropy calculation result dict, removes words that do not
133 | reach the thresholds.
134 | TODO: test use max of r/l entropy to filter.
135 | """
136 | start = time.time()
137 | node = self.trie.root
138 | word = ''
139 | word_dict = {}
140 | self.filter_help(node, word, word_dict)
141 | end = time.time()
142 | print('Word filtering:', end - start)
143 | return word_dict
144 |
145 | def filter_help(self, node, word, word_dict):
146 | if node.children:
147 | for char, child in node.children.items():
148 | word += char
149 | if self.valid_word(child, word):
150 | word_dict[word] = [child.freq, child.pmi, child.r_entropy, child.l_entropy]
151 | self.filter_help(child, word, word_dict)
152 | word = word[:-1]
153 |
154 | def valid_word(self, node, word):
155 | if self.MIN_WORD_LEN <= len(word) <= self.MAX_WORD_LEN \
156 | and node.freq >= self.MIN_WORD_FREQ \
157 | and node.pmi >= self.MIN_PMI \
158 | and node.r_entropy >= self.MIN_ENTROPY \
159 | and node.l_entropy >= self.MIN_ENTROPY \
160 | and not self.has_stopword(word):
161 | return True
162 | return False
163 |
164 | def has_stopword(self, word):
165 | """ Checks if a word contains stopwords, which are not able to construct words.
166 | """
167 | if len(word) == 1:
168 | return False
169 | for char in word:
170 | if char in stopchars:
171 | return True
172 | return False
173 |
174 | @staticmethod
175 | def save(lexicon, out_f):
176 | """ Saves the word detection result in a csv file.
177 | """
178 | words = sorted(lexicon, key=lambda x: (len(x), -lexicon[x][0], -lexicon[x][1], -lexicon[x][2], -lexicon[x][3]))
179 | with open(out_f, 'w') as f:
180 | f.write('Word,Frequency,PMI,R_Entropy,L_Entropy\n')
181 | for word in words:
182 | f.write('{},{},{},{},{}\n'.format(
183 | word, lexicon[word][0], lexicon[word][1],
184 | lexicon[word][2], lexicon[word][3]))
185 |
186 |
187 |
188 |
--------------------------------------------------------------------------------
/jiayan/linguistic_unit.py:
--------------------------------------------------------------------------------
1 | class Paragraph:
2 | def __init__(self):
3 | pass
4 |
5 |
6 | class Sentence:
7 | def __init__(self):
8 | pass
9 |
10 |
11 | class Word:
12 | def __init__(self):
13 | pass
14 |
15 |
16 | class Character:
17 | def __init__(self):
18 | pass
19 |
--------------------------------------------------------------------------------
/jiayan/postagger/README.md:
--------------------------------------------------------------------------------
1 | Tag | Description | Example
2 | --- | --- | ---
3 | a | adjective | 幽明
4 | b | other noun-modifier | 男,女
5 | c | conjunction | 与,而
6 | d | adverb | 皆
7 | e | exclamation | 呜呼
8 | g | morpheme | 甥
9 | h | prefix | 非
10 | i | idiom | 发愤忘食
11 | j | abbreviation | 五帝
12 | k | suffix | 者
13 | m | number | 一,百
14 | n | general noun | 鬼神,山川
15 | nd | direction noun | 东,西,南,北
16 | nh | person name | 轩辕
17 | ni | organization name | 辽队
18 | nl | location noun | 城北
19 | ns | geographical name | 襄平县
20 | nt | temporal noun | 春,夏,秋,冬
21 | nz | other proper noun | 山海经
22 | o | onomatopoeia | 呜呜
23 | p | preposition | 以,为
24 | q | quantity | 年,岁
25 | r | pronoun | 其,斯
26 | u | auxiliary | 之,所
27 | v | verb | 赐
28 | wp | punctuation | ,。!
29 | ws | foreign words | CPU
30 | x | non-lexeme | 萄, 翱
31 | z | descriptive words | 默然,区区
--------------------------------------------------------------------------------
/jiayan/postagger/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/postagger/__init__.py
--------------------------------------------------------------------------------
/jiayan/postagger/crf_pos_tagger.py:
--------------------------------------------------------------------------------
1 | import random
2 | from itertools import chain
3 | from string import ascii_uppercase
4 |
5 | import pycrfsuite
6 | from sklearn.metrics import classification_report
7 | from sklearn.preprocessing import LabelBinarizer
8 |
9 | from jiayan.globals import re_zh_exclude
10 |
11 |
12 | class CRFPOSTagger:
13 |
14 | def __init__(self):
15 | self.tagger = None
16 |
17 | def load(self, crf_model):
18 | self.tagger = pycrfsuite.Tagger()
19 | self.tagger.open(crf_model)
20 |
21 | def sent2features(self, sent):
22 | length = len(sent)
23 | feat_list = []
24 | for i, word in enumerate(sent):
25 | # pattern = self.get_word_pattern(word)
26 | # is_zh = '1' if re_zh_exclude.match(word) else '0'
27 | features = [
28 | 'bias',
29 | '0:word=' + word,
30 | # '0:pattern=' + pattern,
31 | # '0:type=' + is_zh,
32 | ]
33 |
34 | if i > 0:
35 | features.extend([
36 | '-1:word=' + sent[i - 1],
37 | '-10:words=' + '|'.join(sent[i - 1: i + 1]),
38 | ])
39 | else:
40 | features.append('BOS')
41 |
42 | if i > 1:
43 | features.extend([
44 | '-2:word=' + sent[i - 2],
45 | '-21:words=' + '|'.join(sent[i - 2: i]),
46 | '-210:words=' + '|'.join(sent[i - 2: i + 1]),
47 | ])
48 |
49 | if i < length - 1:
50 | features.extend([
51 | '+1:word=' + sent[i + 1],
52 | '+01:words=' + '|'.join(sent[i: i + 2]),
53 | ])
54 | else:
55 | features.append('EOS')
56 |
57 | if i < length - 2:
58 | features.extend([
59 | '+2:word=' + sent[i + 2],
60 | '+12:words=' + '|'.join(sent[i + 1: i + 3]),
61 | '+012:chars=' + '|'.join(sent[i: i + 3]),
62 | ])
63 |
64 | if 0 < i < length - 1:
65 | features.extend([
66 | '-11:words=' + sent[i - 1] + '|' + sent[i + 1],
67 | '-101:words=' + '|'.join(sent[i - 1: i + 2]),
68 | ])
69 |
70 | feat_list.append(features)
71 |
72 | return feat_list
73 |
74 | @staticmethod
75 | def get_word_pattern(word):
76 | pattern = ''
77 | char = ''
78 | i = -1
79 | for ch in word:
80 | if ch != char:
81 | i += 1
82 | pattern += ascii_uppercase[i]
83 | char = ch
84 | return pattern
85 |
86 | def sent2tags(self, sent):
87 | pass
88 |
89 | def train(self, train_x, train_y, out_model):
90 | trainer = pycrfsuite.Trainer(verbose=False)
91 | for x, y in zip(train_x, train_y):
92 | if x and y:
93 | trainer.append(x, y)
94 |
95 | trainer.set_params({
96 | 'c1': 1.0, # coefficient for L1 penalty
97 | 'c2': 1e-3, # coefficient for L2 penalty
98 | 'max_iterations': 50, # stop earlier
99 | 'feature.possible_transitions': True # include transitions that are possible, but not observed
100 | })
101 |
102 | trainer.train(out_model)
103 | print(trainer.logparser.last_iteration)
104 |
105 | def build_data(self, data_file):
106 | X = []
107 | Y = []
108 |
109 | with open(data_file, 'r') as f:
110 | for line in f:
111 | line = line.strip()
112 | if line:
113 | x, y = line.split('\t')
114 | feat_list = self.sent2features(x.split())
115 | tag_list = y.split()
116 | X.append(feat_list)
117 | Y.append(tag_list)
118 |
119 | return X, Y
120 |
121 | def split_data(self, X, Y):
122 | random.seed(42)
123 | rd_num = random.random()
124 |
125 | def _rd():
126 | return rd_num
127 |
128 | random.shuffle(X, _rd)
129 | random.shuffle(Y, _rd)
130 |
131 | ratio = round(len(X) * 0.9)
132 | return X[:ratio], Y[:ratio], X[ratio:], Y[ratio:]
133 |
134 | def eval(self, test_x, test_y, crf_model):
135 | tagger = pycrfsuite.Tagger()
136 | tagger.open(crf_model)
137 |
138 | y_pred = []
139 | for feat_list in test_x:
140 | preds = tagger.tag(feat_list)
141 | y_pred.append(preds)
142 |
143 | lb = LabelBinarizer()
144 | y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
145 | y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))
146 |
147 | tagset = sorted(set(lb.classes_))
148 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
149 |
150 | print(classification_report(
151 | y_true_all,
152 | y_pred_all,
153 | labels=[class_indices[cls] for cls in tagset],
154 | target_names=tagset,
155 | digits=5
156 | ))
157 |
158 | def postag(self, sent):
159 | feat_list = self.sent2features(sent)
160 | tags = self.tagger.tag(feat_list)
161 | return tags
162 |
163 |
164 |
--------------------------------------------------------------------------------
/jiayan/sentencizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/sentencizer/__init__.py
--------------------------------------------------------------------------------
/jiayan/sentencizer/crf_punctuator.py:
--------------------------------------------------------------------------------
1 | from itertools import chain
2 |
3 | import pycrfsuite
4 | from sklearn.metrics import classification_report
5 | from sklearn.preprocessing import LabelBinarizer
6 |
7 | from jiayan.globals import re_puncs_include, re_zh_exclude
8 | from jiayan.utils import text_iterator
9 | from jiayan.sentencizer.crf_sent_tagger import CRFSentTagger
10 | from jiayan.sentencizer.crf_sentencizer import CRFSentencizer
11 |
12 |
13 | class CRFPunctuator(CRFSentTagger):
14 |
15 | def __init__(self, lm, cut_model):
16 | super(CRFPunctuator, self).__init__(lm)
17 | self.sentencizer = CRFSentencizer(lm)
18 | self.sentencizer.load(cut_model)
19 |
20 | def sent2features(self, sent: str, tags=None):
21 | length = len(sent)
22 | feat_list = []
23 | for i, char in enumerate(sent):
24 | features = [
25 | 'bias',
26 | '0:char=' + char,
27 | '0:tag=' + tags[i],
28 | ]
29 |
30 | if i > 0:
31 | features.extend([
32 | '-1:char=' + sent[i - 1],
33 | '-10:chars=' + sent[i - 1: i + 1],
34 | # '-10:pmi=' + self.get_pmi(sent[i - 1: i + 1]),
35 |
36 | # '-1:tag=' + tags[i - 1],
37 | # '-10:tags=' + tags[i - 1: i + 1],
38 | ])
39 | else:
40 | features.append('BOS')
41 |
42 | if i > 1:
43 | features.extend([
44 | '-2:char=' + sent[i - 2],
45 | '-21:chars=' + sent[i - 2: i],
46 | '-210:chars=' + sent[i - 2: i + 1],
47 |
48 | # '-21:tags=' + tags[i - 2: i],
49 | # '-210:tags=' + tags[i - 2: i + 1],
50 | ])
51 |
52 | if i > 2:
53 | features.extend([
54 | '-3:char=' + sent[i - 3],
55 | '-321:chars=' + sent[i - 3: i],
56 | '-3210:chars=' + sent[i - 3: i + 1],
57 | ])
58 |
59 | if i < length - 1:
60 | features.extend([
61 | '+1:char=' + sent[i + 1],
62 | '+01:chars=' + sent[i: i + 2],
63 | # '+01:pmi=' + self.get_pmi(sent[i: i + 2]),
64 |
65 | # '+1:tag=' + tags[i + 1],
66 | # '+01:tags=' + tags[i: i + 2],
67 | ])
68 | else:
69 | features.append('EOS')
70 |
71 | if i < length - 2:
72 | features.extend([
73 | '+2:char=' + sent[i + 2],
74 | '+12:chars=' + sent[i + 1: i + 3],
75 | '+012:chars=' + sent[i: i + 3],
76 |
77 | # '+12:tags=' + tags[i + 1: i + 3],
78 | # '+012:tags=' + tags[i: i + 3],
79 | ])
80 |
81 | if i < length - 3:
82 | features.extend([
83 | '+3:char=' + sent[i + 3],
84 | '+123:chars=' + sent[i + 1: i + 4],
85 | '+0123:chars=' + sent[i: i + 4],
86 | ])
87 |
88 | if 0 < i < length - 1:
89 | features.extend([
90 | '-11:chars=' + sent[i - 1] + sent[i + 1],
91 | '-101:chars=' + sent[i - 1: i + 2],
92 | '-101:ttest=' + self.get_ttest(sent[i - 1: i + 2]),
93 | ])
94 |
95 | feat_list.append(features)
96 |
97 | return feat_list
98 |
99 | def punctuate(self, text):
100 | cut_feat_list = self.sentencizer.sent2features(text)
101 | cut_tags = self.sentencizer.tagger.tag(cut_feat_list)
102 | punc_feat_list = self.sent2features(text, cut_tags)
103 | punc_tags = self.tagger.tag(punc_feat_list)
104 |
105 | sents = []
106 | sent = ''
107 | for i, tag in enumerate(punc_tags):
108 | if tag in self.tag2punc:
109 | if sent:
110 | sents.append(sent)
111 | sent = ''
112 | sents.append(text[i])
113 | sents.append(self.tag2punc[tag])
114 | elif tag == 'B':
115 | if sent:
116 | sents.append(sent)
117 | sent = text[i]
118 | elif tag in {'M', 'E3', 'E2'}:
119 | sent += text[i]
120 | if sent:
121 | sents.append(sent)
122 |
123 | return ''.join(sents)
124 |
125 | def build_data(self, data_file):
126 | X = []
127 | Y = []
128 | for line in text_iterator(data_file, keep_punc=True):
129 | texts = [text for text in re_puncs_include.split(line) if text]
130 | texts = self.process_texts(texts)
131 |
132 | feat_list = []
133 | punc_tags = []
134 | for i in range(len(texts) - 1):
135 | if re_zh_exclude.match(texts[i]) and texts[i + 1] in self.punc2tag:
136 | cut_tags = self.sent2tags(texts[i])
137 | feat_list.extend(self.sent2features(texts[i], cut_tags))
138 | punc_tags.extend(self.sent2tags(texts[i], texts[i + 1]))
139 |
140 | X.append(feat_list)
141 | Y.append(punc_tags)
142 |
143 | return X, Y
144 |
145 | def process_texts(self, texts):
146 | while texts and texts[0] in self.punc2tag:
147 | texts = texts[1:]
148 |
149 | if len(texts) // 2 != 0:
150 | texts.append('。')
151 |
152 | return texts
153 |
154 | def eval(self, test_x, test_y, crf_model):
155 | tagger = pycrfsuite.Tagger()
156 | tagger.open(crf_model)
157 |
158 | pred_y = []
159 | for feat_list in test_x:
160 | preds = tagger.tag(feat_list)
161 | pred_y.append(preds)
162 |
163 | y_trues = [tag for tag in list(chain.from_iterable(test_y)) if tag not in {'B', 'M', 'E3', 'E2'}]
164 | y_preds = [tag for tag in list(chain.from_iterable(pred_y)) if tag not in {'B', 'M', 'E3', 'E2'}]
165 |
166 | lb = LabelBinarizer()
167 | y_true_all = lb.fit_transform(y_trues)
168 | y_pred_all = lb.transform(y_preds)
169 |
170 | tagset = sorted(set(lb.classes_))
171 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
172 |
173 | print(classification_report(
174 | y_true_all,
175 | y_pred_all,
176 | labels=[class_indices[cls] for cls in tagset],
177 | target_names=tagset,
178 | digits=5
179 | ))
180 |
--------------------------------------------------------------------------------
/jiayan/sentencizer/crf_sent_tagger.py:
--------------------------------------------------------------------------------
1 | import random
2 | from itertools import chain
3 |
4 | import pycrfsuite
5 | from sklearn.metrics import classification_report
6 | from sklearn.preprocessing import LabelBinarizer
7 |
8 |
9 | class CRFSentTagger:
10 |
11 | def __init__(self, lm):
12 | self.lm = lm
13 | self.tagger = None
14 |
15 | # for feature extraction of punctuator
16 | self.punc2tag = {
17 | '。': 'J',
18 | '!': 'G',
19 | '?': 'W',
20 | ',': 'D',
21 | '、': 'U',
22 | ':': 'A',
23 | ';': 'F',
24 | }
25 |
26 | # for decoding of punctuator
27 | self.tag2punc = {
28 | 'J': '。',
29 | 'G': '!',
30 | 'W': '?',
31 | 'D': ',',
32 | 'U': '、',
33 | 'A': ':',
34 | 'F': ';',
35 | }
36 |
37 | def load(self, crf_model):
38 | self.tagger = pycrfsuite.Tagger()
39 | self.tagger.open(crf_model)
40 |
41 | def sent2features(self, sent: str, tags=None):
42 | pass
43 |
44 | def sent2tags(self, sent: str, punc=''):
45 | single_tag = 'S'
46 | end_tag = 'E'
47 |
48 | if punc:
49 | single_tag = self.punc2tag[punc]
50 | end_tag = self.punc2tag[punc]
51 |
52 | length = len(sent)
53 | if length == 1:
54 | tags = [single_tag]
55 | elif length == 2:
56 | tags = ['B', end_tag]
57 | elif length == 3:
58 | tags = ['B', 'E2', end_tag]
59 | elif length == 4:
60 | tags = ['B', 'E3', 'E2', end_tag]
61 | elif length == 5:
62 | tags = ['B', 'M', 'E3', 'E2', end_tag]
63 | else:
64 | tags = ['B'] + ['M'] * (length - 4) + ['E3', 'E2', end_tag]
65 |
66 | return tags
67 |
68 | def get_pmi(self, seg):
69 | pmi = self.lm.score(' '.join(seg), eos=False, bos=False) - \
70 | (self.lm.score(seg[0], eos=False, bos=False) + self.lm.score(seg[1], eos=False, bos=False))
71 | if pmi >= 2:
72 | return '2'
73 | elif pmi >= 1.5:
74 | return '1.5'
75 | elif pmi >= 1:
76 | return '1'
77 | elif pmi >= 0.5:
78 | return '0.5'
79 | return '0'
80 |
81 | def get_ttest(self, seg):
82 | former = self.lm.score(' '.join(seg[:2]), eos=False, bos=False) - self.lm.score(seg[0], eos=False, bos=False)
83 | latter = self.lm.score(' '.join(seg[1:]), eos=False, bos=False) - self.lm.score(seg[1], eos=False, bos=False)
84 | diff = former - latter
85 | if diff > 0:
86 | return 'l'
87 | elif diff == 0:
88 | return 'u'
89 | else:
90 | return 'r'
91 |
92 | def train(self, train_x, train_y, out_model):
93 | trainer = pycrfsuite.Trainer(verbose=False)
94 | for x, y in zip(train_x, train_y):
95 | if x and y:
96 | trainer.append(x, y)
97 |
98 | trainer.set_params({
99 | 'c1': 1.0, # coefficient for L1 penalty
100 | 'c2': 1e-3, # coefficient for L2 penalty
101 | 'max_iterations': 50, # stop earlier
102 | 'feature.possible_transitions': True # include transitions that are possible, but not observed
103 | })
104 |
105 | trainer.train(out_model)
106 | print(trainer.logparser.last_iteration)
107 |
108 | def build_data(self, data_file):
109 | pass
110 |
111 | def split_data(self, X, Y):
112 | random.seed(42)
113 | rd_num = random.random()
114 |
115 | def _rd():
116 | return rd_num
117 |
118 | random.shuffle(X, _rd)
119 | random.shuffle(Y, _rd)
120 |
121 | ratio = round(len(X) * 0.9)
122 | return X[:ratio], Y[:ratio], X[ratio:], Y[ratio:]
123 |
124 | def eval(self, test_x, test_y, crf_model):
125 | tagger = pycrfsuite.Tagger()
126 | tagger.open(crf_model)
127 |
128 | y_pred = []
129 | for feat_list in test_x:
130 | preds = tagger.tag(feat_list)
131 | y_pred.append(preds)
132 |
133 | lb = LabelBinarizer()
134 | y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
135 | y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))
136 |
137 | tagset = sorted(set(lb.classes_))
138 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
139 |
140 | print(classification_report(
141 | y_true_all,
142 | y_pred_all,
143 | labels=[class_indices[cls] for cls in tagset],
144 | target_names=tagset,
145 | digits=5
146 | ))
147 |
148 |
149 |
--------------------------------------------------------------------------------
/jiayan/sentencizer/crf_sentencizer.py:
--------------------------------------------------------------------------------
1 | from itertools import chain
2 |
3 | from jiayan.globals import re_puncs_exclude
4 | from jiayan.utils import text_iterator
5 | from jiayan.sentencizer.crf_sent_tagger import CRFSentTagger
6 |
7 |
8 | class CRFSentencizer(CRFSentTagger):
9 |
10 | def __init__(self, lm):
11 | super(CRFSentencizer, self).__init__(lm)
12 |
13 | def sent2features(self, sent: str, tags=None):
14 | length = len(sent)
15 | feat_list = []
16 | for i, char in enumerate(sent):
17 | features = [
18 | 'bias',
19 | '0:char=' + char,
20 | ]
21 |
22 | if i > 0:
23 | features.extend([
24 | '-1:char=' + sent[i - 1],
25 | '-10:chars=' + sent[i - 1: i + 1],
26 | '-10:pmi=' + self.get_pmi(sent[i - 1: i + 1]),
27 | ])
28 | else:
29 | features.append('BOS')
30 |
31 | if i > 1:
32 | features.extend([
33 | # '-2:char=' + sent[i - 2],
34 | '-21:chars=' + sent[i - 2: i],
35 | '-210:chars=' + sent[i - 2: i + 1],
36 | ])
37 |
38 | if i < length - 1:
39 | features.extend([
40 | '+1:char=' + sent[i + 1],
41 | '+01:chars=' + sent[i: i + 2],
42 | '+01:pmi=' + self.get_pmi(sent[i: i + 2]),
43 | ])
44 | else:
45 | features.append('EOS')
46 |
47 | if i < length - 2:
48 | features.extend([
49 | # '+2:char=' + sent[i + 2],
50 | '+12:chars=' + sent[i + 1: i + 3],
51 | '+012:chars=' + sent[i: i + 3],
52 | ])
53 |
54 | if 0 < i < length - 1:
55 | features.extend([
56 | '-11:chars=' + sent[i - 1] + sent[i + 1],
57 | '-101:chars=' + sent[i - 1: i + 2],
58 | '-101:ttest=' + self.get_ttest(sent[i - 1: i + 2]),
59 | ])
60 |
61 | feat_list.append(features)
62 |
63 | return feat_list
64 |
65 | def sentencize(self, text):
66 | feat_list = self.sent2features(text)
67 | tags = self.tagger.tag(feat_list)
68 |
69 | sents = []
70 | sent = ''
71 | for i, tag in enumerate(tags):
72 | if tag == 'S':
73 | if sent:
74 | sents.append(sent)
75 | sent = ''
76 | sents.append(text[i])
77 | elif tag == 'B':
78 | if sent:
79 | sents.append(sent)
80 | sent = text[i]
81 | elif tag in {'M', 'E3', 'E2', 'E'}:
82 | sent += text[i]
83 | if sent:
84 | sents.append(sent)
85 |
86 | return sents
87 |
88 | def build_data(self, data_file):
89 | X = []
90 | Y = []
91 |
92 | for line in text_iterator(data_file, keep_punc=True):
93 | sents = [sent for sent in re_puncs_exclude.split(line) if sent]
94 | feat_list = self.sent2features(''.join(sents))
95 | tag_list = list(chain.from_iterable([self.sent2tags(sent) for sent in sents]))
96 | X.append(feat_list)
97 | Y.append(tag_list)
98 |
99 | return X, Y
100 |
101 |
102 |
--------------------------------------------------------------------------------
/jiayan/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/tokenizer/__init__.py
--------------------------------------------------------------------------------
/jiayan/tokenizer/hmm_tokenizer.py:
--------------------------------------------------------------------------------
1 | from math import log10
2 |
3 | from jiayan.globals import re_zh_include, stopchars
4 |
5 | """
6 | Use HMM to consider word detection as a char sequence tagging problem.
7 |
8 | With a word dict and a char sequence, there could be lots of tokenizing solutions, and the best one will have
9 | the biggest multiplication probability of words:
10 | (see Max Probability Tokenizing: [https://blog.csdn.net/u010189459/article/details/37956689])
11 | p(S) = p(w1) * p(w2) * p(w3)...p(wn)
12 |
13 | However, without a word dict we don't know how to tokenize the sentence by word. But here we can use
14 | language model to compute a possible word probability first:
15 | p(w) = p(c1, c2, c3, c4) = p(c1) * p(c2|c1) * p(c3|c1, c2) * p(c4|c1, c2, c3)
16 |
17 | Here the word "w" is a 4-char word, with c1, c2, c3 and c4, and the probabilities of each char occurring in relative
18 | position could be computed with N-grams model.
19 |
20 | So assume the longest word we want is 4-char word, then in a sentence with length L (L char sequence), each char
21 | could be in 4 possible positions of one word, and each associates with its probability of being at that position
22 | (k indicates the kth char in the sequence)
23 |
24 | 1. the beginning of the word (b): p(ck)
25 | 2. the second char of the word (c): p(ck|ck-1)
26 | 3. the third char of the word (d): p(ck|ck-2, ck-1)
27 | 4. the fourth char of the word (e): p(ck|ck-3, ck-2, ck-1)
28 |
29 | So, a char sequence could be tagged in a char level with labels {b, c, d, e} first, and be chunked based on the
30 | tags. Now we can see the word level problem is broken down to char level problem with hidden states, so this is the
31 | decoding problem of HMM, we can use viterbi algorithm to get the best tag/state sequence for the char/observation
32 | sequence.
33 |
34 | For viterbi, we need (a)initial starting probabilities of each state, (b)transition probabilities between states, and
35 | (c)emission probabilities of states emitting different observations. Let's draw a table to see what they should be in
36 | this problem:
37 |
38 | ----------------------------------------------------
39 | start -> b b b
40 | c c c
41 | d d d
42 | e e e
43 |
44 | char sequence: char1 char2 char3 ...
45 | -----------------------------------------------------
46 |
47 | So for each char in the sequence, there are 4 possible states.
48 | For (a), only "b" can start a sequence, so p(b|) = 1, and p(c|) = p(d|) = p(e|) = 0
49 | For (b), consider the longest word: "bcde", we can see the state transitions are limited in:
50 | i. b -> b, b -> c: the beginning of a word either goes to a new word beginning, or the 2nd char;
51 | ii. c -> b, c -> d: the 2nd char either goes to a new word beginning, or the 3rd char;
52 | iii. d -> b, d -> e: the 3rd char either goes to a new word beginning, or the 4th char;
53 | iv. e -> b, e -> e: the 4th char either goes to a new word beginning, or the 5th char ...
54 | For (c), the emission probability of one char at a certain state could be computed with N-grams model, e.g.,
55 | p(ck|d) = p(ck|ck-1, ck-2)
56 |
57 | The only parameters that we cannot compute here are transition probabilities, which we can manually set.
58 |
59 | Differences from regular HMM tokenizing:
60 | (a) regular HMM tokenizing uses label set {B, M, E, S} to tag char sequence, which is very vague to indicate
61 | exact char position within a word, especially "M", thus hard to compute emission probabilities;
62 | (b) regular HMM tokenizing requires large data to compute transition and emission probabilities, but here our
63 | goal is the opposite, to generate that word corpus;
64 | (c) regular HMM tokenizing computes transition probabilities from data, but here we set them manually;
65 | (d) regular HMM tokenizing computes emission probabilities from data, but here we use char level N-grams
66 | language model.
67 |
68 | Disadvantages:
69 | (a) slow: read the sentence data to build ngrams from min word length to max word length, and read again to tokenize
70 | the whole data, and by this to build word corpus; viterbi on each sentence in data
71 | (b) bad at long word: need to fine tune transition probabilities to control the word lengths, which is time consuming,
72 | and the detected long words are not as good as short words.
73 | (c) fake word frequency: since word corpus is built by tokenizing, which can lead to inaccurate sentence splits, the
74 | word count doesn't reflect true frequency, e.g., "天下" in "于天下". So we use its true frequency count in
75 | the ngrams dict when filtering.
76 | """
77 |
78 |
79 | class CharHMMTokenizer:
80 |
81 | def __init__(self, lm):
82 | self.lm = lm
83 | self.inits = {'b': 0.0, 'c': -3.14e100, 'd': -3.14e100, 'e': -3.14e100}
84 |
85 | # the transition probabilities are manually fine tuned;
86 | # in principle, we would like the word length the shorter the better;
87 | # low to-b and high to-next-char-in-word transition probs lead to long words;
88 | # high to-b and low to-next-char-in-word transition probs lead to short words.
89 | trans = {'bb': 0.85, 'bc': 0.15,
90 | 'cb': 0.9925, 'cd': 0.0075,
91 | 'db': 0.999, 'de': 0.001,
92 | 'eb': 0.9999, 'ee': 0.0001}
93 | # trans = {'bb': 0.8, 'bc': 0.2,
94 | # 'cb': 0.9925, 'cd': 0.0075,
95 | # 'db': 0.999, 'de': 0.001,
96 | # 'eb': 0.9999, 'ee': 0.0001}
97 |
98 | # convert the decimal probabilities to logs to avoid overflow
99 | self.trans = {states: log10(trans_prob) for states, trans_prob in trans.items()}
100 |
101 | def tokenize(self, text: str):
102 | """ Gets the tags of given sentence, and tokenizes sentence based on the tag sequence.
103 | """
104 | # split text by whitespaces first, then split each segment into char chunks by zh chars
105 | for seg in text.strip().split():
106 | if seg:
107 | for chunk in re_zh_include.split(seg):
108 | # if zh chars, tokenize them
109 | if re_zh_include.match(chunk):
110 | tags = self.viterbi(chunk)
111 |
112 | word = chunk[0]
113 | for i in range(1, len(chunk)):
114 | if tags[i] == 'b':
115 | if not self.valid_word(word):
116 | for char in word:
117 | yield char
118 | else:
119 | yield word
120 | word = chunk[i]
121 | else:
122 | word += chunk[i]
123 | if word:
124 | if not self.valid_word(word):
125 | for char in word:
126 | yield char
127 | else:
128 | yield word
129 |
130 | # if not zh chars, we assume they are all punctuations, split them
131 | else:
132 | for char in chunk:
133 | yield char
134 |
135 | def viterbi(self, sent):
136 | """ Chooses the most likely char tag sequence of given char sentence.
137 | """
138 | emits = self.get_emission_probs(sent)
139 |
140 | # record the best path for each state for each char, {path1: path_prob, path2: path_prob, ...};
141 | # paths grow at each decoding step, eventually contains the best paths for each state of last char;
142 | # we assume the initial state probs = 1st char's emission probs
143 | paths = {state: prob + self.inits[state] for state, prob in emits[0].items()}
144 |
145 | # for each char
146 | for i in range(1, len(sent)):
147 | # print(paths)
148 |
149 | # record best paths and their probs to all states of current char
150 | cur_char_paths = {}
151 |
152 | # for each state of current char
153 | for state, emit_prob in emits[i].items():
154 |
155 | # record all possible paths and their probs to current state
156 | cur_state_paths = {}
157 |
158 | # for each state of previous char
159 | for path, path_prob in paths.items():
160 | trans_states = path[-1] + state
161 |
162 | # compute the path prob from a previous state to current state
163 | if trans_states in self.trans:
164 | cur_state_paths[path + state] = path_prob + emit_prob + self.trans[trans_states]
165 |
166 | # choose the best path from all previous paths to current state
167 | best_path = sorted(cur_state_paths, key=lambda x: cur_state_paths[x])[-1]
168 |
169 | # for current state of current char, we found its best path
170 | cur_char_paths[best_path] = cur_state_paths[best_path]
171 |
172 | # the paths grow by one char/state
173 | paths = cur_char_paths
174 |
175 | return sorted(paths, key=lambda x: paths[x])[-1]
176 |
177 | def get_emission_probs(self, sent):
178 | """ Computes emission probability of each state emitting relative char in the given char sequence. """
179 | return [
180 |
181 | {'b': self.seg_prob(sent[i]),
182 | 'c': self.seg_prob(sent[i - 1:i + 1]),
183 | 'd': self.seg_prob(sent[i - 2:i + 1]),
184 | 'e': self.seg_prob(sent[i - 3:i + 1])
185 | }
186 |
187 | for i in range(len(sent))
188 | ]
189 |
190 | def seg_prob(self, seg):
191 | """ Computes the segment probability based on ngrams model.
192 | If given an empty segment, it means it's impossible for current char to be at current position of a word,
193 | thus return default low log prob -100.
194 | """
195 | return (self.lm.score(' '.join(seg), bos=False, eos=False) -
196 | self.lm.score(' '.join(seg[:-1]), bos=False, eos=False)) \
197 | or -100.0
198 |
199 | def valid_word(self, word):
200 | """ Checks if a word contains stopchars, if yes, it's not a valid word. """
201 | for char in word:
202 | if char in stopchars:
203 | return False
204 | return True
205 |
206 |
207 |
--------------------------------------------------------------------------------
/jiayan/tokenizer/ngram_tokenizer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import marshal
3 | from math import log
4 |
5 | from jiayan.globals import re_zh_include
6 |
7 | """
8 | References:
9 | [https://github.com/fxsjy/jieba]
10 | """
11 |
12 | dir_path = os.path.dirname(os.path.realpath(__file__))
13 | root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
14 |
15 | dict_path = os.path.join(root, 'data/dict.txt')
16 | cache_path = os.path.join(dir_path, 'tokenizer.cache')
17 |
18 |
19 | class WordNgramTokenizer:
20 |
21 | def __init__(self, dict_f=None):
22 | if not dict_f:
23 | dict_f = dict_path
24 | self.cache = cache_path
25 | self.PREFIX, self.total = self.check_cache(dict_f)
26 |
27 | def check_cache(self, dict_f):
28 | """ Loads frequency dict and total word counts from cache.
29 | """
30 | if os.path.isfile(self.cache):
31 | with open(self.cache, 'rb') as cf:
32 | return marshal.load(cf)
33 | else:
34 | # if no cache, generate freq dict and dump the cache
35 | PREFIX, total = self.gen_prefix_dict(dict_f)
36 | with open(self.cache, 'wb') as temp_cache_file:
37 | marshal.dump((PREFIX, total), temp_cache_file)
38 | return PREFIX, total
39 |
40 | def clear_cache(self):
41 | if os.path.isfile(self.cache):
42 | os.remove(self.cache)
43 |
44 | @staticmethod
45 | def gen_prefix_dict(dict_f):
46 | """ Reads a dict file and generates the prefix dictionary with total word counts.
47 | """
48 | word_counts = {}
49 | with open(dict_f, 'rb') as f:
50 | for line in f:
51 | line = line.strip().decode('utf-8')
52 | word, freq = line.split(',')
53 | word_counts[word] = int(freq)
54 |
55 | # enumerate all prefixes of a word to enrich the vocab
56 | for i in range(len(word)):
57 | prefix = word[:i + 1]
58 | if prefix not in word_counts:
59 | word_counts[prefix] = 0
60 |
61 | return word_counts, sum(word_counts.values())
62 |
63 | def tokenize(self, text):
64 | # split zh chars and non-zh chars into chunks
65 | chunks = re_zh_include.split(text)
66 |
67 | for chk in chunks:
68 | if chk:
69 | # if the chunk is zh, tokenize it
70 | if re_zh_include.match(chk):
71 | for word in self.cut_DAG(chk):
72 | yield word
73 | # if the chunk is not zh, treat it as a single word
74 | else:
75 | yield chk
76 |
77 | def cut_DAG(self, sentence):
78 | """ Cuts the DAG according to max route probabilities.
79 | """
80 | DAG = self.gen_DAG(sentence)
81 | route = {}
82 | self.calculate_route_prob(sentence, DAG, route)
83 |
84 | start = 0
85 | N = len(sentence)
86 |
87 | while start < N:
88 | end = route[start][1]
89 | word = sentence[start:end + 1]
90 | yield word
91 | start = end + 1
92 |
93 | def gen_DAG(self, sentence):
94 | """ Generates DAG based on given sentence and prefix dict.
95 | """
96 | DAG = {}
97 | N = len(sentence)
98 |
99 | for start in range(N):
100 | ends = []
101 | end = start
102 | prefix = sentence[start]
103 | while end < N and prefix in self.PREFIX:
104 | if self.PREFIX[prefix]:
105 | ends.append(end)
106 | end += 1
107 |
108 | # extend prefix
109 | prefix = sentence[start:end + 1]
110 |
111 | # if no words formed starting from current char, OOV, it ends with itself
112 | if not ends:
113 | ends.append(start)
114 |
115 | DAG[start] = ends
116 |
117 | return DAG
118 |
119 | def calculate_route_prob(self, sentence, DAG, route):
120 | """ Uses dynamic programming to compute the tokenizing solution with highest probability.
121 | """
122 | N = len(sentence)
123 |
124 | # each position in the route will be stored as "position: (prob, end)", where the value
125 | # tuple contains the highest path prob to current position, and the most recent word
126 | # ending position from current route position;
127 | # in other words, sentence[position: end + 1] forms the word and together with which
128 | # the rest of the path that makes the tokenizing solution with highest probability
129 | route[N] = (0, 0)
130 | log_total = log(self.total)
131 |
132 | # compute from backwards to forwards, because ...
133 | for i in range(N - 1, -1, -1):
134 |
135 | # for each word start position, lists all its possible word ending positions,
136 | # compute their word probabilities, and add relative rest path probabilities,
137 | # then choose the end position that makes the whole path probability highest
138 |
139 | # the value got from PREFIX dict could be either None or 0, we assume each word
140 | # appears at least once, like add-1 laplace smoothing
141 | route[i] = max((log(self.PREFIX.get(sentence[i:end + 1]) or 1) - log_total
142 | + route[end + 1][0], end) for end in DAG[i])
143 |
--------------------------------------------------------------------------------
/jiayan/translator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/translator/__init__.py
--------------------------------------------------------------------------------
/jiayan/utils.py:
--------------------------------------------------------------------------------
1 | from jiayan.globals import re_invalid_chars, re_zh_exclude
2 |
3 |
4 | def process_line(line: str):
5 | """ A standard approach to process input line, by
6 | 1. retain and replace valid punctuations;
7 | 2. removing non-zh and invalid punctuation chars;
8 | """
9 | line = line.strip().replace(',', ',').replace('.', '。').replace(':', ':').\
10 | replace('!', '!').replace('?', '?').replace(';', ';')
11 | line = re_invalid_chars.sub('', line)
12 | return line
13 |
14 |
15 | def text_iterator(data_file, keep_punc=False):
16 | """ A help function to provide clean zh char lines of a given file. """
17 | with open(data_file, 'r', encoding='utf-8') as f:
18 | for line in f:
19 | for seg in line.strip().split():
20 | if seg:
21 | seg = process_line(seg)
22 | if keep_punc:
23 | if seg:
24 | yield seg
25 | else:
26 | for text in re_zh_exclude.findall(seg):
27 | if text:
28 | yield text
29 |
30 |
31 | def make_kenlm(data_file):
32 | for text in text_iterator(data_file):
33 | print(' '.join(text))
34 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | https://github.com/kpu/kenlm/archive/master.zip
2 | scikit-learn
3 | python-crfsuite
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import sys
5 | from setuptools import setup, find_packages
6 |
7 |
8 | requirements = ["scikit-learn", "python-crfsuite"]
9 |
10 | if sys.version_info[:2] < (2, 7):
11 | requirements.append('argparse')
12 | if sys.version_info[:2] < (3, 4):
13 | requirements.append('enum34')
14 | if sys.version_info[:2] < (3, 5):
15 | requirements.append('typing')
16 |
17 | extras_require = {
18 | ':python_version<"2.7"': ['argparse'],
19 | ':python_version<"3.4"': ['enum34'],
20 | ':python_version<"3.5"': ['typing'],
21 | }
22 |
23 | setup(
24 | name="jiayan",
25 | version="0.0.21",
26 | author="Jiajie Yan",
27 | author_email="jiaeyan@gmail.com",
28 | description="The NLP toolkit designed for classical chinese.",
29 | long_description=open("README.md", encoding="utf-8").read(),
30 | long_description_content_type='text/markdown',
31 | license="MIT",
32 | url="https://github.com/jiaeyan/Jiayan",
33 | keywords=['classical-chinese', 'ancient-chinese', 'nlp'],
34 | packages=find_packages(),
35 | install_requires=requirements,
36 | extras_require=extras_require,
37 | python_requires='>=2.6, >=3',
38 | include_package_data=True,
39 | classifiers=[
40 | 'Programming Language :: Python',
41 | 'Programming Language :: Python :: 2',
42 | 'Programming Language :: Python :: 2.6',
43 | 'Programming Language :: Python :: 2.7',
44 | 'Programming Language :: Python :: 3',
45 | 'Programming Language :: Python :: 3.3',
46 | 'Programming Language :: Python :: 3.4',
47 | 'Programming Language :: Python :: 3.5',
48 | 'Programming Language :: Python :: 3.6',
49 | 'Topic :: Utilities',
50 | 'Topic :: Text Processing',
51 | ]
52 | )
53 |
--------------------------------------------------------------------------------