├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── jiayan ├── __init__.py ├── __main__.py ├── data │ ├── __init__.py │ ├── char_pos_dict.json │ └── dict.txt ├── examples.py ├── globals.py ├── lexicon │ ├── __init__.py │ └── pmi_entropy_constructor.py ├── linguistic_unit.py ├── postagger │ ├── README.md │ ├── __init__.py │ └── crf_pos_tagger.py ├── sentencizer │ ├── __init__.py │ ├── crf_punctuator.py │ ├── crf_sent_tagger.py │ └── crf_sentencizer.py ├── tokenizer │ ├── __init__.py │ ├── hmm_tokenizer.py │ └── ngram_tokenizer.py ├── translator │ └── __init__.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Jiayan 3 | ################# 4 | test.py 5 | jiayan.klm 6 | tokenizer.cache 7 | cut_model 8 | punc_model 9 | pos_model 10 | 11 | 12 | ################# 13 | ## Eclipse 14 | ################# 15 | 16 | *.pydevproject 17 | .project 18 | .metadata 19 | bin/ 20 | tmp/ 21 | *.tmp 22 | *.bak 23 | *.swp 24 | *~.nib 25 | local.properties 26 | .classpath 27 | .settings/ 28 | .loadpath 29 | 30 | # External tool builders 31 | .externalToolBuilders/ 32 | 33 | # Locally stored "Eclipse launch configurations" 34 | *.launch 35 | 36 | # CDT-specific 37 | .cproject 38 | 39 | # PDT-specific 40 | .buildpath 41 | 42 | 43 | ################# 44 | ## Visual Studio 45 | ################# 46 | 47 | ## Ignore Visual Studio temporary files, build results, and 48 | ## files generated by popular Visual Studio add-ons. 49 | 50 | # User-specific files 51 | *.suo 52 | *.user 53 | *.sln.docstates 54 | 55 | # Build results 56 | [Dd]ebug/ 57 | [Rr]elease/ 58 | *_i.c 59 | *_p.c 60 | *.ilk 61 | *.meta 62 | *.obj 63 | *.pch 64 | *.pdb 65 | *.pgc 66 | *.pgd 67 | *.rsp 68 | *.sbr 69 | *.tlb 70 | *.tli 71 | *.tlh 72 | *.tmp 73 | *.vspscc 74 | .builds 75 | *.dotCover 76 | 77 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 78 | #packages/ 79 | 80 | # Visual C++ cache files 81 | ipch/ 82 | *.aps 83 | *.ncb 84 | *.opensdf 85 | *.sdf 86 | 87 | # Visual Studio profiler 88 | *.psess 89 | *.vsp 90 | 91 | # ReSharper is a .NET coding add-in 92 | _ReSharper* 93 | 94 | # Installshield output folder 95 | [Ee]xpress 96 | 97 | # DocProject is a documentation generator add-in 98 | DocProject/buildhelp/ 99 | DocProject/Help/*.HxT 100 | DocProject/Help/*.HxC 101 | DocProject/Help/*.hhc 102 | DocProject/Help/*.hhk 103 | DocProject/Help/*.hhp 104 | DocProject/Help/Html2 105 | DocProject/Help/html 106 | 107 | # Click-Once directory 108 | publish 109 | 110 | # Others 111 | [Bb]in 112 | [Oo]bj 113 | sql 114 | TestResults 115 | *.Cache 116 | ClientBin 117 | stylecop.* 118 | ~$* 119 | *.dbmdl 120 | Generated_Code #added for RIA/Silverlight projects 121 | 122 | # Backup & report files from converting an old project file to a newer 123 | # Visual Studio version. Backup files are not needed, because we have git ;-) 124 | _UpgradeReport_Files/ 125 | Backup*/ 126 | UpgradeLog*.XML 127 | ############ 128 | ## pycharm 129 | ############ 130 | .idea 131 | 132 | ############ 133 | ## Windows 134 | ############ 135 | 136 | # Windows image file caches 137 | Thumbs.db 138 | 139 | # Folder config file 140 | Desktop.ini 141 | 142 | 143 | ############# 144 | ## Python 145 | ############# 146 | 147 | *.py[co] 148 | 149 | # Packages 150 | *.egg 151 | *.egg-info 152 | dist 153 | build 154 | eggs 155 | parts 156 | bin 157 | var 158 | sdist 159 | develop-eggs 160 | .installed.cfg 161 | 162 | # Installer logs 163 | pip-log.txt 164 | 165 | # Unit test / coverage reports 166 | .coverage 167 | .tox 168 | 169 | #Translations 170 | *.mo 171 | 172 | #Mr Developer 173 | .mr.developer.cfg 174 | 175 | # Mac crap 176 | .DS_Store 177 | *.log 178 | test/tmp/* 179 | 180 | #jython 181 | *.class 182 | 183 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jiajie Yan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include jiayan/data/dict.txt 2 | include jiayan/data/char_pos_dict.json -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 甲言Jiayan 2 | [![PyPI](https://img.shields.io/badge/pypi-v0.0.21-blue.svg)](https://pypi.org/project/jiayan/) 3 | ![License](https://img.shields.io/badge/license-MIT-yellow.svg) 4 | 5 | [中文](#简介) 6 | [English](#introduction) 7 | 8 | ## 简介 9 | 甲言,取「甲骨文言」之意,是一款专注于古汉语处理的NLP工具包。 10 | 目前通用的汉语NLP工具多以现代汉语为核心语料,对古代汉语的处理效果并不如人意(详见[分词](#2))。本项目的初衷,便是辅助古汉语信息处理,帮助有志于挖掘古文化矿藏的古汉语学者、爱好者等更好地分析和利用文言资料,从「文化遗产」中创造出「文化新产」。 11 | 当前版本支持[词库构建](#1)、[自动分词](#2)、[词性标注](#3)、[文言句读](#4)和[标点](#5)五项功能,更多功能正在开发中。 12 | 13 | ## 功能 14 | * [__词库构建__](#1) 15 | * 利用无监督的双[字典树](https://baike.baidu.com/item/Trie树)、[点互信息](https://www.jianshu.com/p/79de56cbb2c7)以及左右邻接[熵](https://baike.baidu.com/item/信息熵/7302318?fr=aladdin)进行文言词库自动构建。 16 | * [__分词__](#2) 17 | * 利用无监督、无词典的[N元语法](https://baike.baidu.com/item/n元语法)和[隐马尔可夫模型](https://baike.baidu.com/item/隐马尔可夫模型)进行古汉语自动分词。 18 | * 利用词库构建功能产生的文言词典,基于有向无环词图、句子最大概率路径和动态规划算法进行分词。 19 | * [__词性标注__](#3) 20 | * 基于词的[条件随机场](https://baike.baidu.com/item/条件随机场)的序列标注,词性详见[词性表](jiayan/postagger/README.md)。 21 | * [__断句__](#4) 22 | * 基于字符的条件随机场的序列标注,引入点互信息及[t-测试值](https://baike.baidu.com/item/t检验/9910799?fr=aladdin)为特征,对文言段落进行自动断句。 23 | * [__标点__](#5) 24 | * 基于字符的层叠式条件随机场的序列标注,在断句的基础上对文言段落进行自动标点。 25 | * 文白翻译 26 | * 开发中,目前处于文白平行语料收集、清洗阶段。 27 | * 基于[双向长短时记忆循环网络](https://baike.baidu.com/item/长短期记忆人工神经网络/17541107?fromtitle=LSTM&fromid=17541102&fr=aladdin)和[注意力机制](https://baike.baidu.com/item/注意力机制)的神经网络生成模型,对古文进行自动翻译。 28 | * 注意:受语料影响,目前不支持繁体。如需处理繁体,可先用[OpenCC](https://github.com/yichen0831/opencc-python)将输入转换为简体,再将结果转化为相应繁体(如港澳台等)。 29 | 30 | ## 安装 31 | $ pip install jiayan 32 | $ pip install https://github.com/kpu/kenlm/archive/master.zip 33 | 34 | ## 使用 35 | 以下各模块的使用方法均来自[examples.py](jiayan/examples.py)。 36 | 1. 下载模型并解压:[百度网盘](https://pan.baidu.com/s/1PXP0eSQWWcNmAb6lkuB5sw),提取码:`p0sc` 37 | * jiayan.klm:语言模型,主要用来分词,以及句读标点任务中的特征提取; 38 | * pos_model:CRF词性标注模型; 39 | * cut_model:CRF句读模型; 40 | * punc_model:CRF标点模型; 41 | * 庄子.txt:用来测试词库构建的庄子全文。 42 | 43 | 2. __词库构建__ 44 | ``` 45 | from jiayan import PMIEntropyLexiconConstructor 46 | 47 | constructor = PMIEntropyLexiconConstructor() 48 | lexicon = constructor.construct_lexicon('庄子.txt') 49 | constructor.save(lexicon, '庄子词库.csv') 50 | ``` 51 | 52 | 结果: 53 | ``` 54 | Word,Frequency,PMI,R_Entropy,L_Entropy 55 | 之,2999,80,7.944909328101839,8.279435615456894 56 | 而,2089,80,7.354575005231323,8.615211168836439 57 | 不,1941,80,7.244331150611089,6.362131306822925 58 | ... 59 | 天下,280,195.23602384978196,5.158574399464853,5.24731990592901 60 | 圣人,111,150.0620531154239,4.622606551534004,4.6853474419338585 61 | 万物,94,377.59805590304126,4.5959107835319895,4.538837960294887 62 | 天地,92,186.73504238078462,3.1492586603863617,4.894533538722486 63 | 孔子,80,176.2550051738876,4.284638190120882,2.4056390622295662 64 | 庄子,76,169.26227942514097,2.328252899085616,2.1920058354921066 65 | 仁义,58,882.3468468468468,3.501609497059026,4.96900162987599 66 | 老聃,45,2281.2228260869565,2.384853500510039,2.4331958387289765 67 | ... 68 | ``` 69 | 3. __分词__ 70 | 1. 字符级隐马尔可夫模型分词,效果符合语感,建议使用,需加载语言模型 `jiayan.klm` 71 | ``` 72 | from jiayan import load_lm 73 | from jiayan import CharHMMTokenizer 74 | 75 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。' 76 | 77 | lm = load_lm('jiayan.klm') 78 | tokenizer = CharHMMTokenizer(lm) 79 | print(list(tokenizer.tokenize(text))) 80 | ``` 81 | 结果: 82 | `['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']` 83 | 84 | 由于古汉语没有公开分词数据,无法做效果评估,但我们可以通过不同NLP工具对相同句子的处理结果来直观感受本项目的优势: 85 | 86 | 试比较 [LTP](https://github.com/HIT-SCIR/ltp) (3.4.0) 模型分词结果: 87 | `['是', '故内', '圣外王', '之', '道', ',', '暗而不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉以自为方', '。']` 88 | 89 | 再试比较 [HanLP](http://hanlp.com) 分词结果: 90 | `['是故', '内', '圣', '外', '王之道', ',', '暗', '而', '不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各为其所欲焉', '以', '自为', '方', '。']` 91 | 92 | 可见本工具对古汉语的分词效果明显优于通用汉语NLP工具。 93 | 94 | *更新:感谢HanLP的作者hankc告知——从2021年初,HanLP发布了深度学习驱动的2.x。由于使用了大规模语料上预训练的语言模型,这些语料已经包括了互联网上几乎所有的古汉语和现代汉语,所以在古汉语上的效果已经得到了质的提升。不仅仅是分词,就连词性标注和语义分析也有一定zero-shot learning的效果。相应的具体分词效果请参见该[Issue](https://github.com/jiaeyan/Jiayan/issues/15)。 95 | 96 | 2. 词级最大概率路径分词,基本以字为单位,颗粒度较粗 97 | ``` 98 | from jiayan import WordNgramTokenizer 99 | 100 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。' 101 | tokenizer = WordNgramTokenizer() 102 | print(list(tokenizer.tokenize(text))) 103 | ``` 104 | 结果: 105 | `['是', '故', '内', '圣', '外', '王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']` 106 | 107 | 4. __词性标注__ 108 | ``` 109 | from jiayan import CRFPOSTagger 110 | 111 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。'] 112 | 113 | postagger = CRFPOSTagger() 114 | postagger.load('pos_model') 115 | print(postagger.postag(words)) 116 | ``` 117 | 结果: 118 | `['n', 'a', 'wp', 'n', 'd', 'a', 'wp', 'n', 'd', 'm', 'wp', 'n', 'a', 'u', 'm', 'v', 'r', 'p', 'r', 'a', 'wp']` 119 | 120 | 5. __断句__ 121 | ``` 122 | from jiayan import load_lm 123 | from jiayan import CRFSentencizer 124 | 125 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 126 | 127 | lm = load_lm('jiayan.klm') 128 | sentencizer = CRFSentencizer(lm) 129 | sentencizer.load('cut_model') 130 | print(sentencizer.sentencize(text)) 131 | ``` 132 | 结果: 133 | `['天下大乱', '贤圣不明', '道德不一', '天下多得一察焉以自好', '譬如耳目', '皆有所明', '不能相通', '犹百家众技也', '皆有所长', '时有所用', '虽然', '不该不遍', '一之士也', '判天地之美', '析万物之理', '察古人之全', '寡能备于天地之美', '称神之容', '是故内圣外王之道', '暗而不明', '郁而不发', '天下之人各为其所欲焉以自为方', '悲夫', '百家往而不反', '必不合矣', '后世之学者', '不幸不见天地之纯', '古之大体', '道术将为天下裂']` 134 | 135 | 6. __标点__ 136 | ``` 137 | from jiayan import load_lm 138 | from jiayan import CRFPunctuator 139 | 140 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 141 | 142 | lm = load_lm('jiayan.klm') 143 | punctuator = CRFPunctuator(lm, 'cut_model') 144 | punctuator.load('punc_model') 145 | print(punctuator.punctuate(text)) 146 | ``` 147 | 结果: 148 | `天下大乱,贤圣不明,道德不一,天下多得一察焉以自好,譬如耳目,皆有所明,不能相通,犹百家众技也,皆有所长,时有所用,虽然,不该不遍,一之士也,判天地之美,析万物之理,察古人之全,寡能备于天地之美,称神之容,是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方,悲夫!百家往而不反,必不合矣,后世之学者,不幸不见天地之纯,古之大体,道术将为天下裂。` 149 | 150 | 151 | ## 版本 152 | * v0.0.21 153 | * 将安装过程分为两步,确保得到最新的kenlm版本。 154 | * v0.0.2 155 | * 增加词性标注功能。 156 | * v0.0.1 157 | * 词库构建、自动分词、文言句读、标点功能开放。 158 | 159 | 160 | --- 161 | 162 | ## Introduction 163 | Jiayan, which means Chinese characters engraved on oracle bones, is a professional Python NLP tool for Classical Chinese. 164 | Prevailing Chinese NLP tools are mainly trained on modern Chinese data, which leads to bad performance on Classical Chinese (See [__Tokenizing__](#6)). The purpose of this project is to assist Classical Chinese information processing. 165 | Current version supports [lexicon construction](#6), [tokenizing](#7), [POS tagging](#8), [sentence segmentation](#9) and [automatic punctuation](#10), more features are in development. 166 | 167 | ## Features 168 | * [__Lexicon Construction__](#6) 169 | * With an unsupervised approach, construct lexicon with [Trie](https://en.wikipedia.org/wiki/Trie) -tree, [PMI](https://en.wikipedia.org/wiki/Pointwise_mutual_information) (_point-wise mutual information_) and neighboring [entropy](https://en.wikipedia.org/wiki/Entropy_\(information_theory\)) of left and right characters. 170 | * [__Tokenizing__](#7) 171 | * With an unsupervised, no dictionary approach to tokenize a Classical Chinese sentence with [N-gram](https://en.wikipedia.org/wiki/N-gram) language model and [HMM](https://en.wikipedia.org/wiki/Hidden_Markov_model) (_Hidden Markov Model_). 172 | * With the dictionary produced from lexicon construction, tokenize a Classical Chinese sentence with Directed Acyclic Word Graph, Max Probability Path and [Dynamic Programming](https://en.wikipedia.org/wiki/Dynamic_programming). 173 | * [__POS Tagging__](#8) 174 | * Word level sequence tagging with [CRF](https://en.wikipedia.org/wiki/Conditional_random_field) (_Conditional Random Field_). See POS tag categories [here](jiayan/postagger/README.md). 175 | * [__Sentence Segmentation__](#9) 176 | * Character level sequence tagging with CRF, introduces PMI and [T-test](https://en.wikipedia.org/wiki/Student%27s_t-test) values as features. 177 | * [__Punctuation__](#10) 178 | * Character level sequence tagging with layered CRFs, punctuate given Classical Chinese texts based on results of sentence segmentation. 179 | * Note: Due to data we used, we don't support traditional Chinese for now. If you have to process traditional one, please use [OpenCC](https://github.com/yichen0831/opencc-python) to convert traditional input to simplified, then you could convert the results back. 180 | 181 | ## Installation 182 | $ pip install jiayan 183 | $ pip install https://github.com/kpu/kenlm/archive/master.zip 184 | 185 | ## Usages 186 | The usage codes below are all from [examples.py](jiayan/examples.py). 187 | 1. Download the models and unzip them:[Google Drive](https://drive.google.com/open?id=1piZQBO8OXQ5Cpi17vAcZsrbJLPABnKzp) 188 | * jiayan.klm:the language model used for tokenizing and feature extraction for sentence segmentation and punctuation; 189 | * pos_model:the CRF model for POS tagging; 190 | * cut_model:the CRF model for sentence segmentation; 191 | * punc_model:the CRF model for punctuation; 192 | * 庄子.txt:the full text of 《Zhuangzi》 used for testing lexicon construction. 193 | 194 | 2. __Lexicon Construction__ 195 | ``` 196 | from jiayan import PMIEntropyLexiconConstructor 197 | 198 | constructor = PMIEntropyLexiconConstructor() 199 | lexicon = constructor.construct_lexicon('庄子.txt') 200 | constructor.save(lexicon, 'Zhuangzi_Lexicon.csv') 201 | ``` 202 | 203 | Result: 204 | ``` 205 | Word,Frequency,PMI,R_Entropy,L_Entropy 206 | 之,2999,80,7.944909328101839,8.279435615456894 207 | 而,2089,80,7.354575005231323,8.615211168836439 208 | 不,1941,80,7.244331150611089,6.362131306822925 209 | ... 210 | 天下,280,195.23602384978196,5.158574399464853,5.24731990592901 211 | 圣人,111,150.0620531154239,4.622606551534004,4.6853474419338585 212 | 万物,94,377.59805590304126,4.5959107835319895,4.538837960294887 213 | 天地,92,186.73504238078462,3.1492586603863617,4.894533538722486 214 | 孔子,80,176.2550051738876,4.284638190120882,2.4056390622295662 215 | 庄子,76,169.26227942514097,2.328252899085616,2.1920058354921066 216 | 仁义,58,882.3468468468468,3.501609497059026,4.96900162987599 217 | 老聃,45,2281.2228260869565,2.384853500510039,2.4331958387289765 218 | ... 219 | ``` 220 | 3. __Tokenizing__ 221 | 1. The character based HMM, recommended, needs language model: `jiayan.klm` 222 | ``` 223 | from jiayan import load_lm 224 | from jiayan import CharHMMTokenizer 225 | 226 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。' 227 | 228 | lm = load_lm('jiayan.klm') 229 | tokenizer = CharHMMTokenizer(lm) 230 | print(list(tokenizer.tokenize(text))) 231 | ``` 232 | Result: 233 | `['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']` 234 | 235 | Since there is no public tokenizing data for Classical Chinese, it's hard to do performance evaluation directly; However, we can compare the results with other popular modern Chinese NLP tools to check the performance: 236 | 237 | Compare the tokenizing result of [LTP](https://github.com/HIT-SCIR/ltp) (3.4.0): 238 | `['是', '故内', '圣外王', '之', '道', ',', '暗而不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉以自为方', '。']` 239 | 240 | Also, compare the tokenizing result of [HanLP](http://hanlp.com): 241 | `['是故', '内', '圣', '外', '王之道', ',', '暗', '而', '不明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各为其所欲焉', '以', '自为', '方', '。']` 242 | 243 | It's apparent that Jiayan has much better tokenizing performance than general Chinese NLP tools. 244 | 245 | 2. Max probability path approach tokenizing based on words 246 | ``` 247 | from jiayan import WordNgramTokenizer 248 | 249 | text = '是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方。' 250 | tokenizer = WordNgramTokenizer() 251 | print(list(tokenizer.tokenize(text))) 252 | ``` 253 | Result: 254 | `['是', '故', '内', '圣', '外', '王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。']` 255 | 256 | 4. __POS Tagging__ 257 | ``` 258 | from jiayan import CRFPOSTagger 259 | 260 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。'] 261 | 262 | postagger = CRFPOSTagger() 263 | postagger.load('pos_model') 264 | print(postagger.postag(words)) 265 | ``` 266 | Result: 267 | `['n', 'a', 'wp', 'n', 'd', 'a', 'wp', 'n', 'd', 'm', 'wp', 'n', 'a', 'u', 'm', 'v', 'r', 'p', 'r', 'a', 'wp']` 268 | 269 | 4. __Sentence Segmentation__ 270 | ``` 271 | from jiayan import load_lm 272 | from jiayan import CRFSentencizer 273 | 274 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 275 | 276 | lm = load_lm('jiayan.klm') 277 | sentencizer = CRFSentencizer(lm) 278 | sentencizer.load('cut_model') 279 | print(sentencizer.sentencize(text)) 280 | ``` 281 | Result: 282 | `['天下大乱', '贤圣不明', '道德不一', '天下多得一察焉以自好', '譬如耳目', '皆有所明', '不能相通', '犹百家众技也', '皆有所长', '时有所用', '虽然', '不该不遍', '一之士也', '判天地之美', '析万物之理', '察古人之全', '寡能备于天地之美', '称神之容', '是故内圣外王之道', '暗而不明', '郁而不发', '天下之人各为其所欲焉以自为方', '悲夫', '百家往而不反', '必不合矣', '后世之学者', '不幸不见天地之纯', '古之大体', '道术将为天下裂']` 283 | 284 | 5. __Punctuation__ 285 | ``` 286 | from jiayan import load_lm 287 | from jiayan import CRFPunctuator 288 | 289 | text = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 290 | 291 | lm = load_lm('jiayan.klm') 292 | punctuator = CRFPunctuator(lm, 'cut_model') 293 | punctuator.load('punc_model') 294 | print(punctuator.punctuate(text)) 295 | ``` 296 | Result: 297 | `天下大乱,贤圣不明,道德不一,天下多得一察焉以自好,譬如耳目,皆有所明,不能相通,犹百家众技也,皆有所长,时有所用,虽然,不该不遍,一之士也,判天地之美,析万物之理,察古人之全,寡能备于天地之美,称神之容,是故内圣外王之道,暗而不明,郁而不发,天下之人各为其所欲焉以自为方,悲夫!百家往而不反,必不合矣,后世之学者,不幸不见天地之纯,古之大体,道术将为天下裂。` 298 | 299 | 300 | ## Versions 301 | * v0.0.21 302 | * Divide the installation into two steps to ensure to get the latest version of kenlm. 303 | * v0.0.2 304 | * POS tagging feature is open. 305 | * v0.0.1 306 | * Add features of lexicon construction, tokenizing, sentence segmentation and automatic punctuation. -------------------------------------------------------------------------------- /jiayan/__init__.py: -------------------------------------------------------------------------------- 1 | import kenlm 2 | 3 | from jiayan.lexicon.pmi_entropy_constructor import PMIEntropyLexiconConstructor 4 | from jiayan.tokenizer.hmm_tokenizer import CharHMMTokenizer 5 | from jiayan.tokenizer.ngram_tokenizer import WordNgramTokenizer 6 | from jiayan.sentencizer.crf_sentencizer import CRFSentencizer 7 | from jiayan.sentencizer.crf_punctuator import CRFPunctuator 8 | from jiayan.postagger.crf_pos_tagger import CRFPOSTagger 9 | 10 | 11 | def load_lm(lm): 12 | return kenlm.LanguageModel(lm) 13 | 14 | -------------------------------------------------------------------------------- /jiayan/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/__main__.py -------------------------------------------------------------------------------- /jiayan/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/data/__init__.py -------------------------------------------------------------------------------- /jiayan/examples.py: -------------------------------------------------------------------------------- 1 | from jiayan import PMIEntropyLexiconConstructor 2 | from jiayan import CharHMMTokenizer 3 | from jiayan import WordNgramTokenizer 4 | from jiayan import CRFSentencizer 5 | from jiayan import CRFPunctuator 6 | from jiayan import CRFPOSTagger 7 | from jiayan import load_lm 8 | 9 | 10 | def construct_lexicon(data_file: str, out_f: str): 11 | constructor = PMIEntropyLexiconConstructor() 12 | lexicon = constructor.construct_lexicon(data_file) 13 | constructor.save(lexicon, out_f) 14 | 15 | 16 | def hmm_tokenize(lm_path: str, text: str): 17 | lm = load_lm(lm_path) 18 | tokenizer = CharHMMTokenizer(lm) 19 | print(list(tokenizer.tokenize(text))) 20 | 21 | 22 | def ngram_tokenize(text: str): 23 | tokenizer = WordNgramTokenizer() 24 | print(list(tokenizer.tokenize(text))) 25 | 26 | 27 | def crf_pos_tag(pos_model, words): 28 | postagger = CRFPOSTagger() 29 | postagger.load(pos_model) 30 | print(postagger.postag(words)) 31 | 32 | 33 | def crf_sentencize(lm_path: str, cut_model, text): 34 | lm = load_lm(lm_path) 35 | sentencizer = CRFSentencizer(lm) 36 | sentencizer.load(cut_model) 37 | print(sentencizer.sentencize(text)) 38 | 39 | 40 | def crf_punctuate(lm_path, cut_model, punc_model, text): 41 | lm = load_lm(lm_path) 42 | punctuator = CRFPunctuator(lm, cut_model) 43 | punctuator.load(punc_model) 44 | print(punctuator.punctuate(text)) 45 | 46 | 47 | def train_sentencizer(lm_path, data_file, out_model): 48 | lm = load_lm(lm_path) 49 | sentencizer = CRFSentencizer(lm) 50 | print('Building data...') 51 | X, Y = sentencizer.build_data(data_file) 52 | train_x, train_y, test_x, test_y = sentencizer.split_data(X, Y) 53 | X[:] = [] 54 | Y[:] = [] 55 | print('Training...') 56 | sentencizer.train(train_x, train_y, out_model) 57 | sentencizer.eval(test_x, test_y, out_model) 58 | 59 | 60 | def train_punctuator(lm_path, data_file, cut_model, out_model): 61 | lm = load_lm(lm_path) 62 | punctuator = CRFPunctuator(lm, cut_model) 63 | print('Building data...') 64 | X, Y = punctuator.build_data(data_file) 65 | train_x, train_y, test_x, test_y = punctuator.split_data(X, Y) 66 | X[:] = [] 67 | Y[:] = [] 68 | print('Training...') 69 | punctuator.train(train_x, train_y, out_model) 70 | punctuator.eval(test_x, test_y, out_model) 71 | 72 | 73 | def train_postagger(data_file, pos_model): 74 | postagger = CRFPOSTagger() 75 | print('Building data...') 76 | X, Y = postagger.build_data(data_file) 77 | train_x, train_y, test_x, test_y = postagger.split_data(X, Y) 78 | X[:] = [] 79 | Y[:] = [] 80 | print('Training...') 81 | postagger.train(train_x, train_y, pos_model) 82 | postagger.eval(test_x, test_y, pos_model) 83 | 84 | if __name__ == '__main__': 85 | test_f = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 86 | test_f1 = '圣人之治民也先治者强先战者胜夫国事务先而一民心专举公而私不从赏告而奸不生明法而治不烦能用四者强不能用四者弱夫国之所以强者政也主之所以尊者权也故明君有权有政乱君亦有权有政积而不同其所以立异也故明君操权而上重一政而国治故法者王之本也刑者爱之自也' 87 | test_f2 = '公曰善吾不食谄人以言也以鱼五十乘赐弦章章归鱼车塞途抚其御之手曰昔者晏子辞党当作赏以正君故过失不掩之今诸臣谀以干利吾若受鱼是反晏子之义而顺谄谀之欲固辞鱼不受君子曰弦章之廉晏子之遗行也' 88 | test_f3 = '景公游于菑闻晏子死公乘侈舆服繁驵驱之而因为迟下车而趋知不若车之速则又乘比至于国者四下而趋行哭而往伏尸而号' 89 | test_f4 = '有足游浮云背凌苍天尾偃天间跃啄北海颈尾咳于天地乎然而漻漻不知六翮之所在' 90 | test_f5 = '谁知林栖者闻风坐相悦草木有本心何求美人折' 91 | test_f6 = '能说诸心能研诸侯之虑定天下之吉凶成天下之亹亹者是故变化云为吉事有祥象事知器占事知来天地设位圣人成能人谋鬼谋百姓与能八卦以象告爻彖以情言刚柔杂居而吉凶可见矣' 92 | test_f7 = '至哉坤元万物资生乃顺承天坤厚载物德合无疆含弘光大品物咸亨牝马地类行地无疆柔顺利贞君子攸行先迷失道后顺得常' 93 | test_f8 = '天下熙熙一盈一虚一治一乱所以然者何也其君贤不肖不等乎其天时变化自然乎' 94 | test_f9 = '先生之言悖龙之所以为名者乃以白马之论尔今使龙去之则无以教焉且欲师之者以智与学不如也今使龙去之此先教而后师之也先教而后师之者悖且白马非马乃仲尼之所取龙闻楚王张繁弱之弓载忘归之矢以射蛟兕于云梦之圃而丧其弓左右请求之' 95 | test_f10 = '伪学伪才揣摩以逢主意从前洋务穆彰阿倾排异己殊堪痛恨若一旦置之重法实有不忍着从宽革职永不叙用于是主战主和之功罪是非千秋论定而枋政之臣欲以掩天下后世之耳目不可得矣' 96 | test_f11 = '传字世文至圣四十七代孙建炎初随孔端友南渡遂流寓衢州' 97 | test_f12 = '若乃厯代褒崇之典累朝班赉之恩宠数便蕃固可以枚陈而列数以至验祖壁之遗书访阙里之陈迹荒墟废址沦没于春芜秋草之中者阙有之故老世传之将使闻见之所未尝者如接于耳目之近' 98 | test_f13 = '颂曰元始二妃帝尧之女嫔列有虞承舜于下以尊事卑终能劳苦瞽叟和宁卒享福祜' 99 | test_f14 = '弃母姜嫄者邰侯之女也当尧之时行见巨人迹好而履之归而有娠浸以益大心怪恶之卜筮禋祀以求无子终生子' 100 | test_f15 = '颂曰契母简狄敦仁励翼吞卵产子遂自修饰教以事理推恩有德契为帝辅盖母有力' 101 | test_f16 = '堂之下则有大冶长老桃花茶巢元脩菜何氏丛橘种秔稌莳枣栗有松期为可斫种麦以为奇事作陂塘植黄桑皆足以供先生之岁用而为雪堂之胜景云耳' 102 | test_f17 = '占者乡塾里闾亦各有史所以纪善恶而垂劝戒后世惟天于有太史而庶民之有德业者非附贤士大夫为之纪其闻者蔑焉' 103 | test_f18 = '东家杂记孔子四十七代孙孔传所述杂记曰周灵王二十一年已酉岁即鲁襄公二十二年也当襄公二十二年冬十月庚子日先圣生又曰周敬王四十一年辛酉岁即鲁哀公十六年也当哀公十六年夏四月乙丑日先圣薨先儒以为已丑者误也' 104 | test_f19 = '周灵王二十一年已酉岁即鲁襄公二十二年也当襄公二十二年冬十月庚子日先圣生是夕有二龙绕室五老降庭五老者五星之精也又颜氏之房闻奏钧天之乐空中有声云天感生圣子故降以和乐笙镛之音' 105 | test_f20 = '河山大地未尝可以法空也佛必欲空之而屹然沛然卒不能空兵刑灾祸未尝可以度也佛必欲度之而伏尸百万' 106 | test_f21 = '朱子曰心之虚灵知觉一而已矣而以为有心人道心之异者以其或生于形气之私或原于性命之正而所以为知觉者不同是以或危殆而不安或微妙而难见尔' 107 | test_f22 = '真西山读书记曰此武王伐纣之事诗意虽主伐纣而言然学者平居讽咏其辞凛然如上帝之实临其上则所以为闲邪存诚之助顾不大哉' 108 | test_f23 = '述叙既讫乃为主客发其例曰客问主人曰伪经何以名之新学也汉艺文志号为古经五经异义称为古说诸书所述古文尤繁' 109 | test_f24 = '取胡氏传一句两句为旨而以经事之相类者合以为题传为主经为客有以彼经证此经之题有用彼经而隐此经之题于是此一经者为射覆之书而春秋亡矣' 110 | test_f25 = '谁非黄帝尧舜之子孙而至于今日其不幸而为臧获为婢妾为舆台皂隶窘穷迫逼无可奈何非其数十代以前即自臧获婢妾舆台皂隶来也一旦奋发有为精勤不倦有及身而富贵者矣及其子孙而富贵者矣' 111 | test_f26 = '人器有德人和伦常社器有德族谐国安灵器有德则天伦如仪器无德人怨族乱国沸天地失道也' 112 | test_f27 = '先圣没逮今一千五百余年传世五十或问其姓则内求而不得或审其家则舌举而不下为之后者得无愧乎' 113 | test_f28 = '高辛父曰蟜极蟜极父曰玄嚣玄嚣父曰黄帝' 114 | test_f29 = '以为锦绣文采靡曼之衣' 115 | test_f30 = '通玄理而不通禅必受固执之病通禅理而不通儒多成狂慧之流求其禅儒皆通而又能贯之以道不但今鲜其人即古之紫衣黄冠下除紫阳莲池外恒不多觏' 116 | tests = [ 117 | test_f, test_f1, test_f2, test_f3, test_f4, test_f5, test_f6, test_f7, test_f8, 118 | test_f9, test_f10, 119 | test_f11, test_f12, test_f13, test_f14, test_f15, test_f16, test_f17, test_f18, test_f19, test_f20, 120 | test_f21, test_f22, 121 | test_f23, test_f24, test_f25, test_f26, test_f27, test_f28, test_f29, test_f30 122 | ] 123 | 124 | 125 | 126 | 127 | 128 | # train_sentencizer('data/jiayan.klm', '/Users/jiaeyan/Desktop/chn_data/all.txt', 'cut_model_60') 129 | # train_punctuator('data/jiayan.klm', '/Users/jiaeyan/Desktop/chn_data/all.txt', 'cut_model', 'punc_model') 130 | # train_postagger('/Users/jiaeyan/Desktop/chn_data/pos_all.txt', 'pos_model') 131 | 132 | # lm = load_lm('data/jiayan.klm') 133 | # 134 | # sentcizer = CRFSentencizer(lm) 135 | # sentcizer.load("/Users/jiaeyan/Desktop/cut_model_70") 136 | # for test in tests: 137 | # print(sentcizer.sentencize(test)) 138 | 139 | 140 | # punctuator = CRFPunctuator(lm, '/Users/jiaeyan/Desktop/cut_model') 141 | # punctuator.load('/Users/jiaeyan/Desktop/punc_model') 142 | # for test in tests: 143 | # print(punctuator.punctuate(test)) 144 | 145 | # tokenizer = CharHMMTokenizer(lm) 146 | # for test in tests: 147 | # print(list(tokenizer.tokenize(test))) 148 | 149 | postagger = CRFPOSTagger() 150 | postagger.load('/Users/jiaeyan/Desktop/pos_model_50') 151 | # words = ['是', '故', '内圣外王', '之', '道', ',', '暗', '而', '不', '明', ',', '郁', '而', '不', '发', ',', '天下', '之', '人', '各', '为', '其', '所', '欲', '焉', '以', '自', '为', '方', '。'] 152 | words = ['天下', '大乱', ',', '贤圣', '不', '明', ',', '道德', '不', '一', ',', '天下', '多', '得', '一', '察', '焉', '以', '自', '好', '。'] 153 | print(postagger.postag(words)) 154 | # for test in tests: 155 | # words = list(tokenizer.tokenize(test)) 156 | # print(words) 157 | # print(postagger.postag(words)) 158 | 159 | 160 | 161 | 162 | 163 | # test = '天下大乱贤圣不明道德不一天下多得一察焉以自好譬如耳目皆有所明不能相通犹百家众技也皆有所长时有所用虽然不该不遍一之士也' \ 164 | # '判天地之美析万物之理察古人之全寡能备于天地之美称神之容是故内圣外王之道暗而不明郁而不发天下之人各为其所欲焉以自为方' \ 165 | # '悲夫百家往而不反必不合矣后世之学者不幸不见天地之纯古之大体道术将为天下裂' 166 | # 167 | # lm_path = 'data/jiayan.klm' 168 | 169 | # print('Constructing lexicon...') 170 | # construct_lexicon('data/庄子.txt', '庄子1.csv') 171 | # 172 | # print('\nTokenizing test text with HMM...') 173 | # hmm_tokenize(lm_path, test) 174 | # 175 | # print('\nTokenizing test text with N-grams...') 176 | # for test in tests: 177 | # ngram_tokenize(test) 178 | # 179 | # print('\nSentencizing test text with CRF...') 180 | # crf_sentencize(lm_path, 'cut_model', test_f1) 181 | 182 | # print('\nPunctuating test text with CRF...') 183 | # crf_punctuate(lm_path, 'cut_model_60', 'punc_model', test_f1) 184 | # crf_punctuate(lm_path, 'cut_model_60', 'punc_model', test) 185 | -------------------------------------------------------------------------------- /jiayan/globals.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | re_zh_include = re.compile("([\u4E00-\u9FA5]+)", re.U) 5 | re_zh_exclude = re.compile("[\u4E00-\u9FA5]+", re.U) 6 | 7 | re_puncs_include = re.compile(r'([。?!,、:;])') 8 | re_puncs_exclude = re.compile(r'[。?!,、:;]') 9 | 10 | re_invalid_chars = re.compile(r'[^\u4E00-\u9FA5。?!,、:;]+', re.U) 11 | 12 | stopchars = {'之', '兹', '此', '彼', 13 | '谁', '孰', '何', '奚', '曷', '焉', 14 | '或', '有', '某', 15 | '莫', '弗', '毋', '勿', '不', '得', 16 | '亦', '乃', 17 | '於', '于', '乎', '在', '而', '以', '为', 18 | '其', '唯', '惟', '焉', '者', '所', 19 | '也', '矣', '已', 20 | '欤', '耶', '哉', '兮', 21 | '必', '又', '每', '皆', '仅', '只', 22 | '甚', '颇', '岂', 23 | '曰'} 24 | 25 | 26 | def get_char_pos_dict(): 27 | with open('data/char_pos_dict.json', 'r') as f: 28 | char_pos_dict = json.load(f) 29 | return char_pos_dict 30 | 31 | 32 | re_num = re.compile(r'[第一二三四五六七八九十百千万]{2,}') 33 | re_calender = re.compile(r'[甲乙丙丁戊己庚辛壬癸子丑寅卯辰巳午未申酉戌亥]{2,}') 34 | 35 | 36 | """ 37 | http://www.ziyexing.com/files-5/guhanyu/guhanyu_index.html 38 | """ 39 | pron_single = {'我', '吾', '你', '而', '乃', '若', '其', '之', '他'} 40 | pron_plural = {'吾侪', '吾曹', '吾属', '我属', '我辈', '若曹', '而属', '尔辈', '公等', '卿等'} 41 | pron_demonstrate = {'之', '斯', '兹', '此', '是', '彼', '夫', '伊'} 42 | pron_interrogative = {'谁', '孰', '何', '奚', '胡', '曷', '恶', '焉', '安', '几'} 43 | pron_indefinite = {'或', '有', '某'} 44 | pron_negative = {'莫', '罔', '无', '靡', '蔑', '末'} 45 | auxiliary_verb = {'克', '能', '堪', '可', '可以', '可得', '得', '足', '足以', 46 | '欲', '肯', '将', '宁', '敢', '忍', '愿', 47 | '当', '如', '宜', '任', '合', '应', 48 | '见', '被', '为'} 49 | preposition = {'於', '于', '乎', '以', '在', '即', '及', '自', '从', '由', 50 | '当', '到', '迨', '逮', '至', '比', '竟', '向', '临', '先' 51 | '因', '用', '缘', '为', '乎', '从', '与'} 52 | conjunction = {'夫', '若', '如', '且', '至', '若夫', '且夫', '至于', '至如', 53 | '既', '终', '已', 54 | '如', '苟', '使', '令', '即', '抑', '向', '诚', '果', '设', '假', 55 | '若苟', '向使', '若使', '如若', '如使', '如令', '如果', '苟使', '假设', 56 | '假使', '假如', '假若', '假令', '设使', '设若', '设令', '倘若', '倘使', '诚使', '诚令' 57 | '虽', '则', '且', '而', '尚', '犹', '且犹', '尚犹', '纵', '虽则', '虽使', '与其', '与', '以', '为', '由', 58 | '与', '及', '暨', '之以', '而', '且又', '亦', '而且', '而又', 59 | '况', '而况', '况于', '何况', 60 | '故', '乃', '是以', '是用', '是故', '以故', '因是', 61 | '然则', '然而', '但'} 62 | particle = {'其', '之', '斯', '思', # rhyme 63 | '唯', '惟', # syntax 64 | '有', '畴', '丕', '薄', '言', # word prefix 65 | '然', '焉', '尔', '而', '斯', '若', '如', '乎', # word suffix 66 | '者', '攸', '所', # phrase prefix & suffix 67 | '载', '有', '式', '于', 68 | } 69 | modal_particle = {'也', '矣', '已', '而已', '耳', '尔', '而', '焉', '然', '夫', '者', '云', # affirmative 70 | '与', '欤', '邪', '耶', '乎', '为', '则', # question 71 | '哉', '乎', '夫', '为', '兮', '邪', # exclamation 72 | } 73 | -------------------------------------------------------------------------------- /jiayan/lexicon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/lexicon/__init__.py -------------------------------------------------------------------------------- /jiayan/lexicon/pmi_entropy_constructor.py: -------------------------------------------------------------------------------- 1 | from math import log2 2 | import time 3 | from jiayan.globals import stopchars 4 | from jiayan.utils import text_iterator 5 | 6 | """ 7 | A precise way to discover new words in sentence corpus, consider PMI and entropy. 8 | 9 | 1. PMI is used to evaluate how tight the two segments of the word; 10 | 2. Right and left entropy are used to evaluate how independent the word is in various contexts. 11 | """ 12 | 13 | 14 | class Trie: 15 | 16 | class TrieNode: 17 | def __init__(self): 18 | self.freq = 0 19 | self.pmi = 0 20 | self.r_entropy = 0 21 | self.l_entropy = 0 22 | self.children = {} 23 | 24 | def __init__(self): 25 | self.root = self.TrieNode() 26 | 27 | def add(self, word): 28 | node = self.root 29 | for char in word: 30 | if char not in node.children: 31 | node.children[char] = self.TrieNode() 32 | node = node.children[char] 33 | node.freq += 1 34 | 35 | def find(self, word): 36 | node = self.root 37 | for char in word: 38 | if char not in node.children: 39 | return None 40 | node = node.children[char] 41 | return node 42 | 43 | 44 | class PMIEntropyLexiconConstructor: 45 | 46 | MIN_WORD_LEN = 1 47 | MAX_WORD_LEN = 4 48 | 49 | # TODO: Different PMI and Entropy thresholds for different lengths 50 | MIN_WORD_FREQ = 10 51 | MIN_PMI = 80 52 | MIN_ENTROPY = 2 53 | 54 | def __init__(self): 55 | self.trie = Trie() 56 | self.r_trie = Trie() 57 | self.total = 0 58 | 59 | def construct_lexicon(self, data_file): 60 | self.build_trie_trees(data_file) 61 | self.compute() 62 | lexicon = self.filter() 63 | return lexicon 64 | 65 | def build_trie_trees(self, data_file): 66 | """ Counts frequency of segments of data, also records their left and right char sets. 67 | """ 68 | max_seg_len = self.MAX_WORD_LEN + 1 69 | 70 | start = time.time() 71 | for text in text_iterator(data_file): 72 | length = len(text) 73 | for i in range(length): 74 | for j in range(1, min(length - i + 1, max_seg_len + 1)): 75 | seg = text[i: i + j] 76 | self.trie.add(seg) 77 | 78 | r_seg = seg[::-1] 79 | self.r_trie.add(r_seg) 80 | 81 | self.total += 1 82 | end = time.time() 83 | 84 | print('Trie building time:', end - start) 85 | 86 | def compute(self): 87 | start = time.time() 88 | node = self.trie.root 89 | word = '' 90 | self.compute_help(node, word) 91 | end = time.time() 92 | print('Computation time:', end - start) 93 | 94 | def compute_help(self, node, word): 95 | if node.children: 96 | for char, child in node.children.items(): 97 | word += char 98 | if len(word) <= self.MAX_WORD_LEN: 99 | self.calculate_pmi(child, word) 100 | self.calculate_rl_entropy(child, word) 101 | self.compute_help(child, word) 102 | word = word[:-1] 103 | 104 | def calculate_pmi(self, node, word): 105 | length = len(word) 106 | if length == 1: 107 | node.pmi = self.MIN_PMI 108 | else: 109 | constant = node.freq * self.total 110 | mutuals = (constant / (self.trie.find(word[:i + 1]).freq * self.trie.find(word[i + 1:]).freq) 111 | for i in range(length - 1)) 112 | node.pmi = min(mutuals) 113 | 114 | def calculate_rl_entropy(self, node, word): 115 | # right entropy 116 | if node.children: 117 | node.r_entropy = self.calculate_entropy(node) 118 | 119 | # left entropy 120 | r_word = word[::-1] 121 | r_node = self.r_trie.find(r_word) 122 | if r_node.children: 123 | node.l_entropy = self.calculate_entropy(r_node) 124 | 125 | def calculate_entropy(self, node): 126 | freqs = [child.freq for child in node.children.values()] 127 | sum_freqs = sum(freqs) 128 | entropy = sum([- (x / sum_freqs) * log2(x / sum_freqs) for x in freqs]) 129 | return entropy 130 | 131 | def filter(self): 132 | """ Filters the PMI and entropy calculation result dict, removes words that do not 133 | reach the thresholds. 134 | TODO: test use max of r/l entropy to filter. 135 | """ 136 | start = time.time() 137 | node = self.trie.root 138 | word = '' 139 | word_dict = {} 140 | self.filter_help(node, word, word_dict) 141 | end = time.time() 142 | print('Word filtering:', end - start) 143 | return word_dict 144 | 145 | def filter_help(self, node, word, word_dict): 146 | if node.children: 147 | for char, child in node.children.items(): 148 | word += char 149 | if self.valid_word(child, word): 150 | word_dict[word] = [child.freq, child.pmi, child.r_entropy, child.l_entropy] 151 | self.filter_help(child, word, word_dict) 152 | word = word[:-1] 153 | 154 | def valid_word(self, node, word): 155 | if self.MIN_WORD_LEN <= len(word) <= self.MAX_WORD_LEN \ 156 | and node.freq >= self.MIN_WORD_FREQ \ 157 | and node.pmi >= self.MIN_PMI \ 158 | and node.r_entropy >= self.MIN_ENTROPY \ 159 | and node.l_entropy >= self.MIN_ENTROPY \ 160 | and not self.has_stopword(word): 161 | return True 162 | return False 163 | 164 | def has_stopword(self, word): 165 | """ Checks if a word contains stopwords, which are not able to construct words. 166 | """ 167 | if len(word) == 1: 168 | return False 169 | for char in word: 170 | if char in stopchars: 171 | return True 172 | return False 173 | 174 | @staticmethod 175 | def save(lexicon, out_f): 176 | """ Saves the word detection result in a csv file. 177 | """ 178 | words = sorted(lexicon, key=lambda x: (len(x), -lexicon[x][0], -lexicon[x][1], -lexicon[x][2], -lexicon[x][3])) 179 | with open(out_f, 'w') as f: 180 | f.write('Word,Frequency,PMI,R_Entropy,L_Entropy\n') 181 | for word in words: 182 | f.write('{},{},{},{},{}\n'.format( 183 | word, lexicon[word][0], lexicon[word][1], 184 | lexicon[word][2], lexicon[word][3])) 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /jiayan/linguistic_unit.py: -------------------------------------------------------------------------------- 1 | class Paragraph: 2 | def __init__(self): 3 | pass 4 | 5 | 6 | class Sentence: 7 | def __init__(self): 8 | pass 9 | 10 | 11 | class Word: 12 | def __init__(self): 13 | pass 14 | 15 | 16 | class Character: 17 | def __init__(self): 18 | pass 19 | -------------------------------------------------------------------------------- /jiayan/postagger/README.md: -------------------------------------------------------------------------------- 1 | Tag | Description | Example 2 | --- | --- | --- 3 | a | adjective | 幽明 4 | b | other noun-modifier | 男,女 5 | c | conjunction | 与,而 6 | d | adverb | 皆 7 | e | exclamation | 呜呼 8 | g | morpheme | 甥 9 | h | prefix | 非 10 | i | idiom | 发愤忘食 11 | j | abbreviation | 五帝 12 | k | suffix | 者 13 | m | number | 一,百 14 | n | general noun | 鬼神,山川 15 | nd | direction noun | 东,西,南,北 16 | nh | person name | 轩辕 17 | ni | organization name | 辽队 18 | nl | location noun | 城北 19 | ns | geographical name | 襄平县 20 | nt | temporal noun | 春,夏,秋,冬 21 | nz | other proper noun | 山海经 22 | o | onomatopoeia | 呜呜 23 | p | preposition | 以,为 24 | q | quantity | 年,岁 25 | r | pronoun | 其,斯 26 | u | auxiliary | 之,所 27 | v | verb | 赐 28 | wp | punctuation | ,。! 29 | ws | foreign words | CPU 30 | x | non-lexeme | 萄, 翱 31 | z | descriptive words | 默然,区区 -------------------------------------------------------------------------------- /jiayan/postagger/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/postagger/__init__.py -------------------------------------------------------------------------------- /jiayan/postagger/crf_pos_tagger.py: -------------------------------------------------------------------------------- 1 | import random 2 | from itertools import chain 3 | from string import ascii_uppercase 4 | 5 | import pycrfsuite 6 | from sklearn.metrics import classification_report 7 | from sklearn.preprocessing import LabelBinarizer 8 | 9 | from jiayan.globals import re_zh_exclude 10 | 11 | 12 | class CRFPOSTagger: 13 | 14 | def __init__(self): 15 | self.tagger = None 16 | 17 | def load(self, crf_model): 18 | self.tagger = pycrfsuite.Tagger() 19 | self.tagger.open(crf_model) 20 | 21 | def sent2features(self, sent): 22 | length = len(sent) 23 | feat_list = [] 24 | for i, word in enumerate(sent): 25 | # pattern = self.get_word_pattern(word) 26 | # is_zh = '1' if re_zh_exclude.match(word) else '0' 27 | features = [ 28 | 'bias', 29 | '0:word=' + word, 30 | # '0:pattern=' + pattern, 31 | # '0:type=' + is_zh, 32 | ] 33 | 34 | if i > 0: 35 | features.extend([ 36 | '-1:word=' + sent[i - 1], 37 | '-10:words=' + '|'.join(sent[i - 1: i + 1]), 38 | ]) 39 | else: 40 | features.append('BOS') 41 | 42 | if i > 1: 43 | features.extend([ 44 | '-2:word=' + sent[i - 2], 45 | '-21:words=' + '|'.join(sent[i - 2: i]), 46 | '-210:words=' + '|'.join(sent[i - 2: i + 1]), 47 | ]) 48 | 49 | if i < length - 1: 50 | features.extend([ 51 | '+1:word=' + sent[i + 1], 52 | '+01:words=' + '|'.join(sent[i: i + 2]), 53 | ]) 54 | else: 55 | features.append('EOS') 56 | 57 | if i < length - 2: 58 | features.extend([ 59 | '+2:word=' + sent[i + 2], 60 | '+12:words=' + '|'.join(sent[i + 1: i + 3]), 61 | '+012:chars=' + '|'.join(sent[i: i + 3]), 62 | ]) 63 | 64 | if 0 < i < length - 1: 65 | features.extend([ 66 | '-11:words=' + sent[i - 1] + '|' + sent[i + 1], 67 | '-101:words=' + '|'.join(sent[i - 1: i + 2]), 68 | ]) 69 | 70 | feat_list.append(features) 71 | 72 | return feat_list 73 | 74 | @staticmethod 75 | def get_word_pattern(word): 76 | pattern = '' 77 | char = '' 78 | i = -1 79 | for ch in word: 80 | if ch != char: 81 | i += 1 82 | pattern += ascii_uppercase[i] 83 | char = ch 84 | return pattern 85 | 86 | def sent2tags(self, sent): 87 | pass 88 | 89 | def train(self, train_x, train_y, out_model): 90 | trainer = pycrfsuite.Trainer(verbose=False) 91 | for x, y in zip(train_x, train_y): 92 | if x and y: 93 | trainer.append(x, y) 94 | 95 | trainer.set_params({ 96 | 'c1': 1.0, # coefficient for L1 penalty 97 | 'c2': 1e-3, # coefficient for L2 penalty 98 | 'max_iterations': 50, # stop earlier 99 | 'feature.possible_transitions': True # include transitions that are possible, but not observed 100 | }) 101 | 102 | trainer.train(out_model) 103 | print(trainer.logparser.last_iteration) 104 | 105 | def build_data(self, data_file): 106 | X = [] 107 | Y = [] 108 | 109 | with open(data_file, 'r') as f: 110 | for line in f: 111 | line = line.strip() 112 | if line: 113 | x, y = line.split('\t') 114 | feat_list = self.sent2features(x.split()) 115 | tag_list = y.split() 116 | X.append(feat_list) 117 | Y.append(tag_list) 118 | 119 | return X, Y 120 | 121 | def split_data(self, X, Y): 122 | random.seed(42) 123 | rd_num = random.random() 124 | 125 | def _rd(): 126 | return rd_num 127 | 128 | random.shuffle(X, _rd) 129 | random.shuffle(Y, _rd) 130 | 131 | ratio = round(len(X) * 0.9) 132 | return X[:ratio], Y[:ratio], X[ratio:], Y[ratio:] 133 | 134 | def eval(self, test_x, test_y, crf_model): 135 | tagger = pycrfsuite.Tagger() 136 | tagger.open(crf_model) 137 | 138 | y_pred = [] 139 | for feat_list in test_x: 140 | preds = tagger.tag(feat_list) 141 | y_pred.append(preds) 142 | 143 | lb = LabelBinarizer() 144 | y_true_all = lb.fit_transform(list(chain.from_iterable(test_y))) 145 | y_pred_all = lb.transform(list(chain.from_iterable(y_pred))) 146 | 147 | tagset = sorted(set(lb.classes_)) 148 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} 149 | 150 | print(classification_report( 151 | y_true_all, 152 | y_pred_all, 153 | labels=[class_indices[cls] for cls in tagset], 154 | target_names=tagset, 155 | digits=5 156 | )) 157 | 158 | def postag(self, sent): 159 | feat_list = self.sent2features(sent) 160 | tags = self.tagger.tag(feat_list) 161 | return tags 162 | 163 | 164 | -------------------------------------------------------------------------------- /jiayan/sentencizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/sentencizer/__init__.py -------------------------------------------------------------------------------- /jiayan/sentencizer/crf_punctuator.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | import pycrfsuite 4 | from sklearn.metrics import classification_report 5 | from sklearn.preprocessing import LabelBinarizer 6 | 7 | from jiayan.globals import re_puncs_include, re_zh_exclude 8 | from jiayan.utils import text_iterator 9 | from jiayan.sentencizer.crf_sent_tagger import CRFSentTagger 10 | from jiayan.sentencizer.crf_sentencizer import CRFSentencizer 11 | 12 | 13 | class CRFPunctuator(CRFSentTagger): 14 | 15 | def __init__(self, lm, cut_model): 16 | super(CRFPunctuator, self).__init__(lm) 17 | self.sentencizer = CRFSentencizer(lm) 18 | self.sentencizer.load(cut_model) 19 | 20 | def sent2features(self, sent: str, tags=None): 21 | length = len(sent) 22 | feat_list = [] 23 | for i, char in enumerate(sent): 24 | features = [ 25 | 'bias', 26 | '0:char=' + char, 27 | '0:tag=' + tags[i], 28 | ] 29 | 30 | if i > 0: 31 | features.extend([ 32 | '-1:char=' + sent[i - 1], 33 | '-10:chars=' + sent[i - 1: i + 1], 34 | # '-10:pmi=' + self.get_pmi(sent[i - 1: i + 1]), 35 | 36 | # '-1:tag=' + tags[i - 1], 37 | # '-10:tags=' + tags[i - 1: i + 1], 38 | ]) 39 | else: 40 | features.append('BOS') 41 | 42 | if i > 1: 43 | features.extend([ 44 | '-2:char=' + sent[i - 2], 45 | '-21:chars=' + sent[i - 2: i], 46 | '-210:chars=' + sent[i - 2: i + 1], 47 | 48 | # '-21:tags=' + tags[i - 2: i], 49 | # '-210:tags=' + tags[i - 2: i + 1], 50 | ]) 51 | 52 | if i > 2: 53 | features.extend([ 54 | '-3:char=' + sent[i - 3], 55 | '-321:chars=' + sent[i - 3: i], 56 | '-3210:chars=' + sent[i - 3: i + 1], 57 | ]) 58 | 59 | if i < length - 1: 60 | features.extend([ 61 | '+1:char=' + sent[i + 1], 62 | '+01:chars=' + sent[i: i + 2], 63 | # '+01:pmi=' + self.get_pmi(sent[i: i + 2]), 64 | 65 | # '+1:tag=' + tags[i + 1], 66 | # '+01:tags=' + tags[i: i + 2], 67 | ]) 68 | else: 69 | features.append('EOS') 70 | 71 | if i < length - 2: 72 | features.extend([ 73 | '+2:char=' + sent[i + 2], 74 | '+12:chars=' + sent[i + 1: i + 3], 75 | '+012:chars=' + sent[i: i + 3], 76 | 77 | # '+12:tags=' + tags[i + 1: i + 3], 78 | # '+012:tags=' + tags[i: i + 3], 79 | ]) 80 | 81 | if i < length - 3: 82 | features.extend([ 83 | '+3:char=' + sent[i + 3], 84 | '+123:chars=' + sent[i + 1: i + 4], 85 | '+0123:chars=' + sent[i: i + 4], 86 | ]) 87 | 88 | if 0 < i < length - 1: 89 | features.extend([ 90 | '-11:chars=' + sent[i - 1] + sent[i + 1], 91 | '-101:chars=' + sent[i - 1: i + 2], 92 | '-101:ttest=' + self.get_ttest(sent[i - 1: i + 2]), 93 | ]) 94 | 95 | feat_list.append(features) 96 | 97 | return feat_list 98 | 99 | def punctuate(self, text): 100 | cut_feat_list = self.sentencizer.sent2features(text) 101 | cut_tags = self.sentencizer.tagger.tag(cut_feat_list) 102 | punc_feat_list = self.sent2features(text, cut_tags) 103 | punc_tags = self.tagger.tag(punc_feat_list) 104 | 105 | sents = [] 106 | sent = '' 107 | for i, tag in enumerate(punc_tags): 108 | if tag in self.tag2punc: 109 | if sent: 110 | sents.append(sent) 111 | sent = '' 112 | sents.append(text[i]) 113 | sents.append(self.tag2punc[tag]) 114 | elif tag == 'B': 115 | if sent: 116 | sents.append(sent) 117 | sent = text[i] 118 | elif tag in {'M', 'E3', 'E2'}: 119 | sent += text[i] 120 | if sent: 121 | sents.append(sent) 122 | 123 | return ''.join(sents) 124 | 125 | def build_data(self, data_file): 126 | X = [] 127 | Y = [] 128 | for line in text_iterator(data_file, keep_punc=True): 129 | texts = [text for text in re_puncs_include.split(line) if text] 130 | texts = self.process_texts(texts) 131 | 132 | feat_list = [] 133 | punc_tags = [] 134 | for i in range(len(texts) - 1): 135 | if re_zh_exclude.match(texts[i]) and texts[i + 1] in self.punc2tag: 136 | cut_tags = self.sent2tags(texts[i]) 137 | feat_list.extend(self.sent2features(texts[i], cut_tags)) 138 | punc_tags.extend(self.sent2tags(texts[i], texts[i + 1])) 139 | 140 | X.append(feat_list) 141 | Y.append(punc_tags) 142 | 143 | return X, Y 144 | 145 | def process_texts(self, texts): 146 | while texts and texts[0] in self.punc2tag: 147 | texts = texts[1:] 148 | 149 | if len(texts) // 2 != 0: 150 | texts.append('。') 151 | 152 | return texts 153 | 154 | def eval(self, test_x, test_y, crf_model): 155 | tagger = pycrfsuite.Tagger() 156 | tagger.open(crf_model) 157 | 158 | pred_y = [] 159 | for feat_list in test_x: 160 | preds = tagger.tag(feat_list) 161 | pred_y.append(preds) 162 | 163 | y_trues = [tag for tag in list(chain.from_iterable(test_y)) if tag not in {'B', 'M', 'E3', 'E2'}] 164 | y_preds = [tag for tag in list(chain.from_iterable(pred_y)) if tag not in {'B', 'M', 'E3', 'E2'}] 165 | 166 | lb = LabelBinarizer() 167 | y_true_all = lb.fit_transform(y_trues) 168 | y_pred_all = lb.transform(y_preds) 169 | 170 | tagset = sorted(set(lb.classes_)) 171 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} 172 | 173 | print(classification_report( 174 | y_true_all, 175 | y_pred_all, 176 | labels=[class_indices[cls] for cls in tagset], 177 | target_names=tagset, 178 | digits=5 179 | )) 180 | -------------------------------------------------------------------------------- /jiayan/sentencizer/crf_sent_tagger.py: -------------------------------------------------------------------------------- 1 | import random 2 | from itertools import chain 3 | 4 | import pycrfsuite 5 | from sklearn.metrics import classification_report 6 | from sklearn.preprocessing import LabelBinarizer 7 | 8 | 9 | class CRFSentTagger: 10 | 11 | def __init__(self, lm): 12 | self.lm = lm 13 | self.tagger = None 14 | 15 | # for feature extraction of punctuator 16 | self.punc2tag = { 17 | '。': 'J', 18 | '!': 'G', 19 | '?': 'W', 20 | ',': 'D', 21 | '、': 'U', 22 | ':': 'A', 23 | ';': 'F', 24 | } 25 | 26 | # for decoding of punctuator 27 | self.tag2punc = { 28 | 'J': '。', 29 | 'G': '!', 30 | 'W': '?', 31 | 'D': ',', 32 | 'U': '、', 33 | 'A': ':', 34 | 'F': ';', 35 | } 36 | 37 | def load(self, crf_model): 38 | self.tagger = pycrfsuite.Tagger() 39 | self.tagger.open(crf_model) 40 | 41 | def sent2features(self, sent: str, tags=None): 42 | pass 43 | 44 | def sent2tags(self, sent: str, punc=''): 45 | single_tag = 'S' 46 | end_tag = 'E' 47 | 48 | if punc: 49 | single_tag = self.punc2tag[punc] 50 | end_tag = self.punc2tag[punc] 51 | 52 | length = len(sent) 53 | if length == 1: 54 | tags = [single_tag] 55 | elif length == 2: 56 | tags = ['B', end_tag] 57 | elif length == 3: 58 | tags = ['B', 'E2', end_tag] 59 | elif length == 4: 60 | tags = ['B', 'E3', 'E2', end_tag] 61 | elif length == 5: 62 | tags = ['B', 'M', 'E3', 'E2', end_tag] 63 | else: 64 | tags = ['B'] + ['M'] * (length - 4) + ['E3', 'E2', end_tag] 65 | 66 | return tags 67 | 68 | def get_pmi(self, seg): 69 | pmi = self.lm.score(' '.join(seg), eos=False, bos=False) - \ 70 | (self.lm.score(seg[0], eos=False, bos=False) + self.lm.score(seg[1], eos=False, bos=False)) 71 | if pmi >= 2: 72 | return '2' 73 | elif pmi >= 1.5: 74 | return '1.5' 75 | elif pmi >= 1: 76 | return '1' 77 | elif pmi >= 0.5: 78 | return '0.5' 79 | return '0' 80 | 81 | def get_ttest(self, seg): 82 | former = self.lm.score(' '.join(seg[:2]), eos=False, bos=False) - self.lm.score(seg[0], eos=False, bos=False) 83 | latter = self.lm.score(' '.join(seg[1:]), eos=False, bos=False) - self.lm.score(seg[1], eos=False, bos=False) 84 | diff = former - latter 85 | if diff > 0: 86 | return 'l' 87 | elif diff == 0: 88 | return 'u' 89 | else: 90 | return 'r' 91 | 92 | def train(self, train_x, train_y, out_model): 93 | trainer = pycrfsuite.Trainer(verbose=False) 94 | for x, y in zip(train_x, train_y): 95 | if x and y: 96 | trainer.append(x, y) 97 | 98 | trainer.set_params({ 99 | 'c1': 1.0, # coefficient for L1 penalty 100 | 'c2': 1e-3, # coefficient for L2 penalty 101 | 'max_iterations': 50, # stop earlier 102 | 'feature.possible_transitions': True # include transitions that are possible, but not observed 103 | }) 104 | 105 | trainer.train(out_model) 106 | print(trainer.logparser.last_iteration) 107 | 108 | def build_data(self, data_file): 109 | pass 110 | 111 | def split_data(self, X, Y): 112 | random.seed(42) 113 | rd_num = random.random() 114 | 115 | def _rd(): 116 | return rd_num 117 | 118 | random.shuffle(X, _rd) 119 | random.shuffle(Y, _rd) 120 | 121 | ratio = round(len(X) * 0.9) 122 | return X[:ratio], Y[:ratio], X[ratio:], Y[ratio:] 123 | 124 | def eval(self, test_x, test_y, crf_model): 125 | tagger = pycrfsuite.Tagger() 126 | tagger.open(crf_model) 127 | 128 | y_pred = [] 129 | for feat_list in test_x: 130 | preds = tagger.tag(feat_list) 131 | y_pred.append(preds) 132 | 133 | lb = LabelBinarizer() 134 | y_true_all = lb.fit_transform(list(chain.from_iterable(test_y))) 135 | y_pred_all = lb.transform(list(chain.from_iterable(y_pred))) 136 | 137 | tagset = sorted(set(lb.classes_)) 138 | class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} 139 | 140 | print(classification_report( 141 | y_true_all, 142 | y_pred_all, 143 | labels=[class_indices[cls] for cls in tagset], 144 | target_names=tagset, 145 | digits=5 146 | )) 147 | 148 | 149 | -------------------------------------------------------------------------------- /jiayan/sentencizer/crf_sentencizer.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from jiayan.globals import re_puncs_exclude 4 | from jiayan.utils import text_iterator 5 | from jiayan.sentencizer.crf_sent_tagger import CRFSentTagger 6 | 7 | 8 | class CRFSentencizer(CRFSentTagger): 9 | 10 | def __init__(self, lm): 11 | super(CRFSentencizer, self).__init__(lm) 12 | 13 | def sent2features(self, sent: str, tags=None): 14 | length = len(sent) 15 | feat_list = [] 16 | for i, char in enumerate(sent): 17 | features = [ 18 | 'bias', 19 | '0:char=' + char, 20 | ] 21 | 22 | if i > 0: 23 | features.extend([ 24 | '-1:char=' + sent[i - 1], 25 | '-10:chars=' + sent[i - 1: i + 1], 26 | '-10:pmi=' + self.get_pmi(sent[i - 1: i + 1]), 27 | ]) 28 | else: 29 | features.append('BOS') 30 | 31 | if i > 1: 32 | features.extend([ 33 | # '-2:char=' + sent[i - 2], 34 | '-21:chars=' + sent[i - 2: i], 35 | '-210:chars=' + sent[i - 2: i + 1], 36 | ]) 37 | 38 | if i < length - 1: 39 | features.extend([ 40 | '+1:char=' + sent[i + 1], 41 | '+01:chars=' + sent[i: i + 2], 42 | '+01:pmi=' + self.get_pmi(sent[i: i + 2]), 43 | ]) 44 | else: 45 | features.append('EOS') 46 | 47 | if i < length - 2: 48 | features.extend([ 49 | # '+2:char=' + sent[i + 2], 50 | '+12:chars=' + sent[i + 1: i + 3], 51 | '+012:chars=' + sent[i: i + 3], 52 | ]) 53 | 54 | if 0 < i < length - 1: 55 | features.extend([ 56 | '-11:chars=' + sent[i - 1] + sent[i + 1], 57 | '-101:chars=' + sent[i - 1: i + 2], 58 | '-101:ttest=' + self.get_ttest(sent[i - 1: i + 2]), 59 | ]) 60 | 61 | feat_list.append(features) 62 | 63 | return feat_list 64 | 65 | def sentencize(self, text): 66 | feat_list = self.sent2features(text) 67 | tags = self.tagger.tag(feat_list) 68 | 69 | sents = [] 70 | sent = '' 71 | for i, tag in enumerate(tags): 72 | if tag == 'S': 73 | if sent: 74 | sents.append(sent) 75 | sent = '' 76 | sents.append(text[i]) 77 | elif tag == 'B': 78 | if sent: 79 | sents.append(sent) 80 | sent = text[i] 81 | elif tag in {'M', 'E3', 'E2', 'E'}: 82 | sent += text[i] 83 | if sent: 84 | sents.append(sent) 85 | 86 | return sents 87 | 88 | def build_data(self, data_file): 89 | X = [] 90 | Y = [] 91 | 92 | for line in text_iterator(data_file, keep_punc=True): 93 | sents = [sent for sent in re_puncs_exclude.split(line) if sent] 94 | feat_list = self.sent2features(''.join(sents)) 95 | tag_list = list(chain.from_iterable([self.sent2tags(sent) for sent in sents])) 96 | X.append(feat_list) 97 | Y.append(tag_list) 98 | 99 | return X, Y 100 | 101 | 102 | -------------------------------------------------------------------------------- /jiayan/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/tokenizer/__init__.py -------------------------------------------------------------------------------- /jiayan/tokenizer/hmm_tokenizer.py: -------------------------------------------------------------------------------- 1 | from math import log10 2 | 3 | from jiayan.globals import re_zh_include, stopchars 4 | 5 | """ 6 | Use HMM to consider word detection as a char sequence tagging problem. 7 | 8 | With a word dict and a char sequence, there could be lots of tokenizing solutions, and the best one will have 9 | the biggest multiplication probability of words: 10 | (see Max Probability Tokenizing: [https://blog.csdn.net/u010189459/article/details/37956689]) 11 | p(S) = p(w1) * p(w2) * p(w3)...p(wn) 12 | 13 | However, without a word dict we don't know how to tokenize the sentence by word. But here we can use 14 | language model to compute a possible word probability first: 15 | p(w) = p(c1, c2, c3, c4) = p(c1) * p(c2|c1) * p(c3|c1, c2) * p(c4|c1, c2, c3) 16 | 17 | Here the word "w" is a 4-char word, with c1, c2, c3 and c4, and the probabilities of each char occurring in relative 18 | position could be computed with N-grams model. 19 | 20 | So assume the longest word we want is 4-char word, then in a sentence with length L (L char sequence), each char 21 | could be in 4 possible positions of one word, and each associates with its probability of being at that position 22 | (k indicates the kth char in the sequence) 23 | 24 | 1. the beginning of the word (b): p(ck) 25 | 2. the second char of the word (c): p(ck|ck-1) 26 | 3. the third char of the word (d): p(ck|ck-2, ck-1) 27 | 4. the fourth char of the word (e): p(ck|ck-3, ck-2, ck-1) 28 | 29 | So, a char sequence could be tagged in a char level with labels {b, c, d, e} first, and be chunked based on the 30 | tags. Now we can see the word level problem is broken down to char level problem with hidden states, so this is the 31 | decoding problem of HMM, we can use viterbi algorithm to get the best tag/state sequence for the char/observation 32 | sequence. 33 | 34 | For viterbi, we need (a)initial starting probabilities of each state, (b)transition probabilities between states, and 35 | (c)emission probabilities of states emitting different observations. Let's draw a table to see what they should be in 36 | this problem: 37 | 38 | ---------------------------------------------------- 39 | start -> b b b 40 | c c c 41 | d d d 42 | e e e 43 | 44 | char sequence: char1 char2 char3 ... 45 | ----------------------------------------------------- 46 | 47 | So for each char in the sequence, there are 4 possible states. 48 | For (a), only "b" can start a sequence, so p(b|) = 1, and p(c|) = p(d|) = p(e|) = 0 49 | For (b), consider the longest word: "bcde", we can see the state transitions are limited in: 50 | i. b -> b, b -> c: the beginning of a word either goes to a new word beginning, or the 2nd char; 51 | ii. c -> b, c -> d: the 2nd char either goes to a new word beginning, or the 3rd char; 52 | iii. d -> b, d -> e: the 3rd char either goes to a new word beginning, or the 4th char; 53 | iv. e -> b, e -> e: the 4th char either goes to a new word beginning, or the 5th char ... 54 | For (c), the emission probability of one char at a certain state could be computed with N-grams model, e.g., 55 | p(ck|d) = p(ck|ck-1, ck-2) 56 | 57 | The only parameters that we cannot compute here are transition probabilities, which we can manually set. 58 | 59 | Differences from regular HMM tokenizing: 60 | (a) regular HMM tokenizing uses label set {B, M, E, S} to tag char sequence, which is very vague to indicate 61 | exact char position within a word, especially "M", thus hard to compute emission probabilities; 62 | (b) regular HMM tokenizing requires large data to compute transition and emission probabilities, but here our 63 | goal is the opposite, to generate that word corpus; 64 | (c) regular HMM tokenizing computes transition probabilities from data, but here we set them manually; 65 | (d) regular HMM tokenizing computes emission probabilities from data, but here we use char level N-grams 66 | language model. 67 | 68 | Disadvantages: 69 | (a) slow: read the sentence data to build ngrams from min word length to max word length, and read again to tokenize 70 | the whole data, and by this to build word corpus; viterbi on each sentence in data 71 | (b) bad at long word: need to fine tune transition probabilities to control the word lengths, which is time consuming, 72 | and the detected long words are not as good as short words. 73 | (c) fake word frequency: since word corpus is built by tokenizing, which can lead to inaccurate sentence splits, the 74 | word count doesn't reflect true frequency, e.g., "天下" in "于天下". So we use its true frequency count in 75 | the ngrams dict when filtering. 76 | """ 77 | 78 | 79 | class CharHMMTokenizer: 80 | 81 | def __init__(self, lm): 82 | self.lm = lm 83 | self.inits = {'b': 0.0, 'c': -3.14e100, 'd': -3.14e100, 'e': -3.14e100} 84 | 85 | # the transition probabilities are manually fine tuned; 86 | # in principle, we would like the word length the shorter the better; 87 | # low to-b and high to-next-char-in-word transition probs lead to long words; 88 | # high to-b and low to-next-char-in-word transition probs lead to short words. 89 | trans = {'bb': 0.85, 'bc': 0.15, 90 | 'cb': 0.9925, 'cd': 0.0075, 91 | 'db': 0.999, 'de': 0.001, 92 | 'eb': 0.9999, 'ee': 0.0001} 93 | # trans = {'bb': 0.8, 'bc': 0.2, 94 | # 'cb': 0.9925, 'cd': 0.0075, 95 | # 'db': 0.999, 'de': 0.001, 96 | # 'eb': 0.9999, 'ee': 0.0001} 97 | 98 | # convert the decimal probabilities to logs to avoid overflow 99 | self.trans = {states: log10(trans_prob) for states, trans_prob in trans.items()} 100 | 101 | def tokenize(self, text: str): 102 | """ Gets the tags of given sentence, and tokenizes sentence based on the tag sequence. 103 | """ 104 | # split text by whitespaces first, then split each segment into char chunks by zh chars 105 | for seg in text.strip().split(): 106 | if seg: 107 | for chunk in re_zh_include.split(seg): 108 | # if zh chars, tokenize them 109 | if re_zh_include.match(chunk): 110 | tags = self.viterbi(chunk) 111 | 112 | word = chunk[0] 113 | for i in range(1, len(chunk)): 114 | if tags[i] == 'b': 115 | if not self.valid_word(word): 116 | for char in word: 117 | yield char 118 | else: 119 | yield word 120 | word = chunk[i] 121 | else: 122 | word += chunk[i] 123 | if word: 124 | if not self.valid_word(word): 125 | for char in word: 126 | yield char 127 | else: 128 | yield word 129 | 130 | # if not zh chars, we assume they are all punctuations, split them 131 | else: 132 | for char in chunk: 133 | yield char 134 | 135 | def viterbi(self, sent): 136 | """ Chooses the most likely char tag sequence of given char sentence. 137 | """ 138 | emits = self.get_emission_probs(sent) 139 | 140 | # record the best path for each state for each char, {path1: path_prob, path2: path_prob, ...}; 141 | # paths grow at each decoding step, eventually contains the best paths for each state of last char; 142 | # we assume the initial state probs = 1st char's emission probs 143 | paths = {state: prob + self.inits[state] for state, prob in emits[0].items()} 144 | 145 | # for each char 146 | for i in range(1, len(sent)): 147 | # print(paths) 148 | 149 | # record best paths and their probs to all states of current char 150 | cur_char_paths = {} 151 | 152 | # for each state of current char 153 | for state, emit_prob in emits[i].items(): 154 | 155 | # record all possible paths and their probs to current state 156 | cur_state_paths = {} 157 | 158 | # for each state of previous char 159 | for path, path_prob in paths.items(): 160 | trans_states = path[-1] + state 161 | 162 | # compute the path prob from a previous state to current state 163 | if trans_states in self.trans: 164 | cur_state_paths[path + state] = path_prob + emit_prob + self.trans[trans_states] 165 | 166 | # choose the best path from all previous paths to current state 167 | best_path = sorted(cur_state_paths, key=lambda x: cur_state_paths[x])[-1] 168 | 169 | # for current state of current char, we found its best path 170 | cur_char_paths[best_path] = cur_state_paths[best_path] 171 | 172 | # the paths grow by one char/state 173 | paths = cur_char_paths 174 | 175 | return sorted(paths, key=lambda x: paths[x])[-1] 176 | 177 | def get_emission_probs(self, sent): 178 | """ Computes emission probability of each state emitting relative char in the given char sequence. """ 179 | return [ 180 | 181 | {'b': self.seg_prob(sent[i]), 182 | 'c': self.seg_prob(sent[i - 1:i + 1]), 183 | 'd': self.seg_prob(sent[i - 2:i + 1]), 184 | 'e': self.seg_prob(sent[i - 3:i + 1]) 185 | } 186 | 187 | for i in range(len(sent)) 188 | ] 189 | 190 | def seg_prob(self, seg): 191 | """ Computes the segment probability based on ngrams model. 192 | If given an empty segment, it means it's impossible for current char to be at current position of a word, 193 | thus return default low log prob -100. 194 | """ 195 | return (self.lm.score(' '.join(seg), bos=False, eos=False) - 196 | self.lm.score(' '.join(seg[:-1]), bos=False, eos=False)) \ 197 | or -100.0 198 | 199 | def valid_word(self, word): 200 | """ Checks if a word contains stopchars, if yes, it's not a valid word. """ 201 | for char in word: 202 | if char in stopchars: 203 | return False 204 | return True 205 | 206 | 207 | -------------------------------------------------------------------------------- /jiayan/tokenizer/ngram_tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import marshal 3 | from math import log 4 | 5 | from jiayan.globals import re_zh_include 6 | 7 | """ 8 | References: 9 | [https://github.com/fxsjy/jieba] 10 | """ 11 | 12 | dir_path = os.path.dirname(os.path.realpath(__file__)) 13 | root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 14 | 15 | dict_path = os.path.join(root, 'data/dict.txt') 16 | cache_path = os.path.join(dir_path, 'tokenizer.cache') 17 | 18 | 19 | class WordNgramTokenizer: 20 | 21 | def __init__(self, dict_f=None): 22 | if not dict_f: 23 | dict_f = dict_path 24 | self.cache = cache_path 25 | self.PREFIX, self.total = self.check_cache(dict_f) 26 | 27 | def check_cache(self, dict_f): 28 | """ Loads frequency dict and total word counts from cache. 29 | """ 30 | if os.path.isfile(self.cache): 31 | with open(self.cache, 'rb') as cf: 32 | return marshal.load(cf) 33 | else: 34 | # if no cache, generate freq dict and dump the cache 35 | PREFIX, total = self.gen_prefix_dict(dict_f) 36 | with open(self.cache, 'wb') as temp_cache_file: 37 | marshal.dump((PREFIX, total), temp_cache_file) 38 | return PREFIX, total 39 | 40 | def clear_cache(self): 41 | if os.path.isfile(self.cache): 42 | os.remove(self.cache) 43 | 44 | @staticmethod 45 | def gen_prefix_dict(dict_f): 46 | """ Reads a dict file and generates the prefix dictionary with total word counts. 47 | """ 48 | word_counts = {} 49 | with open(dict_f, 'rb') as f: 50 | for line in f: 51 | line = line.strip().decode('utf-8') 52 | word, freq = line.split(',') 53 | word_counts[word] = int(freq) 54 | 55 | # enumerate all prefixes of a word to enrich the vocab 56 | for i in range(len(word)): 57 | prefix = word[:i + 1] 58 | if prefix not in word_counts: 59 | word_counts[prefix] = 0 60 | 61 | return word_counts, sum(word_counts.values()) 62 | 63 | def tokenize(self, text): 64 | # split zh chars and non-zh chars into chunks 65 | chunks = re_zh_include.split(text) 66 | 67 | for chk in chunks: 68 | if chk: 69 | # if the chunk is zh, tokenize it 70 | if re_zh_include.match(chk): 71 | for word in self.cut_DAG(chk): 72 | yield word 73 | # if the chunk is not zh, treat it as a single word 74 | else: 75 | yield chk 76 | 77 | def cut_DAG(self, sentence): 78 | """ Cuts the DAG according to max route probabilities. 79 | """ 80 | DAG = self.gen_DAG(sentence) 81 | route = {} 82 | self.calculate_route_prob(sentence, DAG, route) 83 | 84 | start = 0 85 | N = len(sentence) 86 | 87 | while start < N: 88 | end = route[start][1] 89 | word = sentence[start:end + 1] 90 | yield word 91 | start = end + 1 92 | 93 | def gen_DAG(self, sentence): 94 | """ Generates DAG based on given sentence and prefix dict. 95 | """ 96 | DAG = {} 97 | N = len(sentence) 98 | 99 | for start in range(N): 100 | ends = [] 101 | end = start 102 | prefix = sentence[start] 103 | while end < N and prefix in self.PREFIX: 104 | if self.PREFIX[prefix]: 105 | ends.append(end) 106 | end += 1 107 | 108 | # extend prefix 109 | prefix = sentence[start:end + 1] 110 | 111 | # if no words formed starting from current char, OOV, it ends with itself 112 | if not ends: 113 | ends.append(start) 114 | 115 | DAG[start] = ends 116 | 117 | return DAG 118 | 119 | def calculate_route_prob(self, sentence, DAG, route): 120 | """ Uses dynamic programming to compute the tokenizing solution with highest probability. 121 | """ 122 | N = len(sentence) 123 | 124 | # each position in the route will be stored as "position: (prob, end)", where the value 125 | # tuple contains the highest path prob to current position, and the most recent word 126 | # ending position from current route position; 127 | # in other words, sentence[position: end + 1] forms the word and together with which 128 | # the rest of the path that makes the tokenizing solution with highest probability 129 | route[N] = (0, 0) 130 | log_total = log(self.total) 131 | 132 | # compute from backwards to forwards, because ... 133 | for i in range(N - 1, -1, -1): 134 | 135 | # for each word start position, lists all its possible word ending positions, 136 | # compute their word probabilities, and add relative rest path probabilities, 137 | # then choose the end position that makes the whole path probability highest 138 | 139 | # the value got from PREFIX dict could be either None or 0, we assume each word 140 | # appears at least once, like add-1 laplace smoothing 141 | route[i] = max((log(self.PREFIX.get(sentence[i:end + 1]) or 1) - log_total 142 | + route[end + 1][0], end) for end in DAG[i]) 143 | -------------------------------------------------------------------------------- /jiayan/translator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiaeyan/Jiayan/28c9638a071f1f0ab69d0ee971081147aa682a5b/jiayan/translator/__init__.py -------------------------------------------------------------------------------- /jiayan/utils.py: -------------------------------------------------------------------------------- 1 | from jiayan.globals import re_invalid_chars, re_zh_exclude 2 | 3 | 4 | def process_line(line: str): 5 | """ A standard approach to process input line, by 6 | 1. retain and replace valid punctuations; 7 | 2. removing non-zh and invalid punctuation chars; 8 | """ 9 | line = line.strip().replace(',', ',').replace('.', '。').replace(':', ':').\ 10 | replace('!', '!').replace('?', '?').replace(';', ';') 11 | line = re_invalid_chars.sub('', line) 12 | return line 13 | 14 | 15 | def text_iterator(data_file, keep_punc=False): 16 | """ A help function to provide clean zh char lines of a given file. """ 17 | with open(data_file, 'r', encoding='utf-8') as f: 18 | for line in f: 19 | for seg in line.strip().split(): 20 | if seg: 21 | seg = process_line(seg) 22 | if keep_punc: 23 | if seg: 24 | yield seg 25 | else: 26 | for text in re_zh_exclude.findall(seg): 27 | if text: 28 | yield text 29 | 30 | 31 | def make_kenlm(data_file): 32 | for text in text_iterator(data_file): 33 | print(' '.join(text)) 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | https://github.com/kpu/kenlm/archive/master.zip 2 | scikit-learn 3 | python-crfsuite 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import sys 5 | from setuptools import setup, find_packages 6 | 7 | 8 | requirements = ["scikit-learn", "python-crfsuite"] 9 | 10 | if sys.version_info[:2] < (2, 7): 11 | requirements.append('argparse') 12 | if sys.version_info[:2] < (3, 4): 13 | requirements.append('enum34') 14 | if sys.version_info[:2] < (3, 5): 15 | requirements.append('typing') 16 | 17 | extras_require = { 18 | ':python_version<"2.7"': ['argparse'], 19 | ':python_version<"3.4"': ['enum34'], 20 | ':python_version<"3.5"': ['typing'], 21 | } 22 | 23 | setup( 24 | name="jiayan", 25 | version="0.0.21", 26 | author="Jiajie Yan", 27 | author_email="jiaeyan@gmail.com", 28 | description="The NLP toolkit designed for classical chinese.", 29 | long_description=open("README.md", encoding="utf-8").read(), 30 | long_description_content_type='text/markdown', 31 | license="MIT", 32 | url="https://github.com/jiaeyan/Jiayan", 33 | keywords=['classical-chinese', 'ancient-chinese', 'nlp'], 34 | packages=find_packages(), 35 | install_requires=requirements, 36 | extras_require=extras_require, 37 | python_requires='>=2.6, >=3', 38 | include_package_data=True, 39 | classifiers=[ 40 | 'Programming Language :: Python', 41 | 'Programming Language :: Python :: 2', 42 | 'Programming Language :: Python :: 2.6', 43 | 'Programming Language :: Python :: 2.7', 44 | 'Programming Language :: Python :: 3', 45 | 'Programming Language :: Python :: 3.3', 46 | 'Programming Language :: Python :: 3.4', 47 | 'Programming Language :: Python :: 3.5', 48 | 'Programming Language :: Python :: 3.6', 49 | 'Topic :: Utilities', 50 | 'Topic :: Text Processing', 51 | ] 52 | ) 53 | --------------------------------------------------------------------------------