├── .gitignore ├── .mocharc.yml ├── .npmignore ├── .nvmrc ├── .travis.yml ├── CHANGELOG.md ├── MIT-License ├── README.md ├── __root.d.ts ├── __root.js ├── __root.ts ├── demo ├── README.md └── sample │ └── 果体.txt ├── dicts └── .gitkeep ├── index.d.ts ├── index.js ├── index.ts ├── jest.config.js ├── jetbrains.svg ├── lib ├── POSTAG.d.ts ├── POSTAG.js ├── POSTAG.ts ├── Segment.d.ts ├── Segment.js ├── Segment.ts ├── const.d.ts ├── const.js ├── const.ts ├── defaults │ ├── dict.d.ts │ ├── dict.js │ ├── dict.ts │ ├── index.d.ts │ ├── index.js │ ├── index.ts │ ├── mods.d.ts │ ├── mods.js │ └── mods.ts ├── fs │ ├── get.d.ts │ ├── get.js │ └── get.ts ├── index.d.ts ├── index.js ├── index.ts ├── loader.d.ts ├── loader.js ├── loader.ts ├── mod │ ├── CHS_NAMES.d.ts │ ├── CHS_NAMES.js │ ├── CHS_NAMES.ts │ ├── COLORS.d.ts │ ├── COLORS.js │ ├── COLORS.ts │ ├── Optimizer.d.ts │ ├── Optimizer.js │ ├── Optimizer.ts │ ├── Tokenizer.d.ts │ ├── Tokenizer.js │ ├── Tokenizer.ts │ ├── const.d.ts │ ├── const.js │ ├── const.ts │ ├── data │ │ ├── STOPWORD.d.ts │ │ ├── STOPWORD.js │ │ └── STOPWORD.ts │ ├── index.d.ts │ ├── index.js │ ├── index.ts │ ├── mod.d.ts │ ├── mod.js │ └── mod.ts ├── segment │ ├── core.d.ts │ ├── core.js │ ├── core.ts │ ├── defaults.d.ts │ ├── defaults.js │ ├── defaults.ts │ ├── index.d.ts │ ├── index.js │ ├── index.ts │ ├── method.d.ts │ ├── method.js │ ├── method.ts │ ├── methods │ │ ├── _get_text.d.ts │ │ ├── _get_text.js │ │ ├── _get_text.ts │ │ ├── convertSynonym.d.ts │ │ ├── convertSynonym.js │ │ ├── convertSynonym.ts │ │ ├── doSegment.d.ts │ │ ├── doSegment.js │ │ ├── doSegment.ts │ │ ├── getOptionsDoSegment.d.ts │ │ ├── getOptionsDoSegment.js │ │ ├── getOptionsDoSegment.ts │ │ ├── indexOf.d.ts │ │ ├── indexOf.js │ │ ├── indexOf.ts │ │ ├── listModules.d.ts │ │ ├── listModules.js │ │ ├── listModules.ts │ │ ├── split.d.ts │ │ ├── split.js │ │ ├── split.ts │ │ ├── stringify.d.ts │ │ ├── stringify.js │ │ ├── stringify.ts │ │ ├── useModules.d.ts │ │ ├── useModules.js │ │ ├── useModules.ts │ │ ├── useModules2.d.ts │ │ ├── useModules2.js │ │ └── useModules2.ts │ ├── types.d.ts │ ├── types.js │ └── types.ts ├── submod.d.ts ├── submod.js ├── submod.ts ├── submod │ ├── AdjectiveOptimizer.d.ts │ ├── AdjectiveOptimizer.js │ ├── AdjectiveOptimizer.ts │ ├── ChsNameOptimizer.d.ts │ ├── ChsNameOptimizer.js │ ├── ChsNameOptimizer.ts │ ├── ChsNameTokenizer.d.ts │ ├── ChsNameTokenizer.js │ ├── ChsNameTokenizer.ts │ ├── DatetimeOptimizer.d.ts │ ├── DatetimeOptimizer.js │ ├── DatetimeOptimizer.ts │ ├── DictOptimizer.d.ts │ ├── DictOptimizer.js │ ├── DictOptimizer.ts │ ├── DictTokenizer.d.ts │ ├── DictTokenizer.js │ ├── DictTokenizer.ts │ ├── EmailOptimizer.d.ts │ ├── EmailOptimizer.js │ ├── EmailOptimizer.ts │ ├── ForeignOptimizer.d.ts │ ├── ForeignOptimizer.js │ ├── ForeignOptimizer.ts │ ├── ForeignTokenizer.d.ts │ ├── ForeignTokenizer.js │ ├── ForeignTokenizer.ts │ ├── JpSimpleTokenizer.d.ts │ ├── JpSimpleTokenizer.js │ ├── JpSimpleTokenizer.ts │ ├── PunctuationTokenizer.d.ts │ ├── PunctuationTokenizer.js │ ├── PunctuationTokenizer.ts │ ├── SingleTokenizer.d.ts │ ├── SingleTokenizer.js │ ├── SingleTokenizer.ts │ ├── URLTokenizer.d.ts │ ├── URLTokenizer.js │ ├── URLTokenizer.ts │ ├── WildcardTokenizer.d.ts │ ├── WildcardTokenizer.js │ ├── WildcardTokenizer.ts │ ├── ZhRadicalTokenizer.d.ts │ ├── ZhRadicalTokenizer.js │ ├── ZhRadicalTokenizer.ts │ ├── ZhtSynonymOptimizer.d.ts │ ├── ZhtSynonymOptimizer.js │ ├── ZhtSynonymOptimizer.ts │ ├── ZhuyinTokenizer.d.ts │ ├── ZhuyinTokenizer.js │ └── ZhuyinTokenizer.ts └── util │ ├── debug.d.ts │ ├── debug.js │ ├── debug.ts │ ├── index.d.ts │ ├── index.js │ ├── index.ts │ ├── isUnset.d.ts │ ├── isUnset.js │ └── isUnset.ts ├── package.json ├── project.config.d.ts ├── project.config.js ├── project.config.ts ├── repl ├── script ├── publish-after.d.ts ├── publish-after.js ├── publish-after.ts ├── publish-after2.d.ts ├── publish-after2.js ├── publish-after2.ts ├── sort-stringify-cache.d.ts ├── sort-stringify-cache.js └── sort-stringify-cache.ts ├── test ├── __snapshots__ │ └── bug.spec.ts.snap ├── _local-dev.ts ├── bug.spec.ts ├── chk-fixme.ts ├── demo.cache.ts ├── demo.glob.ts ├── demo.ts ├── lazy.fixme.ts ├── lib │ ├── delete-cache.ts │ ├── index.ts │ └── util.ts ├── res │ ├── default.ts │ ├── fixme.data.ts │ ├── gc.data.ts │ ├── gc.not │ │ └── 666962621.txt │ ├── lazy.index.ts │ ├── lazy.index │ │ ├── tests_lazy_array.ts │ │ ├── tests_lazy_base.ts │ │ ├── tests_lazy_base_not.ts │ │ ├── tests_lazy_indexof.ts │ │ └── tests_lazy_indexof_not.ts │ ├── lazy.novel.ts │ └── ウォルテニア戦記 │ │ ├── 第11話【西へ】其2.txt │ │ ├── 第11話【西へ】其2_cjk2zht.txt │ │ ├── 第11話【西へ】其2_cn2tw.txt │ │ ├── 第11話【西へ】其2_opencc.txt │ │ ├── 第11話【西へ】其2_out.txt │ │ └── 第11話【西へ】其2_zh2jp.txt ├── script │ └── build-submod.ts ├── sleep.ts ├── submod.spec.ts ├── temp │ ├── .gitignore │ ├── cache │ │ ├── 0 │ │ │ ├── char.txt │ │ │ ├── eng.txt │ │ │ └── other.txt │ │ ├── a.txt │ │ ├── b.txt │ │ ├── c.txt │ │ ├── d.txt │ │ ├── e.txt │ │ ├── f.txt │ │ ├── g.txt │ │ ├── h.txt │ │ ├── i.txt │ │ ├── j.txt │ │ ├── k.txt │ │ ├── l.txt │ │ ├── m.txt │ │ ├── n.txt │ │ ├── o.txt │ │ ├── p.txt │ │ ├── q.txt │ │ ├── r.txt │ │ ├── s.txt │ │ ├── t.txt │ │ ├── u.txt │ │ ├── v.txt │ │ ├── w.txt │ │ ├── x.txt │ │ ├── y.txt │ │ └── z.txt │ └── stringify.sorted.txt ├── test.ts ├── test_segment.test.ts ├── version.spec.ts ├── versions.spec.ts ├── word.novel.test.ts ├── word.test.ts ├── z.0010.test.ts ├── z.gc.not.test.ts └── z.gc.test.ts ├── tsconfig.json ├── typedoc.config.js ├── version.d.ts ├── version.js └── version.ts /.mocharc.yml: -------------------------------------------------------------------------------- 1 | require: 2 | # - esm 3 | - ts-node/register 4 | timeout: 0 5 | color: true 6 | extension: 7 | - ts 8 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | /.pnp 2 | .pnp.js 3 | .idea 4 | ~ci.list.txt 5 | ~ci.log.txt 6 | ~ci.errors.txt 7 | *.stackdump 8 | *.bak 9 | *.old 10 | *.log 11 | tsconfig.json 12 | package-lock.json 13 | test 14 | .github 15 | .gitkeep 16 | /.*/ 17 | /.* 18 | tests 19 | /~* 20 | __test__ 21 | __tests__ 22 | node_modules 23 | /node_modules/ 24 | **/node_modules/ 25 | *.ts 26 | !*.d.ts 27 | /bin/**/*.d.ts 28 | /bin/*.d.ts 29 | 30 | !*.d.mts 31 | /bin/**/*.d.mts 32 | /bin/*.d.mts 33 | 34 | !*.d.cts 35 | /bin/**/*.d.cts 36 | /bin/*.d.cts 37 | 38 | !/src/**/*.ts 39 | !/src/**/*.cts 40 | !/src/**/*.mts 41 | !/src/**/*.tsx 42 | 43 | /src/**/*.d.ts 44 | /src/**/*.js 45 | /src/**/*.d.cts 46 | /src/**/*.d.mts 47 | /src/**/*.cjs 48 | /src/**/*.mjs 49 | /src/**/*.jsx 50 | 51 | *.tgz 52 | /tsconfig.json.tpl 53 | yarn-error.log 54 | .git 55 | yarn.lock 56 | .env.local 57 | .env.*.local 58 | npm-debug.log* 59 | yarn-debug.log* 60 | yarn-error.log* 61 | .vscode 62 | *.suo 63 | *.ntvs* 64 | *.njsproj 65 | *.sln 66 | *.sw? 67 | *.vue.js 68 | *.vue.d.ts 69 | *.vue.js.map 70 | .nyc_output 71 | coverage 72 | /*.tpl 73 | webpack.config.js 74 | vue.config.js 75 | /jestconfig.json 76 | /tslint.json 77 | .git 78 | webpack.*.config.js 79 | webpack.*.config.d.ts 80 | webpack.*.config.js.map 81 | webpack.*.config.ts 82 | karma.conf.js 83 | /_config.yml 84 | intellij-style-guide.xml 85 | jest.config.js 86 | *.tsbuildinfo 87 | tsconfig.*.json 88 | tsconfig.esm.json.tpl 89 | /package.d.ts 90 | .mocharc.yml 91 | jest.config.js 92 | jest.config.* 93 | /jest-preset.* 94 | /report.*.json 95 | now.json 96 | /Makefile 97 | *.spec.d.ts 98 | *.spec.js 99 | *.spec.ts 100 | 101 | *.spec.d.cts 102 | *.spec.cjs 103 | *.spec.cts 104 | 105 | *.spec.d.mts 106 | *.spec.mjs 107 | *.spec.mts 108 | 109 | *.spec.d.tsx 110 | *.spec.tsx 111 | 112 | __mocks__ 113 | __tests__ 114 | __snapshots__ 115 | *.snap 116 | npm-shrinkwrap.json 117 | /example/ 118 | *.stat 119 | .vercel 120 | tsdx.config.js 121 | /report.json 122 | 123 | /_*/ 124 | _snowpack 125 | 126 | /snowpack.config.js 127 | web_modules 128 | cz-adapter 129 | 130 | tsc-multi.json.tpl 131 | tsc-multi.json 132 | 133 | changelog-option.js 134 | 135 | bin/tsconfig.json 136 | bin/tsconfig.*.json 137 | 138 | .yarnrc.yml 139 | .turbo 140 | __file_snapshots__ 141 | __fixtures__ 142 | /fixture/ 143 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | node 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "node" 4 | 5 | #deploy: 6 | # provider: pages 7 | # skip-cleanup: true 8 | # github-token: $GITHUB_TOKEN 9 | # keep-history: true 10 | # on: 11 | # branch: master 12 | # local-dir: docs 13 | 14 | 15 | cache: 16 | yarn: true 17 | directories: 18 | - "node_modules" 19 | 20 | before_install: 21 | #- npm install -g typedoc typedoc-themes-color typedoc-plugin-nojekyll 22 | - npm install -g typescript@next ts-node mocha chai 23 | 24 | install: 25 | # - yarn install 26 | - yarn add fs-extra chai-string @types/mocha typescript@next ts-node mocha chai chai-asserttype-extra 27 | 28 | 29 | before_script: 30 | - yarn list segment-dict 31 | - echo before_script 32 | script: 33 | # npm run travis 34 | yarn run test 35 | 36 | env: 37 | global: 38 | - TS_NODE_TRANSPILE_ONLY=true 39 | - color=1 40 | - FORCE_COLOR=1 41 | -------------------------------------------------------------------------------- /MIT-License: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2015 Zongmin Lei (雷宗民) 2 | http://ucdok.com 3 | 4 | The MIT License 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /__root.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/7/1. 3 | */ 4 | export default __dirname; 5 | -------------------------------------------------------------------------------- /__root.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2020/7/1. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.default = __dirname; 7 | //# sourceMappingURL=__root.js.map -------------------------------------------------------------------------------- /__root.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/7/1. 3 | */ 4 | 5 | import { join, normalize } from "path"; 6 | 7 | export default __dirname; 8 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # title 2 | 3 | ```ts 4 | const segment = new Segment(); 5 | ``` 6 | 7 | ## 如何將目前已加入的字典匯出 8 | 9 | ```ts 10 | // 用來確保字典的確已載入 11 | segment.autoInit() 12 | 13 | // 字典類型 14 | let type = 'TABLE'; 15 | 16 | let db_dict = segment.getDictDatabase(type) 17 | fs.writeFileSync('./exported.table.dict.txt', db_dict.stringify()) 18 | ``` 19 | 20 | ## 段落切分 21 | 22 | > 由於 segment 是利用對內容的前後文分析來進行分詞 23 | > 所以如何切割段落對於結果就會產生不同影響 24 | 25 | | | | 26 | |:------|:--| 27 | | `SPLIT` | `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件 | 28 | | `SPLIT_FILTER` | `RegExp` or 具有 `.test(input: string) => boolean` 的物件 | 29 | 30 | ```ts 31 | /** 32 | * 分段 33 | * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件 34 | * 35 | * @type {Segment.ISPLIT} 36 | */ 37 | segment.SPLIT: ISPLIT = /([\r\n]+|^[ \s+]+|[ \s]+$|[ \s]{2,})/gm as ISPLIT; 38 | 39 | /** 40 | * 分段之後 如果符合以下條件 則直接忽略分析 41 | * `RegExp` or 具有 `.test(input: string) => boolean` 的物件 42 | * 43 | * @type {Segment.ISPLIT_FILTER} 44 | */ 45 | segment.SPLIT_FILTER: ISPLIT_FILTER = /^([\r\n]+)$/g as ISPLIT_FILTER; 46 | ``` 47 | 48 | ## dictionary 49 | 50 | > 以下方法會載入字典 `name` 51 | 52 | `name` 可以為 53 | 54 | * 字典檔案絕對/相對路徑 55 | * 字典檔名(可以忽略副檔名) 56 | 57 | 當只輸入檔名時 58 | 會呼叫 `_resolveDictFilename(name: string, pathPlus?: string[], extPlus?: string[]): string;` 59 | 依照以下順序搜尋第一個符合的檔案 60 | 61 | 1. 目前 `cwd` 的相對路徑 62 | 2. novel-segment 模組底下的 [`novel-segment/dicts`](https://github.com/bluelovers/node-segment/tree/master/dicts) 63 | 3. 如果是呼叫 `loadSynonymDict` 時 會額外搜尋 [`segment-dict/dict/synonym`](https://github.com/bluelovers/node-segment-dict/tree/master/dict/synonym) 64 | 4. 如果是呼叫 `loadStopwordDict` 時 會額外搜尋 [`segment-dict/dict/stopword`](https://github.com/bluelovers/node-segment-dict/tree/master/dict/stopword) 65 | 5. `segment-dict` 模組底下的 [`segment-dict/dict/segment`](https://github.com/bluelovers/node-segment-dict/tree/master/dict/segment) 66 | 67 | 副檔名為以下順序 68 | 69 | 1. `''` => 無 也就是與 `name` 同名的檔案 70 | 2. `.utf8` 71 | 3. `.txt` 72 | 73 | ```ts 74 | /** 75 | * 载入字典文件 76 | * 77 | * @param {String} name 字典文件名 78 | * @param {String} type 类型 79 | * @param {Boolean} convert_to_lower 是否全部转换为小写 80 | * @return {Segment} 81 | */ 82 | loadDict(name: string, type?: string, convert_to_lower?: boolean, skipExists?: boolean): this; 83 | /** 84 | * 载入同义词词典 85 | * 86 | * @param {String} name 字典文件名 87 | */ 88 | loadSynonymDict(name: string, skipExists?: boolean): this; 89 | /** 90 | * 载入停止符词典 91 | * 92 | * @param {String} name 字典文件名 93 | */ 94 | loadStopwordDict(name: string): this; 95 | ``` 96 | 97 | -------------------------------------------------------------------------------- /demo/sample/果体.txt: -------------------------------------------------------------------------------- 1 | 但如果體內的營養成分消失 2 | 就接近了看了刊登異性的果體照片的週刊雜誌的感覺。 3 | 就接近了看了刊登異性的果體照片的週刊雜志的感覺。 4 | 雨果體力的冒険者之魂在沸騰 5 | 穿着像是果體一樣的H服裝 6 | 應該是和萊娜訓練的成果體現出來了 7 | “果體圍裙” 8 | 果體後綁起來 9 | 我處於果體狀態 10 | 11 | 12 | 但如果体内的营养成分消失 13 | 就接近了看了刊登异性的果体照片的周刊杂志的感觉。 14 | 就接近了看了刊登异性的果体照片的周刊杂志的感觉。 15 | 雨果体力的冒険者之魂在沸腾 16 | 穿着像是果体一样的H服装 17 | 应该是和莱娜训练的成果体现出来了 18 | “果体围裙” 19 | 果体后绑起来 20 | 我处于果体状态 21 | -------------------------------------------------------------------------------- /dicts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluelovers/node-segment/f5688c9dd2c31a16d7eefd2275aa9b206a5f7134/dicts/.gitkeep -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | import { Segment } from './lib/Segment'; 2 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 3 | declare const _Segment: typeof Segment & { 4 | version: string; 5 | version_dict: string; 6 | versions: { 7 | "novel-segment": string; 8 | "segment-dict": string; 9 | "regexp-cjk": string; 10 | "cjk-conv": string; 11 | }; 12 | /** 13 | * 分词接口 14 | */ 15 | Segment: typeof Segment; 16 | /** 17 | * 词性接口 18 | */ 19 | POSTAG: typeof POSTAG; 20 | }; 21 | declare const __Segment: typeof _Segment & { 22 | default: typeof _Segment; 23 | }; 24 | export = __Segment; 25 | export * from './version'; 26 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 中文分词器 3 | * 4 | * @author 老雷 5 | */ 6 | 'use strict'; 7 | const tslib_1 = require("tslib"); 8 | const Segment_1 = require("./lib/Segment"); 9 | const ids_1 = require("@novel-segment/postag/lib/postag/ids"); 10 | const _Segment = Segment_1.Segment; 11 | const __Segment = _Segment; 12 | Object.defineProperty(__Segment, "version", { 13 | get() { 14 | return require('./version').version; 15 | } 16 | }); 17 | Object.defineProperty(__Segment, "version_dict", { 18 | get() { 19 | return require('./version').version_dict; 20 | } 21 | }); 22 | Object.defineProperty(__Segment, "versions", { 23 | get() { 24 | return require('./version').versions; 25 | } 26 | }); 27 | // @ts-ignore 28 | tslib_1.__exportStar(require("./version"), exports); 29 | __Segment.POSTAG = ids_1.POSTAG; 30 | __Segment.Segment = Segment_1.Segment; 31 | __Segment.default = __Segment; 32 | module.exports = __Segment; 33 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 中文分词器 3 | * 4 | * @author 老雷 5 | */ 6 | 'use strict'; 7 | 8 | import { Segment, IWord, IDICT, IOptionsSegment, IDICT2, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment } from './lib/Segment'; 9 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 10 | 11 | const _Segment = Segment as typeof Segment & { 12 | version: string, 13 | version_dict: string, 14 | 15 | versions: { 16 | 'novel-segment': string, 17 | 'segment-dict': string, 18 | 'regexp-cjk': string, 19 | 'cjk-conv': string, 20 | }, 21 | 22 | /** 23 | * 分词接口 24 | */ 25 | Segment: typeof Segment, 26 | /** 27 | * 词性接口 28 | */ 29 | POSTAG: typeof POSTAG, 30 | }; 31 | 32 | const __Segment = _Segment as typeof _Segment & { 33 | default: typeof _Segment, 34 | }; 35 | 36 | Object.defineProperty(__Segment, "version", { 37 | get() 38 | { 39 | return require('./version').version 40 | } 41 | }); 42 | 43 | Object.defineProperty(__Segment, "version_dict", { 44 | get() 45 | { 46 | return require('./version').version_dict 47 | } 48 | }); 49 | 50 | Object.defineProperty(__Segment, "versions", { 51 | get() 52 | { 53 | return require('./version').versions 54 | } 55 | }); 56 | 57 | // @ts-ignore 58 | export = __Segment; 59 | 60 | // @ts-ignore 61 | export * from './version'; 62 | 63 | __Segment.POSTAG = POSTAG; 64 | __Segment.Segment = Segment; 65 | __Segment.default = __Segment; 66 | 67 | /* 68 | 使用示例: 69 | 70 | var segment = new Segment(); 71 | // 使用默认的识别模块及字典 72 | segment.useDefault(); 73 | // 开始分词 74 | console.log(segment.doSegment('这是一个基于Node.js的中文分词模块。')); 75 | */ 76 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | 3 | const { basename, extname, dirname } = require('path'); 4 | 5 | /** 6 | * // @type { import('@jest/types').Config.InitialOptions } 7 | * @type { import('ts-jest').InitialOptionsTsJest } 8 | */ 9 | let jestConfig = { 10 | 11 | } 12 | 13 | /** 14 | * @param {string} name 15 | * @returns {string} 16 | * @private 17 | */ 18 | function _requireResolve(name) 19 | { 20 | let result; 21 | 22 | try 23 | { 24 | // @ts-ignore 25 | const { requireResolveExtra, requireResolveCore } = require('@yarn-tool/require-resolve'); 26 | 27 | const paths = [ 28 | requireResolveExtra('@bluelovers/tsdx').result, 29 | requireResolveExtra('tsdx').result, 30 | ].filter(Boolean); 31 | 32 | result = requireResolveCore(name, { 33 | includeGlobal: true, 34 | includeCurrentDirectory: true, 35 | paths, 36 | }) 37 | } 38 | catch (e) 39 | { 40 | 41 | } 42 | 43 | result = result || require.resolve(name); 44 | 45 | console.info('[require.resolve]', name, '=>', result) 46 | 47 | return result 48 | } 49 | 50 | let _ok = true; 51 | 52 | try 53 | { 54 | if (!jestConfig.preset) 55 | { 56 | 57 | let result = require('@yarn-tool/ws-find-up-paths').findUpPathsWorkspaces([ 58 | 'jest-preset.js', 59 | 'jest.config.js', 60 | ], { 61 | ignoreCurrentPackage: true, 62 | onlyFiles: true, 63 | }).result; 64 | 65 | if (result) 66 | { 67 | let name = basename(result, extname(result)) 68 | 69 | switch (name) 70 | { 71 | case 'jest-preset': 72 | jestConfig.preset = dirname(result); 73 | break; 74 | default: 75 | jestConfig = { 76 | ...require(result), 77 | jestConfig, 78 | }; 79 | break; 80 | } 81 | 82 | _ok = false; 83 | } 84 | } 85 | } 86 | catch (e) 87 | { 88 | 89 | } 90 | 91 | try 92 | { 93 | if (_ok && !jestConfig.preset) 94 | { 95 | let result = _requireResolve('@bluelovers/jest-config/package.json'); 96 | if (result) 97 | { 98 | jestConfig.preset = dirname(result); 99 | _ok = false; 100 | } 101 | } 102 | } 103 | catch (e) 104 | { 105 | 106 | } 107 | 108 | if (_ok && !jestConfig.preset) 109 | { 110 | jestConfig.preset = '@bluelovers/jest-config'; 111 | _ok = false; 112 | } 113 | 114 | console.info(`jest.config.preset: ${jestConfig.preset}`); 115 | 116 | module.exports = jestConfig 117 | -------------------------------------------------------------------------------- /lib/POSTAG.d.ts: -------------------------------------------------------------------------------- 1 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 2 | export { POSTAG }; 3 | export default POSTAG; 4 | -------------------------------------------------------------------------------- /lib/POSTAG.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.POSTAG = void 0; 4 | const ids_1 = require("@novel-segment/postag/lib/postag/ids"); 5 | Object.defineProperty(exports, "POSTAG", { enumerable: true, get: function () { return ids_1.POSTAG; } }); 6 | exports.default = ids_1.POSTAG; 7 | //# sourceMappingURL=POSTAG.js.map -------------------------------------------------------------------------------- /lib/POSTAG.ts: -------------------------------------------------------------------------------- 1 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 2 | 3 | export { POSTAG } 4 | 5 | export default POSTAG 6 | -------------------------------------------------------------------------------- /lib/const.d.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluelovers/node-segment/f5688c9dd2c31a16d7eefd2275aa9b206a5f7134/lib/const.d.ts -------------------------------------------------------------------------------- /lib/const.js: -------------------------------------------------------------------------------- 1 | //# sourceMappingURL=const.js.map -------------------------------------------------------------------------------- /lib/const.ts: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/defaults/dict.d.ts: -------------------------------------------------------------------------------- 1 | import { Segment } from '../Segment'; 2 | import { IUseDefaultOptionsDicts } from './index'; 3 | export declare function useDefaultDicts(segment: Segment, options?: IUseDefaultOptionsDicts): Segment; 4 | export declare function useDefaultSynonymDict(segment: Segment, options?: IUseDefaultOptionsDicts): Segment; 5 | export declare function useDefaultBlacklistDict(segment: Segment, options?: IUseDefaultOptionsDicts): Segment; 6 | -------------------------------------------------------------------------------- /lib/defaults/dict.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.useDefaultDicts = useDefaultDicts; 4 | exports.useDefaultSynonymDict = useDefaultSynonymDict; 5 | exports.useDefaultBlacklistDict = useDefaultBlacklistDict; 6 | function useDefaultDicts(segment, options = {}) { 7 | if (!options.nodict) { 8 | // 字典文件 9 | segment 10 | //.loadDict('jieba') <=== bad file 11 | .loadDict('char') 12 | // 盘古词典 13 | .loadDict('pangu/phrases') 14 | .loadDict('pangu/phrases2') 15 | .loadDict('phrases/*') 16 | .loadDict('dict') 17 | .loadDict('dict2') 18 | .loadDict('dict3') 19 | .loadDict('dict4') 20 | .loadDict('pangu/dict005') 21 | .loadDict('pangu/dict006') 22 | //.loadDict('synonym/后') 23 | //.loadDict('synonym/參') 24 | //.loadDict('synonym/发') 25 | .loadDict('dict_synonym/*') 26 | //.loadDict('pangu/wildcard', 'WILDCARD', true) // 通配符 27 | .loadStopwordDict('stopword') // 停止符 28 | .loadDict('lazy/dict_synonym') 29 | /* 30 | .loadDict('names/area') 31 | .loadDict('names/job') 32 | .loadDict('names/food') 33 | 34 | .loadDict('names/other') 35 | .loadDict('names/jp') 36 | .loadDict('names/zh') 37 | .loadDict('names/en') 38 | .loadDict('names/name') 39 | */ 40 | .loadDict('names/*') 41 | .loadDict('lazy/*') 42 | .loadDict('pangu/num') 43 | .loadDict('lazy/badword') 44 | .loadDict('pangu/wildcard', 'WILDCARD', true); 45 | useDefaultSynonymDict(segment, options); 46 | useDefaultBlacklistDict(segment, options); 47 | segment.doBlacklist(); 48 | } 49 | return segment; 50 | } 51 | function useDefaultSynonymDict(segment, options = {}) { 52 | if (!options.nodict) { 53 | segment 54 | .loadSynonymDict('synonym') // 同义词 55 | .loadSynonymDict('zht.synonym', false); 56 | if (options.nodeNovelMode) { 57 | segment 58 | .loadSynonymDict('badword.synonym', false) 59 | .loadSynonymDict('zht.common.synonym', false); 60 | } 61 | } 62 | return segment; 63 | } 64 | function useDefaultBlacklistDict(segment, options = {}) { 65 | if (!options.nodict) { 66 | segment 67 | .loadBlacklistDict('blacklist') 68 | .loadBlacklistOptimizerDict('blacklist.name') 69 | .loadBlacklistSynonymDict('blacklist.synonym'); 70 | } 71 | return segment; 72 | } 73 | //# sourceMappingURL=dict.js.map -------------------------------------------------------------------------------- /lib/defaults/dict.ts: -------------------------------------------------------------------------------- 1 | import { Segment } from '../Segment'; 2 | import { IUseDefaultOptionsDicts } from './index'; 3 | 4 | export function useDefaultDicts(segment: Segment, options: IUseDefaultOptionsDicts = {}) 5 | { 6 | if (!options.nodict) 7 | { 8 | // 字典文件 9 | segment 10 | //.loadDict('jieba') <=== bad file 11 | 12 | .loadDict('char') 13 | 14 | // 盘古词典 15 | .loadDict('pangu/phrases') 16 | .loadDict('pangu/phrases2') 17 | .loadDict('phrases/*') 18 | 19 | .loadDict('dict') 20 | .loadDict('dict2') 21 | .loadDict('dict3') 22 | .loadDict('dict4') 23 | .loadDict('pangu/dict005') 24 | .loadDict('pangu/dict006') 25 | 26 | //.loadDict('synonym/后') 27 | //.loadDict('synonym/參') 28 | //.loadDict('synonym/发') 29 | .loadDict('dict_synonym/*') 30 | 31 | //.loadDict('pangu/wildcard', 'WILDCARD', true) // 通配符 32 | 33 | .loadStopwordDict('stopword') // 停止符 34 | 35 | .loadDict('lazy/dict_synonym') 36 | 37 | /* 38 | .loadDict('names/area') 39 | .loadDict('names/job') 40 | .loadDict('names/food') 41 | 42 | .loadDict('names/other') 43 | .loadDict('names/jp') 44 | .loadDict('names/zh') 45 | .loadDict('names/en') 46 | .loadDict('names/name') 47 | */ 48 | 49 | .loadDict('names/*') 50 | 51 | .loadDict('lazy/*') 52 | 53 | .loadDict('pangu/num') 54 | 55 | .loadDict('lazy/badword') 56 | 57 | .loadDict('pangu/wildcard', 'WILDCARD', true) 58 | ; 59 | 60 | useDefaultSynonymDict(segment, options); 61 | useDefaultBlacklistDict(segment, options); 62 | 63 | segment.doBlacklist(); 64 | } 65 | 66 | return segment 67 | } 68 | 69 | export function useDefaultSynonymDict(segment: Segment, options: IUseDefaultOptionsDicts = {}) 70 | { 71 | if (!options.nodict) 72 | { 73 | segment 74 | .loadSynonymDict('synonym') // 同义词 75 | .loadSynonymDict('zht.synonym', false) 76 | ; 77 | 78 | if (options.nodeNovelMode) 79 | { 80 | segment 81 | .loadSynonymDict('badword.synonym', false) 82 | .loadSynonymDict('zht.common.synonym', false) 83 | } 84 | 85 | } 86 | 87 | return segment 88 | } 89 | 90 | export function useDefaultBlacklistDict(segment: Segment, options: IUseDefaultOptionsDicts = {}) 91 | { 92 | if (!options.nodict) 93 | { 94 | segment 95 | .loadBlacklistDict('blacklist') 96 | .loadBlacklistOptimizerDict('blacklist.name') 97 | .loadBlacklistSynonymDict('blacklist.synonym') 98 | ; 99 | } 100 | 101 | return segment 102 | } 103 | -------------------------------------------------------------------------------- /lib/defaults/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | import { Segment } from '../Segment'; 5 | /** 6 | * @private 7 | */ 8 | export interface IUseDefaultOptionsDicts { 9 | /** 10 | * 不載入 字典 11 | */ 12 | nodict?: boolean; 13 | /** 14 | * 載入 node-novel 相關字典 15 | */ 16 | nodeNovelMode?: boolean; 17 | } 18 | /** 19 | * @private 20 | */ 21 | export interface IUseDefaultOptionsMods { 22 | all_mod?: boolean; 23 | nomod?: boolean; 24 | } 25 | export interface IUseDefaultOptions extends IUseDefaultOptionsDicts, IUseDefaultOptionsMods { 26 | } 27 | export declare function useDefault(segment: Segment, options?: IUseDefaultOptions): Segment; 28 | -------------------------------------------------------------------------------- /lib/defaults/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.useDefault = useDefault; 4 | const mods_1 = require("./mods"); 5 | const dict_1 = require("./dict"); 6 | function useDefault(segment, options = {}) { 7 | // 识别模块 8 | !options.nomod && (0, mods_1.useDefaultMods)(segment, options); 9 | // 字典文件 10 | !options.nodict && (0, dict_1.useDefaultDicts)(segment, options); 11 | return segment; 12 | } 13 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /lib/defaults/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | import { Segment } from '../Segment'; 5 | import { useDefaultMods } from './mods'; 6 | import { useDefaultDicts } from './dict'; 7 | 8 | /** 9 | * @private 10 | */ 11 | export interface IUseDefaultOptionsDicts 12 | { 13 | /** 14 | * 不載入 字典 15 | */ 16 | nodict?: boolean, 17 | /** 18 | * 載入 node-novel 相關字典 19 | */ 20 | nodeNovelMode?: boolean, 21 | } 22 | 23 | /** 24 | * @private 25 | */ 26 | export interface IUseDefaultOptionsMods 27 | { 28 | all_mod?: boolean, 29 | nomod?: boolean, 30 | } 31 | 32 | export interface IUseDefaultOptions extends IUseDefaultOptionsDicts, IUseDefaultOptionsMods 33 | { 34 | 35 | } 36 | 37 | export function useDefault(segment: Segment, options: IUseDefaultOptions = {}) 38 | { 39 | 40 | // 识别模块 41 | !options.nomod && useDefaultMods(segment, options); 42 | 43 | // 字典文件 44 | !options.nodict && useDefaultDicts(segment, options); 45 | 46 | return segment; 47 | } 48 | -------------------------------------------------------------------------------- /lib/defaults/mods.d.ts: -------------------------------------------------------------------------------- 1 | import { IUseDefaultOptionsMods } from './index'; 2 | export declare function useDefaultMods(segment: any, options?: IUseDefaultOptionsMods): any; 3 | -------------------------------------------------------------------------------- /lib/defaults/mods.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.useDefaultMods = useDefaultMods; 4 | const tslib_1 = require("tslib"); 5 | const index_1 = tslib_1.__importDefault(require("../mod/index")); 6 | function useDefaultMods(segment, options = {}) { 7 | !options.nomod && segment.use((0, index_1.default)(options.all_mod)); 8 | return segment; 9 | } 10 | //# sourceMappingURL=mods.js.map -------------------------------------------------------------------------------- /lib/defaults/mods.ts: -------------------------------------------------------------------------------- 1 | import { IUseDefaultOptionsMods } from './index'; 2 | import getDefaultModList from '../mod/index'; 3 | 4 | export function useDefaultMods(segment, options: IUseDefaultOptionsMods = {}) 5 | { 6 | !options.nomod && segment.use(getDefaultModList(options.all_mod)); 7 | 8 | return segment 9 | } 10 | 11 | -------------------------------------------------------------------------------- /lib/fs/get.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/13/013. 3 | */ 4 | export type IOptions = { 5 | extensions?: string[]; 6 | paths: string[]; 7 | onlyDir?: boolean; 8 | onlyFile?: boolean; 9 | }; 10 | export declare function searchGlobSync(file: string, options: IOptions): string[]; 11 | export declare function searchGlobSync(file: string, paths?: string[]): string[]; 12 | export declare function _searchGlobSync(file: any, options: IOptions, cwd?: string): string[]; 13 | export declare function searchFirstSync(file: string, options: IOptions): string; 14 | export declare function searchFirstSync(file: string, paths?: string[]): string; 15 | export declare function existsSync(path: string, options?: { 16 | onlyDir?: boolean; 17 | onlyFile?: boolean; 18 | }): boolean; 19 | export declare function getOptions(options: T & IOptions): T & IOptions; 20 | export declare function getOptions(paths: string[]): IOptions; 21 | export declare function getOptions(options: IOptions | string[]): options is IOptions; 22 | export default searchFirstSync; 23 | -------------------------------------------------------------------------------- /lib/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/16/016. 3 | */ 4 | import getDefaultModList from './mod'; 5 | import { Segment } from './Segment'; 6 | import { useDefault } from './defaults'; 7 | export { getDefaultModList }; 8 | export { Segment }; 9 | export { useDefault }; 10 | export default Segment; 11 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/4/16/016. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.useDefault = exports.Segment = exports.getDefaultModList = void 0; 7 | const tslib_1 = require("tslib"); 8 | const mod_1 = tslib_1.__importDefault(require("./mod")); 9 | exports.getDefaultModList = mod_1.default; 10 | const Segment_1 = require("./Segment"); 11 | Object.defineProperty(exports, "Segment", { enumerable: true, get: function () { return Segment_1.Segment; } }); 12 | const defaults_1 = require("./defaults"); 13 | Object.defineProperty(exports, "useDefault", { enumerable: true, get: function () { return defaults_1.useDefault; } }); 14 | exports.default = Segment_1.Segment; 15 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /lib/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/16/016. 3 | */ 4 | 5 | import getDefaultModList from './mod'; 6 | import { Segment } from './Segment'; 7 | import { useDefault } from './defaults'; 8 | 9 | export { getDefaultModList } 10 | 11 | export { Segment } 12 | 13 | export { useDefault } 14 | 15 | export default Segment; 16 | -------------------------------------------------------------------------------- /lib/loader.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/2/24/024. 3 | */ 4 | import SegmentDict from 'segment-dict'; 5 | import * as SegmentDictLoader from 'segment-dict/lib/loader/segment'; 6 | import * as SegmentSynonymLoader from '@novel-segment/loaders/segment/synonym'; 7 | export { SegmentDict }; 8 | export { SegmentDictLoader, SegmentSynonymLoader }; 9 | declare const _default: { 10 | SegmentDict: typeof import("segment-dict"); 11 | SegmentDictLoader: typeof SegmentDictLoader; 12 | SegmentSynonymLoader: typeof SegmentSynonymLoader; 13 | }; 14 | export default _default; 15 | -------------------------------------------------------------------------------- /lib/loader.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/2/24/024. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.SegmentSynonymLoader = exports.SegmentDictLoader = exports.SegmentDict = void 0; 7 | const tslib_1 = require("tslib"); 8 | const segment_dict_1 = tslib_1.__importDefault(require("segment-dict")); 9 | exports.SegmentDict = segment_dict_1.default; 10 | const SegmentDictLoader = tslib_1.__importStar(require("segment-dict/lib/loader/segment")); 11 | exports.SegmentDictLoader = SegmentDictLoader; 12 | const SegmentSynonymLoader = tslib_1.__importStar(require("@novel-segment/loaders/segment/synonym")); 13 | exports.SegmentSynonymLoader = SegmentSynonymLoader; 14 | exports.default = { 15 | SegmentDict: segment_dict_1.default, 16 | SegmentDictLoader, 17 | SegmentSynonymLoader, 18 | }; 19 | //# sourceMappingURL=loader.js.map -------------------------------------------------------------------------------- /lib/loader.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/2/24/024. 3 | */ 4 | 5 | // @ts-ignore 6 | import * as fs from 'fs'; 7 | import SegmentDict from 'segment-dict'; 8 | import * as SegmentDictLoader from 'segment-dict/lib/loader/segment'; 9 | import * as SegmentSynonymLoader from '@novel-segment/loaders/segment/synonym'; 10 | 11 | export { SegmentDict } 12 | export { SegmentDictLoader, SegmentSynonymLoader } 13 | 14 | export default { 15 | SegmentDict, 16 | SegmentDictLoader, 17 | SegmentSynonymLoader, 18 | }; 19 | -------------------------------------------------------------------------------- /lib/mod/CHS_NAMES.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 中文姓 3 | */ 4 | import { IDICT } from '../Segment'; 5 | export declare namespace _CHS_NAMES { 6 | const FAMILY_NAME_1: string[]; 7 | const FAMILY_NAME_2: string[]; 8 | const DOUBLE_NAME_1: string[]; 9 | const DOUBLE_NAME_2: string[]; 10 | const SINGLE_NAME: string[]; 11 | const SINGLE_NAME_NO_REPEAT: string[]; 12 | const SHARE_NAME: string[]; 13 | function p(a: string[], n: number): IDICT; 14 | } 15 | /** 16 | * 单姓 17 | */ 18 | export declare const FAMILY_NAME_1: IDICT; 19 | /** 20 | * 复姓 21 | */ 22 | export declare const FAMILY_NAME_2: IDICT; 23 | /** 24 | * 双字姓名第一个字 25 | */ 26 | export declare const DOUBLE_NAME_1: IDICT; 27 | /** 28 | * 双字姓名第二个字 29 | */ 30 | export declare const DOUBLE_NAME_2: IDICT; 31 | /** 32 | * 单字姓名 33 | */ 34 | export declare const SINGLE_NAME: IDICT; 35 | /** 36 | * 单字姓名 不重覆 37 | */ 38 | export declare const SINGLE_NAME_NO_REPEAT: IDICT; 39 | declare const _default: typeof import("./CHS_NAMES"); 40 | export default _default; 41 | -------------------------------------------------------------------------------- /lib/mod/COLORS.d.ts: -------------------------------------------------------------------------------- 1 | import { IDICT } from '../Segment'; 2 | export declare namespace _COLORS { 3 | const ZH = "\u8272"; 4 | const COLOR_HAIR: string[]; 5 | const COLOR_WITH_RGB: string[][]; 6 | const COLOR_ALL: string[]; 7 | function p(a: string[]): IDICT; 8 | } 9 | export declare const COLOR_HAIR: IDICT; 10 | export declare const COLOR_ALL: IDICT; 11 | declare const _default: typeof import("./COLORS"); 12 | export default _default; 13 | -------------------------------------------------------------------------------- /lib/mod/Optimizer.d.ts: -------------------------------------------------------------------------------- 1 | import { IWord, Segment } from '../Segment'; 2 | import { ISubSModule, SModule, SubSModule } from './mod'; 3 | export type ISubOptimizer = ISubSModule & { 4 | type: 'optimizer'; 5 | doOptimize(words: IWord[], ...argv: any[]): IWord[]; 6 | }; 7 | export type ISubOptimizerCreate = { 8 | (segment: Segment, ...argv: any[]): T & R; 9 | }; 10 | export declare class SubSModuleOptimizer extends SubSModule implements ISubOptimizer { 11 | static readonly type = "optimizer"; 12 | readonly type = "optimizer"; 13 | doOptimize(words: IWord[], ...argv: any[]): IWord[]; 14 | init(segment: Segment, ...argv: any[]): this; 15 | static init(segment: Segment, ...argv: any[]): T; 16 | } 17 | /** 18 | * 分词模块管理器 19 | */ 20 | export declare class Optimizer extends SModule { 21 | type: string; 22 | /** 23 | * 对一段文本进行分词 24 | * 25 | * @param {array} words 单词数组 26 | * @param {array} modules 分词模块数组 27 | * @return {array} 28 | */ 29 | doOptimize(words: IWord[], mods: ISubOptimizer[], ...argv: any[]): IWord[]; 30 | } 31 | export default Optimizer; 32 | -------------------------------------------------------------------------------- /lib/mod/Optimizer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 优化模块管理器 3 | * 4 | * @author 老雷 5 | */ 6 | 'use strict'; 7 | Object.defineProperty(exports, "__esModule", { value: true }); 8 | exports.Optimizer = exports.SubSModuleOptimizer = void 0; 9 | const tslib_1 = require("tslib"); 10 | const core_decorators_1 = require("core-decorators"); 11 | const mod_1 = require("./mod"); 12 | let SubSModuleOptimizer = class SubSModuleOptimizer extends mod_1.SubSModule { 13 | constructor() { 14 | super(...arguments); 15 | this.type = 'optimizer'; 16 | } 17 | doOptimize(words, ...argv) { 18 | throw new Error(); 19 | } 20 | init(segment, ...argv) { 21 | super.init(segment, ...argv); 22 | return this; 23 | } 24 | static init(segment, ...argv) { 25 | // @ts-ignore 26 | return super.init(segment, ...argv); 27 | } 28 | }; 29 | exports.SubSModuleOptimizer = SubSModuleOptimizer; 30 | SubSModuleOptimizer.type = 'optimizer'; 31 | exports.SubSModuleOptimizer = SubSModuleOptimizer = tslib_1.__decorate([ 32 | core_decorators_1.autobind 33 | // @ts-ignore 34 | ], SubSModuleOptimizer); 35 | /** 36 | * 分词模块管理器 37 | */ 38 | class Optimizer extends mod_1.SModule { 39 | constructor() { 40 | super(...arguments); 41 | this.type = 'optimizer'; 42 | } 43 | /** 44 | * 对一段文本进行分词 45 | * 46 | * @param {array} words 单词数组 47 | * @param {array} modules 分词模块数组 48 | * @return {array} 49 | */ 50 | doOptimize(words, mods, ...argv) { 51 | return this._doMethod('doOptimize', words, mods, ...argv); 52 | } 53 | } 54 | exports.Optimizer = Optimizer; 55 | exports.default = Optimizer; 56 | //# sourceMappingURL=Optimizer.js.map -------------------------------------------------------------------------------- /lib/mod/Optimizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 优化模块管理器 3 | * 4 | * @author 老雷 5 | */ 6 | 'use strict'; 7 | 8 | import { autobind } from 'core-decorators'; 9 | import { IWord, Segment } from '../Segment'; 10 | import { ISubSModule, SModule, SubSModule } from './mod'; 11 | 12 | export type ISubOptimizer = ISubSModule & { 13 | type: 'optimizer', 14 | doOptimize(words: IWord[], ...argv): IWord[], 15 | } 16 | 17 | export type ISubOptimizerCreate = { 18 | (segment: Segment, ...argv): T & R, 19 | }; 20 | 21 | @autobind 22 | // @ts-ignore 23 | export class SubSModuleOptimizer extends SubSModule implements ISubOptimizer 24 | { 25 | public static override readonly type = 'optimizer'; 26 | public override readonly type = 'optimizer'; 27 | 28 | public doOptimize(words: IWord[], ...argv): IWord[] 29 | { 30 | throw new Error(); 31 | } 32 | 33 | public override init(segment: Segment, ...argv) 34 | { 35 | super.init(segment, ...argv); 36 | 37 | return this; 38 | } 39 | 40 | public static override init(segment: Segment, ...argv): T 41 | { 42 | // @ts-ignore 43 | return super.init(segment, ...argv); 44 | } 45 | } 46 | 47 | /** 48 | * 分词模块管理器 49 | */ 50 | export class Optimizer extends SModule 51 | { 52 | override type = 'optimizer'; 53 | 54 | /** 55 | * 对一段文本进行分词 56 | * 57 | * @param {array} words 单词数组 58 | * @param {array} modules 分词模块数组 59 | * @return {array} 60 | */ 61 | doOptimize(words: IWord[], mods: ISubOptimizer[], ...argv): IWord[] 62 | { 63 | return this._doMethod('doOptimize', words, mods, ...argv); 64 | } 65 | } 66 | 67 | export default Optimizer; 68 | -------------------------------------------------------------------------------- /lib/mod/Tokenizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 分词模块管理器 3 | * 4 | * @author 老雷 5 | */ 6 | import { IWord, Segment } from '../Segment'; 7 | import { ISubSModule, SModule, SubSModule } from './mod'; 8 | export type ISubTokenizer = ISubSModule & { 9 | type: 'tokenizer'; 10 | split(words: IWord[], ...argv: any[]): IWord[]; 11 | }; 12 | export type ISubTokenizerCreate = { 13 | (...argv: Parameters): T & R; 14 | (segment: Segment, ...argv: any[]): T & R; 15 | }; 16 | export declare abstract class SubSModuleTokenizer extends SubSModule implements ISubTokenizer { 17 | static readonly type = "tokenizer"; 18 | readonly type = "tokenizer"; 19 | abstract split(words: IWord[], ...argv: any[]): IWord[]; 20 | init(segment: Segment, ...argv: any[]): this; 21 | static init(segment: Segment, ...argv: any[]): T; 22 | /** 23 | * 仅对未识别的词进行匹配 24 | * 不包含 p 為 0 25 | */ 26 | protected _splitUnset(words: T[], fn: (text: string, ...argv: any[]) => U[]): U[]; 27 | /** 28 | * 仅对未识别的词进行匹配 29 | * 包含已存在 但 p 為 0 30 | */ 31 | protected _splitUnknow(words: T[], fn: (text: string, ...argv: any[]) => U[]): U[]; 32 | } 33 | /** 34 | * 分词模块管理器 35 | */ 36 | export declare class Tokenizer extends SModule { 37 | type: string; 38 | /** 39 | * 对一段文本进行分词 40 | * 41 | * @param {string} text 文本 42 | * @param {array} modules 分词模块数组 43 | * @return {array} 44 | */ 45 | split(text: string, mods: ISubTokenizer[], ...argv: any[]): IWord[]; 46 | } 47 | export default Tokenizer; 48 | -------------------------------------------------------------------------------- /lib/mod/const.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/19/019. 3 | */ 4 | import { IDICT } from '../Segment'; 5 | /** 6 | * 日期时间常见组合 7 | */ 8 | export declare let _DATETIME: string[]; 9 | export declare const DATETIME: IDICT; 10 | declare const _default: typeof import("./const"); 11 | export default _default; 12 | -------------------------------------------------------------------------------- /lib/mod/const.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/4/19/019. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.DATETIME = exports._DATETIME = void 0; 7 | const list_1 = require("@lazy-cjk/zh-table-list/list"); 8 | /** 9 | * 日期时间常见组合 10 | */ 11 | exports._DATETIME = [ 12 | '世纪', '年', '年份', '年度', '月', '月份', '月度', '日', '号', 13 | '时', '点', '点钟', '分', '分钟', '秒', '毫秒' 14 | ]; 15 | exports.DATETIME = (0, list_1.arrCjk)(exports._DATETIME) 16 | .reduce(function (data, v) { 17 | data[v] = v.length; 18 | return data; 19 | }, {}); 20 | exports.default = exports; 21 | //# sourceMappingURL=const.js.map -------------------------------------------------------------------------------- /lib/mod/const.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/19/019. 3 | */ 4 | 5 | import { IDICT } from '../Segment'; 6 | import { arrCjk as arr_cjk } from '@lazy-cjk/zh-table-list/list'; 7 | 8 | /** 9 | * 日期时间常见组合 10 | */ 11 | export let _DATETIME = [ 12 | '世纪', '年', '年份', '年度', '月', '月份', '月度', '日', '号', 13 | '时', '点', '点钟', '分', '分钟', '秒', '毫秒' 14 | ]; 15 | 16 | export const DATETIME: IDICT = arr_cjk(_DATETIME) 17 | .reduce(function (data, v) 18 | { 19 | data[v] = v.length; 20 | 21 | return data; 22 | }, {}) 23 | ; 24 | 25 | export default exports as typeof import('./const'); 26 | -------------------------------------------------------------------------------- /lib/mod/data/STOPWORD.d.ts: -------------------------------------------------------------------------------- 1 | export declare namespace NS_STOPWORD { 2 | const _TABLE: string; 3 | const _STOPWORD: string[], STOPWORD: { 4 | [key: string]: number; 5 | }, STOPWORD2: { 6 | [key: number]: { 7 | [key: string]: number; 8 | }; 9 | }; 10 | function parseStopWord(_STOPWORD: string | string[]): { 11 | _STOPWORD: string[]; 12 | STOPWORD: { 13 | [key: string]: number; 14 | }; 15 | STOPWORD2: { 16 | [key: number]: { 17 | [key: string]: number; 18 | }; 19 | }; 20 | }; 21 | } 22 | export declare const _STOPWORD: string[], STOPWORD: { 23 | [key: string]: number; 24 | }, STOPWORD2: { 25 | [key: number]: { 26 | [key: string]: number; 27 | }; 28 | }; 29 | declare const _default: typeof import("./STOPWORD"); 30 | export default _default; 31 | -------------------------------------------------------------------------------- /lib/mod/data/STOPWORD.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.STOPWORD2 = exports.STOPWORD = exports._STOPWORD = exports.NS_STOPWORD = void 0; 4 | const array_hyper_unique_1 = require("array-hyper-unique"); 5 | var NS_STOPWORD; 6 | (function (NS_STOPWORD) { 7 | var _a; 8 | NS_STOPWORD._TABLE = [ 9 | ' ,.;+-|/\\\'":?<>[]{}=!@#$%^&*()~`' + 10 | '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦' + 11 | '﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<­__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸' + 12 | '﹀︺︾ˉ﹂﹄︼+-×÷﹢﹣±/=≈≡≠∧∨∑∏∪∩∈⊙⌒⊥∥∠∽≌<>≤≥≮≯∧∨√﹙﹚[]﹛﹜∫∮∝∞⊙∏' + 13 | '┌┬┐┏┳┓╒╤╕─│├┼┤┣╋┫╞╪╡━┃└┴┘┗┻┛╘╧╛┄┆┅┇╭─╮┏━┓╔╦╗┈┊│╳│┃┃╠╬╣┉┋╰─╯┗━┛' + 14 | '╚╩╝╲╱┞┟┠┡┢┦┧┨┩┪╉╊┭┮┯┰┱┲┵┶┷┸╇╈┹┺┽┾┿╀╁╂╃╄╅╆' + 15 | '○◇□△▽☆●◆■▲▼★♠♥♦♣☼☺◘♀√☻◙♂×▁▂▃▄▅▆▇█⊙◎۞卍卐╱╲▁▏↖↗↑←↔◤◥╲╱▔▕↙↘↓→↕◣◢∷▒░℡™', 16 | '.・ ※', 17 | '⋯', 18 | /** 19 | * 丶並非標點符號 而為部首 但有的人會用這個作為 標點符號使用 20 | */ 21 | '丶', 22 | ].join(''); 23 | _a = parseStopWord(NS_STOPWORD._TABLE), NS_STOPWORD._STOPWORD = _a._STOPWORD, NS_STOPWORD.STOPWORD = _a.STOPWORD, NS_STOPWORD.STOPWORD2 = _a.STOPWORD2; 24 | function parseStopWord(_STOPWORD) { 25 | var _a; 26 | if (typeof _STOPWORD === 'string') { 27 | _STOPWORD = _STOPWORD.split(''); 28 | //_STOPWORD = UString.split(_STOPWORD, ''); 29 | } 30 | else if (!Array.isArray(_STOPWORD)) { 31 | throw new TypeError(`table must is string or string[]`); 32 | } 33 | _STOPWORD = (0, array_hyper_unique_1.array_unique)(_STOPWORD); 34 | let STOPWORD = {}; 35 | let STOPWORD2 = {}; 36 | for (const _STOPWORDItem of _STOPWORD) { 37 | if (_STOPWORDItem === '') 38 | continue; 39 | let len = _STOPWORDItem.length; 40 | STOPWORD[_STOPWORDItem] = len; 41 | STOPWORD2[len] = (_a = STOPWORD2[len]) !== null && _a !== void 0 ? _a : {}; 42 | STOPWORD2[len][_STOPWORDItem] = len; 43 | } 44 | return { 45 | _STOPWORD, 46 | STOPWORD, 47 | STOPWORD2, 48 | }; 49 | } 50 | NS_STOPWORD.parseStopWord = parseStopWord; 51 | })(NS_STOPWORD || (exports.NS_STOPWORD = NS_STOPWORD = {})); 52 | exports._STOPWORD = NS_STOPWORD._STOPWORD, exports.STOPWORD = NS_STOPWORD.STOPWORD, exports.STOPWORD2 = NS_STOPWORD.STOPWORD2; 53 | exports.default = exports; 54 | //# sourceMappingURL=STOPWORD.js.map -------------------------------------------------------------------------------- /lib/mod/data/STOPWORD.ts: -------------------------------------------------------------------------------- 1 | import { array_unique } from 'array-hyper-unique'; 2 | 3 | export namespace NS_STOPWORD 4 | { 5 | export const _TABLE = [ 6 | ' ,.;+-|/\\\'":?<>[]{}=!@#$%^&*()~`' + 7 | '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦' + 8 | '﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<­__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸' + 9 | '﹀︺︾ˉ﹂﹄︼+-×÷﹢﹣±/=≈≡≠∧∨∑∏∪∩∈⊙⌒⊥∥∠∽≌<>≤≥≮≯∧∨√﹙﹚[]﹛﹜∫∮∝∞⊙∏' + 10 | '┌┬┐┏┳┓╒╤╕─│├┼┤┣╋┫╞╪╡━┃└┴┘┗┻┛╘╧╛┄┆┅┇╭─╮┏━┓╔╦╗┈┊│╳│┃┃╠╬╣┉┋╰─╯┗━┛' + 11 | '╚╩╝╲╱┞┟┠┡┢┦┧┨┩┪╉╊┭┮┯┰┱┲┵┶┷┸╇╈┹┺┽┾┿╀╁╂╃╄╅╆' + 12 | '○◇□△▽☆●◆■▲▼★♠♥♦♣☼☺◘♀√☻◙♂×▁▂▃▄▅▆▇█⊙◎۞卍卐╱╲▁▏↖↗↑←↔◤◥╲╱▔▕↙↘↓→↕◣◢∷▒░℡™', 13 | '.・ ※', 14 | '⋯', 15 | /** 16 | * 丶並非標點符號 而為部首 但有的人會用這個作為 標點符號使用 17 | */ 18 | '丶', 19 | ].join(''); 20 | 21 | export const { _STOPWORD, STOPWORD, STOPWORD2 } = parseStopWord(_TABLE); 22 | 23 | export function parseStopWord(_STOPWORD: string | string[]) 24 | { 25 | if (typeof _STOPWORD === 'string') 26 | { 27 | _STOPWORD = _STOPWORD.split(''); 28 | //_STOPWORD = UString.split(_STOPWORD, ''); 29 | } 30 | else if (!Array.isArray(_STOPWORD)) 31 | { 32 | throw new TypeError(`table must is string or string[]`) 33 | } 34 | 35 | _STOPWORD = array_unique(_STOPWORD); 36 | 37 | let STOPWORD = {} as { 38 | [key: string]: number, 39 | }; 40 | let STOPWORD2 = {} as { 41 | [key: number]: typeof STOPWORD, 42 | }; 43 | 44 | for (const _STOPWORDItem of _STOPWORD) 45 | { 46 | if (_STOPWORDItem === '') continue; 47 | let len = _STOPWORDItem.length; 48 | STOPWORD[_STOPWORDItem] = len; 49 | STOPWORD2[len] = STOPWORD2[len] ?? {}; 50 | STOPWORD2[len][_STOPWORDItem] = len; 51 | } 52 | 53 | return { 54 | _STOPWORD, 55 | STOPWORD, 56 | STOPWORD2, 57 | } 58 | } 59 | } 60 | 61 | export const { _STOPWORD, STOPWORD, STOPWORD2 } = NS_STOPWORD; 62 | 63 | export default exports as typeof import('./STOPWORD'); 64 | -------------------------------------------------------------------------------- /lib/mod/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/16/016. 3 | */ 4 | import { ISubOptimizer, ISubOptimizerCreate, Optimizer, SubSModuleOptimizer } from './Optimizer'; 5 | import { ISubTokenizer, ISubTokenizerCreate, SubSModuleTokenizer, Tokenizer } from './Tokenizer'; 6 | import { ISubSModule, ISubSModuleCreate, ISubSModuleMethod, SubSModule } from './mod'; 7 | export { Optimizer, SubSModuleOptimizer, ISubOptimizer, ISubOptimizerCreate }; 8 | export { Tokenizer, SubSModuleTokenizer, ISubTokenizer, ISubTokenizerCreate }; 9 | export { SubSModule, ISubSModule, ISubSModuleCreate, ISubSModuleMethod }; 10 | /** 11 | * 识别模块 12 | * 强制分割类单词识别 13 | */ 14 | export declare enum ENUM_SUBMODS { 15 | /** 16 | * URL识别 17 | */ 18 | URLTokenizer = "URLTokenizer", 19 | /** 20 | * 通配符,必须在标点符号识别之前 21 | */ 22 | WildcardTokenizer = "WildcardTokenizer", 23 | /** 24 | * 标点符号识别 25 | */ 26 | PunctuationTokenizer = "PunctuationTokenizer", 27 | /** 28 | * 外文字符、数字识别,必须在标点符号识别之后 29 | */ 30 | ForeignTokenizer = "ForeignTokenizer", 31 | /** 32 | * 词典识别 33 | */ 34 | DictTokenizer = "DictTokenizer", 35 | /** 36 | * 人名识别,建议在词典识别之后 37 | */ 38 | ChsNameTokenizer = "ChsNameTokenizer", 39 | JpSimpleTokenizer = "JpSimpleTokenizer", 40 | /** 41 | * 注音 42 | */ 43 | ZhuyinTokenizer = "ZhuyinTokenizer", 44 | /** 45 | * 部首 46 | */ 47 | /** 48 | * 邮箱地址识别 49 | */ 50 | EmailOptimizer = "EmailOptimizer", 51 | /** 52 | * 人名识别优化 53 | */ 54 | ChsNameOptimizer = "ChsNameOptimizer", 55 | /** 56 | * 词典识别优化 57 | */ 58 | DictOptimizer = "DictOptimizer", 59 | /** 60 | * 日期时间识别优化 61 | */ 62 | DatetimeOptimizer = "DatetimeOptimizer", 63 | /** 64 | * 合併外文與中文的詞 65 | * 例如 T恤 66 | */ 67 | ForeignOptimizer = "ForeignOptimizer", 68 | /** 69 | * 自動處理 `里|裏|后` 70 | */ 71 | ZhtSynonymOptimizer = "ZhtSynonymOptimizer", 72 | AdjectiveOptimizer = "AdjectiveOptimizer" 73 | } 74 | /** 75 | * 不包含在預設模組列表內 需要手動指定 76 | */ 77 | export declare enum ENUM_SUBMODS_OTHER { 78 | /** 79 | * 单字切分模块 80 | */ 81 | SingleTokenizer = "SingleTokenizer" 82 | } 83 | export type ENUM_SUBMODS_NAME = ENUM_SUBMODS | ENUM_SUBMODS_OTHER; 84 | export declare const LIST_SUBMODS_NOT_DEF: ENUM_SUBMODS[]; 85 | export declare const SUBMODS_LIST: import("ts-enum-util").EnumWrapper; 86 | export declare const SUBMODS_OTHER_LIST: import("ts-enum-util").EnumWrapper; 87 | /** 88 | * 取得列表並且保持 ENUM 順序 89 | * @param {boolean} all 90 | * @returns {ENUM_SUBMODS[]} 91 | */ 92 | export declare function getDefault(all?: boolean): ENUM_SUBMODS[]; 93 | export default getDefault; 94 | -------------------------------------------------------------------------------- /lib/mod/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/16/016. 3 | */ 4 | 5 | import { $enum } from "ts-enum-util"; 6 | import { ISubOptimizer, ISubOptimizerCreate, Optimizer, SubSModuleOptimizer } from './Optimizer'; 7 | import { ISubTokenizer, ISubTokenizerCreate, SubSModuleTokenizer, Tokenizer } from './Tokenizer'; 8 | import { ISubSModule, ISubSModuleCreate, ISubSModuleMethod, SubSModule } from './mod'; 9 | 10 | export { Optimizer, SubSModuleOptimizer, ISubOptimizer, ISubOptimizerCreate } 11 | export { Tokenizer, SubSModuleTokenizer, ISubTokenizer, ISubTokenizerCreate } 12 | export { SubSModule, ISubSModule, ISubSModuleCreate, ISubSModuleMethod } 13 | 14 | /** 15 | * 识别模块 16 | * 强制分割类单词识别 17 | */ 18 | export enum ENUM_SUBMODS 19 | { 20 | /** 21 | * URL识别 22 | */ 23 | URLTokenizer = 'URLTokenizer', 24 | /** 25 | * 通配符,必须在标点符号识别之前 26 | */ 27 | WildcardTokenizer = 'WildcardTokenizer', 28 | /** 29 | * 标点符号识别 30 | */ 31 | PunctuationTokenizer = 'PunctuationTokenizer', 32 | /** 33 | * 外文字符、数字识别,必须在标点符号识别之后 34 | */ 35 | ForeignTokenizer = 'ForeignTokenizer', 36 | 37 | // 中文单词识别 38 | 39 | /** 40 | * 词典识别 41 | */ 42 | DictTokenizer = 'DictTokenizer', 43 | /** 44 | * 人名识别,建议在词典识别之后 45 | */ 46 | ChsNameTokenizer = 'ChsNameTokenizer', 47 | 48 | JpSimpleTokenizer = 'JpSimpleTokenizer', 49 | 50 | /** 51 | * 注音 52 | */ 53 | ZhuyinTokenizer = 'ZhuyinTokenizer', 54 | 55 | /** 56 | * 部首 57 | */ 58 | //ZhRadicalTokenizer = 'ZhRadicalTokenizer', 59 | 60 | // @todo 优化模块 61 | 62 | /** 63 | * 邮箱地址识别 64 | */ 65 | EmailOptimizer = 'EmailOptimizer', 66 | /** 67 | * 人名识别优化 68 | */ 69 | ChsNameOptimizer = 'ChsNameOptimizer', 70 | /** 71 | * 词典识别优化 72 | */ 73 | DictOptimizer = 'DictOptimizer', 74 | /** 75 | * 日期时间识别优化 76 | */ 77 | DatetimeOptimizer = 'DatetimeOptimizer', 78 | 79 | /** 80 | * 合併外文與中文的詞 81 | * 例如 T恤 82 | */ 83 | ForeignOptimizer = 'ForeignOptimizer', 84 | 85 | /** 86 | * 自動處理 `里|裏|后` 87 | */ 88 | ZhtSynonymOptimizer = 'ZhtSynonymOptimizer', 89 | 90 | AdjectiveOptimizer = 'AdjectiveOptimizer', 91 | } 92 | 93 | /** 94 | * 不包含在預設模組列表內 需要手動指定 95 | */ 96 | export enum ENUM_SUBMODS_OTHER 97 | { 98 | /** 99 | * 单字切分模块 100 | */ 101 | SingleTokenizer = 'SingleTokenizer', 102 | } 103 | 104 | export type ENUM_SUBMODS_NAME = ENUM_SUBMODS | ENUM_SUBMODS_OTHER; 105 | 106 | export const LIST_SUBMODS_NOT_DEF = [ 107 | ENUM_SUBMODS.ZhtSynonymOptimizer, 108 | ]; 109 | 110 | export const SUBMODS_LIST = $enum(ENUM_SUBMODS); 111 | export const SUBMODS_OTHER_LIST = $enum(ENUM_SUBMODS_OTHER); 112 | 113 | /** 114 | * 取得列表並且保持 ENUM 順序 115 | * @param {boolean} all 116 | * @returns {ENUM_SUBMODS[]} 117 | */ 118 | export function getDefault(all?: boolean): ENUM_SUBMODS[] 119 | { 120 | let list = SUBMODS_LIST.getKeys(); 121 | 122 | return Object.keys(ENUM_SUBMODS) 123 | .reduce(function (a, m) 124 | { 125 | if (!a.includes(m) && list.includes(m as any)) 126 | { 127 | if (all || !LIST_SUBMODS_NOT_DEF.includes(m as any)) 128 | { 129 | a.push(m); 130 | } 131 | } 132 | 133 | return a; 134 | }, []) 135 | ; 136 | } 137 | 138 | //console.log(getDefault(true)); 139 | 140 | export default getDefault; 141 | -------------------------------------------------------------------------------- /lib/mod/mod.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/2/21/021. 3 | */ 4 | import { IDICT_BLACKLIST, IWord, Segment } from '../Segment'; 5 | import { IWordDebug, IWordDebugInfo } from '../util/index'; 6 | import { ENUM_SUBMODS_NAME } from './index'; 7 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 8 | export type ISModuleType = 'optimizer' | 'tokenizer' | string; 9 | export declare class SModule implements ISModule { 10 | type?: ISModuleType; 11 | segment: Segment; 12 | /** 13 | * @param {Segment} segment 分词接口 14 | */ 15 | constructor(segment: Segment); 16 | protected _doMethod(fn: string, target: S[], mods: T[], ...argv: any[]): S[]; 17 | } 18 | export declare class SubSModule implements ISubSModule { 19 | static type: ISModuleType; 20 | type: ISModuleType; 21 | segment: Segment; 22 | priority?: number; 23 | inited?: boolean; 24 | static NAME: string; 25 | name: string; 26 | protected _TABLE?: any; 27 | protected _POSTAG?: typeof POSTAG; 28 | protected _BLACKLIST?: IDICT_BLACKLIST; 29 | constructor(type?: ISModuleType, segment?: Segment, ...argv: any[]); 30 | static init(segment: Segment, ...argv: any[]): T; 31 | protected static _init(libThis: IModuleStatic, segment: Segment, ...argv: any[]): T; 32 | init(segment: Segment, ...argv: any[]): this; 33 | protected _cache(...argv: any[]): void; 34 | /** 35 | * 回傳最簡版的 IWord { w, p, f, s } 36 | */ 37 | protected createRawToken(data: T, ow?: Partial, attr?: U & IWordDebugInfo): T; 38 | protected createToken(data: T, skipCheck?: boolean, attr?: U & IWordDebugInfo): T; 39 | protected sliceToken(words: T[], pos: number, len: number, data: T, skipCheck?: boolean, attr?: U & IWordDebugInfo): T[]; 40 | protected debugToken(data: T, attr?: U & IWordDebugInfo, returnToken?: true, ...argv: any[]): T; 41 | } 42 | export interface ISubSModuleMethod { 43 | (words: T[], ...argv: any[]): U[]; 44 | } 45 | export interface ISubSModuleCreate { 46 | (segment: Segment, ...argv: any[]): T & R; 47 | } 48 | export interface ISModule { 49 | type?: ISModuleType; 50 | segment: Segment; 51 | } 52 | export interface IModuleStatic { 53 | type: ISModuleType; 54 | new (type?: ISModuleType, segment?: Segment, ...argv: any[]): T; 55 | init(segment: Segment, ...argv: any[]): T; 56 | } 57 | export interface ISubSModule { 58 | type: ISModuleType; 59 | segment: Segment; 60 | name?: ENUM_SUBMODS_NAME | string; 61 | priority?: number; 62 | init(segment: Segment, ...argv: any[]): ISubSModule; 63 | } 64 | declare const _default: typeof import("./mod"); 65 | export default _default; 66 | -------------------------------------------------------------------------------- /lib/mod/mod.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/2/21/021. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.SubSModule = exports.SModule = void 0; 7 | const debug_1 = require("../util/debug"); 8 | class SModule { 9 | /** 10 | * @param {Segment} segment 分词接口 11 | */ 12 | constructor(segment) { 13 | this.segment = segment; 14 | } 15 | _doMethod(fn, target, mods, ...argv) { 16 | mods.forEach(function (mod) { 17 | // @ts-ignore 18 | if (typeof mod._cache === 'function') { 19 | // @ts-ignore 20 | mod._cache(); 21 | } 22 | target = mod[fn](target, ...argv); 23 | }); 24 | return target; 25 | } 26 | } 27 | exports.SModule = SModule; 28 | class SubSModule { 29 | constructor(type, segment, ...argv) { 30 | if (type) { 31 | this.type = type; 32 | } 33 | if (!this.type) { 34 | throw new Error(); 35 | } 36 | if (segment) { 37 | this.init(segment, ...argv); 38 | this.inited = true; 39 | } 40 | } 41 | static init(segment, ...argv) { 42 | // @ts-ignore 43 | return this._init(this, segment, ...argv); 44 | } 45 | static _init(libThis, segment, ...argv) { 46 | if (!libThis.type) { 47 | throw new Error(); 48 | } 49 | let mod = new libThis(libThis.type, segment, ...argv); 50 | if (!mod.inited) { 51 | mod.init(segment, ...argv); 52 | mod.inited = true; 53 | } 54 | // @ts-ignore 55 | return mod; 56 | } 57 | init(segment, ...argv) { 58 | this.segment = segment; 59 | this.inited = true; 60 | //this._cache(); 61 | return this; 62 | } 63 | _cache(...argv) { 64 | this._POSTAG = this.segment.POSTAG; 65 | } 66 | /** 67 | * 回傳最簡版的 IWord { w, p, f, s } 68 | */ 69 | createRawToken(data, ow, attr) { 70 | var _a, _b, _c, _d; 71 | // @ts-ignore 72 | ow = ow || {}; 73 | let nw = { 74 | w: (_a = data.w) !== null && _a !== void 0 ? _a : ow.w, 75 | p: (_b = data.p) !== null && _b !== void 0 ? _b : ow.p, 76 | f: (_c = data.f) !== null && _c !== void 0 ? _c : ow.f, 77 | s: (_d = data.s) !== null && _d !== void 0 ? _d : ow.s, 78 | }; 79 | if (attr) { 80 | this.debugToken(nw, attr); 81 | } 82 | return nw; 83 | } 84 | createToken(data, skipCheck, attr) { 85 | let TABLE = this._TABLE; 86 | if (!skipCheck && TABLE && !(data.w in TABLE)) { 87 | this.debugToken(data, { 88 | autoCreate: true, 89 | }); 90 | } 91 | // 自動將模組名稱血入 debug 資訊 92 | if (this.name) { 93 | attr = Object.assign(attr || {}); 94 | if (!(this.name in attr)) { 95 | // @ts-ignore 96 | attr[this.name] = true; 97 | } 98 | } 99 | if (attr) { 100 | this.debugToken(data, attr); 101 | } 102 | return data; 103 | } 104 | sliceToken(words, pos, len, data, skipCheck, attr) { 105 | words.splice(pos, len, this.createToken(data, skipCheck, attr)); 106 | return words; 107 | } 108 | debugToken(data, attr, returnToken, ...argv) { 109 | return (0, debug_1.debugToken)(data, attr, returnToken, ...argv); 110 | } 111 | } 112 | exports.SubSModule = SubSModule; 113 | exports.default = exports; 114 | //# sourceMappingURL=mod.js.map -------------------------------------------------------------------------------- /lib/segment/defaults.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | import { IOptionsDoSegment } from './types'; 5 | export declare const defaultOptionsDoSegment: IOptionsDoSegment; 6 | -------------------------------------------------------------------------------- /lib/segment/defaults.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.defaultOptionsDoSegment = void 0; 4 | exports.defaultOptionsDoSegment = {}; 5 | //# sourceMappingURL=defaults.js.map -------------------------------------------------------------------------------- /lib/segment/defaults.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | import { IOptionsDoSegment } from './types'; 5 | 6 | export const defaultOptionsDoSegment: IOptionsDoSegment = {}; 7 | -------------------------------------------------------------------------------- /lib/segment/index.d.ts: -------------------------------------------------------------------------------- 1 | import SegmentCore from './core'; 2 | export declare class SegmentBase extends SegmentCore { 3 | } 4 | export default SegmentBase; 5 | -------------------------------------------------------------------------------- /lib/segment/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.SegmentBase = void 0; 4 | const tslib_1 = require("tslib"); 5 | const core_1 = tslib_1.__importDefault(require("./core")); 6 | class SegmentBase extends core_1.default { 7 | } 8 | exports.SegmentBase = SegmentBase; 9 | exports.default = SegmentBase; 10 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /lib/segment/index.ts: -------------------------------------------------------------------------------- 1 | import SegmentCore from './core'; 2 | 3 | export class SegmentBase extends SegmentCore 4 | { 5 | 6 | } 7 | 8 | export default SegmentBase 9 | -------------------------------------------------------------------------------- /lib/segment/method.d.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluelovers/node-segment/f5688c9dd2c31a16d7eefd2275aa9b206a5f7134/lib/segment/method.d.ts -------------------------------------------------------------------------------- /lib/segment/method.js: -------------------------------------------------------------------------------- 1 | //# sourceMappingURL=method.js.map -------------------------------------------------------------------------------- /lib/segment/method.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluelovers/node-segment/f5688c9dd2c31a16d7eefd2275aa9b206a5f7134/lib/segment/method.ts -------------------------------------------------------------------------------- /lib/segment/methods/_get_text.d.ts: -------------------------------------------------------------------------------- 1 | export declare function _get_text(text: string | Buffer): string; 2 | -------------------------------------------------------------------------------- /lib/segment/methods/_get_text.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports._get_text = _get_text; 4 | const crlf_normalize_1 = require("crlf-normalize"); 5 | function _get_text(text) { 6 | try { 7 | if (Buffer.isBuffer(text)) { 8 | text = text.toString(); 9 | } 10 | } 11 | catch (e) { } 12 | finally { 13 | if (typeof text !== 'string') { 14 | throw new TypeError(`text must is string or Buffer`); 15 | } 16 | text = (0, crlf_normalize_1.crlf)(text); 17 | } 18 | return text; 19 | } 20 | //# sourceMappingURL=_get_text.js.map -------------------------------------------------------------------------------- /lib/segment/methods/_get_text.ts: -------------------------------------------------------------------------------- 1 | import { crlf } from 'crlf-normalize'; 2 | 3 | export function _get_text(text: string | Buffer): string 4 | { 5 | try 6 | { 7 | if (Buffer.isBuffer(text)) 8 | { 9 | text = text.toString(); 10 | } 11 | } 12 | catch (e) 13 | {} 14 | finally 15 | { 16 | if (typeof text !== 'string') 17 | { 18 | throw new TypeError(`text must is string or Buffer`) 19 | } 20 | 21 | text = crlf(text); 22 | } 23 | 24 | return text; 25 | } 26 | -------------------------------------------------------------------------------- /lib/segment/methods/convertSynonym.d.ts: -------------------------------------------------------------------------------- 1 | import { IWordDebug } from '../../util/debug'; 2 | import { IDICT, IDICT_SYNONYM } from '../types'; 3 | import { ITSOverwrite } from 'ts-type'; 4 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 5 | import { IWord } from '@novel-segment/types'; 6 | interface IOptions { 7 | /** 8 | * for debug 9 | */ 10 | showcount?: boolean; 11 | DICT_SYNONYM: IDICT_SYNONYM; 12 | DICT_TABLE: IDICT; 13 | POSTAG: typeof POSTAG; 14 | } 15 | export interface IConvertSynonymWithShowcount { 16 | count: number; 17 | list: IWordDebug[]; 18 | } 19 | /** 20 | * 转换同义词 21 | */ 22 | export declare function convertSynonym(ret: IWordDebug[], options: ITSOverwrite): { 25 | count: number; 26 | list: IWordDebug[]; 27 | }; 28 | /** 29 | * 转换同义词 30 | */ 31 | export declare function convertSynonym(ret: IWordDebug[], options?: IOptions): IWordDebug[]; 32 | export {}; 33 | -------------------------------------------------------------------------------- /lib/segment/methods/convertSynonym.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.convertSynonym = convertSynonym; 4 | const tslib_1 = require("tslib"); 5 | const core_1 = tslib_1.__importDefault(require("deepmerge-plus/core")); 6 | const debug_1 = require("../../util/debug"); 7 | function convertSynonym(ret, options) { 8 | const { showcount, POSTAG, DICT_SYNONYM, DICT_TABLE } = options; 9 | let total_count = 0; 10 | //const RAW = Symbol.for('RAW'); 11 | // 转换同义词 12 | function _convertSynonym(list) { 13 | let count = 0; 14 | list = list.reduce(function (a, item) { 15 | let bool; 16 | let w = item.w; 17 | let nw; 18 | let debug = (0, debug_1.debugToken)(item); 19 | if (w in DICT_SYNONYM) { 20 | bool = true; 21 | nw = DICT_SYNONYM[w]; 22 | } 23 | else if (debug.autoCreate && !debug.convertSynonym && !item.ow && item.m && item.m.length) { 24 | nw = item.m.reduce(function (a, b) { 25 | if (typeof b === 'string') { 26 | a.push(b); 27 | } 28 | else if (b.w in DICT_SYNONYM) { 29 | a.push(DICT_SYNONYM[b.w]); 30 | bool = true; 31 | } 32 | else { 33 | a.push(b.w); 34 | } 35 | return a; 36 | }, []).join(''); 37 | } 38 | if (bool) { 39 | count++; 40 | total_count++; 41 | //return { w: DICT_SYNONYM[item.w], p: item.p }; 42 | let p = item.p; 43 | if (w in DICT_TABLE) { 44 | p = DICT_TABLE[w].p || p; 45 | } 46 | if (p & POSTAG.BAD) { 47 | p = p ^ POSTAG.BAD; 48 | } 49 | let item_new = (0, debug_1.debugToken)({ 50 | ...item, 51 | w: nw, 52 | ow: w, 53 | p, 54 | op: item.p, 55 | //[RAW]: item, 56 | //source: item, 57 | }, { 58 | convertSynonym: true, 59 | //_source: item, 60 | /** 61 | * JSON.stringify 62 | * avoid TypeError: Converting circular structure to JSON 63 | */ 64 | _source: (0, core_1.default)({}, item), 65 | }, true); 66 | a.push(item_new); 67 | } 68 | else { 69 | a.push(item); 70 | } 71 | debug = undefined; 72 | return a; 73 | }, []); 74 | return { count: count, list: list }; 75 | } 76 | let result; 77 | do { 78 | result = _convertSynonym(ret); 79 | ret = result.list; 80 | result.list = undefined; 81 | } while (result.count > 0); 82 | result = undefined; 83 | if (showcount) { 84 | return { count: total_count, list: ret }; 85 | } 86 | return ret; 87 | } 88 | //# sourceMappingURL=convertSynonym.js.map -------------------------------------------------------------------------------- /lib/segment/methods/convertSynonym.ts: -------------------------------------------------------------------------------- 1 | import deepmerge from 'deepmerge-plus/core'; 2 | import { debugToken, IWordDebug } from '../../util/debug'; 3 | import { IDICT, IDICT_SYNONYM } from '../types'; 4 | import { ITSOverwrite } from 'ts-type'; 5 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 6 | import { IWord } from '@novel-segment/types'; 7 | 8 | interface IOptions 9 | { 10 | /** 11 | * for debug 12 | */ 13 | showcount?: boolean, 14 | DICT_SYNONYM: IDICT_SYNONYM, 15 | DICT_TABLE: IDICT, 16 | POSTAG: typeof POSTAG, 17 | } 18 | 19 | export interface IConvertSynonymWithShowcount 20 | { 21 | count: number, 22 | list: IWordDebug[], 23 | } 24 | 25 | /** 26 | * 转换同义词 27 | */ 28 | export function convertSynonym(ret: IWordDebug[], options: ITSOverwrite): { 31 | count: number, 32 | list: IWordDebug[], 33 | } 34 | /** 35 | * 转换同义词 36 | */ 37 | export function convertSynonym(ret: IWordDebug[], options?: IOptions): IWordDebug[] 38 | export function convertSynonym(ret: IWordDebug[], options: IOptions) 39 | { 40 | const { showcount, POSTAG, DICT_SYNONYM, DICT_TABLE } = options; 41 | 42 | let total_count = 0; 43 | 44 | //const RAW = Symbol.for('RAW'); 45 | 46 | // 转换同义词 47 | function _convertSynonym(list: IWordDebug[]) 48 | { 49 | let count = 0; 50 | list = list.reduce(function (a, item: IWordDebug) 51 | { 52 | let bool: boolean; 53 | let w = item.w; 54 | let nw: string; 55 | 56 | let debug = debugToken(item); 57 | 58 | if (w in DICT_SYNONYM) 59 | { 60 | bool = true; 61 | nw = DICT_SYNONYM[w]; 62 | } 63 | else if (debug.autoCreate && !debug.convertSynonym && !item.ow && item.m && item.m.length) 64 | { 65 | nw = item.m.reduce(function (a: string[], b) 66 | { 67 | if (typeof b === 'string') 68 | { 69 | a.push(b); 70 | } 71 | else if (b.w in DICT_SYNONYM) 72 | { 73 | a.push(DICT_SYNONYM[b.w]); 74 | bool = true; 75 | } 76 | else 77 | { 78 | a.push(b.w); 79 | } 80 | 81 | return a; 82 | }, []).join(''); 83 | } 84 | 85 | if (bool) 86 | { 87 | count++; 88 | total_count++; 89 | //return { w: DICT_SYNONYM[item.w], p: item.p }; 90 | 91 | let p = item.p; 92 | 93 | if (w in DICT_TABLE) 94 | { 95 | p = DICT_TABLE[w].p || p; 96 | } 97 | 98 | if (p & POSTAG.BAD) 99 | { 100 | p = p ^ POSTAG.BAD; 101 | } 102 | 103 | let item_new = debugToken({ 104 | ...item, 105 | 106 | w: nw, 107 | ow: w, 108 | p, 109 | op: item.p, 110 | 111 | //[RAW]: item, 112 | 113 | //source: item, 114 | }, { 115 | convertSynonym: true, 116 | //_source: item, 117 | 118 | /** 119 | * JSON.stringify 120 | * avoid TypeError: Converting circular structure to JSON 121 | */ 122 | _source: deepmerge({}, item) as IWordDebug, 123 | 124 | }, true); 125 | 126 | a.push(item_new); 127 | } 128 | else 129 | { 130 | a.push(item); 131 | } 132 | 133 | debug = undefined; 134 | 135 | return a; 136 | }, [] as IWordDebug[]); 137 | return { count: count, list: list } as IConvertSynonymWithShowcount; 138 | } 139 | 140 | let result: IConvertSynonymWithShowcount; 141 | do 142 | { 143 | result = _convertSynonym(ret); 144 | ret = result.list; 145 | 146 | result.list = undefined; 147 | } 148 | while (result.count > 0); 149 | 150 | result = undefined; 151 | 152 | if (showcount) 153 | { 154 | return { count: total_count, list: ret }; 155 | } 156 | 157 | return ret; 158 | } 159 | -------------------------------------------------------------------------------- /lib/segment/methods/doSegment.d.ts: -------------------------------------------------------------------------------- 1 | import { IWordDebug } from '../../util/debug'; 2 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 3 | export declare function _doSegmentStripPOSTAG(ret: IWordDebug[], postag: POSTAG): IWordDebug[]; 4 | /** 5 | * 去除停止符 6 | */ 7 | export declare function _doSegmentStripStopword(ret: IWordDebug[], STOPWORD: any): IWordDebug[]; 8 | export declare function _doSegmentStripSpace(ret: IWordDebug[]): IWordDebug[]; 9 | /** 10 | * 仅返回单词内容 11 | */ 12 | export declare function _doSegmentSimple(ret: IWordDebug[]): string[]; 13 | -------------------------------------------------------------------------------- /lib/segment/methods/doSegment.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports._doSegmentStripPOSTAG = _doSegmentStripPOSTAG; 4 | exports._doSegmentStripStopword = _doSegmentStripStopword; 5 | exports._doSegmentStripSpace = _doSegmentStripSpace; 6 | exports._doSegmentSimple = _doSegmentSimple; 7 | function _doSegmentStripPOSTAG(ret, postag) { 8 | return ret.filter(function (item) { 9 | return item.p !== postag; 10 | }); 11 | } 12 | /** 13 | * 去除停止符 14 | */ 15 | function _doSegmentStripStopword(ret, STOPWORD) { 16 | return ret.filter(function (item) { 17 | return !(item.w in STOPWORD); 18 | }); 19 | } 20 | function _doSegmentStripSpace(ret) { 21 | return ret.filter(function (item) { 22 | return !/^\s+$/g.test(item.w); 23 | }); 24 | } 25 | /** 26 | * 仅返回单词内容 27 | */ 28 | function _doSegmentSimple(ret) { 29 | return ret.map(function (item) { 30 | return item.w; 31 | }); 32 | } 33 | //# sourceMappingURL=doSegment.js.map -------------------------------------------------------------------------------- /lib/segment/methods/doSegment.ts: -------------------------------------------------------------------------------- 1 | import { IWordDebug } from '../../util/debug'; 2 | import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; 3 | 4 | export function _doSegmentStripPOSTAG(ret: IWordDebug[], postag: POSTAG) 5 | { 6 | return ret.filter(function (item) 7 | { 8 | return item.p !== postag; 9 | }); 10 | } 11 | 12 | /** 13 | * 去除停止符 14 | */ 15 | export function _doSegmentStripStopword(ret: IWordDebug[], STOPWORD) 16 | { 17 | return ret.filter(function (item) 18 | { 19 | return !(item.w in STOPWORD); 20 | }); 21 | } 22 | 23 | export function _doSegmentStripSpace(ret: IWordDebug[]) 24 | { 25 | return ret.filter(function (item) 26 | { 27 | return !/^\s+$/g.test(item.w); 28 | }); 29 | } 30 | 31 | /** 32 | * 仅返回单词内容 33 | */ 34 | export function _doSegmentSimple(ret: IWordDebug[]): string[] 35 | { 36 | return ret.map(function (item) 37 | { 38 | return item.w; 39 | }); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /lib/segment/methods/getOptionsDoSegment.d.ts: -------------------------------------------------------------------------------- 1 | import { IOptionsDoSegment } from '../types'; 2 | export declare function getOptionsDoSegment(options: T, optionsDoSegment: any): T; 3 | -------------------------------------------------------------------------------- /lib/segment/methods/getOptionsDoSegment.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.getOptionsDoSegment = getOptionsDoSegment; 4 | const defaults_1 = require("../defaults"); 5 | function getOptionsDoSegment(options, optionsDoSegment) { 6 | return Object.assign({}, defaults_1.defaultOptionsDoSegment, optionsDoSegment, options); 7 | } 8 | //# sourceMappingURL=getOptionsDoSegment.js.map -------------------------------------------------------------------------------- /lib/segment/methods/getOptionsDoSegment.ts: -------------------------------------------------------------------------------- 1 | import { IOptionsDoSegment } from '../types'; 2 | import { defaultOptionsDoSegment } from '../defaults'; 3 | 4 | export function getOptionsDoSegment(options: T, optionsDoSegment: any): T 5 | { 6 | return Object.assign({}, 7 | defaultOptionsDoSegment, 8 | optionsDoSegment, 9 | options, 10 | ); 11 | } 12 | -------------------------------------------------------------------------------- /lib/segment/methods/indexOf.d.ts: -------------------------------------------------------------------------------- 1 | import { IWord } from '@novel-segment/types'; 2 | /** 3 | * 在单词数组中查找某一个单词或词性所在的位置 4 | * 5 | * @param {Array} words 单词数组 6 | * @param {Number|String} s 要查找的单词或词性 7 | * @param {Number} cur 开始位置 8 | * @return {Number} 找不到,返回-1 9 | */ 10 | export declare function indexOf(words: IWord[], s: string | number, cur?: number, ...argv: any[]): number; 11 | -------------------------------------------------------------------------------- /lib/segment/methods/indexOf.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.indexOf = indexOf; 4 | /** 5 | * 在单词数组中查找某一个单词或词性所在的位置 6 | * 7 | * @param {Array} words 单词数组 8 | * @param {Number|String} s 要查找的单词或词性 9 | * @param {Number} cur 开始位置 10 | * @return {Number} 找不到,返回-1 11 | */ 12 | function indexOf(words, s, cur, ...argv) { 13 | cur = isNaN(cur) ? 0 : cur; 14 | let f = typeof s === 'string' ? 'w' : 'p'; 15 | while (cur < words.length) { 16 | if (words[cur][f] === s) 17 | return cur; 18 | cur++; 19 | } 20 | return -1; 21 | } 22 | //# sourceMappingURL=indexOf.js.map -------------------------------------------------------------------------------- /lib/segment/methods/indexOf.ts: -------------------------------------------------------------------------------- 1 | import { IWord } from '@novel-segment/types'; 2 | 3 | /** 4 | * 在单词数组中查找某一个单词或词性所在的位置 5 | * 6 | * @param {Array} words 单词数组 7 | * @param {Number|String} s 要查找的单词或词性 8 | * @param {Number} cur 开始位置 9 | * @return {Number} 找不到,返回-1 10 | */ 11 | export function indexOf(words: IWord[], s: string | number, cur?: number, ...argv) 12 | { 13 | cur = isNaN(cur) ? 0 : cur; 14 | let f = typeof s === 'string' ? 'w' : 'p'; 15 | 16 | while (cur < words.length) 17 | { 18 | if (words[cur][f] === s) return cur; 19 | cur++; 20 | } 21 | 22 | return -1; 23 | } 24 | -------------------------------------------------------------------------------- /lib/segment/methods/listModules.d.ts: -------------------------------------------------------------------------------- 1 | import { IOptionsDoSegment } from '../types'; 2 | import { ISubTokenizer } from '../../mod/Tokenizer'; 3 | import { ISubOptimizer } from '../../mod/Optimizer'; 4 | import { Segment } from '../../Segment'; 5 | export declare function listModules(modules: Segment["modules"], options: IOptionsDoSegment): { 6 | enable: { 7 | tokenizer: ISubTokenizer[]; 8 | optimizer: ISubOptimizer[]; 9 | }; 10 | disable: { 11 | tokenizer: ISubTokenizer[]; 12 | optimizer: ISubOptimizer[]; 13 | }; 14 | }; 15 | -------------------------------------------------------------------------------- /lib/segment/methods/listModules.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.listModules = listModules; 4 | function listModules(modules, options) { 5 | let ret = { 6 | enable: { 7 | tokenizer: [], 8 | optimizer: [], 9 | }, 10 | disable: { 11 | tokenizer: [], 12 | optimizer: [], 13 | }, 14 | }; 15 | if (options === null || options === void 0 ? void 0 : options.disableModules) { 16 | modules.tokenizer 17 | .forEach(function (mod) { 18 | let bool; 19 | if (mod.name) { 20 | if (options.disableModules.includes(mod.name)) { 21 | bool = true; 22 | } 23 | } 24 | else { 25 | if (options.disableModules.includes(mod)) { 26 | bool = true; 27 | } 28 | } 29 | ret[bool ? 'disable' : 'enable'].tokenizer.push(mod); 30 | }); 31 | modules.optimizer 32 | .forEach(function (mod) { 33 | let bool; 34 | if (mod.name) { 35 | if (options.disableModules.includes(mod.name)) { 36 | bool = true; 37 | } 38 | } 39 | else { 40 | if (options.disableModules.includes(mod)) { 41 | bool = true; 42 | } 43 | } 44 | ret[bool ? 'disable' : 'enable'].optimizer.push(mod); 45 | }); 46 | } 47 | else { 48 | ret.enable.tokenizer = modules.tokenizer.slice(); 49 | ret.enable.optimizer = modules.optimizer.slice(); 50 | } 51 | return ret; 52 | } 53 | //# sourceMappingURL=listModules.js.map -------------------------------------------------------------------------------- /lib/segment/methods/listModules.ts: -------------------------------------------------------------------------------- 1 | import { IOptionsDoSegment } from '../types'; 2 | import { ISubTokenizer } from '../../mod/Tokenizer'; 3 | import { ISubOptimizer } from '../../mod/Optimizer'; 4 | import { Segment } from '../../Segment'; 5 | 6 | export function listModules(modules: Segment["modules"], options: IOptionsDoSegment) 7 | { 8 | let ret = { 9 | enable: { 10 | tokenizer: [] as ISubTokenizer[], 11 | optimizer: [] as ISubOptimizer[], 12 | }, 13 | disable: { 14 | tokenizer: [] as ISubTokenizer[], 15 | optimizer: [] as ISubOptimizer[], 16 | }, 17 | }; 18 | 19 | if (options?.disableModules) 20 | { 21 | modules.tokenizer 22 | .forEach(function (mod) 23 | { 24 | let bool: boolean; 25 | 26 | if (mod.name) 27 | { 28 | if (options.disableModules.includes(mod.name)) 29 | { 30 | bool = true; 31 | } 32 | } 33 | else 34 | { 35 | if (options.disableModules.includes(mod as any)) 36 | { 37 | bool = true; 38 | } 39 | } 40 | 41 | ret[bool ? 'disable' : 'enable'].tokenizer.push(mod); 42 | }) 43 | ; 44 | 45 | modules.optimizer 46 | .forEach(function (mod) 47 | { 48 | let bool: boolean; 49 | 50 | if (mod.name) 51 | { 52 | if (options.disableModules.includes(mod.name)) 53 | { 54 | bool = true; 55 | } 56 | } 57 | else 58 | { 59 | if (options.disableModules.includes(mod as any)) 60 | { 61 | bool = true; 62 | } 63 | } 64 | 65 | ret[bool ? 'disable' : 'enable'].optimizer.push(mod); 66 | }) 67 | ; 68 | } 69 | else 70 | { 71 | ret.enable.tokenizer = modules.tokenizer.slice(); 72 | ret.enable.optimizer = modules.optimizer.slice(); 73 | } 74 | 75 | return ret; 76 | } 77 | -------------------------------------------------------------------------------- /lib/segment/methods/split.d.ts: -------------------------------------------------------------------------------- 1 | import { IWord } from '@novel-segment/types'; 2 | /** 3 | * 根据某个单词或词性来分割单词数组 4 | * 5 | * @param {Array} words 单词数组 6 | * @param {Number|String} s 用于分割的单词或词性 7 | * @return {Array} 8 | */ 9 | export declare function split(words: IWord[], s: string | number, ...argv: any[]): IWord[]; 10 | -------------------------------------------------------------------------------- /lib/segment/methods/split.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.split = split; 4 | /** 5 | * 根据某个单词或词性来分割单词数组 6 | * 7 | * @param {Array} words 单词数组 8 | * @param {Number|String} s 用于分割的单词或词性 9 | * @return {Array} 10 | */ 11 | function split(words, s, ...argv) { 12 | let ret = []; 13 | let lasti = 0; 14 | let i = 0; 15 | let f = typeof s === 'string' ? 'w' : 'p'; 16 | while (i < words.length) { 17 | if (words[i][f] === s) { 18 | if (lasti < i) 19 | ret.push(words.slice(lasti, i)); 20 | ret.push(words.slice(i, i + 1)); 21 | i++; 22 | lasti = i; 23 | } 24 | else { 25 | i++; 26 | } 27 | } 28 | if (lasti < words.length - 1) { 29 | ret.push(words.slice(lasti, words.length)); 30 | } 31 | words = undefined; 32 | return ret; 33 | } 34 | //# sourceMappingURL=split.js.map -------------------------------------------------------------------------------- /lib/segment/methods/split.ts: -------------------------------------------------------------------------------- 1 | import { IWord } from '@novel-segment/types'; 2 | 3 | /** 4 | * 根据某个单词或词性来分割单词数组 5 | * 6 | * @param {Array} words 单词数组 7 | * @param {Number|String} s 用于分割的单词或词性 8 | * @return {Array} 9 | */ 10 | export function split(words: IWord[], s: string | number, ...argv): IWord[] 11 | { 12 | let ret = []; 13 | let lasti = 0; 14 | let i = 0; 15 | let f = typeof s === 'string' ? 'w' : 'p'; 16 | 17 | while (i < words.length) 18 | { 19 | if (words[i][f] === s) 20 | { 21 | if (lasti < i) ret.push(words.slice(lasti, i)); 22 | ret.push(words.slice(i, i + 1)); 23 | i++; 24 | lasti = i; 25 | } 26 | else 27 | { 28 | i++; 29 | } 30 | } 31 | if (lasti < words.length - 1) 32 | { 33 | ret.push(words.slice(lasti, words.length)); 34 | } 35 | 36 | words = undefined; 37 | 38 | return ret; 39 | } 40 | -------------------------------------------------------------------------------- /lib/segment/methods/stringify.d.ts: -------------------------------------------------------------------------------- 1 | export { stringify, stringify as default } from '@novel-segment/stringify'; 2 | -------------------------------------------------------------------------------- /lib/segment/methods/stringify.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.default = exports.stringify = void 0; 4 | var stringify_1 = require("@novel-segment/stringify"); 5 | Object.defineProperty(exports, "stringify", { enumerable: true, get: function () { return stringify_1.stringify; } }); 6 | Object.defineProperty(exports, "default", { enumerable: true, get: function () { return stringify_1.stringify; } }); 7 | //# sourceMappingURL=stringify.js.map -------------------------------------------------------------------------------- /lib/segment/methods/stringify.ts: -------------------------------------------------------------------------------- 1 | 2 | export { stringify, stringify as default } from '@novel-segment/stringify'; 3 | -------------------------------------------------------------------------------- /lib/segment/methods/useModules.d.ts: -------------------------------------------------------------------------------- 1 | import SegmentCore from '../core'; 2 | import { ISubOptimizer } from '../../mod/Optimizer'; 3 | import { ISubTokenizer } from '../../mod/Tokenizer'; 4 | export declare function _isIgnoreModules(me: T, mod: ISubOptimizer | ISubTokenizer | any, ...argv: any[]): boolean; 5 | export declare function _warnIgnoreModules(mod: any): void; 6 | export declare function useModules(me: T, mod: ISubOptimizer | ISubTokenizer | any, ...argv: any[]): T; 7 | -------------------------------------------------------------------------------- /lib/segment/methods/useModules.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports._isIgnoreModules = _isIgnoreModules; 4 | exports._warnIgnoreModules = _warnIgnoreModules; 5 | exports.useModules = useModules; 6 | function _isIgnoreModules(me, mod, ...argv) { 7 | var _a, _b; 8 | return ((_b = (_a = me.options) === null || _a === void 0 ? void 0 : _a.disableModules) === null || _b === void 0 ? void 0 : _b.includes(mod)); 9 | } 10 | function _warnIgnoreModules(mod) { 11 | console.warn(`can't use this mod, because it got disable: ${mod}`); 12 | } 13 | function useModules(me, mod, ...argv) { 14 | if (_isIgnoreModules(me, mod, ...argv)) { 15 | _warnIgnoreModules(mod); 16 | } 17 | else { 18 | // 初始化并注册模块 19 | let c = mod.init(me, ...argv); 20 | if (typeof c !== 'undefined') { 21 | mod = c; 22 | } 23 | if (!['tokenizer', 'optimizer'].includes(mod.type)) { 24 | throw new TypeError(`not a valid module, ${mod}`); 25 | } 26 | // @ts-ignore 27 | me.modules[mod.type].push(mod); 28 | } 29 | return me; 30 | } 31 | //# sourceMappingURL=useModules.js.map -------------------------------------------------------------------------------- /lib/segment/methods/useModules.ts: -------------------------------------------------------------------------------- 1 | import SegmentCore from '../core'; 2 | import { ISubOptimizer } from '../../mod/Optimizer'; 3 | import { ISubTokenizer } from '../../mod/Tokenizer'; 4 | 5 | export function _isIgnoreModules(me: T, mod: ISubOptimizer | ISubTokenizer | any, ...argv) 6 | { 7 | return (me.options?.disableModules?.includes(mod)) 8 | } 9 | 10 | export function _warnIgnoreModules(mod) 11 | { 12 | console.warn(`can't use this mod, because it got disable: ${mod}`) 13 | } 14 | 15 | export function useModules(me: T, mod: ISubOptimizer | ISubTokenizer | any, ...argv) 16 | { 17 | if (_isIgnoreModules(me as any, mod, ...argv)) 18 | { 19 | _warnIgnoreModules(mod) 20 | } 21 | else 22 | { 23 | // 初始化并注册模块 24 | let c = mod.init(me, ...argv); 25 | 26 | if (typeof c !== 'undefined') 27 | { 28 | mod = c; 29 | } 30 | 31 | if (!['tokenizer', 'optimizer'].includes(mod.type)) 32 | { 33 | throw new TypeError(`not a valid module, ${mod}`) 34 | } 35 | 36 | // @ts-ignore 37 | me.modules[mod.type].push(mod); 38 | } 39 | 40 | return me; 41 | } 42 | -------------------------------------------------------------------------------- /lib/segment/methods/useModules2.d.ts: -------------------------------------------------------------------------------- 1 | import { ISubOptimizer } from '../../mod/Optimizer'; 2 | import { ISubTokenizer } from '../../mod/Tokenizer'; 3 | export declare function useModules(me: T, mod: ISubOptimizer | ISubTokenizer | any | string | (ISubTokenizer | ISubOptimizer | string)[], ...argv: any[]): T; 4 | -------------------------------------------------------------------------------- /lib/segment/methods/useModules2.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.useModules = useModules; 4 | const tslib_1 = require("tslib"); 5 | const useModules_1 = require("./useModules"); 6 | const BuildInSubMod = tslib_1.__importStar(require("../../submod")); 7 | function useModules(me, mod, ...argv) { 8 | if (Array.isArray(mod)) { 9 | mod.forEach(function (m) { 10 | useModules(me, m, ...argv); 11 | }); 12 | } 13 | else { 14 | if (typeof mod === 'string' && !(0, useModules_1._isIgnoreModules)(me, mod, ...argv)) { 15 | //mod = require(path.join(__dirname, '../..', 'submod', mod)); 16 | //mod = require(`../../submod/${mod}`); 17 | mod = BuildInSubMod[mod]; 18 | } 19 | (0, useModules_1.useModules)(me, mod, ...argv); 20 | } 21 | return me; 22 | } 23 | //# sourceMappingURL=useModules2.js.map -------------------------------------------------------------------------------- /lib/segment/methods/useModules2.ts: -------------------------------------------------------------------------------- 1 | import { _isIgnoreModules, useModules as _useModules } from './useModules'; 2 | import { ISubOptimizer } from '../../mod/Optimizer'; 3 | import { ISubTokenizer } from '../../mod/Tokenizer'; 4 | import * as BuildInSubMod from '../../submod'; 5 | 6 | export function useModules(me: T, mod: ISubOptimizer | ISubTokenizer | any | string | (ISubTokenizer | ISubOptimizer | string)[], ...argv) 7 | { 8 | if (Array.isArray(mod)) 9 | { 10 | mod.forEach(function (m) 11 | { 12 | useModules(me as any, m, ...argv) 13 | }); 14 | } 15 | else 16 | { 17 | if (typeof mod === 'string' && !_isIgnoreModules(me as any, mod, ...argv)) 18 | { 19 | //mod = require(path.join(__dirname, '../..', 'submod', mod)); 20 | //mod = require(`../../submod/${mod}`); 21 | 22 | mod = BuildInSubMod[mod] 23 | } 24 | 25 | _useModules(me as any, mod, ...argv) 26 | } 27 | 28 | return me; 29 | } 30 | -------------------------------------------------------------------------------- /lib/segment/types.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | import { IOptions as IOptionsTableDict } from '@novel-segment/table-core-abstract'; 5 | import { TableDict } from '@novel-segment/table-dict'; 6 | import { ENUM_SUBMODS_NAME } from '../mod/index'; 7 | import { IUseDefaultOptions } from '../defaults/index'; 8 | export { IWord } from '@novel-segment/types'; 9 | export type ISPLIT = RegExp | string | { 10 | [Symbol.split](input: string, limit?: number): string[]; 11 | }; 12 | export type ISPLIT_FILTER = RegExp | { 13 | test(input: string): boolean; 14 | }; 15 | export interface IDICT { 16 | [key: string]: T; 17 | } 18 | export interface IDICT2 { 19 | [key: number]: IDICT; 20 | } 21 | export interface IOptionsSegment extends IOptionsTableDict, IUseDefaultOptions { 22 | db?: TableDict[]; 23 | optionsDoSegment?: IOptionsDoSegment; 24 | maxChunkCount?: number; 25 | minChunkCount?: number; 26 | disableModules?: (ENUM_SUBMODS_NAME | unknown)[]; 27 | } 28 | export type IDICT_SYNONYM = IDICT; 29 | export type IDICT_STOPWORD = IDICT; 30 | export type IDICT_BLACKLIST = IDICT; 31 | export interface IOptionsDoSegment { 32 | /** 33 | * 不返回词性 34 | */ 35 | simple?: boolean; 36 | /** 37 | * 去除标点符号 38 | */ 39 | stripPunctuation?: boolean; 40 | /** 41 | * 转换同义词 42 | */ 43 | convertSynonym?: boolean; 44 | /** 45 | * 去除停止符 46 | */ 47 | stripStopword?: boolean; 48 | stripSpace?: boolean; 49 | disableModules?: (ENUM_SUBMODS_NAME | unknown)[]; 50 | } 51 | -------------------------------------------------------------------------------- /lib/segment/types.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2019/6/26. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | //# sourceMappingURL=types.js.map -------------------------------------------------------------------------------- /lib/segment/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/26. 3 | */ 4 | 5 | import { IOptions as IOptionsTableDict } from '@novel-segment/table-core-abstract'; 6 | import { TableDict } from '@novel-segment/table-dict'; 7 | import { ENUM_SUBMODS_NAME } from '../mod/index'; 8 | import { IUseDefaultOptions } from '../defaults/index'; 9 | 10 | export { IWord } from '@novel-segment/types'; 11 | 12 | export type ISPLIT = RegExp | string | { 13 | [Symbol.split](input: string, limit?: number): string[], 14 | }; 15 | 16 | export type ISPLIT_FILTER = RegExp | { 17 | test(input: string): boolean, 18 | }; 19 | 20 | export interface IDICT 21 | { 22 | [key: string]: T, 23 | } 24 | 25 | export interface IDICT2 26 | { 27 | [key: number]: IDICT, 28 | } 29 | 30 | export interface IOptionsSegment extends IOptionsTableDict, IUseDefaultOptions 31 | { 32 | db?: TableDict[], 33 | optionsDoSegment?: IOptionsDoSegment, 34 | 35 | maxChunkCount?: number, 36 | minChunkCount?: number, 37 | 38 | disableModules?: (ENUM_SUBMODS_NAME | unknown)[], 39 | } 40 | 41 | export type IDICT_SYNONYM = IDICT; 42 | export type IDICT_STOPWORD = IDICT; 43 | export type IDICT_BLACKLIST = IDICT; 44 | 45 | export interface IOptionsDoSegment 46 | { 47 | /** 48 | * 不返回词性 49 | */ 50 | simple?: boolean, 51 | 52 | /** 53 | * 去除标点符号 54 | */ 55 | stripPunctuation?: boolean, 56 | 57 | /** 58 | * 转换同义词 59 | */ 60 | convertSynonym?: boolean, 61 | 62 | /** 63 | * 去除停止符 64 | */ 65 | stripStopword?: boolean, 66 | 67 | stripSpace?: boolean, 68 | 69 | disableModules?: (ENUM_SUBMODS_NAME | unknown)[], 70 | } 71 | -------------------------------------------------------------------------------- /lib/submod.d.ts: -------------------------------------------------------------------------------- 1 | import * as AdjectiveOptimizer from './submod/AdjectiveOptimizer'; 2 | import * as ChsNameOptimizer from './submod/ChsNameOptimizer'; 3 | import * as ChsNameTokenizer from './submod/ChsNameTokenizer'; 4 | import * as DatetimeOptimizer from './submod/DatetimeOptimizer'; 5 | import * as DictOptimizer from './submod/DictOptimizer'; 6 | import * as DictTokenizer from './submod/DictTokenizer'; 7 | import * as EmailOptimizer from './submod/EmailOptimizer'; 8 | import * as ForeignOptimizer from './submod/ForeignOptimizer'; 9 | import * as ForeignTokenizer from './submod/ForeignTokenizer'; 10 | import * as JpSimpleTokenizer from './submod/JpSimpleTokenizer'; 11 | import * as PunctuationTokenizer from './submod/PunctuationTokenizer'; 12 | import * as SingleTokenizer from './submod/SingleTokenizer'; 13 | import * as URLTokenizer from './submod/URLTokenizer'; 14 | import * as WildcardTokenizer from './submod/WildcardTokenizer'; 15 | import * as ZhRadicalTokenizer from './submod/ZhRadicalTokenizer'; 16 | import * as ZhtSynonymOptimizer from './submod/ZhtSynonymOptimizer'; 17 | import * as ZhuyinTokenizer from './submod/ZhuyinTokenizer'; 18 | export { AdjectiveOptimizer }; 19 | export { ChsNameOptimizer }; 20 | export { ChsNameTokenizer }; 21 | export { DatetimeOptimizer }; 22 | export { DictOptimizer }; 23 | export { DictTokenizer }; 24 | export { EmailOptimizer }; 25 | export { ForeignOptimizer }; 26 | export { ForeignTokenizer }; 27 | export { JpSimpleTokenizer }; 28 | export { PunctuationTokenizer }; 29 | export { SingleTokenizer }; 30 | export { URLTokenizer }; 31 | export { WildcardTokenizer }; 32 | export { ZhRadicalTokenizer }; 33 | export { ZhtSynonymOptimizer }; 34 | export { ZhuyinTokenizer }; 35 | -------------------------------------------------------------------------------- /lib/submod.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.ZhuyinTokenizer = exports.ZhtSynonymOptimizer = exports.ZhRadicalTokenizer = exports.WildcardTokenizer = exports.URLTokenizer = exports.SingleTokenizer = exports.PunctuationTokenizer = exports.JpSimpleTokenizer = exports.ForeignTokenizer = exports.ForeignOptimizer = exports.EmailOptimizer = exports.DictTokenizer = exports.DictOptimizer = exports.DatetimeOptimizer = exports.ChsNameTokenizer = exports.ChsNameOptimizer = exports.AdjectiveOptimizer = void 0; 4 | const tslib_1 = require("tslib"); 5 | const AdjectiveOptimizer = tslib_1.__importStar(require("./submod/AdjectiveOptimizer")); 6 | exports.AdjectiveOptimizer = AdjectiveOptimizer; 7 | const ChsNameOptimizer = tslib_1.__importStar(require("./submod/ChsNameOptimizer")); 8 | exports.ChsNameOptimizer = ChsNameOptimizer; 9 | const ChsNameTokenizer = tslib_1.__importStar(require("./submod/ChsNameTokenizer")); 10 | exports.ChsNameTokenizer = ChsNameTokenizer; 11 | const DatetimeOptimizer = tslib_1.__importStar(require("./submod/DatetimeOptimizer")); 12 | exports.DatetimeOptimizer = DatetimeOptimizer; 13 | const DictOptimizer = tslib_1.__importStar(require("./submod/DictOptimizer")); 14 | exports.DictOptimizer = DictOptimizer; 15 | const DictTokenizer = tslib_1.__importStar(require("./submod/DictTokenizer")); 16 | exports.DictTokenizer = DictTokenizer; 17 | const EmailOptimizer = tslib_1.__importStar(require("./submod/EmailOptimizer")); 18 | exports.EmailOptimizer = EmailOptimizer; 19 | const ForeignOptimizer = tslib_1.__importStar(require("./submod/ForeignOptimizer")); 20 | exports.ForeignOptimizer = ForeignOptimizer; 21 | const ForeignTokenizer = tslib_1.__importStar(require("./submod/ForeignTokenizer")); 22 | exports.ForeignTokenizer = ForeignTokenizer; 23 | const JpSimpleTokenizer = tslib_1.__importStar(require("./submod/JpSimpleTokenizer")); 24 | exports.JpSimpleTokenizer = JpSimpleTokenizer; 25 | const PunctuationTokenizer = tslib_1.__importStar(require("./submod/PunctuationTokenizer")); 26 | exports.PunctuationTokenizer = PunctuationTokenizer; 27 | const SingleTokenizer = tslib_1.__importStar(require("./submod/SingleTokenizer")); 28 | exports.SingleTokenizer = SingleTokenizer; 29 | const URLTokenizer = tslib_1.__importStar(require("./submod/URLTokenizer")); 30 | exports.URLTokenizer = URLTokenizer; 31 | const WildcardTokenizer = tslib_1.__importStar(require("./submod/WildcardTokenizer")); 32 | exports.WildcardTokenizer = WildcardTokenizer; 33 | const ZhRadicalTokenizer = tslib_1.__importStar(require("./submod/ZhRadicalTokenizer")); 34 | exports.ZhRadicalTokenizer = ZhRadicalTokenizer; 35 | const ZhtSynonymOptimizer = tslib_1.__importStar(require("./submod/ZhtSynonymOptimizer")); 36 | exports.ZhtSynonymOptimizer = ZhtSynonymOptimizer; 37 | const ZhuyinTokenizer = tslib_1.__importStar(require("./submod/ZhuyinTokenizer")); 38 | exports.ZhuyinTokenizer = ZhuyinTokenizer; 39 | //# sourceMappingURL=submod.js.map -------------------------------------------------------------------------------- /lib/submod.ts: -------------------------------------------------------------------------------- 1 | 2 | import * as AdjectiveOptimizer from './submod/AdjectiveOptimizer'; 3 | import * as ChsNameOptimizer from './submod/ChsNameOptimizer'; 4 | import * as ChsNameTokenizer from './submod/ChsNameTokenizer'; 5 | import * as DatetimeOptimizer from './submod/DatetimeOptimizer'; 6 | import * as DictOptimizer from './submod/DictOptimizer'; 7 | import * as DictTokenizer from './submod/DictTokenizer'; 8 | import * as EmailOptimizer from './submod/EmailOptimizer'; 9 | import * as ForeignOptimizer from './submod/ForeignOptimizer'; 10 | import * as ForeignTokenizer from './submod/ForeignTokenizer'; 11 | import * as JpSimpleTokenizer from './submod/JpSimpleTokenizer'; 12 | import * as PunctuationTokenizer from './submod/PunctuationTokenizer'; 13 | import * as SingleTokenizer from './submod/SingleTokenizer'; 14 | import * as URLTokenizer from './submod/URLTokenizer'; 15 | import * as WildcardTokenizer from './submod/WildcardTokenizer'; 16 | import * as ZhRadicalTokenizer from './submod/ZhRadicalTokenizer'; 17 | import * as ZhtSynonymOptimizer from './submod/ZhtSynonymOptimizer'; 18 | import * as ZhuyinTokenizer from './submod/ZhuyinTokenizer'; 19 | 20 | export { AdjectiveOptimizer } 21 | export { ChsNameOptimizer } 22 | export { ChsNameTokenizer } 23 | export { DatetimeOptimizer } 24 | export { DictOptimizer } 25 | export { DictTokenizer } 26 | export { EmailOptimizer } 27 | export { ForeignOptimizer } 28 | export { ForeignTokenizer } 29 | export { JpSimpleTokenizer } 30 | export { PunctuationTokenizer } 31 | export { SingleTokenizer } 32 | export { URLTokenizer } 33 | export { WildcardTokenizer } 34 | export { ZhRadicalTokenizer } 35 | export { ZhtSynonymOptimizer } 36 | export { ZhuyinTokenizer } 37 | -------------------------------------------------------------------------------- /lib/submod/AdjectiveOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | import { SubSModuleOptimizer } from '../mod'; 2 | import { IWordDebug } from '../util'; 3 | /** 4 | * 把一些错认为名词的词标注为形容词,或者对名词作定语的情况 5 | */ 6 | export declare class AdjectiveOptimizer extends SubSModuleOptimizer { 7 | name: string; 8 | doOptimize(words: IWordDebug[]): IWordDebug[]; 9 | isNominal(pos: number | number[]): boolean; 10 | } 11 | export declare const init: typeof AdjectiveOptimizer.init; 12 | export declare const type = "optimizer"; 13 | export default AdjectiveOptimizer; 14 | -------------------------------------------------------------------------------- /lib/submod/AdjectiveOptimizer.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.AdjectiveOptimizer = void 0; 4 | const mod_1 = require("../mod"); 5 | const COLORS_1 = require("../mod/COLORS"); 6 | /** 7 | * 把一些错认为名词的词标注为形容词,或者对名词作定语的情况 8 | */ 9 | class AdjectiveOptimizer extends mod_1.SubSModuleOptimizer { 10 | constructor() { 11 | super(...arguments); 12 | this.name = 'AdjectiveOptimizer'; 13 | } 14 | doOptimize(words) { 15 | const POSTAG = this._POSTAG; 16 | let index = 0; 17 | while (index < words.length) { 18 | const word = words[index]; 19 | const nextword = words[index + 1]; 20 | if (nextword) { 21 | // 对于<颜色>+<的>,直接判断颜色是形容词(字典里颜色都是名词) 22 | if (nextword.p & POSTAG.D_U && COLORS_1.COLOR_ALL[word.w]) { 23 | word.op = word.op || word.p; 24 | word.p |= POSTAG.D_A; 25 | this.debugToken(word, { 26 | [this.name]: true, 27 | }); 28 | } 29 | // 如果是连续的两个名词,前一个是颜色,那这个颜色也是形容词 30 | if (word.p & POSTAG.D_N && this.isNominal(nextword.p) && COLORS_1.COLOR_ALL[word.w]) { 31 | word.op = word.op || word.p; 32 | word.p |= POSTAG.D_A; 33 | word.p |= POSTAG.D_N; 34 | this.debugToken(word, { 35 | [this.name]: true, 36 | }); 37 | } 38 | if ((word.w === '純' || word.w === '纯') && COLORS_1.COLOR_HAIR[nextword.w]) { 39 | word.op = word.op || word.p; 40 | word.p |= POSTAG.D_A; 41 | this.debugToken(word, { 42 | [this.name]: true, 43 | }); 44 | } 45 | } 46 | // 移到下一个单词 47 | index += 1; 48 | } 49 | return words; 50 | } 51 | isNominal(pos) { 52 | /* 53 | if (Array.isArray(pos)) 54 | { 55 | return this.isNominal(pos[0]); 56 | } 57 | */ 58 | const POSTAG = this._POSTAG; 59 | return (pos === POSTAG.D_N || 60 | pos === POSTAG.A_NT || 61 | pos === POSTAG.A_NX || 62 | pos === POSTAG.A_NZ || 63 | pos === POSTAG.A_NR || 64 | pos === POSTAG.A_NS || 65 | pos === POSTAG.URL); 66 | } 67 | } 68 | exports.AdjectiveOptimizer = AdjectiveOptimizer; 69 | exports.init = AdjectiveOptimizer.init.bind(AdjectiveOptimizer); 70 | exports.type = AdjectiveOptimizer.type; 71 | exports.default = AdjectiveOptimizer; 72 | //# sourceMappingURL=AdjectiveOptimizer.js.map -------------------------------------------------------------------------------- /lib/submod/AdjectiveOptimizer.ts: -------------------------------------------------------------------------------- 1 | import { SubSModuleOptimizer } from '../mod'; 2 | 3 | import { COLOR_ALL, COLOR_HAIR } from '../mod/COLORS'; 4 | import { IWordDebug } from '../util'; 5 | 6 | /** 7 | * 把一些错认为名词的词标注为形容词,或者对名词作定语的情况 8 | */ 9 | export class AdjectiveOptimizer extends SubSModuleOptimizer 10 | { 11 | override name = 'AdjectiveOptimizer'; 12 | 13 | override doOptimize(words: IWordDebug[]): IWordDebug[] 14 | { 15 | const POSTAG = this._POSTAG; 16 | let index = 0; 17 | while (index < words.length) 18 | { 19 | const word = words[index]; 20 | const nextword = words[index + 1]; 21 | if (nextword) 22 | { 23 | // 对于<颜色>+<的>,直接判断颜色是形容词(字典里颜色都是名词) 24 | if (nextword.p & POSTAG.D_U && COLOR_ALL[word.w]) 25 | { 26 | word.op = word.op || word.p; 27 | word.p |= POSTAG.D_A; 28 | 29 | this.debugToken(word, { 30 | [this.name]: true, 31 | }); 32 | } 33 | 34 | // 如果是连续的两个名词,前一个是颜色,那这个颜色也是形容词 35 | if (word.p & POSTAG.D_N && this.isNominal(nextword.p) && COLOR_ALL[word.w]) 36 | { 37 | word.op = word.op || word.p; 38 | word.p |= POSTAG.D_A; 39 | word.p |= POSTAG.D_N; 40 | 41 | this.debugToken(word, { 42 | [this.name]: true, 43 | }); 44 | } 45 | 46 | if ((word.w === '純' || word.w === '纯') && COLOR_HAIR[nextword.w]) 47 | { 48 | word.op = word.op || word.p; 49 | word.p |= POSTAG.D_A; 50 | 51 | this.debugToken(word, { 52 | [this.name]: true, 53 | }); 54 | } 55 | } 56 | // 移到下一个单词 57 | index += 1; 58 | } 59 | return words; 60 | } 61 | 62 | isNominal(pos: number | number[]): boolean 63 | { 64 | /* 65 | if (Array.isArray(pos)) 66 | { 67 | return this.isNominal(pos[0]); 68 | } 69 | */ 70 | 71 | const POSTAG = this._POSTAG; 72 | return ( 73 | pos === POSTAG.D_N || 74 | pos === POSTAG.A_NT || 75 | pos === POSTAG.A_NX || 76 | pos === POSTAG.A_NZ || 77 | pos === POSTAG.A_NR || 78 | pos === POSTAG.A_NS || 79 | pos === POSTAG.URL 80 | ); 81 | } 82 | } 83 | 84 | export const init = AdjectiveOptimizer.init.bind(AdjectiveOptimizer) as typeof AdjectiveOptimizer.init; 85 | 86 | export const type = AdjectiveOptimizer.type; 87 | 88 | export default AdjectiveOptimizer 89 | -------------------------------------------------------------------------------- /lib/submod/ChsNameOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 人名优化模块 3 | * 4 | * @author 老雷 5 | * @version 0.1 6 | */ 7 | import { SubSModuleOptimizer } from '../mod'; 8 | import { IDICT, IWord } from '../Segment'; 9 | /** 10 | * @todo 支援 XX氏 11 | */ 12 | export declare class ChsNameOptimizer extends SubSModuleOptimizer { 13 | protected _TABLE: IDICT; 14 | name: string; 15 | _cache(): void; 16 | isBlackList(nw: string): boolean; 17 | isMergeable2(...words: string[]): boolean; 18 | isMergeable(word: IWord, nextword: IWord): boolean; 19 | /** 20 | * 只有新詞屬於人名或未知詞時才會合併 21 | */ 22 | validUnknownNewWord(ws: W, cb?: (nw: string, ew: IWord, ws: W) => IWord | boolean | void): true | IWord; 23 | /** 24 | * 姓 25 | */ 26 | isFamilyName(w: string): boolean; 27 | /** 28 | * 双字姓名 29 | */ 30 | isDoubleName(w1: string, w2: string): boolean; 31 | isSingleNameRepeat(w1: string, w2: string): boolean; 32 | /** 33 | * 单字姓名 34 | */ 35 | isSingleName(w1: string): boolean; 36 | /** 37 | * 单字姓名 不重覆 38 | */ 39 | isSingleNameNoRepeat(w1: string): boolean; 40 | isFirstName(w1: string, w2: string): boolean; 41 | /** 42 | * 对可能是人名的单词进行优化 43 | * 44 | * @param {array} words 单词数组 45 | * @return {array} 46 | */ 47 | doOptimize(words: IWord[]): IWord[]; 48 | } 49 | export declare const init: typeof ChsNameOptimizer.init; 50 | export declare const type = "optimizer"; 51 | export default ChsNameOptimizer; 52 | -------------------------------------------------------------------------------- /lib/submod/ChsNameTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | import { SubSModuleTokenizer } from '../mod'; 2 | import { IDICT, IWord } from '../Segment'; 3 | export declare class ChsNameTokenizer extends SubSModuleTokenizer { 4 | protected _TABLE: IDICT; 5 | name: string; 6 | _cache(): void; 7 | /** 8 | * 对未识别的单词进行分词 9 | * 10 | * @param {array} words 单词数组 11 | * @return {array} 12 | */ 13 | split(words: IWord[]): IWord[]; 14 | /** 15 | * 匹配包含的人名,并返回相关信息 16 | * 17 | * @param {string} text 文本 18 | * @param {int} cur 开始位置 19 | * @return {array} 返回格式 {w: '人名', c: 开始位置} 20 | */ 21 | matchName(text: string, cur?: number): IWord[]; 22 | } 23 | export declare const init: typeof ChsNameTokenizer.init; 24 | export declare const type = "tokenizer"; 25 | export default ChsNameTokenizer; 26 | -------------------------------------------------------------------------------- /lib/submod/DatetimeOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 日期时间优化模块 3 | * 4 | * @author 老雷 5 | */ 6 | import Segment, { IWord } from '../Segment'; 7 | /** 模块类型 */ 8 | export declare const type = "optimizer"; 9 | export declare let segment: Segment; 10 | /** 11 | * 模块初始化 12 | * 13 | * @param {Segment} segment 分词接口 14 | */ 15 | export declare function init(_segment: any): void; 16 | /** 17 | * 日期时间优化 18 | * 19 | * @param {array} words 单词数组 20 | * @param {bool} is_not_first 是否为管理器调用的 21 | * @return {array} 22 | */ 23 | export declare function doOptimize(words: IWord[], is_not_first?: boolean): Segment.IWord[]; 24 | -------------------------------------------------------------------------------- /lib/submod/DatetimeOptimizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.segment = exports.type = void 0; 4 | exports.init = init; 5 | exports.doOptimize = doOptimize; 6 | const const_1 = require("../mod/const"); 7 | /** 模块类型 */ 8 | exports.type = 'optimizer'; 9 | /** 10 | * 模块初始化 11 | * 12 | * @param {Segment} segment 分词接口 13 | */ 14 | function init(_segment) { 15 | exports.segment = _segment; 16 | } 17 | /** 18 | * 日期时间优化 19 | * 20 | * @param {array} words 单词数组 21 | * @param {bool} is_not_first 是否为管理器调用的 22 | * @return {array} 23 | */ 24 | function doOptimize(words, is_not_first) { 25 | if (typeof is_not_first === 'undefined') { 26 | is_not_first = false; 27 | } 28 | // 合并相邻的能组成一个单词的两个词 29 | const TABLE = exports.segment.getDict('TABLE'); 30 | const POSTAG = exports.segment.POSTAG; 31 | let i = 0; 32 | let ie = words.length - 1; 33 | while (i < ie) { 34 | let w1 = words[i]; 35 | let w2 = words[i + 1]; 36 | //debug(w1.w + ', ' + w2.w); 37 | if ((w1.p & POSTAG.A_M) > 0) { 38 | // ========================================= 39 | // 日期时间组合 数字 + 日期单位,如 “2005年" 40 | if (w2.w in const_1.DATETIME) { 41 | let nw = w1.w + w2.w; 42 | let len = 2; 43 | let ma = [w1, w2]; 44 | // 继续搜索后面连续的日期时间描述,必须符合 数字 + 日期单位 45 | while (true) { 46 | let w11 = words[i + len]; 47 | let w22 = words[i + len + 1]; 48 | if (w11 && w22 && (w11.p & POSTAG.A_M) > 0 && w22.w in const_1.DATETIME) { 49 | len += 2; 50 | nw += w11.w + w22.w; 51 | ma.push(w11); 52 | ma.push(w22); 53 | } 54 | else { 55 | break; 56 | } 57 | } 58 | words.splice(i, len, { 59 | w: nw, 60 | p: POSTAG.D_T, 61 | m: ma, 62 | }); 63 | ie -= len - 1; 64 | continue; 65 | } 66 | // ========================================= 67 | } 68 | // 移到下一个词 69 | i++; 70 | } 71 | return words; 72 | } 73 | //# sourceMappingURL=DatetimeOptimizer.js.map -------------------------------------------------------------------------------- /lib/submod/DatetimeOptimizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * 日期时间优化模块 5 | * 6 | * @author 老雷 7 | */ 8 | 9 | import Segment, { IWord } from '../Segment'; 10 | import { DATETIME } from '../mod/const'; 11 | 12 | /** 模块类型 */ 13 | export const type = 'optimizer'; 14 | export let segment: Segment; 15 | 16 | /** 17 | * 模块初始化 18 | * 19 | * @param {Segment} segment 分词接口 20 | */ 21 | export function init(_segment) 22 | { 23 | segment = _segment; 24 | } 25 | 26 | /** 27 | * 日期时间优化 28 | * 29 | * @param {array} words 单词数组 30 | * @param {bool} is_not_first 是否为管理器调用的 31 | * @return {array} 32 | */ 33 | export function doOptimize(words: IWord[], is_not_first?: boolean) 34 | { 35 | if (typeof is_not_first === 'undefined') 36 | { 37 | is_not_first = false; 38 | } 39 | // 合并相邻的能组成一个单词的两个词 40 | const TABLE = segment.getDict('TABLE'); 41 | const POSTAG = segment.POSTAG; 42 | 43 | let i = 0; 44 | let ie = words.length - 1; 45 | while (i < ie) 46 | { 47 | let w1 = words[i]; 48 | let w2 = words[i + 1]; 49 | //debug(w1.w + ', ' + w2.w); 50 | 51 | if ((w1.p & POSTAG.A_M) > 0) 52 | { 53 | // ========================================= 54 | // 日期时间组合 数字 + 日期单位,如 “2005年" 55 | if (w2.w in DATETIME) 56 | { 57 | let nw = w1.w + w2.w; 58 | let len = 2; 59 | 60 | let ma = [w1, w2]; 61 | 62 | // 继续搜索后面连续的日期时间描述,必须符合 数字 + 日期单位 63 | while (true) 64 | { 65 | let w11 = words[i + len]; 66 | let w22 = words[i + len + 1]; 67 | if (w11 && w22 && (w11.p & POSTAG.A_M) > 0 && w22.w in DATETIME) 68 | { 69 | len += 2; 70 | nw += w11.w + w22.w; 71 | 72 | ma.push(w11); 73 | ma.push(w22); 74 | } 75 | else 76 | { 77 | break; 78 | } 79 | } 80 | words.splice(i, len, { 81 | w: nw, 82 | p: POSTAG.D_T, 83 | m: ma, 84 | }); 85 | ie -= len - 1; 86 | continue; 87 | } 88 | // ========================================= 89 | } 90 | 91 | // 移到下一个词 92 | i++; 93 | } 94 | 95 | return words; 96 | } 97 | -------------------------------------------------------------------------------- /lib/submod/DictOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | import { ISubOptimizerCreate, SubSModuleOptimizer } from '../mod'; 2 | import { IDICT, IWord } from '../Segment'; 3 | import { POSTAG as IPOSTAG } from '@novel-segment/postag/lib/postag/ids'; 4 | /** 5 | * 词典优化模块 6 | * 7 | * @author 老雷 8 | */ 9 | export declare class DictOptimizer extends SubSModuleOptimizer { 10 | protected _TABLE: IDICT; 11 | name: string; 12 | _cache(): void; 13 | isMergeable(w1: IWord, w2: IWord, { POSTAG, TABLE, nw, i, nw_cache, nw_cache_exists, }: { 14 | POSTAG: typeof IPOSTAG; 15 | TABLE: IDICT; 16 | nw: string; 17 | i: number; 18 | nw_cache: IWord; 19 | nw_cache_exists: boolean; 20 | }): boolean; 21 | _getWordCache(nw: string, nw_cache: IWord, nw_cache_exists: boolean): { 22 | nw: string; 23 | nw_cache: IWord; 24 | nw_cache_exists: boolean; 25 | }; 26 | /** 27 | * 词典优化 28 | * 29 | * @param {array} words 单词数组 30 | * @param {bool} is_not_first 是否为管理器调用的 31 | * @return {array} 32 | */ 33 | doOptimize(words: IWord[], is_not_first: boolean): IWord[]; 34 | /** 35 | * 數詞 + 量詞 36 | */ 37 | _mergeWordHowManyProp(p: number, p2: number, p3?: number): number; 38 | } 39 | export declare const init: ISubOptimizerCreate; 40 | export declare const type = "optimizer"; 41 | export default DictOptimizer; 42 | -------------------------------------------------------------------------------- /lib/submod/EmailOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | import { ISubOptimizerCreate, SubSModuleOptimizer } from '../mod'; 2 | import { IDICT, IWord } from '../Segment'; 3 | /** 4 | * 邮箱地址中允许出现的字符 5 | * 参考:http://www.cs.tut.fi/~jkorpela/rfc/822addr.html 6 | */ 7 | export declare const _EMAILCHAR: string[]; 8 | export declare const EMAILCHAR: IDICT; 9 | /** 10 | * 邮箱地址识别优化模块 11 | * 12 | * @author 老雷 13 | */ 14 | export declare class EmailOptimizer extends SubSModuleOptimizer { 15 | /** 16 | * 对可能是邮箱地址的单词进行优化 17 | * 18 | * @param {array} words 单词数组 19 | * @return {array} 20 | */ 21 | doOptimize(words: any): any; 22 | /** 23 | * 根据一组单词生成邮箱地址 24 | * 25 | * @param {array} words 单词数组 26 | * @return {string} 27 | */ 28 | toEmailAddress(words: IWord[]): string; 29 | } 30 | export declare const init: ISubOptimizerCreate; 31 | export declare const type = "optimizer"; 32 | export default EmailOptimizer; 33 | -------------------------------------------------------------------------------- /lib/submod/EmailOptimizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { ISubOptimizerCreate, SubSModuleOptimizer } from '../mod'; 4 | import { IDICT, IWord } from '../Segment'; 5 | 6 | /** 7 | * 邮箱地址中允许出现的字符 8 | * 参考:http://www.cs.tut.fi/~jkorpela/rfc/822addr.html 9 | */ 10 | export const _EMAILCHAR = '!"#$%&\'*+-/0123456789=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~.'.split(''); 11 | export const EMAILCHAR: IDICT = {}; 12 | for (let i in _EMAILCHAR) EMAILCHAR[_EMAILCHAR[i]] = 1; 13 | 14 | /** 15 | * 邮箱地址识别优化模块 16 | * 17 | * @author 老雷 18 | */ 19 | export class EmailOptimizer extends SubSModuleOptimizer 20 | { 21 | 22 | /** 23 | * 对可能是邮箱地址的单词进行优化 24 | * 25 | * @param {array} words 单词数组 26 | * @return {array} 27 | */ 28 | override doOptimize(words) 29 | { 30 | const POSTAG = this.segment.POSTAG; 31 | //debug(words); 32 | 33 | let i = 0; 34 | let ie = words.length - 1; 35 | let addr_start: boolean | number = false; 36 | let has_at = false; 37 | 38 | while (i < ie) 39 | { 40 | let word = words[i]; 41 | let is_ascii = ((word.p === POSTAG.A_NX) || 42 | (word.p === POSTAG.A_M && word.w.charCodeAt(0) < 128)) 43 | ? true : false; 44 | 45 | // 如果是外文字符或者数字,符合电子邮件地址开头的条件 46 | // @ts-ignore 47 | if (addr_start === false && is_ascii) 48 | { 49 | addr_start = i; 50 | i++; 51 | continue; 52 | } 53 | else 54 | { 55 | // 如果遇到@符号,符合第二个条件 56 | if (has_at === false && word.w === '@') 57 | { 58 | has_at = true; 59 | i++; 60 | continue; 61 | } 62 | // 如果已经遇到过@符号,且出现了其他字符,则截取邮箱地址 63 | if (has_at !== false && words[i - 1].w !== '@' && is_ascii === false && !(word.w in EMAILCHAR)) 64 | { 65 | let mailws = words.slice(addr_start, i); 66 | //debug(toEmailAddress(mailws)); 67 | words.splice(addr_start, mailws.length, { 68 | w: this.toEmailAddress(mailws), 69 | p: POSTAG.URL 70 | }); 71 | i = addr_start + 1; 72 | ie -= mailws.length - 1; 73 | addr_start = false; 74 | has_at = false; 75 | continue; 76 | } 77 | // 如果已经开头 78 | if (addr_start !== false && (is_ascii || word.w in EMAILCHAR)) 79 | { 80 | i++; 81 | continue; 82 | } 83 | } 84 | 85 | // 移到下一个词 86 | addr_start = false; 87 | has_at = false; 88 | i++; 89 | } 90 | 91 | // 检查剩余部分 92 | if (addr_start && has_at && words[ie]) 93 | { 94 | let word = words[ie]; 95 | let is_ascii = ((word.p === POSTAG.A_NX) || 96 | (word.p === POSTAG.A_M && word.w in EMAILCHAR)) 97 | ? true : false; 98 | if (is_ascii) 99 | { 100 | let mailws = words.slice(addr_start, words.length); 101 | //debug(toEmailAddress(mailws)); 102 | words.splice(addr_start, mailws.length, { 103 | w: this.toEmailAddress(mailws), 104 | p: POSTAG.URL 105 | }); 106 | } 107 | } 108 | 109 | return words; 110 | } 111 | 112 | /** 113 | * 根据一组单词生成邮箱地址 114 | * 115 | * @param {array} words 单词数组 116 | * @return {string} 117 | */ 118 | toEmailAddress(words: IWord[]) 119 | { 120 | let ret = words[0].w; 121 | for (let i = 1, word; word = words[i]; i++) 122 | { 123 | ret += word.w; 124 | } 125 | return ret; 126 | } 127 | 128 | } 129 | 130 | export const init = EmailOptimizer.init.bind(EmailOptimizer) as ISubOptimizerCreate; 131 | 132 | export const type = EmailOptimizer.type; 133 | 134 | export default EmailOptimizer; 135 | -------------------------------------------------------------------------------- /lib/submod/ForeignOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/8/18/018. 3 | */ 4 | import { SubSModuleOptimizer } from '../mod'; 5 | import { IDICT, IWord } from '../Segment'; 6 | import { IWordDebug } from '../util'; 7 | export declare class ForeignOptimizer extends SubSModuleOptimizer { 8 | name: string; 9 | protected _TABLE: IDICT; 10 | _cache(): void; 11 | doOptimize(words: T[]): T[]; 12 | } 13 | export declare const init: typeof ForeignOptimizer.init; 14 | export declare const type = "optimizer"; 15 | export default ForeignOptimizer; 16 | -------------------------------------------------------------------------------- /lib/submod/ForeignOptimizer.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/8/18/018. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.type = exports.init = exports.ForeignOptimizer = void 0; 7 | const mod_1 = require("../mod"); 8 | class ForeignOptimizer extends mod_1.SubSModuleOptimizer { 9 | constructor() { 10 | super(...arguments); 11 | this.name = 'ForeignOptimizer'; 12 | } 13 | _cache() { 14 | super._cache(); 15 | this._TABLE = this.segment.getDict('TABLE'); 16 | this._POSTAG = this.segment.POSTAG; 17 | } 18 | doOptimize(words) { 19 | const self = this; 20 | const POSTAG = this._POSTAG; 21 | const TABLE = this._TABLE; 22 | let i = 0; 23 | let len = words.length - 1; 24 | while (i < len) { 25 | let w0 = words[i - 1]; 26 | let w1 = words[i]; 27 | let w2 = words[i + 1]; 28 | if (!(w1.p === POSTAG.A_NX)) { 29 | i++; 30 | continue; 31 | } 32 | if (w2) { 33 | let nw = w1.w + w2.w; 34 | let mw = TABLE[nw]; 35 | if (mw) { 36 | let new_w = self.debugToken({ 37 | ...mw, 38 | w: nw, 39 | m: [w1, w2], 40 | }, { 41 | [this.name]: 1, 42 | }, true); 43 | this.sliceToken(words, i, 2, new_w); 44 | len--; 45 | continue; 46 | } 47 | } 48 | i++; 49 | } 50 | return words; 51 | } 52 | } 53 | exports.ForeignOptimizer = ForeignOptimizer; 54 | exports.init = ForeignOptimizer.init.bind(ForeignOptimizer); 55 | exports.type = ForeignOptimizer.type; 56 | exports.default = ForeignOptimizer; 57 | //# sourceMappingURL=ForeignOptimizer.js.map -------------------------------------------------------------------------------- /lib/submod/ForeignOptimizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/8/18/018. 3 | */ 4 | 5 | import { SubSModuleOptimizer } from '../mod'; 6 | import { IDICT, IWord } from '../Segment'; 7 | import { IWordDebug } from '../util'; 8 | 9 | export class ForeignOptimizer extends SubSModuleOptimizer 10 | { 11 | override name = 'ForeignOptimizer'; 12 | 13 | protected override _TABLE: IDICT; 14 | 15 | override _cache() 16 | { 17 | super._cache(); 18 | 19 | this._TABLE = this.segment.getDict('TABLE'); 20 | this._POSTAG = this.segment.POSTAG; 21 | } 22 | 23 | override doOptimize(words: T[]): T[] 24 | { 25 | const self = this; 26 | const POSTAG = this._POSTAG; 27 | const TABLE = this._TABLE; 28 | 29 | let i = 0; 30 | let len = words.length - 1; 31 | 32 | while (i < len) 33 | { 34 | let w0: IWordDebug = words[i - 1]; 35 | let w1: IWordDebug = words[i]; 36 | let w2: IWordDebug = words[i + 1]; 37 | 38 | if (!(w1.p === POSTAG.A_NX)) 39 | { 40 | i++; 41 | continue; 42 | } 43 | 44 | if (w2) 45 | { 46 | let nw: string = w1.w + w2.w; 47 | let mw: IWordDebug = TABLE[nw]; 48 | 49 | if (mw) 50 | { 51 | let new_w: IWordDebug = self.debugToken({ 52 | ...mw, 53 | w: nw, 54 | m: [w1, w2], 55 | }, { 56 | [this.name]: 1, 57 | }, true); 58 | 59 | this.sliceToken(words, i, 2, new_w); 60 | 61 | len--; 62 | continue; 63 | } 64 | } 65 | 66 | i++; 67 | } 68 | 69 | return words; 70 | } 71 | } 72 | 73 | export const init = ForeignOptimizer.init.bind(ForeignOptimizer) as typeof ForeignOptimizer.init; 74 | 75 | export const type = ForeignOptimizer.type; 76 | 77 | export default ForeignOptimizer; 78 | -------------------------------------------------------------------------------- /lib/submod/ForeignTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 外文字符、数字识别模块 3 | * 4 | * @author 老雷 5 | */ 6 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 7 | import { IWord } from '../Segment'; 8 | import { IWordDebugInfo } from '../util/index'; 9 | export declare class ForeignTokenizer extends SubSModuleTokenizer { 10 | name: string; 11 | /** 12 | * 分詞用(包含中文) 13 | */ 14 | _REGEXP_SPLIT_1: RegExp; 15 | /** 16 | * 分詞用(不包含中文的全詞符合) 17 | */ 18 | _REGEXP_SPLIT_2: RegExp; 19 | _cache(): void; 20 | /** 21 | * 对未识别的单词进行分词 22 | * 23 | * @param {array} words 单词数组 24 | * @return {array} 25 | */ 26 | split(words: IWord[]): IWord[]; 27 | /** 28 | * 支援更多外文判定(但可能會降低效率) 29 | * 30 | * 並且避免誤切割 例如 latīna Русский 31 | */ 32 | splitForeign2(text: string, cur?: number): IWord[]; 33 | /** 34 | * 匹配包含的英文字符和数字,并分割 35 | * 36 | * @param {string} text 文本 37 | * @param {int} cur 开始位置 38 | * @return {array} 返回格式 {w: '单词', c: 开始位置} 39 | */ 40 | splitForeign(text: string, cur?: number): IWord[]; 41 | createForeignToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo): IWord; 42 | } 43 | export declare const init: ISubTokenizerCreate; 44 | export declare const type = "tokenizer"; 45 | export default ForeignTokenizer; 46 | -------------------------------------------------------------------------------- /lib/submod/JpSimpleTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/19/019. 3 | */ 4 | import { SubSModuleTokenizer } from '../mod'; 5 | import { IWord } from '../Segment'; 6 | import { IWordDebug } from '../util'; 7 | export declare const enum EnumJpSimpleTokenizerType { 8 | /** 9 | * 平仮名 10 | * https://en.wikipedia.org/wiki/Hiragana 11 | */ 12 | HIRAGANA = 1, 13 | /** 14 | * 片仮名 15 | * https://en.wikipedia.org/wiki/Katakana 16 | */ 17 | KATAKANA = 2 18 | } 19 | export declare class JpSimpleTokenizer extends SubSModuleTokenizer { 20 | static NAME: "JpSimpleTokenizer"; 21 | name: "JpSimpleTokenizer"; 22 | split(words: IWord[], ...argv: any[]): IWord[]; 23 | protected createJpSimpleToken(data: T, type: EnumJpSimpleTokenizerType): T; 24 | protected _splitText(text: string): IWord[]; 25 | } 26 | export declare const init: typeof JpSimpleTokenizer.init; 27 | export declare const type = "tokenizer"; 28 | export default JpSimpleTokenizer; 29 | -------------------------------------------------------------------------------- /lib/submod/JpSimpleTokenizer.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/4/19/019. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.type = exports.init = exports.JpSimpleTokenizer = exports.EnumJpSimpleTokenizerType = void 0; 7 | const mod_1 = require("../mod"); 8 | var EnumJpSimpleTokenizerType; 9 | (function (EnumJpSimpleTokenizerType) { 10 | /** 11 | * 平仮名 12 | * https://en.wikipedia.org/wiki/Hiragana 13 | */ 14 | EnumJpSimpleTokenizerType[EnumJpSimpleTokenizerType["HIRAGANA"] = 1] = "HIRAGANA"; 15 | /** 16 | * 片仮名 17 | * https://en.wikipedia.org/wiki/Katakana 18 | */ 19 | EnumJpSimpleTokenizerType[EnumJpSimpleTokenizerType["KATAKANA"] = 2] = "KATAKANA"; 20 | })(EnumJpSimpleTokenizerType || (exports.EnumJpSimpleTokenizerType = EnumJpSimpleTokenizerType = {})); 21 | class JpSimpleTokenizer extends mod_1.SubSModuleTokenizer { 22 | constructor() { 23 | super(...arguments); 24 | this.name = 'JpSimpleTokenizer'; 25 | } 26 | split(words, ...argv) { 27 | return this._splitUnset(words, this._splitText); 28 | } 29 | createJpSimpleToken(data, type) { 30 | return super.debugToken(data, { 31 | [this.name]: type, 32 | }, true); 33 | } 34 | _splitText(text) { 35 | //const POSTAG = this.segment.POSTAG; 36 | let self = this; 37 | let b1 = /[ぁ-ん]/.test(text); 38 | let b2 = /[ァ-ヴーア-ン゙ー]/.test(text); 39 | if (b1 === false || b2 === false) { 40 | if (b1 === true && /^[ぁ-ん]+$/.test(text) || b2 === true && /^[ァ-ヴーア-ン゙ー]+$/.test(text)) { 41 | return [self.createJpSimpleToken({ 42 | w: text, 43 | }, b1 ? 1 /* EnumJpSimpleTokenizerType.HIRAGANA */ : 2 /* EnumJpSimpleTokenizerType.KATAKANA */)]; 44 | } 45 | return null; 46 | } 47 | let ret = []; 48 | text 49 | .split(/((?:[^ァ-ヴーア-ン゙ー]+)?[ぁ-ん]+(?=[ァ-ヴーア-ン゙ー])|(?:[^ぁ-ん]+)?[ァ-ヴーア-ン゙ー]+(?=[ぁ-ん]))/) 50 | .forEach(function (w, i) { 51 | if (w !== '') { 52 | ret.push(self.createJpSimpleToken({ 53 | w, 54 | }, /[ぁ-ん]/.test(w) ? 1 /* EnumJpSimpleTokenizerType.HIRAGANA */ 55 | : 2 /* EnumJpSimpleTokenizerType.KATAKANA */)); 56 | } 57 | }); 58 | return ret; 59 | } 60 | } 61 | exports.JpSimpleTokenizer = JpSimpleTokenizer; 62 | JpSimpleTokenizer.NAME = 'JpSimpleTokenizer'; 63 | exports.init = JpSimpleTokenizer.init.bind(JpSimpleTokenizer); 64 | exports.type = JpSimpleTokenizer.type; 65 | exports.default = JpSimpleTokenizer; 66 | //# sourceMappingURL=JpSimpleTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/JpSimpleTokenizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/19/019. 3 | */ 4 | 5 | import { SubSModuleTokenizer } from '../mod'; 6 | import { IWord } from '../Segment'; 7 | import { IWordDebug } from '../util'; 8 | 9 | export const enum EnumJpSimpleTokenizerType 10 | { 11 | /** 12 | * 平仮名 13 | * https://en.wikipedia.org/wiki/Hiragana 14 | */ 15 | HIRAGANA = 0x1, 16 | /** 17 | * 片仮名 18 | * https://en.wikipedia.org/wiki/Katakana 19 | */ 20 | KATAKANA = 0x2, 21 | } 22 | 23 | export class JpSimpleTokenizer extends SubSModuleTokenizer 24 | { 25 | static override NAME = 'JpSimpleTokenizer' as const; 26 | 27 | override name = 'JpSimpleTokenizer' as const; 28 | 29 | split(words: IWord[], ...argv): IWord[] 30 | { 31 | return this._splitUnset(words, this._splitText); 32 | } 33 | 34 | protected createJpSimpleToken(data: T, type: EnumJpSimpleTokenizerType) 35 | { 36 | return super.debugToken(data, { 37 | [this.name]: type, 38 | }, true); 39 | } 40 | 41 | protected _splitText(text: string): IWord[] 42 | { 43 | //const POSTAG = this.segment.POSTAG; 44 | 45 | let self = this; 46 | 47 | let b1 = /[ぁ-ん]/.test(text); 48 | let b2 = /[ァ-ヴーア-ン゙ー]/.test(text); 49 | 50 | if (b1 === false || b2 === false) 51 | { 52 | if (b1 === true && /^[ぁ-ん]+$/.test(text) || b2 === true && /^[ァ-ヴーア-ン゙ー]+$/.test(text)) 53 | { 54 | return [self.createJpSimpleToken({ 55 | w: text, 56 | }, b1 ? EnumJpSimpleTokenizerType.HIRAGANA : EnumJpSimpleTokenizerType.KATAKANA 57 | )]; 58 | } 59 | 60 | return null; 61 | } 62 | 63 | let ret: IWord[] = []; 64 | 65 | text 66 | .split(/((?:[^ァ-ヴーア-ン゙ー]+)?[ぁ-ん]+(?=[ァ-ヴーア-ン゙ー])|(?:[^ぁ-ん]+)?[ァ-ヴーア-ン゙ー]+(?=[ぁ-ん]))/) 67 | .forEach(function (w, i) 68 | { 69 | if (w !== '') 70 | { 71 | ret.push(self.createJpSimpleToken({ 72 | w, 73 | }, /[ぁ-ん]/.test(w) ? EnumJpSimpleTokenizerType.HIRAGANA 74 | : EnumJpSimpleTokenizerType.KATAKANA 75 | )); 76 | } 77 | }) 78 | 79 | ; 80 | 81 | return ret; 82 | } 83 | 84 | } 85 | 86 | export const init = JpSimpleTokenizer.init.bind(JpSimpleTokenizer) as typeof JpSimpleTokenizer.init; 87 | 88 | export const type = JpSimpleTokenizer.type; 89 | 90 | export default JpSimpleTokenizer; 91 | 92 | -------------------------------------------------------------------------------- /lib/submod/PunctuationTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 标点符号识别模块 3 | * 4 | * @author 老雷 5 | */ 6 | import { SubSModuleTokenizer } from '../mod'; 7 | import { IWord } from '../Segment'; 8 | export declare class PunctuationTokenizer extends SubSModuleTokenizer { 9 | name: string; 10 | _STOPWORD: string[]; 11 | STOPWORD: { 12 | [key: string]: number; 13 | }; 14 | STOPWORD2: { 15 | [key: number]: { 16 | [key: string]: number; 17 | }; 18 | }; 19 | /** 20 | * 对未识别的单词进行分词 21 | * 22 | * @param {array} words 单词数组 23 | * @return {array} 24 | */ 25 | split(words: IWord[]): IWord[]; 26 | /** 27 | * 匹配包含的标点符号,返回相关信息 28 | * 29 | * @param {string} text 文本 30 | * @param {int} cur 开始位置 31 | * @return {array} 返回格式 {w: '网址', c: 开始位置} 32 | */ 33 | matchStopword(text: string, cur?: number): IWord[]; 34 | } 35 | export declare const init: typeof PunctuationTokenizer.init; 36 | export declare const type = "tokenizer"; 37 | export default PunctuationTokenizer; 38 | -------------------------------------------------------------------------------- /lib/submod/PunctuationTokenizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.PunctuationTokenizer = void 0; 4 | /** 5 | * 标点符号识别模块 6 | * 7 | * @author 老雷 8 | */ 9 | const mod_1 = require("../mod"); 10 | const STOPWORD_1 = require("../mod/data/STOPWORD"); 11 | class PunctuationTokenizer extends mod_1.SubSModuleTokenizer { 12 | constructor() { 13 | super(...arguments); 14 | this.name = 'PunctuationTokenizer'; 15 | this._STOPWORD = STOPWORD_1._STOPWORD; 16 | this.STOPWORD = STOPWORD_1.STOPWORD; 17 | this.STOPWORD2 = STOPWORD_1.STOPWORD2; 18 | } 19 | /** 20 | * 对未识别的单词进行分词 21 | * 22 | * @param {array} words 单词数组 23 | * @return {array} 24 | */ 25 | split(words) { 26 | const POSTAG = this._POSTAG; 27 | const self = this; 28 | let ret = []; 29 | for (let i = 0, word; word = words[i]; i++) { 30 | if (word.p > 0) { 31 | ret.push(word); 32 | continue; 33 | } 34 | // 仅对未识别的词进行匹配 35 | let stopinfo = self.matchStopword(word.w); 36 | if (stopinfo.length < 1) { 37 | ret.push(word); 38 | continue; 39 | } 40 | // 分离出标点符号 41 | let lastc = 0; 42 | for (let ui = 0, sw; sw = stopinfo[ui]; ui++) { 43 | if (sw.c > lastc) { 44 | ret.push({ 45 | w: word.w.substr(lastc, sw.c - lastc) 46 | }); 47 | } 48 | ret.push(self.debugToken({ 49 | w: sw.w, 50 | p: POSTAG.D_W 51 | }, { 52 | [self.name]: true, 53 | }, true)); 54 | lastc = sw.c + sw.w.length; 55 | } 56 | let lastsw = stopinfo[stopinfo.length - 1]; 57 | if (lastsw.c + lastsw.w.length < word.w.length) { 58 | ret.push({ 59 | w: word.w.substr(lastsw.c + lastsw.w.length) 60 | }); 61 | } 62 | } 63 | return ret; 64 | } 65 | /** 66 | * 匹配包含的标点符号,返回相关信息 67 | * 68 | * @param {string} text 文本 69 | * @param {int} cur 开始位置 70 | * @return {array} 返回格式 {w: '网址', c: 开始位置} 71 | */ 72 | matchStopword(text, cur) { 73 | const STOPWORD2 = this.STOPWORD2; 74 | if (isNaN(cur)) 75 | cur = 0; 76 | let ret = []; 77 | let isMatch = false; 78 | while (cur < text.length) { 79 | let w; 80 | for (let i in STOPWORD2) { 81 | w = text.substr(cur, i); 82 | if (w in STOPWORD2[i]) { 83 | ret.push({ w: w, c: cur }); 84 | isMatch = true; 85 | break; 86 | } 87 | } 88 | cur += isMatch === false ? 1 : w.length; 89 | isMatch = false; 90 | } 91 | return ret; 92 | } 93 | } 94 | exports.PunctuationTokenizer = PunctuationTokenizer; 95 | // debug(STOPWORD2); 96 | exports.init = PunctuationTokenizer.init.bind(PunctuationTokenizer); 97 | exports.type = PunctuationTokenizer.type; 98 | exports.default = PunctuationTokenizer; 99 | //# sourceMappingURL=PunctuationTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/PunctuationTokenizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * 标点符号识别模块 5 | * 6 | * @author 老雷 7 | */ 8 | 9 | import { SubSModuleTokenizer } from '../mod'; 10 | import { IWord } from '../Segment'; 11 | import { _STOPWORD, STOPWORD, STOPWORD2 } from '../mod/data/STOPWORD'; 12 | 13 | export class PunctuationTokenizer extends SubSModuleTokenizer 14 | { 15 | override name = 'PunctuationTokenizer'; 16 | 17 | public _STOPWORD = _STOPWORD; 18 | public STOPWORD = STOPWORD; 19 | public STOPWORD2 = STOPWORD2; 20 | 21 | /** 22 | * 对未识别的单词进行分词 23 | * 24 | * @param {array} words 单词数组 25 | * @return {array} 26 | */ 27 | split(words: IWord[]): IWord[] 28 | { 29 | const POSTAG = this._POSTAG; 30 | const self = this; 31 | 32 | let ret = []; 33 | for (let i = 0, word; word = words[i]; i++) 34 | { 35 | if (word.p > 0) 36 | { 37 | ret.push(word); 38 | continue; 39 | } 40 | // 仅对未识别的词进行匹配 41 | let stopinfo = self.matchStopword(word.w); 42 | if (stopinfo.length < 1) 43 | { 44 | ret.push(word); 45 | continue; 46 | } 47 | // 分离出标点符号 48 | let lastc = 0; 49 | for (let ui = 0, sw; sw = stopinfo[ui]; ui++) 50 | { 51 | if (sw.c > lastc) 52 | { 53 | ret.push({ 54 | w: word.w.substr(lastc, sw.c - lastc) 55 | }); 56 | } 57 | 58 | ret.push(self.debugToken({ 59 | w: sw.w, 60 | p: POSTAG.D_W 61 | }, { 62 | [self.name]: true, 63 | }, true)); 64 | 65 | lastc = sw.c + sw.w.length; 66 | } 67 | let lastsw = stopinfo[stopinfo.length - 1]; 68 | if (lastsw.c + lastsw.w.length < word.w.length) 69 | { 70 | ret.push({ 71 | w: word.w.substr(lastsw.c + lastsw.w.length) 72 | }); 73 | } 74 | } 75 | return ret; 76 | } 77 | 78 | /** 79 | * 匹配包含的标点符号,返回相关信息 80 | * 81 | * @param {string} text 文本 82 | * @param {int} cur 开始位置 83 | * @return {array} 返回格式 {w: '网址', c: 开始位置} 84 | */ 85 | matchStopword(text: string, cur?: number): IWord[] 86 | { 87 | const STOPWORD2 = this.STOPWORD2; 88 | 89 | if (isNaN(cur)) cur = 0; 90 | let ret = []; 91 | let isMatch = false; 92 | while (cur < text.length) 93 | { 94 | let w; 95 | for (let i in STOPWORD2) 96 | { 97 | w = text.substr(cur, i as any as number); 98 | if (w in STOPWORD2[i]) 99 | { 100 | ret.push({ w: w, c: cur }); 101 | isMatch = true; 102 | break; 103 | } 104 | } 105 | cur += isMatch === false ? 1 : w.length; 106 | isMatch = false; 107 | } 108 | 109 | return ret; 110 | } 111 | } 112 | 113 | // debug(STOPWORD2); 114 | 115 | export const init = PunctuationTokenizer.init.bind(PunctuationTokenizer) as typeof PunctuationTokenizer.init; 116 | 117 | export const type = PunctuationTokenizer.type; 118 | 119 | export default PunctuationTokenizer; 120 | -------------------------------------------------------------------------------- /lib/submod/SingleTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | import { SubSModuleTokenizer } from '../mod'; 2 | import { IWord } from '../Segment'; 3 | /** 4 | * 单字切分模块 5 | * 此模組不包含模組列表內 需要手動指定 6 | * 7 | * @author 老雷 8 | */ 9 | export declare class SingleTokenizer extends SubSModuleTokenizer { 10 | /** 11 | * 对未识别的单词进行分词 12 | * 13 | * @param {array} words 单词数组 14 | * @return {array} 15 | */ 16 | split(words: IWord[]): IWord[]; 17 | /** 18 | * 单字切分 19 | * 20 | * @param {string} text 要切分的文本 21 | * @param {int} cur 开始位置 22 | * @return {array} 23 | */ 24 | splitSingle(text: any, cur?: number): IWord[]; 25 | } 26 | export declare const init: typeof SingleTokenizer.init; 27 | export declare const type = "tokenizer"; 28 | export default SingleTokenizer; 29 | -------------------------------------------------------------------------------- /lib/submod/SingleTokenizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.SingleTokenizer = void 0; 4 | const tslib_1 = require("tslib"); 5 | const mod_1 = require("../mod"); 6 | const uni_string_1 = tslib_1.__importDefault(require("uni-string")); 7 | /** 8 | * 单字切分模块 9 | * 此模組不包含模組列表內 需要手動指定 10 | * 11 | * @author 老雷 12 | */ 13 | class SingleTokenizer extends mod_1.SubSModuleTokenizer { 14 | /** 15 | * 对未识别的单词进行分词 16 | * 17 | * @param {array} words 单词数组 18 | * @return {array} 19 | */ 20 | split(words) { 21 | const POSTAG = this.segment.POSTAG; 22 | let ret = []; 23 | for (let i = 0, word; word = words[i]; i++) { 24 | if (typeof word.p === 'undefined' || word.p) { 25 | ret.push(word); 26 | } 27 | else { 28 | // 仅对未识别的词进行匹配 29 | ret = ret.concat(this.splitSingle(word.w)); 30 | } 31 | } 32 | return ret; 33 | } 34 | /** 35 | * 单字切分 36 | * 37 | * @param {string} text 要切分的文本 38 | * @param {int} cur 开始位置 39 | * @return {array} 40 | */ 41 | splitSingle(text, cur) { 42 | const POSTAG = this.segment.POSTAG; 43 | if (isNaN(cur)) 44 | cur = 0; 45 | if (cur > 0) { 46 | text = text.slice(cur); 47 | } 48 | let ret = []; 49 | uni_string_1.default 50 | .split(text, '') 51 | .forEach(function (w, i) { 52 | ret.push({ 53 | w, 54 | p: POSTAG.UNK, 55 | }); 56 | }); 57 | return ret; 58 | } 59 | } 60 | exports.SingleTokenizer = SingleTokenizer; 61 | exports.init = SingleTokenizer.init.bind(SingleTokenizer); 62 | exports.type = SingleTokenizer.type; 63 | exports.default = SingleTokenizer; 64 | //# sourceMappingURL=SingleTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/SingleTokenizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { SubSModuleTokenizer } from '../mod'; 4 | import { IWord } from '../Segment'; 5 | import UString from 'uni-string'; 6 | 7 | /** 8 | * 单字切分模块 9 | * 此模組不包含模組列表內 需要手動指定 10 | * 11 | * @author 老雷 12 | */ 13 | export class SingleTokenizer extends SubSModuleTokenizer 14 | { 15 | 16 | /** 17 | * 对未识别的单词进行分词 18 | * 19 | * @param {array} words 单词数组 20 | * @return {array} 21 | */ 22 | split(words: IWord[]): IWord[] 23 | { 24 | const POSTAG = this.segment.POSTAG; 25 | 26 | let ret = []; 27 | for (let i = 0, word; word = words[i]; i++) 28 | { 29 | if (typeof word.p === 'undefined' || word.p) 30 | { 31 | ret.push(word); 32 | } 33 | else 34 | { 35 | // 仅对未识别的词进行匹配 36 | ret = ret.concat(this.splitSingle(word.w)); 37 | } 38 | } 39 | return ret; 40 | } 41 | 42 | /** 43 | * 单字切分 44 | * 45 | * @param {string} text 要切分的文本 46 | * @param {int} cur 开始位置 47 | * @return {array} 48 | */ 49 | splitSingle(text, cur?: number): IWord[] 50 | { 51 | const POSTAG = this.segment.POSTAG; 52 | 53 | if (isNaN(cur)) cur = 0; 54 | 55 | if (cur > 0) 56 | { 57 | text = text.slice(cur); 58 | } 59 | 60 | let ret: IWord[] = []; 61 | 62 | UString 63 | .split(text, '') 64 | .forEach(function (w, i) 65 | { 66 | ret.push({ 67 | w, 68 | p: POSTAG.UNK, 69 | }); 70 | }) 71 | ; 72 | 73 | return ret; 74 | } 75 | } 76 | 77 | export const init = SingleTokenizer.init.bind(SingleTokenizer) as typeof SingleTokenizer.init; 78 | 79 | export const type = SingleTokenizer.type; 80 | 81 | export default SingleTokenizer; 82 | -------------------------------------------------------------------------------- /lib/submod/URLTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | import Segment, { IWord } from '../Segment'; 2 | /** 3 | * URL识别模块 4 | * 5 | * @author 老雷 6 | */ 7 | /** 8 | * 模块类型 9 | * */ 10 | export declare const type = "tokenizer"; 11 | export declare let segment: Segment; 12 | /** 13 | * 模块初始化 14 | * 15 | * @param {Segment} segment 分词接口 16 | */ 17 | export declare function init(_segment: Segment): void; 18 | /** 19 | * 对未识别的单词进行分词 20 | * 21 | * @param {array} words 单词数组 22 | * @return {array} 23 | */ 24 | export declare function split(words: IWord[]): IWord[]; 25 | /** 26 | * 匹配包含的网址,返回相关信息 27 | * 28 | * @param {string} text 文本 29 | * @param {int} cur 开始位置 30 | * @return {array} 返回格式 {w: '网址', c: 开始位置} 31 | */ 32 | export declare function matchURL(text: string, cur?: number): any[]; 33 | -------------------------------------------------------------------------------- /lib/submod/WildcardTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 通配符识别模块 3 | * 4 | * @author 老雷 5 | */ 6 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 7 | import { IDICT, IDICT2, IWord } from '../Segment'; 8 | import { IWordDebugInfo } from '../util/index'; 9 | export declare class WildcardTokenizer extends SubSModuleTokenizer { 10 | name: string; 11 | protected _TABLE: IDICT; 12 | protected _TABLE2: IDICT2; 13 | _cache(): void; 14 | /** 15 | * 对未识别的单词进行分词 16 | * 17 | * @param {array} words 单词数组 18 | * @return {array} 19 | */ 20 | split(words: IWord[]): IWord[]; 21 | createWildcardToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo): IWord; 22 | splitWildcard(text: string, cur?: number): IWord[]; 23 | /** 24 | * 匹配单词,返回相关信息 25 | * 26 | * @param {string} text 文本 27 | * @param {int} cur 开始位置 28 | * @return {array} 返回格式 {w: '单词', c: 开始位置} 29 | */ 30 | matchWord(text: string, cur?: number): IWord[]; 31 | } 32 | export declare const init: ISubTokenizerCreate; 33 | export declare const type = "tokenizer"; 34 | export default WildcardTokenizer; 35 | -------------------------------------------------------------------------------- /lib/submod/WildcardTokenizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.WildcardTokenizer = void 0; 4 | /** 5 | * 通配符识别模块 6 | * 7 | * @author 老雷 8 | */ 9 | const mod_1 = require("../mod"); 10 | class WildcardTokenizer extends mod_1.SubSModuleTokenizer { 11 | constructor() { 12 | super(...arguments); 13 | this.name = 'WildcardTokenizer'; 14 | } 15 | _cache() { 16 | super._cache(); 17 | this._TABLE = this.segment.getDict('WILDCARD'); 18 | this._TABLE2 = this.segment.getDict('WILDCARD2'); 19 | } 20 | /** 21 | * 对未识别的单词进行分词 22 | * 23 | * @param {array} words 单词数组 24 | * @return {array} 25 | */ 26 | split(words) { 27 | //return this._splitUnknow(words, this.splitForeign); 28 | return this._splitUnknow(words, this.splitWildcard); 29 | } 30 | createWildcardToken(word, lasttype, attr) { 31 | let nw = this.createToken(word, true, attr); 32 | return nw; 33 | } 34 | splitWildcard(text, cur) { 35 | var _a; 36 | //const POSTAG = this._POSTAG; 37 | const TABLE = this._TABLE; 38 | let ret = []; 39 | let self = this; 40 | // 分离出已识别的单词 41 | let wordinfo = self.matchWord(text); 42 | if (wordinfo.length) { 43 | let lastc = 0; 44 | for (let ui = 0, bw; bw = wordinfo[ui]; ui++) { 45 | if (bw.c > lastc) { 46 | ret.push({ 47 | w: text.substr(lastc, bw.c - lastc), 48 | }); 49 | } 50 | let nw = self.createWildcardToken({ 51 | w: bw.w, 52 | p: (_a = TABLE[bw.w.toLowerCase()]) === null || _a === void 0 ? void 0 : _a.p, 53 | }); 54 | ret.push(nw); 55 | lastc = bw.c + bw.w.length; 56 | } 57 | let lastword = wordinfo[wordinfo.length - 1]; 58 | if (lastword.c + lastword.w.length < text.length) { 59 | ret.push({ 60 | w: text.substr(lastword.c + lastword.w.length), 61 | }); 62 | } 63 | } 64 | return ret.length ? ret : undefined; 65 | } 66 | /** 67 | * 匹配单词,返回相关信息 68 | * 69 | * @param {string} text 文本 70 | * @param {int} cur 开始位置 71 | * @return {array} 返回格式 {w: '单词', c: 开始位置} 72 | */ 73 | matchWord(text, cur) { 74 | //const POSTAG = this._POSTAG; 75 | const TABLE = this._TABLE2; 76 | if (isNaN(cur)) 77 | cur = 0; 78 | let ret = []; 79 | //let self = this; 80 | let s = false; 81 | // 匹配可能出现的单词,取长度最大的那个 82 | let lowertext = text.toLowerCase(); 83 | while (cur < text.length) { 84 | let stopword = null; 85 | for (let i in TABLE) { 86 | if (lowertext.substr(cur, i) in TABLE[i]) { 87 | stopword = { 88 | w: text.substr(cur, i), 89 | c: cur, 90 | }; 91 | } 92 | } 93 | if (stopword !== null) { 94 | ret.push(stopword); 95 | cur += stopword.w.length; 96 | } 97 | else { 98 | cur++; 99 | } 100 | } 101 | return ret; 102 | } 103 | } 104 | exports.WildcardTokenizer = WildcardTokenizer; 105 | exports.init = WildcardTokenizer.init.bind(WildcardTokenizer); 106 | exports.type = WildcardTokenizer.type; 107 | exports.default = WildcardTokenizer; 108 | //# sourceMappingURL=WildcardTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/WildcardTokenizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * 通配符识别模块 5 | * 6 | * @author 老雷 7 | */ 8 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 9 | import { IDICT, IDICT2, IWord } from '../Segment'; 10 | import { IWordDebugInfo } from '../util/index'; 11 | 12 | export class WildcardTokenizer extends SubSModuleTokenizer 13 | { 14 | 15 | override name = 'WildcardTokenizer'; 16 | 17 | protected override _TABLE: IDICT; 18 | protected _TABLE2: IDICT2; 19 | 20 | override _cache() 21 | { 22 | super._cache(); 23 | this._TABLE = this.segment.getDict('WILDCARD'); 24 | this._TABLE2 = this.segment.getDict('WILDCARD2'); 25 | } 26 | 27 | /** 28 | * 对未识别的单词进行分词 29 | * 30 | * @param {array} words 单词数组 31 | * @return {array} 32 | */ 33 | split(words: IWord[]): IWord[] 34 | { 35 | //return this._splitUnknow(words, this.splitForeign); 36 | return this._splitUnknow(words, this.splitWildcard); 37 | } 38 | 39 | createWildcardToken(word: IWord, lasttype?: number, attr?: IWordDebugInfo) 40 | { 41 | let nw = this.createToken(word, true, attr); 42 | 43 | return nw; 44 | } 45 | 46 | splitWildcard(text: string, cur?: number): IWord[] 47 | { 48 | //const POSTAG = this._POSTAG; 49 | const TABLE = this._TABLE; 50 | 51 | let ret: IWord[] = []; 52 | let self = this; 53 | 54 | // 分离出已识别的单词 55 | let wordinfo = self.matchWord(text); 56 | if (wordinfo.length) 57 | { 58 | let lastc = 0; 59 | for (let ui = 0, bw; bw = wordinfo[ui]; ui++) 60 | { 61 | if (bw.c > lastc) 62 | { 63 | ret.push({ 64 | w: text.substr(lastc, bw.c - lastc), 65 | }); 66 | } 67 | 68 | let nw = self.createWildcardToken({ 69 | w: bw.w, 70 | p: TABLE[bw.w.toLowerCase()]?.p, 71 | }); 72 | 73 | ret.push(nw); 74 | 75 | lastc = bw.c + bw.w.length; 76 | } 77 | 78 | let lastword = wordinfo[wordinfo.length - 1]; 79 | if (lastword.c + lastword.w.length < text.length) 80 | { 81 | ret.push({ 82 | w: text.substr(lastword.c + lastword.w.length), 83 | }); 84 | } 85 | } 86 | 87 | return ret.length ? ret : undefined; 88 | } 89 | 90 | /** 91 | * 匹配单词,返回相关信息 92 | * 93 | * @param {string} text 文本 94 | * @param {int} cur 开始位置 95 | * @return {array} 返回格式 {w: '单词', c: 开始位置} 96 | */ 97 | matchWord(text: string, cur?: number) 98 | { 99 | //const POSTAG = this._POSTAG; 100 | const TABLE = this._TABLE2; 101 | 102 | if (isNaN(cur)) cur = 0; 103 | 104 | let ret: IWord[] = []; 105 | //let self = this; 106 | 107 | let s = false; 108 | 109 | // 匹配可能出现的单词,取长度最大的那个 110 | let lowertext = text.toLowerCase(); 111 | 112 | while (cur < text.length) 113 | { 114 | let stopword: IWord = null; 115 | for (let i in TABLE) 116 | { 117 | if (lowertext.substr(cur, i as any) in TABLE[i]) 118 | { 119 | stopword = { 120 | w: text.substr(cur, i as any), 121 | c: cur, 122 | }; 123 | } 124 | } 125 | if (stopword !== null) 126 | { 127 | ret.push(stopword); 128 | cur += stopword.w.length; 129 | } 130 | else 131 | { 132 | cur++; 133 | } 134 | } 135 | return ret; 136 | } 137 | 138 | } 139 | 140 | export const init = WildcardTokenizer.init.bind(WildcardTokenizer) as ISubTokenizerCreate; 141 | 142 | export const type = WildcardTokenizer.type; 143 | 144 | export default WildcardTokenizer; 145 | -------------------------------------------------------------------------------- /lib/submod/ZhRadicalTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 2 | import { IDICT, IDICT2, IWord } from '../Segment'; 3 | /** 4 | * 此模組目前無任何用處與效果 5 | * 6 | * @todo 部首 7 | */ 8 | export declare class ZhRadicalTokenizer extends SubSModuleTokenizer { 9 | name: string; 10 | protected _TABLE: IDICT; 11 | protected _TABLE2: IDICT2; 12 | protected _cache(...argv: any[]): void; 13 | split(words: IWord[]): IWord[]; 14 | splitZhRadical(text: string, cur?: number): IWord[]; 15 | } 16 | export declare const init: ISubTokenizerCreate; 17 | export declare const type = "tokenizer"; 18 | export default ZhRadicalTokenizer; 19 | -------------------------------------------------------------------------------- /lib/submod/ZhRadicalTokenizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.ZhRadicalTokenizer = void 0; 4 | const mod_1 = require("../mod"); 5 | /** 6 | * 此模組目前無任何用處與效果 7 | * 8 | * @todo 部首 9 | */ 10 | class ZhRadicalTokenizer extends mod_1.SubSModuleTokenizer { 11 | constructor() { 12 | super(...arguments); 13 | this.name = 'ZhRadicalTokenizer'; 14 | } 15 | _cache(...argv) { 16 | super._cache(...argv); 17 | } 18 | split(words) { 19 | return this._splitUnset(words, this.splitZhRadical); 20 | } 21 | splitZhRadical(text, cur) { 22 | let ret = []; 23 | let self = this; 24 | let _r = /[\u4136\u4137]/u; 25 | if (!_r.test(text)) { 26 | return null; 27 | } 28 | text 29 | .split(/([\u4136\u4137]+)/u) 30 | .forEach(function (w, i) { 31 | if (w !== '') { 32 | if (_r.test(w)) { 33 | ret.push(self.debugToken({ 34 | w, 35 | }, { 36 | [self.name]: true, 37 | }, true)); 38 | } 39 | else { 40 | ret.push({ 41 | w, 42 | }); 43 | } 44 | } 45 | }); 46 | return ret.length ? ret : null; 47 | } 48 | } 49 | exports.ZhRadicalTokenizer = ZhRadicalTokenizer; 50 | exports.init = ZhRadicalTokenizer.init.bind(ZhRadicalTokenizer); 51 | exports.type = ZhRadicalTokenizer.type; 52 | exports.default = ZhRadicalTokenizer; 53 | //# sourceMappingURL=ZhRadicalTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/ZhRadicalTokenizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 4 | import { IDICT, IDICT2, IWord } from '../Segment'; 5 | 6 | /** 7 | * 此模組目前無任何用處與效果 8 | * 9 | * @todo 部首 10 | */ 11 | export class ZhRadicalTokenizer extends SubSModuleTokenizer 12 | { 13 | 14 | override name = 'ZhRadicalTokenizer'; 15 | 16 | protected override _TABLE: IDICT; 17 | protected _TABLE2: IDICT2; 18 | 19 | protected override _cache(...argv) 20 | { 21 | super._cache(...argv); 22 | } 23 | 24 | split(words: IWord[]): IWord[] 25 | { 26 | return this._splitUnset(words, this.splitZhRadical); 27 | } 28 | 29 | splitZhRadical(text: string, cur?: number): IWord[] 30 | { 31 | let ret: IWord[] = []; 32 | let self = this; 33 | 34 | let _r = /[\u4136\u4137]/u; 35 | 36 | if (!_r.test(text)) 37 | { 38 | return null; 39 | } 40 | 41 | text 42 | .split(/([\u4136\u4137]+)/u) 43 | .forEach(function (w, i) 44 | { 45 | if (w !== '') 46 | { 47 | if (_r.test(w)) 48 | { 49 | ret.push(self.debugToken({ 50 | w, 51 | }, { 52 | [self.name]: true, 53 | }, true)); 54 | } 55 | else 56 | { 57 | ret.push({ 58 | w, 59 | }); 60 | } 61 | } 62 | }) 63 | ; 64 | 65 | return ret.length ? ret : null; 66 | } 67 | 68 | } 69 | 70 | export const init = ZhRadicalTokenizer.init.bind(ZhRadicalTokenizer) as ISubTokenizerCreate; 71 | 72 | export const type = ZhRadicalTokenizer.type; 73 | 74 | export default ZhRadicalTokenizer; 75 | -------------------------------------------------------------------------------- /lib/submod/ZhtSynonymOptimizer.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/16/016. 3 | */ 4 | import { SubSModuleOptimizer } from '../mod'; 5 | import { IDICT, IDICT_SYNONYM, IWord } from '../Segment'; 6 | import { IWordDebug } from '../util'; 7 | /** 8 | * 以詞意來自動轉換 而不需要手動加入字典於 synonym.txt 9 | * 適用於比較容易需要人工處理的轉換 10 | * 11 | * 自動處理 `里|后` 12 | * 13 | * 建議在字典內追加人名地名等等名字 來增加準確性 14 | * 防止轉換錯誤 15 | * 16 | * @todo 發于余干松冲准呆只范舍涂 17 | */ 18 | export declare class ZhtSynonymOptimizer extends SubSModuleOptimizer { 19 | name: string; 20 | protected _SYNONYM?: IDICT_SYNONYM; 21 | protected _TABLE: IDICT; 22 | _cache(): void; 23 | isSynonymBlacklist(w: string): boolean; 24 | protected _getSynonym(w: string, nw: string): string; 25 | doOptimize(words: T[]): T[]; 26 | } 27 | export declare const init: typeof ZhtSynonymOptimizer.init; 28 | export declare const type = "optimizer"; 29 | export default ZhtSynonymOptimizer; 30 | -------------------------------------------------------------------------------- /lib/submod/ZhuyinTokenizer.d.ts: -------------------------------------------------------------------------------- 1 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 2 | import { IDICT, IDICT2, IWord } from '../Segment'; 3 | /** 4 | * 注音 5 | */ 6 | export declare class ZhuyinTokenizer extends SubSModuleTokenizer { 7 | name: string; 8 | protected _TABLE: IDICT; 9 | protected _TABLE2: IDICT2; 10 | protected _cache(...argv: any[]): void; 11 | split(words: IWord[]): IWord[]; 12 | splitZhuyin(text: string, cur?: number): IWord[]; 13 | } 14 | export declare const init: ISubTokenizerCreate; 15 | export declare const type = "tokenizer"; 16 | export default ZhuyinTokenizer; 17 | -------------------------------------------------------------------------------- /lib/submod/ZhuyinTokenizer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.type = exports.init = exports.ZhuyinTokenizer = void 0; 4 | const mod_1 = require("../mod"); 5 | /** 6 | * 注音 7 | */ 8 | class ZhuyinTokenizer extends mod_1.SubSModuleTokenizer { 9 | constructor() { 10 | super(...arguments); 11 | this.name = 'ZhuyinTokenizer'; 12 | } 13 | _cache(...argv) { 14 | super._cache(...argv); 15 | } 16 | split(words) { 17 | return this._splitUnset(words, this.splitZhuyin); 18 | } 19 | splitZhuyin(text, cur) { 20 | let ret = []; 21 | let self = this; 22 | let _r = /[\u31A0-\u31BA\u3105-\u312E]/u; 23 | if (!_r.test(text)) { 24 | return null; 25 | } 26 | text 27 | .split(/([\u31A0-\u31BA\u3105-\u312E]+)/u) 28 | .forEach(function (w, i) { 29 | if (w !== '') { 30 | if (_r.test(w)) { 31 | ret.push(self.debugToken({ 32 | w, 33 | }, { 34 | [self.name]: true, 35 | }, true)); 36 | } 37 | else { 38 | ret.push({ 39 | w, 40 | }); 41 | } 42 | } 43 | }); 44 | return ret.length ? ret : null; 45 | } 46 | } 47 | exports.ZhuyinTokenizer = ZhuyinTokenizer; 48 | exports.init = ZhuyinTokenizer.init.bind(ZhuyinTokenizer); 49 | exports.type = ZhuyinTokenizer.type; 50 | exports.default = ZhuyinTokenizer; 51 | //# sourceMappingURL=ZhuyinTokenizer.js.map -------------------------------------------------------------------------------- /lib/submod/ZhuyinTokenizer.ts: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { ISubTokenizerCreate, SubSModuleTokenizer } from '../mod'; 4 | import { IDICT, IDICT2, IWord } from '../Segment'; 5 | 6 | /** 7 | * 注音 8 | */ 9 | export class ZhuyinTokenizer extends SubSModuleTokenizer 10 | { 11 | 12 | override name = 'ZhuyinTokenizer'; 13 | 14 | protected override _TABLE: IDICT; 15 | protected _TABLE2: IDICT2; 16 | 17 | protected override _cache(...argv) 18 | { 19 | super._cache(...argv); 20 | } 21 | 22 | split(words: IWord[]): IWord[] 23 | { 24 | return this._splitUnset(words, this.splitZhuyin); 25 | } 26 | 27 | splitZhuyin(text: string, cur?: number): IWord[] 28 | { 29 | let ret: IWord[] = []; 30 | let self = this; 31 | 32 | let _r = /[\u31A0-\u31BA\u3105-\u312E]/u; 33 | 34 | if (!_r.test(text)) 35 | { 36 | return null; 37 | } 38 | 39 | text 40 | .split(/([\u31A0-\u31BA\u3105-\u312E]+)/u) 41 | .forEach(function (w, i) 42 | { 43 | if (w !== '') 44 | { 45 | if (_r.test(w)) 46 | { 47 | ret.push(self.debugToken({ 48 | w, 49 | }, { 50 | [self.name]: true, 51 | }, true)); 52 | } 53 | 54 | else 55 | { 56 | ret.push({ 57 | w, 58 | }); 59 | } 60 | } 61 | }) 62 | ; 63 | 64 | return ret.length ? ret : null; 65 | } 66 | 67 | } 68 | 69 | export const init = ZhuyinTokenizer.init.bind(ZhuyinTokenizer) as ISubTokenizerCreate; 70 | 71 | export const type = ZhuyinTokenizer.type; 72 | 73 | export default ZhuyinTokenizer; 74 | -------------------------------------------------------------------------------- /lib/util/debug.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/19/019. 3 | */ 4 | import { IWord } from '@novel-segment/types'; 5 | export declare const SYMBOL_DEBUG_KEY = "_debug"; 6 | export type IWordDebugInfo = { 7 | ZhtSynonymOptimizer?: boolean; 8 | convertSynonym?: boolean; 9 | autoCreate?: boolean; 10 | _source?: T & IWordDebug; 11 | index?: number; 12 | ps_en?: string; 13 | [key: string]: any; 14 | [key: number]: any; 15 | }; 16 | export type IWordDebug = IWord & { 17 | m?: Array; 18 | ps?: string; 19 | pp?: string; 20 | ow?: string; 21 | op?: number; 22 | ops?: string; 23 | opp?: string; 24 | os?: boolean; 25 | [SYMBOL_DEBUG_KEY]?: IWordDebugInfo; 26 | }; 27 | export declare function clearTokemDebug(data: IWordDebugInfo, returnClone?: false): data is IWord; 28 | export declare function clearTokemDebug(data: IWordDebugInfo, returnClone?: true): IWord; 29 | export declare function debugToken(data: T, attr: U & IWordDebugInfo, returnToken: true, ...argv: any[]): T; 30 | export declare function debugToken(data: T, attr?: U & IWordDebugInfo, returnToken?: boolean, ...argv: any[]): U & IWordDebugInfo; 31 | export declare function debug_token(ks: Array, returnSource?: boolean): Array; 32 | export declare function token_add_info(v: T): T; 33 | export declare function toHex(p: number): string; 34 | -------------------------------------------------------------------------------- /lib/util/debug.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/4/19/019. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.SYMBOL_DEBUG_KEY = void 0; 7 | exports.clearTokemDebug = clearTokemDebug; 8 | exports.debugToken = debugToken; 9 | exports.debug_token = debug_token; 10 | exports.token_add_info = token_add_info; 11 | exports.toHex = toHex; 12 | const tslib_1 = require("tslib"); 13 | const sort_object_keys2_1 = tslib_1.__importDefault(require("sort-object-keys2")); 14 | const i18n_1 = require("@novel-segment/postag/lib/i18n"); 15 | //export const SYMBOL_DEBUG_KEY = Symbol.for('_debug'); 16 | exports.SYMBOL_DEBUG_KEY = '_debug'; 17 | function clearTokemDebug(data, returnClone) { 18 | if (returnClone) { 19 | return { 20 | w: data.w, 21 | p: data.p, 22 | f: data.f, 23 | }; 24 | } 25 | for (let k in data) { 26 | if (k !== 'w' && k !== 'p' && k !== 'f') { 27 | delete data[k]; 28 | } 29 | } 30 | delete data[exports.SYMBOL_DEBUG_KEY]; 31 | return data; 32 | } 33 | function debugToken(data, attr, returnToken, ...argv) { 34 | if (attr) { 35 | data[exports.SYMBOL_DEBUG_KEY] = Object.assign(data[exports.SYMBOL_DEBUG_KEY] || {}, attr); 36 | } 37 | if (returnToken) { 38 | return data; 39 | } 40 | return (data[exports.SYMBOL_DEBUG_KEY] || {}); 41 | } 42 | function debug_token(ks, returnSource) { 43 | let ks2 = []; 44 | // @ts-ignore 45 | ks.map(function (v, index) { 46 | //v.index = index; 47 | // @ts-ignore 48 | debugToken(v, { 49 | index, 50 | }); 51 | if (v.p) { 52 | // @ts-ignore 53 | token_add_info(v); 54 | } 55 | else if (v.m) { 56 | // @ts-ignore 57 | v.m.map(token_add_info); 58 | } 59 | else { 60 | // @ts-ignore 61 | ks2.push(v); 62 | } 63 | }); 64 | return returnSource ? ks : ks2; 65 | } 66 | function token_add_info(v) { 67 | if (v.p) { 68 | v.ps = (0, i18n_1.zhName)(v.p); 69 | //v.ps_en = POSTAG.enName(v.p); 70 | let debug = debugToken(v, { 71 | ps_en: (0, i18n_1.enName)(v.p), 72 | }); 73 | v.pp = toHex(v.p); 74 | if (v.op) { 75 | v.ops = (0, i18n_1.zhName)(v.op); 76 | v.opp = toHex(v.op); 77 | } 78 | if (v.m) { 79 | v.m.map(function (v) { 80 | if (typeof v === 'string') { 81 | return v; 82 | } 83 | return token_add_info(v); 84 | }); 85 | } 86 | if (debug._source) { 87 | token_add_info(debug._source); 88 | } 89 | } 90 | if (v) { 91 | (0, sort_object_keys2_1.default)(v, { 92 | keys: [ 93 | 'w', 94 | 'p', 95 | 'f', 96 | 'ps', 97 | 'pp', 98 | 'ow', 99 | 'op', 100 | 'ops', 101 | 'opp', 102 | 'os', 103 | ], 104 | useSource: true, 105 | }); 106 | } 107 | return v; 108 | } 109 | function toHex(p) { 110 | return '0x' + p 111 | .toString(16) 112 | .padStart(4, '0') 113 | .toUpperCase(); 114 | } 115 | //# sourceMappingURL=debug.js.map -------------------------------------------------------------------------------- /lib/util/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/17/017. 3 | */ 4 | import { debug_token, IWordDebug, IWordDebugInfo, toHex, token_add_info } from './debug'; 5 | import { InspectOptions } from 'util'; 6 | export { IWordDebug, IWordDebugInfo, debug_token, toHex, token_add_info }; 7 | export declare function debug_inspect(argv: any[], options?: InspectOptions): string[]; 8 | export declare function debug(...argv: any[]): void; 9 | export declare function debug_options(argv: any[], options?: InspectOptions): void; 10 | export declare function hexAndAny(n: number, p?: number, ...argv: number[]): number; 11 | export declare function hexAnd(n: number, p?: number, ...argv: number[]): number; 12 | export declare function hexOr(n: number, p?: number, ...argv: number[]): number; 13 | -------------------------------------------------------------------------------- /lib/util/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/4/17/017. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.token_add_info = exports.toHex = exports.debug_token = void 0; 7 | exports.debug_inspect = debug_inspect; 8 | exports.debug = debug; 9 | exports.debug_options = debug_options; 10 | exports.hexAndAny = hexAndAny; 11 | exports.hexAnd = hexAnd; 12 | exports.hexOr = hexOr; 13 | const debug_1 = require("./debug"); 14 | Object.defineProperty(exports, "debug_token", { enumerable: true, get: function () { return debug_1.debug_token; } }); 15 | Object.defineProperty(exports, "toHex", { enumerable: true, get: function () { return debug_1.toHex; } }); 16 | Object.defineProperty(exports, "token_add_info", { enumerable: true, get: function () { return debug_1.token_add_info; } }); 17 | const util_1 = require("util"); 18 | function debug_inspect(argv, options = {}) { 19 | options = Object.assign({ 20 | colors: true, 21 | }, options); 22 | return argv.map(function (b) { 23 | return (0, util_1.inspect)(b, options); 24 | }, []); 25 | } 26 | function debug(...argv) { 27 | return console.log(...debug_inspect(argv)); 28 | } 29 | function debug_options(argv, options) { 30 | return console.log(...debug_inspect(argv, options)); 31 | } 32 | function hexAndAny(n, ...argv) { 33 | if (!argv.length) { 34 | return n; 35 | } 36 | for (let v of argv) { 37 | let r = (n & v); 38 | if (r) { 39 | return r; 40 | } 41 | } 42 | return 0; 43 | } 44 | function hexAnd(n, ...argv) { 45 | if (argv.length) { 46 | let r = 0; 47 | for (let v of argv) { 48 | let p = n & v; 49 | if (!p) { 50 | return 0; 51 | } 52 | r |= v; 53 | } 54 | return r; 55 | } 56 | return n; 57 | } 58 | function hexOr(n, ...argv) { 59 | for (let v of argv) { 60 | n |= v; 61 | } 62 | return n; 63 | } 64 | //let p = hexAnd(0x6000 | 0x8000, 0x2000, 0x4000) 65 | //debug(p, toHex(p)); 66 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /lib/util/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/17/017. 3 | */ 4 | 5 | import { debug_token, IWordDebug, IWordDebugInfo, toHex, token_add_info } from './debug'; 6 | import { inspect, InspectOptions } from 'util'; 7 | 8 | export { IWordDebug, IWordDebugInfo, debug_token, toHex, token_add_info } 9 | 10 | export function debug_inspect(argv: any[], options: InspectOptions = {}) 11 | { 12 | options = Object.assign({ 13 | colors: true, 14 | }, options); 15 | 16 | return argv.map(function (b) 17 | { 18 | return inspect(b, options); 19 | }, []); 20 | } 21 | 22 | export function debug(...argv) 23 | { 24 | return console.log(...debug_inspect(argv)); 25 | } 26 | 27 | export function debug_options(argv: any[], options?: InspectOptions) 28 | { 29 | return console.log(...debug_inspect(argv, options)); 30 | } 31 | 32 | export function hexAndAny(n: number, p?: number, ...argv: number[]): number 33 | export function hexAndAny(n: number, ...argv: number[]) 34 | { 35 | if (!argv.length) 36 | { 37 | return n; 38 | } 39 | 40 | for (let v of argv) 41 | { 42 | let r = (n & v); 43 | 44 | if (r) 45 | { 46 | return r; 47 | } 48 | } 49 | 50 | return 0; 51 | } 52 | 53 | export function hexAnd(n: number, p?: number, ...argv: number[]): number 54 | export function hexAnd(n: number, ...argv: number[]) 55 | { 56 | if (argv.length) 57 | { 58 | let r = 0; 59 | 60 | for (let v of argv) 61 | { 62 | let p = n & v; 63 | 64 | if (!p) 65 | { 66 | return 0; 67 | } 68 | 69 | r |= v; 70 | } 71 | 72 | return r; 73 | } 74 | 75 | return n; 76 | } 77 | 78 | export function hexOr(n: number, p?: number, ...argv: number[]): number 79 | export function hexOr(n: number, ...argv: number[]) 80 | { 81 | for (let v of argv) 82 | { 83 | n |= v; 84 | } 85 | 86 | return n; 87 | } 88 | 89 | //let p = hexAnd(0x6000 | 0x8000, 0x2000, 0x4000) 90 | //debug(p, toHex(p)); 91 | -------------------------------------------------------------------------------- /lib/util/isUnset.d.ts: -------------------------------------------------------------------------------- 1 | export declare function isUnset(val: T): val is Extract; 2 | export declare function isSet(val: T): val is Exclude; 3 | export default isUnset; 4 | -------------------------------------------------------------------------------- /lib/util/isUnset.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | exports.isUnset = isUnset; 4 | exports.isSet = isSet; 5 | function isUnset(val) { 6 | return typeof val === 'undefined' || val === null; 7 | } 8 | function isSet(val) { 9 | return typeof val !== 'undefined' && val !== null; 10 | } 11 | exports.default = isUnset; 12 | //# sourceMappingURL=isUnset.js.map -------------------------------------------------------------------------------- /lib/util/isUnset.ts: -------------------------------------------------------------------------------- 1 | 2 | export function isUnset(val: T): val is Extract 3 | { 4 | return typeof val === 'undefined' || val === null 5 | } 6 | 7 | export function isSet(val: T): val is Exclude 8 | { 9 | return typeof val !== 'undefined' && val !== null 10 | } 11 | 12 | export default isUnset 13 | -------------------------------------------------------------------------------- /project.config.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2017/8/13/013. 3 | */ 4 | export declare const project_root: string; 5 | export declare const dict_root: string; 6 | export declare const temp_root: string; 7 | declare const _default: { 8 | project_root: string; 9 | dict_root: string; 10 | temp_root: string; 11 | }; 12 | export default _default; 13 | -------------------------------------------------------------------------------- /project.config.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2017/8/13/013. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.temp_root = exports.dict_root = exports.project_root = void 0; 7 | const path_1 = require("path"); 8 | exports.project_root = (0, path_1.join)(__dirname); 9 | exports.dict_root = (0, path_1.join)(exports.project_root, 'dicts'); 10 | //export const dist_root = path.join(project_root, 'dist'); 11 | exports.temp_root = (0, path_1.join)(exports.project_root, 'test/temp'); 12 | exports.default = { 13 | project_root: exports.project_root, 14 | dict_root: exports.dict_root, 15 | temp_root: exports.temp_root, 16 | }; 17 | //# sourceMappingURL=project.config.js.map -------------------------------------------------------------------------------- /project.config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2017/8/13/013. 3 | */ 4 | 5 | import { join } from 'path'; 6 | 7 | export const project_root = join(__dirname); 8 | 9 | export const dict_root = join(project_root, 'dicts'); 10 | 11 | //export const dist_root = path.join(project_root, 'dist'); 12 | export const temp_root = join(project_root, 'test/temp'); 13 | 14 | export default { 15 | project_root, 16 | dict_root, 17 | temp_root, 18 | }; 19 | -------------------------------------------------------------------------------- /repl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const path = require('path'); 5 | const repl = require('repl'); 6 | 7 | const r = repl.start('> '); 8 | const c = r.context; 9 | 10 | c._load = function () 11 | { 12 | c.Segment = require('./'); 13 | const segment = new c.Segment(); 14 | segment.useDefault(); 15 | c.segment = segment; 16 | c.s = function () 17 | { 18 | return c.segment.doSegment.apply(c.segment, arguments); 19 | }; 20 | c.ss = function () 21 | { 22 | const list = c.s.apply(null, arguments); 23 | return list.map(function (v) { return v.w; }).join('/'); 24 | }; 25 | }; 26 | 27 | c.reload = function () 28 | { 29 | const t = Date.now(); 30 | const dir = path.resolve(__dirname) + path.sep; 31 | for (let i in require.cache) 32 | { 33 | if (i.indexOf(dir) === 0) 34 | { 35 | delete require.cache[i]; 36 | // console.log('delete %s', i); 37 | } 38 | } 39 | c._load(); 40 | console.log('OK. (spent %sms)', Date.now() - t); 41 | } 42 | 43 | c._load(); 44 | -------------------------------------------------------------------------------- /script/publish-after.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/7/24/024. 3 | */ 4 | export {}; 5 | -------------------------------------------------------------------------------- /script/publish-after.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/7/24/024. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | const tslib_1 = require("tslib"); 7 | const path = tslib_1.__importStar(require("path")); 8 | const project_config_1 = tslib_1.__importDefault(require("../project.config")); 9 | // @ts-ignore 10 | const PackageJson = tslib_1.__importStar(require("../package.json")); 11 | /// 12 | const index = require("../index"); 13 | (async () => { 14 | let crossSpawn; 15 | // @ts-ignore 16 | crossSpawn = await Promise.resolve().then(() => tslib_1.__importStar(require('cross-spawn-extra'))); 17 | let gitroot; 18 | // @ts-ignore 19 | gitroot = await Promise.resolve().then(() => tslib_1.__importStar(require('git-root2'))).then(m => m.sync); 20 | // @ts-ignore 21 | gitroot = gitroot(__dirname); 22 | if (!gitroot || path.relative(gitroot, project_config_1.default.project_root)) { 23 | console.warn(`no git exists`); 24 | return; 25 | } 26 | let options = { 27 | cwd: project_config_1.default.project_root, 28 | stdio: 'inherit', 29 | }; 30 | let msg = `npm publish ${PackageJson.version}`; 31 | msg += `\n\nnovel-segment@${index.versions['novel-segment']}, segment-dict@${index.versions['segment-dict']}, cjk-conv@${index.versions['cjk-conv']}, regexp-cjk@${index.versions['regexp-cjk']}`; 32 | await crossSpawn('git', [ 33 | 'commit', 34 | '-a', 35 | '-m', 36 | msg, 37 | ], options); 38 | await new Promise(function (done) { 39 | setTimeout(done, 500); 40 | }); 41 | await crossSpawn('git', [ 42 | 'tag', 43 | '-a', 44 | PackageJson.version, 45 | '-m', 46 | msg, 47 | ], options); 48 | })().catch(e => console.error(e)); 49 | //# sourceMappingURL=publish-after.js.map -------------------------------------------------------------------------------- /script/publish-after.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/7/24/024. 3 | */ 4 | 5 | import * as path from 'path'; 6 | import ProjectConfig from '../project.config'; 7 | // @ts-ignore 8 | import * as PackageJson from '../package.json'; 9 | import CrossSpawn = require('cross-spawn-extra'); 10 | /// 11 | import index = require('../index'); 12 | import { sync } from 'git-root2/core'; 13 | 14 | (async () => 15 | { 16 | let crossSpawn: typeof CrossSpawn; 17 | // @ts-ignore 18 | crossSpawn = await import('cross-spawn-extra'); 19 | 20 | let gitroot: string; 21 | 22 | // @ts-ignore 23 | gitroot = await import('git-root2').then(m => m.sync); 24 | // @ts-ignore 25 | gitroot = gitroot(__dirname); 26 | 27 | if (!gitroot || path.relative(gitroot, ProjectConfig.project_root)) 28 | { 29 | console.warn(`no git exists`); 30 | return; 31 | } 32 | 33 | let options = { 34 | cwd: ProjectConfig.project_root, 35 | stdio: 'inherit', 36 | }; 37 | 38 | let msg = `npm publish ${PackageJson.version}`; 39 | 40 | msg += `\n\nnovel-segment@${index.versions['novel-segment']}, segment-dict@${index.versions['segment-dict']}, cjk-conv@${index.versions['cjk-conv']}, regexp-cjk@${index.versions['regexp-cjk']}`; 41 | 42 | await crossSpawn('git', [ 43 | 'commit', 44 | '-a', 45 | '-m', 46 | msg, 47 | ], options); 48 | 49 | await new Promise(function (done) 50 | { 51 | setTimeout(done, 500); 52 | }); 53 | 54 | await crossSpawn('git', [ 55 | 'tag', 56 | '-a', 57 | PackageJson.version, 58 | '-m', 59 | msg, 60 | ], options); 61 | 62 | })().catch(e => console.error(e)); 63 | -------------------------------------------------------------------------------- /script/publish-after2.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/7/24/024. 3 | */ 4 | export {}; 5 | -------------------------------------------------------------------------------- /script/publish-after2.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2018/7/24/024. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | const tslib_1 = require("tslib"); 7 | const path = require("path"); 8 | /// 9 | const index = require("../index"); 10 | const project_config_1 = tslib_1.__importDefault(require("../project.config")); 11 | const path_1 = require("path"); 12 | (async () => { 13 | let crossSpawn; 14 | // @ts-ignore 15 | crossSpawn = await Promise.resolve().then(() => tslib_1.__importStar(require('cross-spawn-extra'))); 16 | let gitroot; 17 | // @ts-ignore 18 | gitroot = await Promise.resolve().then(() => tslib_1.__importStar(require('git-root2'))).then(m => m.sync); 19 | // @ts-ignore 20 | gitroot = gitroot(__dirname); 21 | if (!gitroot || path.relative(gitroot, project_config_1.default.project_root)) { 22 | let __root_ws = await Promise.resolve().then(() => tslib_1.__importStar(require('../../../__root_ws'))).then(m => m.__root_ws) 23 | .catch(e => null); 24 | if (!__root_ws || path.relative(gitroot, __root_ws)) { 25 | console.warn(`no git exists`); 26 | console.warn(`__root_ws`, __root_ws); 27 | console.warn(`gitroot`, gitroot); 28 | console.warn(`path.relative`, path.relative(gitroot, project_config_1.default.project_root)); 29 | return; 30 | } 31 | } 32 | let cwd = (0, path_1.join)(project_config_1.default.project_root, 'test'); 33 | let options = { 34 | cwd, 35 | stdio: 'inherit', 36 | }; 37 | let msg = `novel-segment@${index.versions['novel-segment']}, segment-dict@${index.versions['segment-dict']}, cjk-conv@${index.versions['cjk-conv']}, regexp-cjk@${index.versions['regexp-cjk']}`; 38 | await crossSpawn('git', [ 39 | 'commit', 40 | //'-a', 41 | '-m', 42 | msg, 43 | '.', 44 | ], options); 45 | })().catch(e => console.error(e)); 46 | //# sourceMappingURL=publish-after2.js.map -------------------------------------------------------------------------------- /script/publish-after2.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/7/24/024. 3 | */ 4 | 5 | import path = require('path'); 6 | // @ts-ignore 7 | import PackageJson = require('../package.json'); 8 | import CrossSpawn = require('cross-spawn-extra'); 9 | /// 10 | import index = require('../index'); 11 | import ProjectConfig from '../project.config'; 12 | import { join } from "path"; 13 | 14 | (async () => 15 | { 16 | let crossSpawn: typeof CrossSpawn; 17 | // @ts-ignore 18 | crossSpawn = await import('cross-spawn-extra'); 19 | 20 | let gitroot: string; 21 | 22 | // @ts-ignore 23 | gitroot = await import('git-root2').then(m => m.sync); 24 | // @ts-ignore 25 | gitroot = gitroot(__dirname); 26 | 27 | if (!gitroot || path.relative(gitroot, ProjectConfig.project_root)) 28 | { 29 | let __root_ws = await import('../../../__root_ws') 30 | .then(m => m.__root_ws) 31 | .catch(e => null) 32 | ; 33 | 34 | if (!__root_ws || path.relative(gitroot, __root_ws)) 35 | { 36 | console.warn(`no git exists`); 37 | console.warn(`__root_ws`, __root_ws); 38 | console.warn(`gitroot`, gitroot); 39 | console.warn(`path.relative`, path.relative(gitroot, ProjectConfig.project_root)); 40 | return; 41 | } 42 | } 43 | 44 | let cwd = join(ProjectConfig.project_root, 'test'); 45 | 46 | let options = { 47 | cwd, 48 | stdio: 'inherit', 49 | }; 50 | 51 | let msg = `novel-segment@${index.versions['novel-segment']}, segment-dict@${index.versions['segment-dict']}, cjk-conv@${index.versions['cjk-conv']}, regexp-cjk@${index.versions['regexp-cjk']}`; 52 | 53 | await crossSpawn('git', [ 54 | 'commit', 55 | //'-a', 56 | '-m', 57 | msg, 58 | '.', 59 | ], options); 60 | 61 | })().catch(e => console.error(e)); 62 | -------------------------------------------------------------------------------- /script/sort-stringify-cache.d.ts: -------------------------------------------------------------------------------- 1 | export {}; 2 | -------------------------------------------------------------------------------- /test/__snapshots__/bug.spec.ts.snap: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`bug check word is constructor 1`] = `"inspection.dead.code.problem.synopsis28.constructor=构造函数有一个用法,但它是不可到达的从入口点."`; 4 | -------------------------------------------------------------------------------- /test/_local-dev.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | import _chai = require('chai'); 3 | // @ts-ignore 4 | // @ts-ignore 5 | //import { expect, assert } from 'chai'; 6 | 7 | import { IChaiInstalled } from 'chai-asserttype-extra' 8 | //import ChaiPlugin = require('chai-asserttype-extra'); 9 | import ChaiStatic = Chai.ChaiStatic; 10 | 11 | let chai: IChaiInstalled | ChaiStatic; 12 | 13 | if (requireResolve('chai-asserttype-extra')) 14 | { 15 | const ChaiPlugin = require('chai-asserttype-extra').ChaiPlugin; 16 | 17 | chai = ChaiPlugin.install(_chai) as IChaiInstalled; 18 | } 19 | else 20 | { 21 | chai = _chai; 22 | } 23 | 24 | if (requireResolve('chai-string')) 25 | { 26 | chai.use(require('chai-string')); 27 | } 28 | 29 | const { expect, assert } = chai; 30 | 31 | export { chai, expect, assert } 32 | 33 | // @ts-ignore 34 | import path = require('path'); 35 | // @ts-ignore 36 | import util = require('util'); 37 | 38 | export { path, util }; 39 | 40 | // @ts-ignore 41 | export const rootDir: string = path.join(__dirname, '..'); 42 | 43 | export function relative(filename: string): string 44 | { 45 | return path.relative(rootDir, filename); 46 | } 47 | 48 | export function mochaAsync(fn: Function) 49 | { 50 | return async (done) => 51 | { 52 | try 53 | { 54 | await fn(); 55 | done(); 56 | } 57 | catch (err) 58 | { 59 | done(err); 60 | } 61 | }; 62 | } 63 | 64 | export default exports as typeof import('./_local-dev'); 65 | 66 | export function requireResolve(name: string): string 67 | { 68 | try 69 | { 70 | return require.resolve(name) 71 | } 72 | catch (e) 73 | { 74 | 75 | } 76 | return null; 77 | } 78 | -------------------------------------------------------------------------------- /test/bug.spec.ts: -------------------------------------------------------------------------------- 1 | //@noUnusedParameters:false 2 | 3 | import { basename, extname } from 'path'; 4 | import { createSegment } from './lib/index'; 5 | import { stringifyList, stringify } from '@novel-segment/stringify'; 6 | 7 | describe(`bug`, () => 8 | { 9 | const segment = createSegment(true, { 10 | nodeNovelMode: true, 11 | }); 12 | 13 | test(`check word is constructor`, () => 14 | { 15 | let words = segment.doSegment(`inspection.dead.code.problem.synopsis28.constructor=构造函数有一个用法,但它是不可到达的从入口点.`); 16 | 17 | let actual = stringify(words); 18 | 19 | expect(actual).toContain(`inspection.dead.code.problem.synopsis28.constructor`); 20 | expect(actual).not.toContain(`[native code]`); 21 | 22 | expect(actual).toMatchSnapshot(); 23 | 24 | }); 25 | 26 | }) 27 | -------------------------------------------------------------------------------- /test/chk-fixme.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/4/12. 3 | */ 4 | 5 | import Mocha = require('mocha'); 6 | import fs = require('fs'); 7 | import path = require('path'); 8 | import yargs = require('yargs'); 9 | 10 | let cli = yargs 11 | .argv 12 | ; 13 | 14 | // @ts-ignore 15 | const mocha = new Mocha(cli); 16 | 17 | mocha.addFile( 18 | path.join(__dirname, 'lazy.fixme') 19 | ); 20 | 21 | mocha.run(function(failures) { 22 | 23 | failures && console.warn(`Tests failed: ${failures}`); 24 | 25 | process.exitCode = 0; 26 | }); 27 | 28 | try 29 | { 30 | mocha.allowUncaught() 31 | } 32 | catch (e) 33 | { 34 | 35 | } 36 | 37 | process.exitCode = 0; 38 | -------------------------------------------------------------------------------- /test/lib/delete-cache.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/17/017. 3 | */ 4 | 5 | import { relative } from 'path'; 6 | import { removeSync } from 'fs-extra'; 7 | import { temp_root, project_root } from '../../project.config'; 8 | import { debug } from '../../lib/util'; 9 | import { async as FastGlob } from '@bluelovers/fast-glob/bluebird'; 10 | 11 | console.time(`[delete] cache`); 12 | 13 | export default FastGlob([ 14 | '**/cache.db', 15 | '**/cache*.db', 16 | ], { 17 | cwd: temp_root, 18 | absolute: true, 19 | }) 20 | .map((cache_file) => { 21 | 22 | debug(relative(project_root, cache_file)); 23 | removeSync(cache_file); 24 | 25 | }) 26 | .tap(() => { 27 | 28 | console.timeEnd(`[delete] cache`); 29 | 30 | }) 31 | ; 32 | -------------------------------------------------------------------------------- /test/lib/util.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/4/9. 3 | */ 4 | 5 | import { IWord } from '../../lib/Segment'; 6 | import tests_lazy_index from '../res/lazy.index'; 7 | import { zhDictCompare } from '@novel-segment/util'; 8 | import { assert, chai } from '../_local-dev'; 9 | import * as _ from '@novel-segment/assert'; 10 | 11 | function _wrapFn(fn: T): T 12 | { 13 | return ((...argv: Parameters) => { 14 | argv[2] = { 15 | ...(argv[2] ?? {}), 16 | }; 17 | argv[2].inspectFn ??= chai.util.inspect; 18 | // @ts-ignore 19 | return fn(...argv) 20 | }) as T 21 | } 22 | 23 | export const lazyMatch = _wrapFn(_.lazyMatch); 24 | export const lazyMatch002 = _wrapFn(_.lazyMatch002); 25 | export const lazyMatchNot = _wrapFn(_.lazyMatchNot); 26 | export const lazyMatchSynonym001 = _wrapFn(_.lazyMatchSynonym001); 27 | export const lazyMatchSynonym001Not = _wrapFn(_.lazyMatchSynonym001Not); 28 | 29 | export function mochaSetup(mocha: Mocha.Context) 30 | { 31 | mocha.timeout(30000); 32 | 33 | return mocha; 34 | } 35 | 36 | export function toStringArray(arr: T) 37 | { 38 | return arr.map(function (w) 39 | { 40 | return w.w; 41 | }); 42 | } 43 | 44 | export default exports as typeof import('./util'); 45 | 46 | export function sortTests(list: T) 47 | { 48 | list.sort(function (a, b) 49 | { 50 | return zhDictCompare(String(a[1]), String(b[1])) 51 | || zhDictCompare(a[0], b[0]) 52 | }) 53 | } 54 | -------------------------------------------------------------------------------- /test/res/gc.data.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/6/12. 3 | */ 4 | 5 | export const fixedGC = [ 6 | '接著就是對市政府的對外放送讓他能跑來避難以及多虧了市政府提供了這般安全的地方讓他歇息表達深深地感謝', 7 | `(二)控股股东,是指其出资额占有限责任公司资本总额百分之五十以上或者其持有的股份占股份有限公司股本总额百分之五十以上的股东;出资额或者持有股份的比例虽然不足百分之五十,但依其出资额或者持有的股份所享有的表决权已足以对股东会、股东大会的决议产生重大影响的股东。`, 8 | ]; 9 | 10 | export default fixedGC 11 | -------------------------------------------------------------------------------- /test/res/gc.not/666962621.txt: -------------------------------------------------------------------------------- 1 | https://github.com/leizongmin/node-segment/issues/35#issuecomment-666962621 2 | 3 | 一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十 4 | -------------------------------------------------------------------------------- /test/res/lazy.index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2019/4/12. 3 | * 4 | * 測試段落 每次發布版本時 會保證以下分析轉換是符合預期 5 | */ 6 | 7 | import { 8 | sortTests, 9 | 10 | } from '../lib/util'; 11 | 12 | import tests_lazy_base from './lazy.index/tests_lazy_base'; 13 | import tests_lazy_base_not from './lazy.index/tests_lazy_base_not'; 14 | import tests_lazy_array from './lazy.index/tests_lazy_array'; 15 | import tests_lazy_indexof from './lazy.index/tests_lazy_indexof'; 16 | import tests_lazy_indexof_not from './lazy.index/tests_lazy_indexof_not'; 17 | import { 18 | lazyMatch, 19 | lazyMatch002, 20 | lazyMatchNot, 21 | lazyMatchSynonym001, 22 | lazyMatchSynonym001Not, 23 | } from '@novel-segment/assert'; 24 | 25 | sortTests(tests_lazy_base); 26 | sortTests(tests_lazy_base_not); 27 | sortTests(tests_lazy_array); 28 | sortTests(tests_lazy_indexof); 29 | sortTests(tests_lazy_indexof_not); 30 | 31 | export { 32 | tests_lazy_base, 33 | tests_lazy_base_not, 34 | tests_lazy_array, 35 | tests_lazy_indexof, 36 | tests_lazy_indexof_not, 37 | }; 38 | 39 | export default { 40 | tests_lazy_base, 41 | tests_lazy_base_not, 42 | tests_lazy_array, 43 | tests_lazy_indexof, 44 | tests_lazy_indexof_not, 45 | }; 46 | -------------------------------------------------------------------------------- /test/res/lazy.index/tests_lazy_array.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/1/17. 3 | */ 4 | 5 | import { lazyMatch002 } from '@novel-segment/assert'; 6 | 7 | /** 8 | * 分析後應該要符合以下其中一個結果 9 | */ 10 | export const tests_lazy_array: [string, Parameters['1'], Parameters['2']?][] = [ 11 | 12 | [ 13 | '胡锦涛出席APEC领导人会议后回京', 14 | [ 15 | [ 16 | '会议', 17 | '回京', 18 | ], 19 | ], 20 | ], 21 | 22 | [ 23 | '在這裡有兩具自動人偶隨侍在側的烏列爾', 24 | [ 25 | [ 26 | '兩具', 27 | '自動', 28 | '人偶', 29 | '隨侍', 30 | ], 31 | [ 32 | '兩具', 33 | '自動人偶', 34 | '隨侍', 35 | ], 36 | ], 37 | ], 38 | 39 | [ 40 | '我摀住嘴', 41 | [ 42 | [ 43 | '我', 44 | '摀住', 45 | '嘴', 46 | ], 47 | [ 48 | '我', 49 | '摀住嘴', 50 | ], 51 | ], 52 | ], 53 | 54 | [ 55 | '世間萬物終歸于虛無', 56 | [ 57 | [ 58 | '世間', 59 | '萬物', 60 | '終歸', 61 | '於', 62 | '虛無', 63 | ], 64 | [ 65 | '世間', 66 | '萬物', 67 | '終歸於', 68 | '虛無', 69 | ], 70 | ], 71 | ], 72 | 73 | ]; 74 | 75 | export default tests_lazy_array 76 | -------------------------------------------------------------------------------- /test/res/lazy.index/tests_lazy_indexof_not.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/1/17. 3 | */ 4 | 5 | import { lazyMatchSynonym001Not } from '@novel-segment/assert'; 6 | 7 | /** 8 | * 分析轉換後不應該具有以下字詞 9 | */ 10 | export const tests_lazy_indexof_not: [string, Parameters['1'], Parameters['2']?][] = [ 11 | 12 | [ 13 | '那是里靈魂的世界。', 14 | [ 15 | '裡', 16 | ], 17 | ], 18 | 19 | [ 20 | '原因還是在於教會對于究極療癒所抱持的想法吧', 21 | [ 22 | '于', 23 | ], 24 | ], 25 | 26 | [ 27 | '遥遥领先于帝位争夺的皇太子战死于战场都是太过奇怪的事了', 28 | [ 29 | '于', 30 | ], 31 | ], 32 | 33 | [ 34 | '那里民风保守', 35 | [ 36 | '里', 37 | ], 38 | ], 39 | 40 | [ 41 | '似乎在一栋别墅里长住不走了', 42 | [ 43 | '里', 44 | ], 45 | ], 46 | 47 | [ 48 | '生活里长期充满了无奈', 49 | [ 50 | '里', 51 | ], 52 | ], 53 | 54 | [ 55 | '异次元里拉出来', 56 | [ 57 | '里', 58 | ], 59 | ], 60 | 61 | [ 62 | '他被寄养在别的家庭里长达十年', 63 | [ 64 | '里', 65 | ], 66 | ], 67 | 68 | [ 69 | '我在这本第一卷里见到的雷普凌辱鬼畜等词汇比我这一整年在其他地方看到的都多', 70 | [ 71 | '里', 72 | ], 73 | ], 74 | 75 | [ 76 | '「好耶!来去冲冲水!!」', 77 | [ 78 | '衝', 79 | ], 80 | ], 81 | 82 | [ 83 | '那是她連在血斗場都未曾見識過的速度。', 84 | [ 85 | '斗', 86 | ], 87 | ], 88 | 89 | [ 90 | '先不说那个擦过了她手和嘴的脏手帕干不干净,', 91 | [ 92 | '干', 93 | '幹', 94 | ], 95 | ], 96 | 97 | [ 98 | '而是于公于私的看法都相同,', 99 | [ 100 | '于', 101 | ] 102 | ], 103 | 104 | [ 105 | '「維爾德拉流斗殺法」', 106 | [ 107 | '斗', 108 | ] 109 | ], 110 | 111 | [ 112 | '。面對一隻做困獸之斗的老虎,', 113 | [ 114 | '斗', 115 | ] 116 | ], 117 | 118 | [ 119 | ',比我想象的还强──', 120 | [ 121 | '象', 122 | ] 123 | ], 124 | 125 | [ 126 | '不论那副姿态有多么不堪入目', 127 | [ 128 | '么', 129 | ] 130 | ], 131 | 132 | [ 133 | '钢铁制面具隐藏的嘴部现形了', 134 | [ 135 | '制', 136 | '麵', 137 | ] 138 | ], 139 | 140 | [ 141 | ',人类根本干不出这种事这样?', 142 | [ 143 | '干', 144 | ] 145 | ], 146 | 147 | [ 148 | '和庵彼此搏命相斗,会操控火焰的男人。', 149 | [ 150 | '斗', 151 | ], 152 | ], 153 | 154 | [ 155 | '。我干我该干的就对了。」', 156 | [ 157 | '干', 158 | ], 159 | ], 160 | 161 | [ 162 | ',让我沉溺在你们两人的温柔乡里吧。」', 163 | [ 164 | '里', 165 | ], 166 | ], 167 | 168 | ]; 169 | 170 | export default tests_lazy_indexof_not 171 | -------------------------------------------------------------------------------- /test/res/lazy.novel.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 測試段落 每次發布版本時 會保證以下分析轉換是符合預期 3 | * 4 | * 此檔案內的測試只有在開啟 nodeNovelMode 後才會符合預期 5 | */ 6 | 7 | import { 8 | sortTests, 9 | 10 | } from '../lib/util'; 11 | import { 12 | lazyMatch, 13 | lazyMatch002, 14 | lazyMatchNot, 15 | lazyMatchSynonym001, 16 | lazyMatchSynonym001Not, 17 | } from '@novel-segment/assert'; 18 | 19 | /** 20 | * 分析後應該要符合以下結果 21 | */ 22 | export const tests_lazy_novel_base: [string, Parameters['1'], Parameters['2']?][] = [ 23 | 24 | ]; 25 | 26 | /** 27 | * 分析後不應該存在符合以下結果 28 | */ 29 | export const tests_lazy_novel_base_not: [string, Parameters['1'], Parameters['2']?][] = [ 30 | 31 | 32 | 33 | ]; 34 | 35 | /** 36 | * 分析後應該要符合以下其中一個結果 37 | */ 38 | export const tests_lazy_novel_array: [string, Parameters['1'], Parameters['2']?][] = [ 39 | 40 | ]; 41 | 42 | /** 43 | * 分析轉換後應該要具有以下字詞 44 | */ 45 | export const tests_lazy_novel_indexof: [string, Parameters['1'], Parameters['2']?][] = [ 46 | 47 | [ 48 | '但是在發出邀請後卻被回以“吾才不要去那種魚龍混雜的地方呢”,義正言辭的回絕了', 49 | [ 50 | '義正辭嚴', 51 | ], 52 | ], 53 | 54 | [ 55 | '也許是不知道有普通人被卷入了結界', 56 | [ 57 | '捲', 58 | ], 59 | ], 60 | 61 | [ 62 | '不過好象只是杞人憂天。', 63 | [ 64 | '像', 65 | ], 66 | ], 67 | 68 | [ 69 | '鶫讓自己的身體深深的陷入政府準備的轎車后座', 70 | [ 71 | '後', 72 | ], 73 | ], 74 | 75 | [ 76 | '貝爾不遜的回復道。', 77 | [ 78 | '覆', 79 | ], 80 | ], 81 | 82 | [ 83 | '「沒有借口啊」', 84 | [ 85 | '藉', 86 | ], 87 | ], 88 | 89 | [ 90 | '「⋯⋯你啊。说了吧。会把我卷进你的漩涡里,让我无处可逃。」', 91 | [ 92 | '捲', 93 | ], 94 | ], 95 | 96 | [ 97 | '「⋯⋯好困啊。都快要做像黃金絲綢般美好的夢了。」', 98 | [ 99 | '睏', 100 | ], 101 | ], 102 | 103 | [ 104 | '廣瀨的下一個對象是我嗎?很遺憾,對男人沒有興趣。', 105 | [ 106 | '象', 107 | ], 108 | ], 109 | 110 | [ 111 | ' 墮入絕望的深淵的她,最後抓住的對象是――', 112 | [ 113 | '象', 114 | ], 115 | ], 116 | 117 | [ 118 | '基本上的印象是伊撒古向勞拉絕對服從', 119 | [ 120 | '象', 121 | ], 122 | ], 123 | 124 | [ 125 | '企圖甩掉貨物的八腳獨眼象用它的巨大身軀衝撞正在拆解中的鷹架', 126 | [ 127 | '象', 128 | ], 129 | ], 130 | 131 | [ 132 | '「艾莉卡!卡露米雅,你們把大象抓起來!」', 133 | [ 134 | '象', 135 | ], 136 | ], 137 | 138 | [ 139 | '還有因為最近在就業活動中占有優勢的等等。', 140 | [ 141 | '佔', 142 | ], 143 | ], 144 | 145 | [ 146 | '将系成蝴蝶结的一头拉动的话', 147 | [ 148 | '繫', 149 | ], 150 | ], 151 | 152 | [ 153 | '由於發生了恐怖分子佔領事件,因此換了一輛列車。', 154 | [ 155 | '份', 156 | ], 157 | ], 158 | 159 | [ 160 | '「哦⋯⋯『業火灼熱拉面』,這怎麼看都是很危險的東西。」', 161 | [ 162 | '麵', 163 | ], 164 | ], 165 | 166 | [ 167 | '伊甸人非常喜爱地上的牲畜及谷类。', 168 | [ 169 | '穀', 170 | ], 171 | ], 172 | 173 | [ 174 | '一邊喂我喝了些甚麼。', 175 | [ 176 | '餵', 177 | ], 178 | ], 179 | 180 | [ 181 | '艾倫諾拉征得龍龍同意後', 182 | [ 183 | '徵', 184 | ], 185 | ], 186 | 187 | [ 188 | '讓婚約者的心系在自己身上也是尤菲莉亞的任務', 189 | [ 190 | '繫', 191 | ], 192 | ], 193 | 194 | [ 195 | '是以蕎麥湯面', 196 | [ 197 | '麵', 198 | ], 199 | ], 200 | 201 | ]; 202 | 203 | /** 204 | * 分析轉換後不應該具有以下字詞 205 | */ 206 | export const tests_lazy_novel_indexof_not: [string, Parameters['1'], Parameters['2']?][] = [ 207 | 208 | [ 209 | '還有一碗名叫『麻辣力湯面』的拉面類料理。', 210 | [ 211 | '面', 212 | ], 213 | ], 214 | 215 | ]; 216 | 217 | sortTests(tests_lazy_novel_base); 218 | sortTests(tests_lazy_novel_base_not); 219 | sortTests(tests_lazy_novel_array); 220 | sortTests(tests_lazy_novel_indexof); 221 | sortTests(tests_lazy_novel_indexof_not); 222 | 223 | export default exports as typeof import('./lazy.novel'); 224 | -------------------------------------------------------------------------------- /test/script/build-submod.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/7/1. 3 | */ 4 | 5 | import FastGlob from '@bluelovers/fast-glob'; 6 | import { join, parse } from 'path'; 7 | import { outputFile } from 'fs-extra'; 8 | 9 | const __root = join(__dirname, '../..'); 10 | 11 | FastGlob 12 | .async([ 13 | '!*.d.ts', 14 | '*.ts', 15 | ], { 16 | cwd: join(__root, 'lib', 'submod') 17 | }) 18 | .then(ls => { 19 | 20 | let record = { 21 | Optimizer: [] as string[], 22 | Tokenizer: [] as string[], 23 | all: [] as string[], 24 | } 25 | 26 | ls.sort(); 27 | 28 | let lines = [] as string[]; 29 | 30 | lines.push(''); 31 | 32 | ls.forEach(row => { 33 | 34 | let name = parse(row).name; 35 | 36 | if (/Optimizer$/.test(name)) 37 | { 38 | record.Optimizer.push(name) 39 | } 40 | else if (/Tokenizer$/.test(name)) 41 | { 42 | record.Tokenizer.push(name) 43 | } 44 | 45 | record.all.push(name) 46 | 47 | lines.push(`import * as ${name} from './submod/${name}';`) 48 | 49 | }); 50 | 51 | lines.push(''); 52 | 53 | record.all.forEach(name => { 54 | 55 | lines.push(`export { ${name} }`) 56 | 57 | }) 58 | 59 | lines.push(''); 60 | 61 | return outputFile(join(__root, 'lib', 'submod.ts'), lines.join(`\n`)) 62 | }) 63 | ; 64 | 65 | function _record(list: string[]) 66 | { 67 | return list.map(m => `\t${m},`) 68 | } 69 | -------------------------------------------------------------------------------- /test/sleep.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/13/013. 3 | */ 4 | 5 | import * as deasync from 'deasync'; 6 | import { sleepSync, awaitSync } from '../lib/util/sleep'; 7 | 8 | const timstamp = Date.now(); 9 | 10 | function f(n: number) 11 | { 12 | return new Promise(function (done) 13 | { 14 | setTimeout(done, n); 15 | }) 16 | .then(function () 17 | { 18 | logWithTime(n); 19 | 20 | return n; 21 | }) 22 | ; 23 | } 24 | 25 | console.time(); 26 | 27 | f(500); 28 | 29 | let p2 = sleepSync(250); 30 | 31 | p2.thenSync(function (n) 32 | { 33 | return logWithTime('thenSync', n); 34 | }); 35 | p2.then(function (n) 36 | { 37 | return logWithTime(n); 38 | }); 39 | 40 | awaitSync(p2) 41 | 42 | .then(function (n) 43 | { 44 | logWithTime(666, n); 45 | }); 46 | 47 | let p = f(1500); 48 | 49 | deasync.sleep(1000); 50 | //msleep(1000); 51 | logWithTime(1000); 52 | 53 | let p33 = awaitSync(p); 54 | 55 | let v33 = p33.thenSync(function (n) 56 | { 57 | return logWithTime('thenSync', n); 58 | }); 59 | 60 | logWithTime('print v33', v33); 61 | 62 | p33 63 | .then(function (n) 64 | { 65 | logWithTime(777, n); 66 | }); 67 | 68 | console.timeEnd(); 69 | 70 | function logWithTime(...argv) 71 | { 72 | console.log(`[${Date.now() - timstamp}]`, ...argv); 73 | 74 | return argv; 75 | } 76 | -------------------------------------------------------------------------------- /test/submod.spec.ts: -------------------------------------------------------------------------------- 1 | import FastGlob from '@bluelovers/fast-glob'; 2 | import { join, parse } from "path"; 3 | import __root from '../__root'; 4 | import { ISubOptimizer, ISubTokenizer } from '../lib/mod'; 5 | import { dirname } from 'path'; 6 | import { array_unique, array_unique_overwrite } from 'array-hyper-unique'; 7 | import SegmentCore from '../lib/segment/core'; 8 | import isUnset, { isSet } from '../lib/util/isUnset'; 9 | import * as SubmodList from '../lib/submod'; 10 | 11 | const segment = new SegmentCore; 12 | 13 | describe(`check all files`, () => 14 | { 15 | 16 | const files = FastGlob 17 | .sync([ 18 | '!*.d.ts', 19 | '*.ts', 20 | ], { 21 | cwd: join(__root, 'lib', 'submod'), 22 | absolute: true, 23 | }) 24 | ; 25 | 26 | FastGlob 27 | .sync([ 28 | '*/index.ts', 29 | ], { 30 | cwd: join(__root, 'lib', 'submod'), 31 | absolute: true, 32 | }) 33 | .forEach(m => { 34 | 35 | files.push(dirname(m)) 36 | 37 | }) 38 | ; 39 | 40 | array_unique_overwrite(files); 41 | 42 | files.sort(); 43 | 44 | files.forEach(row => { 45 | 46 | let name = parse(row).name; 47 | 48 | describe(name, () => { 49 | 50 | test(`import`, async () => 51 | { 52 | const mod = await import(row); 53 | 54 | _check(mod, name); 55 | }); 56 | 57 | test(`require`, async () => 58 | { 59 | const mod = require(row); 60 | 61 | _check(mod, name); 62 | }); 63 | 64 | }) 65 | 66 | }) 67 | 68 | }) 69 | 70 | function _check(mod: ISubOptimizer | ISubTokenizer, name: string) 71 | { 72 | expect(typeof mod).toStrictEqual('object'); 73 | expect(typeof mod.init).toStrictEqual('function'); 74 | 75 | 76 | let actual = mod.init(segment as any); 77 | // @ts-ignore 78 | let _mod: ISubOptimizer | ISubTokenizer = actual ?? mod; 79 | 80 | if (/Optimizer$/.test(name)) 81 | { 82 | 83 | //expect(typeof (mod as ISubOptimizer).doOptimize).toStrictEqual('function'); 84 | 85 | expect(mod).toHaveProperty('type', 'optimizer'); 86 | 87 | _checkApi(_mod, name) 88 | 89 | if (isUnset(actual)) 90 | { 91 | _checkApi(mod, name) 92 | } 93 | 94 | } 95 | else if (/Tokenizer$/.test(name)) 96 | { 97 | //expect(typeof (mod as ISubTokenizer).split).toStrictEqual('function'); 98 | 99 | expect(mod).toHaveProperty('type', 'tokenizer'); 100 | 101 | _checkApi(_mod, name) 102 | 103 | if (isUnset(actual)) 104 | { 105 | _checkApi(mod, name) 106 | } 107 | 108 | } 109 | else 110 | { 111 | expect(name).toMatch(/(?:Tokenizer|Optimizer)$/) 112 | } 113 | } 114 | 115 | describe(`submod index`, () => 116 | { 117 | 118 | Object.entries(SubmodList) 119 | .forEach(([name, mod]) => { 120 | 121 | test(name, () => { 122 | _check(mod as any, name); 123 | }) 124 | 125 | }) 126 | ; 127 | 128 | }) 129 | 130 | function _checkApi(mod: ISubOptimizer | ISubTokenizer, name: string) 131 | { 132 | if (/Optimizer$/.test(name)) 133 | { 134 | 135 | //expect(typeof (mod as ISubOptimizer).doOptimize).toStrictEqual('function'); 136 | 137 | expect(mod).toHaveProperty('type', 'optimizer'); 138 | 139 | expect(typeof (mod as ISubOptimizer).doOptimize).toStrictEqual('function'); 140 | 141 | } 142 | else if (/Tokenizer$/.test(name)) 143 | { 144 | //expect(typeof (mod as ISubTokenizer).split).toStrictEqual('function'); 145 | 146 | expect(mod).toHaveProperty('type', 'tokenizer'); 147 | 148 | expect(typeof (mod as ISubTokenizer).split).toStrictEqual('function'); 149 | 150 | } 151 | else 152 | { 153 | expect(name).toMatch(/(?:Tokenizer|Optimizer)$/) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /test/temp/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | !/cache/**/* 3 | !/cache 4 | !/cache.db.info.json 5 | !/stringify.sorted.txt 6 | /stringify.txt 7 | /cache.common.synonym.db.info.json 8 | /cache.db.info.json 9 | -------------------------------------------------------------------------------- /test/temp/cache/0/eng.txt: -------------------------------------------------------------------------------- 1 | 3G|0x100000|100 2 | App|0x100010|0 3 | T恤|0x100010|0 4 | VR|0x100010|0 5 | 6 | -------------------------------------------------------------------------------- /test/temp/cache/0/other.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /test/temp/cache/i.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /test/temp/cache/u.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /test/temp/cache/v.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /test/test_segment.test.ts: -------------------------------------------------------------------------------- 1 | import { createSegment } from './lib'; 2 | import { IOptionsDoSegment, Segment } from '../lib/Segment'; 3 | import { mochaSetup, toStringArray } from './lib/util'; 4 | import { ENUM_SUBMODS } from '../lib/mod/index'; 5 | import { tests_old } from './res/default'; 6 | import { console } from 'debug-color2'; 7 | import { chai, relative, expect, path, assert, util, mochaAsync } from './_local-dev'; 8 | 9 | describe(relative(__filename), function () 10 | { 11 | 12 | let segment: Segment = null; 13 | 14 | before(function () 15 | { 16 | mochaSetup(this); 17 | 18 | segment = createSegment(false, { 19 | disableModules: [ 20 | //ENUM_SUBMODS.ZhtSynonymOptimizer, 21 | ] 22 | }); 23 | }); 24 | 25 | function doSegment(a: string, options?: IOptionsDoSegment) 26 | { 27 | return segment.doSegment(a, { 28 | convertSynonym: false, 29 | disableModules: [ 30 | ENUM_SUBMODS.ZhtSynonymOptimizer, 31 | ], 32 | ...options, 33 | }) 34 | } 35 | 36 | it('init', function () 37 | { 38 | 39 | }); 40 | 41 | describe('default test', function () 42 | { 43 | 44 | let equal = function (a, b) 45 | { 46 | //console.info(a); 47 | 48 | let c = toStringArray(doSegment(a)); 49 | console.debug(c.join('/')); 50 | //assert.equal(c.toString('\t'), b.toString('\t')); 51 | 52 | expect(c).to.deep.equal(b) 53 | }; 54 | 55 | //console.info('分词测试'); 56 | 57 | tests_old.forEach(function (args) 58 | { 59 | it(args[0], function () 60 | { 61 | equal(...args); 62 | }); 63 | }); 64 | 65 | }); 66 | 67 | it('options: simple=true', function () 68 | { 69 | assert.equal(doSegment('永和服装饰品有限公司', { simple: true }).join('\t'), 70 | ['永和', '服装', '饰品', '有限公司'].join('\t'), 71 | ); 72 | }); 73 | 74 | it('options: stripPunctuation=true', function () 75 | { 76 | assert.equal(doSegment('王五和张三丰、李强是谁', { simple: true, stripPunctuation: true }).join('').includes('丰李'), 77 | true, 78 | ); 79 | }); 80 | 81 | /* 82 | it('options: convertSynonym=true', function () 83 | { 84 | assert.equal(doSegment('何时入睡', { simple: true, convertSynonym: true }).join('\t'), 85 | ['什么时候', '入眠'].join('\t'), 86 | ); 87 | }); 88 | */ 89 | 90 | it('options: stripStopword=true', function () 91 | { 92 | assert.equal(doSegment('因为李三买了一张三角桌子', { simple: true, stripStopword: true }).join('\t'), 93 | ['李三', '买', '一张', '三角', '桌子'].join('\t'), 94 | ); 95 | }); 96 | 97 | }); 98 | -------------------------------------------------------------------------------- /test/version.spec.ts: -------------------------------------------------------------------------------- 1 | import _m0 = require('../version'); 2 | import _m1 from '../version'; 3 | import { version } from '../package.json'; 4 | 5 | test(`export version check`, () => 6 | { 7 | 8 | expect(_m0.version).toStrictEqual(_m1); 9 | expect(_m0.version).toStrictEqual(version); 10 | 11 | }); 12 | -------------------------------------------------------------------------------- /test/versions.spec.ts: -------------------------------------------------------------------------------- 1 | import _m1, { versions } from '../version'; 2 | import { version } from '../package.json'; 3 | 4 | test(`export version check`, () => 5 | { 6 | 7 | expect(_m1).toStrictEqual(versions['novel-segment']); 8 | expect(_m1).toStrictEqual(version); 9 | 10 | }); 11 | 12 | test(`export versions check 2`, () => 13 | { 14 | 15 | expect(versions).toMatchObject({ 16 | 'novel-segment': expect.any(String), 17 | 'segment-dict': expect.any(String), 18 | 'regexp-cjk': expect.any(String), 19 | 'cjk-conv': expect.any(String), 20 | }); 21 | 22 | }); 23 | -------------------------------------------------------------------------------- /test/z.0010.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by User on 2019/6/12. 3 | */ 4 | 5 | /// 6 | /// 7 | /// 8 | /// 9 | 10 | import { chai, relative, expect, path, assert, util, mochaAsync } from './_local-dev'; 11 | import { mochaSetup, toStringArray } from './lib/util'; 12 | import { createSegment } from './lib'; 13 | import { Segment } from '../lib'; 14 | import { console } from 'debug-color2'; 15 | import fixedGC from './res/gc.data'; 16 | import { IOptionsDoSegment } from '../lib/Segment'; 17 | 18 | console.setOptions({ 19 | label: true, 20 | }); 21 | 22 | // @ts-ignore 23 | describe(relative(__filename), () => 24 | { 25 | let currentTest: Mocha.Test; 26 | 27 | let segment: Segment = null; 28 | 29 | // @ts-ignore 30 | before(function () 31 | { 32 | // @ts-ignore 33 | this.timeout(60000); 34 | 35 | segment = createSegment(true, { 36 | disableModules: [ 37 | //ENUM_SUBMODS.ZhtSynonymOptimizer, 38 | ] 39 | }); 40 | }); 41 | 42 | // @ts-ignore 43 | beforeEach(function () 44 | { 45 | // @ts-ignore 46 | currentTest = this.currentTest; 47 | 48 | //console.log('it:before', currentTest.title); 49 | //console.log('it:before', currentTest.fullTitle()); 50 | }); 51 | 52 | // @ts-ignore 53 | describe(`干支`, function () 54 | { 55 | 56 | `甲子|乙丑|丙寅|丁卯|戊辰|己巳|庚午|辛未|壬申|癸酉|甲戌|乙亥|丙子|丁丑|戊寅|己卯|庚辰|辛巳|壬午|癸未|甲申|乙酉|丙戌|丁亥|戊子|己丑|庚寅|辛卯|壬辰|癸巳|甲午|乙未|丙申|丁酉|戊戌|己亥|庚子|辛丑|壬寅|癸卯|甲辰|乙巳|丙午|丁未|戊申|己酉|庚戌|辛亥|壬子|癸丑|甲寅|乙卯|丙辰|丁巳|戊午|己未|庚申|辛酉|壬戌|癸亥|寅月|丙寅月|戊寅月|庚寅月|壬寅月|甲寅月|卯月|丁卯月|己卯月|辛卯月|癸卯月|乙卯月|辰月|戊辰月|庚辰月|壬辰月|甲辰月|丙辰月|巳月|己巳月|辛巳月|癸巳月|乙巳月|丁巳月|午月|庚午月|壬午月|甲午月|丙午月|戊午月|未月|辛未月|癸未月|乙未月|丁未月|己未月|申月|壬申月|甲申月|丙申月|戊申月|庚申月|酉月|癸酉月|乙酉月|丁酉月|己酉月|辛酉月|戌月|甲戌月|丙戌月|戊戌月|庚戌月|壬戌月|亥月|乙亥月|丁亥月|己亥月|辛亥月|癸亥月|子月|丙子月|戊子月|庚子月|壬子月|甲子月|丑月|丁丑月|己丑月|辛丑月|癸丑月|乙丑月`.split('|') 57 | .forEach(text => { 58 | 59 | // @ts-ignore 60 | it(text, function () 61 | { 62 | let actual = toStringArray(doSegment(text)); 63 | 64 | expect(actual).length.gt(0).lte(2); 65 | 66 | if (actual.length === 2) 67 | { 68 | expect(actual).to.have.deep 69 | .property('1', '月') 70 | ; 71 | 72 | if (actual[0].length === 1) 73 | { 74 | expect(actual[0]).length.gt(1) 75 | } 76 | } 77 | }); 78 | 79 | }) 80 | 81 | }); 82 | 83 | function doSegment(a: string, options?: IOptionsDoSegment) 84 | { 85 | return segment.doSegment(a, { 86 | ...options, 87 | }) 88 | } 89 | }); 90 | -------------------------------------------------------------------------------- /test/z.gc.not.test.ts: -------------------------------------------------------------------------------- 1 | import { createSegment } from './lib'; 2 | import FastGlob from '@bluelovers/fast-glob/bluebird'; 3 | import { join } from 'path'; 4 | import { readFileSync } from 'fs'; 5 | 6 | describe(`check not gc`, () => 7 | { 8 | const segment = createSegment(true, { 9 | nodeNovelMode: true, 10 | }); 11 | 12 | const __res = join(__dirname, 'res/gc.not'); 13 | 14 | FastGlob 15 | .sync([ 16 | '**/*.txt', 17 | ], { 18 | cwd: __res 19 | }) 20 | .forEach(file => { 21 | 22 | it(file, () => 23 | { 24 | console.time(file) 25 | const text = readFileSync(join(__res, file)) 26 | let actual = segment.doSegment(text); 27 | console.timeEnd(file) 28 | }); 29 | 30 | }) 31 | ; 32 | 33 | }) 34 | -------------------------------------------------------------------------------- /test/z.gc.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by User on 2019/6/12. 3 | */ 4 | 5 | /// 6 | /// 7 | /// 8 | /// 9 | 10 | import { chai, relative, expect, path, assert, util, mochaAsync } from './_local-dev'; 11 | import { mochaSetup, toStringArray } from './lib/util'; 12 | import { createSegment } from './lib'; 13 | import { Segment } from '../lib'; 14 | import { console } from 'debug-color2'; 15 | import fixedGC from './res/gc.data'; 16 | import { IOptionsDoSegment } from '../lib/Segment'; 17 | 18 | console.setOptions({ 19 | label: true, 20 | }); 21 | 22 | // @ts-ignore 23 | describe(relative(__filename), () => 24 | { 25 | // @ts-ignore 26 | let currentTest: Mocha.Test; 27 | 28 | let segment: Segment = null; 29 | 30 | // @ts-ignore 31 | before(function () 32 | { 33 | this.timeout(60000); 34 | 35 | segment = createSegment(true, { 36 | disableModules: [ 37 | //ENUM_SUBMODS.ZhtSynonymOptimizer, 38 | ] 39 | }); 40 | }); 41 | 42 | // @ts-ignore 43 | beforeEach(function () 44 | { 45 | // @ts-ignore 46 | currentTest = this.currentTest; 47 | 48 | //console.log('it:before', currentTest.title); 49 | //console.log('it:before', currentTest.fullTitle()); 50 | }); 51 | 52 | // @ts-ignore 53 | describe(`suite`, function () 54 | { 55 | 56 | fixedGC.forEach(text => { 57 | 58 | // @ts-ignore 59 | it(text, function () 60 | { 61 | // @ts-ignore 62 | this.timeout(60000); 63 | 64 | let actual = toStringArray(doSegment(text)); 65 | 66 | console.debug(actual.join('/')); 67 | 68 | }); 69 | 70 | }) 71 | 72 | }); 73 | 74 | function doSegment(a: string, options?: IOptionsDoSegment) 75 | { 76 | return segment.doSegment(a, { 77 | ...options, 78 | }) 79 | } 80 | }); 81 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@bluelovers/tsconfig/esm/mapfile.json", 3 | "compilerOptions": { 4 | "importHelpers": true, 5 | "noPropertyAccessFromIndexSignature": false 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /typedoc.config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2018/4/21/021. 3 | */ 4 | 5 | const path = require('path'); 6 | 7 | let p = path.resolve(path.join(path.dirname(require.resolve('typedoc-themes-color')), 'theme')); 8 | 9 | console.log(p); 10 | console.log(path.relative(process.cwd(), p)); 11 | 12 | module.exports = { 13 | src : '.', 14 | out: './docs', 15 | //theme: './my-theme', 16 | // theme: path.relative(process.cwd(), p), 17 | theme: p, 18 | ignoreCompilerErrors: true, 19 | excludeExternals: true, 20 | 21 | externalPattern: "**/node_modules/**", 22 | 23 | exclude: [ 24 | "test", 25 | "node_modules", 26 | "test/", 27 | "node_modules/", 28 | "**/test", 29 | "**/node_modules", 30 | "**/test/**/*", 31 | "**/node_modules/**/*", 32 | ], 33 | }; 34 | 35 | console.log(module.exports); 36 | -------------------------------------------------------------------------------- /version.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/6/2. 3 | */ 4 | export declare let version: string; 5 | export default version; 6 | export declare let version_dict: string; 7 | export declare let versions: { 8 | 'novel-segment': string; 9 | 'segment-dict': string; 10 | 'regexp-cjk': string; 11 | 'cjk-conv': string; 12 | }; 13 | -------------------------------------------------------------------------------- /version.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | /** 3 | * Created by user on 2020/6/2. 4 | */ 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | exports.versions = exports.version_dict = exports.version = void 0; 7 | exports.default = exports.version; 8 | Object.defineProperty(exports, "version", { 9 | get() { 10 | return require('./package.json').version; 11 | } 12 | }); 13 | Object.defineProperty(exports, "version_dict", { 14 | get() { 15 | return require('segment-dict/version').version; 16 | } 17 | }); 18 | Object.defineProperty(exports, "versions", { 19 | get() { 20 | return { 21 | 'novel-segment': exports.version, 22 | 'segment-dict': exports.version_dict, 23 | 'regexp-cjk': require('regexp-cjk/version').version, 24 | 'cjk-conv': require('cjk-conv/version').version, 25 | }; 26 | } 27 | }); 28 | Object.defineProperty(exports, "default", { 29 | get() { 30 | return exports.version; 31 | } 32 | }); 33 | //# sourceMappingURL=version.js.map -------------------------------------------------------------------------------- /version.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by user on 2020/6/2. 3 | */ 4 | 5 | // @ts-ignore 6 | export let version: string; 7 | export default version 8 | // @ts-ignore 9 | export let version_dict: string; 10 | 11 | // @ts-ignore 12 | export let versions: { 13 | 'novel-segment': string; 14 | 'segment-dict': string; 15 | 'regexp-cjk': string; 16 | 'cjk-conv': string; 17 | } 18 | 19 | Object.defineProperty(exports, "version", { 20 | get() 21 | { 22 | return require('./package.json').version 23 | } 24 | }); 25 | 26 | Object.defineProperty(exports, "version_dict", { 27 | get() 28 | { 29 | return require('segment-dict/version').version 30 | } 31 | }); 32 | 33 | Object.defineProperty(exports, "versions", { 34 | get() 35 | { 36 | return { 37 | 'novel-segment': version, 38 | 'segment-dict': version_dict, 39 | 'regexp-cjk': require('regexp-cjk/version').version, 40 | 'cjk-conv': require('cjk-conv/version').version, 41 | } 42 | } 43 | }); 44 | 45 | Object.defineProperty(exports, "default", { 46 | get() 47 | { 48 | return version 49 | } 50 | }); 51 | 52 | --------------------------------------------------------------------------------