├── .github └── workflows │ ├── build-n-publish.yml │ └── pytest.yml ├── .gitignore ├── .gitmodules ├── CITATION ├── LICENSE ├── README.md ├── benchmark ├── benchmark.py ├── run_benchmark_ginza.sh ├── run_benchmark_spacy.sh └── setup_benchmark.sh ├── config ├── ja_ginza.analysis.cfg ├── ja_ginza.cfg ├── ja_ginza.meta.json ├── ja_ginza_bert_large.cfg ├── ja_ginza_bert_large.meta.json ├── ja_ginza_bert_large_analysis.cfg ├── ja_ginza_electra.analysis.cfg ├── ja_ginza_electra.cfg └── ja_ginza_electra.meta.json ├── docs ├── _config.yml ├── bunsetu_api.md ├── command_line_tool.md ├── developer_reference.md └── index.md ├── ginza ├── __init__.py ├── __main__.py ├── analyzer.py ├── bunsetu_recognizer.py ├── command_line.py ├── compound_splitter.py ├── disable_sentencizer.py ├── ene_ontonotes_mapper.py └── tests │ ├── conftest.py │ ├── test_analyzer.py │ ├── test_command_line.py │ └── test_models.py ├── ginza_util ├── __init__.py ├── browse_trees.py ├── conllu_to_json.py ├── conv_connlu_to_json.rea.sh ├── evaluate_conllu.py ├── evaluate_model.py ├── gsk2014a.py └── setup_meta.py ├── requirements.txt ├── setup.cfg └── setup.py /.github/workflows/build-n-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | jobs: 8 | build-n-publish: 9 | name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI 10 | runs-on: ubuntu-18.04 11 | steps: 12 | - uses: actions/checkout@master 13 | - name: Set up Python 3.9 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.9 17 | - name: Install pypa/build 18 | run: >- 19 | python -m 20 | pip install 21 | build 22 | --user 23 | - name: Build a binary wheel and a source tarball 24 | run: >- 25 | python -m 26 | build 27 | --sdist 28 | --wheel 29 | --outdir dist/ 30 | . 31 | - name: Publish distribution 📦 to PyPI 32 | if: startsWith(github.ref, 'refs/tags') 33 | uses: pypa/gh-action-pypi-publish@master 34 | with: 35 | password: ${{ secrets.PYPI_API_TOKEN }} 36 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: pytest 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - develop 10 | 11 | jobs: 12 | pytest: 13 | name: Run tests with pytest 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: [3.7, 3.8] 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Upgrade pip 26 | run: >- 27 | python -m 28 | pip install -U pip 29 | - name: Install dependencies 30 | run: >- 31 | python -m 32 | pip install . pytest pytest-mock ja-ginza ja-ginza-electra 33 | - name: Run Tests 34 | run: pytest 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bccwj*/ 2 | /build/ 3 | /config/ja_gsd* 4 | /corpus*/ 5 | /dist/ 6 | /electra* 7 | /embedding*/ 8 | /ja_* 9 | /log* 10 | /megagonlabs/ 11 | /models/ 12 | /old/ 13 | /rtx* 14 | /submodules/ 15 | /sudachi* 16 | /target/ 17 | /test/ 18 | /vector* 19 | /venv* 20 | __pycache__/ 21 | *.pyc 22 | *.egg-info/ 23 | .DS_Store 24 | .eggs -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/megagonlabs/ginza/f67b4987af09bad939d75c89b4e9483b90c064ee/.gitmodules -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @ARTICLE{GiNZA NLP, 2 | AUTHOR = {Hiroshi, Mai and Masayuki}, 3 | TITLE = {短単位品詞の用法曖昧性解決と依存関係ラベリングの同時学習}, 4 | YEAR = {2019}, 5 | JOURNAL = {言語処理学会第25回年次大会}, 6 | URL = {http://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/F2-3.pdf} 7 | } 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Megagon Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | import sys 4 | 5 | 6 | REPEAT = 5 7 | BATCH_SIZE = 128 8 | 9 | assert len(sys.argv) >= 2, "Usage: python {sys.argv[0]} [-g] model_name1 [model_name2 [...]]" 10 | if sys.argv[1] == "-g": 11 | require_gpu = True 12 | device = "GPU" 13 | model_names = sys.argv[2:] 14 | else: 15 | require_gpu = False 16 | device = "CPU" 17 | model_names = sys.argv[1:] 18 | 19 | sents = [_.rstrip("\n") for _ in sys.stdin] 20 | 21 | results = {} 22 | 23 | 24 | print("timestamp ", "[msec]", "device", 'procedure description', sep="\t", file=sys.stderr) 25 | start = datetime.now() 26 | prev = start 27 | print(start, 0, f"benchmark started with {len(sents)} sentences", sep="\t", file=sys.stderr) 28 | 29 | import spacy 30 | if require_gpu: 31 | spacy.require_gpu() 32 | lap = datetime.now() 33 | dur = int((lap - prev).total_seconds() * 1000) 34 | print(lap, dur, device, 'import spacy', sep="\t", file=sys.stderr) 35 | prev = lap 36 | 37 | for model_name in model_names: 38 | results = {} 39 | nlp = spacy.load(model_name) 40 | lap = datetime.now() 41 | dur = int((lap - prev).total_seconds() * 1000) 42 | results[f"spacy.load()"] = [dur] 43 | print(lap, dur, device, f"spacy.load({model_name})", sep="\t", file=sys.stderr) 44 | prev = lap 45 | 46 | results[f"nlp.pipe(batch={BATCH_SIZE})"] = [] 47 | for repeat in range(1, REPEAT + 1): 48 | for _ in range((len(sents) - 1) // BATCH_SIZE + 1): 49 | docs = nlp.pipe(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE]) 50 | for doc in docs: 51 | len(doc) 52 | lap = datetime.now() 53 | dur = int((lap - prev).total_seconds() * 1000) 54 | results[f"nlp.pipe(batch={BATCH_SIZE})"].append(dur / len(sents)) 55 | print( 56 | lap, 57 | dur, 58 | device, 59 | f"#{repeat} {model_name}->nlp.pipe(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]", 60 | sep="\t", file=sys.stderr, 61 | ) 62 | prev = lap 63 | 64 | results[f"nlp(batch=1)"] = [] 65 | for repeat in range(1, REPEAT + 1): 66 | for sent in sents: 67 | doc = nlp(sent) 68 | len(doc) 69 | lap = datetime.now() 70 | dur = int((lap - prev).total_seconds() * 1000) 71 | results[f"nlp(batch=1)"].append(dur / len(sents)) 72 | print( 73 | lap, 74 | dur, 75 | device, 76 | f"#{repeat} {model_name}->nlp(batch=1): {dur / len(sents):.03f}[msec/sent]", 77 | sep="\t", file=sys.stderr, 78 | ) 79 | prev = lap 80 | 81 | dur = int((lap - start).total_seconds() * 1000) 82 | print(lap, dur, device, model_name, 'finished', sep="\t", file=sys.stderr) 83 | 84 | for k, v in results.items(): 85 | l = sorted(v) 86 | results[k] = l[len(l) // 2] 87 | 88 | json.dump( 89 | {"model": model_name, "device": device, "results": results}, 90 | sys.stdout, 91 | ensure_ascii=False, 92 | ) 93 | print() 94 | -------------------------------------------------------------------------------- /benchmark/run_benchmark_ginza.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cat gsd/dev.txt gsd/test.txt | python benchmark.py -g ja_ginza ja_ginza_electra 3 | cat gsd/dev.txt gsd/test.txt | python benchmark.py ja_ginza ja_ginza_electra 4 | -------------------------------------------------------------------------------- /benchmark/run_benchmark_spacy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cat gsd/dev.txt gsd/test.txt | python benchmark.py -g ja_core_news_md ja_core_news_trf 3 | cat gsd/dev.txt gsd/test.txt | python benchmark.py ja_core_news_md ja_core_news_trf 4 | -------------------------------------------------------------------------------- /benchmark/setup_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | mkdir -p gsd 4 | for t in train dev test ; do 5 | curl "https://raw.githubusercontent.com/megagonlabs/UD_Japanese-GSD/c614040872a74587912a15ef4637eabc0dc29a60/ja_gsd-ud-${t}.ne.conllu?raw=true" | grep "# text = " | sed 's/# text = //' > gsd/${t}.txt 6 | done 7 | echo 8 | echo '=== CUDA Related Installation Steps ===' 9 | echo 'The pytorch should be installed with cuda support. See https://pytorch.org/get-started/previous-versions/#linux-and-windows-1' 10 | echo 'Also you need to install spacy with appropriate cuda specifier as `pip install -U spacy[cudaXXX]`. See https://spacy.io/usage#gpu' 11 | echo 'And then, install GiNZA as `pip install -U ginza ja-ginza ja-ginza-electra`.' 12 | echo 'To evaluate the performance of spaCy official models, install models as `python -m spacy download ja_core_news_md ; python -m spacy download ja_core_news_trf`.' 13 | -------------------------------------------------------------------------------- /config/ja_ginza.analysis.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = null 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["tok2vec","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 1000 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | validate = false 29 | 30 | [components.bunsetu_recognizer] 31 | factory = "bunsetu_recognizer" 32 | remain_bunsetu_suffix = false 33 | 34 | [components.compound_splitter] 35 | factory = "compound_splitter" 36 | split_mode = null 37 | 38 | [components.morphologizer] 39 | factory = "morphologizer" 40 | 41 | [components.morphologizer.model] 42 | @architectures = "spacy.Tagger.v1" 43 | nO = null 44 | 45 | [components.morphologizer.model.tok2vec] 46 | @architectures = "spacy.Tok2VecListener.v1" 47 | width = ${components.tok2vec.model.encode.width} 48 | upstream = "*" 49 | 50 | [components.ner] 51 | factory = "ner" 52 | incorrect_spans_key = null 53 | moves = null 54 | update_with_oracle_cut_size = 100 55 | 56 | [components.ner.model] 57 | @architectures = "spacy.TransitionBasedParser.v2" 58 | state_type = "ner" 59 | extra_state_tokens = false 60 | hidden_width = 64 61 | maxout_pieces = 2 62 | use_upper = true 63 | nO = null 64 | 65 | [components.ner.model.tok2vec] 66 | @architectures = "spacy.Tok2VecListener.v1" 67 | width = ${components.tok2vec.model.encode.width} 68 | upstream = "*" 69 | 70 | [components.parser] 71 | factory = "parser" 72 | learn_tokens = false 73 | min_action_freq = 30 74 | moves = null 75 | update_with_oracle_cut_size = 100 76 | 77 | [components.parser.model] 78 | @architectures = "spacy.TransitionBasedParser.v2" 79 | state_type = "parser" 80 | extra_state_tokens = false 81 | hidden_width = 128 82 | maxout_pieces = 3 83 | use_upper = true 84 | nO = null 85 | 86 | [components.parser.model.tok2vec] 87 | @architectures = "spacy.Tok2VecListener.v1" 88 | width = ${components.tok2vec.model.encode.width} 89 | upstream = "*" 90 | 91 | [components.tok2vec] 92 | factory = "tok2vec" 93 | 94 | [components.tok2vec.model] 95 | @architectures = "spacy.Tok2Vec.v2" 96 | 97 | [components.tok2vec.model.embed] 98 | @architectures = "spacy.MultiHashEmbed.v2" 99 | width = ${components.tok2vec.model.encode.width} 100 | attrs = ["ORTH","SHAPE"] 101 | rows = [5000,2500] 102 | include_static_vectors = true 103 | 104 | [components.tok2vec.model.encode] 105 | @architectures = "spacy.MaxoutWindowEncoder.v2" 106 | width = 256 107 | depth = 8 108 | window_size = 1 109 | maxout_pieces = 3 110 | 111 | [corpora] 112 | 113 | [corpora.dev] 114 | @readers = "spacy.Corpus.v1" 115 | path = ${paths.dev} 116 | max_length = 0 117 | gold_preproc = false 118 | limit = 0 119 | augmenter = null 120 | 121 | [corpora.train] 122 | @readers = "spacy.Corpus.v1" 123 | path = ${paths.train} 124 | max_length = 2000 125 | gold_preproc = false 126 | limit = 0 127 | augmenter = null 128 | 129 | [training] 130 | dev_corpus = "corpora.dev" 131 | train_corpus = "corpora.train" 132 | seed = ${system.seed} 133 | gpu_allocator = ${system.gpu_allocator} 134 | dropout = 0.1 135 | accumulate_gradient = 1 136 | patience = 0 137 | max_epochs = 0 138 | max_steps = 50000 139 | eval_frequency = 200 140 | frozen_components = [] 141 | before_to_disk = null 142 | annotating_components = [] 143 | 144 | [training.batcher] 145 | @batchers = "spacy.batch_by_words.v1" 146 | discard_oversize = false 147 | tolerance = 0.2 148 | get_length = null 149 | 150 | [training.batcher.size] 151 | @schedules = "compounding.v1" 152 | start = 100 153 | stop = 1000 154 | compound = 1.001 155 | t = 0.0 156 | 157 | [training.logger] 158 | @loggers = "spacy.ConsoleLogger.v1" 159 | progress_bar = false 160 | 161 | [training.optimizer] 162 | @optimizers = "Adam.v1" 163 | beta1 = 0.9 164 | beta2 = 0.999 165 | L2_is_weight_decay = true 166 | L2 = 0.01 167 | grad_clip = 1.0 168 | use_averages = false 169 | eps = 0.00000001 170 | learn_rate = 0.001 171 | 172 | [training.score_weights] 173 | dep_uas = 0.25 174 | dep_las = 0.25 175 | dep_las_per_type = null 176 | sents_p = null 177 | sents_r = null 178 | sents_f = 0.1 179 | ents_f = 0.25 180 | ents_p = 0.0 181 | ents_r = 0.0 182 | ents_per_type = null 183 | pos_acc = 0.15 184 | morph_acc = 0.0 185 | morph_per_feat = null 186 | tag_acc = 0.0 187 | 188 | [pretraining] 189 | 190 | [initialize] 191 | vectors = "vectors/" 192 | init_tok2vec = ${paths.init_tok2vec} 193 | vocab_data = null 194 | lookups = null 195 | before_init = null 196 | after_init = null 197 | 198 | [initialize.components] 199 | 200 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = null 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["tok2vec","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 1000 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | validate = false 29 | 30 | [components.bunsetu_recognizer] 31 | factory = "bunsetu_recognizer" 32 | remain_bunsetu_suffix = true 33 | 34 | [components.compound_splitter] 35 | factory = "compound_splitter" 36 | split_mode = null 37 | 38 | [components.morphologizer] 39 | factory = "morphologizer" 40 | 41 | [components.morphologizer.model] 42 | @architectures = "spacy.Tagger.v1" 43 | nO = null 44 | 45 | [components.morphologizer.model.tok2vec] 46 | @architectures = "spacy.Tok2VecListener.v1" 47 | width = ${components.tok2vec.model.encode.width} 48 | upstream = "*" 49 | 50 | [components.ner] 51 | factory = "ner" 52 | incorrect_spans_key = null 53 | moves = null 54 | update_with_oracle_cut_size = 100 55 | 56 | [components.ner.model] 57 | @architectures = "spacy.TransitionBasedParser.v2" 58 | state_type = "ner" 59 | extra_state_tokens = false 60 | hidden_width = 64 61 | maxout_pieces = 2 62 | use_upper = true 63 | nO = null 64 | 65 | [components.ner.model.tok2vec] 66 | @architectures = "spacy.Tok2VecListener.v1" 67 | width = ${components.tok2vec.model.encode.width} 68 | upstream = "*" 69 | 70 | [components.parser] 71 | factory = "parser" 72 | learn_tokens = false 73 | min_action_freq = 30 74 | moves = null 75 | update_with_oracle_cut_size = 100 76 | 77 | [components.parser.model] 78 | @architectures = "spacy.TransitionBasedParser.v2" 79 | state_type = "parser" 80 | extra_state_tokens = false 81 | hidden_width = 128 82 | maxout_pieces = 3 83 | use_upper = true 84 | nO = null 85 | 86 | [components.parser.model.tok2vec] 87 | @architectures = "spacy.Tok2VecListener.v1" 88 | width = ${components.tok2vec.model.encode.width} 89 | upstream = "*" 90 | 91 | [components.tok2vec] 92 | factory = "tok2vec" 93 | 94 | [components.tok2vec.model] 95 | @architectures = "spacy.Tok2Vec.v2" 96 | 97 | [components.tok2vec.model.embed] 98 | @architectures = "spacy.MultiHashEmbed.v2" 99 | width = ${components.tok2vec.model.encode.width} 100 | attrs = ["ORTH","SHAPE"] 101 | rows = [5000,2500] 102 | include_static_vectors = true 103 | 104 | [components.tok2vec.model.encode] 105 | @architectures = "spacy.MaxoutWindowEncoder.v2" 106 | width = 256 107 | depth = 8 108 | window_size = 1 109 | maxout_pieces = 3 110 | 111 | [corpora] 112 | 113 | [corpora.dev] 114 | @readers = "spacy.Corpus.v1" 115 | path = ${paths.dev} 116 | max_length = 0 117 | gold_preproc = false 118 | limit = 0 119 | augmenter = null 120 | 121 | [corpora.train] 122 | @readers = "spacy.Corpus.v1" 123 | path = ${paths.train} 124 | max_length = 2000 125 | gold_preproc = false 126 | limit = 0 127 | augmenter = null 128 | 129 | [training] 130 | dev_corpus = "corpora.dev" 131 | train_corpus = "corpora.train" 132 | seed = ${system.seed} 133 | gpu_allocator = ${system.gpu_allocator} 134 | dropout = 0.1 135 | accumulate_gradient = 1 136 | patience = 0 137 | max_epochs = 0 138 | max_steps = 50000 139 | eval_frequency = 200 140 | frozen_components = [] 141 | before_to_disk = null 142 | annotating_components = [] 143 | 144 | [training.batcher] 145 | @batchers = "spacy.batch_by_words.v1" 146 | discard_oversize = false 147 | tolerance = 0.2 148 | get_length = null 149 | 150 | [training.batcher.size] 151 | @schedules = "compounding.v1" 152 | start = 100 153 | stop = 1000 154 | compound = 1.001 155 | t = 0.0 156 | 157 | [training.logger] 158 | @loggers = "spacy.ConsoleLogger.v1" 159 | progress_bar = false 160 | 161 | [training.optimizer] 162 | @optimizers = "Adam.v1" 163 | beta1 = 0.9 164 | beta2 = 0.999 165 | L2_is_weight_decay = true 166 | L2 = 0.01 167 | grad_clip = 1.0 168 | use_averages = false 169 | eps = 0.00000001 170 | learn_rate = 0.001 171 | 172 | [training.score_weights] 173 | dep_uas = 0.25 174 | dep_las = 0.25 175 | dep_las_per_type = null 176 | sents_p = null 177 | sents_r = null 178 | sents_f = 0.1 179 | ents_f = 0.25 180 | ents_p = 0.0 181 | ents_r = 0.0 182 | ents_per_type = null 183 | pos_acc = 0.15 184 | morph_acc = 0.0 185 | morph_per_feat = null 186 | tag_acc = 0.0 187 | 188 | [pretraining] 189 | 190 | [initialize] 191 | vectors = "vectors/" 192 | init_tok2vec = ${paths.init_tok2vec} 193 | vocab_data = null 194 | lookups = null 195 | before_init = null 196 | after_init = null 197 | 198 | [initialize.components] 199 | 200 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"ja", 3 | "name":"ginza", 4 | "version":"5.2.0", 5 | "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.", 6 | "author":"Megagon Labs Tokyo.", 7 | "email":"ginza@megagon.ai", 8 | "url":"https://github.com/megagonlabs/ginza", 9 | "license":"MIT License", 10 | "sources":[ 11 | { 12 | "name":"UD_Japanese-BCCWJ r2.8", 13 | "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ", 14 | "license":"CC BY-NC-SA 4.0", 15 | "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y." 16 | }, 17 | { 18 | "name":"GSK2014-A(2019)", 19 | "url":"https://www.gsk.or.jp/catalog/gsk2014-a/", 20 | "license":"Individually defined commercial license", 21 | "author":"Tokyo Institute of Technology" 22 | }, 23 | { 24 | "name":"SudachiDict_core", 25 | "url":"https://github.com/WorksApplications/SudachiDict", 26 | "license":"Apache License 2.0", 27 | "author":"Works Applications Enterprise Co., Ltd." 28 | }, 29 | { 30 | "name":"chiVe", 31 | "url":"https://github.com/WorksApplications/chiVe", 32 | "license":"Apache License 2.0", 33 | "author":"Works Applications Enterprise Co., Ltd." 34 | } 35 | ], 36 | "parent_package":"spacy", 37 | "spacy_version":">=3.4.4,<4.0.0", 38 | "spacy_git_version":"0fc3dee77", 39 | "vectors":{ 40 | "width":300, 41 | "vectors":20000, 42 | "keys":480443, 43 | "name":"ja_vectors" 44 | }, 45 | "pipeline":[ 46 | "tok2vec", 47 | "parser", 48 | "attribute_ruler", 49 | "ner", 50 | "morphologizer", 51 | "compound_splitter", 52 | "bunsetu_recognizer" 53 | ], 54 | "components":[ 55 | "tok2vec", 56 | "parser", 57 | "attribute_ruler", 58 | "ner", 59 | "morphologizer", 60 | "compound_splitter", 61 | "bunsetu_recognizer" 62 | ], 63 | "disabled":[ 64 | "attribute_ruler" 65 | ], 66 | "requirements":[ 67 | "sudachipy>=0.6.2,<0.7.0", 68 | "sudachidict_core>=20210802", 69 | "ginza>=5.2.0,<5.3.0" 70 | ] 71 | } 72 | -------------------------------------------------------------------------------- /config/ja_ginza_bert_large.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = "pytorch" 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 128 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | validate = false 29 | 30 | [components.bunsetu_recognizer] 31 | factory = "bunsetu_recognizer" 32 | remain_bunsetu_suffix = true 33 | 34 | [components.compound_splitter] 35 | factory = "compound_splitter" 36 | split_mode = null 37 | 38 | [components.morphologizer] 39 | factory = "morphologizer" 40 | extend = true 41 | overwrite = true 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} 43 | 44 | [components.morphologizer.model] 45 | @architectures = "spacy.Tagger.v2" 46 | nO = null 47 | normalize = false 48 | 49 | [components.morphologizer.model.tok2vec] 50 | @architectures = "spacy-transformers.TransformerListener.v1" 51 | grad_factor = 1.0 52 | pooling = {"@layers":"reduce_mean.v1"} 53 | upstream = "*" 54 | 55 | [components.ner] 56 | factory = "ner" 57 | incorrect_spans_key = null 58 | moves = null 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"} 60 | update_with_oracle_cut_size = 100 61 | 62 | [components.ner.model] 63 | @architectures = "spacy.TransitionBasedParser.v2" 64 | state_type = "ner" 65 | extra_state_tokens = false 66 | hidden_width = 64 67 | maxout_pieces = 2 68 | use_upper = false 69 | nO = null 70 | 71 | [components.ner.model.tok2vec] 72 | @architectures = "spacy-transformers.TransformerListener.v1" 73 | grad_factor = 1.0 74 | pooling = {"@layers":"reduce_mean.v1"} 75 | upstream = "*" 76 | 77 | [components.parser] 78 | factory = "parser" 79 | learn_tokens = false 80 | min_action_freq = 30 81 | moves = null 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"} 83 | update_with_oracle_cut_size = 100 84 | 85 | [components.parser.model] 86 | @architectures = "spacy.TransitionBasedParser.v2" 87 | state_type = "parser" 88 | extra_state_tokens = false 89 | hidden_width = 128 90 | maxout_pieces = 3 91 | use_upper = false 92 | nO = null 93 | 94 | [components.parser.model.tok2vec] 95 | @architectures = "spacy-transformers.TransformerListener.v1" 96 | grad_factor = 1.0 97 | pooling = {"@layers":"reduce_mean.v1"} 98 | upstream = "*" 99 | 100 | [components.transformer] 101 | factory = "transformer" 102 | max_batch_items = 4096 103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} 104 | 105 | [components.transformer.model] 106 | @architectures = "spacy-transformers.TransformerModel.v3" 107 | name = "cl-tohoku/bert-large-japanese-v2" 108 | mixed_precision = false 109 | 110 | [components.transformer.model.get_spans] 111 | @span_getters = "spacy-transformers.strided_spans.v1" 112 | window = 128 113 | stride = 96 114 | 115 | [components.transformer.model.grad_scaler_config] 116 | 117 | [components.transformer.model.tokenizer_config] 118 | use_fast = false 119 | tokenizer_class = "BertJapaneseTokenizer" 120 | do_lower_case = false 121 | word_tokenizer_type = mecab 122 | subword_tokenizer_type = wordpiece 123 | mecab_kwargs = {"mecab_dic":"unidic_lite"} 124 | 125 | [components.transformer.model.transformer_config] 126 | 127 | [corpora] 128 | 129 | [corpora.dev] 130 | @readers = "spacy.Corpus.v1" 131 | path = ${paths.dev} 132 | max_length = 0 133 | gold_preproc = false 134 | limit = 0 135 | augmenter = null 136 | 137 | [corpora.train] 138 | @readers = "spacy.Corpus.v1" 139 | path = ${paths.train} 140 | max_length = 0 141 | gold_preproc = false 142 | limit = 0 143 | augmenter = null 144 | 145 | [training] 146 | accumulate_gradient = 3 147 | dev_corpus = "corpora.dev" 148 | train_corpus = "corpora.train" 149 | seed = ${system.seed} 150 | gpu_allocator = ${system.gpu_allocator} 151 | dropout = 0.1 152 | patience = 0 153 | max_epochs = 0 154 | max_steps = 20000 155 | eval_frequency = 200 156 | frozen_components = [] 157 | annotating_components = [] 158 | before_to_disk = null 159 | before_update = null 160 | 161 | [training.batcher] 162 | @batchers = "spacy.batch_by_padded.v1" 163 | discard_oversize = true 164 | size = 2000 165 | buffer = 256 166 | get_length = null 167 | 168 | [training.logger] 169 | @loggers = "spacy.ConsoleLogger.v1" 170 | progress_bar = false 171 | 172 | [training.optimizer] 173 | @optimizers = "Adam.v1" 174 | beta1 = 0.9 175 | beta2 = 0.999 176 | L2_is_weight_decay = true 177 | L2 = 0.01 178 | grad_clip = 1.0 179 | use_averages = false 180 | eps = 0.00000001 181 | 182 | [training.optimizer.learn_rate] 183 | @schedules = "warmup_linear.v1" 184 | warmup_steps = 250 185 | total_steps = 20000 186 | initial_rate = 0.00005 187 | 188 | [training.score_weights] 189 | pos_acc = 0.15 190 | morph_micro_f = 0.0 191 | morph_per_feat = null 192 | dep_uas = 0.25 193 | dep_las = 0.25 194 | dep_las_per_type = null 195 | sents_p = null 196 | sents_r = null 197 | sents_f = 0.1 198 | ents_f = 0.25 199 | ents_p = 0.0 200 | ents_r = 0.0 201 | ents_per_type = null 202 | tag_acc = 0.0 203 | 204 | [pretraining] 205 | 206 | [initialize] 207 | vectors = null 208 | init_tok2vec = ${paths.init_tok2vec} 209 | vocab_data = null 210 | lookups = null 211 | before_init = null 212 | after_init = null 213 | 214 | [initialize.components] 215 | 216 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza_bert_large.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"ja", 3 | "name":"ginza_bert_large", 4 | "version":"5.2.0b1", 5 | "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", 6 | "author":"Megagon Labs Tokyo.", 7 | "email":"ginza@megagon.ai", 8 | "url":"https://github.com/megagonlabs/ginza", 9 | "license":"MIT License", 10 | "sources":[ 11 | { 12 | "name":"UD_Japanese-BCCWJ r2.8", 13 | "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ", 14 | "license":"CC BY-NC-SA 4.0", 15 | "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y." 16 | }, 17 | { 18 | "name":"GSK2014-A(2019)", 19 | "url":"https://www.gsk.or.jp/catalog/gsk2014-a/", 20 | "license":"Individually defined commercial license", 21 | "author":"Tokyo Institute of Technology" 22 | }, 23 | { 24 | "name":"SudachiDict_core", 25 | "url":"https://github.com/WorksApplications/SudachiDict", 26 | "license":"Apache License 2.0", 27 | "author":"Works Applications Enterprise Co., Ltd." 28 | }, 29 | { 30 | "name":"cl-tohoku/bert-large-japanese-v2", 31 | "url":"https://huggingface.co/cl-tohoku/bert-large-japanese-v2", 32 | "license":"Apache License 2.0", 33 | "author":"Tohoku University" 34 | } 35 | ], 36 | "spacy_version":">=3.6.1,<4.0.0", 37 | "spacy_git_version":"458bc5f45", 38 | "pipeline":[ 39 | "transformer", 40 | "parser", 41 | "ner", 42 | "morphologizer", 43 | "compound_splitter", 44 | "bunsetu_recognizer" 45 | ], 46 | "components":[ 47 | "transformer", 48 | "parser", 49 | "attribute_ruler", 50 | "ner", 51 | "morphologizer", 52 | "compound_splitter", 53 | "bunsetu_recognizer" 54 | ], 55 | "disabled":[ 56 | "attribute_ruler" 57 | ], 58 | "vectors":{ 59 | "width":0, 60 | "vectors":0, 61 | "keys":0, 62 | "name":null, 63 | "mode":"default" 64 | }, 65 | "requirements":[ 66 | "sudachipy>=0.6.7,<0.7.0", 67 | "sudachidict_core>=20230711", 68 | "spacy-transformers>=1.2.5,<1.3.0", 69 | "fugashi>=1.3.0", 70 | "unidic-lite>=1.0.8", 71 | "ginza>=5.2.0,<5.3.0" 72 | ] 73 | } -------------------------------------------------------------------------------- /config/ja_ginza_bert_large_analysis.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = "pytorch" 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 128 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | validate = false 29 | 30 | [components.bunsetu_recognizer] 31 | factory = "bunsetu_recognizer" 32 | remain_bunsetu_suffix = false 33 | 34 | [components.compound_splitter] 35 | factory = "compound_splitter" 36 | split_mode = null 37 | 38 | [components.morphologizer] 39 | factory = "morphologizer" 40 | extend = true 41 | overwrite = true 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} 43 | 44 | [components.morphologizer.model] 45 | @architectures = "spacy.Tagger.v2" 46 | nO = null 47 | normalize = false 48 | 49 | [components.morphologizer.model.tok2vec] 50 | @architectures = "spacy-transformers.TransformerListener.v1" 51 | grad_factor = 1.0 52 | pooling = {"@layers":"reduce_mean.v1"} 53 | upstream = "*" 54 | 55 | [components.ner] 56 | factory = "ner" 57 | incorrect_spans_key = null 58 | moves = null 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"} 60 | update_with_oracle_cut_size = 100 61 | 62 | [components.ner.model] 63 | @architectures = "spacy.TransitionBasedParser.v2" 64 | state_type = "ner" 65 | extra_state_tokens = false 66 | hidden_width = 64 67 | maxout_pieces = 2 68 | use_upper = false 69 | nO = null 70 | 71 | [components.ner.model.tok2vec] 72 | @architectures = "spacy-transformers.TransformerListener.v1" 73 | grad_factor = 1.0 74 | pooling = {"@layers":"reduce_mean.v1"} 75 | upstream = "*" 76 | 77 | [components.parser] 78 | factory = "parser" 79 | learn_tokens = false 80 | min_action_freq = 30 81 | moves = null 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"} 83 | update_with_oracle_cut_size = 100 84 | 85 | [components.parser.model] 86 | @architectures = "spacy.TransitionBasedParser.v2" 87 | state_type = "parser" 88 | extra_state_tokens = false 89 | hidden_width = 128 90 | maxout_pieces = 3 91 | use_upper = false 92 | nO = null 93 | 94 | [components.parser.model.tok2vec] 95 | @architectures = "spacy-transformers.TransformerListener.v1" 96 | grad_factor = 1.0 97 | pooling = {"@layers":"reduce_mean.v1"} 98 | upstream = "*" 99 | 100 | [components.transformer] 101 | factory = "transformer" 102 | max_batch_items = 4096 103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} 104 | 105 | [components.transformer.model] 106 | @architectures = "spacy-transformers.TransformerModel.v3" 107 | name = "cl-tohoku/bert-large-japanese-v2" 108 | mixed_precision = false 109 | 110 | [components.transformer.model.get_spans] 111 | @span_getters = "spacy-transformers.strided_spans.v1" 112 | window = 128 113 | stride = 96 114 | 115 | [components.transformer.model.grad_scaler_config] 116 | 117 | [components.transformer.model.tokenizer_config] 118 | use_fast = false 119 | tokenizer_class = "BertJapaneseTokenizer" 120 | do_lower_case = false 121 | word_tokenizer_type = mecab 122 | subword_tokenizer_type = wordpiece 123 | mecab_kwargs = {"mecab_dic":"unidic_lite"} 124 | 125 | [components.transformer.model.transformer_config] 126 | 127 | [corpora] 128 | 129 | [corpora.dev] 130 | @readers = "spacy.Corpus.v1" 131 | path = ${paths.dev} 132 | max_length = 0 133 | gold_preproc = false 134 | limit = 0 135 | augmenter = null 136 | 137 | [corpora.train] 138 | @readers = "spacy.Corpus.v1" 139 | path = ${paths.train} 140 | max_length = 0 141 | gold_preproc = false 142 | limit = 0 143 | augmenter = null 144 | 145 | [training] 146 | accumulate_gradient = 3 147 | dev_corpus = "corpora.dev" 148 | train_corpus = "corpora.train" 149 | seed = ${system.seed} 150 | gpu_allocator = ${system.gpu_allocator} 151 | dropout = 0.1 152 | patience = 0 153 | max_epochs = 0 154 | max_steps = 20000 155 | eval_frequency = 200 156 | frozen_components = [] 157 | annotating_components = [] 158 | before_to_disk = null 159 | before_update = null 160 | 161 | [training.batcher] 162 | @batchers = "spacy.batch_by_padded.v1" 163 | discard_oversize = true 164 | size = 2000 165 | buffer = 256 166 | get_length = null 167 | 168 | [training.logger] 169 | @loggers = "spacy.ConsoleLogger.v1" 170 | progress_bar = false 171 | 172 | [training.optimizer] 173 | @optimizers = "Adam.v1" 174 | beta1 = 0.9 175 | beta2 = 0.999 176 | L2_is_weight_decay = true 177 | L2 = 0.01 178 | grad_clip = 1.0 179 | use_averages = false 180 | eps = 0.00000001 181 | 182 | [training.optimizer.learn_rate] 183 | @schedules = "warmup_linear.v1" 184 | warmup_steps = 250 185 | total_steps = 20000 186 | initial_rate = 0.00005 187 | 188 | [training.score_weights] 189 | pos_acc = 0.15 190 | morph_micro_f = 0.0 191 | morph_per_feat = null 192 | dep_uas = 0.25 193 | dep_las = 0.25 194 | dep_las_per_type = null 195 | sents_p = null 196 | sents_r = null 197 | sents_f = 0.1 198 | ents_f = 0.25 199 | ents_p = 0.0 200 | ents_r = 0.0 201 | ents_per_type = null 202 | tag_acc = 0.0 203 | 204 | [pretraining] 205 | 206 | [initialize] 207 | vectors = null 208 | init_tok2vec = ${paths.init_tok2vec} 209 | vocab_data = null 210 | lookups = null 211 | before_init = null 212 | after_init = null 213 | 214 | [initialize.components] 215 | 216 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza_electra.analysis.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = "pytorch" 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 128 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"} 29 | validate = false 30 | 31 | [components.bunsetu_recognizer] 32 | factory = "bunsetu_recognizer" 33 | remain_bunsetu_suffix = false 34 | 35 | [components.compound_splitter] 36 | factory = "compound_splitter" 37 | split_mode = null 38 | 39 | [components.morphologizer] 40 | factory = "morphologizer" 41 | extend = true 42 | overwrite = true 43 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} 44 | 45 | [components.morphologizer.model] 46 | @architectures = "spacy.Tagger.v1" 47 | nO = null 48 | 49 | [components.morphologizer.model.tok2vec] 50 | @architectures = "spacy-transformers.TransformerListener.v1" 51 | grad_factor = 1.0 52 | pooling = {"@layers":"reduce_mean.v1"} 53 | upstream = "*" 54 | 55 | [components.ner] 56 | factory = "ner" 57 | incorrect_spans_key = null 58 | moves = null 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"} 60 | update_with_oracle_cut_size = 100 61 | 62 | [components.ner.model] 63 | @architectures = "spacy.TransitionBasedParser.v2" 64 | state_type = "ner" 65 | extra_state_tokens = false 66 | hidden_width = 64 67 | maxout_pieces = 2 68 | use_upper = false 69 | nO = null 70 | 71 | [components.ner.model.tok2vec] 72 | @architectures = "spacy-transformers.TransformerListener.v1" 73 | grad_factor = 1.0 74 | pooling = {"@layers":"reduce_mean.v1"} 75 | upstream = "*" 76 | 77 | [components.parser] 78 | factory = "parser" 79 | learn_tokens = false 80 | min_action_freq = 30 81 | moves = null 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"} 83 | update_with_oracle_cut_size = 100 84 | 85 | [components.parser.model] 86 | @architectures = "spacy.TransitionBasedParser.v2" 87 | state_type = "parser" 88 | extra_state_tokens = false 89 | hidden_width = 128 90 | maxout_pieces = 3 91 | use_upper = false 92 | nO = null 93 | 94 | [components.parser.model.tok2vec] 95 | @architectures = "spacy-transformers.TransformerListener.v1" 96 | grad_factor = 1.0 97 | pooling = {"@layers":"reduce_mean.v1"} 98 | upstream = "*" 99 | 100 | [components.transformer] 101 | factory = "transformer_custom" 102 | max_batch_items = 4096 103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} 104 | 105 | [components.transformer.model] 106 | @architectures = "spacy-transformers.TransformerModel.v3" 107 | name = "megagonlabs/transformers-ud-japanese-electra-base-ginza-510" 108 | mixed_precision = false 109 | 110 | [components.transformer.model.get_spans] 111 | @span_getters = "spacy-transformers.strided_spans.v1" 112 | window = 128 113 | stride = 96 114 | 115 | [components.transformer.model.grad_scaler_config] 116 | 117 | [components.transformer.model.tokenizer_config] 118 | use_fast = false 119 | tokenizer_class = "sudachitra.tokenization_electra_sudachipy.ElectraSudachipyTokenizer" 120 | do_lower_case = false 121 | do_word_tokenize = true 122 | do_subword_tokenize = true 123 | word_tokenizer_type = "sudachipy" 124 | subword_tokenizer_type = "wordpiece" 125 | word_form_type = "dictionary_and_surface" 126 | 127 | [components.transformer.model.tokenizer_config.sudachipy_kwargs] 128 | split_mode = "A" 129 | dict_type = "core" 130 | 131 | [components.transformer.model.transformer_config] 132 | 133 | [corpora] 134 | 135 | [corpora.dev] 136 | @readers = "spacy.Corpus.v1" 137 | path = ${paths.dev} 138 | max_length = 0 139 | gold_preproc = false 140 | limit = 0 141 | augmenter = null 142 | 143 | [corpora.train] 144 | @readers = "spacy.Corpus.v1" 145 | path = ${paths.train} 146 | max_length = 500 147 | gold_preproc = false 148 | limit = 0 149 | augmenter = null 150 | 151 | [training] 152 | accumulate_gradient = 3 153 | dev_corpus = "corpora.dev" 154 | train_corpus = "corpora.train" 155 | seed = ${system.seed} 156 | gpu_allocator = ${system.gpu_allocator} 157 | dropout = 0.1 158 | patience = 0 159 | max_epochs = 0 160 | max_steps = 50000 161 | eval_frequency = 200 162 | frozen_components = [] 163 | annotating_components = [] 164 | before_to_disk = null 165 | 166 | [training.batcher] 167 | @batchers = "spacy.batch_by_padded.v1" 168 | discard_oversize = true 169 | size = 2000 170 | buffer = 256 171 | get_length = null 172 | 173 | [training.logger] 174 | @loggers = "spacy.ConsoleLogger.v1" 175 | progress_bar = false 176 | 177 | [training.optimizer] 178 | @optimizers = "Adam.v1" 179 | beta1 = 0.9 180 | beta2 = 0.999 181 | L2_is_weight_decay = true 182 | L2 = 0.01 183 | grad_clip = 1.0 184 | use_averages = false 185 | eps = 0.00000001 186 | 187 | [training.optimizer.learn_rate] 188 | @schedules = "warmup_linear.v1" 189 | warmup_steps = 250 190 | total_steps = 50000 191 | initial_rate = 0.00005 192 | 193 | [training.score_weights] 194 | dep_uas = 0.25 195 | dep_las = 0.25 196 | dep_las_per_type = null 197 | sents_p = null 198 | sents_r = null 199 | sents_f = 0.1 200 | ents_f = 0.25 201 | ents_p = 0.0 202 | ents_r = 0.0 203 | ents_per_type = null 204 | pos_acc = 0.15 205 | morph_acc = 0.0 206 | morph_per_feat = null 207 | tag_acc = 0.0 208 | 209 | [pretraining] 210 | 211 | [initialize] 212 | vectors = null 213 | init_tok2vec = ${paths.init_tok2vec} 214 | vocab_data = null 215 | lookups = null 216 | before_init = null 217 | after_init = null 218 | 219 | [initialize.components] 220 | 221 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza_electra.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" 3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = "pytorch" 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "ja" 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] 14 | batch_size = 128 15 | disabled = ["attribute_ruler"] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [nlp.tokenizer] 21 | @tokenizers = "spacy.ja.JapaneseTokenizer" 22 | split_mode = "C" 23 | 24 | [components] 25 | 26 | [components.attribute_ruler] 27 | factory = "attribute_ruler" 28 | validate = false 29 | 30 | [components.bunsetu_recognizer] 31 | factory = "bunsetu_recognizer" 32 | remain_bunsetu_suffix = true 33 | 34 | [components.compound_splitter] 35 | factory = "compound_splitter" 36 | split_mode = null 37 | 38 | [components.morphologizer] 39 | factory = "morphologizer" 40 | extend = true 41 | overwrite = true 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} 43 | 44 | [components.morphologizer.model] 45 | @architectures = "spacy.Tagger.v1" 46 | nO = null 47 | 48 | [components.morphologizer.model.tok2vec] 49 | @architectures = "spacy-transformers.TransformerListener.v1" 50 | grad_factor = 1.0 51 | pooling = {"@layers":"reduce_mean.v1"} 52 | upstream = "*" 53 | 54 | [components.ner] 55 | factory = "ner" 56 | incorrect_spans_key = null 57 | moves = null 58 | scorer = {"@scorers":"spacy.ner_scorer.v1"} 59 | update_with_oracle_cut_size = 100 60 | 61 | [components.ner.model] 62 | @architectures = "spacy.TransitionBasedParser.v2" 63 | state_type = "ner" 64 | extra_state_tokens = false 65 | hidden_width = 64 66 | maxout_pieces = 2 67 | use_upper = false 68 | nO = null 69 | 70 | [components.ner.model.tok2vec] 71 | @architectures = "spacy-transformers.TransformerListener.v1" 72 | grad_factor = 1.0 73 | pooling = {"@layers":"reduce_mean.v1"} 74 | upstream = "*" 75 | 76 | [components.parser] 77 | factory = "parser" 78 | learn_tokens = false 79 | min_action_freq = 30 80 | moves = null 81 | scorer = {"@scorers":"spacy.parser_scorer.v1"} 82 | update_with_oracle_cut_size = 100 83 | 84 | [components.parser.model] 85 | @architectures = "spacy.TransitionBasedParser.v2" 86 | state_type = "parser" 87 | extra_state_tokens = false 88 | hidden_width = 128 89 | maxout_pieces = 3 90 | use_upper = false 91 | nO = null 92 | 93 | [components.parser.model.tok2vec] 94 | @architectures = "spacy-transformers.TransformerListener.v1" 95 | grad_factor = 1.0 96 | pooling = {"@layers":"reduce_mean.v1"} 97 | upstream = "*" 98 | 99 | [components.transformer] 100 | factory = "transformer_custom" 101 | max_batch_items = 4096 102 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} 103 | 104 | [components.transformer.model] 105 | @architectures = "spacy-transformers.TransformerModel.v3" 106 | name = "megagonlabs/transformers-ud-japanese-electra-base-discriminator" 107 | mixed_precision = false 108 | 109 | [components.transformer.model.get_spans] 110 | @span_getters = "spacy-transformers.strided_spans.v1" 111 | window = 128 112 | stride = 96 113 | 114 | [components.transformer.model.grad_scaler_config] 115 | 116 | [components.transformer.model.tokenizer_config] 117 | use_fast = false 118 | tokenizer_class = "sudachitra.tokenization_electra_sudachipy.ElectraSudachipyTokenizer" 119 | do_lower_case = false 120 | do_word_tokenize = true 121 | do_subword_tokenize = true 122 | word_tokenizer_type = "sudachipy" 123 | subword_tokenizer_type = "wordpiece" 124 | word_form_type = "dictionary_and_surface" 125 | 126 | [components.transformer.model.tokenizer_config.sudachipy_kwargs] 127 | split_mode = "A" 128 | dict_type = "core" 129 | 130 | [components.transformer.model.transformer_config] 131 | 132 | [corpora] 133 | 134 | [corpora.dev] 135 | @readers = "spacy.Corpus.v1" 136 | path = ${paths.dev} 137 | max_length = 0 138 | gold_preproc = false 139 | limit = 0 140 | augmenter = null 141 | 142 | [corpora.train] 143 | @readers = "spacy.Corpus.v1" 144 | path = ${paths.train} 145 | max_length = 500 146 | gold_preproc = false 147 | limit = 0 148 | augmenter = null 149 | 150 | [training] 151 | accumulate_gradient = 3 152 | dev_corpus = "corpora.dev" 153 | train_corpus = "corpora.train" 154 | seed = ${system.seed} 155 | gpu_allocator = ${system.gpu_allocator} 156 | dropout = 0.1 157 | patience = 0 158 | max_epochs = 0 159 | max_steps = 50000 160 | eval_frequency = 200 161 | frozen_components = [] 162 | before_to_disk = null 163 | annotating_components = [] 164 | 165 | [training.batcher] 166 | @batchers = "spacy.batch_by_padded.v1" 167 | discard_oversize = true 168 | size = 2000 169 | buffer = 256 170 | get_length = null 171 | 172 | [training.logger] 173 | @loggers = "spacy.ConsoleLogger.v1" 174 | progress_bar = false 175 | 176 | [training.optimizer] 177 | @optimizers = "Adam.v1" 178 | beta1 = 0.9 179 | beta2 = 0.999 180 | L2_is_weight_decay = true 181 | L2 = 0.01 182 | grad_clip = 1.0 183 | use_averages = false 184 | eps = 0.00000001 185 | 186 | [training.optimizer.learn_rate] 187 | @schedules = "warmup_linear.v1" 188 | warmup_steps = 250 189 | total_steps = 50000 190 | initial_rate = 0.00005 191 | 192 | [training.score_weights] 193 | dep_uas = 0.25 194 | dep_las = 0.25 195 | dep_las_per_type = null 196 | sents_p = null 197 | sents_r = null 198 | sents_f = 0.1 199 | ents_f = 0.25 200 | ents_p = 0.0 201 | ents_r = 0.0 202 | ents_per_type = null 203 | pos_acc = 0.15 204 | morph_acc = 0.0 205 | morph_per_feat = null 206 | tag_acc = 0.0 207 | 208 | [pretraining] 209 | 210 | [initialize] 211 | vectors = null 212 | init_tok2vec = ${paths.init_tok2vec} 213 | vocab_data = null 214 | lookups = null 215 | before_init = null 216 | after_init = null 217 | 218 | [initialize.components] 219 | 220 | [initialize.tokenizer] -------------------------------------------------------------------------------- /config/ja_ginza_electra.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"ja", 3 | "name":"ginza_electra", 4 | "version":"5.2.0", 5 | "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", 6 | "author":"Megagon Labs Tokyo.", 7 | "email":"ginza@megagon.ai", 8 | "url":"https://github.com/megagonlabs/ginza", 9 | "license":"MIT License", 10 | "sources":[ 11 | { 12 | "name":"UD_Japanese-BCCWJ r2.8", 13 | "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ", 14 | "license":"CC BY-NC-SA 4.0", 15 | "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y." 16 | }, 17 | { 18 | "name":"GSK2014-A(2019)", 19 | "url":"https://www.gsk.or.jp/catalog/gsk2014-a/", 20 | "license":"Individually defined commercial license", 21 | "author":"Tokyo Institute of Technology" 22 | }, 23 | { 24 | "name":"SudachiDict_core", 25 | "url":"https://github.com/WorksApplications/SudachiDict", 26 | "license":"Apache License 2.0", 27 | "author":"Works Applications Enterprise Co., Ltd." 28 | }, 29 | { 30 | "name":"mC4", 31 | "url":"https://huggingface.co/datasets/mc4", 32 | "license":"ODC-BY-1.0", 33 | "title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer", 34 | "author":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, & Peter J. Liu" 35 | }, 36 | { 37 | "name":"megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0", 38 | "url":"https://huggingface.co/megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0", 39 | "license":"MIT Licence", 40 | "author":"Hiroshi Matsuda (Megagon Labs Tokyo, Recruit Co., Ltd.)" 41 | } 42 | ], 43 | "parent_package":"spacy", 44 | "spacy_version":">=3.4.4,<4.0.0", 45 | "spacy_git_version":"0fc3dee77", 46 | "pipeline":[ 47 | "transformer", 48 | "parser", 49 | "attribute_ruler", 50 | "ner", 51 | "morphologizer", 52 | "compound_splitter", 53 | "bunsetu_recognizer" 54 | ], 55 | "components":[ 56 | "transformer", 57 | "parser", 58 | "attribute_ruler", 59 | "ner", 60 | "morphologizer", 61 | "compound_splitter", 62 | "bunsetu_recognizer" 63 | ], 64 | "disabled": [ 65 | "attribute_ruler" 66 | ], 67 | "vectors":{ 68 | "width":0, 69 | "vectors":0, 70 | "keys":0, 71 | "name":null 72 | }, 73 | "requirements":[ 74 | "sudachipy>=0.6.2,<0.7.0", 75 | "sudachidict_core>=20210802", 76 | "sudachitra>=0.1.6,<0.2.0", 77 | "ginza-transformers>=0.4.0,<0.5.0", 78 | "ginza>=5.2.0,<5.3.0" 79 | ] 80 | } 81 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: GiNZA - Japanese NLP Library 3 | description: Universal Dependenciesに基づくオープンソース日本語NLPライブラリ 4 | -------------------------------------------------------------------------------- /docs/bunsetu_api.md: -------------------------------------------------------------------------------- 1 | # 文節APIの解説 2 | 3 | ## GiNZAの解析モデルと文節単位の解析API 4 | 5 | GiNZA独自の文節解析モデルにより、Universal Dependenciesの枠組みの中で日本語に特徴的な文節構造を考慮することができます。 6 | 7 | ![bunsetu_heads](https://github.com/megagonlabs/ginza/raw/static/docs/images/bunsetu_heads.png) 8 | 9 | またGiNZA v4で追加された解析APIを用いることで、文節やその主辞を単位とした分析がこれまでよりずっと容易になります。 10 | ```python 11 | from ginza import * 12 | import spacy 13 | nlp = spacy.load("ja_ginza") # GiNZAモデルの読み込み 14 | 15 | from collections import defaultdict 16 | frames = defaultdict(lambda: 0) # 依存関係の出現頻度を格納 17 | sentences = set() # 重複文検出用のset 18 | 19 | with open("sentences.txt", "r") as fin: # 解析対象のテキストファイルから 20 | for line in fin: # 一行ごとに 21 | try: 22 | doc = nlp(line.rstrip()) # 解析を実行し 23 | except: 24 | continue 25 | for sent in doc.sents: # 文単位でループ 26 | if sent.text in sentences: 27 | continue # 重複文はスキップ 28 | sentences.add(sent.text) 29 | for t in bunsetu_head_tokens(sent): # 文節主辞トークンのうち 30 | if t.pos_ not in {"ADJ", "VERB"}: 31 | continue # 述語以外はスキップ 32 | v = phrase(lemma_)(t) # 述語とその格要素(主語・目的語相当)の句を集める 33 | dep_phrases = sub_phrases(t, phrase(lemma_), is_not_stop) 34 | subj = [phrase for dep, phrase in dep_phrases if dep in {"nsubj"}] 35 | obj = [phrase for dep, phrase in dep_phrases if dep in {"obj", "iobj"}] 36 | for s in subj: 37 | for o in obj: 38 | frames[(s, o, v)] += 1 # 格要素と述語の組み合わせをカウント 39 | 40 | for frame, count in sorted(frames.items(), key=lambda t: -t[1]): 41 | print(count, *frame, sep="\t") # 出現頻度の高い順に表示 42 | ``` 43 | 44 | #### 表1 GiNZAの文節APIの一覧 45 | 46 | | category | func or variable | description | 47 | | --- | --- | --- | 48 | | Span-based | | | 49 | | | bunsetu_spans() | 文節SpanのIterable。 | 50 | | | bunsetu_phrase_spans() | 文節主辞SpanのIterable。 | 51 | | | bunsetu_span() | トークンが属する文節のSpan。 | 52 | | | bunsetu_phrase_span() | トークンが属する文節の主辞Span。 | 53 | | Construction | | | 54 | | | bunsetu() | 文節中のトークン列を指定された形に整形して返す。 | 55 | | | phrase() | 文節主辞中のトークン列を指定された形に整形して
返す。 | 56 | | | sub_phrases() | 従属文節を指定された形に整形して返す。 | 57 | | | phrases() | スパンに含まれる文節を指定された形に整形して
返す。 | 58 | | Utility | | | 59 | | | traverse() | 構文木を指定された方法で巡回し指定された形に
整形して返す。 | 60 | | | default_join_func() | デフォルトのトークン列の結合方法。 | 61 | | | SEP | デフォルトのトークン区切り文字。 | 62 | | Token-based | | | 63 | | | bunsetu_head_list() | DocやSpanに含まれる文節のヘッドトークンの
インデックスのリスト。 | 64 | | | bunsetu_head_tokens() | DocやSpanに含まれる文節のヘッドトークンの
リスト。 | 65 | | | bunsetu_bi_labels() | DocやSpanに含まれるトークンが文節開始位置
にある場合は"B"、それ以外は"I"とするリスト。 | 66 | | | bunsetu_position_types() | DocやSpanに含まれるトークンを{"ROOT",
"SEM_HEAD", "SYN_HEAD", "NO_HEAD",
"FUNC", "CONT"}に分類したリスト。 | 67 | | | is_bunsetu_head() | トークンが文節のヘッドの場合はTrue、
それ以外はFalse。 | 68 | | | bunsetu_bi_label() | トークンが文節開始位置にある場合は"B"、
それ以外は"I"。 | 69 | | | bunsetu_position_type() | トークンを{"ROOT", "SEM_HEAD",
"SYN_HEAD", "NO_HEAD", "FUNC",
"CONT"}に分類。 | 70 | | Proxy | | | 71 | | | * | spacy.tokens.Tokenクラスのプロパティと
同名・同機能の関数群。 | 72 | | Subtoken | | | 73 | | | sub_tokens() | トークンの分割情報。 | 74 | | | set_split_mode() | デフォルトの分割モードの変更。 | 75 | | Clause | | | 76 | | | clauses() | 節単位に分割されたトークン列。(experimental) | 77 | | | clause_head() | トークンが属する節のヘッドとなるトークン。(experimental) | 78 | | | clause_head_i() | トークンが属する節のヘッドとなるトークン番号。(experimental) | 79 | 80 | ## 解説資料 81 | 82 | 詳細な解説はこちらの記事をご覧ください。 83 | 84 | - [GiNZA version 4.0: 多言語依存構造解析技術への文節APIの統合 - Megagon Labs Blog](https://www.megagon.ai/jp/blog/ginza-version-4-0/) 85 | - [GiNZA - Universal Dependenciesによる実用的日本語解析 - 自然言語処理 Volume 27 Number 3](https://www.jstage.jst.go.jp/article/jnlp/27/3/27_695/_article/-char/ja/) 86 | -------------------------------------------------------------------------------- /docs/command_line_tool.md: -------------------------------------------------------------------------------- 1 | # コマンドラインツールの解説 2 | 3 | ## ginza 4 | 5 | `ginza`コマンドはコマンドライン引数で指定されたファイル(指定されない場合は標準入力)から一行を単位としてテキストを読み込み、解析結果を標準出力に[CoNLL-U Syntactic Annotation](https://universaldependencies.org/format.html#syntactic-annotation) 形式で出力します。 6 | ```console 7 | $ ginza 8 | 銀座でランチをご一緒しましょう。 9 | # text = 銀座でランチをご一緒しましょう。 10 | 1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 nmod _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City|ClauseHead=6 11 | 2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ|ClauseHead=6 12 | 3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ|ClauseHead=6 13 | 4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ|ClauseHead=6 14 | 5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|NP_B|Reading=ゴ|ClauseHead=6 15 | 6 一緒 一緒 NOUN 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|NP_I|Reading=イッショ|ClauseHead=6 16 | 7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ|ClauseHead=6 17 | 8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ|ClauseHead=6 18 | 9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。|ClauseHead=6 19 | 20 | ``` 21 | 22 | ## ginzame 23 | 24 | `ginzame`コマンドでオープンソース形態素解析エンジン [MeCab](https://taku910.github.io/mecab/) の`mecab`コマンドに近い形式で解析結果を出力することができます。 25 | `ginzame`コマンドは形態素解析処理のみをマルチプロセスで高速に実行します。 26 | このコマンドと`mecab`の出力形式の相違点として、最終フィールド(発音)が常に`*`となること、 27 | ginza の split_mode はデフォルトが `C` なので unidic 相当の単語分割を得るためには `-s A` を指定する必要があることに注意して下さい。 28 | ```console 29 | $ ginzame 30 | 銀座でランチをご一緒しましょう。 31 | 銀座 名詞,固有名詞,地名,一般,*,*,銀座,ギンザ,* 32 | で 助詞,格助詞,*,*,*,*,で,デ,* 33 | ランチ 名詞,普通名詞,一般,*,*,*,ランチ,ランチ,* 34 | を 助詞,格助詞,*,*,*,*,を,ヲ,* 35 | ご 接頭辞,*,*,*,*,*,御,ゴ,* 36 | 一緒 名詞,普通名詞,サ変可能,*,*,*,一緒,イッショ,* 37 | し 動詞,非自立可能,*,*,サ行変格,連用形-一般,為る,シ,* 38 | ましょう 助動詞,*,*,*,助動詞-マス,意志推量形,ます,マショウ,* 39 | 。 補助記号,句点,*,*,*,*,。,。,* 40 | EOS 41 | 42 | ``` 43 | 44 | ## OPTIONS 45 | `ginza`コマンドでは以下のオプションを指定することができます。 46 | `ginzame`コマンドでは `--split-mode` `--hash-comment` `output-path` `--use-normalized-form` `--parallel` オプションが利用可能です。 47 | 48 | - `--model-path `, `-b ` 49 | `spacy.language.Language` 形式の学習済みモデルが保存されたディレクトリを指定します。 50 | `--ensure-model` オプションと同時に指定することはできません。 51 | - `--ensure-model `, `-m ` 52 | ginza および spaCy が公開している学習済みモデル名を指定します。`--model-path` オプションと同時に指定することはできません。次の値のいずれかを指定できます。 53 | - `ja_ginza`, `ja_ginza_electra` 54 | - [spaCy Models & Languages](https://spacy.io/usage/models)で公開されている日本語以外を含む全ての言語のモデル (例: en_core_web_md) 55 | 使用するモデルに応じて、事前に `pip install ja-ginza-electra` のようにパッケージをダウンロードする必要があります。 56 | `--model-path`, `--ensure-model` のどちらも指定されない場合には `ja_ginza_electra`、`ja_ginza` の順の優先度でロード可能なモデルを利用します。 57 | - `--split-mode `, `-s ` 58 | 複合名詞の分割モードを指定します。モードは [sudachi](https://github.com/WorksApplications/Sudachi#the-modes-of-splitting) に準拠し、`A`、`B`、`C`のいずれかを指定できます。`ginza`コマンドのデフォルト値は `C`、`ginzame`コマンドのデフォルト値はMeCab UniDicに近い `A` です。 59 | `A`が分割が最も短く複合名詞が UniDic 短単位まで分割され、 `C` では固有名詞が抽出されます。`B` は二つの中間の単位に分割されます。 60 | - `--hash-comment `, `-c ` 61 | 行頭が `#` から始まる行を解析対象とするかのモードを指定します。次の値のいずれかを指定できます。 62 | - `print` 63 | 解析対象とはしないが、解析結果には入力をそのまま出力します。 64 | - `skip` 65 | 解析対象とせず、解析結果にも出力しません。 66 | - `analyze` 67 | `#` から始まる行についても解析を行い、結果を出力します。ただし`-f json`が指定されている場合は `-c`の指定に依らず常に`analyze`が適用されます。 68 | デフォルト値は `print` です。 69 | - `--output-path `, `-o ` 70 | 解析結果を出力するファイルのパスを指定します。指定しない場合には標準出力に解析結果が出力されます。 71 | - `--output-format `, `-f ` 72 | [解析結果のフォーマット](#出力形式の指定)を指定します。次の値のいずれかを指定できます。 73 | - `0`, `conllu` 74 | - `1`, `cabocha` 75 | - `2`, `mecab` 76 | - `3`, `json` 77 | デフォルト値は `conllu` です。 78 | - `--require-gpu `, `-g ` 79 | 引数で指定されたgpu_idのGPUを使用して解析を行います。引数に-1を指定(デフォルト)するとCPUを使用します。ただし、[spaCyおよびcupyの制約](https://github.com/explosion/spaCy/issues/5507)から、`--require-gpu`は`--parallel`と同時に指定できません。 80 | - `--use-normalized-form`, `-n` 81 | `-f conllu`のlemmaフィールドに [sudachi](https://github.com/WorksApplications/Sudachi#normalized-form) を使用するためのブールスイッチ。 82 | - `--disable-sentencizer`, `-d` 83 | `ja_ginza`、 `ja_ginza_electra` モデル利用時に[disable_sentencizer](https://github.com/megagonlabs/ginza/blob/develop/ginza/disable_sentencizer.py)を有効化するブールスイッチ。 84 | - `--parallel `, `-p ` 85 | 並列実行するプロセス数を指定します。0 を指定すると cpu コア数分のプロセスを起動します。デフォルト値は1です。 86 | 87 | ## 出力形式の指定 88 | 89 | ### JSON 90 | 91 | spaCyの学習用JSON形式での出力は`ginza -f 3` または `ginza -f json`を実行してください。 92 | ```console 93 | $ ginza -f json 94 | 銀座でランチをご一緒しましょう。 95 | [ 96 | { 97 | "paragraphs": [ 98 | { 99 | "raw": "銀座でランチをご一緒しましょう。", 100 | "sentences": [ 101 | { 102 | "tokens": [ 103 | {"id": 1, "orth": "銀座", "tag": "名詞-固有名詞-地名-一般", "pos": "PROPN", "lemma": "銀座", "head": 5, "dep": "obl", "ner": "B-City"}, 104 | {"id": 2, "orth": "で", "tag": "助詞-格助詞", "pos": "ADP", "lemma": "で", "head": -1, "dep": "case", "ner": "O"}, 105 | {"id": 3, "orth": "ランチ", "tag": "名詞-普通名詞-一般", "pos": "NOUN", "lemma": "ランチ", "head": 3, "dep": "obj", "ner": "O"}, 106 | {"id": 4, "orth": "を", "tag": "助詞-格助詞", "pos": "ADP", "lemma": "を", "head": -1, "dep": "case", "ner": "O"}, 107 | {"id": 5, "orth": "ご", "tag": "接頭辞", "pos": "NOUN", "lemma": "ご", "head": 1, "dep": "compound", "ner": "O"}, 108 | {"id": 6, "orth": "一緒", "tag": "名詞-普通名詞-サ変可能", "pos": "VERB", "lemma": "一緒", "head": 0, "dep": "ROOT", "ner": "O"}, 109 | {"id": 7, "orth": "し", "tag": "動詞-非自立可能", "pos": "AUX", "lemma": "する", "head": -1, "dep": "advcl", "ner": "O"}, 110 | {"id": 8, "orth": "ましょう", "tag": "助動詞", "pos": "AUX", "lemma": "ます", "head": -2, "dep": "aux", "ner": "O"}, 111 | {"id": 9, "orth": "。", "tag": "補助記号-句点", "pos": "PUNCT", "lemma": "。", "head": -3, "dep": "punct", "ner": "O"} 112 | ] 113 | } 114 | ] 115 | } 116 | ] 117 | } 118 | ] 119 | ``` 120 | 121 | ### CaboCha 122 | 123 | 日本語係り受け解析器 [CaboCha](https://taku910.github.io/cabocha/) の`cabocha -f1`のラティス形式に近い解析結果を出力する場合は 124 | `ginza -f 1` または `ginza -f cabocha` を実行して下さい。 125 | このオプションと`cabocha -f1`の出力形式の相違点として、 126 | スラッシュ記号`/`に続く`func_index`フィールドが常に自立語の終了位置(機能語があればその開始位置に一致)を示すこと、 127 | 機能語認定基準が一部異なること、 128 | に注意して下さい。 129 | ```console 130 | $ ginza -f cabocha 131 | 銀座でランチをご一緒しましょう。 132 | * 0 2D 0/1 0.000000 133 | 銀座 名詞,固有名詞,地名,一般,,銀座,ギンザ,* B-City 134 | で 助詞,格助詞,*,*,,で,デ,* O 135 | * 1 2D 0/1 0.000000 136 | ランチ 名詞,普通名詞,一般,*,,ランチ,ランチ,* O 137 | を 助詞,格助詞,*,*,,を,ヲ,* O 138 | * 2 -1D 0/2 0.000000 139 | ご 接頭辞,*,*,*,,ご,ゴ,* O 140 | 一緒 名詞,普通名詞,サ変可能,*,,一緒,イッショ,* O 141 | し 動詞,非自立可能,*,*,サ行変格,連用形-一般,する,シ,* O 142 | ましょう 助動詞,*,*,*,助動詞-マス,意志推量形,ます,マショウ,* O 143 | 。 補助記号,句点,*,*,,。,。,* O 144 | EOS 145 | 146 | ``` 147 | 148 | ## マルチプロセス実行 (Experimental) 149 | 150 | `-p NUM_PROCESS` オプションで解析処理のマルチプロセス実行が可能になります。 151 | `NUM_PROCESS`には並列実行するプロセス数を整数で指定します。 152 | 0以下の値は`実行環境のCPUコア数+NUM_PROCESS`を指定したのと等価になります。 153 | 154 | `ginza -f mecab`とそのエイリアスである`ginzame`以外で`-p NUM_PROCESS`オプションを使用する場合は、 155 | 実行環境の空きメモリ容量が十分あることを事前に確認してください。 156 | マルチプロセス実行では1プロセスあたり`ja_ginza`で数百MB、`ja_ginza_electra`で数GBのメモリが必要です。 157 | -------------------------------------------------------------------------------- /docs/developer_reference.md: -------------------------------------------------------------------------------- 1 | # 開発者向けの情報 2 | 3 | ## 開発環境 4 | 5 | ### 開発環境のセットアップ 6 | 7 | #### 1. githubからclone 8 | ```console 9 | $ git clone 'https://github.com/megagonlabs/ginza.git' 10 | ``` 11 | 12 | #### 2. pip install および setup.sh の実行 13 | ```console 14 | $ pip install -U -r requirements.txt 15 | $ python setup.py develop 16 | ``` 17 | 18 | #### 3. GPU用ライブラリのセットアップ (Optional) 19 | CUDA v11.0の場合は次のように指定します。 20 | ```console 21 | $ pip install -U spacy[cuda110] 22 | ``` 23 | 24 | ### 訓練の実行 25 | GiNZAの解析モデル `ja_ginza` はspaCy標準コマンドを使用して学習を行っています。 26 | ```console 27 | $ python -m spacy train ja ja_ginza-4.0.0 corpus/ja_ginza-ud-train.json corpus/ja_ginza-ud-dev.json -b ja_vectors_chive_mc90_35k/ -ovl 0.3 -n 100 -m meta.json.ginza -V 4.0.0 28 | ``` 29 | 30 | ### トラブルシューティング 31 | 32 | Google Colab 環境ではインストール後にパッケージ情報の再読込が必要な場合があります。詳細はリンクの記事をご確認下さい。 33 | ```python 34 | import pkg_resources, imp 35 | imp.reload(pkg_resources) 36 | ``` 37 | [【GiNZA】GoogleColabで日本語NLPライブラリGiNZAがloadできない](https://www.sololance.tokyo/2019/10/colab-load-ginza.html) 38 | 39 | インストール時にCythonに関するエラーが発生した場合は、次のように環境変数CFLAGSを設定してください。 40 | ```console 41 | $ CFLAGS='-stdlib=libc++' pip install ginza 42 | ``` 43 | 44 | ## ユーザ辞書の使用 45 | 46 | GiNZAはTokenizer(形態素解析レイヤ)にSudachiPyを使用しています。 47 | GiNZAでユーザ辞書を使用するにはSudachiPyの辞書設定ファイル `sudachi.json` の `userDict` フィールドに、 48 | コンパイル済みのユーザ辞書ファイルのパスのリストを指定します。 49 | 50 | SudachiPyのユーザ辞書ファイルのコンパイル方法についてはSudachiPyのGitHubリポジトリで公開されているドキュメントを参照してください。 51 | [SudachiPy - User defined Dictionary](https://github.com/WorksApplications/SudachiPy#user-defined-dictionary) 52 | [Sudachi ユーザー辞書作成方法](https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md) 53 | -------------------------------------------------------------------------------- /ginza/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import singledispatch 2 | from typing import Callable, Iterable, Union, Tuple, TypeVar 3 | 4 | from sudachipy.morpheme import Morpheme 5 | 6 | from spacy.lang.ja import DetailedToken 7 | from spacy.language import Language 8 | from spacy.tokens import Doc, Span, Token 9 | 10 | from .bunsetu_recognizer import * 11 | from .compound_splitter import * 12 | from .disable_sentencizer import * 13 | from .ene_ontonotes_mapper import ENE_ONTONOTES_MAPPING 14 | 15 | 16 | __all__ = [ 17 | "make_compound_splitter", "make_bunsetu_recognizer", "make_disable_sentencizer", 18 | "force_using_normalized_form_as_lemma", "set_split_mode", 19 | "token_i", "text", "text_with_ws", "orth", "orth_", 20 | "ent_type", "ent_type_", "ent_iob", "ent_iob_", 21 | "lemma", "lemma_", "norm", "norm_", 22 | "pos", "pos_", "tag", "tag_", "dep", "dep_", 23 | "is_sent_start", "is_stop", "is_not_stop", 24 | "ent_label_ene", "ent_label_ontonotes", 25 | "reading_form", "inflection", 26 | "bunsetu_bi_label", "bunsetu_position_type", "is_bunsetu_head", 27 | "clauses","token_clause_head", 28 | "SEP", "default_join_func", 29 | "traverse", 30 | "head", "ancestors", "conjuncts", "children", "lefts", "rights", "subtree", 31 | "bunsetu", "phrase", "sub_phrases", "phrases", 32 | "sub_tokens", 33 | # from bunsetu_recognizer 34 | "bunsetu_span", 35 | "bunsetu_spans", 36 | "bunsetu_phrase_span", 37 | "bunsetu_phrase_spans", 38 | "bunsetu_head_list", 39 | "bunsetu_head_tokens", 40 | "bunsetu_bi_labels", 41 | "bunsetu_position_types", 42 | "clauses", 43 | "clause_head", 44 | "clause_head_i", 45 | "BunsetuRecognizer", 46 | # from compound_splitter 47 | "CompoundSplitter", 48 | "tag_to_pos", 49 | ] 50 | 51 | 52 | @Language.factory( 53 | "compound_splitter", 54 | requires=[], 55 | assigns=[], 56 | retokenizes=True, 57 | default_config={"split_mode": None}, 58 | ) 59 | def make_compound_splitter( 60 | nlp: Language, 61 | name: str, 62 | split_mode: str = None, 63 | ): 64 | return CompoundSplitter( 65 | nlp.vocab, 66 | split_mode, 67 | ) 68 | 69 | 70 | @Language.factory( 71 | "bunsetu_recognizer", 72 | requires=["token.dep"], 73 | assigns=["token.dep"], 74 | retokenizes=False, 75 | default_config={}, 76 | ) 77 | def make_bunsetu_recognizer( 78 | nlp: Language, 79 | name: str, 80 | remain_bunsetu_suffix: bool = False, 81 | ): 82 | return BunsetuRecognizer( 83 | nlp.vocab, 84 | remain_bunsetu_suffix, 85 | ) 86 | 87 | @Language.factory( 88 | "disable_sentencizer", 89 | requires=[], 90 | assigns=[], 91 | retokenizes=False, 92 | default_config={}, 93 | ) 94 | def make_disable_sentencizer( 95 | nlp: Language, 96 | name: str, 97 | ): 98 | return DisableSentencizer( 99 | nlp.vocab, 100 | ) 101 | 102 | 103 | _morpheme_dictionary_form = None 104 | 105 | 106 | def force_using_normalized_form_as_lemma(force: bool): 107 | global _morpheme_dictionary_form 108 | if force and not _morpheme_dictionary_form: 109 | _morpheme_dictionary_form = Morpheme.dictionary_form 110 | Morpheme.dictionary_form = Morpheme.normalized_form 111 | elif not force and _morpheme_dictionary_form: 112 | Morpheme.dictionary_form = _morpheme_dictionary_form 113 | 114 | 115 | def set_split_mode(nlp: Language, mode: str): 116 | if nlp.has_pipe("compound_splitter"): 117 | splitter = nlp.get_pipe("compound_splitter") 118 | splitter.split_mode = mode 119 | 120 | 121 | # token field getters 122 | 123 | def token_i(token: Token) -> int: 124 | return token.i 125 | 126 | 127 | def text(token: Token) -> str: 128 | return token.text 129 | 130 | 131 | def text_with_ws(token: Token) -> str: 132 | return token.text_with_ws 133 | 134 | 135 | def orth(token: Token) -> int: 136 | return token.orth 137 | 138 | 139 | def orth_(token: Token) -> str: 140 | return token.orth_ 141 | 142 | 143 | def ent_type(token: Token) -> int: 144 | return token.ent_type 145 | 146 | 147 | def ent_type_(token: Token) -> str: 148 | return ENE_ONTONOTES_MAPPING.get(token.ent_type_, "OTHERS") 149 | 150 | 151 | def ent_iob(token: Token) -> int: 152 | return token.ent_iob 153 | 154 | 155 | def ent_iob_(token: Token) -> str: 156 | return token.ent_iob_ 157 | 158 | 159 | def lemma(token: Token) -> int: 160 | return token.lemma 161 | 162 | 163 | def lemma_(token: Token) -> str: 164 | return token.lemma_ 165 | 166 | 167 | def norm(token: Token) -> int: 168 | return token.norm 169 | 170 | 171 | def norm_(token: Token) -> str: 172 | return token.norm_ 173 | 174 | 175 | def pos(token: Token) -> int: 176 | return token.pos 177 | 178 | 179 | def pos_(token: Token) -> str: 180 | return token.pos_ 181 | 182 | 183 | def tag(token: Token) -> int: 184 | return token.tag 185 | 186 | 187 | def tag_(token: Token) -> str: 188 | return token.tag_ 189 | 190 | 191 | def dep(token: Token) -> int: 192 | return token.dep 193 | 194 | 195 | def dep_(token: Token) -> str: 196 | return token.dep_ 197 | 198 | 199 | def is_sent_start(token: Token) -> bool: 200 | return token.is_sent_start 201 | 202 | 203 | def is_stop(token: Token) -> bool: 204 | return token.is_stop 205 | 206 | 207 | def is_not_stop(token: Token) -> bool: 208 | return not token.is_stop 209 | 210 | 211 | def ent_label_ene(token: Token) -> str: 212 | if token.ent_iob_ in "BI": 213 | return token.ent_iob_ + "-" + token.ent_type_ 214 | else: 215 | return token.ent_iob_ 216 | 217 | 218 | def ent_label_ontonotes(token: Token) -> str: 219 | if token.ent_iob_ in "BI": 220 | return token.ent_iob_ + "-" + ENE_ONTONOTES_MAPPING.get(token.ent_type_, "OTHERS") 221 | else: 222 | return token.ent_iob_ 223 | 224 | 225 | # token field getters for Doc.user_data 226 | 227 | def reading_form(token: Token, use_orth_if_none: bool) -> str: 228 | reading = token.morph.get("Reading") 229 | if reading: 230 | return reading[0] 231 | elif use_orth_if_none: 232 | return token.orth_ 233 | else: 234 | return None 235 | 236 | 237 | def inflection(token: Token) -> str: 238 | inf = token.morph.get("Inflection") 239 | if inf: 240 | return inf[0].replace(";", ",") 241 | else: 242 | return "" 243 | 244 | 245 | # bunsetu related field getters for Doc.user_data 246 | 247 | def bunsetu_bi_label(token: Token): 248 | return bunsetu_bi_labels(token.doc)[token.i] 249 | 250 | 251 | def bunsetu_position_type(token: Token): 252 | return bunsetu_position_types(token.doc)[token.i] 253 | 254 | 255 | def is_bunsetu_head(token: Token): 256 | return token.i in token.doc.user_data["bunsetu_heads"] 257 | 258 | 259 | SEP = "+" 260 | 261 | 262 | def default_join_func(elements): 263 | return SEP.join([element if isinstance(element, str) else str(element) for element in elements]) 264 | 265 | 266 | T = TypeVar('T') 267 | U = TypeVar('U') 268 | V = TypeVar('V') 269 | 270 | 271 | # curried function: ex. traverse(children, lemma_)(token) 272 | @singledispatch 273 | def traverse( 274 | traverse_func: Callable[[Token], Iterable[Token]], 275 | element_func: Callable[[Token], T] = lambda token: token, 276 | condition_func: Callable[[Token], bool] = lambda token: True, 277 | join_func: Callable[[Iterable[T]], U] = lambda lst: lst, 278 | ) -> Callable[[Union[Token, Span]], U]: 279 | return lambda token: join_func([ 280 | element_func(t) for t in traverse_func(token) if condition_func(t) 281 | ]) 282 | 283 | 284 | # overload: ex. traverse(token, children, lemma_) 285 | @traverse.register(Token) 286 | def _traverse( 287 | token: Token, 288 | traverse_func: Callable[[Token], Iterable[Token]], 289 | element_func: Callable[[Token], T] = lambda token: token, 290 | condition_func: Callable[[Token], bool] = lambda token: True, 291 | join_func: Callable[[Iterable[T]], U] = lambda lst: lst, 292 | ) -> U: 293 | return traverse(traverse_func, element_func, condition_func, join_func)(token) 294 | 295 | 296 | def head(token: Token) -> Token: 297 | return token.head 298 | 299 | 300 | def ancestors(token: Token) -> Iterable[Token]: 301 | return token.ancestors 302 | 303 | 304 | def conjuncts(token: Token) -> Tuple[Token]: 305 | return token.conjuncts 306 | 307 | 308 | def children(token: Token) -> Iterable[Token]: 309 | return token.children 310 | 311 | 312 | def lefts(token: Token) -> Iterable[Token]: 313 | return token.lefts 314 | 315 | 316 | def rights(token: Token) -> Iterable[Token]: 317 | return token.rights 318 | 319 | 320 | def subtree(token: Token) -> Iterable[Token]: 321 | return token.subtree 322 | 323 | 324 | # curried function: ex. bunsetu(lemma_)(token) 325 | @singledispatch 326 | def bunsetu( 327 | element_func: Callable[[Token], T] = lambda token: token, 328 | condition_func: Callable[[Token], bool] = lambda token: True, 329 | join_func: Callable[[Iterable[T]], U] = default_join_func, 330 | ) -> Callable[[Token], U]: 331 | return traverse(bunsetu_span, element_func, condition_func, join_func) 332 | 333 | 334 | # overload: ex. bunsetu(token, lemma_) 335 | @bunsetu.register(Token) 336 | def _bunsetu( 337 | token: Token, 338 | element_func: Callable[[Token], T] = lambda token: token, 339 | condition_func: Callable[[Token], bool] = lambda token: True, 340 | join_func: Callable[[Iterable[T]], U] = default_join_func, 341 | ) -> U: 342 | return traverse(bunsetu_span, element_func, condition_func, join_func)(token) 343 | 344 | 345 | # curried function: ex. phrase(lemma_)(token) 346 | @singledispatch 347 | def phrase( 348 | element_func: Callable[[Token], T] = lambda token: token, 349 | condition_func: Callable[[Token], bool] = lambda token: True, 350 | join_func: Callable[[Iterable[T]], U] = default_join_func, 351 | ) -> Callable[[Token], U]: 352 | return traverse(bunsetu_phrase_span, element_func, condition_func, join_func) 353 | 354 | 355 | # overload: ex. phrase(token) 356 | @phrase.register(Token) 357 | def _phrase( 358 | token: Token, 359 | element_func: Callable[[Token], T] = lambda token: token, 360 | condition_func: Callable[[Token], bool] = lambda token: True, 361 | join_func: Callable[[Iterable[T]], U] = default_join_func, 362 | ) -> U: 363 | return traverse(bunsetu_phrase_span, element_func, condition_func, join_func)(token) 364 | 365 | 366 | # curried function: ex. sub_phrases(lemma_)(token) 367 | @singledispatch 368 | def sub_phrases( 369 | phrase_func: Callable[[Token], U] = _phrase, 370 | condition_func: Callable[[Token], bool] = lambda token: True, 371 | ) -> Callable[[Token], Iterable[Tuple[str, U]]]: 372 | return lambda token: _sub_phrases( 373 | token, 374 | phrase_func, 375 | condition_func, 376 | ) 377 | 378 | 379 | # overload: ex. sub_phrases(token, lemma_) 380 | @sub_phrases.register(Token) 381 | def _sub_phrases( 382 | token: Token, 383 | phrase_func: Callable[[Token], U] = _phrase, 384 | condition_func: Callable[[Token], bool] = lambda token: True, 385 | ) -> Iterable[Tuple[str, U]]: 386 | return [ 387 | ( 388 | t.dep_, 389 | phrase_func(t), 390 | ) for t in bunsetu_span(token).root.children if t.i in bunsetu_head_list(token.doc) and condition_func(t) 391 | ] 392 | 393 | 394 | # curried function: ex. phrases(lemma_)(sent) 395 | @singledispatch 396 | def phrases( 397 | phrase_func: Callable[[Token], U] = _phrase, 398 | condition_func: Callable[[Token], bool] = lambda token: True, 399 | ) -> Callable[[Span], Iterable[U]]: 400 | return lambda sent: _phrases_span( 401 | sent, 402 | phrase_func, 403 | condition_func, 404 | ) if isinstance(sent, Span) else _phrases_doc( 405 | sent, 406 | phrase_func, 407 | condition_func, 408 | ) 409 | 410 | 411 | # overload: ex. phrases(sent, lemma_) 412 | @phrases.register(Span) 413 | def _phrases_span( 414 | sent: Span, 415 | phrase_func: Callable[[Token], U] = _phrase, 416 | condition_func: Callable[[Token], bool] = lambda token: True, 417 | ) -> Iterable[U]: 418 | return [ 419 | phrase_func(t) for t in bunsetu_head_tokens(sent) if condition_func(t) 420 | ] 421 | 422 | 423 | # overload: ex. phrases(doc, lemma_) 424 | @phrases.register(Doc) 425 | def _phrases_doc( 426 | doc: Doc, 427 | phrase_func: Callable[[Token], U] = _phrase, 428 | condition_func: Callable[[Token], bool] = lambda token: True, 429 | ) -> Iterable[U]: 430 | return [ 431 | phrase_func(t) for t in bunsetu_head_tokens(doc[:]) if condition_func(t) 432 | ] 433 | 434 | 435 | # curried function: ex. sub_tokens("B", lambda sub_token: sub_token.lemma)(token) 436 | @singledispatch 437 | def sub_tokens( 438 | mode: str = "A", # "A" or "B" 439 | sub_token_func: Callable[[DetailedToken], T] = lambda sub_token: sub_token, 440 | join_func: Callable[[Iterable[T]], U] = default_join_func, 441 | ) -> Callable[[Token], U]: 442 | return lambda token: _sub_tokens(token, mode, sub_token_func, join_func) 443 | 444 | 445 | # overload: ex. sub_tokens(token, "B", lambda sub_token: sub_token.lemma) 446 | @sub_tokens.register(Token) 447 | def _sub_tokens( 448 | token: Token, 449 | mode: str = "A", # "A" or "B" 450 | sub_token_func: Callable[[DetailedToken], T] = lambda sub_token: sub_token.surface, 451 | join_func: Callable[[Iterable[T]], U] = default_join_func, 452 | ) -> U: 453 | if token.doc.user_data["sub_tokens"][token.i]: 454 | elements = token.doc.user_data["sub_tokens"][token.i][{"A": 0, "B": 1}[mode]] 455 | else: 456 | elements = [ 457 | DetailedToken( 458 | token.orth_, 459 | token.tag_, 460 | inflection(token), 461 | token.lemma_, 462 | reading_form(token, True), 463 | None, 464 | ) 465 | ] 466 | return join_func([ 467 | sub_token_func(element) for element in elements 468 | ]) 469 | -------------------------------------------------------------------------------- /ginza/__main__.py: -------------------------------------------------------------------------------- 1 | import plac 2 | 3 | from .command_line import run_ginza, run_ginzame 4 | 5 | 6 | def main_ginzame(): 7 | plac.call(run_ginzame) 8 | 9 | 10 | def main_ginza(): 11 | plac.call(run_ginza) 12 | 13 | 14 | if __name__ == "__main__": 15 | plac.call(run_ginza) 16 | -------------------------------------------------------------------------------- /ginza/analyzer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import sys 3 | from typing import Iterable, Optional 4 | 5 | import thinc 6 | 7 | import spacy 8 | from spacy.tokens import Doc, Span 9 | from spacy.language import Language 10 | from spacy.lang.ja import Japanese 11 | 12 | from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes, bunsetu_bi_label, bunsetu_position_type, clause_head_i 13 | from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span 14 | 15 | 16 | def try_sudachi_import(split_mode: str): 17 | """SudachiPy is required for Japanese support, so check for it. 18 | It it's not available blow up and explain how to fix it. 19 | split_mode should be one of these values: "A", "B", "C", None->"A".""" 20 | try: 21 | from sudachipy import dictionary, tokenizer 22 | 23 | split_mode = { 24 | None: tokenizer.Tokenizer.SplitMode.A, 25 | "A": tokenizer.Tokenizer.SplitMode.A, 26 | "B": tokenizer.Tokenizer.SplitMode.B, 27 | "C": tokenizer.Tokenizer.SplitMode.C, 28 | }[split_mode] 29 | tok = dictionary.Dictionary().create(mode=split_mode) 30 | return tok 31 | except ImportError: 32 | raise ImportError( 33 | "Japanese support requires SudachiPy and SudachiDict-core " 34 | "(https://github.com/WorksApplications/SudachiPy). " 35 | "Install with `pip install sudachipy sudachidict_core` or " 36 | "install spaCy with `pip install spacy[ja]`." 37 | ) from None 38 | 39 | 40 | class Analyzer: 41 | def __init__( 42 | self, 43 | model_name_or_path: str, 44 | split_mode: str, 45 | hash_comment: str, 46 | output_format: str, 47 | require_gpu: int, 48 | disable_sentencizer: bool, 49 | use_normalized_form: bool, 50 | ) -> None: 51 | self.model_name_or_path = model_name_or_path 52 | self.split_mode = split_mode 53 | self.hash_comment = hash_comment 54 | self.output_format = output_format 55 | self.require_gpu = require_gpu 56 | self.disable_sentencizer = disable_sentencizer 57 | self.use_normalized_form = use_normalized_form 58 | self.nlp: Optional[Language] = None 59 | 60 | def set_nlp(self) -> None: 61 | if self.nlp: 62 | return 63 | 64 | if self.require_gpu >= 0: 65 | thinc.api.require_gpu(self.require_gpu) 66 | 67 | if self.output_format in ["2", "mecab"]: 68 | nlp = try_sudachi_import(self.split_mode) 69 | else: 70 | # Work-around for pickle error. Need to share model data. 71 | if self.model_name_or_path: 72 | nlp = spacy.load(self.model_name_or_path) 73 | else: 74 | try: 75 | nlp = spacy.load("ja_ginza_electra") 76 | except IOError as e: 77 | try: 78 | nlp = spacy.load("ja_ginza") 79 | except IOError as e: 80 | try: 81 | nlp = spacy.load("ja_ginza_bert_large") 82 | except IOError as e: 83 | raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.') 84 | 85 | if self.disable_sentencizer: 86 | nlp.add_pipe("disable_sentencizer", before="parser") 87 | 88 | if self.split_mode: 89 | set_split_mode(nlp, self.split_mode) 90 | 91 | self.nlp = nlp 92 | self.use_orth_if_reading_is_none = isinstance(self.nlp, Japanese) 93 | 94 | def analyze_batch(self, lines: Iterable[str]) -> str: 95 | self.set_nlp() 96 | if self.output_format in ["2", "mecab"]: 97 | return "".join(self.analyze_line(line) for line in lines) 98 | 99 | if self.hash_comment == "print": 100 | batch = list(self.nlp.pipe(line.rstrip("\n") for line in lines if not line.startswith("#"))) 101 | docs = [] 102 | index = 0 103 | for line in lines: 104 | if line.startswith("#"): 105 | docs.append(line) 106 | else: 107 | docs.append(batch[index]) 108 | index += 1 109 | else: 110 | lines = [line.rstrip("\n") for line in lines if self.hash_comment != "skip" or not line.startswith("#")] 111 | docs = self.nlp.pipe(lines) 112 | 113 | if self.output_format in ["3", "json"]: 114 | sep = ",\n" 115 | else: 116 | sep = "" 117 | return sep.join(format_doc(doc, self.output_format, self.use_normalized_form, self.use_orth_if_reading_is_none) if isinstance(doc, Doc) else doc for doc in docs) 118 | 119 | def analyze_line(self, input_line: str) -> str: 120 | line = input_line.rstrip("\n") 121 | if line.startswith("#"): 122 | if self.hash_comment == "print": 123 | return input_line 124 | elif self.hash_comment == "skip": 125 | return "" 126 | if line == "": 127 | return "\n" 128 | if self.output_format in ["2", "mecab"]: 129 | doc = self.nlp.tokenize(line) 130 | else: 131 | doc = self.nlp(line) 132 | return format_doc(doc, self.output_format, self.use_normalized_form, self.use_orth_if_reading_is_none) 133 | 134 | 135 | def format_doc( 136 | doc: Doc, output_format: str, use_normalized_form: bool, use_orth_if_reading_is_none: bool, 137 | ) -> str: 138 | if output_format in ["0", "conllu"]: 139 | return "".join(format_conllu(sent, use_normalized_form, use_orth_if_reading_is_none) for sent in doc.sents) 140 | elif output_format in ["1", "cabocha"]: 141 | return "".join(format_cabocha(sent, use_normalized_form) for sent in doc.sents) 142 | elif output_format in ["2", "mecab"]: 143 | return "".join(format_mecab(doc, use_normalized_form)) 144 | elif output_format in ["3", "json"]: 145 | return ",\n".join(format_json(sent) for sent in doc.sents) 146 | else: 147 | raise Exception(output_format + " is not supported") 148 | 149 | 150 | def format_json(sent: Span) -> str: 151 | token_lines = ",\n".join( 152 | f""" {{"id":{ 153 | token.i - sent.start + 1 154 | },"orth":"{ 155 | token.orth_ 156 | }","tag":"{ 157 | token.tag_ 158 | }","pos":"{ 159 | token.pos_ 160 | }","lemma":"{ 161 | token.lemma_ 162 | }","norm":"{ 163 | token.norm_ 164 | }","head":{ 165 | token.head.i - token.i 166 | },"dep":"{ 167 | token.dep_ 168 | }","ner":"{ 169 | token.ent_iob_ 170 | }{ 171 | "-" + token.ent_type_ if token.ent_type_ else "" 172 | }"{ 173 | ',"whitespacce":"' + token.whitespace_ + '"' if token.whitespace_ else "" 174 | }}}""" for token in sent 175 | ) 176 | return f""" {{ 177 | "paragraphs": [ 178 | {{ 179 | "raw": "{sent.text}", 180 | "sentences": [ 181 | {{ 182 | "tokens": [ 183 | {token_lines} 184 | ] 185 | }} 186 | ] 187 | }} 188 | ] 189 | }}""" 190 | 191 | 192 | def format_conllu(sent: Span, use_normalized_form, use_orth_if_reading_is_none, print_origin=True) -> str: 193 | np_labels = [""] * len(sent) 194 | use_bunsetu = bunsetu_available(sent) 195 | if use_bunsetu: 196 | for head_i in bunsetu_head_list(sent): 197 | bunsetu_head_token = sent[head_i] 198 | phrase = bunsetu_phrase_span(bunsetu_head_token) 199 | if phrase.label_ == "NP": 200 | for idx in range(phrase.start - sent.start, phrase.end - sent.start): 201 | np_labels[idx] = "NP_B" if idx == phrase.start else "NP_I" 202 | token_lines = "".join(conllu_token_line(sent, token, np_label, use_bunsetu, use_normalized_form, use_orth_if_reading_is_none) for token, np_label in zip(sent, np_labels)) 203 | if print_origin: 204 | return f"# text = {sent.text}\n{token_lines}\n" 205 | else: 206 | return f"{token_lines}\n" 207 | 208 | 209 | def conllu_token_line(sent, token, np_label, use_bunsetu, use_normalized_form, use_orth_if_reading_is_none) -> str: 210 | bunsetu_bi = bunsetu_bi_label(token) if use_bunsetu else None 211 | position_type = bunsetu_position_type(token) if use_bunsetu else None 212 | inf = inflection(token) 213 | reading = reading_form(token, use_orth_if_reading_is_none) 214 | ne = ent_label_ontonotes(token) 215 | ene = ent_label_ene(token) 216 | clause_head = clause_head_i(token) + 1 217 | misc = "|".join( 218 | filter( 219 | lambda s: s, 220 | ( 221 | "SpaceAfter=Yes" if token.whitespace_ else "SpaceAfter=No", 222 | "" if not bunsetu_bi else f"BunsetuBILabel={bunsetu_bi}", 223 | "" if not position_type else f"BunsetuPositionType={position_type}", 224 | np_label, 225 | "" if not inf else f"Inf={inf}", 226 | "" if not reading else "Reading={}".format(reading.replace("|", "\\|").replace("\\", "\\\\")), 227 | "" if not ne or ne == "O" else f"NE={ne}", 228 | "" if not ene or ene == "O" else f"ENE={ene}", 229 | "" if not clause_head else f"ClauseHead={clause_head}", 230 | ) 231 | ) 232 | ) 233 | 234 | return "\t".join( 235 | [ 236 | str(token.i - sent.start + 1), 237 | token.orth_, 238 | token.norm_ if use_normalized_form else token.lemma_, 239 | token.pos_, 240 | token.tag_.replace(",*", "").replace(",", "-"), 241 | "NumType=Card" if token.pos_ == "NUM" else "_", 242 | "0" if token.head.i == token.i else str(token.head.i - sent.start + 1), 243 | token.dep_.lower() if token.dep_ else "_", 244 | "_", 245 | misc if misc else "_", 246 | ] 247 | ) + "\n" 248 | 249 | 250 | def format_cabocha(sent: Span, use_normalized_form) -> str: 251 | bunsetu_index_list = {} 252 | bunsetu_index = -1 253 | for token in sent: 254 | if bunsetu_bi_label(token) == "B": 255 | bunsetu_index += 1 256 | bunsetu_index_list[token.i] = bunsetu_index 257 | 258 | lines = [] 259 | for token in sent: 260 | if bunsetu_bi_label(token) == "B": 261 | lines.append(cabocha_bunsetu_line(sent, bunsetu_index_list, token)) 262 | lines.append(cabocha_token_line(token, use_normalized_form)) 263 | lines.append("EOS\n\n") 264 | return "".join(lines) 265 | 266 | 267 | def cabocha_bunsetu_line(sent: Span, bunsetu_index_list, token) -> str: 268 | bunsetu_head_index = None 269 | bunsetu_dep_index = None 270 | bunsetu_func_index = None 271 | dep_type = "D" 272 | for t in token.doc[token.i : sent.end]: 273 | if bunsetu_index_list[t.i] != bunsetu_index_list[token.i]: 274 | if bunsetu_func_index is None: 275 | bunsetu_func_index = t.i - token.i 276 | break 277 | tbi = bunsetu_index_list[t.head.i] 278 | if bunsetu_index_list[t.i] != tbi: 279 | bunsetu_head_index = t.i - token.i 280 | bunsetu_dep_index = tbi 281 | if bunsetu_func_index is None and bunsetu_position_type(t) in {"FUNC", "SYN_HEAD"}: 282 | bunsetu_func_index = t.i - token.i 283 | else: 284 | if bunsetu_func_index is None: 285 | bunsetu_func_index = len(sent) - token.i 286 | if bunsetu_head_index is None: 287 | bunsetu_head_index = 0 288 | if bunsetu_dep_index is None: 289 | bunsetu_dep_index = -1 290 | return "* {} {}{} {}/{} 0.000000\n".format( 291 | bunsetu_index_list[token.i], 292 | bunsetu_dep_index, 293 | dep_type, 294 | bunsetu_head_index, 295 | bunsetu_func_index, 296 | ) 297 | 298 | 299 | def cabocha_token_line(token, use_normalized_form) -> str: 300 | part_of_speech = token.tag_.replace("-", ",") 301 | inf = inflection(token) 302 | part_of_speech += ",*" * (3 - part_of_speech.count(",")) + "," + (inf if inf else "*,*") 303 | reading = reading_form(token, True) 304 | return "{}\t{},{},{},{}\t{}\n".format( 305 | token.orth_, 306 | part_of_speech, 307 | token.norm_ if use_normalized_form else token.lemma_, 308 | reading if reading else token.orth_, 309 | "*", 310 | "O" if token.ent_iob_ == "O" else "{}-{}".format(token.ent_iob_, token.ent_type_), 311 | ) 312 | 313 | 314 | def format_mecab(sudachipy_tokens, use_normalized_form) -> str: 315 | return "".join(mecab_token_line(t, use_normalized_form) for t in sudachipy_tokens) + "EOS\n\n" 316 | 317 | 318 | def mecab_token_line(token, use_normalized_form) -> str: 319 | reading = token.reading_form() 320 | return "{}\t{},{},{},{}\n".format( 321 | token.surface(), 322 | ",".join(token.part_of_speech()), 323 | token.normalized_form() if use_normalized_form else token.dictionary_form(), 324 | reading if reading else token.surface(), 325 | "*", 326 | ) 327 | -------------------------------------------------------------------------------- /ginza/bunsetu_recognizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, Iterable, List, Optional, Set 3 | 4 | from spacy.language import Language 5 | from spacy.tokens import Doc, Span, Token 6 | 7 | __all__ = [ 8 | "bunsetu_available", 9 | "bunsetu_span", 10 | "bunsetu_spans", 11 | "bunsetu_phrase_span", 12 | "bunsetu_phrase_spans", 13 | "bunsetu_head_list", 14 | "bunsetu_head_tokens", 15 | "bunsetu_bi_labels", 16 | "bunsetu_position_types", 17 | "BunsetuRecognizer", 18 | "append_bunsetu_head_dep_suffix", 19 | "BUNSETU_HEAD_SUFFIX", 20 | "PHRASE_RELATIONS", 21 | "POS_PHRASE_MAP", 22 | "clauses", 23 | "clause_head", 24 | "clause_head_i", 25 | "CLAUSE_MARKER_RULES", 26 | "MIN_BUNSETU_NUM_IN_CLAUSE", 27 | ] 28 | 29 | 30 | BUNSETU_HEAD_SUFFIX = "_bunsetu" 31 | 32 | PHRASE_RELATIONS = ("compound", "nummod", "nmod") 33 | 34 | POS_PHRASE_MAP = { 35 | "NOUN": "NP", 36 | "NUM": "NP", 37 | "PRON": "NP", 38 | "PROPN": "NP", 39 | 40 | "VERB": "VP", 41 | 42 | "ADJ": "ADJP", 43 | 44 | "ADV": "ADVP", 45 | 46 | "CCONJ": "CCONJP", 47 | } 48 | 49 | CLAUSE_MARKER_RULES = [ 50 | { 51 | "tag_": "補助記号-読点", 52 | }, 53 | ] 54 | 55 | MIN_BUNSETU_NUM_IN_CLAUSE = 2 56 | 57 | 58 | def bunsetu_available(span: Span): 59 | return "bunsetu_heads" in span.doc.user_data 60 | 61 | 62 | def bunsetu_head_list(span: Span) -> Iterable[int]: 63 | doc = span.doc 64 | heads = doc.user_data["bunsetu_heads"] 65 | if isinstance(span, Doc): 66 | return heads 67 | else: 68 | start = span.start 69 | end = span.end 70 | return [i - start for i in heads if start <= i < end] 71 | 72 | 73 | def bunsetu_head_tokens(span: Span) -> Iterable[Token]: 74 | doc = span.doc 75 | heads = doc.user_data["bunsetu_heads"] 76 | if isinstance(span, Doc): 77 | start = 0 78 | end = len(span) 79 | else: 80 | start = span.start 81 | end = span.end 82 | return [span[i - start] for i in heads if start <= i < end] 83 | 84 | 85 | def bunsetu_spans(span: Span) -> Iterable[Span]: 86 | return [ 87 | bunsetu_span(head) for head in bunsetu_head_tokens(span) 88 | ] 89 | 90 | 91 | def bunsetu_span(token: Token) -> Span: 92 | bunsetu_bi_list = bunsetu_bi_labels(token.doc) 93 | start = token.i 94 | end = start + 1 95 | for idx in range(start, 0, -1): 96 | if bunsetu_bi_list[idx] == "B" or token.doc[idx].is_sent_start: 97 | start = idx 98 | break 99 | else: 100 | start = 0 101 | doc_len = len(token.doc) 102 | for idx in range(end, doc_len): 103 | if bunsetu_bi_list[idx] == "B": 104 | end = idx 105 | break 106 | else: 107 | end = doc_len 108 | 109 | doc = token.doc 110 | return Span(doc, start=start, end=end, label=POS_PHRASE_MAP.get(doc[start:end].root.pos_, "")) 111 | 112 | 113 | def bunsetu_phrase_spans(span: Span, phrase_relations: Iterable[str] = PHRASE_RELATIONS) -> Iterable[Span]: 114 | return [ 115 | bunsetu_phrase_span(head, phrase_relations) for head in bunsetu_head_tokens(span) 116 | ] 117 | 118 | 119 | def bunsetu_phrase_span(token: Token, phrase_relations: Iterable[str] = PHRASE_RELATIONS) -> Span: 120 | def _traverse(head, _bunsetu, result): 121 | for t in head.children: 122 | if _bunsetu.start <= t.i < _bunsetu.end: 123 | if t.dep_ in phrase_relations: 124 | _traverse(t, _bunsetu, result) 125 | result.append(head.i) 126 | bunsetu = bunsetu_span(token) 127 | phrase_tokens = [] 128 | _traverse(bunsetu.root, bunsetu, phrase_tokens) 129 | start = min(phrase_tokens) 130 | end = max(phrase_tokens) + 1 131 | return Span(token.doc, start=start, end=end, label=bunsetu.label_) 132 | 133 | 134 | def bunsetu_bi_labels(span: Span) -> List[str]: 135 | doc = span.doc 136 | bunsetu_bi = doc.user_data["bunsetu_bi_labels"] 137 | if isinstance(span, Doc): 138 | return bunsetu_bi 139 | else: 140 | start = span.start 141 | end = span.end 142 | return bunsetu_bi[start:end] 143 | 144 | 145 | def bunsetu_position_types(span: Span) -> List[str]: 146 | doc = span.doc 147 | position_types = doc.user_data["bunsetu_position_types"] 148 | if isinstance(span, Doc): 149 | return position_types 150 | else: 151 | start = span.start 152 | end = span.end 153 | return position_types[start:end] 154 | 155 | 156 | def clauses(doc: Doc) -> List[Token]: 157 | clauses = doc.user_data["clauses"] 158 | return [[doc[token] for token in tokens] for tokens in clauses.values()] 159 | 160 | 161 | def clause_head(token: Token) -> Token: 162 | return token.doc[token.doc.user_data["clause_heads"][token.i]] 163 | 164 | 165 | def clause_head_i(token: Token) -> int: 166 | doc = token.doc 167 | return doc.user_data["clause_heads"][token.i] - token.sent.start 168 | 169 | 170 | class BunsetuRecognizer: 171 | def __init__( 172 | self, 173 | nlp: Language, 174 | remain_bunsetu_suffix: bool = False, 175 | clause_marker_rules: List[Dict[str, str]] = CLAUSE_MARKER_RULES, 176 | min_bunsetu_num_in_clause: int = MIN_BUNSETU_NUM_IN_CLAUSE, 177 | ) -> None: 178 | self.nlp = nlp 179 | self._remain_bunsetu_suffix = remain_bunsetu_suffix 180 | self._clause_marker_rules = [{k: re.compile(v) for k, v in rule.items()} for rule in clause_marker_rules] 181 | self._min_bunsetu_num_in_clause = min_bunsetu_num_in_clause 182 | 183 | @property 184 | def remain_bunsetu_suffix(self) -> str: 185 | return self._remain_bunsetu_suffix 186 | 187 | @remain_bunsetu_suffix.setter 188 | def remain_bunsetu_suffix(self, remain: bool): 189 | self._remain_bunsetu_suffix = remain 190 | 191 | @property 192 | def clause_marker_rules(self) -> List[Dict[str, str]]: 193 | return [{k: v.pattern for k, v in rules.items()} for rules in self._clause_marker_rules] 194 | 195 | @clause_marker_rules.setter 196 | def clause_marker_rules(self, _clause_marker_rules: List[Dict[str, str]]): 197 | self._clause_markers = [{k: re.compile(v) for k, v in rules} for rules in _clause_marker_rules] 198 | 199 | @property 200 | def min_bunsetu_num_in_clause(self) -> int: 201 | return self._min_bunsetu_num_in_clause 202 | 203 | @min_bunsetu_num_in_clause.setter 204 | def min_bunsetu_num_in_clause(self, _min_bunsetu_num_in_clause: int): 205 | self._min_bunsetu_num_in_clause = _min_bunsetu_num_in_clause 206 | 207 | def __call__(self, doc: Doc) -> Doc: 208 | debug = False 209 | heads = [False] * len(doc) 210 | for t in doc: 211 | if t.dep_ == "ROOT": 212 | heads[t.i] = True 213 | elif t.dep_.endswith(BUNSETU_HEAD_SUFFIX): 214 | heads[t.i] = True 215 | if not self._remain_bunsetu_suffix: 216 | t.dep_ = t.dep_[:-len(BUNSETU_HEAD_SUFFIX)] 217 | for t in doc: # recovering uncovered subtrees 218 | if heads[t.i]: 219 | while t.head.i < t.i and not heads[t.head.i]: 220 | heads[t.head.i] = t.head.pos_ not in {"PUNCT"} 221 | if debug and heads[t.head.i]: 222 | print("========= A", t.i + 1, t.orth_, "=========") 223 | print(list((t.i + 1, t.orth_, t.head.i + 1) for t, is_head in zip(doc, heads) if is_head)) 224 | t = t.head 225 | heads[t.head.i] = True 226 | 227 | for ent in doc.ents: # removing head inside ents 228 | head = None 229 | outer = None 230 | for t in ent: 231 | if t.head.i == t.i or t.head.i < ent.start or ent.end <= t.head.i: 232 | if not outer: 233 | head = t 234 | outer = t.head 235 | elif outer.i != t.head.i: 236 | break 237 | else: 238 | if head: 239 | for t in ent: 240 | if t.i != head.i: 241 | heads[t.i] = False 242 | 243 | bunsetu_heads = tuple(idx for idx, is_head in enumerate(heads) if is_head) 244 | 245 | bunsetu_bi = ["I"] * len(doc) 246 | if bunsetu_bi: 247 | bunsetu_bi[0] = "B" 248 | for head_i, next_head_i in zip(bunsetu_heads[:-1], bunsetu_heads[1:]): 249 | l_head = doc[head_i] 250 | r_head = doc[next_head_i] 251 | if l_head.right_edge.i + 1 == r_head.left_edge.i or l_head.right_edge.i >= r_head.i: # (l)(r) or (l (r)) 252 | bunsetu_bi[r_head.left_edge.i] = "B" 253 | elif l_head.i <= r_head.left_edge.i: # ((l) r) 254 | bunsetu_bi[l_head.right_edge.i + 1] = "B" 255 | else: # ((l) (missed_tokens) (r)) 256 | l_ancestors = set(t.i for t in l_head.ancestors) 257 | r_ancestors = set(t.i for t in r_head.ancestors) 258 | for m in doc[l_head.right_edge.i + 1: r_head.left_edge.i]: # find closer branch 259 | found = False 260 | for m_ancestor in [m] + list(m.ancestors): 261 | if m_ancestor.i in r_ancestors: 262 | bunsetu_bi[m_ancestor.i] = "B" 263 | found = True 264 | break 265 | elif m_ancestor.i in l_ancestors: 266 | break 267 | if found: 268 | break 269 | else: 270 | bunsetu_bi[l_head.right_edge.i + 1] = "B" 271 | 272 | doc.user_data["bunsetu_heads"] = bunsetu_heads 273 | doc.user_data["bunsetu_bi_labels"] = bunsetu_bi 274 | 275 | position_types = [None] * len(doc) 276 | for head in bunsetu_heads: 277 | phrase = bunsetu_phrase_span(doc[head]) 278 | for t in phrase: 279 | if t.i == t.head.i: 280 | position_types[t.i] = "ROOT" 281 | elif t.i == head: 282 | position_types[t.i] = "NO_HEAD" if t.dep_ == "punct" else "SEM_HEAD" 283 | else: 284 | position_types[t.i] = "CONT" 285 | first_func = True 286 | for t, bi, position_type in reversed(list(zip(doc, bunsetu_bi, position_types))): 287 | if bi: 288 | first_func = True 289 | if position_type is None: 290 | if t.pos_ in {'AUX', 'ADP', 'SCONJ', 'CCONJ', 'PART'}: 291 | if first_func: 292 | position_types[t.i] = "SYN_HEAD" 293 | first_func = False 294 | else: 295 | position_types[t.i] = "FUNC" 296 | else: 297 | position_types[t.i] = "CONT" 298 | doc.user_data["bunsetu_position_types"] = position_types 299 | 300 | bunsetu_heads_set = set(bunsetu_heads) 301 | clause_head_candidates = set() 302 | roots = set() 303 | for t in doc: 304 | for rule in self._clause_marker_rules: 305 | if t.dep_.lower() == "root": 306 | roots.add(t.i) 307 | continue 308 | for attr, pattern in rule.items(): 309 | if not pattern.fullmatch(getattr(t, attr)): 310 | break 311 | else: 312 | if t.i in bunsetu_heads_set: 313 | clause_head_candidates.add(t.i) 314 | else: 315 | for ancestor in t.ancestors: 316 | if ancestor.i in bunsetu_heads_set: 317 | clause_head_candidates.add(t.head.i) 318 | break 319 | break 320 | clause_head_candidates -= roots 321 | 322 | for clause_head in list(sorted(clause_head_candidates)): 323 | subtree = set(_.i for _ in doc[clause_head].subtree) 324 | if len(subtree & bunsetu_heads_set) < self._min_bunsetu_num_in_clause: 325 | clause_head_candidates.remove(clause_head) 326 | 327 | clause_head_candidates |= roots 328 | for clause_head in list(sorted(clause_head_candidates)): 329 | subtree = set(_.i for _ in doc[clause_head].subtree) 330 | subtree_bunsetu = subtree & bunsetu_heads_set 331 | descendant_clauses = subtree & clause_head_candidates - {clause_head} 332 | for subclause in descendant_clauses: 333 | subtree_bunsetu -= set(_.i for _ in doc[subclause].subtree) 334 | if len(subtree_bunsetu) < self._min_bunsetu_num_in_clause: 335 | if clause_head in roots: 336 | clause_head_candidates -= descendant_clauses 337 | else: 338 | clause_head_candidates.remove(clause_head) 339 | 340 | clause_heads = list(sorted(clause_head_candidates)) 341 | 342 | def _children_except_clause_heads(idx): 343 | children = [] 344 | for t in doc[idx].lefts: 345 | if t.i in clause_heads: 346 | continue 347 | children += _children_except_clause_heads(t.i) 348 | children.append(idx) 349 | for t in doc[idx].rights: 350 | if t.i in clause_heads: 351 | continue 352 | children += _children_except_clause_heads(t.i) 353 | return children 354 | 355 | clauses = {head: _children_except_clause_heads(head) for head in clause_heads} 356 | doc.user_data["clauses"] = clauses 357 | clause_heads = [-1] * len(doc) 358 | for head, tokens in clauses.items(): 359 | for token in tokens: 360 | clause_heads[token] = head 361 | doc.user_data["clause_heads"] = clause_heads 362 | return doc 363 | 364 | 365 | def append_bunsetu_head_dep_suffix(tokens: List[Token], suffix: str = BUNSETU_HEAD_SUFFIX) -> None: 366 | if not suffix: 367 | return 368 | for token in tokens: 369 | if token.dep_.lower() == "root": 370 | return 371 | if token.head.i < tokens[0].i or tokens[-1].i < token.head.i: 372 | token.dep_ += suffix 373 | return 374 | -------------------------------------------------------------------------------- /ginza/command_line.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from multiprocessing import Process, Queue, Event, cpu_count 3 | from pathlib import Path 4 | import queue 5 | import re 6 | import sys 7 | import traceback 8 | from typing import Generator, Iterable, Optional, List 9 | 10 | import plac 11 | from .analyzer import Analyzer 12 | 13 | MINI_BATCH_SIZE = 100 14 | GINZA_MODEL_PATTERN = re.compile(r"^(ja_ginza|ja_ginza_electra)$") 15 | SPACY_MODEL_PATTERN = re.compile(r"^[a-z]{2}[-_].+[-_].+(sm|md|lg|trf)$") 16 | 17 | 18 | class _OutputWrapper: 19 | def __init__(self, output_path, output_format): 20 | self.output = None 21 | self.output_path = output_path 22 | self.output_format = output_format 23 | self.output_json_opened = False 24 | 25 | @property 26 | def is_json(self): 27 | return self.output_format in ["3", "json"] 28 | 29 | def open(self): 30 | if self.output_path: 31 | self.output = open(self.output_path, "w", encoding="utf-8") 32 | else: 33 | self.output = sys.stdout 34 | 35 | def close(self): 36 | if self.is_json and self.output_json_opened: 37 | print("\n]", file=self.output) 38 | self.output_json_opened = False 39 | if self.output_path: 40 | self.output.close() 41 | else: 42 | pass 43 | 44 | def write(self, result: str): 45 | if self.is_json: 46 | if not self.output_json_opened: 47 | print("[", file=self.output) 48 | self.output_json_opened = True 49 | else: 50 | print(",", file=self.output) 51 | print(result, end="", file=self.output) 52 | 53 | 54 | def run( 55 | model_path: Optional[str] = None, 56 | ensure_model: Optional[str] = None, 57 | split_mode: Optional[str] = None, 58 | hash_comment: str = "print", 59 | output_path: Optional[Path] = None, 60 | output_format: str = "0", 61 | require_gpu: int = -1, 62 | disable_sentencizer: bool = False, 63 | use_normalized_form: bool = False, 64 | parallel_level: int = 1, 65 | files: List[str] = None, 66 | ): 67 | if output_format in ["3", "json"] and hash_comment != "analyze": 68 | print( 69 | f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze".', 70 | file=sys.stderr 71 | ) 72 | 73 | assert parallel_level == 1 or require_gpu == -1, "require_gpu not allowed for multi-processing. https://github.com/explosion/spaCy/issues/5507" 74 | 75 | if parallel_level <= 0: 76 | level = max(1, cpu_count() + parallel_level) 77 | if output_format in [2, "mecab"]: 78 | if require_gpu >= 0: 79 | print("GPU not used for mecab mode", file=sys.stderr) 80 | require_gpu = False 81 | elif parallel_level <= 0: 82 | if require_gpu >= 0: 83 | if level < 4: 84 | print(f"GPU #{require_gpu} enabled: parallel_level' set to {level}", end="", file=sys.stderr) 85 | else: 86 | print(f"GPU #{require_gpu} enabled: parallel_level' set to {level} but seems it's too much", end="", file=sys.stderr) 87 | else: 88 | print(f"'parallel_level' set to {level}", file=sys.stderr) 89 | elif require_gpu: 90 | print(f"GPU #{require_gpu} enabled", file=sys.stderr) 91 | parallel_level = level 92 | 93 | assert model_path is None or ensure_model is None 94 | if ensure_model: 95 | ensure_model = ensure_model.replace("-", "_") 96 | try: 97 | from importlib import import_module 98 | import_module(ensure_model) 99 | except ModuleNotFoundError: 100 | if GINZA_MODEL_PATTERN.match(ensure_model): 101 | print("Installing", ensure_model, file=sys.stderr) 102 | import pip 103 | pip.main(["install", ensure_model]) 104 | print("Successfully installed", ensure_model, file=sys.stderr) 105 | elif SPACY_MODEL_PATTERN.match(ensure_model): 106 | print("Installing", ensure_model, file=sys.stderr) 107 | from spacy.cli.download import download 108 | download(ensure_model) 109 | print("Successfully installed", ensure_model, file=sys.stderr) 110 | else: 111 | raise OSError("E050", f'You need to install "{ensure_model}" before executing ginza.') 112 | model_name_or_path = ensure_model 113 | else: 114 | model_name_or_path = model_path 115 | 116 | analyzer = Analyzer( 117 | model_name_or_path, 118 | split_mode, 119 | hash_comment, 120 | output_format, 121 | require_gpu, 122 | disable_sentencizer, 123 | use_normalized_form, 124 | ) 125 | 126 | output = _OutputWrapper(output_path, output_format) 127 | output.open() 128 | try: 129 | if not files and sys.stdin.isatty(): 130 | _analyze_tty(analyzer, output) 131 | else: 132 | if not files: 133 | files = [0] 134 | if parallel_level == 1: 135 | _analyze_single(analyzer, output, files) 136 | else: 137 | _analyze_parallel(analyzer, output, files, parallel_level) 138 | finally: 139 | output.close() 140 | 141 | 142 | def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None: 143 | try: 144 | analyzer.set_nlp() 145 | while True: 146 | line = input() 147 | output.write(analyzer.analyze_line(line)) 148 | except EOFError: 149 | pass 150 | except KeyboardInterrupt: 151 | pass 152 | 153 | 154 | def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None: 155 | try: 156 | analyzer.set_nlp() 157 | batch = [] 158 | for path in files: 159 | with open(path, "r", encoding="utf-8") as f: 160 | for line in f: 161 | batch.append(line) 162 | if len(batch) < MINI_BATCH_SIZE: 163 | continue 164 | output.write(analyzer.analyze_batch(batch)) 165 | batch.clear() 166 | if batch: 167 | output.write(analyzer.analyze_batch(batch)) 168 | except KeyboardInterrupt: 169 | pass 170 | 171 | 172 | def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str], parallel_level: int) -> None: 173 | try: 174 | in_queue = Queue(maxsize=parallel_level * 2) 175 | out_queue = Queue() 176 | 177 | p_analyzes = [] 178 | abort = Event() 179 | for _ in range(parallel_level): 180 | p = Process(target=_multi_process_analyze, args=(analyzer, in_queue, out_queue, abort), daemon=True) 181 | p.start() 182 | p_analyzes.append(p) 183 | 184 | p_load = Process(target=_multi_process_load, args=(in_queue, files, MINI_BATCH_SIZE, parallel_level, abort), daemon=True) 185 | p_load.start() 186 | 187 | _main_process_write(out_queue, output, parallel_level, abort) 188 | 189 | except KeyboardInterrupt: 190 | abort.set() 191 | finally: 192 | for p in [p_load] + p_analyzes: 193 | try: 194 | p.join(timeout=1) 195 | except: 196 | if p.is_alive(): 197 | p.terminate() 198 | p.join() 199 | 200 | 201 | def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]: 202 | mini_batch = [] 203 | for path in files: 204 | with open(path, "r", encoding="utf-8") as f: 205 | for line in f: 206 | mini_batch.append(line) 207 | if len(mini_batch) == batch_size: 208 | yield mini_batch 209 | mini_batch = [] 210 | if mini_batch: 211 | yield mini_batch 212 | 213 | 214 | def _multi_process_load(in_queue: Queue, files: List[str], batch_size: int, n_analyze_process: int, abort: Event): 215 | try: 216 | for i, mini_batch in enumerate(_data_loader(files, batch_size)): 217 | if abort.is_set(): 218 | break 219 | in_queue.put((i, mini_batch)) 220 | else: 221 | for _ in range(n_analyze_process): 222 | in_queue.put("terminate") 223 | except KeyboardInterrupt: 224 | pass 225 | except: 226 | traceback.print_exc() 227 | abort.set() 228 | 229 | 230 | def _multi_process_analyze(analyzer: Analyzer, in_queue: Queue, out_queue: Queue, abort: Event): 231 | i = None 232 | mini_batch = [] 233 | try: 234 | while True: 235 | if abort.is_set(): 236 | break 237 | try: 238 | msg = in_queue.get(timeout=0.1) 239 | except queue.Empty: 240 | continue 241 | if msg == "terminate": 242 | out_queue.put(("terminating", i, None)) 243 | break 244 | i, mini_batch = msg 245 | result = analyzer.analyze_batch(mini_batch) 246 | out_queue.put((None, i, result)) 247 | except KeyboardInterrupt: 248 | pass 249 | except Exception as err: 250 | out_queue.put(("Error: {}\n{}".format(err, "".join(mini_batch)), i, None)) 251 | traceback.print_exc() 252 | abort.set() 253 | 254 | 255 | def _main_process_write(out_queue: queue, output: _OutputWrapper, parallel_level: int, abort: Event): 256 | cur = 0 257 | results = dict() 258 | terminating = 0 259 | while True: 260 | if abort.is_set(): 261 | return 262 | try: 263 | msg, mini_batch_index, result = out_queue.get(timeout=0.1) 264 | except queue.Empty: 265 | continue 266 | 267 | if msg is not None: 268 | if msg == "terminating": 269 | terminating += 1 270 | if terminating == parallel_level: 271 | return 272 | continue 273 | else: 274 | print(f"Analysis failed in mini_batch #{mini_batch_index}. Stopping all the processes.", file=sys.stderr) 275 | print(msg, file=sys.stderr) 276 | return 277 | 278 | # output must be ordered same as input text 279 | results[mini_batch_index] = result 280 | while results: 281 | if cur not in results.keys(): 282 | break 283 | result = results[cur] 284 | del results[cur] 285 | cur += 1 286 | output.write(result) 287 | 288 | 289 | @plac.annotations( 290 | split_mode=("split mode", "option", "s", str, ["A", "B", "C"]), 291 | hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]), 292 | output_path=("output path", "option", "o", Path), 293 | parallel=("parallel level (default=-1, all_cpus=0)", "option", "p", int), 294 | files=("input files", "positional"), 295 | ) 296 | def run_ginzame( 297 | split_mode=None, 298 | hash_comment="print", 299 | output_path=None, 300 | parallel=-1, 301 | *files, 302 | ): 303 | run( 304 | model_path=None, 305 | ensure_model=None, 306 | split_mode=split_mode, 307 | hash_comment=hash_comment, 308 | output_path=output_path, 309 | output_format="mecab", 310 | require_gpu=-1, 311 | use_normalized_form=True, 312 | parallel_level=parallel, 313 | disable_sentencizer=False, 314 | files=files, 315 | ) 316 | 317 | 318 | def main_ginzame(): 319 | plac.call(run_ginzame) 320 | 321 | 322 | @plac.annotations( 323 | model_path=("model directory path", "option", "b", str), 324 | ensure_model=("select model package of ginza or spacy", "option", "m", str), 325 | split_mode=("split mode", "option", "s", str, ["A", "B", "C"]), 326 | hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]), 327 | output_path=("output path", "option", "o", Path), 328 | output_format=("output format", "option", "f", str, ["0", "conllu", "1", "cabocha", "2", "mecab", "3", "json"]), 329 | require_gpu=("enable require_gpu", "option", "g", int), 330 | use_normalized_form=("Use Token.norm_ instead of Token.lemma_", "flag", "n"), 331 | disable_sentencizer=("disable spaCy's sentence separator", "flag", "d"), 332 | parallel=("parallel level (default=1, all_cpus=0)", "option", "p", int), 333 | files=("input files", "positional"), 334 | ) 335 | def run_ginza( 336 | model_path=None, 337 | ensure_model=None, 338 | split_mode="C", 339 | hash_comment="print", 340 | output_path=None, 341 | output_format="conllu", 342 | require_gpu=-1, 343 | use_normalized_form=False, 344 | disable_sentencizer=False, 345 | parallel=1, 346 | *files, 347 | ): 348 | run( 349 | model_path=model_path, 350 | ensure_model=ensure_model, 351 | split_mode=split_mode, 352 | hash_comment=hash_comment, 353 | output_path=output_path, 354 | output_format=output_format, 355 | require_gpu=require_gpu, 356 | use_normalized_form=use_normalized_form, 357 | disable_sentencizer=disable_sentencizer, 358 | parallel_level=parallel, 359 | files=files, 360 | ) 361 | 362 | 363 | def main_ginza(): 364 | plac.call(run_ginza) 365 | 366 | 367 | if __name__ == "__main__": 368 | plac.call(run_ginza) 369 | -------------------------------------------------------------------------------- /ginza/compound_splitter.py: -------------------------------------------------------------------------------- 1 | # encoding: utf8 2 | from collections import OrderedDict 3 | import re 4 | 5 | import srsly 6 | 7 | from spacy import util 8 | from spacy.language import Language 9 | from spacy.lang.ja import resolve_pos 10 | from spacy.tokens import Doc, MorphAnalysis 11 | 12 | __all__ = [ 13 | "CompoundSplitter", 14 | "tag_to_pos", 15 | ] 16 | 17 | 18 | TAG_DEP_MAP = { 19 | "ADJ": "amod", 20 | "ADP": "case", 21 | "NUM": "nummod", 22 | "PART": "mark", 23 | "PUNCT": "punct", 24 | } 25 | 26 | 27 | def tag_dep_map(tag): 28 | return TAG_DEP_MAP.get(tag, "compound") 29 | 30 | 31 | def tag_to_pos(sub_tokens, next_token_tag): 32 | pos_list = [] 33 | next_pos = None 34 | for t1, t2 in zip(sub_tokens[:-1], sub_tokens[1:]): 35 | if next_pos: 36 | pos = next_pos 37 | next_pos = None 38 | else: 39 | pos, next_pos = resolve_pos(t1.surface, t1.tag, t2.tag) 40 | pos_list.append(pos) 41 | if next_pos: 42 | pos = next_pos 43 | else: 44 | pos, next_pos = resolve_pos(sub_tokens[-1].surface, sub_tokens[-1].tag, next_token_tag) 45 | pos_list.append(pos) 46 | return pos_list 47 | 48 | 49 | def _replace_list_entries(lst, index, inserting_list): 50 | return lst[:index] + inserting_list + lst[index + 1:] 51 | 52 | 53 | class CompoundSplitter: 54 | def __init__(self, vocab, split_mode=None): 55 | self.vocab = vocab 56 | self.split_mode = split_mode 57 | 58 | def __call__(self, doc: Doc): 59 | if "sub_tokens" not in doc.user_data: 60 | return doc 61 | if self._split_mode is None: 62 | return doc 63 | elif self._split_mode == "C": 64 | del doc.user_data["sub_tokens"] 65 | return doc 66 | elif self._split_mode == "B": 67 | sub_tokens_index = 1 68 | elif self._split_mode == "A": 69 | sub_tokens_index = 0 70 | else: 71 | raise Exception("invalid split_mode: " + self._split_mode) 72 | 73 | sub_tokens_list = [ 74 | sub_tokens[sub_tokens_index] if sub_tokens else None for sub_tokens in doc.user_data["sub_tokens"] 75 | ] 76 | 77 | for token_i, sub_tokens in reversed(tuple(zip(range(len(doc)), sub_tokens_list))): 78 | token = doc[token_i] 79 | token_ent_type = token.ent_type 80 | 81 | # edit token.dep_ 82 | if token.head.i == token.i: 83 | dep = "ROOT" 84 | else: 85 | dep = token.dep_ 86 | 87 | compounds = dep in {"compound", "nummod", "punct"} 88 | 89 | # retokenize 90 | if sub_tokens_index is not None and sub_tokens: 91 | deps = [tag_dep_map(dtoken.tag) for dtoken in sub_tokens[:-1]] + [token.dep_] 92 | last = len(sub_tokens) - 1 93 | if token.head.i == token.i: 94 | heads = [(token, last) for _ in range(last + 1)] 95 | elif compounds: 96 | heads = [token.head for _ in range(len(sub_tokens))] 97 | else: 98 | heads = [(token, last) for _ in range(last)] + [token.head] 99 | surfaces = [dtoken.surface for dtoken in sub_tokens] 100 | def morph(dtoken): 101 | m = {} 102 | if dtoken.inf: 103 | m["Inflection"] = dtoken.inf 104 | if dtoken.reading: 105 | m["Reading"] = re.sub("[=|]", "_", dtoken.reading) 106 | return "|".join(f"{k}={v}" for k, v in m.items()) 107 | attrs = { 108 | "TAG": [dtoken.tag for dtoken in sub_tokens], 109 | "DEP": deps, 110 | "POS": tag_to_pos( 111 | sub_tokens, 112 | doc[token.i + 1].tag_ if token.i < len(doc) - 1 else None 113 | ), 114 | "LEMMA": [dtoken.lemma for dtoken in sub_tokens], 115 | "NORM": [dtoken.norm for dtoken in sub_tokens], 116 | "ENT_TYPE": [token_ent_type for dtoken in sub_tokens], 117 | "MORPH": [morph(dtoken) for dtoken in sub_tokens], 118 | } 119 | try: 120 | with doc.retokenize() as retokenizer: 121 | retokenizer.split(token, surfaces, heads=heads, attrs=attrs) 122 | except Exception as e: 123 | import sys 124 | print("Retokenization error:", file=sys.stderr) 125 | print(doc.text, file=sys.stderr) 126 | print([(t.i, t.orth_) for t in doc], file=sys.stderr) 127 | print(list(enumerate(doc.user_data["sub_tokens"])), file=sys.stderr) 128 | raise e 129 | 130 | # work-around: retokenize() does not consider the head of the splitted tokens 131 | if not compounds: 132 | for t in doc: 133 | if t.i < token_i or token_i + len(sub_tokens) <= t.i: 134 | if t.head.i == token_i: 135 | t.head = doc[token_i + last] 136 | 137 | del doc.user_data["sub_tokens"] 138 | return doc 139 | 140 | @property 141 | def split_mode(self) -> str: 142 | return self._split_mode 143 | 144 | @split_mode.setter 145 | def split_mode(self, mode: str): 146 | assert mode in (None, "A", "B", "C"), 'split_mode should be "A", "B", "C", or None' 147 | self._split_mode = mode 148 | 149 | def _get_config(self): 150 | config = OrderedDict( 151 | ( 152 | ("split_mode", self._split_mode), 153 | ) 154 | ) 155 | return config 156 | 157 | def _set_config(self, config=None): 158 | self.split_mode = config.get("split_mode", None) if config else None 159 | 160 | def to_bytes(self, **_kwargs): 161 | serializers = OrderedDict( 162 | ( 163 | ("cfg", lambda: srsly.json_dumps(self._get_config())), 164 | ) 165 | ) 166 | return util.to_bytes(serializers, []) 167 | 168 | def from_bytes(self, data, **_kwargs): 169 | deserializers = OrderedDict( 170 | ( 171 | ("cfg", lambda b: self._set_config(srsly.json_loads(b))), 172 | ) 173 | ) 174 | util.from_bytes(data, deserializers, []) 175 | return self 176 | 177 | def to_disk(self, path, **_kwargs): 178 | path = util.ensure_path(path) 179 | serializers = OrderedDict( 180 | ( 181 | ("cfg", lambda p: srsly.write_json(p, self._get_config())), 182 | ) 183 | ) 184 | return util.to_disk(path, serializers, []) 185 | 186 | def from_disk(self, path, **_kwargs): 187 | path = util.ensure_path(path) 188 | serializers = OrderedDict( 189 | ( 190 | ("cfg", lambda p: self._set_config(srsly.read_json(p))), 191 | ) 192 | ) 193 | util.from_disk(path, serializers, []) 194 | -------------------------------------------------------------------------------- /ginza/disable_sentencizer.py: -------------------------------------------------------------------------------- 1 | # encoding: utf8 2 | from collections import OrderedDict 3 | 4 | import srsly 5 | 6 | from spacy import util 7 | 8 | 9 | __all__ = [ 10 | "DisableSentencizer", 11 | ] 12 | 13 | 14 | 15 | class DisableSentencizer: 16 | def __init__(self, nlp): 17 | self.nlp = nlp 18 | 19 | def __call__(self, doc): 20 | for t in doc[1:]: 21 | t.is_sent_start = False 22 | return doc 23 | 24 | def _get_config(self): 25 | return {} 26 | 27 | def _set_config(self, config=None): 28 | pass 29 | 30 | def to_bytes(self, **_kwargs): 31 | serializers = OrderedDict( 32 | ( 33 | ("cfg", lambda: srsly.json_dumps(self._get_config())), 34 | ) 35 | ) 36 | return util.to_bytes(serializers, []) 37 | 38 | def from_bytes(self, data, **_kwargs): 39 | deserializers = OrderedDict( 40 | ( 41 | ("cfg", lambda b: self._set_config(srsly.json_loads(b))), 42 | ) 43 | ) 44 | util.from_bytes(data, deserializers, []) 45 | return self 46 | 47 | def to_disk(self, path, **_kwargs): 48 | path = util.ensure_path(path) 49 | serializers = OrderedDict( 50 | ( 51 | ("cfg", lambda p: srsly.write_json(p, self._get_config())), 52 | ) 53 | ) 54 | return util.to_disk(path, serializers, []) 55 | 56 | def from_disk(self, path, **_kwargs): 57 | path = util.ensure_path(path) 58 | serializers = OrderedDict( 59 | ( 60 | ("cfg", lambda p: self._set_config(srsly.read_json(p))), 61 | ) 62 | ) 63 | util.from_disk(path, serializers, []) 64 | -------------------------------------------------------------------------------- /ginza/ene_ontonotes_mapper.py: -------------------------------------------------------------------------------- 1 | # encoding: utf8 2 | 3 | __all__ = [ 4 | "ENE_ONTONOTES_MAPPING", 5 | ] 6 | 7 | 8 | ENE_ONTONOTES_MAPPING = { 9 | "Person": "PERSON", 10 | "God": "PERSON", 11 | 12 | "International_Organization": "NORP", 13 | "Ethnic_Group": "NORP", 14 | "Ethnic_Group_Other": "NORP", 15 | "Nationality": "NORP", 16 | "Political_Organization": "NORP", 17 | "Political_Organization_Other": "NORP", 18 | "Political_Party": "NORP", 19 | "Religion": "NORP", 20 | 21 | "Language": "LANGUAGE", 22 | "Language_Other": "LANGUAGE", 23 | "National_Language": "LANGUAGE", 24 | 25 | "Location_Other": "GPE", 26 | "GPE": "GPE", 27 | "GPE_Other": "GPE", 28 | "City": "GPE", 29 | "Province": "GPE", 30 | "Country": "GPE", 31 | "Spa": "GPE", 32 | "Address": "GPE", 33 | "Address_Other": "GPE", 34 | "Postal_Address": "GPE", 35 | "County": "GPE", 36 | 37 | "Region": "LOC", 38 | "Region_Other": "LOC", 39 | "Continental_Region": "LOC", 40 | "Domestic_Region": "LOC", 41 | "Geological_Region": "LOC", 42 | "Geological_Region_Other": "LOC", 43 | "Mountain": "LOC", 44 | "Island": "LOC", 45 | "River": "LOC", 46 | "Lake": "LOC", 47 | "Sea": "LOC", 48 | "Bay": "LOC", 49 | 50 | "Event_Other": "EVENT", 51 | "Occasion": "EVENT", 52 | "Occasion_Other": "EVENT", 53 | "Election": "EVENT", 54 | "Religious_Festival": "EVENT", 55 | "Competition": "EVENT", 56 | "Game": "EVENT", # used in v7 ("Competition" in v8) 57 | "Conference": "EVENT", 58 | "Incident": "EVENT", 59 | "Incident_Other": "EVENT", 60 | "War": "EVENT", 61 | "Natural_Phenomenon": "EVENT", 62 | "Natural_Phenomenon_Other": "EVENT", 63 | "Natural_Disaster": "EVENT", 64 | "Earthquake": "EVENT", 65 | 66 | "Facility": "FAC", 67 | "Facility_Other": "FAC", 68 | "Facility_Part": "FAC", 69 | "Dam": "FAC", 70 | "Archaeological_Place": "FAC", 71 | "Archaeological_Place_Other": "FAC", 72 | "Tomb": "FAC", 73 | "FOE": "FAC", 74 | "FOE_Other": "FAC", 75 | "GOE_Other": "FAC", # used in v7 ("FOE_Other" in v8) 76 | "Military_Base": "FAC", 77 | "Power_Plant": "FAC", 78 | "Park": "FAC", 79 | "Shopping_Complex": "FAC", 80 | "Sports_Facility": "FAC", 81 | "Museum": "FAC", 82 | "Zoo": "FAC", 83 | "Amusement_Park": "FAC", 84 | "Theater": "FAC", 85 | "Worship_Place": "FAC", 86 | "Castle": "FAC", 87 | "Palace": "FAC", 88 | "Public_Institution": "FAC", 89 | "Accommodation": "FAC", 90 | "Medical_Institution": "FAC", 91 | "School": "FAC", 92 | "Research_Institute": "FAC", 93 | "Market": "FAC", 94 | "Transport_Facility": "FAC", 95 | "Transport_Facility_Other": "FAC", 96 | "Car_Stop": "FAC", 97 | "Station": "FAC", 98 | "Airport": "FAC", 99 | "Port": "FAC", 100 | "Line": "FAC", 101 | "Line_Other": "FAC", 102 | "Railroad": "FAC", 103 | "Road": "FAC", 104 | "Canal": "FAC", 105 | "Water_Route": "FAC", 106 | "Tunnel": "FAC", 107 | "Bridge": "FAC", 108 | "Tumulus": "FAC", 109 | 110 | "Organization": "ORG", 111 | "Organization_Other": "ORG", 112 | "Show_Organization": "ORG", 113 | "Family": "ORG", 114 | "Sports_Organization": "ORG", 115 | "Sports_Organization_Other": "ORG", 116 | "Pro_Sports_Organization": "ORG", # used in v7 ("Sports_Organization" in v8) 117 | "Sports_Federation": "ORG", 118 | "Sports_League": "ORG", 119 | "Sports_Team": "ORG", 120 | "Juridical_Person": "ORG", 121 | "Juridical_Person_Other": "ORG", 122 | "Channel": "ORG", 123 | "Corporation_Other": "ORG", # used in v7 ("Juridical_Person_Other" in v8) 124 | "Nonprofit_Organization": "ORG", 125 | "Company": "ORG", 126 | "Company_Group": "ORG", 127 | "Government": "ORG", 128 | "Cabinet": "ORG", 129 | "Military": "ORG", 130 | 131 | "Product_Other": "PRODUCT", 132 | "Service": "PRODUCT", 133 | "Character": "PRODUCT", 134 | "ID_Number": "PRODUCT", 135 | "Game_Other": "PRODUCT", 136 | "Digital_Game": "PRODUCT", 137 | "Software": "PRODUCT", 138 | "Vehicle": "PRODUCT", 139 | "Vehicle_Other": "PRODUCT", 140 | "Car": "PRODUCT", 141 | "Train": "PRODUCT", 142 | "Aircraft": "PRODUCT", 143 | "Spaceship": "PRODUCT", 144 | "Ship": "PRODUCT", 145 | "Food_Other": "PRODUCT", 146 | "Musical_Instrument": "PRODUCT", 147 | "Clothing": "PRODUCT", 148 | "Money_Form": "PRODUCT", 149 | "Drug": "PRODUCT", 150 | "Weapon": "PRODUCT", 151 | "Stock": "PRODUCT", 152 | "Award": "PRODUCT", 153 | "Decoration": "PRODUCT", 154 | 155 | "Video_Work": "WORK_OF_ART", 156 | "Art": "WORK_OF_ART", 157 | "Art_Other": "WORK_OF_ART", 158 | "Painting": "WORK_OF_ART", 159 | "Broadcast_Program": "WORK_OF_ART", 160 | "Movie": "WORK_OF_ART", 161 | "Show": "WORK_OF_ART", 162 | "Music": "WORK_OF_ART", 163 | "Book": "WORK_OF_ART", 164 | "Printing": "WORK_OF_ART", 165 | "Printing_Other": "WORK_OF_ART", 166 | "Newspaper": "WORK_OF_ART", 167 | "Magazine": "WORK_OF_ART", 168 | "Picture": "WORK_OF_ART", 169 | 170 | "Offense": "LAW", 171 | "Doctrine_Method_Other": "LAW", 172 | "Movement": "LAW", 173 | "Plan": "LAW", 174 | "Rule": "LAW", 175 | "Rule_Other": "LAW", 176 | "Treaty": "LAW", 177 | "Law": "LAW", 178 | 179 | "Timex": "DATE", 180 | "Timex_Other": "DATE", 181 | "Timeex": "DATE", 182 | "Timeex_Other": "DATE", 183 | "Date": "DATE", 184 | "Day_Of_Week": "DATE", 185 | "Era": "DATE", 186 | "Periodx": "DATE", 187 | "Periodx_Other": "DATE", 188 | "Period_Day": "DATE", 189 | "Period_Week": "DATE", 190 | "Period_Month": "DATE", 191 | "Period_Year": "DATE", 192 | "Time_Top_Other": "DATE", 193 | 194 | "Time": "TIME", 195 | "Period_Time": "TIME", 196 | 197 | "Percent": "PERCENT", 198 | 199 | "Currency": "MONEY", 200 | "Money": "MONEY", 201 | 202 | "Unit_Other": "QUANTITY", 203 | "Latitude_Longitude": "QUANTITY", 204 | "Latitude_Longtitude": "QUANTITY", # used in v7 ("Latitude_Longitude" in v8) 205 | "Measurement": "QUANTITY", 206 | "Measurement_Other": "QUANTITY", 207 | "Physical_Extent": "QUANTITY", 208 | "Seismic_Magnitude": "QUANTITY", 209 | "Space": "QUANTITY", 210 | "Volume": "QUANTITY", 211 | "Weight": "QUANTITY", 212 | "Speed": "QUANTITY", 213 | "Intensity": "QUANTITY", 214 | "Temperature": "QUANTITY", 215 | "Calorie": "QUANTITY", 216 | "Seismic_Intensity": "QUANTITY", 217 | "Countx": "QUANTITY", 218 | "Countx_Other": "QUANTITY", 219 | "N_Person": "QUANTITY", 220 | "N_Organization": "QUANTITY", 221 | "N_Location": "QUANTITY", 222 | "N_Location_Other": "QUANTITY", 223 | "N_Country": "QUANTITY", 224 | "N_Facility": "QUANTITY", 225 | "N_Product": "QUANTITY", 226 | "N_Event": "QUANTITY", 227 | "N_Natural_Object": "QUANTITY", 228 | "N_Natural_Object_Other": "QUANTITY", 229 | "N_Animal": "QUANTITY", 230 | "N_Flora": "QUANTITY", 231 | "Point": "QUANTITY", 232 | "Multiplication": "QUANTITY", 233 | "Frequency": "QUANTITY", 234 | "Age": "QUANTITY", 235 | 236 | "Rank": "ORDINAL", 237 | "School_Age": "ORDINAL", 238 | "Ordinal_Number": "ORDINAL", 239 | 240 | "Stock_Index": "CARDINAL", 241 | 242 | "Phone_Number": "PHONE", 243 | 244 | "Email": "EMAIL", 245 | 246 | "URL": "URL", 247 | 248 | "Individual_Animal": "ANIMAL", 249 | "Individual_Animal_Other": "ANIMAL", 250 | "Racehorse": "ANIMAL", 251 | 252 | "Name": "OTHERS", 253 | "Name_Other": "OTHERS", 254 | "Natural_Object": "OTHERS", 255 | "Natural_Object_Other": "OTHERS", 256 | "Element": "OTHERS", 257 | "Compound": "OTHERS", 258 | "Mineral": "OTHERS", 259 | "Living_Thing": "OTHERS", 260 | "Living_Thing_Other": "OTHERS", 261 | "Fungus": "OTHERS", 262 | "Mollusk_Arthropod": "OTHERS", 263 | "Mollusc_Arthropod": "OTHERS", # used in v7 ("Mollusk_Arthropod" in v8) 264 | "Insect": "OTHERS", 265 | "Fish": "OTHERS", 266 | "Amphibia": "OTHERS", 267 | "Reptile": "OTHERS", 268 | "Bird": "OTHERS", 269 | "Mammal": "OTHERS", 270 | "Flora": "OTHERS", 271 | "Living_Thing_Part": "OTHERS", 272 | "Living_Thing_Part_Other": "OTHERS", 273 | "Animal_Part": "OTHERS", 274 | "Flora_Part": "OTHERS", 275 | "Disease": "OTHERS", 276 | "Disease_Other": "OTHERS", 277 | "Animal_Disease": "OTHERS", 278 | "Color": "OTHERS", 279 | "Color_Other": "OTHERS", 280 | "Nature_Color": "OTHERS", 281 | "Location": "OTHERS", 282 | "Astronomical_Object": "OTHERS", 283 | "Astronomical_Object_Other": "OTHERS", 284 | "Star": "OTHERS", 285 | "Planet": "OTHERS", 286 | "Constellation": "OTHERS", 287 | "Product": "OTHERS", 288 | "Class": "OTHERS", 289 | "Food": "OTHERS", 290 | "Dish": "OTHERS", 291 | "Doctrine_Method": "OTHERS", 292 | "Culture": "OTHERS", 293 | "Academic": "OTHERS", 294 | "Sport": "OTHERS", 295 | "Style": "OTHERS", 296 | "Theory": "OTHERS", 297 | "Title": "OTHERS", 298 | "Title_Other": "OTHERS", 299 | "Position_Vocation": "OTHERS", 300 | "Unit": "OTHERS", 301 | "Virtual_Address": "OTHERS", 302 | "Virtual_Address_Other": "OTHERS", 303 | "Event": "OTHERS", 304 | "Numex": "OTHERS", 305 | "Numex_Other": "OTHERS", 306 | "Astral_Body_Other": "OTHERS", 307 | "Material": "OTHERS", 308 | } 309 | 310 | ENE8_LABELS = { 311 | "Name": "1", 312 | "Name_Other": "1.0", 313 | "Person": "1.1", 314 | "God": "1.2", 315 | "Individual_Animal": "1.3", 316 | "Individual_Animal_Other": "1.3.0", 317 | "Racehorse": "1.3.1", 318 | "Organization": "1.4", 319 | "Organization_Other": "1.4.0", 320 | "International_Organization": "1.4.1", 321 | "Show_Organization": "1.4.2", 322 | "Family": "1.4.3", 323 | "Ethnic_Group": "1.4.4", 324 | "Ethnic_Group_Other": "1.4.4.0", 325 | "Nationality": "1.4.4.1", 326 | "Sports_Organization": "1.4.5", 327 | "Sports_Organization_Other": "1.4.5.0", 328 | "Sports_Federation": "1.4.5.1", 329 | "Sports_League": "1.4.5.2", 330 | "Sports_Team": "1.4.5.3", 331 | "Juridical_Person": "1.4.6", 332 | "Juridical_Person_Other": "1.4.6.0", 333 | "Nonprofit_Organization": "1.4.6.1", 334 | "Company": "1.4.6.2", 335 | "Company_Group": "1.4.6.3", 336 | "Political_Organization": "1.4.7", 337 | "Political_Organization_Other": "1.4.7.0", 338 | "Government": "1.4.7.1", 339 | "Political_Party": "1.4.7.2", 340 | "Cabinet": "1.4.7.3", 341 | "Military": "1.4.7.4", 342 | "Location": "1.5", 343 | "Location_Other": "1.5.0", 344 | "GPE": "1.5.1", 345 | "GPE_Other": "1.5.1.0", 346 | "City": "1.5.1.1", 347 | "Province": "1.5.1.2", 348 | "Country": "1.5.1.3", 349 | "Region": "1.5.2", 350 | "Region_Other": "1.5.2.0", 351 | "Continental_Region": "1.5.2.1", 352 | "Domestic_Region": "1.5.2.2", 353 | "Geological_Region": "1.5.3", 354 | "Geological_Region_Other": "1.5.3.0", 355 | "Spa": "1.5.3.1", 356 | "Mountain": "1.5.3.2", 357 | "Island": "1.5.3.3", 358 | "River": "1.5.3.4", 359 | "Lake": "1.5.3.5", 360 | "Sea": "1.5.3.6", 361 | "Bay": "1.5.3.7", 362 | "Astronomical_Object": "1.5.4", 363 | "Astronomical_Object_Other": "1.5.4.0", 364 | "Star": "1.5.4.1", 365 | "Planet": "1.5.4.2", 366 | "Constellation": "1.5.4.3", 367 | "Address": "1.5.5", 368 | "Address_Other": "1.5.5.0", 369 | "Postal_Address": "1.5.5.1", 370 | "Facility": "1.6", 371 | "Facility_Other": "1.6.0", 372 | "Facility_Part": "1.6.1", 373 | "Dam": "1.6.2", 374 | "Archaeological_Place": "1.6.3", 375 | "Archaeological_Place_Other": "1.6.3.0", 376 | "Tomb": "1.6.3.1", 377 | "FOE": "1.6.4", 378 | "FOE_Other": "1.6.4.0", 379 | "Military_Base": "1.6.4.1", 380 | "Castle": "1.6.4.2", 381 | "Palace": "1.6.4.3", 382 | "Public_Institution": "1.6.4.4", 383 | "Accommodation": "1.6.4.5", 384 | "Medical_Institution": "1.6.4.6", 385 | "School": "1.6.4.7", 386 | "Research_Institute": "1.6.4.8", 387 | "Market": "1.6.4.9", 388 | "Power_Plant": "1.6.4.10", 389 | "Park": "1.6.4.11", 390 | "Shopping_Complex": "1.6.4.12", 391 | "Sports_Facility": "1.6.4.13", 392 | "Museum": "1.6.4.14", 393 | "Zoo": "1.6.4.15", 394 | "Amusement_Park": "1.6.4.16", 395 | "Theater": "1.6.4.17", 396 | "Worship_Place": "1.6.4.18", 397 | "Transport_Facility": "1.6.5", 398 | "Transport_Facility_Other": "1.6.5.0", 399 | "Car_Stop": "1.6.5.1", 400 | "Station": "1.6.5.2", 401 | "Airport": "1.6.5.3", 402 | "Port": "1.6.5.4", 403 | "Line": "1.6.6", 404 | "Line_Other": "1.6.6.0", 405 | "Railroad": "1.6.6.1", 406 | "Road": "1.6.6.2", 407 | "Canal": "1.6.6.3", 408 | "Water_Route": "1.6.6.4", 409 | "Tunnel": "1.6.6.5", 410 | "Bridge": "1.6.6.6", 411 | "Product": "1.7", 412 | "Product_Other": "1.7.0", 413 | "Video_Work": "1.7.1", 414 | "Musical_Instrument": "1.7.2", 415 | "Clothing": "1.7.3", 416 | "Money_Form": "1.7.4", 417 | "Drug": "1.7.5", 418 | "Weapon": "1.7.6", 419 | "Stock": "1.7.7", 420 | "Award": "1.7.8", 421 | "Decoration": "1.7.9", 422 | "Offense": "1.7.10", 423 | "Service": "1.7.11", 424 | "Class": "1.7.12", 425 | "Character": "1.7.13", 426 | "ID_Number": "1.7.14", 427 | "Game": "1.7.15", 428 | "Game_Other": "1.7.15.0", 429 | "Digital_Game": "1.7.15.1", 430 | "Software": "1.7.16", 431 | "Vehicle": "1.7.17", 432 | "Vehicle_Other": "1.7.17.0", 433 | "Car": "1.7.17.1", 434 | "Train": "1.7.17.2", 435 | "Aircraft": "1.7.17.3", 436 | "Spaceship": "1.7.17.4", 437 | "Ship": "1.7.17.5", 438 | "Food": "1.7.18", 439 | "Food_Other": "1.7.18.0", 440 | "Dish": "1.7.18.1", 441 | "Art": "1.7.19", 442 | "Art_Other": "1.7.19.0", 443 | "Painting": "1.7.19.1", 444 | "Broadcast_Program": "1.7.19.2", 445 | "Movie": "1.7.19.3", 446 | "Show": "1.7.19.4", 447 | "Music": "1.7.19.5", 448 | "Book": "1.7.19.6", 449 | "Printing": "1.7.20", 450 | "Printing_Other": "1.7.20.0", 451 | "Newspaper": "1.7.20.1", 452 | "Magazine": "1.7.20.2", 453 | "Doctrine_Method": "1.7.21", 454 | "Doctrine_Method_Other": "1.7.21.0", 455 | "Culture": "1.7.21.1", 456 | "Religion": "1.7.21.2", 457 | "Academic": "1.7.21.3", 458 | "Sport": "1.7.21.4", 459 | "Style": "1.7.21.5", 460 | "Movement": "1.7.21.6", 461 | "Theory": "1.7.21.7", 462 | "Plan": "1.7.21.8", 463 | "Rule": "1.7.22", 464 | "Rule_Other": "1.7.22.0", 465 | "Treaty": "1.7.22.1", 466 | "Law": "1.7.22.2", 467 | "Title": "1.7.23", 468 | "Title_Other": "1.7.23.0", 469 | "Position_Vocation": "1.7.23.1", 470 | "Language": "1.7.24", 471 | "Language_Other": "1.7.24.0", 472 | "National_Language": "1.7.24.1", 473 | "Unit": "1.7.25", 474 | "Unit_Other": "1.7.25.0", 475 | "Currency": "1.7.25.1", 476 | "Virtual_Address": "1.8", 477 | "Virtual_Address_Other": "1.8.0", 478 | "Channel": "1.8.1", 479 | "Phone_Number": "1.8.2", 480 | "Email": "1.8.3", 481 | "URL": "1.8.4", 482 | "Event": "1.9", 483 | "Event_Other": "1.9.0", 484 | "Occasion": "1.9.1", 485 | "Occasion_Other": "1.9.1.0", 486 | "Election": "1.9.1.1", 487 | "Religious_Festival": "1.9.1.2", 488 | "Competition": "1.9.1.3", 489 | "Conference": "1.9.1.4", 490 | "Incident": "1.9.2", 491 | "Incident_Other": "1.9.2.0", 492 | "War": "1.9.2.1", 493 | "Natural_Phenomenon": "1.9.3", 494 | "Natural_Phenomenon_Other": "1.9.3.0", 495 | "Natural_Disaster": "1.9.3.1", 496 | "Earthquake": "1.9.3.2", 497 | "Natural_Object": "1.10", 498 | "Natural_Object_Other": "1.10.0", 499 | "Element": "1.10.1", 500 | "Compound": "1.10.2", 501 | "Mineral": "1.10.3", 502 | "Living_Thing": "1.10.4", 503 | "Living_Thing_Other": "1.10.4.0", 504 | "Fungus": "1.10.4.1", 505 | "Mollusk_Arthropod": "1.10.4.2", 506 | "Insect": "1.10.4.3", 507 | "Fish": "1.10.4.4", 508 | "Amphibia": "1.10.4.5", 509 | "Reptile": "1.10.4.6", 510 | "Bird": "1.10.4.7", 511 | "Mammal": "1.10.4.8", 512 | "Flora": "1.10.4.9", 513 | "Living_Thing_Part": "1.10.5", 514 | "Living_Thing_Part_Other": "1.10.5.0", 515 | "Animal_Part": "1.10.5.1", 516 | "Flora_Part": "1.10.5.2", 517 | "Disease": "1.11", 518 | "Disease_Other": "1.11.0", 519 | "Animal_Disease": "1.11.1", 520 | "Color": "1.12", 521 | "Color_Other": "1.12.0", 522 | "Nature_Color": "1.12.1", 523 | "Timex": "2", 524 | "Timex_Other": "2.0", 525 | "Timeex": "2.1", 526 | "Timeex_Other": "2.1.0", 527 | "Time": "2.1.1", 528 | "Date": "2.1.2", 529 | "Day_Of_Week": "2.1.3", 530 | "Era": "2.1.4", 531 | "Periodx": "2.2", 532 | "Periodx_Other": "2.2.0", 533 | "Period_Time": "2.2.1", 534 | "Period_Day": "2.2.2", 535 | "Period_Week": "2.2.3", 536 | "Period_Month": "2.2.4", 537 | "Period_Year": "2.2.5", 538 | "Numex": "3", 539 | "Numex_Other": "3.0", 540 | "Money": "3.1", 541 | "Stock_Index": "3.2", 542 | "Point": "3.3", 543 | "Percent": "3.4", 544 | "Multiplication": "3.5", 545 | "Frequency": "3.6", 546 | "Age": "3.7", 547 | "School_Age": "3.8", 548 | "Ordinal_Number": "3.9", 549 | "Rank": "3.10", 550 | "Latitude_Longitude": "3.11", 551 | "Measurement": "3.12", 552 | "Measurement_Other": "3.12.0", 553 | "Physical_Extent": "3.12.1", 554 | "Space": "3.12.2", 555 | "Volume": "3.12.3", 556 | "Weight": "3.12.4", 557 | "Speed": "3.12.5", 558 | "Intensity": "3.12.6", 559 | "Temperature": "3.12.7", 560 | "Calorie": "3.12.8", 561 | "Seismic_Intensity": "3.12.9", 562 | "Seismic_Magnitude": "3.12.10", 563 | "Countx": "3.13", 564 | "Countx_Other": "3.13.0", 565 | "N_Person": "3.13.1", 566 | "N_Organization": "3.13.2", 567 | "N_Location": "3.13.3", 568 | "N_Location_Other": "3.13.3.0", 569 | "N_Country": "3.13.3.1", 570 | "N_Facility": "3.13.4", 571 | "N_Product": "3.13.5", 572 | "N_Event": "3.13.6", 573 | "N_Natural_Object": "3.13.7", 574 | "N_Natural_Object_Other": "3.13.7.0", 575 | "N_Animal": "3.13.7.1", 576 | "N_Flora": "3.13.7.2", 577 | } 578 | 579 | """ 580 | import json 581 | import sys 582 | 583 | if __name__ == "__main__": 584 | for ne, idx in ENE8_LABELS.items(): 585 | if ne not in ENE_NE_MAPPING: 586 | print(idx, ne, "not in mapping") 587 | with open(sys.argv[1], "r") as f: 588 | meta_json = json.load(f) 589 | for ne in meta_json["labels"]["ner"]: 590 | if ne not in ENE_NE_MAPPING: 591 | print(ne, "not in mapping") 592 | for ent, idx in ENE_NE_MAPPING.items(): 593 | if ent not in ENE8_LABELS and ent not in meta_json["labels"]["ner"]: 594 | print(idx, ent, "not used") 595 | 596 | for ne, idx, ent in sorted([ 597 | (ne, ENE8_LABELS[ent] if ent in ENE8_LABELS else "_", ent) for ent, ne in ENT_NE_MAPPING.items() 598 | ]): 599 | print("\t"{}": "{}",".format(ent, ne)) 600 | """ 601 | -------------------------------------------------------------------------------- /ginza/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import subprocess as sp 3 | import sys 4 | from pathlib import Path 5 | from functools import partial 6 | 7 | import pytest 8 | 9 | run_cmd = partial(sp.run, encoding="utf-8", stdout=sp.PIPE) 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def tmpdir() -> Path: 14 | with tempfile.TemporaryDirectory() as dir_name: 15 | yield Path(dir_name) 16 | -------------------------------------------------------------------------------- /ginza/tests/test_analyzer.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import pytest 4 | 5 | from ginza.analyzer import Analyzer 6 | 7 | 8 | TOKEN_TESTS = [ 9 | ["今日はかつ丼を食べた。明日は蕎麦を食べたい。", ["今日","は","かつ丼","を","食べ","た","。","明日","は","蕎麦","を","食べ","たい","。"]] 10 | ] 11 | 12 | MECAB_TESTS = [ 13 | ["今日はかつ丼を食べた。明日は蕎麦を食べたい。", ["今日","は","かつ","丼","を","食べ","た","。","明日","は","蕎麦","を","食べ","たい","。"]] 14 | ] 15 | 16 | BATCH_TESTS = [ 17 | [ 18 | [ 19 | "銀座でランチをご一緒しましょう。", 20 | "東京タワーの近くに住んでいます。", 21 | "東京都選挙管理委員会の担当者は、次のように説明した。", 22 | ], [ 23 | ["銀座","で","ランチ","を","ご","一緒","し","ましょう","。"], 24 | ["東京","タワー","の","近く","に","住ん","で","い","ます","。"], 25 | ["東京都","選挙管理委員会","の","担当者","は","、","次","の","よう","に","説明","し","た","。"], 26 | ] 27 | ] 28 | ] 29 | 30 | @pytest.fixture 31 | def analyzer() -> Analyzer: 32 | default_params = dict( 33 | model_name_or_path=None, 34 | split_mode=None, 35 | hash_comment="print", 36 | output_format="conllu", 37 | require_gpu=-1, 38 | disable_sentencizer=False, 39 | use_normalized_form=False, 40 | ) 41 | yield Analyzer(**default_params) 42 | 43 | 44 | def _tokens_conllu(result: str): 45 | ret = [] 46 | for line in result.split("\n"): 47 | if line.startswith("#") or line.strip() == "": 48 | continue 49 | ret.append(line.split("\t")[1]) 50 | return ret 51 | 52 | 53 | def _tokens_cabocha(result: str): 54 | ret = [] 55 | for line in result.split("\n"): 56 | if line.startswith("*") or line.strip() in ("","EOS"): 57 | continue 58 | ret.append(line.split("\t")[0]) 59 | return ret 60 | 61 | 62 | def _tokens_mecab(result: str): 63 | ret = [] 64 | for line in result.split("\n"): 65 | if line.startswith("#") or line.strip() in ("","EOS"): 66 | continue 67 | ret.append(line.split("\t")[0]) 68 | return ret 69 | 70 | 71 | def _tokens_json(result: str): 72 | data = json.loads(f"[{result}]") 73 | ret = [] 74 | for d in data: 75 | for p in d["paragraphs"]: 76 | for s in p["sentences"]: 77 | for t in s["tokens"]: 78 | ret.append(t["orth"]) 79 | return ret 80 | 81 | class TestAnalyzer: 82 | def test_model_name_or_path_ja_ginza(self, mocker, analyzer): 83 | spacy_load_mock = mocker.patch("spacy.load") 84 | analyzer.model_name_or_path = "ja_ginza" 85 | analyzer.set_nlp() 86 | spacy_load_mock.assert_called_once_with("ja_ginza") 87 | 88 | def test_model_name_or_path_ja_ginza_electra(self, mocker, analyzer): 89 | spacy_load_mock = mocker.patch("spacy.load") 90 | analyzer.model_name_or_path = "ja_ginza_electra" 91 | analyzer.set_nlp() 92 | spacy_load_mock.assert_called_once_with("ja_ginza_electra") 93 | 94 | def test_require_gpu(self, mocker, analyzer): 95 | require_gpu_mock = mocker.patch("thinc.api.require_gpu") 96 | analyzer.require_gpu = 0 97 | analyzer.set_nlp() 98 | require_gpu_mock.assert_called_once() 99 | 100 | @pytest.mark.parametrize("input_text, tokens", TOKEN_TESTS) 101 | @pytest.mark.parametrize( 102 | "output_format, raises_analysis_before_set, tokens_func", 103 | [ 104 | ("conllu", TypeError, _tokens_conllu), 105 | ("cabocha", TypeError, _tokens_cabocha), 106 | ("json", TypeError, _tokens_json), 107 | ], 108 | ) 109 | def test_analyze_line(self, output_format, raises_analysis_before_set, input_text, tokens, tokens_func, analyzer): 110 | analyzer.output_format = output_format 111 | with pytest.raises(raises_analysis_before_set): 112 | analyzer.analyze_line(input_text) 113 | 114 | analyzer.set_nlp() 115 | ret = analyzer.analyze_line(input_text) 116 | assert tokens_func(ret) == tokens 117 | 118 | @pytest.mark.parametrize("input_text, tokens", MECAB_TESTS) 119 | @pytest.mark.parametrize( 120 | "output_format, raises_analysis_before_set, tokens_func", 121 | [ 122 | ("mecab", AttributeError, _tokens_mecab), 123 | ], 124 | ) 125 | def test_analyze_line_mecab(self, output_format, raises_analysis_before_set, input_text, tokens, tokens_func, analyzer): 126 | analyzer.output_format = output_format 127 | with pytest.raises(raises_analysis_before_set): 128 | analyzer.analyze_line(input_text) 129 | 130 | analyzer.set_nlp() 131 | ret = analyzer.analyze_line(input_text) 132 | assert tokens_func(ret) == tokens 133 | 134 | @pytest.mark.parametrize("input_batch, tokens_batch", BATCH_TESTS) 135 | @pytest.mark.parametrize( 136 | "output_format, tokens_func", 137 | [ 138 | ("conllu", _tokens_conllu), 139 | ("cabocha", _tokens_cabocha), 140 | ("json", _tokens_json), 141 | ], 142 | ) 143 | def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_func, analyzer): 144 | analyzer.output_format = output_format 145 | ret = analyzer.analyze_batch(input_batch) 146 | assert tokens_func(ret) == sum(tokens_batch, []) 147 | 148 | @pytest.mark.parametrize( 149 | "raises_analysis_before_set, tokens_func", 150 | [ 151 | (TypeError, _tokens_conllu) 152 | ], 153 | ) 154 | @pytest.mark.parametrize( 155 | "split_mode, input_text, tokens", 156 | [ 157 | ("A", "機能性食品", ["機能", "性", "食品"]), 158 | ("B", "機能性食品", ["機能性", "食品"]), 159 | ("C", "機能性食品", ["機能性食品"]), 160 | ], 161 | ) 162 | def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer): 163 | analyzer.split_mode = split_mode 164 | with pytest.raises(raises_analysis_before_set): 165 | analyzer.analyze_line(input_text) 166 | 167 | analyzer.set_nlp() 168 | ret = analyzer.analyze_line(input_text) 169 | assert tokens_func(ret) == tokens 170 | -------------------------------------------------------------------------------- /ginza/tests/test_command_line.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess as sp 4 | from functools import partial 5 | from pathlib import Path 6 | from typing import Iterable, List 7 | 8 | import pytest 9 | 10 | import torch 11 | import ginza.command_line as cli 12 | 13 | TEST_TEXT = "#コメント\n今日はかつ丼を食べた。\n明日は東京で蕎麦を食べる。明後日は酒が飲みたい。" 14 | 15 | run_cmd = partial(sp.run, encoding="utf-8", stdout=sp.PIPE) 16 | 17 | 18 | @pytest.fixture(scope="module") 19 | def input_file(tmpdir: Path) -> Path: 20 | file_path = (tmpdir / "test_input.txt").resolve() 21 | with open(file_path, "w") as fp: 22 | print(TEST_TEXT, file=fp) 23 | yield file_path 24 | file_path.unlink() 25 | 26 | 27 | @pytest.fixture(scope="module") 28 | def input_files(tmpdir: Path) -> Iterable[Path]: 29 | paths = [] 30 | for i, text in enumerate(TEST_TEXT.split("\n")): 31 | file_path = (tmpdir / f"test_input_{i}.txt").resolve() 32 | with open(file_path, "w") as fp: 33 | print(text, file=fp) 34 | paths.append(file_path) 35 | yield paths 36 | for file_path in paths: 37 | file_path.unlink() 38 | 39 | 40 | @pytest.fixture(scope="module") 41 | def long_input_file(tmpdir: Path) -> Iterable[Path]: 42 | file_path = (tmpdir / "test_long_input.txt").resolve() 43 | with open(file_path, "w") as fp: 44 | for _ in range(10): 45 | print(TEST_TEXT, file=fp) 46 | yield file_path 47 | file_path.unlink() 48 | 49 | 50 | @pytest.fixture 51 | def output_file(tmpdir: Path) -> Path: 52 | file_path = (tmpdir / "test_output.txt").resolve() 53 | file_path.touch() 54 | yield file_path 55 | file_path.unlink() 56 | 57 | 58 | def _conllu_parsable(result: str): 59 | for line in result.split("\n"): 60 | if line.startswith("#") or line.strip() == "": 61 | continue 62 | if not len(line.strip().split("\t")) == 10: 63 | raise Exception 64 | 65 | 66 | def _cabocha_parsable(result: str): 67 | for line in result.split("\n"): 68 | if line.strip() in ("", "EOS") or line.startswith("*") or line.startswith("#"): 69 | continue 70 | if not len(line.split("\t")) == 3: 71 | raise Exception 72 | if not len(line.split("\t")[1].split(",")) in [8, 9]: 73 | raise Exception 74 | 75 | 76 | def _mecab_parsable(result: str): 77 | for line in result.split("\n"): 78 | if line.strip() in ("", "EOS") or line.startswith("#"): 79 | continue 80 | if not len(line.split("\t")) == 2: 81 | raise Exception 82 | if not len(line.split("\t")[1].split(",")) == 9: 83 | raise Exception 84 | 85 | 86 | def _json_parsable(result: str): 87 | data = json.loads(result) 88 | for d in data: 89 | if not type(d) == dict: 90 | raise Exception 91 | if not "paragraphs" in d.keys(): 92 | raise Exception 93 | 94 | 95 | class TestCLIGinza: 96 | def test_help(self): 97 | for opt in ["-h", "--help"]: 98 | p = run_cmd(["ginza", opt]) 99 | assert p.returncode == 0 100 | 101 | def test_input(self, input_file): 102 | # input file 103 | p = run_cmd(["ginza", input_file]) 104 | 105 | # input from stdin 106 | p_stdin = sp.Popen(["ginza"], stdin=sp.PIPE, stdout=sp.PIPE) 107 | o, e = p_stdin.communicate(input=TEST_TEXT.encode()) 108 | assert e is None 109 | assert o.decode("utf-8") == p.stdout 110 | 111 | def test_multiple_input(self, input_files, input_file): 112 | p_multi = run_cmd(["ginza", *input_files]) 113 | assert p_multi.returncode == 0 114 | 115 | p_single = run_cmd(["ginza", input_file]) 116 | assert p_multi.stdout == p_single.stdout 117 | 118 | # TODO: add user defined model to fixture and test it here 119 | @pytest.mark.parametrize( 120 | "model_path, exit_ok", 121 | [ 122 | ("ja_ginza", True), 123 | ("not-exist-model", False), 124 | ], 125 | ) 126 | def test_model_path(self, model_path, exit_ok, input_file): 127 | p = run_cmd(["ginza", "-b", model_path, input_file]) 128 | assert (p.returncode == 0) is exit_ok 129 | 130 | @pytest.mark.parametrize( 131 | "ensure_model, exit_ok", 132 | [ 133 | ("ja_ginza", True), 134 | ("ja-ginza", True), 135 | ("ja-ginza-electra", True), 136 | ("ja_ginza_electra", True), 137 | ("ja-ginza_electra", True), 138 | ("not-exist-model", False), 139 | ], 140 | ) 141 | def test_ensure_model(self, ensure_model, exit_ok, input_file): 142 | p = run_cmd(["ginza", "-m", ensure_model, input_file]) 143 | assert (p.returncode == 0) is exit_ok 144 | 145 | def test_double_model_spcification(self, input_file): 146 | p = run_cmd(["ginza", "-b", "ja_ginza", "-m", "ja_ginza", input_file]) 147 | assert p.returncode != 0 148 | 149 | @pytest.mark.parametrize( 150 | "split_mode, input_text, expected", 151 | [ 152 | ("A", "機能性食品", ["機能", "性", "食品"]), 153 | ("B", "機能性食品", ["機能性", "食品"]), 154 | ("C", "機能性食品", ["機能性食品"]), 155 | ], 156 | ) 157 | def test_split_mode(self, split_mode, input_text, expected): 158 | p = run_cmd(["ginza", "-s", split_mode], input=input_text) 159 | assert p.returncode == 0 160 | 161 | def _sub_words(lines: Iterable) -> List[str]: 162 | return [l.split("\t")[1] for l in lines if len(l.split("\t")) > 1] 163 | 164 | assert _sub_words(p.stdout.split("\n")) == expected 165 | 166 | @pytest.mark.parametrize( 167 | "hash_comment, n_sentence, n_analyzed_sentence, exit_ok", 168 | [ 169 | ("print", 4, 3, True), 170 | ("skip", 3, 3, True), 171 | ("analyze", 4, 4, True), 172 | ], 173 | ) 174 | def test_hash_comment(self, hash_comment, n_sentence, n_analyzed_sentence, exit_ok, input_file): 175 | def _n_sentence(lines: Iterable) -> int: 176 | return len(list(filter(lambda x: x.startswith("#"), lines))) 177 | 178 | def _n_analyzed_sentence(lines: Iterable) -> int: 179 | return len(list(filter(lambda x: x.startswith("# text = "), lines))) 180 | 181 | p = run_cmd(["ginza", "-c", hash_comment, input_file]) 182 | assert (p.returncode == 0) is exit_ok 183 | assert _n_sentence(p.stdout.split("\n")) == n_sentence 184 | assert _n_analyzed_sentence(p.stdout.split("\n")) == n_analyzed_sentence 185 | 186 | def test_output_path(self, input_file, output_file): 187 | p_s = run_cmd(["ginza", input_file]) 188 | p_o = run_cmd(["ginza", "-o", output_file, input_file]) 189 | assert p_o.returncode == 0 190 | 191 | def _file_output(): 192 | with open(output_file, "r") as fp: 193 | return [l.strip() for l in fp if l.strip()] 194 | 195 | def _pipe_output(): 196 | return [l.strip() for l in p_s.stdout.split("\n") if l.strip()] 197 | 198 | assert _file_output() == _pipe_output() 199 | 200 | @pytest.mark.parametrize( 201 | "output_format, result_parsable", 202 | [ 203 | ("conllu", _conllu_parsable), 204 | ("cabocha", _cabocha_parsable), 205 | ("mecab", _mecab_parsable), 206 | ("json", _json_parsable), 207 | ], 208 | ) 209 | def test_output_format(self, output_format, result_parsable, input_file): 210 | p = run_cmd(["ginza", "-c", "analyze", "-f", output_format, input_file]) 211 | assert p.returncode == 0 212 | result_parsable(p.stdout.strip()) 213 | 214 | @pytest.mark.parametrize( 215 | "hash_comment", ["print", "skip"] 216 | ) 217 | def test_warn_if_json_hash_comment_not_analyze(self, hash_comment, input_file): 218 | p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file], stderr=sp.PIPE) 219 | assert p.returncode == 0 220 | msg = ( 221 | f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze"' 222 | ) 223 | assert msg in p.stderr 224 | 225 | def test_require_gpu(self, input_file): 226 | p = run_cmd(["ginza", "-g", "0", input_file]) 227 | gpu_available = torch.cuda.is_available() 228 | assert (p.returncode == 0) is gpu_available 229 | 230 | def test_do_not_use_normalized_form(self, input_file): 231 | p = run_cmd(["ginza", input_file]) 232 | lemmas = [l.split("\t")[2] for l in p.stdout.split("\n") if len(l.split("\t")) > 1] 233 | # 'かつ丼' is dictionary_form of 'かつ丼' 234 | assert p.returncode == 0 235 | assert "かつ丼" in lemmas 236 | 237 | def test_use_normalized_form(self, input_file): 238 | p = run_cmd(["ginza", "-n", input_file]) 239 | lemmas = [l.split("\t")[2] for l in p.stdout.split("\n") if len(l.split("\t")) > 1] 240 | # 'カツ丼' is normlized_form of 'かつ丼' 241 | assert p.returncode == 0 242 | assert "カツ丼" in lemmas 243 | 244 | def test_disable_sentencizer(self, input_file): 245 | p = run_cmd(["ginza", "-d", input_file]) 246 | 247 | def _n_analyzed_sentence(lines: Iterable) -> int: 248 | return len(list(filter(lambda x: x.startswith("# text = "), lines))) 249 | 250 | assert p.returncode == 0 251 | assert _n_analyzed_sentence(p.stdout.split("\n")) == 2 252 | 253 | def test_parallel(self, input_file): 254 | p = run_cmd(["ginza", "-p", "2", input_file]) 255 | assert p.returncode == 0 256 | 257 | 258 | class TestCLIGinzame: 259 | def test_ginzame(self, input_file): 260 | p_ginzame = run_cmd(["ginzame", input_file]) 261 | p_ginza = run_cmd(["ginza", "-n", "-m", "ja_ginza", "-f", "2", "-s", "A", input_file]) 262 | 263 | assert p_ginzame.returncode == 0 264 | assert p_ginzame.stdout == p_ginza.stdout 265 | 266 | 267 | class TestRun: 268 | def test_run_as_single_when_input_is_a_tty(self, mocker, output_file, long_input_file): 269 | i = 0 270 | 271 | def f_mock_input(): 272 | nonlocal i 273 | if i >= 1: 274 | raise KeyboardInterrupt 275 | else: 276 | i += 1 277 | return "今日はいい天気だ" 278 | 279 | mocker.patch.object(cli, "MINI_BATCH_SIZE", 5) 280 | mocker.patch("ginza.command_line.sys.stdin.isatty", return_value=True) 281 | input_mock = mocker.patch.object(cli, "input", side_effect=f_mock_input) 282 | analyze_parallel_mock = mocker.patch.object(cli, "_analyze_parallel") 283 | cli.run(parallel_level=2, output_path=output_file, files=None) 284 | assert input_mock.call_count == 2 285 | analyze_parallel_mock.assert_not_called() 286 | 287 | @pytest.mark.parametrize( 288 | "output_format", 289 | ["conllu", "cabocha", "mecab", "json"], 290 | ) 291 | def test_parallel_output_same_as_single(self, output_format, mocker, tmpdir, long_input_file): 292 | mocker.patch.object(cli, "MINI_BATCH_SIZE", 5) 293 | 294 | out_single = tmpdir / "single_output.txt" 295 | if out_single.exists(): 296 | out_single.unlink() 297 | cli.run( 298 | parallel_level=1, 299 | output_path=out_single, 300 | output_format=output_format, 301 | files=[long_input_file], 302 | ensure_model="ja_ginza", 303 | ) 304 | 305 | out_parallel = tmpdir / "parallel_output.txt" 306 | if out_parallel.exists(): 307 | out_parallel.unlink() 308 | try: 309 | cli.run( 310 | parallel_level=2, 311 | output_path=out_parallel, 312 | output_format=output_format, 313 | files=[long_input_file], 314 | ensure_model="ja_ginza", 315 | ) 316 | except: 317 | pytest.fail("parallel run failed") 318 | 319 | def f_len(path): 320 | with open(path, "r") as f: 321 | return sum([1 for _ in f]) 322 | 323 | assert f_len(out_single) == f_len(out_parallel) 324 | with open(out_single, "r") as f_s: 325 | with open(out_parallel, "r") as f_p: 326 | for s, p in zip(f_s, f_p): 327 | assert s == p 328 | -------------------------------------------------------------------------------- /ginza/tests/test_models.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import pytest 3 | from copy import deepcopy 4 | 5 | from ginza import set_split_mode 6 | 7 | 8 | MODELS = ["ja_ginza", "ja_ginza_electra"] 9 | 10 | TOKENIZER_TESTS = [ 11 | ("銀座でランチをご一緒しましょう。", ["銀座", "で", "ランチ", "を", "ご", "一緒", "し", "ましょう", "。"]), 12 | ("すもももももももものうち", ["すもも", "も", "もも", "も", "もも", "の", "うち"]), 13 | ] 14 | 15 | COMPOUND_SPLITER_TESTS = [ 16 | ("選挙管理委員会", 4, 3, 1), 17 | ("客室乗務員", 3, 2, 1), 18 | ("労働者協同組合", 4, 3, 1), 19 | ("機能性食品", 3, 2, 1), 20 | ] 21 | 22 | TAG_TESTS = [ 23 | ("銀座でランチをご一緒しましょう。", ["名詞-固有名詞-地名-一般", "助詞-格助詞", "名詞-普通名詞-一般", "助詞-格助詞", "接頭辞", "名詞-普通名詞-サ変可能", "動詞-非自立可能", "助動詞", "補助記号-句点"]), 24 | ("すもももももももものうち", ["名詞-普通名詞-一般", "助詞-係助詞", "名詞-普通名詞-一般", "助詞-係助詞", "名詞-普通名詞-一般", "助詞-格助詞", "名詞-普通名詞-副詞可能"]), 25 | ] 26 | 27 | POS_TESTS_JA_GINZA = [ 28 | ("銀座でランチをご一緒しましょう。", ["PROPN", "ADP", "NOUN", "ADP", "NOUN", "NOUN", "AUX", "AUX", "PUNCT"]), 29 | ("すもももももももものうち", ["NOUN", "ADP", "NOUN", "ADP", "NOUN", "ADP", "NOUN"]), 30 | ] 31 | 32 | POS_TESTS_JA_GINZA_ELECTRA = [ 33 | ("銀座でランチをご一緒しましょう。", ["PROPN", "ADP", "NOUN", "ADP", "NOUN", "VERB", "AUX", "AUX", "PUNCT"]), 34 | ("すもももももももものうち", ["NOUN", "ADP", "NOUN", "ADP", "NOUN", "ADP", "NOUN"]), 35 | ] 36 | 37 | LEMMATIZE_TESTS = [ 38 | ("新しく", "新しい"), 39 | ("いただきました", "いただく"), 40 | ("なった", "なる"), 41 | ] 42 | 43 | NORMALIZE_TESTS = [ 44 | ("かつ丼", "カツ丼"), 45 | ("附属", "付属"), 46 | ("SUMMER", "サマー"), 47 | ("シュミレーション", "シミュレーション"), 48 | ] 49 | 50 | EMPTYISH_TESTS = [ 51 | ("", 0), 52 | (" ", 1), 53 | ("\n\n\t\t\n\n", 1), 54 | ("\r\n\r\n", 1), 55 | ("\n \n\n", 5), 56 | ] 57 | 58 | NAUGHTY_STRINGS = [ 59 | # ASCII punctuation 60 | r",./;'[]\-=", 61 | r'<>?:"{}|_+', 62 | r'!@#$%^&*()`~"', 63 | # Unicode additional control characters, byte order marks 64 | r"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪", 65 | r"￾", 66 | # Unicode Symbols 67 | r"Ω≈ç√∫˜µ≤≥÷", 68 | r"åß∂ƒ©˙∆˚¬…æ", 69 | "œ∑´®†¥¨ˆøπ“‘", 70 | r"¡™£¢∞§¶•ªº–≠", 71 | r"¸˛Ç◊ı˜Â¯˘¿", 72 | r"ÅÍÎÏ˝ÓÔÒÚÆ☃", 73 | r"Œ„´‰ˇÁ¨ˆØ∏”’", 74 | r"`⁄€‹›fifl‡°·‚—±", 75 | r"⅛⅜⅝⅞", 76 | r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", 77 | r"٠١٢٣٤٥٦٧٨٩", 78 | # Unicode Subscript/Superscript/Accents 79 | r"⁰⁴⁵", 80 | r"₀₁₂", 81 | r"⁰⁴⁵₀₁₂", 82 | r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", 83 | r" ̄ ̄", 84 | # Two-Byte Characters 85 | r"田中さんにあげて下さい", 86 | r"パーティーへ行かないか", 87 | r"和製漢語", 88 | r"部落格", 89 | r"사회과학원 어학연구소", 90 | r"찦차를 타고 온 펲시맨과 쑛다리 똠방각하", 91 | r"社會科學院語學研究所", 92 | r"울란바토르", 93 | r"𠜎𠜱𠝹𠱓𠱸𠲖𠳏", 94 | # Japanese Emoticons 95 | r"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ", 96 | r"(。◕ ∀ ◕。)", 97 | r"`ィ(´∀`∩", 98 | r"__ロ(,_,*)", 99 | r"・( ̄∀ ̄)・:*:", 100 | r"゚・✿ヾ╲(。◕‿◕。)╱✿・゚", 101 | r",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’", 102 | r"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻", 103 | r"┬─┬ノ( º _ ºノ)", 104 | r"( ͡° ͜ʖ ͡°)", 105 | # Emoji 106 | r"😍", 107 | r"👩🏽", 108 | r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", 109 | r"🐵 🙈 🙉 🙊", 110 | r"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙", 111 | r"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿", 112 | r"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧", 113 | r"0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟", 114 | # Regional Indicator Symbols 115 | r"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸", 116 | r"🇺🇸🇷🇺🇸🇦🇫🇦🇲", 117 | r"🇺🇸🇷🇺🇸🇦", 118 | # Unicode Numbers 119 | r"123", 120 | r"١٢٣", 121 | # Right-To-Left Strings 122 | r"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.", 123 | r"إيو.", 124 | r"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ", 125 | r"הָיְתָהtestالصفحات التّحول", 126 | r"﷽", 127 | r"ﷺ", 128 | r"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،", 129 | # Trick Unicode 130 | r"‪‪test‪", 131 | r"‫test", 132 | r"
test
", 133 | r"test⁠test", 134 | r"⁦test⁧", 135 | # Zalgo Text 136 | r"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣", 137 | r"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰", 138 | r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", 139 | r"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕", 140 | r"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮", 141 | # Unicode Upsidedown 142 | r"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥", 143 | r"00˙Ɩ$-", 144 | # Unicode font 145 | r"The quick brown fox jumps over the lazy dog", 146 | r"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠", 147 | r"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌", 148 | r"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈", 149 | r"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰", 150 | r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘", 151 | r"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐", 152 | r"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢", 153 | # File paths 154 | r"../../../../../../../../../../../etc/passwd%00", 155 | r"../../../../../../../../../../../etc/hosts", 156 | # iOS Vulnerabilities 157 | r"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗", 158 | r"🏳0🌈️", 159 | ] 160 | 161 | 162 | @pytest.fixture(scope="module") 163 | def nlp(request): 164 | return spacy.load(request.param) 165 | 166 | 167 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 168 | @pytest.mark.parametrize("text, expected_tokens", TOKENIZER_TESTS) 169 | def test_tokenize(nlp, text, expected_tokens): 170 | tokens = [token.text for token in nlp(text)] 171 | assert tokens == expected_tokens 172 | 173 | 174 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 175 | @pytest.mark.parametrize("text, len_a, len_b, len_c", COMPOUND_SPLITER_TESTS) 176 | def test_compound_spliter(nlp, text, len_a, len_b, len_c): 177 | assert len(nlp(text)) == len_c 178 | for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]): 179 | set_split_mode(nlp, split_mode) 180 | assert len(nlp(text)) == l 181 | 182 | 183 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 184 | @pytest.mark.parametrize("text, expected_tags", TAG_TESTS) 185 | def test_tag(nlp, text, expected_tags): 186 | tags = [token.tag_ for token in nlp(text)] 187 | assert tags == expected_tags 188 | 189 | 190 | @pytest.mark.parametrize("nlp", ["ja_ginza"], indirect=True) 191 | @pytest.mark.parametrize("text, expected_poss", POS_TESTS_JA_GINZA) 192 | def test_pos_ja_ginza(nlp, text, expected_poss): 193 | poss = [token.pos_ for token in nlp(text)] 194 | assert poss == expected_poss 195 | 196 | 197 | @pytest.mark.parametrize("nlp", ["ja_ginza_electra"], indirect=True) 198 | @pytest.mark.parametrize("text, expected_poss", POS_TESTS_JA_GINZA_ELECTRA) 199 | def test_pos_ja_ginza_electra(nlp, text, expected_poss): 200 | poss = [token.pos_ for token in nlp(text)] 201 | assert poss == expected_poss 202 | 203 | 204 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 205 | @pytest.mark.parametrize("text, lemma", LEMMATIZE_TESTS) 206 | def test_lemmatize(nlp, text, lemma): 207 | doc = nlp(text) 208 | assert lemma == doc[0].lemma_ 209 | 210 | 211 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 212 | @pytest.mark.parametrize("text, norm", NORMALIZE_TESTS) 213 | def test_normalize(nlp, text, norm): 214 | doc = nlp(text) 215 | assert norm == doc[0].norm_ 216 | 217 | 218 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 219 | @pytest.mark.parametrize("text, expected_len", EMPTYISH_TESTS) 220 | def test_emptyish_texts(nlp, text, expected_len): 221 | doc = nlp(text) 222 | assert len(doc) == expected_len 223 | 224 | 225 | @pytest.mark.parametrize("nlp", MODELS, indirect=True) 226 | @pytest.mark.parametrize("text", NAUGHTY_STRINGS) 227 | def test_naughty_strings(nlp, text): 228 | doc = nlp(text) 229 | assert doc.text_with_ws == text 230 | -------------------------------------------------------------------------------- /ginza_util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/megagonlabs/ginza/f67b4987af09bad939d75c89b4e9483b90c064ee/ginza_util/__init__.py -------------------------------------------------------------------------------- /ginza_util/browse_trees.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import plac 3 | import sys 4 | import threading 5 | import time 6 | import webbrowser 7 | import spacy 8 | from spacy import displacy 9 | from spacy.gold import GoldCorpus 10 | from ginza import * 11 | 12 | 13 | @plac.annotations( 14 | model_path=("model directory path", "option", "b", str), 15 | split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]), 16 | style=("displacy style (default=dep)", "option", "d", str), 17 | compact=("compact", "flag", "c"), 18 | browser_command=("web browser command", "option", "w", str), 19 | ) 20 | def main( 21 | model_path=None, 22 | split_mode=None, 23 | style='dep', 24 | compact=False, 25 | browser_command=None, 26 | ): 27 | if model_path: 28 | nlp = spacy.load(model_path) 29 | else: 30 | nlp = spacy.load("ja_ginza") 31 | 32 | if split_mode: 33 | set_split_mode(nlp, split_mode) 34 | 35 | if browser_command: 36 | browser = webbrowser.get(browser_command) 37 | else: 38 | browser = None 39 | 40 | print("Input a sentence line:", file=sys.stderr) 41 | line = input() 42 | docs = [nlp(line)] 43 | 44 | display(browser, docs, style, compact) 45 | 46 | 47 | def display(browser, docs, style='dep', compact=False, url='http://localhost:5000'): 48 | if browser: 49 | thread = threading.Thread(target=open_browser, args=[browser, url]) 50 | thread.start() 51 | else: 52 | print('open following url by web browser', file=sys.stderr) 53 | print(url, file=sys.stderr) 54 | displacy.serve(docs, style, options={'compact': compact, 'collapse_punct': False}) 55 | 56 | 57 | def open_browser(browser, url, wait=0.5): 58 | if wait: 59 | time.sleep(wait) 60 | browser.open(url) 61 | 62 | 63 | if __name__ == '__main__': 64 | plac.call(main) 65 | -------------------------------------------------------------------------------- /ginza_util/conv_connlu_to_json.rea.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | corpus_dir=$1 6 | corpus_title=$2 7 | 8 | for s in dev test train; do 9 | for n in 1 10 -15; do 10 | 11 | if ((n == -15)); then 12 | file_n_sents=random_sents 13 | else 14 | file_n_sents=$n 15 | fi 16 | 17 | python ginza_util/conllu_to_json.py -n $n -r C -e -a $corpus_dir/$corpus_title-$s.ne.conllu > $corpus_dir/$corpus_title-$s.ne.rea.$file_n_sents.json 18 | 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /ginza_util/evaluate_conllu.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals, print_function 3 | 4 | import json 5 | import sys 6 | 7 | import spacy 8 | 9 | 10 | USAGE = ''' 11 | Usage: python evaluate_model.py spacy_model_path json_file1 [json_file2 ...] 12 | ''' 13 | 14 | 15 | def evaluate_from_file( 16 | conllu_file, 17 | json_files, 18 | ): 19 | gold = [] 20 | for file in json_files: 21 | with open(file, 'r', encoding="utf-8") as f: 22 | for doc in json.load(f): 23 | for paragraph in doc['paragraphs']: 24 | for sentence in paragraph['sentences']: 25 | tokens = sentence['tokens'] 26 | gold.append(tokens) 27 | 28 | system = load_conllu(conllu_file) 29 | 30 | return evaluate( 31 | gold, 32 | system, 33 | sys.stdout, 34 | ) 35 | 36 | 37 | class TokenFake: 38 | pass 39 | 40 | def load_conllu(conllu_file): 41 | with open(conllu_file, "r", encoding="utf8") as fin: 42 | sentences = {} 43 | sentence = None 44 | tokens = [] 45 | idx = 0 46 | for line in fin: 47 | line = line.rstrip() 48 | if not line: 49 | assert sentence and tokens 50 | for t in tokens: 51 | if t.dep_ == "root": 52 | t.dep_ = "ROOT" 53 | t.head = t 54 | else: 55 | t.head = tokens[t.head] 56 | sentences[sentence] = tokens 57 | sentence = None 58 | tokens = [] 59 | idx = 0 60 | continue 61 | if line.startswith("# text = "): 62 | assert not sentence and not tokens 63 | sentence = line[9:] 64 | continue 65 | r = line.split("\t") 66 | t = TokenFake() 67 | t.i = int(r[0]) - 1 68 | t.orth_, t.lemma_, t.pos_, t.tag_ = r[1:5] 69 | t.head = int(r[6]) - 1 70 | t.dep_ = r[7] 71 | t.ent_type_ = None 72 | t.ent_iob_ = "O" 73 | if "NE=" in r[9]: 74 | t.ent_iob_, t.ent_type_ = r[9].split("NE=")[1].split("|")[0].split("-") 75 | print(sentence, t.orth_) 76 | t.idx = sentence.index(t.orth_, idx) 77 | idx = t.idx + len(t.orth_) 78 | tokens.append(t) 79 | assert not sentence and not tokens 80 | return sentences 81 | 82 | 83 | def evaluate( 84 | gold_corpus, 85 | system, 86 | fout=sys.stdout, 87 | morph_custom_condition=lambda g, r: g['pos'] == r.pos_ if g['tag'].find('可能') >= 0 else None, 88 | ): 89 | stats = Stats() 90 | 91 | print('Evaluate {} sentences'.format(len(gold_corpus)), file=sys.stderr, flush=True) 92 | for i, gold_tokens in enumerate(gold_corpus): 93 | if i % 100 == 0: 94 | print('.', end='', file=sys.stderr, flush=True) 95 | 96 | offset = 0 97 | sentence = '' 98 | for idx, t in enumerate(gold_tokens): 99 | t['head'] = gold_tokens[idx + t['head']] 100 | t['offset'] = offset 101 | offset += len(t['orth']) 102 | t['end'] = offset 103 | sentence += t['orth'] 104 | if 'whitespace' in t and t['whitespace']: 105 | offset += 1 106 | sentence += ' ' 107 | try: 108 | doc = system[sentence] 109 | stats.evaluate(gold_tokens, doc, morph_custom_condition) 110 | except Exception as e: 111 | print("Evaluation error:", sentence, file=sys.stderr) 112 | raise e 113 | print(file=sys.stderr, flush=True) 114 | 115 | stats.print(fout) 116 | 117 | return stats 118 | 119 | 120 | COMMON_FORMAT = "LAS={:.4f},UAS={:.4f},LAS_POS={:.4f},UAS_POS={:.4f},POS={:.4f},TAG={:.4f},boundary={:.4f}" 121 | 122 | 123 | class Stats: 124 | def __init__(self): 125 | self.sentences = 0 126 | self.gold_tokens = 0 127 | self.result_tokens = 0 128 | self.custom_tokens = 0 129 | self.correct_tokens = 0 130 | self.correct_tag_tokens = 0 131 | self.correct_pos_tokens = 0 132 | self.correct_pos_uas_tokens = 0 133 | self.correct_pos_las_tokens = 0 134 | self.correct_uas_tokens = 0 135 | self.correct_las_tokens = 0 136 | self.correct_custom_tokens = 0 137 | self.correct_sentences = 0 138 | self.correct_tag_sentences = 0 139 | self.correct_pos_sentences = 0 140 | self.correct_pos_uas_sentences = 0 141 | self.correct_pos_las_sentences = 0 142 | self.correct_uas_sentences = 0 143 | self.correct_las_sentences = 0 144 | self.correct_roots = 0 145 | self.dep_confusion = {} 146 | self.pos_confusion = {} 147 | 148 | self.gold_ents = 0 149 | self.result_ents = 0 150 | self.correct_ent_spans = 0 151 | self.correct_ent_labels = 0 152 | self.ent_confusion = {} 153 | 154 | def score(self): 155 | return sum([ 156 | self.correct_tokens, 157 | self.correct_pos_tokens, 158 | self.correct_pos_uas_tokens, 159 | self.correct_pos_las_tokens, 160 | self.correct_uas_tokens, 161 | self.correct_las_tokens, 162 | self.correct_custom_tokens, 163 | self.correct_sentences, 164 | self.correct_pos_sentences, 165 | self.correct_pos_uas_sentences, 166 | self.correct_pos_las_sentences, 167 | self.correct_uas_sentences, 168 | self.correct_las_sentences, 169 | self.correct_roots, 170 | ]) 171 | 172 | def print(self, file=sys.stdout): 173 | def f1(p, r): 174 | if p + r == 0.0: 175 | return 0.0 176 | else: 177 | return 2 * p * r / (p + r) 178 | 179 | for title, matrix in ( 180 | ('pos_confusion', self.pos_confusion), 181 | ('dep_confusion', self.dep_confusion), 182 | ('ent_confusion', self.ent_confusion), 183 | ): 184 | print(' {}'.format(title), file=file) 185 | max_label_len = str(max(len(g) for g in matrix.keys())) 186 | for gold, results in sorted(matrix.items(), key=lambda t: t[0]): 187 | results = matrix[gold] 188 | print((' {:<' + max_label_len + '}({:>6}): {}').format(gold, sum(results.values()), ', '.join([ 189 | '{}={}'.format(pos, num) for pos, num in sorted(results.items(), key=lambda t:-t[1]) 190 | ])), file=file) 191 | print(' precision, recall, f1', file=file) 192 | for gold, results in sorted(matrix.items(), key=lambda t: t[0]): 193 | results = matrix[gold] 194 | total = sum(results.values()) 195 | correct = results.get(gold, results.get(gold.upper(), 0)) 196 | output = sum(sum(v for k, v in r.items() if k.lower() == gold.lower()) for r in matrix.values()) 197 | p = correct / output if output else 0 198 | r = correct / total if total else 0 199 | f = p * r * 2 / (p + r) if p and r else 0 200 | print((' {:<' + max_label_len + '}: {:.3f}, {:.3f}, {:.3f}').format(gold, p, r, f), file=file) 201 | 202 | print("sentence={}, gold_token={}, result_token={}, custom_cond={:.4f}({}/{})".format( 203 | self.sentences, 204 | self.gold_tokens, 205 | self.result_tokens, 206 | (self.correct_custom_tokens / self.custom_tokens) if self.custom_tokens > 0 else 0, 207 | self.correct_custom_tokens, 208 | self.custom_tokens, 209 | ), file=file) 210 | print((" token_f1:" + COMMON_FORMAT).format( 211 | f1(self.correct_las_tokens / self.gold_tokens, self.correct_las_tokens / self.result_tokens), 212 | f1(self.correct_uas_tokens / self.gold_tokens, self.correct_uas_tokens / self.result_tokens), 213 | f1(self.correct_pos_las_tokens / self.gold_tokens, self.correct_pos_las_tokens / self.result_tokens), 214 | f1(self.correct_pos_uas_tokens / self.gold_tokens, self.correct_pos_uas_tokens / self.result_tokens), 215 | f1(self.correct_pos_tokens / self.gold_tokens, self.correct_pos_tokens / self.result_tokens), 216 | f1(self.correct_tag_tokens / self.gold_tokens, self.correct_tag_tokens / self.result_tokens), 217 | f1(self.correct_tokens / self.gold_tokens, self.correct_tokens / self.result_tokens), 218 | ), file=file) 219 | print((" token_recall:" + COMMON_FORMAT).format( 220 | self.correct_las_tokens / self.gold_tokens, 221 | self.correct_uas_tokens / self.gold_tokens, 222 | self.correct_pos_las_tokens / self.gold_tokens, 223 | self.correct_pos_uas_tokens / self.gold_tokens, 224 | self.correct_pos_tokens / self.gold_tokens, 225 | self.correct_tag_tokens / self.gold_tokens, 226 | self.correct_tokens / self.gold_tokens, 227 | ), file=file) 228 | print((" token_precision:" + COMMON_FORMAT).format( 229 | self.correct_las_tokens / self.result_tokens, 230 | self.correct_uas_tokens / self.result_tokens, 231 | self.correct_pos_las_tokens / self.result_tokens, 232 | self.correct_pos_uas_tokens / self.result_tokens, 233 | self.correct_pos_tokens / self.result_tokens, 234 | self.correct_tag_tokens / self.result_tokens, 235 | self.correct_tokens / self.result_tokens, 236 | ), file=file) 237 | print((" whole_sentence:" + COMMON_FORMAT + ",root={:.4f}").format( 238 | self.correct_las_sentences / self.sentences, 239 | self.correct_uas_sentences / self.sentences, 240 | self.correct_pos_las_sentences / self.sentences, 241 | self.correct_pos_uas_sentences / self.sentences, 242 | self.correct_pos_sentences / self.sentences, 243 | self.correct_tag_sentences / self.sentences, 244 | self.correct_sentences / self.sentences, 245 | self.correct_roots / self.sentences, 246 | ), file=file) 247 | print("ent_gold={}, ent_result={}".format( 248 | self.gold_ents, 249 | self.result_ents, 250 | ), file=file) 251 | if self.gold_ents and self.result_ents: 252 | print(" ent_f1:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 253 | f1(self.correct_ent_labels / self.gold_ents, self.correct_ent_labels / self.result_ents), 254 | f1(self.correct_ent_spans / self.gold_ents, self.correct_ent_spans / self.result_ents), 255 | ), file=file) 256 | print(" ent_recall:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 257 | self.correct_ent_labels / self.gold_ents, 258 | self.correct_ent_spans / self.gold_ents, 259 | ), file=file) 260 | print(" ent_precision:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 261 | self.correct_ent_labels / self.result_ents, 262 | self.correct_ent_spans / self.result_ents, 263 | ), file=file) 264 | file.flush() 265 | 266 | def evaluate(self, gold, doc, morph_custom_condition, debug=False): 267 | def count(matrix, l1, l2): 268 | if l1 not in matrix: 269 | matrix[l1] = {} 270 | m2 = matrix[l1] 271 | if l2 in m2: 272 | m2[l2] += 1 273 | else: 274 | m2[l2] = 1 275 | 276 | self.sentences += 1 277 | self.gold_tokens += len(gold) 278 | self.result_tokens += len(doc) 279 | 280 | correct_tokens = 0 281 | correct_tag_tokens = 0 282 | correct_pos_tokens = 0 283 | correct_uas_tokens = 0 284 | correct_las_tokens = 0 285 | correct_pos_uas_tokens = 0 286 | correct_pos_las_tokens = 0 287 | custom_tokens = 0 288 | correct_custom_tokens = 0 289 | index_g = 0 290 | index_r = 0 291 | last_match_g = 0 292 | last_match_r = 0 293 | while index_g < len(gold) and index_r < len(doc): 294 | g = gold[index_g] 295 | g_end = g['end'] 296 | r = doc[index_r] 297 | r_end = r.idx + len(r.orth_) 298 | if g['offset'] == r.idx: 299 | if g_end == r_end: 300 | correct_tokens += 1 301 | count(self.pos_confusion, g['pos'], r.pos_) 302 | if g['tag'] == r.tag_: 303 | correct_tag_tokens += 1 304 | if g['pos'] == r.pos_: 305 | correct_pos_tokens += 1 306 | if is_correct_dep(g, r): 307 | correct_uas_tokens += 1 308 | count(self.dep_confusion, g['dep'].lower(), r.dep_) 309 | if g['pos'] == r.pos_: 310 | correct_pos_uas_tokens += 1 311 | if g['dep'].lower() == r.dep_.lower(): 312 | correct_las_tokens += 1 313 | if g['pos'] == r.pos_: 314 | correct_pos_las_tokens += 1 315 | else: 316 | count(self.dep_confusion, g['dep'].lower(), '_') 317 | if g['dep'].lower() == 'root' and r.dep_.lower() == 'root': 318 | self.correct_roots += 1 319 | elif g_end < r_end: 320 | count(self.pos_confusion, g['pos'], '_') 321 | count(self.dep_confusion, g['dep'].lower(), '_') 322 | elif g_end < r_end: 323 | count(self.pos_confusion, g['pos'], '_') 324 | count(self.dep_confusion, g['dep'].lower(), '_') 325 | 326 | if debug: 327 | if g_end == r_end: 328 | print('{}\t{}\t{}'.format( 329 | '=' if index_g == last_match_g and index_r == last_match_r else 330 | '>' if index_g == last_match_g else 331 | '<' if index_r == last_match_r else 332 | '!', 333 | ','.join(['-'.join(( 334 | m['orth'], m['pos'], m['dep'], str(m['head']['offset']), str(m['head']['end']) 335 | )) for m in gold[last_match_g:index_g + 1]]), 336 | ','.join(['-'.join(( 337 | m.orth_, m.pos_, m.dep_, str(m.head.idx), str(m.head.idx + len(m.head.orth_)) 338 | )) for m in doc[last_match_r:index_r + 1]]), 339 | )) 340 | last_match_g = index_g + 1 341 | last_match_r = index_r + 1 342 | if g_end <= r_end: 343 | index_g += 1 344 | if g_end >= r_end: 345 | index_r += 1 346 | 347 | tokens = len(gold) 348 | self.correct_tokens += correct_tokens 349 | if correct_tokens == tokens: 350 | self.correct_sentences += 1 351 | self.correct_tag_tokens += correct_tag_tokens 352 | if correct_tag_tokens == tokens: 353 | self.correct_tag_sentences += 1 354 | self.correct_pos_tokens += correct_pos_tokens 355 | if correct_pos_tokens == tokens: 356 | self.correct_pos_sentences += 1 357 | self.correct_uas_tokens += correct_uas_tokens 358 | if correct_uas_tokens == tokens: 359 | self.correct_uas_sentences += 1 360 | self.correct_las_tokens += correct_las_tokens 361 | if correct_las_tokens == tokens: 362 | self.correct_las_sentences += 1 363 | self.correct_pos_uas_tokens += correct_pos_uas_tokens 364 | if correct_pos_uas_tokens == tokens: 365 | self.correct_pos_uas_sentences += 1 366 | self.correct_pos_las_tokens += correct_pos_las_tokens 367 | if correct_pos_las_tokens == tokens: 368 | self.correct_pos_las_sentences += 1 369 | 370 | result_borders = {r.idx: (len(r.orth_), r) for r in doc} 371 | for g in gold: 372 | length, r = result_borders.get(g['offset'], (0, None)) 373 | if length == len(g['orth']): 374 | custom = morph_custom_condition(g, r) 375 | if custom is not None: 376 | custom_tokens += 1 377 | if custom: 378 | correct_custom_tokens += 1 379 | # else: 380 | # print(custom, g.surface, r.surface, g.pos, r.pos, g.tag, r.tag) 381 | self.custom_tokens += custom_tokens 382 | self.correct_custom_tokens += correct_custom_tokens 383 | 384 | gold_ents = {} 385 | ent_label = None 386 | ent_begin = None 387 | for g in gold: 388 | ner = g['ner'] if 'ner' in g else '-' 389 | if ner.startswith('B-'): 390 | ent_label = ner[2:] 391 | ent_begin = g['offset'] 392 | elif ner.startswith('L-'): 393 | gold_ents[(ent_begin, g['end'])] = ent_label 394 | ent_label = None 395 | ent_begin = None 396 | elif ner.startswith('U-'): 397 | gold_ents[(g['offset'], g['end'])] = ner[2:] 398 | result_ents = {} 399 | ent_label = None 400 | ent_begin = None 401 | ent_end = None 402 | for r in doc: 403 | if ent_label and r.ent_iob_ != 'I': 404 | result_ents[(ent_begin, ent_end)] = ent_label 405 | ent_label = None 406 | ent_begin = None 407 | ent_end = None 408 | if r.ent_iob_ == 'B': 409 | ent_label = r.ent_type_ 410 | ent_begin = r.idx 411 | ent_end = r.idx + len(r.orth_) 412 | elif r.ent_iob_ == 'I': 413 | ent_end = r.idx + len(r.orth_) 414 | if ent_label: 415 | result_ents[(ent_begin, ent_end)] = ent_label 416 | 417 | self.gold_ents += len(gold_ents) 418 | self.result_ents += len(result_ents) 419 | for k, gold_label in gold_ents.items(): 420 | if k in result_ents: 421 | self.correct_ent_spans += 1 422 | result_label = result_ents[k] 423 | count(self.ent_confusion, gold_label, result_label) 424 | if gold_label == result_label: 425 | self.correct_ent_labels += 1 426 | else: 427 | count(self.ent_confusion, gold_label, '_') 428 | 429 | 430 | def is_correct_dep(g, r): 431 | return g['head']['offset'] <= r.head.idx and g['head']['end'] >= r.head.idx + len(r.head.orth_) or \ 432 | g['head']['offset'] >= r.head.idx and g['head']['end'] <= r.head.idx + len(r.head.orth_) 433 | 434 | 435 | if __name__ == '__main__': 436 | if len(sys.argv) < 3: 437 | print(USAGE, file=sys.stderr) 438 | exit(2) 439 | evaluate_from_file(sys.argv[1], sys.argv[2:]) 440 | -------------------------------------------------------------------------------- /ginza_util/evaluate_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals, print_function 3 | 4 | import json 5 | import sys 6 | 7 | import spacy 8 | 9 | 10 | USAGE = ''' 11 | Usage: python evaluate_model.py spacy_model_path json_file1 [json_file2 ...] 12 | ''' 13 | 14 | 15 | def evaluate_from_file( 16 | model_path, 17 | json_files, 18 | ): 19 | gold = [] 20 | for file in json_files: 21 | with open(file, 'r', encoding="utf-8") as f: 22 | for doc in json.load(f): 23 | for paragraph in doc['paragraphs']: 24 | for sentence in paragraph['sentences']: 25 | tokens = sentence['tokens'] 26 | gold.append(tokens) 27 | 28 | nlp = spacy.load(model_path) 29 | nlp.tokenizer.use_sentence_separator = False 30 | 31 | return evaluate( 32 | gold, 33 | nlp, 34 | sys.stdout, 35 | ) 36 | 37 | 38 | def evaluate( 39 | gold_corpus, 40 | nlp, 41 | fout=sys.stdout, 42 | morph_custom_condition=lambda g, r: g['pos'] == r.pos_ if g['tag'].find('可能') >= 0 else None, 43 | ): 44 | stats = Stats() 45 | 46 | print('Evaluate {} sentences'.format(len(gold_corpus)), file=sys.stderr, flush=True) 47 | for i, gold_tokens in enumerate(gold_corpus): 48 | if i % 100 == 0: 49 | print('.', end='', file=sys.stderr, flush=True) 50 | 51 | offset = 0 52 | sentence = '' 53 | for idx, t in enumerate(gold_tokens): 54 | t['head'] = gold_tokens[idx + t['head']] 55 | t['offset'] = offset 56 | offset += len(t['orth']) 57 | t['end'] = offset 58 | sentence += t['orth'] 59 | if 'whitespace' in t and t['whitespace']: 60 | offset += 1 61 | sentence += ' ' 62 | try: 63 | doc = nlp(sentence) 64 | stats.evaluate(gold_tokens, doc, morph_custom_condition) 65 | except Exception as e: 66 | print("Evaluation error:", sentence, file=sys.stderr) 67 | raise e 68 | print(file=sys.stderr, flush=True) 69 | 70 | stats.print(fout) 71 | 72 | return stats 73 | 74 | 75 | COMMON_FORMAT = "LAS={:.4f},UAS={:.4f},LAS_POS={:.4f},UAS_POS={:.4f},POS={:.4f},TAG={:.4f},boundary={:.4f}" 76 | 77 | 78 | class Stats: 79 | def __init__(self): 80 | self.sentences = 0 81 | self.gold_tokens = 0 82 | self.result_tokens = 0 83 | self.custom_tokens = 0 84 | self.correct_tokens = 0 85 | self.correct_tag_tokens = 0 86 | self.correct_pos_tokens = 0 87 | self.correct_pos_uas_tokens = 0 88 | self.correct_pos_las_tokens = 0 89 | self.correct_uas_tokens = 0 90 | self.correct_las_tokens = 0 91 | self.correct_custom_tokens = 0 92 | self.correct_sentences = 0 93 | self.correct_tag_sentences = 0 94 | self.correct_pos_sentences = 0 95 | self.correct_pos_uas_sentences = 0 96 | self.correct_pos_las_sentences = 0 97 | self.correct_uas_sentences = 0 98 | self.correct_las_sentences = 0 99 | self.correct_roots = 0 100 | self.dep_confusion = {} 101 | self.pos_confusion = {} 102 | 103 | self.gold_ents = 0 104 | self.result_ents = 0 105 | self.correct_ent_spans = 0 106 | self.correct_ent_labels = 0 107 | self.ent_confusion = {} 108 | 109 | def score(self): 110 | return sum([ 111 | self.correct_tokens, 112 | self.correct_pos_tokens, 113 | self.correct_pos_uas_tokens, 114 | self.correct_pos_las_tokens, 115 | self.correct_uas_tokens, 116 | self.correct_las_tokens, 117 | self.correct_custom_tokens, 118 | self.correct_sentences, 119 | self.correct_pos_sentences, 120 | self.correct_pos_uas_sentences, 121 | self.correct_pos_las_sentences, 122 | self.correct_uas_sentences, 123 | self.correct_las_sentences, 124 | self.correct_roots, 125 | ]) 126 | 127 | def print(self, file=sys.stdout): 128 | def f1(p, r): 129 | if p + r == 0.0: 130 | return 0.0 131 | else: 132 | return 2 * p * r / (p + r) 133 | 134 | for title, matrix in ( 135 | ('pos_confusion', self.pos_confusion), 136 | ('dep_confusion', self.dep_confusion), 137 | ('ent_confusion', self.ent_confusion), 138 | ): 139 | print(' {}'.format(title), file=file) 140 | max_label_len = str(max(len(g) for g in matrix.keys())) 141 | for gold, results in sorted(matrix.items(), key=lambda t: t[0]): 142 | results = matrix[gold] 143 | print((' {:<' + max_label_len + '}({:>6}): {}').format(gold, sum(results.values()), ', '.join([ 144 | '{}={}'.format(pos, num) for pos, num in sorted(results.items(), key=lambda t:-t[1]) 145 | ])), file=file) 146 | print(' precision, recall, f1', file=file) 147 | for gold, results in sorted(matrix.items(), key=lambda t: t[0]): 148 | results = matrix[gold] 149 | total = sum(results.values()) 150 | correct = results.get(gold, results.get(gold.upper(), 0)) 151 | output = sum(sum(v for k, v in r.items() if k.lower() == gold.lower()) for r in matrix.values()) 152 | p = correct / output if output else 0 153 | r = correct / total if total else 0 154 | f = p * r * 2 / (p + r) if p and r else 0 155 | print((' {:<' + max_label_len + '}: {:.3f}, {:.3f}, {:.3f}').format(gold, p, r, f), file=file) 156 | 157 | print("sentence={}, gold_token={}, result_token={}, custom_cond={:.4f}({}/{})".format( 158 | self.sentences, 159 | self.gold_tokens, 160 | self.result_tokens, 161 | (self.correct_custom_tokens / self.custom_tokens) if self.custom_tokens > 0 else 0, 162 | self.correct_custom_tokens, 163 | self.custom_tokens, 164 | ), file=file) 165 | print((" token_f1:" + COMMON_FORMAT).format( 166 | f1(self.correct_las_tokens / self.gold_tokens, self.correct_las_tokens / self.result_tokens), 167 | f1(self.correct_uas_tokens / self.gold_tokens, self.correct_uas_tokens / self.result_tokens), 168 | f1(self.correct_pos_las_tokens / self.gold_tokens, self.correct_pos_las_tokens / self.result_tokens), 169 | f1(self.correct_pos_uas_tokens / self.gold_tokens, self.correct_pos_uas_tokens / self.result_tokens), 170 | f1(self.correct_pos_tokens / self.gold_tokens, self.correct_pos_tokens / self.result_tokens), 171 | f1(self.correct_tag_tokens / self.gold_tokens, self.correct_tag_tokens / self.result_tokens), 172 | f1(self.correct_tokens / self.gold_tokens, self.correct_tokens / self.result_tokens), 173 | ), file=file) 174 | print((" token_recall:" + COMMON_FORMAT).format( 175 | self.correct_las_tokens / self.gold_tokens, 176 | self.correct_uas_tokens / self.gold_tokens, 177 | self.correct_pos_las_tokens / self.gold_tokens, 178 | self.correct_pos_uas_tokens / self.gold_tokens, 179 | self.correct_pos_tokens / self.gold_tokens, 180 | self.correct_tag_tokens / self.gold_tokens, 181 | self.correct_tokens / self.gold_tokens, 182 | ), file=file) 183 | print((" token_precision:" + COMMON_FORMAT).format( 184 | self.correct_las_tokens / self.result_tokens, 185 | self.correct_uas_tokens / self.result_tokens, 186 | self.correct_pos_las_tokens / self.result_tokens, 187 | self.correct_pos_uas_tokens / self.result_tokens, 188 | self.correct_pos_tokens / self.result_tokens, 189 | self.correct_tag_tokens / self.result_tokens, 190 | self.correct_tokens / self.result_tokens, 191 | ), file=file) 192 | print((" whole_sentence:" + COMMON_FORMAT + ",root={:.4f}").format( 193 | self.correct_las_sentences / self.sentences, 194 | self.correct_uas_sentences / self.sentences, 195 | self.correct_pos_las_sentences / self.sentences, 196 | self.correct_pos_uas_sentences / self.sentences, 197 | self.correct_pos_sentences / self.sentences, 198 | self.correct_tag_sentences / self.sentences, 199 | self.correct_sentences / self.sentences, 200 | self.correct_roots / self.sentences, 201 | ), file=file) 202 | print("ent_gold={}, ent_result={}".format( 203 | self.gold_ents, 204 | self.result_ents, 205 | ), file=file) 206 | if self.gold_ents and self.result_ents: 207 | print(" ent_f1:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 208 | f1(self.correct_ent_labels / self.gold_ents, self.correct_ent_labels / self.result_ents), 209 | f1(self.correct_ent_spans / self.gold_ents, self.correct_ent_spans / self.result_ents), 210 | ), file=file) 211 | print(" ent_recall:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 212 | self.correct_ent_labels / self.gold_ents, 213 | self.correct_ent_spans / self.gold_ents, 214 | ), file=file) 215 | print(" ent_precision:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format( 216 | self.correct_ent_labels / self.result_ents, 217 | self.correct_ent_spans / self.result_ents, 218 | ), file=file) 219 | file.flush() 220 | 221 | def evaluate(self, gold, doc, morph_custom_condition, debug=False): 222 | def count(matrix, l1, l2): 223 | if l1 not in matrix: 224 | matrix[l1] = {} 225 | m2 = matrix[l1] 226 | if l2 in m2: 227 | m2[l2] += 1 228 | else: 229 | m2[l2] = 1 230 | 231 | self.sentences += 1 232 | self.gold_tokens += len(gold) 233 | self.result_tokens += len(doc) 234 | 235 | correct_tokens = 0 236 | correct_tag_tokens = 0 237 | correct_pos_tokens = 0 238 | correct_uas_tokens = 0 239 | correct_las_tokens = 0 240 | correct_pos_uas_tokens = 0 241 | correct_pos_las_tokens = 0 242 | custom_tokens = 0 243 | correct_custom_tokens = 0 244 | index_g = 0 245 | index_r = 0 246 | last_match_g = 0 247 | last_match_r = 0 248 | while index_g < len(gold) and index_r < len(doc): 249 | g = gold[index_g] 250 | g_end = g['end'] 251 | r = doc[index_r] 252 | r_end = r.idx + len(r.orth_) 253 | if g['offset'] == r.idx: 254 | if g_end == r_end: 255 | correct_tokens += 1 256 | count(self.pos_confusion, g['pos'], r.pos_) 257 | if g['tag'] == r.tag_: 258 | correct_tag_tokens += 1 259 | if g['pos'] == r.pos_: 260 | correct_pos_tokens += 1 261 | if is_correct_dep(g, r): 262 | correct_uas_tokens += 1 263 | count(self.dep_confusion, g['dep'].lower(), r.dep_) 264 | if g['pos'] == r.pos_: 265 | correct_pos_uas_tokens += 1 266 | if g['dep'].lower() == r.dep_.lower(): 267 | correct_las_tokens += 1 268 | if g['pos'] == r.pos_: 269 | correct_pos_las_tokens += 1 270 | else: 271 | count(self.dep_confusion, g['dep'].lower(), '_') 272 | if g['dep'].lower() == 'root' and r.dep_.lower() == 'root': 273 | self.correct_roots += 1 274 | elif g_end < r_end: 275 | count(self.pos_confusion, g['pos'], '_') 276 | count(self.dep_confusion, g['dep'].lower(), '_') 277 | elif g_end < r_end: 278 | count(self.pos_confusion, g['pos'], '_') 279 | count(self.dep_confusion, g['dep'].lower(), '_') 280 | 281 | if debug: 282 | if g_end == r_end: 283 | print('{}\t{}\t{}'.format( 284 | '=' if index_g == last_match_g and index_r == last_match_r else 285 | '>' if index_g == last_match_g else 286 | '<' if index_r == last_match_r else 287 | '!', 288 | ','.join(['-'.join(( 289 | m['orth'], m['pos'], m['dep'], str(m['head']['offset']), str(m['head']['end']) 290 | )) for m in gold[last_match_g:index_g + 1]]), 291 | ','.join(['-'.join(( 292 | m.orth_, m.pos_, m.dep_, str(m.head.idx), str(m.head.idx + len(m.head.orth_)) 293 | )) for m in doc[last_match_r:index_r + 1]]), 294 | )) 295 | last_match_g = index_g + 1 296 | last_match_r = index_r + 1 297 | if g_end <= r_end: 298 | index_g += 1 299 | if g_end >= r_end: 300 | index_r += 1 301 | 302 | tokens = len(gold) 303 | self.correct_tokens += correct_tokens 304 | if correct_tokens == tokens: 305 | self.correct_sentences += 1 306 | self.correct_tag_tokens += correct_tag_tokens 307 | if correct_tag_tokens == tokens: 308 | self.correct_tag_sentences += 1 309 | self.correct_pos_tokens += correct_pos_tokens 310 | if correct_pos_tokens == tokens: 311 | self.correct_pos_sentences += 1 312 | self.correct_uas_tokens += correct_uas_tokens 313 | if correct_uas_tokens == tokens: 314 | self.correct_uas_sentences += 1 315 | self.correct_las_tokens += correct_las_tokens 316 | if correct_las_tokens == tokens: 317 | self.correct_las_sentences += 1 318 | self.correct_pos_uas_tokens += correct_pos_uas_tokens 319 | if correct_pos_uas_tokens == tokens: 320 | self.correct_pos_uas_sentences += 1 321 | self.correct_pos_las_tokens += correct_pos_las_tokens 322 | if correct_pos_las_tokens == tokens: 323 | self.correct_pos_las_sentences += 1 324 | 325 | result_borders = {r.idx: (len(r.orth_), r) for r in doc} 326 | for g in gold: 327 | length, r = result_borders.get(g['offset'], (0, None)) 328 | if length == len(g['orth']): 329 | custom = morph_custom_condition(g, r) 330 | if custom is not None: 331 | custom_tokens += 1 332 | if custom: 333 | correct_custom_tokens += 1 334 | # else: 335 | # print(custom, g.surface, r.surface, g.pos, r.pos, g.tag, r.tag) 336 | self.custom_tokens += custom_tokens 337 | self.correct_custom_tokens += correct_custom_tokens 338 | 339 | gold_ents = {} 340 | ent_label = None 341 | ent_begin = None 342 | for g in gold: 343 | ner = g['ner'] if 'ner' in g else '-' 344 | if ner.startswith('B-'): 345 | ent_label = ner[2:] 346 | ent_begin = g['offset'] 347 | elif ner.startswith('L-'): 348 | gold_ents[(ent_begin, g['end'])] = ent_label 349 | ent_label = None 350 | ent_begin = None 351 | elif ner.startswith('U-'): 352 | gold_ents[(g['offset'], g['end'])] = ner[2:] 353 | result_ents = {} 354 | ent_label = None 355 | ent_begin = None 356 | ent_end = None 357 | for r in doc: 358 | if ent_label and r.ent_iob_ != 'I': 359 | result_ents[(ent_begin, ent_end)] = ent_label 360 | ent_label = None 361 | ent_begin = None 362 | ent_end = None 363 | if r.ent_iob_ == 'B': 364 | ent_label = r.ent_type_ 365 | ent_begin = r.idx 366 | ent_end = r.idx + len(r.orth_) 367 | elif r.ent_iob_ == 'I': 368 | ent_end = r.idx + len(r.orth_) 369 | if ent_label: 370 | result_ents[(ent_begin, ent_end)] = ent_label 371 | 372 | self.gold_ents += len(gold_ents) 373 | self.result_ents += len(result_ents) 374 | for k, gold_label in gold_ents.items(): 375 | if k in result_ents: 376 | self.correct_ent_spans += 1 377 | result_label = result_ents[k] 378 | count(self.ent_confusion, gold_label, result_label) 379 | if gold_label == result_label: 380 | self.correct_ent_labels += 1 381 | else: 382 | count(self.ent_confusion, gold_label, '_') 383 | 384 | 385 | def is_correct_dep(g, r): 386 | return g['head']['offset'] <= r.head.idx and g['head']['end'] >= r.head.idx + len(r.head.orth_) or \ 387 | g['head']['offset'] >= r.head.idx and g['head']['end'] <= r.head.idx + len(r.head.orth_) 388 | 389 | 390 | if __name__ == '__main__': 391 | if len(sys.argv) < 3: 392 | print(USAGE, file=sys.stderr) 393 | exit(2) 394 | evaluate_from_file(sys.argv[1], sys.argv[2:]) 395 | -------------------------------------------------------------------------------- /ginza_util/gsk2014a.py: -------------------------------------------------------------------------------- 1 | # encoding: utf8 2 | from __future__ import unicode_literals, print_function 3 | 4 | import re 5 | import sys 6 | 7 | 8 | ID_PATTERN = re.compile( 9 | r'^.*((OC|OW|OY|PB|PM|PN)(..)_([0-9]{5}))(|\..+)$' 10 | ) 11 | 12 | 13 | def read_gsk2014a_xml(_path): 14 | in_text = False 15 | text = '' 16 | stack = [] 17 | tags = [] 18 | with open(_path, 'r') as xml: 19 | for line in xml: 20 | if line.startswith(''): 21 | in_text = True 22 | elif line.startswith(''): 23 | break 24 | elif in_text: 25 | prev = 0 26 | for m in re.finditer(r'<(/?[^>]+)>', line): 27 | text += line[prev:m.start(0)] 28 | prev = m.end(0) 29 | tag = m.group(1) 30 | offset = len(text) 31 | if tag.startswith('/'): 32 | begin_tag, begin = stack.pop() 33 | assert begin_tag == tag[1:], _path + ' ' + str(offset) + ' ' + begin_tag + ' ' + tag 34 | if not stack: 35 | tags.append((begin_tag, begin, offset)) 36 | elif tag.startswith('rejectedBlock'): 37 | return None, None 38 | else: 39 | stack.append((tag, offset)) 40 | text += line[prev:] 41 | assert not stack, _path + ' ' + str(stack) 42 | return text, tags 43 | 44 | 45 | def main(): 46 | output_base_path = sys.argv[1] 47 | for conllu_path in sys.argv[2:]: 48 | file_id = ID_PATTERN.match(conllu_path).group(1) 49 | text, tags = read_gsk2014a_xml('corpus/gsk-ene-19.6.25/bccwj/xml/{}/{}.xml'.format(file_id[:2], file_id)) 50 | tag_idx = 0 51 | in_tag = False 52 | offset = 0 53 | debug_sentence = '' 54 | output = [] 55 | with open(conllu_path, 'r') as fin: 56 | for line in fin: 57 | line = line.rstrip('\n') 58 | if not text: 59 | output.append((line, None)) 60 | continue 61 | if line.startswith('# text = '): 62 | in_tag = False 63 | debug_sentence = line 64 | output.append((line, None)) 65 | continue 66 | if line.startswith('#'): 67 | output.append((line, None)) 68 | continue 69 | if line == '': 70 | if in_tag: # for multi sentence NEs such as URLs 71 | in_tag = False 72 | l, n = output[-1] 73 | if n.startswith('B'): 74 | n = 'U-' + tag 75 | elif n.startswith('I'): 76 | n = 'L-' + tag 77 | output[-1] = (l, n) 78 | print( 79 | 'dividing ne span:', 80 | file_id, 81 | tag_begin, 82 | tag_end, 83 | tag, 84 | text[tag_begin:offset].replace('\n', '\\n'), 85 | '|', 86 | text[offset:tag_end].replace('\n', '\\n'), 87 | debug_sentence, 88 | file=sys.stderr 89 | ) 90 | output.append((line, None)) 91 | continue 92 | orth = line.split('\t')[1] 93 | new_offset = text.find(orth, offset) 94 | if new_offset == -1: 95 | new_offset = text.find(orth.replace(' ', ' '), offset) 96 | if new_offset == -1: 97 | if orth == 'ミュージカル': 98 | orth = 'ミュージ\nカル' 99 | elif orth == 'モテる': 100 | orth = 'モテ\nる' 101 | elif orth == 'すぎる': 102 | orth = 'す\n\nぎる' 103 | elif orth == 'いう': 104 | orth = 'い\n\nう' 105 | elif orth == '位置': 106 | orth = '位\n置' 107 | elif orth == '用いれ': 108 | orth = '用\n\nいれ' 109 | elif orth == '見込ま': 110 | orth = '見\n\n込ま' 111 | elif orth == 'なる': 112 | orth = 'な\n\nる' 113 | elif orth == '載せる': 114 | orth = '載せ\n\nる' 115 | new_offset = text.find(orth, offset) 116 | if new_offset - offset >= 2 and len(text[offset:new_offset].strip()) >= 2: 117 | if orth == '不能': 118 | orth = '不\n\n能' 119 | elif orth == '退職': 120 | orth = '退\n\n職' 121 | elif orth == 'から': 122 | orth = 'か\nら' 123 | elif orth == '思う': 124 | orth = '思\n\nう' 125 | elif orth == '中敷き': 126 | orth = '中敷\nき' 127 | new_offset = text.find(orth, offset) 128 | assert new_offset >= 0, 'lost token: {} {}\n{}\n{}\n{}'.format( 129 | file_id, 130 | offset, 131 | line, 132 | text[offset:].replace('\n', '\\n'), 133 | debug_sentence, 134 | ) 135 | if text[offset:new_offset].strip() != '': 136 | print( 137 | 'skipping text:', 138 | file_id, 139 | offset, 140 | new_offset, 141 | text[offset:new_offset].replace('\n', '\\n'), 142 | debug_sentence, 143 | file=sys.stderr 144 | ) 145 | offset = new_offset 146 | 147 | end = offset + len(orth) 148 | if 'SpaceAfter=No' not in line: 149 | end += 1 150 | 151 | if tag_idx < len(tags): 152 | tag, tag_begin, tag_end = tags[tag_idx] 153 | if end <= tag_begin: 154 | assert not in_tag, '{} {} {} {}\n{}\n{}\n{}'.format( 155 | file_id, 156 | offset, 157 | end, 158 | tag_begin, 159 | tag_end, 160 | line, 161 | text[offset], 162 | ) 163 | ner = 'O' 164 | elif offset < tag_end and not in_tag: 165 | if end < tag_end: 166 | ner = 'B-' + tag 167 | in_tag = True 168 | else: 169 | ner = 'U-' + tag 170 | tag_idx += 1 171 | elif end < tag_end: 172 | assert in_tag, '{} {} {} {}\n{}\n{}\n{}'.format( 173 | file_id, 174 | offset, 175 | end, 176 | tag_begin, 177 | tag_end, 178 | line, 179 | text[offset:], 180 | ) 181 | ner = 'I-' + tag 182 | elif tag_end <= end: 183 | if in_tag: 184 | ner = 'L-' + tag 185 | tag_idx += 1 186 | in_tag = False 187 | elif tag_begin < offset: 188 | ner = 'U-' + tag 189 | tag_idx += 1 190 | else: 191 | ner = 'O' 192 | tag_idx += 1 193 | print( 194 | 'skipping tag:', 195 | file_id, 196 | tag_begin, 197 | tag_end, 198 | tag, 199 | text[tag_begin:tag_end].replace('\n', '\\n'), 200 | debug_sentence, 201 | file=sys.stderr 202 | ) 203 | else: 204 | raise Exception("Unexpected state: token={} {}-{} {}, ne={}-{} {} {}".format( 205 | file_id, 206 | offset, 207 | end, 208 | text[offset:end].replace('\n', '\\n'), 209 | tag_begin, 210 | tag_end, 211 | text[tag_begin:tag_end].replace('\n', '\\n'), 212 | tag, 213 | )) 214 | else: 215 | ner = 'O' 216 | output.append((line, ner)) 217 | offset = end 218 | if tags and tag_idx < len(tags): 219 | for tag_idx in range(tag_idx, len(tags)): 220 | print( 221 | 'skipping tag:', 222 | file_id, 223 | tag_begin, 224 | tag_end, 225 | text[tag_begin:tag_end].replace('\n', '\\n'), 226 | '', 227 | file=sys.stderr 228 | ) 229 | prev_ner = 'O' 230 | for line, ner in output: 231 | if not ner: 232 | ner = 'O' 233 | assert prev_ner[0] not in ['B', 'I'] or ner[0] in ['I', 'L'], '{}\n{} {} {}\n{}'.format( 234 | '\n'.join([line + ' ' + str(ner) for line, ner in output]), 235 | conllu_path, 236 | prev_ner, 237 | ner, 238 | line, 239 | ) 240 | assert prev_ner[0] not in ['L', 'U', 'O'] or ner[0] in ['B', 'U', 'O'], '{}\n{} {} {}\n{}'.format( 241 | '\n'.join([line + ' ' + str(ner) for line, ner in output]), 242 | conllu_path, 243 | prev_ner, 244 | ner, 245 | line, 246 | ) 247 | prev_ner = ner 248 | 249 | with open(output_base_path + '/' + conllu_path.split('/')[-1], 'w') as fout: 250 | for line, ner in output: 251 | if ner: 252 | if line.endswith('\t'): 253 | print(line + 'NE=' + ner, file=fout) 254 | elif line.endswith('_'): 255 | print(line[:-1] + '|NE=' + ner, file=fout) 256 | else: 257 | print(line + '|NE=' + ner, file=fout) 258 | else: 259 | print(line, file=fout) 260 | 261 | 262 | if __name__ == "__main__": 263 | # execute only if run as a script 264 | main() 265 | -------------------------------------------------------------------------------- /ginza_util/setup_meta.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | 5 | with open(sys.argv[1], "r") as fin: 6 | master = json.load(fin) 7 | 8 | with open(sys.argv[2], "r") as fin: 9 | target = json.load(fin) 10 | 11 | target.update(master) 12 | 13 | json.dump(target, sys.stdout, indent=1, ensure_ascii=False) 14 | 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy>=3.4.4,<4.0.0 2 | plac>=1.3.3 3 | SudachiPy>=0.6.2,<0.7.0 4 | SudachiDict-core>=20210802 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test = pytest 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | author="Megagon Labs, Tokyo.", 6 | author_email="ginza@megagon.ai", 7 | description="GiNZA, An Open Source Japanese NLP Library, based on Universal Dependencies", 8 | entry_points={ 9 | "spacy_factories": [ 10 | "bunsetu_recognizer = ginza:make_bunsetu_recognizer", 11 | "compound_splitter = ginza:make_compound_splitter", 12 | "disable_sentencizer = ginza:disable_sentencizer", 13 | ], 14 | "console_scripts": [ 15 | "ginza = ginza.command_line:main_ginza", 16 | "ginzame = ginza.command_line:main_ginzame", 17 | ], 18 | }, 19 | python_requires=">=3.8", 20 | install_requires=[ 21 | "spacy>=3.4.4,<4.0.0", 22 | "plac>=1.3.3", 23 | "SudachiPy>=0.6.2,<0.7.0", 24 | "SudachiDict-core>=20210802", 25 | ], 26 | setup_requires=["pytest-runner"], 27 | tests_require=["pytest", "pytest-cov", "pytest-mock"], 28 | license="MIT", 29 | name="ginza", 30 | packages=find_packages(include=["ginza"]), 31 | url="https://github.com/megagonlabs/ginza", 32 | version='5.2.0', 33 | ) 34 | --------------------------------------------------------------------------------