├── .github
    └── workflows
    │   ├── build-n-publish.yml
    │   └── pytest.yml
├── .gitignore
├── .gitmodules
├── CITATION
├── LICENSE
├── README.md
├── benchmark
    ├── benchmark.py
    ├── run_benchmark_ginza.sh
    ├── run_benchmark_spacy.sh
    └── setup_benchmark.sh
├── config
    ├── ja_ginza.analysis.cfg
    ├── ja_ginza.cfg
    ├── ja_ginza.meta.json
    ├── ja_ginza_bert_large.cfg
    ├── ja_ginza_bert_large.meta.json
    ├── ja_ginza_bert_large_analysis.cfg
    ├── ja_ginza_electra.analysis.cfg
    ├── ja_ginza_electra.cfg
    └── ja_ginza_electra.meta.json
├── docs
    ├── _config.yml
    ├── bunsetu_api.md
    ├── command_line_tool.md
    ├── developer_reference.md
    └── index.md
├── ginza
    ├── __init__.py
    ├── __main__.py
    ├── analyzer.py
    ├── bunsetu_recognizer.py
    ├── command_line.py
    ├── compound_splitter.py
    ├── disable_sentencizer.py
    ├── ene_ontonotes_mapper.py
    └── tests
    │   ├── conftest.py
    │   ├── test_analyzer.py
    │   ├── test_command_line.py
    │   └── test_models.py
├── ginza_util
    ├── __init__.py
    ├── browse_trees.py
    ├── conllu_to_json.py
    ├── conv_connlu_to_json.rea.sh
    ├── evaluate_conllu.py
    ├── evaluate_model.py
    ├── gsk2014a.py
    └── setup_meta.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.github/workflows/build-n-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | jobs:
 8 |   build-n-publish:
 9 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
10 |     runs-on: ubuntu-18.04
11 |     steps:
12 |       - uses: actions/checkout@master
13 |       - name: Set up Python 3.9
14 |         uses: actions/setup-python@v1
15 |         with:
16 |           python-version: 3.9
17 |       - name: Install pypa/build
18 |         run: >-
19 |           python -m
20 |           pip install
21 |           build
22 |           --user
23 |       - name: Build a binary wheel and a source tarball
24 |         run: >-
25 |           python -m
26 |           build
27 |           --sdist
28 |           --wheel
29 |           --outdir dist/
30 |           .
31 |       - name: Publish distribution 📦 to PyPI
32 |         if: startsWith(github.ref, 'refs/tags')
33 |         uses: pypa/gh-action-pypi-publish@master
34 |         with:
35 |           password: ${{ secrets.PYPI_API_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: pytest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - develop
10 | 
11 | jobs:
12 |   pytest:
13 |     name: Run tests with pytest
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.7, 3.8]
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v2
21 |       - name: Set up Python ${{ matrix.python-version }}
22 |         uses: actions/setup-python@v1
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Upgrade pip
26 |         run: >-
27 |           python -m
28 |           pip install -U pip
29 |       - name: Install dependencies
30 |         run: >-
31 |           python -m
32 |           pip install . pytest pytest-mock ja-ginza ja-ginza-electra
33 |       - name: Run Tests
34 |         run: pytest
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /bccwj*/
 2 | /build/
 3 | /config/ja_gsd*
 4 | /corpus*/
 5 | /dist/
 6 | /electra*
 7 | /embedding*/
 8 | /ja_*
 9 | /log*
10 | /megagonlabs/
11 | /models/
12 | /old/
13 | /rtx*
14 | /submodules/
15 | /sudachi*
16 | /target/
17 | /test/
18 | /vector*
19 | /venv*
20 | __pycache__/
21 | *.pyc
22 | *.egg-info/
23 | .DS_Store
24 | .eggs


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/megagonlabs/ginza/f67b4987af09bad939d75c89b4e9483b90c064ee/.gitmodules


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | @ARTICLE{GiNZA NLP,
2 |    AUTHOR  = {Hiroshi, Mai and Masayuki},
3 |    TITLE   = {短単位品詞の用法曖昧性解決と依存関係ラベリングの同時学習},
4 |    YEAR    = {2019},
5 |    JOURNAL = {言語処理学会第25回年次大会},
6 |    URL     = {http://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/F2-3.pdf}
7 | }
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Megagon Labs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import json
 3 | import sys
 4 | 
 5 | 
 6 | REPEAT = 5
 7 | BATCH_SIZE = 128
 8 | 
 9 | assert len(sys.argv) >= 2, "Usage: python {sys.argv[0]} [-g] model_name1 [model_name2 [...]]"
10 | if sys.argv[1] == "-g":
11 |     require_gpu = True
12 |     device = "GPU"
13 |     model_names = sys.argv[2:]
14 | else:
15 |     require_gpu = False
16 |     device = "CPU"
17 |     model_names = sys.argv[1:]
18 | 
19 | sents = [_.rstrip("\n") for _ in sys.stdin]
20 | 
21 | results = {}
22 | 
23 | 
24 | print("timestamp                 ", "[msec]", "device", 'procedure description', sep="\t", file=sys.stderr)
25 | start = datetime.now()
26 | prev = start
27 | print(start, 0, f"benchmark started with {len(sents)} sentences", sep="\t", file=sys.stderr)
28 | 
29 | import spacy
30 | if require_gpu:
31 |     spacy.require_gpu()
32 | lap = datetime.now()
33 | dur = int((lap - prev).total_seconds() * 1000)
34 | print(lap, dur, device, 'import spacy', sep="\t", file=sys.stderr)
35 | prev = lap
36 | 
37 | for model_name in model_names:
38 |     results = {}
39 |     nlp = spacy.load(model_name)
40 |     lap = datetime.now()
41 |     dur = int((lap - prev).total_seconds() * 1000)
42 |     results[f"spacy.load()"] = [dur]
43 |     print(lap, dur, device, f"spacy.load({model_name})", sep="\t", file=sys.stderr)
44 |     prev = lap
45 | 
46 |     results[f"nlp.pipe(batch={BATCH_SIZE})"] = []
47 |     for repeat in range(1, REPEAT + 1):
48 |         for _ in range((len(sents) - 1) // BATCH_SIZE + 1):
49 |             docs = nlp.pipe(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE])
50 |             for doc in docs:
51 |                 len(doc)
52 |         lap = datetime.now()
53 |         dur = int((lap - prev).total_seconds() * 1000)
54 |         results[f"nlp.pipe(batch={BATCH_SIZE})"].append(dur / len(sents))
55 |         print(
56 |             lap,
57 |             dur,
58 |             device,
59 |             f"#{repeat} {model_name}->nlp.pipe(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]",
60 |             sep="\t", file=sys.stderr,
61 |         )
62 |         prev = lap
63 | 
64 |     results[f"nlp(batch=1)"] = []
65 |     for repeat in range(1, REPEAT + 1):
66 |         for sent in sents:
67 |             doc = nlp(sent)
68 |             len(doc)
69 |         lap = datetime.now()
70 |         dur = int((lap - prev).total_seconds() * 1000)
71 |         results[f"nlp(batch=1)"].append(dur / len(sents))
72 |         print(
73 |             lap,
74 |             dur,
75 |             device,
76 |             f"#{repeat} {model_name}->nlp(batch=1):   {dur / len(sents):.03f}[msec/sent]",
77 |             sep="\t", file=sys.stderr,
78 |         )
79 |         prev = lap
80 | 
81 |     dur = int((lap - start).total_seconds() * 1000)
82 |     print(lap, dur, device, model_name, 'finished', sep="\t", file=sys.stderr)
83 | 
84 |     for k, v in results.items():
85 |         l = sorted(v)
86 |         results[k] = l[len(l) // 2]
87 | 
88 |     json.dump(
89 |         {"model": model_name, "device": device, "results": results},
90 |         sys.stdout,
91 |         ensure_ascii=False,
92 |     )
93 |     print()
94 | 


--------------------------------------------------------------------------------
/benchmark/run_benchmark_ginza.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cat gsd/dev.txt gsd/test.txt | python benchmark.py -g ja_ginza ja_ginza_electra
3 | cat gsd/dev.txt gsd/test.txt | python benchmark.py ja_ginza ja_ginza_electra
4 | 


--------------------------------------------------------------------------------
/benchmark/run_benchmark_spacy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cat gsd/dev.txt gsd/test.txt | python benchmark.py -g ja_core_news_md ja_core_news_trf
3 | cat gsd/dev.txt gsd/test.txt | python benchmark.py ja_core_news_md ja_core_news_trf
4 | 


--------------------------------------------------------------------------------
/benchmark/setup_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | mkdir -p gsd
 4 | for t in train dev test ; do
 5 |   curl "https://raw.githubusercontent.com/megagonlabs/UD_Japanese-GSD/c614040872a74587912a15ef4637eabc0dc29a60/ja_gsd-ud-${t}.ne.conllu?raw=true" | grep "# text = " | sed 's/# text = //' > gsd/${t}.txt
 6 | done
 7 | echo
 8 | echo '=== CUDA Related Installation Steps ==='
 9 | echo 'The pytorch should be installed with cuda support. See https://pytorch.org/get-started/previous-versions/#linux-and-windows-1'
10 | echo 'Also you need to install spacy with appropriate cuda specifier as `pip install -U spacy[cudaXXX]`. See https://spacy.io/usage#gpu'
11 | echo 'And then, install GiNZA as `pip install -U ginza ja-ginza ja-ginza-electra`.'
12 | echo 'To evaluate the performance of spaCy official models, install models as `python -m spacy download ja_core_news_md ; python -m spacy download ja_core_news_trf`.'
13 | 


--------------------------------------------------------------------------------
/config/ja_ginza.analysis.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = null
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["tok2vec","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 1000
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | validate = false
 29 | 
 30 | [components.bunsetu_recognizer]
 31 | factory = "bunsetu_recognizer"
 32 | remain_bunsetu_suffix = false
 33 | 
 34 | [components.compound_splitter]
 35 | factory = "compound_splitter"
 36 | split_mode = null
 37 | 
 38 | [components.morphologizer]
 39 | factory = "morphologizer"
 40 | 
 41 | [components.morphologizer.model]
 42 | @architectures = "spacy.Tagger.v1"
 43 | nO = null
 44 | 
 45 | [components.morphologizer.model.tok2vec]
 46 | @architectures = "spacy.Tok2VecListener.v1"
 47 | width = ${components.tok2vec.model.encode.width}
 48 | upstream = "*"
 49 | 
 50 | [components.ner]
 51 | factory = "ner"
 52 | incorrect_spans_key = null
 53 | moves = null
 54 | update_with_oracle_cut_size = 100
 55 | 
 56 | [components.ner.model]
 57 | @architectures = "spacy.TransitionBasedParser.v2"
 58 | state_type = "ner"
 59 | extra_state_tokens = false
 60 | hidden_width = 64
 61 | maxout_pieces = 2
 62 | use_upper = true
 63 | nO = null
 64 | 
 65 | [components.ner.model.tok2vec]
 66 | @architectures = "spacy.Tok2VecListener.v1"
 67 | width = ${components.tok2vec.model.encode.width}
 68 | upstream = "*"
 69 | 
 70 | [components.parser]
 71 | factory = "parser"
 72 | learn_tokens = false
 73 | min_action_freq = 30
 74 | moves = null
 75 | update_with_oracle_cut_size = 100
 76 | 
 77 | [components.parser.model]
 78 | @architectures = "spacy.TransitionBasedParser.v2"
 79 | state_type = "parser"
 80 | extra_state_tokens = false
 81 | hidden_width = 128
 82 | maxout_pieces = 3
 83 | use_upper = true
 84 | nO = null
 85 | 
 86 | [components.parser.model.tok2vec]
 87 | @architectures = "spacy.Tok2VecListener.v1"
 88 | width = ${components.tok2vec.model.encode.width}
 89 | upstream = "*"
 90 | 
 91 | [components.tok2vec]
 92 | factory = "tok2vec"
 93 | 
 94 | [components.tok2vec.model]
 95 | @architectures = "spacy.Tok2Vec.v2"
 96 | 
 97 | [components.tok2vec.model.embed]
 98 | @architectures = "spacy.MultiHashEmbed.v2"
 99 | width = ${components.tok2vec.model.encode.width}
100 | attrs = ["ORTH","SHAPE"]
101 | rows = [5000,2500]
102 | include_static_vectors = true
103 | 
104 | [components.tok2vec.model.encode]
105 | @architectures = "spacy.MaxoutWindowEncoder.v2"
106 | width = 256
107 | depth = 8
108 | window_size = 1
109 | maxout_pieces = 3
110 | 
111 | [corpora]
112 | 
113 | [corpora.dev]
114 | @readers = "spacy.Corpus.v1"
115 | path = ${paths.dev}
116 | max_length = 0
117 | gold_preproc = false
118 | limit = 0
119 | augmenter = null
120 | 
121 | [corpora.train]
122 | @readers = "spacy.Corpus.v1"
123 | path = ${paths.train}
124 | max_length = 2000
125 | gold_preproc = false
126 | limit = 0
127 | augmenter = null
128 | 
129 | [training]
130 | dev_corpus = "corpora.dev"
131 | train_corpus = "corpora.train"
132 | seed = ${system.seed}
133 | gpu_allocator = ${system.gpu_allocator}
134 | dropout = 0.1
135 | accumulate_gradient = 1
136 | patience = 0
137 | max_epochs = 0
138 | max_steps = 50000
139 | eval_frequency = 200
140 | frozen_components = []
141 | before_to_disk = null
142 | annotating_components = []
143 | 
144 | [training.batcher]
145 | @batchers = "spacy.batch_by_words.v1"
146 | discard_oversize = false
147 | tolerance = 0.2
148 | get_length = null
149 | 
150 | [training.batcher.size]
151 | @schedules = "compounding.v1"
152 | start = 100
153 | stop = 1000
154 | compound = 1.001
155 | t = 0.0
156 | 
157 | [training.logger]
158 | @loggers = "spacy.ConsoleLogger.v1"
159 | progress_bar = false
160 | 
161 | [training.optimizer]
162 | @optimizers = "Adam.v1"
163 | beta1 = 0.9
164 | beta2 = 0.999
165 | L2_is_weight_decay = true
166 | L2 = 0.01
167 | grad_clip = 1.0
168 | use_averages = false
169 | eps = 0.00000001
170 | learn_rate = 0.001
171 | 
172 | [training.score_weights]
173 | dep_uas = 0.25
174 | dep_las = 0.25
175 | dep_las_per_type = null
176 | sents_p = null
177 | sents_r = null
178 | sents_f = 0.1
179 | ents_f = 0.25
180 | ents_p = 0.0
181 | ents_r = 0.0
182 | ents_per_type = null
183 | pos_acc = 0.15
184 | morph_acc = 0.0
185 | morph_per_feat = null
186 | tag_acc = 0.0
187 | 
188 | [pretraining]
189 | 
190 | [initialize]
191 | vectors = "vectors/"
192 | init_tok2vec = ${paths.init_tok2vec}
193 | vocab_data = null
194 | lookups = null
195 | before_init = null
196 | after_init = null
197 | 
198 | [initialize.components]
199 | 
200 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = null
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["tok2vec","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 1000
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | validate = false
 29 | 
 30 | [components.bunsetu_recognizer]
 31 | factory = "bunsetu_recognizer"
 32 | remain_bunsetu_suffix = true
 33 | 
 34 | [components.compound_splitter]
 35 | factory = "compound_splitter"
 36 | split_mode = null
 37 | 
 38 | [components.morphologizer]
 39 | factory = "morphologizer"
 40 | 
 41 | [components.morphologizer.model]
 42 | @architectures = "spacy.Tagger.v1"
 43 | nO = null
 44 | 
 45 | [components.morphologizer.model.tok2vec]
 46 | @architectures = "spacy.Tok2VecListener.v1"
 47 | width = ${components.tok2vec.model.encode.width}
 48 | upstream = "*"
 49 | 
 50 | [components.ner]
 51 | factory = "ner"
 52 | incorrect_spans_key = null
 53 | moves = null
 54 | update_with_oracle_cut_size = 100
 55 | 
 56 | [components.ner.model]
 57 | @architectures = "spacy.TransitionBasedParser.v2"
 58 | state_type = "ner"
 59 | extra_state_tokens = false
 60 | hidden_width = 64
 61 | maxout_pieces = 2
 62 | use_upper = true
 63 | nO = null
 64 | 
 65 | [components.ner.model.tok2vec]
 66 | @architectures = "spacy.Tok2VecListener.v1"
 67 | width = ${components.tok2vec.model.encode.width}
 68 | upstream = "*"
 69 | 
 70 | [components.parser]
 71 | factory = "parser"
 72 | learn_tokens = false
 73 | min_action_freq = 30
 74 | moves = null
 75 | update_with_oracle_cut_size = 100
 76 | 
 77 | [components.parser.model]
 78 | @architectures = "spacy.TransitionBasedParser.v2"
 79 | state_type = "parser"
 80 | extra_state_tokens = false
 81 | hidden_width = 128
 82 | maxout_pieces = 3
 83 | use_upper = true
 84 | nO = null
 85 | 
 86 | [components.parser.model.tok2vec]
 87 | @architectures = "spacy.Tok2VecListener.v1"
 88 | width = ${components.tok2vec.model.encode.width}
 89 | upstream = "*"
 90 | 
 91 | [components.tok2vec]
 92 | factory = "tok2vec"
 93 | 
 94 | [components.tok2vec.model]
 95 | @architectures = "spacy.Tok2Vec.v2"
 96 | 
 97 | [components.tok2vec.model.embed]
 98 | @architectures = "spacy.MultiHashEmbed.v2"
 99 | width = ${components.tok2vec.model.encode.width}
100 | attrs = ["ORTH","SHAPE"]
101 | rows = [5000,2500]
102 | include_static_vectors = true
103 | 
104 | [components.tok2vec.model.encode]
105 | @architectures = "spacy.MaxoutWindowEncoder.v2"
106 | width = 256
107 | depth = 8
108 | window_size = 1
109 | maxout_pieces = 3
110 | 
111 | [corpora]
112 | 
113 | [corpora.dev]
114 | @readers = "spacy.Corpus.v1"
115 | path = ${paths.dev}
116 | max_length = 0
117 | gold_preproc = false
118 | limit = 0
119 | augmenter = null
120 | 
121 | [corpora.train]
122 | @readers = "spacy.Corpus.v1"
123 | path = ${paths.train}
124 | max_length = 2000
125 | gold_preproc = false
126 | limit = 0
127 | augmenter = null
128 | 
129 | [training]
130 | dev_corpus = "corpora.dev"
131 | train_corpus = "corpora.train"
132 | seed = ${system.seed}
133 | gpu_allocator = ${system.gpu_allocator}
134 | dropout = 0.1
135 | accumulate_gradient = 1
136 | patience = 0
137 | max_epochs = 0
138 | max_steps = 50000
139 | eval_frequency = 200
140 | frozen_components = []
141 | before_to_disk = null
142 | annotating_components = []
143 | 
144 | [training.batcher]
145 | @batchers = "spacy.batch_by_words.v1"
146 | discard_oversize = false
147 | tolerance = 0.2
148 | get_length = null
149 | 
150 | [training.batcher.size]
151 | @schedules = "compounding.v1"
152 | start = 100
153 | stop = 1000
154 | compound = 1.001
155 | t = 0.0
156 | 
157 | [training.logger]
158 | @loggers = "spacy.ConsoleLogger.v1"
159 | progress_bar = false
160 | 
161 | [training.optimizer]
162 | @optimizers = "Adam.v1"
163 | beta1 = 0.9
164 | beta2 = 0.999
165 | L2_is_weight_decay = true
166 | L2 = 0.01
167 | grad_clip = 1.0
168 | use_averages = false
169 | eps = 0.00000001
170 | learn_rate = 0.001
171 | 
172 | [training.score_weights]
173 | dep_uas = 0.25
174 | dep_las = 0.25
175 | dep_las_per_type = null
176 | sents_p = null
177 | sents_r = null
178 | sents_f = 0.1
179 | ents_f = 0.25
180 | ents_p = 0.0
181 | ents_r = 0.0
182 | ents_per_type = null
183 | pos_acc = 0.15
184 | morph_acc = 0.0
185 | morph_per_feat = null
186 | tag_acc = 0.0
187 | 
188 | [pretraining]
189 | 
190 | [initialize]
191 | vectors = "vectors/"
192 | init_tok2vec = ${paths.init_tok2vec}
193 | vocab_data = null
194 | lookups = null
195 | before_init = null
196 | after_init = null
197 | 
198 | [initialize.components]
199 | 
200 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza.meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"ja",
 3 |   "name":"ginza",
 4 |   "version":"5.2.0",
 5 |   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.",
 6 |   "author":"Megagon Labs Tokyo.",
 7 |   "email":"ginza@megagon.ai",
 8 |   "url":"https://github.com/megagonlabs/ginza",
 9 |   "license":"MIT License",
10 |   "sources":[
11 |     {
12 |       "name":"UD_Japanese-BCCWJ r2.8",
13 |       "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ",
14 |       "license":"CC BY-NC-SA 4.0",
15 |       "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y."
16 |     },
17 |     {
18 |       "name":"GSK2014-A(2019)",
19 |       "url":"https://www.gsk.or.jp/catalog/gsk2014-a/",
20 |       "license":"Individually defined commercial license",
21 |       "author":"Tokyo Institute of Technology"
22 |     },
23 |     {
24 |       "name":"SudachiDict_core",
25 |       "url":"https://github.com/WorksApplications/SudachiDict",
26 |       "license":"Apache License 2.0",
27 |       "author":"Works Applications Enterprise Co., Ltd."
28 |     },
29 |     {
30 |       "name":"chiVe",
31 |       "url":"https://github.com/WorksApplications/chiVe",
32 |       "license":"Apache License 2.0",
33 |       "author":"Works Applications Enterprise Co., Ltd."
34 |     }
35 |   ],
36 |   "parent_package":"spacy",
37 |   "spacy_version":">=3.4.4,<4.0.0",
38 |   "spacy_git_version":"0fc3dee77",
39 |   "vectors":{
40 |     "width":300,
41 |     "vectors":20000,
42 |     "keys":480443,
43 |     "name":"ja_vectors"
44 |   },
45 |   "pipeline":[
46 |     "tok2vec",
47 |     "parser",
48 |     "attribute_ruler",
49 |     "ner",
50 |     "morphologizer",
51 |     "compound_splitter",
52 |     "bunsetu_recognizer"
53 |   ],
54 |   "components":[
55 |     "tok2vec",
56 |     "parser",
57 |     "attribute_ruler",
58 |     "ner",
59 |     "morphologizer",
60 |     "compound_splitter",
61 |     "bunsetu_recognizer"
62 |   ],
63 |   "disabled":[
64 |     "attribute_ruler"
65 |   ],
66 |   "requirements":[
67 |     "sudachipy>=0.6.2,<0.7.0",
68 |     "sudachidict_core>=20210802",
69 |     "ginza>=5.2.0,<5.3.0"
70 |   ]
71 | }
72 | 


--------------------------------------------------------------------------------
/config/ja_ginza_bert_large.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = "pytorch"
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 128
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | validate = false
 29 | 
 30 | [components.bunsetu_recognizer]
 31 | factory = "bunsetu_recognizer"
 32 | remain_bunsetu_suffix = true
 33 | 
 34 | [components.compound_splitter]
 35 | factory = "compound_splitter"
 36 | split_mode = null
 37 | 
 38 | [components.morphologizer]
 39 | factory = "morphologizer"
 40 | extend = true
 41 | overwrite = true
 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
 43 | 
 44 | [components.morphologizer.model]
 45 | @architectures = "spacy.Tagger.v2"
 46 | nO = null
 47 | normalize = false
 48 | 
 49 | [components.morphologizer.model.tok2vec]
 50 | @architectures = "spacy-transformers.TransformerListener.v1"
 51 | grad_factor = 1.0
 52 | pooling = {"@layers":"reduce_mean.v1"}
 53 | upstream = "*"
 54 | 
 55 | [components.ner]
 56 | factory = "ner"
 57 | incorrect_spans_key = null
 58 | moves = null
 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"}
 60 | update_with_oracle_cut_size = 100
 61 | 
 62 | [components.ner.model]
 63 | @architectures = "spacy.TransitionBasedParser.v2"
 64 | state_type = "ner"
 65 | extra_state_tokens = false
 66 | hidden_width = 64
 67 | maxout_pieces = 2
 68 | use_upper = false
 69 | nO = null
 70 | 
 71 | [components.ner.model.tok2vec]
 72 | @architectures = "spacy-transformers.TransformerListener.v1"
 73 | grad_factor = 1.0
 74 | pooling = {"@layers":"reduce_mean.v1"}
 75 | upstream = "*"
 76 | 
 77 | [components.parser]
 78 | factory = "parser"
 79 | learn_tokens = false
 80 | min_action_freq = 30
 81 | moves = null
 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"}
 83 | update_with_oracle_cut_size = 100
 84 | 
 85 | [components.parser.model]
 86 | @architectures = "spacy.TransitionBasedParser.v2"
 87 | state_type = "parser"
 88 | extra_state_tokens = false
 89 | hidden_width = 128
 90 | maxout_pieces = 3
 91 | use_upper = false
 92 | nO = null
 93 | 
 94 | [components.parser.model.tok2vec]
 95 | @architectures = "spacy-transformers.TransformerListener.v1"
 96 | grad_factor = 1.0
 97 | pooling = {"@layers":"reduce_mean.v1"}
 98 | upstream = "*"
 99 | 
100 | [components.transformer]
101 | factory = "transformer"
102 | max_batch_items = 4096
103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
104 | 
105 | [components.transformer.model]
106 | @architectures = "spacy-transformers.TransformerModel.v3"
107 | name = "cl-tohoku/bert-large-japanese-v2"
108 | mixed_precision = false
109 | 
110 | [components.transformer.model.get_spans]
111 | @span_getters = "spacy-transformers.strided_spans.v1"
112 | window = 128
113 | stride = 96
114 | 
115 | [components.transformer.model.grad_scaler_config]
116 | 
117 | [components.transformer.model.tokenizer_config]
118 | use_fast = false
119 | tokenizer_class = "BertJapaneseTokenizer"
120 | do_lower_case = false
121 | word_tokenizer_type = mecab
122 | subword_tokenizer_type = wordpiece
123 | mecab_kwargs = {"mecab_dic":"unidic_lite"}
124 | 
125 | [components.transformer.model.transformer_config]
126 | 
127 | [corpora]
128 | 
129 | [corpora.dev]
130 | @readers = "spacy.Corpus.v1"
131 | path = ${paths.dev}
132 | max_length = 0
133 | gold_preproc = false
134 | limit = 0
135 | augmenter = null
136 | 
137 | [corpora.train]
138 | @readers = "spacy.Corpus.v1"
139 | path = ${paths.train}
140 | max_length = 0
141 | gold_preproc = false
142 | limit = 0
143 | augmenter = null
144 | 
145 | [training]
146 | accumulate_gradient = 3
147 | dev_corpus = "corpora.dev"
148 | train_corpus = "corpora.train"
149 | seed = ${system.seed}
150 | gpu_allocator = ${system.gpu_allocator}
151 | dropout = 0.1
152 | patience = 0
153 | max_epochs = 0
154 | max_steps = 20000
155 | eval_frequency = 200
156 | frozen_components = []
157 | annotating_components = []
158 | before_to_disk = null
159 | before_update = null
160 | 
161 | [training.batcher]
162 | @batchers = "spacy.batch_by_padded.v1"
163 | discard_oversize = true
164 | size = 2000
165 | buffer = 256
166 | get_length = null
167 | 
168 | [training.logger]
169 | @loggers = "spacy.ConsoleLogger.v1"
170 | progress_bar = false
171 | 
172 | [training.optimizer]
173 | @optimizers = "Adam.v1"
174 | beta1 = 0.9
175 | beta2 = 0.999
176 | L2_is_weight_decay = true
177 | L2 = 0.01
178 | grad_clip = 1.0
179 | use_averages = false
180 | eps = 0.00000001
181 | 
182 | [training.optimizer.learn_rate]
183 | @schedules = "warmup_linear.v1"
184 | warmup_steps = 250
185 | total_steps = 20000
186 | initial_rate = 0.00005
187 | 
188 | [training.score_weights]
189 | pos_acc = 0.15
190 | morph_micro_f = 0.0
191 | morph_per_feat = null
192 | dep_uas = 0.25
193 | dep_las = 0.25
194 | dep_las_per_type = null
195 | sents_p = null
196 | sents_r = null
197 | sents_f = 0.1
198 | ents_f = 0.25
199 | ents_p = 0.0
200 | ents_r = 0.0
201 | ents_per_type = null
202 | tag_acc = 0.0
203 | 
204 | [pretraining]
205 | 
206 | [initialize]
207 | vectors = null
208 | init_tok2vec = ${paths.init_tok2vec}
209 | vocab_data = null
210 | lookups = null
211 | before_init = null
212 | after_init = null
213 | 
214 | [initialize.components]
215 | 
216 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza_bert_large.meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"ja",
 3 |   "name":"ginza_bert_large",
 4 |   "version":"5.2.0b1",
 5 |   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
 6 |   "author":"Megagon Labs Tokyo.",
 7 |   "email":"ginza@megagon.ai",
 8 |   "url":"https://github.com/megagonlabs/ginza",
 9 |   "license":"MIT License",
10 |   "sources":[
11 |     {
12 |       "name":"UD_Japanese-BCCWJ r2.8",
13 |       "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ",
14 |       "license":"CC BY-NC-SA 4.0",
15 |       "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y."
16 |     },
17 |     {
18 |       "name":"GSK2014-A(2019)",
19 |       "url":"https://www.gsk.or.jp/catalog/gsk2014-a/",
20 |       "license":"Individually defined commercial license",
21 |       "author":"Tokyo Institute of Technology"
22 |     },
23 |     {
24 |       "name":"SudachiDict_core",
25 |       "url":"https://github.com/WorksApplications/SudachiDict",
26 |       "license":"Apache License 2.0",
27 |       "author":"Works Applications Enterprise Co., Ltd."
28 |     },
29 |     {
30 |       "name":"cl-tohoku/bert-large-japanese-v2",
31 |       "url":"https://huggingface.co/cl-tohoku/bert-large-japanese-v2",
32 |       "license":"Apache License 2.0",
33 |       "author":"Tohoku University"
34 |     }
35 |   ],
36 |   "spacy_version":">=3.6.1,<4.0.0",
37 |   "spacy_git_version":"458bc5f45",
38 |   "pipeline":[
39 |     "transformer",
40 |     "parser",
41 |     "ner",
42 |     "morphologizer",
43 |     "compound_splitter",
44 |     "bunsetu_recognizer"
45 |   ],
46 |   "components":[
47 |     "transformer",
48 |     "parser",
49 |     "attribute_ruler",
50 |     "ner",
51 |     "morphologizer",
52 |     "compound_splitter",
53 |     "bunsetu_recognizer"
54 |   ],
55 |   "disabled":[
56 |     "attribute_ruler"
57 |   ],
58 |   "vectors":{
59 |     "width":0,
60 |     "vectors":0,
61 |     "keys":0,
62 |     "name":null,
63 |     "mode":"default"
64 |   },
65 |   "requirements":[
66 |     "sudachipy>=0.6.7,<0.7.0",
67 |     "sudachidict_core>=20230711",
68 |     "spacy-transformers>=1.2.5,<1.3.0",
69 |     "fugashi>=1.3.0",
70 |     "unidic-lite>=1.0.8",
71 |     "ginza>=5.2.0,<5.3.0"
72 |   ]
73 | }


--------------------------------------------------------------------------------
/config/ja_ginza_bert_large_analysis.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = "pytorch"
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 128
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | validate = false
 29 | 
 30 | [components.bunsetu_recognizer]
 31 | factory = "bunsetu_recognizer"
 32 | remain_bunsetu_suffix = false
 33 | 
 34 | [components.compound_splitter]
 35 | factory = "compound_splitter"
 36 | split_mode = null
 37 | 
 38 | [components.morphologizer]
 39 | factory = "morphologizer"
 40 | extend = true
 41 | overwrite = true
 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
 43 | 
 44 | [components.morphologizer.model]
 45 | @architectures = "spacy.Tagger.v2"
 46 | nO = null
 47 | normalize = false
 48 | 
 49 | [components.morphologizer.model.tok2vec]
 50 | @architectures = "spacy-transformers.TransformerListener.v1"
 51 | grad_factor = 1.0
 52 | pooling = {"@layers":"reduce_mean.v1"}
 53 | upstream = "*"
 54 | 
 55 | [components.ner]
 56 | factory = "ner"
 57 | incorrect_spans_key = null
 58 | moves = null
 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"}
 60 | update_with_oracle_cut_size = 100
 61 | 
 62 | [components.ner.model]
 63 | @architectures = "spacy.TransitionBasedParser.v2"
 64 | state_type = "ner"
 65 | extra_state_tokens = false
 66 | hidden_width = 64
 67 | maxout_pieces = 2
 68 | use_upper = false
 69 | nO = null
 70 | 
 71 | [components.ner.model.tok2vec]
 72 | @architectures = "spacy-transformers.TransformerListener.v1"
 73 | grad_factor = 1.0
 74 | pooling = {"@layers":"reduce_mean.v1"}
 75 | upstream = "*"
 76 | 
 77 | [components.parser]
 78 | factory = "parser"
 79 | learn_tokens = false
 80 | min_action_freq = 30
 81 | moves = null
 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"}
 83 | update_with_oracle_cut_size = 100
 84 | 
 85 | [components.parser.model]
 86 | @architectures = "spacy.TransitionBasedParser.v2"
 87 | state_type = "parser"
 88 | extra_state_tokens = false
 89 | hidden_width = 128
 90 | maxout_pieces = 3
 91 | use_upper = false
 92 | nO = null
 93 | 
 94 | [components.parser.model.tok2vec]
 95 | @architectures = "spacy-transformers.TransformerListener.v1"
 96 | grad_factor = 1.0
 97 | pooling = {"@layers":"reduce_mean.v1"}
 98 | upstream = "*"
 99 | 
100 | [components.transformer]
101 | factory = "transformer"
102 | max_batch_items = 4096
103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
104 | 
105 | [components.transformer.model]
106 | @architectures = "spacy-transformers.TransformerModel.v3"
107 | name = "cl-tohoku/bert-large-japanese-v2"
108 | mixed_precision = false
109 | 
110 | [components.transformer.model.get_spans]
111 | @span_getters = "spacy-transformers.strided_spans.v1"
112 | window = 128
113 | stride = 96
114 | 
115 | [components.transformer.model.grad_scaler_config]
116 | 
117 | [components.transformer.model.tokenizer_config]
118 | use_fast = false
119 | tokenizer_class = "BertJapaneseTokenizer"
120 | do_lower_case = false
121 | word_tokenizer_type = mecab
122 | subword_tokenizer_type = wordpiece
123 | mecab_kwargs = {"mecab_dic":"unidic_lite"}
124 | 
125 | [components.transformer.model.transformer_config]
126 | 
127 | [corpora]
128 | 
129 | [corpora.dev]
130 | @readers = "spacy.Corpus.v1"
131 | path = ${paths.dev}
132 | max_length = 0
133 | gold_preproc = false
134 | limit = 0
135 | augmenter = null
136 | 
137 | [corpora.train]
138 | @readers = "spacy.Corpus.v1"
139 | path = ${paths.train}
140 | max_length = 0
141 | gold_preproc = false
142 | limit = 0
143 | augmenter = null
144 | 
145 | [training]
146 | accumulate_gradient = 3
147 | dev_corpus = "corpora.dev"
148 | train_corpus = "corpora.train"
149 | seed = ${system.seed}
150 | gpu_allocator = ${system.gpu_allocator}
151 | dropout = 0.1
152 | patience = 0
153 | max_epochs = 0
154 | max_steps = 20000
155 | eval_frequency = 200
156 | frozen_components = []
157 | annotating_components = []
158 | before_to_disk = null
159 | before_update = null
160 | 
161 | [training.batcher]
162 | @batchers = "spacy.batch_by_padded.v1"
163 | discard_oversize = true
164 | size = 2000
165 | buffer = 256
166 | get_length = null
167 | 
168 | [training.logger]
169 | @loggers = "spacy.ConsoleLogger.v1"
170 | progress_bar = false
171 | 
172 | [training.optimizer]
173 | @optimizers = "Adam.v1"
174 | beta1 = 0.9
175 | beta2 = 0.999
176 | L2_is_weight_decay = true
177 | L2 = 0.01
178 | grad_clip = 1.0
179 | use_averages = false
180 | eps = 0.00000001
181 | 
182 | [training.optimizer.learn_rate]
183 | @schedules = "warmup_linear.v1"
184 | warmup_steps = 250
185 | total_steps = 20000
186 | initial_rate = 0.00005
187 | 
188 | [training.score_weights]
189 | pos_acc = 0.15
190 | morph_micro_f = 0.0
191 | morph_per_feat = null
192 | dep_uas = 0.25
193 | dep_las = 0.25
194 | dep_las_per_type = null
195 | sents_p = null
196 | sents_r = null
197 | sents_f = 0.1
198 | ents_f = 0.25
199 | ents_p = 0.0
200 | ents_r = 0.0
201 | ents_per_type = null
202 | tag_acc = 0.0
203 | 
204 | [pretraining]
205 | 
206 | [initialize]
207 | vectors = null
208 | init_tok2vec = ${paths.init_tok2vec}
209 | vocab_data = null
210 | lookups = null
211 | before_init = null
212 | after_init = null
213 | 
214 | [initialize.components]
215 | 
216 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza_electra.analysis.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = "pytorch"
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 128
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
 29 | validate = false
 30 | 
 31 | [components.bunsetu_recognizer]
 32 | factory = "bunsetu_recognizer"
 33 | remain_bunsetu_suffix = false
 34 | 
 35 | [components.compound_splitter]
 36 | factory = "compound_splitter"
 37 | split_mode = null
 38 | 
 39 | [components.morphologizer]
 40 | factory = "morphologizer"
 41 | extend = true
 42 | overwrite = true
 43 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
 44 | 
 45 | [components.morphologizer.model]
 46 | @architectures = "spacy.Tagger.v1"
 47 | nO = null
 48 | 
 49 | [components.morphologizer.model.tok2vec]
 50 | @architectures = "spacy-transformers.TransformerListener.v1"
 51 | grad_factor = 1.0
 52 | pooling = {"@layers":"reduce_mean.v1"}
 53 | upstream = "*"
 54 | 
 55 | [components.ner]
 56 | factory = "ner"
 57 | incorrect_spans_key = null
 58 | moves = null
 59 | scorer = {"@scorers":"spacy.ner_scorer.v1"}
 60 | update_with_oracle_cut_size = 100
 61 | 
 62 | [components.ner.model]
 63 | @architectures = "spacy.TransitionBasedParser.v2"
 64 | state_type = "ner"
 65 | extra_state_tokens = false
 66 | hidden_width = 64
 67 | maxout_pieces = 2
 68 | use_upper = false
 69 | nO = null
 70 | 
 71 | [components.ner.model.tok2vec]
 72 | @architectures = "spacy-transformers.TransformerListener.v1"
 73 | grad_factor = 1.0
 74 | pooling = {"@layers":"reduce_mean.v1"}
 75 | upstream = "*"
 76 | 
 77 | [components.parser]
 78 | factory = "parser"
 79 | learn_tokens = false
 80 | min_action_freq = 30
 81 | moves = null
 82 | scorer = {"@scorers":"spacy.parser_scorer.v1"}
 83 | update_with_oracle_cut_size = 100
 84 | 
 85 | [components.parser.model]
 86 | @architectures = "spacy.TransitionBasedParser.v2"
 87 | state_type = "parser"
 88 | extra_state_tokens = false
 89 | hidden_width = 128
 90 | maxout_pieces = 3
 91 | use_upper = false
 92 | nO = null
 93 | 
 94 | [components.parser.model.tok2vec]
 95 | @architectures = "spacy-transformers.TransformerListener.v1"
 96 | grad_factor = 1.0
 97 | pooling = {"@layers":"reduce_mean.v1"}
 98 | upstream = "*"
 99 | 
100 | [components.transformer]
101 | factory = "transformer_custom"
102 | max_batch_items = 4096
103 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
104 | 
105 | [components.transformer.model]
106 | @architectures = "spacy-transformers.TransformerModel.v3"
107 | name = "megagonlabs/transformers-ud-japanese-electra-base-ginza-510"
108 | mixed_precision = false
109 | 
110 | [components.transformer.model.get_spans]
111 | @span_getters = "spacy-transformers.strided_spans.v1"
112 | window = 128
113 | stride = 96
114 | 
115 | [components.transformer.model.grad_scaler_config]
116 | 
117 | [components.transformer.model.tokenizer_config]
118 | use_fast = false
119 | tokenizer_class = "sudachitra.tokenization_electra_sudachipy.ElectraSudachipyTokenizer"
120 | do_lower_case = false
121 | do_word_tokenize = true
122 | do_subword_tokenize = true
123 | word_tokenizer_type = "sudachipy"
124 | subword_tokenizer_type = "wordpiece"
125 | word_form_type = "dictionary_and_surface"
126 | 
127 | [components.transformer.model.tokenizer_config.sudachipy_kwargs]
128 | split_mode = "A"
129 | dict_type = "core"
130 | 
131 | [components.transformer.model.transformer_config]
132 | 
133 | [corpora]
134 | 
135 | [corpora.dev]
136 | @readers = "spacy.Corpus.v1"
137 | path = ${paths.dev}
138 | max_length = 0
139 | gold_preproc = false
140 | limit = 0
141 | augmenter = null
142 | 
143 | [corpora.train]
144 | @readers = "spacy.Corpus.v1"
145 | path = ${paths.train}
146 | max_length = 500
147 | gold_preproc = false
148 | limit = 0
149 | augmenter = null
150 | 
151 | [training]
152 | accumulate_gradient = 3
153 | dev_corpus = "corpora.dev"
154 | train_corpus = "corpora.train"
155 | seed = ${system.seed}
156 | gpu_allocator = ${system.gpu_allocator}
157 | dropout = 0.1
158 | patience = 0
159 | max_epochs = 0
160 | max_steps = 50000
161 | eval_frequency = 200
162 | frozen_components = []
163 | annotating_components = []
164 | before_to_disk = null
165 | 
166 | [training.batcher]
167 | @batchers = "spacy.batch_by_padded.v1"
168 | discard_oversize = true
169 | size = 2000
170 | buffer = 256
171 | get_length = null
172 | 
173 | [training.logger]
174 | @loggers = "spacy.ConsoleLogger.v1"
175 | progress_bar = false
176 | 
177 | [training.optimizer]
178 | @optimizers = "Adam.v1"
179 | beta1 = 0.9
180 | beta2 = 0.999
181 | L2_is_weight_decay = true
182 | L2 = 0.01
183 | grad_clip = 1.0
184 | use_averages = false
185 | eps = 0.00000001
186 | 
187 | [training.optimizer.learn_rate]
188 | @schedules = "warmup_linear.v1"
189 | warmup_steps = 250
190 | total_steps = 50000
191 | initial_rate = 0.00005
192 | 
193 | [training.score_weights]
194 | dep_uas = 0.25
195 | dep_las = 0.25
196 | dep_las_per_type = null
197 | sents_p = null
198 | sents_r = null
199 | sents_f = 0.1
200 | ents_f = 0.25
201 | ents_p = 0.0
202 | ents_r = 0.0
203 | ents_per_type = null
204 | pos_acc = 0.15
205 | morph_acc = 0.0
206 | morph_per_feat = null
207 | tag_acc = 0.0
208 | 
209 | [pretraining]
210 | 
211 | [initialize]
212 | vectors = null
213 | init_tok2vec = ${paths.init_tok2vec}
214 | vocab_data = null
215 | lookups = null
216 | before_init = null
217 | after_init = null
218 | 
219 | [initialize.components]
220 | 
221 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza_electra.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
  3 | dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = "pytorch"
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "ja"
 13 | pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
 14 | batch_size = 128
 15 | disabled = ["attribute_ruler"]
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [nlp.tokenizer]
 21 | @tokenizers = "spacy.ja.JapaneseTokenizer"
 22 | split_mode = "C"
 23 | 
 24 | [components]
 25 | 
 26 | [components.attribute_ruler]
 27 | factory = "attribute_ruler"
 28 | validate = false
 29 | 
 30 | [components.bunsetu_recognizer]
 31 | factory = "bunsetu_recognizer"
 32 | remain_bunsetu_suffix = true
 33 | 
 34 | [components.compound_splitter]
 35 | factory = "compound_splitter"
 36 | split_mode = null
 37 | 
 38 | [components.morphologizer]
 39 | factory = "morphologizer"
 40 | extend = true
 41 | overwrite = true
 42 | scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
 43 | 
 44 | [components.morphologizer.model]
 45 | @architectures = "spacy.Tagger.v1"
 46 | nO = null
 47 | 
 48 | [components.morphologizer.model.tok2vec]
 49 | @architectures = "spacy-transformers.TransformerListener.v1"
 50 | grad_factor = 1.0
 51 | pooling = {"@layers":"reduce_mean.v1"}
 52 | upstream = "*"
 53 | 
 54 | [components.ner]
 55 | factory = "ner"
 56 | incorrect_spans_key = null
 57 | moves = null
 58 | scorer = {"@scorers":"spacy.ner_scorer.v1"}
 59 | update_with_oracle_cut_size = 100
 60 | 
 61 | [components.ner.model]
 62 | @architectures = "spacy.TransitionBasedParser.v2"
 63 | state_type = "ner"
 64 | extra_state_tokens = false
 65 | hidden_width = 64
 66 | maxout_pieces = 2
 67 | use_upper = false
 68 | nO = null
 69 | 
 70 | [components.ner.model.tok2vec]
 71 | @architectures = "spacy-transformers.TransformerListener.v1"
 72 | grad_factor = 1.0
 73 | pooling = {"@layers":"reduce_mean.v1"}
 74 | upstream = "*"
 75 | 
 76 | [components.parser]
 77 | factory = "parser"
 78 | learn_tokens = false
 79 | min_action_freq = 30
 80 | moves = null
 81 | scorer = {"@scorers":"spacy.parser_scorer.v1"}
 82 | update_with_oracle_cut_size = 100
 83 | 
 84 | [components.parser.model]
 85 | @architectures = "spacy.TransitionBasedParser.v2"
 86 | state_type = "parser"
 87 | extra_state_tokens = false
 88 | hidden_width = 128
 89 | maxout_pieces = 3
 90 | use_upper = false
 91 | nO = null
 92 | 
 93 | [components.parser.model.tok2vec]
 94 | @architectures = "spacy-transformers.TransformerListener.v1"
 95 | grad_factor = 1.0
 96 | pooling = {"@layers":"reduce_mean.v1"}
 97 | upstream = "*"
 98 | 
 99 | [components.transformer]
100 | factory = "transformer_custom"
101 | max_batch_items = 4096
102 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
103 | 
104 | [components.transformer.model]
105 | @architectures = "spacy-transformers.TransformerModel.v3"
106 | name = "megagonlabs/transformers-ud-japanese-electra-base-discriminator"
107 | mixed_precision = false
108 | 
109 | [components.transformer.model.get_spans]
110 | @span_getters = "spacy-transformers.strided_spans.v1"
111 | window = 128
112 | stride = 96
113 | 
114 | [components.transformer.model.grad_scaler_config]
115 | 
116 | [components.transformer.model.tokenizer_config]
117 | use_fast = false
118 | tokenizer_class = "sudachitra.tokenization_electra_sudachipy.ElectraSudachipyTokenizer"
119 | do_lower_case = false
120 | do_word_tokenize = true
121 | do_subword_tokenize = true
122 | word_tokenizer_type = "sudachipy"
123 | subword_tokenizer_type = "wordpiece"
124 | word_form_type = "dictionary_and_surface"
125 | 
126 | [components.transformer.model.tokenizer_config.sudachipy_kwargs]
127 | split_mode = "A"
128 | dict_type = "core"
129 | 
130 | [components.transformer.model.transformer_config]
131 | 
132 | [corpora]
133 | 
134 | [corpora.dev]
135 | @readers = "spacy.Corpus.v1"
136 | path = ${paths.dev}
137 | max_length = 0
138 | gold_preproc = false
139 | limit = 0
140 | augmenter = null
141 | 
142 | [corpora.train]
143 | @readers = "spacy.Corpus.v1"
144 | path = ${paths.train}
145 | max_length = 500
146 | gold_preproc = false
147 | limit = 0
148 | augmenter = null
149 | 
150 | [training]
151 | accumulate_gradient = 3
152 | dev_corpus = "corpora.dev"
153 | train_corpus = "corpora.train"
154 | seed = ${system.seed}
155 | gpu_allocator = ${system.gpu_allocator}
156 | dropout = 0.1
157 | patience = 0
158 | max_epochs = 0
159 | max_steps = 50000
160 | eval_frequency = 200
161 | frozen_components = []
162 | before_to_disk = null
163 | annotating_components = []
164 | 
165 | [training.batcher]
166 | @batchers = "spacy.batch_by_padded.v1"
167 | discard_oversize = true
168 | size = 2000
169 | buffer = 256
170 | get_length = null
171 | 
172 | [training.logger]
173 | @loggers = "spacy.ConsoleLogger.v1"
174 | progress_bar = false
175 | 
176 | [training.optimizer]
177 | @optimizers = "Adam.v1"
178 | beta1 = 0.9
179 | beta2 = 0.999
180 | L2_is_weight_decay = true
181 | L2 = 0.01
182 | grad_clip = 1.0
183 | use_averages = false
184 | eps = 0.00000001
185 | 
186 | [training.optimizer.learn_rate]
187 | @schedules = "warmup_linear.v1"
188 | warmup_steps = 250
189 | total_steps = 50000
190 | initial_rate = 0.00005
191 | 
192 | [training.score_weights]
193 | dep_uas = 0.25
194 | dep_las = 0.25
195 | dep_las_per_type = null
196 | sents_p = null
197 | sents_r = null
198 | sents_f = 0.1
199 | ents_f = 0.25
200 | ents_p = 0.0
201 | ents_r = 0.0
202 | ents_per_type = null
203 | pos_acc = 0.15
204 | morph_acc = 0.0
205 | morph_per_feat = null
206 | tag_acc = 0.0
207 | 
208 | [pretraining]
209 | 
210 | [initialize]
211 | vectors = null
212 | init_tok2vec = ${paths.init_tok2vec}
213 | vocab_data = null
214 | lookups = null
215 | before_init = null
216 | after_init = null
217 | 
218 | [initialize.components]
219 | 
220 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/config/ja_ginza_electra.meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"ja",
 3 |   "name":"ginza_electra",
 4 |   "version":"5.2.0",
 5 |   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
 6 |   "author":"Megagon Labs Tokyo.",
 7 |   "email":"ginza@megagon.ai",
 8 |   "url":"https://github.com/megagonlabs/ginza",
 9 |   "license":"MIT License",
10 |   "sources":[
11 |     {
12 |       "name":"UD_Japanese-BCCWJ r2.8",
13 |       "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ",
14 |       "license":"CC BY-NC-SA 4.0",
15 |       "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y."
16 |     },
17 |     {
18 |       "name":"GSK2014-A(2019)",
19 |       "url":"https://www.gsk.or.jp/catalog/gsk2014-a/",
20 |       "license":"Individually defined commercial license",
21 |       "author":"Tokyo Institute of Technology"
22 |     },
23 |     {
24 |       "name":"SudachiDict_core",
25 |       "url":"https://github.com/WorksApplications/SudachiDict",
26 |       "license":"Apache License 2.0",
27 |       "author":"Works Applications Enterprise Co., Ltd."
28 |     },
29 |     {
30 |       "name":"mC4",
31 |       "url":"https://huggingface.co/datasets/mc4",
32 |       "license":"ODC-BY-1.0",
33 |       "title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer",
34 |       "author":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, & Peter J. Liu"
35 |     },
36 |     {
37 |       "name":"megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0",
38 |       "url":"https://huggingface.co/megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0",
39 |       "license":"MIT Licence",
40 |       "author":"Hiroshi Matsuda (Megagon Labs Tokyo, Recruit Co., Ltd.)"
41 |     }
42 |   ],
43 |   "parent_package":"spacy",
44 |   "spacy_version":">=3.4.4,<4.0.0",
45 |   "spacy_git_version":"0fc3dee77",
46 |   "pipeline":[
47 |     "transformer",
48 |     "parser",
49 |     "attribute_ruler",
50 |     "ner",
51 |     "morphologizer",
52 |     "compound_splitter",
53 |     "bunsetu_recognizer"
54 |   ],
55 |   "components":[
56 |     "transformer",
57 |     "parser",
58 |     "attribute_ruler",
59 |     "ner",
60 |     "morphologizer",
61 |     "compound_splitter",
62 |     "bunsetu_recognizer"
63 |   ],
64 |   "disabled": [
65 |     "attribute_ruler"
66 |   ],
67 |   "vectors":{
68 |     "width":0,
69 |     "vectors":0,
70 |     "keys":0,
71 |     "name":null
72 |   },
73 |  "requirements":[
74 |     "sudachipy>=0.6.2,<0.7.0",
75 |     "sudachidict_core>=20210802",
76 |     "sudachitra>=0.1.6,<0.2.0",
77 |     "ginza-transformers>=0.4.0,<0.5.0",
78 |     "ginza>=5.2.0,<5.3.0"
79 |   ]
80 | }
81 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: GiNZA - Japanese NLP Library
3 | description: Universal Dependenciesに基づくオープンソース日本語NLPライブラリ
4 | 


--------------------------------------------------------------------------------
/docs/bunsetu_api.md:
--------------------------------------------------------------------------------
 1 | # 文節APIの解説
 2 | 
 3 | ## GiNZAの解析モデルと文節単位の解析API
 4 | 
 5 | GiNZA独自の文節解析モデルにより、Universal Dependenciesの枠組みの中で日本語に特徴的な文節構造を考慮することができます。
 6 | 
 7 | ![bunsetu_heads](https://github.com/megagonlabs/ginza/raw/static/docs/images/bunsetu_heads.png)
 8 | 
 9 | またGiNZA v4で追加された解析APIを用いることで、文節やその主辞を単位とした分析がこれまでよりずっと容易になります。
10 | ```python
11 | from ginza import *
12 | import spacy
13 | nlp = spacy.load("ja_ginza")  # GiNZAモデルの読み込み
14 | 
15 | from collections import defaultdict
16 | frames = defaultdict(lambda: 0)  # 依存関係の出現頻度を格納
17 | sentences = set()  # 重複文検出用のset
18 | 
19 | with open("sentences.txt", "r") as fin:  # 解析対象のテキストファイルから
20 |   for line in fin:  # 一行ごとに
21 |     try:
22 |       doc = nlp(line.rstrip())  # 解析を実行し
23 |     except:
24 |       continue
25 |     for sent in doc.sents:  # 文単位でループ
26 |       if sent.text in sentences:
27 |         continue  # 重複文はスキップ
28 |       sentences.add(sent.text)
29 |       for t in bunsetu_head_tokens(sent):  # 文節主辞トークンのうち
30 |         if t.pos_ not in {"ADJ", "VERB"}:
31 |           continue  # 述語以外はスキップ
32 |         v = phrase(lemma_)(t)  # 述語とその格要素(主語・目的語相当)の句を集める
33 |         dep_phrases = sub_phrases(t, phrase(lemma_), is_not_stop)
34 |         subj = [phrase for dep, phrase in dep_phrases if dep in {"nsubj"}]
35 |         obj  = [phrase for dep, phrase in dep_phrases if dep in {"obj", "iobj"}]
36 |         for s in subj:
37 |           for o in obj:
38 |             frames[(s, o, v)] += 1  # 格要素と述語の組み合わせをカウント
39 | 
40 | for frame, count in sorted(frames.items(), key=lambda t: -t[1]):
41 |   print(count, *frame, sep="\t")  # 出現頻度の高い順に表示
42 | ```
43 | 
44 | #### 表1 GiNZAの文節APIの一覧
45 | 
46 | | category | func or variable | description |
47 | | --- | --- | --- |
48 | | Span-based | | |
49 | | | bunsetu_spans()           | 文節SpanのIterable。 |
50 | | | bunsetu_phrase_spans()    | 文節主辞SpanのIterable。 |
51 | | | bunsetu_span()            | トークンが属する文節のSpan。 |
52 | | | bunsetu_phrase_span()     | トークンが属する文節の主辞Span。 |
53 | | Construction | | |
54 | | | bunsetu()                 | 文節中のトークン列を指定された形に整形して返す。 |
55 | | | phrase()                  | 文節主辞中のトークン列を指定された形に整形して<br>返す。 |
56 | | | sub_phrases()             | 従属文節を指定された形に整形して返す。 |
57 | | | phrases()                 | スパンに含まれる文節を指定された形に整形して<br>返す。 |
58 | | Utility | | |
59 | | | traverse()                | 構文木を指定された方法で巡回し指定された形に<br>整形して返す。 |
60 | | | default_join_func()       | デフォルトのトークン列の結合方法。 |
61 | | | SEP                       | デフォルトのトークン区切り文字。 |
62 | | Token-based | | |
63 | | | bunsetu_head_list()       | DocやSpanに含まれる文節のヘッドトークンの<br>インデックスのリスト。 |
64 | | | bunsetu_head_tokens()     | DocやSpanに含まれる文節のヘッドトークンの<br>リスト。 |
65 | | | bunsetu_bi_labels()       | DocやSpanに含まれるトークンが文節開始位置<br>にある場合は"B"、それ以外は"I"とするリスト。 |
66 | | | bunsetu_position_types()  | DocやSpanに含まれるトークンを{"ROOT",<br>"SEM_HEAD", "SYN_HEAD", "NO_HEAD",<br>"FUNC", "CONT"}に分類したリスト。 |
67 | | | is_bunsetu_head()         | トークンが文節のヘッドの場合はTrue、<br>それ以外はFalse。 |
68 | | | bunsetu_bi_label()        | トークンが文節開始位置にある場合は"B"、<br>それ以外は"I"。 |
69 | | | bunsetu_position_type()   | トークンを{"ROOT", "SEM_HEAD",<br>"SYN_HEAD", "NO_HEAD", "FUNC",<br>"CONT"}に分類。 |
70 | | Proxy | | |
71 | | | *                         | spacy.tokens.Tokenクラスのプロパティと<br>同名・同機能の関数群。 |
72 | | Subtoken | | |
73 | | | sub_tokens()              | トークンの分割情報。 |
74 | | | set_split_mode()          | デフォルトの分割モードの変更。 |
75 | | Clause | | |
76 | | | clauses()                | 節単位に分割されたトークン列。(experimental) |
77 | | | clause_head()            | トークンが属する節のヘッドとなるトークン。(experimental) |
78 | | | clause_head_i()          | トークンが属する節のヘッドとなるトークン番号。(experimental) |
79 | 
80 | ## 解説資料
81 | 
82 | 詳細な解説はこちらの記事をご覧ください。
83 | 
84 | - [GiNZA version 4.0: 多言語依存構造解析技術への文節APIの統合 - Megagon Labs Blog](https://www.megagon.ai/jp/blog/ginza-version-4-0/)
85 | - [GiNZA - Universal Dependenciesによる実用的日本語解析 - 自然言語処理 Volume 27 Number 3](https://www.jstage.jst.go.jp/article/jnlp/27/3/27_695/_article/-char/ja/)
86 | 


--------------------------------------------------------------------------------
/docs/command_line_tool.md:
--------------------------------------------------------------------------------
  1 | # コマンドラインツールの解説
  2 | 
  3 | ## ginza
  4 | 
  5 | `ginza`コマンドはコマンドライン引数で指定されたファイル(指定されない場合は標準入力)から一行を単位としてテキストを読み込み、解析結果を標準出力に[CoNLL-U Syntactic Annotation](https://universaldependencies.org/format.html#syntactic-annotation) 形式で出力します。
  6 | ```console
  7 | $ ginza
  8 | 銀座でランチをご一緒しましょう。
  9 | # text = 銀座でランチをご一緒しましょう。
 10 | 1       銀座    銀座    PROPN   名詞-固有名詞-地名-一般 _       6       nmod    _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City|ClauseHead=6
 11 | 2       で      で      ADP     助詞-格助詞     _       1       case    _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ|ClauseHead=6
 12 | 3       ランチ  ランチ  NOUN    名詞-普通名詞-一般      _       6       obj     _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ|ClauseHead=6
 13 | 4       を      を      ADP     助詞-格助詞     _       3       case    _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ|ClauseHead=6
 14 | 5       ご      ご      NOUN    接頭辞  _       6       compound        _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|NP_B|Reading=ゴ|ClauseHead=6
 15 | 6       一緒    一緒    NOUN    名詞-普通名詞-サ変可能  _       0       root    _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|NP_I|Reading=イッショ|ClauseHead=6
 16 | 7       し      する    AUX     動詞-非自立可能 _       6       aux     _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ|ClauseHead=6
 17 | 8       ましょう        ます    AUX     助動詞  _       6       aux     _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ|ClauseHead=6
 18 | 9       。      。      PUNCT   補助記号-句点   _       6       punct   _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。|ClauseHead=6
 19 | 
 20 | ```
 21 | 
 22 | ## ginzame
 23 | 
 24 | `ginzame`コマンドでオープンソース形態素解析エンジン [MeCab](https://taku910.github.io/mecab/) の`mecab`コマンドに近い形式で解析結果を出力することができます。
 25 | `ginzame`コマンドは形態素解析処理のみをマルチプロセスで高速に実行します。
 26 | このコマンドと`mecab`の出力形式の相違点として、最終フィールド（発音）が常に`*`となること、
 27 | ginza の split_mode はデフォルトが `C` なので unidic 相当の単語分割を得るためには `-s A` を指定する必要があることに注意して下さい。
 28 | ```console
 29 | $ ginzame
 30 | 銀座でランチをご一緒しましょう。
 31 | 銀座	名詞,固有名詞,地名,一般,*,*,銀座,ギンザ,*
 32 | で	助詞,格助詞,*,*,*,*,で,デ,*
 33 | ランチ	名詞,普通名詞,一般,*,*,*,ランチ,ランチ,*
 34 | を	助詞,格助詞,*,*,*,*,を,ヲ,*
 35 | ご	接頭辞,*,*,*,*,*,御,ゴ,*
 36 | 一緒	名詞,普通名詞,サ変可能,*,*,*,一緒,イッショ,*
 37 | し	動詞,非自立可能,*,*,サ行変格,連用形-一般,為る,シ,*
 38 | ましょう	助動詞,*,*,*,助動詞-マス,意志推量形,ます,マショウ,*
 39 | 。	補助記号,句点,*,*,*,*,。,。,*
 40 | EOS
 41 | 
 42 | ```
 43 | 
 44 | ## OPTIONS
 45 | `ginza`コマンドでは以下のオプションを指定することができます。
 46 | `ginzame`コマンドでは `--split-mode` `--hash-comment` `output-path` `--use-normalized-form` `--parallel` オプションが利用可能です。
 47 | 
 48 | - `--model-path <string>`, `-b <string>`
 49 |     `spacy.language.Language` 形式の学習済みモデルが保存されたディレクトリを指定します。
 50 |     `--ensure-model` オプションと同時に指定することはできません。
 51 | - `--ensure-model <string>`, `-m <string>`
 52 |     ginza および spaCy が公開している学習済みモデル名を指定します。`--model-path` オプションと同時に指定することはできません。次の値のいずれかを指定できます。
 53 |         - `ja_ginza`, `ja_ginza_electra`
 54 |         - [spaCy Models & Languages](https://spacy.io/usage/models)で公開されている日本語以外を含む全ての言語のモデル (例: en_core_web_md)
 55 |     使用するモデルに応じて、事前に `pip install ja-ginza-electra` のようにパッケージをダウンロードする必要があります。
 56 |     `--model-path`, `--ensure-model` のどちらも指定されない場合には `ja_ginza_electra`、`ja_ginza` の順の優先度でロード可能なモデルを利用します。
 57 | - `--split-mode <string>`, `-s <string>`
 58 |      複合名詞の分割モードを指定します。モードは [sudachi](https://github.com/WorksApplications/Sudachi#the-modes-of-splitting) に準拠し、`A`、`B`、`C`のいずれかを指定できます。`ginza`コマンドのデフォルト値は `C`、`ginzame`コマンドのデフォルト値はMeCab UniDicに近い `A` です。
 59 |      `A`が分割が最も短く複合名詞が UniDic 短単位まで分割され、 `C` では固有名詞が抽出されます。`B` は二つの中間の単位に分割されます。
 60 | - `--hash-comment <string>`, `-c <string>`
 61 |     行頭が `#` から始まる行を解析対象とするかのモードを指定します。次の値のいずれかを指定できます。
 62 |         - `print`
 63 |             解析対象とはしないが、解析結果には入力をそのまま出力します。
 64 |         - `skip`
 65 |             解析対象とせず、解析結果にも出力しません。
 66 |         - `analyze`
 67 |             `#` から始まる行についても解析を行い、結果を出力します。ただし`-f json`が指定されている場合は `-c`の指定に依らず常に`analyze`が適用されます。
 68 |     デフォルト値は `print` です。
 69 | - `--output-path <string>`, `-o <string>`
 70 |     解析結果を出力するファイルのパスを指定します。指定しない場合には標準出力に解析結果が出力されます。
 71 | - `--output-format <string>`, `-f <string>`
 72 |     [解析結果のフォーマット](#出力形式の指定)を指定します。次の値のいずれかを指定できます。
 73 |         - `0`, `conllu`
 74 |         - `1`, `cabocha`
 75 |         - `2`, `mecab`
 76 |         - `3`, `json`
 77 |     デフォルト値は `conllu` です。
 78 | - `--require-gpu <int>`, `-g <int>`
 79 |     引数で指定されたgpu_idのGPUを使用して解析を行います。引数に-1を指定(デフォルト)するとCPUを使用します。ただし、[spaCyおよびcupyの制約](https://github.com/explosion/spaCy/issues/5507)から、`--require-gpu`は`--parallel`と同時に指定できません。
 80 | - `--use-normalized-form`, `-n`
 81 |     `-f conllu`のlemmaフィールドに [sudachi](https://github.com/WorksApplications/Sudachi#normalized-form) を使用するためのブールスイッチ。
 82 | - `--disable-sentencizer`, `-d`
 83 |     `ja_ginza`、 `ja_ginza_electra` モデル利用時に[disable_sentencizer](https://github.com/megagonlabs/ginza/blob/develop/ginza/disable_sentencizer.py)を有効化するブールスイッチ。
 84 | - `--parallel <int>`, `-p <int>`
 85 |     並列実行するプロセス数を指定します。0 を指定すると cpu コア数分のプロセスを起動します。デフォルト値は1です。
 86 | 
 87 | ## 出力形式の指定
 88 | 
 89 | ### JSON
 90 | 
 91 | spaCyの学習用JSON形式での出力は`ginza -f 3` または `ginza -f json`を実行してください。
 92 | ```console
 93 | $ ginza -f json
 94 | 銀座でランチをご一緒しましょう。
 95 | [
 96 |  {
 97 |   "paragraphs": [
 98 |    {
 99 |     "raw": "銀座でランチをご一緒しましょう。",
100 |     "sentences": [
101 |      {
102 |       "tokens": [
103 |        {"id": 1, "orth": "銀座", "tag": "名詞-固有名詞-地名-一般", "pos": "PROPN", "lemma": "銀座", "head": 5, "dep": "obl", "ner": "B-City"},
104 |        {"id": 2, "orth": "で", "tag": "助詞-格助詞", "pos": "ADP", "lemma": "で", "head": -1, "dep": "case", "ner": "O"},
105 |        {"id": 3, "orth": "ランチ", "tag": "名詞-普通名詞-一般", "pos": "NOUN", "lemma": "ランチ", "head": 3, "dep": "obj", "ner": "O"},
106 |        {"id": 4, "orth": "を", "tag": "助詞-格助詞", "pos": "ADP", "lemma": "を", "head": -1, "dep": "case", "ner": "O"},
107 |        {"id": 5, "orth": "ご", "tag": "接頭辞", "pos": "NOUN", "lemma": "ご", "head": 1, "dep": "compound", "ner": "O"},
108 |        {"id": 6, "orth": "一緒", "tag": "名詞-普通名詞-サ変可能", "pos": "VERB", "lemma": "一緒", "head": 0, "dep": "ROOT", "ner": "O"},
109 |        {"id": 7, "orth": "し", "tag": "動詞-非自立可能", "pos": "AUX", "lemma": "する", "head": -1, "dep": "advcl", "ner": "O"},
110 |        {"id": 8, "orth": "ましょう", "tag": "助動詞", "pos": "AUX", "lemma": "ます", "head": -2, "dep": "aux", "ner": "O"},
111 |        {"id": 9, "orth": "。", "tag": "補助記号-句点", "pos": "PUNCT", "lemma": "。", "head": -3, "dep": "punct", "ner": "O"}
112 |       ]
113 |      }
114 |     ]
115 |    }
116 |   ]
117 |  }
118 | ]
119 | ```
120 | 
121 | ### CaboCha
122 | 
123 | 日本語係り受け解析器 [CaboCha](https://taku910.github.io/cabocha/) の`cabocha -f1`のラティス形式に近い解析結果を出力する場合は
124 | `ginza -f 1` または `ginza -f cabocha` を実行して下さい。
125 | このオプションと`cabocha -f1`の出力形式の相違点として、
126 | スラッシュ記号`/`に続く`func_index`フィールドが常に自立語の終了位置（機能語があればその開始位置に一致）を示すこと、
127 | 機能語認定基準が一部異なること、
128 | に注意して下さい。
129 | ```console
130 | $ ginza -f cabocha
131 | 銀座でランチをご一緒しましょう。
132 | * 0 2D 0/1 0.000000
133 | 銀座	名詞,固有名詞,地名,一般,,銀座,ギンザ,*	B-City
134 | で	助詞,格助詞,*,*,,で,デ,*	O
135 | * 1 2D 0/1 0.000000
136 | ランチ	名詞,普通名詞,一般,*,,ランチ,ランチ,*	O
137 | を	助詞,格助詞,*,*,,を,ヲ,*	O
138 | * 2 -1D 0/2 0.000000
139 | ご	接頭辞,*,*,*,,ご,ゴ,*	O
140 | 一緒	名詞,普通名詞,サ変可能,*,,一緒,イッショ,*	O
141 | し	動詞,非自立可能,*,*,サ行変格,連用形-一般,する,シ,*	O
142 | ましょう	助動詞,*,*,*,助動詞-マス,意志推量形,ます,マショウ,*	O
143 | 。	補助記号,句点,*,*,,。,。,*	O
144 | EOS
145 | 
146 | ```
147 | 
148 | ## マルチプロセス実行 (Experimental)
149 | 
150 | `-p NUM_PROCESS` オプションで解析処理のマルチプロセス実行が可能になります。
151 | `NUM_PROCESS`には並列実行するプロセス数を整数で指定します。
152 | 0以下の値は`実行環境のCPUコア数＋NUM_PROCESS`を指定したのと等価になります。
153 | 
154 | `ginza -f mecab`とそのエイリアスである`ginzame`以外で`-p NUM_PROCESS`オプションを使用する場合は、
155 | 実行環境の空きメモリ容量が十分あることを事前に確認してください。
156 | マルチプロセス実行では1プロセスあたり`ja_ginza`で数百MB、`ja_ginza_electra`で数GBのメモリが必要です。
157 | 


--------------------------------------------------------------------------------
/docs/developer_reference.md:
--------------------------------------------------------------------------------
 1 | # 開発者向けの情報
 2 | 
 3 | ## 開発環境
 4 | 
 5 | ### 開発環境のセットアップ
 6 | 
 7 | #### 1. githubからclone
 8 | ```console
 9 | $ git clone 'https://github.com/megagonlabs/ginza.git'
10 | ```
11 | 
12 | #### 2. pip install および setup.sh の実行
13 | ```console
14 | $ pip install -U -r requirements.txt
15 | $ python setup.py develop
16 | ```
17 | 
18 | #### 3. GPU用ライブラリのセットアップ (Optional)
19 | CUDA v11.0の場合は次のように指定します。
20 | ```console
21 | $ pip install -U spacy[cuda110]
22 | ```
23 | 
24 | ### 訓練の実行
25 | GiNZAの解析モデル `ja_ginza` はspaCy標準コマンドを使用して学習を行っています。
26 | ```console
27 | $ python -m spacy train ja ja_ginza-4.0.0 corpus/ja_ginza-ud-train.json corpus/ja_ginza-ud-dev.json -b ja_vectors_chive_mc90_35k/ -ovl 0.3 -n 100 -m meta.json.ginza -V 4.0.0
28 | ```
29 | 
30 | ### トラブルシューティング
31 | 
32 | Google Colab 環境ではインストール後にパッケージ情報の再読込が必要な場合があります。詳細はリンクの記事をご確認下さい。
33 | ```python
34 | import pkg_resources, imp
35 | imp.reload(pkg_resources)
36 | ```
37 | [【GiNZA】GoogleColabで日本語NLPライブラリGiNZAがloadできない](https://www.sololance.tokyo/2019/10/colab-load-ginza.html)
38 | 
39 | インストール時にCythonに関するエラーが発生した場合は、次のように環境変数CFLAGSを設定してください。
40 | ```console
41 | $ CFLAGS='-stdlib=libc++' pip install ginza
42 | ```
43 | 
44 | ## ユーザ辞書の使用
45 | 
46 | GiNZAはTokenizer(形態素解析レイヤ)にSudachiPyを使用しています。
47 | GiNZAでユーザ辞書を使用するにはSudachiPyの辞書設定ファイル `sudachi.json` の `userDict` フィールドに、
48 | コンパイル済みのユーザ辞書ファイルのパスのリストを指定します。
49 | 
50 | SudachiPyのユーザ辞書ファイルのコンパイル方法についてはSudachiPyのGitHubリポジトリで公開されているドキュメントを参照してください。  
51 | [SudachiPy - User defined Dictionary](https://github.com/WorksApplications/SudachiPy#user-defined-dictionary)  
52 | [Sudachi ユーザー辞書作成方法](https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md)
53 | 


--------------------------------------------------------------------------------
/ginza/__init__.py:
--------------------------------------------------------------------------------
  1 | from functools import singledispatch
  2 | from typing import Callable, Iterable, Union, Tuple, TypeVar
  3 | 
  4 | from sudachipy.morpheme import Morpheme
  5 | 
  6 | from spacy.lang.ja import DetailedToken
  7 | from spacy.language import Language
  8 | from spacy.tokens import Doc, Span, Token
  9 | 
 10 | from .bunsetu_recognizer import *
 11 | from .compound_splitter import *
 12 | from .disable_sentencizer import *
 13 | from .ene_ontonotes_mapper import ENE_ONTONOTES_MAPPING
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "make_compound_splitter", "make_bunsetu_recognizer", "make_disable_sentencizer",
 18 |     "force_using_normalized_form_as_lemma", "set_split_mode",
 19 |     "token_i", "text", "text_with_ws", "orth", "orth_",
 20 |     "ent_type", "ent_type_", "ent_iob", "ent_iob_",
 21 |     "lemma", "lemma_", "norm", "norm_",
 22 |     "pos", "pos_", "tag", "tag_", "dep", "dep_",
 23 |     "is_sent_start", "is_stop", "is_not_stop",
 24 |     "ent_label_ene", "ent_label_ontonotes",
 25 |     "reading_form", "inflection",
 26 |     "bunsetu_bi_label", "bunsetu_position_type", "is_bunsetu_head",
 27 |     "clauses","token_clause_head",
 28 |     "SEP", "default_join_func",
 29 |     "traverse",
 30 |     "head", "ancestors", "conjuncts", "children", "lefts", "rights", "subtree",
 31 |     "bunsetu", "phrase", "sub_phrases", "phrases",
 32 |     "sub_tokens",
 33 |     # from bunsetu_recognizer
 34 |     "bunsetu_span",
 35 |     "bunsetu_spans",
 36 |     "bunsetu_phrase_span",
 37 |     "bunsetu_phrase_spans",
 38 |     "bunsetu_head_list",
 39 |     "bunsetu_head_tokens",
 40 |     "bunsetu_bi_labels",
 41 |     "bunsetu_position_types",
 42 |     "clauses",
 43 |     "clause_head",
 44 |     "clause_head_i",
 45 |     "BunsetuRecognizer",
 46 |     # from compound_splitter
 47 |     "CompoundSplitter",
 48 |     "tag_to_pos",
 49 | ]
 50 | 
 51 | 
 52 | @Language.factory(
 53 |     "compound_splitter",
 54 |     requires=[],
 55 |     assigns=[],
 56 |     retokenizes=True,
 57 |     default_config={"split_mode": None},
 58 | )
 59 | def make_compound_splitter(
 60 |     nlp: Language,
 61 |     name: str,
 62 |     split_mode: str = None,
 63 | ):
 64 |     return CompoundSplitter(
 65 |         nlp.vocab,
 66 |         split_mode,
 67 |     )
 68 | 
 69 | 
 70 | @Language.factory(
 71 |     "bunsetu_recognizer",
 72 |     requires=["token.dep"],
 73 |     assigns=["token.dep"],
 74 |     retokenizes=False,
 75 |     default_config={},
 76 | )
 77 | def make_bunsetu_recognizer(
 78 |     nlp: Language,
 79 |     name: str,
 80 |     remain_bunsetu_suffix: bool = False,
 81 | ):
 82 |     return BunsetuRecognizer(
 83 |         nlp.vocab,
 84 |         remain_bunsetu_suffix,
 85 |     )
 86 | 
 87 | @Language.factory(
 88 |     "disable_sentencizer",
 89 |     requires=[],
 90 |     assigns=[],
 91 |     retokenizes=False,
 92 |     default_config={},
 93 | )
 94 | def make_disable_sentencizer(
 95 |     nlp: Language,
 96 |     name: str,
 97 | ):
 98 |     return DisableSentencizer(
 99 |         nlp.vocab,
100 |     )
101 | 
102 | 
103 | _morpheme_dictionary_form = None
104 | 
105 | 
106 | def force_using_normalized_form_as_lemma(force: bool):
107 |     global _morpheme_dictionary_form
108 |     if force and not _morpheme_dictionary_form:
109 |         _morpheme_dictionary_form = Morpheme.dictionary_form
110 |         Morpheme.dictionary_form = Morpheme.normalized_form
111 |     elif not force and _morpheme_dictionary_form:
112 |         Morpheme.dictionary_form = _morpheme_dictionary_form
113 | 
114 | 
115 | def set_split_mode(nlp: Language, mode: str):
116 |     if nlp.has_pipe("compound_splitter"):
117 |         splitter = nlp.get_pipe("compound_splitter")
118 |         splitter.split_mode = mode
119 | 
120 | 
121 | # token field getters
122 | 
123 | def token_i(token: Token) -> int:
124 |     return token.i
125 | 
126 | 
127 | def text(token: Token) -> str:
128 |     return token.text
129 | 
130 | 
131 | def text_with_ws(token: Token) -> str:
132 |     return token.text_with_ws
133 | 
134 | 
135 | def orth(token: Token) -> int:
136 |     return token.orth
137 | 
138 | 
139 | def orth_(token: Token) -> str:
140 |     return token.orth_
141 | 
142 | 
143 | def ent_type(token: Token) -> int:
144 |     return token.ent_type
145 | 
146 | 
147 | def ent_type_(token: Token) -> str:
148 |     return ENE_ONTONOTES_MAPPING.get(token.ent_type_, "OTHERS")
149 | 
150 | 
151 | def ent_iob(token: Token) -> int:
152 |     return token.ent_iob
153 | 
154 | 
155 | def ent_iob_(token: Token) -> str:
156 |     return token.ent_iob_
157 | 
158 | 
159 | def lemma(token: Token) -> int:
160 |     return token.lemma
161 | 
162 | 
163 | def lemma_(token: Token) -> str:
164 |     return token.lemma_
165 | 
166 | 
167 | def norm(token: Token) -> int:
168 |     return token.norm
169 | 
170 | 
171 | def norm_(token: Token) -> str:
172 |     return token.norm_
173 | 
174 | 
175 | def pos(token: Token) -> int:
176 |     return token.pos
177 | 
178 | 
179 | def pos_(token: Token) -> str:
180 |     return token.pos_
181 | 
182 | 
183 | def tag(token: Token) -> int:
184 |     return token.tag
185 | 
186 | 
187 | def tag_(token: Token) -> str:
188 |     return token.tag_
189 | 
190 | 
191 | def dep(token: Token) -> int:
192 |     return token.dep
193 | 
194 | 
195 | def dep_(token: Token) -> str:
196 |     return token.dep_
197 | 
198 | 
199 | def is_sent_start(token: Token) -> bool:
200 |     return token.is_sent_start
201 | 
202 | 
203 | def is_stop(token: Token) -> bool:
204 |     return token.is_stop
205 | 
206 | 
207 | def is_not_stop(token: Token) -> bool:
208 |     return not token.is_stop
209 | 
210 | 
211 | def ent_label_ene(token: Token) -> str:
212 |     if token.ent_iob_ in "BI":
213 |         return token.ent_iob_ + "-" + token.ent_type_
214 |     else:
215 |         return token.ent_iob_
216 | 
217 | 
218 | def ent_label_ontonotes(token: Token) -> str:
219 |     if token.ent_iob_ in "BI":
220 |         return token.ent_iob_ + "-" + ENE_ONTONOTES_MAPPING.get(token.ent_type_, "OTHERS")
221 |     else:
222 |         return token.ent_iob_
223 | 
224 | 
225 | # token field getters for Doc.user_data
226 | 
227 | def reading_form(token: Token, use_orth_if_none: bool) -> str:
228 |     reading = token.morph.get("Reading")
229 |     if reading:
230 |         return reading[0]
231 |     elif use_orth_if_none:
232 |         return token.orth_
233 |     else:
234 |         return None
235 | 
236 | 
237 | def inflection(token: Token) -> str:
238 |     inf = token.morph.get("Inflection")
239 |     if inf:
240 |         return inf[0].replace(";", ",")
241 |     else:
242 |         return ""
243 | 
244 | 
245 | # bunsetu related field getters for Doc.user_data
246 | 
247 | def bunsetu_bi_label(token: Token):
248 |     return bunsetu_bi_labels(token.doc)[token.i]
249 | 
250 | 
251 | def bunsetu_position_type(token: Token):
252 |     return bunsetu_position_types(token.doc)[token.i]
253 | 
254 | 
255 | def is_bunsetu_head(token: Token):
256 |     return token.i in token.doc.user_data["bunsetu_heads"]
257 | 
258 | 
259 | SEP = "+"
260 | 
261 | 
262 | def default_join_func(elements):
263 |     return SEP.join([element if isinstance(element, str) else str(element) for element in elements])
264 | 
265 | 
266 | T = TypeVar('T')
267 | U = TypeVar('U')
268 | V = TypeVar('V')
269 | 
270 | 
271 | # curried function: ex. traverse(children, lemma_)(token)
272 | @singledispatch
273 | def traverse(
274 |         traverse_func: Callable[[Token], Iterable[Token]],
275 |         element_func: Callable[[Token], T] = lambda token: token,
276 |         condition_func: Callable[[Token], bool] = lambda token: True,
277 |         join_func: Callable[[Iterable[T]], U] = lambda lst: lst,
278 | ) -> Callable[[Union[Token, Span]], U]:
279 |     return lambda token: join_func([
280 |         element_func(t) for t in traverse_func(token) if condition_func(t)
281 |     ])
282 | 
283 | 
284 | # overload: ex. traverse(token, children, lemma_)
285 | @traverse.register(Token)
286 | def _traverse(
287 |         token: Token,
288 |         traverse_func: Callable[[Token], Iterable[Token]],
289 |         element_func: Callable[[Token], T] = lambda token: token,
290 |         condition_func: Callable[[Token], bool] = lambda token: True,
291 |         join_func: Callable[[Iterable[T]], U] = lambda lst: lst,
292 | ) -> U:
293 |     return traverse(traverse_func, element_func, condition_func, join_func)(token)
294 | 
295 | 
296 | def head(token: Token) -> Token:
297 |     return token.head
298 | 
299 | 
300 | def ancestors(token: Token) -> Iterable[Token]:
301 |     return token.ancestors
302 | 
303 | 
304 | def conjuncts(token: Token) -> Tuple[Token]:
305 |     return token.conjuncts
306 | 
307 | 
308 | def children(token: Token) -> Iterable[Token]:
309 |     return token.children
310 | 
311 | 
312 | def lefts(token: Token) -> Iterable[Token]:
313 |     return token.lefts
314 | 
315 | 
316 | def rights(token: Token) -> Iterable[Token]:
317 |     return token.rights
318 | 
319 | 
320 | def subtree(token: Token) -> Iterable[Token]:
321 |     return token.subtree
322 | 
323 | 
324 | # curried function: ex. bunsetu(lemma_)(token)
325 | @singledispatch
326 | def bunsetu(
327 |         element_func: Callable[[Token], T] = lambda token: token,
328 |         condition_func: Callable[[Token], bool] = lambda token: True,
329 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
330 | ) -> Callable[[Token], U]:
331 |     return traverse(bunsetu_span, element_func, condition_func, join_func)
332 | 
333 | 
334 | # overload: ex. bunsetu(token, lemma_)
335 | @bunsetu.register(Token)
336 | def _bunsetu(
337 |         token: Token,
338 |         element_func: Callable[[Token], T] = lambda token: token,
339 |         condition_func: Callable[[Token], bool] = lambda token: True,
340 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
341 | ) -> U:
342 |     return traverse(bunsetu_span, element_func, condition_func, join_func)(token)
343 | 
344 | 
345 | # curried function: ex. phrase(lemma_)(token)
346 | @singledispatch
347 | def phrase(
348 |         element_func: Callable[[Token], T] = lambda token: token,
349 |         condition_func: Callable[[Token], bool] = lambda token: True,
350 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
351 | ) -> Callable[[Token], U]:
352 |     return traverse(bunsetu_phrase_span, element_func, condition_func, join_func)
353 | 
354 | 
355 | # overload: ex. phrase(token)
356 | @phrase.register(Token)
357 | def _phrase(
358 |         token: Token,
359 |         element_func: Callable[[Token], T] = lambda token: token,
360 |         condition_func: Callable[[Token], bool] = lambda token: True,
361 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
362 | ) -> U:
363 |     return traverse(bunsetu_phrase_span, element_func, condition_func, join_func)(token)
364 | 
365 | 
366 | # curried function: ex. sub_phrases(lemma_)(token)
367 | @singledispatch
368 | def sub_phrases(
369 |         phrase_func: Callable[[Token], U] = _phrase,
370 |         condition_func: Callable[[Token], bool] = lambda token: True,
371 | ) -> Callable[[Token], Iterable[Tuple[str, U]]]:
372 |     return lambda token: _sub_phrases(
373 |         token,
374 |         phrase_func,
375 |         condition_func,
376 |     )
377 | 
378 | 
379 | # overload: ex. sub_phrases(token, lemma_)
380 | @sub_phrases.register(Token)
381 | def _sub_phrases(
382 |         token: Token,
383 |         phrase_func: Callable[[Token], U] = _phrase,
384 |         condition_func: Callable[[Token], bool] = lambda token: True,
385 | ) -> Iterable[Tuple[str, U]]:
386 |     return [
387 |         (
388 |             t.dep_,
389 |             phrase_func(t),
390 |         ) for t in bunsetu_span(token).root.children if t.i in bunsetu_head_list(token.doc) and condition_func(t)
391 |     ]
392 | 
393 | 
394 | # curried function: ex. phrases(lemma_)(sent)
395 | @singledispatch
396 | def phrases(
397 |         phrase_func: Callable[[Token], U] = _phrase,
398 |         condition_func: Callable[[Token], bool] = lambda token: True,
399 | ) -> Callable[[Span], Iterable[U]]:
400 |     return lambda sent: _phrases_span(
401 |         sent,
402 |         phrase_func,
403 |         condition_func,
404 |     ) if isinstance(sent, Span) else _phrases_doc(
405 |         sent,
406 |         phrase_func,
407 |         condition_func,
408 |     )
409 | 
410 | 
411 | # overload: ex. phrases(sent, lemma_)
412 | @phrases.register(Span)
413 | def _phrases_span(
414 |         sent: Span,
415 |         phrase_func: Callable[[Token], U] = _phrase,
416 |         condition_func: Callable[[Token], bool] = lambda token: True,
417 | ) -> Iterable[U]:
418 |     return [
419 |         phrase_func(t) for t in bunsetu_head_tokens(sent) if condition_func(t)
420 |     ]
421 | 
422 | 
423 | # overload: ex. phrases(doc, lemma_)
424 | @phrases.register(Doc)
425 | def _phrases_doc(
426 |         doc: Doc,
427 |         phrase_func: Callable[[Token], U] = _phrase,
428 |         condition_func: Callable[[Token], bool] = lambda token: True,
429 | ) -> Iterable[U]:
430 |     return [
431 |         phrase_func(t) for t in bunsetu_head_tokens(doc[:]) if condition_func(t)
432 |     ]
433 | 
434 | 
435 | # curried function: ex. sub_tokens("B", lambda sub_token: sub_token.lemma)(token)
436 | @singledispatch
437 | def sub_tokens(
438 |         mode: str = "A",  # "A" or "B"
439 |         sub_token_func: Callable[[DetailedToken], T] = lambda sub_token: sub_token,
440 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
441 | ) -> Callable[[Token], U]:
442 |     return lambda token: _sub_tokens(token, mode, sub_token_func, join_func)
443 | 
444 | 
445 | # overload: ex. sub_tokens(token, "B", lambda sub_token: sub_token.lemma)
446 | @sub_tokens.register(Token)
447 | def _sub_tokens(
448 |         token: Token,
449 |         mode: str = "A",  # "A" or "B"
450 |         sub_token_func: Callable[[DetailedToken], T] = lambda sub_token: sub_token.surface,
451 |         join_func: Callable[[Iterable[T]], U] = default_join_func,
452 | ) -> U:
453 |     if token.doc.user_data["sub_tokens"][token.i]:
454 |         elements = token.doc.user_data["sub_tokens"][token.i][{"A": 0, "B": 1}[mode]]
455 |     else:
456 |         elements = [
457 |             DetailedToken(
458 |                 token.orth_,
459 |                 token.tag_,
460 |                 inflection(token),
461 |                 token.lemma_,
462 |                 reading_form(token, True),
463 |                 None,
464 |             )
465 |         ]
466 |     return join_func([
467 |         sub_token_func(element) for element in elements
468 |     ])
469 | 


--------------------------------------------------------------------------------
/ginza/__main__.py:
--------------------------------------------------------------------------------
 1 | import plac
 2 | 
 3 | from .command_line import run_ginza, run_ginzame
 4 | 
 5 | 
 6 | def main_ginzame():
 7 |     plac.call(run_ginzame)
 8 | 
 9 | 
10 | def main_ginza():
11 |     plac.call(run_ginza)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     plac.call(run_ginza)
16 | 


--------------------------------------------------------------------------------
/ginza/analyzer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | import sys
  3 | from typing import Iterable, Optional
  4 | 
  5 | import thinc
  6 | 
  7 | import spacy
  8 | from spacy.tokens import Doc, Span
  9 | from spacy.language import Language
 10 | from spacy.lang.ja import Japanese
 11 | 
 12 | from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes, bunsetu_bi_label, bunsetu_position_type, clause_head_i
 13 | from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span
 14 | 
 15 | 
 16 | def try_sudachi_import(split_mode: str):
 17 |     """SudachiPy is required for Japanese support, so check for it.
 18 |     It it's not available blow up and explain how to fix it.
 19 |     split_mode should be one of these values: "A", "B", "C", None->"A"."""
 20 |     try:
 21 |         from sudachipy import dictionary, tokenizer
 22 | 
 23 |         split_mode = {
 24 |             None: tokenizer.Tokenizer.SplitMode.A,
 25 |             "A": tokenizer.Tokenizer.SplitMode.A,
 26 |             "B": tokenizer.Tokenizer.SplitMode.B,
 27 |             "C": tokenizer.Tokenizer.SplitMode.C,
 28 |         }[split_mode]
 29 |         tok = dictionary.Dictionary().create(mode=split_mode)
 30 |         return tok
 31 |     except ImportError:
 32 |         raise ImportError(
 33 |             "Japanese support requires SudachiPy and SudachiDict-core "
 34 |             "(https://github.com/WorksApplications/SudachiPy). "
 35 |             "Install with `pip install sudachipy sudachidict_core` or "
 36 |             "install spaCy with `pip install spacy[ja]`."
 37 |         ) from None
 38 | 
 39 | 
 40 | class Analyzer:
 41 |     def __init__(
 42 |         self,
 43 |         model_name_or_path: str,
 44 |         split_mode: str,
 45 |         hash_comment: str,
 46 |         output_format: str,
 47 |         require_gpu: int,
 48 |         disable_sentencizer: bool,
 49 |         use_normalized_form: bool,
 50 |     ) -> None:
 51 |         self.model_name_or_path = model_name_or_path
 52 |         self.split_mode = split_mode
 53 |         self.hash_comment = hash_comment
 54 |         self.output_format = output_format
 55 |         self.require_gpu = require_gpu
 56 |         self.disable_sentencizer = disable_sentencizer
 57 |         self.use_normalized_form = use_normalized_form
 58 |         self.nlp: Optional[Language] = None
 59 | 
 60 |     def set_nlp(self) -> None:
 61 |         if self.nlp:
 62 |             return
 63 | 
 64 |         if self.require_gpu >= 0:
 65 |             thinc.api.require_gpu(self.require_gpu)
 66 | 
 67 |         if self.output_format in ["2", "mecab"]:
 68 |             nlp = try_sudachi_import(self.split_mode)
 69 |         else:
 70 |             # Work-around for pickle error. Need to share model data.
 71 |             if self.model_name_or_path:
 72 |                 nlp = spacy.load(self.model_name_or_path)
 73 |             else:
 74 |                 try:
 75 |                     nlp = spacy.load("ja_ginza_electra")
 76 |                 except IOError as e:
 77 |                     try:
 78 |                         nlp = spacy.load("ja_ginza")
 79 |                     except IOError as e:
 80 |                         try:
 81 |                             nlp = spacy.load("ja_ginza_bert_large")
 82 |                         except IOError as e:
 83 |                             raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.')
 84 | 
 85 |             if self.disable_sentencizer:
 86 |                 nlp.add_pipe("disable_sentencizer", before="parser")
 87 | 
 88 |             if self.split_mode:
 89 |                 set_split_mode(nlp, self.split_mode)
 90 | 
 91 |         self.nlp = nlp
 92 |         self.use_orth_if_reading_is_none = isinstance(self.nlp, Japanese)
 93 | 
 94 |     def analyze_batch(self, lines: Iterable[str]) -> str:
 95 |         self.set_nlp()
 96 |         if self.output_format in ["2", "mecab"]:
 97 |             return "".join(self.analyze_line(line) for line in lines)
 98 | 
 99 |         if self.hash_comment == "print":
100 |             batch = list(self.nlp.pipe(line.rstrip("\n") for line in lines if not line.startswith("#")))
101 |             docs = []
102 |             index = 0
103 |             for line in lines:
104 |                 if line.startswith("#"):
105 |                     docs.append(line)
106 |                 else:
107 |                     docs.append(batch[index])
108 |                     index += 1
109 |         else:
110 |             lines = [line.rstrip("\n") for line in lines if self.hash_comment != "skip" or not line.startswith("#")]
111 |             docs = self.nlp.pipe(lines)
112 | 
113 |         if self.output_format in ["3", "json"]:
114 |             sep = ",\n"
115 |         else:
116 |             sep = ""
117 |         return sep.join(format_doc(doc, self.output_format, self.use_normalized_form, self.use_orth_if_reading_is_none) if isinstance(doc, Doc) else doc for doc in docs)
118 | 
119 |     def analyze_line(self, input_line: str) -> str:
120 |         line = input_line.rstrip("\n")
121 |         if line.startswith("#"):
122 |             if self.hash_comment == "print":
123 |                 return input_line
124 |             elif self.hash_comment == "skip":
125 |                 return ""
126 |         if line == "":
127 |             return "\n"
128 |         if self.output_format in ["2", "mecab"]:
129 |             doc = self.nlp.tokenize(line)
130 |         else:
131 |             doc = self.nlp(line)
132 |         return format_doc(doc, self.output_format, self.use_normalized_form, self.use_orth_if_reading_is_none)
133 | 
134 | 
135 | def format_doc(
136 |    doc: Doc, output_format: str, use_normalized_form: bool, use_orth_if_reading_is_none: bool,
137 | ) -> str:
138 |     if output_format in ["0", "conllu"]:
139 |         return "".join(format_conllu(sent, use_normalized_form, use_orth_if_reading_is_none) for sent in doc.sents)
140 |     elif output_format in ["1", "cabocha"]:
141 |         return "".join(format_cabocha(sent, use_normalized_form) for sent in doc.sents)
142 |     elif output_format in ["2", "mecab"]:
143 |         return "".join(format_mecab(doc, use_normalized_form))
144 |     elif output_format in ["3", "json"]:
145 |         return ",\n".join(format_json(sent) for sent in doc.sents)
146 |     else:
147 |         raise Exception(output_format + " is not supported")
148 | 
149 | 
150 | def format_json(sent: Span) -> str:
151 |     token_lines = ",\n".join(
152 |         f"""       {{"id":{
153 |             token.i - sent.start + 1
154 |         },"orth":"{
155 |             token.orth_
156 |         }","tag":"{
157 |             token.tag_
158 |         }","pos":"{
159 |             token.pos_
160 |         }","lemma":"{
161 |             token.lemma_
162 |         }","norm":"{
163 |             token.norm_
164 |         }","head":{
165 |             token.head.i - token.i
166 |         },"dep":"{
167 |             token.dep_
168 |         }","ner":"{
169 |             token.ent_iob_
170 |         }{
171 |             "-" + token.ent_type_ if token.ent_type_ else ""
172 |         }"{
173 |             ',"whitespacce":"' + token.whitespace_ + '"' if token.whitespace_ else ""
174 |         }}}""" for token in sent
175 |     )
176 |     return f""" {{
177 |   "paragraphs": [
178 |    {{
179 |     "raw": "{sent.text}",
180 |     "sentences": [
181 |      {{
182 |       "tokens": [
183 | {token_lines}
184 |       ]
185 |      }}
186 |     ]
187 |    }}
188 |   ]
189 |  }}"""
190 | 
191 | 
192 | def format_conllu(sent: Span, use_normalized_form, use_orth_if_reading_is_none, print_origin=True) -> str:
193 |     np_labels = [""] * len(sent)
194 |     use_bunsetu = bunsetu_available(sent)
195 |     if use_bunsetu:
196 |         for head_i in bunsetu_head_list(sent):
197 |             bunsetu_head_token = sent[head_i]
198 |             phrase = bunsetu_phrase_span(bunsetu_head_token)
199 |             if phrase.label_ == "NP":
200 |                 for idx in range(phrase.start - sent.start, phrase.end - sent.start):
201 |                     np_labels[idx] = "NP_B" if idx == phrase.start else "NP_I"
202 |     token_lines = "".join(conllu_token_line(sent, token, np_label, use_bunsetu, use_normalized_form, use_orth_if_reading_is_none) for token, np_label in zip(sent, np_labels))
203 |     if print_origin:
204 |         return f"# text = {sent.text}\n{token_lines}\n"
205 |     else:
206 |         return f"{token_lines}\n"
207 | 
208 | 
209 | def conllu_token_line(sent, token, np_label, use_bunsetu, use_normalized_form, use_orth_if_reading_is_none) -> str:
210 |     bunsetu_bi = bunsetu_bi_label(token) if use_bunsetu else None
211 |     position_type = bunsetu_position_type(token) if use_bunsetu else None
212 |     inf = inflection(token)
213 |     reading = reading_form(token, use_orth_if_reading_is_none)
214 |     ne = ent_label_ontonotes(token)
215 |     ene = ent_label_ene(token)
216 |     clause_head = clause_head_i(token) + 1
217 |     misc = "|".join(
218 |         filter(
219 |             lambda s: s,
220 |             (
221 |                 "SpaceAfter=Yes" if token.whitespace_ else "SpaceAfter=No",
222 |                 "" if not bunsetu_bi else f"BunsetuBILabel={bunsetu_bi}",
223 |                 "" if not position_type else f"BunsetuPositionType={position_type}",
224 |                 np_label,
225 |                 "" if not inf else f"Inf={inf}",
226 |                 "" if not reading else "Reading={}".format(reading.replace("|", "\\|").replace("\\", "\\\\")),
227 |                 "" if not ne or ne == "O" else f"NE={ne}",
228 |                 "" if not ene or ene == "O" else f"ENE={ene}",
229 |                 "" if not clause_head else f"ClauseHead={clause_head}",
230 |             )
231 |         )
232 |     )
233 | 
234 |     return "\t".join(
235 |         [
236 |             str(token.i - sent.start + 1),
237 |             token.orth_,
238 |             token.norm_ if use_normalized_form else token.lemma_,
239 |             token.pos_,
240 |             token.tag_.replace(",*", "").replace(",", "-"),
241 |             "NumType=Card" if token.pos_ == "NUM" else "_",
242 |             "0" if token.head.i == token.i else str(token.head.i - sent.start + 1),
243 |             token.dep_.lower() if token.dep_ else "_",
244 |             "_",
245 |             misc if misc else "_",
246 |         ]
247 |     ) + "\n"
248 | 
249 | 
250 | def format_cabocha(sent: Span, use_normalized_form) -> str:
251 |     bunsetu_index_list = {}
252 |     bunsetu_index = -1
253 |     for token in sent:
254 |         if bunsetu_bi_label(token) == "B":
255 |             bunsetu_index += 1
256 |         bunsetu_index_list[token.i] = bunsetu_index
257 | 
258 |     lines = []
259 |     for token in sent:
260 |         if bunsetu_bi_label(token) == "B":
261 |             lines.append(cabocha_bunsetu_line(sent, bunsetu_index_list, token))
262 |         lines.append(cabocha_token_line(token, use_normalized_form))
263 |     lines.append("EOS\n\n")
264 |     return "".join(lines)
265 | 
266 | 
267 | def cabocha_bunsetu_line(sent: Span, bunsetu_index_list, token) -> str:
268 |     bunsetu_head_index = None
269 |     bunsetu_dep_index = None
270 |     bunsetu_func_index = None
271 |     dep_type = "D"
272 |     for t in token.doc[token.i : sent.end]:
273 |         if bunsetu_index_list[t.i] != bunsetu_index_list[token.i]:
274 |             if bunsetu_func_index is None:
275 |                 bunsetu_func_index = t.i - token.i
276 |             break
277 |         tbi = bunsetu_index_list[t.head.i]
278 |         if bunsetu_index_list[t.i] != tbi:
279 |             bunsetu_head_index = t.i - token.i
280 |             bunsetu_dep_index = tbi
281 |         if bunsetu_func_index is None and bunsetu_position_type(t) in {"FUNC", "SYN_HEAD"}:
282 |             bunsetu_func_index = t.i - token.i
283 |     else:
284 |         if bunsetu_func_index is None:
285 |             bunsetu_func_index = len(sent) - token.i
286 |     if bunsetu_head_index is None:
287 |         bunsetu_head_index = 0
288 |     if bunsetu_dep_index is None:
289 |         bunsetu_dep_index = -1
290 |     return "* {} {}{} {}/{} 0.000000\n".format(
291 |         bunsetu_index_list[token.i],
292 |         bunsetu_dep_index,
293 |         dep_type,
294 |         bunsetu_head_index,
295 |         bunsetu_func_index,
296 |     )
297 | 
298 | 
299 | def cabocha_token_line(token, use_normalized_form) -> str:
300 |     part_of_speech = token.tag_.replace("-", ",")
301 |     inf = inflection(token)
302 |     part_of_speech += ",*" * (3 - part_of_speech.count(",")) + "," + (inf if inf else "*,*")
303 |     reading = reading_form(token, True)
304 |     return "{}\t{},{},{},{}\t{}\n".format(
305 |         token.orth_,
306 |         part_of_speech,
307 |         token.norm_ if use_normalized_form else token.lemma_,
308 |         reading if reading else token.orth_,
309 |         "*",
310 |         "O" if token.ent_iob_ == "O" else "{}-{}".format(token.ent_iob_, token.ent_type_),
311 |     )
312 | 
313 | 
314 | def format_mecab(sudachipy_tokens, use_normalized_form) -> str:
315 |     return "".join(mecab_token_line(t, use_normalized_form) for t in sudachipy_tokens) + "EOS\n\n"
316 | 
317 | 
318 | def mecab_token_line(token, use_normalized_form) -> str:
319 |     reading = token.reading_form()
320 |     return "{}\t{},{},{},{}\n".format(
321 |         token.surface(),
322 |         ",".join(token.part_of_speech()),
323 |         token.normalized_form() if use_normalized_form else token.dictionary_form(),
324 |         reading if reading else token.surface(),
325 |         "*",
326 |     )
327 | 


--------------------------------------------------------------------------------
/ginza/bunsetu_recognizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, Iterable, List, Optional, Set
  3 | 
  4 | from spacy.language import Language
  5 | from spacy.tokens import Doc, Span, Token
  6 | 
  7 | __all__ = [
  8 |     "bunsetu_available",
  9 |     "bunsetu_span",
 10 |     "bunsetu_spans",
 11 |     "bunsetu_phrase_span",
 12 |     "bunsetu_phrase_spans",
 13 |     "bunsetu_head_list",
 14 |     "bunsetu_head_tokens",
 15 |     "bunsetu_bi_labels",
 16 |     "bunsetu_position_types",
 17 |     "BunsetuRecognizer",
 18 |     "append_bunsetu_head_dep_suffix",
 19 |     "BUNSETU_HEAD_SUFFIX",
 20 |     "PHRASE_RELATIONS",
 21 |     "POS_PHRASE_MAP",
 22 |     "clauses",
 23 |     "clause_head",
 24 |     "clause_head_i",
 25 |     "CLAUSE_MARKER_RULES",
 26 |     "MIN_BUNSETU_NUM_IN_CLAUSE",
 27 | ]
 28 | 
 29 | 
 30 | BUNSETU_HEAD_SUFFIX = "_bunsetu"
 31 | 
 32 | PHRASE_RELATIONS = ("compound", "nummod", "nmod")
 33 | 
 34 | POS_PHRASE_MAP = {
 35 |     "NOUN": "NP",
 36 |     "NUM": "NP",
 37 |     "PRON": "NP",
 38 |     "PROPN": "NP",
 39 | 
 40 |     "VERB": "VP",
 41 | 
 42 |     "ADJ": "ADJP",
 43 | 
 44 |     "ADV": "ADVP",
 45 | 
 46 |     "CCONJ": "CCONJP",
 47 | }
 48 | 
 49 | CLAUSE_MARKER_RULES = [
 50 |     {
 51 |         "tag_": "補助記号-読点",
 52 |     },
 53 | ]
 54 | 
 55 | MIN_BUNSETU_NUM_IN_CLAUSE = 2
 56 | 
 57 | 
 58 | def bunsetu_available(span: Span):
 59 |     return "bunsetu_heads" in span.doc.user_data
 60 | 
 61 | 
 62 | def bunsetu_head_list(span: Span) -> Iterable[int]:
 63 |     doc = span.doc
 64 |     heads = doc.user_data["bunsetu_heads"]
 65 |     if isinstance(span, Doc):
 66 |         return heads
 67 |     else:
 68 |         start = span.start
 69 |         end = span.end
 70 |         return [i - start for i in heads if start <= i < end]
 71 | 
 72 | 
 73 | def bunsetu_head_tokens(span: Span) -> Iterable[Token]:
 74 |     doc = span.doc
 75 |     heads = doc.user_data["bunsetu_heads"]
 76 |     if isinstance(span, Doc):
 77 |         start = 0
 78 |         end = len(span)
 79 |     else:
 80 |         start = span.start
 81 |         end = span.end
 82 |     return [span[i - start] for i in heads if start <= i < end]
 83 | 
 84 | 
 85 | def bunsetu_spans(span: Span) -> Iterable[Span]:
 86 |     return [
 87 |         bunsetu_span(head) for head in bunsetu_head_tokens(span)
 88 |     ]
 89 | 
 90 | 
 91 | def bunsetu_span(token: Token) -> Span:
 92 |     bunsetu_bi_list = bunsetu_bi_labels(token.doc)
 93 |     start = token.i
 94 |     end = start + 1
 95 |     for idx in range(start, 0, -1):
 96 |         if bunsetu_bi_list[idx] == "B" or token.doc[idx].is_sent_start:
 97 |             start = idx
 98 |             break
 99 |     else:
100 |         start = 0
101 |     doc_len = len(token.doc)
102 |     for idx in range(end, doc_len):
103 |         if bunsetu_bi_list[idx] == "B":
104 |             end = idx
105 |             break
106 |     else:
107 |         end = doc_len
108 | 
109 |     doc = token.doc
110 |     return Span(doc, start=start, end=end, label=POS_PHRASE_MAP.get(doc[start:end].root.pos_, ""))
111 | 
112 | 
113 | def bunsetu_phrase_spans(span: Span, phrase_relations: Iterable[str] = PHRASE_RELATIONS) -> Iterable[Span]:
114 |     return [
115 |         bunsetu_phrase_span(head, phrase_relations) for head in bunsetu_head_tokens(span)
116 |     ]
117 | 
118 | 
119 | def bunsetu_phrase_span(token: Token, phrase_relations: Iterable[str] = PHRASE_RELATIONS) -> Span:
120 |     def _traverse(head, _bunsetu, result):
121 |         for t in head.children:
122 |             if _bunsetu.start <= t.i < _bunsetu.end:
123 |                 if t.dep_ in phrase_relations:
124 |                     _traverse(t, _bunsetu, result)
125 |         result.append(head.i)
126 |     bunsetu = bunsetu_span(token)
127 |     phrase_tokens = []
128 |     _traverse(bunsetu.root, bunsetu, phrase_tokens)
129 |     start = min(phrase_tokens)
130 |     end = max(phrase_tokens) + 1
131 |     return Span(token.doc, start=start, end=end, label=bunsetu.label_)
132 | 
133 | 
134 | def bunsetu_bi_labels(span: Span) -> List[str]:
135 |     doc = span.doc
136 |     bunsetu_bi = doc.user_data["bunsetu_bi_labels"]
137 |     if isinstance(span, Doc):
138 |         return bunsetu_bi
139 |     else:
140 |         start = span.start
141 |         end = span.end
142 |         return bunsetu_bi[start:end]
143 | 
144 | 
145 | def bunsetu_position_types(span: Span) -> List[str]:
146 |     doc = span.doc
147 |     position_types = doc.user_data["bunsetu_position_types"]
148 |     if isinstance(span, Doc):
149 |         return position_types
150 |     else:
151 |         start = span.start
152 |         end = span.end
153 |         return position_types[start:end]
154 | 
155 | 
156 | def clauses(doc: Doc) -> List[Token]:
157 |     clauses = doc.user_data["clauses"]
158 |     return [[doc[token] for token in tokens] for tokens in clauses.values()]
159 | 
160 | 
161 | def clause_head(token: Token) -> Token:
162 |     return token.doc[token.doc.user_data["clause_heads"][token.i]]
163 | 
164 | 
165 | def clause_head_i(token: Token) -> int:
166 |     doc = token.doc
167 |     return doc.user_data["clause_heads"][token.i] - token.sent.start
168 | 
169 | 
170 | class BunsetuRecognizer:
171 |     def __init__(
172 |             self,
173 |             nlp: Language,
174 |             remain_bunsetu_suffix: bool = False,
175 |             clause_marker_rules: List[Dict[str, str]] = CLAUSE_MARKER_RULES,
176 |             min_bunsetu_num_in_clause: int = MIN_BUNSETU_NUM_IN_CLAUSE,
177 |         ) -> None:
178 |         self.nlp = nlp
179 |         self._remain_bunsetu_suffix = remain_bunsetu_suffix
180 |         self._clause_marker_rules = [{k: re.compile(v) for k, v in rule.items()} for rule in clause_marker_rules]
181 |         self._min_bunsetu_num_in_clause = min_bunsetu_num_in_clause
182 | 
183 |     @property
184 |     def remain_bunsetu_suffix(self) -> str:
185 |         return self._remain_bunsetu_suffix
186 | 
187 |     @remain_bunsetu_suffix.setter
188 |     def remain_bunsetu_suffix(self, remain: bool):
189 |         self._remain_bunsetu_suffix = remain
190 | 
191 |     @property
192 |     def clause_marker_rules(self) -> List[Dict[str, str]]:
193 |         return [{k: v.pattern for k, v in rules.items()} for rules in self._clause_marker_rules]
194 | 
195 |     @clause_marker_rules.setter
196 |     def clause_marker_rules(self, _clause_marker_rules: List[Dict[str, str]]):
197 |         self._clause_markers = [{k: re.compile(v) for k, v in rules} for rules in _clause_marker_rules]
198 | 
199 |     @property
200 |     def min_bunsetu_num_in_clause(self) -> int:
201 |         return self._min_bunsetu_num_in_clause
202 | 
203 |     @min_bunsetu_num_in_clause.setter
204 |     def min_bunsetu_num_in_clause(self, _min_bunsetu_num_in_clause: int):
205 |         self._min_bunsetu_num_in_clause = _min_bunsetu_num_in_clause
206 | 
207 |     def __call__(self, doc: Doc) -> Doc:
208 |         debug = False
209 |         heads = [False] * len(doc)
210 |         for t in doc:
211 |             if t.dep_ == "ROOT":
212 |                 heads[t.i] = True
213 |             elif t.dep_.endswith(BUNSETU_HEAD_SUFFIX):
214 |                 heads[t.i] = True
215 |                 if not self._remain_bunsetu_suffix:
216 |                     t.dep_ = t.dep_[:-len(BUNSETU_HEAD_SUFFIX)]
217 |         for t in doc:  # recovering uncovered subtrees
218 |             if heads[t.i]:
219 |                 while t.head.i < t.i and not heads[t.head.i]:
220 |                     heads[t.head.i] = t.head.pos_ not in {"PUNCT"}
221 |                     if debug and heads[t.head.i]:
222 |                         print("========= A", t.i + 1, t.orth_, "=========")
223 |                         print(list((t.i + 1, t.orth_, t.head.i + 1) for t, is_head in zip(doc, heads) if is_head))
224 |                     t = t.head
225 |                 heads[t.head.i] = True
226 | 
227 |         for ent in doc.ents:  # removing head inside ents
228 |             head = None
229 |             outer = None
230 |             for t in ent:
231 |                 if t.head.i == t.i or t.head.i < ent.start or ent.end <= t.head.i:
232 |                     if not outer:
233 |                         head = t
234 |                         outer = t.head
235 |                     elif outer.i != t.head.i:
236 |                         break
237 |             else:
238 |                 if head:
239 |                     for t in ent:
240 |                         if t.i != head.i:
241 |                             heads[t.i] = False
242 | 
243 |         bunsetu_heads = tuple(idx for idx, is_head in enumerate(heads) if is_head)
244 | 
245 |         bunsetu_bi = ["I"] * len(doc)
246 |         if bunsetu_bi:
247 |             bunsetu_bi[0] = "B"
248 |         for head_i, next_head_i in zip(bunsetu_heads[:-1], bunsetu_heads[1:]):
249 |             l_head = doc[head_i]
250 |             r_head = doc[next_head_i]
251 |             if l_head.right_edge.i + 1 == r_head.left_edge.i or l_head.right_edge.i >= r_head.i:  # (l)(r) or (l (r))
252 |                 bunsetu_bi[r_head.left_edge.i] = "B"
253 |             elif l_head.i <= r_head.left_edge.i:  # ((l) r)
254 |                 bunsetu_bi[l_head.right_edge.i + 1] = "B"
255 |             else:  # ((l) (missed_tokens) (r))
256 |                 l_ancestors = set(t.i for t in l_head.ancestors)
257 |                 r_ancestors = set(t.i for t in r_head.ancestors)
258 |                 for m in doc[l_head.right_edge.i + 1: r_head.left_edge.i]:  # find closer branch
259 |                     found = False
260 |                     for m_ancestor in [m] + list(m.ancestors):
261 |                         if m_ancestor.i in r_ancestors:
262 |                             bunsetu_bi[m_ancestor.i] = "B"
263 |                             found = True
264 |                             break
265 |                         elif m_ancestor.i in l_ancestors:
266 |                             break
267 |                     if found:
268 |                         break
269 |                 else:
270 |                     bunsetu_bi[l_head.right_edge.i + 1] = "B"
271 | 
272 |         doc.user_data["bunsetu_heads"] = bunsetu_heads
273 |         doc.user_data["bunsetu_bi_labels"] = bunsetu_bi
274 | 
275 |         position_types = [None] * len(doc)
276 |         for head in bunsetu_heads:
277 |             phrase = bunsetu_phrase_span(doc[head])
278 |             for t in phrase:
279 |                 if t.i == t.head.i:
280 |                     position_types[t.i] = "ROOT"
281 |                 elif t.i == head:
282 |                     position_types[t.i] = "NO_HEAD" if t.dep_ == "punct" else "SEM_HEAD"
283 |                 else:
284 |                     position_types[t.i] = "CONT"
285 |         first_func = True
286 |         for t, bi, position_type in reversed(list(zip(doc, bunsetu_bi, position_types))):
287 |             if bi:
288 |                 first_func = True
289 |             if position_type is None:
290 |                 if t.pos_ in {'AUX', 'ADP', 'SCONJ', 'CCONJ', 'PART'}:
291 |                     if first_func:
292 |                         position_types[t.i] = "SYN_HEAD"
293 |                         first_func = False
294 |                     else:
295 |                         position_types[t.i] = "FUNC"
296 |                 else:
297 |                     position_types[t.i] = "CONT"
298 |         doc.user_data["bunsetu_position_types"] = position_types
299 | 
300 |         bunsetu_heads_set = set(bunsetu_heads)
301 |         clause_head_candidates = set()
302 |         roots = set()
303 |         for t in doc:
304 |             for rule in self._clause_marker_rules:
305 |                 if t.dep_.lower() == "root":
306 |                     roots.add(t.i)
307 |                     continue
308 |                 for attr, pattern in rule.items():
309 |                     if not pattern.fullmatch(getattr(t, attr)):
310 |                         break
311 |                 else:
312 |                     if t.i in bunsetu_heads_set:
313 |                         clause_head_candidates.add(t.i)
314 |                     else:
315 |                         for ancestor in t.ancestors:
316 |                             if ancestor.i in bunsetu_heads_set:
317 |                                 clause_head_candidates.add(t.head.i)
318 |                                 break
319 |                     break
320 |         clause_head_candidates -= roots
321 | 
322 |         for clause_head in list(sorted(clause_head_candidates)):
323 |             subtree = set(_.i for _ in doc[clause_head].subtree)
324 |             if len(subtree & bunsetu_heads_set) < self._min_bunsetu_num_in_clause:
325 |                 clause_head_candidates.remove(clause_head)
326 | 
327 |         clause_head_candidates |= roots
328 |         for clause_head in list(sorted(clause_head_candidates)):
329 |             subtree = set(_.i for _ in doc[clause_head].subtree)
330 |             subtree_bunsetu = subtree & bunsetu_heads_set
331 |             descendant_clauses = subtree & clause_head_candidates - {clause_head}
332 |             for subclause in descendant_clauses:
333 |                 subtree_bunsetu -= set(_.i for _ in doc[subclause].subtree)
334 |             if len(subtree_bunsetu) < self._min_bunsetu_num_in_clause:
335 |                 if clause_head in roots:
336 |                     clause_head_candidates -= descendant_clauses
337 |                 else:
338 |                     clause_head_candidates.remove(clause_head)
339 |         
340 |         clause_heads = list(sorted(clause_head_candidates))
341 | 
342 |         def _children_except_clause_heads(idx):
343 |             children = []
344 |             for t in doc[idx].lefts:
345 |                 if t.i in clause_heads:
346 |                     continue
347 |                 children += _children_except_clause_heads(t.i)
348 |             children.append(idx)
349 |             for t in doc[idx].rights:
350 |                 if t.i in clause_heads:
351 |                     continue
352 |                 children += _children_except_clause_heads(t.i)
353 |             return children
354 | 
355 |         clauses = {head: _children_except_clause_heads(head) for head in clause_heads}
356 |         doc.user_data["clauses"] = clauses
357 |         clause_heads = [-1] * len(doc)
358 |         for head, tokens in clauses.items():
359 |             for token in tokens:
360 |                 clause_heads[token] = head
361 |         doc.user_data["clause_heads"] = clause_heads
362 |         return doc
363 | 
364 | 
365 | def append_bunsetu_head_dep_suffix(tokens: List[Token], suffix: str = BUNSETU_HEAD_SUFFIX) -> None:
366 |     if not suffix:
367 |         return
368 |     for token in tokens:
369 |         if token.dep_.lower() == "root":
370 |             return
371 |         if token.head.i < tokens[0].i or tokens[-1].i < token.head.i:
372 |             token.dep_ += suffix
373 |             return
374 | 


--------------------------------------------------------------------------------
/ginza/command_line.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from multiprocessing import Process, Queue, Event, cpu_count
  3 | from pathlib import Path
  4 | import queue
  5 | import re
  6 | import sys
  7 | import traceback
  8 | from typing import Generator, Iterable, Optional, List
  9 | 
 10 | import plac
 11 | from .analyzer import Analyzer
 12 | 
 13 | MINI_BATCH_SIZE = 100
 14 | GINZA_MODEL_PATTERN = re.compile(r"^(ja_ginza|ja_ginza_electra)$")
 15 | SPACY_MODEL_PATTERN = re.compile(r"^[a-z]{2}[-_].+[-_].+(sm|md|lg|trf)$")
 16 | 
 17 | 
 18 | class _OutputWrapper:
 19 |     def __init__(self, output_path, output_format):
 20 |         self.output = None
 21 |         self.output_path = output_path
 22 |         self.output_format = output_format
 23 |         self.output_json_opened = False
 24 | 
 25 |     @property
 26 |     def is_json(self):
 27 |         return self.output_format in ["3", "json"]
 28 | 
 29 |     def open(self):
 30 |         if self.output_path:
 31 |             self.output = open(self.output_path, "w", encoding="utf-8")
 32 |         else:
 33 |             self.output = sys.stdout
 34 | 
 35 |     def close(self):
 36 |         if self.is_json and self.output_json_opened:
 37 |             print("\n]", file=self.output)
 38 |             self.output_json_opened = False
 39 |         if self.output_path:
 40 |             self.output.close()
 41 |         else:
 42 |             pass
 43 | 
 44 |     def write(self, result: str):
 45 |         if self.is_json:
 46 |             if not self.output_json_opened:
 47 |                 print("[", file=self.output)
 48 |                 self.output_json_opened = True
 49 |             else:
 50 |                 print(",", file=self.output)
 51 |         print(result, end="", file=self.output)
 52 | 
 53 | 
 54 | def run(
 55 |     model_path: Optional[str] = None,
 56 |     ensure_model: Optional[str] = None,
 57 |     split_mode: Optional[str] = None,
 58 |     hash_comment: str = "print",
 59 |     output_path: Optional[Path] = None,
 60 |     output_format: str = "0",
 61 |     require_gpu: int = -1,
 62 |     disable_sentencizer: bool = False,
 63 |     use_normalized_form: bool = False,
 64 |     parallel_level: int = 1,
 65 |     files: List[str] = None,
 66 | ):
 67 |     if output_format in ["3", "json"] and hash_comment != "analyze":
 68 |         print(
 69 |             f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze".',
 70 |             file=sys.stderr
 71 |         )
 72 | 
 73 |     assert parallel_level == 1 or require_gpu == -1, "require_gpu not allowed for multi-processing. https://github.com/explosion/spaCy/issues/5507"
 74 | 
 75 |     if parallel_level <= 0:
 76 |         level = max(1, cpu_count() + parallel_level)
 77 |         if output_format in [2, "mecab"]:
 78 |             if require_gpu >= 0:
 79 |                 print("GPU not used for mecab mode", file=sys.stderr)
 80 |                 require_gpu = False
 81 |         elif parallel_level <= 0:
 82 |             if require_gpu >= 0:
 83 |                 if level < 4:
 84 |                     print(f"GPU #{require_gpu} enabled: parallel_level' set to {level}", end="", file=sys.stderr)
 85 |                 else:
 86 |                     print(f"GPU #{require_gpu} enabled: parallel_level' set to {level} but seems it's too much", end="", file=sys.stderr)
 87 |             else:
 88 |                 print(f"'parallel_level' set to {level}", file=sys.stderr)
 89 |         elif require_gpu:
 90 |             print(f"GPU #{require_gpu} enabled", file=sys.stderr)
 91 |         parallel_level = level
 92 | 
 93 |     assert model_path is None or ensure_model is None
 94 |     if ensure_model:
 95 |         ensure_model = ensure_model.replace("-", "_")
 96 |         try:
 97 |             from importlib import import_module
 98 |             import_module(ensure_model)
 99 |         except ModuleNotFoundError:
100 |             if GINZA_MODEL_PATTERN.match(ensure_model):
101 |                 print("Installing", ensure_model, file=sys.stderr)
102 |                 import pip
103 |                 pip.main(["install", ensure_model])
104 |                 print("Successfully installed", ensure_model, file=sys.stderr)
105 |             elif SPACY_MODEL_PATTERN.match(ensure_model):
106 |                 print("Installing", ensure_model, file=sys.stderr)
107 |                 from spacy.cli.download import download
108 |                 download(ensure_model)
109 |                 print("Successfully installed", ensure_model, file=sys.stderr)
110 |             else:
111 |                 raise OSError("E050", f'You need to install "{ensure_model}" before executing ginza.')
112 |         model_name_or_path = ensure_model
113 |     else:
114 |         model_name_or_path = model_path
115 | 
116 |     analyzer = Analyzer(
117 |         model_name_or_path,
118 |         split_mode,
119 |         hash_comment,
120 |         output_format,
121 |         require_gpu,
122 |         disable_sentencizer,
123 |         use_normalized_form,
124 |     )
125 | 
126 |     output = _OutputWrapper(output_path, output_format)
127 |     output.open()
128 |     try:
129 |         if not files and sys.stdin.isatty():
130 |             _analyze_tty(analyzer, output)
131 |         else:
132 |             if not files:
133 |                 files = [0]
134 |             if parallel_level == 1:
135 |                 _analyze_single(analyzer, output, files)
136 |             else:
137 |                 _analyze_parallel(analyzer, output, files, parallel_level)
138 |     finally:
139 |         output.close()
140 | 
141 | 
142 | def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None:
143 |     try:
144 |         analyzer.set_nlp()
145 |         while True:
146 |             line = input()
147 |             output.write(analyzer.analyze_line(line))
148 |     except EOFError:
149 |         pass
150 |     except KeyboardInterrupt:
151 |         pass
152 | 
153 | 
154 | def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None:
155 |     try:
156 |         analyzer.set_nlp()
157 |         batch = []
158 |         for path in files:
159 |             with open(path, "r", encoding="utf-8") as f:
160 |                 for line in f:
161 |                     batch.append(line)
162 |                     if len(batch) < MINI_BATCH_SIZE:
163 |                         continue
164 |                     output.write(analyzer.analyze_batch(batch))
165 |                     batch.clear()
166 |         if batch:
167 |             output.write(analyzer.analyze_batch(batch))
168 |     except KeyboardInterrupt:
169 |         pass
170 | 
171 | 
172 | def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str], parallel_level: int) -> None:
173 |     try:
174 |         in_queue = Queue(maxsize=parallel_level * 2)
175 |         out_queue = Queue()
176 | 
177 |         p_analyzes = []
178 |         abort = Event()
179 |         for _ in range(parallel_level):
180 |             p = Process(target=_multi_process_analyze, args=(analyzer, in_queue, out_queue, abort), daemon=True)
181 |             p.start()
182 |             p_analyzes.append(p)
183 | 
184 |         p_load = Process(target=_multi_process_load, args=(in_queue, files, MINI_BATCH_SIZE, parallel_level, abort), daemon=True)
185 |         p_load.start()
186 | 
187 |         _main_process_write(out_queue, output, parallel_level, abort)
188 | 
189 |     except KeyboardInterrupt:
190 |         abort.set()
191 |     finally:
192 |         for p in [p_load] + p_analyzes:
193 |             try:
194 |                 p.join(timeout=1)
195 |             except:
196 |                 if p.is_alive():
197 |                     p.terminate()
198 |                     p.join()
199 | 
200 | 
201 | def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]:
202 |     mini_batch = []
203 |     for path in files:
204 |         with open(path, "r", encoding="utf-8") as f:
205 |             for line in f:
206 |                 mini_batch.append(line)
207 |                 if len(mini_batch) == batch_size:
208 |                     yield mini_batch
209 |                     mini_batch = []
210 |     if mini_batch:
211 |         yield mini_batch
212 | 
213 | 
214 | def _multi_process_load(in_queue: Queue, files: List[str], batch_size: int, n_analyze_process: int, abort: Event):
215 |     try:
216 |         for i, mini_batch in enumerate(_data_loader(files, batch_size)):
217 |             if abort.is_set():
218 |                 break
219 |             in_queue.put((i, mini_batch))
220 |         else:
221 |             for _ in range(n_analyze_process):
222 |                 in_queue.put("terminate")
223 |     except KeyboardInterrupt:
224 |         pass
225 |     except:
226 |         traceback.print_exc()
227 |         abort.set()
228 | 
229 | 
230 | def _multi_process_analyze(analyzer: Analyzer, in_queue: Queue, out_queue: Queue, abort: Event):
231 |     i = None
232 |     mini_batch = []
233 |     try:
234 |         while True:
235 |             if abort.is_set():
236 |                 break
237 |             try:
238 |                 msg = in_queue.get(timeout=0.1)
239 |             except queue.Empty:
240 |                 continue
241 |             if msg == "terminate":
242 |                 out_queue.put(("terminating", i, None))
243 |                 break
244 |             i, mini_batch = msg
245 |             result = analyzer.analyze_batch(mini_batch)
246 |             out_queue.put((None, i, result))
247 |     except KeyboardInterrupt:
248 |         pass
249 |     except Exception as err:
250 |         out_queue.put(("Error: {}\n{}".format(err, "".join(mini_batch)), i, None))
251 |         traceback.print_exc()
252 |         abort.set()
253 | 
254 | 
255 | def _main_process_write(out_queue: queue, output: _OutputWrapper, parallel_level: int, abort: Event):
256 |     cur = 0
257 |     results = dict()
258 |     terminating = 0
259 |     while True:
260 |         if abort.is_set():
261 |             return
262 |         try:
263 |             msg, mini_batch_index, result = out_queue.get(timeout=0.1)
264 |         except queue.Empty:
265 |             continue
266 | 
267 |         if msg is not None:
268 |             if msg == "terminating":
269 |                 terminating += 1
270 |                 if terminating == parallel_level:
271 |                     return
272 |                 continue
273 |             else:
274 |                 print(f"Analysis failed in mini_batch #{mini_batch_index}. Stopping all the processes.", file=sys.stderr)
275 |                 print(msg, file=sys.stderr)
276 |                 return
277 | 
278 |         # output must be ordered same as input text
279 |         results[mini_batch_index] = result
280 |         while results:
281 |             if cur not in results.keys():
282 |                 break
283 |             result = results[cur]
284 |             del results[cur]
285 |             cur += 1
286 |             output.write(result)
287 | 
288 | 
289 | @plac.annotations(
290 |     split_mode=("split mode", "option", "s", str, ["A", "B", "C"]),
291 |     hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]),
292 |     output_path=("output path", "option", "o", Path),
293 |     parallel=("parallel level (default=-1, all_cpus=0)", "option", "p", int),
294 |     files=("input files", "positional"),
295 | )
296 | def run_ginzame(
297 |     split_mode=None,
298 |     hash_comment="print",
299 |     output_path=None,
300 |     parallel=-1,
301 |     *files,
302 | ):
303 |     run(
304 |         model_path=None,
305 |         ensure_model=None,
306 |         split_mode=split_mode,
307 |         hash_comment=hash_comment,
308 |         output_path=output_path,
309 |         output_format="mecab",
310 |         require_gpu=-1,
311 |         use_normalized_form=True,
312 |         parallel_level=parallel,
313 |         disable_sentencizer=False,
314 |         files=files,
315 |     )
316 | 
317 | 
318 | def main_ginzame():
319 |     plac.call(run_ginzame)
320 | 
321 | 
322 | @plac.annotations(
323 |     model_path=("model directory path", "option", "b", str),
324 |     ensure_model=("select model package of ginza or spacy", "option", "m", str),
325 |     split_mode=("split mode", "option", "s", str, ["A", "B", "C"]),
326 |     hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]),
327 |     output_path=("output path", "option", "o", Path),
328 |     output_format=("output format", "option", "f", str, ["0", "conllu", "1", "cabocha", "2", "mecab", "3", "json"]),
329 |     require_gpu=("enable require_gpu", "option", "g", int),
330 |     use_normalized_form=("Use Token.norm_ instead of Token.lemma_", "flag", "n"),
331 |     disable_sentencizer=("disable spaCy's sentence separator", "flag", "d"),
332 |     parallel=("parallel level (default=1, all_cpus=0)", "option", "p", int),
333 |     files=("input files", "positional"),
334 | )
335 | def run_ginza(
336 |     model_path=None,
337 |     ensure_model=None,
338 |     split_mode="C",
339 |     hash_comment="print",
340 |     output_path=None,
341 |     output_format="conllu",
342 |     require_gpu=-1,
343 |     use_normalized_form=False,
344 |     disable_sentencizer=False,
345 |     parallel=1,
346 |     *files,
347 | ):
348 |     run(
349 |         model_path=model_path,
350 |         ensure_model=ensure_model,
351 |         split_mode=split_mode,
352 |         hash_comment=hash_comment,
353 |         output_path=output_path,
354 |         output_format=output_format,
355 |         require_gpu=require_gpu,
356 |         use_normalized_form=use_normalized_form,
357 |         disable_sentencizer=disable_sentencizer,
358 |         parallel_level=parallel,
359 |         files=files,
360 |     )
361 | 
362 | 
363 | def main_ginza():
364 |     plac.call(run_ginza)
365 | 
366 | 
367 | if __name__ == "__main__":
368 |     plac.call(run_ginza)
369 | 


--------------------------------------------------------------------------------
/ginza/compound_splitter.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf8
  2 | from collections import OrderedDict
  3 | import re
  4 | 
  5 | import srsly
  6 | 
  7 | from spacy import util
  8 | from spacy.language import Language
  9 | from spacy.lang.ja import resolve_pos
 10 | from spacy.tokens import Doc, MorphAnalysis
 11 | 
 12 | __all__ = [
 13 |     "CompoundSplitter",
 14 |     "tag_to_pos",
 15 | ]
 16 | 
 17 | 
 18 | TAG_DEP_MAP = {
 19 |     "ADJ": "amod",
 20 |     "ADP": "case",
 21 |     "NUM": "nummod",
 22 |     "PART": "mark",
 23 |     "PUNCT": "punct",
 24 | }
 25 | 
 26 | 
 27 | def tag_dep_map(tag):
 28 |     return TAG_DEP_MAP.get(tag, "compound")
 29 | 
 30 | 
 31 | def tag_to_pos(sub_tokens, next_token_tag):
 32 |     pos_list = []
 33 |     next_pos = None
 34 |     for t1, t2 in zip(sub_tokens[:-1], sub_tokens[1:]):
 35 |         if next_pos:
 36 |             pos = next_pos
 37 |             next_pos = None
 38 |         else:
 39 |             pos, next_pos = resolve_pos(t1.surface, t1.tag, t2.tag)
 40 |         pos_list.append(pos)
 41 |     if next_pos:
 42 |         pos = next_pos
 43 |     else:
 44 |         pos, next_pos = resolve_pos(sub_tokens[-1].surface, sub_tokens[-1].tag, next_token_tag)
 45 |     pos_list.append(pos)
 46 |     return pos_list
 47 | 
 48 | 
 49 | def _replace_list_entries(lst, index, inserting_list):
 50 |     return lst[:index] + inserting_list + lst[index + 1:]
 51 | 
 52 | 
 53 | class CompoundSplitter:
 54 |     def __init__(self, vocab, split_mode=None):
 55 |         self.vocab = vocab
 56 |         self.split_mode = split_mode
 57 | 
 58 |     def __call__(self, doc: Doc):
 59 |         if "sub_tokens" not in doc.user_data:
 60 |             return doc
 61 |         if self._split_mode is None:
 62 |             return doc
 63 |         elif self._split_mode == "C":
 64 |             del doc.user_data["sub_tokens"]
 65 |             return doc
 66 |         elif self._split_mode == "B":
 67 |             sub_tokens_index = 1
 68 |         elif self._split_mode == "A":
 69 |             sub_tokens_index = 0
 70 |         else:
 71 |             raise Exception("invalid split_mode: " + self._split_mode)
 72 | 
 73 |         sub_tokens_list = [
 74 |             sub_tokens[sub_tokens_index] if sub_tokens else None for sub_tokens in doc.user_data["sub_tokens"]
 75 |         ]
 76 | 
 77 |         for token_i, sub_tokens in reversed(tuple(zip(range(len(doc)), sub_tokens_list))):
 78 |             token = doc[token_i]
 79 |             token_ent_type = token.ent_type
 80 | 
 81 |             # edit token.dep_
 82 |             if token.head.i == token.i:
 83 |                 dep = "ROOT"
 84 |             else:
 85 |                 dep = token.dep_
 86 | 
 87 |             compounds = dep in {"compound", "nummod", "punct"}
 88 | 
 89 |             # retokenize
 90 |             if sub_tokens_index is not None and sub_tokens:
 91 |                 deps = [tag_dep_map(dtoken.tag) for dtoken in sub_tokens[:-1]] + [token.dep_]
 92 |                 last = len(sub_tokens) - 1
 93 |                 if token.head.i == token.i:
 94 |                     heads = [(token, last) for _ in range(last + 1)]
 95 |                 elif compounds:
 96 |                     heads = [token.head for _ in range(len(sub_tokens))]
 97 |                 else:
 98 |                     heads = [(token, last) for _ in range(last)] + [token.head]
 99 |                 surfaces = [dtoken.surface for dtoken in sub_tokens]
100 |                 def morph(dtoken):
101 |                     m = {}
102 |                     if dtoken.inf:
103 |                         m["Inflection"] = dtoken.inf
104 |                     if dtoken.reading:
105 |                         m["Reading"] = re.sub("[=|]", "_", dtoken.reading)
106 |                     return "|".join(f"{k}={v}" for k, v in m.items())
107 |                 attrs = {
108 |                     "TAG": [dtoken.tag for dtoken in sub_tokens],
109 |                     "DEP": deps,
110 |                     "POS": tag_to_pos(
111 |                         sub_tokens,
112 |                         doc[token.i + 1].tag_ if token.i < len(doc) - 1 else None
113 |                     ),
114 |                     "LEMMA": [dtoken.lemma for dtoken in sub_tokens],
115 |                     "NORM": [dtoken.norm for dtoken in sub_tokens],
116 |                     "ENT_TYPE": [token_ent_type for dtoken in sub_tokens],
117 |                     "MORPH": [morph(dtoken) for dtoken in sub_tokens],
118 |                 }
119 |                 try:
120 |                     with doc.retokenize() as retokenizer:
121 |                         retokenizer.split(token, surfaces, heads=heads, attrs=attrs)
122 |                 except Exception as e:
123 |                     import sys
124 |                     print("Retokenization error:", file=sys.stderr)
125 |                     print(doc.text, file=sys.stderr)
126 |                     print([(t.i, t.orth_) for t in doc], file=sys.stderr)
127 |                     print(list(enumerate(doc.user_data["sub_tokens"])), file=sys.stderr)
128 |                     raise e
129 | 
130 |                 # work-around: retokenize() does not consider the head of the splitted tokens
131 |                 if not compounds:
132 |                     for t in doc:
133 |                         if t.i < token_i or token_i + len(sub_tokens) <= t.i:
134 |                             if t.head.i == token_i:
135 |                                 t.head = doc[token_i + last]
136 | 
137 |         del doc.user_data["sub_tokens"]
138 |         return doc
139 | 
140 |     @property
141 |     def split_mode(self) -> str:
142 |         return self._split_mode
143 | 
144 |     @split_mode.setter
145 |     def split_mode(self, mode: str):
146 |         assert mode in (None, "A", "B", "C"), 'split_mode should be "A", "B", "C", or None'
147 |         self._split_mode = mode
148 | 
149 |     def _get_config(self):
150 |         config = OrderedDict(
151 |             (
152 |                 ("split_mode", self._split_mode),
153 |             )
154 |         )
155 |         return config
156 | 
157 |     def _set_config(self, config=None):
158 |         self.split_mode = config.get("split_mode", None) if config else None
159 | 
160 |     def to_bytes(self, **_kwargs):
161 |         serializers = OrderedDict(
162 |             (
163 |                 ("cfg", lambda: srsly.json_dumps(self._get_config())),
164 |             )
165 |         )
166 |         return util.to_bytes(serializers, [])
167 | 
168 |     def from_bytes(self, data, **_kwargs):
169 |         deserializers = OrderedDict(
170 |             (
171 |                 ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
172 |             )
173 |         )
174 |         util.from_bytes(data, deserializers, [])
175 |         return self
176 | 
177 |     def to_disk(self, path, **_kwargs):
178 |         path = util.ensure_path(path)
179 |         serializers = OrderedDict(
180 |             (
181 |                 ("cfg", lambda p: srsly.write_json(p, self._get_config())),
182 |             )
183 |         )
184 |         return util.to_disk(path, serializers, [])
185 | 
186 |     def from_disk(self, path, **_kwargs):
187 |         path = util.ensure_path(path)
188 |         serializers = OrderedDict(
189 |             (
190 |                 ("cfg", lambda p: self._set_config(srsly.read_json(p))),
191 |             )
192 |         )
193 |         util.from_disk(path, serializers, [])
194 | 


--------------------------------------------------------------------------------
/ginza/disable_sentencizer.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf8
 2 | from collections import OrderedDict
 3 | 
 4 | import srsly
 5 | 
 6 | from spacy import util
 7 | 
 8 | 
 9 | __all__ = [
10 |     "DisableSentencizer",
11 | ]
12 | 
13 | 
14 | 
15 | class DisableSentencizer:
16 |     def __init__(self, nlp):
17 |         self.nlp = nlp
18 | 
19 |     def __call__(self, doc):
20 |         for t in doc[1:]:
21 |             t.is_sent_start = False
22 |         return doc
23 | 
24 |     def _get_config(self):
25 |         return {}
26 | 
27 |     def _set_config(self, config=None):
28 |         pass
29 | 
30 |     def to_bytes(self, **_kwargs):
31 |         serializers = OrderedDict(
32 |             (
33 |                 ("cfg", lambda: srsly.json_dumps(self._get_config())),
34 |             )
35 |         )
36 |         return util.to_bytes(serializers, [])
37 | 
38 |     def from_bytes(self, data, **_kwargs):
39 |         deserializers = OrderedDict(
40 |             (
41 |                 ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
42 |             )
43 |         )
44 |         util.from_bytes(data, deserializers, [])
45 |         return self
46 | 
47 |     def to_disk(self, path, **_kwargs):
48 |         path = util.ensure_path(path)
49 |         serializers = OrderedDict(
50 |             (
51 |                 ("cfg", lambda p: srsly.write_json(p, self._get_config())),
52 |             )
53 |         )
54 |         return util.to_disk(path, serializers, [])
55 | 
56 |     def from_disk(self, path, **_kwargs):
57 |         path = util.ensure_path(path)
58 |         serializers = OrderedDict(
59 |             (
60 |                 ("cfg", lambda p: self._set_config(srsly.read_json(p))),
61 |             )
62 |         )
63 |         util.from_disk(path, serializers, [])
64 | 


--------------------------------------------------------------------------------
/ginza/ene_ontonotes_mapper.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf8
  2 | 
  3 | __all__ = [
  4 |     "ENE_ONTONOTES_MAPPING",
  5 | ]
  6 | 
  7 | 
  8 | ENE_ONTONOTES_MAPPING = {
  9 |     "Person": "PERSON",
 10 |     "God": "PERSON",
 11 | 
 12 |     "International_Organization": "NORP",
 13 |     "Ethnic_Group": "NORP",
 14 |     "Ethnic_Group_Other": "NORP",
 15 |     "Nationality": "NORP",
 16 |     "Political_Organization": "NORP",
 17 |     "Political_Organization_Other": "NORP",
 18 |     "Political_Party": "NORP",
 19 |     "Religion": "NORP",
 20 | 
 21 |     "Language": "LANGUAGE",
 22 |     "Language_Other": "LANGUAGE",
 23 |     "National_Language": "LANGUAGE",
 24 | 
 25 |     "Location_Other": "GPE",
 26 |     "GPE": "GPE",
 27 |     "GPE_Other": "GPE",
 28 |     "City": "GPE",
 29 |     "Province": "GPE",
 30 |     "Country": "GPE",
 31 |     "Spa": "GPE",
 32 |     "Address": "GPE",
 33 |     "Address_Other": "GPE",
 34 |     "Postal_Address": "GPE",
 35 |     "County": "GPE",
 36 | 
 37 |     "Region": "LOC",
 38 |     "Region_Other": "LOC",
 39 |     "Continental_Region": "LOC",
 40 |     "Domestic_Region": "LOC",
 41 |     "Geological_Region": "LOC",
 42 |     "Geological_Region_Other": "LOC",
 43 |     "Mountain": "LOC",
 44 |     "Island": "LOC",
 45 |     "River": "LOC",
 46 |     "Lake": "LOC",
 47 |     "Sea": "LOC",
 48 |     "Bay": "LOC",
 49 | 
 50 |     "Event_Other": "EVENT",
 51 |     "Occasion": "EVENT",
 52 |     "Occasion_Other": "EVENT",
 53 |     "Election": "EVENT",
 54 |     "Religious_Festival": "EVENT",
 55 |     "Competition": "EVENT",
 56 |     "Game": "EVENT",  # used in v7 ("Competition" in v8)
 57 |     "Conference": "EVENT",
 58 |     "Incident": "EVENT",
 59 |     "Incident_Other": "EVENT",
 60 |     "War": "EVENT",
 61 |     "Natural_Phenomenon": "EVENT",
 62 |     "Natural_Phenomenon_Other": "EVENT",
 63 |     "Natural_Disaster": "EVENT",
 64 |     "Earthquake": "EVENT",
 65 | 
 66 |     "Facility": "FAC",
 67 |     "Facility_Other": "FAC",
 68 |     "Facility_Part": "FAC",
 69 |     "Dam": "FAC",
 70 |     "Archaeological_Place": "FAC",
 71 |     "Archaeological_Place_Other": "FAC",
 72 |     "Tomb": "FAC",
 73 |     "FOE": "FAC",
 74 |     "FOE_Other": "FAC",
 75 |     "GOE_Other": "FAC",  # used in v7 ("FOE_Other" in v8)
 76 |     "Military_Base": "FAC",
 77 |     "Power_Plant": "FAC",
 78 |     "Park": "FAC",
 79 |     "Shopping_Complex": "FAC",
 80 |     "Sports_Facility": "FAC",
 81 |     "Museum": "FAC",
 82 |     "Zoo": "FAC",
 83 |     "Amusement_Park": "FAC",
 84 |     "Theater": "FAC",
 85 |     "Worship_Place": "FAC",
 86 |     "Castle": "FAC",
 87 |     "Palace": "FAC",
 88 |     "Public_Institution": "FAC",
 89 |     "Accommodation": "FAC",
 90 |     "Medical_Institution": "FAC",
 91 |     "School": "FAC",
 92 |     "Research_Institute": "FAC",
 93 |     "Market": "FAC",
 94 |     "Transport_Facility": "FAC",
 95 |     "Transport_Facility_Other": "FAC",
 96 |     "Car_Stop": "FAC",
 97 |     "Station": "FAC",
 98 |     "Airport": "FAC",
 99 |     "Port": "FAC",
100 |     "Line": "FAC",
101 |     "Line_Other": "FAC",
102 |     "Railroad": "FAC",
103 |     "Road": "FAC",
104 |     "Canal": "FAC",
105 |     "Water_Route": "FAC",
106 |     "Tunnel": "FAC",
107 |     "Bridge": "FAC",
108 |     "Tumulus": "FAC",
109 | 
110 |     "Organization": "ORG",
111 |     "Organization_Other": "ORG",
112 |     "Show_Organization": "ORG",
113 |     "Family": "ORG",
114 |     "Sports_Organization": "ORG",
115 |     "Sports_Organization_Other": "ORG",
116 |     "Pro_Sports_Organization": "ORG",  # used in v7 ("Sports_Organization" in v8)
117 |     "Sports_Federation": "ORG",
118 |     "Sports_League": "ORG",
119 |     "Sports_Team": "ORG",
120 |     "Juridical_Person": "ORG",
121 |     "Juridical_Person_Other": "ORG",
122 |     "Channel": "ORG",
123 |     "Corporation_Other": "ORG",  # used in v7 ("Juridical_Person_Other" in v8)
124 |     "Nonprofit_Organization": "ORG",
125 |     "Company": "ORG",
126 |     "Company_Group": "ORG",
127 |     "Government": "ORG",
128 |     "Cabinet": "ORG",
129 |     "Military": "ORG",
130 | 
131 |     "Product_Other": "PRODUCT",
132 |     "Service": "PRODUCT",
133 |     "Character": "PRODUCT",
134 |     "ID_Number": "PRODUCT",
135 |     "Game_Other": "PRODUCT",
136 |     "Digital_Game": "PRODUCT",
137 |     "Software": "PRODUCT",
138 |     "Vehicle": "PRODUCT",
139 |     "Vehicle_Other": "PRODUCT",
140 |     "Car": "PRODUCT",
141 |     "Train": "PRODUCT",
142 |     "Aircraft": "PRODUCT",
143 |     "Spaceship": "PRODUCT",
144 |     "Ship": "PRODUCT",
145 |     "Food_Other": "PRODUCT",
146 |     "Musical_Instrument": "PRODUCT",
147 |     "Clothing": "PRODUCT",
148 |     "Money_Form": "PRODUCT",
149 |     "Drug": "PRODUCT",
150 |     "Weapon": "PRODUCT",
151 |     "Stock": "PRODUCT",
152 |     "Award": "PRODUCT",
153 |     "Decoration": "PRODUCT",
154 | 
155 |     "Video_Work": "WORK_OF_ART",
156 |     "Art": "WORK_OF_ART",
157 |     "Art_Other": "WORK_OF_ART",
158 |     "Painting": "WORK_OF_ART",
159 |     "Broadcast_Program": "WORK_OF_ART",
160 |     "Movie": "WORK_OF_ART",
161 |     "Show": "WORK_OF_ART",
162 |     "Music": "WORK_OF_ART",
163 |     "Book": "WORK_OF_ART",
164 |     "Printing": "WORK_OF_ART",
165 |     "Printing_Other": "WORK_OF_ART",
166 |     "Newspaper": "WORK_OF_ART",
167 |     "Magazine": "WORK_OF_ART",
168 |     "Picture": "WORK_OF_ART",
169 | 
170 |     "Offense": "LAW",
171 |     "Doctrine_Method_Other": "LAW",
172 |     "Movement": "LAW",
173 |     "Plan": "LAW",
174 |     "Rule": "LAW",
175 |     "Rule_Other": "LAW",
176 |     "Treaty": "LAW",
177 |     "Law": "LAW",
178 | 
179 |     "Timex": "DATE",
180 |     "Timex_Other": "DATE",
181 |     "Timeex": "DATE",
182 |     "Timeex_Other": "DATE",
183 |     "Date": "DATE",
184 |     "Day_Of_Week": "DATE",
185 |     "Era": "DATE",
186 |     "Periodx": "DATE",
187 |     "Periodx_Other": "DATE",
188 |     "Period_Day": "DATE",
189 |     "Period_Week": "DATE",
190 |     "Period_Month": "DATE",
191 |     "Period_Year": "DATE",
192 |     "Time_Top_Other": "DATE",
193 | 
194 |     "Time": "TIME",
195 |     "Period_Time": "TIME",
196 | 
197 |     "Percent": "PERCENT",
198 | 
199 |     "Currency": "MONEY",
200 |     "Money": "MONEY",
201 | 
202 |     "Unit_Other": "QUANTITY",
203 |     "Latitude_Longitude": "QUANTITY",
204 |     "Latitude_Longtitude": "QUANTITY",  # used in v7 ("Latitude_Longitude" in v8)
205 |     "Measurement": "QUANTITY",
206 |     "Measurement_Other": "QUANTITY",
207 |     "Physical_Extent": "QUANTITY",
208 |     "Seismic_Magnitude": "QUANTITY",
209 |     "Space": "QUANTITY",
210 |     "Volume": "QUANTITY",
211 |     "Weight": "QUANTITY",
212 |     "Speed": "QUANTITY",
213 |     "Intensity": "QUANTITY",
214 |     "Temperature": "QUANTITY",
215 |     "Calorie": "QUANTITY",
216 |     "Seismic_Intensity": "QUANTITY",
217 |     "Countx": "QUANTITY",
218 |     "Countx_Other": "QUANTITY",
219 |     "N_Person": "QUANTITY",
220 |     "N_Organization": "QUANTITY",
221 |     "N_Location": "QUANTITY",
222 |     "N_Location_Other": "QUANTITY",
223 |     "N_Country": "QUANTITY",
224 |     "N_Facility": "QUANTITY",
225 |     "N_Product": "QUANTITY",
226 |     "N_Event": "QUANTITY",
227 |     "N_Natural_Object": "QUANTITY",
228 |     "N_Natural_Object_Other": "QUANTITY",
229 |     "N_Animal": "QUANTITY",
230 |     "N_Flora": "QUANTITY",
231 |     "Point": "QUANTITY",
232 |     "Multiplication": "QUANTITY",
233 |     "Frequency": "QUANTITY",
234 |     "Age": "QUANTITY",
235 | 
236 |     "Rank": "ORDINAL",
237 |     "School_Age": "ORDINAL",
238 |     "Ordinal_Number": "ORDINAL",
239 | 
240 |     "Stock_Index": "CARDINAL",
241 | 
242 |     "Phone_Number": "PHONE",
243 | 
244 |     "Email": "EMAIL",
245 | 
246 |     "URL": "URL",
247 | 
248 |     "Individual_Animal": "ANIMAL",
249 |     "Individual_Animal_Other": "ANIMAL",
250 |     "Racehorse": "ANIMAL",
251 | 
252 |     "Name": "OTHERS",
253 |     "Name_Other": "OTHERS",
254 |     "Natural_Object": "OTHERS",
255 |     "Natural_Object_Other": "OTHERS",
256 |     "Element": "OTHERS",
257 |     "Compound": "OTHERS",
258 |     "Mineral": "OTHERS",
259 |     "Living_Thing": "OTHERS",
260 |     "Living_Thing_Other": "OTHERS",
261 |     "Fungus": "OTHERS",
262 |     "Mollusk_Arthropod": "OTHERS",
263 |     "Mollusc_Arthropod": "OTHERS",  # used in v7 ("Mollusk_Arthropod" in v8)
264 |     "Insect": "OTHERS",
265 |     "Fish": "OTHERS",
266 |     "Amphibia": "OTHERS",
267 |     "Reptile": "OTHERS",
268 |     "Bird": "OTHERS",
269 |     "Mammal": "OTHERS",
270 |     "Flora": "OTHERS",
271 |     "Living_Thing_Part": "OTHERS",
272 |     "Living_Thing_Part_Other": "OTHERS",
273 |     "Animal_Part": "OTHERS",
274 |     "Flora_Part": "OTHERS",
275 |     "Disease": "OTHERS",
276 |     "Disease_Other": "OTHERS",
277 |     "Animal_Disease": "OTHERS",
278 |     "Color": "OTHERS",
279 |     "Color_Other": "OTHERS",
280 |     "Nature_Color": "OTHERS",
281 |     "Location": "OTHERS",
282 |     "Astronomical_Object": "OTHERS",
283 |     "Astronomical_Object_Other": "OTHERS",
284 |     "Star": "OTHERS",
285 |     "Planet": "OTHERS",
286 |     "Constellation": "OTHERS",
287 |     "Product": "OTHERS",
288 |     "Class": "OTHERS",
289 |     "Food": "OTHERS",
290 |     "Dish": "OTHERS",
291 |     "Doctrine_Method": "OTHERS",
292 |     "Culture": "OTHERS",
293 |     "Academic": "OTHERS",
294 |     "Sport": "OTHERS",
295 |     "Style": "OTHERS",
296 |     "Theory": "OTHERS",
297 |     "Title": "OTHERS",
298 |     "Title_Other": "OTHERS",
299 |     "Position_Vocation": "OTHERS",
300 |     "Unit": "OTHERS",
301 |     "Virtual_Address": "OTHERS",
302 |     "Virtual_Address_Other": "OTHERS",
303 |     "Event": "OTHERS",
304 |     "Numex": "OTHERS",
305 |     "Numex_Other": "OTHERS",
306 |     "Astral_Body_Other": "OTHERS",
307 |     "Material": "OTHERS",
308 | }
309 | 
310 | ENE8_LABELS = {
311 |     "Name": "1",
312 |     "Name_Other": "1.0",
313 |     "Person": "1.1",
314 |     "God": "1.2",
315 |     "Individual_Animal": "1.3",
316 |     "Individual_Animal_Other": "1.3.0",
317 |     "Racehorse": "1.3.1",
318 |     "Organization": "1.4",
319 |     "Organization_Other": "1.4.0",
320 |     "International_Organization": "1.4.1",
321 |     "Show_Organization": "1.4.2",
322 |     "Family": "1.4.3",
323 |     "Ethnic_Group": "1.4.4",
324 |     "Ethnic_Group_Other": "1.4.4.0",
325 |     "Nationality": "1.4.4.1",
326 |     "Sports_Organization": "1.4.5",
327 |     "Sports_Organization_Other": "1.4.5.0",
328 |     "Sports_Federation": "1.4.5.1",
329 |     "Sports_League": "1.4.5.2",
330 |     "Sports_Team": "1.4.5.3",
331 |     "Juridical_Person": "1.4.6",
332 |     "Juridical_Person_Other": "1.4.6.0",
333 |     "Nonprofit_Organization": "1.4.6.1",
334 |     "Company": "1.4.6.2",
335 |     "Company_Group": "1.4.6.3",
336 |     "Political_Organization": "1.4.7",
337 |     "Political_Organization_Other": "1.4.7.0",
338 |     "Government": "1.4.7.1",
339 |     "Political_Party": "1.4.7.2",
340 |     "Cabinet": "1.4.7.3",
341 |     "Military": "1.4.7.4",
342 |     "Location": "1.5",
343 |     "Location_Other": "1.5.0",
344 |     "GPE": "1.5.1",
345 |     "GPE_Other": "1.5.1.0",
346 |     "City": "1.5.1.1",
347 |     "Province": "1.5.1.2",
348 |     "Country": "1.5.1.3",
349 |     "Region": "1.5.2",
350 |     "Region_Other": "1.5.2.0",
351 |     "Continental_Region": "1.5.2.1",
352 |     "Domestic_Region": "1.5.2.2",
353 |     "Geological_Region": "1.5.3",
354 |     "Geological_Region_Other": "1.5.3.0",
355 |     "Spa": "1.5.3.1",
356 |     "Mountain": "1.5.3.2",
357 |     "Island": "1.5.3.3",
358 |     "River": "1.5.3.4",
359 |     "Lake": "1.5.3.5",
360 |     "Sea": "1.5.3.6",
361 |     "Bay": "1.5.3.7",
362 |     "Astronomical_Object": "1.5.4",
363 |     "Astronomical_Object_Other": "1.5.4.0",
364 |     "Star": "1.5.4.1",
365 |     "Planet": "1.5.4.2",
366 |     "Constellation": "1.5.4.3",
367 |     "Address": "1.5.5",
368 |     "Address_Other": "1.5.5.0",
369 |     "Postal_Address": "1.5.5.1",
370 |     "Facility": "1.6",
371 |     "Facility_Other": "1.6.0",
372 |     "Facility_Part": "1.6.1",
373 |     "Dam": "1.6.2",
374 |     "Archaeological_Place": "1.6.3",
375 |     "Archaeological_Place_Other": "1.6.3.0",
376 |     "Tomb": "1.6.3.1",
377 |     "FOE": "1.6.4",
378 |     "FOE_Other": "1.6.4.0",
379 |     "Military_Base": "1.6.4.1",
380 |     "Castle": "1.6.4.2",
381 |     "Palace": "1.6.4.3",
382 |     "Public_Institution": "1.6.4.4",
383 |     "Accommodation": "1.6.4.5",
384 |     "Medical_Institution": "1.6.4.6",
385 |     "School": "1.6.4.7",
386 |     "Research_Institute": "1.6.4.8",
387 |     "Market": "1.6.4.9",
388 |     "Power_Plant": "1.6.4.10",
389 |     "Park": "1.6.4.11",
390 |     "Shopping_Complex": "1.6.4.12",
391 |     "Sports_Facility": "1.6.4.13",
392 |     "Museum": "1.6.4.14",
393 |     "Zoo": "1.6.4.15",
394 |     "Amusement_Park": "1.6.4.16",
395 |     "Theater": "1.6.4.17",
396 |     "Worship_Place": "1.6.4.18",
397 |     "Transport_Facility": "1.6.5",
398 |     "Transport_Facility_Other": "1.6.5.0",
399 |     "Car_Stop": "1.6.5.1",
400 |     "Station": "1.6.5.2",
401 |     "Airport": "1.6.5.3",
402 |     "Port": "1.6.5.4",
403 |     "Line": "1.6.6",
404 |     "Line_Other": "1.6.6.0",
405 |     "Railroad": "1.6.6.1",
406 |     "Road": "1.6.6.2",
407 |     "Canal": "1.6.6.3",
408 |     "Water_Route": "1.6.6.4",
409 |     "Tunnel": "1.6.6.5",
410 |     "Bridge": "1.6.6.6",
411 |     "Product": "1.7",
412 |     "Product_Other": "1.7.0",
413 |     "Video_Work": "1.7.1",
414 |     "Musical_Instrument": "1.7.2",
415 |     "Clothing": "1.7.3",
416 |     "Money_Form": "1.7.4",
417 |     "Drug": "1.7.5",
418 |     "Weapon": "1.7.6",
419 |     "Stock": "1.7.7",
420 |     "Award": "1.7.8",
421 |     "Decoration": "1.7.9",
422 |     "Offense": "1.7.10",
423 |     "Service": "1.7.11",
424 |     "Class": "1.7.12",
425 |     "Character": "1.7.13",
426 |     "ID_Number": "1.7.14",
427 |     "Game": "1.7.15",
428 |     "Game_Other": "1.7.15.0",
429 |     "Digital_Game": "1.7.15.1",
430 |     "Software": "1.7.16",
431 |     "Vehicle": "1.7.17",
432 |     "Vehicle_Other": "1.7.17.0",
433 |     "Car": "1.7.17.1",
434 |     "Train": "1.7.17.2",
435 |     "Aircraft": "1.7.17.3",
436 |     "Spaceship": "1.7.17.4",
437 |     "Ship": "1.7.17.5",
438 |     "Food": "1.7.18",
439 |     "Food_Other": "1.7.18.0",
440 |     "Dish": "1.7.18.1",
441 |     "Art": "1.7.19",
442 |     "Art_Other": "1.7.19.0",
443 |     "Painting": "1.7.19.1",
444 |     "Broadcast_Program": "1.7.19.2",
445 |     "Movie": "1.7.19.3",
446 |     "Show": "1.7.19.4",
447 |     "Music": "1.7.19.5",
448 |     "Book": "1.7.19.6",
449 |     "Printing": "1.7.20",
450 |     "Printing_Other": "1.7.20.0",
451 |     "Newspaper": "1.7.20.1",
452 |     "Magazine": "1.7.20.2",
453 |     "Doctrine_Method": "1.7.21",
454 |     "Doctrine_Method_Other": "1.7.21.0",
455 |     "Culture": "1.7.21.1",
456 |     "Religion": "1.7.21.2",
457 |     "Academic": "1.7.21.3",
458 |     "Sport": "1.7.21.4",
459 |     "Style": "1.7.21.5",
460 |     "Movement": "1.7.21.6",
461 |     "Theory": "1.7.21.7",
462 |     "Plan": "1.7.21.8",
463 |     "Rule": "1.7.22",
464 |     "Rule_Other": "1.7.22.0",
465 |     "Treaty": "1.7.22.1",
466 |     "Law": "1.7.22.2",
467 |     "Title": "1.7.23",
468 |     "Title_Other": "1.7.23.0",
469 |     "Position_Vocation": "1.7.23.1",
470 |     "Language": "1.7.24",
471 |     "Language_Other": "1.7.24.0",
472 |     "National_Language": "1.7.24.1",
473 |     "Unit": "1.7.25",
474 |     "Unit_Other": "1.7.25.0",
475 |     "Currency": "1.7.25.1",
476 |     "Virtual_Address": "1.8",
477 |     "Virtual_Address_Other": "1.8.0",
478 |     "Channel": "1.8.1",
479 |     "Phone_Number": "1.8.2",
480 |     "Email": "1.8.3",
481 |     "URL": "1.8.4",
482 |     "Event": "1.9",
483 |     "Event_Other": "1.9.0",
484 |     "Occasion": "1.9.1",
485 |     "Occasion_Other": "1.9.1.0",
486 |     "Election": "1.9.1.1",
487 |     "Religious_Festival": "1.9.1.2",
488 |     "Competition": "1.9.1.3",
489 |     "Conference": "1.9.1.4",
490 |     "Incident": "1.9.2",
491 |     "Incident_Other": "1.9.2.0",
492 |     "War": "1.9.2.1",
493 |     "Natural_Phenomenon": "1.9.3",
494 |     "Natural_Phenomenon_Other": "1.9.3.0",
495 |     "Natural_Disaster": "1.9.3.1",
496 |     "Earthquake": "1.9.3.2",
497 |     "Natural_Object": "1.10",
498 |     "Natural_Object_Other": "1.10.0",
499 |     "Element": "1.10.1",
500 |     "Compound": "1.10.2",
501 |     "Mineral": "1.10.3",
502 |     "Living_Thing": "1.10.4",
503 |     "Living_Thing_Other": "1.10.4.0",
504 |     "Fungus": "1.10.4.1",
505 |     "Mollusk_Arthropod": "1.10.4.2",
506 |     "Insect": "1.10.4.3",
507 |     "Fish": "1.10.4.4",
508 |     "Amphibia": "1.10.4.5",
509 |     "Reptile": "1.10.4.6",
510 |     "Bird": "1.10.4.7",
511 |     "Mammal": "1.10.4.8",
512 |     "Flora": "1.10.4.9",
513 |     "Living_Thing_Part": "1.10.5",
514 |     "Living_Thing_Part_Other": "1.10.5.0",
515 |     "Animal_Part": "1.10.5.1",
516 |     "Flora_Part": "1.10.5.2",
517 |     "Disease": "1.11",
518 |     "Disease_Other": "1.11.0",
519 |     "Animal_Disease": "1.11.1",
520 |     "Color": "1.12",
521 |     "Color_Other": "1.12.0",
522 |     "Nature_Color": "1.12.1",
523 |     "Timex": "2",
524 |     "Timex_Other": "2.0",
525 |     "Timeex": "2.1",
526 |     "Timeex_Other": "2.1.0",
527 |     "Time": "2.1.1",
528 |     "Date": "2.1.2",
529 |     "Day_Of_Week": "2.1.3",
530 |     "Era": "2.1.4",
531 |     "Periodx": "2.2",
532 |     "Periodx_Other": "2.2.0",
533 |     "Period_Time": "2.2.1",
534 |     "Period_Day": "2.2.2",
535 |     "Period_Week": "2.2.3",
536 |     "Period_Month": "2.2.4",
537 |     "Period_Year": "2.2.5",
538 |     "Numex": "3",
539 |     "Numex_Other": "3.0",
540 |     "Money": "3.1",
541 |     "Stock_Index": "3.2",
542 |     "Point": "3.3",
543 |     "Percent": "3.4",
544 |     "Multiplication": "3.5",
545 |     "Frequency": "3.6",
546 |     "Age": "3.7",
547 |     "School_Age": "3.8",
548 |     "Ordinal_Number": "3.9",
549 |     "Rank": "3.10",
550 |     "Latitude_Longitude": "3.11",
551 |     "Measurement": "3.12",
552 |     "Measurement_Other": "3.12.0",
553 |     "Physical_Extent": "3.12.1",
554 |     "Space": "3.12.2",
555 |     "Volume": "3.12.3",
556 |     "Weight": "3.12.4",
557 |     "Speed": "3.12.5",
558 |     "Intensity": "3.12.6",
559 |     "Temperature": "3.12.7",
560 |     "Calorie": "3.12.8",
561 |     "Seismic_Intensity": "3.12.9",
562 |     "Seismic_Magnitude": "3.12.10",
563 |     "Countx": "3.13",
564 |     "Countx_Other": "3.13.0",
565 |     "N_Person": "3.13.1",
566 |     "N_Organization": "3.13.2",
567 |     "N_Location": "3.13.3",
568 |     "N_Location_Other": "3.13.3.0",
569 |     "N_Country": "3.13.3.1",
570 |     "N_Facility": "3.13.4",
571 |     "N_Product": "3.13.5",
572 |     "N_Event": "3.13.6",
573 |     "N_Natural_Object": "3.13.7",
574 |     "N_Natural_Object_Other": "3.13.7.0",
575 |     "N_Animal": "3.13.7.1",
576 |     "N_Flora": "3.13.7.2",
577 | }
578 | 
579 | """
580 | import json
581 | import sys
582 | 
583 | if __name__ == "__main__":
584 |     for ne, idx in ENE8_LABELS.items():
585 |         if ne not in ENE_NE_MAPPING:
586 |             print(idx, ne, "not in mapping")
587 |     with open(sys.argv[1], "r") as f:
588 |         meta_json = json.load(f)
589 |     for ne in meta_json["labels"]["ner"]:
590 |         if ne not in ENE_NE_MAPPING:
591 |             print(ne, "not in mapping")
592 |     for ent, idx in ENE_NE_MAPPING.items():
593 |         if ent not in ENE8_LABELS and ent not in meta_json["labels"]["ner"]:
594 |             print(idx, ent, "not used")
595 | 
596 |     for ne, idx, ent in sorted([
597 |         (ne, ENE8_LABELS[ent] if ent in ENE8_LABELS else "_", ent) for ent, ne in ENT_NE_MAPPING.items()
598 |     ]):
599 |         print("\t"{}": "{}",".format(ent, ne))
600 | """
601 | 


--------------------------------------------------------------------------------
/ginza/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import subprocess as sp
 3 | import sys
 4 | from pathlib import Path
 5 | from functools import partial
 6 | 
 7 | import pytest
 8 | 
 9 | run_cmd = partial(sp.run, encoding="utf-8", stdout=sp.PIPE)
10 | 
11 | 
12 | @pytest.fixture(scope="session")
13 | def tmpdir() -> Path:
14 |     with tempfile.TemporaryDirectory() as dir_name:
15 |         yield Path(dir_name)
16 | 


--------------------------------------------------------------------------------
/ginza/tests/test_analyzer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import json
  3 | import pytest
  4 | 
  5 | from ginza.analyzer import Analyzer
  6 | 
  7 | 
  8 | TOKEN_TESTS = [
  9 |     ["今日はかつ丼を食べた。明日は蕎麦を食べたい。", ["今日","は","かつ丼","を","食べ","た","。","明日","は","蕎麦","を","食べ","たい","。"]]
 10 | ]
 11 | 
 12 | MECAB_TESTS = [
 13 |     ["今日はかつ丼を食べた。明日は蕎麦を食べたい。", ["今日","は","かつ","丼","を","食べ","た","。","明日","は","蕎麦","を","食べ","たい","。"]]
 14 | ]
 15 | 
 16 | BATCH_TESTS = [
 17 |     [
 18 |         [
 19 |             "銀座でランチをご一緒しましょう。",
 20 |             "東京タワーの近くに住んでいます。",
 21 |             "東京都選挙管理委員会の担当者は、次のように説明した。",
 22 |         ], [
 23 |             ["銀座","で","ランチ","を","ご","一緒","し","ましょう","。"],
 24 |             ["東京","タワー","の","近く","に","住ん","で","い","ます","。"],
 25 |             ["東京都","選挙管理委員会","の","担当者","は","、","次","の","よう","に","説明","し","た","。"],
 26 |         ]
 27 |     ]
 28 | ]
 29 | 
 30 | @pytest.fixture
 31 | def analyzer() -> Analyzer:
 32 |     default_params = dict(
 33 |         model_name_or_path=None,
 34 |         split_mode=None,
 35 |         hash_comment="print",
 36 |         output_format="conllu",
 37 |         require_gpu=-1,
 38 |         disable_sentencizer=False,
 39 |         use_normalized_form=False,
 40 |     )
 41 |     yield Analyzer(**default_params)
 42 | 
 43 | 
 44 | def _tokens_conllu(result: str):
 45 |     ret = []
 46 |     for line in result.split("\n"):
 47 |         if line.startswith("#") or line.strip() == "":
 48 |             continue
 49 |         ret.append(line.split("\t")[1])
 50 |     return ret
 51 | 
 52 | 
 53 | def _tokens_cabocha(result: str):
 54 |     ret = []
 55 |     for line in result.split("\n"):
 56 |         if line.startswith("*") or line.strip() in ("","EOS"):
 57 |             continue
 58 |         ret.append(line.split("\t")[0])
 59 |     return ret
 60 | 
 61 | 
 62 | def _tokens_mecab(result: str):
 63 |     ret = []
 64 |     for line in result.split("\n"):
 65 |         if line.startswith("#") or line.strip() in ("","EOS"):
 66 |             continue
 67 |         ret.append(line.split("\t")[0])
 68 |     return ret
 69 | 
 70 | 
 71 | def _tokens_json(result: str):
 72 |     data = json.loads(f"[{result}]")
 73 |     ret = []
 74 |     for d in data:
 75 |         for p in d["paragraphs"]:
 76 |             for s in p["sentences"]:
 77 |                 for t in s["tokens"]:
 78 |                     ret.append(t["orth"])
 79 |     return ret
 80 | 
 81 | class TestAnalyzer:
 82 |     def test_model_name_or_path_ja_ginza(self, mocker, analyzer):
 83 |         spacy_load_mock = mocker.patch("spacy.load")
 84 |         analyzer.model_name_or_path = "ja_ginza"
 85 |         analyzer.set_nlp()
 86 |         spacy_load_mock.assert_called_once_with("ja_ginza")
 87 | 
 88 |     def test_model_name_or_path_ja_ginza_electra(self, mocker, analyzer):
 89 |         spacy_load_mock = mocker.patch("spacy.load")
 90 |         analyzer.model_name_or_path = "ja_ginza_electra"
 91 |         analyzer.set_nlp()
 92 |         spacy_load_mock.assert_called_once_with("ja_ginza_electra")
 93 | 
 94 |     def test_require_gpu(self, mocker, analyzer):
 95 |         require_gpu_mock = mocker.patch("thinc.api.require_gpu")
 96 |         analyzer.require_gpu = 0
 97 |         analyzer.set_nlp()
 98 |         require_gpu_mock.assert_called_once()
 99 | 
100 |     @pytest.mark.parametrize("input_text, tokens", TOKEN_TESTS)
101 |     @pytest.mark.parametrize(
102 |         "output_format, raises_analysis_before_set, tokens_func",
103 |         [
104 |             ("conllu", TypeError, _tokens_conllu),
105 |             ("cabocha", TypeError, _tokens_cabocha),
106 |             ("json", TypeError, _tokens_json),
107 |         ],
108 |     )
109 |     def test_analyze_line(self, output_format, raises_analysis_before_set, input_text, tokens, tokens_func, analyzer):
110 |         analyzer.output_format = output_format
111 |         with pytest.raises(raises_analysis_before_set):
112 |             analyzer.analyze_line(input_text)
113 | 
114 |         analyzer.set_nlp()
115 |         ret = analyzer.analyze_line(input_text)
116 |         assert tokens_func(ret) == tokens
117 | 
118 |     @pytest.mark.parametrize("input_text, tokens", MECAB_TESTS)
119 |     @pytest.mark.parametrize(
120 |         "output_format, raises_analysis_before_set, tokens_func",
121 |         [
122 |             ("mecab", AttributeError, _tokens_mecab),
123 |         ],
124 |     )
125 |     def test_analyze_line_mecab(self, output_format, raises_analysis_before_set, input_text, tokens, tokens_func, analyzer):
126 |         analyzer.output_format = output_format
127 |         with pytest.raises(raises_analysis_before_set):
128 |             analyzer.analyze_line(input_text)
129 | 
130 |         analyzer.set_nlp()
131 |         ret = analyzer.analyze_line(input_text)
132 |         assert tokens_func(ret) == tokens
133 | 
134 |     @pytest.mark.parametrize("input_batch, tokens_batch", BATCH_TESTS)
135 |     @pytest.mark.parametrize(
136 |         "output_format, tokens_func",
137 |         [
138 |             ("conllu", _tokens_conllu),
139 |             ("cabocha", _tokens_cabocha),
140 |             ("json", _tokens_json),
141 |         ],
142 |     )
143 |     def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_func, analyzer):
144 |         analyzer.output_format = output_format
145 |         ret = analyzer.analyze_batch(input_batch)
146 |         assert tokens_func(ret) == sum(tokens_batch, [])
147 | 
148 |     @pytest.mark.parametrize(
149 |         "raises_analysis_before_set, tokens_func",
150 |         [
151 |             (TypeError, _tokens_conllu)
152 |         ],
153 |     )
154 |     @pytest.mark.parametrize(
155 |         "split_mode, input_text, tokens",
156 |         [
157 |             ("A", "機能性食品", ["機能", "性", "食品"]),
158 |             ("B", "機能性食品", ["機能性", "食品"]),
159 |             ("C", "機能性食品", ["機能性食品"]),
160 |         ],
161 |     )
162 |     def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer):
163 |         analyzer.split_mode = split_mode
164 |         with pytest.raises(raises_analysis_before_set):
165 |             analyzer.analyze_line(input_text)
166 | 
167 |         analyzer.set_nlp()
168 |         ret = analyzer.analyze_line(input_text)
169 |         assert tokens_func(ret) == tokens
170 | 


--------------------------------------------------------------------------------
/ginza/tests/test_command_line.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import subprocess as sp
  4 | from functools import partial
  5 | from pathlib import Path
  6 | from typing import Iterable, List
  7 | 
  8 | import pytest
  9 | 
 10 | import torch
 11 | import ginza.command_line as cli
 12 | 
 13 | TEST_TEXT = "#コメント\n今日はかつ丼を食べた。\n明日は東京で蕎麦を食べる。明後日は酒が飲みたい。"
 14 | 
 15 | run_cmd = partial(sp.run, encoding="utf-8", stdout=sp.PIPE)
 16 | 
 17 | 
 18 | @pytest.fixture(scope="module")
 19 | def input_file(tmpdir: Path) -> Path:
 20 |     file_path = (tmpdir / "test_input.txt").resolve()
 21 |     with open(file_path, "w") as fp:
 22 |         print(TEST_TEXT, file=fp)
 23 |     yield file_path
 24 |     file_path.unlink()
 25 | 
 26 | 
 27 | @pytest.fixture(scope="module")
 28 | def input_files(tmpdir: Path) -> Iterable[Path]:
 29 |     paths = []
 30 |     for i, text in enumerate(TEST_TEXT.split("\n")):
 31 |         file_path = (tmpdir / f"test_input_{i}.txt").resolve()
 32 |         with open(file_path, "w") as fp:
 33 |             print(text, file=fp)
 34 |         paths.append(file_path)
 35 |     yield paths
 36 |     for file_path in paths:
 37 |         file_path.unlink()
 38 | 
 39 | 
 40 | @pytest.fixture(scope="module")
 41 | def long_input_file(tmpdir: Path) -> Iterable[Path]:
 42 |     file_path = (tmpdir / "test_long_input.txt").resolve()
 43 |     with open(file_path, "w") as fp:
 44 |         for _ in range(10):
 45 |             print(TEST_TEXT, file=fp)
 46 |     yield file_path
 47 |     file_path.unlink()
 48 | 
 49 | 
 50 | @pytest.fixture
 51 | def output_file(tmpdir: Path) -> Path:
 52 |     file_path = (tmpdir / "test_output.txt").resolve()
 53 |     file_path.touch()
 54 |     yield file_path
 55 |     file_path.unlink()
 56 | 
 57 | 
 58 | def _conllu_parsable(result: str):
 59 |     for line in result.split("\n"):
 60 |         if line.startswith("#") or line.strip() == "":
 61 |             continue
 62 |         if not len(line.strip().split("\t")) == 10:
 63 |             raise Exception
 64 | 
 65 | 
 66 | def _cabocha_parsable(result: str):
 67 |     for line in result.split("\n"):
 68 |         if line.strip() in ("", "EOS") or line.startswith("*") or line.startswith("#"):
 69 |             continue
 70 |         if not len(line.split("\t")) == 3:
 71 |             raise Exception
 72 |         if not len(line.split("\t")[1].split(",")) in [8, 9]:
 73 |             raise Exception
 74 | 
 75 | 
 76 | def _mecab_parsable(result: str):
 77 |     for line in result.split("\n"):
 78 |         if line.strip() in ("", "EOS") or line.startswith("#"):
 79 |             continue
 80 |         if not len(line.split("\t")) == 2:
 81 |             raise Exception
 82 |         if not len(line.split("\t")[1].split(",")) == 9:
 83 |             raise Exception
 84 | 
 85 | 
 86 | def _json_parsable(result: str):
 87 |     data = json.loads(result)
 88 |     for d in data:
 89 |         if not type(d) == dict:
 90 |             raise Exception
 91 |         if not "paragraphs" in d.keys():
 92 |             raise Exception
 93 | 
 94 | 
 95 | class TestCLIGinza:
 96 |     def test_help(self):
 97 |         for opt in ["-h", "--help"]:
 98 |             p = run_cmd(["ginza", opt])
 99 |             assert p.returncode == 0
100 | 
101 |     def test_input(self, input_file):
102 |         # input file
103 |         p = run_cmd(["ginza", input_file])
104 | 
105 |         # input from stdin
106 |         p_stdin = sp.Popen(["ginza"], stdin=sp.PIPE, stdout=sp.PIPE)
107 |         o, e = p_stdin.communicate(input=TEST_TEXT.encode())
108 |         assert e is None
109 |         assert o.decode("utf-8") == p.stdout
110 | 
111 |     def test_multiple_input(self, input_files, input_file):
112 |         p_multi = run_cmd(["ginza", *input_files])
113 |         assert p_multi.returncode == 0
114 | 
115 |         p_single = run_cmd(["ginza", input_file])
116 |         assert p_multi.stdout == p_single.stdout
117 | 
118 |     # TODO: add user defined model to fixture and test it here
119 |     @pytest.mark.parametrize(
120 |         "model_path, exit_ok",
121 |         [
122 |             ("ja_ginza", True),
123 |             ("not-exist-model", False),
124 |         ],
125 |     )
126 |     def test_model_path(self, model_path, exit_ok, input_file):
127 |         p = run_cmd(["ginza", "-b", model_path, input_file])
128 |         assert (p.returncode == 0) is exit_ok
129 | 
130 |     @pytest.mark.parametrize(
131 |         "ensure_model, exit_ok",
132 |         [
133 |             ("ja_ginza", True),
134 |             ("ja-ginza", True),
135 |             ("ja-ginza-electra", True),
136 |             ("ja_ginza_electra", True),
137 |             ("ja-ginza_electra", True),
138 |             ("not-exist-model", False),
139 |         ],
140 |     )
141 |     def test_ensure_model(self, ensure_model, exit_ok, input_file):
142 |         p = run_cmd(["ginza", "-m", ensure_model, input_file])
143 |         assert (p.returncode == 0) is exit_ok
144 | 
145 |     def test_double_model_spcification(self, input_file):
146 |         p = run_cmd(["ginza", "-b", "ja_ginza", "-m", "ja_ginza", input_file])
147 |         assert p.returncode != 0
148 | 
149 |     @pytest.mark.parametrize(
150 |         "split_mode, input_text, expected",
151 |         [
152 |             ("A", "機能性食品", ["機能", "性", "食品"]),
153 |             ("B", "機能性食品", ["機能性", "食品"]),
154 |             ("C", "機能性食品", ["機能性食品"]),
155 |         ],
156 |     )
157 |     def test_split_mode(self, split_mode, input_text, expected):
158 |         p = run_cmd(["ginza", "-s", split_mode], input=input_text)
159 |         assert p.returncode == 0
160 | 
161 |         def _sub_words(lines: Iterable) -> List[str]:
162 |             return [l.split("\t")[1] for l in lines if len(l.split("\t")) > 1]
163 | 
164 |         assert _sub_words(p.stdout.split("\n")) == expected
165 | 
166 |     @pytest.mark.parametrize(
167 |         "hash_comment, n_sentence, n_analyzed_sentence, exit_ok",
168 |         [
169 |             ("print", 4, 3, True),
170 |             ("skip", 3, 3, True),
171 |             ("analyze", 4, 4, True),
172 |         ],
173 |     )
174 |     def test_hash_comment(self, hash_comment, n_sentence, n_analyzed_sentence, exit_ok, input_file):
175 |         def _n_sentence(lines: Iterable) -> int:
176 |             return len(list(filter(lambda x: x.startswith("#"), lines)))
177 | 
178 |         def _n_analyzed_sentence(lines: Iterable) -> int:
179 |             return len(list(filter(lambda x: x.startswith("# text = "), lines)))
180 | 
181 |         p = run_cmd(["ginza", "-c", hash_comment, input_file])
182 |         assert (p.returncode == 0) is exit_ok
183 |         assert _n_sentence(p.stdout.split("\n")) == n_sentence
184 |         assert _n_analyzed_sentence(p.stdout.split("\n")) == n_analyzed_sentence
185 | 
186 |     def test_output_path(self, input_file, output_file):
187 |         p_s = run_cmd(["ginza", input_file])
188 |         p_o = run_cmd(["ginza", "-o", output_file, input_file])
189 |         assert p_o.returncode == 0
190 | 
191 |         def _file_output():
192 |             with open(output_file, "r") as fp:
193 |                 return [l.strip() for l in fp if l.strip()]
194 | 
195 |         def _pipe_output():
196 |             return [l.strip() for l in p_s.stdout.split("\n") if l.strip()]
197 | 
198 |         assert _file_output() == _pipe_output()
199 | 
200 |     @pytest.mark.parametrize(
201 |         "output_format, result_parsable",
202 |         [
203 |             ("conllu", _conllu_parsable),
204 |             ("cabocha", _cabocha_parsable),
205 |             ("mecab", _mecab_parsable),
206 |             ("json", _json_parsable),
207 |         ],
208 |     )
209 |     def test_output_format(self, output_format, result_parsable, input_file):
210 |         p = run_cmd(["ginza", "-c", "analyze", "-f", output_format, input_file])
211 |         assert p.returncode == 0
212 |         result_parsable(p.stdout.strip())
213 | 
214 |     @pytest.mark.parametrize(
215 |         "hash_comment", ["print", "skip"]
216 |     )
217 |     def test_warn_if_json_hash_comment_not_analyze(self, hash_comment, input_file):
218 |         p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file], stderr=sp.PIPE)
219 |         assert p.returncode == 0
220 |         msg = (
221 |             f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze"'
222 |         )
223 |         assert msg in p.stderr
224 | 
225 |     def test_require_gpu(self, input_file):
226 |         p = run_cmd(["ginza", "-g", "0", input_file])
227 |         gpu_available = torch.cuda.is_available()
228 |         assert (p.returncode == 0) is gpu_available
229 | 
230 |     def test_do_not_use_normalized_form(self, input_file):
231 |         p = run_cmd(["ginza", input_file])
232 |         lemmas = [l.split("\t")[2] for l in p.stdout.split("\n") if len(l.split("\t")) > 1]
233 |         # 'かつ丼' is dictionary_form of 'かつ丼'
234 |         assert p.returncode == 0
235 |         assert "かつ丼" in lemmas
236 | 
237 |     def test_use_normalized_form(self, input_file):
238 |         p = run_cmd(["ginza", "-n", input_file])
239 |         lemmas = [l.split("\t")[2] for l in p.stdout.split("\n") if len(l.split("\t")) > 1]
240 |         # 'カツ丼' is normlized_form of 'かつ丼'
241 |         assert p.returncode == 0
242 |         assert "カツ丼" in lemmas
243 | 
244 |     def test_disable_sentencizer(self, input_file):
245 |         p = run_cmd(["ginza", "-d", input_file])
246 | 
247 |         def _n_analyzed_sentence(lines: Iterable) -> int:
248 |             return len(list(filter(lambda x: x.startswith("# text = "), lines)))
249 | 
250 |         assert p.returncode == 0
251 |         assert _n_analyzed_sentence(p.stdout.split("\n")) == 2
252 | 
253 |     def test_parallel(self, input_file):
254 |         p = run_cmd(["ginza", "-p", "2", input_file])
255 |         assert p.returncode == 0
256 | 
257 | 
258 | class TestCLIGinzame:
259 |     def test_ginzame(self, input_file):
260 |         p_ginzame = run_cmd(["ginzame", input_file])
261 |         p_ginza = run_cmd(["ginza", "-n", "-m", "ja_ginza", "-f", "2", "-s", "A", input_file])
262 | 
263 |         assert p_ginzame.returncode == 0
264 |         assert p_ginzame.stdout == p_ginza.stdout
265 | 
266 | 
267 | class TestRun:
268 |     def test_run_as_single_when_input_is_a_tty(self, mocker, output_file, long_input_file):
269 |         i = 0
270 | 
271 |         def f_mock_input():
272 |             nonlocal i
273 |             if i >= 1:
274 |                 raise KeyboardInterrupt
275 |             else:
276 |                 i += 1
277 |                 return "今日はいい天気だ"
278 | 
279 |         mocker.patch.object(cli, "MINI_BATCH_SIZE", 5)
280 |         mocker.patch("ginza.command_line.sys.stdin.isatty", return_value=True)
281 |         input_mock = mocker.patch.object(cli, "input", side_effect=f_mock_input)
282 |         analyze_parallel_mock = mocker.patch.object(cli, "_analyze_parallel")
283 |         cli.run(parallel_level=2, output_path=output_file, files=None)
284 |         assert input_mock.call_count == 2
285 |         analyze_parallel_mock.assert_not_called()
286 | 
287 |     @pytest.mark.parametrize(
288 |         "output_format",
289 |         ["conllu", "cabocha", "mecab", "json"],
290 |     )
291 |     def test_parallel_output_same_as_single(self, output_format, mocker, tmpdir, long_input_file):
292 |         mocker.patch.object(cli, "MINI_BATCH_SIZE", 5)
293 | 
294 |         out_single = tmpdir / "single_output.txt"
295 |         if out_single.exists():
296 |             out_single.unlink()
297 |         cli.run(
298 |             parallel_level=1,
299 |             output_path=out_single,
300 |             output_format=output_format,
301 |             files=[long_input_file],
302 |             ensure_model="ja_ginza",
303 |         )
304 | 
305 |         out_parallel = tmpdir / "parallel_output.txt"
306 |         if out_parallel.exists():
307 |             out_parallel.unlink()
308 |         try:
309 |             cli.run(
310 |                 parallel_level=2,
311 |                 output_path=out_parallel,
312 |                 output_format=output_format,
313 |                 files=[long_input_file],
314 |                 ensure_model="ja_ginza",
315 |             )
316 |         except:
317 |             pytest.fail("parallel run failed")
318 | 
319 |         def f_len(path):
320 |             with open(path, "r") as f:
321 |                 return sum([1 for _ in f])
322 | 
323 |         assert f_len(out_single) == f_len(out_parallel)
324 |         with open(out_single, "r") as f_s:
325 |             with open(out_parallel, "r") as f_p:
326 |                 for s, p in zip(f_s, f_p):
327 |                     assert s == p
328 | 


--------------------------------------------------------------------------------
/ginza/tests/test_models.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import pytest
  3 | from copy import deepcopy
  4 | 
  5 | from ginza import set_split_mode
  6 | 
  7 | 
  8 | MODELS = ["ja_ginza", "ja_ginza_electra"]
  9 | 
 10 | TOKENIZER_TESTS = [
 11 |     ("銀座でランチをご一緒しましょう。", ["銀座", "で", "ランチ", "を", "ご", "一緒", "し", "ましょう", "。"]),
 12 |     ("すもももももももものうち", ["すもも", "も", "もも", "も", "もも", "の", "うち"]),
 13 | ]
 14 | 
 15 | COMPOUND_SPLITER_TESTS = [
 16 |     ("選挙管理委員会", 4, 3, 1),
 17 |     ("客室乗務員", 3, 2, 1),
 18 |     ("労働者協同組合", 4, 3, 1),
 19 |     ("機能性食品", 3, 2, 1),
 20 | ]
 21 | 
 22 | TAG_TESTS = [
 23 |     ("銀座でランチをご一緒しましょう。", ["名詞-固有名詞-地名-一般", "助詞-格助詞", "名詞-普通名詞-一般", "助詞-格助詞", "接頭辞", "名詞-普通名詞-サ変可能", "動詞-非自立可能", "助動詞", "補助記号-句点"]),
 24 |     ("すもももももももものうち", ["名詞-普通名詞-一般", "助詞-係助詞", "名詞-普通名詞-一般", "助詞-係助詞", "名詞-普通名詞-一般", "助詞-格助詞", "名詞-普通名詞-副詞可能"]),
 25 | ]
 26 | 
 27 | POS_TESTS_JA_GINZA = [
 28 |     ("銀座でランチをご一緒しましょう。", ["PROPN", "ADP", "NOUN", "ADP", "NOUN", "NOUN", "AUX", "AUX", "PUNCT"]),
 29 |     ("すもももももももものうち", ["NOUN", "ADP", "NOUN", "ADP", "NOUN", "ADP", "NOUN"]),
 30 | ]
 31 | 
 32 | POS_TESTS_JA_GINZA_ELECTRA = [
 33 |     ("銀座でランチをご一緒しましょう。", ["PROPN", "ADP", "NOUN", "ADP", "NOUN", "VERB", "AUX", "AUX", "PUNCT"]),
 34 |     ("すもももももももものうち", ["NOUN", "ADP", "NOUN", "ADP", "NOUN", "ADP", "NOUN"]),
 35 | ]
 36 | 
 37 | LEMMATIZE_TESTS = [
 38 |     ("新しく", "新しい"),
 39 |     ("いただきました", "いただく"),
 40 |     ("なった", "なる"),
 41 | ]
 42 | 
 43 | NORMALIZE_TESTS = [
 44 |     ("かつ丼", "カツ丼"),
 45 |     ("附属", "付属"),
 46 |     ("SUMMER", "サマー"),
 47 |     ("シュミレーション", "シミュレーション"),
 48 | ]
 49 | 
 50 | EMPTYISH_TESTS = [
 51 |     ("", 0),
 52 |     ("         ", 1),
 53 |     ("\n\n\t\t\n\n", 1),
 54 |     ("\r\n\r\n", 1),
 55 |     ("\n&nbsp;\n\n", 5),
 56 | ]
 57 | 
 58 | NAUGHTY_STRINGS = [
 59 |     # ASCII punctuation
 60 |     r",./;'[]\-=",
 61 |     r'<>?:"{}|_+',
 62 |     r'!@#$%^&*()`~"',
 63 |     # Unicode additional control characters, byte order marks
 64 |     r"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪",
 65 |     r"￾",
 66 |     # Unicode Symbols
 67 |     r"Ω≈ç√∫˜µ≤≥÷",
 68 |     r"åß∂ƒ©˙∆˚¬…æ",
 69 |     "œ∑´®†¥¨ˆøπ“‘",
 70 |     r"¡™£¢∞§¶•ªº–≠",
 71 |     r"¸˛Ç◊ı˜Â¯˘¿",
 72 |     r"ÅÍÎÏ˝ÓÔÒÚÆ☃",
 73 |     r"Œ„´‰ˇÁ¨ˆØ∏”’",
 74 |     r"`⁄€‹›ﬁﬂ‡°·‚—±",
 75 |     r"⅛⅜⅝⅞",
 76 |     r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
 77 |     r"٠١٢٣٤٥٦٧٨٩",
 78 |     # Unicode Subscript/Superscript/Accents
 79 |     r"⁰⁴⁵",
 80 |     r"₀₁₂",
 81 |     r"⁰⁴⁵₀₁₂",
 82 |     r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
 83 |     r" ̄  ̄",
 84 |     # Two-Byte Characters
 85 |     r"田中さんにあげて下さい",
 86 |     r"パーティーへ行かないか",
 87 |     r"和製漢語",
 88 |     r"部落格",
 89 |     r"사회과학원 어학연구소",
 90 |     r"찦차를 타고 온 펲시맨과 쑛다리 똠방각하",
 91 |     r"社會科學院語學研究所",
 92 |     r"울란바토르",
 93 |     r"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
 94 |     # Japanese Emoticons
 95 |     r"ヽ༼ຈل͜ຈ༽ﾉ ヽ༼ຈل͜ຈ༽ﾉ",
 96 |     r"(｡◕ ∀ ◕｡)",
 97 |     r"｀ｨ(´∀｀∩",
 98 |     r"__ﾛ(,_,*)",
 99 |     r"・(￣∀￣)・:*:",
100 |     r"ﾟ･✿ヾ╲(｡◕‿◕｡)╱✿･ﾟ",
101 |     r",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
102 |     r"(╯°□°）╯︵ ┻━┻)" "(ﾉಥ益ಥ）ﾉ﻿ ┻━┻",
103 |     r"┬─┬ノ( º _ ºノ)",
104 |     r"( ͡° ͜ʖ ͡°)",
105 |     # Emoji
106 |     r"😍",
107 |     r"👩🏽",
108 |     r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍",
109 |     r"🐵 🙈 🙉 🙊",
110 |     r"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙",
111 |     r"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
112 |     r"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
113 |     r"0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟",
114 |     # Regional Indicator Symbols
115 |     r"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
116 |     r"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
117 |     r"🇺🇸🇷🇺🇸🇦",
118 |     # Unicode Numbers
119 |     r"１２３",
120 |     r"١٢٣",
121 |     # Right-To-Left Strings
122 |     r"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
123 |     r"إيو.",
124 |     r"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
125 |     r"הָיְתָהtestالصفحات التّحول",
126 |     r"﷽",
127 |     r"ﷺ",
128 |     r"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
129 |     # Trick Unicode
130 |     r"‪‪test‪",
131 |     r"‫test",
132 |     r" test ",
133 |     r"test⁠test",
134 |     r"⁦test⁧",
135 |     # Zalgo Text
136 |     r"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
137 |     r"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
138 |     r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
139 |     r"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
140 |     r"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
141 |     # Unicode Upsidedown
142 |     r"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
143 |     r"00˙Ɩ$-",
144 |     # Unicode font
145 |     r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ",
146 |     r"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
147 |     r"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
148 |     r"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈",
149 |     r"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰",
150 |     r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
151 |     r"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
152 |     r"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
153 |     # File paths
154 |     r"../../../../../../../../../../../etc/passwd%00",
155 |     r"../../../../../../../../../../../etc/hosts",
156 |     # iOS Vulnerabilities
157 |     r"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
158 |     r"🏳0🌈️",
159 | ]
160 | 
161 | 
162 | @pytest.fixture(scope="module")
163 | def nlp(request):
164 |     return spacy.load(request.param)
165 | 
166 | 
167 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
168 | @pytest.mark.parametrize("text, expected_tokens", TOKENIZER_TESTS)
169 | def test_tokenize(nlp, text, expected_tokens):
170 |     tokens = [token.text for token in nlp(text)]
171 |     assert tokens == expected_tokens
172 | 
173 | 
174 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
175 | @pytest.mark.parametrize("text, len_a, len_b, len_c", COMPOUND_SPLITER_TESTS)
176 | def test_compound_spliter(nlp, text, len_a, len_b, len_c):
177 |     assert len(nlp(text)) == len_c
178 |     for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]):
179 |         set_split_mode(nlp, split_mode)
180 |         assert len(nlp(text)) == l
181 | 
182 | 
183 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
184 | @pytest.mark.parametrize("text, expected_tags", TAG_TESTS)
185 | def test_tag(nlp, text, expected_tags):
186 |     tags = [token.tag_ for token in nlp(text)]
187 |     assert tags == expected_tags
188 | 
189 | 
190 | @pytest.mark.parametrize("nlp", ["ja_ginza"], indirect=True)
191 | @pytest.mark.parametrize("text, expected_poss", POS_TESTS_JA_GINZA)
192 | def test_pos_ja_ginza(nlp, text, expected_poss):
193 |     poss = [token.pos_ for token in nlp(text)]
194 |     assert poss == expected_poss
195 | 
196 | 
197 | @pytest.mark.parametrize("nlp", ["ja_ginza_electra"], indirect=True)
198 | @pytest.mark.parametrize("text, expected_poss", POS_TESTS_JA_GINZA_ELECTRA)
199 | def test_pos_ja_ginza_electra(nlp, text, expected_poss):
200 |     poss = [token.pos_ for token in nlp(text)]
201 |     assert poss == expected_poss
202 | 
203 | 
204 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
205 | @pytest.mark.parametrize("text, lemma", LEMMATIZE_TESTS)
206 | def test_lemmatize(nlp, text, lemma):
207 |     doc = nlp(text)
208 |     assert lemma == doc[0].lemma_
209 | 
210 | 
211 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
212 | @pytest.mark.parametrize("text, norm", NORMALIZE_TESTS)
213 | def test_normalize(nlp, text, norm):
214 |     doc = nlp(text)
215 |     assert norm == doc[0].norm_
216 | 
217 | 
218 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
219 | @pytest.mark.parametrize("text, expected_len", EMPTYISH_TESTS)
220 | def test_emptyish_texts(nlp, text, expected_len):
221 |     doc = nlp(text)
222 |     assert len(doc) == expected_len
223 | 
224 | 
225 | @pytest.mark.parametrize("nlp", MODELS, indirect=True)
226 | @pytest.mark.parametrize("text", NAUGHTY_STRINGS)
227 | def test_naughty_strings(nlp, text):
228 |     doc = nlp(text)
229 |     assert doc.text_with_ws == text
230 | 


--------------------------------------------------------------------------------
/ginza_util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/megagonlabs/ginza/f67b4987af09bad939d75c89b4e9483b90c064ee/ginza_util/__init__.py


--------------------------------------------------------------------------------
/ginza_util/browse_trees.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | import plac
 3 | import sys
 4 | import threading
 5 | import time
 6 | import webbrowser
 7 | import spacy
 8 | from spacy import displacy
 9 | from spacy.gold import GoldCorpus
10 | from ginza import *
11 | 
12 | 
13 | @plac.annotations(
14 |     model_path=("model directory path", "option", "b", str),
15 |     split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]),
16 |     style=("displacy style (default=dep)", "option", "d", str),
17 |     compact=("compact", "flag", "c"),
18 |     browser_command=("web browser command", "option", "w", str),
19 | )
20 | def main(
21 |         model_path=None,
22 |         split_mode=None,
23 |         style='dep',
24 |         compact=False,
25 |         browser_command=None,
26 | ):
27 |     if model_path:
28 |         nlp = spacy.load(model_path)
29 |     else:
30 |         nlp = spacy.load("ja_ginza")
31 | 
32 |     if split_mode:
33 |         set_split_mode(nlp, split_mode)
34 | 
35 |     if browser_command:
36 |         browser = webbrowser.get(browser_command)
37 |     else:
38 |         browser = None
39 | 
40 |     print("Input a sentence line:", file=sys.stderr)
41 |     line = input()
42 |     docs = [nlp(line)]
43 | 
44 |     display(browser, docs, style, compact)
45 | 
46 | 
47 | def display(browser, docs, style='dep', compact=False, url='http://localhost:5000'):
48 |     if browser:
49 |         thread = threading.Thread(target=open_browser, args=[browser, url])
50 |         thread.start()
51 |     else:
52 |         print('open following url by web browser', file=sys.stderr)
53 |         print(url, file=sys.stderr)
54 |     displacy.serve(docs, style, options={'compact': compact, 'collapse_punct': False})
55 | 
56 | 
57 | def open_browser(browser, url, wait=0.5):
58 |     if wait:
59 |         time.sleep(wait)
60 |     browser.open(url)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     plac.call(main)
65 | 


--------------------------------------------------------------------------------
/ginza_util/conv_connlu_to_json.rea.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | corpus_dir=$1
 6 | corpus_title=$2
 7 | 
 8 | for s in dev test train; do
 9 | for n in 1 10 -15; do
10 | 
11 | if ((n == -15)); then
12 |    file_n_sents=random_sents
13 | else
14 |    file_n_sents=$n
15 | fi
16 | 
17 | python ginza_util/conllu_to_json.py -n $n -r C -e -a $corpus_dir/$corpus_title-$s.ne.conllu > $corpus_dir/$corpus_title-$s.ne.rea.$file_n_sents.json
18 | 
19 | done
20 | done
21 | 


--------------------------------------------------------------------------------
/ginza_util/evaluate_conllu.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from __future__ import unicode_literals, print_function
  3 | 
  4 | import json
  5 | import sys
  6 | 
  7 | import spacy
  8 | 
  9 | 
 10 | USAGE = '''
 11 | Usage: python evaluate_model.py spacy_model_path json_file1 [json_file2 ...]
 12 | '''
 13 | 
 14 | 
 15 | def evaluate_from_file(
 16 |         conllu_file,
 17 |         json_files,
 18 | ):
 19 |     gold = []
 20 |     for file in json_files:
 21 |         with open(file, 'r', encoding="utf-8") as f:
 22 |             for doc in json.load(f):
 23 |                 for paragraph in doc['paragraphs']:
 24 |                     for sentence in paragraph['sentences']:
 25 |                         tokens = sentence['tokens']
 26 |                         gold.append(tokens)
 27 | 
 28 |     system = load_conllu(conllu_file)
 29 | 
 30 |     return evaluate(
 31 |         gold,
 32 |         system,
 33 |         sys.stdout,
 34 |     )
 35 | 
 36 | 
 37 | class TokenFake:
 38 |     pass
 39 | 
 40 | def load_conllu(conllu_file):
 41 |     with open(conllu_file, "r", encoding="utf8") as fin:
 42 |         sentences = {}
 43 |         sentence = None
 44 |         tokens = []
 45 |         idx = 0
 46 |         for line in fin:
 47 |             line = line.rstrip()
 48 |             if not line:
 49 |                 assert sentence and tokens
 50 |                 for t in tokens:
 51 |                     if t.dep_ == "root":
 52 |                         t.dep_ = "ROOT"
 53 |                         t.head = t
 54 |                     else:
 55 |                         t.head = tokens[t.head]
 56 |                 sentences[sentence] = tokens
 57 |                 sentence = None
 58 |                 tokens = []
 59 |                 idx = 0
 60 |                 continue
 61 |             if line.startswith("# text = "):
 62 |                 assert not sentence and not tokens
 63 |                 sentence = line[9:]
 64 |                 continue
 65 |             r = line.split("\t")
 66 |             t = TokenFake()
 67 |             t.i = int(r[0]) - 1
 68 |             t.orth_, t.lemma_, t.pos_, t.tag_ = r[1:5]
 69 |             t.head = int(r[6]) - 1
 70 |             t.dep_ = r[7]
 71 |             t.ent_type_ = None
 72 |             t.ent_iob_ = "O"
 73 |             if "NE=" in r[9]:
 74 |                 t.ent_iob_, t.ent_type_ = r[9].split("NE=")[1].split("|")[0].split("-")
 75 |             print(sentence, t.orth_)
 76 |             t.idx = sentence.index(t.orth_, idx)
 77 |             idx = t.idx + len(t.orth_)
 78 |             tokens.append(t)
 79 |     assert not sentence and not tokens
 80 |     return sentences
 81 | 
 82 | 
 83 | def evaluate(
 84 |         gold_corpus,
 85 |         system,
 86 |         fout=sys.stdout,
 87 |         morph_custom_condition=lambda g, r: g['pos'] == r.pos_ if g['tag'].find('可能') >= 0 else None,
 88 | ):
 89 |     stats = Stats()
 90 | 
 91 |     print('Evaluate {} sentences'.format(len(gold_corpus)), file=sys.stderr, flush=True)
 92 |     for i, gold_tokens in enumerate(gold_corpus):
 93 |         if i % 100 == 0:
 94 |             print('.', end='', file=sys.stderr, flush=True)
 95 | 
 96 |         offset = 0
 97 |         sentence = ''
 98 |         for idx, t in enumerate(gold_tokens):
 99 |             t['head'] = gold_tokens[idx + t['head']]
100 |             t['offset'] = offset
101 |             offset += len(t['orth'])
102 |             t['end'] = offset
103 |             sentence += t['orth']
104 |             if 'whitespace' in t and t['whitespace']:
105 |                 offset += 1
106 |                 sentence += ' '
107 |         try:
108 |             doc = system[sentence]
109 |             stats.evaluate(gold_tokens, doc, morph_custom_condition)
110 |         except Exception as e:
111 |             print("Evaluation error:", sentence, file=sys.stderr)
112 |             raise e
113 |     print(file=sys.stderr, flush=True)
114 | 
115 |     stats.print(fout)
116 | 
117 |     return stats
118 | 
119 | 
120 | COMMON_FORMAT = "LAS={:.4f},UAS={:.4f},LAS_POS={:.4f},UAS_POS={:.4f},POS={:.4f},TAG={:.4f},boundary={:.4f}"
121 | 
122 | 
123 | class Stats:
124 |     def __init__(self):
125 |         self.sentences = 0
126 |         self.gold_tokens = 0
127 |         self.result_tokens = 0
128 |         self.custom_tokens = 0
129 |         self.correct_tokens = 0
130 |         self.correct_tag_tokens = 0
131 |         self.correct_pos_tokens = 0
132 |         self.correct_pos_uas_tokens = 0
133 |         self.correct_pos_las_tokens = 0
134 |         self.correct_uas_tokens = 0
135 |         self.correct_las_tokens = 0
136 |         self.correct_custom_tokens = 0
137 |         self.correct_sentences = 0
138 |         self.correct_tag_sentences = 0
139 |         self.correct_pos_sentences = 0
140 |         self.correct_pos_uas_sentences = 0
141 |         self.correct_pos_las_sentences = 0
142 |         self.correct_uas_sentences = 0
143 |         self.correct_las_sentences = 0
144 |         self.correct_roots = 0
145 |         self.dep_confusion = {}
146 |         self.pos_confusion = {}
147 | 
148 |         self.gold_ents = 0
149 |         self.result_ents = 0
150 |         self.correct_ent_spans = 0
151 |         self.correct_ent_labels = 0
152 |         self.ent_confusion = {}
153 | 
154 |     def score(self):
155 |         return sum([
156 |             self.correct_tokens,
157 |             self.correct_pos_tokens,
158 |             self.correct_pos_uas_tokens,
159 |             self.correct_pos_las_tokens,
160 |             self.correct_uas_tokens,
161 |             self.correct_las_tokens,
162 |             self.correct_custom_tokens,
163 |             self.correct_sentences,
164 |             self.correct_pos_sentences,
165 |             self.correct_pos_uas_sentences,
166 |             self.correct_pos_las_sentences,
167 |             self.correct_uas_sentences,
168 |             self.correct_las_sentences,
169 |             self.correct_roots,
170 |         ])
171 | 
172 |     def print(self, file=sys.stdout):
173 |         def f1(p, r):
174 |             if p + r == 0.0:
175 |                 return 0.0
176 |             else:
177 |                 return 2 * p * r / (p + r)
178 | 
179 |         for title, matrix in (
180 |                 ('pos_confusion', self.pos_confusion),
181 |                 ('dep_confusion', self.dep_confusion),
182 |                 ('ent_confusion', self.ent_confusion),
183 |         ):
184 |             print(' {}'.format(title), file=file)
185 |             max_label_len = str(max(len(g) for g in matrix.keys()))
186 |             for gold, results in sorted(matrix.items(), key=lambda t: t[0]):
187 |                 results = matrix[gold]
188 |                 print(('  {:<' + max_label_len + '}({:>6}): {}').format(gold, sum(results.values()), ', '.join([
189 |                     '{}={}'.format(pos, num) for pos, num in sorted(results.items(), key=lambda t:-t[1])
190 |                 ])), file=file)
191 |             print(' precision, recall, f1', file=file)
192 |             for gold, results in sorted(matrix.items(), key=lambda t: t[0]):
193 |                 results = matrix[gold]
194 |                 total = sum(results.values())
195 |                 correct = results.get(gold, results.get(gold.upper(), 0))
196 |                 output = sum(sum(v for k, v in r.items() if k.lower() == gold.lower()) for r in matrix.values())
197 |                 p = correct / output if output else 0
198 |                 r = correct / total if total else 0
199 |                 f = p * r * 2 / (p + r) if p and r else 0
200 |                 print(('  {:<' + max_label_len + '}: {:.3f}, {:.3f}, {:.3f}').format(gold, p, r, f), file=file)
201 | 
202 |         print("sentence={}, gold_token={}, result_token={}, custom_cond={:.4f}({}/{})".format(
203 |             self.sentences,
204 |             self.gold_tokens,
205 |             self.result_tokens,
206 |             (self.correct_custom_tokens / self.custom_tokens) if self.custom_tokens > 0 else 0,
207 |             self.correct_custom_tokens,
208 |             self.custom_tokens,
209 |         ), file=file)
210 |         print(("        token_f1:" + COMMON_FORMAT).format(
211 |             f1(self.correct_las_tokens / self.gold_tokens, self.correct_las_tokens / self.result_tokens),
212 |             f1(self.correct_uas_tokens / self.gold_tokens, self.correct_uas_tokens / self.result_tokens),
213 |             f1(self.correct_pos_las_tokens / self.gold_tokens, self.correct_pos_las_tokens / self.result_tokens),
214 |             f1(self.correct_pos_uas_tokens / self.gold_tokens, self.correct_pos_uas_tokens / self.result_tokens),
215 |             f1(self.correct_pos_tokens / self.gold_tokens, self.correct_pos_tokens / self.result_tokens),
216 |             f1(self.correct_tag_tokens / self.gold_tokens, self.correct_tag_tokens / self.result_tokens),
217 |             f1(self.correct_tokens / self.gold_tokens, self.correct_tokens / self.result_tokens),
218 |         ), file=file)
219 |         print(("    token_recall:" + COMMON_FORMAT).format(
220 |             self.correct_las_tokens / self.gold_tokens,
221 |             self.correct_uas_tokens / self.gold_tokens,
222 |             self.correct_pos_las_tokens / self.gold_tokens,
223 |             self.correct_pos_uas_tokens / self.gold_tokens,
224 |             self.correct_pos_tokens / self.gold_tokens,
225 |             self.correct_tag_tokens / self.gold_tokens,
226 |             self.correct_tokens / self.gold_tokens,
227 |         ), file=file)
228 |         print((" token_precision:" + COMMON_FORMAT).format(
229 |             self.correct_las_tokens / self.result_tokens,
230 |             self.correct_uas_tokens / self.result_tokens,
231 |             self.correct_pos_las_tokens / self.result_tokens,
232 |             self.correct_pos_uas_tokens / self.result_tokens,
233 |             self.correct_pos_tokens / self.result_tokens,
234 |             self.correct_tag_tokens / self.result_tokens,
235 |             self.correct_tokens / self.result_tokens,
236 |         ), file=file)
237 |         print(("  whole_sentence:" + COMMON_FORMAT + ",root={:.4f}").format(
238 |             self.correct_las_sentences / self.sentences,
239 |             self.correct_uas_sentences / self.sentences,
240 |             self.correct_pos_las_sentences / self.sentences,
241 |             self.correct_pos_uas_sentences / self.sentences,
242 |             self.correct_pos_sentences / self.sentences,
243 |             self.correct_tag_sentences / self.sentences,
244 |             self.correct_sentences / self.sentences,
245 |             self.correct_roots / self.sentences,
246 |         ), file=file)
247 |         print("ent_gold={}, ent_result={}".format(
248 |                 self.gold_ents,
249 |                 self.result_ents,
250 |         ), file=file)
251 |         if self.gold_ents and self.result_ents:
252 |             print("        ent_f1:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
253 |                 f1(self.correct_ent_labels / self.gold_ents, self.correct_ent_labels / self.result_ents),
254 |                 f1(self.correct_ent_spans / self.gold_ents, self.correct_ent_spans / self.result_ents),
255 |             ), file=file)
256 |             print("    ent_recall:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
257 |                 self.correct_ent_labels / self.gold_ents,
258 |                 self.correct_ent_spans / self.gold_ents,
259 |             ), file=file)
260 |             print(" ent_precision:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
261 |                 self.correct_ent_labels / self.result_ents,
262 |                 self.correct_ent_spans / self.result_ents,
263 |             ), file=file)
264 |         file.flush()
265 | 
266 |     def evaluate(self, gold, doc, morph_custom_condition, debug=False):
267 |         def count(matrix, l1, l2):
268 |             if l1 not in matrix:
269 |                 matrix[l1] = {}
270 |             m2 = matrix[l1]
271 |             if l2 in m2:
272 |                 m2[l2] += 1
273 |             else:
274 |                 m2[l2] = 1
275 | 
276 |         self.sentences += 1
277 |         self.gold_tokens += len(gold)
278 |         self.result_tokens += len(doc)
279 | 
280 |         correct_tokens = 0
281 |         correct_tag_tokens = 0
282 |         correct_pos_tokens = 0
283 |         correct_uas_tokens = 0
284 |         correct_las_tokens = 0
285 |         correct_pos_uas_tokens = 0
286 |         correct_pos_las_tokens = 0
287 |         custom_tokens = 0
288 |         correct_custom_tokens = 0
289 |         index_g = 0
290 |         index_r = 0
291 |         last_match_g = 0
292 |         last_match_r = 0
293 |         while index_g < len(gold) and index_r < len(doc):
294 |             g = gold[index_g]
295 |             g_end = g['end']
296 |             r = doc[index_r]
297 |             r_end = r.idx + len(r.orth_)
298 |             if g['offset'] == r.idx:
299 |                 if g_end == r_end:
300 |                     correct_tokens += 1
301 |                     count(self.pos_confusion, g['pos'], r.pos_)
302 |                     if g['tag'] == r.tag_:
303 |                         correct_tag_tokens += 1
304 |                     if g['pos'] == r.pos_:
305 |                         correct_pos_tokens += 1
306 |                     if is_correct_dep(g, r):
307 |                         correct_uas_tokens += 1
308 |                         count(self.dep_confusion, g['dep'].lower(), r.dep_)
309 |                         if g['pos'] == r.pos_:
310 |                             correct_pos_uas_tokens += 1
311 |                         if g['dep'].lower() == r.dep_.lower():
312 |                             correct_las_tokens += 1
313 |                             if g['pos'] == r.pos_:
314 |                                 correct_pos_las_tokens += 1
315 |                     else:
316 |                         count(self.dep_confusion, g['dep'].lower(), '_')
317 |                     if g['dep'].lower() == 'root' and r.dep_.lower() == 'root':
318 |                         self.correct_roots += 1
319 |                 elif g_end < r_end:
320 |                     count(self.pos_confusion, g['pos'], '_')
321 |                     count(self.dep_confusion, g['dep'].lower(), '_')
322 |             elif g_end < r_end:
323 |                 count(self.pos_confusion, g['pos'], '_')
324 |                 count(self.dep_confusion, g['dep'].lower(), '_')
325 | 
326 |             if debug:
327 |                 if g_end == r_end:
328 |                     print('{}\t{}\t{}'.format(
329 |                         '=' if index_g == last_match_g and index_r == last_match_r else
330 |                         '>' if index_g == last_match_g else
331 |                         '<' if index_r == last_match_r else
332 |                         '!',
333 |                         ','.join(['-'.join((
334 |                             m['orth'], m['pos'], m['dep'], str(m['head']['offset']), str(m['head']['end'])
335 |                         )) for m in gold[last_match_g:index_g + 1]]),
336 |                         ','.join(['-'.join((
337 |                             m.orth_, m.pos_, m.dep_, str(m.head.idx), str(m.head.idx + len(m.head.orth_))
338 |                         )) for m in doc[last_match_r:index_r + 1]]),
339 |                     ))
340 |                     last_match_g = index_g + 1
341 |                     last_match_r = index_r + 1
342 |             if g_end <= r_end:
343 |                 index_g += 1
344 |             if g_end >= r_end:
345 |                 index_r += 1
346 | 
347 |         tokens = len(gold)
348 |         self.correct_tokens += correct_tokens
349 |         if correct_tokens == tokens:
350 |             self.correct_sentences += 1
351 |         self.correct_tag_tokens += correct_tag_tokens
352 |         if correct_tag_tokens == tokens:
353 |             self.correct_tag_sentences += 1
354 |         self.correct_pos_tokens += correct_pos_tokens
355 |         if correct_pos_tokens == tokens:
356 |             self.correct_pos_sentences += 1
357 |         self.correct_uas_tokens += correct_uas_tokens
358 |         if correct_uas_tokens == tokens:
359 |             self.correct_uas_sentences += 1
360 |         self.correct_las_tokens += correct_las_tokens
361 |         if correct_las_tokens == tokens:
362 |             self.correct_las_sentences += 1
363 |         self.correct_pos_uas_tokens += correct_pos_uas_tokens
364 |         if correct_pos_uas_tokens == tokens:
365 |             self.correct_pos_uas_sentences += 1
366 |         self.correct_pos_las_tokens += correct_pos_las_tokens
367 |         if correct_pos_las_tokens == tokens:
368 |             self.correct_pos_las_sentences += 1
369 | 
370 |         result_borders = {r.idx: (len(r.orth_), r) for r in doc}
371 |         for g in gold:
372 |             length, r = result_borders.get(g['offset'], (0, None))
373 |             if length == len(g['orth']):
374 |                 custom = morph_custom_condition(g, r)
375 |                 if custom is not None:
376 |                     custom_tokens += 1
377 |                     if custom:
378 |                         correct_custom_tokens += 1
379 |                     # else:
380 |                         # print(custom, g.surface, r.surface, g.pos, r.pos, g.tag, r.tag)
381 |         self.custom_tokens += custom_tokens
382 |         self.correct_custom_tokens += correct_custom_tokens
383 | 
384 |         gold_ents = {}
385 |         ent_label = None
386 |         ent_begin = None
387 |         for g in gold:
388 |             ner = g['ner'] if 'ner' in g else '-'
389 |             if ner.startswith('B-'):
390 |                 ent_label = ner[2:]
391 |                 ent_begin = g['offset']
392 |             elif ner.startswith('L-'):
393 |                 gold_ents[(ent_begin, g['end'])] = ent_label
394 |                 ent_label = None
395 |                 ent_begin = None
396 |             elif ner.startswith('U-'):
397 |                 gold_ents[(g['offset'], g['end'])] = ner[2:]
398 |         result_ents = {}
399 |         ent_label = None
400 |         ent_begin = None
401 |         ent_end = None
402 |         for r in doc:
403 |             if ent_label and r.ent_iob_ != 'I':
404 |                 result_ents[(ent_begin, ent_end)] = ent_label
405 |                 ent_label = None
406 |                 ent_begin = None
407 |                 ent_end = None
408 |             if r.ent_iob_ == 'B':
409 |                 ent_label = r.ent_type_
410 |                 ent_begin = r.idx
411 |                 ent_end = r.idx + len(r.orth_)
412 |             elif r.ent_iob_ == 'I':
413 |                 ent_end = r.idx + len(r.orth_)
414 |         if ent_label:
415 |             result_ents[(ent_begin, ent_end)] = ent_label
416 | 
417 |         self.gold_ents += len(gold_ents)
418 |         self.result_ents += len(result_ents)
419 |         for k, gold_label in gold_ents.items():
420 |             if k in result_ents:
421 |                 self.correct_ent_spans += 1
422 |                 result_label = result_ents[k]
423 |                 count(self.ent_confusion, gold_label, result_label)
424 |                 if gold_label == result_label:
425 |                     self.correct_ent_labels += 1
426 |             else:
427 |                 count(self.ent_confusion, gold_label, '_')
428 | 
429 | 
430 | def is_correct_dep(g, r):
431 |     return g['head']['offset'] <= r.head.idx and g['head']['end'] >= r.head.idx + len(r.head.orth_) or \
432 |            g['head']['offset'] >= r.head.idx and g['head']['end'] <= r.head.idx + len(r.head.orth_)
433 | 
434 | 
435 | if __name__ == '__main__':
436 |     if len(sys.argv) < 3:
437 |         print(USAGE, file=sys.stderr)
438 |         exit(2)
439 |     evaluate_from_file(sys.argv[1], sys.argv[2:])
440 | 


--------------------------------------------------------------------------------
/ginza_util/evaluate_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from __future__ import unicode_literals, print_function
  3 | 
  4 | import json
  5 | import sys
  6 | 
  7 | import spacy
  8 | 
  9 | 
 10 | USAGE = '''
 11 | Usage: python evaluate_model.py spacy_model_path json_file1 [json_file2 ...]
 12 | '''
 13 | 
 14 | 
 15 | def evaluate_from_file(
 16 |         model_path,
 17 |         json_files,
 18 | ):
 19 |     gold = []
 20 |     for file in json_files:
 21 |         with open(file, 'r', encoding="utf-8") as f:
 22 |             for doc in json.load(f):
 23 |                 for paragraph in doc['paragraphs']:
 24 |                     for sentence in paragraph['sentences']:
 25 |                         tokens = sentence['tokens']
 26 |                         gold.append(tokens)
 27 | 
 28 |     nlp = spacy.load(model_path)
 29 |     nlp.tokenizer.use_sentence_separator = False
 30 | 
 31 |     return evaluate(
 32 |         gold,
 33 |         nlp,
 34 |         sys.stdout,
 35 |     )
 36 | 
 37 | 
 38 | def evaluate(
 39 |         gold_corpus,
 40 |         nlp,
 41 |         fout=sys.stdout,
 42 |         morph_custom_condition=lambda g, r: g['pos'] == r.pos_ if g['tag'].find('可能') >= 0 else None,
 43 | ):
 44 |     stats = Stats()
 45 | 
 46 |     print('Evaluate {} sentences'.format(len(gold_corpus)), file=sys.stderr, flush=True)
 47 |     for i, gold_tokens in enumerate(gold_corpus):
 48 |         if i % 100 == 0:
 49 |             print('.', end='', file=sys.stderr, flush=True)
 50 | 
 51 |         offset = 0
 52 |         sentence = ''
 53 |         for idx, t in enumerate(gold_tokens):
 54 |             t['head'] = gold_tokens[idx + t['head']]
 55 |             t['offset'] = offset
 56 |             offset += len(t['orth'])
 57 |             t['end'] = offset
 58 |             sentence += t['orth']
 59 |             if 'whitespace' in t and t['whitespace']:
 60 |                 offset += 1
 61 |                 sentence += ' '
 62 |         try:
 63 |             doc = nlp(sentence)
 64 |             stats.evaluate(gold_tokens, doc, morph_custom_condition)
 65 |         except Exception as e:
 66 |             print("Evaluation error:", sentence, file=sys.stderr)
 67 |             raise e
 68 |     print(file=sys.stderr, flush=True)
 69 | 
 70 |     stats.print(fout)
 71 | 
 72 |     return stats
 73 | 
 74 | 
 75 | COMMON_FORMAT = "LAS={:.4f},UAS={:.4f},LAS_POS={:.4f},UAS_POS={:.4f},POS={:.4f},TAG={:.4f},boundary={:.4f}"
 76 | 
 77 | 
 78 | class Stats:
 79 |     def __init__(self):
 80 |         self.sentences = 0
 81 |         self.gold_tokens = 0
 82 |         self.result_tokens = 0
 83 |         self.custom_tokens = 0
 84 |         self.correct_tokens = 0
 85 |         self.correct_tag_tokens = 0
 86 |         self.correct_pos_tokens = 0
 87 |         self.correct_pos_uas_tokens = 0
 88 |         self.correct_pos_las_tokens = 0
 89 |         self.correct_uas_tokens = 0
 90 |         self.correct_las_tokens = 0
 91 |         self.correct_custom_tokens = 0
 92 |         self.correct_sentences = 0
 93 |         self.correct_tag_sentences = 0
 94 |         self.correct_pos_sentences = 0
 95 |         self.correct_pos_uas_sentences = 0
 96 |         self.correct_pos_las_sentences = 0
 97 |         self.correct_uas_sentences = 0
 98 |         self.correct_las_sentences = 0
 99 |         self.correct_roots = 0
100 |         self.dep_confusion = {}
101 |         self.pos_confusion = {}
102 | 
103 |         self.gold_ents = 0
104 |         self.result_ents = 0
105 |         self.correct_ent_spans = 0
106 |         self.correct_ent_labels = 0
107 |         self.ent_confusion = {}
108 | 
109 |     def score(self):
110 |         return sum([
111 |             self.correct_tokens,
112 |             self.correct_pos_tokens,
113 |             self.correct_pos_uas_tokens,
114 |             self.correct_pos_las_tokens,
115 |             self.correct_uas_tokens,
116 |             self.correct_las_tokens,
117 |             self.correct_custom_tokens,
118 |             self.correct_sentences,
119 |             self.correct_pos_sentences,
120 |             self.correct_pos_uas_sentences,
121 |             self.correct_pos_las_sentences,
122 |             self.correct_uas_sentences,
123 |             self.correct_las_sentences,
124 |             self.correct_roots,
125 |         ])
126 | 
127 |     def print(self, file=sys.stdout):
128 |         def f1(p, r):
129 |             if p + r == 0.0:
130 |                 return 0.0
131 |             else:
132 |                 return 2 * p * r / (p + r)
133 | 
134 |         for title, matrix in (
135 |                 ('pos_confusion', self.pos_confusion),
136 |                 ('dep_confusion', self.dep_confusion),
137 |                 ('ent_confusion', self.ent_confusion),
138 |         ):
139 |             print(' {}'.format(title), file=file)
140 |             max_label_len = str(max(len(g) for g in matrix.keys()))
141 |             for gold, results in sorted(matrix.items(), key=lambda t: t[0]):
142 |                 results = matrix[gold]
143 |                 print(('  {:<' + max_label_len + '}({:>6}): {}').format(gold, sum(results.values()), ', '.join([
144 |                     '{}={}'.format(pos, num) for pos, num in sorted(results.items(), key=lambda t:-t[1])
145 |                 ])), file=file)
146 |             print(' precision, recall, f1', file=file)
147 |             for gold, results in sorted(matrix.items(), key=lambda t: t[0]):
148 |                 results = matrix[gold]
149 |                 total = sum(results.values())
150 |                 correct = results.get(gold, results.get(gold.upper(), 0))
151 |                 output = sum(sum(v for k, v in r.items() if k.lower() == gold.lower()) for r in matrix.values())
152 |                 p = correct / output if output else 0
153 |                 r = correct / total if total else 0
154 |                 f = p * r * 2 / (p + r) if p and r else 0
155 |                 print(('  {:<' + max_label_len + '}: {:.3f}, {:.3f}, {:.3f}').format(gold, p, r, f), file=file)
156 | 
157 |         print("sentence={}, gold_token={}, result_token={}, custom_cond={:.4f}({}/{})".format(
158 |             self.sentences,
159 |             self.gold_tokens,
160 |             self.result_tokens,
161 |             (self.correct_custom_tokens / self.custom_tokens) if self.custom_tokens > 0 else 0,
162 |             self.correct_custom_tokens,
163 |             self.custom_tokens,
164 |         ), file=file)
165 |         print(("        token_f1:" + COMMON_FORMAT).format(
166 |             f1(self.correct_las_tokens / self.gold_tokens, self.correct_las_tokens / self.result_tokens),
167 |             f1(self.correct_uas_tokens / self.gold_tokens, self.correct_uas_tokens / self.result_tokens),
168 |             f1(self.correct_pos_las_tokens / self.gold_tokens, self.correct_pos_las_tokens / self.result_tokens),
169 |             f1(self.correct_pos_uas_tokens / self.gold_tokens, self.correct_pos_uas_tokens / self.result_tokens),
170 |             f1(self.correct_pos_tokens / self.gold_tokens, self.correct_pos_tokens / self.result_tokens),
171 |             f1(self.correct_tag_tokens / self.gold_tokens, self.correct_tag_tokens / self.result_tokens),
172 |             f1(self.correct_tokens / self.gold_tokens, self.correct_tokens / self.result_tokens),
173 |         ), file=file)
174 |         print(("    token_recall:" + COMMON_FORMAT).format(
175 |             self.correct_las_tokens / self.gold_tokens,
176 |             self.correct_uas_tokens / self.gold_tokens,
177 |             self.correct_pos_las_tokens / self.gold_tokens,
178 |             self.correct_pos_uas_tokens / self.gold_tokens,
179 |             self.correct_pos_tokens / self.gold_tokens,
180 |             self.correct_tag_tokens / self.gold_tokens,
181 |             self.correct_tokens / self.gold_tokens,
182 |         ), file=file)
183 |         print((" token_precision:" + COMMON_FORMAT).format(
184 |             self.correct_las_tokens / self.result_tokens,
185 |             self.correct_uas_tokens / self.result_tokens,
186 |             self.correct_pos_las_tokens / self.result_tokens,
187 |             self.correct_pos_uas_tokens / self.result_tokens,
188 |             self.correct_pos_tokens / self.result_tokens,
189 |             self.correct_tag_tokens / self.result_tokens,
190 |             self.correct_tokens / self.result_tokens,
191 |         ), file=file)
192 |         print(("  whole_sentence:" + COMMON_FORMAT + ",root={:.4f}").format(
193 |             self.correct_las_sentences / self.sentences,
194 |             self.correct_uas_sentences / self.sentences,
195 |             self.correct_pos_las_sentences / self.sentences,
196 |             self.correct_pos_uas_sentences / self.sentences,
197 |             self.correct_pos_sentences / self.sentences,
198 |             self.correct_tag_sentences / self.sentences,
199 |             self.correct_sentences / self.sentences,
200 |             self.correct_roots / self.sentences,
201 |         ), file=file)
202 |         print("ent_gold={}, ent_result={}".format(
203 |                 self.gold_ents,
204 |                 self.result_ents,
205 |         ), file=file)
206 |         if self.gold_ents and self.result_ents:
207 |             print("        ent_f1:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
208 |                 f1(self.correct_ent_labels / self.gold_ents, self.correct_ent_labels / self.result_ents),
209 |                 f1(self.correct_ent_spans / self.gold_ents, self.correct_ent_spans / self.result_ents),
210 |             ), file=file)
211 |             print("    ent_recall:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
212 |                 self.correct_ent_labels / self.gold_ents,
213 |                 self.correct_ent_spans / self.gold_ents,
214 |             ), file=file)
215 |             print(" ent_precision:SPAN_LABEL={:.4f},SPAN_ONLY={:.4f}".format(
216 |                 self.correct_ent_labels / self.result_ents,
217 |                 self.correct_ent_spans / self.result_ents,
218 |             ), file=file)
219 |         file.flush()
220 | 
221 |     def evaluate(self, gold, doc, morph_custom_condition, debug=False):
222 |         def count(matrix, l1, l2):
223 |             if l1 not in matrix:
224 |                 matrix[l1] = {}
225 |             m2 = matrix[l1]
226 |             if l2 in m2:
227 |                 m2[l2] += 1
228 |             else:
229 |                 m2[l2] = 1
230 | 
231 |         self.sentences += 1
232 |         self.gold_tokens += len(gold)
233 |         self.result_tokens += len(doc)
234 | 
235 |         correct_tokens = 0
236 |         correct_tag_tokens = 0
237 |         correct_pos_tokens = 0
238 |         correct_uas_tokens = 0
239 |         correct_las_tokens = 0
240 |         correct_pos_uas_tokens = 0
241 |         correct_pos_las_tokens = 0
242 |         custom_tokens = 0
243 |         correct_custom_tokens = 0
244 |         index_g = 0
245 |         index_r = 0
246 |         last_match_g = 0
247 |         last_match_r = 0
248 |         while index_g < len(gold) and index_r < len(doc):
249 |             g = gold[index_g]
250 |             g_end = g['end']
251 |             r = doc[index_r]
252 |             r_end = r.idx + len(r.orth_)
253 |             if g['offset'] == r.idx:
254 |                 if g_end == r_end:
255 |                     correct_tokens += 1
256 |                     count(self.pos_confusion, g['pos'], r.pos_)
257 |                     if g['tag'] == r.tag_:
258 |                         correct_tag_tokens += 1
259 |                     if g['pos'] == r.pos_:
260 |                         correct_pos_tokens += 1
261 |                     if is_correct_dep(g, r):
262 |                         correct_uas_tokens += 1
263 |                         count(self.dep_confusion, g['dep'].lower(), r.dep_)
264 |                         if g['pos'] == r.pos_:
265 |                             correct_pos_uas_tokens += 1
266 |                         if g['dep'].lower() == r.dep_.lower():
267 |                             correct_las_tokens += 1
268 |                             if g['pos'] == r.pos_:
269 |                                 correct_pos_las_tokens += 1
270 |                     else:
271 |                         count(self.dep_confusion, g['dep'].lower(), '_')
272 |                     if g['dep'].lower() == 'root' and r.dep_.lower() == 'root':
273 |                         self.correct_roots += 1
274 |                 elif g_end < r_end:
275 |                     count(self.pos_confusion, g['pos'], '_')
276 |                     count(self.dep_confusion, g['dep'].lower(), '_')
277 |             elif g_end < r_end:
278 |                 count(self.pos_confusion, g['pos'], '_')
279 |                 count(self.dep_confusion, g['dep'].lower(), '_')
280 | 
281 |             if debug:
282 |                 if g_end == r_end:
283 |                     print('{}\t{}\t{}'.format(
284 |                         '=' if index_g == last_match_g and index_r == last_match_r else
285 |                         '>' if index_g == last_match_g else
286 |                         '<' if index_r == last_match_r else
287 |                         '!',
288 |                         ','.join(['-'.join((
289 |                             m['orth'], m['pos'], m['dep'], str(m['head']['offset']), str(m['head']['end'])
290 |                         )) for m in gold[last_match_g:index_g + 1]]),
291 |                         ','.join(['-'.join((
292 |                             m.orth_, m.pos_, m.dep_, str(m.head.idx), str(m.head.idx + len(m.head.orth_))
293 |                         )) for m in doc[last_match_r:index_r + 1]]),
294 |                     ))
295 |                     last_match_g = index_g + 1
296 |                     last_match_r = index_r + 1
297 |             if g_end <= r_end:
298 |                 index_g += 1
299 |             if g_end >= r_end:
300 |                 index_r += 1
301 | 
302 |         tokens = len(gold)
303 |         self.correct_tokens += correct_tokens
304 |         if correct_tokens == tokens:
305 |             self.correct_sentences += 1
306 |         self.correct_tag_tokens += correct_tag_tokens
307 |         if correct_tag_tokens == tokens:
308 |             self.correct_tag_sentences += 1
309 |         self.correct_pos_tokens += correct_pos_tokens
310 |         if correct_pos_tokens == tokens:
311 |             self.correct_pos_sentences += 1
312 |         self.correct_uas_tokens += correct_uas_tokens
313 |         if correct_uas_tokens == tokens:
314 |             self.correct_uas_sentences += 1
315 |         self.correct_las_tokens += correct_las_tokens
316 |         if correct_las_tokens == tokens:
317 |             self.correct_las_sentences += 1
318 |         self.correct_pos_uas_tokens += correct_pos_uas_tokens
319 |         if correct_pos_uas_tokens == tokens:
320 |             self.correct_pos_uas_sentences += 1
321 |         self.correct_pos_las_tokens += correct_pos_las_tokens
322 |         if correct_pos_las_tokens == tokens:
323 |             self.correct_pos_las_sentences += 1
324 | 
325 |         result_borders = {r.idx: (len(r.orth_), r) for r in doc}
326 |         for g in gold:
327 |             length, r = result_borders.get(g['offset'], (0, None))
328 |             if length == len(g['orth']):
329 |                 custom = morph_custom_condition(g, r)
330 |                 if custom is not None:
331 |                     custom_tokens += 1
332 |                     if custom:
333 |                         correct_custom_tokens += 1
334 |                     # else:
335 |                         # print(custom, g.surface, r.surface, g.pos, r.pos, g.tag, r.tag)
336 |         self.custom_tokens += custom_tokens
337 |         self.correct_custom_tokens += correct_custom_tokens
338 | 
339 |         gold_ents = {}
340 |         ent_label = None
341 |         ent_begin = None
342 |         for g in gold:
343 |             ner = g['ner'] if 'ner' in g else '-'
344 |             if ner.startswith('B-'):
345 |                 ent_label = ner[2:]
346 |                 ent_begin = g['offset']
347 |             elif ner.startswith('L-'):
348 |                 gold_ents[(ent_begin, g['end'])] = ent_label
349 |                 ent_label = None
350 |                 ent_begin = None
351 |             elif ner.startswith('U-'):
352 |                 gold_ents[(g['offset'], g['end'])] = ner[2:]
353 |         result_ents = {}
354 |         ent_label = None
355 |         ent_begin = None
356 |         ent_end = None
357 |         for r in doc:
358 |             if ent_label and r.ent_iob_ != 'I':
359 |                 result_ents[(ent_begin, ent_end)] = ent_label
360 |                 ent_label = None
361 |                 ent_begin = None
362 |                 ent_end = None
363 |             if r.ent_iob_ == 'B':
364 |                 ent_label = r.ent_type_
365 |                 ent_begin = r.idx
366 |                 ent_end = r.idx + len(r.orth_)
367 |             elif r.ent_iob_ == 'I':
368 |                 ent_end = r.idx + len(r.orth_)
369 |         if ent_label:
370 |             result_ents[(ent_begin, ent_end)] = ent_label
371 | 
372 |         self.gold_ents += len(gold_ents)
373 |         self.result_ents += len(result_ents)
374 |         for k, gold_label in gold_ents.items():
375 |             if k in result_ents:
376 |                 self.correct_ent_spans += 1
377 |                 result_label = result_ents[k]
378 |                 count(self.ent_confusion, gold_label, result_label)
379 |                 if gold_label == result_label:
380 |                     self.correct_ent_labels += 1
381 |             else:
382 |                 count(self.ent_confusion, gold_label, '_')
383 | 
384 | 
385 | def is_correct_dep(g, r):
386 |     return g['head']['offset'] <= r.head.idx and g['head']['end'] >= r.head.idx + len(r.head.orth_) or \
387 |            g['head']['offset'] >= r.head.idx and g['head']['end'] <= r.head.idx + len(r.head.orth_)
388 | 
389 | 
390 | if __name__ == '__main__':
391 |     if len(sys.argv) < 3:
392 |         print(USAGE, file=sys.stderr)
393 |         exit(2)
394 |     evaluate_from_file(sys.argv[1], sys.argv[2:])
395 | 


--------------------------------------------------------------------------------
/ginza_util/gsk2014a.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf8
  2 | from __future__ import unicode_literals, print_function
  3 | 
  4 | import re
  5 | import sys
  6 | 
  7 | 
  8 | ID_PATTERN = re.compile(
  9 |     r'^.*((OC|OW|OY|PB|PM|PN)(..)_([0-9]{5}))(|\..+)$'
 10 | )
 11 | 
 12 | 
 13 | def read_gsk2014a_xml(_path):
 14 |     in_text = False
 15 |     text = ''
 16 |     stack = []
 17 |     tags = []
 18 |     with open(_path, 'r') as xml:
 19 |         for line in xml:
 20 |             if line.startswith('<TEXT>'):
 21 |                 in_text = True
 22 |             elif line.startswith('</TEXT>'):
 23 |                 break
 24 |             elif in_text:
 25 |                 prev = 0
 26 |                 for m in re.finditer(r'<(/?[^>]+)>', line):
 27 |                     text += line[prev:m.start(0)]
 28 |                     prev = m.end(0)
 29 |                     tag = m.group(1)
 30 |                     offset = len(text)
 31 |                     if tag.startswith('/'):
 32 |                         begin_tag, begin = stack.pop()
 33 |                         assert begin_tag == tag[1:], _path + ' ' + str(offset) + ' ' + begin_tag + ' ' + tag
 34 |                         if not stack:
 35 |                             tags.append((begin_tag, begin, offset))
 36 |                     elif tag.startswith('rejectedBlock'):
 37 |                         return None, None
 38 |                     else:
 39 |                         stack.append((tag, offset))
 40 |                 text += line[prev:]
 41 |         assert not stack, _path + ' ' + str(stack)
 42 |     return text, tags
 43 | 
 44 | 
 45 | def main():
 46 |     output_base_path = sys.argv[1]
 47 |     for conllu_path in sys.argv[2:]:
 48 |         file_id = ID_PATTERN.match(conllu_path).group(1)
 49 |         text, tags = read_gsk2014a_xml('corpus/gsk-ene-19.6.25/bccwj/xml/{}/{}.xml'.format(file_id[:2], file_id))
 50 |         tag_idx = 0
 51 |         in_tag = False
 52 |         offset = 0
 53 |         debug_sentence = ''
 54 |         output = []
 55 |         with open(conllu_path, 'r') as fin:
 56 |             for line in fin:
 57 |                 line = line.rstrip('\n')
 58 |                 if not text:
 59 |                     output.append((line, None))
 60 |                     continue
 61 |                 if line.startswith('# text = '):
 62 |                     in_tag = False
 63 |                     debug_sentence = line
 64 |                     output.append((line, None))
 65 |                     continue
 66 |                 if line.startswith('#'):
 67 |                     output.append((line, None))
 68 |                     continue
 69 |                 if line == '':
 70 |                     if in_tag:  # for multi sentence NEs such as URLs
 71 |                         in_tag = False
 72 |                         l, n = output[-1]
 73 |                         if n.startswith('B'):
 74 |                             n = 'U-' + tag
 75 |                         elif n.startswith('I'):
 76 |                             n = 'L-' + tag
 77 |                         output[-1] = (l, n)
 78 |                         print(
 79 |                             'dividing ne span:',
 80 |                             file_id,
 81 |                             tag_begin,
 82 |                             tag_end,
 83 |                             tag,
 84 |                             text[tag_begin:offset].replace('\n', '\\n'),
 85 |                             '|',
 86 |                             text[offset:tag_end].replace('\n', '\\n'),
 87 |                             debug_sentence,
 88 |                             file=sys.stderr
 89 |                         )
 90 |                     output.append((line, None))
 91 |                     continue
 92 |                 orth = line.split('\t')[1]
 93 |                 new_offset = text.find(orth, offset)
 94 |                 if new_offset == -1:
 95 |                     new_offset = text.find(orth.replace(' ', '　'), offset)
 96 |                 if new_offset == -1:
 97 |                     if orth == 'ミュージカル':
 98 |                         orth = 'ミュージ\nカル'
 99 |                     elif orth == 'モテる':
100 |                         orth = 'モテ\nる'
101 |                     elif orth == 'すぎる':
102 |                         orth = 'す\n\nぎる'
103 |                     elif orth == 'いう':
104 |                         orth = 'い\n\nう'
105 |                     elif orth == '位置':
106 |                         orth = '位\n置'
107 |                     elif orth == '用いれ':
108 |                         orth = '用\n\nいれ'
109 |                     elif orth == '見込ま':
110 |                         orth = '見\n\n込ま'
111 |                     elif orth == 'なる':
112 |                         orth = 'な\n\nる'
113 |                     elif orth == '載せる':
114 |                         orth = '載せ\n\nる'
115 |                     new_offset = text.find(orth, offset)
116 |                 if new_offset - offset >= 2 and len(text[offset:new_offset].strip()) >= 2:
117 |                     if orth == '不能':
118 |                         orth = '不\n\n能'
119 |                     elif orth == '退職':
120 |                         orth = '退\n\n職'
121 |                     elif orth == 'から':
122 |                         orth = 'か\nら'
123 |                     elif orth == '思う':
124 |                         orth = '思\n\nう'
125 |                     elif orth == '中敷き':
126 |                         orth = '中敷\nき'
127 |                     new_offset = text.find(orth, offset)
128 |                 assert new_offset >= 0, 'lost token: {} {}\n{}\n{}\n{}'.format(
129 |                     file_id,
130 |                     offset,
131 |                     line,
132 |                     text[offset:].replace('\n', '\\n'),
133 |                     debug_sentence,
134 |                 )
135 |                 if text[offset:new_offset].strip() != '':
136 |                     print(
137 |                         'skipping text:',
138 |                         file_id,
139 |                         offset,
140 |                         new_offset,
141 |                         text[offset:new_offset].replace('\n', '\\n'),
142 |                         debug_sentence,
143 |                         file=sys.stderr
144 |                     )
145 |                 offset = new_offset
146 | 
147 |                 end = offset + len(orth)
148 |                 if 'SpaceAfter=No' not in line:
149 |                     end += 1
150 | 
151 |                 if tag_idx < len(tags):
152 |                     tag, tag_begin, tag_end = tags[tag_idx]
153 |                     if end <= tag_begin:
154 |                         assert not in_tag, '{} {} {} {}\n{}\n{}\n{}'.format(
155 |                             file_id,
156 |                             offset,
157 |                             end,
158 |                             tag_begin,
159 |                             tag_end,
160 |                             line,
161 |                             text[offset],
162 |                         )
163 |                         ner = 'O'
164 |                     elif offset < tag_end and not in_tag:
165 |                         if end < tag_end:
166 |                             ner = 'B-' + tag
167 |                             in_tag = True
168 |                         else:
169 |                             ner = 'U-' + tag
170 |                             tag_idx += 1
171 |                     elif end < tag_end:
172 |                         assert in_tag, '{} {} {} {}\n{}\n{}\n{}'.format(
173 |                             file_id,
174 |                             offset,
175 |                             end,
176 |                             tag_begin,
177 |                             tag_end,
178 |                             line,
179 |                             text[offset:],
180 |                         )
181 |                         ner = 'I-' + tag
182 |                     elif tag_end <= end:
183 |                         if in_tag:
184 |                             ner = 'L-' + tag
185 |                             tag_idx += 1
186 |                             in_tag = False
187 |                         elif tag_begin < offset:
188 |                             ner = 'U-' + tag
189 |                             tag_idx += 1
190 |                         else:
191 |                             ner = 'O'
192 |                             tag_idx += 1
193 |                             print(
194 |                                 'skipping tag:',
195 |                                 file_id,
196 |                                 tag_begin,
197 |                                 tag_end,
198 |                                 tag,
199 |                                 text[tag_begin:tag_end].replace('\n', '\\n'),
200 |                                 debug_sentence,
201 |                                 file=sys.stderr
202 |                             )
203 |                     else:
204 |                         raise Exception("Unexpected state: token={} {}-{} {}, ne={}-{} {} {}".format(
205 |                             file_id,
206 |                             offset,
207 |                             end,
208 |                             text[offset:end].replace('\n', '\\n'),
209 |                             tag_begin,
210 |                             tag_end,
211 |                             text[tag_begin:tag_end].replace('\n', '\\n'),
212 |                             tag,
213 |                         ))
214 |                 else:
215 |                     ner = 'O'
216 |                 output.append((line, ner))
217 |                 offset = end
218 |         if tags and tag_idx < len(tags):
219 |             for tag_idx in range(tag_idx, len(tags)):
220 |                 print(
221 |                     'skipping tag:',
222 |                     file_id,
223 |                     tag_begin,
224 |                     tag_end,
225 |                     text[tag_begin:tag_end].replace('\n', '\\n'),
226 |                     '<EOF>',
227 |                     file=sys.stderr
228 |                 )
229 |         prev_ner = 'O'
230 |         for line, ner in output:
231 |             if not ner:
232 |                 ner = 'O'
233 |             assert prev_ner[0] not in ['B', 'I'] or ner[0] in ['I', 'L'], '{}\n{} {} {}\n{}'.format(
234 |                 '\n'.join([line + ' ' + str(ner) for line, ner in output]),
235 |                 conllu_path,
236 |                 prev_ner,
237 |                 ner,
238 |                 line,
239 |             )
240 |             assert prev_ner[0] not in ['L', 'U', 'O'] or ner[0] in ['B', 'U', 'O'], '{}\n{} {} {}\n{}'.format(
241 |                 '\n'.join([line + ' ' + str(ner) for line, ner in output]),
242 |                 conllu_path,
243 |                 prev_ner,
244 |                 ner,
245 |                 line,
246 |             )
247 |             prev_ner = ner
248 | 
249 |         with open(output_base_path + '/' + conllu_path.split('/')[-1], 'w') as fout:
250 |             for line, ner in output:
251 |                 if ner:
252 |                     if line.endswith('\t'):
253 |                         print(line + 'NE=' + ner, file=fout)
254 |                     elif line.endswith('_'):
255 |                         print(line[:-1] + '|NE=' + ner, file=fout)
256 |                     else:
257 |                         print(line + '|NE=' + ner, file=fout)
258 |                 else:
259 |                     print(line, file=fout)
260 | 
261 | 
262 | if __name__ == "__main__":
263 |     # execute only if run as a script
264 |     main()
265 | 


--------------------------------------------------------------------------------
/ginza_util/setup_meta.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | 
 5 | with open(sys.argv[1], "r") as fin:
 6 |   master = json.load(fin)
 7 | 
 8 | with open(sys.argv[2], "r") as fin:
 9 |   target = json.load(fin)
10 | 
11 | target.update(master)
12 | 
13 | json.dump(target, sys.stdout, indent=1, ensure_ascii=False)
14 | 
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy>=3.4.4,<4.0.0
2 | plac>=1.3.3
3 | SudachiPy>=0.6.2,<0.7.0
4 | SudachiDict-core>=20210802
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test = pytest
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     author="Megagon Labs, Tokyo.",
 6 |     author_email="ginza@megagon.ai",
 7 |     description="GiNZA, An Open Source Japanese NLP Library, based on Universal Dependencies",
 8 |     entry_points={
 9 |         "spacy_factories": [
10 |             "bunsetu_recognizer = ginza:make_bunsetu_recognizer",
11 |             "compound_splitter = ginza:make_compound_splitter",
12 |             "disable_sentencizer = ginza:disable_sentencizer",
13 |         ],
14 |         "console_scripts": [
15 |             "ginza = ginza.command_line:main_ginza",
16 |             "ginzame = ginza.command_line:main_ginzame",
17 |         ],
18 |     },
19 |     python_requires=">=3.8",
20 |     install_requires=[
21 |         "spacy>=3.4.4,<4.0.0",
22 |         "plac>=1.3.3",
23 |         "SudachiPy>=0.6.2,<0.7.0",
24 |         "SudachiDict-core>=20210802",
25 |     ],
26 |     setup_requires=["pytest-runner"],
27 |     tests_require=["pytest", "pytest-cov", "pytest-mock"],
28 |     license="MIT",
29 |     name="ginza",
30 |     packages=find_packages(include=["ginza"]),
31 |     url="https://github.com/megagonlabs/ginza",
32 |     version='5.2.0',
33 | )
34 | 


--------------------------------------------------------------------------------