├── .dockerignore
├── .screenshots
    └── sbb_ner_demo.png
├── Dockerfile
├── Dockerfile.cpu
├── LICENSE
├── Makefile
├── README.md
├── __init__.py
├── doc
    └── sbb_ner_model_card.md
├── qurator
    ├── __init__.py
    └── sbb_ner
    │   ├── __init__.py
    │   ├── ground_truth
    │       ├── __init__.py
    │       ├── conll.py
    │       ├── data_processor.py
    │       ├── europeana_historic.py
    │       ├── germeval.py
    │       ├── join_gt.py
    │       └── wikiner.py
    │   ├── models
    │       ├── __init__.py
    │       ├── bert.py
    │       ├── corpus.py
    │       ├── finetune_on_pregenerated.py
    │       ├── pregenerate_training_data.py
    │       └── tokenization.py
    │   └── webapp
    │       ├── __init__.py
    │       ├── app.py
    │       ├── config-8GB-GPU.json
    │       ├── config.json
    │       ├── static
    │           ├── __init__.py
    │           ├── css
    │           │   ├── __init__.py
    │           │   └── bootstrap.min.css
    │           ├── index.html
    │           └── js
    │           │   ├── __init__.py
    │           │   ├── jquery-3.4.1.js
    │           │   ├── ner-demo.js
    │           │   └── ner.js
    │       └── wsgi.py
├── requirements.txt
└── setup.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/*
2 | *.egg_info
3 | venv
4 | models
5 | *.tar.gz
6 | 


--------------------------------------------------------------------------------
/.screenshots/sbb_ner_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qurator-spk/sbb_ner/0b943e0eb532291b064b9060c154fb0da3aab371/.screenshots/sbb_ner_demo.png


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-base
 2 | 
 3 | ARG http_proxy
 4 | ENV http_proxy=$http_proxy
 5 | ENV https_proxy=$http_proxy
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get -y install build-essential && \
 9 |     apt-get -y install python3-pip && \
10 |     apt-get clean && rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY requirements.txt /tmp
13 | RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
14 | 
15 | COPY . /usr/src/qurator-sbb-ner
16 | 
17 | RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019
18 | RUN mkdir -p /usr/src/qurator-sbb-ner/digisam
19 | 
20 | RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner
21 | 
22 | WORKDIR /usr/src/qurator-sbb-ner
23 | CMD export LANG=C.UTF-8; env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0
24 | 


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM python:3.6-slim-stretch
 2 | 
 3 | ARG http_proxy
 4 | ENV http_proxy=$http_proxy
 5 | ENV https_proxy=$http_proxy
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get -y install build-essential && \
 9 |     apt-get clean && rm -rf /var/lib/apt/lists/*
10 | 
11 | COPY requirements.txt /tmp
12 | RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
13 | 
14 | COPY . /usr/src/qurator-sbb-ner
15 | 
16 | RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019
17 | RUN mkdir -p /usr/src/qurator-sbb-ner/digisam
18 | 
19 | RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner
20 | 
21 | WORKDIR /usr/src/qurator-sbb-ner
22 | CMD env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2019 qurator
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | REPO_PATH ?=$(shell pwd)
  2 | 
  3 | BERT_BASE_PATH ?=$(REPO_PATH)/data/BERT
  4 | 
  5 | BERT_URL ?=https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
  6 | REL_BERT_PATH ?=multi_cased_L-12_H-768_A-12
  7 | 
  8 | #BERT_URL ?=https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
  9 | #REL_BERT_PATH ?=wwm_uncased_L-24_H-1024_A-16
 10 | 
 11 | BERT_MODEL_PATH ?=$(BERT_BASE_PATH)/$(REL_BERT_PATH)
 12 | 
 13 | DIGISAM_PATH ?=$(REPO_PATH)/data/digisam
 14 | 
 15 | REL_FINETUNED_PATH ?=data/digisam/BERT_de_finetuned
 16 | #REL_FINETUNED_PATH ?=data/digisam/BERT-large_de_finetuned
 17 | BERT_FINETUNED_PATH ?=$(REPO_PATH)/$(REL_FINETUNED_PATH)
 18 | 
 19 | NER_DATA_PATH ?=$(REPO_PATH)/data/NER
 20 | 
 21 | REL_BUILD_PATH ?=data/build
 22 | BUILD_PATH ?=$(REPO_PATH)/$(REL_BUILD_PATH)
 23 | 
 24 | EPOCHS ?=1
 25 | EPOCH_FILE ?=pytorch_model_ep$(EPOCHS).bin
 26 | MODEL_FILE ?=pytorch_model.bin
 27 | CROSS_VAL_FILE ?=cross_validation_results.pkl
 28 | 
 29 | WEIGHT_DECAY ?=0.03
 30 | WARMUP_PROPORTION ?=0.4
 31 | 
 32 | BATCH_SIZE ?=32
 33 | GRAD_ACC_STEPS ?=2
 34 | 
 35 | # BATCH_SIZE ?=128 # <===== unsupervised
 36 | # GRAD_ACC_STEPS ?=4 # <===== unsupervised
 37 | 
 38 | MAX_SEQ_LEN ?=128
 39 | 
 40 | # EXTRA_OPTIONS="--dry_run --no_cuda" <- Test if everything works.
 41 | EXTRA_OPTIONS ?=
 42 | 
 43 | DO_LOWER_CASE ?=
 44 | 
 45 | BERT_NER_OPTIONS ?=--task_name=ner --max_seq_length=$(MAX_SEQ_LEN) --num_train_epochs=$(EPOCHS) --warmup_proportion=$(WARMUP_PROPORTION) --gradient_accumulation_steps=$(GRAD_ACC_STEPS) --train_batch_size=$(BATCH_SIZE) --gt_file=$(BUILD_PATH)/gt.pkl --weight_decay=$(WEIGHT_DECAY) $(DO_LOWER_CASE) $(EXTRA_OPTIONS) 
 46 | 
 47 | BERT_NER_EVAL_OPTIONS ?=--eval_batch_size=8 --task_name=ner --gt_file=$(BUILD_PATH)/gt.pkl $(DO_LOWER_CASE) $(EXTRA_OPTIONS)
 48 | 
 49 | ###############################################################################
 50 | # directories
 51 | #
 52 | 
 53 | $(BUILD_PATH):
 54 | 	mkdir -p $(BUILD_PATH)
 55 | 
 56 | $(BERT_FINETUNED_PATH):
 57 | 	mkdir -p $(BERT_FINETUNED_PATH)
 58 | 	cp -L $(BERT_MODEL_PATH)/pytorch_model.bin $(BERT_FINETUNED_PATH)/pytorch_model.bin
 59 | 	chmod u+rw $(BERT_FINETUNED_PATH)/pytorch_model.bin
 60 | 	ln -sfn $(BERT_MODEL_PATH)/bert_config.json $(BERT_FINETUNED_PATH)/bert_config.json
 61 | 	ln -sfn $(BERT_MODEL_PATH)/vocab.txt $(BERT_FINETUNED_PATH)/vocab.txt
 62 | 
 63 | dirs: $(BUILD_PATH) $(BERT_FINETUNED_PATH)
 64 | 
 65 | ###############################################################################
 66 | # BERT unsupervised on "Digitale Sammlungen":
 67 | #
 68 | 
 69 | TEMP_PREFIX ?=/tmp/
 70 | 
 71 | $(BERT_MODEL_PATH)/bert_model.ckpt.index:
 72 | 	wget -nc --directory-prefix=$(BERT_BASE_PATH) $(BERT_URL)
 73 | 	unzip -d $(BERT_BASE_PATH) $(BERT_MODEL_PATH).zip
 74 | 
 75 | $(BERT_MODEL_PATH)/pytorch_model.bin:	$(BERT_MODEL_PATH)/bert_model.ckpt.index
 76 | 	pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch $(BERT_MODEL_PATH)/bert_model.ckpt $(BERT_MODEL_PATH)/bert_config.json $(BERT_MODEL_PATH)/pytorch_model.bin
 77 | 
 78 | $(DIGISAM_PATH)/de_corpus.txt:
 79 | 	collectcorpus $(DIGISAM_PATH)/fulltext.sqlite3 $(DIGISAM_PATH)/selection_de.pkl $(DIGISAM_PATH)/de_corpus.txt --chunksize=10000
 80 | 
 81 | $(BERT_MODEL_PATH)/epoch_0.json: $(DIGISAM_PATH)/de_corpus.txt $(BERT_MODEL_PATH)/pytorch_model.bin
 82 | 	bert-pregenerate-trainingdata --train_corpus $(DIGISAM_PATH)/de_corpus.txt --output_dir $(BERT_MODEL_PATH) --bert_model $(BERT_MODEL_PATH) --reduce_memory --epochs $(EPOCHS)
 83 | 
 84 | bert-digisam-unsupervised: $(BERT_MODEL_PATH)/epoch_0.json
 85 | 	bert-finetune --pregenerated_data $(BERT_MODEL_PATH) --output_dir $(BERT_FINETUNED_PATH) --bert_model $(BERT_MODEL_PATH) --reduce_memory --fp16 --gradient_accumulation_steps 4 --train_batch_size 32 --epochs $(EPOCHS) --temp_prefix $(TEMP_PREFIX)
 86 | 
 87 | bert-digisam-unsupervised-continued: $(BERT_FINETUNED_PATH) $(BERT_MODEL_PATH)/epoch_0.json
 88 | 	bert-finetune --pregenerated_data $(BERT_MODEL_PATH) --output_dir $(BERT_FINETUNED_PATH) --bert_model $(BERT_FINETUNED_PATH) --reduce_memory --fp16 --gradient_accumulation_steps=$(GRAD_ACC_STEPS) --train_batch_size=$(BATCH_SIZE) --epochs $(EPOCHS) --temp_prefix $(TEMP_PREFIX)
 89 | 
 90 | get-bert: $(BERT_MODEL_PATH)/bert_model.ckpt.index
 91 | 
 92 | convert-bert: $(BERT_MODEL_PATH)/pytorch_model.bin
 93 | 
 94 | ###############################################################################
 95 | #NER ground truth:
 96 | 
 97 | $(NER_DATA_PATH)/ner-corpora:
 98 | 	git clone https://github.com/EuropeanaNewspapers/ner-corpora $(NER_DATA_PATH)/ner-corpora
 99 | 
100 | $(BUILD_PATH)/europeana_historic.pkl: $(NER_DATA_PATH)/ner-corpora
101 | 	compile_europeana_historic $(NER_DATA_PATH)/ner-corpora $(BUILD_PATH)/europeana_historic.pkl
102 | 
103 | $(BUILD_PATH)/germ_eval.pkl: $(NER_DATA_PATH)/GermEval
104 | 	compile_germ_eval $(NER_DATA_PATH)/GermEval $(BUILD_PATH)/germ_eval.pkl
105 | 
106 | $(BUILD_PATH)/conll2003.pkl: $(NER_DATA_PATH)/conll2003
107 | 	compile_conll $(NER_DATA_PATH)/conll2003 $(BUILD_PATH)/conll2003.pkl
108 | 
109 | $(BUILD_PATH)/wikiner.pkl: $(NER_DATA_PATH)/wikiner
110 | 	compile_wikiner $(NER_DATA_PATH)/wikiner $(BUILD_PATH)/wikiner.pkl
111 | 
112 | $(BUILD_PATH)/gt.pkl:
113 | 	join-gt $(BUILD_PATH)/germ_eval.pkl $(BUILD_PATH)/europeana_historic.pkl $(BUILD_PATH)/conll2003.pkl $(BUILD_PATH)/wikiner.pkl $(BUILD_PATH)/gt.pkl
114 | 
115 | ner-ground-truth: dirs $(BUILD_PATH)/europeana_historic.pkl $(BUILD_PATH)/germ_eval.pkl $(BUILD_PATH)/conll2003.pkl $(BUILD_PATH)/wikiner.pkl $(BUILD_PATH)/gt.pkl
116 | 
117 | ###############################################################################
118 | #BERT NER training:
119 | 
120 | .PRECIOUS: %/vocab.txt %/bert_config.json %/$(MODEL_FILE)
121 | 
122 | %/vocab.txt:
123 | 	ln -sfnr $(BERT_FINETUNED_PATH)/vocab.txt $(@D)/vocab.txt
124 | 
125 | %/bert_config.json:
126 | 	ln -sfnr $(BERT_FINETUNED_PATH)/bert_config.json $(@D)/bert_config.json
127 | 
128 | %/$(MODEL_FILE):
129 | 	ln -sfnr $(@D)/$(EPOCH_FILE) $(@D)/$(MODEL_FILE)
130 | 
131 | ########################################
132 | # baseline
133 | 
134 | $(BUILD_PATH)/bert-conll2003-en-baseline/$(EPOCH_FILE):
135 | 	bert-ner --train_sets='EN-CONLL-TRAIN' --dev_sets='EN-CONLL-TESTA' --bert_model=bert-base-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
136 | 
137 | $(BUILD_PATH)/bert-conll2003-de-baseline/$(EPOCH_FILE):
138 | 	bert-ner --train_sets='DE-CONLL-TRAIN' --dev_sets='DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
139 | 
140 | $(BUILD_PATH)/bert-germ-eval-baseline/$(EPOCH_FILE):
141 | 	bert-ner --train_sets='GERM-EVAL-TRAIN' --dev_sets='GERM-EVAL-DEV' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 
142 | 
143 | 
144 | $(BUILD_PATH)/bert-all-german-baseline/$(EPOCH_FILE):
145 | 	bert-ner --train_sets='GERM-EVAL-TRAIN|DE-CONLL-TRAIN' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
146 | 
147 | 
148 | $(BUILD_PATH)/bert-wikiner-baseline/$(EPOCH_FILE):
149 | 	bert-ner --train_sets='WIKINER-WP3' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
150 | 
151 | 
152 | $(BUILD_PATH)/bert-lft-baseline/$(EPOCH_FILE):
153 | 	bert-ner --train_sets='LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
154 | 
155 | $(BUILD_PATH)/bert-onb-baseline/$(EPOCH_FILE):
156 | 	bert-ner --train_sets='ONB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
157 | 
158 | $(BUILD_PATH)/bert-sbb-baseline/$(EPOCH_FILE):
159 | 	bert-ner --train_sets='SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
160 | 
161 | $(BUILD_PATH)/bert-lft-sbb-baseline/$(EPOCH_FILE):
162 | 	bert-ner --train_sets='LFT|SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
163 | 
164 | $(BUILD_PATH)/bert-onb-sbb-baseline/$(EPOCH_FILE):
165 | 	bert-ner --train_sets='ONB|SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
166 | 
167 | $(BUILD_PATH)/bert-onb-lft-baseline/$(EPOCH_FILE):
168 | 	bert-ner --train_sets='ONB|LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
169 | 
170 | 
171 | bert-%-baseline: $(BUILD_PATH)/bert-%-baseline/$(EPOCH_FILE) $(BUILD_PATH)/bert-%-baseline/vocab.txt $(BUILD_PATH)/bert-%-baseline/bert_config.json $(BUILD_PATH)/bert-%-baseline/$(MODEL_FILE) ;
172 | 
173 | bert-baseline: dirs ner-ground-truth bert-conll2003-en-baseline bert-conll2003-de-baseline bert-germ-eval-baseline bert-all-german-baseline bert-wikiner-baseline bert-lft-baseline bert-onb-baseline bert-sbb-baseline bert-lft-sbb-baseline bert-onb-sbb-baseline bert-onb-lft-baseline
174 | 
175 | 
176 | $(BUILD_PATH)/bert-lft-baseline/$(CROSS_VAL_FILE):
177 | 	bert-ner --do_cross_validation --train_sets='LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
178 | 
179 | $(BUILD_PATH)/bert-onb-baseline/$(CROSS_VAL_FILE):
180 | 	bert-ner --do_cross_validation --train_sets='ONB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
181 | 
182 | $(BUILD_PATH)/bert-sbb-baseline/$(CROSS_VAL_FILE):
183 | 	bert-ner --do_cross_validation --train_sets='SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
184 | 
185 | bert-cv-%-baseline: $(BUILD_PATH)/bert-%-baseline/$(CROSS_VAL_FILE) $(BUILD_PATH)/bert-%-baseline/vocab.txt $(BUILD_PATH)/bert-%-baseline/bert_config.json $(BUILD_PATH)/bert-%-baseline/$(MODEL_FILE) ;
186 | 
187 | bert-cv-baseline: bert-cv-lft-baseline bert-cv-onb-baseline bert-cv-sbb-baseline
188 | 	
189 | ########################################
190 | #de-finetuned
191 | 
192 | $(BUILD_PATH)/bert-conll2003-de-finetuned/$(EPOCH_FILE): 
193 | 	bert-ner --train_sets='DE-CONLL-TRAIN' --dev_sets='DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
194 | 
195 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/$(EPOCH_FILE):
196 | 	bert-ner --train_sets='GERM-EVAL-TRAIN' --dev_sets='GERM-EVAL-DEV' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
197 | 
198 | $(BUILD_PATH)/bert-all-german-de-finetuned/$(EPOCH_FILE):
199 | 	bert-ner --train_sets='GERM-EVAL-TRAIN|DE-CONLL-TRAIN' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
200 | 
201 | $(BUILD_PATH)/bert-complete-de-finetuned/$(EPOCH_FILE):
202 | 	bert-ner --train_sets='GERM-EVAL-TRAIN|GERM-EVAL-DEV|DE-CONLL-TRAIN|DE-CONLL-DEV|SBB|ONB|LFT|DE-CONLL-TESTA|DE-CONLL-TESTB|GERM-EVAL-TEST' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
203 | 
204 | $(BUILD_PATH)/bert-multilang-de-finetuned/$(EPOCH_FILE):
205 | 	bert-ner --train_sets='GERM-EVAL-DEV|GERM-EVAL-TEST|GERM-EVAL-TRAIN|SBB|ONB|LFT|BNF|KB|DE-CONLL-DEV|DE-CONLL-TESTA|DE-CONLL-TESTB|DE-CONLL-TRAIN|EN-CONLL-TESTA|EN-CONLL-TESTB|EN-CONLL-TRAIN' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
206 | 
207 | 
208 | $(BUILD_PATH)/bert-wikiner-de-finetuned/$(EPOCH_FILE):
209 | 	bert-ner --train_sets='WIKINER-WP3' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
210 | 
211 | 
212 | $(BUILD_PATH)/bert-lft-de-finetuned/$(EPOCH_FILE):
213 | 	bert-ner --train_sets='LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
214 | 
215 | $(BUILD_PATH)/bert-onb-de-finetuned/$(EPOCH_FILE):
216 | 	bert-ner --train_sets='ONB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
217 | 
218 | $(BUILD_PATH)/bert-sbb-de-finetuned/$(EPOCH_FILE):
219 | 	bert-ner --train_sets='SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
220 | 
221 | $(BUILD_PATH)/bert-lft-sbb-de-finetuned/$(EPOCH_FILE):
222 | 	bert-ner --train_sets='LFT|SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
223 | 
224 | $(BUILD_PATH)/bert-onb-sbb-de-finetuned/$(EPOCH_FILE):
225 | 	bert-ner --train_sets='ONB|SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
226 | 
227 | $(BUILD_PATH)/bert-onb-lft-de-finetuned/$(EPOCH_FILE):
228 | 	bert-ner --train_sets='ONB|LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
229 | 
230 | 
231 | bert-%-de-finetuned: ner-ground-truth $(BUILD_PATH)/bert-%-de-finetuned/$(EPOCH_FILE) $(BUILD_PATH)/bert-%-de-finetuned/vocab.txt $(BUILD_PATH)/bert-%-de-finetuned/bert_config.json $(BUILD_PATH)/bert-%-de-finetuned/$(MODEL_FILE) ;
232 | 
233 | bert-finetuned: dirs ner-ground-truth bert-conll2003-de-finetuned bert-germ-eval-de-finetuned bert-all-german-de-finetuned bert-wikiner-de-finetuned bert-lft-de-finetuned bert-onb-de-finetuned bert-sbb-de-finetuned bert-lft-sbb-de-finetuned bert-onb-sbb-de-finetuned bert-onb-lft-de-finetuned
234 | 
235 | 
236 | $(BUILD_PATH)/bert-lft-de-finetuned/$(CROSS_VAL_FILE):
237 | 	bert-ner --do_cross_validation --train_sets='LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) # >> $(@D).log 2<&1
238 | 
239 | $(BUILD_PATH)/bert-onb-de-finetuned/$(CROSS_VAL_FILE):
240 | 	bert-ner --do_cross_validation --train_sets='ONB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
241 | 
242 | $(BUILD_PATH)/bert-sbb-de-finetuned/$(CROSS_VAL_FILE): 
243 | 	bert-ner --do_cross_validation --train_sets='SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1
244 | 
245 | bert-cv-%-de-finetuned: $(BUILD_PATH)/bert-%-de-finetuned/$(CROSS_VAL_FILE) $(BUILD_PATH)/bert-%-de-finetuned/vocab.txt $(BUILD_PATH)/bert-%-de-finetuned/bert_config.json $(BUILD_PATH)/bert-%-de-finetuned/$(MODEL_FILE) ;
246 | 
247 | bert-cv-de-finetuned: bert-cv-lft-de-finetuned bert-cv-onb-de-finetuned bert-cv-sbb-de-finetuned
248 | 
249 | bert-cv: bert-cv-de-finetuned bert-cv-baseline
250 | 
251 | bert-train: bert-finetuned bert-baseline
252 | 
253 | ###############################################################################
254 | # Evaluation
255 | #
256 | 
257 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTA.pkl:
258 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
259 | 
260 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTB.pkl:
261 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
262 | 
263 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-LFT.pkl:
264 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
265 | 
266 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-SBB.pkl:
267 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
268 | 
269 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-ONB.pkl:
270 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
271 | 
272 | #
273 | 
274 | 
275 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-GERM-EVAL-TEST.pkl:
276 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
277 | 
278 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-GERM-EVAL-TEST.pkl:
279 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
280 | 
281 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-GERM-EVAL-TEST.pkl: 
282 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
283 | 
284 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-LFT.pkl:
285 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
286 | 
287 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-SBB.pkl:
288 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
289 | 
290 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-ONB.pkl:
291 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
292 | 
293 | #
294 | 
295 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTA.pkl:
296 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)   $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
297 | 
298 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTA.pkl: 
299 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
300 | 
301 | #
302 | 
303 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTB.pkl:
304 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)   $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
305 | 
306 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTB.pkl: 
307 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
308 | 
309 | #
310 | 
311 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-LFT.pkl:
312 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
313 | 
314 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-SBB.pkl:
315 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
316 | 
317 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-ONB.pkl:
318 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
319 | 
320 | #
321 | 
322 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-LFT.pkl:
323 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
324 | 
325 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-SBB.pkl:
326 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
327 | 
328 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-ONB.pkl:
329 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
330 | 
331 | 
332 | $(BUILD_PATH)/bert-lft-baseline/eval_results-ONB.pkl:
333 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
334 | 
335 | $(BUILD_PATH)/bert-lft-baseline/eval_results-SBB.pkl:
336 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
337 | 
338 | 
339 | $(BUILD_PATH)/bert-onb-baseline/eval_results-LFT.pkl:
340 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
341 | 
342 | $(BUILD_PATH)/bert-onb-baseline/eval_results-SBB.pkl:
343 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
344 | 
345 | 
346 | $(BUILD_PATH)/bert-sbb-baseline/eval_results-LFT.pkl:
347 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
348 | 
349 | $(BUILD_PATH)/bert-sbb-baseline/eval_results-ONB.pkl:
350 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
351 | 
352 | 
353 | $(BUILD_PATH)/bert-lft-sbb-baseline/eval_results-ONB.pkl:
354 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
355 | 
356 | $(BUILD_PATH)/bert-onb-sbb-baseline/eval_results-LFT.pkl:
357 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
358 | 
359 | $(BUILD_PATH)/bert-onb-lft-baseline/eval_results-SBB.pkl:
360 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
361 | 
362 | 
363 | bert-ner-evaluation-baseline: dirs $(BUILD_PATH)/bert-all-german-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-lft-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-onb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-sbb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-sbb-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-sbb-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-onb-sbb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-lft-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-ONB.pkl
364 | 
365 | #######################################
366 | 
367 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTA.pkl:
368 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
369 | 
370 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTB.pkl:
371 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
372 | 
373 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-LFT.pkl:
374 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
375 | 
376 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-SBB.pkl:
377 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
378 | 
379 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-ONB.pkl:
380 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
381 | 
382 | 
383 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-GERM-EVAL-TEST.pkl:
384 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
385 | 
386 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-GERM-EVAL-TEST.pkl:
387 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
388 | 
389 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-GERM-EVAL-TEST.pkl: 
390 | 	bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
391 | #
392 | 
393 | 
394 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-LFT.pkl:
395 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
396 | 
397 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-SBB.pkl:
398 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
399 | 
400 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-ONB.pkl:
401 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
402 | 
403 | #
404 | 
405 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTA.pkl:
406 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
407 | 
408 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTA.pkl: 
409 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
410 | 
411 | #
412 | 
413 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTB.pkl:
414 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
415 | 
416 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTB.pkl: 
417 | 	bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS)  $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
418 | 
419 | #
420 | 
421 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-LFT.pkl:
422 | 	bert-ner --dev_sets='LFT' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
423 | 
424 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-SBB.pkl:
425 | 	bert-ner --dev_sets='SBB' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
426 | 
427 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-ONB.pkl:
428 | 	bert-ner --dev_sets='ONB' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
429 | 
430 | #
431 | 
432 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-LFT.pkl:
433 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
434 | 
435 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-SBB.pkl:
436 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
437 | 
438 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-ONB.pkl:
439 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
440 | 
441 | 
442 | $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-ONB.pkl:
443 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
444 | 
445 | $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-SBB.pkl:
446 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
447 | 
448 | 
449 | $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-LFT.pkl:
450 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
451 | 
452 | $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-SBB.pkl:
453 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
454 | 
455 | 
456 | $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-LFT.pkl:
457 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
458 | 
459 | $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-ONB.pkl:
460 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
461 | 
462 | 
463 | $(BUILD_PATH)/bert-lft-sbb-de-finetuned/eval_results-ONB.pkl:
464 | 	bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
465 | 
466 | $(BUILD_PATH)/bert-onb-sbb-de-finetuned/eval_results-LFT.pkl:
467 | 	bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
468 | 
469 | $(BUILD_PATH)/bert-onb-lft-de-finetuned/eval_results-SBB.pkl:
470 | 	bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1
471 | 
472 | 
473 | bert-ner-evaluation-de-finetuned: dirs $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-sbb-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-onb-sbb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-lft-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-ONB.pkl
474 | 
475 | bert-evaluation: bert-ner-evaluation-baseline bert-ner-evaluation-de-finetuned
476 | 
477 | ###############################################################################
478 | #wikipedia
479 | 
480 | WIKI_DATA_DIR=data/wikipedia
481 | WP_EPOCH_SIZE=100000
482 | 
483 | wikipedia-ner-baseline-train: $(WIKI_DATA_DIR)/wikipedia-tagged.csv
484 | 	bert-ner --gt_file=$(WIKI_DATA_DIR)/wikipedia-tagged.csv --train_sets=$(WIKI_DATA_DIR)/ner-train-index.pkl --dev_sets=$(WIKI_DATA_DIR)/ner-dev-index.pkl --bert_model=bert-base-multilingual-cased --task_name=wikipedia-ner --output_dir=$(BUILD_PATH)/wikipedia-baseline --num_train_epochs $(EPOCHS) --num_data_epochs=$(EPOCHS) --epoch_size=$(WP_EPOCH_SIZE) $(BERT_NER_OPTIONS)
485 | 
486 | wikipedia-ner-de-finetuned-train: $(WIKI_DATA_DIR)/wikipedia-tagged.csv
487 | 	bert-ner --gt_file=$(WIKI_DATA_DIR)/wikipedia-tagged.csv --train_sets=$(WIKI_DATA_DIR)/ner-train-index.pkl --dev_sets=$(WIKI_DATA_DIR)/ner-dev-index.pkl --bert_model=$(DIGISAM_PATH)/BERT_de_finetuned --task_name=wikipedia-ner --output_dir=$(BUILD_PATH)/wikipedia-de-finetuned --num_train_epochs $(EPOCHS) --num_data_epochs=$(EPOCHS) --epoch_size=$(WP_EPOCH_SIZE) $(BERT_NER_OPTIONS)
488 | 
489 | ########################
490 | 
491 | $(BUILD_PATH)/wikipedia-baseline/eval_results-LFT.pkl:
492 | 	bert-ner --dev_sets='LFT' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 
493 | 
494 | $(BUILD_PATH)/wikipedia-baseline/eval_results-SBB.pkl:
495 | 	bert-ner --dev_sets='SBB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS)  
496 | 
497 | $(BUILD_PATH)/wikipedia-baseline/eval_results-ONB.pkl:
498 | 	bert-ner --dev_sets='ONB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 
499 | 
500 | $(BUILD_PATH)/wikipedia-baseline/eval_results-DE-CONLL-TESTA.pkl:
501 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 
502 | 
503 | 
504 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl:
505 | 	bert-ner --dev_sets='LFT' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS)  
506 | 
507 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl:
508 | 	bert-ner --dev_sets='SBB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 
509 | 
510 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl:
511 | 	bert-ner --dev_sets='DE-CONLL-TESTA' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS)  
512 | 
513 | wikipedia-baseline-evaluation: $(BUILD_PATH)/wikipedia-baseline/eval_results-SBB.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-LFT.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-ONB.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-DE-CONLL-TESTA.pkl
514 | 
515 | wikipedia-evaluation: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl
516 | wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl
517 | wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl
518 | 
519 | ###############################
520 | 
521 | model_archive:
522 | 	tar --exclude='*ep[1-6]*' --exclude='*eval*' --exclude='pytorch_model.bin' --exclude='*.pkl' -chzf models.tar.gz data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-all-german-baseline
523 | 
524 | models_from_git_annex:
525 | 	cd data;git annex get konvens2019/build-wd_0.03/bert-all-german-de-finetuned
526 | 	cd data;git annex get konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned
527 | 	cd data;git annex get konvens2019/build-wd_0.03/bert-sbb-de-finetuned
528 | 	cd data;git annex get konvens2019/build-wd_0.03/bert-all-german-baseline
529 | 
530 | 	
531 | 
532 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![sbb-ner-demo example](.screenshots/sbb_ner_demo.png?raw=true)
  2 | 
  3 | How the models have been obtained is described in our [paper](https://corpora.linguistik.uni-erlangen.de/data/konvens/proceedings/papers/KONVENS2019_paper_4.pdf).
  4 | 
  5 | ***
  6 | 
  7 | # Installation:
  8 | 
  9 | Recommended python version is 3.11. 
 10 | Consider use of [pyenv](https://github.com/pyenv/pyenv) if that python version is not available on your system. 
 11 | 
 12 | Activate virtual environment (virtualenv):
 13 | ```
 14 | source venv/bin/activate
 15 | ```
 16 | or (pyenv):
 17 | ```
 18 | pyenv activate my-python-3.11-virtualenv
 19 | ```
 20 | 
 21 | Update pip:
 22 | ```
 23 | pip install -U pip
 24 | ```
 25 | Install sbb_ner:
 26 | ```
 27 | pip install git+https://github.com/qurator-spk/sbb_ner.git
 28 | ```
 29 | Download required models: https://qurator-data.de/sbb_ner/models.tar.gz 
 30 | 
 31 | Extract model archive:
 32 | ```
 33 | tar -xzf models.tar.gz
 34 | ```
 35 | 
 36 | Copy [config file](qurator/sbb_ner/webapp/config.json) into working directory.
 37 | Set USE_CUDA environment variable to True/False depending on GPU availability.
 38 | 
 39 | Run webapp directly:
 40 | 
 41 | ```
 42 | env CONFIG=config.json env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True/False flask run --host=0.0.0.0
 43 | ```
 44 | 
 45 | For production purposes rather use
 46 | ```
 47 | env CONFIG=config.json env USE_CUDA=True/False gunicorn --bind 0.0.0.0:5000 qurator.sbb_ner.webapp.wsgi:app
 48 | ```
 49 | 
 50 | # Docker
 51 | 
 52 | ## CPU-only:
 53 | 
 54 | ```
 55 | docker build --build-arg http_proxy=$http_proxy  -t qurator/webapp-ner-cpu -f Dockerfile.cpu .
 56 | ```
 57 | 
 58 | ```
 59 | docker run -ti --rm=true --mount type=bind,source=data/konvens2019,target=/usr/src/qurator-sbb-ner/data/konvens2019 -p 5000:5000 qurator/webapp-ner-cpu
 60 | ```
 61 | 
 62 | ## GPU:
 63 | 
 64 | Make sure that your GPU is correctly set up and that nvidia-docker has been installed.
 65 | 
 66 | ```
 67 | docker build --build-arg http_proxy=$http_proxy  -t qurator/webapp-ner-gpu -f Dockerfile .
 68 | ```
 69 | 
 70 | ```
 71 | docker run -ti --rm=true --mount type=bind,source=data/konvens2019,target=/usr/src/qurator-sbb-ner/data/konvens2019 -p 5000:5000 qurator/webapp-ner-gpu
 72 | ```
 73 | 
 74 | NER web-interface is availabe at http://localhost:5000 . 
 75 | 
 76 | # REST - Interface
 77 | 
 78 | Get available models:
 79 | ```
 80 | curl http://localhost:5000/models
 81 | ```
 82 | 
 83 | Output:
 84 | 
 85 | ```
 86 | [
 87 |   {
 88 |     "default": true, 
 89 |     "id": 1, 
 90 |     "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned", 
 91 |     "name": "DC-SBB + CONLL + GERMEVAL"
 92 |   }, 
 93 |   {
 94 |     "default": false, 
 95 |     "id": 2, 
 96 |     "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned", 
 97 |     "name": "DC-SBB + CONLL + GERMEVAL + SBB"
 98 |   }, 
 99 |   {
100 |     "default": false, 
101 |     "id": 3, 
102 |     "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned", 
103 |     "name": "DC-SBB + SBB"
104 |   }, 
105 |   {
106 |     "default": false, 
107 |     "id": 4, 
108 |     "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline", 
109 |     "name": "CONLL + GERMEVAL"
110 |   }
111 | ]
112 | ```
113 | 
114 | Perform NER using model 1: 
115 | 
116 | ```
117 | curl -d '{ "text": "Paris Hilton wohnt im Hilton Paris in Paris." }' -H "Content-Type: application/json" http://localhost:5000/ner/1
118 | ```
119 | 
120 | Output:
121 | 
122 | ```
123 | [
124 |   [
125 |     {
126 |       "prediction": "B-PER", 
127 |       "word": "Paris"
128 |     }, 
129 |     {
130 |       "prediction": "I-PER", 
131 |       "word": "Hilton"
132 |     }, 
133 |     {
134 |       "prediction": "O", 
135 |       "word": "wohnt"
136 |     }, 
137 |     {
138 |       "prediction": "O", 
139 |       "word": "im"
140 |     }, 
141 |     {
142 |       "prediction": "B-ORG", 
143 |       "word": "Hilton"
144 |     }, 
145 |     {
146 |       "prediction": "I-ORG", 
147 |       "word": "Paris"
148 |     }, 
149 |     {
150 |       "prediction": "O", 
151 |       "word": "in"
152 |     }, 
153 |     {
154 |       "prediction": "B-LOC", 
155 |       "word": "Paris"
156 |     }, 
157 |     {
158 |       "prediction": "O", 
159 |       "word": "."
160 |     }
161 |   ]
162 | ]
163 | ```
164 | The JSON above is the expected input format of the 
165 | [SBB named entity linking and disambiguation system](https://github.com/qurator-spk/sbb_ned).
166 | # Model-Training 
167 | 
168 | ***
169 | ## Preprocessing of NER ground-truth:
170 | 
171 | 
172 | ### compile_conll
173 | 
174 | Read CONLL 2003 ner ground truth files from directory and
175 | write the outcome of the data parsing to some pandas DataFrame that is
176 | stored as pickle.
177 | 
178 | #### Usage
179 | 
180 | ```
181 | compile_conll --help
182 | ```
183 | 
184 | ### compile_germ_eval
185 | 
186 | Read germ eval .tsv files from directory and write the
187 | outcome of the data parsing to some pandas DataFrame that is stored as
188 | pickle.
189 | 
190 | #### Usage
191 | 
192 | ```
193 | compile_germ_eval --help
194 | ```
195 | 
196 | ### compile_europeana_historic
197 | 
198 | Read europeana historic ner ground truth .bio files from directory 
199 | and write the outcome of the data parsing to some pandas
200 | DataFrame that is stored as pickle.
201 | 
202 | #### Usage
203 | 
204 | ```
205 | compile_europeana_historic --help
206 | ```
207 | 
208 | 
209 | ### compile_wikiner
210 | 
211 | Read wikiner files from directory and write the outcome
212 | of the data parsing to some pandas DataFrame that is stored as pickle.
213 | 
214 | #### Usage
215 | 
216 | ```
217 | compile_wikiner --help
218 | ```
219 | 
220 | ***
221 | ## Train BERT - NER model:
222 | 
223 | ### bert-ner
224 | 
225 | Perform BERT for NER supervised training and test/cross-validation.
226 | 
227 | #### Usage
228 | 
229 | ```
230 | bert-ner --help
231 | ```
232 | 
233 | ## BERT-Pre-training:
234 | 
235 | ### collectcorpus
236 | 
237 | ```
238 | collectcorpus --help
239 | 
240 | Usage: collectcorpus [OPTIONS] FULLTEXT_FILE SELECTION_FILE CORPUS_FILE
241 | 
242 |   Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and
243 |   write it to one big text file.
244 | 
245 |   FULLTEXT_FILE: The CSV or SQLITE3 file to read from.
246 | 
247 |   SELECTION_FILE: Consider only a subset of all pages that is defined by the
248 |   DataFrame that is stored in <selection_file>.
249 | 
250 |   CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
251 | 
252 | Options:
253 |   --chunksize INTEGER     Process the corpus in chunks of <chunksize>.
254 |                           default:10**4
255 | 
256 |   --processes INTEGER     Number of parallel processes. default: 6
257 |   --min-line-len INTEGER  Lower bound of line length in output file.
258 |                           default:80
259 | 
260 |   --help                  Show this message and exit.
261 | 
262 | ```
263 | 
264 | ### bert-pregenerate-trainingdata
265 | 
266 | Generate data for BERT pre-training from a corpus text file where 
267 | the documents are separated by an empty line (output of corpuscollect).
268 | 
269 | #### Usage
270 | 
271 | ```
272 | bert-pregenerate-trainingdata --help
273 | ```
274 | 
275 | ### bert-finetune
276 | 
277 | Perform BERT pre-training on pre-generated data.
278 | 
279 | #### Usage
280 | 
281 | ```
282 | bert-finetune --help
283 | ```
284 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/doc/sbb_ner_model_card.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | tags: 
  3 | - pytorch
  4 | - token-classification
  5 | - sequence-tagger-model
  6 | language: de
  7 | datasets:
  8 | - conll2003
  9 | - germeval_14
 10 | license: apache-2.0
 11 | ---
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | # Model Card for sbb_ner
 19 | 
 20 | <!-- Provide a quick summary of what the model is/does. [Optional] -->
 21 | A BERT model trained on three German corpora containing contemporary and historical texts for named entity recognition tasks. It predicts the classes PER, LOC and ORG. 
 22 | Questions and comments about the model can be directed to Clemens Neudecker at clemens.neudecker@sbb.spk-berlin.de.
 23 | 
 24 | 
 25 | 
 26 | 
 27 | #  Table of Contents
 28 | 
 29 | - [Model Card for sbb_ner](#model-card-for--model_id-)
 30 | - [Table of Contents](#table-of-contents)
 31 | - [Model Details](#model-details)
 32 |   - [Model Description](#model-description)
 33 | - [Uses](#uses)
 34 |   - [Direct Use](#direct-use)
 35 |   - [Downstream Use [Optional]](#downstream-use-optional)
 36 |   - [Out-of-Scope Use](#out-of-scope-use)
 37 | - [Bias, Risks, and Limitations](#bias-risks-and-limitations)
 38 |   - [Recommendations](#recommendations)
 39 | - [Training Details](#training-details)
 40 |   - [Training Data](#training-data)
 41 |   - [Training Procedure](#training-procedure)
 42 |     - [Preprocessing](#preprocessing)
 43 |     - [Speeds, Sizes, Times](#speeds-sizes-times)
 44 | - [Evaluation](#evaluation)
 45 |   - [Testing Data, Factors & Metrics](#testing-data-factors--metrics)
 46 |     - [Testing Data](#testing-data)
 47 |     - [Factors](#factors)
 48 |     - [Metrics](#metrics)
 49 |   - [Results](#results)
 50 | - [Model Examination](#model-examination)
 51 | - [Environmental Impact](#environmental-impact)
 52 | - [Technical Specifications [optional]](#technical-specifications-optional)
 53 |   - [Model Architecture and Objective](#model-architecture-and-objective)
 54 |   - [Compute Infrastructure](#compute-infrastructure)
 55 |     - [Hardware](#hardware)
 56 |     - [Software](#software)
 57 | - [Citation](#citation)
 58 | - [Glossary [optional]](#glossary-optional)
 59 | - [More Information [optional]](#more-information-optional)
 60 | - [Model Card Authors [optional]](#model-card-authors-optional)
 61 | - [Model Card Contact](#model-card-contact)
 62 | - [How to Get Started with the Model](#how-to-get-started-with-the-model)
 63 | 
 64 | 
 65 | # Model Details
 66 | 
 67 | ## Model Description
 68 | 
 69 | <!-- Provide a longer summary of what this model is/does. -->
 70 | A BERT model trained on three German corpora containing contemporary and historical texts for named entity recognition tasks. 
 71 | It predicts the classes PER, LOC and ORG. 
 72 | 
 73 | - **Developed by:** [Kai Labusch](kai.labusch@sbb.spk-berlin.de), [Clemens Neudecker](clemens.neudecker@sbb.spk-berlin.de), David Zellhöfer
 74 | - **Shared by [Optional]:** Staatsbibliothek zu Berlin / Berlin State Library
 75 | - **Model type:** Language model
 76 | - **Language(s) (NLP):** de
 77 | - **License:** apache-2.0
 78 | - **Parent Model:** The BERT base multilingual cased model as provided by [Google](https://huggingface.co/bert-base-multilingual-cased)
 79 | - **Resources for more information:** More information needed
 80 |     - [GitHub Repo](https://github.com/qurator-spk/sbb_ner)
 81 |     - [Associated Paper](https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_4.pdf)
 82 | 
 83 | # Uses
 84 | 
 85 | <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 86 | 
 87 | ## Direct Use
 88 | 
 89 | The model can directly be used to perform NER on historical german texts obtained by OCR from digitized documents.
 90 | Supported entity types are PER, LOC and ORG. 
 91 | 
 92 | <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 93 | <!-- If the user enters content, print that. If not, but they enter a task in the list, use that. If neither, say "more info needed." -->
 94 | 
 95 | ## Downstream Use
 96 | 
 97 | The model has been pre-trained on 2.300.000 pages of OCR-text of the digitized collections of Berlin State Library.
 98 | Therefore it is adapted to OCR-error prone historical german texts and might be used for particular applications that involve such text material.
 99 | 
100 | <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
101 | <!-- If the user enters content, print that. If not, but they enter a task in the list, use that. If neither, say "more info needed." -->
102 |  
103 | 
104 | 
105 | 
106 | ## Out-of-Scope Use
107 | 
108 | <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
109 | <!-- If the user enters content, print that. If not, but they enter a task in the list, use that. If neither, say "more info needed." -->
110 | 
111 | 
112 | 
113 | 
114 | # Bias, Risks, and Limitations
115 | 
116 | <!-- This section is meant to convey both technical and sociotechnical limitations. -->
117 | 
118 | The identification of named entities in historical and contemporary texts is a task contributing to knowledge creation aiming at enhancing scientific research and better discoverability of information in digitized historical texts. The aim of the development of this model was to improve this knowledge creation process, an endeavour that is not for profit. The results of the applied model are freely accessible for the users of the digital collections of the Berlin State Library. Against this backdrop, ethical challenges cannot be identified. As a limitation, it has to be noted that there is a lot of performance to gain for historical text by adding more historical ground-truth data. 
119 | 
120 | 
121 | ## Recommendations
122 | 
123 | <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
124 | 
125 | The general observation that historical texts often remain silent and avoid naming of subjects from the colonies and address them anonymously cannot be remedied by named entity recognition. Disambiguation of named entities proves to be challenging beyond the task of automatically identifying entities. The existence of broad variations in the spelling of person and place names because of non-normalized orthography and linguistic change as well as changes in the naming of places according to the context adds to this challenge. Historical texts, especially newspapers, contain narrative descriptions and visual representations of minorities and disadvantaged groups without naming them; de-anonymizing such persons and groups is a research task in itself, which has only been started to be tackled in the 2020&#39;s. 
126 | 
127 | 
128 | # Training Details
129 | 
130 | ## Training Data
131 | 
132 | <!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
133 | 
134 | 1) CoNLL 2003 German Named Entity Recognition Ground Truth (Tjong Kim Sang and De Meulder, 2003)
135 | 2) GermEval Konvens 2014 Shared Task Data (Benikova et al., 2014)
136 | 3) DC-SBB Digital Collections of the Berlin State Library (Labusch and Zellhöfer, 2019)
137 | 4) Europeana Newspapers Historic German Datasets (Neudecker, 2016)
138 | 
139 | 
140 | ## Training Procedure
141 | 
142 | <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
143 | 
144 | The BERT model is trained directly with respect to the NER by implementation of the same method that has been proposed by the BERT authors (Devlin et al., 2018). We applied unsupervised pre-training on 2,333,647 pages of unlabeled historical German text from the Berlin State Library digital collections, and supervised pre-training on two datasets with contemporary German text, conll2003 and germeval_14. Unsupervised pre-training on the DC-SBB data as well as supervised pre-training on contemporary NER ground truth were applied. Unsupervised and supervised pretraining are combined where unsupervised is done first and supervised second. Performance on different combinations of training and test sets was explored, and a 5-fold cross validation and comparison with state of the art approaches was conducted.
145 | 
146 | ### Preprocessing
147 | 
148 | The model was pretrained on 2.300.000 pages of german texts from the digitized collections of the Berlin State Library.
149 | The texts have been obtained by OCR from the page scans of the documents.
150 | 
151 | ### Speeds, Sizes, Times
152 | 
153 | <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
154 | 
155 | Since it is an incarnation of the original BERT-model published by Google, all the speed, size and time considerations of that original model hold.
156 |  
157 | # Evaluation
158 | 
159 | <!-- This section describes the evaluation protocols and provides the results. -->
160 | The model has been evaluated by 5-fold cross-validation on several german historical OCR ground truth datasets. 
161 | See publication for detail.
162 | 
163 | ## Testing Data, Factors & Metrics
164 | 
165 | ### Testing Data
166 | 
167 | <!-- This should link to a Data Card if possible. -->
168 | 
169 | Two different test sets contained in the CoNLL 2003 German Named Entity Recognition Ground Truth, 
170 | i.e. TEST-A and TEST-B, have been used for testing (DE-CoNLL-TEST).
171 | Additionaly historical OCR-based ground truth datasets have been used for testing - see publication for details.
172 | 
173 | 
174 | ### Factors
175 | 
176 | <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
177 | 
178 | The evaluation focuses on NER in historical germans documents, see publication for details.
179 | 
180 | ### Metrics
181 | 
182 | <!-- These are the evaluation metrics being used, ideally with a description of why. -->
183 | 
184 | Performance metrics used in evaluation is precision, recall and F1-score.
185 | See paper for actual results in terms of these metrics.
186 | 
187 | ## Results 
188 | 
189 | See publication.
190 | 
191 | # Model Examination
192 | 
193 | See publication.
194 | 
195 | # Environmental Impact
196 | 
197 | <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
198 | 
199 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
200 | 
201 | - **Hardware Type:** V100
202 | - **Hours used:** Roughly 1-2 week(s) for pretraining. Roughly 1 hour for final NER-training.
203 | - **Cloud Provider:** No cloud.
204 | - **Compute Region:** Germany.
205 | - **Carbon Emitted:** More information needed
206 | 
207 | # Technical Specifications [optional]
208 | 
209 | ## Model Architecture and Objective
210 | 
211 | See original BERT publication.
212 | 
213 | ## Compute Infrastructure
214 | 
215 | Training and pre-training has been performed on a single V100.
216 | 
217 | ### Hardware
218 | 
219 | See above.
220 | 
221 | ### Software
222 | 
223 | See published code on github.
224 | 
225 | # Citation
226 | 
227 | <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
228 | 
229 | **BibTeX:**
230 | 
231 | @article{labusch_bert_2019,
232 | 	title = {{BERT} for {Named} {Entity} {Recognition} in {Contemporary} and {Historical} {German}},
233 | 	volume = {Conference on Natural Language Processing},
234 | 	url = {https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_4.pdf},
235 | 	abstract = {We apply a pre-trained transformer based representational language model, i.e. BERT (Devlin et al., 2018), to named entity recognition (NER) in contemporary and historical German text and observe state of the art performance for both text categories. We further improve the recognition performance for historical German by unsupervised pre-training on a large corpus of historical German texts of the Berlin State Library and show that best performance for historical German is obtained by unsupervised pre-training on historical German plus supervised pre-training with contemporary NER ground-truth.},
236 | 	language = {en},
237 | 	author = {Labusch, Kai and Neudecker, Clemens and Zellhöfer, David},
238 | 	year = {2019},
239 | 	pages = {9},
240 | }
241 | 
242 | **APA:**
243 | 
244 | (Labusch et al., 2019)
245 | 
246 | # Glossary [optional]
247 | 
248 | <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
249 | 
250 | More information needed
251 | 
252 | # More Information [optional]
253 | 
254 | More information needed
255 | 
256 | # Model Card Authors [optional]
257 | 
258 | <!-- This section provides another layer of transparency and accountability. Whose views is this model card representing? How many voices were included in its construction? Etc. -->
259 | 
260 | [Kai Labusch](kai.labusch@sbb.spk-berlin.de) and [Jörg Lehmann](joerg.lehmann@sbb.spk-berlin.de)
261 | 
262 | 
263 | # Model Card Contact
264 | 
265 | Questions and comments about the model can be directed to Clemens Neudecker at clemens.neudecker@sbb.spk-berlin.de, 
266 | questions and comments about the model card can be directed to Jörg Lehmann at joerg.lehmann@sbb.spk-berlin.de
267 | 
268 | # How to Get Started with the Model
269 | 
270 | Use the code below to get started with the model.
271 | 
272 | <details>
273 | How to get started with this model is explained in the ReadMe file of the GitHub repository [over here] (https://github.com/qurator-spk/sbb_ner).
274 | </details>
275 | 


--------------------------------------------------------------------------------
/qurator/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)
2 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/conll.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import click
 3 | import codecs
 4 | import os
 5 | 
 6 | 
 7 | def read_gt(files, datasets):
 8 |     sentence_number = 300000
 9 |     gt_data = list()
10 | 
11 |     for filename, dataset in zip(files, datasets):
12 |         gt_lines = [l.strip() for l in codecs.open(filename, 'r', 'latin-1')]
13 | 
14 |         word_number = 0
15 | 
16 |         for li in gt_lines:
17 | 
18 |             if li == '':
19 | 
20 |                 if word_number > 0:
21 | 
22 |                     sentence_number += 1
23 |                     word_number = 0
24 | 
25 |                 continue
26 | 
27 |             if li.startswith('-DOCSTART-'):
28 |                 continue
29 | 
30 |             parts = li.split()
31 | 
32 |             if len(parts) == 5:
33 |                 word, _, _, _, tag = li.split()
34 |             else:
35 |                 word, _, _, tag = li.split()
36 | 
37 |             tag = tag.upper()
38 |             tag = tag.replace('_', '-')
39 |             tag = tag.replace('.', '-')
40 | 
41 |             if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
42 |                 tag = 'O'
43 | 
44 |             gt_data.append((sentence_number, word_number, word, tag, dataset))
45 | 
46 |             word_number += 1
47 | 
48 |     return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
49 | 
50 | 
51 | @click.command()
52 | @click.argument('path-to-conll', type=click.Path(exists=True), required=True, nargs=1)
53 | @click.argument('conll-ground-truth-file', type=click.Path(), required=True, nargs=1)
54 | def main(path_to_conll, conll_ground_truth_file):
55 |     """
56 |     Read CONLL 2003 ner ground truth files from directory <path-to-conll> and
57 |     write the outcome of the data parsing to some pandas DataFrame
58 |     that is stored as pickle in file <conll-ground-truth-file>.
59 |     """
60 | 
61 |     os.makedirs(os.path.dirname(conll_ground_truth_file), exist_ok=True)
62 | 
63 |     gt_all = read_gt(['{}/deu.dev'.format(path_to_conll),
64 |                       '{}/deu.testa'.format(path_to_conll),
65 |                       '{}/deu.testb'.format(path_to_conll),
66 |                       '{}/deu.train'.format(path_to_conll),
67 |                       '{}/eng.testa'.format(path_to_conll),
68 |                       '{}/eng.testb'.format(path_to_conll),
69 |                       '{}/eng.train'.format(path_to_conll)],
70 |                      ['DE-CONLL-DEV', 'DE-CONLL-TESTA', 'DE-CONLL-TESTB', 'DE-CONLL-TRAIN',
71 |                       'EN-CONLL-TESTA', 'EN-CONLL-TESTB', 'EN-CONLL-TRAIN'])
72 | 
73 |     gt_all.to_pickle(conll_ground_truth_file)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/data_processor.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import os
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | 
 11 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 12 |                               TensorDataset, Dataset)
 13 | from torch.utils.data.distributed import DistributedSampler
 14 | 
 15 | 
 16 | class InputExample(object):
 17 |     """A single training/test example for simple sequence classification."""
 18 | 
 19 |     def __init__(self, guid, text_a, text_b=None, label=None):
 20 |         """Constructs a InputExample.
 21 | 
 22 |         Args:
 23 |             guid: Unique id for the example.
 24 |             text_a: string. The untokenized text of the first sequence. For single
 25 |             sequence tasks, only this sequence must be specified.
 26 |             text_b: (Optional) string. The untokenized text of the second sequence.
 27 |             Only must be specified for sequence pair tasks.
 28 |             label: (Optional) string. The label of the example. This should be
 29 |             specified for train and dev examples, but not for test examples.
 30 |         """
 31 |         self.guid = guid
 32 |         self.text_a = text_a
 33 |         self.text_b = text_b
 34 |         self.label = label
 35 | 
 36 | 
 37 | class InputFeatures(object):
 38 |     """A single set of features of data."""
 39 | 
 40 |     def __init__(self, guid, input_ids, input_mask, segment_ids, label_id, tokens):
 41 |         self.guid = guid
 42 |         self.input_ids = input_ids
 43 |         self.input_mask = input_mask
 44 |         self.segment_ids = segment_ids
 45 |         self.label_id = label_id
 46 |         self.tokens = tokens
 47 | 
 48 | 
 49 | class WikipediaDataset(Dataset):
 50 |     """
 51 |     """
 52 | 
 53 |     def __init__(self, set_file, gt_file, data_epochs, epoch_size,
 54 |                  label_map, tokenizer, max_seq_length,
 55 |                  queue_size=1000, no_entity_fraction=0.0, seed=23,
 56 |                  min_sen_len=10, min_article_len=20):
 57 | 
 58 |         self._set_file = set_file
 59 |         self._subset = pd.read_pickle(set_file)
 60 |         self._gt_file = gt_file
 61 |         self._data_epochs = data_epochs
 62 |         self._epoch_size = epoch_size
 63 |         self._label_map = label_map
 64 |         self._tokenizer = tokenizer
 65 |         self._max_seq_length = max_seq_length
 66 |         self._queue_size = queue_size
 67 |         self._no_entity_fraction = no_entity_fraction
 68 |         self._seed = seed
 69 |         self._min_sen_len = min_sen_len
 70 |         self._min_article_len = min_article_len
 71 | 
 72 |         self._queue = None
 73 |         self._data_sequence = None
 74 |         self._counter = None
 75 |         # noinspection PyUnresolvedReferences
 76 |         self._random_state = np.random.RandomState(seed=self._seed)
 77 | 
 78 |         self._features = []
 79 | 
 80 |         self._reset()
 81 | 
 82 |         return
 83 | 
 84 |     def _next_sample_should_have_entities(self):
 85 | 
 86 |         if self._no_entity_fraction <= 0.0:
 87 |             return True
 88 | 
 89 |         return int(self._counter) % int(1.0 / self._no_entity_fraction) != 0
 90 | 
 91 |     def _get_features(self):
 92 | 
 93 |         if self._counter > self._data_epochs * self._epoch_size:
 94 |             self._reset()
 95 | 
 96 |         while True:
 97 | 
 98 |             # get next random sentence
 99 |             sen_words, sen_tags = self._queue_next()
100 | 
101 |             if len(sen_words) < self._min_sen_len:  # Skip all sentences that are to short.
102 |                 continue
103 | 
104 |             if self._has_entities(sen_tags):
105 | 
106 |                 if not self._next_sample_should_have_entities():  # Skip sample if next sample is supposed to
107 |                     # be a no-entity sample
108 |                     continue
109 |             else:
110 |                 if self._next_sample_should_have_entities():  # Skip sample if next sample is supposed to be a entity
111 |                     # sample
112 |                     continue
113 |             break
114 | 
115 |         sample = InputExample(guid="%s-%s" % (self._set_file, self._counter),
116 |                               text_a=sen_words, text_b=None, label=sen_tags)
117 | 
118 |         return [fe for fe in
119 |                 convert_examples_to_features(sample, self._label_map, self._max_seq_length, self._tokenizer)]
120 | 
121 |     def __getitem__(self, index):
122 | 
123 |         del index
124 | 
125 |         if len(self._features) == 0:
126 |             self._features = self._get_features()
127 | 
128 |         fe = self._features.pop()
129 | 
130 |         self._counter += 1
131 | 
132 |         return torch.tensor(fe.input_ids, dtype=torch.long), \
133 |                torch.tensor(fe.input_mask, dtype=torch.long), \
134 |                torch.tensor(fe.segment_ids, dtype=torch.long), \
135 |                torch.tensor(fe.label_id, dtype=torch.long)
136 | 
137 |     def __len__(self):
138 | 
139 |         return int(self._epoch_size)
140 | 
141 |     def _reset(self):
142 | 
143 |         # print('================= WikipediaDataset:_reset ====================== ')
144 | 
145 |         self._queue = list()
146 |         self._data_sequence = self._sequence()
147 |         self._counter = 0
148 |         # noinspection PyUnresolvedReferences
149 |         # self._random_state = np.random.RandomState(seed=self._seed)
150 | 
151 |         for _ in range(0, self._queue_size):
152 |             self._queue.append(list())
153 | 
154 |     def _sequence(self):
155 | 
156 |         while True:
157 | 
158 |             for row in pd.read_csv(self._gt_file, chunksize=1, sep=';'):
159 | 
160 |                 page_id = row.page_id.iloc[0]
161 |                 text = row.text.iloc[0]
162 |                 tags = row.tags.iloc[0]
163 | 
164 |                 if page_id not in self._subset.index:
165 |                     continue
166 | 
167 |                 sentences = [(sen_text, sen_tag) for sen_text, sen_tag in zip(json.loads(text), json.loads(tags))]
168 | 
169 |                 if len(sentences) < self._min_article_len:  # Skip very short articles.
170 |                     continue
171 | 
172 |                 print(page_id)
173 | 
174 |                 yield sentences
175 | 
176 |     def _queue_next(self):
177 | 
178 |         nqueue = self._random_state.randint(len(self._queue))
179 | 
180 |         while len(self._queue[nqueue]) <= 0:
181 |             self._queue[nqueue] = next(self._data_sequence)
182 | 
183 |         return self._queue[nqueue].pop()
184 | 
185 |     @staticmethod
186 |     def _has_entities(sen_tags):
187 | 
188 |         for t in sen_tags:
189 | 
190 |             if t != 'O':
191 |                 return True
192 | 
193 |         return False
194 | 
195 | 
196 | class DataProcessor(object):
197 |     """Base class for data converters for sequence classification data sets."""
198 | 
199 |     def get_train_examples(self, batch_size, local_rank):
200 |         """Gets a collection of `InputExample`s for the train set."""
201 |         raise NotImplementedError()
202 | 
203 |     def get_dev_examples(self, batch_size, local_rank):
204 |         """Gets a collection of `InputExample`s for the dev set."""
205 |         raise NotImplementedError()
206 | 
207 |     def get_labels(self):
208 |         """Gets the list of labels for this data set."""
209 |         raise NotImplementedError()
210 | 
211 |     def get_evaluation_file(self):
212 |         raise NotImplementedError()
213 | 
214 | 
215 | class WikipediaNerProcessor(DataProcessor):
216 | 
217 |     def __init__(self, train_sets, dev_sets, test_sets, gt_file, max_seq_length, tokenizer,
218 |                  data_epochs, epoch_size, **kwargs):
219 |         del kwargs
220 | 
221 |         self._max_seq_length = max_seq_length
222 |         self._tokenizer = tokenizer
223 |         self._train_set_file = train_sets
224 |         self._dev_set_file = dev_sets
225 |         self._test_set_file = test_sets
226 |         self._gt_file = gt_file
227 |         self._data_epochs = data_epochs
228 |         self._epoch_size = epoch_size
229 | 
230 |     def get_train_examples(self, batch_size, local_rank):
231 |         """See base class."""
232 | 
233 |         return self._make_data_loader(self._train_set_file, batch_size, local_rank)
234 | 
235 |     def get_dev_examples(self, batch_size, local_rank):
236 |         """See base class."""
237 | 
238 |         return self._make_data_loader(self._dev_set_file, batch_size, local_rank)
239 | 
240 |     def get_labels(self):
241 |         """See base class."""
242 | 
243 |         labels = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "X", "[CLS]", "[SEP]"]
244 | 
245 |         return {label: i for i, label in enumerate(labels)}
246 | 
247 |     def get_evaluation_file(self):
248 |         dev_set_name = os.path.splitext(os.path.basename(self._dev_set_file))[0]
249 | 
250 |         return "eval_results-{}.pkl".format(dev_set_name)
251 | 
252 |     def _make_data_loader(self, set_file, batch_size, local_rank):
253 |         del local_rank
254 | 
255 |         data = WikipediaDataset(set_file=set_file, gt_file=self._gt_file,
256 |                                 data_epochs=self._data_epochs, epoch_size=self._epoch_size,
257 |                                 label_map=self.get_labels(), tokenizer=self._tokenizer,
258 |                                 max_seq_length=self._max_seq_length)
259 | 
260 |         sampler = SequentialSampler(data)
261 | 
262 |         return DataLoader(data, sampler=sampler, batch_size=batch_size)
263 | 
264 | 
265 | class NerProcessor(DataProcessor):
266 | 
267 |     def __init__(self, train_sets, dev_sets, test_sets, max_seq_length, tokenizer,
268 |                  label_map=None, gt=None, gt_file=None, **kwargs):
269 | 
270 |         del kwargs
271 | 
272 |         self._max_seg_length = max_seq_length
273 |         self._tokenizer = tokenizer
274 |         self._train_sets = set(train_sets.split('|')) if train_sets is not None else set()
275 |         self._dev_sets = set(dev_sets.split('|')) if dev_sets is not None else set()
276 |         self._test_sets = set(test_sets.split('|')) if test_sets is not None else set()
277 | 
278 |         self._gt = gt
279 | 
280 |         if self._gt is None:
281 |             self._gt = pd.read_pickle(gt_file)
282 | 
283 |         self._label_map = label_map
284 | 
285 |         print('TRAIN SETS: ', train_sets)
286 |         print('DEV SETS: ', dev_sets)
287 |         print('TEST SETS: ', test_sets)
288 | 
289 |     def get_train_examples(self, batch_size, local_rank):
290 |         """See base class."""
291 | 
292 |         return self.make_data_loader(
293 |                             self.create_examples(self._read_lines(self._train_sets), "train"), batch_size, local_rank,
294 |                             self.get_labels(), self._max_seg_length, self._tokenizer)
295 | 
296 |     def get_dev_examples(self, batch_size, local_rank):
297 |         """See base class."""
298 |         return self.make_data_loader(
299 |                         self.create_examples(self._read_lines(self._dev_sets), "dev"), batch_size, local_rank,
300 |                         self.get_labels(), self._max_seg_length, self._tokenizer)
301 | 
302 |     def get_labels(self):
303 |         """See base class."""
304 | 
305 |         if self._label_map is not None:
306 |             return self._label_map
307 | 
308 |         gt = self._gt
309 |         gt = gt.loc[gt.dataset.isin(self._train_sets.union(self._dev_sets).union(self._test_sets))]
310 | 
311 |         labels = sorted(gt.tag.unique().tolist()) + ["X", "[CLS]", "[SEP]"]
312 | 
313 |         self._label_map = {label: i for i, label in enumerate(labels, 1)}
314 | 
315 |         self._label_map['UNK'] = 0
316 | 
317 |         return self._label_map
318 | 
319 |     def get_evaluation_file(self):
320 | 
321 |         return "eval_results-{}.pkl".format("-".join(sorted(self._dev_sets)))
322 | 
323 |     @staticmethod
324 |     def create_examples(lines, set_type):
325 | 
326 |         for i, (sentence, label) in enumerate(lines):
327 |             guid = "%s-%s" % (set_type, i)
328 |             text_a = sentence
329 |             text_b = None
330 |             label = label
331 | 
332 |             yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
333 | 
334 |     @staticmethod
335 |     def make_data_loader(examples, batch_size, local_rank, label_map, max_seq_length, tokenizer, features=None,
336 |                          sequential=False):
337 | 
338 |         if features is None:
339 |             features = [fe for ex in examples for fe in
340 |                         convert_examples_to_features(ex, label_map, max_seq_length, tokenizer)]
341 | 
342 |         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
343 |         all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
344 |         all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
345 |         all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
346 | 
347 |         data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
348 | 
349 |         if local_rank == -1:
350 |             if sequential:
351 |                 train_sampler = SequentialSampler(data)
352 |             else:
353 |                 train_sampler = RandomSampler(data)
354 |         else:
355 |             if sequential:
356 |                 train_sampler = SequentialSampler(data)
357 |             else:
358 |                 train_sampler = DistributedSampler(data)
359 | 
360 |         return DataLoader(data, sampler=train_sampler, batch_size=batch_size)
361 | 
362 |     def _read_lines(self, sets):
363 | 
364 |         gt = self._gt
365 |         gt = gt.loc[gt.dataset.isin(sets)]
366 | 
367 |         data = list()
368 |         for i, sent in gt.groupby('nsentence'):
369 | 
370 |             sent = sent.sort_values('nword', ascending=True)
371 | 
372 |             data.append((sent.word.tolist(), sent.tag.tolist()))
373 | 
374 |         return data
375 | 
376 | 
377 | def convert_examples_to_features(example, label_map, max_seq_len, tokenizer):
378 |     """
379 |     :param example: instance of InputExample
380 |     :param label_map: Maps labels like B-ORG ... to numbers (ids).
381 |     :param max_seq_len: Maximum length of sequences to be delivered to the model.
382 |     :param tokenizer: BERT-Tokenizer
383 |     :return:
384 |     """
385 |     tokens = []
386 |     labels = []
387 | 
388 |     for i, word in enumerate(example.text_a):  # example.text_a is a sequence of words
389 | 
390 |         token = tokenizer.tokenize(word)
391 | 
392 |         # import ipdb;ipdb.set_trace()
393 | 
394 |         tokens.extend(token)
395 | 
396 |         label_1 = example.label[i] if i < len(example.label) else 'O'
397 | 
398 |         for m in range(len(token)):  # a word might have been split into several tokens
399 |             if m == 0:
400 |                 labels.append(label_1)
401 |             else:
402 |                 labels.append("X")
403 | 
404 |     start_pos = 0
405 |     while start_pos < len(tokens):
406 | 
407 |         window_len = min(max_seq_len - 2, len(tokens) - start_pos)  # -2 since we also need [CLS] and [SEP]
408 | 
409 |         # Make sure that we do not split the sentence within a word.
410 |         while window_len > 1 and start_pos + window_len < len(tokens) and\
411 |                 tokens[start_pos + window_len].startswith('##'):
412 |             window_len -= 1
413 | 
414 |         if window_len == 1:
415 |             window_len = min(max_seq_len - 2, len(tokens) - start_pos)
416 | 
417 |         token_window = tokens[start_pos:start_pos+window_len]
418 |         start_pos += window_len
419 | 
420 |         augmented_tokens = ["[CLS]"] + token_window + ["[SEP]"]
421 | 
422 |         input_ids = tokenizer.convert_tokens_to_ids(augmented_tokens) + max(0, max_seq_len - len(augmented_tokens))*[0]
423 | 
424 |         input_mask = [1] * len(augmented_tokens) + max(0, max_seq_len - len(augmented_tokens))*[0]
425 | 
426 |         segment_ids = [0] + len(token_window) * [0] + [0] + max(0, max_seq_len - len(augmented_tokens))*[0]
427 | 
428 |         label_ids = [label_map["[CLS]"]] + [label_map[labels[i]] for i in range(len(token_window))] + \
429 |                     [label_map["[SEP]"]] + max(0, max_seq_len - len(augmented_tokens)) * [0]
430 | 
431 |         assert len(input_ids) == max_seq_len
432 |         assert len(input_mask) == max_seq_len
433 |         assert len(segment_ids) == max_seq_len
434 |         assert len(label_ids) == max_seq_len
435 | 
436 |         yield InputFeatures(guid=example.guid, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids,
437 |                             label_id=label_ids, tokens=augmented_tokens)
438 | 
439 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/europeana_historic.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import click
 4 | import os
 5 | 
 6 | 
 7 | def read_gt(files, datasets):
 8 |     sentence_number = 100000
 9 |     sentence = ''
10 |     gt_data = list()
11 | 
12 |     for filename, dataset in zip(files, datasets):
13 |         gt_lines = [l.strip() for l in open(filename) if not l.startswith('<--')]
14 | 
15 |         word_number = 0
16 | 
17 |         for l in gt_lines:
18 | 
19 |             try:
20 |                 word, tag = l.split(' ')
21 |             except ValueError:
22 |                 word = l.replace(' ', '_')
23 |                 tag = 'O'
24 | 
25 |             tag = tag.upper()
26 | 
27 |             tag = tag.replace('_', '-')
28 |             tag = tag.replace('.', '-')
29 | 
30 |             if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
31 |                 tag = 'O'
32 | 
33 |             gt_data.append((sentence_number, word_number, word, tag, dataset))
34 | 
35 |             if re.match(r'.*[.|?|!]$', word) \
36 |                and not re.match(r'[0-9]+[.]$', word) \
37 |                and not re.match(r'.*[0-9]+\s*$', sentence)\
38 |                and not re.match(r'.*\s+[\S]{1,2}$', sentence):
39 | 
40 |                 sentence_number += 1
41 |                 sentence = ''
42 |                 word_number = 0
43 |             else:
44 |                 word_number += 1
45 |                 sentence += ' ' + word
46 | 
47 |     return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
48 | 
49 | 
50 | @click.command()
51 | @click.argument('path-to-ner-corpora', type=click.Path(exists=True), required=True, nargs=1)
52 | @click.argument('ner-ground-truth-file', type=click.Path(), required=True, nargs=1)
53 | def main(path_to_ner_corpora, ner_ground_truth_file):
54 |     """
55 |     Read europeana historic ner ground truth .bio files from directory <path-to-ner-corpora> and
56 |     write the outcome of the data parsing to some pandas DataFrame
57 |     that is stored as pickle in file <ner-ground-truth-file>.
58 |     """
59 | 
60 |     os.makedirs(os.path.dirname(ner_ground_truth_file), exist_ok=True)
61 | 
62 |     gt_all = read_gt(['{}/enp_DE.sbb.bio/enp_DE.sbb.bio'.format(path_to_ner_corpora),
63 |                       '{}/enp_DE.onb.bio/enp_DE.onb.bio'.format(path_to_ner_corpora),
64 |                       '{}/enp_DE.lft.bio/enp_DE.lft.bio'.format(path_to_ner_corpora),
65 |                       '{}/enp_FR.bnf.bio/enp_FR.bnf.bio'.format(path_to_ner_corpora),
66 |                       '{}/enp_NL.kb.bio/enp_NL.kb.bio'.format(path_to_ner_corpora)],
67 |                      ['SBB', 'ONB', 'LFT', 'BNF', 'KB'])
68 | 
69 |     gt_all.to_pickle(ner_ground_truth_file)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/germeval.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import click
 3 | import os
 4 | 
 5 | 
 6 | def read_gt(files, datasets):
 7 |     sentence_number = 200000
 8 |     gt_data = list()
 9 | 
10 |     for filename, dataset in zip(files, datasets):
11 |         gt_lines = [l.strip() for l in open(filename)]
12 | 
13 |         word_number = 0
14 | 
15 |         for li in gt_lines:
16 | 
17 |             if li == '':
18 | 
19 |                 if word_number > 0:
20 |                     sentence_number += 1
21 |                     word_number = 0
22 | 
23 |                 continue
24 | 
25 |             if li.startswith('#'):
26 |                 continue
27 | 
28 |             _, word, tag, _ = li.split()
29 | 
30 |             tag = tag.upper()
31 |             tag = tag.replace('_', '-')
32 |             tag = tag.replace('.', '-')
33 | 
34 |             if len(tag) > 5:
35 |                 tag = tag[0:5]
36 | 
37 |             if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
38 |                 tag = 'O'
39 | 
40 |             gt_data.append((sentence_number, word_number, word, tag, dataset))
41 | 
42 |             word_number += 1
43 | 
44 |     return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
45 | 
46 | 
47 | @click.command()
48 | @click.argument('path-to-germ-eval', type=click.Path(exists=True), required=True, nargs=1)
49 | @click.argument('germ-eval-ground-truth-file', type=click.Path(), required=True, nargs=1)
50 | def main(path_to_germ_eval, germ_eval_ground_truth_file):
51 |     """
52 |     Read germ eval .tsv files from directory <path-to-germ-eval> and
53 |     write the outcome of the data parsing to some pandas DataFrame
54 |     that is stored as pickle in file <germ-eval-ground-truth-file>.
55 |     """
56 | 
57 |     os.makedirs(os.path.dirname(germ_eval_ground_truth_file), exist_ok=True)
58 | 
59 |     gt_all = read_gt(['{}/NER-de-dev.tsv'.format(path_to_germ_eval),
60 |                       '{}/NER-de-test.tsv'.format(path_to_germ_eval),
61 |                       '{}/NER-de-train.tsv'.format(path_to_germ_eval)],
62 |                      ['GERM-EVAL-DEV', 'GERM-EVAL-TEST', 'GERM-EVAL-TRAIN'])
63 | 
64 |     gt_all.to_pickle(germ_eval_ground_truth_file)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/join_gt.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import click
 3 | import os
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.argument('files', nargs=-1, type=click.Path())
 8 | def main(files):
 9 |     """
10 |     Join multiple pandas DataFrame pickles of NER ground-truth into one big file.
11 |     """
12 | 
13 |     assert(len(files) > 1)
14 | 
15 |     gt = list()
16 | 
17 |     for filename in files[:-1]:
18 | 
19 |         gt.append(pd.read_pickle(filename))
20 | 
21 |     gt = pd.concat(gt, axis=0)
22 | 
23 |     os.makedirs(os.path.dirname(files[-1]), exist_ok=True)
24 | 
25 |     gt.to_pickle(files[-1])
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/ground_truth/wikiner.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import click
 3 | import os
 4 | 
 5 | 
 6 | def read_gt(files, datasets):
 7 | 
 8 |     sentence_number = 1000000
 9 |     gt_data = list()
10 | 
11 |     for filename, dataset in zip(files, datasets):
12 | 
13 |         for li in open(filename, encoding='iso-8859-1'):
14 | 
15 |             li = li.strip()
16 | 
17 |             parts = li.split(' ')
18 | 
19 |             prev_tag = 'O'
20 |             for word_number, pa in enumerate(parts):
21 | 
22 |                 if len(pa) == 0:
23 |                     continue
24 | 
25 |                 word, pos, tag = pa.split('|')
26 | 
27 |                 tag = tag.upper()
28 |                 tag = tag.replace('_', '-')
29 |                 tag = tag.replace('.', '-')
30 | 
31 |                 if len(tag) > 5:
32 |                     tag = tag[0:5]
33 | 
34 |                 if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
35 |                     tag = 'O'
36 | 
37 |                 if tag.startswith('I') and prev_tag == 'O':
38 |                     tag = 'B' + tag[1:]
39 | 
40 |                 prev_tag = tag
41 |                 gt_data.append((sentence_number, word_number, word, tag, dataset))
42 | 
43 |             sentence_number += 1
44 | 
45 |     return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
46 | 
47 | 
48 | @click.command()
49 | @click.argument('path-to-wikiner', type=click.Path(exists=True), required=True, nargs=1)
50 | @click.argument('wikiner-ground-truth-file', type=click.Path(), required=True, nargs=1)
51 | def main(path_to_wikiner, wikiner_ground_truth_file):
52 |     """
53 |     Read wikiner files from directory <path-to-wikiner> and
54 |     write the outcome of the data parsing to some pandas DataFrame
55 |     that is stored as pickle in file <wikiner-ground-truth-file>.
56 |     """
57 | 
58 |     os.makedirs(os.path.dirname(wikiner_ground_truth_file), exist_ok=True)
59 | 
60 |     gt_all = read_gt(['{}/aij-wikiner-de-wp2'.format(path_to_wikiner),
61 |                       '{}/aij-wikiner-de-wp3'.format(path_to_wikiner)],
62 |                      ['WIKINER-WP2', 'WIKINER-WP3'])
63 | 
64 |     gt_all.to_pickle(wikiner_ground_truth_file)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/bert.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | # from inspect import currentframe
  3 | 
  4 | import argparse
  5 | import logging
  6 | import os
  7 | import random
  8 | import json
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 16 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME,  # WEIGHTS_NAME,
 17 |                                               BertConfig,
 18 |                                               BertForTokenClassification)
 19 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 20 | # from pytorch_pretrained_bert.tokenization import BertTokenizer
 21 | from .tokenization import BertTokenizer
 22 | 
 23 | 
 24 | from conlleval import evaluate as conll_eval
 25 | 
 26 | from tqdm import tqdm, trange
 27 | 
 28 | from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, WikipediaNerProcessor
 29 | 
 30 | from sklearn.model_selection import GroupKFold
 31 | 
 32 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 33 |                     datefmt='%m/%d/%Y %H:%M:%S',
 34 |                     level=logging.INFO)
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | 
 38 | def model_train(bert_model, max_seq_length, do_lower_case,
 39 |                 num_train_epochs, train_batch_size, gradient_accumulation_steps,
 40 |                 learning_rate, weight_decay, loss_scale, warmup_proportion,
 41 |                 processor, device, n_gpu, fp16, cache_dir, local_rank,
 42 |                 dry_run, no_cuda, output_dir=None):
 43 | 
 44 |     label_map = processor.get_labels()
 45 | 
 46 |     if gradient_accumulation_steps < 1:
 47 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
 48 |             gradient_accumulation_steps))
 49 | 
 50 |     train_batch_size = train_batch_size // gradient_accumulation_steps
 51 | 
 52 |     train_dataloader = processor.get_train_examples(train_batch_size, local_rank)
 53 | 
 54 |     # Batch sampler divides by batch_size!
 55 |     num_train_optimization_steps = int(len(train_dataloader)*num_train_epochs/gradient_accumulation_steps)
 56 | 
 57 |     if local_rank != -1:
 58 |         num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 59 | 
 60 |     # Prepare model
 61 |     cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
 62 |                                                          'distributed_{}'.format(local_rank))
 63 | 
 64 |     model = BertForTokenClassification.from_pretrained(bert_model, cache_dir=cache_dir, num_labels=len(label_map))
 65 | 
 66 |     if fp16:
 67 |         model.half()
 68 | 
 69 |     model.to(device)
 70 | 
 71 |     if local_rank != -1:
 72 |         try:
 73 |             from apex.parallel import DistributedDataParallel as DDP
 74 |         except ImportError:
 75 |             raise ImportError(
 76 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 77 | 
 78 |         model = DDP(model)
 79 |     elif n_gpu > 1:
 80 |         model = torch.nn.DataParallel(model)
 81 | 
 82 |     param_optimizer = list(model.named_parameters())
 83 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
 84 |     optimizer_grouped_parameters = [
 85 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 86 |          'weight_decay': weight_decay},
 87 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
 88 |     ]
 89 | 
 90 |     if fp16:
 91 |         try:
 92 |             from apex.optimizers import FP16_Optimizer
 93 |             from apex.optimizers import FusedAdam
 94 |         except ImportError:
 95 |             raise ImportError(
 96 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 97 | 
 98 |         optimizer = FusedAdam(optimizer_grouped_parameters,
 99 |                               lr=learning_rate,
100 |                               bias_correction=False,
101 |                               max_grad_norm=1.0)
102 |         if loss_scale == 0:
103 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
104 |         else:
105 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
106 | 
107 |         warmup_linear = WarmupLinearSchedule(warmup=warmup_proportion, t_total=num_train_optimization_steps)
108 |     else:
109 |         optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion,
110 |                              t_total=num_train_optimization_steps)
111 |         warmup_linear = None
112 | 
113 |     global_step = 0
114 |     logger.info("***** Running training *****")
115 |     logger.info("  Num examples = %d", len(train_dataloader))
116 |     logger.info("  Batch size = %d", train_batch_size)
117 |     logger.info("  Num steps = %d", num_train_optimization_steps)
118 |     logger.info("  Num epochs = %d", num_train_epochs)
119 | 
120 |     model_config = {"bert_model": bert_model, "do_lower": do_lower_case,
121 |                     "max_seq_length": max_seq_length, "label_map": label_map}
122 | 
123 |     def save_model(lh):
124 | 
125 |         if output_dir is None:
126 |             return
127 | 
128 |         output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep))
129 | 
130 |         # Save a trained model and the associated configuration
131 |         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
132 | 
133 |         torch.save(model_to_save.state_dict(), output_model_file)
134 | 
135 |         output_config_file = os.path.join(output_dir, CONFIG_NAME)
136 |         with open(output_config_file, 'w') as f:
137 |             f.write(model_to_save.config.to_json_string())
138 | 
139 |         json.dump(model_config, open(os.path.join(output_dir, "model_config.json"), "w"))
140 | 
141 |         lh = pd.DataFrame(lh, columns=['global_step', 'loss'])
142 | 
143 |         loss_history_file = os.path.join(output_dir, "loss_ep{}.pkl".format(ep))
144 | 
145 |         lh.to_pickle(loss_history_file)
146 | 
147 |     def load_model(epoch):
148 | 
149 |         if output_dir is None:
150 | 
151 |             return False
152 | 
153 |         output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(epoch))
154 | 
155 |         if not os.path.exists(output_model_file):
156 | 
157 |             return False
158 | 
159 |         logger.info("Loading epoch {} from disk...".format(epoch))
160 |         model.load_state_dict(torch.load(output_model_file,
161 |                                          map_location=lambda storage, loc: storage if no_cuda else None))
162 |         return True
163 | 
164 |     model.train()
165 |     for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"):
166 | 
167 |         if dry_run and ep > 1:
168 |             logger.info("Dry run. Stop.")
169 |             break
170 | 
171 |         if load_model(ep):
172 |             global_step += len(train_dataloader) // gradient_accumulation_steps
173 |             continue
174 | 
175 |         loss_history = list()
176 |         tr_loss = 0
177 |         nb_tr_examples, nb_tr_steps = 0, 0
178 |         with tqdm(total=len(train_dataloader), desc=f"Epoch {ep}") as pbar:
179 |         
180 |             for step, batch in enumerate(train_dataloader):
181 | 
182 |                 batch = tuple(t.to(device) for t in batch)
183 | 
184 |                 input_ids, input_mask, segment_ids, label_ids = batch
185 | 
186 |                 loss = model(input_ids, segment_ids, input_mask, label_ids)
187 | 
188 |                 if n_gpu > 1:
189 |                     loss = loss.mean()  # mean() to average on multi-gpu.
190 |                 if gradient_accumulation_steps > 1:
191 |                     loss = loss / gradient_accumulation_steps
192 | 
193 |                 if fp16:
194 |                     optimizer.backward(loss)
195 |                 else:
196 |                     loss.backward()
197 | 
198 |                 loss_history.append((global_step, loss.item()))
199 | 
200 |                 tr_loss += loss.item()
201 |                 nb_tr_examples += input_ids.size(0)
202 |                 nb_tr_steps += 1
203 |                 pbar.update(1)
204 |                 mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps
205 |                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
206 | 
207 |                 if dry_run and len(loss_history) > 2:
208 |                     logger.info("Dry run. Stop.")
209 |                     break
210 | 
211 |                 if (step + 1) % gradient_accumulation_steps == 0:
212 |                     if fp16:
213 |                         # modify learning rate with special warm up BERT uses
214 |                         # if args.fp16 is False, BertAdam is used that handles this automatically
215 |                         lr_this_step = learning_rate * warmup_linear.get_lr(global_step, warmup_proportion)
216 | 
217 |                         for param_group in optimizer.param_groups:
218 |                             param_group['lr'] = lr_this_step
219 | 
220 |                     optimizer.step()
221 |                     optimizer.zero_grad()
222 |                     global_step += 1
223 | 
224 |         save_model(loss_history)
225 | 
226 |     return model, model_config
227 | 
228 | 
229 | def model_eval(batch_size, label_map, processor, device, num_train_epochs=1, output_dir=None, model=None,
230 |                local_rank=-1, no_cuda=False, dry_run=False):
231 | 
232 |     output_eval_file = None
233 |     if output_dir is not None:
234 |         output_eval_file = os.path.join(output_dir, processor.get_evaluation_file())
235 |         logger.info('Write evaluation results to: {}'.format(output_eval_file))
236 | 
237 |     dataloader = processor.get_dev_examples(batch_size, local_rank)
238 | 
239 |     logger.info("***** Running evaluation *****")
240 |     logger.info("  Num examples = %d", len(dataloader))
241 |     logger.info("  Batch size = %d", batch_size)
242 | 
243 |     results = list()
244 | 
245 |     output_config_file = None
246 |     if output_dir is not None:
247 |         output_config_file = os.path.join(output_dir, CONFIG_NAME)
248 | 
249 |     for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"):
250 | 
251 |         if dry_run and ep > 1:
252 |             logger.info("Dry run. Stop.")
253 |             break
254 | 
255 |         if output_config_file is not None:
256 |             # Load a trained model and config that you have fine-tuned
257 |             output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep))
258 | 
259 |             if not os.path.exists(output_model_file):
260 |                 logger.info("Stopping at epoch {} since model file is missing.".format(ep))
261 |                 break
262 | 
263 |             config = BertConfig(output_config_file)
264 |             model = BertForTokenClassification(config, num_labels=len(label_map))
265 |             model.load_state_dict(torch.load(output_model_file,
266 |                                              map_location=lambda storage, loc: storage if no_cuda else None))
267 |             model.to(device)
268 | 
269 |         if model is None:
270 |             raise ValueError('Model required for evaluation.')
271 | 
272 |         model.eval()
273 | 
274 |         y_pred, y_true = model_predict_compare(dataloader, device, label_map, model, dry_run)
275 | 
276 |         lines = ['empty ' + 'XXX ' + v + ' ' + p for yt, yp in zip(y_true, y_pred) for v, p in zip(yt, yp)]
277 | 
278 |         res = conll_eval(lines)
279 | 
280 |         # print(res)
281 | 
282 |         evals = \
283 |             pd.concat([pd.DataFrame.from_dict(res['overall']['evals'], orient='index', columns=['ALL']),
284 |                        pd.DataFrame.from_dict(res['slots']['LOC']['evals'], orient='index', columns=['LOC']),
285 |                        pd.DataFrame.from_dict(res['slots']['PER']['evals'], orient='index', columns=['PER']),
286 |                        pd.DataFrame.from_dict(res['slots']['ORG']['evals'], orient='index', columns=['ORG']),
287 |                        ], axis=1).T
288 | 
289 |         stats = \
290 |             pd.concat(
291 |                 [pd.DataFrame.from_dict(res['overall']['stats'], orient='index', columns=['ALL']),
292 |                  pd.DataFrame.from_dict(res['slots']['LOC']['stats'], orient='index', columns=['LOC']),
293 |                  pd.DataFrame.from_dict(res['slots']['PER']['stats'], orient='index', columns=['PER']),
294 |                  pd.DataFrame.from_dict(res['slots']['ORG']['stats'], orient='index', columns=['ORG'])],
295 |                 axis=1, sort=True).T
296 | 
297 |         evals['epoch'] = ep
298 |         stats['epoch'] = ep
299 | 
300 |         results.append(pd.concat([evals.reset_index().set_index(['index', 'epoch']),
301 |                                   stats.reset_index().set_index(['index', 'epoch'])], axis=1))
302 | 
303 |         if output_eval_file is not None:
304 |             pd.concat(results).to_pickle(output_eval_file)
305 | 
306 |     results = pd.concat(results)
307 |     print(results)
308 | 
309 |     return results
310 | 
311 | 
312 | def model_predict_compare(dataloader, device, label_map, model, dry_run=False):
313 | 
314 |     y_true = []
315 |     y_pred = []
316 |     covered = set()
317 |     for input_ids, input_mask, segment_ids, label_ids in tqdm(dataloader, desc="Evaluating"):
318 |         input_ids = input_ids.to(device)
319 |         input_mask = input_mask.to(device)
320 |         segment_ids = segment_ids.to(device)
321 |         label_ids = label_ids.to(device)
322 | 
323 |         with torch.no_grad():
324 |             logits = model(input_ids, segment_ids, input_mask)
325 | 
326 |         logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
327 |         logits = logits.detach().cpu().numpy()
328 |         label_ids = label_ids.to('cpu').numpy()
329 |         input_mask = input_mask.to('cpu').numpy()
330 | 
331 |         for i, mask in enumerate(input_mask):
332 |             temp_1 = []
333 |             temp_2 = []
334 |             for j, m in enumerate(mask):
335 |                 if j == 0:
336 |                     continue
337 |                 if m:
338 |                     if label_map[label_ids[i][j]] != "X":
339 |                         temp_1.append(label_map[label_ids[i][j]])
340 |                         temp_2.append(label_map[logits[i][j]])
341 |                 else:
342 |                     temp_1.pop()
343 |                     temp_2.pop()
344 |                     y_true.append(temp_1)
345 |                     y_pred.append(temp_2)
346 | 
347 |                     covered = covered.union(set(temp_1))
348 |                     break
349 | 
350 |         if dry_run:
351 | 
352 |             if 'I-LOC' not in covered:
353 |                 continue
354 |             if 'I-ORG' not in covered:
355 |                 continue
356 |             if 'I-PER' not in covered:
357 |                 continue
358 | 
359 |             break
360 |     return y_pred, y_true
361 | 
362 | 
363 | def model_predict(dataloader, device, label_map, model):
364 | 
365 |     y_pred = []
366 |     for input_ids, input_mask, segment_ids, label_ids in dataloader:
367 |         input_ids = input_ids.to(device)
368 |         input_mask = input_mask.to(device)
369 |         segment_ids = segment_ids.to(device)
370 | 
371 |         with torch.no_grad():
372 |             logits = model(input_ids, segment_ids, input_mask)
373 | 
374 |         logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
375 |         logits = logits.detach().cpu().numpy()
376 |         input_mask = input_mask.to('cpu').numpy()
377 | 
378 |         for i, mask in enumerate(input_mask):
379 |             temp_2 = []
380 |             for j, m in enumerate(mask):
381 |                 if j == 0:  # skip first token since its [CLS]
382 |                     continue
383 |                 if m:
384 |                     temp_2.append(label_map[logits[i][j]])
385 |                 else:
386 |                     temp_2.pop()  # skip last token since its [SEP]
387 |                     y_pred.append(temp_2)
388 |                     break
389 |             else:
390 |                 temp_2.pop()  # skip last token since its [SEP]
391 |                 y_pred.append(temp_2)
392 | 
393 |     return y_pred
394 | 
395 | 
396 | def get_device(local_rank=-1, no_cuda=False):
397 |     if local_rank == -1 or no_cuda:
398 |         device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
399 |         n_gpu = torch.cuda.device_count()
400 |     else:
401 |         torch.cuda.set_device(local_rank)
402 |         device = torch.device("cuda", local_rank)
403 |         n_gpu = 1
404 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
405 |         torch.distributed.init_process_group(backend='nccl')
406 |     return device, n_gpu
407 | 
408 | 
409 | def main():
410 | 
411 |     parser = get_arg_parser()
412 | 
413 |     args = parser.parse_args()
414 | 
415 |     do_eval = len(args.dev_sets) > 0 and not args.do_cross_validation
416 |     do_train = len(args.train_sets) > 0 and not args.do_cross_validation
417 | 
418 |     device, n_gpu = get_device(args.local_rank, args.no_cuda)
419 | 
420 |     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
421 |         device, n_gpu, bool(args.local_rank != -1), args.fp16))
422 | 
423 |     random.seed(args.seed)
424 |     np.random.seed(args.seed)
425 |     torch.manual_seed(args.seed)
426 | 
427 |     if not do_train and not do_eval and not args.do_cross_validation:
428 |         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
429 | 
430 |     if not os.path.exists(args.output_dir):
431 |         os.makedirs(args.output_dir)
432 | 
433 |     task_name = args.task_name.lower()
434 | 
435 |     processors = {"ner": NerProcessor, "wikipedia-ner": WikipediaNerProcessor}
436 | 
437 |     if task_name not in processors:
438 |         raise ValueError("Task not found: %s" % task_name)
439 | 
440 |     if args.do_cross_validation:
441 | 
442 |         cross_val_result_file = "cross_validation_results.pkl"
443 | 
444 |         cross_val_result_file = os.path.join(args.output_dir, cross_val_result_file)
445 | 
446 |         sets = set(args.train_sets.split('|')) if args.train_sets is not None else set()
447 | 
448 |         gt = pd.read_pickle(args.gt_file)
449 | 
450 |         gt = gt.loc[gt.dataset.isin(sets)]
451 | 
452 |         k_fold = GroupKFold(n_splits=args.n_splits)
453 | 
454 |         eval_results = list()
455 | 
456 |         tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
457 | 
458 |         for ep in range(1, int(args.num_train_epochs) + 1):
459 | 
460 |             for sp, (train, test) in enumerate(k_fold.split(X=gt, groups=gt.nsentence)):
461 | 
462 |                 tr = gt.iloc[train].copy()
463 |                 te = gt.iloc[test].copy()
464 | 
465 |                 tr['dataset'] = 'TRAIN'
466 |                 te['dataset'] = 'TEST'
467 | 
468 |                 gt_tmp = pd.concat([tr, te])
469 | 
470 |                 processor = \
471 |                     processors[task_name](train_sets='TRAIN', dev_sets='TEST', test_sets='TEST',
472 |                                           gt=gt_tmp, max_seq_length=args.max_seq_length,
473 |                                           tokenizer=tokenizer, data_epochs=args.num_data_epochs,
474 |                                           epoch_size=args.epoch_size)
475 | 
476 |                 model, model_config = \
477 |                     model_train(bert_model=args.bert_model, max_seq_length=args.max_seq_length,
478 |                                 do_lower_case=args.do_lower_case, num_train_epochs=ep,
479 |                                 train_batch_size=args.train_batch_size,
480 |                                 gradient_accumulation_steps=args.gradient_accumulation_steps,
481 |                                 learning_rate=args.learning_rate, weight_decay=args.weight_decay,
482 |                                 loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion,
483 |                                 processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16,
484 |                                 cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run,
485 |                                 no_cuda=args.no_cuda)
486 | 
487 |                 label_map = {v: k for k, v in model_config['label_map'].items()}
488 | 
489 |                 eval_result =\
490 |                     model_eval(model=model, label_map=label_map, processor=processor, device=device,
491 |                                batch_size=args.eval_batch_size, local_rank=args.local_rank,
492 |                                no_cuda=args.no_cuda, dry_run=args.dry_run).reset_index()
493 | 
494 |                 eval_result['split'] = sp
495 |                 eval_result['epoch'] = ep
496 |                 eval_results.append(eval_result)
497 | 
498 |                 del model  # release CUDA memory
499 | 
500 |             pd.concat(eval_results).to_pickle(cross_val_result_file)
501 | 
502 |     if do_train:
503 | 
504 |         tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
505 | 
506 |         processor = \
507 |             processors[task_name](train_sets=args.train_sets, dev_sets=args.dev_sets, test_sets=args.test_sets,
508 |                                   gt_file=args.gt_file, max_seq_length=args.max_seq_length,
509 |                                   tokenizer=tokenizer, data_epochs=args.num_data_epochs,
510 |                                   epoch_size=args.epoch_size)
511 | 
512 |         model_train(bert_model=args.bert_model, output_dir=args.output_dir, max_seq_length=args.max_seq_length,
513 |                     do_lower_case=args.do_lower_case, num_train_epochs=args.num_train_epochs,
514 |                     train_batch_size=args.train_batch_size,
515 |                     gradient_accumulation_steps=args.gradient_accumulation_steps,
516 |                     learning_rate=args.learning_rate, weight_decay=args.weight_decay, loss_scale=args.loss_scale,
517 |                     warmup_proportion=args.warmup_proportion, processor=processor, device=device, n_gpu=n_gpu,
518 |                     fp16=args.fp16, cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run,
519 |                     no_cuda=args.no_cuda)
520 | 
521 |     if do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
522 | 
523 |         model_config = json.load(open(os.path.join(args.output_dir, "model_config.json"), "r"))
524 | 
525 |         label_to_id = model_config['label_map']
526 | 
527 |         label_map = {v: k for k, v in model_config['label_map'].items()}
528 | 
529 |         tokenizer = BertTokenizer.from_pretrained(model_config['bert_model'],
530 |                                                   do_lower_case=model_config['do_lower'])
531 | 
532 |         processor = \
533 |             processors[task_name](train_sets=None, dev_sets=args.dev_sets, test_sets=args.test_sets,
534 |                                   gt_file=args.gt_file, max_seq_length=model_config['max_seq_length'],
535 |                                   tokenizer=tokenizer, data_epochs=args.num_data_epochs,
536 |                                   epoch_size=args.epoch_size, label_map=label_to_id)
537 | 
538 |         model_eval(label_map=label_map, processor=processor, device=device, num_train_epochs=args.num_train_epochs,
539 |                    output_dir=args.output_dir, batch_size=args.eval_batch_size, local_rank=args.local_rank,
540 |                    no_cuda=args.no_cuda, dry_run=args.dry_run)
541 | 
542 | 
543 | def get_arg_parser():
544 | 
545 |     parser = argparse.ArgumentParser()
546 | 
547 | 
548 |     parser.add_argument("--gt_file",
549 |                         default=None,
550 |                         type=str,
551 |                         required=True,
552 |                         help="The pickle file that contains all NER ground truth as pandas DataFrame."
553 |                              " Required columns: ['nsentence', 'nword', 'word', 'tag', 'dataset]."
554 |                              " The selection of training, test and dev set is performed on the 'dataset' column.")
555 | 
556 |     parser.add_argument("--train_sets",
557 |                         default='',
558 |                         type=str,
559 |                         required=False,
560 |                         help="Specifiy one or more tags from the dataset column in order to mark samples"
561 |                              " that belong to the training set. Example: 'GERM-EVAL-TRAIN|DE-CONLL-TRAIN'. ")
562 | 
563 |     parser.add_argument("--dev_sets",
564 |                         default='',
565 |                         type=str,
566 |                         required=False,
567 |                         help="Specifiy one or more tags from the dataset column in order to mark samples"
568 |                              " that belong to the dev set. Example: 'GERM-EVAL-DEV|DE-CONLL-TESTA'. ")
569 | 
570 |     parser.add_argument("--test_sets",
571 |                         default='',
572 |                         type=str,
573 |                         required=False,
574 |                         help="Specifiy one or more tags from the dataset column in order to mark samples"
575 |                              " that belong to the test set. Example: 'GERM-EVAL-TEST|DE-CONLL-TESTB'. ")
576 | 
577 |     parser.add_argument("--bert_model", default=None, type=str, required=False,
578 |                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
579 |                              "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
580 |                              "bert-base-multilingual-cased, bert-base-chinese.")
581 | 
582 |     parser.add_argument("--task_name",
583 |                         default=None,
584 |                         type=str,
585 |                         required=True,
586 |                         help="The name of the task to train.")
587 | 
588 |     parser.add_argument("--output_dir",
589 |                         default=None,
590 |                         type=str,
591 |                         required=False,
592 |                         help="The output directory where the model predictions and checkpoints will be written.")
593 | 
594 |     # Other parameters
595 |     parser.add_argument("--cache_dir",
596 |                         default="",
597 |                         type=str,
598 |                         help="Where do you want to store the pre-trained models downloaded from s3")
599 | 
600 |     parser.add_argument("--max_seq_length",
601 |                         default=128,
602 |                         type=int,
603 |                         help="The maximum total input sequence length after WordPiece tokenization. \n"
604 |                              "Sequences longer than this will be truncated, and sequences shorter \n"
605 |                              "than this will be padded.")
606 | 
607 |     parser.add_argument("--do_lower_case",
608 |                         action='store_true',
609 |                         help="Set this flag if you are using an uncased model.")
610 | 
611 |     parser.add_argument("--train_batch_size",
612 |                         default=32,
613 |                         type=int,
614 |                         help="Total batch size for training.")
615 | 
616 |     parser.add_argument("--eval_batch_size",
617 |                         default=8,
618 |                         type=int,
619 |                         help="Total batch size for eval.")
620 | 
621 |     parser.add_argument("--learning_rate",
622 |                         default=3e-5,
623 |                         type=float,
624 |                         help="The initial learning rate for Adam.")
625 | 
626 |     parser.add_argument("--weight_decay",
627 |                         default=0.01,
628 |                         type=float,
629 |                         help="Weight decay for Adam.")
630 | 
631 |     parser.add_argument("--num_train_epochs",
632 |                         default=3.0,
633 |                         type=float,
634 |                         help="Total number of training epochs to perform/evaluate.")
635 | 
636 |     parser.add_argument("--num_data_epochs",
637 |                         default=1.0,
638 |                         type=float,
639 |                         help="Re-cycle data after num_data_epochs.")
640 | 
641 |     parser.add_argument("--epoch_size",
642 |                         default=10000,
643 |                         type=float,
644 |                         help="Size of one epoch.")
645 | 
646 |     parser.add_argument("--do_cross_validation",
647 |                         action='store_true',
648 |                         help="Do cross-validation.")
649 | 
650 |     parser.add_argument("--n_splits",
651 |                         default=5,
652 |                         type=int,
653 |                         help="Number of folds in cross_validation.")
654 | 
655 |     parser.add_argument("--warmup_proportion",
656 |                         default=0.1,
657 |                         type=float,
658 |                         help="Proportion of training to perform linear learning rate warmup for. "
659 |                              "E.g., 0.1 = 10%% of training.")
660 | 
661 |     parser.add_argument("--no_cuda",
662 |                         action='store_true',
663 |                         help="Whether not to use CUDA when available")
664 | 
665 |     parser.add_argument("--dry_run",
666 |                         action='store_true',
667 |                         help="Test mode.")
668 | 
669 |     parser.add_argument("--local_rank",
670 |                         type=int,
671 |                         default=-1,
672 |                         help="local_rank for distributed training on gpus")
673 | 
674 |     parser.add_argument('--seed',
675 |                         type=int,
676 |                         default=42,
677 |                         help="random seed for initialization")
678 | 
679 |     parser.add_argument('--gradient_accumulation_steps',
680 |                         type=int,
681 |                         default=1,
682 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
683 | 
684 |     parser.add_argument('--fp16',
685 |                         action='store_true',
686 |                         help="Whether to use 16-bit float precision instead of 32-bit")
687 | 
688 |     parser.add_argument('--loss_scale',
689 |                         type=float, default=0,
690 |                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
691 |                              "0 (default value): dynamic loss scaling.\n"
692 |                              "Positive power of 2: static loss scaling value.\n")
693 |     return parser
694 | 
695 | 
696 | if __name__ == "__main__":
697 |     main()
698 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/corpus.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pandas as pd
  3 | from tqdm import tqdm as tqdm
  4 | import click
  5 | import codecs
  6 | import os
  7 | import sqlite3
  8 | 
  9 | from qurator.utils.parallel import run as prun
 10 | 
 11 | 
 12 | class ChunkTask:
 13 | 
 14 |     selection = None
 15 | 
 16 |     def __init__(self, chunk, min_line_len):
 17 | 
 18 |         self._chunk = chunk
 19 |         self._min_line_len = min_line_len
 20 | 
 21 |     def __call__(self, *args, **kwargs):
 22 | 
 23 |         return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)
 24 | 
 25 |     @staticmethod
 26 |     def reformat_chunk(chunk, min_line_len):
 27 |         """
 28 |         Process a chunk of documents.
 29 | 
 30 |         :param chunk: pandas DataFrame that contains one document per row.
 31 |         :param min_line_len: Break the document text up in lines that have this minimum length.
 32 |         :return: One big text where the documents are separated by an empty line.
 33 |         """
 34 | 
 35 |         text = ''
 36 | 
 37 |         for i, r in chunk.iterrows():
 38 | 
 39 |             if type(r.text) != str:
 40 |                 continue
 41 | 
 42 |             ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn
 43 | 
 44 |             filename = str(r['file name'])
 45 | 
 46 |             if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:
 47 |                 continue
 48 | 
 49 |             for se in sentence_split(str(r.text), min_line_len):
 50 | 
 51 |                 text += se
 52 | 
 53 |             text += '\n\n'
 54 | 
 55 |         return text
 56 | 
 57 |     @staticmethod
 58 |     def initialize(selection_file):
 59 | 
 60 |         ChunkTask.selection = \
 61 |             pd.read_pickle(selection_file).\
 62 |                 reset_index().\
 63 |                 set_index(['ppn', 'filename']).\
 64 |                 sort_index()
 65 | 
 66 | 
 67 | def get_csv_chunks(alto_csv_file, chunksize):
 68 | 
 69 |     for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):
 70 | 
 71 |         yield ch
 72 | 
 73 | 
 74 | def get_sqlite_chunks(alto_sqlite_file, chunksize):
 75 | 
 76 |     yield pd.DataFrame()
 77 | 
 78 |     with sqlite3.connect(alto_sqlite_file) as conn:
 79 | 
 80 |         conn.execute('pragma journal_mode=wal')
 81 | 
 82 |         total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)
 83 | 
 84 |         for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):
 85 | 
 86 |             yield ch
 87 | 
 88 | 
 89 | def get_chunk_tasks(chunks, min_len_len):
 90 | 
 91 |     for chunk in chunks:
 92 | 
 93 |         if len(chunk) == 0:
 94 |             continue
 95 | 
 96 |         yield ChunkTask(chunk, min_len_len)
 97 | 
 98 | 
 99 | def sentence_split(s, min_len):
100 |     """
101 |     Reformat text of an entire document such that each line has at least length min_len
102 |     :param s: str
103 |     :param min_len: minimum line length
104 |     :return: reformatted text
105 |     """
106 | 
107 |     parts = s.split(' ')
108 | 
109 |     se = ''
110 |     for p in parts:
111 | 
112 |         se += ' ' + p
113 | 
114 |         if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):
115 |             yield se + '\n'
116 |             se = ''
117 | 
118 |     yield se + '\n'
119 | 
120 | 
121 | @click.command()
122 | @click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)
123 | @click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)
124 | @click.argument('corpus-file', type=click.Path(), required=True, nargs=1)
125 | @click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4")
126 | @click.option('--processes', default=6, help="Number of parallel processes. default: 6")
127 | @click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")
128 | def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
129 |     """
130 |     Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.
131 | 
132 |     FULLTEXT_FILE: The CSV or SQLITE3 file to read from.
133 | 
134 |     SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
135 |     that is stored in <selection_file>.
136 | 
137 |     CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
138 |     """
139 |     os.makedirs(os.path.dirname(corpus_file), exist_ok=True)
140 | 
141 |     print('Open {}.'.format(corpus_file))
142 |     corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
143 |     corpus_fh.write(u'\ufeff')
144 | 
145 |     if fulltext_file.endswith('.csv'):
146 |         chunks = get_csv_chunks(fulltext_file, chunksize)
147 |     elif fulltext_file.endswith('.sqlite3'):
148 |         chunks = get_sqlite_chunks(fulltext_file, chunksize)
149 |     else:
150 |         raise RuntimeError('Unsupported input file format.')
151 | 
152 |     for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
153 |                      initargs=(selection_file,)):
154 | 
155 |         corpus_fh.write(text)
156 | 
157 |     corpus_fh.close()
158 | 
159 |     return
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/finetune_on_pregenerated.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from pathlib import Path
  3 | import torch
  4 | import logging
  5 | import json
  6 | import random
  7 | import numpy as np
  8 | from collections import namedtuple
  9 | from tempfile import TemporaryDirectory
 10 | 
 11 | from torch.utils.data import DataLoader, Dataset, RandomSampler
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from tqdm import tqdm
 14 | 
 15 | from pytorch_pretrained_bert.modeling import BertForPreTraining
 16 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 17 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 18 | 
 19 | InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 20 | 
 21 | log_format = '%(asctime)-10s: %(message)s'
 22 | logging.basicConfig(level=logging.INFO, format=log_format)
 23 | 
 24 | 
 25 | def convert_example_to_features(example, tokenizer, max_seq_length):
 26 |     tokens = example["tokens"]
 27 |     segment_ids = example["segment_ids"]
 28 |     is_random_next = example["is_random_next"]
 29 |     masked_lm_positions = example["masked_lm_positions"]
 30 |     masked_lm_labels = example["masked_lm_labels"]
 31 | 
 32 |     assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
 33 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
 34 |     masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
 35 | 
 36 |     input_array = np.zeros(max_seq_length, dtype=np.int)
 37 |     input_array[:len(input_ids)] = input_ids
 38 | 
 39 |     mask_array = np.zeros(max_seq_length, dtype=np.bool)
 40 |     mask_array[:len(input_ids)] = 1
 41 | 
 42 |     segment_array = np.zeros(max_seq_length, dtype=np.bool)
 43 |     segment_array[:len(segment_ids)] = segment_ids
 44 | 
 45 |     lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
 46 |     lm_label_array[masked_lm_positions] = masked_label_ids
 47 | 
 48 |     features = InputFeatures(input_ids=input_array,
 49 |                              input_mask=mask_array,
 50 |                              segment_ids=segment_array,
 51 |                              lm_label_ids=lm_label_array,
 52 |                              is_next=is_random_next)
 53 |     return features
 54 | 
 55 | 
 56 | class PregeneratedDataset(Dataset):
 57 |     def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False, prefix=None):
 58 |         self.vocab = tokenizer.vocab
 59 |         self.tokenizer = tokenizer
 60 |         self.epoch = epoch
 61 |         self.data_epoch = epoch % num_data_epochs
 62 |         data_file = training_path / f"epoch_{self.data_epoch}.json"
 63 |         metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
 64 |         assert data_file.is_file() and metrics_file.is_file()
 65 |         metrics = json.loads(metrics_file.read_text())
 66 |         num_samples = metrics['num_training_examples']
 67 |         seq_len = metrics['max_seq_len']
 68 |         self.temp_dir = None
 69 |         self.working_dir = None
 70 |         if reduce_memory:
 71 |             self.temp_dir = TemporaryDirectory(prefix=prefix)
 72 |             self.working_dir = Path(self.temp_dir.name)
 73 |             input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
 74 |                                   mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
 75 |             input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
 76 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
 77 |             segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
 78 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
 79 |             lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
 80 |                                      shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
 81 |             lm_label_ids[:] = -1
 82 |             is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
 83 |                                  shape=(num_samples,), mode='w+', dtype=np.bool)
 84 |         else:
 85 |             input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
 86 |             input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
 87 |             segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
 88 |             lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
 89 |             is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
 90 |         logging.info(f"Loading training examples for epoch {epoch}")
 91 |         with data_file.open() as f:
 92 |             for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
 93 |                 line = line.strip()
 94 |                 example = json.loads(line)
 95 |                 features = convert_example_to_features(example, tokenizer, seq_len)
 96 |                 input_ids[i] = features.input_ids
 97 |                 segment_ids[i] = features.segment_ids
 98 |                 input_masks[i] = features.input_mask
 99 |                 lm_label_ids[i] = features.lm_label_ids
100 |                 is_nexts[i] = features.is_next
101 |         assert i == num_samples - 1  # Assert that the sample count metric was true
102 |         logging.info("Loading complete!")
103 |         self.num_samples = num_samples
104 |         self.seq_len = seq_len
105 |         self.input_ids = input_ids
106 |         self.input_masks = input_masks
107 |         self.segment_ids = segment_ids
108 |         self.lm_label_ids = lm_label_ids
109 |         self.is_nexts = is_nexts
110 | 
111 |     def __len__(self):
112 |         return self.num_samples
113 | 
114 |     def __getitem__(self, item):
115 |         return (torch.tensor(self.input_ids[item].astype(np.int64)),
116 |                 torch.tensor(self.input_masks[item].astype(np.int64)),
117 |                 torch.tensor(self.segment_ids[item].astype(np.int64)),
118 |                 torch.tensor(self.lm_label_ids[item].astype(np.int64)),
119 |                 torch.tensor(self.is_nexts[item].astype(np.int64)))
120 | 
121 | 
122 | def main():
123 |     parser = ArgumentParser()
124 |     parser.add_argument('--pregenerated_data', type=Path, required=True)
125 |     parser.add_argument('--output_dir', type=Path, required=True)
126 |     parser.add_argument("--bert_model", type=str, required=True, help="Directory where the Bert pre-trained  model can be found "
127 |                                                                       "or Bert pre-trained model selected in the list: bert-base-uncased, "
128 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
129 |     parser.add_argument("--do_lower_case", action="store_true")
130 |     parser.add_argument("--reduce_memory", action="store_true",
131 |                         help="Store training data as on-disc memmaps to massively reduce memory usage")
132 | 
133 |     parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
134 |     parser.add_argument("--local_rank",
135 |                         type=int,
136 |                         default=-1,
137 |                         help="local_rank for distributed training on gpus")
138 |     parser.add_argument("--no_cuda",
139 |                         action='store_true',
140 |                         help="Whether not to use CUDA when available")
141 |     parser.add_argument('--gradient_accumulation_steps',
142 |                         type=int,
143 |                         default=1,
144 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
145 |     parser.add_argument("--train_batch_size",
146 |                         default=32,
147 |                         type=int,
148 |                         help="Total batch size for training.")
149 |     parser.add_argument("--save_interval",
150 |                         default=20000,
151 |                         type=int,
152 |                         help="Save model every save_interval training steps.")
153 |     parser.add_argument('--fp16',
154 |                         action='store_true',
155 |                         help="Whether to use 16-bit float precision instead of 32-bit")
156 |     parser.add_argument('--loss_scale',
157 |                         type=float, default=0,
158 |                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
159 |                         "0 (default value): dynamic loss scaling.\n"
160 |                         "Positive power of 2: static loss scaling value.\n")
161 |     parser.add_argument("--warmup_proportion",
162 |                         default=0.1,
163 |                         type=float,
164 |                         help="Proportion of training to perform linear learning rate warmup for. "
165 |                              "E.g., 0.1 = 10%% of training.")
166 |     parser.add_argument("--learning_rate",
167 |                         default=3e-5,
168 |                         type=float,
169 |                         help="The initial learning rate for Adam.")
170 |     parser.add_argument('--seed',
171 |                         type=int,
172 |                         default=42,
173 |                         help="random seed for initialization")
174 |     parser.add_argument('--temp_prefix',
175 |                         type=str,
176 |                         default=None,
177 |                         help="where to store temporary data")
178 | 
179 |     args = parser.parse_args()
180 | 
181 |     assert args.pregenerated_data.is_dir(), \
182 |         "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
183 | 
184 |     samples_per_epoch = []
185 |     for i in range(args.epochs):
186 |         epoch_file = args.pregenerated_data / f"epoch_{i}.json"
187 |         metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
188 |         if epoch_file.is_file() and metrics_file.is_file():
189 |             metrics = json.loads(metrics_file.read_text())
190 |             samples_per_epoch.append(metrics['num_training_examples'])
191 |         else:
192 |             if i == 0:
193 |                 exit("No training data was found!")
194 |             print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
195 |             print("This script will loop over the available data, but training diversity may be negatively impacted.")
196 |             num_data_epochs = i
197 |             break
198 |     else:
199 |         num_data_epochs = args.epochs
200 | 
201 |     if args.local_rank == -1 or args.no_cuda:
202 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
203 |         n_gpu = torch.cuda.device_count()
204 |     else:
205 |         torch.cuda.set_device(args.local_rank)
206 |         device = torch.device("cuda", args.local_rank)
207 |         n_gpu = 1
208 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
209 |         torch.distributed.init_process_group(backend='nccl')
210 |     logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
211 |         device, n_gpu, bool(args.local_rank != -1), args.fp16))
212 | 
213 |     if args.gradient_accumulation_steps < 1:
214 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
215 |                             args.gradient_accumulation_steps))
216 | 
217 |     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
218 | 
219 |     random.seed(args.seed)
220 |     np.random.seed(args.seed)
221 |     torch.manual_seed(args.seed)
222 |     if n_gpu > 0:
223 |         torch.cuda.manual_seed_all(args.seed)
224 | 
225 |     if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
226 |         logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
227 |     args.output_dir.mkdir(parents=True, exist_ok=True)
228 | 
229 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
230 | 
231 |     total_train_examples = 0
232 |     for i in range(args.epochs):
233 |         # The modulo takes into account the fact that we may loop over limited epochs of data
234 |         total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]
235 | 
236 |     num_train_optimization_steps = int(
237 |         total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
238 | 
239 |     if args.local_rank != -1:
240 |         num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
241 | 
242 |     # Prepare model
243 |     model = BertForPreTraining.from_pretrained(args.bert_model)
244 |     if args.fp16:
245 |         model.half()
246 |     model.to(device)
247 |     if args.local_rank != -1:
248 |         try:
249 |             from apex.parallel import DistributedDataParallel as DDP
250 |         except ImportError:
251 |             raise ImportError(
252 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
253 |         model = DDP(model)
254 |     elif n_gpu > 1:
255 |         model = torch.nn.DataParallel(model)
256 | 
257 |     # Prepare optimizer
258 |     param_optimizer = list(model.named_parameters())
259 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
260 |     optimizer_grouped_parameters = [
261 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
262 |          'weight_decay': 0.01},
263 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
264 |     ]
265 | 
266 |     if args.fp16:
267 |         try:
268 |             from apex.optimizers import FP16_Optimizer
269 |             from apex.optimizers import FusedAdam
270 |         except ImportError:
271 |             raise ImportError(
272 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
273 | 
274 |         optimizer = FusedAdam(optimizer_grouped_parameters,
275 |                               lr=args.learning_rate,
276 |                               bias_correction=False,
277 |                               max_grad_norm=1.0)
278 |         if args.loss_scale == 0:
279 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
280 |         else:
281 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
282 |         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
283 |                                              t_total=num_train_optimization_steps)
284 |     else:
285 |         optimizer = BertAdam(optimizer_grouped_parameters,
286 |                              lr=args.learning_rate,
287 |                              warmup=args.warmup_proportion,
288 |                              t_total=num_train_optimization_steps)
289 | 
290 |     global_step = 0
291 |     logging.info("***** Running training *****")
292 |     logging.info(f"  Num examples = {total_train_examples}")
293 |     logging.info("  Batch size = %d", args.train_batch_size)
294 |     logging.info("  Num steps = %d", num_train_optimization_steps)
295 |     model.train()
296 | 
297 |     def save_model():
298 | 
299 |         logging.info("** ** * Saving fine-tuned model ** ** * ")
300 |         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
301 |         output_model_file = args.output_dir / "pytorch_model.bin"
302 |         torch.save(model_to_save.state_dict(), str(output_model_file))
303 | 
304 |     for epoch in range(args.epochs):
305 |         epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
306 |                                             num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory,
307 |                                             prefix=args.temp_prefix)
308 |         if args.local_rank == -1:
309 |             train_sampler = RandomSampler(epoch_dataset)
310 |         else:
311 |             train_sampler = DistributedSampler(epoch_dataset)
312 |         train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
313 |         tr_loss = 0
314 |         nb_tr_examples, nb_tr_steps = 0, 0
315 |         with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
316 |             for step, batch in enumerate(train_dataloader):
317 | 
318 |                 batch = tuple(t.to(device) for t in batch)
319 |                 input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
320 |                 loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
321 |                 if n_gpu > 1:
322 |                     loss = loss.mean()  # mean() to average on multi-gpu.
323 |                 if args.gradient_accumulation_steps > 1:
324 |                     loss = loss / args.gradient_accumulation_steps
325 |                 if args.fp16:
326 |                     optimizer.backward(loss)
327 |                 else:
328 |                     loss.backward()
329 |                 tr_loss += loss.item()
330 |                 nb_tr_examples += input_ids.size(0)
331 |                 nb_tr_steps += 1
332 |                 pbar.update(1)
333 |                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
334 |                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
335 | 
336 |                 if step % args.save_interval == 0:
337 |                     save_model()
338 | 
339 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
340 |                     if args.fp16:
341 |                         # modify learning rate with special warm up BERT uses
342 |                         # if args.fp16 is False, BertAdam is used that handles this automatically
343 |                         lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
344 | 
345 |                         for param_group in optimizer.param_groups:
346 |                             param_group['lr'] = lr_this_step
347 | 
348 |                     optimizer.step()
349 |                     optimizer.zero_grad()
350 | 
351 |                     global_step += 1
352 | 
353 |     # Save a trained model
354 |     # logging.info("** ** * Saving fine-tuned model ** ** * ")
355 |     # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
356 |     # output_model_file = args.output_dir / "pytorch_model.bin"
357 |     # torch.save(model_to_save.state_dict(), str(output_model_file))
358 | 
359 |     save_model()
360 | 
361 | 
362 | if __name__ == '__main__':
363 |     main()
364 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/pregenerate_training_data.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from pathlib import Path
  3 | from tqdm import tqdm, trange
  4 | from tempfile import TemporaryDirectory
  5 | import shelve
  6 | 
  7 | from random import random, randrange, randint, shuffle, choice, sample
  8 | from pytorch_pretrained_bert.tokenization import BertTokenizer
  9 | import numpy as np
 10 | import json
 11 | 
 12 | 
 13 | class DocumentDatabase:
 14 |     def __init__(self, reduce_memory=False):
 15 |         if reduce_memory:
 16 |             self.temp_dir = TemporaryDirectory()
 17 |             self.working_dir = Path(self.temp_dir.name)
 18 |             self.document_shelf_filepath = self.working_dir / 'shelf.db'
 19 |             self.document_shelf = shelve.open(str(self.document_shelf_filepath),
 20 |                                               flag='n', protocol=-1)
 21 |             self.documents = None
 22 |         else:
 23 |             self.documents = []
 24 |             self.document_shelf = None
 25 |             self.document_shelf_filepath = None
 26 |             self.temp_dir = None
 27 |         self.doc_lengths = []
 28 |         self.doc_cumsum = None
 29 |         self.cumsum_max = None
 30 |         self.reduce_memory = reduce_memory
 31 | 
 32 |     def add_document(self, document):
 33 |         if not document:
 34 |             return
 35 |         if self.reduce_memory:
 36 |             current_idx = len(self.doc_lengths)
 37 |             self.document_shelf[str(current_idx)] = document
 38 |         else:
 39 |             self.documents.append(document)
 40 |         self.doc_lengths.append(len(document))
 41 | 
 42 |     def _precalculate_doc_weights(self):
 43 |         self.doc_cumsum = np.cumsum(self.doc_lengths)
 44 |         self.cumsum_max = self.doc_cumsum[-1]
 45 | 
 46 |     def sample_doc(self, current_idx, sentence_weighted=True):
 47 |         # Uses the current iteration counter to ensure we don't sample the same doc twice
 48 |         if sentence_weighted:
 49 |             # With sentence weighting, we sample docs proportionally to their sentence length
 50 |             if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
 51 |                 self._precalculate_doc_weights()
 52 |             rand_start = self.doc_cumsum[current_idx]
 53 |             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
 54 |             sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
 55 |             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
 56 |         else:
 57 |             # If we don't use sentence weighting, then every doc has an equal chance to be chosen
 58 |             sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
 59 |         assert sampled_doc_index != current_idx
 60 |         if self.reduce_memory:
 61 |             return self.document_shelf[str(sampled_doc_index)]
 62 |         else:
 63 |             return self.documents[sampled_doc_index]
 64 | 
 65 |     def __len__(self):
 66 |         return len(self.doc_lengths)
 67 | 
 68 |     def __getitem__(self, item):
 69 |         if self.reduce_memory:
 70 |             return self.document_shelf[str(item)]
 71 |         else:
 72 |             return self.documents[item]
 73 | 
 74 |     def __enter__(self):
 75 |         return self
 76 | 
 77 |     def __exit__(self, exc_type, exc_val, traceback):
 78 |         if self.document_shelf is not None:
 79 |             self.document_shelf.close()
 80 |         if self.temp_dir is not None:
 81 |             self.temp_dir.cleanup()
 82 | 
 83 | 
 84 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
 85 |     """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
 86 |     while True:
 87 |         total_length = len(tokens_a) + len(tokens_b)
 88 |         if total_length <= max_num_tokens:
 89 |             break
 90 | 
 91 |         trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
 92 |         assert len(trunc_tokens) >= 1
 93 | 
 94 |         # We want to sometimes truncate from the front and sometimes from the
 95 |         # back to add more randomness and avoid biases.
 96 |         if random() < 0.5:
 97 |             del trunc_tokens[0]
 98 |         else:
 99 |             trunc_tokens.pop()
100 | 
101 | 
102 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
103 |     """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
104 |     with several refactors to clean it up and remove a lot of unnecessary variables."""
105 |     cand_indices = []
106 |     for (i, token) in enumerate(tokens):
107 |         if token == "[CLS]" or token == "[SEP]":
108 |             continue
109 |         cand_indices.append(i)
110 | 
111 |     num_to_mask = min(max_predictions_per_seq,
112 |                       max(1, int(round(len(tokens) * masked_lm_prob))))
113 |     shuffle(cand_indices)
114 |     mask_indices = sorted(sample(cand_indices, num_to_mask))
115 |     masked_token_labels = []
116 |     for index in mask_indices:
117 |         # 80% of the time, replace with [MASK]
118 |         if random() < 0.8:
119 |             masked_token = "[MASK]"
120 |         else:
121 |             # 10% of the time, keep original
122 |             if random() < 0.5:
123 |                 masked_token = tokens[index]
124 |             # 10% of the time, replace with random word
125 |             else:
126 |                 masked_token = choice(vocab_list)
127 |         masked_token_labels.append(tokens[index])
128 |         # Once we've saved the true label for that token, we can overwrite it with the masked version
129 |         tokens[index] = masked_token
130 | 
131 |     return tokens, mask_indices, masked_token_labels
132 | 
133 | 
134 | def create_instances_from_document(
135 |         doc_database, doc_idx, max_seq_length, short_seq_prob,
136 |         masked_lm_prob, max_predictions_per_seq, vocab_list):
137 |     """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
138 |     However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
139 |     Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
140 |     (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
141 |     document = doc_database[doc_idx]
142 |     # Account for [CLS], [SEP], [SEP]
143 |     max_num_tokens = max_seq_length - 3
144 | 
145 |     # We *usually* want to fill up the entire sequence since we are padding
146 |     # to `max_seq_length` anyways, so short sequences are generally wasted
147 |     # computation. However, we *sometimes*
148 |     # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
149 |     # sequences to minimize the mismatch between pre-training and fine-tuning.
150 |     # The `target_seq_length` is just a rough target however, whereas
151 |     # `max_seq_length` is a hard limit.
152 |     target_seq_length = max_num_tokens
153 |     if random() < short_seq_prob:
154 |         target_seq_length = randint(2, max_num_tokens)
155 | 
156 |     # We DON'T just concatenate all of the tokens from a document into a long
157 |     # sequence and choose an arbitrary split point because this would make the
158 |     # next sentence prediction task too easy. Instead, we split the input into
159 |     # segments "A" and "B" based on the actual "sentences" provided by the user
160 |     # input.
161 |     instances = []
162 |     current_chunk = []
163 |     current_length = 0
164 |     i = 0
165 |     while i < len(document):
166 |         segment = document[i]
167 |         current_chunk.append(segment)
168 |         current_length += len(segment)
169 |         if i == len(document) - 1 or current_length >= target_seq_length:
170 |             if current_chunk:
171 |                 # `a_end` is how many segments from `current_chunk` go into the `A`
172 |                 # (first) sentence.
173 |                 a_end = 1
174 |                 if len(current_chunk) >= 2:
175 |                     a_end = randrange(1, len(current_chunk))
176 | 
177 |                 tokens_a = []
178 |                 for j in range(a_end):
179 |                     tokens_a.extend(current_chunk[j])
180 | 
181 |                 tokens_b = []
182 | 
183 |                 # Random next
184 |                 if len(current_chunk) == 1 or random() < 0.5:
185 |                     is_random_next = True
186 |                     target_b_length = target_seq_length - len(tokens_a)
187 | 
188 |                     # Sample a random document, with longer docs being sampled more frequently
189 |                     random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
190 | 
191 |                     random_start = randrange(0, len(random_document))
192 |                     for j in range(random_start, len(random_document)):
193 |                         tokens_b.extend(random_document[j])
194 |                         if len(tokens_b) >= target_b_length:
195 |                             break
196 |                     # We didn't actually use these segments so we "put them back" so
197 |                     # they don't go to waste.
198 |                     num_unused_segments = len(current_chunk) - a_end
199 |                     i -= num_unused_segments
200 |                 # Actual next
201 |                 else:
202 |                     is_random_next = False
203 |                     for j in range(a_end, len(current_chunk)):
204 |                         tokens_b.extend(current_chunk[j])
205 |                 truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
206 | 
207 |                 assert len(tokens_a) >= 1
208 |                 assert len(tokens_b) >= 1
209 | 
210 |                 tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
211 |                 # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
212 |                 # They are 1 for the B tokens and the final [SEP]
213 |                 segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
214 | 
215 |                 tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
216 |                     tokens, masked_lm_prob, max_predictions_per_seq, vocab_list)
217 | 
218 |                 instance = {
219 |                     "tokens": tokens,
220 |                     "segment_ids": segment_ids,
221 |                     "is_random_next": is_random_next,
222 |                     "masked_lm_positions": masked_lm_positions,
223 |                     "masked_lm_labels": masked_lm_labels}
224 |                 instances.append(instance)
225 |             current_chunk = []
226 |             current_length = 0
227 |         i += 1
228 | 
229 |     return instances
230 | 
231 | 
232 | def main():
233 |     parser = ArgumentParser()
234 |     parser.add_argument('--train_corpus', type=Path, required=True)
235 |     parser.add_argument("--output_dir", type=Path, required=True)
236 |     parser.add_argument("--bert_model", type=str, required=True)  # ,
237 | #                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
238 | #                                 "bert-base-multilingual", "bert-base-chinese"])
239 |     parser.add_argument("--do_lower_case", action="store_true")
240 | 
241 |     parser.add_argument("--reduce_memory", action="store_true",
242 |                         help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
243 | 
244 |     parser.add_argument("--epochs_to_generate", type=int, default=3,
245 |                         help="Number of epochs of data to pregenerate")
246 |     parser.add_argument("--max_seq_len", type=int, default=128)
247 |     parser.add_argument("--short_seq_prob", type=float, default=0.1,
248 |                         help="Probability of making a short sentence as a training example")
249 |     parser.add_argument("--masked_lm_prob", type=float, default=0.15,
250 |                         help="Probability of masking each token for the LM task")
251 |     parser.add_argument("--max_predictions_per_seq", type=int, default=20,
252 |                         help="Maximum number of tokens to mask in each sequence")
253 | 
254 |     args = parser.parse_args()
255 | 
256 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
257 |     vocab_list = list(tokenizer.vocab.keys())
258 |     with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
259 |         with args.train_corpus.open() as f:
260 |             doc = []
261 |             for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
262 |                 line = line.strip()
263 |                 if line == "":
264 |                     docs.add_document(doc)
265 |                     doc = []
266 |                 else:
267 |                     tokens = tokenizer.tokenize(line)
268 |                     doc.append(tokens)
269 |             if doc:
270 |                 docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
271 |         if len(docs) <= 1:
272 |             exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
273 |                  "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
274 |                  "indicate breaks between documents in your input file. If your dataset does not contain multiple "
275 |                  "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
276 |                  "sections or paragraphs.")
277 | 
278 |         args.output_dir.mkdir(exist_ok=True)
279 |         for epoch in trange(args.epochs_to_generate, desc="Epoch"):
280 |             epoch_filename = args.output_dir / f"epoch_{epoch}.json"
281 |             num_instances = 0
282 |             with epoch_filename.open('w') as epoch_file:
283 |                 for doc_idx in trange(len(docs), desc="Document"):
284 |                     doc_instances = create_instances_from_document(
285 |                         docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
286 |                         masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
287 |                         vocab_list=vocab_list)
288 |                     doc_instances = [json.dumps(instance) for instance in doc_instances]
289 |                     for instance in doc_instances:
290 |                         epoch_file.write(instance + '\n')
291 |                         num_instances += 1
292 |             metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json"
293 |             with metrics_file.open('w') as metrics_file:
294 |                 metrics = {
295 |                     "num_training_examples": num_instances,
296 |                     "max_seq_len": args.max_seq_len
297 |                 }
298 |                 metrics_file.write(json.dumps(metrics))
299 | 
300 | 
301 | if __name__ == '__main__':
302 |     main()
303 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/models/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import collections
 20 | import logging
 21 | import os
 22 | import unicodedata
 23 | from io import open
 24 | 
 25 | from pytorch_pretrained_bert.file_utils import cached_path
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 37 | }
 38 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 39 |     'bert-base-uncased': 512,
 40 |     'bert-large-uncased': 512,
 41 |     'bert-base-cased': 512,
 42 |     'bert-large-cased': 512,
 43 |     'bert-base-multilingual-uncased': 512,
 44 |     'bert-base-multilingual-cased': 512,
 45 |     'bert-base-chinese': 512,
 46 | }
 47 | VOCAB_NAME = 'vocab.txt'
 48 | 
 49 | 
 50 | def load_vocab(vocab_file):
 51 |     """Loads a vocabulary file into a dictionary."""
 52 |     vocab = collections.OrderedDict()
 53 |     index = 0
 54 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 55 |         while True:
 56 |             token = reader.readline()
 57 |             if not token:
 58 |                 break
 59 |             token = token.strip()
 60 |             vocab[token] = index
 61 |             index += 1
 62 |     return vocab
 63 | 
 64 | 
 65 | def whitespace_tokenize(text):
 66 |     """Runs basic whitespace cleaning and splitting on a piece of text."""
 67 |     text = text.strip()
 68 |     if not text:
 69 |         return []
 70 |     tokens = text.split()
 71 |     return tokens
 72 | 
 73 | 
 74 | class BertTokenizer(object):
 75 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 76 | 
 77 |     def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
 78 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
 79 |         """Constructs a BertTokenizer.
 80 | 
 81 |         Args:
 82 |           vocab_file: Path to a one-wordpiece-per-line vocabulary file
 83 |           do_lower_case: Whether to lower case the input
 84 |                          Only has an effect when do_wordpiece_only=False
 85 |           do_basic_tokenize: Whether to do basic tokenization before wordpiece.
 86 |           max_len: An artificial maximum length to truncate tokenized sequences to;
 87 |                          Effective maximum length is always the minimum of this
 88 |                          value (if specified) and the underlying BERT model's
 89 |                          sequence length.
 90 |           never_split: List of tokens which will never be split during tokenization.
 91 |                          Only has an effect when do_wordpiece_only=False
 92 |         """
 93 |         if not os.path.isfile(vocab_file):
 94 |             raise ValueError(
 95 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 96 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 97 |         self.vocab = load_vocab(vocab_file)
 98 |         self.ids_to_tokens = collections.OrderedDict(
 99 |             [(ids, tok) for tok, ids in self.vocab.items()])
100 |         self.do_basic_tokenize = do_basic_tokenize
101 |         if do_basic_tokenize:
102 |           self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
103 |                                                 never_split=never_split)
104 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
105 |         self.max_len = max_len if max_len is not None else int(1e12)
106 | 
107 |     def tokenize(self, text):
108 |         split_tokens = []
109 |         if self.do_basic_tokenize:
110 |             for token in self.basic_tokenizer.tokenize(text):
111 |                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
112 |                     split_tokens.append(sub_token)
113 |         else:
114 |             split_tokens = self.wordpiece_tokenizer.tokenize(text)
115 |         return split_tokens
116 | 
117 |     def convert_tokens_to_ids(self, tokens):
118 |         """Converts a sequence of tokens into ids using the vocab."""
119 |         ids = []
120 |         for token in tokens:
121 |             ids.append(self.vocab[token])
122 |         if len(ids) > self.max_len:
123 |             logger.warning(
124 |                 "Token indices sequence length is longer than the specified maximum "
125 |                 " sequence length for this BERT model ({} > {}). Running this"
126 |                 " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
127 |             )
128 |         return ids
129 | 
130 |     def convert_ids_to_tokens(self, ids):
131 |         """Converts a sequence of ids in wordpiece tokens using the vocab."""
132 |         tokens = []
133 |         for i in ids:
134 |             tokens.append(self.ids_to_tokens[i])
135 |         return tokens
136 | 
137 |     def save_vocabulary(self, vocab_path):
138 |         """Save the tokenizer vocabulary to a directory or file."""
139 |         index = 0
140 |         if os.path.isdir(vocab_path):
141 |             vocab_file = os.path.join(vocab_path, VOCAB_NAME)
142 |         with open(vocab_file, "w", encoding="utf-8") as writer:
143 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
144 |                 if index != token_index:
145 |                     logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
146 |                                    " Please check that the vocabulary is not corrupted!".format(vocab_file))
147 |                     index = token_index
148 |                 writer.write(token + u'\n')
149 |                 index += 1
150 |         return vocab_file
151 | 
152 |     @classmethod
153 |     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
154 |         """
155 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
156 |         Download and cache the pre-trained model file if needed.
157 |         """
158 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
159 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
160 |             if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
161 |                 logger.warning("The pre-trained model you are loading is a cased model but you have not set "
162 |                                "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
163 |                                "you may want to check this behavior.")
164 |                 kwargs['do_lower_case'] = False
165 |             elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
166 |                 logger.warning("The pre-trained model you are loading is an uncased model but you have set "
167 |                                "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
168 |                                "but you may want to check this behavior.")
169 |                 kwargs['do_lower_case'] = True
170 |         else:
171 |             vocab_file = pretrained_model_name_or_path
172 |         if os.path.isdir(vocab_file):
173 |             vocab_file = os.path.join(vocab_file, VOCAB_NAME)
174 |         # redirect to the cache, if necessary
175 |         try:
176 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
177 |         except EnvironmentError:
178 |             logger.error(
179 |                 "Model name '{}' was not found in model name list ({}). "
180 |                 "We assumed '{}' was a path or url but couldn't find any file "
181 |                 "associated to this path or url.".format(
182 |                     pretrained_model_name_or_path,
183 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
184 |                     vocab_file))
185 |             return None
186 |         if resolved_vocab_file == vocab_file:
187 |             logger.info("loading vocabulary file {}".format(vocab_file))
188 |         else:
189 |             logger.info("loading vocabulary file {} from cache at {}".format(
190 |                 vocab_file, resolved_vocab_file))
191 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
192 |             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
193 |             # than the number of positional embeddings
194 |             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
195 |             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
196 |         # Instantiate tokenizer.
197 |         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
198 |         return tokenizer
199 | 
200 | 
201 | class BasicTokenizer(object):
202 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
203 | 
204 |     def __init__(self,
205 |                  do_lower_case=True,
206 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
207 |         """Constructs a BasicTokenizer.
208 | 
209 |         Args:
210 |           do_lower_case: Whether to lower case the input.
211 |         """
212 |         self.do_lower_case = do_lower_case
213 |         self.never_split = never_split
214 | 
215 |     def tokenize(self, text):
216 |         """Tokenizes a piece of text."""
217 |         text = self._clean_text(text)
218 |         # This was added on November 1st, 2018 for the multilingual and Chinese
219 |         # models. This is also applied to the English models now, but it doesn't
220 |         # matter since the English models were not trained on any Chinese data
221 |         # and generally don't have any Chinese data in them (there are Chinese
222 |         # characters in the vocabulary because Wikipedia does have some Chinese
223 |         # words in the English Wikipedia.).
224 |         text = self._tokenize_chinese_chars(text)
225 |         orig_tokens = whitespace_tokenize(text)
226 |         split_tokens = []
227 |         for token in orig_tokens:
228 |             if self.do_lower_case and token not in self.never_split:
229 |                 token = token.lower()
230 |                 token = self._run_strip_accents(token)
231 |             split_tokens.extend(self._run_split_on_punc(token))
232 | 
233 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
234 |         return output_tokens
235 | 
236 |     def _run_strip_accents(self, text):
237 |         """Strips accents from a piece of text."""
238 |         text = unicodedata.normalize("NFD", text)
239 |         output = []
240 |         for char in text:
241 |             cat = unicodedata.category(char)
242 |             if cat == "Mn":
243 |                 continue
244 |             output.append(char)
245 |         return "".join(output)
246 | 
247 |     def _run_split_on_punc(self, text):
248 |         """Splits punctuation on a piece of text."""
249 |         if text in self.never_split:
250 |             return [text]
251 |         chars = list(text)
252 |         i = 0
253 |         start_new_word = True
254 |         output = []
255 |         while i < len(chars):
256 |             char = chars[i]
257 |             if _is_punctuation(char):
258 |                 output.append([char])
259 |                 start_new_word = True
260 |             else:
261 |                 if start_new_word:
262 |                     output.append([])
263 |                 start_new_word = False
264 |                 output[-1].append(char)
265 |             i += 1
266 | 
267 |         return ["".join(x) for x in output]
268 | 
269 |     def _tokenize_chinese_chars(self, text):
270 |         """Adds whitespace around any CJK character."""
271 |         output = []
272 |         for char in text:
273 |             cp = ord(char)
274 |             if self._is_chinese_char(cp):
275 |                 output.append(" ")
276 |                 output.append(char)
277 |                 output.append(" ")
278 |             else:
279 |                 output.append(char)
280 |         return "".join(output)
281 | 
282 |     def _is_chinese_char(self, cp):
283 |         """Checks whether CP is the codepoint of a CJK character."""
284 |         # This defines a "chinese character" as anything in the CJK Unicode block:
285 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
286 |         #
287 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
288 |         # despite its name. The modern Korean Hangul alphabet is a different block,
289 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
290 |         # space-separated words, so they are not treated specially and handled
291 |         # like the all of the other languages.
292 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
293 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
294 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
295 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
296 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
297 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
298 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
299 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
300 |             return True
301 | 
302 |         return False
303 | 
304 |     def _clean_text(self, text):
305 |         """Performs invalid character removal and whitespace cleanup on text."""
306 |         output = []
307 |         for char in text:
308 |             cp = ord(char)
309 |             if cp == 0 or cp == 0xfffd or _is_control(char):
310 |                 continue
311 |             if _is_whitespace(char):
312 |                 output.append(" ")
313 |             else:
314 |                 output.append(char)
315 |         return "".join(output)
316 | 
317 | 
318 | class WordpieceTokenizer(object):
319 |     """Runs WordPiece tokenization."""
320 | 
321 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
322 |         self.vocab = vocab
323 |         self.unk_token = unk_token
324 |         self.max_input_chars_per_word = max_input_chars_per_word
325 | 
326 |     def tokenize(self, text):
327 |         """Tokenizes a piece of text into its word pieces.
328 | 
329 |         This uses a greedy longest-match-first algorithm to perform tokenization
330 |         using the given vocabulary.
331 | 
332 |         For example:
333 |           input = "unaffable"
334 |           output = ["un", "##aff", "##able"]
335 | 
336 |         Args:
337 |           text: A single token or whitespace separated tokens. This should have
338 |             already been passed through `BasicTokenizer`.
339 | 
340 |         Returns:
341 |           A list of wordpiece tokens.
342 |         """
343 | 
344 |         output_tokens = []
345 |         for token in whitespace_tokenize(text):
346 |             chars = list(token)
347 |             # if len(chars) > self.max_input_chars_per_word:
348 |             #     output_tokens.append(self.unk_token)
349 |             #     continue
350 | 
351 |             # is_bad = False
352 |             start = 0
353 |             sub_tokens = []
354 |             while start < len(chars):
355 |                 end = len(chars)
356 |                 cur_substr = None
357 |                 while start < end:
358 |                     substr = "".join(chars[start:end])
359 |                     if start > 0:
360 |                         substr = "##" + substr
361 |                     if substr in self.vocab:
362 |                         cur_substr = substr
363 |                         break
364 |                     end -= 1
365 |                 if cur_substr is None:
366 |                     # is_bad = True
367 |                     # break
368 |                     sub_tokens.append(self.unk_token)
369 |                     start += 1
370 |                 else:
371 |                     sub_tokens.append(cur_substr)
372 |                     start = end
373 | 
374 |             # if is_bad:
375 |             #    output_tokens.append(self.unk_token)
376 |             # else:
377 |             output_tokens.extend(sub_tokens)
378 | 
379 |         return output_tokens
380 | 
381 | 
382 | def _is_whitespace(char):
383 |     """Checks whether `chars` is a whitespace character."""
384 |     # \t, \n, and \r are technically contorl characters but we treat them
385 |     # as whitespace since they are generally considered as such.
386 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
387 |         return True
388 |     cat = unicodedata.category(char)
389 |     if cat == "Zs":
390 |         return True
391 |     return False
392 | 
393 | 
394 | def _is_control(char):
395 |     """Checks whether `chars` is a control character."""
396 |     # These are technically control characters but we count them as whitespace
397 |     # characters.
398 |     if char == "\t" or char == "\n" or char == "\r":
399 |         return False
400 |     cat = unicodedata.category(char)
401 |     if cat.startswith("C"):
402 |         return True
403 |     return False
404 | 
405 | 
406 | def _is_punctuation(char):
407 |     """Checks whether `chars` is a punctuation character."""
408 |     cp = ord(char)
409 |     # We treat all non-letter/number ASCII as punctuation.
410 |     # Characters such as "^", "$", and "`" are not in the Unicode
411 |     # Punctuation class but we treat them as punctuation anyways, for
412 |     # consistency.
413 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
414 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
415 |         return True
416 |     cat = unicodedata.category(char)
417 |     if cat.startswith("P"):
418 |         return True
419 |     return False
420 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from flask import Flask, send_from_directory, redirect, jsonify, request
  4 | from flask_caching import Cache
  5 | from hashlib import sha256
  6 | import html
  7 | import json
  8 | import torch
  9 | from somajo import Tokenizer, SentenceSplitter
 10 | 
 11 | from qurator.sbb_ner.models.bert import get_device, model_predict
 12 | from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, convert_examples_to_features
 13 | from qurator.sbb_ner.models.tokenization import BertTokenizer
 14 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME,
 15 |                                               BertConfig,
 16 |                                               BertForTokenClassification)
 17 | app = Flask(__name__)
 18 | 
 19 | app.config.from_file(os.path.join(os.getcwd(),
 20 |                                   'config.json' if not os.environ.get('CONFIG')
 21 |                                   else os.environ.get('CONFIG')), load=json.load)
 22 | cache = Cache(app)
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class NERPredictor:
 28 | 
 29 |     def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False):
 30 | 
 31 |         self._batch_size = batch_size
 32 |         self._local_rank = local_rank
 33 |         self._max_seq_length = max_seq_length
 34 | 
 35 |         self._device, self._n_gpu = get_device(no_cuda=no_cuda)
 36 | 
 37 |         self._model_config = json.load(open(os.path.join(model_dir, "model_config.json"), "r"))
 38 | 
 39 |         self._label_to_id = self._model_config['label_map']
 40 | 
 41 |         self._label_map = {v: k for k, v in self._model_config['label_map'].items()}
 42 | 
 43 |         self._bert_tokenizer = \
 44 |             BertTokenizer.from_pretrained(model_dir,
 45 |                                           do_lower_case=self._model_config['do_lower'])
 46 | 
 47 |         output_config_file = os.path.join(model_dir, CONFIG_NAME)
 48 | 
 49 |         output_model_file = os.path.join(model_dir, "pytorch_model_ep{}.bin".format(epoch))
 50 | 
 51 |         config = BertConfig(output_config_file)
 52 | 
 53 |         self._model = BertForTokenClassification(config, num_labels=len(self._label_map))
 54 |         self._model.load_state_dict(torch.load(output_model_file,
 55 |                                                map_location=lambda storage, loc: storage if no_cuda else None))
 56 |         self._model.to(self._device)
 57 |         self._model.eval()
 58 | 
 59 |         return
 60 | 
 61 |     def classify_text(self, sentences):
 62 | 
 63 |         examples = NerProcessor.create_examples(sentences, 'test')
 64 | 
 65 |         features = [fe for ex in examples for fe in
 66 |                     convert_examples_to_features(ex, self._label_to_id, self._max_seq_length, self._bert_tokenizer)]
 67 | 
 68 |         data_loader = NerProcessor.make_data_loader(None, self._batch_size, self._local_rank, self._label_to_id,
 69 |                                                     self._max_seq_length, self._bert_tokenizer, features=features,
 70 |                                                     sequential=True)
 71 | 
 72 |         prediction_tmp = model_predict(data_loader, self._device, self._label_map, self._model)
 73 | 
 74 |         assert len(prediction_tmp) == len(features)
 75 | 
 76 |         prediction = []
 77 |         prev_guid = None
 78 |         for fe, pr in zip(features, prediction_tmp):
 79 |             # longer sentences might have been processed in several steps
 80 |             # therefore we have to glue them together. This can be done on the basis of the guid.
 81 | 
 82 |             if prev_guid != fe.guid:
 83 |                 prediction.append((fe.tokens[1:-1], pr))
 84 |             else:
 85 |                 prediction[-1] = (prediction[-1][0] + fe.tokens[1:-1], prediction[-1][1] + pr)
 86 | 
 87 |             prev_guid = fe.guid
 88 | 
 89 |         try:
 90 |             assert len(sentences) == len(prediction)
 91 |         except AssertionError:
 92 |             print('Sentences:\n')
 93 |             print(sentences)
 94 |             print('\n\nPrediciton:\n')
 95 |             print(prediction)
 96 | 
 97 |         return prediction
 98 | 
 99 | 
100 | class NERTokenizer:
101 | 
102 |     def __init__(self):
103 | 
104 |         self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
105 | 
106 |         self._sentence_splitter = SentenceSplitter()
107 | 
108 |     def parse_text(self, text):
109 |         tokens = self._word_tokenizer.tokenize_paragraph(text)
110 | 
111 |         sentences_tokenized = self._sentence_splitter.split(tokens)
112 | 
113 |         sentences = []
114 |         for sen in sentences_tokenized:
115 | 
116 |             sen = [tok.replace(" ", "") for tok in sen]
117 | 
118 |             if len(sen) == 0:
119 |                 continue
120 | 
121 |             sentences.append((sen, []))
122 | 
123 |         return sentences
124 | 
125 | 
126 | class PredictorStore:
127 | 
128 |     def __init__(self):
129 | 
130 |         self._predictor = None
131 |         self._model_id = None
132 | 
133 |     def get(self, model_id):
134 | 
135 |         if model_id is not None:
136 |             model = next((m for m in app.config['MODELS'] if m['id'] == int(model_id)))
137 |         else:
138 |             model = next((m for m in app.config['MODELS'] if m['default']))
139 | 
140 |         if self._model_id != model['id']:
141 | 
142 |             self._predictor = NERPredictor(model_dir=model['model_dir'],
143 |                                            epoch=model['epoch'],
144 |                                            batch_size=app.config['BATCH_SIZE'],
145 |                                            no_cuda=False if not os.environ.get('USE_CUDA') else
146 |                                            os.environ.get('USE_CUDA').lower() == 'false')
147 |             self._model_id = model['id']
148 | 
149 |         return self._predictor
150 | 
151 | 
152 | predictor_store = PredictorStore()
153 | 
154 | tokenizer = NERTokenizer()
155 | 
156 | 
157 | def key_prefix():
158 |     return "{}:{}".format(request.path, sha256(str(request.json).encode('utf-8')).hexdigest())
159 | 
160 | 
161 | @app.route('/')
162 | def entry():
163 |     return redirect("/index.html", code=302)
164 | 
165 | 
166 | @app.route('/models')
167 | def get_models():
168 |     return jsonify(app.config['MODELS'])
169 | 
170 | 
171 | @app.route('/tokenized', methods=['GET', 'POST'])
172 | @cache.cached(key_prefix=key_prefix)
173 | def tokenized():
174 | 
175 |     raw_text = request.json['text']
176 | 
177 |     sentences = tokenizer.parse_text(raw_text)
178 | 
179 |     result = [(sen, i) for i, (sen, _) in enumerate(sentences)]
180 | 
181 |     return jsonify(result)
182 | 
183 | 
184 | @app.route('/ner-bert-tokens', methods=['GET', 'POST'])
185 | @app.route('/ner-bert-tokens/<model_id>', methods=['GET', 'POST'])
186 | @cache.cached(key_prefix=key_prefix)
187 | def ner_bert_tokens(model_id=None):
188 | 
189 |     raw_text = request.json['text']
190 | 
191 |     sentences = tokenizer.parse_text(raw_text)
192 | 
193 |     prediction = predictor_store.get(model_id).classify_text(sentences)
194 | 
195 |     output = []
196 | 
197 |     for tokens, word_predictions in prediction:
198 | 
199 |         output_sentence = []
200 | 
201 |         for token, word_pred in zip(tokens, word_predictions):
202 | 
203 |             output_sentence.append({'token': html.escape(token), 'prediction': word_pred})
204 | 
205 |         output.append(output_sentence)
206 | 
207 |     return jsonify(output)
208 | 
209 | 
210 | @app.route('/ner', methods=['GET', 'POST'])
211 | @app.route('/ner/<model_id>', methods=['GET', 'POST'])
212 | @cache.cached(key_prefix=key_prefix)
213 | def ner(model_id=None):
214 | 
215 |     raw_text = request.json['text']
216 | 
217 |     sentences = tokenizer.parse_text(raw_text)
218 | 
219 |     prediction = predictor_store.get(model_id).classify_text(sentences)
220 | 
221 |     output = []
222 | 
223 |     for (tokens, token_predictions),  (input_sentence, _) in zip(prediction, sentences):
224 | 
225 |         output_text = ""
226 |         original_text = "".join(input_sentence)
227 |         original_word_positions = \
228 |             [pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions]
229 | 
230 |         word = ''
231 |         word_prediction = 'O'
232 |         output_sentence = []
233 | 
234 |         for pos, (token, token_prediction) in enumerate(zip(tokens, token_predictions)):
235 | 
236 |             if not token.startswith('##') and token_prediction == 'X' or token_prediction == '[SEP]':
237 |                 token_prediction = 'O'
238 | 
239 |             orig_pos = len(output_text + word)
240 | 
241 |             # if the current word length is greater than 0
242 |             # and its either a word start token (does not start with ##) and not an unknown token or the original text
243 |             # positions indicate a word break
244 |             if len(word) > 0 and ((not token.startswith('##') and token != '[UNK]') or
245 |                                   (orig_pos > 0 and
246 |                                    original_word_positions[orig_pos-1] != original_word_positions[orig_pos])):
247 |                 output_sentence.append({'word': word, 'prediction': word_prediction})
248 |                 output_text += word
249 |                 word = ''
250 |                 word_prediction = 'O'
251 | 
252 |             if token == '[UNK]':
253 | 
254 |                 orig_pos = len(output_text + word)
255 | 
256 |                 # are we on a word boundary?
257 |                 if len(word) > 0 and orig_pos > 0 \
258 |                         and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]:
259 | 
260 |                     # we are on a word boundary - start a new word ...
261 |                     output_sentence.append({'word': word, 'prediction': word_prediction})
262 |                     output_text += word
263 |                     word = ''
264 |                     word_prediction = 'O'
265 | 
266 |                 # get character that corresponds to [UNK] token from original text
267 |                 token = original_text[orig_pos]
268 | 
269 |             else:
270 |                 token = token[2:] if token.startswith('##') else token
271 | 
272 |             # if the output_text plus the current word and token is not a prefix of the original text, it means, that
273 |             # we would miss characters. Therefore we take the missing characters from the original text at the current
274 |             # word position
275 |             while not original_text.startswith(output_text + word + token) \
276 |                     and len(output_text + word) < len(original_text):
277 | 
278 |                 word += original_text[len(output_text + word)]
279 | 
280 |                 orig_pos = len(output_text + word)
281 | 
282 |                 # are we on a word boundary?
283 |                 if orig_pos > 0 and original_word_positions[orig_pos - 1] != original_word_positions[orig_pos]:
284 |                     # we are on a word boundary - start a new word ...
285 |                     output_sentence.append({'word': word, 'prediction': word_prediction})
286 |                     output_text += word
287 |                     word = ''
288 |                     word_prediction = 'O'
289 | 
290 |             word += token
291 | 
292 |             if token_prediction != 'X':
293 |                 word_prediction = token_prediction
294 | 
295 |         if len(word) > 0:
296 |             output_text += word
297 |             output_sentence.append({'word': word, 'prediction': word_prediction})
298 | 
299 |         output.append(output_sentence)
300 | 
301 |         try:
302 |             assert output_text == original_text
303 |         except AssertionError:
304 |             import ipdb;ipdb.set_trace()
305 | 
306 |     for output_sentence, (input_sentence, _) in zip(output, sentences):
307 | 
308 |         try:
309 |             assert "".join([pred['word'] for pred in output_sentence]) == "".join(input_sentence)
310 |         except AssertionError:
311 |             logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'.
312 |                            format("".join(input_sentence).replace(" ", ""),
313 |                                   "".join([pred['word'] for pred in output_sentence])))
314 | 
315 |     torch.cuda.empty_cache()
316 | 
317 |     return jsonify(output)
318 | 
319 | 
320 | @app.route('/<path:path>')
321 | def send_js(path):
322 |     return send_from_directory('static', path)
323 | 


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/config-8GB-GPU.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "BATCH_SIZE": 16,
 3 |   "MODELS": [
 4 |     {
 5 |       "name": "DC-SBB + CONLL + GERMEVAL",
 6 |       "id": 1,
 7 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned",
 8 |       "epoch": 7,
 9 |       "default": true
10 |     },
11 |     {
12 |       "name": "DC-SBB + CONLL + GERMEVAL + SBB",
13 |       "id": 2,
14 |       "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned",
15 |       "epoch": 7,
16 |       "default": false
17 |     },
18 |     {
19 |       "name": "DC-SBB + SBB",
20 |       "id": 3,
21 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned",
22 |       "epoch": 7,
23 |       "default": false
24 |     },
25 |     {
26 |       "name": "CONLL + GERMEVAL",
27 |       "id": 4,
28 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline",
29 |       "epoch": 7,
30 |       "default": false
31 |     }
32 |   ]
33 | }


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "BATCH_SIZE": 256,
 3 |   "MODELS": [
 4 |     {
 5 |       "name": "DC-SBB + CONLL + GERMEVAL",
 6 |       "id": 1,
 7 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned",
 8 |       "epoch": 7,
 9 |       "default": true
10 |     },
11 |     {
12 |       "name": "DC-SBB + CONLL + GERMEVAL + SBB",
13 |       "id": 2,
14 |       "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned",
15 |       "epoch": 7,
16 |       "default": false
17 |     },
18 |     {
19 |       "name": "DC-SBB + SBB",
20 |       "id": 3,
21 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned",
22 |       "epoch": 7,
23 |       "default": false
24 |     },
25 |     {
26 |       "name": "CONLL + GERMEVAL",
27 |       "id": 4,
28 |       "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline",
29 |       "epoch": 7,
30 |       "default": false
31 |     },
32 |     {
33 |       "name": "MULTILANG",
34 |       "id": 5,
35 |       "model_dir": "data/BERT/build-wd_0.03/bert-multilang-de-finetuned",
36 |       "epoch": 20,
37 |       "default": false
38 |     }
39 |   ]
40 | }


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/css/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 | <head>
 4 |     <!-- Required meta tags -->
 5 |     <meta charset="utf-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 7 | 
 8 |     <!-- Bootstrap CSS -->
 9 |     <link rel="stylesheet" href="css/bootstrap.min.css"
10 |           integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
11 | 
12 |     <title>NER - Demo </title>
13 |     <script src="js/jquery-3.4.1.js"></script>
14 | </head>
15 | <body>
16 | <div class="container-fluid" style="height: 95vh;">
17 | 
18 |     <div class="row" style="margin-top: 5vh">
19 | 
20 |         <div class="col-2">
21 |         </div>
22 |         <div class="col-10">
23 |             <div class="row">
24 |                 <div class="col-9 text-center">
25 |                     <h1>NER - Demo</h1>
26 |                 </div>
27 |                 <div class="col">
28 |                 </div>
29 |             </div>
30 |             <div class="row" style="margin-top: 2vh">
31 |                 <div class="col-9">
32 |                     <div class="card">
33 |                         <div class="card-block">
34 |                             <form class="mt-3 mb-3" role="form" id="nerform">
35 |                                 <div class="form-group row ml-2">
36 |                                     <label for="task" class="col-sm-2 col-form-label">Task:</label>
37 |                                     <select id="task" class="selectpicker col-md-auto" onchange="task_select()">
38 |                                         <option value="tokenize">Wort- und Satztokenisierung</option>
39 |                                         <option value="ner" selected>Named Entity Recognition</option>
40 |                                         <option value="bert-tokens">BERT Tokens</option>
41 |                                     </select>
42 |                                 </div>
43 |                                 <div class="form-group row ml-2" id="model_select">
44 |                                     <label for="model" class="col-sm-2 col-form-label">Model:</label>
45 |                                     <select id="model" class="selectpicker col-md-auto">
46 |                                     </select>
47 |                                 </div>
48 | 
49 |                                 <div class="form-group row ml-2">
50 |                                     <label for="inputtext" class="col-sm-2 col-form-label">Input text:</label>
51 |                                     <textarea id="inputtext" class=" col-sm-8 form-control" rows="10" required></textarea>
52 |                                 </div>
53 | 
54 |                                 <div class="form-group row ml-2">
55 |                                     <div class="col-sm-2"></div>
56 |                                     <button class="btn btn-primary" type="submit">Go</button>
57 |                                 </div>
58 |                             </form>
59 |                         </div>
60 |                     </div>
61 |                 </div>
62 |                 <div class="col">
63 |                 </div>
64 |             </div>
65 | 
66 |             <div class="row mt-5">
67 |                 <div class="col-9" id="resultregion">
68 |                 </div>
69 |                 <div class="col" id="legende">
70 |                 </div>
71 |             </div>
72 |         </div>
73 |     </div>
74 | 
75 | </div>
76 | <script src="js/ner.js"></script>
77 | <script src="js/ner-demo.js"></script>
78 | </body>
79 | </html>


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/js/__init__.py:
--------------------------------------------------------------------------------
1 | __import__('pkg_resources').declare_namespace(__name__)


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/js/ner-demo.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function(){
 2 | 
 3 |     $('#nerform').submit(
 4 |         function(e){
 5 |             e.preventDefault();
 6 | 
 7 |             update();
 8 |         }
 9 |     );
10 | 
11 |     $.get( "models")
12 |         .done(
13 |             function( data ) {
14 |                 var tmp="";
15 |                 $.each(data,
16 |                     function(index, item){
17 | 
18 |                         selected=""
19 |                         if (item.default) {
20 |                             selected = "selected"
21 |                         }
22 | 
23 |                         tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
24 |                     });
25 |                     $('#model').html(tmp);
26 | 
27 |                     var url_params = new URLSearchParams(window.location.search);
28 | 
29 |                     var do_update=false;
30 | 
31 |                     if (url_params.has('text')) {
32 | 
33 |                         var text = decodeURIComponent(url_params.get('text'))
34 | 
35 |                         $('#inputtext').val(text);
36 | 
37 |                         do_update = true;
38 | 
39 |                         window.history.replaceState({}, '', `${location.pathname}`);
40 |                     }
41 | 
42 |                     task_select()
43 | 
44 |                     if (do_update) update();
45 |                 }
46 |             );
47 | });
48 | 
49 | function update() {
50 | 
51 |     var task = $('#task').val();
52 |     var model_id = $('#model').val();
53 |     var input_text = $('#inputtext').val()
54 | 
55 |     if (input_text.length < 30000) {
56 | 
57 |         var url_params = new URLSearchParams(window.location.search);
58 | 
59 |         url_params.set('text', encodeURIComponent(input_text))
60 | 
61 |         window.history.replaceState({}, '', `${location.pathname}?${url_params}`);
62 |     }
63 |     else {
64 |         window.history.replaceState({}, '', `${location.pathname}`);
65 |     }
66 | 
67 | 
68 | 
69 |     do_task(task, model_id, input_text);
70 | }


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/static/js/ner.js:
--------------------------------------------------------------------------------
  1 | 
  2 | function task_select() {
  3 | 
  4 |     var task = $('#task').val();
  5 | 
  6 |     if ((task != "ner") && (task != "bert-tokens")){
  7 |         $('#model_select').hide()
  8 |     }
  9 |     else {
 10 |         $('#model_select').show()
 11 |     }
 12 | 
 13 |     $("#resultregion").html("");
 14 |     $("#legende").html("");
 15 | }
 16 | 
 17 | function do_task(task, model_id, input_text) {
 18 | 
 19 |     var post_data = { "text" : input_text }
 20 | 
 21 |     var text_region_html =
 22 |         `<div class="card">
 23 |             <div class="card-header">
 24 |                 Ergebnis:
 25 |             </div>
 26 |             <div class="card-block">
 27 |                 <div id="textregion" style="overflow-y:scroll;height: 55vh;"></div>
 28 |             </div>
 29 |         </div>`;
 30 | 
 31 |     var legende_html =
 32 |          `<div class="card">
 33 |             <div class="card-header">
 34 |                 Legende:
 35 |                 <div class="ml-2" >[<font color="red">Person</font>]</div>
 36 |                 <div class="ml-2" >[<font color="green">Ort</font>]</div>
 37 |                 <div class="ml-2" >[<font color="blue">Organisation</font>]</div>
 38 |                 <div class="ml-2" >[keine Named Entity]</div>
 39 |             </div>
 40 |         </div>`;
 41 | 
 42 |     var spinner_html =
 43 |         `<div class="d-flex justify-content-center">
 44 |             <div class="spinner-border align-center" role="status">
 45 |                 <span class="sr-only">Loading...</span>
 46 |             </div>
 47 |          </div>`;
 48 | 
 49 |     $("#legende").html("");
 50 | 
 51 |     if (task == "fulltext") {
 52 |         $("#resultregion").html(text_region_html)
 53 |         $("#textregion").html(input_text)
 54 |     }
 55 |     else if (task == "tokenize") {
 56 | 
 57 |         $("#resultregion").html(spinner_html)
 58 | 
 59 |         $.ajax(
 60 |             {
 61 |             url:  "tokenized",
 62 |             data: JSON.stringify(post_data),
 63 |             type: 'POST',
 64 |             contentType: "application/json",
 65 |             success:
 66 |                 function( data ) {
 67 |                     text_html = ""
 68 |                     data.forEach(
 69 |                         function(sentence) {
 70 | 
 71 |                             text_html += JSON.stringify(sentence)
 72 | 
 73 |                             text_html += '<br/>'
 74 |                         }
 75 |                     )
 76 |                     $("#resultregion").html(text_region_html)
 77 |                     $("#textregion").html(text_html)
 78 |                     $("#legende").html(legende_html)
 79 |                 }
 80 |             ,
 81 |             error:
 82 |                 function(error) {
 83 |                     console.log(error);
 84 |                 }
 85 |             })
 86 |     }
 87 |     else if (task == "ner") {
 88 | 
 89 |         $("#resultregion").html(spinner_html)
 90 | 
 91 |         $.ajax({
 92 |             url:  "ner/" + model_id,
 93 |             data: JSON.stringify(post_data),
 94 |             type: 'POST',
 95 |             contentType: "application/json",
 96 |             success:
 97 |                 function( data ) {
 98 |                     text_html = ""
 99 |                     data.forEach(
100 |                         function(sentence) {
101 |                             sentence.forEach(
102 |                                 function(token) {
103 | 
104 |                                      if (text_html != "") text_html += ' '
105 | 
106 |                                      if (token.prediction == 'O')
107 |                                         text_html += token.word
108 |                                      else if (token.prediction.endsWith('PER'))
109 |                                         text_html += '<font color="red">' + token.word + '</font>'
110 |                                      else if (token.prediction.endsWith('LOC'))
111 |                                         text_html += '<font color="green">' + token.word + '</font>'
112 |                                      else if (token.prediction.endsWith('ORG'))
113 |                                         text_html += '<font color="blue">' + token.word + '</font>'
114 |                                 })
115 |                              text_html += '<br/>'
116 |                         }
117 |                     )
118 |                     $("#resultregion").html(text_region_html)
119 |                     $("#textregion").html(text_html)
120 |                     $("#legende").html(legende_html)
121 |                 }
122 |             ,
123 |             error: function(error) {
124 |                 console.log(error);
125 |             }
126 |         });
127 |      }
128 |      else if (task == "bert-tokens") {
129 |         $("#resultregion").html(spinner_html);
130 | 
131 |         $.ajax(
132 |             {
133 |             url:  "ner-bert-tokens/" + model_id,
134 |             data: JSON.stringify(post_data),
135 |             type: 'POST',
136 |             contentType: "application/json",
137 |             success:
138 |                 function( data ) {
139 |                     text_html = ""
140 |                     data.forEach(
141 |                         function(sentence) {
142 |                             sentence.forEach(
143 |                                 function(part) {
144 | 
145 |                                      if (text_html != "") text_html += ' '
146 | 
147 |                                      text_html += part.token + "(" + part.prediction + ")"
148 |                                 })
149 |                              text_html += '<br/>'
150 |                         }
151 |                     )
152 |                     $("#resultregion").html(text_region_html)
153 |                     $("#textregion").html(text_html)
154 |                     $("#legende").html(legende_html)
155 |                 }
156 |             ,
157 |             error:
158 |                 function(error) {
159 |                     console.log(error);
160 |                 }
161 |             })
162 |      }
163 | }


--------------------------------------------------------------------------------
/qurator/sbb_ner/webapp/wsgi.py:
--------------------------------------------------------------------------------
 1 | from .app import app
 2 | import logging
 3 | 
 4 | if __name__ == "__main__":
 5 |     app.run()
 6 | else:
 7 |     gunicorn_logger = logging.getLogger('gunicorn.error')
 8 |     app.logger.handlers = gunicorn_logger.handlers
 9 |     app.logger.setLevel(gunicorn_logger.level)
10 | 
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | pandas
 3 | tqdm
 4 | pytorch-pretrained-bert==0.6.2
 5 | scikit-learn
 6 | click
 7 | langid
 8 | seqeval
 9 | conlleval
10 | toolz
11 | cloudpickle
12 | pytest
13 | pytest-cov
14 | flask
15 | Flask-Caching
16 | gunicorn
17 | somajo
18 | qurator-sbb-utils @ git+https://github.com/qurator-spk/sbb_utils.git
19 | qurator-sbb-tools @ git+https://github.com/qurator-spk/sbb_tools.git
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from io import open
 2 | from setuptools import find_packages, setup
 3 | 
 4 | with open('requirements.txt') as fp:
 5 |     install_requires = fp.read()
 6 | 
 7 | setup(
 8 |     name="qurator-sbb-ner",
 9 |     version="0.0.1",
10 |     author="The Qurator Team",
11 |     author_email="qurator@sbb.spk-berlin.de",
12 |     description="Qurator",
13 |     long_description=open("README.md", "r", encoding='utf-8').read(),
14 |     long_description_content_type="text/markdown",
15 |     keywords='qurator',
16 |     license='Apache',
17 |     url="https://qurator.ai",
18 |     packages=find_packages(exclude=["*.tests", "*.tests.*",
19 |                                     "tests.*", "tests"]),
20 |     package_data={'': ['*.html', '*.js', '*.css', '*.map', '*.png', '*.txt']},
21 |     install_requires=install_requires,
22 |     entry_points={
23 |       'console_scripts': [
24 |         "compile_europeana_historic=qurator.sbb_ner.ground_truth.europeana_historic:main",
25 |         "compile_germ_eval=qurator.sbb_ner.ground_truth.germeval:main",
26 |         "compile_conll=qurator.sbb_ner.ground_truth.conll:main",
27 |         "compile_wikiner=qurator.sbb_ner.ground_truth.wikiner:main",
28 |         "join-gt=qurator.sbb_ner.ground_truth.join_gt:main",
29 |         "bert-ner=qurator.sbb_ner.models.bert:main",
30 | 
31 |         "collectcorpus=qurator.sbb_ner.models.corpus:collect",
32 |         "bert-pregenerate-trainingdata=qurator.sbb_ner.models.pregenerate_training_data:main",
33 |         "bert-finetune=qurator.sbb_ner.models.finetune_on_pregenerated:main"
34 |       ]
35 |     },
36 |     python_requires='>=3.6.0',
37 |     tests_require=['pytest'],
38 |     classifiers=[
39 |           'Intended Audience :: Science/Research',
40 |           'License :: OSI Approved :: Apache Software License',
41 |           'Programming Language :: Python :: 3',
42 |           'Topic :: Scientific/Engineering :: Artificial Intelligence',
43 |     ],
44 | )


--------------------------------------------------------------------------------