├── .dockerignore ├── .screenshots └── sbb_ner_demo.png ├── Dockerfile ├── Dockerfile.cpu ├── LICENSE ├── Makefile ├── README.md ├── __init__.py ├── doc └── sbb_ner_model_card.md ├── qurator ├── __init__.py └── sbb_ner │ ├── __init__.py │ ├── ground_truth │ ├── __init__.py │ ├── conll.py │ ├── data_processor.py │ ├── europeana_historic.py │ ├── germeval.py │ ├── join_gt.py │ └── wikiner.py │ ├── models │ ├── __init__.py │ ├── bert.py │ ├── corpus.py │ ├── finetune_on_pregenerated.py │ ├── pregenerate_training_data.py │ └── tokenization.py │ └── webapp │ ├── __init__.py │ ├── app.py │ ├── config-8GB-GPU.json │ ├── config.json │ ├── static │ ├── __init__.py │ ├── css │ │ ├── __init__.py │ │ └── bootstrap.min.css │ ├── index.html │ └── js │ │ ├── __init__.py │ │ ├── jquery-3.4.1.js │ │ ├── ner-demo.js │ │ └── ner.js │ └── wsgi.py ├── requirements.txt └── setup.py /.dockerignore: -------------------------------------------------------------------------------- 1 | data/* 2 | *.egg_info 3 | venv 4 | models 5 | *.tar.gz 6 | -------------------------------------------------------------------------------- /.screenshots/sbb_ner_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qurator-spk/sbb_ner/0b943e0eb532291b064b9060c154fb0da3aab371/.screenshots/sbb_ner_demo.png -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-base 2 | 3 | ARG http_proxy 4 | ENV http_proxy=$http_proxy 5 | ENV https_proxy=$http_proxy 6 | 7 | RUN apt-get update && \ 8 | apt-get -y install build-essential && \ 9 | apt-get -y install python3-pip && \ 10 | apt-get clean && rm -rf /var/lib/apt/lists/* 11 | 12 | COPY requirements.txt /tmp 13 | RUN pip3 --no-cache-dir install -r /tmp/requirements.txt 14 | 15 | COPY . /usr/src/qurator-sbb-ner 16 | 17 | RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019 18 | RUN mkdir -p /usr/src/qurator-sbb-ner/digisam 19 | 20 | RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner 21 | 22 | WORKDIR /usr/src/qurator-sbb-ner 23 | CMD export LANG=C.UTF-8; env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0 24 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM python:3.6-slim-stretch 2 | 3 | ARG http_proxy 4 | ENV http_proxy=$http_proxy 5 | ENV https_proxy=$http_proxy 6 | 7 | RUN apt-get update && \ 8 | apt-get -y install build-essential && \ 9 | apt-get clean && rm -rf /var/lib/apt/lists/* 10 | 11 | COPY requirements.txt /tmp 12 | RUN pip3 --no-cache-dir install -r /tmp/requirements.txt 13 | 14 | COPY . /usr/src/qurator-sbb-ner 15 | 16 | RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019 17 | RUN mkdir -p /usr/src/qurator-sbb-ner/digisam 18 | 19 | RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner 20 | 21 | WORKDIR /usr/src/qurator-sbb-ner 22 | CMD env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 qurator 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | REPO_PATH ?=$(shell pwd) 2 | 3 | BERT_BASE_PATH ?=$(REPO_PATH)/data/BERT 4 | 5 | BERT_URL ?=https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip 6 | REL_BERT_PATH ?=multi_cased_L-12_H-768_A-12 7 | 8 | #BERT_URL ?=https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip 9 | #REL_BERT_PATH ?=wwm_uncased_L-24_H-1024_A-16 10 | 11 | BERT_MODEL_PATH ?=$(BERT_BASE_PATH)/$(REL_BERT_PATH) 12 | 13 | DIGISAM_PATH ?=$(REPO_PATH)/data/digisam 14 | 15 | REL_FINETUNED_PATH ?=data/digisam/BERT_de_finetuned 16 | #REL_FINETUNED_PATH ?=data/digisam/BERT-large_de_finetuned 17 | BERT_FINETUNED_PATH ?=$(REPO_PATH)/$(REL_FINETUNED_PATH) 18 | 19 | NER_DATA_PATH ?=$(REPO_PATH)/data/NER 20 | 21 | REL_BUILD_PATH ?=data/build 22 | BUILD_PATH ?=$(REPO_PATH)/$(REL_BUILD_PATH) 23 | 24 | EPOCHS ?=1 25 | EPOCH_FILE ?=pytorch_model_ep$(EPOCHS).bin 26 | MODEL_FILE ?=pytorch_model.bin 27 | CROSS_VAL_FILE ?=cross_validation_results.pkl 28 | 29 | WEIGHT_DECAY ?=0.03 30 | WARMUP_PROPORTION ?=0.4 31 | 32 | BATCH_SIZE ?=32 33 | GRAD_ACC_STEPS ?=2 34 | 35 | # BATCH_SIZE ?=128 # <===== unsupervised 36 | # GRAD_ACC_STEPS ?=4 # <===== unsupervised 37 | 38 | MAX_SEQ_LEN ?=128 39 | 40 | # EXTRA_OPTIONS="--dry_run --no_cuda" <- Test if everything works. 41 | EXTRA_OPTIONS ?= 42 | 43 | DO_LOWER_CASE ?= 44 | 45 | BERT_NER_OPTIONS ?=--task_name=ner --max_seq_length=$(MAX_SEQ_LEN) --num_train_epochs=$(EPOCHS) --warmup_proportion=$(WARMUP_PROPORTION) --gradient_accumulation_steps=$(GRAD_ACC_STEPS) --train_batch_size=$(BATCH_SIZE) --gt_file=$(BUILD_PATH)/gt.pkl --weight_decay=$(WEIGHT_DECAY) $(DO_LOWER_CASE) $(EXTRA_OPTIONS) 46 | 47 | BERT_NER_EVAL_OPTIONS ?=--eval_batch_size=8 --task_name=ner --gt_file=$(BUILD_PATH)/gt.pkl $(DO_LOWER_CASE) $(EXTRA_OPTIONS) 48 | 49 | ############################################################################### 50 | # directories 51 | # 52 | 53 | $(BUILD_PATH): 54 | mkdir -p $(BUILD_PATH) 55 | 56 | $(BERT_FINETUNED_PATH): 57 | mkdir -p $(BERT_FINETUNED_PATH) 58 | cp -L $(BERT_MODEL_PATH)/pytorch_model.bin $(BERT_FINETUNED_PATH)/pytorch_model.bin 59 | chmod u+rw $(BERT_FINETUNED_PATH)/pytorch_model.bin 60 | ln -sfn $(BERT_MODEL_PATH)/bert_config.json $(BERT_FINETUNED_PATH)/bert_config.json 61 | ln -sfn $(BERT_MODEL_PATH)/vocab.txt $(BERT_FINETUNED_PATH)/vocab.txt 62 | 63 | dirs: $(BUILD_PATH) $(BERT_FINETUNED_PATH) 64 | 65 | ############################################################################### 66 | # BERT unsupervised on "Digitale Sammlungen": 67 | # 68 | 69 | TEMP_PREFIX ?=/tmp/ 70 | 71 | $(BERT_MODEL_PATH)/bert_model.ckpt.index: 72 | wget -nc --directory-prefix=$(BERT_BASE_PATH) $(BERT_URL) 73 | unzip -d $(BERT_BASE_PATH) $(BERT_MODEL_PATH).zip 74 | 75 | $(BERT_MODEL_PATH)/pytorch_model.bin: $(BERT_MODEL_PATH)/bert_model.ckpt.index 76 | pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch $(BERT_MODEL_PATH)/bert_model.ckpt $(BERT_MODEL_PATH)/bert_config.json $(BERT_MODEL_PATH)/pytorch_model.bin 77 | 78 | $(DIGISAM_PATH)/de_corpus.txt: 79 | collectcorpus $(DIGISAM_PATH)/fulltext.sqlite3 $(DIGISAM_PATH)/selection_de.pkl $(DIGISAM_PATH)/de_corpus.txt --chunksize=10000 80 | 81 | $(BERT_MODEL_PATH)/epoch_0.json: $(DIGISAM_PATH)/de_corpus.txt $(BERT_MODEL_PATH)/pytorch_model.bin 82 | bert-pregenerate-trainingdata --train_corpus $(DIGISAM_PATH)/de_corpus.txt --output_dir $(BERT_MODEL_PATH) --bert_model $(BERT_MODEL_PATH) --reduce_memory --epochs $(EPOCHS) 83 | 84 | bert-digisam-unsupervised: $(BERT_MODEL_PATH)/epoch_0.json 85 | bert-finetune --pregenerated_data $(BERT_MODEL_PATH) --output_dir $(BERT_FINETUNED_PATH) --bert_model $(BERT_MODEL_PATH) --reduce_memory --fp16 --gradient_accumulation_steps 4 --train_batch_size 32 --epochs $(EPOCHS) --temp_prefix $(TEMP_PREFIX) 86 | 87 | bert-digisam-unsupervised-continued: $(BERT_FINETUNED_PATH) $(BERT_MODEL_PATH)/epoch_0.json 88 | bert-finetune --pregenerated_data $(BERT_MODEL_PATH) --output_dir $(BERT_FINETUNED_PATH) --bert_model $(BERT_FINETUNED_PATH) --reduce_memory --fp16 --gradient_accumulation_steps=$(GRAD_ACC_STEPS) --train_batch_size=$(BATCH_SIZE) --epochs $(EPOCHS) --temp_prefix $(TEMP_PREFIX) 89 | 90 | get-bert: $(BERT_MODEL_PATH)/bert_model.ckpt.index 91 | 92 | convert-bert: $(BERT_MODEL_PATH)/pytorch_model.bin 93 | 94 | ############################################################################### 95 | #NER ground truth: 96 | 97 | $(NER_DATA_PATH)/ner-corpora: 98 | git clone https://github.com/EuropeanaNewspapers/ner-corpora $(NER_DATA_PATH)/ner-corpora 99 | 100 | $(BUILD_PATH)/europeana_historic.pkl: $(NER_DATA_PATH)/ner-corpora 101 | compile_europeana_historic $(NER_DATA_PATH)/ner-corpora $(BUILD_PATH)/europeana_historic.pkl 102 | 103 | $(BUILD_PATH)/germ_eval.pkl: $(NER_DATA_PATH)/GermEval 104 | compile_germ_eval $(NER_DATA_PATH)/GermEval $(BUILD_PATH)/germ_eval.pkl 105 | 106 | $(BUILD_PATH)/conll2003.pkl: $(NER_DATA_PATH)/conll2003 107 | compile_conll $(NER_DATA_PATH)/conll2003 $(BUILD_PATH)/conll2003.pkl 108 | 109 | $(BUILD_PATH)/wikiner.pkl: $(NER_DATA_PATH)/wikiner 110 | compile_wikiner $(NER_DATA_PATH)/wikiner $(BUILD_PATH)/wikiner.pkl 111 | 112 | $(BUILD_PATH)/gt.pkl: 113 | join-gt $(BUILD_PATH)/germ_eval.pkl $(BUILD_PATH)/europeana_historic.pkl $(BUILD_PATH)/conll2003.pkl $(BUILD_PATH)/wikiner.pkl $(BUILD_PATH)/gt.pkl 114 | 115 | ner-ground-truth: dirs $(BUILD_PATH)/europeana_historic.pkl $(BUILD_PATH)/germ_eval.pkl $(BUILD_PATH)/conll2003.pkl $(BUILD_PATH)/wikiner.pkl $(BUILD_PATH)/gt.pkl 116 | 117 | ############################################################################### 118 | #BERT NER training: 119 | 120 | .PRECIOUS: %/vocab.txt %/bert_config.json %/$(MODEL_FILE) 121 | 122 | %/vocab.txt: 123 | ln -sfnr $(BERT_FINETUNED_PATH)/vocab.txt $(@D)/vocab.txt 124 | 125 | %/bert_config.json: 126 | ln -sfnr $(BERT_FINETUNED_PATH)/bert_config.json $(@D)/bert_config.json 127 | 128 | %/$(MODEL_FILE): 129 | ln -sfnr $(@D)/$(EPOCH_FILE) $(@D)/$(MODEL_FILE) 130 | 131 | ######################################## 132 | # baseline 133 | 134 | $(BUILD_PATH)/bert-conll2003-en-baseline/$(EPOCH_FILE): 135 | bert-ner --train_sets='EN-CONLL-TRAIN' --dev_sets='EN-CONLL-TESTA' --bert_model=bert-base-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 136 | 137 | $(BUILD_PATH)/bert-conll2003-de-baseline/$(EPOCH_FILE): 138 | bert-ner --train_sets='DE-CONLL-TRAIN' --dev_sets='DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 139 | 140 | $(BUILD_PATH)/bert-germ-eval-baseline/$(EPOCH_FILE): 141 | bert-ner --train_sets='GERM-EVAL-TRAIN' --dev_sets='GERM-EVAL-DEV' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 142 | 143 | 144 | $(BUILD_PATH)/bert-all-german-baseline/$(EPOCH_FILE): 145 | bert-ner --train_sets='GERM-EVAL-TRAIN|DE-CONLL-TRAIN' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 146 | 147 | 148 | $(BUILD_PATH)/bert-wikiner-baseline/$(EPOCH_FILE): 149 | bert-ner --train_sets='WIKINER-WP3' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 150 | 151 | 152 | $(BUILD_PATH)/bert-lft-baseline/$(EPOCH_FILE): 153 | bert-ner --train_sets='LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 154 | 155 | $(BUILD_PATH)/bert-onb-baseline/$(EPOCH_FILE): 156 | bert-ner --train_sets='ONB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 157 | 158 | $(BUILD_PATH)/bert-sbb-baseline/$(EPOCH_FILE): 159 | bert-ner --train_sets='SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 160 | 161 | $(BUILD_PATH)/bert-lft-sbb-baseline/$(EPOCH_FILE): 162 | bert-ner --train_sets='LFT|SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 163 | 164 | $(BUILD_PATH)/bert-onb-sbb-baseline/$(EPOCH_FILE): 165 | bert-ner --train_sets='ONB|SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 166 | 167 | $(BUILD_PATH)/bert-onb-lft-baseline/$(EPOCH_FILE): 168 | bert-ner --train_sets='ONB|LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 169 | 170 | 171 | bert-%-baseline: $(BUILD_PATH)/bert-%-baseline/$(EPOCH_FILE) $(BUILD_PATH)/bert-%-baseline/vocab.txt $(BUILD_PATH)/bert-%-baseline/bert_config.json $(BUILD_PATH)/bert-%-baseline/$(MODEL_FILE) ; 172 | 173 | bert-baseline: dirs ner-ground-truth bert-conll2003-en-baseline bert-conll2003-de-baseline bert-germ-eval-baseline bert-all-german-baseline bert-wikiner-baseline bert-lft-baseline bert-onb-baseline bert-sbb-baseline bert-lft-sbb-baseline bert-onb-sbb-baseline bert-onb-lft-baseline 174 | 175 | 176 | $(BUILD_PATH)/bert-lft-baseline/$(CROSS_VAL_FILE): 177 | bert-ner --do_cross_validation --train_sets='LFT' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 178 | 179 | $(BUILD_PATH)/bert-onb-baseline/$(CROSS_VAL_FILE): 180 | bert-ner --do_cross_validation --train_sets='ONB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 181 | 182 | $(BUILD_PATH)/bert-sbb-baseline/$(CROSS_VAL_FILE): 183 | bert-ner --do_cross_validation --train_sets='SBB' --bert_model=bert-base-multilingual-cased --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 184 | 185 | bert-cv-%-baseline: $(BUILD_PATH)/bert-%-baseline/$(CROSS_VAL_FILE) $(BUILD_PATH)/bert-%-baseline/vocab.txt $(BUILD_PATH)/bert-%-baseline/bert_config.json $(BUILD_PATH)/bert-%-baseline/$(MODEL_FILE) ; 186 | 187 | bert-cv-baseline: bert-cv-lft-baseline bert-cv-onb-baseline bert-cv-sbb-baseline 188 | 189 | ######################################## 190 | #de-finetuned 191 | 192 | $(BUILD_PATH)/bert-conll2003-de-finetuned/$(EPOCH_FILE): 193 | bert-ner --train_sets='DE-CONLL-TRAIN' --dev_sets='DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 194 | 195 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/$(EPOCH_FILE): 196 | bert-ner --train_sets='GERM-EVAL-TRAIN' --dev_sets='GERM-EVAL-DEV' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 197 | 198 | $(BUILD_PATH)/bert-all-german-de-finetuned/$(EPOCH_FILE): 199 | bert-ner --train_sets='GERM-EVAL-TRAIN|DE-CONLL-TRAIN' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 200 | 201 | $(BUILD_PATH)/bert-complete-de-finetuned/$(EPOCH_FILE): 202 | bert-ner --train_sets='GERM-EVAL-TRAIN|GERM-EVAL-DEV|DE-CONLL-TRAIN|DE-CONLL-DEV|SBB|ONB|LFT|DE-CONLL-TESTA|DE-CONLL-TESTB|GERM-EVAL-TEST' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 203 | 204 | $(BUILD_PATH)/bert-multilang-de-finetuned/$(EPOCH_FILE): 205 | bert-ner --train_sets='GERM-EVAL-DEV|GERM-EVAL-TEST|GERM-EVAL-TRAIN|SBB|ONB|LFT|BNF|KB|DE-CONLL-DEV|DE-CONLL-TESTA|DE-CONLL-TESTB|DE-CONLL-TRAIN|EN-CONLL-TESTA|EN-CONLL-TESTB|EN-CONLL-TRAIN' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 206 | 207 | 208 | $(BUILD_PATH)/bert-wikiner-de-finetuned/$(EPOCH_FILE): 209 | bert-ner --train_sets='WIKINER-WP3' --dev_sets='GERM-EVAL-DEV|DE-CONLL-TESTA' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 210 | 211 | 212 | $(BUILD_PATH)/bert-lft-de-finetuned/$(EPOCH_FILE): 213 | bert-ner --train_sets='LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 214 | 215 | $(BUILD_PATH)/bert-onb-de-finetuned/$(EPOCH_FILE): 216 | bert-ner --train_sets='ONB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 217 | 218 | $(BUILD_PATH)/bert-sbb-de-finetuned/$(EPOCH_FILE): 219 | bert-ner --train_sets='SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 220 | 221 | $(BUILD_PATH)/bert-lft-sbb-de-finetuned/$(EPOCH_FILE): 222 | bert-ner --train_sets='LFT|SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 223 | 224 | $(BUILD_PATH)/bert-onb-sbb-de-finetuned/$(EPOCH_FILE): 225 | bert-ner --train_sets='ONB|SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 226 | 227 | $(BUILD_PATH)/bert-onb-lft-de-finetuned/$(EPOCH_FILE): 228 | bert-ner --train_sets='ONB|LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 229 | 230 | 231 | bert-%-de-finetuned: ner-ground-truth $(BUILD_PATH)/bert-%-de-finetuned/$(EPOCH_FILE) $(BUILD_PATH)/bert-%-de-finetuned/vocab.txt $(BUILD_PATH)/bert-%-de-finetuned/bert_config.json $(BUILD_PATH)/bert-%-de-finetuned/$(MODEL_FILE) ; 232 | 233 | bert-finetuned: dirs ner-ground-truth bert-conll2003-de-finetuned bert-germ-eval-de-finetuned bert-all-german-de-finetuned bert-wikiner-de-finetuned bert-lft-de-finetuned bert-onb-de-finetuned bert-sbb-de-finetuned bert-lft-sbb-de-finetuned bert-onb-sbb-de-finetuned bert-onb-lft-de-finetuned 234 | 235 | 236 | $(BUILD_PATH)/bert-lft-de-finetuned/$(CROSS_VAL_FILE): 237 | bert-ner --do_cross_validation --train_sets='LFT' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) # >> $(@D).log 2<&1 238 | 239 | $(BUILD_PATH)/bert-onb-de-finetuned/$(CROSS_VAL_FILE): 240 | bert-ner --do_cross_validation --train_sets='ONB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 241 | 242 | $(BUILD_PATH)/bert-sbb-de-finetuned/$(CROSS_VAL_FILE): 243 | bert-ner --do_cross_validation --train_sets='SBB' --bert_model=$(BERT_FINETUNED_PATH) --output_dir=$(@D) $(BERT_NER_OPTIONS) >> $(@D).log 2<&1 244 | 245 | bert-cv-%-de-finetuned: $(BUILD_PATH)/bert-%-de-finetuned/$(CROSS_VAL_FILE) $(BUILD_PATH)/bert-%-de-finetuned/vocab.txt $(BUILD_PATH)/bert-%-de-finetuned/bert_config.json $(BUILD_PATH)/bert-%-de-finetuned/$(MODEL_FILE) ; 246 | 247 | bert-cv-de-finetuned: bert-cv-lft-de-finetuned bert-cv-onb-de-finetuned bert-cv-sbb-de-finetuned 248 | 249 | bert-cv: bert-cv-de-finetuned bert-cv-baseline 250 | 251 | bert-train: bert-finetuned bert-baseline 252 | 253 | ############################################################################### 254 | # Evaluation 255 | # 256 | 257 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTA.pkl: 258 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 259 | 260 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTB.pkl: 261 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 262 | 263 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-LFT.pkl: 264 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 265 | 266 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-SBB.pkl: 267 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 268 | 269 | $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-ONB.pkl: 270 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 271 | 272 | # 273 | 274 | 275 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-GERM-EVAL-TEST.pkl: 276 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 277 | 278 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-GERM-EVAL-TEST.pkl: 279 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 280 | 281 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-GERM-EVAL-TEST.pkl: 282 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 283 | 284 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-LFT.pkl: 285 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 286 | 287 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-SBB.pkl: 288 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 289 | 290 | $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-ONB.pkl: 291 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 292 | 293 | # 294 | 295 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTA.pkl: 296 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 297 | 298 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTA.pkl: 299 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 300 | 301 | # 302 | 303 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTB.pkl: 304 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 305 | 306 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTB.pkl: 307 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 308 | 309 | # 310 | 311 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-LFT.pkl: 312 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 313 | 314 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-SBB.pkl: 315 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 316 | 317 | $(BUILD_PATH)/bert-all-german-baseline/eval_results-ONB.pkl: 318 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 319 | 320 | # 321 | 322 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-LFT.pkl: 323 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 324 | 325 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-SBB.pkl: 326 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 327 | 328 | $(BUILD_PATH)/bert-wikiner-baseline/eval_results-ONB.pkl: 329 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 330 | 331 | 332 | $(BUILD_PATH)/bert-lft-baseline/eval_results-ONB.pkl: 333 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 334 | 335 | $(BUILD_PATH)/bert-lft-baseline/eval_results-SBB.pkl: 336 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 337 | 338 | 339 | $(BUILD_PATH)/bert-onb-baseline/eval_results-LFT.pkl: 340 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 341 | 342 | $(BUILD_PATH)/bert-onb-baseline/eval_results-SBB.pkl: 343 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 344 | 345 | 346 | $(BUILD_PATH)/bert-sbb-baseline/eval_results-LFT.pkl: 347 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 348 | 349 | $(BUILD_PATH)/bert-sbb-baseline/eval_results-ONB.pkl: 350 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 351 | 352 | 353 | $(BUILD_PATH)/bert-lft-sbb-baseline/eval_results-ONB.pkl: 354 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 355 | 356 | $(BUILD_PATH)/bert-onb-sbb-baseline/eval_results-LFT.pkl: 357 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 358 | 359 | $(BUILD_PATH)/bert-onb-lft-baseline/eval_results-SBB.pkl: 360 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 361 | 362 | 363 | bert-ner-evaluation-baseline: dirs $(BUILD_PATH)/bert-all-german-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-all-german-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-wikiner-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-lft-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-onb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-sbb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-sbb-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-sbb-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-germ-eval-baseline/eval_results-ONB.pkl $(BUILD_PATH)/bert-onb-sbb-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-lft-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-LFT.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-baseline/eval_results-ONB.pkl 364 | 365 | ####################################### 366 | 367 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTA.pkl: 368 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 369 | 370 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTB.pkl: 371 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 372 | 373 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-LFT.pkl: 374 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 375 | 376 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-SBB.pkl: 377 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 378 | 379 | $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-ONB.pkl: 380 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 381 | 382 | 383 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-GERM-EVAL-TEST.pkl: 384 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 385 | 386 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-GERM-EVAL-TEST.pkl: 387 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 388 | 389 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-GERM-EVAL-TEST.pkl: 390 | bert-ner --dev_sets='GERM-EVAL-TEST' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 391 | # 392 | 393 | 394 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-LFT.pkl: 395 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 396 | 397 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-SBB.pkl: 398 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 399 | 400 | $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-ONB.pkl: 401 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 402 | 403 | # 404 | 405 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTA.pkl: 406 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 407 | 408 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTA.pkl: 409 | bert-ner --dev_sets='DE-CONLL-TESTA' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 410 | 411 | # 412 | 413 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTB.pkl: 414 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 415 | 416 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTB.pkl: 417 | bert-ner --dev_sets='DE-CONLL-TESTB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 418 | 419 | # 420 | 421 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-LFT.pkl: 422 | bert-ner --dev_sets='LFT' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 423 | 424 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-SBB.pkl: 425 | bert-ner --dev_sets='SBB' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 426 | 427 | $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-ONB.pkl: 428 | bert-ner --dev_sets='ONB' --output_dir=$(BUILD_PATH)/bert-all-german-de-finetuned --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 429 | 430 | # 431 | 432 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-LFT.pkl: 433 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 434 | 435 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-SBB.pkl: 436 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 437 | 438 | $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-ONB.pkl: 439 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 440 | 441 | 442 | $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-ONB.pkl: 443 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 444 | 445 | $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-SBB.pkl: 446 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 447 | 448 | 449 | $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-LFT.pkl: 450 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 451 | 452 | $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-SBB.pkl: 453 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 454 | 455 | 456 | $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-LFT.pkl: 457 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 458 | 459 | $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-ONB.pkl: 460 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 461 | 462 | 463 | $(BUILD_PATH)/bert-lft-sbb-de-finetuned/eval_results-ONB.pkl: 464 | bert-ner --dev_sets='ONB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 465 | 466 | $(BUILD_PATH)/bert-onb-sbb-de-finetuned/eval_results-LFT.pkl: 467 | bert-ner --dev_sets='LFT' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 468 | 469 | $(BUILD_PATH)/bert-onb-lft-de-finetuned/eval_results-SBB.pkl: 470 | bert-ner --dev_sets='SBB' --output_dir=$(@D) --num_train_epochs $(EPOCHS) $(BERT_NER_EVAL_OPTIONS) >> $(@D).log 2<&1 471 | 472 | 473 | bert-ner-evaluation-de-finetuned: dirs $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-all-german-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-wikiner-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-GERM-EVAL-TEST.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTA.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-DE-CONLL-TESTB.pkl $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-sbb-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-lft-sbb-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-germ-eval-de-finetuned/eval_results-ONB.pkl $(BUILD_PATH)/bert-onb-sbb-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-onb-lft-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-LFT.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-SBB.pkl $(BUILD_PATH)/bert-conll2003-de-finetuned/eval_results-ONB.pkl 474 | 475 | bert-evaluation: bert-ner-evaluation-baseline bert-ner-evaluation-de-finetuned 476 | 477 | ############################################################################### 478 | #wikipedia 479 | 480 | WIKI_DATA_DIR=data/wikipedia 481 | WP_EPOCH_SIZE=100000 482 | 483 | wikipedia-ner-baseline-train: $(WIKI_DATA_DIR)/wikipedia-tagged.csv 484 | bert-ner --gt_file=$(WIKI_DATA_DIR)/wikipedia-tagged.csv --train_sets=$(WIKI_DATA_DIR)/ner-train-index.pkl --dev_sets=$(WIKI_DATA_DIR)/ner-dev-index.pkl --bert_model=bert-base-multilingual-cased --task_name=wikipedia-ner --output_dir=$(BUILD_PATH)/wikipedia-baseline --num_train_epochs $(EPOCHS) --num_data_epochs=$(EPOCHS) --epoch_size=$(WP_EPOCH_SIZE) $(BERT_NER_OPTIONS) 485 | 486 | wikipedia-ner-de-finetuned-train: $(WIKI_DATA_DIR)/wikipedia-tagged.csv 487 | bert-ner --gt_file=$(WIKI_DATA_DIR)/wikipedia-tagged.csv --train_sets=$(WIKI_DATA_DIR)/ner-train-index.pkl --dev_sets=$(WIKI_DATA_DIR)/ner-dev-index.pkl --bert_model=$(DIGISAM_PATH)/BERT_de_finetuned --task_name=wikipedia-ner --output_dir=$(BUILD_PATH)/wikipedia-de-finetuned --num_train_epochs $(EPOCHS) --num_data_epochs=$(EPOCHS) --epoch_size=$(WP_EPOCH_SIZE) $(BERT_NER_OPTIONS) 488 | 489 | ######################## 490 | 491 | $(BUILD_PATH)/wikipedia-baseline/eval_results-LFT.pkl: 492 | bert-ner --dev_sets='LFT' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 493 | 494 | $(BUILD_PATH)/wikipedia-baseline/eval_results-SBB.pkl: 495 | bert-ner --dev_sets='SBB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 496 | 497 | $(BUILD_PATH)/wikipedia-baseline/eval_results-ONB.pkl: 498 | bert-ner --dev_sets='ONB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 499 | 500 | $(BUILD_PATH)/wikipedia-baseline/eval_results-DE-CONLL-TESTA.pkl: 501 | bert-ner --dev_sets='DE-CONLL-TESTA' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 502 | 503 | 504 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl: 505 | bert-ner --dev_sets='LFT' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 506 | 507 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl: 508 | bert-ner --dev_sets='SBB' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 509 | 510 | $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl: 511 | bert-ner --dev_sets='DE-CONLL-TESTA' --task_name=ner --output_dir=$(@D) $(BERT_NER_EVAL_OPTIONS) 512 | 513 | wikipedia-baseline-evaluation: $(BUILD_PATH)/wikipedia-baseline/eval_results-SBB.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-LFT.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-ONB.pkl $(BUILD_PATH)/wikipedia-baseline/eval_results-DE-CONLL-TESTA.pkl 514 | 515 | wikipedia-evaluation: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl 516 | wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl 517 | wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl 518 | 519 | ############################### 520 | 521 | model_archive: 522 | tar --exclude='*ep[1-6]*' --exclude='*eval*' --exclude='pytorch_model.bin' --exclude='*.pkl' -chzf models.tar.gz data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-all-german-baseline 523 | 524 | models_from_git_annex: 525 | cd data;git annex get konvens2019/build-wd_0.03/bert-all-german-de-finetuned 526 | cd data;git annex get konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned 527 | cd data;git annex get konvens2019/build-wd_0.03/bert-sbb-de-finetuned 528 | cd data;git annex get konvens2019/build-wd_0.03/bert-all-german-baseline 529 | 530 | 531 | 532 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![sbb-ner-demo example](.screenshots/sbb_ner_demo.png?raw=true) 2 | 3 | How the models have been obtained is described in our [paper](https://corpora.linguistik.uni-erlangen.de/data/konvens/proceedings/papers/KONVENS2019_paper_4.pdf). 4 | 5 | *** 6 | 7 | # Installation: 8 | 9 | Recommended python version is 3.11. 10 | Consider use of [pyenv](https://github.com/pyenv/pyenv) if that python version is not available on your system. 11 | 12 | Activate virtual environment (virtualenv): 13 | ``` 14 | source venv/bin/activate 15 | ``` 16 | or (pyenv): 17 | ``` 18 | pyenv activate my-python-3.11-virtualenv 19 | ``` 20 | 21 | Update pip: 22 | ``` 23 | pip install -U pip 24 | ``` 25 | Install sbb_ner: 26 | ``` 27 | pip install git+https://github.com/qurator-spk/sbb_ner.git 28 | ``` 29 | Download required models: https://qurator-data.de/sbb_ner/models.tar.gz 30 | 31 | Extract model archive: 32 | ``` 33 | tar -xzf models.tar.gz 34 | ``` 35 | 36 | Copy [config file](qurator/sbb_ner/webapp/config.json) into working directory. 37 | Set USE_CUDA environment variable to True/False depending on GPU availability. 38 | 39 | Run webapp directly: 40 | 41 | ``` 42 | env CONFIG=config.json env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True/False flask run --host=0.0.0.0 43 | ``` 44 | 45 | For production purposes rather use 46 | ``` 47 | env CONFIG=config.json env USE_CUDA=True/False gunicorn --bind 0.0.0.0:5000 qurator.sbb_ner.webapp.wsgi:app 48 | ``` 49 | 50 | # Docker 51 | 52 | ## CPU-only: 53 | 54 | ``` 55 | docker build --build-arg http_proxy=$http_proxy -t qurator/webapp-ner-cpu -f Dockerfile.cpu . 56 | ``` 57 | 58 | ``` 59 | docker run -ti --rm=true --mount type=bind,source=data/konvens2019,target=/usr/src/qurator-sbb-ner/data/konvens2019 -p 5000:5000 qurator/webapp-ner-cpu 60 | ``` 61 | 62 | ## GPU: 63 | 64 | Make sure that your GPU is correctly set up and that nvidia-docker has been installed. 65 | 66 | ``` 67 | docker build --build-arg http_proxy=$http_proxy -t qurator/webapp-ner-gpu -f Dockerfile . 68 | ``` 69 | 70 | ``` 71 | docker run -ti --rm=true --mount type=bind,source=data/konvens2019,target=/usr/src/qurator-sbb-ner/data/konvens2019 -p 5000:5000 qurator/webapp-ner-gpu 72 | ``` 73 | 74 | NER web-interface is availabe at http://localhost:5000 . 75 | 76 | # REST - Interface 77 | 78 | Get available models: 79 | ``` 80 | curl http://localhost:5000/models 81 | ``` 82 | 83 | Output: 84 | 85 | ``` 86 | [ 87 | { 88 | "default": true, 89 | "id": 1, 90 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned", 91 | "name": "DC-SBB + CONLL + GERMEVAL" 92 | }, 93 | { 94 | "default": false, 95 | "id": 2, 96 | "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned", 97 | "name": "DC-SBB + CONLL + GERMEVAL + SBB" 98 | }, 99 | { 100 | "default": false, 101 | "id": 3, 102 | "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned", 103 | "name": "DC-SBB + SBB" 104 | }, 105 | { 106 | "default": false, 107 | "id": 4, 108 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline", 109 | "name": "CONLL + GERMEVAL" 110 | } 111 | ] 112 | ``` 113 | 114 | Perform NER using model 1: 115 | 116 | ``` 117 | curl -d '{ "text": "Paris Hilton wohnt im Hilton Paris in Paris." }' -H "Content-Type: application/json" http://localhost:5000/ner/1 118 | ``` 119 | 120 | Output: 121 | 122 | ``` 123 | [ 124 | [ 125 | { 126 | "prediction": "B-PER", 127 | "word": "Paris" 128 | }, 129 | { 130 | "prediction": "I-PER", 131 | "word": "Hilton" 132 | }, 133 | { 134 | "prediction": "O", 135 | "word": "wohnt" 136 | }, 137 | { 138 | "prediction": "O", 139 | "word": "im" 140 | }, 141 | { 142 | "prediction": "B-ORG", 143 | "word": "Hilton" 144 | }, 145 | { 146 | "prediction": "I-ORG", 147 | "word": "Paris" 148 | }, 149 | { 150 | "prediction": "O", 151 | "word": "in" 152 | }, 153 | { 154 | "prediction": "B-LOC", 155 | "word": "Paris" 156 | }, 157 | { 158 | "prediction": "O", 159 | "word": "." 160 | } 161 | ] 162 | ] 163 | ``` 164 | The JSON above is the expected input format of the 165 | [SBB named entity linking and disambiguation system](https://github.com/qurator-spk/sbb_ned). 166 | # Model-Training 167 | 168 | *** 169 | ## Preprocessing of NER ground-truth: 170 | 171 | 172 | ### compile_conll 173 | 174 | Read CONLL 2003 ner ground truth files from directory and 175 | write the outcome of the data parsing to some pandas DataFrame that is 176 | stored as pickle. 177 | 178 | #### Usage 179 | 180 | ``` 181 | compile_conll --help 182 | ``` 183 | 184 | ### compile_germ_eval 185 | 186 | Read germ eval .tsv files from directory and write the 187 | outcome of the data parsing to some pandas DataFrame that is stored as 188 | pickle. 189 | 190 | #### Usage 191 | 192 | ``` 193 | compile_germ_eval --help 194 | ``` 195 | 196 | ### compile_europeana_historic 197 | 198 | Read europeana historic ner ground truth .bio files from directory 199 | and write the outcome of the data parsing to some pandas 200 | DataFrame that is stored as pickle. 201 | 202 | #### Usage 203 | 204 | ``` 205 | compile_europeana_historic --help 206 | ``` 207 | 208 | 209 | ### compile_wikiner 210 | 211 | Read wikiner files from directory and write the outcome 212 | of the data parsing to some pandas DataFrame that is stored as pickle. 213 | 214 | #### Usage 215 | 216 | ``` 217 | compile_wikiner --help 218 | ``` 219 | 220 | *** 221 | ## Train BERT - NER model: 222 | 223 | ### bert-ner 224 | 225 | Perform BERT for NER supervised training and test/cross-validation. 226 | 227 | #### Usage 228 | 229 | ``` 230 | bert-ner --help 231 | ``` 232 | 233 | ## BERT-Pre-training: 234 | 235 | ### collectcorpus 236 | 237 | ``` 238 | collectcorpus --help 239 | 240 | Usage: collectcorpus [OPTIONS] FULLTEXT_FILE SELECTION_FILE CORPUS_FILE 241 | 242 | Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and 243 | write it to one big text file. 244 | 245 | FULLTEXT_FILE: The CSV or SQLITE3 file to read from. 246 | 247 | SELECTION_FILE: Consider only a subset of all pages that is defined by the 248 | DataFrame that is stored in . 249 | 250 | CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata. 251 | 252 | Options: 253 | --chunksize INTEGER Process the corpus in chunks of . 254 | default:10**4 255 | 256 | --processes INTEGER Number of parallel processes. default: 6 257 | --min-line-len INTEGER Lower bound of line length in output file. 258 | default:80 259 | 260 | --help Show this message and exit. 261 | 262 | ``` 263 | 264 | ### bert-pregenerate-trainingdata 265 | 266 | Generate data for BERT pre-training from a corpus text file where 267 | the documents are separated by an empty line (output of corpuscollect). 268 | 269 | #### Usage 270 | 271 | ``` 272 | bert-pregenerate-trainingdata --help 273 | ``` 274 | 275 | ### bert-finetune 276 | 277 | Perform BERT pre-training on pre-generated data. 278 | 279 | #### Usage 280 | 281 | ``` 282 | bert-finetune --help 283 | ``` 284 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /doc/sbb_ner_model_card.md: -------------------------------------------------------------------------------- 1 | --- 2 | tags: 3 | - pytorch 4 | - token-classification 5 | - sequence-tagger-model 6 | language: de 7 | datasets: 8 | - conll2003 9 | - germeval_14 10 | license: apache-2.0 11 | --- 12 | 13 | 14 | 15 | 16 | 17 | 18 | # Model Card for sbb_ner 19 | 20 | 21 | A BERT model trained on three German corpora containing contemporary and historical texts for named entity recognition tasks. It predicts the classes PER, LOC and ORG. 22 | Questions and comments about the model can be directed to Clemens Neudecker at clemens.neudecker@sbb.spk-berlin.de. 23 | 24 | 25 | 26 | 27 | # Table of Contents 28 | 29 | - [Model Card for sbb_ner](#model-card-for--model_id-) 30 | - [Table of Contents](#table-of-contents) 31 | - [Model Details](#model-details) 32 | - [Model Description](#model-description) 33 | - [Uses](#uses) 34 | - [Direct Use](#direct-use) 35 | - [Downstream Use [Optional]](#downstream-use-optional) 36 | - [Out-of-Scope Use](#out-of-scope-use) 37 | - [Bias, Risks, and Limitations](#bias-risks-and-limitations) 38 | - [Recommendations](#recommendations) 39 | - [Training Details](#training-details) 40 | - [Training Data](#training-data) 41 | - [Training Procedure](#training-procedure) 42 | - [Preprocessing](#preprocessing) 43 | - [Speeds, Sizes, Times](#speeds-sizes-times) 44 | - [Evaluation](#evaluation) 45 | - [Testing Data, Factors & Metrics](#testing-data-factors--metrics) 46 | - [Testing Data](#testing-data) 47 | - [Factors](#factors) 48 | - [Metrics](#metrics) 49 | - [Results](#results) 50 | - [Model Examination](#model-examination) 51 | - [Environmental Impact](#environmental-impact) 52 | - [Technical Specifications [optional]](#technical-specifications-optional) 53 | - [Model Architecture and Objective](#model-architecture-and-objective) 54 | - [Compute Infrastructure](#compute-infrastructure) 55 | - [Hardware](#hardware) 56 | - [Software](#software) 57 | - [Citation](#citation) 58 | - [Glossary [optional]](#glossary-optional) 59 | - [More Information [optional]](#more-information-optional) 60 | - [Model Card Authors [optional]](#model-card-authors-optional) 61 | - [Model Card Contact](#model-card-contact) 62 | - [How to Get Started with the Model](#how-to-get-started-with-the-model) 63 | 64 | 65 | # Model Details 66 | 67 | ## Model Description 68 | 69 | 70 | A BERT model trained on three German corpora containing contemporary and historical texts for named entity recognition tasks. 71 | It predicts the classes PER, LOC and ORG. 72 | 73 | - **Developed by:** [Kai Labusch](kai.labusch@sbb.spk-berlin.de), [Clemens Neudecker](clemens.neudecker@sbb.spk-berlin.de), David Zellhöfer 74 | - **Shared by [Optional]:** Staatsbibliothek zu Berlin / Berlin State Library 75 | - **Model type:** Language model 76 | - **Language(s) (NLP):** de 77 | - **License:** apache-2.0 78 | - **Parent Model:** The BERT base multilingual cased model as provided by [Google](https://huggingface.co/bert-base-multilingual-cased) 79 | - **Resources for more information:** More information needed 80 | - [GitHub Repo](https://github.com/qurator-spk/sbb_ner) 81 | - [Associated Paper](https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_4.pdf) 82 | 83 | # Uses 84 | 85 | 86 | 87 | ## Direct Use 88 | 89 | The model can directly be used to perform NER on historical german texts obtained by OCR from digitized documents. 90 | Supported entity types are PER, LOC and ORG. 91 | 92 | 93 | 94 | 95 | ## Downstream Use 96 | 97 | The model has been pre-trained on 2.300.000 pages of OCR-text of the digitized collections of Berlin State Library. 98 | Therefore it is adapted to OCR-error prone historical german texts and might be used for particular applications that involve such text material. 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | ## Out-of-Scope Use 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | # Bias, Risks, and Limitations 115 | 116 | 117 | 118 | The identification of named entities in historical and contemporary texts is a task contributing to knowledge creation aiming at enhancing scientific research and better discoverability of information in digitized historical texts. The aim of the development of this model was to improve this knowledge creation process, an endeavour that is not for profit. The results of the applied model are freely accessible for the users of the digital collections of the Berlin State Library. Against this backdrop, ethical challenges cannot be identified. As a limitation, it has to be noted that there is a lot of performance to gain for historical text by adding more historical ground-truth data. 119 | 120 | 121 | ## Recommendations 122 | 123 | 124 | 125 | The general observation that historical texts often remain silent and avoid naming of subjects from the colonies and address them anonymously cannot be remedied by named entity recognition. Disambiguation of named entities proves to be challenging beyond the task of automatically identifying entities. The existence of broad variations in the spelling of person and place names because of non-normalized orthography and linguistic change as well as changes in the naming of places according to the context adds to this challenge. Historical texts, especially newspapers, contain narrative descriptions and visual representations of minorities and disadvantaged groups without naming them; de-anonymizing such persons and groups is a research task in itself, which has only been started to be tackled in the 2020's. 126 | 127 | 128 | # Training Details 129 | 130 | ## Training Data 131 | 132 | 133 | 134 | 1) CoNLL 2003 German Named Entity Recognition Ground Truth (Tjong Kim Sang and De Meulder, 2003) 135 | 2) GermEval Konvens 2014 Shared Task Data (Benikova et al., 2014) 136 | 3) DC-SBB Digital Collections of the Berlin State Library (Labusch and Zellhöfer, 2019) 137 | 4) Europeana Newspapers Historic German Datasets (Neudecker, 2016) 138 | 139 | 140 | ## Training Procedure 141 | 142 | 143 | 144 | The BERT model is trained directly with respect to the NER by implementation of the same method that has been proposed by the BERT authors (Devlin et al., 2018). We applied unsupervised pre-training on 2,333,647 pages of unlabeled historical German text from the Berlin State Library digital collections, and supervised pre-training on two datasets with contemporary German text, conll2003 and germeval_14. Unsupervised pre-training on the DC-SBB data as well as supervised pre-training on contemporary NER ground truth were applied. Unsupervised and supervised pretraining are combined where unsupervised is done first and supervised second. Performance on different combinations of training and test sets was explored, and a 5-fold cross validation and comparison with state of the art approaches was conducted. 145 | 146 | ### Preprocessing 147 | 148 | The model was pretrained on 2.300.000 pages of german texts from the digitized collections of the Berlin State Library. 149 | The texts have been obtained by OCR from the page scans of the documents. 150 | 151 | ### Speeds, Sizes, Times 152 | 153 | 154 | 155 | Since it is an incarnation of the original BERT-model published by Google, all the speed, size and time considerations of that original model hold. 156 | 157 | # Evaluation 158 | 159 | 160 | The model has been evaluated by 5-fold cross-validation on several german historical OCR ground truth datasets. 161 | See publication for detail. 162 | 163 | ## Testing Data, Factors & Metrics 164 | 165 | ### Testing Data 166 | 167 | 168 | 169 | Two different test sets contained in the CoNLL 2003 German Named Entity Recognition Ground Truth, 170 | i.e. TEST-A and TEST-B, have been used for testing (DE-CoNLL-TEST). 171 | Additionaly historical OCR-based ground truth datasets have been used for testing - see publication for details. 172 | 173 | 174 | ### Factors 175 | 176 | 177 | 178 | The evaluation focuses on NER in historical germans documents, see publication for details. 179 | 180 | ### Metrics 181 | 182 | 183 | 184 | Performance metrics used in evaluation is precision, recall and F1-score. 185 | See paper for actual results in terms of these metrics. 186 | 187 | ## Results 188 | 189 | See publication. 190 | 191 | # Model Examination 192 | 193 | See publication. 194 | 195 | # Environmental Impact 196 | 197 | 198 | 199 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). 200 | 201 | - **Hardware Type:** V100 202 | - **Hours used:** Roughly 1-2 week(s) for pretraining. Roughly 1 hour for final NER-training. 203 | - **Cloud Provider:** No cloud. 204 | - **Compute Region:** Germany. 205 | - **Carbon Emitted:** More information needed 206 | 207 | # Technical Specifications [optional] 208 | 209 | ## Model Architecture and Objective 210 | 211 | See original BERT publication. 212 | 213 | ## Compute Infrastructure 214 | 215 | Training and pre-training has been performed on a single V100. 216 | 217 | ### Hardware 218 | 219 | See above. 220 | 221 | ### Software 222 | 223 | See published code on github. 224 | 225 | # Citation 226 | 227 | 228 | 229 | **BibTeX:** 230 | 231 | @article{labusch_bert_2019, 232 | title = {{BERT} for {Named} {Entity} {Recognition} in {Contemporary} and {Historical} {German}}, 233 | volume = {Conference on Natural Language Processing}, 234 | url = {https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_4.pdf}, 235 | abstract = {We apply a pre-trained transformer based representational language model, i.e. BERT (Devlin et al., 2018), to named entity recognition (NER) in contemporary and historical German text and observe state of the art performance for both text categories. We further improve the recognition performance for historical German by unsupervised pre-training on a large corpus of historical German texts of the Berlin State Library and show that best performance for historical German is obtained by unsupervised pre-training on historical German plus supervised pre-training with contemporary NER ground-truth.}, 236 | language = {en}, 237 | author = {Labusch, Kai and Neudecker, Clemens and Zellhöfer, David}, 238 | year = {2019}, 239 | pages = {9}, 240 | } 241 | 242 | **APA:** 243 | 244 | (Labusch et al., 2019) 245 | 246 | # Glossary [optional] 247 | 248 | 249 | 250 | More information needed 251 | 252 | # More Information [optional] 253 | 254 | More information needed 255 | 256 | # Model Card Authors [optional] 257 | 258 | 259 | 260 | [Kai Labusch](kai.labusch@sbb.spk-berlin.de) and [Jörg Lehmann](joerg.lehmann@sbb.spk-berlin.de) 261 | 262 | 263 | # Model Card Contact 264 | 265 | Questions and comments about the model can be directed to Clemens Neudecker at clemens.neudecker@sbb.spk-berlin.de, 266 | questions and comments about the model card can be directed to Jörg Lehmann at joerg.lehmann@sbb.spk-berlin.de 267 | 268 | # How to Get Started with the Model 269 | 270 | Use the code below to get started with the model. 271 | 272 |
273 | How to get started with this model is explained in the ReadMe file of the GitHub repository [over here] (https://github.com/qurator-spk/sbb_ner). 274 |
275 | -------------------------------------------------------------------------------- /qurator/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) 2 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/conll.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import click 3 | import codecs 4 | import os 5 | 6 | 7 | def read_gt(files, datasets): 8 | sentence_number = 300000 9 | gt_data = list() 10 | 11 | for filename, dataset in zip(files, datasets): 12 | gt_lines = [l.strip() for l in codecs.open(filename, 'r', 'latin-1')] 13 | 14 | word_number = 0 15 | 16 | for li in gt_lines: 17 | 18 | if li == '': 19 | 20 | if word_number > 0: 21 | 22 | sentence_number += 1 23 | word_number = 0 24 | 25 | continue 26 | 27 | if li.startswith('-DOCSTART-'): 28 | continue 29 | 30 | parts = li.split() 31 | 32 | if len(parts) == 5: 33 | word, _, _, _, tag = li.split() 34 | else: 35 | word, _, _, tag = li.split() 36 | 37 | tag = tag.upper() 38 | tag = tag.replace('_', '-') 39 | tag = tag.replace('.', '-') 40 | 41 | if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}: 42 | tag = 'O' 43 | 44 | gt_data.append((sentence_number, word_number, word, tag, dataset)) 45 | 46 | word_number += 1 47 | 48 | return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset']) 49 | 50 | 51 | @click.command() 52 | @click.argument('path-to-conll', type=click.Path(exists=True), required=True, nargs=1) 53 | @click.argument('conll-ground-truth-file', type=click.Path(), required=True, nargs=1) 54 | def main(path_to_conll, conll_ground_truth_file): 55 | """ 56 | Read CONLL 2003 ner ground truth files from directory and 57 | write the outcome of the data parsing to some pandas DataFrame 58 | that is stored as pickle in file . 59 | """ 60 | 61 | os.makedirs(os.path.dirname(conll_ground_truth_file), exist_ok=True) 62 | 63 | gt_all = read_gt(['{}/deu.dev'.format(path_to_conll), 64 | '{}/deu.testa'.format(path_to_conll), 65 | '{}/deu.testb'.format(path_to_conll), 66 | '{}/deu.train'.format(path_to_conll), 67 | '{}/eng.testa'.format(path_to_conll), 68 | '{}/eng.testb'.format(path_to_conll), 69 | '{}/eng.train'.format(path_to_conll)], 70 | ['DE-CONLL-DEV', 'DE-CONLL-TESTA', 'DE-CONLL-TESTB', 'DE-CONLL-TRAIN', 71 | 'EN-CONLL-TESTA', 'EN-CONLL-TESTB', 'EN-CONLL-TRAIN']) 72 | 73 | gt_all.to_pickle(conll_ground_truth_file) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/data_processor.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import json 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import torch 10 | 11 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 12 | TensorDataset, Dataset) 13 | from torch.utils.data.distributed import DistributedSampler 14 | 15 | 16 | class InputExample(object): 17 | """A single training/test example for simple sequence classification.""" 18 | 19 | def __init__(self, guid, text_a, text_b=None, label=None): 20 | """Constructs a InputExample. 21 | 22 | Args: 23 | guid: Unique id for the example. 24 | text_a: string. The untokenized text of the first sequence. For single 25 | sequence tasks, only this sequence must be specified. 26 | text_b: (Optional) string. The untokenized text of the second sequence. 27 | Only must be specified for sequence pair tasks. 28 | label: (Optional) string. The label of the example. This should be 29 | specified for train and dev examples, but not for test examples. 30 | """ 31 | self.guid = guid 32 | self.text_a = text_a 33 | self.text_b = text_b 34 | self.label = label 35 | 36 | 37 | class InputFeatures(object): 38 | """A single set of features of data.""" 39 | 40 | def __init__(self, guid, input_ids, input_mask, segment_ids, label_id, tokens): 41 | self.guid = guid 42 | self.input_ids = input_ids 43 | self.input_mask = input_mask 44 | self.segment_ids = segment_ids 45 | self.label_id = label_id 46 | self.tokens = tokens 47 | 48 | 49 | class WikipediaDataset(Dataset): 50 | """ 51 | """ 52 | 53 | def __init__(self, set_file, gt_file, data_epochs, epoch_size, 54 | label_map, tokenizer, max_seq_length, 55 | queue_size=1000, no_entity_fraction=0.0, seed=23, 56 | min_sen_len=10, min_article_len=20): 57 | 58 | self._set_file = set_file 59 | self._subset = pd.read_pickle(set_file) 60 | self._gt_file = gt_file 61 | self._data_epochs = data_epochs 62 | self._epoch_size = epoch_size 63 | self._label_map = label_map 64 | self._tokenizer = tokenizer 65 | self._max_seq_length = max_seq_length 66 | self._queue_size = queue_size 67 | self._no_entity_fraction = no_entity_fraction 68 | self._seed = seed 69 | self._min_sen_len = min_sen_len 70 | self._min_article_len = min_article_len 71 | 72 | self._queue = None 73 | self._data_sequence = None 74 | self._counter = None 75 | # noinspection PyUnresolvedReferences 76 | self._random_state = np.random.RandomState(seed=self._seed) 77 | 78 | self._features = [] 79 | 80 | self._reset() 81 | 82 | return 83 | 84 | def _next_sample_should_have_entities(self): 85 | 86 | if self._no_entity_fraction <= 0.0: 87 | return True 88 | 89 | return int(self._counter) % int(1.0 / self._no_entity_fraction) != 0 90 | 91 | def _get_features(self): 92 | 93 | if self._counter > self._data_epochs * self._epoch_size: 94 | self._reset() 95 | 96 | while True: 97 | 98 | # get next random sentence 99 | sen_words, sen_tags = self._queue_next() 100 | 101 | if len(sen_words) < self._min_sen_len: # Skip all sentences that are to short. 102 | continue 103 | 104 | if self._has_entities(sen_tags): 105 | 106 | if not self._next_sample_should_have_entities(): # Skip sample if next sample is supposed to 107 | # be a no-entity sample 108 | continue 109 | else: 110 | if self._next_sample_should_have_entities(): # Skip sample if next sample is supposed to be a entity 111 | # sample 112 | continue 113 | break 114 | 115 | sample = InputExample(guid="%s-%s" % (self._set_file, self._counter), 116 | text_a=sen_words, text_b=None, label=sen_tags) 117 | 118 | return [fe for fe in 119 | convert_examples_to_features(sample, self._label_map, self._max_seq_length, self._tokenizer)] 120 | 121 | def __getitem__(self, index): 122 | 123 | del index 124 | 125 | if len(self._features) == 0: 126 | self._features = self._get_features() 127 | 128 | fe = self._features.pop() 129 | 130 | self._counter += 1 131 | 132 | return torch.tensor(fe.input_ids, dtype=torch.long), \ 133 | torch.tensor(fe.input_mask, dtype=torch.long), \ 134 | torch.tensor(fe.segment_ids, dtype=torch.long), \ 135 | torch.tensor(fe.label_id, dtype=torch.long) 136 | 137 | def __len__(self): 138 | 139 | return int(self._epoch_size) 140 | 141 | def _reset(self): 142 | 143 | # print('================= WikipediaDataset:_reset ====================== ') 144 | 145 | self._queue = list() 146 | self._data_sequence = self._sequence() 147 | self._counter = 0 148 | # noinspection PyUnresolvedReferences 149 | # self._random_state = np.random.RandomState(seed=self._seed) 150 | 151 | for _ in range(0, self._queue_size): 152 | self._queue.append(list()) 153 | 154 | def _sequence(self): 155 | 156 | while True: 157 | 158 | for row in pd.read_csv(self._gt_file, chunksize=1, sep=';'): 159 | 160 | page_id = row.page_id.iloc[0] 161 | text = row.text.iloc[0] 162 | tags = row.tags.iloc[0] 163 | 164 | if page_id not in self._subset.index: 165 | continue 166 | 167 | sentences = [(sen_text, sen_tag) for sen_text, sen_tag in zip(json.loads(text), json.loads(tags))] 168 | 169 | if len(sentences) < self._min_article_len: # Skip very short articles. 170 | continue 171 | 172 | print(page_id) 173 | 174 | yield sentences 175 | 176 | def _queue_next(self): 177 | 178 | nqueue = self._random_state.randint(len(self._queue)) 179 | 180 | while len(self._queue[nqueue]) <= 0: 181 | self._queue[nqueue] = next(self._data_sequence) 182 | 183 | return self._queue[nqueue].pop() 184 | 185 | @staticmethod 186 | def _has_entities(sen_tags): 187 | 188 | for t in sen_tags: 189 | 190 | if t != 'O': 191 | return True 192 | 193 | return False 194 | 195 | 196 | class DataProcessor(object): 197 | """Base class for data converters for sequence classification data sets.""" 198 | 199 | def get_train_examples(self, batch_size, local_rank): 200 | """Gets a collection of `InputExample`s for the train set.""" 201 | raise NotImplementedError() 202 | 203 | def get_dev_examples(self, batch_size, local_rank): 204 | """Gets a collection of `InputExample`s for the dev set.""" 205 | raise NotImplementedError() 206 | 207 | def get_labels(self): 208 | """Gets the list of labels for this data set.""" 209 | raise NotImplementedError() 210 | 211 | def get_evaluation_file(self): 212 | raise NotImplementedError() 213 | 214 | 215 | class WikipediaNerProcessor(DataProcessor): 216 | 217 | def __init__(self, train_sets, dev_sets, test_sets, gt_file, max_seq_length, tokenizer, 218 | data_epochs, epoch_size, **kwargs): 219 | del kwargs 220 | 221 | self._max_seq_length = max_seq_length 222 | self._tokenizer = tokenizer 223 | self._train_set_file = train_sets 224 | self._dev_set_file = dev_sets 225 | self._test_set_file = test_sets 226 | self._gt_file = gt_file 227 | self._data_epochs = data_epochs 228 | self._epoch_size = epoch_size 229 | 230 | def get_train_examples(self, batch_size, local_rank): 231 | """See base class.""" 232 | 233 | return self._make_data_loader(self._train_set_file, batch_size, local_rank) 234 | 235 | def get_dev_examples(self, batch_size, local_rank): 236 | """See base class.""" 237 | 238 | return self._make_data_loader(self._dev_set_file, batch_size, local_rank) 239 | 240 | def get_labels(self): 241 | """See base class.""" 242 | 243 | labels = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "X", "[CLS]", "[SEP]"] 244 | 245 | return {label: i for i, label in enumerate(labels)} 246 | 247 | def get_evaluation_file(self): 248 | dev_set_name = os.path.splitext(os.path.basename(self._dev_set_file))[0] 249 | 250 | return "eval_results-{}.pkl".format(dev_set_name) 251 | 252 | def _make_data_loader(self, set_file, batch_size, local_rank): 253 | del local_rank 254 | 255 | data = WikipediaDataset(set_file=set_file, gt_file=self._gt_file, 256 | data_epochs=self._data_epochs, epoch_size=self._epoch_size, 257 | label_map=self.get_labels(), tokenizer=self._tokenizer, 258 | max_seq_length=self._max_seq_length) 259 | 260 | sampler = SequentialSampler(data) 261 | 262 | return DataLoader(data, sampler=sampler, batch_size=batch_size) 263 | 264 | 265 | class NerProcessor(DataProcessor): 266 | 267 | def __init__(self, train_sets, dev_sets, test_sets, max_seq_length, tokenizer, 268 | label_map=None, gt=None, gt_file=None, **kwargs): 269 | 270 | del kwargs 271 | 272 | self._max_seg_length = max_seq_length 273 | self._tokenizer = tokenizer 274 | self._train_sets = set(train_sets.split('|')) if train_sets is not None else set() 275 | self._dev_sets = set(dev_sets.split('|')) if dev_sets is not None else set() 276 | self._test_sets = set(test_sets.split('|')) if test_sets is not None else set() 277 | 278 | self._gt = gt 279 | 280 | if self._gt is None: 281 | self._gt = pd.read_pickle(gt_file) 282 | 283 | self._label_map = label_map 284 | 285 | print('TRAIN SETS: ', train_sets) 286 | print('DEV SETS: ', dev_sets) 287 | print('TEST SETS: ', test_sets) 288 | 289 | def get_train_examples(self, batch_size, local_rank): 290 | """See base class.""" 291 | 292 | return self.make_data_loader( 293 | self.create_examples(self._read_lines(self._train_sets), "train"), batch_size, local_rank, 294 | self.get_labels(), self._max_seg_length, self._tokenizer) 295 | 296 | def get_dev_examples(self, batch_size, local_rank): 297 | """See base class.""" 298 | return self.make_data_loader( 299 | self.create_examples(self._read_lines(self._dev_sets), "dev"), batch_size, local_rank, 300 | self.get_labels(), self._max_seg_length, self._tokenizer) 301 | 302 | def get_labels(self): 303 | """See base class.""" 304 | 305 | if self._label_map is not None: 306 | return self._label_map 307 | 308 | gt = self._gt 309 | gt = gt.loc[gt.dataset.isin(self._train_sets.union(self._dev_sets).union(self._test_sets))] 310 | 311 | labels = sorted(gt.tag.unique().tolist()) + ["X", "[CLS]", "[SEP]"] 312 | 313 | self._label_map = {label: i for i, label in enumerate(labels, 1)} 314 | 315 | self._label_map['UNK'] = 0 316 | 317 | return self._label_map 318 | 319 | def get_evaluation_file(self): 320 | 321 | return "eval_results-{}.pkl".format("-".join(sorted(self._dev_sets))) 322 | 323 | @staticmethod 324 | def create_examples(lines, set_type): 325 | 326 | for i, (sentence, label) in enumerate(lines): 327 | guid = "%s-%s" % (set_type, i) 328 | text_a = sentence 329 | text_b = None 330 | label = label 331 | 332 | yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) 333 | 334 | @staticmethod 335 | def make_data_loader(examples, batch_size, local_rank, label_map, max_seq_length, tokenizer, features=None, 336 | sequential=False): 337 | 338 | if features is None: 339 | features = [fe for ex in examples for fe in 340 | convert_examples_to_features(ex, label_map, max_seq_length, tokenizer)] 341 | 342 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 343 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 344 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) 345 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) 346 | 347 | data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 348 | 349 | if local_rank == -1: 350 | if sequential: 351 | train_sampler = SequentialSampler(data) 352 | else: 353 | train_sampler = RandomSampler(data) 354 | else: 355 | if sequential: 356 | train_sampler = SequentialSampler(data) 357 | else: 358 | train_sampler = DistributedSampler(data) 359 | 360 | return DataLoader(data, sampler=train_sampler, batch_size=batch_size) 361 | 362 | def _read_lines(self, sets): 363 | 364 | gt = self._gt 365 | gt = gt.loc[gt.dataset.isin(sets)] 366 | 367 | data = list() 368 | for i, sent in gt.groupby('nsentence'): 369 | 370 | sent = sent.sort_values('nword', ascending=True) 371 | 372 | data.append((sent.word.tolist(), sent.tag.tolist())) 373 | 374 | return data 375 | 376 | 377 | def convert_examples_to_features(example, label_map, max_seq_len, tokenizer): 378 | """ 379 | :param example: instance of InputExample 380 | :param label_map: Maps labels like B-ORG ... to numbers (ids). 381 | :param max_seq_len: Maximum length of sequences to be delivered to the model. 382 | :param tokenizer: BERT-Tokenizer 383 | :return: 384 | """ 385 | tokens = [] 386 | labels = [] 387 | 388 | for i, word in enumerate(example.text_a): # example.text_a is a sequence of words 389 | 390 | token = tokenizer.tokenize(word) 391 | 392 | # import ipdb;ipdb.set_trace() 393 | 394 | tokens.extend(token) 395 | 396 | label_1 = example.label[i] if i < len(example.label) else 'O' 397 | 398 | for m in range(len(token)): # a word might have been split into several tokens 399 | if m == 0: 400 | labels.append(label_1) 401 | else: 402 | labels.append("X") 403 | 404 | start_pos = 0 405 | while start_pos < len(tokens): 406 | 407 | window_len = min(max_seq_len - 2, len(tokens) - start_pos) # -2 since we also need [CLS] and [SEP] 408 | 409 | # Make sure that we do not split the sentence within a word. 410 | while window_len > 1 and start_pos + window_len < len(tokens) and\ 411 | tokens[start_pos + window_len].startswith('##'): 412 | window_len -= 1 413 | 414 | if window_len == 1: 415 | window_len = min(max_seq_len - 2, len(tokens) - start_pos) 416 | 417 | token_window = tokens[start_pos:start_pos+window_len] 418 | start_pos += window_len 419 | 420 | augmented_tokens = ["[CLS]"] + token_window + ["[SEP]"] 421 | 422 | input_ids = tokenizer.convert_tokens_to_ids(augmented_tokens) + max(0, max_seq_len - len(augmented_tokens))*[0] 423 | 424 | input_mask = [1] * len(augmented_tokens) + max(0, max_seq_len - len(augmented_tokens))*[0] 425 | 426 | segment_ids = [0] + len(token_window) * [0] + [0] + max(0, max_seq_len - len(augmented_tokens))*[0] 427 | 428 | label_ids = [label_map["[CLS]"]] + [label_map[labels[i]] for i in range(len(token_window))] + \ 429 | [label_map["[SEP]"]] + max(0, max_seq_len - len(augmented_tokens)) * [0] 430 | 431 | assert len(input_ids) == max_seq_len 432 | assert len(input_mask) == max_seq_len 433 | assert len(segment_ids) == max_seq_len 434 | assert len(label_ids) == max_seq_len 435 | 436 | yield InputFeatures(guid=example.guid, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, 437 | label_id=label_ids, tokens=augmented_tokens) 438 | 439 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/europeana_historic.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import click 4 | import os 5 | 6 | 7 | def read_gt(files, datasets): 8 | sentence_number = 100000 9 | sentence = '' 10 | gt_data = list() 11 | 12 | for filename, dataset in zip(files, datasets): 13 | gt_lines = [l.strip() for l in open(filename) if not l.startswith('<--')] 14 | 15 | word_number = 0 16 | 17 | for l in gt_lines: 18 | 19 | try: 20 | word, tag = l.split(' ') 21 | except ValueError: 22 | word = l.replace(' ', '_') 23 | tag = 'O' 24 | 25 | tag = tag.upper() 26 | 27 | tag = tag.replace('_', '-') 28 | tag = tag.replace('.', '-') 29 | 30 | if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}: 31 | tag = 'O' 32 | 33 | gt_data.append((sentence_number, word_number, word, tag, dataset)) 34 | 35 | if re.match(r'.*[.|?|!]$', word) \ 36 | and not re.match(r'[0-9]+[.]$', word) \ 37 | and not re.match(r'.*[0-9]+\s*$', sentence)\ 38 | and not re.match(r'.*\s+[\S]{1,2}$', sentence): 39 | 40 | sentence_number += 1 41 | sentence = '' 42 | word_number = 0 43 | else: 44 | word_number += 1 45 | sentence += ' ' + word 46 | 47 | return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset']) 48 | 49 | 50 | @click.command() 51 | @click.argument('path-to-ner-corpora', type=click.Path(exists=True), required=True, nargs=1) 52 | @click.argument('ner-ground-truth-file', type=click.Path(), required=True, nargs=1) 53 | def main(path_to_ner_corpora, ner_ground_truth_file): 54 | """ 55 | Read europeana historic ner ground truth .bio files from directory and 56 | write the outcome of the data parsing to some pandas DataFrame 57 | that is stored as pickle in file . 58 | """ 59 | 60 | os.makedirs(os.path.dirname(ner_ground_truth_file), exist_ok=True) 61 | 62 | gt_all = read_gt(['{}/enp_DE.sbb.bio/enp_DE.sbb.bio'.format(path_to_ner_corpora), 63 | '{}/enp_DE.onb.bio/enp_DE.onb.bio'.format(path_to_ner_corpora), 64 | '{}/enp_DE.lft.bio/enp_DE.lft.bio'.format(path_to_ner_corpora), 65 | '{}/enp_FR.bnf.bio/enp_FR.bnf.bio'.format(path_to_ner_corpora), 66 | '{}/enp_NL.kb.bio/enp_NL.kb.bio'.format(path_to_ner_corpora)], 67 | ['SBB', 'ONB', 'LFT', 'BNF', 'KB']) 68 | 69 | gt_all.to_pickle(ner_ground_truth_file) 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/germeval.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import click 3 | import os 4 | 5 | 6 | def read_gt(files, datasets): 7 | sentence_number = 200000 8 | gt_data = list() 9 | 10 | for filename, dataset in zip(files, datasets): 11 | gt_lines = [l.strip() for l in open(filename)] 12 | 13 | word_number = 0 14 | 15 | for li in gt_lines: 16 | 17 | if li == '': 18 | 19 | if word_number > 0: 20 | sentence_number += 1 21 | word_number = 0 22 | 23 | continue 24 | 25 | if li.startswith('#'): 26 | continue 27 | 28 | _, word, tag, _ = li.split() 29 | 30 | tag = tag.upper() 31 | tag = tag.replace('_', '-') 32 | tag = tag.replace('.', '-') 33 | 34 | if len(tag) > 5: 35 | tag = tag[0:5] 36 | 37 | if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}: 38 | tag = 'O' 39 | 40 | gt_data.append((sentence_number, word_number, word, tag, dataset)) 41 | 42 | word_number += 1 43 | 44 | return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset']) 45 | 46 | 47 | @click.command() 48 | @click.argument('path-to-germ-eval', type=click.Path(exists=True), required=True, nargs=1) 49 | @click.argument('germ-eval-ground-truth-file', type=click.Path(), required=True, nargs=1) 50 | def main(path_to_germ_eval, germ_eval_ground_truth_file): 51 | """ 52 | Read germ eval .tsv files from directory and 53 | write the outcome of the data parsing to some pandas DataFrame 54 | that is stored as pickle in file . 55 | """ 56 | 57 | os.makedirs(os.path.dirname(germ_eval_ground_truth_file), exist_ok=True) 58 | 59 | gt_all = read_gt(['{}/NER-de-dev.tsv'.format(path_to_germ_eval), 60 | '{}/NER-de-test.tsv'.format(path_to_germ_eval), 61 | '{}/NER-de-train.tsv'.format(path_to_germ_eval)], 62 | ['GERM-EVAL-DEV', 'GERM-EVAL-TEST', 'GERM-EVAL-TRAIN']) 63 | 64 | gt_all.to_pickle(germ_eval_ground_truth_file) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/join_gt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import click 3 | import os 4 | 5 | 6 | @click.command() 7 | @click.argument('files', nargs=-1, type=click.Path()) 8 | def main(files): 9 | """ 10 | Join multiple pandas DataFrame pickles of NER ground-truth into one big file. 11 | """ 12 | 13 | assert(len(files) > 1) 14 | 15 | gt = list() 16 | 17 | for filename in files[:-1]: 18 | 19 | gt.append(pd.read_pickle(filename)) 20 | 21 | gt = pd.concat(gt, axis=0) 22 | 23 | os.makedirs(os.path.dirname(files[-1]), exist_ok=True) 24 | 25 | gt.to_pickle(files[-1]) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /qurator/sbb_ner/ground_truth/wikiner.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import click 3 | import os 4 | 5 | 6 | def read_gt(files, datasets): 7 | 8 | sentence_number = 1000000 9 | gt_data = list() 10 | 11 | for filename, dataset in zip(files, datasets): 12 | 13 | for li in open(filename, encoding='iso-8859-1'): 14 | 15 | li = li.strip() 16 | 17 | parts = li.split(' ') 18 | 19 | prev_tag = 'O' 20 | for word_number, pa in enumerate(parts): 21 | 22 | if len(pa) == 0: 23 | continue 24 | 25 | word, pos, tag = pa.split('|') 26 | 27 | tag = tag.upper() 28 | tag = tag.replace('_', '-') 29 | tag = tag.replace('.', '-') 30 | 31 | if len(tag) > 5: 32 | tag = tag[0:5] 33 | 34 | if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}: 35 | tag = 'O' 36 | 37 | if tag.startswith('I') and prev_tag == 'O': 38 | tag = 'B' + tag[1:] 39 | 40 | prev_tag = tag 41 | gt_data.append((sentence_number, word_number, word, tag, dataset)) 42 | 43 | sentence_number += 1 44 | 45 | return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset']) 46 | 47 | 48 | @click.command() 49 | @click.argument('path-to-wikiner', type=click.Path(exists=True), required=True, nargs=1) 50 | @click.argument('wikiner-ground-truth-file', type=click.Path(), required=True, nargs=1) 51 | def main(path_to_wikiner, wikiner_ground_truth_file): 52 | """ 53 | Read wikiner files from directory and 54 | write the outcome of the data parsing to some pandas DataFrame 55 | that is stored as pickle in file . 56 | """ 57 | 58 | os.makedirs(os.path.dirname(wikiner_ground_truth_file), exist_ok=True) 59 | 60 | gt_all = read_gt(['{}/aij-wikiner-de-wp2'.format(path_to_wikiner), 61 | '{}/aij-wikiner-de-wp3'.format(path_to_wikiner)], 62 | ['WIKINER-WP2', 'WIKINER-WP3']) 63 | 64 | gt_all.to_pickle(wikiner_ground_truth_file) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /qurator/sbb_ner/models/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/models/bert.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | # from inspect import currentframe 3 | 4 | import argparse 5 | import logging 6 | import os 7 | import random 8 | import json 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 16 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME, # WEIGHTS_NAME, 17 | BertConfig, 18 | BertForTokenClassification) 19 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule 20 | # from pytorch_pretrained_bert.tokenization import BertTokenizer 21 | from .tokenization import BertTokenizer 22 | 23 | 24 | from conlleval import evaluate as conll_eval 25 | 26 | from tqdm import tqdm, trange 27 | 28 | from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, WikipediaNerProcessor 29 | 30 | from sklearn.model_selection import GroupKFold 31 | 32 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 33 | datefmt='%m/%d/%Y %H:%M:%S', 34 | level=logging.INFO) 35 | logger = logging.getLogger(__name__) 36 | 37 | 38 | def model_train(bert_model, max_seq_length, do_lower_case, 39 | num_train_epochs, train_batch_size, gradient_accumulation_steps, 40 | learning_rate, weight_decay, loss_scale, warmup_proportion, 41 | processor, device, n_gpu, fp16, cache_dir, local_rank, 42 | dry_run, no_cuda, output_dir=None): 43 | 44 | label_map = processor.get_labels() 45 | 46 | if gradient_accumulation_steps < 1: 47 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 48 | gradient_accumulation_steps)) 49 | 50 | train_batch_size = train_batch_size // gradient_accumulation_steps 51 | 52 | train_dataloader = processor.get_train_examples(train_batch_size, local_rank) 53 | 54 | # Batch sampler divides by batch_size! 55 | num_train_optimization_steps = int(len(train_dataloader)*num_train_epochs/gradient_accumulation_steps) 56 | 57 | if local_rank != -1: 58 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 59 | 60 | # Prepare model 61 | cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 62 | 'distributed_{}'.format(local_rank)) 63 | 64 | model = BertForTokenClassification.from_pretrained(bert_model, cache_dir=cache_dir, num_labels=len(label_map)) 65 | 66 | if fp16: 67 | model.half() 68 | 69 | model.to(device) 70 | 71 | if local_rank != -1: 72 | try: 73 | from apex.parallel import DistributedDataParallel as DDP 74 | except ImportError: 75 | raise ImportError( 76 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 77 | 78 | model = DDP(model) 79 | elif n_gpu > 1: 80 | model = torch.nn.DataParallel(model) 81 | 82 | param_optimizer = list(model.named_parameters()) 83 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 84 | optimizer_grouped_parameters = [ 85 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 86 | 'weight_decay': weight_decay}, 87 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 88 | ] 89 | 90 | if fp16: 91 | try: 92 | from apex.optimizers import FP16_Optimizer 93 | from apex.optimizers import FusedAdam 94 | except ImportError: 95 | raise ImportError( 96 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 97 | 98 | optimizer = FusedAdam(optimizer_grouped_parameters, 99 | lr=learning_rate, 100 | bias_correction=False, 101 | max_grad_norm=1.0) 102 | if loss_scale == 0: 103 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 104 | else: 105 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) 106 | 107 | warmup_linear = WarmupLinearSchedule(warmup=warmup_proportion, t_total=num_train_optimization_steps) 108 | else: 109 | optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, 110 | t_total=num_train_optimization_steps) 111 | warmup_linear = None 112 | 113 | global_step = 0 114 | logger.info("***** Running training *****") 115 | logger.info(" Num examples = %d", len(train_dataloader)) 116 | logger.info(" Batch size = %d", train_batch_size) 117 | logger.info(" Num steps = %d", num_train_optimization_steps) 118 | logger.info(" Num epochs = %d", num_train_epochs) 119 | 120 | model_config = {"bert_model": bert_model, "do_lower": do_lower_case, 121 | "max_seq_length": max_seq_length, "label_map": label_map} 122 | 123 | def save_model(lh): 124 | 125 | if output_dir is None: 126 | return 127 | 128 | output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep)) 129 | 130 | # Save a trained model and the associated configuration 131 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 132 | 133 | torch.save(model_to_save.state_dict(), output_model_file) 134 | 135 | output_config_file = os.path.join(output_dir, CONFIG_NAME) 136 | with open(output_config_file, 'w') as f: 137 | f.write(model_to_save.config.to_json_string()) 138 | 139 | json.dump(model_config, open(os.path.join(output_dir, "model_config.json"), "w")) 140 | 141 | lh = pd.DataFrame(lh, columns=['global_step', 'loss']) 142 | 143 | loss_history_file = os.path.join(output_dir, "loss_ep{}.pkl".format(ep)) 144 | 145 | lh.to_pickle(loss_history_file) 146 | 147 | def load_model(epoch): 148 | 149 | if output_dir is None: 150 | 151 | return False 152 | 153 | output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(epoch)) 154 | 155 | if not os.path.exists(output_model_file): 156 | 157 | return False 158 | 159 | logger.info("Loading epoch {} from disk...".format(epoch)) 160 | model.load_state_dict(torch.load(output_model_file, 161 | map_location=lambda storage, loc: storage if no_cuda else None)) 162 | return True 163 | 164 | model.train() 165 | for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"): 166 | 167 | if dry_run and ep > 1: 168 | logger.info("Dry run. Stop.") 169 | break 170 | 171 | if load_model(ep): 172 | global_step += len(train_dataloader) // gradient_accumulation_steps 173 | continue 174 | 175 | loss_history = list() 176 | tr_loss = 0 177 | nb_tr_examples, nb_tr_steps = 0, 0 178 | with tqdm(total=len(train_dataloader), desc=f"Epoch {ep}") as pbar: 179 | 180 | for step, batch in enumerate(train_dataloader): 181 | 182 | batch = tuple(t.to(device) for t in batch) 183 | 184 | input_ids, input_mask, segment_ids, label_ids = batch 185 | 186 | loss = model(input_ids, segment_ids, input_mask, label_ids) 187 | 188 | if n_gpu > 1: 189 | loss = loss.mean() # mean() to average on multi-gpu. 190 | if gradient_accumulation_steps > 1: 191 | loss = loss / gradient_accumulation_steps 192 | 193 | if fp16: 194 | optimizer.backward(loss) 195 | else: 196 | loss.backward() 197 | 198 | loss_history.append((global_step, loss.item())) 199 | 200 | tr_loss += loss.item() 201 | nb_tr_examples += input_ids.size(0) 202 | nb_tr_steps += 1 203 | pbar.update(1) 204 | mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps 205 | pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") 206 | 207 | if dry_run and len(loss_history) > 2: 208 | logger.info("Dry run. Stop.") 209 | break 210 | 211 | if (step + 1) % gradient_accumulation_steps == 0: 212 | if fp16: 213 | # modify learning rate with special warm up BERT uses 214 | # if args.fp16 is False, BertAdam is used that handles this automatically 215 | lr_this_step = learning_rate * warmup_linear.get_lr(global_step, warmup_proportion) 216 | 217 | for param_group in optimizer.param_groups: 218 | param_group['lr'] = lr_this_step 219 | 220 | optimizer.step() 221 | optimizer.zero_grad() 222 | global_step += 1 223 | 224 | save_model(loss_history) 225 | 226 | return model, model_config 227 | 228 | 229 | def model_eval(batch_size, label_map, processor, device, num_train_epochs=1, output_dir=None, model=None, 230 | local_rank=-1, no_cuda=False, dry_run=False): 231 | 232 | output_eval_file = None 233 | if output_dir is not None: 234 | output_eval_file = os.path.join(output_dir, processor.get_evaluation_file()) 235 | logger.info('Write evaluation results to: {}'.format(output_eval_file)) 236 | 237 | dataloader = processor.get_dev_examples(batch_size, local_rank) 238 | 239 | logger.info("***** Running evaluation *****") 240 | logger.info(" Num examples = %d", len(dataloader)) 241 | logger.info(" Batch size = %d", batch_size) 242 | 243 | results = list() 244 | 245 | output_config_file = None 246 | if output_dir is not None: 247 | output_config_file = os.path.join(output_dir, CONFIG_NAME) 248 | 249 | for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"): 250 | 251 | if dry_run and ep > 1: 252 | logger.info("Dry run. Stop.") 253 | break 254 | 255 | if output_config_file is not None: 256 | # Load a trained model and config that you have fine-tuned 257 | output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep)) 258 | 259 | if not os.path.exists(output_model_file): 260 | logger.info("Stopping at epoch {} since model file is missing.".format(ep)) 261 | break 262 | 263 | config = BertConfig(output_config_file) 264 | model = BertForTokenClassification(config, num_labels=len(label_map)) 265 | model.load_state_dict(torch.load(output_model_file, 266 | map_location=lambda storage, loc: storage if no_cuda else None)) 267 | model.to(device) 268 | 269 | if model is None: 270 | raise ValueError('Model required for evaluation.') 271 | 272 | model.eval() 273 | 274 | y_pred, y_true = model_predict_compare(dataloader, device, label_map, model, dry_run) 275 | 276 | lines = ['empty ' + 'XXX ' + v + ' ' + p for yt, yp in zip(y_true, y_pred) for v, p in zip(yt, yp)] 277 | 278 | res = conll_eval(lines) 279 | 280 | # print(res) 281 | 282 | evals = \ 283 | pd.concat([pd.DataFrame.from_dict(res['overall']['evals'], orient='index', columns=['ALL']), 284 | pd.DataFrame.from_dict(res['slots']['LOC']['evals'], orient='index', columns=['LOC']), 285 | pd.DataFrame.from_dict(res['slots']['PER']['evals'], orient='index', columns=['PER']), 286 | pd.DataFrame.from_dict(res['slots']['ORG']['evals'], orient='index', columns=['ORG']), 287 | ], axis=1).T 288 | 289 | stats = \ 290 | pd.concat( 291 | [pd.DataFrame.from_dict(res['overall']['stats'], orient='index', columns=['ALL']), 292 | pd.DataFrame.from_dict(res['slots']['LOC']['stats'], orient='index', columns=['LOC']), 293 | pd.DataFrame.from_dict(res['slots']['PER']['stats'], orient='index', columns=['PER']), 294 | pd.DataFrame.from_dict(res['slots']['ORG']['stats'], orient='index', columns=['ORG'])], 295 | axis=1, sort=True).T 296 | 297 | evals['epoch'] = ep 298 | stats['epoch'] = ep 299 | 300 | results.append(pd.concat([evals.reset_index().set_index(['index', 'epoch']), 301 | stats.reset_index().set_index(['index', 'epoch'])], axis=1)) 302 | 303 | if output_eval_file is not None: 304 | pd.concat(results).to_pickle(output_eval_file) 305 | 306 | results = pd.concat(results) 307 | print(results) 308 | 309 | return results 310 | 311 | 312 | def model_predict_compare(dataloader, device, label_map, model, dry_run=False): 313 | 314 | y_true = [] 315 | y_pred = [] 316 | covered = set() 317 | for input_ids, input_mask, segment_ids, label_ids in tqdm(dataloader, desc="Evaluating"): 318 | input_ids = input_ids.to(device) 319 | input_mask = input_mask.to(device) 320 | segment_ids = segment_ids.to(device) 321 | label_ids = label_ids.to(device) 322 | 323 | with torch.no_grad(): 324 | logits = model(input_ids, segment_ids, input_mask) 325 | 326 | logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) 327 | logits = logits.detach().cpu().numpy() 328 | label_ids = label_ids.to('cpu').numpy() 329 | input_mask = input_mask.to('cpu').numpy() 330 | 331 | for i, mask in enumerate(input_mask): 332 | temp_1 = [] 333 | temp_2 = [] 334 | for j, m in enumerate(mask): 335 | if j == 0: 336 | continue 337 | if m: 338 | if label_map[label_ids[i][j]] != "X": 339 | temp_1.append(label_map[label_ids[i][j]]) 340 | temp_2.append(label_map[logits[i][j]]) 341 | else: 342 | temp_1.pop() 343 | temp_2.pop() 344 | y_true.append(temp_1) 345 | y_pred.append(temp_2) 346 | 347 | covered = covered.union(set(temp_1)) 348 | break 349 | 350 | if dry_run: 351 | 352 | if 'I-LOC' not in covered: 353 | continue 354 | if 'I-ORG' not in covered: 355 | continue 356 | if 'I-PER' not in covered: 357 | continue 358 | 359 | break 360 | return y_pred, y_true 361 | 362 | 363 | def model_predict(dataloader, device, label_map, model): 364 | 365 | y_pred = [] 366 | for input_ids, input_mask, segment_ids, label_ids in dataloader: 367 | input_ids = input_ids.to(device) 368 | input_mask = input_mask.to(device) 369 | segment_ids = segment_ids.to(device) 370 | 371 | with torch.no_grad(): 372 | logits = model(input_ids, segment_ids, input_mask) 373 | 374 | logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) 375 | logits = logits.detach().cpu().numpy() 376 | input_mask = input_mask.to('cpu').numpy() 377 | 378 | for i, mask in enumerate(input_mask): 379 | temp_2 = [] 380 | for j, m in enumerate(mask): 381 | if j == 0: # skip first token since its [CLS] 382 | continue 383 | if m: 384 | temp_2.append(label_map[logits[i][j]]) 385 | else: 386 | temp_2.pop() # skip last token since its [SEP] 387 | y_pred.append(temp_2) 388 | break 389 | else: 390 | temp_2.pop() # skip last token since its [SEP] 391 | y_pred.append(temp_2) 392 | 393 | return y_pred 394 | 395 | 396 | def get_device(local_rank=-1, no_cuda=False): 397 | if local_rank == -1 or no_cuda: 398 | device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu") 399 | n_gpu = torch.cuda.device_count() 400 | else: 401 | torch.cuda.set_device(local_rank) 402 | device = torch.device("cuda", local_rank) 403 | n_gpu = 1 404 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 405 | torch.distributed.init_process_group(backend='nccl') 406 | return device, n_gpu 407 | 408 | 409 | def main(): 410 | 411 | parser = get_arg_parser() 412 | 413 | args = parser.parse_args() 414 | 415 | do_eval = len(args.dev_sets) > 0 and not args.do_cross_validation 416 | do_train = len(args.train_sets) > 0 and not args.do_cross_validation 417 | 418 | device, n_gpu = get_device(args.local_rank, args.no_cuda) 419 | 420 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 421 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 422 | 423 | random.seed(args.seed) 424 | np.random.seed(args.seed) 425 | torch.manual_seed(args.seed) 426 | 427 | if not do_train and not do_eval and not args.do_cross_validation: 428 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 429 | 430 | if not os.path.exists(args.output_dir): 431 | os.makedirs(args.output_dir) 432 | 433 | task_name = args.task_name.lower() 434 | 435 | processors = {"ner": NerProcessor, "wikipedia-ner": WikipediaNerProcessor} 436 | 437 | if task_name not in processors: 438 | raise ValueError("Task not found: %s" % task_name) 439 | 440 | if args.do_cross_validation: 441 | 442 | cross_val_result_file = "cross_validation_results.pkl" 443 | 444 | cross_val_result_file = os.path.join(args.output_dir, cross_val_result_file) 445 | 446 | sets = set(args.train_sets.split('|')) if args.train_sets is not None else set() 447 | 448 | gt = pd.read_pickle(args.gt_file) 449 | 450 | gt = gt.loc[gt.dataset.isin(sets)] 451 | 452 | k_fold = GroupKFold(n_splits=args.n_splits) 453 | 454 | eval_results = list() 455 | 456 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 457 | 458 | for ep in range(1, int(args.num_train_epochs) + 1): 459 | 460 | for sp, (train, test) in enumerate(k_fold.split(X=gt, groups=gt.nsentence)): 461 | 462 | tr = gt.iloc[train].copy() 463 | te = gt.iloc[test].copy() 464 | 465 | tr['dataset'] = 'TRAIN' 466 | te['dataset'] = 'TEST' 467 | 468 | gt_tmp = pd.concat([tr, te]) 469 | 470 | processor = \ 471 | processors[task_name](train_sets='TRAIN', dev_sets='TEST', test_sets='TEST', 472 | gt=gt_tmp, max_seq_length=args.max_seq_length, 473 | tokenizer=tokenizer, data_epochs=args.num_data_epochs, 474 | epoch_size=args.epoch_size) 475 | 476 | model, model_config = \ 477 | model_train(bert_model=args.bert_model, max_seq_length=args.max_seq_length, 478 | do_lower_case=args.do_lower_case, num_train_epochs=ep, 479 | train_batch_size=args.train_batch_size, 480 | gradient_accumulation_steps=args.gradient_accumulation_steps, 481 | learning_rate=args.learning_rate, weight_decay=args.weight_decay, 482 | loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion, 483 | processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16, 484 | cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run, 485 | no_cuda=args.no_cuda) 486 | 487 | label_map = {v: k for k, v in model_config['label_map'].items()} 488 | 489 | eval_result =\ 490 | model_eval(model=model, label_map=label_map, processor=processor, device=device, 491 | batch_size=args.eval_batch_size, local_rank=args.local_rank, 492 | no_cuda=args.no_cuda, dry_run=args.dry_run).reset_index() 493 | 494 | eval_result['split'] = sp 495 | eval_result['epoch'] = ep 496 | eval_results.append(eval_result) 497 | 498 | del model # release CUDA memory 499 | 500 | pd.concat(eval_results).to_pickle(cross_val_result_file) 501 | 502 | if do_train: 503 | 504 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 505 | 506 | processor = \ 507 | processors[task_name](train_sets=args.train_sets, dev_sets=args.dev_sets, test_sets=args.test_sets, 508 | gt_file=args.gt_file, max_seq_length=args.max_seq_length, 509 | tokenizer=tokenizer, data_epochs=args.num_data_epochs, 510 | epoch_size=args.epoch_size) 511 | 512 | model_train(bert_model=args.bert_model, output_dir=args.output_dir, max_seq_length=args.max_seq_length, 513 | do_lower_case=args.do_lower_case, num_train_epochs=args.num_train_epochs, 514 | train_batch_size=args.train_batch_size, 515 | gradient_accumulation_steps=args.gradient_accumulation_steps, 516 | learning_rate=args.learning_rate, weight_decay=args.weight_decay, loss_scale=args.loss_scale, 517 | warmup_proportion=args.warmup_proportion, processor=processor, device=device, n_gpu=n_gpu, 518 | fp16=args.fp16, cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run, 519 | no_cuda=args.no_cuda) 520 | 521 | if do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 522 | 523 | model_config = json.load(open(os.path.join(args.output_dir, "model_config.json"), "r")) 524 | 525 | label_to_id = model_config['label_map'] 526 | 527 | label_map = {v: k for k, v in model_config['label_map'].items()} 528 | 529 | tokenizer = BertTokenizer.from_pretrained(model_config['bert_model'], 530 | do_lower_case=model_config['do_lower']) 531 | 532 | processor = \ 533 | processors[task_name](train_sets=None, dev_sets=args.dev_sets, test_sets=args.test_sets, 534 | gt_file=args.gt_file, max_seq_length=model_config['max_seq_length'], 535 | tokenizer=tokenizer, data_epochs=args.num_data_epochs, 536 | epoch_size=args.epoch_size, label_map=label_to_id) 537 | 538 | model_eval(label_map=label_map, processor=processor, device=device, num_train_epochs=args.num_train_epochs, 539 | output_dir=args.output_dir, batch_size=args.eval_batch_size, local_rank=args.local_rank, 540 | no_cuda=args.no_cuda, dry_run=args.dry_run) 541 | 542 | 543 | def get_arg_parser(): 544 | 545 | parser = argparse.ArgumentParser() 546 | 547 | 548 | parser.add_argument("--gt_file", 549 | default=None, 550 | type=str, 551 | required=True, 552 | help="The pickle file that contains all NER ground truth as pandas DataFrame." 553 | " Required columns: ['nsentence', 'nword', 'word', 'tag', 'dataset]." 554 | " The selection of training, test and dev set is performed on the 'dataset' column.") 555 | 556 | parser.add_argument("--train_sets", 557 | default='', 558 | type=str, 559 | required=False, 560 | help="Specifiy one or more tags from the dataset column in order to mark samples" 561 | " that belong to the training set. Example: 'GERM-EVAL-TRAIN|DE-CONLL-TRAIN'. ") 562 | 563 | parser.add_argument("--dev_sets", 564 | default='', 565 | type=str, 566 | required=False, 567 | help="Specifiy one or more tags from the dataset column in order to mark samples" 568 | " that belong to the dev set. Example: 'GERM-EVAL-DEV|DE-CONLL-TESTA'. ") 569 | 570 | parser.add_argument("--test_sets", 571 | default='', 572 | type=str, 573 | required=False, 574 | help="Specifiy one or more tags from the dataset column in order to mark samples" 575 | " that belong to the test set. Example: 'GERM-EVAL-TEST|DE-CONLL-TESTB'. ") 576 | 577 | parser.add_argument("--bert_model", default=None, type=str, required=False, 578 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 579 | "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " 580 | "bert-base-multilingual-cased, bert-base-chinese.") 581 | 582 | parser.add_argument("--task_name", 583 | default=None, 584 | type=str, 585 | required=True, 586 | help="The name of the task to train.") 587 | 588 | parser.add_argument("--output_dir", 589 | default=None, 590 | type=str, 591 | required=False, 592 | help="The output directory where the model predictions and checkpoints will be written.") 593 | 594 | # Other parameters 595 | parser.add_argument("--cache_dir", 596 | default="", 597 | type=str, 598 | help="Where do you want to store the pre-trained models downloaded from s3") 599 | 600 | parser.add_argument("--max_seq_length", 601 | default=128, 602 | type=int, 603 | help="The maximum total input sequence length after WordPiece tokenization. \n" 604 | "Sequences longer than this will be truncated, and sequences shorter \n" 605 | "than this will be padded.") 606 | 607 | parser.add_argument("--do_lower_case", 608 | action='store_true', 609 | help="Set this flag if you are using an uncased model.") 610 | 611 | parser.add_argument("--train_batch_size", 612 | default=32, 613 | type=int, 614 | help="Total batch size for training.") 615 | 616 | parser.add_argument("--eval_batch_size", 617 | default=8, 618 | type=int, 619 | help="Total batch size for eval.") 620 | 621 | parser.add_argument("--learning_rate", 622 | default=3e-5, 623 | type=float, 624 | help="The initial learning rate for Adam.") 625 | 626 | parser.add_argument("--weight_decay", 627 | default=0.01, 628 | type=float, 629 | help="Weight decay for Adam.") 630 | 631 | parser.add_argument("--num_train_epochs", 632 | default=3.0, 633 | type=float, 634 | help="Total number of training epochs to perform/evaluate.") 635 | 636 | parser.add_argument("--num_data_epochs", 637 | default=1.0, 638 | type=float, 639 | help="Re-cycle data after num_data_epochs.") 640 | 641 | parser.add_argument("--epoch_size", 642 | default=10000, 643 | type=float, 644 | help="Size of one epoch.") 645 | 646 | parser.add_argument("--do_cross_validation", 647 | action='store_true', 648 | help="Do cross-validation.") 649 | 650 | parser.add_argument("--n_splits", 651 | default=5, 652 | type=int, 653 | help="Number of folds in cross_validation.") 654 | 655 | parser.add_argument("--warmup_proportion", 656 | default=0.1, 657 | type=float, 658 | help="Proportion of training to perform linear learning rate warmup for. " 659 | "E.g., 0.1 = 10%% of training.") 660 | 661 | parser.add_argument("--no_cuda", 662 | action='store_true', 663 | help="Whether not to use CUDA when available") 664 | 665 | parser.add_argument("--dry_run", 666 | action='store_true', 667 | help="Test mode.") 668 | 669 | parser.add_argument("--local_rank", 670 | type=int, 671 | default=-1, 672 | help="local_rank for distributed training on gpus") 673 | 674 | parser.add_argument('--seed', 675 | type=int, 676 | default=42, 677 | help="random seed for initialization") 678 | 679 | parser.add_argument('--gradient_accumulation_steps', 680 | type=int, 681 | default=1, 682 | help="Number of updates steps to accumulate before performing a backward/update pass.") 683 | 684 | parser.add_argument('--fp16', 685 | action='store_true', 686 | help="Whether to use 16-bit float precision instead of 32-bit") 687 | 688 | parser.add_argument('--loss_scale', 689 | type=float, default=0, 690 | help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" 691 | "0 (default value): dynamic loss scaling.\n" 692 | "Positive power of 2: static loss scaling value.\n") 693 | return parser 694 | 695 | 696 | if __name__ == "__main__": 697 | main() 698 | -------------------------------------------------------------------------------- /qurator/sbb_ner/models/corpus.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | from tqdm import tqdm as tqdm 4 | import click 5 | import codecs 6 | import os 7 | import sqlite3 8 | 9 | from qurator.utils.parallel import run as prun 10 | 11 | 12 | class ChunkTask: 13 | 14 | selection = None 15 | 16 | def __init__(self, chunk, min_line_len): 17 | 18 | self._chunk = chunk 19 | self._min_line_len = min_line_len 20 | 21 | def __call__(self, *args, **kwargs): 22 | 23 | return ChunkTask.reformat_chunk(self._chunk, self._min_line_len) 24 | 25 | @staticmethod 26 | def reformat_chunk(chunk, min_line_len): 27 | """ 28 | Process a chunk of documents. 29 | 30 | :param chunk: pandas DataFrame that contains one document per row. 31 | :param min_line_len: Break the document text up in lines that have this minimum length. 32 | :return: One big text where the documents are separated by an empty line. 33 | """ 34 | 35 | text = '' 36 | 37 | for i, r in chunk.iterrows(): 38 | 39 | if type(r.text) != str: 40 | continue 41 | 42 | ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn 43 | 44 | filename = str(r['file name']) 45 | 46 | if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]: 47 | continue 48 | 49 | for se in sentence_split(str(r.text), min_line_len): 50 | 51 | text += se 52 | 53 | text += '\n\n' 54 | 55 | return text 56 | 57 | @staticmethod 58 | def initialize(selection_file): 59 | 60 | ChunkTask.selection = \ 61 | pd.read_pickle(selection_file).\ 62 | reset_index().\ 63 | set_index(['ppn', 'filename']).\ 64 | sort_index() 65 | 66 | 67 | def get_csv_chunks(alto_csv_file, chunksize): 68 | 69 | for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)): 70 | 71 | yield ch 72 | 73 | 74 | def get_sqlite_chunks(alto_sqlite_file, chunksize): 75 | 76 | yield pd.DataFrame() 77 | 78 | with sqlite3.connect(alto_sqlite_file) as conn: 79 | 80 | conn.execute('pragma journal_mode=wal') 81 | 82 | total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize) 83 | 84 | for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total): 85 | 86 | yield ch 87 | 88 | 89 | def get_chunk_tasks(chunks, min_len_len): 90 | 91 | for chunk in chunks: 92 | 93 | if len(chunk) == 0: 94 | continue 95 | 96 | yield ChunkTask(chunk, min_len_len) 97 | 98 | 99 | def sentence_split(s, min_len): 100 | """ 101 | Reformat text of an entire document such that each line has at least length min_len 102 | :param s: str 103 | :param min_len: minimum line length 104 | :return: reformatted text 105 | """ 106 | 107 | parts = s.split(' ') 108 | 109 | se = '' 110 | for p in parts: 111 | 112 | se += ' ' + p 113 | 114 | if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p): 115 | yield se + '\n' 116 | se = '' 117 | 118 | yield se + '\n' 119 | 120 | 121 | @click.command() 122 | @click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1) 123 | @click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1) 124 | @click.argument('corpus-file', type=click.Path(), required=True, nargs=1) 125 | @click.option('--chunksize', default=10**4, help="Process the corpus in chunks of . default:10**4") 126 | @click.option('--processes', default=6, help="Number of parallel processes. default: 6") 127 | @click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80") 128 | def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len): 129 | """ 130 | Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file. 131 | 132 | FULLTEXT_FILE: The CSV or SQLITE3 file to read from. 133 | 134 | SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame 135 | that is stored in . 136 | 137 | CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata. 138 | """ 139 | os.makedirs(os.path.dirname(corpus_file), exist_ok=True) 140 | 141 | print('Open {}.'.format(corpus_file)) 142 | corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8') 143 | corpus_fh.write(u'\ufeff') 144 | 145 | if fulltext_file.endswith('.csv'): 146 | chunks = get_csv_chunks(fulltext_file, chunksize) 147 | elif fulltext_file.endswith('.sqlite3'): 148 | chunks = get_sqlite_chunks(fulltext_file, chunksize) 149 | else: 150 | raise RuntimeError('Unsupported input file format.') 151 | 152 | for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize, 153 | initargs=(selection_file,)): 154 | 155 | corpus_fh.write(text) 156 | 157 | corpus_fh.close() 158 | 159 | return 160 | 161 | 162 | if __name__ == '__main__': 163 | main() 164 | -------------------------------------------------------------------------------- /qurator/sbb_ner/models/finetune_on_pregenerated.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pathlib import Path 3 | import torch 4 | import logging 5 | import json 6 | import random 7 | import numpy as np 8 | from collections import namedtuple 9 | from tempfile import TemporaryDirectory 10 | 11 | from torch.utils.data import DataLoader, Dataset, RandomSampler 12 | from torch.utils.data.distributed import DistributedSampler 13 | from tqdm import tqdm 14 | 15 | from pytorch_pretrained_bert.modeling import BertForPreTraining 16 | from pytorch_pretrained_bert.tokenization import BertTokenizer 17 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule 18 | 19 | InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next") 20 | 21 | log_format = '%(asctime)-10s: %(message)s' 22 | logging.basicConfig(level=logging.INFO, format=log_format) 23 | 24 | 25 | def convert_example_to_features(example, tokenizer, max_seq_length): 26 | tokens = example["tokens"] 27 | segment_ids = example["segment_ids"] 28 | is_random_next = example["is_random_next"] 29 | masked_lm_positions = example["masked_lm_positions"] 30 | masked_lm_labels = example["masked_lm_labels"] 31 | 32 | assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated 33 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 34 | masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) 35 | 36 | input_array = np.zeros(max_seq_length, dtype=np.int) 37 | input_array[:len(input_ids)] = input_ids 38 | 39 | mask_array = np.zeros(max_seq_length, dtype=np.bool) 40 | mask_array[:len(input_ids)] = 1 41 | 42 | segment_array = np.zeros(max_seq_length, dtype=np.bool) 43 | segment_array[:len(segment_ids)] = segment_ids 44 | 45 | lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) 46 | lm_label_array[masked_lm_positions] = masked_label_ids 47 | 48 | features = InputFeatures(input_ids=input_array, 49 | input_mask=mask_array, 50 | segment_ids=segment_array, 51 | lm_label_ids=lm_label_array, 52 | is_next=is_random_next) 53 | return features 54 | 55 | 56 | class PregeneratedDataset(Dataset): 57 | def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False, prefix=None): 58 | self.vocab = tokenizer.vocab 59 | self.tokenizer = tokenizer 60 | self.epoch = epoch 61 | self.data_epoch = epoch % num_data_epochs 62 | data_file = training_path / f"epoch_{self.data_epoch}.json" 63 | metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json" 64 | assert data_file.is_file() and metrics_file.is_file() 65 | metrics = json.loads(metrics_file.read_text()) 66 | num_samples = metrics['num_training_examples'] 67 | seq_len = metrics['max_seq_len'] 68 | self.temp_dir = None 69 | self.working_dir = None 70 | if reduce_memory: 71 | self.temp_dir = TemporaryDirectory(prefix=prefix) 72 | self.working_dir = Path(self.temp_dir.name) 73 | input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap', 74 | mode='w+', dtype=np.int32, shape=(num_samples, seq_len)) 75 | input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap', 76 | shape=(num_samples, seq_len), mode='w+', dtype=np.bool) 77 | segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap', 78 | shape=(num_samples, seq_len), mode='w+', dtype=np.bool) 79 | lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap', 80 | shape=(num_samples, seq_len), mode='w+', dtype=np.int32) 81 | lm_label_ids[:] = -1 82 | is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap', 83 | shape=(num_samples,), mode='w+', dtype=np.bool) 84 | else: 85 | input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32) 86 | input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) 87 | segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool) 88 | lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1) 89 | is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool) 90 | logging.info(f"Loading training examples for epoch {epoch}") 91 | with data_file.open() as f: 92 | for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): 93 | line = line.strip() 94 | example = json.loads(line) 95 | features = convert_example_to_features(example, tokenizer, seq_len) 96 | input_ids[i] = features.input_ids 97 | segment_ids[i] = features.segment_ids 98 | input_masks[i] = features.input_mask 99 | lm_label_ids[i] = features.lm_label_ids 100 | is_nexts[i] = features.is_next 101 | assert i == num_samples - 1 # Assert that the sample count metric was true 102 | logging.info("Loading complete!") 103 | self.num_samples = num_samples 104 | self.seq_len = seq_len 105 | self.input_ids = input_ids 106 | self.input_masks = input_masks 107 | self.segment_ids = segment_ids 108 | self.lm_label_ids = lm_label_ids 109 | self.is_nexts = is_nexts 110 | 111 | def __len__(self): 112 | return self.num_samples 113 | 114 | def __getitem__(self, item): 115 | return (torch.tensor(self.input_ids[item].astype(np.int64)), 116 | torch.tensor(self.input_masks[item].astype(np.int64)), 117 | torch.tensor(self.segment_ids[item].astype(np.int64)), 118 | torch.tensor(self.lm_label_ids[item].astype(np.int64)), 119 | torch.tensor(self.is_nexts[item].astype(np.int64))) 120 | 121 | 122 | def main(): 123 | parser = ArgumentParser() 124 | parser.add_argument('--pregenerated_data', type=Path, required=True) 125 | parser.add_argument('--output_dir', type=Path, required=True) 126 | parser.add_argument("--bert_model", type=str, required=True, help="Directory where the Bert pre-trained model can be found " 127 | "or Bert pre-trained model selected in the list: bert-base-uncased, " 128 | "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") 129 | parser.add_argument("--do_lower_case", action="store_true") 130 | parser.add_argument("--reduce_memory", action="store_true", 131 | help="Store training data as on-disc memmaps to massively reduce memory usage") 132 | 133 | parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") 134 | parser.add_argument("--local_rank", 135 | type=int, 136 | default=-1, 137 | help="local_rank for distributed training on gpus") 138 | parser.add_argument("--no_cuda", 139 | action='store_true', 140 | help="Whether not to use CUDA when available") 141 | parser.add_argument('--gradient_accumulation_steps', 142 | type=int, 143 | default=1, 144 | help="Number of updates steps to accumulate before performing a backward/update pass.") 145 | parser.add_argument("--train_batch_size", 146 | default=32, 147 | type=int, 148 | help="Total batch size for training.") 149 | parser.add_argument("--save_interval", 150 | default=20000, 151 | type=int, 152 | help="Save model every save_interval training steps.") 153 | parser.add_argument('--fp16', 154 | action='store_true', 155 | help="Whether to use 16-bit float precision instead of 32-bit") 156 | parser.add_argument('--loss_scale', 157 | type=float, default=0, 158 | help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" 159 | "0 (default value): dynamic loss scaling.\n" 160 | "Positive power of 2: static loss scaling value.\n") 161 | parser.add_argument("--warmup_proportion", 162 | default=0.1, 163 | type=float, 164 | help="Proportion of training to perform linear learning rate warmup for. " 165 | "E.g., 0.1 = 10%% of training.") 166 | parser.add_argument("--learning_rate", 167 | default=3e-5, 168 | type=float, 169 | help="The initial learning rate for Adam.") 170 | parser.add_argument('--seed', 171 | type=int, 172 | default=42, 173 | help="random seed for initialization") 174 | parser.add_argument('--temp_prefix', 175 | type=str, 176 | default=None, 177 | help="where to store temporary data") 178 | 179 | args = parser.parse_args() 180 | 181 | assert args.pregenerated_data.is_dir(), \ 182 | "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" 183 | 184 | samples_per_epoch = [] 185 | for i in range(args.epochs): 186 | epoch_file = args.pregenerated_data / f"epoch_{i}.json" 187 | metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" 188 | if epoch_file.is_file() and metrics_file.is_file(): 189 | metrics = json.loads(metrics_file.read_text()) 190 | samples_per_epoch.append(metrics['num_training_examples']) 191 | else: 192 | if i == 0: 193 | exit("No training data was found!") 194 | print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") 195 | print("This script will loop over the available data, but training diversity may be negatively impacted.") 196 | num_data_epochs = i 197 | break 198 | else: 199 | num_data_epochs = args.epochs 200 | 201 | if args.local_rank == -1 or args.no_cuda: 202 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 203 | n_gpu = torch.cuda.device_count() 204 | else: 205 | torch.cuda.set_device(args.local_rank) 206 | device = torch.device("cuda", args.local_rank) 207 | n_gpu = 1 208 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 209 | torch.distributed.init_process_group(backend='nccl') 210 | logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 211 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 212 | 213 | if args.gradient_accumulation_steps < 1: 214 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 215 | args.gradient_accumulation_steps)) 216 | 217 | args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 218 | 219 | random.seed(args.seed) 220 | np.random.seed(args.seed) 221 | torch.manual_seed(args.seed) 222 | if n_gpu > 0: 223 | torch.cuda.manual_seed_all(args.seed) 224 | 225 | if args.output_dir.is_dir() and list(args.output_dir.iterdir()): 226 | logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") 227 | args.output_dir.mkdir(parents=True, exist_ok=True) 228 | 229 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 230 | 231 | total_train_examples = 0 232 | for i in range(args.epochs): 233 | # The modulo takes into account the fact that we may loop over limited epochs of data 234 | total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] 235 | 236 | num_train_optimization_steps = int( 237 | total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) 238 | 239 | if args.local_rank != -1: 240 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 241 | 242 | # Prepare model 243 | model = BertForPreTraining.from_pretrained(args.bert_model) 244 | if args.fp16: 245 | model.half() 246 | model.to(device) 247 | if args.local_rank != -1: 248 | try: 249 | from apex.parallel import DistributedDataParallel as DDP 250 | except ImportError: 251 | raise ImportError( 252 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 253 | model = DDP(model) 254 | elif n_gpu > 1: 255 | model = torch.nn.DataParallel(model) 256 | 257 | # Prepare optimizer 258 | param_optimizer = list(model.named_parameters()) 259 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 260 | optimizer_grouped_parameters = [ 261 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 262 | 'weight_decay': 0.01}, 263 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 264 | ] 265 | 266 | if args.fp16: 267 | try: 268 | from apex.optimizers import FP16_Optimizer 269 | from apex.optimizers import FusedAdam 270 | except ImportError: 271 | raise ImportError( 272 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 273 | 274 | optimizer = FusedAdam(optimizer_grouped_parameters, 275 | lr=args.learning_rate, 276 | bias_correction=False, 277 | max_grad_norm=1.0) 278 | if args.loss_scale == 0: 279 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 280 | else: 281 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) 282 | warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, 283 | t_total=num_train_optimization_steps) 284 | else: 285 | optimizer = BertAdam(optimizer_grouped_parameters, 286 | lr=args.learning_rate, 287 | warmup=args.warmup_proportion, 288 | t_total=num_train_optimization_steps) 289 | 290 | global_step = 0 291 | logging.info("***** Running training *****") 292 | logging.info(f" Num examples = {total_train_examples}") 293 | logging.info(" Batch size = %d", args.train_batch_size) 294 | logging.info(" Num steps = %d", num_train_optimization_steps) 295 | model.train() 296 | 297 | def save_model(): 298 | 299 | logging.info("** ** * Saving fine-tuned model ** ** * ") 300 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 301 | output_model_file = args.output_dir / "pytorch_model.bin" 302 | torch.save(model_to_save.state_dict(), str(output_model_file)) 303 | 304 | for epoch in range(args.epochs): 305 | epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, 306 | num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, 307 | prefix=args.temp_prefix) 308 | if args.local_rank == -1: 309 | train_sampler = RandomSampler(epoch_dataset) 310 | else: 311 | train_sampler = DistributedSampler(epoch_dataset) 312 | train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 313 | tr_loss = 0 314 | nb_tr_examples, nb_tr_steps = 0, 0 315 | with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: 316 | for step, batch in enumerate(train_dataloader): 317 | 318 | batch = tuple(t.to(device) for t in batch) 319 | input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch 320 | loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) 321 | if n_gpu > 1: 322 | loss = loss.mean() # mean() to average on multi-gpu. 323 | if args.gradient_accumulation_steps > 1: 324 | loss = loss / args.gradient_accumulation_steps 325 | if args.fp16: 326 | optimizer.backward(loss) 327 | else: 328 | loss.backward() 329 | tr_loss += loss.item() 330 | nb_tr_examples += input_ids.size(0) 331 | nb_tr_steps += 1 332 | pbar.update(1) 333 | mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps 334 | pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") 335 | 336 | if step % args.save_interval == 0: 337 | save_model() 338 | 339 | if (step + 1) % args.gradient_accumulation_steps == 0: 340 | if args.fp16: 341 | # modify learning rate with special warm up BERT uses 342 | # if args.fp16 is False, BertAdam is used that handles this automatically 343 | lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) 344 | 345 | for param_group in optimizer.param_groups: 346 | param_group['lr'] = lr_this_step 347 | 348 | optimizer.step() 349 | optimizer.zero_grad() 350 | 351 | global_step += 1 352 | 353 | # Save a trained model 354 | # logging.info("** ** * Saving fine-tuned model ** ** * ") 355 | # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 356 | # output_model_file = args.output_dir / "pytorch_model.bin" 357 | # torch.save(model_to_save.state_dict(), str(output_model_file)) 358 | 359 | save_model() 360 | 361 | 362 | if __name__ == '__main__': 363 | main() 364 | -------------------------------------------------------------------------------- /qurator/sbb_ner/models/pregenerate_training_data.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pathlib import Path 3 | from tqdm import tqdm, trange 4 | from tempfile import TemporaryDirectory 5 | import shelve 6 | 7 | from random import random, randrange, randint, shuffle, choice, sample 8 | from pytorch_pretrained_bert.tokenization import BertTokenizer 9 | import numpy as np 10 | import json 11 | 12 | 13 | class DocumentDatabase: 14 | def __init__(self, reduce_memory=False): 15 | if reduce_memory: 16 | self.temp_dir = TemporaryDirectory() 17 | self.working_dir = Path(self.temp_dir.name) 18 | self.document_shelf_filepath = self.working_dir / 'shelf.db' 19 | self.document_shelf = shelve.open(str(self.document_shelf_filepath), 20 | flag='n', protocol=-1) 21 | self.documents = None 22 | else: 23 | self.documents = [] 24 | self.document_shelf = None 25 | self.document_shelf_filepath = None 26 | self.temp_dir = None 27 | self.doc_lengths = [] 28 | self.doc_cumsum = None 29 | self.cumsum_max = None 30 | self.reduce_memory = reduce_memory 31 | 32 | def add_document(self, document): 33 | if not document: 34 | return 35 | if self.reduce_memory: 36 | current_idx = len(self.doc_lengths) 37 | self.document_shelf[str(current_idx)] = document 38 | else: 39 | self.documents.append(document) 40 | self.doc_lengths.append(len(document)) 41 | 42 | def _precalculate_doc_weights(self): 43 | self.doc_cumsum = np.cumsum(self.doc_lengths) 44 | self.cumsum_max = self.doc_cumsum[-1] 45 | 46 | def sample_doc(self, current_idx, sentence_weighted=True): 47 | # Uses the current iteration counter to ensure we don't sample the same doc twice 48 | if sentence_weighted: 49 | # With sentence weighting, we sample docs proportionally to their sentence length 50 | if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths): 51 | self._precalculate_doc_weights() 52 | rand_start = self.doc_cumsum[current_idx] 53 | rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] 54 | sentence_index = randrange(rand_start, rand_end) % self.cumsum_max 55 | sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') 56 | else: 57 | # If we don't use sentence weighting, then every doc has an equal chance to be chosen 58 | sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths) 59 | assert sampled_doc_index != current_idx 60 | if self.reduce_memory: 61 | return self.document_shelf[str(sampled_doc_index)] 62 | else: 63 | return self.documents[sampled_doc_index] 64 | 65 | def __len__(self): 66 | return len(self.doc_lengths) 67 | 68 | def __getitem__(self, item): 69 | if self.reduce_memory: 70 | return self.document_shelf[str(item)] 71 | else: 72 | return self.documents[item] 73 | 74 | def __enter__(self): 75 | return self 76 | 77 | def __exit__(self, exc_type, exc_val, traceback): 78 | if self.document_shelf is not None: 79 | self.document_shelf.close() 80 | if self.temp_dir is not None: 81 | self.temp_dir.cleanup() 82 | 83 | 84 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens): 85 | """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo.""" 86 | while True: 87 | total_length = len(tokens_a) + len(tokens_b) 88 | if total_length <= max_num_tokens: 89 | break 90 | 91 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 92 | assert len(trunc_tokens) >= 1 93 | 94 | # We want to sometimes truncate from the front and sometimes from the 95 | # back to add more randomness and avoid biases. 96 | if random() < 0.5: 97 | del trunc_tokens[0] 98 | else: 99 | trunc_tokens.pop() 100 | 101 | 102 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list): 103 | """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but 104 | with several refactors to clean it up and remove a lot of unnecessary variables.""" 105 | cand_indices = [] 106 | for (i, token) in enumerate(tokens): 107 | if token == "[CLS]" or token == "[SEP]": 108 | continue 109 | cand_indices.append(i) 110 | 111 | num_to_mask = min(max_predictions_per_seq, 112 | max(1, int(round(len(tokens) * masked_lm_prob)))) 113 | shuffle(cand_indices) 114 | mask_indices = sorted(sample(cand_indices, num_to_mask)) 115 | masked_token_labels = [] 116 | for index in mask_indices: 117 | # 80% of the time, replace with [MASK] 118 | if random() < 0.8: 119 | masked_token = "[MASK]" 120 | else: 121 | # 10% of the time, keep original 122 | if random() < 0.5: 123 | masked_token = tokens[index] 124 | # 10% of the time, replace with random word 125 | else: 126 | masked_token = choice(vocab_list) 127 | masked_token_labels.append(tokens[index]) 128 | # Once we've saved the true label for that token, we can overwrite it with the masked version 129 | tokens[index] = masked_token 130 | 131 | return tokens, mask_indices, masked_token_labels 132 | 133 | 134 | def create_instances_from_document( 135 | doc_database, doc_idx, max_seq_length, short_seq_prob, 136 | masked_lm_prob, max_predictions_per_seq, vocab_list): 137 | """This code is mostly a duplicate of the equivalent function from Google BERT's repo. 138 | However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function. 139 | Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence 140 | (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task.""" 141 | document = doc_database[doc_idx] 142 | # Account for [CLS], [SEP], [SEP] 143 | max_num_tokens = max_seq_length - 3 144 | 145 | # We *usually* want to fill up the entire sequence since we are padding 146 | # to `max_seq_length` anyways, so short sequences are generally wasted 147 | # computation. However, we *sometimes* 148 | # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter 149 | # sequences to minimize the mismatch between pre-training and fine-tuning. 150 | # The `target_seq_length` is just a rough target however, whereas 151 | # `max_seq_length` is a hard limit. 152 | target_seq_length = max_num_tokens 153 | if random() < short_seq_prob: 154 | target_seq_length = randint(2, max_num_tokens) 155 | 156 | # We DON'T just concatenate all of the tokens from a document into a long 157 | # sequence and choose an arbitrary split point because this would make the 158 | # next sentence prediction task too easy. Instead, we split the input into 159 | # segments "A" and "B" based on the actual "sentences" provided by the user 160 | # input. 161 | instances = [] 162 | current_chunk = [] 163 | current_length = 0 164 | i = 0 165 | while i < len(document): 166 | segment = document[i] 167 | current_chunk.append(segment) 168 | current_length += len(segment) 169 | if i == len(document) - 1 or current_length >= target_seq_length: 170 | if current_chunk: 171 | # `a_end` is how many segments from `current_chunk` go into the `A` 172 | # (first) sentence. 173 | a_end = 1 174 | if len(current_chunk) >= 2: 175 | a_end = randrange(1, len(current_chunk)) 176 | 177 | tokens_a = [] 178 | for j in range(a_end): 179 | tokens_a.extend(current_chunk[j]) 180 | 181 | tokens_b = [] 182 | 183 | # Random next 184 | if len(current_chunk) == 1 or random() < 0.5: 185 | is_random_next = True 186 | target_b_length = target_seq_length - len(tokens_a) 187 | 188 | # Sample a random document, with longer docs being sampled more frequently 189 | random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True) 190 | 191 | random_start = randrange(0, len(random_document)) 192 | for j in range(random_start, len(random_document)): 193 | tokens_b.extend(random_document[j]) 194 | if len(tokens_b) >= target_b_length: 195 | break 196 | # We didn't actually use these segments so we "put them back" so 197 | # they don't go to waste. 198 | num_unused_segments = len(current_chunk) - a_end 199 | i -= num_unused_segments 200 | # Actual next 201 | else: 202 | is_random_next = False 203 | for j in range(a_end, len(current_chunk)): 204 | tokens_b.extend(current_chunk[j]) 205 | truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) 206 | 207 | assert len(tokens_a) >= 1 208 | assert len(tokens_b) >= 1 209 | 210 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] 211 | # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP] 212 | # They are 1 for the B tokens and the final [SEP] 213 | segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] 214 | 215 | tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( 216 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) 217 | 218 | instance = { 219 | "tokens": tokens, 220 | "segment_ids": segment_ids, 221 | "is_random_next": is_random_next, 222 | "masked_lm_positions": masked_lm_positions, 223 | "masked_lm_labels": masked_lm_labels} 224 | instances.append(instance) 225 | current_chunk = [] 226 | current_length = 0 227 | i += 1 228 | 229 | return instances 230 | 231 | 232 | def main(): 233 | parser = ArgumentParser() 234 | parser.add_argument('--train_corpus', type=Path, required=True) 235 | parser.add_argument("--output_dir", type=Path, required=True) 236 | parser.add_argument("--bert_model", type=str, required=True) # , 237 | # choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", 238 | # "bert-base-multilingual", "bert-base-chinese"]) 239 | parser.add_argument("--do_lower_case", action="store_true") 240 | 241 | parser.add_argument("--reduce_memory", action="store_true", 242 | help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") 243 | 244 | parser.add_argument("--epochs_to_generate", type=int, default=3, 245 | help="Number of epochs of data to pregenerate") 246 | parser.add_argument("--max_seq_len", type=int, default=128) 247 | parser.add_argument("--short_seq_prob", type=float, default=0.1, 248 | help="Probability of making a short sentence as a training example") 249 | parser.add_argument("--masked_lm_prob", type=float, default=0.15, 250 | help="Probability of masking each token for the LM task") 251 | parser.add_argument("--max_predictions_per_seq", type=int, default=20, 252 | help="Maximum number of tokens to mask in each sequence") 253 | 254 | args = parser.parse_args() 255 | 256 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 257 | vocab_list = list(tokenizer.vocab.keys()) 258 | with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: 259 | with args.train_corpus.open() as f: 260 | doc = [] 261 | for line in tqdm(f, desc="Loading Dataset", unit=" lines"): 262 | line = line.strip() 263 | if line == "": 264 | docs.add_document(doc) 265 | doc = [] 266 | else: 267 | tokens = tokenizer.tokenize(line) 268 | doc.append(tokens) 269 | if doc: 270 | docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added 271 | if len(docs) <= 1: 272 | exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " 273 | "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " 274 | "indicate breaks between documents in your input file. If your dataset does not contain multiple " 275 | "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " 276 | "sections or paragraphs.") 277 | 278 | args.output_dir.mkdir(exist_ok=True) 279 | for epoch in trange(args.epochs_to_generate, desc="Epoch"): 280 | epoch_filename = args.output_dir / f"epoch_{epoch}.json" 281 | num_instances = 0 282 | with epoch_filename.open('w') as epoch_file: 283 | for doc_idx in trange(len(docs), desc="Document"): 284 | doc_instances = create_instances_from_document( 285 | docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, 286 | masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, 287 | vocab_list=vocab_list) 288 | doc_instances = [json.dumps(instance) for instance in doc_instances] 289 | for instance in doc_instances: 290 | epoch_file.write(instance + '\n') 291 | num_instances += 1 292 | metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json" 293 | with metrics_file.open('w') as metrics_file: 294 | metrics = { 295 | "num_training_examples": num_instances, 296 | "max_seq_len": args.max_seq_len 297 | } 298 | metrics_file.write(json.dumps(metrics)) 299 | 300 | 301 | if __name__ == '__main__': 302 | main() 303 | -------------------------------------------------------------------------------- /qurator/sbb_ner/models/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from pytorch_pretrained_bert.file_utils import cached_path 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 37 | } 38 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { 39 | 'bert-base-uncased': 512, 40 | 'bert-large-uncased': 512, 41 | 'bert-base-cased': 512, 42 | 'bert-large-cased': 512, 43 | 'bert-base-multilingual-uncased': 512, 44 | 'bert-base-multilingual-cased': 512, 45 | 'bert-base-chinese': 512, 46 | } 47 | VOCAB_NAME = 'vocab.txt' 48 | 49 | 50 | def load_vocab(vocab_file): 51 | """Loads a vocabulary file into a dictionary.""" 52 | vocab = collections.OrderedDict() 53 | index = 0 54 | with open(vocab_file, "r", encoding="utf-8") as reader: 55 | while True: 56 | token = reader.readline() 57 | if not token: 58 | break 59 | token = token.strip() 60 | vocab[token] = index 61 | index += 1 62 | return vocab 63 | 64 | 65 | def whitespace_tokenize(text): 66 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 67 | text = text.strip() 68 | if not text: 69 | return [] 70 | tokens = text.split() 71 | return tokens 72 | 73 | 74 | class BertTokenizer(object): 75 | """Runs end-to-end tokenization: punctuation splitting + wordpiece""" 76 | 77 | def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, 78 | never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): 79 | """Constructs a BertTokenizer. 80 | 81 | Args: 82 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 83 | do_lower_case: Whether to lower case the input 84 | Only has an effect when do_wordpiece_only=False 85 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 86 | max_len: An artificial maximum length to truncate tokenized sequences to; 87 | Effective maximum length is always the minimum of this 88 | value (if specified) and the underlying BERT model's 89 | sequence length. 90 | never_split: List of tokens which will never be split during tokenization. 91 | Only has an effect when do_wordpiece_only=False 92 | """ 93 | if not os.path.isfile(vocab_file): 94 | raise ValueError( 95 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 96 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 97 | self.vocab = load_vocab(vocab_file) 98 | self.ids_to_tokens = collections.OrderedDict( 99 | [(ids, tok) for tok, ids in self.vocab.items()]) 100 | self.do_basic_tokenize = do_basic_tokenize 101 | if do_basic_tokenize: 102 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, 103 | never_split=never_split) 104 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 105 | self.max_len = max_len if max_len is not None else int(1e12) 106 | 107 | def tokenize(self, text): 108 | split_tokens = [] 109 | if self.do_basic_tokenize: 110 | for token in self.basic_tokenizer.tokenize(text): 111 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 112 | split_tokens.append(sub_token) 113 | else: 114 | split_tokens = self.wordpiece_tokenizer.tokenize(text) 115 | return split_tokens 116 | 117 | def convert_tokens_to_ids(self, tokens): 118 | """Converts a sequence of tokens into ids using the vocab.""" 119 | ids = [] 120 | for token in tokens: 121 | ids.append(self.vocab[token]) 122 | if len(ids) > self.max_len: 123 | logger.warning( 124 | "Token indices sequence length is longer than the specified maximum " 125 | " sequence length for this BERT model ({} > {}). Running this" 126 | " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) 127 | ) 128 | return ids 129 | 130 | def convert_ids_to_tokens(self, ids): 131 | """Converts a sequence of ids in wordpiece tokens using the vocab.""" 132 | tokens = [] 133 | for i in ids: 134 | tokens.append(self.ids_to_tokens[i]) 135 | return tokens 136 | 137 | def save_vocabulary(self, vocab_path): 138 | """Save the tokenizer vocabulary to a directory or file.""" 139 | index = 0 140 | if os.path.isdir(vocab_path): 141 | vocab_file = os.path.join(vocab_path, VOCAB_NAME) 142 | with open(vocab_file, "w", encoding="utf-8") as writer: 143 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): 144 | if index != token_index: 145 | logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." 146 | " Please check that the vocabulary is not corrupted!".format(vocab_file)) 147 | index = token_index 148 | writer.write(token + u'\n') 149 | index += 1 150 | return vocab_file 151 | 152 | @classmethod 153 | def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): 154 | """ 155 | Instantiate a PreTrainedBertModel from a pre-trained model file. 156 | Download and cache the pre-trained model file if needed. 157 | """ 158 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: 159 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] 160 | if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): 161 | logger.warning("The pre-trained model you are loading is a cased model but you have not set " 162 | "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " 163 | "you may want to check this behavior.") 164 | kwargs['do_lower_case'] = False 165 | elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): 166 | logger.warning("The pre-trained model you are loading is an uncased model but you have set " 167 | "`do_lower_case` to False. We are setting `do_lower_case=True` for you " 168 | "but you may want to check this behavior.") 169 | kwargs['do_lower_case'] = True 170 | else: 171 | vocab_file = pretrained_model_name_or_path 172 | if os.path.isdir(vocab_file): 173 | vocab_file = os.path.join(vocab_file, VOCAB_NAME) 174 | # redirect to the cache, if necessary 175 | try: 176 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 177 | except EnvironmentError: 178 | logger.error( 179 | "Model name '{}' was not found in model name list ({}). " 180 | "We assumed '{}' was a path or url but couldn't find any file " 181 | "associated to this path or url.".format( 182 | pretrained_model_name_or_path, 183 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 184 | vocab_file)) 185 | return None 186 | if resolved_vocab_file == vocab_file: 187 | logger.info("loading vocabulary file {}".format(vocab_file)) 188 | else: 189 | logger.info("loading vocabulary file {} from cache at {}".format( 190 | vocab_file, resolved_vocab_file)) 191 | if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: 192 | # if we're using a pretrained model, ensure the tokenizer wont index sequences longer 193 | # than the number of positional embeddings 194 | max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] 195 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 196 | # Instantiate tokenizer. 197 | tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) 198 | return tokenizer 199 | 200 | 201 | class BasicTokenizer(object): 202 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 203 | 204 | def __init__(self, 205 | do_lower_case=True, 206 | never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): 207 | """Constructs a BasicTokenizer. 208 | 209 | Args: 210 | do_lower_case: Whether to lower case the input. 211 | """ 212 | self.do_lower_case = do_lower_case 213 | self.never_split = never_split 214 | 215 | def tokenize(self, text): 216 | """Tokenizes a piece of text.""" 217 | text = self._clean_text(text) 218 | # This was added on November 1st, 2018 for the multilingual and Chinese 219 | # models. This is also applied to the English models now, but it doesn't 220 | # matter since the English models were not trained on any Chinese data 221 | # and generally don't have any Chinese data in them (there are Chinese 222 | # characters in the vocabulary because Wikipedia does have some Chinese 223 | # words in the English Wikipedia.). 224 | text = self._tokenize_chinese_chars(text) 225 | orig_tokens = whitespace_tokenize(text) 226 | split_tokens = [] 227 | for token in orig_tokens: 228 | if self.do_lower_case and token not in self.never_split: 229 | token = token.lower() 230 | token = self._run_strip_accents(token) 231 | split_tokens.extend(self._run_split_on_punc(token)) 232 | 233 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 234 | return output_tokens 235 | 236 | def _run_strip_accents(self, text): 237 | """Strips accents from a piece of text.""" 238 | text = unicodedata.normalize("NFD", text) 239 | output = [] 240 | for char in text: 241 | cat = unicodedata.category(char) 242 | if cat == "Mn": 243 | continue 244 | output.append(char) 245 | return "".join(output) 246 | 247 | def _run_split_on_punc(self, text): 248 | """Splits punctuation on a piece of text.""" 249 | if text in self.never_split: 250 | return [text] 251 | chars = list(text) 252 | i = 0 253 | start_new_word = True 254 | output = [] 255 | while i < len(chars): 256 | char = chars[i] 257 | if _is_punctuation(char): 258 | output.append([char]) 259 | start_new_word = True 260 | else: 261 | if start_new_word: 262 | output.append([]) 263 | start_new_word = False 264 | output[-1].append(char) 265 | i += 1 266 | 267 | return ["".join(x) for x in output] 268 | 269 | def _tokenize_chinese_chars(self, text): 270 | """Adds whitespace around any CJK character.""" 271 | output = [] 272 | for char in text: 273 | cp = ord(char) 274 | if self._is_chinese_char(cp): 275 | output.append(" ") 276 | output.append(char) 277 | output.append(" ") 278 | else: 279 | output.append(char) 280 | return "".join(output) 281 | 282 | def _is_chinese_char(self, cp): 283 | """Checks whether CP is the codepoint of a CJK character.""" 284 | # This defines a "chinese character" as anything in the CJK Unicode block: 285 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 286 | # 287 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 288 | # despite its name. The modern Korean Hangul alphabet is a different block, 289 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 290 | # space-separated words, so they are not treated specially and handled 291 | # like the all of the other languages. 292 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 293 | (cp >= 0x3400 and cp <= 0x4DBF) or # 294 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 295 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 296 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 297 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 298 | (cp >= 0xF900 and cp <= 0xFAFF) or # 299 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 300 | return True 301 | 302 | return False 303 | 304 | def _clean_text(self, text): 305 | """Performs invalid character removal and whitespace cleanup on text.""" 306 | output = [] 307 | for char in text: 308 | cp = ord(char) 309 | if cp == 0 or cp == 0xfffd or _is_control(char): 310 | continue 311 | if _is_whitespace(char): 312 | output.append(" ") 313 | else: 314 | output.append(char) 315 | return "".join(output) 316 | 317 | 318 | class WordpieceTokenizer(object): 319 | """Runs WordPiece tokenization.""" 320 | 321 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 322 | self.vocab = vocab 323 | self.unk_token = unk_token 324 | self.max_input_chars_per_word = max_input_chars_per_word 325 | 326 | def tokenize(self, text): 327 | """Tokenizes a piece of text into its word pieces. 328 | 329 | This uses a greedy longest-match-first algorithm to perform tokenization 330 | using the given vocabulary. 331 | 332 | For example: 333 | input = "unaffable" 334 | output = ["un", "##aff", "##able"] 335 | 336 | Args: 337 | text: A single token or whitespace separated tokens. This should have 338 | already been passed through `BasicTokenizer`. 339 | 340 | Returns: 341 | A list of wordpiece tokens. 342 | """ 343 | 344 | output_tokens = [] 345 | for token in whitespace_tokenize(text): 346 | chars = list(token) 347 | # if len(chars) > self.max_input_chars_per_word: 348 | # output_tokens.append(self.unk_token) 349 | # continue 350 | 351 | # is_bad = False 352 | start = 0 353 | sub_tokens = [] 354 | while start < len(chars): 355 | end = len(chars) 356 | cur_substr = None 357 | while start < end: 358 | substr = "".join(chars[start:end]) 359 | if start > 0: 360 | substr = "##" + substr 361 | if substr in self.vocab: 362 | cur_substr = substr 363 | break 364 | end -= 1 365 | if cur_substr is None: 366 | # is_bad = True 367 | # break 368 | sub_tokens.append(self.unk_token) 369 | start += 1 370 | else: 371 | sub_tokens.append(cur_substr) 372 | start = end 373 | 374 | # if is_bad: 375 | # output_tokens.append(self.unk_token) 376 | # else: 377 | output_tokens.extend(sub_tokens) 378 | 379 | return output_tokens 380 | 381 | 382 | def _is_whitespace(char): 383 | """Checks whether `chars` is a whitespace character.""" 384 | # \t, \n, and \r are technically contorl characters but we treat them 385 | # as whitespace since they are generally considered as such. 386 | if char == " " or char == "\t" or char == "\n" or char == "\r": 387 | return True 388 | cat = unicodedata.category(char) 389 | if cat == "Zs": 390 | return True 391 | return False 392 | 393 | 394 | def _is_control(char): 395 | """Checks whether `chars` is a control character.""" 396 | # These are technically control characters but we count them as whitespace 397 | # characters. 398 | if char == "\t" or char == "\n" or char == "\r": 399 | return False 400 | cat = unicodedata.category(char) 401 | if cat.startswith("C"): 402 | return True 403 | return False 404 | 405 | 406 | def _is_punctuation(char): 407 | """Checks whether `chars` is a punctuation character.""" 408 | cp = ord(char) 409 | # We treat all non-letter/number ASCII as punctuation. 410 | # Characters such as "^", "$", and "`" are not in the Unicode 411 | # Punctuation class but we treat them as punctuation anyways, for 412 | # consistency. 413 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 414 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 415 | return True 416 | cat = unicodedata.category(char) 417 | if cat.startswith("P"): 418 | return True 419 | return False 420 | -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from flask import Flask, send_from_directory, redirect, jsonify, request 4 | from flask_caching import Cache 5 | from hashlib import sha256 6 | import html 7 | import json 8 | import torch 9 | from somajo import Tokenizer, SentenceSplitter 10 | 11 | from qurator.sbb_ner.models.bert import get_device, model_predict 12 | from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, convert_examples_to_features 13 | from qurator.sbb_ner.models.tokenization import BertTokenizer 14 | from pytorch_pretrained_bert.modeling import (CONFIG_NAME, 15 | BertConfig, 16 | BertForTokenClassification) 17 | app = Flask(__name__) 18 | 19 | app.config.from_file(os.path.join(os.getcwd(), 20 | 'config.json' if not os.environ.get('CONFIG') 21 | else os.environ.get('CONFIG')), load=json.load) 22 | cache = Cache(app) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class NERPredictor: 28 | 29 | def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False): 30 | 31 | self._batch_size = batch_size 32 | self._local_rank = local_rank 33 | self._max_seq_length = max_seq_length 34 | 35 | self._device, self._n_gpu = get_device(no_cuda=no_cuda) 36 | 37 | self._model_config = json.load(open(os.path.join(model_dir, "model_config.json"), "r")) 38 | 39 | self._label_to_id = self._model_config['label_map'] 40 | 41 | self._label_map = {v: k for k, v in self._model_config['label_map'].items()} 42 | 43 | self._bert_tokenizer = \ 44 | BertTokenizer.from_pretrained(model_dir, 45 | do_lower_case=self._model_config['do_lower']) 46 | 47 | output_config_file = os.path.join(model_dir, CONFIG_NAME) 48 | 49 | output_model_file = os.path.join(model_dir, "pytorch_model_ep{}.bin".format(epoch)) 50 | 51 | config = BertConfig(output_config_file) 52 | 53 | self._model = BertForTokenClassification(config, num_labels=len(self._label_map)) 54 | self._model.load_state_dict(torch.load(output_model_file, 55 | map_location=lambda storage, loc: storage if no_cuda else None)) 56 | self._model.to(self._device) 57 | self._model.eval() 58 | 59 | return 60 | 61 | def classify_text(self, sentences): 62 | 63 | examples = NerProcessor.create_examples(sentences, 'test') 64 | 65 | features = [fe for ex in examples for fe in 66 | convert_examples_to_features(ex, self._label_to_id, self._max_seq_length, self._bert_tokenizer)] 67 | 68 | data_loader = NerProcessor.make_data_loader(None, self._batch_size, self._local_rank, self._label_to_id, 69 | self._max_seq_length, self._bert_tokenizer, features=features, 70 | sequential=True) 71 | 72 | prediction_tmp = model_predict(data_loader, self._device, self._label_map, self._model) 73 | 74 | assert len(prediction_tmp) == len(features) 75 | 76 | prediction = [] 77 | prev_guid = None 78 | for fe, pr in zip(features, prediction_tmp): 79 | # longer sentences might have been processed in several steps 80 | # therefore we have to glue them together. This can be done on the basis of the guid. 81 | 82 | if prev_guid != fe.guid: 83 | prediction.append((fe.tokens[1:-1], pr)) 84 | else: 85 | prediction[-1] = (prediction[-1][0] + fe.tokens[1:-1], prediction[-1][1] + pr) 86 | 87 | prev_guid = fe.guid 88 | 89 | try: 90 | assert len(sentences) == len(prediction) 91 | except AssertionError: 92 | print('Sentences:\n') 93 | print(sentences) 94 | print('\n\nPrediciton:\n') 95 | print(prediction) 96 | 97 | return prediction 98 | 99 | 100 | class NERTokenizer: 101 | 102 | def __init__(self): 103 | 104 | self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) 105 | 106 | self._sentence_splitter = SentenceSplitter() 107 | 108 | def parse_text(self, text): 109 | tokens = self._word_tokenizer.tokenize_paragraph(text) 110 | 111 | sentences_tokenized = self._sentence_splitter.split(tokens) 112 | 113 | sentences = [] 114 | for sen in sentences_tokenized: 115 | 116 | sen = [tok.replace(" ", "") for tok in sen] 117 | 118 | if len(sen) == 0: 119 | continue 120 | 121 | sentences.append((sen, [])) 122 | 123 | return sentences 124 | 125 | 126 | class PredictorStore: 127 | 128 | def __init__(self): 129 | 130 | self._predictor = None 131 | self._model_id = None 132 | 133 | def get(self, model_id): 134 | 135 | if model_id is not None: 136 | model = next((m for m in app.config['MODELS'] if m['id'] == int(model_id))) 137 | else: 138 | model = next((m for m in app.config['MODELS'] if m['default'])) 139 | 140 | if self._model_id != model['id']: 141 | 142 | self._predictor = NERPredictor(model_dir=model['model_dir'], 143 | epoch=model['epoch'], 144 | batch_size=app.config['BATCH_SIZE'], 145 | no_cuda=False if not os.environ.get('USE_CUDA') else 146 | os.environ.get('USE_CUDA').lower() == 'false') 147 | self._model_id = model['id'] 148 | 149 | return self._predictor 150 | 151 | 152 | predictor_store = PredictorStore() 153 | 154 | tokenizer = NERTokenizer() 155 | 156 | 157 | def key_prefix(): 158 | return "{}:{}".format(request.path, sha256(str(request.json).encode('utf-8')).hexdigest()) 159 | 160 | 161 | @app.route('/') 162 | def entry(): 163 | return redirect("/index.html", code=302) 164 | 165 | 166 | @app.route('/models') 167 | def get_models(): 168 | return jsonify(app.config['MODELS']) 169 | 170 | 171 | @app.route('/tokenized', methods=['GET', 'POST']) 172 | @cache.cached(key_prefix=key_prefix) 173 | def tokenized(): 174 | 175 | raw_text = request.json['text'] 176 | 177 | sentences = tokenizer.parse_text(raw_text) 178 | 179 | result = [(sen, i) for i, (sen, _) in enumerate(sentences)] 180 | 181 | return jsonify(result) 182 | 183 | 184 | @app.route('/ner-bert-tokens', methods=['GET', 'POST']) 185 | @app.route('/ner-bert-tokens/', methods=['GET', 'POST']) 186 | @cache.cached(key_prefix=key_prefix) 187 | def ner_bert_tokens(model_id=None): 188 | 189 | raw_text = request.json['text'] 190 | 191 | sentences = tokenizer.parse_text(raw_text) 192 | 193 | prediction = predictor_store.get(model_id).classify_text(sentences) 194 | 195 | output = [] 196 | 197 | for tokens, word_predictions in prediction: 198 | 199 | output_sentence = [] 200 | 201 | for token, word_pred in zip(tokens, word_predictions): 202 | 203 | output_sentence.append({'token': html.escape(token), 'prediction': word_pred}) 204 | 205 | output.append(output_sentence) 206 | 207 | return jsonify(output) 208 | 209 | 210 | @app.route('/ner', methods=['GET', 'POST']) 211 | @app.route('/ner/', methods=['GET', 'POST']) 212 | @cache.cached(key_prefix=key_prefix) 213 | def ner(model_id=None): 214 | 215 | raw_text = request.json['text'] 216 | 217 | sentences = tokenizer.parse_text(raw_text) 218 | 219 | prediction = predictor_store.get(model_id).classify_text(sentences) 220 | 221 | output = [] 222 | 223 | for (tokens, token_predictions), (input_sentence, _) in zip(prediction, sentences): 224 | 225 | output_text = "" 226 | original_text = "".join(input_sentence) 227 | original_word_positions = \ 228 | [pos for positions in [[idx] * len(word) for idx, word in enumerate(input_sentence)] for pos in positions] 229 | 230 | word = '' 231 | word_prediction = 'O' 232 | output_sentence = [] 233 | 234 | for pos, (token, token_prediction) in enumerate(zip(tokens, token_predictions)): 235 | 236 | if not token.startswith('##') and token_prediction == 'X' or token_prediction == '[SEP]': 237 | token_prediction = 'O' 238 | 239 | orig_pos = len(output_text + word) 240 | 241 | # if the current word length is greater than 0 242 | # and its either a word start token (does not start with ##) and not an unknown token or the original text 243 | # positions indicate a word break 244 | if len(word) > 0 and ((not token.startswith('##') and token != '[UNK]') or 245 | (orig_pos > 0 and 246 | original_word_positions[orig_pos-1] != original_word_positions[orig_pos])): 247 | output_sentence.append({'word': word, 'prediction': word_prediction}) 248 | output_text += word 249 | word = '' 250 | word_prediction = 'O' 251 | 252 | if token == '[UNK]': 253 | 254 | orig_pos = len(output_text + word) 255 | 256 | # are we on a word boundary? 257 | if len(word) > 0 and orig_pos > 0 \ 258 | and original_word_positions[orig_pos-1] != original_word_positions[orig_pos]: 259 | 260 | # we are on a word boundary - start a new word ... 261 | output_sentence.append({'word': word, 'prediction': word_prediction}) 262 | output_text += word 263 | word = '' 264 | word_prediction = 'O' 265 | 266 | # get character that corresponds to [UNK] token from original text 267 | token = original_text[orig_pos] 268 | 269 | else: 270 | token = token[2:] if token.startswith('##') else token 271 | 272 | # if the output_text plus the current word and token is not a prefix of the original text, it means, that 273 | # we would miss characters. Therefore we take the missing characters from the original text at the current 274 | # word position 275 | while not original_text.startswith(output_text + word + token) \ 276 | and len(output_text + word) < len(original_text): 277 | 278 | word += original_text[len(output_text + word)] 279 | 280 | orig_pos = len(output_text + word) 281 | 282 | # are we on a word boundary? 283 | if orig_pos > 0 and original_word_positions[orig_pos - 1] != original_word_positions[orig_pos]: 284 | # we are on a word boundary - start a new word ... 285 | output_sentence.append({'word': word, 'prediction': word_prediction}) 286 | output_text += word 287 | word = '' 288 | word_prediction = 'O' 289 | 290 | word += token 291 | 292 | if token_prediction != 'X': 293 | word_prediction = token_prediction 294 | 295 | if len(word) > 0: 296 | output_text += word 297 | output_sentence.append({'word': word, 'prediction': word_prediction}) 298 | 299 | output.append(output_sentence) 300 | 301 | try: 302 | assert output_text == original_text 303 | except AssertionError: 304 | import ipdb;ipdb.set_trace() 305 | 306 | for output_sentence, (input_sentence, _) in zip(output, sentences): 307 | 308 | try: 309 | assert "".join([pred['word'] for pred in output_sentence]) == "".join(input_sentence) 310 | except AssertionError: 311 | logger.warning('Input and output different!!! \n\n\nInput: {}\n\nOutput: {}\n'. 312 | format("".join(input_sentence).replace(" ", ""), 313 | "".join([pred['word'] for pred in output_sentence]))) 314 | 315 | torch.cuda.empty_cache() 316 | 317 | return jsonify(output) 318 | 319 | 320 | @app.route('/') 321 | def send_js(path): 322 | return send_from_directory('static', path) 323 | -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/config-8GB-GPU.json: -------------------------------------------------------------------------------- 1 | { 2 | "BATCH_SIZE": 16, 3 | "MODELS": [ 4 | { 5 | "name": "DC-SBB + CONLL + GERMEVAL", 6 | "id": 1, 7 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned", 8 | "epoch": 7, 9 | "default": true 10 | }, 11 | { 12 | "name": "DC-SBB + CONLL + GERMEVAL + SBB", 13 | "id": 2, 14 | "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned", 15 | "epoch": 7, 16 | "default": false 17 | }, 18 | { 19 | "name": "DC-SBB + SBB", 20 | "id": 3, 21 | "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned", 22 | "epoch": 7, 23 | "default": false 24 | }, 25 | { 26 | "name": "CONLL + GERMEVAL", 27 | "id": 4, 28 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline", 29 | "epoch": 7, 30 | "default": false 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "BATCH_SIZE": 256, 3 | "MODELS": [ 4 | { 5 | "name": "DC-SBB + CONLL + GERMEVAL", 6 | "id": 1, 7 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned", 8 | "epoch": 7, 9 | "default": true 10 | }, 11 | { 12 | "name": "DC-SBB + CONLL + GERMEVAL + SBB", 13 | "id": 2, 14 | "model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned", 15 | "epoch": 7, 16 | "default": false 17 | }, 18 | { 19 | "name": "DC-SBB + SBB", 20 | "id": 3, 21 | "model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned", 22 | "epoch": 7, 23 | "default": false 24 | }, 25 | { 26 | "name": "CONLL + GERMEVAL", 27 | "id": 4, 28 | "model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline", 29 | "epoch": 7, 30 | "default": false 31 | }, 32 | { 33 | "name": "MULTILANG", 34 | "id": 5, 35 | "model_dir": "data/BERT/build-wd_0.03/bert-multilang-de-finetuned", 36 | "epoch": 20, 37 | "default": false 38 | } 39 | ] 40 | } -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/css/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | NER - Demo 13 | 14 | 15 | 16 |
17 | 18 |
19 | 20 |
21 |
22 |
23 |
24 |
25 |

NER - Demo

26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | 37 | 42 |
43 |
44 | 45 | 47 |
48 | 49 |
50 | 51 | 52 |
53 | 54 |
55 |
56 | 57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | 66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 | 75 |
76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/js/__init__.py: -------------------------------------------------------------------------------- 1 | __import__('pkg_resources').declare_namespace(__name__) -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/js/ner-demo.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function(){ 2 | 3 | $('#nerform').submit( 4 | function(e){ 5 | e.preventDefault(); 6 | 7 | update(); 8 | } 9 | ); 10 | 11 | $.get( "models") 12 | .done( 13 | function( data ) { 14 | var tmp=""; 15 | $.each(data, 16 | function(index, item){ 17 | 18 | selected="" 19 | if (item.default) { 20 | selected = "selected" 21 | } 22 | 23 | tmp += '' 24 | }); 25 | $('#model').html(tmp); 26 | 27 | var url_params = new URLSearchParams(window.location.search); 28 | 29 | var do_update=false; 30 | 31 | if (url_params.has('text')) { 32 | 33 | var text = decodeURIComponent(url_params.get('text')) 34 | 35 | $('#inputtext').val(text); 36 | 37 | do_update = true; 38 | 39 | window.history.replaceState({}, '', `${location.pathname}`); 40 | } 41 | 42 | task_select() 43 | 44 | if (do_update) update(); 45 | } 46 | ); 47 | }); 48 | 49 | function update() { 50 | 51 | var task = $('#task').val(); 52 | var model_id = $('#model').val(); 53 | var input_text = $('#inputtext').val() 54 | 55 | if (input_text.length < 30000) { 56 | 57 | var url_params = new URLSearchParams(window.location.search); 58 | 59 | url_params.set('text', encodeURIComponent(input_text)) 60 | 61 | window.history.replaceState({}, '', `${location.pathname}?${url_params}`); 62 | } 63 | else { 64 | window.history.replaceState({}, '', `${location.pathname}`); 65 | } 66 | 67 | 68 | 69 | do_task(task, model_id, input_text); 70 | } -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/static/js/ner.js: -------------------------------------------------------------------------------- 1 | 2 | function task_select() { 3 | 4 | var task = $('#task').val(); 5 | 6 | if ((task != "ner") && (task != "bert-tokens")){ 7 | $('#model_select').hide() 8 | } 9 | else { 10 | $('#model_select').show() 11 | } 12 | 13 | $("#resultregion").html(""); 14 | $("#legende").html(""); 15 | } 16 | 17 | function do_task(task, model_id, input_text) { 18 | 19 | var post_data = { "text" : input_text } 20 | 21 | var text_region_html = 22 | `
23 |
24 | Ergebnis: 25 |
26 |
27 |
28 |
29 |
`; 30 | 31 | var legende_html = 32 | `
33 |
34 | Legende: 35 |
[Person]
36 |
[Ort]
37 |
[Organisation]
38 |
[keine Named Entity]
39 |
40 |
`; 41 | 42 | var spinner_html = 43 | `
44 |
45 | Loading... 46 |
47 |
`; 48 | 49 | $("#legende").html(""); 50 | 51 | if (task == "fulltext") { 52 | $("#resultregion").html(text_region_html) 53 | $("#textregion").html(input_text) 54 | } 55 | else if (task == "tokenize") { 56 | 57 | $("#resultregion").html(spinner_html) 58 | 59 | $.ajax( 60 | { 61 | url: "tokenized", 62 | data: JSON.stringify(post_data), 63 | type: 'POST', 64 | contentType: "application/json", 65 | success: 66 | function( data ) { 67 | text_html = "" 68 | data.forEach( 69 | function(sentence) { 70 | 71 | text_html += JSON.stringify(sentence) 72 | 73 | text_html += '
' 74 | } 75 | ) 76 | $("#resultregion").html(text_region_html) 77 | $("#textregion").html(text_html) 78 | $("#legende").html(legende_html) 79 | } 80 | , 81 | error: 82 | function(error) { 83 | console.log(error); 84 | } 85 | }) 86 | } 87 | else if (task == "ner") { 88 | 89 | $("#resultregion").html(spinner_html) 90 | 91 | $.ajax({ 92 | url: "ner/" + model_id, 93 | data: JSON.stringify(post_data), 94 | type: 'POST', 95 | contentType: "application/json", 96 | success: 97 | function( data ) { 98 | text_html = "" 99 | data.forEach( 100 | function(sentence) { 101 | sentence.forEach( 102 | function(token) { 103 | 104 | if (text_html != "") text_html += ' ' 105 | 106 | if (token.prediction == 'O') 107 | text_html += token.word 108 | else if (token.prediction.endsWith('PER')) 109 | text_html += '' + token.word + '' 110 | else if (token.prediction.endsWith('LOC')) 111 | text_html += '' + token.word + '' 112 | else if (token.prediction.endsWith('ORG')) 113 | text_html += '' + token.word + '' 114 | }) 115 | text_html += '
' 116 | } 117 | ) 118 | $("#resultregion").html(text_region_html) 119 | $("#textregion").html(text_html) 120 | $("#legende").html(legende_html) 121 | } 122 | , 123 | error: function(error) { 124 | console.log(error); 125 | } 126 | }); 127 | } 128 | else if (task == "bert-tokens") { 129 | $("#resultregion").html(spinner_html); 130 | 131 | $.ajax( 132 | { 133 | url: "ner-bert-tokens/" + model_id, 134 | data: JSON.stringify(post_data), 135 | type: 'POST', 136 | contentType: "application/json", 137 | success: 138 | function( data ) { 139 | text_html = "" 140 | data.forEach( 141 | function(sentence) { 142 | sentence.forEach( 143 | function(part) { 144 | 145 | if (text_html != "") text_html += ' ' 146 | 147 | text_html += part.token + "(" + part.prediction + ")" 148 | }) 149 | text_html += '
' 150 | } 151 | ) 152 | $("#resultregion").html(text_region_html) 153 | $("#textregion").html(text_html) 154 | $("#legende").html(legende_html) 155 | } 156 | , 157 | error: 158 | function(error) { 159 | console.log(error); 160 | } 161 | }) 162 | } 163 | } -------------------------------------------------------------------------------- /qurator/sbb_ner/webapp/wsgi.py: -------------------------------------------------------------------------------- 1 | from .app import app 2 | import logging 3 | 4 | if __name__ == "__main__": 5 | app.run() 6 | else: 7 | gunicorn_logger = logging.getLogger('gunicorn.error') 8 | app.logger.handlers = gunicorn_logger.handlers 9 | app.logger.setLevel(gunicorn_logger.level) 10 | 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | tqdm 4 | pytorch-pretrained-bert==0.6.2 5 | scikit-learn 6 | click 7 | langid 8 | seqeval 9 | conlleval 10 | toolz 11 | cloudpickle 12 | pytest 13 | pytest-cov 14 | flask 15 | Flask-Caching 16 | gunicorn 17 | somajo 18 | qurator-sbb-utils @ git+https://github.com/qurator-spk/sbb_utils.git 19 | qurator-sbb-tools @ git+https://github.com/qurator-spk/sbb_tools.git 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | from setuptools import find_packages, setup 3 | 4 | with open('requirements.txt') as fp: 5 | install_requires = fp.read() 6 | 7 | setup( 8 | name="qurator-sbb-ner", 9 | version="0.0.1", 10 | author="The Qurator Team", 11 | author_email="qurator@sbb.spk-berlin.de", 12 | description="Qurator", 13 | long_description=open("README.md", "r", encoding='utf-8').read(), 14 | long_description_content_type="text/markdown", 15 | keywords='qurator', 16 | license='Apache', 17 | url="https://qurator.ai", 18 | packages=find_packages(exclude=["*.tests", "*.tests.*", 19 | "tests.*", "tests"]), 20 | package_data={'': ['*.html', '*.js', '*.css', '*.map', '*.png', '*.txt']}, 21 | install_requires=install_requires, 22 | entry_points={ 23 | 'console_scripts': [ 24 | "compile_europeana_historic=qurator.sbb_ner.ground_truth.europeana_historic:main", 25 | "compile_germ_eval=qurator.sbb_ner.ground_truth.germeval:main", 26 | "compile_conll=qurator.sbb_ner.ground_truth.conll:main", 27 | "compile_wikiner=qurator.sbb_ner.ground_truth.wikiner:main", 28 | "join-gt=qurator.sbb_ner.ground_truth.join_gt:main", 29 | "bert-ner=qurator.sbb_ner.models.bert:main", 30 | 31 | "collectcorpus=qurator.sbb_ner.models.corpus:collect", 32 | "bert-pregenerate-trainingdata=qurator.sbb_ner.models.pregenerate_training_data:main", 33 | "bert-finetune=qurator.sbb_ner.models.finetune_on_pregenerated:main" 34 | ] 35 | }, 36 | python_requires='>=3.6.0', 37 | tests_require=['pytest'], 38 | classifiers=[ 39 | 'Intended Audience :: Science/Research', 40 | 'License :: OSI Approved :: Apache Software License', 41 | 'Programming Language :: Python :: 3', 42 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 43 | ], 44 | ) --------------------------------------------------------------------------------