├── stanza ├── models │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── seq2seq_constant.py │ │ ├── exceptions.py │ │ ├── trainer.py │ │ ├── stanza_object.py │ │ ├── count_ner_coverage.py │ │ ├── maxout_linear.py │ │ ├── count_pretrain_coverage.py │ │ └── convert_pretrain.py │ ├── coref │ │ ├── __init__.py │ │ ├── tokenizer_customization.py │ │ ├── const.py │ │ ├── coref_chain.py │ │ ├── loss.py │ │ ├── config.py │ │ └── predict.py │ ├── langid │ │ ├── __init__.py │ │ └── trainer.py │ ├── lemma │ │ ├── __init__.py │ │ ├── scorer.py │ │ ├── vocab.py │ │ ├── edit.py │ │ └── attach_lemma_classifier.py │ ├── mwt │ │ ├── __init__.py │ │ ├── scorer.py │ │ └── vocab.py │ ├── ner │ │ └── __init__.py │ ├── pos │ │ ├── __init__.py │ │ ├── scorer.py │ │ └── xpos_vocab_utils.py │ ├── classifiers │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── config.py │ │ └── base_classifier.py │ ├── constituency │ │ ├── __init__.py │ │ ├── evaluate_treebanks.py │ │ └── tree_stack.py │ ├── depparse │ │ └── __init__.py │ ├── tokenization │ │ ├── __init__.py │ │ └── vocab.py │ ├── lemma_classifier │ │ ├── __init__.py │ │ ├── constants.py │ │ └── baseline_model.py │ └── _training_logging.py ├── pipeline │ ├── __init__.py │ ├── demo │ │ ├── __init__.py │ │ ├── loading.gif │ │ ├── Astloch-Bold.ttf │ │ ├── Liberation_Sans-Regular.ttf │ │ ├── PT_Sans-Caption-Web-Regular.ttf │ │ ├── README.md │ │ └── stanza-brat.css │ ├── external │ │ ├── __init__.py │ │ └── corenlp_converter_depparse.py │ ├── registry.py │ ├── _constants.py │ └── mwt_processor.py ├── resources │ ├── __init__.py │ └── print_charlm_depparse.py ├── tests │ ├── mwt │ │ ├── __init__.py │ │ └── test_utils.py │ ├── ner │ │ ├── __init__.py │ │ ├── test_models_ner_scorer.py │ │ ├── test_from_conllu.py │ │ ├── test_combine_ner_datasets.py │ │ ├── test_convert_starlang_ner.py │ │ ├── test_ner_trainer.py │ │ └── test_pay_amt_annotators.py │ ├── pos │ │ └── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── test_short_name_to_treebank.py │ │ ├── test_dropout.py │ │ ├── test_bert_embedding.py │ │ ├── test_chuliu_edmonds.py │ │ ├── test_foundation_cache.py │ │ ├── test_common_data.py │ │ ├── test_relative_attn.py │ │ └── test_data_objects.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ner │ │ │ ├── __init__.py │ │ │ ├── test_utils.py │ │ │ └── test_prepare_ner_file.py │ │ ├── coref │ │ │ ├── __init__.py │ │ │ └── test_hebrew_iahlt.py │ │ └── test_vietnamese_renormalization.py │ ├── depparse │ │ └── __init__.py │ ├── langid │ │ └── __init__.py │ ├── lemma │ │ └── __init__.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── test_arabic_pipeline.py │ │ ├── test_pipeline_depparse_processor.py │ │ ├── test_pipeline_sentiment_processor.py │ │ ├── pipeline_device_tests.py │ │ └── test_pipeline_pos_processor.py │ ├── server │ │ ├── __init__.py │ │ ├── test_morphology.py │ │ ├── test_ud_enhancer.py │ │ ├── test_tokensregex.py │ │ ├── test_parser_eval.py │ │ └── test_server_pretokenized.py │ ├── classifiers │ │ └── __init__.py │ ├── constituency │ │ ├── __init__.py │ │ ├── test_positional_encoding.py │ │ ├── test_tree_stack.py │ │ └── test_convert_starlang.py │ ├── resources │ │ ├── __init__.py │ │ ├── test_default_packages.py │ │ ├── test_prepare_resources.py │ │ ├── test_charlm_depparse.py │ │ └── test_installation.py │ ├── tokenization │ │ ├── __init__.py │ │ ├── test_tokenize_files.py │ │ └── test_replace_long_tokens.py │ ├── lemma_classifier │ │ ├── __init__.py │ │ └── test_training.py │ ├── data │ │ ├── external_server.properties │ │ ├── tiny_emb.csv │ │ ├── tiny_emb.txt │ │ ├── test.dat │ │ ├── tiny_emb.gz │ │ ├── tiny_emb.pt │ │ ├── tiny_emb.xz │ │ ├── tiny_emb.zip │ │ └── aws_annotations.zip │ └── pytest.ini ├── utils │ ├── __init__.py │ ├── ner │ │ └── __init__.py │ ├── charlm │ │ └── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ner │ │ │ ├── __init__.py │ │ │ ├── combine_ner_datasets.py │ │ │ ├── count_entities.py │ │ │ ├── compare_entities.py │ │ │ ├── convert_kk_kazNERD.py │ │ │ ├── preprocess_wikiner.py │ │ │ ├── convert_nytk.py │ │ │ ├── convert_en_conll03.py │ │ │ ├── convert_starlang_ner.py │ │ │ ├── convert_mr_l3cube.py │ │ │ ├── json_to_bio.py │ │ │ ├── check_for_duplicates.py │ │ │ └── conll_to_iob.py │ │ ├── pos │ │ │ ├── __init__.py │ │ │ └── remove_columns.py │ │ ├── coref │ │ │ ├── __init__.py │ │ │ ├── balance_languages.py │ │ │ └── convert_hebrew_mixed.py │ │ ├── pretrain │ │ │ ├── __init__.py │ │ │ └── word_in_pretrain.py │ │ ├── sentiment │ │ │ ├── __init__.py │ │ │ └── process_vsfc_vietnamese.py │ │ ├── constituency │ │ │ ├── __init__.py │ │ │ ├── count_common_words.py │ │ │ ├── common_trees.py │ │ │ ├── utils.py │ │ │ ├── convert_spmrl.py │ │ │ ├── treebank_to_labeled_brackets.py │ │ │ ├── relabel_tags.py │ │ │ ├── reduce_dataset.py │ │ │ ├── extract_all_silver_dataset.py │ │ │ └── extract_silver_dataset.py │ │ ├── tokenization │ │ │ └── __init__.py │ │ ├── vietnamese │ │ │ └── __init__.py │ │ ├── prepare_pos_treebank.py │ │ ├── thai_syllable_dict_generator.py │ │ └── contract_mwt.py │ ├── lemma │ │ ├── __init__.py │ │ └── count_ambiguous_lemmas.py │ ├── pretrain │ │ ├── __init__.py │ │ └── compare_pretrains.py │ ├── training │ │ └── __init__.py │ ├── constituency │ │ ├── __init__.py │ │ ├── list_tensors.py │ │ ├── grep_test_logs.py │ │ ├── check_transitions.py │ │ └── grep_dev_logs.py │ ├── languages │ │ └── __init__.py │ ├── visualization │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── constants.py │ │ └── README │ ├── max_mwt_length.py │ ├── select_backoff.py │ ├── avg_sent_len.py │ ├── helper_func.py │ ├── get_tqdm.py │ └── default_paths.py ├── _version.py ├── server │ ├── __init__.py │ └── tokensregex.py ├── __init__.py └── protobuf │ └── __init__.py ├── images └── stanza-logo.png ├── LICENSE ├── demo ├── scenegraph.py ├── ssurgeon_script.txt ├── semgrex.py ├── semgrex_sample.conllu ├── CONLL_Dependency_Visualizer_Example.ipynb └── Dependency_Visualization_Testing.ipynb ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ ├── bug_report.md │ └── question.md ├── pull_request_template.md ├── stale.yml └── workflows │ └── stanza-tests.yaml ├── .travis.yml ├── CONTRIBUTING.md └── scripts └── config.sh /stanza/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/resources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/mwt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/pos/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/coref/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/langid/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/lemma/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/mwt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/pos/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/pipeline/demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/depparse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/langid/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/lemma/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/charlm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/lemma/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/depparse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/pipeline/external/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/datasets/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/resources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/pos/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/languages/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/models/lemma_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/datasets/coref/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/lemma_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/coref/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/sentiment/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/utils/datasets/vietnamese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stanza/tests/data/external_server.properties: -------------------------------------------------------------------------------- 1 | annotators = tokenize,ssplit,pos 2 | -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.csv: -------------------------------------------------------------------------------- 1 | 3 4 2 | unban,1,2,3,4 3 | mox,5,6,7,8 4 | opal,9,10,11,12 5 | -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.txt: -------------------------------------------------------------------------------- 1 | 3 4 2 | unban 1 2 3 4 3 | mox 5 6 7 8 4 | opal 9 10 11 12 5 | -------------------------------------------------------------------------------- /images/stanza-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/images/stanza-logo.png -------------------------------------------------------------------------------- /stanza/tests/data/test.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/test.dat -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.gz -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.pt -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.xz -------------------------------------------------------------------------------- /stanza/pipeline/demo/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/loading.gif -------------------------------------------------------------------------------- /stanza/tests/data/tiny_emb.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.zip -------------------------------------------------------------------------------- /stanza/pipeline/demo/Astloch-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/Astloch-Bold.ttf -------------------------------------------------------------------------------- /stanza/tests/data/aws_annotations.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/aws_annotations.zip -------------------------------------------------------------------------------- /stanza/models/_training_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger('stanza') 4 | logger.setLevel(logging.DEBUG) -------------------------------------------------------------------------------- /stanza/_version.py: -------------------------------------------------------------------------------- 1 | """ Single source of truth for version number """ 2 | 3 | __version__ = "1.11.0" 4 | __resources_version__ = '1.11.0' 5 | -------------------------------------------------------------------------------- /stanza/pipeline/demo/Liberation_Sans-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/Liberation_Sans-Regular.ttf -------------------------------------------------------------------------------- /stanza/pipeline/demo/PT_Sans-Caption-Web-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/PT_Sans-Caption-Web-Regular.ttf -------------------------------------------------------------------------------- /stanza/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | travis: all tests that will be run in travis CI 4 | client: all tests that are related to the CoreNLP client interface 5 | pipeline: all tests that are related to the Stanza neural pipeline 6 | -------------------------------------------------------------------------------- /stanza/pipeline/registry.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | # these two get filled by register_processor 4 | NAME_TO_PROCESSOR_CLASS = dict() 5 | PIPELINE_NAMES = [] 6 | 7 | # this gets filled by register_processor_variant 8 | PROCESSOR_VARIANTS = defaultdict(dict) 9 | -------------------------------------------------------------------------------- /stanza/models/common/seq2seq_constant.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants for seq2seq models. 3 | """ 4 | 5 | PAD = '' 6 | PAD_ID = 0 7 | UNK = '' 8 | UNK_ID = 1 9 | SOS = '' 10 | SOS_ID = 2 11 | EOS = '' 12 | EOS_ID = 3 13 | 14 | VOCAB_PREFIX = [PAD, UNK, SOS, EOS] 15 | 16 | EMB_INIT_RANGE = 1.0 17 | INFINITY_NUMBER = 1e12 18 | -------------------------------------------------------------------------------- /stanza/pipeline/_constants.py: -------------------------------------------------------------------------------- 1 | """ Module defining constants """ 2 | 3 | # string constants for processor names 4 | LANGID = 'langid' 5 | TOKENIZE = 'tokenize' 6 | MWT = 'mwt' 7 | POS = 'pos' 8 | LEMMA = 'lemma' 9 | DEPPARSE = 'depparse' 10 | NER = 'ner' 11 | SENTIMENT = 'sentiment' 12 | CONSTITUENCY = 'constituency' 13 | COREF = 'coref' 14 | -------------------------------------------------------------------------------- /stanza/utils/max_mwt_length.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import json 4 | 5 | def max_mwt_length(filenames): 6 | max_len = 0 7 | for filename in filenames: 8 | with open(filename) as f: 9 | d = json.load(f) 10 | max_len = max([max_len] + [len(" ".join(x[0][1])) for x in d]) 11 | return max_len 12 | 13 | if __name__ == '__main__': 14 | print(max_max_jlength(sys.argv[1:])) 15 | -------------------------------------------------------------------------------- /stanza/models/mwt/scorer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils and wrappers for scoring MWT 3 | """ 4 | from stanza.models.common.utils import ud_scores 5 | 6 | def score(system_conllu_file, gold_conllu_file): 7 | """ Wrapper for word segmenter scorer. """ 8 | evaluation = ud_scores(gold_conllu_file, system_conllu_file) 9 | el = evaluation["Words"] 10 | p, r, f = el.precision, el.recall, el.f1 11 | return p, r, f 12 | 13 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/count_common_words.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from collections import Counter 4 | 5 | from stanza.models.constituency import parse_tree 6 | from stanza.models.constituency import tree_reader 7 | 8 | word_counter = Counter() 9 | count_words = lambda x: word_counter.update(x.leaf_labels()) 10 | 11 | tree_reader.read_tree_file(sys.argv[1], tree_callback=count_words) 12 | print(word_counter.most_common()[:100]) 13 | -------------------------------------------------------------------------------- /stanza/utils/constituency/list_tensors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Lists all the tensors in a constituency model. 3 | 4 | Currently useful in combination with torchshow for displaying a series of tensors as they change. 5 | """ 6 | 7 | import sys 8 | 9 | from stanza.models.constituency.trainer import Trainer 10 | 11 | 12 | trainer = Trainer.load(sys.argv[1]) 13 | model = trainer.model 14 | 15 | for name, param in model.named_parameters(): 16 | print(name, param.requires_grad) 17 | -------------------------------------------------------------------------------- /stanza/models/lemma_classifier/constants.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | UNKNOWN_TOKEN = "unk" # token name for unknown tokens 4 | UNKNOWN_TOKEN_IDX = -1 # custom index we apply to unknown tokens 5 | 6 | # TODO: ModelType could just be LSTM and TRANSFORMER 7 | # and then the transformer baseline would have the transformer as another argument 8 | class ModelType(Enum): 9 | LSTM = 1 10 | TRANSFORMER = 2 11 | BERT = 3 12 | ROBERTA = 4 13 | 14 | DEFAULT_BATCH_SIZE = 16 -------------------------------------------------------------------------------- /stanza/utils/select_backoff.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | backoff_models = { "UD_Breton-KEB": "ga_idt", 4 | "UD_Czech-PUD": "cs_pdt", 5 | "UD_English-PUD": "en_ewt", 6 | "UD_Faroese-OFT": "nn_nynorsk", 7 | "UD_Finnish-PUD": "fi_tdt", 8 | "UD_Japanese-Modern": "ja_gsd", 9 | "UD_Naija-NSC": "en_ewt", 10 | "UD_Swedish-PUD": "sv_talbanken" 11 | } 12 | 13 | print(backoff_models[sys.argv[1]]) 14 | -------------------------------------------------------------------------------- /stanza/models/common/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | A couple more specific FileNotFoundError exceptions 3 | 4 | The idea being, the caller can catch it and report a more useful error resolution 5 | """ 6 | 7 | import errno 8 | 9 | class ForwardCharlmNotFoundError(FileNotFoundError): 10 | def __init__(self, msg, filename): 11 | super().__init__(errno.ENOENT, msg, filename) 12 | 13 | class BackwardCharlmNotFoundError(FileNotFoundError): 14 | def __init__(self, msg, filename): 15 | super().__init__(errno.ENOENT, msg, filename) 16 | -------------------------------------------------------------------------------- /stanza/utils/visualization/utils.py: -------------------------------------------------------------------------------- 1 | def find_nth(haystack, needle, n): 2 | """ 3 | Returns the starting index of the nth occurrence of the substring 'needle' in the string 'haystack'. 4 | """ 5 | start = haystack.find(needle) 6 | while start >= 0 and n > 1: 7 | start = haystack.find(needle, start + len(needle)) 8 | n -= 1 9 | return start 10 | 11 | 12 | def round_base(num, base=10): 13 | """ 14 | Rounding a number to its nearest multiple of the base. round_base(49.2, base=50) = 50. 15 | """ 16 | return base * round(num / base) -------------------------------------------------------------------------------- /stanza/models/lemma/scorer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils and wrappers for scoring lemmatizers. 3 | """ 4 | 5 | import logging 6 | 7 | from stanza.models.common.utils import ud_scores 8 | 9 | logger = logging.getLogger('stanza') 10 | 11 | def score(system_conllu_file, gold_conllu_file): 12 | """ Wrapper for lemma scorer. """ 13 | logger.debug("Evaluating system file %s vs gold file %s", system_conllu_file, gold_conllu_file) 14 | evaluation = ud_scores(gold_conllu_file, system_conllu_file) 15 | el = evaluation["Lemmas"] 16 | p, r, f = el.precision, el.recall, el.f1 17 | return p, r, f 18 | 19 | -------------------------------------------------------------------------------- /stanza/utils/avg_sent_len.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | def avg_sent_len(toklabels): 5 | if toklabels.endswith('.json'): 6 | with open(toklabels, 'r') as f: 7 | l = json.load(f) 8 | 9 | l = [''.join([str(x[1]) for x in para]) for para in l] 10 | else: 11 | with open(toklabels, 'r') as f: 12 | l = ''.join(f.readlines()) 13 | 14 | l = l.split('\n\n') 15 | 16 | sentlen = [len(x) + 1 for para in l for x in para.split('2')] 17 | return sum(sentlen) / len(sentlen) 18 | 19 | if __name__ == '__main__': 20 | print(avg_sent_len(sys.args[1])) 21 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/common_trees.py: -------------------------------------------------------------------------------- 1 | """ 2 | Look through 2 files, only output the common trees 3 | 4 | pretty basic - could use some more options 5 | """ 6 | 7 | import sys 8 | 9 | def main(): 10 | in1 = sys.argv[1] 11 | with open(in1, encoding="utf-8") as fin: 12 | lines1 = fin.readlines() 13 | in2 = sys.argv[2] 14 | with open(in2, encoding="utf-8") as fin: 15 | lines2 = fin.readlines() 16 | 17 | common = [l1 for l1, l2 in zip(lines1, lines2) if l1 == l2] 18 | for l in common: 19 | print(l.strip()) 20 | 21 | if __name__ == '__main__': 22 | main() 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 The Board of Trustees of The Leland Stanford Junior University 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /stanza/tests/common/test_short_name_to_treebank.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import stanza 4 | from stanza.models.common import short_name_to_treebank 5 | 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 7 | 8 | def test_short_name(): 9 | assert short_name_to_treebank.short_name_to_treebank("en_ewt") == "UD_English-EWT" 10 | 11 | def test_canonical_name(): 12 | assert short_name_to_treebank.canonical_treebank_name("UD_URDU-UDTB") == "UD_Urdu-UDTB" 13 | assert short_name_to_treebank.canonical_treebank_name("ur_udtb") == "UD_Urdu-UDTB" 14 | assert short_name_to_treebank.canonical_treebank_name("Unban_Mox_Opal") == "Unban_Mox_Opal" 15 | -------------------------------------------------------------------------------- /stanza/server/__init__.py: -------------------------------------------------------------------------------- 1 | from stanza.protobuf import to_text 2 | from stanza.protobuf import Document, Sentence, Token, IndexedWord, Span 3 | from stanza.protobuf import ParseTree, DependencyGraph, CorefChain 4 | from stanza.protobuf import Mention, NERMention, Entity, Relation, RelationTriple, Timex 5 | from stanza.protobuf import Quote, SpeakerInfo 6 | from stanza.protobuf import Operator, Polarity 7 | from stanza.protobuf import SentenceFragment, TokenLocation 8 | from stanza.protobuf import MapStringString, MapIntString 9 | from .client import CoreNLPClient, AnnotationException, TimeoutException, PermanentlyFailedException, StartServer 10 | from .annotator import Annotator 11 | -------------------------------------------------------------------------------- /demo/scenegraph.py: -------------------------------------------------------------------------------- 1 | """ 2 | Very short demo for the SceneGraph interface in the CoreNLP server 3 | 4 | Requires CoreNLP >= 4.5.5, Stanza >= 1.5.1 5 | """ 6 | 7 | import json 8 | 9 | from stanza.server import CoreNLPClient 10 | 11 | # start_server=None if you have the server running in another process on the same host 12 | # you can start it with whatever normal options CoreNLPClient has 13 | # 14 | # preload=False avoids having the server unnecessarily load annotators 15 | # if you don't plan on using them 16 | with CoreNLPClient(preload=False) as client: 17 | result = client.scenegraph("Jennifer's antennae are on her head.") 18 | print(json.dumps(result, indent=2)) 19 | 20 | 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /stanza/models/lemma/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from stanza.models.common.vocab import BaseVocab, BaseMultiVocab 4 | from stanza.models.common.seq2seq_constant import VOCAB_PREFIX 5 | 6 | class Vocab(BaseVocab): 7 | def build_vocab(self): 8 | counter = Counter(self.data) 9 | self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True)) 10 | self._unit2id = {w:i for i, w in enumerate(self._id2unit)} 11 | 12 | class MultiVocab(BaseMultiVocab): 13 | @classmethod 14 | def load_state_dict(cls, state_dict): 15 | new = cls() 16 | for k,v in state_dict.items(): 17 | new[k] = Vocab.load_state_dict(v) 18 | return new 19 | -------------------------------------------------------------------------------- /stanza/utils/constituency/grep_test_logs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | filenames = sys.argv[1:] 5 | 6 | total_score = 0.0 7 | num_scores = 0 8 | 9 | for filename in filenames: 10 | grep_cmd = ["grep", "F1 score.*test.*", filename] 11 | grep_result = subprocess.run(grep_cmd, stdout=subprocess.PIPE, encoding="utf-8") 12 | grep_result = grep_result.stdout.strip() 13 | if not grep_result: 14 | print("{}: no result".format(filename)) 15 | continue 16 | 17 | score = float(grep_result.split()[-1]) 18 | print("{}: {}".format(filename, score)) 19 | total_score += score 20 | num_scores += 1 21 | 22 | if num_scores > 0: 23 | avg = total_score / num_scores 24 | print("Avg: {}".format(avg)) 25 | -------------------------------------------------------------------------------- /stanza/models/common/trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class Trainer: 4 | def change_lr(self, new_lr): 5 | for param_group in self.optimizer.param_groups: 6 | param_group['lr'] = new_lr 7 | 8 | def save(self, filename): 9 | savedict = { 10 | 'model': self.model.state_dict(), 11 | 'optimizer': self.optimizer.state_dict() 12 | } 13 | torch.save(savedict, filename) 14 | 15 | def load(self, filename): 16 | savedict = torch.load(filename, lambda storage, loc: storage, weights_only=True) 17 | 18 | self.model.load_state_dict(savedict['model']) 19 | if self.args['mode'] == 'train': 20 | self.optimizer.load_state_dict(savedict['optimizer']) 21 | -------------------------------------------------------------------------------- /stanza/models/lemma/edit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for calculating edits between word and lemma forms. 3 | """ 4 | 5 | EDIT_TO_ID = {'none': 0, 'identity': 1, 'lower': 2} 6 | 7 | def get_edit_type(word, lemma): 8 | """ Calculate edit types. """ 9 | if lemma == word: 10 | return 'identity' 11 | elif lemma == word.lower(): 12 | return 'lower' 13 | return 'none' 14 | 15 | def edit_word(word, pred, edit_id): 16 | """ 17 | Edit a word, given edit and seq2seq predictions. 18 | """ 19 | if edit_id == 1: 20 | return word 21 | elif edit_id == 2: 22 | return word.lower() 23 | elif edit_id == 0: 24 | return pred 25 | else: 26 | raise Exception("Unrecognized edit ID: {}".format(edit_id)) 27 | 28 | -------------------------------------------------------------------------------- /stanza/models/mwt/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from stanza.models.common.vocab import BaseVocab 4 | import stanza.models.common.seq2seq_constant as constant 5 | 6 | class Vocab(BaseVocab): 7 | def build_vocab(self): 8 | pairs = self.data 9 | allchars = "".join([src + tgt for src, tgt in pairs]) 10 | counter = Counter(allchars) 11 | 12 | self._id2unit = constant.VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True)) 13 | self._unit2id = {w:i for i, w in enumerate(self._id2unit)} 14 | 15 | def add_unit(self, unit): 16 | if unit in self._unit2id: 17 | return 18 | self._unit2id[unit] = len(self._id2unit) 19 | self._id2unit.append(unit) 20 | -------------------------------------------------------------------------------- /stanza/models/pos/scorer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils and wrappers for scoring taggers. 3 | """ 4 | import logging 5 | 6 | from stanza.models.common.utils import ud_scores 7 | 8 | logger = logging.getLogger('stanza') 9 | 10 | def score(system_conllu_file, gold_conllu_file, verbose=True, eval_type='AllTags'): 11 | """ Wrapper for tagger scorer. """ 12 | evaluation = ud_scores(gold_conllu_file, system_conllu_file) 13 | el = evaluation[eval_type] 14 | p = el.precision 15 | r = el.recall 16 | f = el.f1 17 | if verbose: 18 | scores = [evaluation[k].f1 * 100 for k in ['UPOS', 'XPOS', 'UFeats', 'AllTags']] 19 | logger.info("UPOS\tXPOS\tUFeats\tAllTags") 20 | logger.info("{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}".format(*scores)) 21 | return p, r, f 22 | 23 | -------------------------------------------------------------------------------- /stanza/utils/datasets/pos/remove_columns.py: -------------------------------------------------------------------------------- 1 | """ 2 | Remove xpos and feats from each file given at the command line. 3 | 4 | Useful to strip unwanted tags when combining files of two different 5 | types (or two different stages in the annotation process). 6 | 7 | Super rudimentary right now. Will be upgraded if needed 8 | """ 9 | 10 | import sys 11 | 12 | from stanza.utils.conll import CoNLL 13 | 14 | def remove_columns(filename): 15 | doc = CoNLL.conll2doc(filename) 16 | 17 | for sentence in doc.sentences: 18 | for word in sentence.words: 19 | word.xpos = None 20 | word.feats = None 21 | 22 | CoNLL.write_doc2conll(doc, filename) 23 | 24 | if __name__ == '__main__': 25 | for filename in sys.argv[1:]: 26 | remove_columns(filename) 27 | -------------------------------------------------------------------------------- /stanza/tests/server/test_morphology.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the most basic functionality of the morphology script 3 | """ 4 | 5 | import pytest 6 | 7 | from stanza.server.morphology import Morphology, process_text 8 | 9 | words = ["Jennifer", "has", "the", "prettiest", "antennae"] 10 | tags = ["NNP", "VBZ", "DT", "JJS", "NNS"] 11 | expected = ["Jennifer", "have", "the", "pretty", "antenna"] 12 | 13 | def test_process_text(): 14 | result = process_text(words, tags) 15 | lemma = [x.lemma for x in result.words] 16 | print(lemma) 17 | assert lemma == expected 18 | 19 | def test_basic_morphology(): 20 | with Morphology() as morph: 21 | result = morph.process(words, tags) 22 | lemma = [x.lemma for x in result.words] 23 | assert lemma == expected 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Environment (please complete the following information):** 24 | - OS: [e.g. Windows, Ubuntu, CentOS, MacOS] 25 | - Python version: [e.g. Python 3.6.8 from Anaconda] 26 | - Stanza version: [e.g., 1.0.0] 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /stanza/models/coref/tokenizer_customization.py: -------------------------------------------------------------------------------- 1 | """ This file defines functions used to modify the default behaviour 2 | of transformers.AutoTokenizer. These changes are necessary, because some 3 | tokenizers are meant to be used with raw text, while the OntoNotes documents 4 | have already been split into words. 5 | All the functions are used in coref_model.CorefModel._get_docs. """ 6 | 7 | 8 | # Filters out unwanted tokens produced by the tokenizer 9 | TOKENIZER_FILTERS = { 10 | "albert-xxlarge-v2": (lambda token: token != "▁"), # U+2581, not just "_" 11 | "albert-large-v2": (lambda token: token != "▁"), 12 | } 13 | 14 | # Maps some words to tokens directly, without a tokenizer 15 | TOKENIZER_MAPS = { 16 | "roberta-large": {".": ["."], ",": [","], "!": ["!"], "?": ["?"], 17 | ":":[":"], ";":[";"], "'s": ["'s"]} 18 | } 19 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | **BEFORE YOU START**: please make sure your pull request is against the `dev` branch. 2 | We cannot accept pull requests against the `main` branch. 3 | See our [contributing guide](https://github.com/stanfordnlp/stanza/blob/main/CONTRIBUTING.md) for details. 4 | 5 | ## Description 6 | A brief and concise description of what your pull request is trying to accomplish. 7 | 8 | ## Fixes Issues 9 | A list of issues/bugs with # references. (e.g., #123) 10 | 11 | ## Unit test coverage 12 | Are there unit tests in place to make sure your code is functioning correctly? 13 | (see [here](https://github.com/stanfordnlp/stanza/blob/master/tests/test_tagger.py) for a simple example) 14 | 15 | ## Known breaking changes/behaviors 16 | Does this break anything in Stanza's existing user interface? If so, what is it and how is it addressed? 17 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | - fixed on dev 10 | - bug 11 | - enhancement 12 | # Label to use when marking an issue as stale 13 | staleLabel: stale 14 | # Comment to post when marking an issue as stale. Set to `false` to disable 15 | markComment: > 16 | This issue has been automatically marked as stale because it has not had 17 | recent activity. It will be closed if no further activity occurs. Thank you 18 | for your contributions. 19 | # Comment to post when closing a stale issue. Set to `false` to disable 20 | closeComment: > 21 | This issue has been automatically closed due to inactivity. 22 | -------------------------------------------------------------------------------- /stanza/tests/tokenization/test_tokenize_files.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza.models.tokenization import tokenize_files 4 | from stanza.tests import TEST_MODELS_DIR 5 | 6 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 7 | 8 | EXPECTED = """ 9 | This is a test . This is a second sentence . 10 | I took my daughter ice skating 11 | """.lstrip() 12 | 13 | def test_tokenize_files(tmp_path): 14 | input_file = tmp_path / "input.txt" 15 | with open(input_file, "w") as fout: 16 | fout.write("This is a test. This is a second sentence.\n\nI took my daughter ice skating") 17 | 18 | output_file = tmp_path / "output.txt" 19 | tokenize_files.main([str(input_file), "--lang", "en", "--output_file", str(output_file), "--model_dir", TEST_MODELS_DIR]) 20 | 21 | with open(output_file) as fin: 22 | text = fin.read() 23 | 24 | assert EXPECTED == text 25 | -------------------------------------------------------------------------------- /stanza/utils/lemma/count_ambiguous_lemmas.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read in a UD file, report any word/verb pairs which get lemmatized to different lemmas 3 | """ 4 | 5 | from collections import Counter, defaultdict 6 | import sys 7 | 8 | from stanza.utils.conll import CoNLL 9 | 10 | filename = sys.argv[1] 11 | print(filename) 12 | 13 | lemma_counters = defaultdict(Counter) 14 | 15 | doc = CoNLL.conll2doc(input_file=filename) 16 | for sentence in doc.sentences: 17 | for word in sentence.words: 18 | text = word.text 19 | upos = word.upos 20 | lemma = word.lemma 21 | 22 | lemma_counters[(text, upos)][lemma] += 1 23 | 24 | keys = lemma_counters.keys() 25 | keys = sorted(keys, reverse=True, key=lambda x: sum(lemma_counters[x][y] for y in lemma_counters[x])) 26 | for text, upos in keys: 27 | if len(lemma_counters[(text, upos)]) > 1: 28 | print(text, upos, lemma_counters[(text, upos)]) 29 | 30 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6.5 4 | notifications: 5 | email: false 6 | install: 7 | - pip install --quiet . 8 | - export CORENLP_HOME=~/corenlp-latest CORENLP_VERSION=stanford-corenlp-latest 9 | - export CORENLP_URL="http://nlp.stanford.edu/software/${CORENLP_VERSION}.zip" 10 | - wget $CORENLP_URL -O corenlp-latest.zip 11 | - unzip corenlp-latest.zip > unzip.log 12 | - export CORENLP_UNZIP=`grep creating unzip.log | head -n 1 | cut -d ":" -f 2` 13 | - mv $CORENLP_UNZIP $CORENLP_HOME 14 | - mkdir ~/stanza_test 15 | - mkdir ~/stanza_test/in 16 | - mkdir ~/stanza_test/out 17 | - mkdir ~/stanza_test/scripts 18 | - cp tests/data/external_server.properties ~/stanza_test/scripts 19 | - cp tests/data/example_french.json ~/stanza_test/out 20 | - cp tests/data/tiny_emb.* ~/stanza_test/in 21 | - export STANZA_TEST_HOME=~/stanza_test 22 | script: 23 | - python -m pytest -m travis tests/ 24 | -------------------------------------------------------------------------------- /stanza/resources/print_charlm_depparse.py: -------------------------------------------------------------------------------- 1 | """ 2 | A small utility script to output which depparse models use charlm 3 | 4 | (It should skip en_genia, en_craft, but currently doesn't) 5 | 6 | Not frequently useful, but seems like the kind of thing that might get used a couple times 7 | """ 8 | 9 | from stanza.resources.common import load_resources_json 10 | from stanza.resources.default_packages import default_charlms, depparse_charlms 11 | 12 | def list_depparse(): 13 | charlm_langs = list(default_charlms.keys()) 14 | resources = load_resources_json() 15 | 16 | models = ["%s_%s" % (lang, model) for lang in charlm_langs for model in resources[lang].get("depparse", {}) 17 | if lang not in depparse_charlms or model not in depparse_charlms[lang] or depparse_charlms[lang][model] is not None] 18 | return models 19 | 20 | if __name__ == "__main__": 21 | models = list_depparse() 22 | print(" ".join(models)) 23 | -------------------------------------------------------------------------------- /stanza/tests/resources/test_default_packages.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import stanza 4 | 5 | from stanza.resources import default_packages 6 | 7 | def test_default_pretrains(): 8 | """ 9 | Test that all languages with a default treebank have a default pretrain or are specifically marked as not having a pretrain 10 | """ 11 | for lang in default_packages.default_treebanks.keys(): 12 | assert lang in default_packages.no_pretrain_languages or lang in default_packages.default_pretrains, "Lang %s does not have a default pretrain marked!" % lang 13 | 14 | def test_no_pretrain_languages(): 15 | """ 16 | Test that no languages have no_default_pretrain marked despite having a pretrain 17 | """ 18 | for lang in default_packages.no_pretrain_languages: 19 | assert lang not in default_packages.default_pretrains, "Lang %s is marked as no_pretrain but has a default pretrain!" % lang 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /demo/ssurgeon_script.txt: -------------------------------------------------------------------------------- 1 | # To run this, use the stanza/server/ssurgeon.py main file. 2 | # For example: 3 | # python3 stanza/server/ssurgeon.py --edit_file demo/ssurgeon_script.txt --no_print_input --input_file ../data/ud2_11/UD_English-Pronouns/en_pronouns-ud-test.conllu > en_pronouns.updated.conllu 4 | # This script updates the UD 2.11 version of UD_English-Pronouns to 5 | # better match punctuation attachments, MWT, and no double subjects. 6 | 7 | # This turns unwanted csubj into advcl 8 | {}=source >nsubj {} >csubj=bad {} 9 | relabelNamedEdge -edge bad -reln advcl 10 | 11 | # This detects punctuations which are not attached to the root and reattaches them 12 | {word:/[.]/}=punct )`. 24 | -------------------------------------------------------------------------------- /stanza/utils/visualization/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants used for visualization tooling 3 | """ 4 | 5 | # Ssurgeon constants 6 | SAMPLE_SSURGEON_DOC = """ 7 | # sent_id = 271 8 | # text = Hers is easy to clean. 9 | # previous = What did the dealer like about Alex's car? 10 | # comment = extraction/raising via "tough extraction" and clausal subject 11 | 1 Hers hers PRON PRP Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs 3 nsubj _ _ 12 | 2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 cop _ _ 13 | 3 easy easy ADJ JJ Degree=Pos 0 root _ _ 14 | 4 to to PART TO _ 5 mark _ _ 15 | 5 clean clean VERB VB VerbForm=Inf 3 csubj _ SpaceAfter=No 16 | 6 . . PUNCT . _ 5 punct _ _ 17 | """ 18 | 19 | # Semgrex constants 20 | DEFAULT_SAMPLE_TEXT = "Banning opal removed artifact decks from the meta." 21 | DEFAULT_SEMGREX_QUERY = "{pos:NN}=object 0 and num_zeros < batch.shape[0] 29 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/combine_ner_datasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from stanza.utils.default_paths import get_default_paths 4 | from stanza.utils.datasets.ner.utils import combine_dataset 5 | 6 | SHARDS = ("train", "dev", "test") 7 | 8 | def main(args=None): 9 | ner_data_dir = get_default_paths()['NER_DATA_DIR'] 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--output_dataset', type=str, help='What dataset to output') 13 | parser.add_argument('input_datasets', type=str, nargs='+', help='Which datasets to input') 14 | 15 | parser.add_argument('--input_dir', type=str, default=ner_data_dir, help='Which directory to find the datasets') 16 | parser.add_argument('--output_dir', type=str, default=ner_data_dir, help='Which directory to write the dataset') 17 | args = parser.parse_args(args) 18 | 19 | input_dir = args.input_dir 20 | output_dir = args.output_dir 21 | 22 | combine_dataset(input_dir, output_dir, args.input_datasets, args.output_dataset) 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /stanza/models/lemma/attach_lemma_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from stanza.models.lemma.trainer import Trainer 4 | from stanza.models.lemma_classifier.base_model import LemmaClassifier 5 | 6 | def attach_classifier(input_filename, output_filename, classifiers): 7 | trainer = Trainer(model_file=input_filename) 8 | 9 | for classifier in classifiers: 10 | classifier = LemmaClassifier.load(classifier) 11 | trainer.contextual_lemmatizers.append(classifier) 12 | 13 | trainer.save(output_filename) 14 | 15 | def main(args=None): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--input', type=str, required=True, help='Which lemmatizer to start from') 18 | parser.add_argument('--output', type=str, required=True, help='Where to save the lemmatizer') 19 | parser.add_argument('--classifier', type=str, required=True, nargs='+', help='Lemma classifier to attach') 20 | args = parser.parse_args(args) 21 | 22 | attach_classifier(args.input, args.output, args.classifier) 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /stanza/tests/datasets/test_vietnamese_renormalization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from stanza.utils.datasets.vietnamese import renormalize 5 | 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 7 | 8 | def test_replace_all(): 9 | text = "SỌAmple tụy test file" 10 | expected = "SOẠmple tuỵ test file" 11 | 12 | assert renormalize.replace_all(text) == expected 13 | 14 | def test_replace_file(tmp_path): 15 | text = "SỌAmple tụy test file" 16 | expected = "SOẠmple tuỵ test file" 17 | 18 | orig = tmp_path / "orig.txt" 19 | converted = tmp_path / "converted.txt" 20 | 21 | with open(orig, "w", encoding="utf-8") as fout: 22 | for i in range(10): 23 | fout.write(text) 24 | fout.write("\n") 25 | 26 | renormalize.convert_file(orig, converted) 27 | 28 | assert os.path.exists(converted) 29 | with open(converted, encoding="utf-8") as fin: 30 | lines = fin.readlines() 31 | 32 | assert len(lines) == 10 33 | for i in lines: 34 | assert i.strip() == expected 35 | 36 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/test_arabic_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small test of loading the Arabic pipeline 3 | 4 | The main goal is to check that nothing goes wrong with RtL languages, 5 | but incidentally this would have caught a bug where the xpos tags 6 | were split into individual pieces instead of reassembled as expected 7 | """ 8 | 9 | import pytest 10 | import stanza 11 | 12 | from stanza.tests import TEST_MODELS_DIR 13 | 14 | pytestmark = pytest.mark.pipeline 15 | 16 | def test_arabic_pos_pipeline(): 17 | pipe = stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'ar'}) 18 | text = "ولم يتم اعتقال احد بحسب المتحدث باسم الشرطة." 19 | 20 | doc = pipe(text) 21 | # the first token translates to "and not", seems common enough 22 | # that we should be able to rely on it having a stable MWT and tag 23 | 24 | assert len(doc.sentences) == 1 25 | assert doc.sentences[0].tokens[0].text == "ولم" 26 | assert doc.sentences[0].words[0].xpos == "C---------" 27 | assert doc.sentences[0].words[1].xpos == "F---------" 28 | -------------------------------------------------------------------------------- /stanza/__init__.py: -------------------------------------------------------------------------------- 1 | from stanza.pipeline.core import DownloadMethod, Pipeline 2 | from stanza.pipeline.multilingual import MultilingualPipeline 3 | from stanza.models.common.doc import Document 4 | from stanza.resources.common import download 5 | from stanza.resources.installation import install_corenlp, download_corenlp_models 6 | from stanza._version import __version__, __resources_version__ 7 | 8 | import logging 9 | logger = logging.getLogger('stanza') 10 | 11 | # if the client application hasn't set the log level, we set it 12 | # ourselves to INFO 13 | if logger.level == 0: 14 | logger.setLevel(logging.INFO) 15 | 16 | log_handler = logging.StreamHandler() 17 | log_formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 18 | datefmt='%Y-%m-%d %H:%M:%S') 19 | log_handler.setFormatter(log_formatter) 20 | 21 | # also, if the client hasn't added any handlers for this logger 22 | # (or a default handler), we add a handler of our own 23 | # 24 | # client can later do 25 | # logger.removeHandler(stanza.log_handler) 26 | if not logger.hasHandlers(): 27 | logger.addHandler(log_handler) 28 | -------------------------------------------------------------------------------- /demo/semgrex_sample.conllu: -------------------------------------------------------------------------------- 1 | 2 | # sent_id = reviews-181748-0003 3 | # text = My experience was awful though. 4 | 1 My my PRON PRP$ Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs 2 nmod:poss 2:nmod:poss _ 5 | 2 experience experience NOUN NN Number=Sing 4 nsubj 4:nsubj _ 6 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 cop 4:cop _ 7 | 4 awful awful ADJ JJ Degree=Pos 0 root 0:root _ 8 | 5 though though ADV RB _ 4 advmod 4:advmod SpaceAfter=No 9 | 6 . . PUNCT . _ 4 punct 4:punct _ 10 | 11 | 12 | 13 | # sent_id = reviews-117115-0005 14 | # text = The intruders slit the screen of the window. 15 | 1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _ 16 | 2 intruders intruder NOUN NNS Number=Plur 3 nsubj 3:nsubj _ 17 | 3 slit slit VERB VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 0 root 0:root _ 18 | 4 the the DET DT Definite=Def|PronType=Art 5 det 5:det _ 19 | 5 screen screen NOUN NN Number=Sing 3 obj 3:obj _ 20 | 6 of of ADP IN _ 8 case 8:case _ 21 | 7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _ 22 | 8 window window NOUN NN Number=Sing 5 nmod 5:nmod:of SpaceAfter=No 23 | 9 . . PUNCT . _ 3 punct 3:punct _ 24 | 25 | -------------------------------------------------------------------------------- /stanza/tests/common/test_bert_embedding.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from stanza.models.common.bert_embedding import load_bert, extract_bert_embeddings 5 | 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 7 | 8 | BERT_MODEL = "hf-internal-testing/tiny-bert" 9 | 10 | @pytest.fixture(scope="module") 11 | def tiny_bert(): 12 | m, t = load_bert(BERT_MODEL) 13 | return m, t 14 | 15 | def test_load_bert(tiny_bert): 16 | """ 17 | Empty method that just tests loading the bert 18 | """ 19 | m, t = tiny_bert 20 | 21 | def test_run_bert(tiny_bert): 22 | m, t = tiny_bert 23 | device = next(m.parameters()).device 24 | extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "a", "test"]], device, True) 25 | 26 | def test_run_bert_empty_word(tiny_bert): 27 | m, t = tiny_bert 28 | device = next(m.parameters()).device 29 | foo = extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "-", "a", "test"]], device, True) 30 | bar = extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "", "a", "test"]], device, True) 31 | 32 | assert len(foo) == 1 33 | assert torch.allclose(foo[0], bar[0]) 34 | -------------------------------------------------------------------------------- /stanza/tests/resources/test_prepare_resources.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import stanza 4 | import stanza.resources.prepare_resources as prepare_resources 5 | 6 | from stanza.tests import * 7 | 8 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 9 | 10 | def test_split_model_name(): 11 | # Basic test 12 | lang, package, processor = prepare_resources.split_model_name('ro_nonstandard_tagger.pt') 13 | assert lang == 'ro' 14 | assert package == 'nonstandard' 15 | assert processor == 'pos' 16 | 17 | # Check that nertagger is found even though it also ends with tagger 18 | # Check that ncbi_disease is correctly partitioned despite the extra _ 19 | lang, package, processor = prepare_resources.split_model_name('en_ncbi_disease_nertagger.pt') 20 | assert lang == 'en' 21 | assert package == 'ncbi_disease' 22 | assert processor == 'ner' 23 | 24 | # assert that processors with _ in them are also okay 25 | lang, package, processor = prepare_resources.split_model_name('en_pubmed_forward_charlm.pt') 26 | assert lang == 'en' 27 | assert package == 'pubmed' 28 | assert processor == 'forward_charlm' 29 | 30 | 31 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/test_pipeline_depparse_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic testing of part of speech tagging 3 | """ 4 | 5 | import pytest 6 | import stanza 7 | from stanza.models.common.vocab import VOCAB_PREFIX 8 | 9 | from stanza.tests import TEST_MODELS_DIR 10 | 11 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 12 | 13 | class TestClassifier: 14 | @pytest.fixture(scope="class") 15 | def english_depparse(self): 16 | """ 17 | Get a depparse_processor for English 18 | """ 19 | nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'}) 20 | assert 'depparse' in nlp.processors 21 | return nlp.processors['depparse'] 22 | 23 | def test_get_known_relations(self, english_depparse): 24 | """ 25 | Test getting the known relations from a processor. 26 | 27 | Doesn't test that all the relations exist, since who knows what will change in the future 28 | """ 29 | relations = english_depparse.get_known_relations() 30 | assert len(relations) > 5 31 | assert 'case' in relations 32 | for i in VOCAB_PREFIX: 33 | assert i not in relations 34 | -------------------------------------------------------------------------------- /stanza/models/classifiers/utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from torch import nn 4 | 5 | """ 6 | Defines some methods which may occur in multiple model types 7 | """ 8 | # NLP machines: 9 | # word2vec are in 10 | # /u/nlp/data/stanfordnlp/model_production/stanfordnlp/extern_data/word2vec 11 | # google vectors are in 12 | # /scr/nlp/data/wordvectors/en/google/GoogleNews-vectors-negative300.txt 13 | 14 | class WVType(Enum): 15 | WORD2VEC = 1 16 | GOOGLE = 2 17 | FASTTEXT = 3 18 | OTHER = 4 19 | 20 | class ExtraVectors(Enum): 21 | NONE = 1 22 | CONCAT = 2 23 | SUM = 3 24 | 25 | class ModelType(Enum): 26 | CNN = 1 27 | CONSTITUENCY = 2 28 | 29 | def build_output_layers(fc_input_size, fc_shapes, num_classes): 30 | """ 31 | Build a sequence of fully connected layers to go from the final conv layer to num_classes 32 | 33 | Returns an nn.ModuleList 34 | """ 35 | fc_layers = [] 36 | previous_layer_size = fc_input_size 37 | for shape in fc_shapes: 38 | fc_layers.append(nn.Linear(previous_layer_size, shape)) 39 | previous_layer_size = shape 40 | fc_layers.append(nn.Linear(previous_layer_size, num_classes)) 41 | return nn.ModuleList(fc_layers) 42 | -------------------------------------------------------------------------------- /stanza/tests/common/test_chuliu_edmonds.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test some use cases of the chuliu_edmonds algorithm 3 | 4 | (currently just the tarjan implementation) 5 | """ 6 | 7 | import numpy as np 8 | import pytest 9 | 10 | from stanza.models.common.chuliu_edmonds import tarjan 11 | 12 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 13 | 14 | def test_tarjan_basic(): 15 | simple = np.array([0, 4, 4, 4, 0]) 16 | result = tarjan(simple) 17 | assert result == [] 18 | 19 | simple = np.array([0, 2, 0, 4, 2, 2]) 20 | result = tarjan(simple) 21 | assert result == [] 22 | 23 | def test_tarjan_cycle(): 24 | cycle_graph = np.array([0, 3, 1, 2]) 25 | result = tarjan(cycle_graph) 26 | expected = np.array([False, True, True, True]) 27 | assert len(result) == 1 28 | np.testing.assert_array_equal(result[0], expected) 29 | 30 | cycle_graph = np.array([0, 3, 1, 2, 5, 6, 4]) 31 | result = tarjan(cycle_graph) 32 | assert len(result) == 2 33 | expected = [np.array([False, True, True, True, False, False, False]), 34 | np.array([False, False, False, False, True, True, True])] 35 | for r, e in zip(result, expected): 36 | np.testing.assert_array_equal(r, e) 37 | -------------------------------------------------------------------------------- /stanza/tests/tokenization/test_replace_long_tokens.py: -------------------------------------------------------------------------------- 1 | """ 2 | Check to make sure long tokens are replaced with "UNK" by the tokenization processor 3 | """ 4 | import pytest 5 | import stanza 6 | 7 | from stanza.pipeline import tokenize_processor 8 | 9 | from stanza.tests import TEST_MODELS_DIR 10 | 11 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 12 | 13 | def test_replace_long_tokens(): 14 | nlp = stanza.Pipeline(lang="en", download_method=None, model_dir=TEST_MODELS_DIR, processors="tokenize") 15 | 16 | test_str = "foo " + "x" * 10000 + " bar" 17 | 18 | res = nlp(test_str) 19 | 20 | assert res.sentences[0].words[1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT 21 | 22 | def test_set_max_len(): 23 | nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 24 | 'lang': 'en', 25 | 'download_method': None, 26 | 'tokenize_max_seqlen': 20}) 27 | doc = nlp("This is a doc withaverylongtokenthatshouldbereplaced") 28 | assert len(doc.sentences) == 1 29 | assert len(doc.sentences[0].words) == 5 30 | assert doc.sentences[0].words[-1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT 31 | -------------------------------------------------------------------------------- /stanza/tests/common/test_foundation_cache.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | import tempfile 5 | 6 | import pytest 7 | 8 | import stanza 9 | from stanza.models.common.foundation_cache import FoundationCache, load_charlm 10 | from stanza.tests import TEST_MODELS_DIR 11 | 12 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 13 | 14 | def test_charlm_cache(): 15 | models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*") 16 | models = glob.glob(models_path) 17 | # we expect at least one English model downloaded for the tests 18 | assert len(models) >= 1 19 | model_file = models[0] 20 | 21 | cache = FoundationCache() 22 | with tempfile.TemporaryDirectory(dir=".") as test_dir: 23 | temp_file = os.path.join(test_dir, "charlm.pt") 24 | shutil.copy2(model_file, temp_file) 25 | # this will work 26 | model = load_charlm(temp_file) 27 | 28 | # this will save the model 29 | model = cache.load_charlm(temp_file) 30 | 31 | # this should no longer work 32 | with pytest.raises(FileNotFoundError): 33 | model = load_charlm(temp_file) 34 | 35 | # it should remember the cached version 36 | model = cache.load_charlm(temp_file) 37 | -------------------------------------------------------------------------------- /stanza/models/tokenization/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import re 3 | 4 | from stanza.models.common.vocab import BaseVocab 5 | from stanza.models.common.vocab import UNK, PAD 6 | 7 | SPACE_RE = re.compile(r'\s') 8 | 9 | class Vocab(BaseVocab): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.lang_replaces_spaces = any([self.lang.startswith(x) for x in ['zh', 'ja', 'ko']]) 13 | 14 | def build_vocab(self): 15 | paras = self.data 16 | counter = Counter() 17 | for para in paras: 18 | for unit in para: 19 | normalized = self.normalize_unit(unit[0]) 20 | counter[normalized] += 1 21 | 22 | self._id2unit = [PAD, UNK] + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True)) 23 | self._unit2id = {w:i for i, w in enumerate(self._id2unit)} 24 | 25 | def normalize_unit(self, unit): 26 | # Normalize minimal units used by the tokenizer 27 | return unit 28 | 29 | def normalize_token(self, token): 30 | token = SPACE_RE.sub(' ', token.lstrip()) 31 | 32 | if self.lang_replaces_spaces: 33 | token = token.replace(' ', '') 34 | 35 | return token 36 | -------------------------------------------------------------------------------- /stanza/models/common/stanza_object.py: -------------------------------------------------------------------------------- 1 | def _readonly_setter(self, name): 2 | full_classname = self.__class__.__module__ 3 | if full_classname is None: 4 | full_classname = self.__class__.__qualname__ 5 | else: 6 | full_classname += '.' + self.__class__.__qualname__ 7 | raise ValueError(f'Property "{name}" of "{full_classname}" is read-only.') 8 | 9 | class StanzaObject(object): 10 | """ 11 | Base class for all Stanza data objects that allows for some flexibility handling annotations 12 | """ 13 | 14 | @classmethod 15 | def add_property(cls, name, default=None, getter=None, setter=None): 16 | """ 17 | Add a property accessible through self.{name} with underlying variable self._{name}. 18 | Optionally setup a setter as well. 19 | """ 20 | 21 | if hasattr(cls, name): 22 | raise ValueError(f'Property by the name of {name} already exists in {cls}. Maybe you want to find another name?') 23 | 24 | setattr(cls, f'_{name}', default) 25 | if getter is None: 26 | getter = lambda self: getattr(self, f'_{name}') 27 | if setter is None: 28 | setter = lambda self, value: _readonly_setter(self, name) 29 | 30 | setattr(cls, name, property(getter, setter)) 31 | 32 | -------------------------------------------------------------------------------- /stanza/pipeline/demo/stanza-brat.css: -------------------------------------------------------------------------------- 1 | 2 | .red { 3 | color:#990000 4 | } 5 | 6 | #wrap { 7 | min-height: 100%; 8 | height: auto; 9 | margin: 0 auto -6ex; 10 | padding: 0 0 6ex; 11 | } 12 | 13 | .pattern_tab { 14 | margin: 1ex; 15 | } 16 | 17 | .pattern_brat { 18 | margin-top: 1ex; 19 | } 20 | 21 | .label { 22 | color: #777777; 23 | font-size: small; 24 | } 25 | 26 | .footer { 27 | bottom: 0; 28 | width: 100%; 29 | /* Set the fixed height of the footer here */ 30 | height: 5ex; 31 | padding-top: 1ex; 32 | margin-top: 1ex; 33 | background-color: #f5f5f5; 34 | } 35 | 36 | .corenlp_error { 37 | margin-top: 2ex; 38 | } 39 | 40 | /* Styling for parse graph */ 41 | .node rect { 42 | stroke: #333; 43 | fill: #fff; 44 | } 45 | 46 | .parse-RULE rect { 47 | fill: #C0D9AF; 48 | } 49 | 50 | .parse-TERMINAL rect { 51 | stroke: #333; 52 | fill: #EEE8AA; 53 | } 54 | 55 | .node.highlighted { 56 | stroke: #ffff00; 57 | } 58 | 59 | .edgePath path { 60 | stroke: #333; 61 | fill: #333; 62 | stroke-width: 1.5px; 63 | } 64 | 65 | .parse-EDGE path { 66 | stroke: DarkGray; 67 | fill: DarkGray; 68 | stroke-width: 1.5px; 69 | } 70 | 71 | .logo { 72 | font-family: "Lato", "Gill Sans MT", "Gill Sans", "Helvetica", "Arial", sans-serif; 73 | font-style: italic; 74 | } 75 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_models_ner_scorer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple test of the scorer module for NER 3 | """ 4 | 5 | import pytest 6 | import stanza 7 | 8 | from stanza.tests import * 9 | from stanza.models.ner.scorer import score_by_token, score_by_entity 10 | 11 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 12 | 13 | def test_ner_scorer(): 14 | pred_sequences = [['O', 'S-LOC', 'O', 'O', 'B-PER', 'E-PER'], 15 | ['O', 'S-MISC', 'O', 'E-ORG', 'O', 'B-PER', 'I-PER', 'E-PER']] 16 | gold_sequences = [['O', 'B-LOC', 'E-LOC', 'O', 'B-PER', 'E-PER'], 17 | ['O', 'S-MISC', 'B-ORG', 'E-ORG', 'O', 'B-PER', 'E-PER', 'S-LOC']] 18 | 19 | token_p, token_r, token_f, confusion = score_by_token(pred_sequences, gold_sequences) 20 | assert pytest.approx(token_p, abs=0.00001) == 0.625 21 | assert pytest.approx(token_r, abs=0.00001) == 0.5 22 | assert pytest.approx(token_f, abs=0.00001) == 0.55555 23 | 24 | entity_p, entity_r, entity_f, entity_f1 = score_by_entity(pred_sequences, gold_sequences) 25 | assert pytest.approx(entity_p, abs=0.00001) == 0.4 26 | assert pytest.approx(entity_r, abs=0.00001) == 0.33333 27 | assert pytest.approx(entity_f, abs=0.00001) == 0.36363 28 | assert entity_f1 == {'LOC': 0.0, 'MISC': 1.0, 'ORG': 0.0, 'PER': 0.5} 29 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for the processing of constituency treebanks 3 | """ 4 | 5 | import os 6 | import shutil 7 | 8 | from stanza.models.constituency import parse_tree 9 | 10 | SHARDS = ("train", "dev", "test") 11 | 12 | def copy_dev_test(base_path, input_dataset, output_dataset): 13 | shutil.copy2(os.path.join(base_path, "%s_dev.mrg" % input_dataset), 14 | os.path.join(base_path, "%s_dev.mrg" % output_dataset)) 15 | shutil.copy2(os.path.join(base_path, "%s_test.mrg" % input_dataset), 16 | os.path.join(base_path, "%s_test.mrg" % output_dataset)) 17 | 18 | def write_dataset(datasets, output_dir, dataset_name): 19 | for dataset, shard in zip(datasets, SHARDS): 20 | output_filename = os.path.join(output_dir, "%s_%s.mrg" % (dataset_name, shard)) 21 | print("Writing {} trees to {}".format(len(dataset), output_filename)) 22 | parse_tree.Tree.write_treebank(dataset, output_filename) 23 | 24 | def split_treebank(treebank, train_size, dev_size): 25 | """ 26 | Split a treebank deterministically 27 | """ 28 | train_end = int(len(treebank) * train_size) 29 | dev_end = int(len(treebank) * (train_size + dev_size)) 30 | return treebank[:train_end], treebank[train_end:dev_end], treebank[dev_end:] 31 | -------------------------------------------------------------------------------- /stanza/tests/server/test_ud_enhancer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import stanza 3 | from stanza.tests import * 4 | 5 | from stanza.models.common.doc import Document 6 | import stanza.server.ud_enhancer as ud_enhancer 7 | 8 | pytestmark = [pytest.mark.pipeline] 9 | 10 | def check_edges(graph, source, target, num, isExtra=None): 11 | edges = [edge for edge in graph.edge if edge.source == source and edge.target == target] 12 | assert len(edges) == num 13 | if num == 1: 14 | assert edges[0].isExtra == isExtra 15 | 16 | def test_one_sentence(): 17 | nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,pos,lemma,depparse") 18 | doc = nlp("This is the car that I bought") 19 | result = ud_enhancer.process_doc(doc, language="en", pronouns_pattern=None) 20 | 21 | assert len(result.sentence) == 1 22 | sentence = result.sentence[0] 23 | 24 | basic = sentence.basicDependencies 25 | assert len(basic.node) == 7 26 | assert len(basic.edge) == 6 27 | check_edges(basic, 4, 7, 1, False) 28 | check_edges(basic, 7, 4, 0) 29 | 30 | enhanced = sentence.enhancedDependencies 31 | assert len(enhanced.node) == 7 32 | assert len(enhanced.edge) == 7 33 | check_edges(enhanced, 4, 7, 1, False) 34 | # this is the new edge 35 | check_edges(enhanced, 7, 4, 1, True) 36 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/count_entities.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | from collections import defaultdict 4 | import json 5 | 6 | from stanza.models.common.doc import Document 7 | from stanza.utils.datasets.ner.utils import list_doc_entities 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description="Report the coverage of one NER file on another.") 11 | parser.add_argument('filename', type=str, nargs='+', help='File(s) to count') 12 | args = parser.parse_args() 13 | return args 14 | 15 | 16 | def count_entities(*filenames): 17 | entity_collection = defaultdict(list) 18 | 19 | for filename in filenames: 20 | with open(filename) as fin: 21 | doc = Document(json.load(fin)) 22 | num_tokens = sum(1 for sentence in doc.sentences for token in sentence.tokens) 23 | print("Number of tokens in %s: %d" % (filename, num_tokens)) 24 | entities = list_doc_entities(doc) 25 | 26 | for ent in entities: 27 | entity_collection[ent[1]].append(ent[0]) 28 | 29 | keys = sorted(entity_collection.keys()) 30 | for k in keys: 31 | print(k, len(entity_collection[k])) 32 | 33 | def main(): 34 | args = parse_args() 35 | 36 | count_entities(*args.filename) 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /stanza/models/common/count_ner_coverage.py: -------------------------------------------------------------------------------- 1 | from stanza.models.common import pretrain 2 | import argparse 3 | 4 | def parse_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('ners', type=str, nargs='*', help='Which treebanks to run on') 7 | parser.add_argument('--pretrain', type=str, default="/home/john/stanza_resources/hi/pretrain/hdtb.pt", help='Which pretrain to use') 8 | parser.set_defaults(ners=["/home/john/stanza/data/ner/hi_fire2013.train.csv", 9 | "/home/john/stanza/data/ner/hi_fire2013.dev.csv"]) 10 | args = parser.parse_args() 11 | return args 12 | 13 | 14 | def read_ner(filename): 15 | words = [] 16 | for line in open(filename).readlines(): 17 | line = line.strip() 18 | if not line: 19 | continue 20 | if line.split("\t")[1] == 'O': 21 | continue 22 | words.append(line.split("\t")[0]) 23 | return words 24 | 25 | def count_coverage(pretrain, words): 26 | count = 0 27 | for w in words: 28 | if w in pretrain.vocab: 29 | count = count + 1 30 | return count / len(words) 31 | 32 | args = parse_args() 33 | pt = pretrain.Pretrain(args.pretrain) 34 | for dataset in args.ners: 35 | words = read_ner(dataset) 36 | print(dataset) 37 | print(count_coverage(pt, words)) 38 | print() 39 | -------------------------------------------------------------------------------- /stanza/tests/common/test_common_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import stanza 3 | 4 | from stanza.tests import * 5 | from stanza.models.common.data import get_augment_ratio, augment_punct 6 | 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 8 | 9 | def test_augment_ratio(): 10 | data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 11 | should_augment = lambda x: x >= 3 12 | can_augment = lambda x: x >= 4 13 | # check that zero is returned if no augmentation is needed 14 | # which will be the case since 2 are already satisfactory 15 | assert get_augment_ratio(data, should_augment, can_augment, desired_ratio=0.1) == 0.0 16 | 17 | # this should throw an error 18 | with pytest.raises(AssertionError): 19 | get_augment_ratio(data, can_augment, should_augment) 20 | 21 | # with a desired ratio of 0.4, 22 | # there are already 2 that don't need augmenting 23 | # and 7 that are eligible to be augmented 24 | # so 2/7 will need to be augmented 25 | assert get_augment_ratio(data, should_augment, can_augment, desired_ratio=0.4) == pytest.approx(2/7) 26 | 27 | def test_augment_punct(): 28 | data = [["Simple", "test", "."]] 29 | should_augment = lambda x: x[-1] == "." 30 | can_augment = should_augment 31 | new_data = augment_punct(data, 1.0, should_augment, can_augment) 32 | assert new_data == [["Simple", "test"]] 33 | -------------------------------------------------------------------------------- /stanza/tests/constituency/test_positional_encoding.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | 5 | from stanza import Pipeline 6 | from stanza.models.constituency.positional_encoding import SinusoidalEncoding, AddSinusoidalEncoding 7 | 8 | from stanza.tests import * 9 | 10 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 11 | 12 | 13 | def test_positional_encoding(): 14 | encoding = SinusoidalEncoding(model_dim=10, max_len=6) 15 | foo = encoding(torch.tensor([5])) 16 | assert foo.shape == (1, 10) 17 | # TODO: check the values 18 | 19 | def test_resize(): 20 | encoding = SinusoidalEncoding(model_dim=10, max_len=3) 21 | foo = encoding(torch.tensor([5])) 22 | assert foo.shape == (1, 10) 23 | 24 | 25 | def test_arange(): 26 | encoding = SinusoidalEncoding(model_dim=10, max_len=2) 27 | foo = encoding(torch.arange(4)) 28 | assert foo.shape == (4, 10) 29 | assert encoding.max_len() == 4 30 | 31 | def test_add(): 32 | encoding = AddSinusoidalEncoding(d_model=10, max_len=4) 33 | x = torch.zeros(1, 4, 10) 34 | y = encoding(x) 35 | 36 | r = torch.randn(1, 4, 10) 37 | r2 = encoding(r) 38 | 39 | assert torch.allclose(r2 - r, y, atol=1e-07) 40 | 41 | r = torch.randn(2, 4, 10) 42 | r2 = encoding(r) 43 | 44 | assert torch.allclose(r2[0] - r[0], y, atol=1e-07) 45 | assert torch.allclose(r2[1] - r[1], y, atol=1e-07) 46 | -------------------------------------------------------------------------------- /stanza/utils/datasets/prepare_pos_treebank.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to prepare all pos datasets. 3 | 4 | For example, do 5 | python -m stanza.utils.datasets.prepare_pos_treebank TREEBANK 6 | such as 7 | python -m stanza.utils.datasets.prepare_pos_treebank UD_English-EWT 8 | 9 | and it will prepare each of train, dev, test 10 | """ 11 | 12 | import os 13 | import shutil 14 | 15 | import stanza.utils.datasets.common as common 16 | import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank 17 | 18 | def copy_conllu_file_or_zip(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name): 19 | original = f"{tokenizer_dir}/{short_name}.{tokenizer_file}.zip" 20 | copied = f"{dest_dir}/{short_name}.{dest_file}.zip" 21 | 22 | if os.path.exists(original): 23 | print("Copying from %s to %s" % (original, copied)) 24 | shutil.copyfile(original, copied) 25 | else: 26 | prepare_tokenizer_treebank.copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name) 27 | 28 | 29 | def process_treebank(treebank, model_type, paths, args): 30 | prepare_tokenizer_treebank.copy_conllu_treebank(treebank, model_type, paths, paths["POS_DATA_DIR"], postprocess=copy_conllu_file_or_zip) 31 | 32 | def main(): 33 | common.main(process_treebank, common.ModelType.POS) 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | 39 | -------------------------------------------------------------------------------- /stanza/pipeline/external/corenlp_converter_depparse.py: -------------------------------------------------------------------------------- 1 | """ 2 | A depparse processor which converts constituency trees using CoreNLP 3 | """ 4 | 5 | from stanza.pipeline._constants import TOKENIZE, CONSTITUENCY, DEPPARSE 6 | from stanza.pipeline.processor import ProcessorVariant, register_processor_variant 7 | from stanza.server.dependency_converter import DependencyConverter 8 | 9 | @register_processor_variant(DEPPARSE, 'converter') 10 | class ConverterDepparse(ProcessorVariant): 11 | # set of processor requirements for this processor 12 | REQUIRES_DEFAULT = set([TOKENIZE, CONSTITUENCY]) 13 | 14 | def __init__(self, config): 15 | if config['lang'] != 'en': 16 | raise ValueError("Constituency to dependency converter only works for English") 17 | 18 | # TODO: get classpath from config 19 | # TODO: close this when finished? 20 | # a more involved approach would be to turn the Pipeline into 21 | # a context with __enter__ and __exit__ 22 | # __exit__ would try to free all resources, although some 23 | # might linger such as GPU allocations 24 | # maybe it isn't worth even trying to clean things up on account of that 25 | self.converter = DependencyConverter(classpath="$CLASSPATH") 26 | self.converter.open_pipe() 27 | 28 | def process(self, document): 29 | return self.converter.process(document) 30 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_from_conllu.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza import Pipeline 4 | from stanza.utils.conll import CoNLL 5 | from stanza.tests import TEST_MODELS_DIR 6 | 7 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 8 | 9 | def test_from_conllu(): 10 | """ 11 | If the doc does not have the entire text available, make sure it still safely processes the text 12 | 13 | Test case supplied from user - see issue #1428 14 | """ 15 | pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", download_method=None) 16 | doc = pipe("In February, I traveled to Seattle. Dr. Pritchett gave me a new hip") 17 | ents = [x.text for x in doc.ents] 18 | # the default NER model ought to find these three 19 | assert ents == ['February', 'Seattle', 'Pritchett'] 20 | 21 | doc_conllu = "{:C}\n\n".format(doc) 22 | doc = CoNLL.conll2doc(input_str=doc_conllu) 23 | pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", tokenize_pretokenized=True, download_method=None) 24 | pipe(doc) 25 | ents = [x.text for x in doc.ents] 26 | # this should still work when processed from a CoNLLu document 27 | # the bug previously caused a crash because the text to construct 28 | # the entities was not available, since the Document wouldn't have 29 | # the entire document text available 30 | assert ents == ['February', 'Seattle', 'Pritchett'] 31 | -------------------------------------------------------------------------------- /stanza/models/constituency/evaluate_treebanks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read multiple treebanks, score the results. 3 | 4 | Reports the k-best score if multiple predicted treebanks are given. 5 | """ 6 | 7 | import argparse 8 | 9 | from stanza.models.constituency import tree_reader 10 | from stanza.server.parser_eval import EvaluateParser, ParseResult 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description='Get scores for one or more treebanks against the gold') 15 | parser.add_argument('gold', type=str, help='Which file to load as the gold trees') 16 | parser.add_argument('pred', type=str, nargs='+', help='Which file(s) are the predictions. If more than one is given, the evaluation will be "k-best" with the first prediction treated as the canonical') 17 | args = parser.parse_args() 18 | 19 | print("Loading gold treebank: " + args.gold) 20 | gold = tree_reader.read_treebank(args.gold) 21 | print("Loading predicted treebanks: " + args.pred) 22 | pred = [tree_reader.read_treebank(x) for x in args.pred] 23 | 24 | full_results = [ParseResult(parses[0], [*parses[1:]]) 25 | for parses in zip(gold, *pred)] 26 | 27 | if len(pred) <= 1: 28 | kbest = None 29 | else: 30 | kbest = len(pred) 31 | 32 | with EvaluateParser(kbest=kbest) as evaluator: 33 | response = evaluator.process(full_results) 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /stanza/utils/helper_func.py: -------------------------------------------------------------------------------- 1 | def make_table(header, content, column_width=None): 2 | ''' 3 | Input: 4 | header -> List[str]: table header 5 | content -> List[List[str]]: table content 6 | column_width -> int: table column width; set to None for dynamically calculated widths 7 | 8 | Output: 9 | table_str -> str: well-formatted string for the table 10 | ''' 11 | table_str = '' 12 | len_column, len_row = len(header), len(content) + 1 13 | if column_width is None: 14 | # dynamically decide column widths 15 | lens = [[len(str(h)) for h in header]] 16 | lens += [[len(str(x)) for x in row] for row in content] 17 | column_widths = [max(c)+3 for c in zip(*lens)] 18 | else: 19 | column_widths = [column_width] * len_column 20 | 21 | table_str += '=' * (sum(column_widths) + 1) + '\n' 22 | 23 | table_str += '|' 24 | for i, item in enumerate(header): 25 | table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|' 26 | table_str += '\n' 27 | 28 | table_str += '-' * (sum(column_widths) + 1) + '\n' 29 | 30 | for line in content: 31 | table_str += '|' 32 | for i, item in enumerate(line): 33 | table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|' 34 | table_str += '\n' 35 | 36 | table_str += '=' * (sum(column_widths) + 1) + '\n' 37 | 38 | return table_str 39 | -------------------------------------------------------------------------------- /stanza/utils/datasets/pretrain/word_in_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple tool to query a word vector file to see if certain words are in that file 3 | """ 4 | 5 | import argparse 6 | import os 7 | 8 | from stanza.models.common.pretrain import Pretrain 9 | from stanza.resources.common import DEFAULT_MODEL_DIR, download 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser() 13 | group = parser.add_mutually_exclusive_group(required=True) 14 | group.add_argument("--pretrain", default=None, type=str, help="Where to read the converted PT file") 15 | group.add_argument("--package", default=None, type=str, help="Use a pretrain package instead") 16 | parser.add_argument("--download_json", default=False, action='store_true', help="Download the json even if it already exists") 17 | parser.add_argument("words", type=str, nargs="+", help="Which words to search for") 18 | args = parser.parse_args() 19 | 20 | if args.pretrain: 21 | pt = Pretrain(args.pretrain) 22 | else: 23 | lang, package = args.package.split("_", 1) 24 | download(lang=lang, package=None, processors={"pretrain": package}, download_json=args.download_json) 25 | pt_filename = os.path.join(DEFAULT_MODEL_DIR, lang, "pretrain", "%s.pt" % package) 26 | pt = Pretrain(pt_filename) 27 | 28 | for word in args.words: 29 | print("{}: {}".format(word, word in pt.vocab)) 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /stanza/models/common/maxout_linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | A layer which implements maxout from the "Maxout Networks" paper 3 | 4 | https://arxiv.org/pdf/1302.4389v4.pdf 5 | Goodfellow, Warde-Farley, Mirza, Courville, Bengio 6 | 7 | or a simpler explanation here: 8 | 9 | https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705 10 | 11 | The implementation here: 12 | for k layers of maxout, in -> out channels, we make a single linear 13 | map of size in -> out*k 14 | then we reshape the end to be (..., k, out) 15 | and return the max over the k layers 16 | """ 17 | 18 | 19 | import torch 20 | import torch.nn as nn 21 | 22 | class MaxoutLinear(nn.Module): 23 | def __init__(self, in_channels, out_channels, maxout_k): 24 | super().__init__() 25 | 26 | self.in_channels = in_channels 27 | self.out_channels = out_channels 28 | self.maxout_k = maxout_k 29 | 30 | self.linear = nn.Linear(in_channels, out_channels * maxout_k) 31 | 32 | def forward(self, inputs): 33 | """ 34 | Use the oversized linear as the repeated linear, then take the max 35 | 36 | One large linear map makes the implementation simpler and easier for pytorch to make parallel 37 | """ 38 | outputs = self.linear(inputs) 39 | outputs = outputs.view(*outputs.shape[:-1], self.maxout_k, self.out_channels) 40 | outputs = torch.max(outputs, dim=-2)[0] 41 | return outputs 42 | 43 | -------------------------------------------------------------------------------- /stanza/tests/constituency/test_tree_stack.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza.models.constituency.tree_stack import TreeStack 4 | 5 | from stanza.tests import * 6 | 7 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 8 | 9 | def test_simple(): 10 | stack = TreeStack(value=5, parent=None, length=1) 11 | stack = stack.push(3) 12 | stack = stack.push(1) 13 | 14 | expected_values = [1, 3, 5] 15 | for value in expected_values: 16 | assert stack.value == value 17 | stack = stack.pop() 18 | assert stack is None 19 | 20 | def test_iter(): 21 | stack = TreeStack(value=5, parent=None, length=1) 22 | stack = stack.push(3) 23 | stack = stack.push(1) 24 | 25 | stack_list = list(stack) 26 | assert list(stack) == [1, 3, 5] 27 | 28 | def test_str(): 29 | stack = TreeStack(value=5, parent=None, length=1) 30 | stack = stack.push(3) 31 | stack = stack.push(1) 32 | 33 | assert str(stack) == "TreeStack(1, 3, 5)" 34 | 35 | def test_len(): 36 | stack = TreeStack(value=5, parent=None, length=1) 37 | assert len(stack) == 1 38 | 39 | stack = stack.push(3) 40 | stack = stack.push(1) 41 | assert len(stack) == 3 42 | 43 | def test_long_len(): 44 | """ 45 | Original stack had a bug where this took exponential time... 46 | """ 47 | stack = TreeStack(value=0, parent=None, length=1) 48 | for i in range(1, 40): 49 | stack = stack.push(i) 50 | assert len(stack) == 40 51 | -------------------------------------------------------------------------------- /stanza/tests/datasets/ner/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the utils file of the NER dataset processing 3 | """ 4 | 5 | import pytest 6 | 7 | from stanza.utils.datasets.ner.utils import list_doc_entities 8 | from stanza.tests.datasets.ner.test_prepare_ner_file import BIO_1, BIO_2, write_and_convert 9 | 10 | def test_list_doc_entities(tmp_path): 11 | """ 12 | Test the function which lists all of the entities in a doc 13 | """ 14 | doc = write_and_convert(tmp_path, BIO_1) 15 | entities = list_doc_entities(doc) 16 | expected = [(('Jennifer', "Sh'reyan"), 'PERSON')] 17 | assert expected == entities 18 | 19 | doc = write_and_convert(tmp_path, BIO_2) 20 | entities = list_doc_entities(doc) 21 | expected = [(('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')] 22 | assert expected == entities 23 | 24 | doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_2])) 25 | entities = list_doc_entities(doc) 26 | expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')] 27 | assert expected == entities 28 | 29 | doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_1, BIO_2])) 30 | entities = list_doc_entities(doc) 31 | expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')] 32 | assert expected == entities 33 | 34 | 35 | -------------------------------------------------------------------------------- /stanza/utils/get_tqdm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def get_tqdm(): 4 | """ 5 | Return a tqdm appropriate for the situation 6 | 7 | imports tqdm depending on if we're at a console, redir to a file, notebook, etc 8 | 9 | from @tcrimi at https://github.com/tqdm/tqdm/issues/506 10 | 11 | This replaces `import tqdm`, so for example, you do this: 12 | from stanza.utils.get_tqdm import get_tqdm 13 | tqdm = get_tqdm() 14 | then do this when you want a scroll bar or regular iterator depending on context: 15 | tqdm(list) 16 | 17 | If there is no tty, the returned tqdm will always be disabled 18 | unless disable=False is specifically set. 19 | """ 20 | ipy_str = "" 21 | try: 22 | from IPython import get_ipython 23 | ipy_str = str(type(get_ipython())) 24 | except ImportError: 25 | pass 26 | 27 | if 'zmqshell' in ipy_str: 28 | from tqdm import tqdm_notebook as tqdm 29 | return tqdm 30 | if 'terminal' in ipy_str: 31 | from tqdm import tqdm 32 | return tqdm 33 | 34 | if sys.stderr is not None and hasattr(sys.stderr, "isatty") and sys.stderr.isatty(): 35 | from tqdm import tqdm 36 | return tqdm 37 | 38 | from tqdm import tqdm 39 | def hidden_tqdm(*args, **kwargs): 40 | if "disable" in kwargs: 41 | return tqdm(*args, **kwargs) 42 | kwargs["disable"] = True 43 | return tqdm(*args, **kwargs) 44 | 45 | return hidden_tqdm 46 | 47 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/convert_spmrl.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from stanza.models.constituency.parse_tree import Tree 4 | from stanza.models.constituency.tree_reader import read_treebank 5 | from stanza.utils.default_paths import get_default_paths 6 | 7 | SHARDS = ("train", "dev", "test") 8 | 9 | def add_root(tree): 10 | if tree.label.startswith("NN"): 11 | tree = Tree("NP", tree) 12 | if tree.label.startswith("NE"): 13 | tree = Tree("PN", tree) 14 | elif tree.label.startswith("XY"): 15 | tree = Tree("VROOT", tree) 16 | return Tree("ROOT", tree) 17 | 18 | def convert_spmrl(input_directory, output_directory, short_name): 19 | for shard in SHARDS: 20 | tree_filename = os.path.join(input_directory, shard, shard + ".German.gold.ptb") 21 | trees = read_treebank(tree_filename, tree_callback=add_root) 22 | output_filename = os.path.join(output_directory, "%s_%s.mrg" % (short_name, shard)) 23 | with open(output_filename, "w", encoding="utf-8") as fout: 24 | for tree in trees: 25 | fout.write(str(tree)) 26 | fout.write("\n") 27 | print("Wrote %d trees to %s" % (len(trees), output_filename)) 28 | 29 | if __name__ == '__main__': 30 | paths = get_default_paths() 31 | output_directory = paths["CONSTITUENCY_DATA_DIR"] 32 | input_directory = "extern_data/constituency/spmrl/SPMRL_SHARED_2014/GERMAN_SPMRL/gold/ptb" 33 | convert_spmrl(input_directory, output_directory, "de_spmrl") 34 | 35 | 36 | -------------------------------------------------------------------------------- /stanza/tests/resources/test_charlm_depparse.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza.resources.default_packages import default_charlms, depparse_charlms 4 | from stanza.resources.print_charlm_depparse import list_depparse 5 | 6 | def test_list_depparse(): 7 | models = list_depparse() 8 | 9 | # check that it's picking up the models which don't have specific charlms 10 | # first, make sure the default assumption of the test is still true... 11 | # if this test fails, find a different language which isn't in depparse_charlms 12 | assert "af" not in depparse_charlms 13 | assert "af" in default_charlms 14 | assert "af_afribooms_charlm" in models 15 | assert "af_afribooms_nocharlm" in models 16 | 17 | # assert that it's picking up the models which do have specific charlms that aren't None 18 | # again, first make sure the default assumptions are true 19 | # if one of these next few tests fail, just update the test 20 | assert "en" in depparse_charlms 21 | assert "en" in default_charlms 22 | assert "ewt" not in depparse_charlms["en"] 23 | assert "craft" in depparse_charlms["en"] 24 | assert "mimic" in depparse_charlms["en"] 25 | # now, check the results 26 | assert "en_ewt_charlm" in models 27 | assert "en_ewt_nocharlm" in models 28 | assert "en_mimic_charlm" in models 29 | # haven't yet trained w/ and w/o for the bio models 30 | assert "en_mimic_nocharlm" not in models 31 | assert "en_craft_charlm" not in models 32 | assert "en_craft_nocharlm" in models 33 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/compare_entities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Report the fraction of NER entities in one file which are present in another. 3 | 4 | Purpose: show the coverage of one file on another, such as reporting 5 | the number of entities in one dataset on another 6 | """ 7 | 8 | 9 | import argparse 10 | 11 | from stanza.utils.datasets.ner.utils import read_json_entities 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser(description="Report the coverage of one NER file on another.") 15 | parser.add_argument('--train', type=str, nargs="+", required=True, help='File to use to collect the known entities (not necessarily train).') 16 | parser.add_argument('--test', type=str, nargs="+", required=True, help='File for which we want to know the ratio of known entities') 17 | args = parser.parse_args() 18 | return args 19 | 20 | def report_known_entities(train_file, test_file): 21 | train_entities = read_json_entities(train_file) 22 | test_entities = read_json_entities(test_file) 23 | 24 | train_entities = set(x[0] for x in train_entities) 25 | total_score = sum(1 for x in test_entities if x[0] in train_entities) 26 | print(train_file, test_file, total_score / len(test_entities)) 27 | 28 | def main(): 29 | args = parse_args() 30 | 31 | for train_idx, train_file in enumerate(args.train): 32 | if train_idx > 0: 33 | print() 34 | for test_file in args.test: 35 | report_known_entities(train_file, test_file) 36 | 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /stanza/models/coref/coref_chain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Coref chain suitable for attaching to a Document after coref processing 3 | """ 4 | 5 | # by not using namedtuple, we can use this object as output from the json module 6 | # in the doc class as long as we wrap the encoder to print these out in dict() form 7 | # CorefMention = namedtuple('CorefMention', ['sentence', 'start_word', 'end_word']) 8 | class CorefMention: 9 | def __init__(self, sentence, start_word, end_word): 10 | self.sentence = sentence 11 | self.start_word = start_word 12 | self.end_word = end_word 13 | 14 | class CorefChain: 15 | def __init__(self, index, mentions, representative_text, representative_index): 16 | self.index = index 17 | self.mentions = mentions 18 | self.representative_text = representative_text 19 | self.representative_index = representative_index 20 | 21 | class CorefAttachment: 22 | def __init__(self, chain, is_start, is_end, is_representative): 23 | self.chain = chain 24 | self.is_start = is_start 25 | self.is_end = is_end 26 | self.is_representative = is_representative 27 | 28 | def to_json(self): 29 | j = { 30 | "index": self.chain.index, 31 | "representative_text": self.chain.representative_text 32 | } 33 | if self.is_start: 34 | j['is_start'] = True 35 | if self.is_end: 36 | j['is_end'] = True 37 | if self.is_representative: 38 | j['is_representative'] = True 39 | return j 40 | -------------------------------------------------------------------------------- /stanza/server/tokensregex.py: -------------------------------------------------------------------------------- 1 | """Invokes the Java tokensregex on a document 2 | 3 | This operates tokensregex on docs processed with stanza models. 4 | 5 | https://nlp.stanford.edu/software/tokensregex.html 6 | 7 | A minimal example is the main method of this module. 8 | """ 9 | 10 | import stanza 11 | 12 | from stanza.protobuf import TokensRegexRequest, TokensRegexResponse 13 | from stanza.server.java_protobuf_requests import send_request, add_sentence 14 | 15 | def send_tokensregex_request(request): 16 | return send_request(request, TokensRegexResponse, 17 | "edu.stanford.nlp.ling.tokensregex.ProcessTokensRegexRequest") 18 | 19 | def process_doc(doc, *patterns): 20 | request = TokensRegexRequest() 21 | for pattern in patterns: 22 | request.pattern.append(pattern) 23 | 24 | request_doc = request.doc 25 | request_doc.text = doc.text 26 | num_tokens = 0 27 | for sentence in doc.sentences: 28 | add_sentence(request_doc.sentence, sentence, num_tokens) 29 | num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens) 30 | 31 | return send_tokensregex_request(request) 32 | 33 | def main(): 34 | #nlp = stanza.Pipeline('en', 35 | # processors='tokenize,pos,lemma,ner') 36 | nlp = stanza.Pipeline('en', 37 | processors='tokenize') 38 | 39 | doc = nlp('Uro ruined modern. Fortunately, Wotc banned him') 40 | print(process_doc(doc, "him", "ruined")) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/convert_kk_kazNERD.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert a Kazakh NER dataset to our internal .json format 3 | The dataset is here: 4 | 5 | https://github.com/IS2AI/KazNERD/tree/main/KazNERD 6 | """ 7 | 8 | import argparse 9 | import os 10 | import shutil 11 | # import random 12 | 13 | from stanza.utils.datasets.ner.utils import convert_bio_to_json, SHARDS 14 | 15 | def convert_dataset(in_directory, out_directory, short_name): 16 | """ 17 | Reads in train, validation, and test data and converts them to .json file 18 | """ 19 | filenames = ("IOB2_train.txt", "IOB2_valid.txt", "IOB2_test.txt") 20 | for shard, filename in zip(SHARDS, filenames): 21 | input_filename = os.path.join(in_directory, filename) 22 | output_filename = os.path.join(out_directory, "%s.%s.bio" % (short_name, shard)) 23 | shutil.copy(input_filename, output_filename) 24 | convert_bio_to_json(out_directory, out_directory, short_name, "bio") 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--input_path', type=str, default="/nlp/scr/aaydin/kazNERD/NER", help="Where to find the files") 29 | parser.add_argument('--output_path', type=str, default="/nlp/scr/aaydin/kazNERD/data/ner", help="Where to output the results") 30 | args = parser.parse_args() 31 | # in_path = '/nlp/scr/aaydin/kazNERD/NER' 32 | # out_path = '/nlp/scr/aaydin/kazNERD/NER/output' 33 | # convert_dataset(in_path, out_path) 34 | convert_dataset(args.input_path, args.output_path, "kk_kazNERD") 35 | 36 | -------------------------------------------------------------------------------- /stanza/tests/server/test_tokensregex.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from stanza.tests import * 3 | 4 | from stanza.models.common.doc import Document 5 | import stanza.server.tokensregex as tokensregex 6 | 7 | pytestmark = [pytest.mark.travis, pytest.mark.client] 8 | 9 | from stanza.tests.server.test_semgrex import ONE_SENTENCE_DOC, TWO_SENTENCE_DOC 10 | 11 | def test_single_sentence(): 12 | #expected: 13 | #match { 14 | # sentence: 0 15 | # match { 16 | # text: "Opal" 17 | # begin: 2 18 | # end: 3 19 | # } 20 | #} 21 | 22 | response = tokensregex.process_doc(ONE_SENTENCE_DOC, "Opal") 23 | assert len(response.match) == 1 24 | assert len(response.match[0].match) == 1 25 | assert response.match[0].match[0].sentence == 0 26 | assert response.match[0].match[0].match.text == "Opal" 27 | assert response.match[0].match[0].match.begin == 2 28 | assert response.match[0].match[0].match.end == 3 29 | 30 | 31 | def test_ner_sentence(): 32 | #expected: 33 | #match { 34 | # sentence: 0 35 | # match { 36 | # text: "Opal" 37 | # begin: 2 38 | # end: 3 39 | # } 40 | #} 41 | 42 | response = tokensregex.process_doc(ONE_SENTENCE_DOC, "[ner: GEM]") 43 | assert len(response.match) == 1 44 | assert len(response.match[0].match) == 1 45 | assert response.match[0].match[0].sentence == 0 46 | assert response.match[0].match[0].match.text == "Opal" 47 | assert response.match[0].match[0].match.begin == 2 48 | assert response.match[0].match[0].match.end == 3 49 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_combine_ner_datasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pytest 4 | 5 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 6 | 7 | from stanza.models.common.doc import Document 8 | from stanza.tests.ner.test_ner_training import write_temp_file, EN_TRAIN_BIO, EN_DEV_BIO 9 | from stanza.utils.datasets.ner import combine_ner_datasets 10 | 11 | 12 | def test_combine(tmp_path): 13 | """ 14 | Test that if we write two short datasets and combine them, we get back 15 | one slightly longer dataset 16 | 17 | To simplify matters, we just use the same input text with longer 18 | amounts of text for each shard. 19 | """ 20 | SHARDS = ("train", "dev", "test") 21 | for s_num, shard in enumerate(SHARDS): 22 | t1_json = tmp_path / ("en_t1.%s.json" % shard) 23 | # eg, 1x, 2x, 3x the test data from test_ner_training 24 | write_temp_file(t1_json, "\n\n".join([EN_TRAIN_BIO] * (s_num + 1))) 25 | 26 | t2_json = tmp_path / ("en_t2.%s.json" % shard) 27 | write_temp_file(t2_json, "\n\n".join([EN_DEV_BIO] * (s_num + 1))) 28 | 29 | args = ["--output_dataset", "en_c", "en_t1", "en_t2", "--input_dir", str(tmp_path), "--output_dir", str(tmp_path)] 30 | combine_ner_datasets.main(args) 31 | 32 | for s_num, shard in enumerate(SHARDS): 33 | filename = tmp_path / ("en_c.%s.json" % shard) 34 | assert os.path.exists(filename) 35 | 36 | with open(filename, encoding="utf-8") as fin: 37 | doc = Document(json.load(fin)) 38 | assert len(doc.sentences) == (s_num + 1) * 3 39 | 40 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_convert_starlang_ner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test a couple different classes of trees to check the output of the Starlang conversion for NER 3 | """ 4 | 5 | import os 6 | import tempfile 7 | 8 | import pytest 9 | 10 | from stanza.utils.datasets.ner import convert_starlang_ner 11 | 12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 13 | 14 | TREE="( (S (NP (NP {morphologicalAnalysis=bayan+NOUN+A3SG+PNON+NOM}{metaMorphemes=bayan}{turkish=Bayan}{english=Ms.}{semantics=TUR10-0396530}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}{englishSemantics=ENG31-06352895-n}) (NP {morphologicalAnalysis=haag+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=haag}{turkish=Haag}{english=Haag}{semantics=TUR10-0000000}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580})) (VP (NP {morphologicalAnalysis=elianti+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=elianti}{turkish=Elianti}{english=Elianti}{semantics=TUR10-0000000}{namedEntity=NONE}{propBank=ARG1$TUR10-0148580}) (VP {morphologicalAnalysis=çal+VERB+POS+AOR+A3SG}{metaMorphemes=çal+Ar}{turkish=çalar}{english=plays}{semantics=TUR10-0148580}{namedEntity=NONE}{propBank=PREDICATE$TUR10-0148580}{englishSemantics=ENG31-01730049-v})) (. {morphologicalAnalysis=.+PUNC}{metaMorphemes=.}{metaMorphemesMoved=.}{turkish=.}{english=.}{semantics=TUR10-1081860}{namedEntity=NONE}{propBank=NONE})) )" 15 | 16 | def test_read_tree(): 17 | """ 18 | Test a basic tree read 19 | """ 20 | sentence = convert_starlang_ner.read_tree(TREE) 21 | expected = [('Bayan', 'PERSON'), ('Haag', 'PERSON'), ('Elianti', 'O'), ('çalar', 'O'), ('.', 'O')] 22 | assert sentence == expected 23 | 24 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/preprocess_wikiner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts the WikiNER data format to a format usable by our processing tools 3 | 4 | python preprocess_wikiner input output 5 | """ 6 | 7 | import sys 8 | 9 | def preprocess_wikiner(input_file, output_file, encoding="utf-8"): 10 | with open(input_file, encoding=encoding) as fin: 11 | with open(output_file, "w", encoding="utf-8") as fout: 12 | for line in fin: 13 | line = line.strip() 14 | if not line: 15 | fout.write("-DOCSTART- O\n") 16 | fout.write("\n") 17 | continue 18 | 19 | words = line.split() 20 | for word in words: 21 | pieces = word.split("|") 22 | text = pieces[0] 23 | tag = pieces[-1] 24 | # some words look like Daniel_Bernoulli|I-PER 25 | # but the original .pl conversion script didn't take that into account 26 | subtext = text.split("_") 27 | if tag.startswith("B-") and len(subtext) > 1: 28 | fout.write("{} {}\n".format(subtext[0], tag)) 29 | for chunk in subtext[1:]: 30 | fout.write("{} I-{}\n".format(chunk, tag[2:])) 31 | else: 32 | for chunk in subtext: 33 | fout.write("{} {}\n".format(chunk, tag)) 34 | fout.write("\n") 35 | 36 | if __name__ == '__main__': 37 | preprocess_wikiner(sys.argv[1], sys.argv[2]) 38 | -------------------------------------------------------------------------------- /stanza/models/coref/loss.py: -------------------------------------------------------------------------------- 1 | """ Describes the loss function used to train the model, which is a weighted 2 | sum of NLML and BCE losses. """ 3 | 4 | import torch 5 | 6 | 7 | class CorefLoss(torch.nn.Module): 8 | """ See the rationale for using NLML in Lee et al. 2017 9 | https://www.aclweb.org/anthology/D17-1018/ 10 | The added weighted summand of BCE helps the model learn even after 11 | converging on the NLML task. """ 12 | 13 | def __init__(self, bce_weight: float): 14 | assert 0 <= bce_weight <= 1 15 | super().__init__() 16 | self._bce_module = torch.nn.BCEWithLogitsLoss() 17 | self._bce_weight = bce_weight 18 | 19 | def forward(self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch 20 | input_: torch.Tensor, 21 | target: torch.Tensor) -> torch.Tensor: 22 | """ Returns a weighted sum of two losses as a torch.Tensor """ 23 | return (self._nlml(input_, target) 24 | + self._bce(input_, target) * self._bce_weight) 25 | 26 | def _bce(self, 27 | input_: torch.Tensor, 28 | target: torch.Tensor) -> torch.Tensor: 29 | """ For numerical stability, clamps the input before passing it to BCE. 30 | """ 31 | return self._bce_module(torch.clamp(input_, min=-50, max=50), target) 32 | 33 | @staticmethod 34 | def _nlml(input_: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 35 | gold = torch.logsumexp(input_ + torch.log(target), dim=1) 36 | input_ = torch.logsumexp(input_, dim=1) 37 | return (input_ - gold).mean() 38 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/convert_nytk.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | import os 4 | 5 | def convert_nytk(base_input_path, base_output_path, short_name): 6 | for shard in ('train', 'dev', 'test'): 7 | if shard == 'dev': 8 | base_input_subdir = os.path.join(base_input_path, "data/train-devel-test/devel") 9 | else: 10 | base_input_subdir = os.path.join(base_input_path, "data/train-devel-test", shard) 11 | 12 | shard_lines = [] 13 | base_input_glob = base_input_subdir + "/*/no-morph/*" 14 | subpaths = glob.glob(base_input_glob) 15 | print("Reading %d input files from %s" % (len(subpaths), base_input_glob)) 16 | for input_filename in subpaths: 17 | if len(shard_lines) > 0: 18 | shard_lines.append("") 19 | with open(input_filename) as fin: 20 | lines = fin.readlines() 21 | if lines[0].strip() != '# global.columns = FORM LEMMA UPOS XPOS FEATS CONLL:NER': 22 | raise ValueError("Unexpected format in %s" % input_filename) 23 | lines = [x.strip().split("\t") for x in lines[1:]] 24 | lines = ["%s\t%s" % (x[0], x[5]) if len(x) > 1 else "" for x in lines] 25 | shard_lines.extend(lines) 26 | 27 | bio_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard)) 28 | with open(bio_filename, "w") as fout: 29 | print("Writing %d lines to %s" % (len(shard_lines), bio_filename)) 30 | for line in shard_lines: 31 | fout.write(line) 32 | fout.write("\n") 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Stanza 2 | 3 | We would love to see contributions to Stanza from the community! Contributions that we welcome include bugfixes and enhancements. If you want to report a bug or suggest a feature but don't intend to fix or implement it by yourself, please create a corresponding issue on [our issues page](https://github.com/stanfordnlp/stanza/issues). If you plan to contribute a bugfix or enhancement, please read the following. 4 | 5 | ## 🛠️ Bugfixes 6 | 7 | For bugfixes, please follow these steps: 8 | 9 | - Make sure a fix does not already exist, by searching through existing [issues](https://github.com/stanfordnlp/stanza/issues) (including closed ones) and [pull requests](https://github.com/stanfordnlp/stanza/pulls). 10 | - Confirm the bug with us by creating a bug-report issue. In your issue, you should at least include the platform and environment that you are running with, and a minimal code snippet that will reproduce the bug. 11 | - Once the bug is confirmed, you can go ahead with implementing the bugfix, and create a pull request **against the `dev` branch**. 12 | 13 | ## 💡 Enhancements 14 | 15 | For enhancements, please follow these steps: 16 | 17 | - Make sure a similar enhancement suggestion does not already exist, by searching through existing [issues](https://github.com/stanfordnlp/stanza/issues). 18 | - Create a feature-request issue and discuss about this enhancement with us. We'll need to make sure this enhancement won't break existing user interface and functionalities. 19 | - Once the enhancement is confirmed with us, you can go ahead with implementing it, and create a pull request **against the `dev` branch**. 20 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/test_pipeline_sentiment_processor.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import pytest 4 | import stanza 5 | from stanza.utils.conll import CoNLL 6 | from stanza.models.common.doc import Document 7 | 8 | from stanza.tests import * 9 | 10 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 11 | 12 | # data for testing 13 | EN_DOCS = ["Ragavan is terrible and should go away.", "Today is okay.", "Urza's Saga is great."] 14 | 15 | EN_DOC = " ".join(EN_DOCS) 16 | 17 | EXPECTED = [0, 1, 2] 18 | 19 | class TestSentimentPipeline: 20 | @pytest.fixture(scope="class") 21 | def pipeline(self): 22 | """ 23 | A reusable pipeline with the NER module 24 | """ 25 | gc.collect() 26 | return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,sentiment") 27 | 28 | def test_simple(self, pipeline): 29 | results = [] 30 | for text in EN_DOCS: 31 | doc = pipeline(text) 32 | assert len(doc.sentences) == 1 33 | results.append(doc.sentences[0].sentiment) 34 | assert EXPECTED == results 35 | 36 | def test_multiple_sentences(self, pipeline): 37 | doc = pipeline(EN_DOC) 38 | assert len(doc.sentences) == 3 39 | results = [sentence.sentiment for sentence in doc.sentences] 40 | assert EXPECTED == results 41 | 42 | def test_empty_text(self, pipeline): 43 | """ 44 | Test empty text and a text which might get reduced to empty text by removing dashes 45 | """ 46 | doc = pipeline("") 47 | assert len(doc.sentences) == 0 48 | 49 | doc = pipeline("--") 50 | assert len(doc.sentences) == 1 51 | -------------------------------------------------------------------------------- /stanza/utils/datasets/thai_syllable_dict_generator.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pathlib 3 | import argparse 4 | 5 | 6 | def create_dictionary(dataset_dir, save_dir): 7 | syllables = set() 8 | 9 | for p in pathlib.Path(dataset_dir).rglob("*.ssg"): # iterate through all files 10 | 11 | with open(p) as f: # for each file 12 | sentences = f.readlines() 13 | 14 | for i in range(len(sentences)): 15 | 16 | sentences[i] = sentences[i].replace("\n", "") 17 | sentences[i] = sentences[i].replace("", "~") 18 | sentences[i] = sentences[i].split("~") # create list of all syllables 19 | 20 | syllables = syllables.union(sentences[i]) 21 | 22 | 23 | print(len(syllables)) 24 | 25 | # Filter out syllables with English words 26 | import re 27 | 28 | a = [] 29 | 30 | for s in syllables: 31 | print("---") 32 | if bool(re.match("^[\u0E00-\u0E7F]*$", s)) and s != "" and " " not in s: 33 | a.append(s) 34 | else: 35 | pass 36 | 37 | a = set(a) 38 | a = dict(zip(list(a), range(len(a)))) 39 | 40 | import json 41 | print(a) 42 | print(len(a)) 43 | with open(save_dir, "w") as fp: 44 | json.dump(a, fp) 45 | 46 | if __name__ == "__main__": 47 | 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--dataset_dir', type=str, default="syllable_segmentation_data", help="Directory for syllable dataset") 50 | parser.add_argument('--save_dir', type=str, default="thai-syllable.json", help="Directory for generated file") 51 | args = parser.parse_args() 52 | 53 | create_dictionary(args.dataset_dir, args.save_dir) 54 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/pipeline_device_tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility methods to check that all processors are on the expected device 3 | 4 | Refactored since it can be used for multiple pipelines 5 | """ 6 | 7 | import warnings 8 | 9 | import torch 10 | 11 | def check_on_gpu(pipeline): 12 | """ 13 | Check that the processors are all on the GPU and that basic execution works 14 | """ 15 | if not torch.cuda.is_available(): 16 | warnings.warn("Unable to run the test that checks the pipeline is on the GPU, as there is no GPU available!") 17 | return 18 | 19 | for name, proc in pipeline.processors.items(): 20 | if proc.trainer is not None: 21 | device = next(proc.trainer.model.parameters()).device 22 | else: 23 | device = next(proc._model.parameters()).device 24 | 25 | assert str(device).startswith("cuda"), "Processor %s was not on the GPU" % name 26 | 27 | # just check that there are no cpu/cuda tensor conflicts 28 | # when running on the GPU 29 | pipeline("This is a small test") 30 | 31 | def check_on_cpu(pipeline): 32 | """ 33 | Check that the processors are all on the CPU and that basic execution works 34 | """ 35 | for name, proc in pipeline.processors.items(): 36 | if proc.trainer is not None: 37 | device = next(proc.trainer.model.parameters()).device 38 | else: 39 | device = next(proc._model.parameters()).device 40 | 41 | assert str(device).startswith("cpu"), "Processor %s was not on the CPU" % name 42 | 43 | # just check that there are no cpu/cuda tensor conflicts 44 | # when running on the CPU 45 | pipeline("This is a small test") 46 | -------------------------------------------------------------------------------- /stanza/models/common/count_pretrain_coverage.py: -------------------------------------------------------------------------------- 1 | """A simple script to count the fraction of words in a UD dataset which are in a particular pretrain. 2 | 3 | For example, this script shows that the word2vec Armenian vectors, 4 | truncated at 250K words, have 75% coverage of the Western Armenian 5 | dataset, whereas the vectors available here have 88% coverage: 6 | 7 | https://github.com/ispras-texterra/word-embeddings-eval-hy 8 | """ 9 | 10 | from stanza.models.common import pretrain 11 | from stanza.utils.conll import CoNLL 12 | 13 | import argparse 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('treebanks', type=str, nargs='*', help='Which treebanks to run on') 18 | parser.add_argument('--pretrain', type=str, default="/home/john/extern_data/wordvec/glove/armenian.pt", help='Which pretrain to use') 19 | parser.set_defaults(treebanks=["/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu", 20 | "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"]) 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | args = parse_args() 26 | pt = pretrain.Pretrain(args.pretrain) 27 | pt.load() 28 | print("Pretrain stats: {} vectors, {} dim".format(len(pt.vocab), pt.emb[0].shape[0])) 29 | 30 | for treebank in args.treebanks: 31 | print(treebank) 32 | found = 0 33 | total = 0 34 | doc = CoNLL.conll2doc(treebank) 35 | for sentence in doc.sentences: 36 | for word in sentence.words: 37 | total = total + 1 38 | if word.text in pt.vocab: 39 | found = found + 1 40 | 41 | print (found / total) 42 | -------------------------------------------------------------------------------- /stanza/utils/constituency/check_transitions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from stanza.models.constituency import transition_sequence 4 | from stanza.models.constituency import tree_reader 5 | from stanza.models.constituency.parse_transitions import TransitionScheme 6 | from stanza.models.constituency.parse_tree import Tree 7 | from stanza.models.constituency.utils import verify_transitions 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--train_file', type=str, default="data/constituency/en_ptb3_train.mrg", help='Input file for data loader.') 12 | parser.add_argument('--transition_scheme', default=TransitionScheme.IN_ORDER, type=lambda x: TransitionScheme[x.upper()], 13 | help='Transition scheme to use. {}'.format(", ".join(x.name for x in TransitionScheme))) 14 | parser.add_argument('--reversed', default=False, action='store_true', help='Do the transition sequence reversed') 15 | parser.add_argument('--iterations', default=30, type=int, help='How many times to iterate, such as if doing a cProfile') 16 | args = parser.parse_args() 17 | args = vars(args) 18 | 19 | train_trees = tree_reader.read_treebank(args['train_file']) 20 | unary_limit = max(t.count_unary_depth() for t in train_trees) + 1 21 | train_sequences, train_transitions = transition_sequence.convert_trees_to_sequences(train_trees, "training", args['transition_scheme'], args['reversed']) 22 | root_labels = Tree.get_root_labels(train_trees) 23 | for i in range(args['iterations']): 24 | verify_transitions(train_trees, train_sequences, args['transition_scheme'], unary_limit, args['reversed'], "train", root_labels) 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /stanza/utils/pretrain/compare_pretrains.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | from stanza.models.common.pretrain import Pretrain 5 | 6 | pt1_filename = sys.argv[1] 7 | pt2_filename = sys.argv[2] 8 | 9 | pt1 = Pretrain(pt1_filename) 10 | pt2 = Pretrain(pt2_filename) 11 | 12 | vocab1 = pt1.vocab 13 | vocab2 = pt2.vocab 14 | 15 | common_words = [x for x in vocab1 if x in vocab2] 16 | print("%d shared words, out of %d in %s and %d in %s" % (len(common_words), len(vocab1), pt1_filename, len(vocab2), pt2_filename)) 17 | 18 | eps = 0.0001 19 | total_norm = 0.0 20 | total_close = 0 21 | 22 | words_different = [] 23 | 24 | for word, idx in vocab1._unit2id.items(): 25 | if word not in vocab2: 26 | continue 27 | v1 = pt1.emb[idx] 28 | v2 = pt2.emb[pt2.vocab[word]] 29 | norm = np.linalg.norm(v1 - v2) 30 | 31 | if norm < eps: 32 | total_close += 1 33 | else: 34 | total_norm += norm 35 | if len(words_different) < 10: 36 | words_different.append("|%s|" % word) 37 | #print(word, idx, pt2.vocab[word]) 38 | #print(v1) 39 | #print(v2) 40 | 41 | if total_close < len(common_words): 42 | avg_norm = total_norm / (len(common_words) - total_close) 43 | print("%d vectors were close. Average difference of the others: %f" % (total_close, avg_norm)) 44 | print("The first few different words were:\n %s" % "\n ".join(words_different)) 45 | else: 46 | print("All %d vectors were close!" % total_close) 47 | 48 | for word, idx in vocab1._unit2id.items(): 49 | if word not in vocab2: 50 | continue 51 | if pt2.vocab[word] != idx: 52 | break 53 | else: 54 | print("All indices are the same") 55 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/convert_en_conll03.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json 3 | 4 | Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF: 5 | https://huggingface.co/datasets/conll2003 6 | """ 7 | 8 | import os 9 | 10 | from stanza.utils.default_paths import get_default_paths 11 | from stanza.utils.datasets.ner.utils import write_dataset 12 | 13 | TAG_TO_ID = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8} 14 | ID_TO_TAG = {y: x for x, y in TAG_TO_ID.items()} 15 | 16 | def convert_dataset_section(section): 17 | sentences = [] 18 | for item in section: 19 | words = item['tokens'] 20 | tags = [ID_TO_TAG[x] for x in item['ner_tags']] 21 | sentences.append(list(zip(words, tags))) 22 | return sentences 23 | 24 | def process_dataset(short_name, conll_path, ner_output_path): 25 | try: 26 | from datasets import load_dataset 27 | except ImportError as e: 28 | raise ImportError("Please install the datasets package to process CoNLL03 with Stanza") 29 | 30 | dataset = load_dataset('conll2003', cache_dir=conll_path) 31 | datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]] 32 | write_dataset(datasets, ner_output_path, short_name) 33 | 34 | def main(): 35 | paths = get_default_paths() 36 | ner_input_path = paths['NERBASE'] 37 | conll_path = os.path.join(ner_input_path, "english", "en_conll03") 38 | ner_output_path = paths['NER_DATA_DIR'] 39 | process_dataset("en_conll03", conll_path, ner_output_path) 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/treebank_to_labeled_brackets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts a PTB file to a format where all the brackets have labels on the start and end bracket. 3 | 4 | Such a file should be suitable for training an LM 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import sys 10 | 11 | from stanza.models.constituency import tree_reader 12 | from stanza.utils.get_tqdm import get_tqdm 13 | 14 | tqdm = get_tqdm() 15 | 16 | logger = logging.getLogger('stanza.constituency') 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser( 20 | description="Script that converts a PTB treebank into a labeled bracketed file suitable for LM training" 21 | ) 22 | 23 | parser.add_argument( 24 | 'ptb_file', 25 | help='Where to get the original PTB format treebank' 26 | ) 27 | parser.add_argument( 28 | 'label_file', 29 | help='Where to write the labeled bracketed file' 30 | ) 31 | parser.add_argument( 32 | '--separator', 33 | default="_", 34 | help='What separator to use in place of spaces', 35 | ) 36 | parser.add_argument( 37 | '--no_separator', 38 | dest='separator', 39 | action='store_const', 40 | const=None, 41 | help="Don't use a separator" 42 | ) 43 | 44 | args = parser.parse_args() 45 | 46 | treebank = tree_reader.read_treebank(args.ptb_file) 47 | logger.info("Writing %d trees to %s", len(treebank), args.label_file) 48 | 49 | tree_format = "{:%sL}\n" % args.separator if args.separator else "{:L}\n" 50 | with open(args.label_file, "w", encoding="utf-8") as fout: 51 | for tree in tqdm(treebank): 52 | fout.write(tree_format.format(tree)) 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /stanza/utils/constituency/grep_dev_logs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | iteration = sys.argv[1] 5 | filenames = sys.argv[2:] 6 | 7 | total_score = 0.0 8 | num_scores = 0 9 | 10 | for filename in filenames: 11 | grep_cmd = ["grep", "Dev score.* %s[)]" % iteration, "-A1", filename] 12 | grep_result = subprocess.run(grep_cmd, stdout=subprocess.PIPE, encoding="utf-8") 13 | grep_result = grep_result.stdout.strip() 14 | if not grep_result: 15 | max_cmd = ["grep", "Dev score", filename] 16 | max_result = subprocess.run(max_cmd, stdout=subprocess.PIPE, encoding="utf-8") 17 | max_result = max_result.stdout.strip() 18 | if not max_result: 19 | print("{}: no result".format(filename)) 20 | else: 21 | max_it = max_result.split("\n")[-1] 22 | max_it = int(max_it.split(":")[0].split("(")[-1][:-1]) 23 | epoch_finished_string = "Epoch %d finished" % max_it 24 | finish_cmd = ["grep", epoch_finished_string, filename] 25 | finish_result = subprocess.run(finish_cmd, stdout=subprocess.PIPE, encoding="utf-8") 26 | finish_result = finish_result.stdout.strip() 27 | finish_time = finish_result.split(" INFO")[0] 28 | print("{}: no result. max iteration: {} finished at {}".format(filename, max_it, finish_time)) 29 | else: 30 | grep_result = grep_result.split("\n")[-1] 31 | score = float(grep_result.split(":")[-1]) 32 | best_iteration = int(grep_result.split(":")[-2][-6:-1]) 33 | print("{}: {} ({})".format(filename, score, best_iteration)) 34 | total_score += score 35 | num_scores += 1 36 | 37 | if num_scores > 0: 38 | avg = total_score / num_scores 39 | print("Avg: {}".format(avg)) 40 | 41 | -------------------------------------------------------------------------------- /stanza/utils/datasets/coref/balance_languages.py: -------------------------------------------------------------------------------- 1 | """ 2 | balance_concat.py 3 | create a test set from a dev set which is language balanced 4 | """ 5 | 6 | import json 7 | from collections import defaultdict 8 | 9 | from random import Random 10 | 11 | # fix random seed for reproducability 12 | R = Random(42) 13 | 14 | with open("./corefud_concat_v1_0_langid.train.json", 'r') as df: 15 | raw = json.load(df) 16 | 17 | # calculate type of each class; then, we will select the one 18 | # which has the LOWEST counts as the sample rate 19 | lang_counts = defaultdict(int) 20 | for i in raw: 21 | lang_counts[i["lang"]] += 1 22 | 23 | min_lang_count = min(lang_counts.values()) 24 | 25 | # sample 20% of the smallest amount for test set 26 | # this will look like an absurdly small number, but 27 | # remember this is DOCUMENTS not TOKENS or UTTERANCES 28 | # so its actually decent 29 | # also its per language 30 | test_set_size = int(0.1*min_lang_count) 31 | 32 | # sampling input by language 33 | raw_by_language = defaultdict(list) 34 | for i in raw: 35 | raw_by_language[i["lang"]].append(i) 36 | languages = list(set(raw_by_language.keys())) 37 | 38 | train_set = [] 39 | test_set = [] 40 | for i in languages: 41 | length = list(range(len(raw_by_language[i]))) 42 | choices = R.sample(length, test_set_size) 43 | 44 | for indx,i in enumerate(raw_by_language[i]): 45 | if indx in choices: 46 | test_set.append(i) 47 | else: 48 | train_set.append(i) 49 | 50 | with open("./corefud_concat_v1_0_langid-bal.train.json", 'w') as df: 51 | json.dump(train_set, df, indent=2) 52 | 53 | with open("./corefud_concat_v1_0_langid-bal.test.json", 'w') as df: 54 | json.dump(test_set, df, indent=2) 55 | 56 | 57 | 58 | # raw_by_language["en"] 59 | 60 | 61 | -------------------------------------------------------------------------------- /stanza/utils/visualization/README: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | The code in this directory contains tooling required for Semgrex and Ssurgeon visualization. 4 | Searching dependency graphs and manipulating them can be a time consuming and challenging task to get right. 5 | Semgrex is a system for searching dependency graphs and Ssurgeon is a system for manipulating the output of Semgrex. 6 | The compact language used by these systems allows for easy command line or API processing of dependencies. 7 | 8 | We now offer Semgrex and Ssurgeon through a web interface, now accessible via Streamlit with visualizations. 9 | 10 | ## How to run visualizations through Streamlit 11 | 12 | Streamlit can be used to visualize Semgrex and Ssurgeon results and process files. 13 | Here are instructions for setting up a Streamlit webpage: 14 | 15 | 1. install Streamlit. `pip install streamlit` 16 | 2. install Stanford CoreNLP if you have not. You can find an installation here: https://stanfordnlp.github.io/CoreNLP/download.html 17 | 3. set the $CLASSPATH environment variable to your local installation of CoreNLP. 18 | 4. install streamlit, spacy, and ipython. You can use the "visualization" stanza setup option for that 19 | 5. Run `streamlit run stanza/utils/visualization/semgrex_app.py --theme.backgroundColor "#FFFFFF"` 20 | 21 | This should begin a Streamlit runtime application on your local machine that can be interacted with. 22 | 23 | For instructions on how to use Ssurgeon and Semgrex, refer to these helpful pages: 24 | https://aclanthology.org/2023.tlt-1.7.pdf 25 | https://nlp.stanford.edu/nlp/javadoc/javanlp-3.5.0/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html 26 | https://stanfordnlp.github.io/stanza/client_regex.html 27 | https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#query-tokensregex-tokensregex 28 | -------------------------------------------------------------------------------- /stanza/models/classifiers/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | # TODO: perhaps put the enums in this file 5 | from stanza.models.classifiers.utils import WVType, ExtraVectors, ModelType 6 | 7 | @dataclass 8 | class CNNConfig: # pylint: disable=too-many-instance-attributes, too-few-public-methods 9 | filter_channels: Union[int, tuple] 10 | filter_sizes: tuple 11 | fc_shapes: tuple 12 | dropout: float 13 | num_classes: int 14 | wordvec_type: WVType 15 | extra_wordvec_method: ExtraVectors 16 | extra_wordvec_dim: int 17 | extra_wordvec_max_norm: float 18 | char_lowercase: bool 19 | charlm_projection: int 20 | has_charlm_forward: bool 21 | has_charlm_backward: bool 22 | 23 | use_elmo: bool 24 | elmo_projection: int 25 | 26 | bert_model: str 27 | bert_finetune: bool 28 | bert_hidden_layers: int 29 | force_bert_saved: bool 30 | 31 | use_peft: bool 32 | lora_rank: int 33 | lora_alpha: float 34 | lora_dropout: float 35 | lora_modules_to_save: List 36 | lora_target_modules: List 37 | 38 | bilstm: bool 39 | bilstm_hidden_dim: int 40 | maxpool_width: int 41 | model_type: ModelType 42 | 43 | @dataclass 44 | class ConstituencyConfig: # pylint: disable=too-many-instance-attributes, too-few-public-methods 45 | fc_shapes: tuple 46 | dropout: float 47 | num_classes: int 48 | 49 | constituency_backprop: bool 50 | constituency_batch_norm: bool 51 | constituency_node_attn: bool 52 | constituency_top_layer: bool 53 | constituency_all_words: bool 54 | 55 | model_type: ModelType 56 | -------------------------------------------------------------------------------- /stanza/models/pos/xpos_vocab_utils.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from enum import Enum 3 | import logging 4 | import os 5 | 6 | from stanza.models.common.vocab import VOCAB_PREFIX 7 | from stanza.models.pos.vocab import XPOSVocab, WordVocab 8 | 9 | class XPOSType(Enum): 10 | XPOS = 1 11 | WORD = 2 12 | 13 | XPOSDescription = namedtuple('XPOSDescription', ['xpos_type', 'sep']) 14 | DEFAULT_KEY = XPOSDescription(XPOSType.WORD, None) 15 | 16 | logger = logging.getLogger('stanza') 17 | 18 | def filter_data(data, idx): 19 | data_filtered = [] 20 | for sentence in data: 21 | flag = True 22 | for token in sentence: 23 | if token[idx] is None: 24 | flag = False 25 | if flag: data_filtered.append(sentence) 26 | return data_filtered 27 | 28 | def choose_simplest_factory(data, shorthand): 29 | logger.info(f'Original length = {len(data)}') 30 | data = filter_data(data, idx=2) 31 | logger.info(f'Filtered length = {len(data)}') 32 | vocab = WordVocab(data, shorthand, idx=2, ignore=["_"]) 33 | key = DEFAULT_KEY 34 | best_size = len(vocab) - len(VOCAB_PREFIX) 35 | if best_size > 20: 36 | for sep in ['', '-', '+', '|', ',', ':']: # separators 37 | vocab = XPOSVocab(data, shorthand, idx=2, sep=sep) 38 | length = sum(len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) 39 | if length < best_size: 40 | key = XPOSDescription(XPOSType.XPOS, sep) 41 | best_size = length 42 | return key 43 | 44 | def build_xpos_vocab(description, data, shorthand): 45 | if description.xpos_type is XPOSType.WORD: 46 | return WordVocab(data, shorthand, idx=2, ignore=["_"]) 47 | 48 | return XPOSVocab(data, shorthand, idx=2, sep=description.sep) 49 | -------------------------------------------------------------------------------- /stanza/utils/datasets/contract_mwt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def contract_mwt(infile, outfile, ignore_gapping=True): 4 | """ 5 | Simplify the gold tokenizer data for use as MWT processor test files 6 | 7 | The simplifications are to remove the expanded MWTs, and in the 8 | case of ignore_gapping=True, remove any copy words for the dependencies 9 | """ 10 | with open(outfile, 'w', encoding='utf-8') as fout: 11 | with open(infile, 'r', encoding='utf-8') as fin: 12 | idx = 0 13 | mwt_begin = 0 14 | mwt_end = -1 15 | for line in fin: 16 | line = line.strip() 17 | 18 | if line.startswith('#'): 19 | print(line, file=fout) 20 | continue 21 | elif len(line) <= 0: 22 | print(line, file=fout) 23 | idx = 0 24 | mwt_begin = 0 25 | mwt_end = -1 26 | continue 27 | 28 | line = line.split('\t') 29 | 30 | # ignore gapping word 31 | if ignore_gapping and '.' in line[0]: 32 | continue 33 | 34 | idx += 1 35 | if '-' in line[0]: 36 | mwt_begin, mwt_end = [int(x) for x in line[0].split('-')] 37 | print("{}\t{}\t{}".format(idx, "\t".join(line[1:-1]), "MWT=Yes" if line[-1] == '_' else line[-1] + "|MWT=Yes"), file=fout) 38 | idx -= 1 39 | elif mwt_begin <= idx <= mwt_end: 40 | continue 41 | else: 42 | print("{}\t{}".format(idx, "\t".join(line[1:])), file=fout) 43 | 44 | if __name__ == '__main__': 45 | contract_mwt(sys.argv[1], sys.argv[2]) 46 | 47 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/convert_starlang_ner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert the starlang trees to a NER dataset 3 | 4 | Has to hide quite a few trees with missing NER labels 5 | """ 6 | 7 | import re 8 | 9 | from stanza.models.constituency import tree_reader 10 | import stanza.utils.datasets.constituency.convert_starlang as convert_starlang 11 | 12 | TURKISH_WORD_RE = re.compile(r"[{]turkish=([^}]+)[}]") 13 | TURKISH_LABEL_RE = re.compile(r"[{]namedEntity=([^}]+)[}]") 14 | 15 | 16 | 17 | def read_tree(text): 18 | """ 19 | Reads in a tree, then extracts the word and the NER 20 | 21 | One problem is that it is unknown if there are cases of two separate items occurring consecutively 22 | 23 | Note that this is quite similar to the convert_starlang script for constituency. 24 | """ 25 | trees = tree_reader.read_trees(text) 26 | if len(trees) > 1: 27 | raise ValueError("Tree file had two trees!") 28 | tree = trees[0] 29 | words = [] 30 | for label in tree.leaf_labels(): 31 | match = TURKISH_WORD_RE.search(label) 32 | if match is None: 33 | raise ValueError("Could not find word in |{}|".format(label)) 34 | word = match.group(1) 35 | word = word.replace("-LCB-", "{").replace("-RCB-", "}") 36 | 37 | match = TURKISH_LABEL_RE.search(label) 38 | if match is None: 39 | raise ValueError("Could not find ner in |{}|".format(label)) 40 | tag = match.group(1) 41 | if tag == 'NONE' or tag == "null": 42 | tag = 'O' 43 | words.append((word, tag)) 44 | 45 | return words 46 | 47 | def read_starlang(paths): 48 | return convert_starlang.read_starlang(paths, conversion=read_tree, log=False) 49 | 50 | def main(): 51 | train, dev, test = convert_starlang.main(conversion=read_tree, log=False) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | 56 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_ner_trainer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza.tests import * 4 | 5 | from stanza.models.ner import trainer 6 | 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 8 | 9 | def test_fix_singleton_tags(): 10 | TESTS = [ 11 | (["O"], ["O"]), 12 | (["B-PER"], ["S-PER"]), 13 | (["B-PER", "I-PER"], ["B-PER", "E-PER"]), 14 | (["B-PER", "O", "B-PER"], ["S-PER", "O", "S-PER"]), 15 | (["B-PER", "B-PER", "I-PER"], ["S-PER", "B-PER", "E-PER"]), 16 | (["B-PER", "I-PER", "O", "B-PER"], ["B-PER", "E-PER", "O", "S-PER"]), 17 | (["B-PER", "B-PER", "I-PER", "B-PER"], ["S-PER", "B-PER", "E-PER", "S-PER"]), 18 | (["B-PER", "I-ORG", "O", "B-PER"], ["S-PER", "S-ORG", "O", "S-PER"]), 19 | (["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 20 | (["S-PER", "B-PER", "E-PER"], ["S-PER", "B-PER", "E-PER"]), 21 | (["E-PER"], ["S-PER"]), 22 | (["E-PER", "O", "E-PER"], ["S-PER", "O", "S-PER"]), 23 | (["B-PER", "E-ORG", "O", "B-PER"], ["S-PER", "S-ORG", "O", "S-PER"]), 24 | (["I-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 25 | (["B-PER", "I-PER", "I-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 26 | (["B-PER", "I-PER", "E-PER", "O", "I-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 27 | (["B-PER", "I-PER", "E-PER", "O", "B-PER", "I-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 28 | (["I-PER", "I-PER", "I-PER", "O", "I-PER", "I-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]), 29 | ] 30 | 31 | for unfixed, expected in TESTS: 32 | assert trainer.fix_singleton_tags(unfixed) == expected, "Error converting {} to {}".format(unfixed, expected) 33 | -------------------------------------------------------------------------------- /stanza/models/coref/config.py: -------------------------------------------------------------------------------- 1 | """ Describes Config, a simple namespace for config values. 2 | 3 | For description of all config values, refer to config.toml. 4 | """ 5 | 6 | from dataclasses import dataclass 7 | from typing import Dict, List 8 | 9 | 10 | @dataclass 11 | class Config: # pylint: disable=too-many-instance-attributes, too-few-public-methods 12 | """ Contains values needed to set up the coreference model. """ 13 | section: str 14 | 15 | # TODO: can either eliminate data_dir or use it for the train/dev/test data 16 | data_dir: str 17 | save_dir: str 18 | save_name: str 19 | 20 | train_data: str 21 | dev_data: str 22 | test_data: str 23 | 24 | device: str 25 | 26 | bert_model: str 27 | bert_window_size: int 28 | 29 | embedding_size: int 30 | sp_embedding_size: int 31 | a_scoring_batch_size: int 32 | hidden_size: int 33 | n_hidden_layers: int 34 | 35 | max_span_len: int 36 | 37 | rough_k: int 38 | 39 | lora: bool 40 | lora_alpha: int 41 | lora_rank: int 42 | lora_dropout: float 43 | 44 | full_pairwise: bool 45 | 46 | lora_target_modules: List[str] 47 | lora_modules_to_save: List[str] 48 | 49 | clusters_starts_are_singletons: bool 50 | bert_finetune: bool 51 | dropout_rate: float 52 | learning_rate: float 53 | bert_learning_rate: float 54 | # we find that setting this to a small but non-zero number 55 | # makes the model less likely to forget how to do anything 56 | bert_finetune_begin_epoch: float 57 | train_epochs: int 58 | bce_loss_weight: float 59 | 60 | tokenizer_kwargs: Dict[str, dict] 61 | conll_log_dir: str 62 | 63 | save_each_checkpoint: bool 64 | log_norms: bool 65 | singletons: bool 66 | 67 | max_train_len: int 68 | use_zeros: bool 69 | 70 | lang_lr_attenuation: str 71 | lang_lr_weights: str 72 | -------------------------------------------------------------------------------- /stanza/protobuf/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from io import BytesIO 4 | import warnings 5 | 6 | from google.protobuf.internal.encoder import _EncodeVarint 7 | from google.protobuf.internal.decoder import _DecodeVarint 8 | from google.protobuf.message import DecodeError 9 | from .CoreNLP_pb2 import * 10 | 11 | def parseFromDelimitedString(obj, buf, offset=0): 12 | """ 13 | Stanford CoreNLP uses the Java "writeDelimitedTo" function, which 14 | writes the size (and offset) of the buffer before writing the object. 15 | This function handles parsing this message starting from offset 0. 16 | 17 | @returns how many bytes of @buf were consumed. 18 | """ 19 | size, pos = _DecodeVarint(buf, offset) 20 | try: 21 | obj.ParseFromString(buf[offset+pos:offset+pos+size]) 22 | except DecodeError as e: 23 | warnings.warn("Failed to decode a serialized output from CoreNLP server. An incomplete or empty object will be returned.", \ 24 | RuntimeWarning) 25 | return pos+size 26 | 27 | def writeToDelimitedString(obj, stream=None): 28 | """ 29 | Stanford CoreNLP uses the Java "writeDelimitedTo" function, which 30 | writes the size (and offset) of the buffer before writing the object. 31 | This function handles parsing this message starting from offset 0. 32 | 33 | @returns how many bytes of @buf were consumed. 34 | """ 35 | if stream is None: 36 | stream = BytesIO() 37 | 38 | _EncodeVarint(stream.write, obj.ByteSize(), True) 39 | stream.write(obj.SerializeToString()) 40 | return stream 41 | 42 | def to_text(sentence): 43 | """ 44 | Helper routine that converts a Sentence protobuf to a string from 45 | its tokens. 46 | """ 47 | text = "" 48 | for i, tok in enumerate(sentence.token): 49 | if i != 0: 50 | text += tok.before 51 | text += tok.word 52 | return text 53 | -------------------------------------------------------------------------------- /stanza/tests/common/test_relative_attn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | 5 | from stanza.models.common.relative_attn import RelativeAttention 6 | 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 8 | 9 | 10 | def test_attn(): 11 | foo = RelativeAttention(d_model=100, num_heads=2, window=8, dropout=0.0) 12 | bar = torch.randn(10, 13, 100) 13 | result = foo(bar) 14 | assert result.shape == bar.shape 15 | value = foo.value(bar) 16 | if not torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06): 17 | raise ValueError(result[:, -1, :] - value[:, -1, :]) 18 | assert torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06) 19 | assert not torch.allclose(result[:, 0, :], value[:, 0, :]) 20 | 21 | 22 | def test_shorter_sequence(): 23 | # originally this was failing because the batch was smaller than the window 24 | foo = RelativeAttention(d_model=20, num_heads=2, window=5, dropout=0.0) 25 | bar = torch.randn(10, 3, 20) 26 | result = foo(bar) 27 | assert result.shape == bar.shape 28 | 29 | value = foo.value(bar) 30 | if not torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06): 31 | raise ValueError(result[:, -1, :] - value[:, -1, :]) 32 | assert torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06) 33 | assert not torch.allclose(result[:, 0, :], value[:, 0, :]) 34 | 35 | def test_reverse(): 36 | foo = RelativeAttention(d_model=100, num_heads=2, window=8, reverse=True, dropout=0.0) 37 | bar = torch.randn(10, 13, 100) 38 | result = foo(bar) 39 | assert result.shape == bar.shape 40 | value = foo.value(bar) 41 | if not torch.allclose(result[:, 0, :], value[:, 0, :], atol=1e-06): 42 | raise ValueError(result[:, 0, :] - value[:, 0, :]) 43 | assert torch.allclose(result[:, 0, :], value[:, 0, :], atol=1e-06) 44 | assert not torch.allclose(result[:, -1, :], value[:, -1, :]) 45 | 46 | 47 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/relabel_tags.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retag an S-expression tree with a new set of POS tags 3 | 4 | Also includes an option to write the new trees as bracket_labels 5 | (essentially, skipping the treebank_to_labeled_brackets step) 6 | """ 7 | 8 | import argparse 9 | import logging 10 | 11 | from stanza import Pipeline 12 | from stanza.models.constituency import retagging 13 | from stanza.models.constituency import tree_reader 14 | from stanza.models.constituency.utils import retag_trees 15 | 16 | logger = logging.getLogger('stanza') 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description="Script that retags a tree file") 20 | parser.add_argument('--lang', default='vi', type=str, help='Language') 21 | parser.add_argument('--input_file', default='data/constituency/vi_vlsp21_train.mrg', help='File to retag') 22 | parser.add_argument('--output_file', default='vi_vlsp21_train_retagged.mrg', help='Where to write the retagged trees') 23 | retagging.add_retag_args(parser) 24 | 25 | parser.add_argument('--bracket_labels', action='store_true', help='Write the trees as bracket labels instead of S-expressions') 26 | 27 | args = parser.parse_args() 28 | args = vars(args) 29 | retagging.postprocess_args(args) 30 | 31 | return args 32 | 33 | def main(): 34 | args = parse_args() 35 | 36 | retag_pipeline = retagging.build_retag_pipeline(args) 37 | 38 | train_trees = tree_reader.read_treebank(args['input_file']) 39 | logger.info("Retagging %d trees using %s", len(train_trees), args['retag_package']) 40 | train_trees = retag_trees(train_trees, retag_pipeline, args['retag_xpos']) 41 | tree_format = "{:L}" if args['bracket_labels'] else "{}" 42 | with open(args['output_file'], "w") as fout: 43 | for tree in train_trees: 44 | fout.write(tree_format.format(tree)) 45 | fout.write("\n") 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /.github/workflows/stanza-tests.yaml: -------------------------------------------------------------------------------- 1 | name: Run Stanza Tests 2 | on: [push] 3 | jobs: 4 | Run-Stanza-Tests: 5 | runs-on: self-hosted 6 | steps: 7 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 8 | - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" 9 | - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 10 | - name: Check out repository code 11 | uses: actions/checkout@v2 12 | - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." 13 | - run: echo "🖥️ The workflow is now ready to test your code on the runner." 14 | - name: Run Stanza Tests 15 | run: | 16 | # set up environment 17 | echo "Setting up environment..." 18 | bash 19 | #. $CONDA_PREFIX/etc/profile.d/conda.sh 20 | . /home/stanzabuild/miniconda3/etc/profile.d/conda.sh 21 | conda activate stanza 22 | export STANZA_TEST_HOME=/scr/stanza_test 23 | export CORENLP_HOME=$STANZA_TEST_HOME/corenlp_dir 24 | export CLASSPATH=$CORENLP_HOME/*: 25 | echo CORENLP_HOME=$CORENLP_HOME 26 | echo CLASSPATH=$CLASSPATH 27 | # install from stanza repo being evaluated 28 | echo PWD: $pwd 29 | echo PATH: $PATH 30 | pip3 install -e . 31 | pip3 install -e .[test] 32 | pip3 install -e .[transformers] 33 | pip3 install -e .[tokenizers] 34 | # set up for tests 35 | echo "Running stanza test set up..." 36 | rm -rf $STANZA_TEST_HOME 37 | python3 stanza/tests/setup.py 38 | # run tests 39 | echo "Running tests..." 40 | export CUDA_VISIBLE_DEVICES=2 41 | pytest stanza/tests 42 | 43 | - run: echo "🍏 This job's status is ${{ job.status }}." 44 | -------------------------------------------------------------------------------- /stanza/tests/ner/test_pay_amt_annotators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple test for tracking AMT annotator work 3 | """ 4 | 5 | import os 6 | import zipfile 7 | 8 | import pytest 9 | 10 | from stanza.tests import TEST_WORKING_DIR 11 | from stanza.utils.ner import paying_annotators 12 | 13 | DATA_SOURCE = os.path.join(TEST_WORKING_DIR, "in", "aws_annotations.zip") 14 | 15 | @pytest.fixture(scope="module") 16 | def completed_amt_job_metadata(tmp_path_factory): 17 | assert os.path.exists(DATA_SOURCE) 18 | unzip_path = tmp_path_factory.mktemp("amt_test") 19 | input_path = unzip_path / "ner" / "aws_labeling_copy" 20 | with zipfile.ZipFile(DATA_SOURCE, 'r') as zin: 21 | zin.extractall(unzip_path) 22 | return input_path 23 | 24 | def test_amt_annotator_track(completed_amt_job_metadata): 25 | workers = { 26 | "7efc17ac-3397-4472-afe5-89184ad145d0": "Worker1", 27 | "afce8c28-969c-4e73-a20f-622ef122f585": "Worker2", 28 | "91f6236e-63c6-4a84-8fd6-1efbab6dedab": "Worker3", 29 | "6f202e93-e6b6-4e1d-8f07-0484b9a9093a": "Worker4", 30 | "2b674d33-f656-44b0-8f90-d70a1ab71ec2": "Worker5" 31 | } # map AMT annotator subs to relevant identifier 32 | 33 | tracked_work = paying_annotators.track_tasks(completed_amt_job_metadata, workers) 34 | assert tracked_work == {'Worker4': 20, 'Worker5': 20, 'Worker2': 3, 'Worker3': 16} 35 | 36 | 37 | def test_amt_annotator_track_no_map(completed_amt_job_metadata): 38 | sub_to_count = paying_annotators.track_tasks(completed_amt_job_metadata) 39 | assert sub_to_count == {'6f202e93-e6b6-4e1d-8f07-0484b9a9093a': 20, '2b674d33-f656-44b0-8f90-d70a1ab71ec2': 20, 40 | 'afce8c28-969c-4e73-a20f-622ef122f585': 3, '91f6236e-63c6-4a84-8fd6-1efbab6dedab': 16} 41 | 42 | 43 | def main(): 44 | test_amt_annotator_track() 45 | test_amt_annotator_track_no_map() 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | print("TESTS COMPLETED!") 51 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/convert_mr_l3cube.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reads one piece of the MR L3Cube dataset 3 | 4 | The dataset is structured as a long list of words already in IOB format 5 | The sentences have an ID which changes when a new sentence starts 6 | The tags are labeled BNEM instead of B-NEM, so we update that. 7 | (Could theoretically remap the tags to names more typical of other datasets as well) 8 | """ 9 | 10 | def convert(input_file): 11 | """ 12 | Converts one file of the dataset 13 | 14 | Return: a list of list of pairs, (text, tag) 15 | """ 16 | with open(input_file, encoding="utf-8") as fin: 17 | lines = fin.readlines() 18 | 19 | sentences = [] 20 | current_sentence = [] 21 | prev_sent_id = None 22 | for idx, line in enumerate(lines): 23 | # first line of each of the segments is the header 24 | if idx == 0: 25 | continue 26 | 27 | line = line.strip() 28 | if not line: 29 | continue 30 | pieces = line.split("\t") 31 | if len(pieces) != 3: 32 | raise ValueError("Unexpected number of pieces at line %d of %s" % (idx, input_file)) 33 | 34 | text, ner, sent_id = pieces 35 | if ner != 'O': 36 | # ner symbols are written as BNEM, BNED, etc in this dataset 37 | ner = ner[0] + "-" + ner[1:] 38 | 39 | if not prev_sent_id: 40 | prev_sent_id = sent_id 41 | if sent_id != prev_sent_id: 42 | prev_sent_id = sent_id 43 | if len(current_sentence) == 0: 44 | raise ValueError("This should not happen!") 45 | sentences.append(current_sentence) 46 | current_sentence = [] 47 | 48 | current_sentence.append((text, ner)) 49 | 50 | if current_sentence: 51 | sentences.append(current_sentence) 52 | 53 | print("Read %d sentences in %d lines from %s" % (len(sentences), len(lines), input_file)) 54 | return sentences 55 | -------------------------------------------------------------------------------- /stanza/models/langid/trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | 4 | from stanza.models.langid.model import LangIDBiLSTM 5 | 6 | 7 | class Trainer: 8 | 9 | DEFAULT_BATCH_SIZE = 64 10 | DEFAULT_LAYERS = 2 11 | DEFAULT_EMBEDDING_DIM = 150 12 | DEFAULT_HIDDEN_DIM = 150 13 | 14 | def __init__(self, config, load_model=False, device=None): 15 | self.model_path = config["model_path"] 16 | self.batch_size = config.get("batch_size", Trainer.DEFAULT_BATCH_SIZE) 17 | if load_model: 18 | self.load(config["load_name"], device) 19 | else: 20 | self.model = LangIDBiLSTM(config["char_to_idx"], config["tag_to_idx"], Trainer.DEFAULT_LAYERS, 21 | Trainer.DEFAULT_EMBEDDING_DIM, 22 | Trainer.DEFAULT_HIDDEN_DIM, 23 | batch_size=self.batch_size, 24 | weights=config["lang_weights"]).to(device) 25 | self.optimizer = optim.AdamW(self.model.parameters()) 26 | 27 | def update(self, inputs): 28 | self.model.train() 29 | sentences, targets = inputs 30 | self.optimizer.zero_grad() 31 | y_hat = self.model.forward(sentences) 32 | loss = self.model.loss(y_hat, targets) 33 | loss.backward() 34 | self.optimizer.step() 35 | 36 | def predict(self, inputs): 37 | self.model.eval() 38 | sentences, targets = inputs 39 | return torch.argmax(self.model(sentences), dim=1) 40 | 41 | def save(self, label=None): 42 | # save a copy of model with label 43 | if label: 44 | self.model.save(f"{self.model_path[:-3]}-{label}.pt") 45 | self.model.save(self.model_path) 46 | 47 | def load(self, model_path=None, device=None): 48 | if not model_path: 49 | model_path = self.model_path 50 | self.model = LangIDBiLSTM.load(model_path, device, self.batch_size) 51 | 52 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/reduce_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cut short the training portion of a constituency dataset. 3 | 4 | One could think this script isn't necessary, as shuf | head would work, 5 | but some treebanks use multiple lines for representing trees. 6 | Thus it is necessary to actually intelligently read the trees. 7 | 8 | Run with 9 | 10 | python3 stanza/utils/datasets/constituency/reduce_dataset.py --input zh-hans_ctb-51b --output zh-hans_ctb5k 11 | """ 12 | 13 | import argparse 14 | import os 15 | import random 16 | 17 | from stanza.models.constituency import tree_reader 18 | import stanza.utils.default_paths as default_paths 19 | from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description="Script that cuts a treebank down to size") 23 | parser.add_argument('--input', type=str, default=None, help='Input treebank') 24 | parser.add_argument('--output', type=str, default=None, help='Output treebank') 25 | parser.add_argument('--size', type=int, default=5000, help='How many trees') 26 | args = parser.parse_args() 27 | 28 | random.seed(1234) 29 | 30 | paths = default_paths.get_default_paths() 31 | output_directory = paths["CONSTITUENCY_DATA_DIR"] 32 | 33 | # data/constituency/en_ptb3_train.mrg 34 | input_filenames = [os.path.join(output_directory, "%s_%s.mrg" % (args.input, shard)) for shard in SHARDS] 35 | output_filenames = ["%s_%s.mrg" % (args.output, shard) for shard in SHARDS] 36 | shrink_datasets = [True, False, False] 37 | 38 | datasets = [] 39 | for input_filename, shrink in zip(input_filenames, shrink_datasets): 40 | treebank = tree_reader.read_treebank(input_filename) 41 | if shrink: 42 | random.shuffle(treebank) 43 | treebank = treebank[:args.size] 44 | datasets.append(treebank) 45 | write_dataset(datasets, output_directory, args.output) 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /stanza/tests/pipeline/test_pipeline_pos_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic testing of part of speech tagging 3 | """ 4 | 5 | import pytest 6 | import stanza 7 | 8 | from stanza.tests import * 9 | 10 | pytestmark = pytest.mark.pipeline 11 | 12 | EN_DOC = "Joe Smith was born in California." 13 | 14 | EN_DOC_GOLD = """ 15 | ]> 16 | ]> 17 | ]> 18 | ]> 19 | ]> 20 | ]> 21 | ]> 22 | """.strip() 23 | 24 | @pytest.fixture(scope="module") 25 | def pos_pipeline(): 26 | return stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'en'}) 27 | 28 | def test_part_of_speech(pos_pipeline): 29 | doc = pos_pipeline(EN_DOC) 30 | assert EN_DOC_GOLD == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) 31 | 32 | def test_get_known_xpos(pos_pipeline): 33 | tags = pos_pipeline.processors['pos'].get_known_xpos() 34 | # make sure we have xpos... 35 | assert 'DT' in tags 36 | # ... and not upos 37 | assert 'DET' not in tags 38 | 39 | def test_get_known_upos(pos_pipeline): 40 | tags = pos_pipeline.processors['pos'].get_known_upos() 41 | # make sure we have upos... 42 | assert 'DET' in tags 43 | # ... and not xpos 44 | assert 'DT' not in tags 45 | 46 | 47 | def test_get_known_feats(pos_pipeline): 48 | feats = pos_pipeline.processors['pos'].get_known_feats() 49 | # I appreciate how self-referential the Abbr feat is 50 | assert 'Abbr' in feats 51 | assert 'Yes' in feats['Abbr'] 52 | -------------------------------------------------------------------------------- /stanza/models/common/convert_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | A utility script to load a word embedding file from a text file and save it as a .pt 3 | 4 | Run it as follows: 5 | python stanza/models/common/convert_pretrain.py <.pt file> <# vectors> 6 | 7 | Note that -1 for # of vectors will keep all the vectors. 8 | You probably want to keep fewer than that for most publicly released 9 | embeddings, though, as they can get quite large. 10 | 11 | As a concrete example, you can convert a newly downloaded Faroese WV file as follows: 12 | python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/fo_farpahc.pretrain.pt ~/extern_data/wordvec/fasttext/faroese.txt -1 13 | or save part of an Icelandic WV file: 14 | python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/is_icepahc.pretrain.pt ~/extern_data/wordvec/fasttext/icelandic.cc.is.300.vec 150000 15 | Note that if the pretrain already exists, nothing will be changed. It will not overwrite an existing .pt file. 16 | 17 | """ 18 | 19 | import argparse 20 | import os 21 | import sys 22 | 23 | from stanza.models.common import pretrain 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("output_pt", default=None, help="Where to write the converted PT file") 28 | parser.add_argument("input_vec", default=None, help="Unconverted vectors file") 29 | parser.add_argument("max_vocab", type=int, default=-1, nargs="?", help="How many vectors to convert. -1 means convert them all") 30 | args = parser.parse_args() 31 | 32 | if os.path.exists(args.output_pt): 33 | print("Not overwriting existing pretrain file in %s" % args.output_pt) 34 | 35 | if args.input_vec.endswith(".csv"): 36 | pt = pretrain.Pretrain(args.output_pt, max_vocab=args.max_vocab, csv_filename=args.input_vec) 37 | else: 38 | pt = pretrain.Pretrain(args.output_pt, args.input_vec, max_vocab=args.max_vocab) 39 | print("Pretrain is of size {}".format(len(pt.vocab))) 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /stanza/tests/lemma_classifier/test_training.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import pytest 5 | 6 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 7 | 8 | from stanza.models.lemma_classifier import train_lstm_model 9 | from stanza.models.lemma_classifier import train_transformer_model 10 | from stanza.models.lemma_classifier.base_model import LemmaClassifier 11 | from stanza.models.lemma_classifier.evaluate_models import evaluate_model 12 | 13 | from stanza.tests import TEST_WORKING_DIR 14 | from stanza.tests.lemma_classifier.test_data_preparation import convert_english_dataset 15 | 16 | @pytest.fixture(scope="module") 17 | def pretrain_file(): 18 | return f'{TEST_WORKING_DIR}/in/tiny_emb.pt' 19 | 20 | def test_train_lstm(tmp_path, pretrain_file): 21 | converted_files = convert_english_dataset(tmp_path) 22 | 23 | save_name = str(tmp_path / 'lemma.pt') 24 | 25 | train_file = converted_files[0] 26 | eval_file = converted_files[1] 27 | train_args = ['--wordvec_pretrain_file', pretrain_file, 28 | '--save_name', save_name, 29 | '--train_file', train_file, 30 | '--eval_file', eval_file] 31 | trainer = train_lstm_model.main(train_args) 32 | 33 | evaluate_model(trainer.model, eval_file) 34 | # test that loading the model works 35 | model = LemmaClassifier.load(save_name, None) 36 | 37 | def test_train_transformer(tmp_path, pretrain_file): 38 | converted_files = convert_english_dataset(tmp_path) 39 | 40 | save_name = str(tmp_path / 'lemma.pt') 41 | 42 | train_file = converted_files[0] 43 | eval_file = converted_files[1] 44 | train_args = ['--bert_model', 'hf-internal-testing/tiny-bert', 45 | '--save_name', save_name, 46 | '--train_file', train_file, 47 | '--eval_file', eval_file] 48 | trainer = train_transformer_model.main(train_args) 49 | 50 | evaluate_model(trainer.model, eval_file) 51 | 52 | # test that loading the model works 53 | model = LemmaClassifier.load(save_name, None) 54 | -------------------------------------------------------------------------------- /stanza/tests/constituency/test_convert_starlang.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test a couple different classes of trees to check the output of the Starlang conversion 3 | """ 4 | 5 | import os 6 | import tempfile 7 | 8 | import pytest 9 | 10 | from stanza.utils.datasets.constituency import convert_starlang 11 | 12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 13 | 14 | TREE="( (S (NP (NP {morphologicalAnalysis=bayan+NOUN+A3SG+PNON+NOM}{metaMorphemes=bayan}{turkish=Bayan}{english=Ms.}{semantics=TUR10-0396530}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}{englishSemantics=ENG31-06352895-n}) (NP {morphologicalAnalysis=haag+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=haag}{turkish=Haag}{english=Haag}{semantics=TUR10-0000000}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580})) (VP (NP {morphologicalAnalysis=elianti+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=elianti}{turkish=Elianti}{english=Elianti}{semantics=TUR10-0000000}{namedEntity=NONE}{propBank=ARG1$TUR10-0148580}) (VP {morphologicalAnalysis=çal+VERB+POS+AOR+A3SG}{metaMorphemes=çal+Ar}{turkish=çalar}{english=plays}{semantics=TUR10-0148580}{namedEntity=NONE}{propBank=PREDICATE$TUR10-0148580}{englishSemantics=ENG31-01730049-v})) (. {morphologicalAnalysis=.+PUNC}{metaMorphemes=.}{metaMorphemesMoved=.}{turkish=.}{english=.}{semantics=TUR10-1081860}{namedEntity=NONE}{propBank=NONE})) )" 15 | 16 | def test_read_tree(): 17 | """ 18 | Test a basic tree read 19 | """ 20 | tree = convert_starlang.read_tree(TREE) 21 | assert "(ROOT (S (NP (NP Bayan) (NP Haag)) (VP (NP Elianti) (VP çalar)) (. .)))" == str(tree) 22 | 23 | def test_missing_word(): 24 | """ 25 | Test that an error is thrown if the word is missing 26 | """ 27 | tree_text = TREE.replace("turkish=", "foo=") 28 | with pytest.raises(ValueError): 29 | tree = convert_starlang.read_tree(tree_text) 30 | 31 | def test_bad_label(): 32 | """ 33 | Test that an unexpected label results in an error 34 | """ 35 | tree_text = TREE.replace("(S", "(s") 36 | with pytest.raises(ValueError): 37 | tree = convert_starlang.read_tree(tree_text) 38 | -------------------------------------------------------------------------------- /demo/CONLL_Dependency_Visualizer_Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c0fd86c8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n", 11 | "\n", 12 | "# load necessary conllu files - expected to be in the demo directory along with the notebook\n", 13 | "en_file = \"en_test.conllu.txt\"\n", 14 | "\n", 15 | "# testing left to right languages\n", 16 | "conll_to_visual(en_file, \"en\", sent_count=2)\n", 17 | "conll_to_visual(en_file, \"en\", sent_count=10)\n", 18 | "#conll_to_visual(en_file, \"en\", display_all=True)\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "fc4b3f9b", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n", 29 | "\n", 30 | "jp_file = \"japanese_test.conllu.txt\"\n", 31 | "conll_to_visual(jp_file, \"ja\")\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "6852b8e8", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n", 42 | "\n", 43 | "# testing right to left languages\n", 44 | "ar_file = \"arabic_test.conllu.txt\"\n", 45 | "conll_to_visual(ar_file, \"ar\")" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3 (ipykernel)", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.9.22" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 5 70 | } 71 | -------------------------------------------------------------------------------- /stanza/tests/resources/test_installation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test installation functions. 3 | """ 4 | 5 | import os 6 | import pytest 7 | import shutil 8 | import tempfile 9 | 10 | import stanza 11 | from stanza.tests import TEST_WORKING_DIR 12 | 13 | pytestmark = [pytest.mark.travis, pytest.mark.client] 14 | 15 | def test_install_corenlp(): 16 | # we do not reset the CORENLP_HOME variable since this may impact the 17 | # client tests 18 | with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir: 19 | 20 | # the download method doesn't install over existing directories 21 | shutil.rmtree(test_dir) 22 | stanza.install_corenlp(dir=test_dir) 23 | 24 | assert os.path.isdir(test_dir), "Installation destination directory not found." 25 | jar_files = [f for f in os.listdir(test_dir) \ 26 | if f.endswith('.jar') and f.startswith('stanford-corenlp')] 27 | assert len(jar_files) > 0, \ 28 | "Cannot find stanford-corenlp jar files in the installation directory." 29 | assert not os.path.exists(os.path.join(test_dir, 'corenlp.zip')), \ 30 | "Downloaded zip file was not removed." 31 | 32 | def test_download_corenlp_models(): 33 | model_name = "arabic" 34 | version = "4.2.2" 35 | 36 | with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir: 37 | stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir) 38 | 39 | dest_file = os.path.join(test_dir, f"stanford-corenlp-{version}-models-{model_name}.jar") 40 | assert os.path.isfile(dest_file), "Downloaded model file not found." 41 | 42 | def test_download_tokenize_mwt(): 43 | with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir: 44 | stanza.download("en", model_dir=test_dir, processors="tokenize", package="ewt", verbose=False) 45 | pipeline = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package="ewt") 46 | assert isinstance(pipeline, stanza.Pipeline) 47 | # mwt should be added to the list 48 | assert len(pipeline.loaded_processors) == 2 49 | -------------------------------------------------------------------------------- /stanza/tests/server/test_parser_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the parser eval interface 3 | """ 4 | 5 | import pytest 6 | import stanza 7 | from stanza.models.constituency import tree_reader 8 | from stanza.protobuf import EvaluateParserRequest, EvaluateParserResponse 9 | from stanza.server.parser_eval import build_request, collate, EvaluateParser, ParseResult 10 | from stanza.tests.server.test_java_protobuf_requests import check_tree 11 | 12 | from stanza.tests import * 13 | 14 | pytestmark = [pytest.mark.travis, pytest.mark.client] 15 | 16 | def build_one_tree_treebank(fake_scores=True): 17 | text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))" 18 | trees = tree_reader.read_trees(text) 19 | assert len(trees) == 1 20 | gold = trees[0] 21 | if fake_scores: 22 | prediction = (gold, 1.0) 23 | treebank = [ParseResult(gold, [prediction], None, None)] 24 | return treebank 25 | else: 26 | prediction = gold 27 | return collate([gold], [prediction]) 28 | 29 | def check_build(fake_scores=True): 30 | treebank = build_one_tree_treebank(fake_scores) 31 | request = build_request(treebank) 32 | 33 | assert len(request.treebank) == 1 34 | check_tree(request.treebank[0].gold, treebank[0][0], None) 35 | assert len(request.treebank[0].predicted) == 1 36 | if fake_scores: 37 | check_tree(request.treebank[0].predicted[0], treebank[0][1][0][0], treebank[0][1][0][1]) 38 | else: 39 | check_tree(request.treebank[0].predicted[0], treebank[0][1][0], None) 40 | 41 | 42 | def test_build_tuple_request(): 43 | check_build(True) 44 | 45 | def test_build_notuple_request(): 46 | check_build(False) 47 | 48 | def test_score_one_tree_tuples(): 49 | treebank = build_one_tree_treebank(True) 50 | 51 | with EvaluateParser() as ep: 52 | response = ep.process(treebank) 53 | assert response.f1 == pytest.approx(1.0) 54 | 55 | def test_score_one_tree_notuples(): 56 | treebank = build_one_tree_treebank(False) 57 | 58 | with EvaluateParser() as ep: 59 | response = ep.process(treebank) 60 | assert response.f1 == pytest.approx(1.0) 61 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/json_to_bio.py: -------------------------------------------------------------------------------- 1 | """ 2 | If you want to convert .json back to .bio for some reason, this will do it for you 3 | """ 4 | 5 | import argparse 6 | import json 7 | import os 8 | from stanza.models.common.doc import Document 9 | from stanza.models.ner.utils import process_tags 10 | from stanza.utils.default_paths import get_default_paths 11 | 12 | def convert_json_to_bio(input_filename, output_filename): 13 | with open(input_filename, encoding="utf-8") as fin: 14 | doc = Document(json.load(fin)) 15 | sentences = [[(word.text, word.ner) for word in sentence.tokens] for sentence in doc.sentences] 16 | sentences = process_tags(sentences, "bioes") 17 | with open(output_filename, "w", encoding="utf-8") as fout: 18 | for sentence in sentences: 19 | for word in sentence: 20 | fout.write("%s\t%s\n" % word) 21 | fout.write("\n") 22 | 23 | def main(args=None): 24 | ner_data_dir = get_default_paths()['NER_DATA_DIR'] 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--input_filename', type=str, default="data/ner/en_foreign-4class.test.json", help='Convert an individual file') 27 | parser.add_argument('--input_dir', type=str, default=ner_data_dir, help='Which directory to find the dataset, if using --input_dataset') 28 | parser.add_argument('--input_dataset', type=str, help='Convert an entire dataset') 29 | parser.add_argument('--output_suffix', type=str, default='bioes', help='suffix for output filenames') 30 | args = parser.parse_args(args) 31 | 32 | if args.input_dataset: 33 | input_filenames = [os.path.join(args.input_dir, "%s.%s.json" % (args.input_dataset, shard)) 34 | for shard in ("train", "dev", "test")] 35 | else: 36 | input_filenames = [args.input_filename] 37 | for input_filename in input_filenames: 38 | output_filename = os.path.splitext(input_filename)[0] + "." + args.output_suffix 39 | print("%s -> %s" % (input_filename, output_filename)) 40 | convert_json_to_bio(input_filename, output_filename) 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /stanza/tests/server/test_server_pretokenized.py: -------------------------------------------------------------------------------- 1 | """ 2 | Misc tests for the server 3 | """ 4 | 5 | import pytest 6 | import re 7 | 8 | from stanza.server import CoreNLPClient 9 | 10 | pytestmark = pytest.mark.client 11 | 12 | tokens = {} 13 | tags = {} 14 | 15 | # Italian examples 16 | tokens["italian"] = [ 17 | "È vero , tutti possiamo essere sostituiti .\n Alcune chiamate partirono da il Quirinale ." 18 | ] 19 | tags["italian"] = [ 20 | [ 21 | ["AUX", "ADJ", "PUNCT", "PRON", "AUX", "AUX", "VERB", "PUNCT"], 22 | ["DET", "NOUN", "VERB", "ADP", "DET", "PROPN", "PUNCT"], 23 | ], 24 | ] 25 | 26 | 27 | # French examples 28 | tokens["french"] = [ 29 | ( 30 | "Les études durent six ans mais leur contenu diffère donc selon les Facultés .\n" 31 | "Il est fêté le 22 mai ." 32 | ) 33 | ] 34 | tags["french"] = [ 35 | [ 36 | ["DET", "NOUN", "VERB", "NUM", "NOUN", "CCONJ", "DET", "NOUN", "VERB", "ADV", "ADP", "DET", "PROPN", "PUNCT"], 37 | ["PRON", "AUX", "VERB", "DET", "NUM", "NOUN", "PUNCT"] 38 | ], 39 | ] 40 | 41 | 42 | # English examples 43 | tokens["english"] = ["This shouldn't be split .\n I hope it's not ."] 44 | tags["english"] = [ 45 | [ 46 | ["DT", "NN", "VB", "VBN", "."], 47 | ["PRP", "VBP", "PRP$", "RB", "."], 48 | ], 49 | ] 50 | 51 | 52 | def pretokenized_test(lang): 53 | """Test submitting pretokenized French text.""" 54 | with CoreNLPClient( 55 | properties=lang, 56 | annotators="pos", 57 | pretokenized=True, 58 | be_quiet=True, 59 | ) as client: 60 | for input_text, gold_tags in zip(tokens[lang], tags[lang]): 61 | ann = client.annotate(input_text) 62 | for sentence_tags, sentence in zip(gold_tags, ann.sentence): 63 | result_tags = [tok.pos for tok in sentence.token] 64 | assert sentence_tags == result_tags 65 | 66 | 67 | def test_english_pretokenized(): 68 | pretokenized_test("english") 69 | 70 | 71 | def test_italian_pretokenized(): 72 | pretokenized_test("italian") 73 | 74 | 75 | def test_french_pretokenized(): 76 | pretokenized_test("french") 77 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/check_for_duplicates.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple tool to check if there are duplicates in a set of NER files 3 | 4 | It's surprising how many datasets have a bunch of duplicates... 5 | """ 6 | 7 | def read_sentences(filename): 8 | """ 9 | Read the sentences (without tags) from a BIO file 10 | """ 11 | sentences = [] 12 | with open(filename) as fin: 13 | lines = fin.readlines() 14 | current_sentence = [] 15 | for line in lines: 16 | line = line.strip() 17 | if not line: 18 | if current_sentence: 19 | sentences.append(tuple(current_sentence)) 20 | current_sentence = [] 21 | continue 22 | word = line.split("\t")[0] 23 | current_sentence.append(word) 24 | if len(current_sentence) > 0: 25 | sentences.append(tuple(current_sentence)) 26 | return sentences 27 | 28 | def check_for_duplicates(output_filenames, fail=False, check_self=False, print_all=False): 29 | """ 30 | Checks for exact duplicates in a list of NER files 31 | """ 32 | sentence_map = {} 33 | for output_filename in output_filenames: 34 | duplicates = 0 35 | sentences = read_sentences(output_filename) 36 | for sentence in sentences: 37 | other_file = sentence_map.get(sentence, None) 38 | if other_file is not None and (check_self or other_file != output_filename): 39 | if fail: 40 | raise ValueError("Duplicate sentence '{}', first in {}, also in {}".format("".join(sentence), sentence_map[sentence], output_filename)) 41 | else: 42 | if duplicates == 0 and not print_all: 43 | print("First duplicate:") 44 | if duplicates == 0 or print_all: 45 | print("{}\nFound in {} and {}".format(sentence, other_file, output_filename)) 46 | duplicates = duplicates + 1 47 | sentence_map[sentence] = output_filename 48 | if duplicates > 0: 49 | print("%d duplicates found in %s" % (duplicates, output_filename)) 50 | -------------------------------------------------------------------------------- /stanza/tests/common/test_data_objects.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic tests of the stanza data objects, especially the setter/getter routines 3 | """ 4 | import pytest 5 | 6 | import stanza 7 | from stanza.models.common.doc import Document, Sentence, Word 8 | from stanza.tests import * 9 | 10 | pytestmark = pytest.mark.pipeline 11 | 12 | # data for testing 13 | EN_DOC = "This is a test document. Pretty cool!" 14 | 15 | EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNCT_.'), ('ADV_RB', 'ADJ_JJ', 'PUNCT_.')) 16 | 17 | EN_DOC2 = "Chris Manning wrote a sentence. Then another." 18 | 19 | @pytest.fixture(scope="module") 20 | def nlp_pipeline(): 21 | nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') 22 | return nlp 23 | 24 | def test_readonly(nlp_pipeline): 25 | Document.add_property('some_property', 123) 26 | doc = nlp_pipeline(EN_DOC) 27 | assert doc.some_property == 123 28 | with pytest.raises(ValueError): 29 | doc.some_property = 456 30 | 31 | 32 | def test_getter(nlp_pipeline): 33 | Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}") 34 | 35 | doc = nlp_pipeline(EN_DOC) 36 | 37 | assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences) 38 | 39 | def test_setter_getter(nlp_pipeline): 40 | int2str = {0: 'ok', 1: 'good', 2: 'bad'} 41 | str2int = {'ok': 0, 'good': 1, 'bad': 2} 42 | def setter(self, value): 43 | self._classname = str2int[value] 44 | Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter) 45 | 46 | doc = nlp_pipeline(EN_DOC) 47 | sentence = doc.sentences[0] 48 | sentence.classname = 'good' 49 | assert sentence._classname == 1 50 | 51 | # don't try this at home 52 | sentence._classname = 2 53 | assert sentence.classname == 'bad' 54 | 55 | def test_backpointer(nlp_pipeline): 56 | doc = nlp_pipeline(EN_DOC2) 57 | ent = doc.ents[0] 58 | assert ent.sent is doc.sentences[0] 59 | assert list(doc.iter_words())[0].sent is doc.sentences[0] 60 | assert list(doc.iter_tokens())[-1].sent is doc.sentences[-1] 61 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/extract_all_silver_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | After running build_silver_dataset.py, this extracts the trees of all match levels at once 3 | 4 | For example 5 | 6 | python stanza/utils/datasets/constituency/extract_all_silver_dataset.py --output_prefix /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_ --parsed_trees /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_wiki_a*trees 7 | 8 | cat /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_[012345678].mrg | sort | uniq | shuf > /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_sort.mrg 9 | 10 | shuf /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_sort.mrg | head -n 200000 > /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_200K.mrg 11 | """ 12 | 13 | import argparse 14 | from collections import defaultdict 15 | import json 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser(description="After finding common trees using build_silver_dataset, this extracts them all or just the ones from a particular level of accuracy") 19 | parser.add_argument('--parsed_trees', type=str, nargs='+', help='Input file(s) of trees parsed into the build_silver_dataset json format.') 20 | parser.add_argument('--output_prefix', type=str, default=None, help='Prefix to use for outputting trees') 21 | parser.add_argument('--output_suffix', type=str, default=".mrg", help='Suffix to use for outputting trees') 22 | args = parser.parse_args() 23 | 24 | return args 25 | 26 | def main(): 27 | args = parse_args() 28 | 29 | trees = defaultdict(list) 30 | for filename in args.parsed_trees: 31 | with open(filename, encoding='utf-8') as fin: 32 | for line in fin.readlines(): 33 | tree = json.loads(line) 34 | trees[tree['count']].append(tree['tree']) 35 | 36 | for score, tree_list in trees.items(): 37 | filename = "%s%s%s" % (args.output_prefix, score, args.output_suffix) 38 | with open(filename, 'w', encoding='utf-8') as fout: 39 | for tree in tree_list: 40 | fout.write(tree) 41 | fout.write('\n') 42 | 43 | if __name__ == '__main__': 44 | main() 45 | 46 | 47 | -------------------------------------------------------------------------------- /stanza/models/classifiers/base_classifier.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import logging 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from stanza.models.common.utils import split_into_batches, sort_with_indices, unsort 9 | 10 | """ 11 | A base classifier type 12 | 13 | Currently, has the ability to process text or other inputs in a manner 14 | suitable for the particular model type. 15 | In other words, the CNNClassifier processes lists of words, 16 | and the ConstituencyClassifier processes trees 17 | """ 18 | 19 | logger = logging.getLogger('stanza') 20 | 21 | class BaseClassifier(ABC, nn.Module): 22 | @abstractmethod 23 | def extract_sentences(self, doc): 24 | """ 25 | Extract the sentences or the relevant information in the sentences from a document 26 | """ 27 | 28 | def preprocess_sentences(self, sentences): 29 | """ 30 | By default, don't do anything 31 | """ 32 | return sentences 33 | 34 | def label_sentences(self, sentences, batch_size=None): 35 | """ 36 | Given a list of sentences, return the model's results on that text. 37 | """ 38 | self.eval() 39 | 40 | sentences = self.preprocess_sentences(sentences) 41 | 42 | if batch_size is None: 43 | intervals = [(0, len(sentences))] 44 | orig_idx = None 45 | else: 46 | sentences, orig_idx = sort_with_indices(sentences, key=len, reverse=True) 47 | intervals = split_into_batches(sentences, batch_size) 48 | labels = [] 49 | for interval in intervals: 50 | if interval[1] - interval[0] == 0: 51 | # this can happen for empty text 52 | continue 53 | output = self(sentences[interval[0]:interval[1]]) 54 | predicted = torch.argmax(output, dim=1) 55 | labels.extend(predicted.tolist()) 56 | 57 | if orig_idx: 58 | sentences = unsort(sentences, orig_idx) 59 | labels = unsort(labels, orig_idx) 60 | 61 | logger.debug("Found labels") 62 | for (label, sentence) in zip(labels, sentences): 63 | logger.debug((label, sentence)) 64 | 65 | return labels 66 | -------------------------------------------------------------------------------- /stanza/utils/datasets/constituency/extract_silver_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | After running build_silver_dataset.py, this extracts the trees of a certain match level 3 | 4 | For example 5 | 6 | python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score 0 --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_0.mrg 7 | 8 | for i in `echo 0 1 2 3 4 5 6 7 8 9 10`; do python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score $i --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_$i.mrg; done 9 | """ 10 | 11 | import argparse 12 | import json 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description="After finding common trees using build_silver_dataset, this extracts them all or just the ones from a particular level of accuracy") 16 | parser.add_argument('--parsed_trees', type=str, nargs='+', help='Input file(s) of trees parsed into the build_silver_dataset json format.') 17 | parser.add_argument('--keep_score', type=int, default=None, help='Which agreement level to keep. None keeps all') 18 | parser.add_argument('--output_file', type=str, default=None, help='Where to put the output file') 19 | args = parser.parse_args() 20 | 21 | return args 22 | 23 | 24 | def main(): 25 | args = parse_args() 26 | 27 | trees = [] 28 | for filename in args.parsed_trees: 29 | with open(filename, encoding='utf-8') as fin: 30 | for line in fin.readlines(): 31 | tree = json.loads(line) 32 | if args.keep_score is None or tree['count'] == args.keep_score: 33 | tree = tree['tree'] 34 | trees.append(tree) 35 | 36 | if args.output_file is None: 37 | for tree in trees: 38 | print(tree) 39 | else: 40 | with open(args.output_file, 'w', encoding='utf-8') as fout: 41 | for tree in trees: 42 | fout.write(tree) 43 | fout.write('\n') 44 | 45 | if __name__ == '__main__': 46 | main() 47 | 48 | -------------------------------------------------------------------------------- /stanza/utils/datasets/sentiment/process_vsfc_vietnamese.py: -------------------------------------------------------------------------------- 1 | """ 2 | VSFC sentiment dataset is available at 3 | https://drive.google.com/drive/folders/1xclbjHHK58zk2X6iqbvMPS2rcy9y9E0X 4 | 5 | The format is extremely similar to ours - labels are 0,1,2. 6 | Text needs to be tokenized, though. 7 | Also, the files are split into two pieces, labels and text. 8 | """ 9 | 10 | import os 11 | import sys 12 | 13 | from tqdm import tqdm 14 | 15 | import stanza 16 | from stanza.models.classifiers.data import SentimentDatum 17 | import stanza.utils.datasets.sentiment.process_utils as process_utils 18 | 19 | import stanza.utils.default_paths as default_paths 20 | 21 | def combine_columns(in_directory, dataset, nlp): 22 | directory = os.path.join(in_directory, dataset) 23 | 24 | sentiment_file = os.path.join(directory, "sentiments.txt") 25 | with open(sentiment_file) as fin: 26 | sentiment = fin.readlines() 27 | 28 | text_file = os.path.join(directory, "sents.txt") 29 | with open(text_file) as fin: 30 | text = fin.readlines() 31 | 32 | text = [[token.text for sentence in nlp(line.strip()).sentences for token in sentence.tokens] 33 | for line in tqdm(text)] 34 | 35 | phrases = [SentimentDatum(s.strip(), t) for s, t in zip(sentiment, text)] 36 | return phrases 37 | 38 | def main(in_directory, out_directory, short_name): 39 | nlp = stanza.Pipeline('vi', processors='tokenize') 40 | for shard in ("train", "dev", "test"): 41 | phrases = combine_columns(in_directory, shard, nlp) 42 | output_file = os.path.join(out_directory, "%s.%s.json" % (short_name, shard)) 43 | process_utils.write_list(output_file, phrases) 44 | 45 | 46 | if __name__ == '__main__': 47 | paths = default_paths.get_default_paths() 48 | 49 | if len(sys.argv) <= 1: 50 | in_directory = os.path.join(paths['SENTIMENT_BASE'], "vietnamese", "_UIT-VSFC") 51 | else: 52 | in_directory = sys.argv[1] 53 | 54 | if len(sys.argv) <= 2: 55 | out_directory = paths['SENTIMENT_DATA_DIR'] 56 | else: 57 | out_directory = sys.argv[2] 58 | 59 | if len(sys.argv) <= 3: 60 | short_name = 'vi_vsfc' 61 | else: 62 | short_name = sys.argv[3] 63 | 64 | main(in_directory, out_directory, short_name) 65 | -------------------------------------------------------------------------------- /stanza/models/constituency/tree_stack.py: -------------------------------------------------------------------------------- 1 | """ 2 | A utilitiy class for keeping track of intermediate parse states 3 | """ 4 | 5 | from collections import namedtuple 6 | 7 | class TreeStack(namedtuple('TreeStack', ['value', 'parent', 'length'])): 8 | """ 9 | A stack which can branch in several directions, as long as you 10 | keep track of the branching heads 11 | 12 | An example usage is when K constituents are removed at once 13 | to create a new constituent, and then the LSTM which tracks the 14 | values of the constituents is updated starting from the Kth 15 | output of the LSTM with the new value. 16 | 17 | We don't simply keep track of a single stack object using a deque 18 | because versions of the parser which use a beam will want to be 19 | able to branch in different directions from the same base stack 20 | 21 | Another possible usage is if an oracle is used for training 22 | in a manner where some fraction of steps are non-gold steps, 23 | but we also want to take a gold step from the same state. 24 | Eg, parser gets to state X, wants to make incorrect transition T 25 | instead of gold transition G, and so we continue training both 26 | X+G and X+T. If we only represent the state X with standard 27 | python stacks, it would not be possible to track both of these 28 | states at the same time without copying the entire thing. 29 | 30 | Value can be as transition, a word, or a partially built constituent 31 | 32 | Implemented as a namedtuple to make it a bit more efficient 33 | """ 34 | def pop(self): 35 | return self.parent 36 | 37 | def push(self, value): 38 | # returns a new stack node which points to this 39 | return TreeStack(value, self, self.length+1) 40 | 41 | def __iter__(self): 42 | stack = self 43 | while stack.parent is not None: 44 | yield stack.value 45 | stack = stack.parent 46 | yield stack.value 47 | 48 | def __reversed__(self): 49 | items = list(iter(self)) 50 | for item in reversed(items): 51 | yield item 52 | 53 | def __str__(self): 54 | return "TreeStack(%s)" % ", ".join([str(x) for x in self]) 55 | 56 | def __len__(self): 57 | return self.length 58 | -------------------------------------------------------------------------------- /stanza/utils/datasets/coref/convert_hebrew_mixed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Build a dataset mixed with IAHLT Hebrew and UD Coref 3 | 4 | We find that the IAHLT dataset by itself, trained using Stanza 1.11 5 | with xlm-roberta-large and a lora finetuning layer, gets 49.7 F1. 6 | This is a bit lower than the value the IAHLT group originally had, as 7 | they reported 52. Interestingly, we find that mixing in the 1.3 UD 8 | Coref improves results, getting 51.7 under the same parameters 9 | 10 | This script runs the IAHLT conversion and the UD Coref conversion, 11 | then combines the files into one big training file 12 | """ 13 | 14 | import json 15 | import os 16 | import shutil 17 | import tempfile 18 | 19 | from stanza.utils.datasets.coref import convert_hebrew_iahlt 20 | from stanza.utils.datasets.coref import convert_udcoref 21 | from stanza.utils.default_paths import get_default_paths 22 | 23 | def main(): 24 | paths = get_default_paths() 25 | coref_output_path = paths['COREF_DATA_DIR'] 26 | with tempfile.TemporaryDirectory() as temp_dir_path: 27 | hebrew_filenames = convert_hebrew_iahlt.main(["--output_directory", temp_dir_path]) 28 | udcoref_filenames = convert_udcoref.main(["--project", "gerrom", "--output_directory", temp_dir_path]) 29 | 30 | with open(os.path.join(temp_dir_path, hebrew_filenames[0]), encoding="utf-8") as fin: 31 | hebrew_train = json.load(fin) 32 | udcoref_train_filename = os.path.join(temp_dir_path, udcoref_filenames[0]) 33 | with open(udcoref_train_filename, encoding="utf-8") as fin: 34 | print("Reading extra udcoref json data from %s" % udcoref_train_filename) 35 | udcoref_train = json.load(fin) 36 | mixed_train = hebrew_train + udcoref_train 37 | with open(os.path.join(coref_output_path, "he_mixed.train.json"), "w", encoding="utf-8") as fout: 38 | json.dump(mixed_train, fout, indent=2, ensure_ascii=False)) 39 | 40 | shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[1]), 41 | os.path.join(coref_output_path, "he_mixed.dev.json")) 42 | shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[2]), 43 | os.path.join(coref_output_path, "he_mixed.test.json")) 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /stanza/pipeline/mwt_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processor for performing multi-word-token expansion 3 | """ 4 | 5 | import io 6 | 7 | import torch 8 | 9 | from stanza.models.mwt.data import DataLoader 10 | from stanza.models.mwt.trainer import Trainer 11 | from stanza.pipeline._constants import * 12 | from stanza.pipeline.processor import UDProcessor, register_processor 13 | 14 | @register_processor(MWT) 15 | class MWTProcessor(UDProcessor): 16 | 17 | # set of processor requirements this processor fulfills 18 | PROVIDES_DEFAULT = set([MWT]) 19 | # set of processor requirements for this processor 20 | REQUIRES_DEFAULT = set([TOKENIZE]) 21 | 22 | def _set_up_model(self, config, pipeline, device): 23 | self._trainer = Trainer(model_file=config['model_path'], device=device) 24 | 25 | def build_batch(self, document): 26 | return DataLoader(document, self.config['batch_size'], self.config, vocab=self.vocab, evaluation=True, expand_unk_vocab=True) 27 | 28 | def process(self, document): 29 | batch = self.build_batch(document) 30 | 31 | # process the rest 32 | expansions = batch.doc.get_mwt_expansions(evaluation=True) 33 | if len(batch) > 0: 34 | # decide trainer type and run eval 35 | if self.config['dict_only']: 36 | preds = self.trainer.predict_dict(expansions) 37 | else: 38 | with torch.no_grad(): 39 | preds = [] 40 | for i, b in enumerate(batch.to_loader()): 41 | preds += self.trainer.predict(b, never_decode_unk=True, vocab=batch.vocab) 42 | 43 | if self.config.get('ensemble_dict', False): 44 | preds = self.trainer.ensemble(expansions, preds) 45 | else: 46 | # skip eval if dev data does not exist 47 | preds = [] 48 | 49 | batch.doc.set_mwt_expansions(preds, process_manual_expanded=False) 50 | return batch.doc 51 | 52 | def bulk_process(self, docs): 53 | """ 54 | MWT processor counts some statistics on the individual docs, so we need to separately redo those stats 55 | """ 56 | docs = super().bulk_process(docs) 57 | for doc in docs: 58 | doc._count_words() 59 | return docs 60 | -------------------------------------------------------------------------------- /stanza/tests/mwt/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the MWT resplitting of preexisting tokens without word splits 3 | """ 4 | 5 | import pytest 6 | 7 | import stanza 8 | from stanza.models.mwt.utils import resplit_mwt 9 | 10 | from stanza.tests import TEST_MODELS_DIR 11 | 12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis] 13 | 14 | @pytest.fixture(scope="module") 15 | def pipeline(): 16 | """ 17 | A reusable pipeline with the NER module 18 | """ 19 | return stanza.Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,mwt", package="gum") 20 | 21 | 22 | def test_resplit_keep_tokens(pipeline): 23 | """ 24 | Test splitting with enforced token boundaries 25 | """ 26 | tokens = [["I", "can't", "believe", "it"], ["I can't", "sleep"]] 27 | doc = resplit_mwt(tokens, pipeline) 28 | assert len(doc.sentences) == 2 29 | assert len(doc.sentences[0].tokens) == 4 30 | assert len(doc.sentences[0].tokens[1].words) == 2 31 | assert doc.sentences[0].tokens[1].words[0].text == "ca" 32 | assert doc.sentences[0].tokens[1].words[1].text == "n't" 33 | 34 | assert len(doc.sentences[1].tokens) == 2 35 | # updated GUM MWT splits "I can't" into three segments 36 | # the way we want, "I - ca - n't" 37 | # previously it would split "I - can - 't" 38 | assert len(doc.sentences[1].tokens[0].words) == 3 39 | assert doc.sentences[1].tokens[0].words[0].text == "I" 40 | assert doc.sentences[1].tokens[0].words[1].text == "ca" 41 | assert doc.sentences[1].tokens[0].words[2].text == "n't" 42 | 43 | 44 | def test_resplit_no_keep_tokens(pipeline): 45 | """ 46 | Test splitting without enforced token boundaries 47 | """ 48 | tokens = [["I", "can't", "believe", "it"], ["I can't", "sleep"]] 49 | doc = resplit_mwt(tokens, pipeline, keep_tokens=False) 50 | assert len(doc.sentences) == 2 51 | assert len(doc.sentences[0].tokens) == 4 52 | assert len(doc.sentences[0].tokens[1].words) == 2 53 | assert doc.sentences[0].tokens[1].words[0].text == "ca" 54 | assert doc.sentences[0].tokens[1].words[1].text == "n't" 55 | 56 | assert len(doc.sentences[1].tokens) == 3 57 | assert len(doc.sentences[1].tokens[1].words) == 2 58 | assert doc.sentences[1].tokens[1].words[0].text == "ca" 59 | assert doc.sentences[1].tokens[1].words[1].text == "n't" 60 | -------------------------------------------------------------------------------- /stanza/utils/default_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def get_default_paths(): 4 | """ 5 | Gets base paths for the data directories 6 | 7 | If DATA_ROOT is set in the environment, use that as the root 8 | otherwise use "./data" 9 | individual paths can also be set in the environment 10 | """ 11 | DATA_ROOT = os.environ.get("DATA_ROOT", "data") 12 | defaults = { 13 | "TOKENIZE_DATA_DIR": DATA_ROOT + "/tokenize", 14 | "MWT_DATA_DIR": DATA_ROOT + "/mwt", 15 | "LEMMA_DATA_DIR": DATA_ROOT + "/lemma", 16 | "POS_DATA_DIR": DATA_ROOT + "/pos", 17 | "DEPPARSE_DATA_DIR": DATA_ROOT + "/depparse", 18 | "ETE_DATA_DIR": DATA_ROOT + "/ete", 19 | "NER_DATA_DIR": DATA_ROOT + "/ner", 20 | "CHARLM_DATA_DIR": DATA_ROOT + "/charlm", 21 | "SENTIMENT_DATA_DIR": DATA_ROOT + "/sentiment", 22 | "CONSTITUENCY_DATA_DIR": DATA_ROOT + "/constituency", 23 | "COREF_DATA_DIR": DATA_ROOT + "/coref", 24 | "LEMMA_CLASSIFIER_DATA_DIR": DATA_ROOT + "/lemma_classifier", 25 | 26 | # Set directories to store external word vector data 27 | "WORDVEC_DIR": "extern_data/wordvec", 28 | 29 | # TODO: not sure what other people actually have 30 | # TODO: also, could make this automatically update to the latest 31 | "UDBASE": "extern_data/ud2/ud-treebanks-v2.11", 32 | "UDBASE_GIT": "extern_data/ud2/git", 33 | 34 | "NERBASE": "extern_data/ner", 35 | "CONSTITUENCY_BASE": "extern_data/constituency", 36 | "SENTIMENT_BASE": "extern_data/sentiment", 37 | "COREF_BASE": "extern_data/coref", 38 | 39 | # there's a stanford github, stanfordnlp/handparsed-treebank, 40 | # with some data for different languages 41 | "HANDPARSED_DIR": "extern_data/handparsed-treebank", 42 | 43 | # directory with the contents of https://nlp.stanford.edu/projects/stanza/bio/ 44 | # on the cluster, for example, /u/nlp/software/stanza/bio_ud 45 | "BIO_UD_DIR": "extern_data/bio", 46 | 47 | # data root for other general input files, such as VI_VLSP 48 | "STANZA_EXTERN_DIR": "extern_data", 49 | } 50 | 51 | paths = { "DATA_ROOT" : DATA_ROOT } 52 | for k, v in defaults.items(): 53 | paths[k] = os.environ.get(k, v) 54 | 55 | return paths 56 | -------------------------------------------------------------------------------- /stanza/tests/datasets/coref/test_hebrew_iahlt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from stanza import Pipeline 4 | from stanza.tests import TEST_MODELS_DIR 5 | from stanza.utils.datasets.coref.convert_hebrew_iahlt import extract_doc 6 | 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline] 8 | 9 | @pytest.fixture(scope="module") 10 | def tokenizer(): 11 | pipe = Pipeline(lang="he", processors="tokenize", dir=TEST_MODELS_DIR, download_method=None) 12 | return pipe 13 | 14 | TEXT = """ 15 | 16 | 17 | 18 | מבולבלים​? גם אנחנו​: ל​מסעדנים ו​ה​מלצרים יש עוד סימני שאלה על ה​טיפים​ 19 | 20 | ה​פער בין פסיקת בית ה​דין ל​עבודה לבין פסיקה קודמת של בג"ץ​, משאיר את ה​ענף ב​חוסר וודאות​, ו​ה -​1 ב​ינואר כבר מעבר ל​פינה . "​מ​בחינת​י , הייתי מוסיף ל​תפריט תוספת שירות של 17​% "​, אמר בעלים של מסעדה ב​שדרות​ 21 | 22 | ב​רשות ה​מיסים מסתפקים ב​מסר עמום באשר ל​כוונותי​הם לאור פסק דין ה​טיפים ש​צפוי להיכנס ל​תוקפ​ו ב​-​1 ב​ינואר . על פי פרשנות​ם ה​מקצועית , הבהירו​, יש מקום לחייב את כספי ה​טיפים ב​מע"מ , "​עם זאת​, ה​רשות עדין בוחנת את ה​סוגיה ו​טרם התקבלה החלטה אופרטיבית ב​עניין "​. ו​איך אמורים ה​מסעדנים להיערך בינתיים ל​יישום ה​פסיקה ו​ל​מחזור ה​שנה ה​באה ? ב​יום חמישי יפגשו אנשי ארגון '​מסעדנים חזקים ביחד​' עם מנהל רשות ה​מיסים ערן יעקב​, ו​ידרשו תשובות ברורות​.​ 23 | 24 | "​אני עדיין לא מדבר עם ה​עובדים של​י , ו​אני גם לא יודע איך להיערך החל מ​עוד שבועיים​"​, אמר ל​'​דבר ראשון​' ניר שוחט​, ה​בעלים של מסעדת סושי מוטו ב​שדרות ו​מוסיף כי יהיה קשה להתאים את ה​פסיקה ל​מציאות ב​שטח . "​אף אחד לא יודע​. יש המון סתירות – עורך ה​דין אומר דבר אחד ו​רואה ה​חשבון דבר אחר​. עדיין לא הצליחו להבין את ה​חוק ל​אשור​ו "​.​ 25 | 26 | "​מ​בחינת​י , הייתי מוסיף ל​תפריט תוספת שירות של 17​% . זה יגלם גם את ה​מע"מ ו​ה​טיפים ו​מ​זה אני אשלם ל​מלצרים . די כבר עם ה​טיפים ה​אלה , מספיק​.​"​ 27 | """ 28 | 29 | CLUSTER = {'metadata': {'name': 'המסעדנים', 'entity': 'person'}, 'mentions': [[28, 35, {}], [572, 581, {}]]} 30 | 31 | def test_extract_doc(tokenizer): 32 | doc = {'text': TEXT, 33 | 'clusters': [CLUSTER], 34 | 'metadata': { 35 | 'doc_id': 'test' 36 | } 37 | } 38 | extracted = extract_doc(tokenizer, [doc]) 39 | assert len(extracted) == 1 40 | assert len(extracted[0].coref_spans) == 2 41 | assert extracted[0].coref_spans[1] == [(0, 4, 4)] 42 | assert extracted[0].coref_spans[6] == [(0, 3, 4)] 43 | -------------------------------------------------------------------------------- /stanza/models/lemma_classifier/baseline_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Baseline model for the existing lemmatizer which always predicts "be" and never "have" on the "'s" token. 3 | 4 | The BaselineModel class can be updated to any arbitrary token and predicton lemma, not just "be" on the "s" token. 5 | """ 6 | 7 | import stanza 8 | import os 9 | from stanza.models.lemma_classifier.evaluate_models import evaluate_sequences 10 | from stanza.models.lemma_classifier.prepare_dataset import load_doc_from_conll_file 11 | 12 | class BaselineModel: 13 | 14 | def __init__(self, token_to_lemmatize, prediction_lemma, prediction_upos): 15 | self.token_to_lemmatize = token_to_lemmatize 16 | self.prediction_lemma = prediction_lemma 17 | self.prediction_upos = prediction_upos 18 | 19 | def predict(self, token): 20 | if token == self.token_to_lemmatize: 21 | return self.prediction_lemma 22 | 23 | def evaluate(self, conll_path): 24 | """ 25 | Evaluates the baseline model against the test set defined in conll_path. 26 | 27 | Returns a map where the keys are each class and the values are another map including the precision, recall and f1 scores 28 | for that class. 29 | 30 | Also returns confusion matrix. Keys are gold tags and inner keys are predicted tags 31 | """ 32 | doc = load_doc_from_conll_file(conll_path) 33 | gold_tag_sequences, pred_tag_sequences = [], [] 34 | for sentence in doc.sentences: 35 | gold_tags, pred_tags = [], [] 36 | for word in sentence.words: 37 | if word.upos in self.prediction_upos and word.text == self.token_to_lemmatize: 38 | pred = self.prediction_lemma 39 | gold = word.lemma 40 | gold_tags.append(gold) 41 | pred_tags.append(pred) 42 | gold_tag_sequences.append(gold_tags) 43 | pred_tag_sequences.append(pred_tags) 44 | 45 | multiclass_result, confusion_mtx, weighted_f1 = evaluate_sequences(gold_tag_sequences, pred_tag_sequences) 46 | return multiclass_result, confusion_mtx 47 | 48 | 49 | if __name__ == "__main__": 50 | 51 | bl_model = BaselineModel("'s", "be", ["AUX"]) 52 | coNLL_path = os.path.join(os.path.dirname(__file__), "en_gum-ud-train.conllu") 53 | bl_model.evaluate(coNLL_path) 54 | 55 | -------------------------------------------------------------------------------- /demo/Dependency_Visualization_Testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "64b2a9e0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from stanza.utils.visualization.dependency_visualization import visualize_strings\n", 11 | "\n", 12 | "ar_strings = ['برلين ترفض حصول شركة اميركية على رخصة تصنيع دبابة \"ليوبارد\" الالمانية', \"هل بإمكاني مساعدتك؟\", \n", 13 | " \"أراك في مابعد\", \"لحظة من فضلك\"]\n", 14 | "# Testing with right to left language\n", 15 | "visualize_strings(ar_strings, \"ar\")" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "35ef521b", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from stanza.utils.visualization.dependency_visualization import visualize_strings\n", 26 | "\n", 27 | "en_strings = [\"This is a sentence.\", \n", 28 | " \"He is wearing a red shirt\",\n", 29 | " \"Barack Obama was born in Hawaii. He was elected President of the United States in 2008.\"]\n", 30 | "# Testing with left to right languages\n", 31 | "visualize_strings(en_strings, \"en\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "f3cf10ba", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from stanza.utils.visualization.dependency_visualization import visualize_strings\n", 42 | "\n", 43 | "zh_strings = [\"中国是一个很有意思的国家。\"]\n", 44 | "# Testing with right to left language\n", 45 | "visualize_strings(zh_strings, \"zh\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "d2b9b574", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3 (ipykernel)", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.9.22" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 5 78 | } 79 | -------------------------------------------------------------------------------- /stanza/utils/datasets/ner/conll_to_iob.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process a conll file into BIO 3 | 4 | Includes the ability to process a file from a text file 5 | or a text file within a zip 6 | 7 | Main program extracts a piece of the zip file from the Danish DDT dataset 8 | """ 9 | 10 | import io 11 | import zipfile 12 | from zipfile import ZipFile 13 | from stanza.utils.conll import CoNLL 14 | 15 | def process_conll(input_file, output_file, zip_file=None, conversion=None, attr_prefix="name", allow_empty=False): 16 | """ 17 | Process a single file from DDT 18 | 19 | zip_filename: path to ddt.zip 20 | in_filename: which piece to read 21 | out_filename: where to write the result 22 | 23 | label: which attribute to get from the misc field 24 | """ 25 | if not attr_prefix.endswith("="): 26 | attr_prefix = attr_prefix + "=" 27 | 28 | doc = CoNLL.conll2doc(input_file=input_file, zip_file=zip_file) 29 | 30 | with open(output_file, "w", encoding="utf-8") as fout: 31 | for sentence_idx, sentence in enumerate(doc.sentences): 32 | for token_idx, token in enumerate(sentence.tokens): 33 | misc = token.misc.split("|") 34 | for attr in misc: 35 | if attr.startswith(attr_prefix): 36 | ner = attr.split("=", 1)[1] 37 | break 38 | else: # name= not found 39 | if allow_empty: 40 | ner = "O" 41 | else: 42 | raise ValueError("Could not find ner tag in document {}, sentence {}, token {}".format(input_file, sentence_idx, token_idx)) 43 | 44 | if ner != "O" and conversion is not None: 45 | if isinstance(conversion, dict): 46 | bio, label = ner.split("-", 1) 47 | if label in conversion: 48 | label = conversion[label] 49 | ner = "%s-%s" % (bio, label) 50 | else: 51 | ner = conversion(ner) 52 | fout.write("%s\t%s\n" % (token.text, ner)) 53 | fout.write("\n") 54 | 55 | def main(): 56 | process_conll(zip_file="extern_data/ner/da_ddt/ddt.zip", input_file="ddt.train.conllu", output_file="data/ner/da_ddt.train.bio") 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /stanza/models/coref/predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import json 4 | import torch 5 | from tqdm import tqdm 6 | 7 | from stanza.models.coref.model import CorefModel 8 | 9 | 10 | if __name__ == "__main__": 11 | argparser = argparse.ArgumentParser() 12 | argparser.add_argument("experiment") 13 | argparser.add_argument("input_file") 14 | argparser.add_argument("output_file") 15 | argparser.add_argument("--config-file", default="config.toml") 16 | argparser.add_argument("--batch-size", type=int, 17 | help="Adjust to override the config value if you're" 18 | " experiencing out-of-memory issues") 19 | argparser.add_argument("--weights", 20 | help="Path to file with weights to load." 21 | " If not supplied, in the latest" 22 | " weights of the experiment will be loaded;" 23 | " if there aren't any, an error is raised.") 24 | args = argparser.parse_args() 25 | 26 | model = CorefModel.load_model(path=args.weights, 27 | map_location="cpu", 28 | ignore={"bert_optimizer", "general_optimizer", 29 | "bert_scheduler", "general_scheduler"}) 30 | if args.batch_size: 31 | model.config.a_scoring_batch_size = args.batch_size 32 | model.training = False 33 | 34 | try: 35 | with open(args.input_file, encoding="utf-8") as fin: 36 | input_data = json.load(fin) 37 | except json.decoder.JSONDecodeError: 38 | # read the old jsonlines format if necessary 39 | with open(args.input_file, encoding="utf-8") as fin: 40 | text = "[" + ",\n".join(fin) + "]" 41 | input_data = json.loads(text) 42 | docs = [model.build_doc(doc) for doc in input_data] 43 | 44 | with torch.no_grad(): 45 | for doc in tqdm(docs, unit="docs"): 46 | result = model.run(doc) 47 | doc["span_clusters"] = result.span_clusters 48 | doc["word_clusters"] = result.word_clusters 49 | 50 | for key in ("word2subword", "subwords", "word_id", "head2span"): 51 | del doc[key] 52 | 53 | with open(args.output_file, mode="w") as fout: 54 | for doc in docs: 55 | json.dump(doc, fout) 56 | -------------------------------------------------------------------------------- /scripts/config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Set environment variables for the training and testing of stanza modules. 4 | 5 | # Set UDBASE to the location of UD data folder 6 | # The data should be CoNLL-U format 7 | # For details, see 8 | # http://universaldependencies.org/conll18/data.html (CoNLL-18 UD data) 9 | # https://universaldependencies.org/ 10 | # When rebuilding models based on Universal Dependencies, download the 11 | # UD data to some directory, set UDBASE to that directory, and 12 | # uncomment this line. Alternatively, put UDBASE in your shell 13 | # config, Windows env variables, etc as relevant. 14 | # export UDBASE=/path/to/UD 15 | 16 | # Set NERBASE to the location of NER data folder 17 | # The data should be BIO format or convertable to that format 18 | # For details, see https://www.aclweb.org/anthology/W03-0419.pdf (CoNLL-03 NER paper) 19 | # There are other NER datasets, supported in 20 | # stanza/utils/datasets/ner/prepare_ner_dataset.py 21 | # If rebuilding NER data, choose a location for the NER directory 22 | # and set NERBASE to that variable. 23 | # export NERBASE=/path/to/NER 24 | 25 | # Set CONSTITUENCY_BASE to the location of NER data folder 26 | # The data will be in some dataset-specific format 27 | # There is a conversion script which will turn this 28 | # into a PTB style format 29 | # stanza/utils/datasets/constituency/prepare_con_dataset.py 30 | # If processing constituency data, choose a location for the CON data 31 | # and set CONSTITUENCY_BASE to that variable. 32 | # export CONSTITUENCY_BASE=/path/to/CON 33 | 34 | # Set directories to store processed training/evaluation files 35 | # $DATA_ROOT is a default home for where all the outputs from the 36 | # preparation scripts will go. The training scripts will then look 37 | # for the stanza formatted data in that directory. 38 | export DATA_ROOT=./data 39 | export TOKENIZE_DATA_DIR=$DATA_ROOT/tokenize 40 | export MWT_DATA_DIR=$DATA_ROOT/mwt 41 | export LEMMA_DATA_DIR=$DATA_ROOT/lemma 42 | export POS_DATA_DIR=$DATA_ROOT/pos 43 | export DEPPARSE_DATA_DIR=$DATA_ROOT/depparse 44 | export ETE_DATA_DIR=$DATA_ROOT/ete 45 | export NER_DATA_DIR=$DATA_ROOT/ner 46 | export CHARLM_DATA_DIR=$DATA_ROOT/charlm 47 | export CONSTITUENCY_DATA_DIR=$DATA_ROOT/constituency 48 | export SENTIMENT_DATA_DIR=$DATA_ROOT/sentiment 49 | 50 | # Set directories to store external word vector data 51 | export WORDVEC_DIR=./extern_data/wordvec 52 | -------------------------------------------------------------------------------- /stanza/tests/datasets/ner/test_prepare_ner_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test some simple conversions of NER bio files 3 | """ 4 | 5 | import pytest 6 | 7 | import json 8 | 9 | from stanza.models.common.doc import Document 10 | from stanza.utils.datasets.ner.prepare_ner_file import process_dataset 11 | 12 | BIO_1 = """ 13 | Jennifer B-PERSON 14 | Sh'reyan I-PERSON 15 | has O 16 | lovely O 17 | antennae O 18 | """.strip() 19 | 20 | BIO_2 = """ 21 | but O 22 | I O 23 | don't O 24 | like O 25 | the O 26 | way O 27 | Jennifer B-PERSON 28 | treated O 29 | Beckett B-PERSON 30 | on O 31 | the O 32 | Cerritos B-LOCATION 33 | """.strip() 34 | 35 | def check_json_file(doc, raw_text, expected_sentences, expected_tokens): 36 | raw_sentences = raw_text.strip().split("\n\n") 37 | assert len(raw_sentences) == expected_sentences 38 | if isinstance(expected_tokens, int): 39 | expected_tokens = [expected_tokens] 40 | for raw_sentence, expected_len in zip(raw_sentences, expected_tokens): 41 | assert len(raw_sentence.strip().split("\n")) == expected_len 42 | 43 | assert len(doc.sentences) == expected_sentences 44 | for sentence, expected_len in zip(doc.sentences, expected_tokens): 45 | assert len(sentence.tokens) == expected_len 46 | for sentence, raw_sentence in zip(doc.sentences, raw_sentences): 47 | for token, line in zip(sentence.tokens, raw_sentence.strip().split("\n")): 48 | word, tag = line.strip().split() 49 | assert token.text == word 50 | assert token.ner == tag 51 | 52 | def write_and_convert(tmp_path, raw_text): 53 | bio_file = tmp_path / "test.bio" 54 | with open(bio_file, "w", encoding="utf-8") as fout: 55 | fout.write(raw_text) 56 | 57 | json_file = tmp_path / "json.bio" 58 | process_dataset(bio_file, json_file) 59 | 60 | with open(json_file) as fin: 61 | doc = Document(json.load(fin)) 62 | 63 | return doc 64 | 65 | def run_test(tmp_path, raw_text, expected_sentences, expected_tokens): 66 | doc = write_and_convert(tmp_path, raw_text) 67 | check_json_file(doc, raw_text, expected_sentences, expected_tokens) 68 | 69 | def test_simple(tmp_path): 70 | run_test(tmp_path, BIO_1, 1, 5) 71 | 72 | def test_ner_at_end(tmp_path): 73 | run_test(tmp_path, BIO_2, 1, 12) 74 | 75 | def test_two_sentences(tmp_path): 76 | raw_text = BIO_1 + "\n\n" + BIO_2 77 | run_test(tmp_path, raw_text, 2, [5, 12]) 78 | --------------------------------------------------------------------------------