├── stanza
    ├── models
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── seq2seq_constant.py
    │   │   ├── exceptions.py
    │   │   ├── trainer.py
    │   │   ├── stanza_object.py
    │   │   ├── count_ner_coverage.py
    │   │   ├── maxout_linear.py
    │   │   ├── count_pretrain_coverage.py
    │   │   └── convert_pretrain.py
    │   ├── coref
    │   │   ├── __init__.py
    │   │   ├── tokenizer_customization.py
    │   │   ├── const.py
    │   │   ├── coref_chain.py
    │   │   ├── loss.py
    │   │   ├── config.py
    │   │   └── predict.py
    │   ├── langid
    │   │   ├── __init__.py
    │   │   └── trainer.py
    │   ├── lemma
    │   │   ├── __init__.py
    │   │   ├── scorer.py
    │   │   ├── vocab.py
    │   │   ├── edit.py
    │   │   └── attach_lemma_classifier.py
    │   ├── mwt
    │   │   ├── __init__.py
    │   │   ├── scorer.py
    │   │   └── vocab.py
    │   ├── ner
    │   │   └── __init__.py
    │   ├── pos
    │   │   ├── __init__.py
    │   │   ├── scorer.py
    │   │   └── xpos_vocab_utils.py
    │   ├── classifiers
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── config.py
    │   │   └── base_classifier.py
    │   ├── constituency
    │   │   ├── __init__.py
    │   │   ├── evaluate_treebanks.py
    │   │   └── tree_stack.py
    │   ├── depparse
    │   │   └── __init__.py
    │   ├── tokenization
    │   │   ├── __init__.py
    │   │   └── vocab.py
    │   ├── lemma_classifier
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   └── baseline_model.py
    │   └── _training_logging.py
    ├── pipeline
    │   ├── __init__.py
    │   ├── demo
    │   │   ├── __init__.py
    │   │   ├── loading.gif
    │   │   ├── Astloch-Bold.ttf
    │   │   ├── Liberation_Sans-Regular.ttf
    │   │   ├── PT_Sans-Caption-Web-Regular.ttf
    │   │   ├── README.md
    │   │   └── stanza-brat.css
    │   ├── external
    │   │   ├── __init__.py
    │   │   └── corenlp_converter_depparse.py
    │   ├── registry.py
    │   ├── _constants.py
    │   └── mwt_processor.py
    ├── resources
    │   ├── __init__.py
    │   └── print_charlm_depparse.py
    ├── tests
    │   ├── mwt
    │   │   ├── __init__.py
    │   │   └── test_utils.py
    │   ├── ner
    │   │   ├── __init__.py
    │   │   ├── test_models_ner_scorer.py
    │   │   ├── test_from_conllu.py
    │   │   ├── test_combine_ner_datasets.py
    │   │   ├── test_convert_starlang_ner.py
    │   │   ├── test_ner_trainer.py
    │   │   └── test_pay_amt_annotators.py
    │   ├── pos
    │   │   └── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── test_short_name_to_treebank.py
    │   │   ├── test_dropout.py
    │   │   ├── test_bert_embedding.py
    │   │   ├── test_chuliu_edmonds.py
    │   │   ├── test_foundation_cache.py
    │   │   ├── test_common_data.py
    │   │   ├── test_relative_attn.py
    │   │   └── test_data_objects.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ner
    │   │   │   ├── __init__.py
    │   │   │   ├── test_utils.py
    │   │   │   └── test_prepare_ner_file.py
    │   │   ├── coref
    │   │   │   ├── __init__.py
    │   │   │   └── test_hebrew_iahlt.py
    │   │   └── test_vietnamese_renormalization.py
    │   ├── depparse
    │   │   └── __init__.py
    │   ├── langid
    │   │   └── __init__.py
    │   ├── lemma
    │   │   └── __init__.py
    │   ├── pipeline
    │   │   ├── __init__.py
    │   │   ├── test_arabic_pipeline.py
    │   │   ├── test_pipeline_depparse_processor.py
    │   │   ├── test_pipeline_sentiment_processor.py
    │   │   ├── pipeline_device_tests.py
    │   │   └── test_pipeline_pos_processor.py
    │   ├── server
    │   │   ├── __init__.py
    │   │   ├── test_morphology.py
    │   │   ├── test_ud_enhancer.py
    │   │   ├── test_tokensregex.py
    │   │   ├── test_parser_eval.py
    │   │   └── test_server_pretokenized.py
    │   ├── classifiers
    │   │   └── __init__.py
    │   ├── constituency
    │   │   ├── __init__.py
    │   │   ├── test_positional_encoding.py
    │   │   ├── test_tree_stack.py
    │   │   └── test_convert_starlang.py
    │   ├── resources
    │   │   ├── __init__.py
    │   │   ├── test_default_packages.py
    │   │   ├── test_prepare_resources.py
    │   │   ├── test_charlm_depparse.py
    │   │   └── test_installation.py
    │   ├── tokenization
    │   │   ├── __init__.py
    │   │   ├── test_tokenize_files.py
    │   │   └── test_replace_long_tokens.py
    │   ├── lemma_classifier
    │   │   ├── __init__.py
    │   │   └── test_training.py
    │   ├── data
    │   │   ├── external_server.properties
    │   │   ├── tiny_emb.csv
    │   │   ├── tiny_emb.txt
    │   │   ├── test.dat
    │   │   ├── tiny_emb.gz
    │   │   ├── tiny_emb.pt
    │   │   ├── tiny_emb.xz
    │   │   ├── tiny_emb.zip
    │   │   └── aws_annotations.zip
    │   └── pytest.ini
    ├── utils
    │   ├── __init__.py
    │   ├── ner
    │   │   └── __init__.py
    │   ├── charlm
    │   │   └── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ner
    │   │   │   ├── __init__.py
    │   │   │   ├── combine_ner_datasets.py
    │   │   │   ├── count_entities.py
    │   │   │   ├── compare_entities.py
    │   │   │   ├── convert_kk_kazNERD.py
    │   │   │   ├── preprocess_wikiner.py
    │   │   │   ├── convert_nytk.py
    │   │   │   ├── convert_en_conll03.py
    │   │   │   ├── convert_starlang_ner.py
    │   │   │   ├── convert_mr_l3cube.py
    │   │   │   ├── json_to_bio.py
    │   │   │   ├── check_for_duplicates.py
    │   │   │   └── conll_to_iob.py
    │   │   ├── pos
    │   │   │   ├── __init__.py
    │   │   │   └── remove_columns.py
    │   │   ├── coref
    │   │   │   ├── __init__.py
    │   │   │   ├── balance_languages.py
    │   │   │   └── convert_hebrew_mixed.py
    │   │   ├── pretrain
    │   │   │   ├── __init__.py
    │   │   │   └── word_in_pretrain.py
    │   │   ├── sentiment
    │   │   │   ├── __init__.py
    │   │   │   └── process_vsfc_vietnamese.py
    │   │   ├── constituency
    │   │   │   ├── __init__.py
    │   │   │   ├── count_common_words.py
    │   │   │   ├── common_trees.py
    │   │   │   ├── utils.py
    │   │   │   ├── convert_spmrl.py
    │   │   │   ├── treebank_to_labeled_brackets.py
    │   │   │   ├── relabel_tags.py
    │   │   │   ├── reduce_dataset.py
    │   │   │   ├── extract_all_silver_dataset.py
    │   │   │   └── extract_silver_dataset.py
    │   │   ├── tokenization
    │   │   │   └── __init__.py
    │   │   ├── vietnamese
    │   │   │   └── __init__.py
    │   │   ├── prepare_pos_treebank.py
    │   │   ├── thai_syllable_dict_generator.py
    │   │   └── contract_mwt.py
    │   ├── lemma
    │   │   ├── __init__.py
    │   │   └── count_ambiguous_lemmas.py
    │   ├── pretrain
    │   │   ├── __init__.py
    │   │   └── compare_pretrains.py
    │   ├── training
    │   │   └── __init__.py
    │   ├── constituency
    │   │   ├── __init__.py
    │   │   ├── list_tensors.py
    │   │   ├── grep_test_logs.py
    │   │   ├── check_transitions.py
    │   │   └── grep_dev_logs.py
    │   ├── languages
    │   │   └── __init__.py
    │   ├── visualization
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── constants.py
    │   │   └── README
    │   ├── max_mwt_length.py
    │   ├── select_backoff.py
    │   ├── avg_sent_len.py
    │   ├── helper_func.py
    │   ├── get_tqdm.py
    │   └── default_paths.py
    ├── _version.py
    ├── server
    │   ├── __init__.py
    │   └── tokensregex.py
    ├── __init__.py
    └── protobuf
    │   └── __init__.py
├── images
    └── stanza-logo.png
├── LICENSE
├── demo
    ├── scenegraph.py
    ├── ssurgeon_script.txt
    ├── semgrex.py
    ├── semgrex_sample.conllu
    ├── CONLL_Dependency_Visualizer_Example.ipynb
    └── Dependency_Visualization_Testing.ipynb
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   ├── bug_report.md
    │   └── question.md
    ├── pull_request_template.md
    ├── stale.yml
    └── workflows
    │   └── stanza-tests.yaml
├── .travis.yml
├── CONTRIBUTING.md
└── scripts
    └── config.sh


/stanza/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/resources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/mwt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/pos/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/ner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/coref/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/langid/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/lemma/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/mwt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/ner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/pos/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/pipeline/demo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/depparse/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/langid/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/lemma/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/charlm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/lemma/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/pretrain/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/training/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/classifiers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/constituency/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/depparse/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/pipeline/external/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/classifiers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/constituency/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/ner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/resources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/constituency/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/pos/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/languages/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/models/lemma_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/coref/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/lemma_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/coref/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/pretrain/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/sentiment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/vietnamese/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stanza/tests/data/external_server.properties:
--------------------------------------------------------------------------------
1 | annotators = tokenize,ssplit,pos
2 | 


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.csv:
--------------------------------------------------------------------------------
1 | 3 4
2 | unban,1,2,3,4
3 | mox,5,6,7,8
4 | opal,9,10,11,12
5 | 


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.txt:
--------------------------------------------------------------------------------
1 | 3 4
2 | unban 1 2 3 4
3 | mox 5 6 7 8
4 | opal 9 10 11 12
5 | 


--------------------------------------------------------------------------------
/images/stanza-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/images/stanza-logo.png


--------------------------------------------------------------------------------
/stanza/tests/data/test.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/test.dat


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.gz


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.pt


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.xz


--------------------------------------------------------------------------------
/stanza/pipeline/demo/loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/loading.gif


--------------------------------------------------------------------------------
/stanza/tests/data/tiny_emb.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/tiny_emb.zip


--------------------------------------------------------------------------------
/stanza/pipeline/demo/Astloch-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/Astloch-Bold.ttf


--------------------------------------------------------------------------------
/stanza/tests/data/aws_annotations.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/tests/data/aws_annotations.zip


--------------------------------------------------------------------------------
/stanza/models/_training_logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger('stanza')
4 | logger.setLevel(logging.DEBUG)


--------------------------------------------------------------------------------
/stanza/_version.py:
--------------------------------------------------------------------------------
1 | """ Single source of truth for version number """
2 | 
3 | __version__ = "1.11.0"
4 | __resources_version__ = '1.11.0'
5 | 


--------------------------------------------------------------------------------
/stanza/pipeline/demo/Liberation_Sans-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/Liberation_Sans-Regular.ttf


--------------------------------------------------------------------------------
/stanza/pipeline/demo/PT_Sans-Caption-Web-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordnlp/stanza/HEAD/stanza/pipeline/demo/PT_Sans-Caption-Web-Regular.ttf


--------------------------------------------------------------------------------
/stanza/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     travis: all tests that will be run in travis CI
4 |     client: all tests that are related to the CoreNLP client interface
5 |     pipeline: all tests that are related to the Stanza neural pipeline
6 | 


--------------------------------------------------------------------------------
/stanza/pipeline/registry.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | 
3 | # these two get filled by register_processor
4 | NAME_TO_PROCESSOR_CLASS = dict()
5 | PIPELINE_NAMES = []
6 | 
7 | # this gets filled by register_processor_variant
8 | PROCESSOR_VARIANTS = defaultdict(dict)
9 | 


--------------------------------------------------------------------------------
/stanza/models/common/seq2seq_constant.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants for seq2seq models.
 3 | """
 4 | 
 5 | PAD = '<PAD>'
 6 | PAD_ID = 0
 7 | UNK = '<UNK>'
 8 | UNK_ID = 1
 9 | SOS = '<SOS>'
10 | SOS_ID = 2
11 | EOS = '<EOS>'
12 | EOS_ID = 3
13 | 
14 | VOCAB_PREFIX = [PAD, UNK, SOS, EOS]
15 | 
16 | EMB_INIT_RANGE = 1.0
17 | INFINITY_NUMBER = 1e12
18 | 


--------------------------------------------------------------------------------
/stanza/pipeline/_constants.py:
--------------------------------------------------------------------------------
 1 | """ Module defining constants """
 2 | 
 3 | # string constants for processor names
 4 | LANGID = 'langid'
 5 | TOKENIZE = 'tokenize'
 6 | MWT = 'mwt'
 7 | POS = 'pos'
 8 | LEMMA = 'lemma'
 9 | DEPPARSE = 'depparse'
10 | NER = 'ner'
11 | SENTIMENT = 'sentiment'
12 | CONSTITUENCY = 'constituency'
13 | COREF = 'coref'
14 | 


--------------------------------------------------------------------------------
/stanza/utils/max_mwt_length.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import json
 4 | 
 5 | def max_mwt_length(filenames):
 6 |     max_len = 0
 7 |     for filename in filenames:
 8 |         with open(filename) as f:
 9 |             d = json.load(f)
10 |             max_len = max([max_len] + [len(" ".join(x[0][1])) for x in d])
11 |     return max_len
12 | 
13 | if __name__ == '__main__':
14 |     print(max_max_jlength(sys.argv[1:]))
15 | 


--------------------------------------------------------------------------------
/stanza/models/mwt/scorer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils and wrappers for scoring MWT
 3 | """
 4 | from stanza.models.common.utils import ud_scores
 5 | 
 6 | def score(system_conllu_file, gold_conllu_file):
 7 |     """ Wrapper for word segmenter scorer. """
 8 |     evaluation = ud_scores(gold_conllu_file, system_conllu_file)
 9 |     el = evaluation["Words"]
10 |     p, r, f = el.precision, el.recall, el.f1
11 |     return p, r, f
12 | 
13 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/count_common_words.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from collections import Counter
 4 | 
 5 | from stanza.models.constituency import parse_tree
 6 | from stanza.models.constituency import tree_reader
 7 | 
 8 | word_counter = Counter()
 9 | count_words = lambda x: word_counter.update(x.leaf_labels())
10 | 
11 | tree_reader.read_tree_file(sys.argv[1], tree_callback=count_words)
12 | print(word_counter.most_common()[:100])
13 | 


--------------------------------------------------------------------------------
/stanza/utils/constituency/list_tensors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Lists all the tensors in a constituency model.
 3 | 
 4 | Currently useful in combination with torchshow for displaying a series of tensors as they change.
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | from stanza.models.constituency.trainer import Trainer
10 | 
11 | 
12 | trainer = Trainer.load(sys.argv[1])
13 | model = trainer.model
14 | 
15 | for name, param in model.named_parameters():
16 |     print(name, param.requires_grad)
17 | 


--------------------------------------------------------------------------------
/stanza/models/lemma_classifier/constants.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | UNKNOWN_TOKEN = "unk"  # token name for unknown tokens
 4 | UNKNOWN_TOKEN_IDX = -1   # custom index we apply to unknown tokens
 5 | 
 6 | # TODO: ModelType could just be LSTM and TRANSFORMER
 7 | # and then the transformer baseline would have the transformer as another argument
 8 | class ModelType(Enum):
 9 |     LSTM               = 1
10 |     TRANSFORMER        = 2
11 |     BERT               = 3
12 |     ROBERTA            = 4
13 | 
14 | DEFAULT_BATCH_SIZE = 16


--------------------------------------------------------------------------------
/stanza/utils/select_backoff.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | backoff_models = { "UD_Breton-KEB": "ga_idt",
 4 |                    "UD_Czech-PUD": "cs_pdt",
 5 |                    "UD_English-PUD": "en_ewt",
 6 |                    "UD_Faroese-OFT": "nn_nynorsk",
 7 |                    "UD_Finnish-PUD": "fi_tdt",
 8 |                    "UD_Japanese-Modern": "ja_gsd",
 9 |                    "UD_Naija-NSC": "en_ewt",
10 |                    "UD_Swedish-PUD": "sv_talbanken"
11 |                  }
12 | 
13 | print(backoff_models[sys.argv[1]])
14 | 


--------------------------------------------------------------------------------
/stanza/models/common/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A couple more specific FileNotFoundError exceptions
 3 | 
 4 | The idea being, the caller can catch it and report a more useful error resolution
 5 | """
 6 | 
 7 | import errno
 8 | 
 9 | class ForwardCharlmNotFoundError(FileNotFoundError):
10 |     def __init__(self, msg, filename):
11 |         super().__init__(errno.ENOENT, msg, filename)
12 | 
13 | class BackwardCharlmNotFoundError(FileNotFoundError):
14 |     def __init__(self, msg, filename):
15 |         super().__init__(errno.ENOENT, msg, filename)
16 | 


--------------------------------------------------------------------------------
/stanza/utils/visualization/utils.py:
--------------------------------------------------------------------------------
 1 | def find_nth(haystack, needle, n):
 2 |     """
 3 |     Returns the starting index of the nth occurrence of the substring 'needle' in the string 'haystack'.
 4 |     """
 5 |     start = haystack.find(needle)
 6 |     while start >= 0 and n > 1:
 7 |         start = haystack.find(needle, start + len(needle))
 8 |         n -= 1
 9 |     return start
10 | 
11 | 
12 | def round_base(num, base=10):
13 |     """
14 |     Rounding a number to its nearest multiple of the base. round_base(49.2, base=50) = 50.
15 |     """
16 |     return base * round(num / base)


--------------------------------------------------------------------------------
/stanza/models/lemma/scorer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils and wrappers for scoring lemmatizers.
 3 | """
 4 | 
 5 | import logging
 6 | 
 7 | from stanza.models.common.utils import ud_scores
 8 | 
 9 | logger = logging.getLogger('stanza')
10 | 
11 | def score(system_conllu_file, gold_conllu_file):
12 |     """ Wrapper for lemma scorer. """
13 |     logger.debug("Evaluating system file %s vs gold file %s", system_conllu_file, gold_conllu_file)
14 |     evaluation = ud_scores(gold_conllu_file, system_conllu_file)
15 |     el = evaluation["Lemmas"]
16 |     p, r, f = el.precision, el.recall, el.f1
17 |     return p, r, f
18 | 
19 | 


--------------------------------------------------------------------------------
/stanza/utils/avg_sent_len.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | def avg_sent_len(toklabels):
 5 |     if toklabels.endswith('.json'):
 6 |         with open(toklabels, 'r') as f:
 7 |             l = json.load(f)
 8 | 
 9 |         l = [''.join([str(x[1]) for x in para]) for para in l]
10 |     else:
11 |         with open(toklabels, 'r') as f:
12 |             l = ''.join(f.readlines())
13 | 
14 |         l = l.split('\n\n')
15 | 
16 |     sentlen = [len(x) + 1 for para in l for x in para.split('2')]
17 |     return sum(sentlen) / len(sentlen)
18 | 
19 | if __name__ == '__main__':
20 |     print(avg_sent_len(sys.args[1]))
21 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/common_trees.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Look through 2 files, only output the common trees
 3 | 
 4 | pretty basic - could use some more options
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | def main():
10 |     in1 = sys.argv[1]
11 |     with open(in1, encoding="utf-8") as fin:
12 |         lines1 = fin.readlines()
13 |     in2 = sys.argv[2]
14 |     with open(in2, encoding="utf-8") as fin:
15 |         lines2 = fin.readlines()
16 | 
17 |     common = [l1 for l1, l2 in zip(lines1, lines2) if l1 == l2]
18 |     for l in common:
19 |         print(l.strip())
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 The Board of Trustees of The Leland Stanford Junior University
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_short_name_to_treebank.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import stanza
 4 | from stanza.models.common import short_name_to_treebank
 5 | 
 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 7 | 
 8 | def test_short_name():
 9 |     assert short_name_to_treebank.short_name_to_treebank("en_ewt") == "UD_English-EWT"
10 | 
11 | def test_canonical_name():
12 |     assert short_name_to_treebank.canonical_treebank_name("UD_URDU-UDTB") == "UD_Urdu-UDTB"
13 |     assert short_name_to_treebank.canonical_treebank_name("ur_udtb") == "UD_Urdu-UDTB"
14 |     assert short_name_to_treebank.canonical_treebank_name("Unban_Mox_Opal") == "Unban_Mox_Opal"
15 | 


--------------------------------------------------------------------------------
/stanza/server/__init__.py:
--------------------------------------------------------------------------------
 1 | from stanza.protobuf import to_text
 2 | from stanza.protobuf import Document, Sentence, Token, IndexedWord, Span
 3 | from stanza.protobuf import ParseTree, DependencyGraph, CorefChain
 4 | from stanza.protobuf import Mention, NERMention, Entity, Relation, RelationTriple, Timex
 5 | from stanza.protobuf import Quote, SpeakerInfo
 6 | from stanza.protobuf import Operator, Polarity
 7 | from stanza.protobuf import SentenceFragment, TokenLocation
 8 | from stanza.protobuf import MapStringString, MapIntString
 9 | from .client import CoreNLPClient, AnnotationException, TimeoutException, PermanentlyFailedException, StartServer
10 | from .annotator import Annotator
11 | 


--------------------------------------------------------------------------------
/demo/scenegraph.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Very short demo for the SceneGraph interface in the CoreNLP server
 3 | 
 4 | Requires CoreNLP >= 4.5.5, Stanza >= 1.5.1
 5 | """
 6 | 
 7 | import json
 8 | 
 9 | from stanza.server import CoreNLPClient
10 | 
11 | # start_server=None if you have the server running in another process on the same host
12 | # you can start it with whatever normal options CoreNLPClient has
13 | #
14 | # preload=False avoids having the server unnecessarily load annotators
15 | # if you don't plan on using them
16 | with CoreNLPClient(preload=False) as client:
17 |     result = client.scenegraph("Jennifer's antennae are on her head.")
18 |     print(json.dumps(result, indent=2))
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/stanza/models/lemma/vocab.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from stanza.models.common.vocab import BaseVocab, BaseMultiVocab
 4 | from stanza.models.common.seq2seq_constant import VOCAB_PREFIX
 5 | 
 6 | class Vocab(BaseVocab):
 7 |     def build_vocab(self):
 8 |         counter = Counter(self.data)
 9 |         self._id2unit = VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
10 |         self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
11 | 
12 | class MultiVocab(BaseMultiVocab):
13 |     @classmethod
14 |     def load_state_dict(cls, state_dict):
15 |         new = cls()
16 |         for k,v in state_dict.items():
17 |             new[k] = Vocab.load_state_dict(v)
18 |         return new
19 | 


--------------------------------------------------------------------------------
/stanza/utils/constituency/grep_test_logs.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | filenames = sys.argv[1:]
 5 | 
 6 | total_score = 0.0
 7 | num_scores = 0
 8 | 
 9 | for filename in filenames:
10 |     grep_cmd = ["grep", "F1 score.*test.*", filename]
11 |     grep_result = subprocess.run(grep_cmd, stdout=subprocess.PIPE, encoding="utf-8")
12 |     grep_result = grep_result.stdout.strip()
13 |     if not grep_result:
14 |         print("{}: no result".format(filename))
15 |         continue
16 | 
17 |     score = float(grep_result.split()[-1])
18 |     print("{}: {}".format(filename, score))
19 |     total_score += score
20 |     num_scores += 1
21 | 
22 | if num_scores > 0:
23 |     avg = total_score / num_scores
24 |     print("Avg: {}".format(avg))
25 | 


--------------------------------------------------------------------------------
/stanza/models/common/trainer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class Trainer:
 4 |     def change_lr(self, new_lr):
 5 |         for param_group in self.optimizer.param_groups:
 6 |             param_group['lr'] = new_lr
 7 | 
 8 |     def save(self, filename):
 9 |         savedict = {
10 |                    'model': self.model.state_dict(),
11 |                    'optimizer': self.optimizer.state_dict()
12 |                    }
13 |         torch.save(savedict, filename)
14 | 
15 |     def load(self, filename):
16 |         savedict = torch.load(filename, lambda storage, loc: storage, weights_only=True)
17 | 
18 |         self.model.load_state_dict(savedict['model'])
19 |         if self.args['mode'] == 'train':
20 |             self.optimizer.load_state_dict(savedict['optimizer'])
21 | 


--------------------------------------------------------------------------------
/stanza/models/lemma/edit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for calculating edits between word and lemma forms.
 3 | """
 4 | 
 5 | EDIT_TO_ID = {'none': 0, 'identity': 1, 'lower': 2}
 6 | 
 7 | def get_edit_type(word, lemma):
 8 |     """ Calculate edit types. """
 9 |     if lemma == word:
10 |         return 'identity'
11 |     elif lemma == word.lower():
12 |         return 'lower'
13 |     return 'none'
14 | 
15 | def edit_word(word, pred, edit_id):
16 |     """
17 |     Edit a word, given edit and seq2seq predictions.
18 |     """
19 |     if edit_id == 1:
20 |         return word
21 |     elif edit_id == 2:
22 |         return word.lower()
23 |     elif edit_id == 0:
24 |         return pred
25 |     else:
26 |         raise Exception("Unrecognized edit ID: {}".format(edit_id))
27 | 
28 | 


--------------------------------------------------------------------------------
/stanza/models/mwt/vocab.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from stanza.models.common.vocab import BaseVocab
 4 | import stanza.models.common.seq2seq_constant as constant
 5 | 
 6 | class Vocab(BaseVocab):
 7 |     def build_vocab(self):
 8 |         pairs = self.data
 9 |         allchars = "".join([src + tgt for src, tgt in pairs])
10 |         counter = Counter(allchars)
11 | 
12 |         self._id2unit = constant.VOCAB_PREFIX + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
13 |         self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
14 | 
15 |     def add_unit(self, unit):
16 |         if unit in self._unit2id:
17 |             return
18 |         self._unit2id[unit] = len(self._id2unit)
19 |         self._id2unit.append(unit)
20 | 


--------------------------------------------------------------------------------
/stanza/models/pos/scorer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils and wrappers for scoring taggers.
 3 | """
 4 | import logging
 5 | 
 6 | from stanza.models.common.utils import ud_scores
 7 | 
 8 | logger = logging.getLogger('stanza')
 9 | 
10 | def score(system_conllu_file, gold_conllu_file, verbose=True, eval_type='AllTags'):
11 |     """ Wrapper for tagger scorer. """
12 |     evaluation = ud_scores(gold_conllu_file, system_conllu_file)
13 |     el = evaluation[eval_type]
14 |     p = el.precision
15 |     r = el.recall
16 |     f = el.f1
17 |     if verbose:
18 |         scores = [evaluation[k].f1 * 100 for k in ['UPOS', 'XPOS', 'UFeats', 'AllTags']]
19 |         logger.info("UPOS\tXPOS\tUFeats\tAllTags")
20 |         logger.info("{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}".format(*scores))
21 |     return p, r, f
22 | 
23 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/pos/remove_columns.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Remove xpos and feats from each file given at the command line.
 3 | 
 4 | Useful to strip unwanted tags when combining files of two different
 5 | types (or two different stages in the annotation process).
 6 | 
 7 | Super rudimentary right now.  Will be upgraded if needed
 8 | """
 9 | 
10 | import sys
11 | 
12 | from stanza.utils.conll import CoNLL
13 | 
14 | def remove_columns(filename):
15 |     doc = CoNLL.conll2doc(filename)
16 | 
17 |     for sentence in doc.sentences:
18 |         for word in sentence.words:
19 |             word.xpos = None
20 |             word.feats = None
21 | 
22 |     CoNLL.write_doc2conll(doc, filename)
23 | 
24 | if __name__ == '__main__':
25 |     for filename in sys.argv[1:]:
26 |         remove_columns(filename)
27 | 


--------------------------------------------------------------------------------
/stanza/tests/server/test_morphology.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the most basic functionality of the morphology script
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from stanza.server.morphology import Morphology, process_text
 8 | 
 9 | words    = ["Jennifer", "has",  "the", "prettiest", "antennae"]
10 | tags     = ["NNP",      "VBZ",  "DT",  "JJS",       "NNS"]
11 | expected = ["Jennifer", "have", "the", "pretty",    "antenna"]
12 | 
13 | def test_process_text():
14 |     result = process_text(words, tags)
15 |     lemma = [x.lemma for x in result.words]
16 |     print(lemma)
17 |     assert lemma == expected
18 | 
19 | def test_basic_morphology():
20 |     with Morphology() as morph:
21 |         result = morph.process(words, tags)
22 |         lemma = [x.lemma for x in result.words]
23 |         assert lemma == expected
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Environment (please complete the following information):**
24 |  - OS: [e.g. Windows, Ubuntu, CentOS, MacOS]
25 |  - Python version: [e.g. Python 3.6.8 from Anaconda]
26 |  - Stanza version: [e.g., 1.0.0]
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/stanza/models/coref/tokenizer_customization.py:
--------------------------------------------------------------------------------
 1 | """ This file defines functions used to modify the default behaviour
 2 | of transformers.AutoTokenizer. These changes are necessary, because some
 3 | tokenizers are meant to be used with raw text, while the OntoNotes documents
 4 | have already been split into words.
 5 | All the functions are used in coref_model.CorefModel._get_docs. """
 6 | 
 7 | 
 8 | # Filters out unwanted tokens produced by the tokenizer
 9 | TOKENIZER_FILTERS = {
10 |     "albert-xxlarge-v2": (lambda token: token != "▁"),  # U+2581, not just "_"
11 |     "albert-large-v2": (lambda token: token != "▁"),
12 | }
13 | 
14 | # Maps some words to tokens directly, without a tokenizer
15 | TOKENIZER_MAPS = {
16 |     "roberta-large": {".": ["."], ",": [","], "!": ["!"], "?": ["?"],
17 |                       ":":[":"], ";":[";"], "'s": ["'s"]}
18 | }
19 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | **BEFORE YOU START**: please make sure your pull request is against the `dev` branch. 
 2 | We cannot accept pull requests against the `main` branch. 
 3 | See our [contributing guide](https://github.com/stanfordnlp/stanza/blob/main/CONTRIBUTING.md) for details.
 4 | 
 5 | ## Description
 6 | A brief and concise description of what your pull request is trying to accomplish.
 7 | 
 8 | ## Fixes Issues
 9 | A list of issues/bugs with # references. (e.g., #123)
10 | 
11 | ## Unit test coverage
12 | Are there unit tests in place to make sure your code is functioning correctly?
13 | (see [here](https://github.com/stanfordnlp/stanza/blob/master/tests/test_tagger.py) for a simple example)
14 | 
15 | ## Known breaking changes/behaviors
16 | Does this break anything in Stanza's existing user interface? If so, what is it and how is it addressed?
17 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 |   - fixed on dev
10 |   - bug
11 |   - enhancement
12 | # Label to use when marking an issue as stale
13 | staleLabel: stale
14 | # Comment to post when marking an issue as stale. Set to `false` to disable
15 | markComment: >
16 |   This issue has been automatically marked as stale because it has not had
17 |   recent activity. It will be closed if no further activity occurs. Thank you
18 |   for your contributions.
19 | # Comment to post when closing a stale issue. Set to `false` to disable
20 | closeComment: >
21 |   This issue has been automatically closed due to inactivity.
22 | 


--------------------------------------------------------------------------------
/stanza/tests/tokenization/test_tokenize_files.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza.models.tokenization import tokenize_files
 4 | from stanza.tests import TEST_MODELS_DIR
 5 | 
 6 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 7 | 
 8 | EXPECTED = """
 9 | This is a test . This is a second sentence .
10 | I took my daughter ice skating
11 | """.lstrip()
12 | 
13 | def test_tokenize_files(tmp_path):
14 |     input_file = tmp_path / "input.txt"
15 |     with open(input_file, "w") as fout:
16 |         fout.write("This is a test.  This is a second sentence.\n\nI took my daughter ice skating")
17 | 
18 |     output_file = tmp_path / "output.txt"
19 |     tokenize_files.main([str(input_file), "--lang", "en", "--output_file", str(output_file), "--model_dir", TEST_MODELS_DIR])
20 | 
21 |     with open(output_file) as fin:
22 |         text = fin.read()
23 | 
24 |     assert EXPECTED == text
25 | 


--------------------------------------------------------------------------------
/stanza/utils/lemma/count_ambiguous_lemmas.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read in a UD file, report any word/verb pairs which get lemmatized to different lemmas
 3 | """
 4 | 
 5 | from collections import Counter, defaultdict
 6 | import sys
 7 | 
 8 | from stanza.utils.conll import CoNLL
 9 | 
10 | filename = sys.argv[1]
11 | print(filename)
12 | 
13 | lemma_counters = defaultdict(Counter)
14 | 
15 | doc = CoNLL.conll2doc(input_file=filename)
16 | for sentence in doc.sentences:
17 |     for word in sentence.words:
18 |         text = word.text
19 |         upos = word.upos
20 |         lemma = word.lemma
21 | 
22 |         lemma_counters[(text, upos)][lemma] += 1
23 | 
24 | keys = lemma_counters.keys()
25 | keys = sorted(keys, reverse=True, key=lambda x: sum(lemma_counters[x][y] for y in lemma_counters[x]))
26 | for text, upos in keys:
27 |     if len(lemma_counters[(text, upos)]) > 1:
28 |         print(text, upos, lemma_counters[(text, upos)])
29 | 
30 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 3.6.5
 4 | notifications:
 5 |   email: false
 6 | install:
 7 |   - pip install --quiet .
 8 |   - export CORENLP_HOME=~/corenlp-latest CORENLP_VERSION=stanford-corenlp-latest
 9 |   - export CORENLP_URL="http://nlp.stanford.edu/software/${CORENLP_VERSION}.zip"
10 |   - wget $CORENLP_URL -O corenlp-latest.zip
11 |   - unzip corenlp-latest.zip > unzip.log
12 |   - export CORENLP_UNZIP=`grep creating unzip.log | head -n 1 | cut -d ":" -f 2`
13 |   - mv $CORENLP_UNZIP $CORENLP_HOME
14 |   - mkdir ~/stanza_test
15 |   - mkdir ~/stanza_test/in
16 |   - mkdir ~/stanza_test/out
17 |   - mkdir ~/stanza_test/scripts
18 |   - cp tests/data/external_server.properties ~/stanza_test/scripts
19 |   - cp tests/data/example_french.json ~/stanza_test/out
20 |   - cp tests/data/tiny_emb.* ~/stanza_test/in
21 |   - export STANZA_TEST_HOME=~/stanza_test
22 | script:
23 |   - python -m pytest -m travis tests/
24 | 


--------------------------------------------------------------------------------
/stanza/resources/print_charlm_depparse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A small utility script to output which depparse models use charlm
 3 | 
 4 | (It should skip en_genia, en_craft, but currently doesn't)
 5 | 
 6 | Not frequently useful, but seems like the kind of thing that might get used a couple times
 7 | """
 8 | 
 9 | from stanza.resources.common import load_resources_json
10 | from stanza.resources.default_packages import default_charlms, depparse_charlms
11 | 
12 | def list_depparse():
13 |     charlm_langs = list(default_charlms.keys())
14 |     resources = load_resources_json()
15 | 
16 |     models = ["%s_%s" % (lang, model) for lang in charlm_langs for model in resources[lang].get("depparse", {})
17 |               if lang not in depparse_charlms or model not in depparse_charlms[lang] or depparse_charlms[lang][model] is not None]
18 |     return models
19 | 
20 | if __name__ == "__main__":
21 |     models = list_depparse()
22 |     print(" ".join(models))
23 | 


--------------------------------------------------------------------------------
/stanza/tests/resources/test_default_packages.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import stanza
 4 | 
 5 | from stanza.resources import default_packages
 6 | 
 7 | def test_default_pretrains():
 8 |     """
 9 |     Test that all languages with a default treebank have a default pretrain or are specifically marked as not having a pretrain
10 |     """
11 |     for lang in default_packages.default_treebanks.keys():
12 |         assert lang in default_packages.no_pretrain_languages or lang in default_packages.default_pretrains, "Lang %s does not have a default pretrain marked!" % lang
13 | 
14 | def test_no_pretrain_languages():
15 |     """
16 |     Test that no languages have no_default_pretrain marked despite having a pretrain
17 |     """
18 |     for lang in default_packages.no_pretrain_languages:
19 |         assert lang not in default_packages.default_pretrains, "Lang %s is marked as no_pretrain but has a default pretrain!" % lang
20 | 
21 | 
22 | 
23 | 
24 |     
25 | 


--------------------------------------------------------------------------------
/demo/ssurgeon_script.txt:
--------------------------------------------------------------------------------
 1 | # To run this, use the stanza/server/ssurgeon.py main file.
 2 | # For example:
 3 | # python3 stanza/server/ssurgeon.py  --edit_file demo/ssurgeon_script.txt --no_print_input --input_file ../data/ud2_11/UD_English-Pronouns/en_pronouns-ud-test.conllu > en_pronouns.updated.conllu
 4 | # This script updates the UD 2.11 version of UD_English-Pronouns to
 5 | # better match punctuation attachments, MWT, and no double subjects.
 6 | 
 7 | # This turns unwanted csubj into advcl
 8 | {}=source >nsubj {} >csubj=bad {}
 9 | relabelNamedEdge -edge bad -reln advcl
10 | 
11 | # This detects punctuations which are not attached to the root and reattaches them
12 | {word:/[.]/}=punct <punct=bad {}=parent << {$}=root : {}=parent << {}=root
13 | removeNamedEdge -edge bad
14 | addEdge -gov root -dep punct -reln punct
15 | 
16 | # This detects the specific MWT found in the 2.11 dataset
17 | {}=first . {word:/'s|n't|'ll/}=second
18 | combineMWT -node first -node second
19 | 


--------------------------------------------------------------------------------
/stanza/pipeline/demo/README.md:
--------------------------------------------------------------------------------
 1 | ## Interactive Demo for Stanza
 2 | 
 3 | ### Requirements
 4 | 
 5 | stanza, flask
 6 | 
 7 | ### Run the demo locally
 8 | 
 9 | 1. Make sure you know how to disable your browser's CORS rule. For Chrome, [this extension](https://mybrowseraddon.com/access-control-allow-origin.html) works pretty well.
10 | 2. From this directory, start the Stanza demo server
11 | 
12 | ```bash
13 | export FLASK_APP=demo_server.py
14 | flask run
15 | ```
16 | 
17 | 3. In `stanza-brat.js`, uncomment the line at the top that declares `serverAddress` and point it to where your flask is serving the demo server (usually `http://localhost:5000`)
18 | 
19 | 4. Open `stanza-brat.html` in your browser (with CORS disabled) and enjoy!
20 | 
21 | ### Common issues
22 | 
23 | Make sure you have the models corresponding to the language you want to test out locally before submitting requests to the server! (Models can be obtained by `import stanza; stanza.download(<language_code>)`.
24 | 


--------------------------------------------------------------------------------
/stanza/utils/visualization/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants used for visualization tooling
 3 | """
 4 | 
 5 | # Ssurgeon constants
 6 | SAMPLE_SSURGEON_DOC = """
 7 |     # sent_id = 271
 8 |     # text = Hers is easy to clean.
 9 |     # previous = What did the dealer like about Alex's car?
10 |     # comment = extraction/raising via "tough extraction" and clausal subject
11 |     1	Hers	hers	PRON	PRP	Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs	3	nsubj	_	_
12 |     2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	cop	_	_
13 |     3	easy	easy	ADJ	JJ	Degree=Pos	0	root	_	_
14 |     4	to	to	PART	TO	_	5	mark	_	_
15 |     5	clean	clean	VERB	VB	VerbForm=Inf	3	csubj	_	SpaceAfter=No
16 |     6	.	.	PUNCT	.	_	5	punct	_	_
17 |     """
18 | 
19 | # Semgrex constants
20 | DEFAULT_SAMPLE_TEXT = "Banning opal removed artifact decks from the meta."
21 | DEFAULT_SEMGREX_QUERY = "{pos:NN}=object <obl {}=action, {cpos:NOUN}=thing <obj {cpos:VERB}=action"
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/demo/semgrex.py:
--------------------------------------------------------------------------------
 1 | import stanza
 2 | from stanza.server.semgrex import Semgrex
 3 | 
 4 | nlp = stanza.Pipeline("en", processors="tokenize,pos,lemma,depparse")
 5 | 
 6 | doc = nlp("Banning opal removed all artifact decks from the meta.  I miss playing lantern.")
 7 | with Semgrex(classpath="$CLASSPATH") as sem:
 8 |     semgrex_results = sem.process(doc,
 9 |                                   "{pos:NN}=object <obl {}=action",
10 |                                   "{cpos:NOUN}=thing <obj {cpos:VERB}=action")
11 |     print("COMPLETE RESULTS")
12 |     print(semgrex_results)
13 | 
14 |     print("Number of matches in graph 0 ('Banning opal...') for semgrex query 1 (thing <obj action): %d" % len(semgrex_results.result[0].result[1].match))
15 |     for match_idx, match in enumerate(semgrex_results.result[0].result[1].match):
16 |         print("Match {}:\n-----------\n{}".format(match_idx, match))
17 | 
18 |     print("graph 1 for semgrex query 0 is an empty match: len %d" % len(semgrex_results.result[1].result[0].match))
19 | 


--------------------------------------------------------------------------------
/stanza/models/coref/const.py:
--------------------------------------------------------------------------------
 1 | """ Contains type aliases for coref module """
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Any, Dict, List, Tuple
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | EPSILON = 1e-7
10 | LARGE_VALUE = 1000  # used instead of inf due to bug #16762 in pytorch
11 | 
12 | Doc = Dict[str, Any]
13 | Span = Tuple[int, int]
14 | 
15 | 
16 | @dataclass
17 | class CorefResult:
18 |     coref_scores: torch.Tensor = None                  # [n_words, k + 1]
19 |     coref_y: torch.Tensor = None                       # [n_words, k + 1]
20 |     rough_y: torch.Tensor = None                       # [n_words, n_words]
21 | 
22 |     word_clusters: List[List[int]] = None
23 |     span_clusters: List[List[Span]] = None
24 | 
25 |     rough_scores: torch.Tensor = None                  # [n_words, n_words]
26 |     span_scores: torch.Tensor = None                   # [n_heads, n_words, 2]
27 |     span_y: Tuple[torch.Tensor, torch.Tensor] = None   # [n_heads] x2
28 | 
29 |     zero_scores: torch.Tensor = None
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: 'Question about general usage. '
 4 | title: "[QUESTION]"
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Before you start, make sure to check out:
11 | * Our documentation: https://stanfordnlp.github.io/stanza/
12 | * Our FAQ: https://stanfordnlp.github.io/stanza/faq.html
13 | * Github issues (especially closed ones)
14 | Your question might have an answer in these places!
15 | 
16 | If you still couldn't find the answer to your question, feel free to delete this text and write down your question. The more information you provide with your question, the faster we will be able to help you!
17 | 
18 | If you have a question about an issue you're facing when using Stanza, please try to provide a detailed step-by-step guide to reproduce the issue you're facing. Try to at least provide a minimal code sample to reproduce the problem you are facing, instead of just describing it. That would greatly help us in locating the issue faster and help you resolve it!
19 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_dropout.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | 
 5 | import stanza
 6 | from stanza.models.common.dropout import WordDropout
 7 | 
 8 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 9 | 
10 | def test_word_dropout():
11 |     """
12 |     Test that word_dropout is randomly dropping out the entire final dimension of a tensor
13 | 
14 |     Doing 600 small rows should be super fast, but it leaves us with
15 |     something like a 1 in 10^180 chance of the test failing.  Not very
16 |     common, in other words
17 |     """
18 |     wd = WordDropout(0.5)
19 |     batch = torch.randn(600, 4)
20 |     dropped = wd(batch)
21 |     # the one time any of this happens, it's going to be really confusing
22 |     assert not torch.allclose(batch, dropped)
23 |     num_zeros = 0
24 |     for i in range(batch.shape[0]):
25 |         assert torch.allclose(dropped[i], batch[i]) or torch.sum(dropped[i]) == 0.0
26 |         if torch.sum(dropped[i]) == 0.0:
27 |             num_zeros += 1
28 |     assert num_zeros > 0 and num_zeros < batch.shape[0]
29 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/combine_ner_datasets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from stanza.utils.default_paths import get_default_paths
 4 | from stanza.utils.datasets.ner.utils import combine_dataset
 5 | 
 6 | SHARDS = ("train", "dev", "test")
 7 | 
 8 | def main(args=None):
 9 |     ner_data_dir = get_default_paths()['NER_DATA_DIR']
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--output_dataset', type=str, help='What dataset to output')
13 |     parser.add_argument('input_datasets', type=str, nargs='+', help='Which datasets to input')
14 | 
15 |     parser.add_argument('--input_dir', type=str, default=ner_data_dir, help='Which directory to find the datasets')
16 |     parser.add_argument('--output_dir', type=str, default=ner_data_dir, help='Which directory to write the dataset')
17 |     args = parser.parse_args(args)
18 | 
19 |     input_dir = args.input_dir
20 |     output_dir = args.output_dir
21 | 
22 |     combine_dataset(input_dir, output_dir, args.input_datasets, args.output_dataset)
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/stanza/models/lemma/attach_lemma_classifier.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from stanza.models.lemma.trainer import Trainer
 4 | from stanza.models.lemma_classifier.base_model import LemmaClassifier
 5 | 
 6 | def attach_classifier(input_filename, output_filename, classifiers):
 7 |     trainer = Trainer(model_file=input_filename)
 8 | 
 9 |     for classifier in classifiers:
10 |         classifier = LemmaClassifier.load(classifier)
11 |         trainer.contextual_lemmatizers.append(classifier)
12 | 
13 |     trainer.save(output_filename)
14 | 
15 | def main(args=None):
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--input', type=str, required=True, help='Which lemmatizer to start from')
18 |     parser.add_argument('--output', type=str, required=True, help='Where to save the lemmatizer')
19 |     parser.add_argument('--classifier', type=str, required=True, nargs='+', help='Lemma classifier to attach')
20 |     args = parser.parse_args(args)
21 | 
22 |     attach_classifier(args.input, args.output, args.classifier)
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/test_vietnamese_renormalization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | 
 4 | from stanza.utils.datasets.vietnamese import renormalize
 5 | 
 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 7 | 
 8 | def test_replace_all():
 9 |     text     = "SỌAmple tụy test file"
10 |     expected = "SOẠmple tuỵ test file"
11 | 
12 |     assert renormalize.replace_all(text) == expected
13 | 
14 | def test_replace_file(tmp_path):
15 |     text     = "SỌAmple tụy test file"
16 |     expected = "SOẠmple tuỵ test file"
17 | 
18 |     orig = tmp_path / "orig.txt"
19 |     converted = tmp_path / "converted.txt"
20 | 
21 |     with open(orig, "w", encoding="utf-8") as fout:
22 |         for i in range(10):
23 |             fout.write(text)
24 |             fout.write("\n")
25 | 
26 |     renormalize.convert_file(orig, converted)
27 | 
28 |     assert os.path.exists(converted)
29 |     with open(converted, encoding="utf-8") as fin:
30 |         lines = fin.readlines()
31 | 
32 |     assert len(lines) == 10
33 |     for i in lines:
34 |         assert i.strip() == expected
35 |         
36 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/test_arabic_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Small test of loading the Arabic pipeline
 3 | 
 4 | The main goal is to check that nothing goes wrong with RtL languages,
 5 | but incidentally this would have caught a bug where the xpos tags
 6 | were split into individual pieces instead of reassembled as expected
 7 | """
 8 | 
 9 | import pytest
10 | import stanza
11 | 
12 | from stanza.tests import TEST_MODELS_DIR
13 | 
14 | pytestmark = pytest.mark.pipeline
15 | 
16 | def test_arabic_pos_pipeline():
17 |     pipe = stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'ar'})
18 |     text = "ولم يتم اعتقال احد بحسب المتحدث باسم الشرطة."
19 | 
20 |     doc = pipe(text)
21 |     # the first token translates to "and not", seems common enough
22 |     # that we should be able to rely on it having a stable MWT and tag
23 | 
24 |     assert len(doc.sentences) == 1
25 |     assert doc.sentences[0].tokens[0].text == "ولم"
26 |     assert doc.sentences[0].words[0].xpos == "C---------"
27 |     assert doc.sentences[0].words[1].xpos == "F---------"
28 | 


--------------------------------------------------------------------------------
/stanza/__init__.py:
--------------------------------------------------------------------------------
 1 | from stanza.pipeline.core import DownloadMethod, Pipeline
 2 | from stanza.pipeline.multilingual import MultilingualPipeline
 3 | from stanza.models.common.doc import Document
 4 | from stanza.resources.common import download
 5 | from stanza.resources.installation import install_corenlp, download_corenlp_models
 6 | from stanza._version import __version__, __resources_version__
 7 | 
 8 | import logging
 9 | logger = logging.getLogger('stanza')
10 | 
11 | # if the client application hasn't set the log level, we set it
12 | # ourselves to INFO
13 | if logger.level == 0:
14 |     logger.setLevel(logging.INFO)
15 | 
16 | log_handler = logging.StreamHandler()
17 | log_formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s",
18 |                               datefmt='%Y-%m-%d %H:%M:%S')
19 | log_handler.setFormatter(log_formatter)
20 | 
21 | # also, if the client hasn't added any handlers for this logger
22 | # (or a default handler), we add a handler of our own
23 | #
24 | # client can later do
25 | #   logger.removeHandler(stanza.log_handler)
26 | if not logger.hasHandlers():
27 |     logger.addHandler(log_handler)
28 | 


--------------------------------------------------------------------------------
/demo/semgrex_sample.conllu:
--------------------------------------------------------------------------------
 1 | 
 2 | # sent_id = reviews-181748-0003
 3 | # text = My experience was awful though.
 4 | 1	My	my	PRON	PRP$	Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs	2	nmod:poss	2:nmod:poss	_
 5 | 2	experience	experience	NOUN	NN	Number=Sing	4	nsubj	4:nsubj	_
 6 | 3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	4	cop	4:cop	_
 7 | 4	awful	awful	ADJ	JJ	Degree=Pos	0	root	0:root	_
 8 | 5	though	though	ADV	RB	_	4	advmod	4:advmod	SpaceAfter=No
 9 | 6	.	.	PUNCT	.	_	4	punct	4:punct	_
10 | 
11 | 
12 | 
13 | # sent_id = reviews-117115-0005
14 | # text = The intruders slit the screen of the window.
15 | 1	The	the	DET	DT	Definite=Def|PronType=Art	2	det	2:det	_
16 | 2	intruders	intruder	NOUN	NNS	Number=Plur	3	nsubj	3:nsubj	_
17 | 3	slit	slit	VERB	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	0	root	0:root	_
18 | 4	the	the	DET	DT	Definite=Def|PronType=Art	5	det	5:det	_
19 | 5	screen	screen	NOUN	NN	Number=Sing	3	obj	3:obj	_
20 | 6	of	of	ADP	IN	_	8	case	8:case	_
21 | 7	the	the	DET	DT	Definite=Def|PronType=Art	8	det	8:det	_
22 | 8	window	window	NOUN	NN	Number=Sing	5	nmod	5:nmod:of	SpaceAfter=No
23 | 9	.	.	PUNCT	.	_	3	punct	3:punct	_
24 | 
25 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_bert_embedding.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from stanza.models.common.bert_embedding import load_bert, extract_bert_embeddings
 5 | 
 6 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 7 | 
 8 | BERT_MODEL = "hf-internal-testing/tiny-bert"
 9 | 
10 | @pytest.fixture(scope="module")
11 | def tiny_bert():
12 |     m, t = load_bert(BERT_MODEL)
13 |     return m, t
14 | 
15 | def test_load_bert(tiny_bert):
16 |     """
17 |     Empty method that just tests loading the bert
18 |     """
19 |     m, t = tiny_bert
20 | 
21 | def test_run_bert(tiny_bert):
22 |     m, t = tiny_bert
23 |     device = next(m.parameters()).device
24 |     extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "a", "test"]], device, True)
25 | 
26 | def test_run_bert_empty_word(tiny_bert):
27 |     m, t = tiny_bert
28 |     device = next(m.parameters()).device
29 |     foo = extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "-", "a", "test"]], device, True)
30 |     bar = extract_bert_embeddings(BERT_MODEL, t, m, [["This", "is", "", "a", "test"]], device, True)
31 | 
32 |     assert len(foo) == 1
33 |     assert torch.allclose(foo[0], bar[0])
34 | 


--------------------------------------------------------------------------------
/stanza/tests/resources/test_prepare_resources.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import stanza
 4 | import stanza.resources.prepare_resources as prepare_resources
 5 | 
 6 | from stanza.tests import *
 7 | 
 8 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 9 | 
10 | def test_split_model_name():
11 |     # Basic test
12 |     lang, package, processor = prepare_resources.split_model_name('ro_nonstandard_tagger.pt')
13 |     assert lang == 'ro'
14 |     assert package == 'nonstandard'
15 |     assert processor == 'pos'
16 | 
17 |     # Check that nertagger is found even though it also ends with tagger
18 |     # Check that ncbi_disease is correctly partitioned despite the extra _
19 |     lang, package, processor = prepare_resources.split_model_name('en_ncbi_disease_nertagger.pt')
20 |     assert lang == 'en'
21 |     assert package == 'ncbi_disease'
22 |     assert processor == 'ner'
23 | 
24 |     # assert that processors with _ in them are also okay
25 |     lang, package, processor = prepare_resources.split_model_name('en_pubmed_forward_charlm.pt')
26 |     assert lang == 'en'
27 |     assert package == 'pubmed'
28 |     assert processor == 'forward_charlm'
29 |     
30 |     
31 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/test_pipeline_depparse_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic testing of part of speech tagging
 3 | """
 4 | 
 5 | import pytest
 6 | import stanza
 7 | from stanza.models.common.vocab import VOCAB_PREFIX
 8 | 
 9 | from stanza.tests import TEST_MODELS_DIR
10 | 
11 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
12 | 
13 | class TestClassifier:
14 |     @pytest.fixture(scope="class")
15 |     def english_depparse(self):
16 |         """
17 |         Get a depparse_processor for English
18 |         """
19 |         nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})
20 |         assert 'depparse' in nlp.processors
21 |         return nlp.processors['depparse']
22 | 
23 |     def test_get_known_relations(self, english_depparse):
24 |         """
25 |         Test getting the known relations from a processor.
26 | 
27 |         Doesn't test that all the relations exist, since who knows what will change in the future
28 |         """
29 |         relations = english_depparse.get_known_relations()
30 |         assert len(relations) > 5
31 |         assert 'case' in relations
32 |         for i in VOCAB_PREFIX:
33 |             assert i not in relations
34 | 


--------------------------------------------------------------------------------
/stanza/models/classifiers/utils.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from torch import nn
 4 | 
 5 | """
 6 | Defines some methods which may occur in multiple model types
 7 | """
 8 | # NLP machines:
 9 | # word2vec are in
10 | # /u/nlp/data/stanfordnlp/model_production/stanfordnlp/extern_data/word2vec
11 | # google vectors are in
12 | # /scr/nlp/data/wordvectors/en/google/GoogleNews-vectors-negative300.txt
13 | 
14 | class WVType(Enum):
15 |     WORD2VEC = 1
16 |     GOOGLE = 2
17 |     FASTTEXT = 3
18 |     OTHER = 4
19 | 
20 | class ExtraVectors(Enum):
21 |     NONE = 1
22 |     CONCAT = 2
23 |     SUM = 3
24 | 
25 | class ModelType(Enum):
26 |     CNN = 1
27 |     CONSTITUENCY = 2
28 | 
29 | def build_output_layers(fc_input_size, fc_shapes, num_classes):
30 |     """
31 |     Build a sequence of fully connected layers to go from the final conv layer to num_classes
32 | 
33 |     Returns an nn.ModuleList
34 |     """
35 |     fc_layers = []
36 |     previous_layer_size = fc_input_size
37 |     for shape in fc_shapes:
38 |         fc_layers.append(nn.Linear(previous_layer_size, shape))
39 |         previous_layer_size = shape
40 |     fc_layers.append(nn.Linear(previous_layer_size, num_classes))
41 |     return nn.ModuleList(fc_layers)
42 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_chuliu_edmonds.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test some use cases of the chuliu_edmonds algorithm
 3 | 
 4 | (currently just the tarjan implementation)
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pytest
 9 | 
10 | from stanza.models.common.chuliu_edmonds import tarjan
11 | 
12 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
13 | 
14 | def test_tarjan_basic():
15 |     simple = np.array([0, 4, 4, 4, 0])
16 |     result = tarjan(simple)
17 |     assert result == []
18 | 
19 |     simple = np.array([0, 2, 0, 4, 2, 2])
20 |     result = tarjan(simple)
21 |     assert result == []
22 | 
23 | def test_tarjan_cycle():
24 |     cycle_graph = np.array([0, 3, 1, 2])
25 |     result = tarjan(cycle_graph)
26 |     expected = np.array([False,  True,  True,  True])
27 |     assert len(result) == 1
28 |     np.testing.assert_array_equal(result[0], expected)
29 | 
30 |     cycle_graph = np.array([0, 3, 1, 2, 5, 6, 4])
31 |     result = tarjan(cycle_graph)
32 |     assert len(result) == 2
33 |     expected = [np.array([False,  True,  True,  True, False, False, False]),
34 |                 np.array([False, False, False, False,  True,  True,  True])]
35 |     for r, e in zip(result, expected):
36 |         np.testing.assert_array_equal(r, e)
37 | 


--------------------------------------------------------------------------------
/stanza/tests/tokenization/test_replace_long_tokens.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Check to make sure long tokens are replaced with "UNK" by the tokenization processor
 3 | """
 4 | import pytest
 5 | import stanza
 6 | 
 7 | from stanza.pipeline import tokenize_processor
 8 | 
 9 | from stanza.tests import TEST_MODELS_DIR
10 | 
11 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
12 | 
13 | def test_replace_long_tokens():
14 |     nlp = stanza.Pipeline(lang="en", download_method=None, model_dir=TEST_MODELS_DIR, processors="tokenize")
15 | 
16 |     test_str = "foo " + "x" * 10000 + " bar"
17 | 
18 |     res = nlp(test_str)
19 | 
20 |     assert res.sentences[0].words[1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT
21 | 
22 | def test_set_max_len():
23 |     nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR,
24 |                              'lang': 'en',
25 |                              'download_method': None,
26 |                              'tokenize_max_seqlen': 20})
27 |     doc = nlp("This is a doc withaverylongtokenthatshouldbereplaced")
28 |     assert len(doc.sentences) == 1
29 |     assert len(doc.sentences[0].words) == 5
30 |     assert doc.sentences[0].words[-1].text == tokenize_processor.TOKEN_TOO_LONG_REPLACEMENT
31 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_foundation_cache.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | import tempfile
 5 | 
 6 | import pytest
 7 | 
 8 | import stanza
 9 | from stanza.models.common.foundation_cache import FoundationCache, load_charlm
10 | from stanza.tests import TEST_MODELS_DIR
11 | 
12 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
13 | 
14 | def test_charlm_cache():
15 |     models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
16 |     models = glob.glob(models_path)
17 |     # we expect at least one English model downloaded for the tests
18 |     assert len(models) >= 1
19 |     model_file = models[0]
20 | 
21 |     cache = FoundationCache()
22 |     with tempfile.TemporaryDirectory(dir=".") as test_dir:
23 |         temp_file = os.path.join(test_dir, "charlm.pt")
24 |         shutil.copy2(model_file, temp_file)
25 |         # this will work
26 |         model = load_charlm(temp_file)
27 | 
28 |         # this will save the model
29 |         model = cache.load_charlm(temp_file)
30 | 
31 |     # this should no longer work
32 |     with pytest.raises(FileNotFoundError):
33 |         model = load_charlm(temp_file)
34 | 
35 |     # it should remember the cached version
36 |     model = cache.load_charlm(temp_file)
37 | 


--------------------------------------------------------------------------------
/stanza/models/tokenization/vocab.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import re
 3 | 
 4 | from stanza.models.common.vocab import BaseVocab
 5 | from stanza.models.common.vocab import UNK, PAD
 6 | 
 7 | SPACE_RE = re.compile(r'\s')
 8 | 
 9 | class Vocab(BaseVocab):
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self.lang_replaces_spaces = any([self.lang.startswith(x) for x in ['zh', 'ja', 'ko']])
13 | 
14 |     def build_vocab(self):
15 |         paras = self.data
16 |         counter = Counter()
17 |         for para in paras:
18 |             for unit in para:
19 |                 normalized = self.normalize_unit(unit[0])
20 |                 counter[normalized] += 1
21 | 
22 |         self._id2unit = [PAD, UNK] + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
23 |         self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
24 | 
25 |     def normalize_unit(self, unit):
26 |         # Normalize minimal units used by the tokenizer
27 |         return unit
28 | 
29 |     def normalize_token(self, token):
30 |         token = SPACE_RE.sub(' ', token.lstrip())
31 | 
32 |         if self.lang_replaces_spaces:
33 |             token = token.replace(' ', '')
34 | 
35 |         return token
36 | 


--------------------------------------------------------------------------------
/stanza/models/common/stanza_object.py:
--------------------------------------------------------------------------------
 1 | def _readonly_setter(self, name):
 2 |     full_classname = self.__class__.__module__
 3 |     if full_classname is None:
 4 |         full_classname = self.__class__.__qualname__
 5 |     else:
 6 |         full_classname += '.' + self.__class__.__qualname__
 7 |     raise ValueError(f'Property "{name}" of "{full_classname}" is read-only.')
 8 | 
 9 | class StanzaObject(object):
10 |     """
11 |     Base class for all Stanza data objects that allows for some flexibility handling annotations
12 |     """
13 | 
14 |     @classmethod
15 |     def add_property(cls, name, default=None, getter=None, setter=None):
16 |         """
17 |         Add a property accessible through self.{name} with underlying variable self._{name}.
18 |         Optionally setup a setter as well.
19 |         """
20 | 
21 |         if hasattr(cls, name):
22 |             raise ValueError(f'Property by the name of {name} already exists in {cls}. Maybe you want to find another name?')
23 | 
24 |         setattr(cls, f'_{name}', default)
25 |         if getter is None:
26 |             getter = lambda self: getattr(self, f'_{name}')
27 |         if setter is None:
28 |             setter = lambda self, value: _readonly_setter(self, name)
29 | 
30 |         setattr(cls, name, property(getter, setter))
31 | 
32 | 


--------------------------------------------------------------------------------
/stanza/pipeline/demo/stanza-brat.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .red {
 3 |   color:#990000
 4 | }
 5 | 
 6 | #wrap {
 7 |   min-height: 100%;
 8 |   height: auto;
 9 |   margin: 0 auto -6ex;
10 |   padding: 0 0 6ex;
11 | }
12 | 
13 | .pattern_tab {
14 |   margin: 1ex;
15 | }
16 | 
17 | .pattern_brat {
18 |   margin-top: 1ex;
19 | }
20 | 
21 | .label {
22 |   color: #777777;
23 |   font-size: small;
24 | }
25 | 
26 | .footer {
27 |   bottom: 0;
28 |   width: 100%;
29 |   /* Set the fixed height of the footer here */
30 |   height: 5ex;
31 |   padding-top: 1ex;
32 |   margin-top: 1ex;
33 |   background-color: #f5f5f5;
34 | }
35 | 
36 | .corenlp_error {
37 |   margin-top: 2ex;
38 | }
39 | 
40 | /* Styling for parse graph */
41 | .node rect {
42 |   stroke: #333;
43 |   fill: #fff;
44 | }
45 | 
46 | .parse-RULE rect {
47 |   fill: #C0D9AF;
48 | }
49 | 
50 | .parse-TERMINAL rect {
51 |   stroke: #333;
52 |   fill: #EEE8AA;
53 | }
54 | 
55 | .node.highlighted {
56 |   stroke: #ffff00;
57 | }
58 | 
59 | .edgePath path {
60 |   stroke: #333;
61 |   fill: #333;
62 |   stroke-width: 1.5px;
63 | }
64 | 
65 | .parse-EDGE path {
66 |   stroke: DarkGray;
67 |   fill: DarkGray;
68 |   stroke-width: 1.5px;
69 | }
70 | 
71 | .logo {
72 |     font-family: "Lato", "Gill Sans MT", "Gill Sans", "Helvetica", "Arial", sans-serif;
73 |     font-style: italic;
74 | }
75 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_models_ner_scorer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple test of the scorer module for NER
 3 | """
 4 | 
 5 | import pytest
 6 | import stanza
 7 | 
 8 | from stanza.tests import *
 9 | from stanza.models.ner.scorer import score_by_token, score_by_entity
10 | 
11 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
12 | 
13 | def test_ner_scorer():
14 |     pred_sequences = [['O', 'S-LOC', 'O', 'O', 'B-PER', 'E-PER'],
15 |                     ['O', 'S-MISC', 'O', 'E-ORG', 'O', 'B-PER', 'I-PER', 'E-PER']]
16 |     gold_sequences = [['O', 'B-LOC', 'E-LOC', 'O', 'B-PER', 'E-PER'],
17 |                     ['O', 'S-MISC', 'B-ORG', 'E-ORG', 'O', 'B-PER', 'E-PER', 'S-LOC']]
18 |     
19 |     token_p, token_r, token_f, confusion = score_by_token(pred_sequences, gold_sequences)
20 |     assert pytest.approx(token_p, abs=0.00001) == 0.625
21 |     assert pytest.approx(token_r, abs=0.00001) == 0.5
22 |     assert pytest.approx(token_f, abs=0.00001) == 0.55555
23 | 
24 |     entity_p, entity_r, entity_f, entity_f1 = score_by_entity(pred_sequences, gold_sequences)
25 |     assert pytest.approx(entity_p, abs=0.00001) == 0.4
26 |     assert pytest.approx(entity_r, abs=0.00001) == 0.33333
27 |     assert pytest.approx(entity_f, abs=0.00001) == 0.36363
28 |     assert entity_f1 == {'LOC': 0.0, 'MISC': 1.0, 'ORG': 0.0, 'PER': 0.5}
29 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for the processing of constituency treebanks
 3 | """
 4 | 
 5 | import os
 6 | import shutil
 7 | 
 8 | from stanza.models.constituency import parse_tree
 9 | 
10 | SHARDS = ("train", "dev", "test")
11 | 
12 | def copy_dev_test(base_path, input_dataset, output_dataset):
13 |     shutil.copy2(os.path.join(base_path, "%s_dev.mrg" % input_dataset),
14 |                  os.path.join(base_path, "%s_dev.mrg" % output_dataset))
15 |     shutil.copy2(os.path.join(base_path, "%s_test.mrg" % input_dataset),
16 |                  os.path.join(base_path, "%s_test.mrg" % output_dataset))
17 | 
18 | def write_dataset(datasets, output_dir, dataset_name):
19 |     for dataset, shard in zip(datasets, SHARDS):
20 |         output_filename = os.path.join(output_dir, "%s_%s.mrg" % (dataset_name, shard))
21 |         print("Writing {} trees to {}".format(len(dataset), output_filename))
22 |         parse_tree.Tree.write_treebank(dataset, output_filename)
23 | 
24 | def split_treebank(treebank, train_size, dev_size):
25 |     """
26 |     Split a treebank deterministically
27 |     """
28 |     train_end = int(len(treebank) * train_size)
29 |     dev_end = int(len(treebank) * (train_size + dev_size))
30 |     return treebank[:train_end], treebank[train_end:dev_end], treebank[dev_end:]
31 | 


--------------------------------------------------------------------------------
/stanza/tests/server/test_ud_enhancer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import stanza
 3 | from stanza.tests import *
 4 | 
 5 | from stanza.models.common.doc import Document
 6 | import stanza.server.ud_enhancer as ud_enhancer
 7 | 
 8 | pytestmark = [pytest.mark.pipeline]
 9 | 
10 | def check_edges(graph, source, target, num, isExtra=None):
11 |     edges = [edge for edge in graph.edge if edge.source == source and edge.target == target]
12 |     assert len(edges) == num
13 |     if num == 1:
14 |         assert edges[0].isExtra == isExtra
15 | 
16 | def test_one_sentence():
17 |     nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,pos,lemma,depparse")
18 |     doc = nlp("This is the car that I bought")
19 |     result = ud_enhancer.process_doc(doc, language="en", pronouns_pattern=None)
20 | 
21 |     assert len(result.sentence) == 1
22 |     sentence = result.sentence[0]
23 | 
24 |     basic = sentence.basicDependencies
25 |     assert len(basic.node) == 7
26 |     assert len(basic.edge) == 6
27 |     check_edges(basic, 4, 7, 1, False)
28 |     check_edges(basic, 7, 4, 0)
29 | 
30 |     enhanced = sentence.enhancedDependencies
31 |     assert len(enhanced.node) == 7
32 |     assert len(enhanced.edge) == 7
33 |     check_edges(enhanced, 4, 7, 1, False)
34 |     # this is the new edge
35 |     check_edges(enhanced, 7, 4, 1, True)
36 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/count_entities.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from collections import defaultdict
 4 | import json
 5 | 
 6 | from stanza.models.common.doc import Document
 7 | from stanza.utils.datasets.ner.utils import list_doc_entities
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description="Report the coverage of one NER file on another.")
11 |     parser.add_argument('filename', type=str, nargs='+', help='File(s) to count')
12 |     args = parser.parse_args()
13 |     return args
14 | 
15 | 
16 | def count_entities(*filenames):
17 |     entity_collection = defaultdict(list)
18 | 
19 |     for filename in filenames:
20 |         with open(filename) as fin:
21 |             doc = Document(json.load(fin))
22 |             num_tokens = sum(1 for sentence in doc.sentences for token in sentence.tokens)
23 |             print("Number of tokens in %s: %d" % (filename, num_tokens))
24 |             entities = list_doc_entities(doc)
25 | 
26 |         for ent in entities:
27 |             entity_collection[ent[1]].append(ent[0])
28 | 
29 |     keys = sorted(entity_collection.keys())
30 |     for k in keys:
31 |         print(k, len(entity_collection[k]))
32 | 
33 | def main():
34 |     args = parse_args()
35 | 
36 |     count_entities(*args.filename)
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/stanza/models/common/count_ner_coverage.py:
--------------------------------------------------------------------------------
 1 | from stanza.models.common import pretrain
 2 | import argparse
 3 | 
 4 | def parse_args():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('ners', type=str, nargs='*', help='Which treebanks to run on')
 7 |     parser.add_argument('--pretrain', type=str, default="/home/john/stanza_resources/hi/pretrain/hdtb.pt", help='Which pretrain to use')
 8 |     parser.set_defaults(ners=["/home/john/stanza/data/ner/hi_fire2013.train.csv",
 9 |                               "/home/john/stanza/data/ner/hi_fire2013.dev.csv"])
10 |     args = parser.parse_args()
11 |     return args
12 | 
13 | 
14 | def read_ner(filename):
15 |     words = []
16 |     for line in open(filename).readlines():
17 |         line = line.strip()
18 |         if not line:
19 |             continue
20 |         if line.split("\t")[1] == 'O':
21 |             continue
22 |         words.append(line.split("\t")[0])
23 |     return words
24 | 
25 | def count_coverage(pretrain, words):
26 |     count = 0
27 |     for w in words:
28 |         if w in pretrain.vocab:
29 |             count = count + 1
30 |     return count / len(words)
31 | 
32 | args = parse_args()
33 | pt = pretrain.Pretrain(args.pretrain)
34 | for dataset in args.ners:
35 |     words = read_ner(dataset)
36 |     print(dataset)
37 |     print(count_coverage(pt, words))
38 |     print()
39 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_common_data.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import stanza
 3 | 
 4 | from stanza.tests import *
 5 | from stanza.models.common.data import get_augment_ratio, augment_punct
 6 | 
 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 8 | 
 9 | def test_augment_ratio():
10 |     data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
11 |     should_augment = lambda x: x >= 3
12 |     can_augment = lambda x: x >= 4
13 |     # check that zero is returned if no augmentation is needed
14 |     # which will be the case since 2 are already satisfactory
15 |     assert get_augment_ratio(data, should_augment, can_augment, desired_ratio=0.1) == 0.0
16 | 
17 |     # this should throw an error
18 |     with pytest.raises(AssertionError):
19 |         get_augment_ratio(data, can_augment, should_augment)
20 | 
21 |     # with a desired ratio of 0.4,
22 |     # there are already 2 that don't need augmenting
23 |     # and 7 that are eligible to be augmented
24 |     # so 2/7 will need to be augmented
25 |     assert get_augment_ratio(data, should_augment, can_augment, desired_ratio=0.4) == pytest.approx(2/7)
26 | 
27 | def test_augment_punct():
28 |     data = [["Simple", "test", "."]]
29 |     should_augment = lambda x: x[-1] == "."
30 |     can_augment = should_augment
31 |     new_data = augment_punct(data, 1.0, should_augment, can_augment)
32 |     assert new_data == [["Simple", "test"]]
33 | 


--------------------------------------------------------------------------------
/stanza/tests/constituency/test_positional_encoding.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | 
 5 | from stanza import Pipeline
 6 | from stanza.models.constituency.positional_encoding import SinusoidalEncoding, AddSinusoidalEncoding
 7 | 
 8 | from stanza.tests import *
 9 | 
10 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
11 | 
12 | 
13 | def test_positional_encoding():
14 |     encoding = SinusoidalEncoding(model_dim=10, max_len=6)
15 |     foo = encoding(torch.tensor([5]))
16 |     assert foo.shape == (1, 10)
17 |     # TODO: check the values
18 | 
19 | def test_resize():
20 |     encoding = SinusoidalEncoding(model_dim=10, max_len=3)
21 |     foo = encoding(torch.tensor([5]))
22 |     assert foo.shape == (1, 10)
23 | 
24 | 
25 | def test_arange():
26 |     encoding = SinusoidalEncoding(model_dim=10, max_len=2)
27 |     foo = encoding(torch.arange(4))
28 |     assert foo.shape == (4, 10)
29 |     assert encoding.max_len() == 4
30 | 
31 | def test_add():
32 |     encoding = AddSinusoidalEncoding(d_model=10, max_len=4)
33 |     x = torch.zeros(1, 4, 10)
34 |     y = encoding(x)
35 | 
36 |     r = torch.randn(1, 4, 10)
37 |     r2 = encoding(r)
38 | 
39 |     assert torch.allclose(r2 - r, y, atol=1e-07)
40 | 
41 |     r = torch.randn(2, 4, 10)
42 |     r2 = encoding(r)
43 | 
44 |     assert torch.allclose(r2[0] - r[0], y, atol=1e-07)
45 |     assert torch.allclose(r2[1] - r[1], y, atol=1e-07)
46 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/prepare_pos_treebank.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to prepare all pos datasets.
 3 | 
 4 | For example, do
 5 |   python -m stanza.utils.datasets.prepare_pos_treebank TREEBANK
 6 | such as
 7 |   python -m stanza.utils.datasets.prepare_pos_treebank UD_English-EWT
 8 | 
 9 | and it will prepare each of train, dev, test
10 | """
11 | 
12 | import os
13 | import shutil
14 | 
15 | import stanza.utils.datasets.common as common
16 | import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank
17 | 
18 | def copy_conllu_file_or_zip(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name):
19 |     original = f"{tokenizer_dir}/{short_name}.{tokenizer_file}.zip"
20 |     copied = f"{dest_dir}/{short_name}.{dest_file}.zip"
21 | 
22 |     if os.path.exists(original):
23 |         print("Copying from %s to %s" % (original, copied))
24 |         shutil.copyfile(original, copied)
25 |     else:
26 |         prepare_tokenizer_treebank.copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name)
27 | 
28 | 
29 | def process_treebank(treebank, model_type, paths, args):
30 |     prepare_tokenizer_treebank.copy_conllu_treebank(treebank, model_type, paths, paths["POS_DATA_DIR"], postprocess=copy_conllu_file_or_zip)
31 | 
32 | def main():
33 |     common.main(process_treebank, common.ModelType.POS)
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/stanza/pipeline/external/corenlp_converter_depparse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A depparse processor which converts constituency trees using CoreNLP
 3 | """
 4 | 
 5 | from stanza.pipeline._constants import TOKENIZE, CONSTITUENCY, DEPPARSE
 6 | from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
 7 | from stanza.server.dependency_converter import DependencyConverter
 8 | 
 9 | @register_processor_variant(DEPPARSE, 'converter')
10 | class ConverterDepparse(ProcessorVariant):
11 |     # set of processor requirements for this processor
12 |     REQUIRES_DEFAULT = set([TOKENIZE, CONSTITUENCY])
13 | 
14 |     def __init__(self, config):
15 |         if config['lang'] != 'en':
16 |             raise ValueError("Constituency to dependency converter only works for English")
17 | 
18 |         # TODO: get classpath from config
19 |         # TODO: close this when finished?
20 |         #   a more involved approach would be to turn the Pipeline into
21 |         #   a context with __enter__ and __exit__
22 |         #   __exit__ would try to free all resources, although some
23 |         #   might linger such as GPU allocations
24 |         #   maybe it isn't worth even trying to clean things up on account of that
25 |         self.converter = DependencyConverter(classpath="$CLASSPATH")
26 |         self.converter.open_pipe()
27 | 
28 |     def process(self, document):
29 |         return self.converter.process(document)
30 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_from_conllu.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza import Pipeline
 4 | from stanza.utils.conll import CoNLL
 5 | from stanza.tests import TEST_MODELS_DIR
 6 | 
 7 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 8 | 
 9 | def test_from_conllu():
10 |     """
11 |     If the doc does not have the entire text available, make sure it still safely processes the text
12 | 
13 |     Test case supplied from user - see issue #1428
14 |     """
15 |     pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", download_method=None)
16 |     doc = pipe("In February, I traveled to Seattle.  Dr. Pritchett gave me a new hip")
17 |     ents = [x.text for x in doc.ents]
18 |     # the default NER model ought to find these three
19 |     assert ents == ['February', 'Seattle', 'Pritchett']
20 | 
21 |     doc_conllu = "{:C}\n\n".format(doc)
22 |     doc = CoNLL.conll2doc(input_str=doc_conllu)
23 |     pipe = Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,ner", tokenize_pretokenized=True, download_method=None)
24 |     pipe(doc)
25 |     ents = [x.text for x in doc.ents]
26 |     # this should still work when processed from a CoNLLu document
27 |     # the bug previously caused a crash because the text to construct
28 |     # the entities was not available, since the Document wouldn't have
29 |     # the entire document text available
30 |     assert ents == ['February', 'Seattle', 'Pritchett']
31 | 


--------------------------------------------------------------------------------
/stanza/models/constituency/evaluate_treebanks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read multiple treebanks, score the results.
 3 | 
 4 | Reports the k-best score if multiple predicted treebanks are given.
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | from stanza.models.constituency import tree_reader
10 | from stanza.server.parser_eval import EvaluateParser, ParseResult
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description='Get scores for one or more treebanks against the gold')
15 |     parser.add_argument('gold', type=str, help='Which file to load as the gold trees')
16 |     parser.add_argument('pred', type=str, nargs='+', help='Which file(s) are the predictions.  If more than one is given, the evaluation will be "k-best" with the first prediction treated as the canonical')
17 |     args = parser.parse_args()
18 | 
19 |     print("Loading gold treebank: " + args.gold)
20 |     gold = tree_reader.read_treebank(args.gold)
21 |     print("Loading predicted treebanks: " + args.pred)
22 |     pred = [tree_reader.read_treebank(x) for x in args.pred]
23 | 
24 |     full_results = [ParseResult(parses[0], [*parses[1:]])
25 |                     for parses in zip(gold, *pred)]
26 | 
27 |     if len(pred) <= 1:
28 |         kbest = None
29 |     else:
30 |         kbest = len(pred)
31 | 
32 |     with EvaluateParser(kbest=kbest) as evaluator:
33 |         response = evaluator.process(full_results)
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/stanza/utils/helper_func.py:
--------------------------------------------------------------------------------
 1 | def make_table(header, content, column_width=None):
 2 |     '''
 3 |     Input:
 4 |     header -> List[str]: table header
 5 |     content -> List[List[str]]: table content
 6 |     column_width -> int: table column width; set to None for dynamically calculated widths
 7 |     
 8 |     Output:
 9 |     table_str -> str: well-formatted string for the table
10 |     '''
11 |     table_str = ''
12 |     len_column, len_row = len(header), len(content) + 1
13 |     if column_width is None:
14 |         # dynamically decide column widths
15 |         lens = [[len(str(h)) for h in header]]
16 |         lens += [[len(str(x)) for x in row] for row in content]
17 |         column_widths = [max(c)+3 for c in zip(*lens)]
18 |     else:
19 |         column_widths = [column_width] * len_column
20 |     
21 |     table_str += '=' * (sum(column_widths) + 1) + '\n'
22 |     
23 |     table_str += '|'
24 |     for i, item in enumerate(header):
25 |         table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|'
26 |     table_str += '\n'
27 |     
28 |     table_str += '-' * (sum(column_widths) + 1) + '\n'
29 |     
30 |     for line in content:
31 |         table_str += '|'
32 |         for i, item in enumerate(line):
33 |             table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|'
34 |         table_str += '\n'
35 |     
36 |     table_str += '=' * (sum(column_widths) + 1) + '\n'
37 |     
38 |     return table_str
39 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/pretrain/word_in_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple tool to query a word vector file to see if certain words are in that file
 3 | """
 4 | 
 5 | import argparse
 6 | import os
 7 | 
 8 | from stanza.models.common.pretrain import Pretrain
 9 | from stanza.resources.common import DEFAULT_MODEL_DIR, download
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     group = parser.add_mutually_exclusive_group(required=True)
14 |     group.add_argument("--pretrain", default=None, type=str, help="Where to read the converted PT file")
15 |     group.add_argument("--package", default=None, type=str, help="Use a pretrain package instead")
16 |     parser.add_argument("--download_json", default=False, action='store_true', help="Download the json even if it already exists")
17 |     parser.add_argument("words", type=str, nargs="+", help="Which words to search for")
18 |     args = parser.parse_args()
19 | 
20 |     if args.pretrain:
21 |         pt = Pretrain(args.pretrain)
22 |     else:
23 |         lang, package = args.package.split("_", 1)
24 |         download(lang=lang, package=None, processors={"pretrain": package}, download_json=args.download_json)
25 |         pt_filename = os.path.join(DEFAULT_MODEL_DIR, lang, "pretrain", "%s.pt" % package)
26 |         pt = Pretrain(pt_filename)
27 | 
28 |     for word in args.words:
29 |         print("{}: {}".format(word, word in pt.vocab))
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/stanza/models/common/maxout_linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A layer which implements maxout from the "Maxout Networks" paper
 3 | 
 4 | https://arxiv.org/pdf/1302.4389v4.pdf
 5 | Goodfellow, Warde-Farley, Mirza, Courville, Bengio
 6 | 
 7 | or a simpler explanation here:
 8 | 
 9 | https://stats.stackexchange.com/questions/129698/what-is-maxout-in-neural-network/298705#298705
10 | 
11 | The implementation here:
12 | for k layers of maxout, in -> out channels, we make a single linear
13 |   map of size in -> out*k
14 | then we reshape the end to be (..., k, out)
15 | and return the max over the k layers
16 | """
17 | 
18 | 
19 | import torch
20 | import torch.nn as nn
21 | 
22 | class MaxoutLinear(nn.Module):
23 |     def __init__(self, in_channels, out_channels, maxout_k):
24 |         super().__init__()
25 | 
26 |         self.in_channels = in_channels
27 |         self.out_channels = out_channels
28 |         self.maxout_k = maxout_k
29 | 
30 |         self.linear = nn.Linear(in_channels, out_channels * maxout_k)
31 | 
32 |     def forward(self, inputs):
33 |         """
34 |         Use the oversized linear as the repeated linear, then take the max
35 | 
36 |         One large linear map makes the implementation simpler and easier for pytorch to make parallel
37 |         """
38 |         outputs = self.linear(inputs)
39 |         outputs = outputs.view(*outputs.shape[:-1], self.maxout_k, self.out_channels)
40 |         outputs = torch.max(outputs, dim=-2)[0]
41 |         return outputs
42 | 
43 | 


--------------------------------------------------------------------------------
/stanza/tests/constituency/test_tree_stack.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza.models.constituency.tree_stack import TreeStack
 4 | 
 5 | from stanza.tests import *
 6 | 
 7 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 8 | 
 9 | def test_simple():
10 |     stack = TreeStack(value=5, parent=None, length=1)
11 |     stack = stack.push(3)
12 |     stack = stack.push(1)
13 | 
14 |     expected_values = [1, 3, 5]
15 |     for value in expected_values:
16 |         assert stack.value == value
17 |         stack = stack.pop()
18 |     assert stack is None
19 | 
20 | def test_iter():
21 |     stack = TreeStack(value=5, parent=None, length=1)
22 |     stack = stack.push(3)
23 |     stack = stack.push(1)
24 | 
25 |     stack_list = list(stack)
26 |     assert list(stack) == [1, 3, 5]
27 | 
28 | def test_str():
29 |     stack = TreeStack(value=5, parent=None, length=1)
30 |     stack = stack.push(3)
31 |     stack = stack.push(1)
32 | 
33 |     assert str(stack) == "TreeStack(1, 3, 5)"
34 | 
35 | def test_len():
36 |     stack = TreeStack(value=5, parent=None, length=1)
37 |     assert len(stack) == 1
38 | 
39 |     stack = stack.push(3)
40 |     stack = stack.push(1)
41 |     assert len(stack) == 3
42 | 
43 | def test_long_len():
44 |     """
45 |     Original stack had a bug where this took exponential time...
46 |     """
47 |     stack = TreeStack(value=0, parent=None, length=1)
48 |     for i in range(1, 40):
49 |         stack = stack.push(i)
50 |     assert len(stack) == 40
51 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/ner/test_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the utils file of the NER dataset processing
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from stanza.utils.datasets.ner.utils import list_doc_entities
 8 | from stanza.tests.datasets.ner.test_prepare_ner_file import BIO_1, BIO_2, write_and_convert
 9 | 
10 | def test_list_doc_entities(tmp_path):
11 |     """
12 |     Test the function which lists all of the entities in a doc
13 |     """
14 |     doc = write_and_convert(tmp_path, BIO_1)
15 |     entities = list_doc_entities(doc)
16 |     expected = [(('Jennifer', "Sh'reyan"), 'PERSON')]
17 |     assert expected == entities
18 | 
19 |     doc = write_and_convert(tmp_path, BIO_2)
20 |     entities = list_doc_entities(doc)
21 |     expected = [(('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
22 |     assert expected == entities    
23 | 
24 |     doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_2]))
25 |     entities = list_doc_entities(doc)
26 |     expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
27 |     assert expected == entities
28 | 
29 |     doc = write_and_convert(tmp_path, "\n\n".join([BIO_1, BIO_1, BIO_2]))
30 |     entities = list_doc_entities(doc)
31 |     expected = [(('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer', "Sh'reyan"), 'PERSON'), (('Jennifer',), 'PERSON'), (('Beckett',), 'PERSON'), (('Cerritos',), 'LOCATION')]
32 |     assert expected == entities
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/stanza/utils/get_tqdm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def get_tqdm():
 4 |     """
 5 |     Return a tqdm appropriate for the situation
 6 | 
 7 |     imports tqdm depending on if we're at a console, redir to a file, notebook, etc
 8 | 
 9 |     from @tcrimi at https://github.com/tqdm/tqdm/issues/506
10 | 
11 |     This replaces `import tqdm`, so for example, you do this:
12 |       from stanza.utils.get_tqdm import get_tqdm
13 |       tqdm = get_tqdm()
14 |     then do this when you want a scroll bar or regular iterator depending on context:
15 |       tqdm(list)
16 | 
17 |     If there is no tty, the returned tqdm will always be disabled
18 |     unless disable=False is specifically set.
19 |     """
20 |     ipy_str = ""
21 |     try:
22 |         from IPython import get_ipython
23 |         ipy_str = str(type(get_ipython()))
24 |     except ImportError:
25 |         pass
26 | 
27 |     if 'zmqshell' in ipy_str:
28 |         from tqdm import tqdm_notebook as tqdm
29 |         return tqdm
30 |     if 'terminal' in ipy_str:
31 |         from tqdm import tqdm
32 |         return tqdm
33 | 
34 |     if sys.stderr is not None and hasattr(sys.stderr, "isatty") and sys.stderr.isatty():
35 |         from tqdm import tqdm
36 |         return tqdm
37 | 
38 |     from tqdm import tqdm
39 |     def hidden_tqdm(*args, **kwargs):
40 |         if "disable" in kwargs:
41 |             return tqdm(*args, **kwargs)
42 |         kwargs["disable"] = True
43 |         return tqdm(*args, **kwargs)
44 | 
45 |     return hidden_tqdm
46 | 
47 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/convert_spmrl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from stanza.models.constituency.parse_tree import Tree
 4 | from stanza.models.constituency.tree_reader import read_treebank
 5 | from stanza.utils.default_paths import get_default_paths
 6 | 
 7 | SHARDS = ("train", "dev", "test")
 8 | 
 9 | def add_root(tree):
10 |     if tree.label.startswith("NN"):
11 |         tree = Tree("NP", tree)
12 |     if tree.label.startswith("NE"):
13 |         tree = Tree("PN", tree)
14 |     elif tree.label.startswith("XY"):
15 |         tree = Tree("VROOT", tree)
16 |     return Tree("ROOT", tree)
17 | 
18 | def convert_spmrl(input_directory, output_directory, short_name):
19 |     for shard in SHARDS:
20 |         tree_filename = os.path.join(input_directory, shard, shard + ".German.gold.ptb")
21 |         trees = read_treebank(tree_filename, tree_callback=add_root)
22 |         output_filename = os.path.join(output_directory, "%s_%s.mrg" % (short_name, shard))
23 |         with open(output_filename, "w", encoding="utf-8") as fout:
24 |             for tree in trees:
25 |                 fout.write(str(tree))
26 |                 fout.write("\n")
27 |         print("Wrote %d trees to %s" % (len(trees), output_filename))
28 | 
29 | if __name__ == '__main__':
30 |     paths = get_default_paths()
31 |     output_directory = paths["CONSTITUENCY_DATA_DIR"]
32 |     input_directory = "extern_data/constituency/spmrl/SPMRL_SHARED_2014/GERMAN_SPMRL/gold/ptb"
33 |     convert_spmrl(input_directory, output_directory, "de_spmrl")
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/stanza/tests/resources/test_charlm_depparse.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza.resources.default_packages import default_charlms, depparse_charlms
 4 | from stanza.resources.print_charlm_depparse import list_depparse
 5 | 
 6 | def test_list_depparse():
 7 |     models = list_depparse()
 8 | 
 9 |     # check that it's picking up the models which don't have specific charlms
10 |     # first, make sure the default assumption of the test is still true...
11 |     # if this test fails, find a different language which isn't in depparse_charlms
12 |     assert "af" not in depparse_charlms
13 |     assert "af" in default_charlms
14 |     assert "af_afribooms_charlm" in models
15 |     assert "af_afribooms_nocharlm" in models
16 | 
17 |     # assert that it's picking up the models which do have specific charlms that aren't None
18 |     # again, first make sure the default assumptions are true
19 |     # if one of these next few tests fail, just update the test
20 |     assert "en" in depparse_charlms
21 |     assert "en" in default_charlms
22 |     assert "ewt" not in depparse_charlms["en"]
23 |     assert "craft" in depparse_charlms["en"]
24 |     assert "mimic" in depparse_charlms["en"]
25 |     # now, check the results
26 |     assert "en_ewt_charlm" in models
27 |     assert "en_ewt_nocharlm" in models
28 |     assert "en_mimic_charlm" in models
29 |     # haven't yet trained w/ and w/o for the bio models
30 |     assert "en_mimic_nocharlm" not in models
31 |     assert "en_craft_charlm" not in models
32 |     assert "en_craft_nocharlm" in models
33 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/compare_entities.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Report the fraction of NER entities in one file which are present in another.
 3 | 
 4 | Purpose: show the coverage of one file on another, such as reporting
 5 | the number of entities in one dataset on another
 6 | """
 7 | 
 8 | 
 9 | import argparse
10 | 
11 | from stanza.utils.datasets.ner.utils import read_json_entities
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser(description="Report the coverage of one NER file on another.")
15 |     parser.add_argument('--train', type=str, nargs="+", required=True, help='File to use to collect the known entities (not necessarily train).')
16 |     parser.add_argument('--test', type=str, nargs="+", required=True, help='File for which we want to know the ratio of known entities')
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | def report_known_entities(train_file, test_file):
21 |     train_entities = read_json_entities(train_file)
22 |     test_entities = read_json_entities(test_file)
23 | 
24 |     train_entities = set(x[0] for x in train_entities)
25 |     total_score = sum(1 for x in test_entities if x[0] in train_entities)
26 |     print(train_file, test_file, total_score / len(test_entities))
27 | 
28 | def main():
29 |     args = parse_args()
30 | 
31 |     for train_idx, train_file in enumerate(args.train):
32 |         if train_idx > 0:
33 |             print()
34 |         for test_file in args.test:
35 |             report_known_entities(train_file, test_file)
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 


--------------------------------------------------------------------------------
/stanza/models/coref/coref_chain.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Coref chain suitable for attaching to a Document after coref processing
 3 | """
 4 | 
 5 | # by not using namedtuple, we can use this object as output from the json module
 6 | # in the doc class as long as we wrap the encoder to print these out in dict() form
 7 | # CorefMention = namedtuple('CorefMention', ['sentence', 'start_word', 'end_word'])
 8 | class CorefMention:
 9 |     def __init__(self, sentence, start_word, end_word):
10 |         self.sentence = sentence
11 |         self.start_word = start_word
12 |         self.end_word = end_word
13 | 
14 | class CorefChain:
15 |     def __init__(self, index, mentions, representative_text, representative_index):
16 |         self.index = index
17 |         self.mentions = mentions
18 |         self.representative_text = representative_text
19 |         self.representative_index = representative_index
20 | 
21 | class CorefAttachment:
22 |     def __init__(self, chain, is_start, is_end, is_representative):
23 |         self.chain = chain
24 |         self.is_start = is_start
25 |         self.is_end = is_end
26 |         self.is_representative = is_representative
27 | 
28 |     def to_json(self):
29 |         j = {
30 |             "index": self.chain.index,
31 |             "representative_text": self.chain.representative_text
32 |         }
33 |         if self.is_start:
34 |             j['is_start'] = True
35 |         if self.is_end:
36 |             j['is_end'] = True
37 |         if self.is_representative:
38 |             j['is_representative'] = True
39 |         return j
40 | 


--------------------------------------------------------------------------------
/stanza/server/tokensregex.py:
--------------------------------------------------------------------------------
 1 | """Invokes the Java tokensregex on a document
 2 | 
 3 | This operates tokensregex on docs processed with stanza models.
 4 | 
 5 | https://nlp.stanford.edu/software/tokensregex.html
 6 | 
 7 | A minimal example is the main method of this module.
 8 | """
 9 | 
10 | import stanza
11 | 
12 | from stanza.protobuf import TokensRegexRequest, TokensRegexResponse
13 | from stanza.server.java_protobuf_requests import send_request, add_sentence
14 | 
15 | def send_tokensregex_request(request):
16 |     return send_request(request, TokensRegexResponse,
17 |                         "edu.stanford.nlp.ling.tokensregex.ProcessTokensRegexRequest")
18 | 
19 | def process_doc(doc, *patterns):
20 |     request = TokensRegexRequest()
21 |     for pattern in patterns:
22 |         request.pattern.append(pattern)
23 | 
24 |     request_doc = request.doc
25 |     request_doc.text = doc.text
26 |     num_tokens = 0
27 |     for sentence in doc.sentences:
28 |         add_sentence(request_doc.sentence, sentence, num_tokens)
29 |         num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens)
30 | 
31 |     return send_tokensregex_request(request)
32 | 
33 | def main():
34 |     #nlp = stanza.Pipeline('en',
35 |     #                      processors='tokenize,pos,lemma,ner')
36 |     nlp = stanza.Pipeline('en',
37 |                           processors='tokenize')
38 | 
39 |     doc = nlp('Uro ruined modern.  Fortunately, Wotc banned him')
40 |     print(process_doc(doc, "him", "ruined"))
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/convert_kk_kazNERD.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert a Kazakh NER dataset to our internal .json format
 3 | The dataset is here:
 4 | 
 5 | https://github.com/IS2AI/KazNERD/tree/main/KazNERD
 6 | """
 7 | 
 8 | import argparse
 9 | import os
10 | import shutil
11 | # import random
12 | 
13 | from stanza.utils.datasets.ner.utils import convert_bio_to_json, SHARDS
14 | 
15 | def convert_dataset(in_directory, out_directory, short_name):
16 |     """
17 |     Reads in train, validation, and test data and converts them to .json file
18 |     """
19 |     filenames = ("IOB2_train.txt", "IOB2_valid.txt", "IOB2_test.txt")
20 |     for shard, filename in zip(SHARDS, filenames):
21 |         input_filename = os.path.join(in_directory, filename)
22 |         output_filename = os.path.join(out_directory, "%s.%s.bio" % (short_name, shard))
23 |         shutil.copy(input_filename, output_filename)
24 |     convert_bio_to_json(out_directory, out_directory, short_name, "bio")
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('--input_path', type=str, default="/nlp/scr/aaydin/kazNERD/NER", help="Where to find the files")
29 |     parser.add_argument('--output_path', type=str, default="/nlp/scr/aaydin/kazNERD/data/ner", help="Where to output the results")
30 |     args = parser.parse_args()
31 |     # in_path = '/nlp/scr/aaydin/kazNERD/NER'
32 |     # out_path = '/nlp/scr/aaydin/kazNERD/NER/output'
33 |     # convert_dataset(in_path, out_path)
34 |     convert_dataset(args.input_path, args.output_path, "kk_kazNERD")
35 | 
36 | 


--------------------------------------------------------------------------------
/stanza/tests/server/test_tokensregex.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from stanza.tests import *
 3 | 
 4 | from stanza.models.common.doc import Document
 5 | import stanza.server.tokensregex as tokensregex
 6 | 
 7 | pytestmark = [pytest.mark.travis, pytest.mark.client]
 8 | 
 9 | from stanza.tests.server.test_semgrex import ONE_SENTENCE_DOC, TWO_SENTENCE_DOC
10 | 
11 | def test_single_sentence():
12 |     #expected:
13 |     #match {
14 |     #  sentence: 0
15 |     #  match {
16 |     #    text: "Opal"
17 |     #    begin: 2
18 |     #    end: 3
19 |     #  }
20 |     #}
21 | 
22 |     response = tokensregex.process_doc(ONE_SENTENCE_DOC, "Opal")
23 |     assert len(response.match) == 1
24 |     assert len(response.match[0].match) == 1
25 |     assert response.match[0].match[0].sentence == 0
26 |     assert response.match[0].match[0].match.text == "Opal"
27 |     assert response.match[0].match[0].match.begin == 2
28 |     assert response.match[0].match[0].match.end == 3
29 | 
30 | 
31 | def test_ner_sentence():
32 |     #expected:
33 |     #match {
34 |     #  sentence: 0
35 |     #  match {
36 |     #    text: "Opal"
37 |     #    begin: 2
38 |     #    end: 3
39 |     #  }
40 |     #}
41 | 
42 |     response = tokensregex.process_doc(ONE_SENTENCE_DOC, "[ner: GEM]")
43 |     assert len(response.match) == 1
44 |     assert len(response.match[0].match) == 1
45 |     assert response.match[0].match[0].sentence == 0
46 |     assert response.match[0].match[0].match.text == "Opal"
47 |     assert response.match[0].match[0].match.begin == 2
48 |     assert response.match[0].match[0].match.end == 3
49 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_combine_ner_datasets.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pytest
 4 | 
 5 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 6 | 
 7 | from stanza.models.common.doc import Document
 8 | from stanza.tests.ner.test_ner_training import write_temp_file, EN_TRAIN_BIO, EN_DEV_BIO
 9 | from stanza.utils.datasets.ner import combine_ner_datasets
10 | 
11 | 
12 | def test_combine(tmp_path):
13 |     """
14 |     Test that if we write two short datasets and combine them, we get back
15 |     one slightly longer dataset
16 | 
17 |     To simplify matters, we just use the same input text with longer
18 |     amounts of text for each shard.
19 |     """
20 |     SHARDS = ("train", "dev", "test")
21 |     for s_num, shard in enumerate(SHARDS):
22 |         t1_json = tmp_path / ("en_t1.%s.json" % shard)
23 |         # eg, 1x, 2x, 3x the test data from test_ner_training
24 |         write_temp_file(t1_json, "\n\n".join([EN_TRAIN_BIO] * (s_num + 1)))
25 | 
26 |         t2_json = tmp_path / ("en_t2.%s.json" % shard)
27 |         write_temp_file(t2_json, "\n\n".join([EN_DEV_BIO] * (s_num + 1)))
28 | 
29 |     args = ["--output_dataset", "en_c", "en_t1", "en_t2", "--input_dir", str(tmp_path), "--output_dir", str(tmp_path)]
30 |     combine_ner_datasets.main(args)
31 | 
32 |     for s_num, shard in enumerate(SHARDS):
33 |         filename = tmp_path / ("en_c.%s.json" % shard)
34 |         assert os.path.exists(filename)
35 | 
36 |         with open(filename, encoding="utf-8") as fin:
37 |             doc = Document(json.load(fin))
38 |             assert len(doc.sentences) == (s_num + 1) * 3
39 | 
40 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_convert_starlang_ner.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test a couple different classes of trees to check the output of the Starlang conversion for NER
 3 | """
 4 | 
 5 | import os
 6 | import tempfile
 7 | 
 8 | import pytest
 9 | 
10 | from stanza.utils.datasets.ner import convert_starlang_ner
11 | 
12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
13 | 
14 | TREE="( (S (NP (NP {morphologicalAnalysis=bayan+NOUN+A3SG+PNON+NOM}{metaMorphemes=bayan}{turkish=Bayan}{english=Ms.}{semantics=TUR10-0396530}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}{englishSemantics=ENG31-06352895-n}) (NP {morphologicalAnalysis=haag+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=haag}{turkish=Haag}{english=Haag}{semantics=TUR10-0000000}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}))  (VP (NP {morphologicalAnalysis=elianti+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=elianti}{turkish=Elianti}{english=Elianti}{semantics=TUR10-0000000}{namedEntity=NONE}{propBank=ARG1$TUR10-0148580}) (VP {morphologicalAnalysis=çal+VERB+POS+AOR+A3SG}{metaMorphemes=çal+Ar}{turkish=çalar}{english=plays}{semantics=TUR10-0148580}{namedEntity=NONE}{propBank=PREDICATE$TUR10-0148580}{englishSemantics=ENG31-01730049-v}))  (. {morphologicalAnalysis=.+PUNC}{metaMorphemes=.}{metaMorphemesMoved=.}{turkish=.}{english=.}{semantics=TUR10-1081860}{namedEntity=NONE}{propBank=NONE}))  )"
15 | 
16 | def test_read_tree():
17 |     """
18 |     Test a basic tree read
19 |     """
20 |     sentence = convert_starlang_ner.read_tree(TREE)
21 |     expected = [('Bayan', 'PERSON'), ('Haag', 'PERSON'), ('Elianti', 'O'), ('çalar', 'O'), ('.', 'O')]
22 |     assert sentence == expected
23 | 
24 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/preprocess_wikiner.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Converts the WikiNER data format to a format usable by our processing tools
 3 | 
 4 | python preprocess_wikiner input output
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | def preprocess_wikiner(input_file, output_file, encoding="utf-8"):
10 |     with open(input_file, encoding=encoding) as fin:
11 |         with open(output_file, "w", encoding="utf-8") as fout:
12 |             for line in fin:
13 |                 line = line.strip()
14 |                 if not line:
15 |                     fout.write("-DOCSTART- O\n")
16 |                     fout.write("\n")
17 |                     continue
18 | 
19 |                 words = line.split()
20 |                 for word in words:
21 |                     pieces = word.split("|")
22 |                     text = pieces[0]
23 |                     tag = pieces[-1]
24 |                     # some words look like Daniel_Bernoulli|I-PER
25 |                     # but the original .pl conversion script didn't take that into account
26 |                     subtext = text.split("_")
27 |                     if tag.startswith("B-") and len(subtext) > 1:
28 |                         fout.write("{} {}\n".format(subtext[0], tag))
29 |                         for chunk in subtext[1:]:
30 |                             fout.write("{} I-{}\n".format(chunk, tag[2:]))
31 |                     else:
32 |                         for chunk in subtext:
33 |                             fout.write("{} {}\n".format(chunk, tag))
34 |                 fout.write("\n")
35 | 
36 | if __name__ == '__main__':
37 |     preprocess_wikiner(sys.argv[1], sys.argv[2])
38 | 


--------------------------------------------------------------------------------
/stanza/models/coref/loss.py:
--------------------------------------------------------------------------------
 1 | """ Describes the loss function used to train the model, which is a weighted
 2 | sum of NLML and BCE losses. """
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class CorefLoss(torch.nn.Module):
 8 |     """ See the rationale for using NLML in Lee et al. 2017
 9 |     https://www.aclweb.org/anthology/D17-1018/
10 |     The added weighted summand of BCE helps the model learn even after
11 |     converging on the NLML task. """
12 | 
13 |     def __init__(self, bce_weight: float):
14 |         assert 0 <= bce_weight <= 1
15 |         super().__init__()
16 |         self._bce_module = torch.nn.BCEWithLogitsLoss()
17 |         self._bce_weight = bce_weight
18 | 
19 |     def forward(self,    # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
20 |                 input_: torch.Tensor,
21 |                 target: torch.Tensor) -> torch.Tensor:
22 |         """ Returns a weighted sum of two losses as a torch.Tensor """
23 |         return (self._nlml(input_, target)
24 |                 + self._bce(input_, target) * self._bce_weight)
25 | 
26 |     def _bce(self,
27 |              input_: torch.Tensor,
28 |              target: torch.Tensor) -> torch.Tensor:
29 |         """ For numerical stability, clamps the input before passing it to BCE.
30 |         """
31 |         return self._bce_module(torch.clamp(input_, min=-50, max=50), target)
32 | 
33 |     @staticmethod
34 |     def _nlml(input_: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
35 |         gold = torch.logsumexp(input_ + torch.log(target), dim=1)
36 |         input_ = torch.logsumexp(input_, dim=1)
37 |         return (input_ - gold).mean()
38 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/convert_nytk.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import glob
 3 | import os
 4 | 
 5 | def convert_nytk(base_input_path, base_output_path, short_name):
 6 |     for shard in ('train', 'dev', 'test'):
 7 |         if shard == 'dev':
 8 |             base_input_subdir = os.path.join(base_input_path, "data/train-devel-test/devel")
 9 |         else:
10 |             base_input_subdir = os.path.join(base_input_path, "data/train-devel-test", shard)
11 | 
12 |         shard_lines = []
13 |         base_input_glob = base_input_subdir + "/*/no-morph/*"
14 |         subpaths = glob.glob(base_input_glob)
15 |         print("Reading %d input files from %s" % (len(subpaths), base_input_glob))
16 |         for input_filename in subpaths:
17 |             if len(shard_lines) > 0:
18 |                 shard_lines.append("")
19 |             with open(input_filename) as fin:
20 |                 lines = fin.readlines()
21 |                 if lines[0].strip() != '# global.columns = FORM LEMMA UPOS XPOS FEATS CONLL:NER':
22 |                     raise ValueError("Unexpected format in %s" % input_filename)
23 |                 lines = [x.strip().split("\t") for x in lines[1:]]
24 |                 lines = ["%s\t%s" % (x[0], x[5]) if len(x) > 1 else "" for x in lines]
25 |                 shard_lines.extend(lines)
26 | 
27 |         bio_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard))
28 |         with open(bio_filename, "w") as fout:
29 |             print("Writing %d lines to %s" % (len(shard_lines), bio_filename))
30 |             for line in shard_lines:
31 |                 fout.write(line)
32 |                 fout.write("\n")
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Stanza
 2 | 
 3 | We would love to see contributions to Stanza from the community! Contributions that we welcome include bugfixes and enhancements. If you want to report a bug or suggest a feature but don't intend to fix or implement it by yourself, please create a corresponding issue on [our issues page](https://github.com/stanfordnlp/stanza/issues). If you plan to contribute a bugfix or enhancement, please read the following.
 4 | 
 5 | ## 🛠️ Bugfixes
 6 | 
 7 | For bugfixes, please follow these steps:
 8 | 
 9 | - Make sure a fix does not already exist, by searching through existing [issues](https://github.com/stanfordnlp/stanza/issues) (including closed ones) and [pull requests](https://github.com/stanfordnlp/stanza/pulls).
10 | - Confirm the bug with us by creating a bug-report issue. In your issue, you should at least include the platform and environment that you are running with, and a minimal code snippet that will reproduce the bug.
11 | - Once the bug is confirmed, you can go ahead with implementing the bugfix, and create a pull request **against the `dev` branch**.
12 | 
13 | ## 💡 Enhancements
14 | 
15 | For enhancements, please follow these steps:
16 | 
17 | - Make sure a similar enhancement suggestion does not already exist, by searching through existing [issues](https://github.com/stanfordnlp/stanza/issues).
18 | - Create a feature-request issue and discuss about this enhancement with us. We'll need to make sure this enhancement won't break existing user interface and functionalities.
19 | - Once the enhancement is confirmed with us, you can go ahead with implementing it, and create a pull request **against the `dev` branch**.
20 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/test_pipeline_sentiment_processor.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import pytest
 4 | import stanza
 5 | from stanza.utils.conll import CoNLL
 6 | from stanza.models.common.doc import Document
 7 | 
 8 | from stanza.tests import *
 9 | 
10 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
11 | 
12 | # data for testing
13 | EN_DOCS = ["Ragavan is terrible and should go away.",  "Today is okay.",  "Urza's Saga is great."]
14 | 
15 | EN_DOC = "  ".join(EN_DOCS)
16 | 
17 | EXPECTED = [0, 1, 2]
18 | 
19 | class TestSentimentPipeline:
20 |     @pytest.fixture(scope="class")
21 |     def pipeline(self):
22 |         """
23 |         A reusable pipeline with the NER module
24 |         """
25 |         gc.collect()
26 |         return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,sentiment")
27 | 
28 |     def test_simple(self, pipeline):
29 |         results = []
30 |         for text in EN_DOCS:
31 |             doc = pipeline(text)
32 |             assert len(doc.sentences) == 1
33 |             results.append(doc.sentences[0].sentiment)
34 |         assert EXPECTED == results
35 | 
36 |     def test_multiple_sentences(self, pipeline):
37 |         doc = pipeline(EN_DOC)
38 |         assert len(doc.sentences) == 3
39 |         results = [sentence.sentiment for sentence in doc.sentences]
40 |         assert EXPECTED == results
41 | 
42 |     def test_empty_text(self, pipeline):
43 |         """
44 |         Test empty text and a text which might get reduced to empty text by removing dashes
45 |         """
46 |         doc = pipeline("")
47 |         assert len(doc.sentences) == 0
48 | 
49 |         doc = pipeline("--")
50 |         assert len(doc.sentences) == 1
51 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/thai_syllable_dict_generator.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import pathlib
 3 | import argparse
 4 | 
 5 | 
 6 | def create_dictionary(dataset_dir, save_dir):
 7 |     syllables = set()
 8 | 
 9 |     for p in pathlib.Path(dataset_dir).rglob("*.ssg"): # iterate through all files
10 | 
11 |         with open(p) as f: # for each file
12 |             sentences = f.readlines()
13 | 
14 |         for i in range(len(sentences)):
15 | 
16 |             sentences[i] = sentences[i].replace("\n", "")
17 |             sentences[i] = sentences[i].replace("<s/>", "~")
18 |             sentences[i] = sentences[i].split("~") # create list of all syllables
19 | 
20 |             syllables = syllables.union(sentences[i])
21 | 
22 | 
23 |         print(len(syllables))
24 | 
25 |     # Filter out syllables with English words
26 |     import re
27 | 
28 |     a = []
29 | 
30 |     for s in syllables:
31 |         print("---")
32 |         if bool(re.match("^[\u0E00-\u0E7F]*$", s)) and s != "" and " " not in s:
33 |             a.append(s)
34 |         else:
35 |             pass
36 | 
37 |     a = set(a)
38 |     a = dict(zip(list(a), range(len(a))))
39 | 
40 |     import json
41 |     print(a)
42 |     print(len(a))
43 |     with open(save_dir, "w") as fp:
44 |         json.dump(a, fp)
45 | 
46 | if __name__ == "__main__":
47 | 
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument('--dataset_dir', type=str, default="syllable_segmentation_data", help="Directory for syllable dataset")
50 |     parser.add_argument('--save_dir', type=str, default="thai-syllable.json", help="Directory for generated file")
51 |     args = parser.parse_args()
52 | 
53 |     create_dictionary(args.dataset_dir, args.save_dir)
54 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/pipeline_device_tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility methods to check that all processors are on the expected device
 3 | 
 4 | Refactored since it can be used for multiple pipelines
 5 | """
 6 | 
 7 | import warnings
 8 | 
 9 | import torch
10 | 
11 | def check_on_gpu(pipeline):
12 |     """
13 |     Check that the processors are all on the GPU and that basic execution works
14 |     """
15 |     if not torch.cuda.is_available():
16 |         warnings.warn("Unable to run the test that checks the pipeline is on the GPU, as there is no GPU available!")
17 |         return
18 | 
19 |     for name, proc in pipeline.processors.items():
20 |         if proc.trainer is not None:
21 |             device = next(proc.trainer.model.parameters()).device
22 |         else:
23 |             device = next(proc._model.parameters()).device
24 | 
25 |         assert str(device).startswith("cuda"), "Processor %s was not on the GPU" % name
26 | 
27 |     # just check that there are no cpu/cuda tensor conflicts
28 |     # when running on the GPU
29 |     pipeline("This is a small test")
30 | 
31 | def check_on_cpu(pipeline):
32 |     """
33 |     Check that the processors are all on the CPU and that basic execution works
34 |     """
35 |     for name, proc in pipeline.processors.items():
36 |         if proc.trainer is not None:
37 |             device = next(proc.trainer.model.parameters()).device
38 |         else:
39 |             device = next(proc._model.parameters()).device
40 | 
41 |         assert str(device).startswith("cpu"), "Processor %s was not on the CPU" % name
42 | 
43 |     # just check that there are no cpu/cuda tensor conflicts
44 |     # when running on the CPU
45 |     pipeline("This is a small test")
46 | 


--------------------------------------------------------------------------------
/stanza/models/common/count_pretrain_coverage.py:
--------------------------------------------------------------------------------
 1 | """A simple script to count the fraction of words in a UD dataset which are in a particular pretrain.
 2 | 
 3 | For example, this script shows that the word2vec Armenian vectors,
 4 | truncated at 250K words, have 75% coverage of the Western Armenian
 5 | dataset, whereas the vectors available here have 88% coverage:
 6 | 
 7 | https://github.com/ispras-texterra/word-embeddings-eval-hy
 8 | """
 9 | 
10 | from stanza.models.common import pretrain
11 | from stanza.utils.conll import CoNLL
12 | 
13 | import argparse
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('treebanks', type=str, nargs='*', help='Which treebanks to run on')
18 |     parser.add_argument('--pretrain', type=str, default="/home/john/extern_data/wordvec/glove/armenian.pt", help='Which pretrain to use')
19 |     parser.set_defaults(treebanks=["/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu",
20 |                                    "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"])
21 |     args = parser.parse_args()
22 |     return args
23 | 
24 | 
25 | args = parse_args()
26 | pt = pretrain.Pretrain(args.pretrain)
27 | pt.load()
28 | print("Pretrain stats: {} vectors, {} dim".format(len(pt.vocab), pt.emb[0].shape[0]))
29 | 
30 | for treebank in args.treebanks:
31 |     print(treebank)
32 |     found = 0
33 |     total = 0
34 |     doc = CoNLL.conll2doc(treebank)
35 |     for sentence in doc.sentences:
36 |         for word in sentence.words:
37 |             total = total + 1
38 |             if word.text in pt.vocab:
39 |                 found = found + 1
40 | 
41 |     print (found / total)
42 | 


--------------------------------------------------------------------------------
/stanza/utils/constituency/check_transitions.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from stanza.models.constituency import transition_sequence
 4 | from stanza.models.constituency import tree_reader
 5 | from stanza.models.constituency.parse_transitions import TransitionScheme
 6 | from stanza.models.constituency.parse_tree import Tree
 7 | from stanza.models.constituency.utils import verify_transitions
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--train_file', type=str, default="data/constituency/en_ptb3_train.mrg", help='Input file for data loader.')
12 |     parser.add_argument('--transition_scheme', default=TransitionScheme.IN_ORDER, type=lambda x: TransitionScheme[x.upper()],
13 |                         help='Transition scheme to use.  {}'.format(", ".join(x.name for x in TransitionScheme)))
14 |     parser.add_argument('--reversed', default=False, action='store_true', help='Do the transition sequence reversed')
15 |     parser.add_argument('--iterations', default=30, type=int, help='How many times to iterate, such as if doing a cProfile')
16 |     args = parser.parse_args()
17 |     args = vars(args)
18 | 
19 |     train_trees = tree_reader.read_treebank(args['train_file'])
20 |     unary_limit = max(t.count_unary_depth() for t in train_trees) + 1
21 |     train_sequences, train_transitions = transition_sequence.convert_trees_to_sequences(train_trees, "training", args['transition_scheme'], args['reversed'])
22 |     root_labels = Tree.get_root_labels(train_trees)
23 |     for i in range(args['iterations']):
24 |         verify_transitions(train_trees, train_sequences, args['transition_scheme'], unary_limit, args['reversed'], "train", root_labels)
25 | 
26 | if __name__ == '__main__':
27 |     main()
28 | 


--------------------------------------------------------------------------------
/stanza/utils/pretrain/compare_pretrains.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | from stanza.models.common.pretrain import Pretrain
 5 | 
 6 | pt1_filename = sys.argv[1]
 7 | pt2_filename = sys.argv[2]
 8 | 
 9 | pt1 = Pretrain(pt1_filename)
10 | pt2 = Pretrain(pt2_filename)
11 | 
12 | vocab1 = pt1.vocab
13 | vocab2 = pt2.vocab
14 | 
15 | common_words = [x for x in vocab1 if x in vocab2]
16 | print("%d shared words, out of %d in %s and %d in %s" % (len(common_words), len(vocab1), pt1_filename, len(vocab2), pt2_filename))
17 | 
18 | eps = 0.0001
19 | total_norm = 0.0
20 | total_close = 0
21 | 
22 | words_different = []
23 | 
24 | for word, idx in vocab1._unit2id.items():
25 |     if word not in vocab2:
26 |         continue
27 |     v1 = pt1.emb[idx]
28 |     v2 = pt2.emb[pt2.vocab[word]]
29 |     norm = np.linalg.norm(v1 - v2)
30 | 
31 |     if norm < eps:
32 |         total_close += 1
33 |     else:
34 |         total_norm += norm
35 |         if len(words_different) < 10:
36 |             words_different.append("|%s|" % word)
37 |             #print(word, idx, pt2.vocab[word])
38 |             #print(v1)
39 |             #print(v2)
40 | 
41 | if total_close < len(common_words):
42 |     avg_norm = total_norm / (len(common_words) - total_close)
43 |     print("%d vectors were close.  Average difference of the others: %f" % (total_close, avg_norm))
44 |     print("The first few different words were:\n  %s" % "\n  ".join(words_different))
45 | else:
46 |     print("All %d vectors were close!" % total_close)
47 | 
48 |     for word, idx in vocab1._unit2id.items():
49 |         if word not in vocab2:
50 |             continue
51 |         if pt2.vocab[word] != idx:
52 |             break
53 |     else:
54 |         print("All indices are the same")
55 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/convert_en_conll03.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json
 3 | 
 4 | Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
 5 | https://huggingface.co/datasets/conll2003
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from stanza.utils.default_paths import get_default_paths
11 | from stanza.utils.datasets.ner.utils import write_dataset
12 | 
13 | TAG_TO_ID = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
14 | ID_TO_TAG = {y: x for x, y in TAG_TO_ID.items()}
15 | 
16 | def convert_dataset_section(section):
17 |     sentences = []
18 |     for item in section:
19 |         words = item['tokens']
20 |         tags = [ID_TO_TAG[x] for x in item['ner_tags']]
21 |         sentences.append(list(zip(words, tags)))
22 |     return sentences
23 | 
24 | def process_dataset(short_name, conll_path, ner_output_path):
25 |     try:
26 |         from datasets import load_dataset
27 |     except ImportError as e:
28 |         raise ImportError("Please install the datasets package to process CoNLL03 with Stanza")
29 | 
30 |     dataset = load_dataset('conll2003', cache_dir=conll_path)
31 |     datasets = [convert_dataset_section(x) for x in [dataset['train'], dataset['validation'], dataset['test']]]
32 |     write_dataset(datasets, ner_output_path, short_name)
33 | 
34 | def main():
35 |     paths = get_default_paths()
36 |     ner_input_path = paths['NERBASE']
37 |     conll_path = os.path.join(ner_input_path, "english", "en_conll03")
38 |     ner_output_path = paths['NER_DATA_DIR']
39 |     process_dataset("en_conll03", conll_path, ner_output_path)
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/treebank_to_labeled_brackets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Converts a PTB file to a format where all the brackets have labels on the start and end bracket.
 3 | 
 4 | Such a file should be suitable for training an LM
 5 | """
 6 | 
 7 | import argparse
 8 | import logging
 9 | import sys
10 | 
11 | from stanza.models.constituency import tree_reader
12 | from stanza.utils.get_tqdm import get_tqdm
13 | 
14 | tqdm = get_tqdm()
15 | 
16 | logger = logging.getLogger('stanza.constituency')
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser(
20 |         description="Script that converts a PTB treebank into a labeled bracketed file suitable for LM training"
21 |     )
22 | 
23 |     parser.add_argument(
24 |         'ptb_file',
25 |         help='Where to get the original PTB format treebank'
26 |     )
27 |     parser.add_argument(
28 |         'label_file',
29 |         help='Where to write the labeled bracketed file'
30 |     )
31 |     parser.add_argument(
32 |         '--separator',
33 |         default="_",
34 |         help='What separator to use in place of spaces',
35 |     )
36 |     parser.add_argument(
37 |         '--no_separator',
38 |         dest='separator',
39 |         action='store_const',
40 |         const=None,
41 |         help="Don't use a separator"
42 |     )
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     treebank = tree_reader.read_treebank(args.ptb_file)
47 |     logger.info("Writing %d trees to %s", len(treebank), args.label_file)
48 | 
49 |     tree_format = "{:%sL}\n" % args.separator if args.separator else "{:L}\n"
50 |     with open(args.label_file, "w", encoding="utf-8") as fout:
51 |         for tree in tqdm(treebank):
52 |             fout.write(tree_format.format(tree))
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/stanza/utils/constituency/grep_dev_logs.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | iteration = sys.argv[1]
 5 | filenames = sys.argv[2:]
 6 | 
 7 | total_score = 0.0
 8 | num_scores = 0
 9 | 
10 | for filename in filenames:
11 |     grep_cmd = ["grep", "Dev score.* %s[)]" % iteration, "-A1", filename]
12 |     grep_result = subprocess.run(grep_cmd, stdout=subprocess.PIPE, encoding="utf-8")
13 |     grep_result = grep_result.stdout.strip()
14 |     if not grep_result:
15 |         max_cmd = ["grep", "Dev score", filename]
16 |         max_result = subprocess.run(max_cmd, stdout=subprocess.PIPE, encoding="utf-8")
17 |         max_result = max_result.stdout.strip()
18 |         if not max_result:
19 |             print("{}: no result".format(filename))
20 |         else:
21 |             max_it = max_result.split("\n")[-1]
22 |             max_it = int(max_it.split(":")[0].split("(")[-1][:-1])
23 |             epoch_finished_string = "Epoch %d finished" % max_it
24 |             finish_cmd = ["grep", epoch_finished_string, filename]
25 |             finish_result = subprocess.run(finish_cmd, stdout=subprocess.PIPE, encoding="utf-8")
26 |             finish_result = finish_result.stdout.strip()
27 |             finish_time = finish_result.split(" INFO")[0]
28 |             print("{}: no result.  max iteration: {}   finished at {}".format(filename, max_it, finish_time))
29 |     else:
30 |         grep_result = grep_result.split("\n")[-1]
31 |         score = float(grep_result.split(":")[-1])
32 |         best_iteration = int(grep_result.split(":")[-2][-6:-1])
33 |         print("{}: {}  ({})".format(filename, score, best_iteration))
34 |         total_score += score
35 |         num_scores += 1
36 | 
37 | if num_scores > 0:
38 |     avg = total_score / num_scores
39 |     print("Avg: {}".format(avg))
40 | 
41 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/coref/balance_languages.py:
--------------------------------------------------------------------------------
 1 | """
 2 | balance_concat.py
 3 | create a test set from a dev set which is language balanced
 4 | """
 5 | 
 6 | import json
 7 | from collections import defaultdict
 8 | 
 9 | from random import Random
10 | 
11 | # fix random seed for reproducability
12 | R = Random(42)
13 | 
14 | with open("./corefud_concat_v1_0_langid.train.json", 'r') as df:
15 |     raw = json.load(df)
16 | 
17 | # calculate type of each class; then, we will select the one
18 | # which has the LOWEST counts as the sample rate
19 | lang_counts = defaultdict(int)
20 | for i in raw:
21 |     lang_counts[i["lang"]] += 1
22 | 
23 | min_lang_count = min(lang_counts.values())
24 | 
25 | # sample 20% of the smallest amount for test set
26 | # this will look like an absurdly small number, but
27 | # remember this is DOCUMENTS not TOKENS or UTTERANCES
28 | # so its actually decent
29 | # also its per language
30 | test_set_size = int(0.1*min_lang_count)
31 | 
32 | # sampling input by language
33 | raw_by_language = defaultdict(list)
34 | for i in raw:
35 |     raw_by_language[i["lang"]].append(i)
36 | languages = list(set(raw_by_language.keys()))
37 | 
38 | train_set = []
39 | test_set = []
40 | for i in languages:
41 |     length = list(range(len(raw_by_language[i])))
42 |     choices = R.sample(length, test_set_size)
43 | 
44 |     for indx,i in enumerate(raw_by_language[i]):
45 |         if indx in choices:
46 |             test_set.append(i)
47 |         else:
48 |             train_set.append(i)
49 | 
50 | with open("./corefud_concat_v1_0_langid-bal.train.json", 'w') as df:
51 |     json.dump(train_set, df, indent=2)
52 | 
53 | with open("./corefud_concat_v1_0_langid-bal.test.json", 'w') as df:
54 |     json.dump(test_set, df, indent=2)
55 | 
56 | 
57 | 
58 | # raw_by_language["en"]
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/stanza/utils/visualization/README:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | The code in this directory contains tooling required for Semgrex and Ssurgeon visualization.
 4 | Searching dependency graphs and manipulating them can be a time consuming and challenging task to get right.
 5 | Semgrex is a system for searching dependency graphs and Ssurgeon is a system for manipulating the output of Semgrex.
 6 | The compact language used by these systems allows for easy command line or API processing of dependencies.
 7 | 
 8 | We now offer Semgrex and Ssurgeon through a web interface, now accessible via Streamlit with visualizations.
 9 | 
10 | ## How to run visualizations through Streamlit
11 | 
12 | Streamlit can be used to visualize Semgrex and Ssurgeon results and process files.
13 | Here are instructions for setting up a Streamlit webpage:
14 | 
15 | 1. install Streamlit. `pip install streamlit`
16 | 2. install Stanford CoreNLP if you have not. You can find an installation here: https://stanfordnlp.github.io/CoreNLP/download.html
17 | 3. set the $CLASSPATH environment variable to your local installation of CoreNLP.
18 | 4. install streamlit, spacy, and ipython.  You can use the "visualization" stanza setup option for that
19 | 5. Run `streamlit run stanza/utils/visualization/semgrex_app.py --theme.backgroundColor "#FFFFFF"`
20 | 
21 | This should begin a Streamlit runtime application on your local machine that can be interacted with.
22 | 
23 | For instructions on how to use Ssurgeon and Semgrex, refer to these helpful pages:
24 | https://aclanthology.org/2023.tlt-1.7.pdf
25 | https://nlp.stanford.edu/nlp/javadoc/javanlp-3.5.0/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html
26 | https://stanfordnlp.github.io/stanza/client_regex.html
27 | https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#query-tokensregex-tokensregex
28 | 


--------------------------------------------------------------------------------
/stanza/models/classifiers/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List, Union
 3 | 
 4 | # TODO: perhaps put the enums in this file
 5 | from stanza.models.classifiers.utils import WVType, ExtraVectors, ModelType
 6 | 
 7 | @dataclass
 8 | class CNNConfig:  # pylint: disable=too-many-instance-attributes, too-few-public-methods
 9 |         filter_channels: Union[int, tuple]
10 |         filter_sizes: tuple
11 |         fc_shapes: tuple
12 |         dropout: float
13 |         num_classes: int
14 |         wordvec_type: WVType
15 |         extra_wordvec_method: ExtraVectors
16 |         extra_wordvec_dim: int
17 |         extra_wordvec_max_norm: float
18 |         char_lowercase: bool
19 |         charlm_projection: int
20 |         has_charlm_forward: bool
21 |         has_charlm_backward: bool
22 | 
23 |         use_elmo: bool
24 |         elmo_projection: int
25 | 
26 |         bert_model: str
27 |         bert_finetune: bool
28 |         bert_hidden_layers: int
29 |         force_bert_saved: bool
30 | 
31 |         use_peft: bool
32 |         lora_rank: int
33 |         lora_alpha: float
34 |         lora_dropout: float
35 |         lora_modules_to_save: List
36 |         lora_target_modules: List
37 | 
38 |         bilstm: bool
39 |         bilstm_hidden_dim: int
40 |         maxpool_width: int
41 |         model_type: ModelType
42 | 
43 | @dataclass
44 | class ConstituencyConfig:  # pylint: disable=too-many-instance-attributes, too-few-public-methods
45 |         fc_shapes: tuple
46 |         dropout: float
47 |         num_classes: int
48 | 
49 |         constituency_backprop: bool
50 |         constituency_batch_norm: bool
51 |         constituency_node_attn: bool
52 |         constituency_top_layer: bool
53 |         constituency_all_words: bool
54 | 
55 |         model_type: ModelType
56 | 


--------------------------------------------------------------------------------
/stanza/models/pos/xpos_vocab_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from enum import Enum
 3 | import logging
 4 | import os
 5 | 
 6 | from stanza.models.common.vocab import VOCAB_PREFIX
 7 | from stanza.models.pos.vocab import XPOSVocab, WordVocab
 8 | 
 9 | class XPOSType(Enum):
10 |     XPOS     = 1
11 |     WORD     = 2
12 | 
13 | XPOSDescription = namedtuple('XPOSDescription', ['xpos_type', 'sep'])
14 | DEFAULT_KEY = XPOSDescription(XPOSType.WORD, None)
15 | 
16 | logger = logging.getLogger('stanza')
17 | 
18 | def filter_data(data, idx):
19 |     data_filtered = []
20 |     for sentence in data:
21 |         flag = True
22 |         for token in sentence:
23 |             if token[idx] is None:
24 |                 flag = False
25 |         if flag: data_filtered.append(sentence)
26 |     return data_filtered
27 | 
28 | def choose_simplest_factory(data, shorthand):
29 |     logger.info(f'Original length = {len(data)}')
30 |     data = filter_data(data, idx=2)
31 |     logger.info(f'Filtered length = {len(data)}')
32 |     vocab = WordVocab(data, shorthand, idx=2, ignore=["_"])
33 |     key = DEFAULT_KEY
34 |     best_size = len(vocab) - len(VOCAB_PREFIX)
35 |     if best_size > 20:
36 |         for sep in ['', '-', '+', '|', ',', ':']: # separators
37 |             vocab = XPOSVocab(data, shorthand, idx=2, sep=sep)
38 |             length = sum(len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
39 |             if length < best_size:
40 |                 key = XPOSDescription(XPOSType.XPOS, sep)
41 |                 best_size = length
42 |     return key
43 | 
44 | def build_xpos_vocab(description, data, shorthand):
45 |     if description.xpos_type is XPOSType.WORD:
46 |         return WordVocab(data, shorthand, idx=2, ignore=["_"])
47 | 
48 |     return XPOSVocab(data, shorthand, idx=2, sep=description.sep)
49 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/contract_mwt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def contract_mwt(infile, outfile, ignore_gapping=True):
 4 |     """
 5 |     Simplify the gold tokenizer data for use as MWT processor test files
 6 | 
 7 |     The simplifications are to remove the expanded MWTs, and in the
 8 |     case of ignore_gapping=True, remove any copy words for the dependencies
 9 |     """
10 |     with open(outfile, 'w', encoding='utf-8') as fout:
11 |         with open(infile, 'r', encoding='utf-8') as fin:
12 |             idx = 0
13 |             mwt_begin = 0
14 |             mwt_end = -1
15 |             for line in fin:
16 |                 line = line.strip()
17 |     
18 |                 if line.startswith('#'):
19 |                     print(line, file=fout)
20 |                     continue
21 |                 elif len(line) <= 0:
22 |                     print(line, file=fout)
23 |                     idx = 0
24 |                     mwt_begin = 0
25 |                     mwt_end = -1
26 |                     continue
27 |     
28 |                 line = line.split('\t')
29 | 
30 |                 # ignore gapping word
31 |                 if ignore_gapping and '.' in line[0]:
32 |                     continue
33 | 
34 |                 idx += 1
35 |                 if '-' in line[0]:
36 |                     mwt_begin, mwt_end = [int(x) for x in line[0].split('-')]
37 |                     print("{}\t{}\t{}".format(idx, "\t".join(line[1:-1]), "MWT=Yes" if line[-1] == '_' else line[-1] + "|MWT=Yes"), file=fout)
38 |                     idx -= 1
39 |                 elif mwt_begin <= idx <= mwt_end:
40 |                     continue
41 |                 else:
42 |                     print("{}\t{}".format(idx, "\t".join(line[1:])), file=fout)
43 | 
44 | if __name__ == '__main__':
45 |     contract_mwt(sys.argv[1], sys.argv[2])
46 | 
47 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/convert_starlang_ner.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert the starlang trees to a NER dataset
 3 | 
 4 | Has to hide quite a few trees with missing NER labels
 5 | """
 6 | 
 7 | import re
 8 | 
 9 | from stanza.models.constituency import tree_reader
10 | import stanza.utils.datasets.constituency.convert_starlang as convert_starlang
11 | 
12 | TURKISH_WORD_RE = re.compile(r"[{]turkish=([^}]+)[}]")
13 | TURKISH_LABEL_RE = re.compile(r"[{]namedEntity=([^}]+)[}]")
14 | 
15 | 
16 | 
17 | def read_tree(text):
18 |     """
19 |     Reads in a tree, then extracts the word and the NER
20 | 
21 |     One problem is that it is unknown if there are cases of two separate items occurring consecutively
22 | 
23 |     Note that this is quite similar to the convert_starlang script for constituency.  
24 |     """
25 |     trees = tree_reader.read_trees(text)
26 |     if len(trees) > 1:
27 |         raise ValueError("Tree file had two trees!")
28 |     tree = trees[0]
29 |     words = []
30 |     for label in tree.leaf_labels():
31 |         match = TURKISH_WORD_RE.search(label)
32 |         if match is None:
33 |             raise ValueError("Could not find word in |{}|".format(label))
34 |         word = match.group(1)
35 |         word = word.replace("-LCB-", "{").replace("-RCB-", "}")
36 | 
37 |         match = TURKISH_LABEL_RE.search(label)
38 |         if match is None:
39 |             raise ValueError("Could not find ner in |{}|".format(label))
40 |         tag = match.group(1)
41 |         if tag == 'NONE' or tag == "null":
42 |             tag = 'O'
43 |         words.append((word, tag))
44 | 
45 |     return words
46 | 
47 | def read_starlang(paths):
48 |     return convert_starlang.read_starlang(paths, conversion=read_tree, log=False)
49 | 
50 | def main():
51 |     train, dev, test = convert_starlang.main(conversion=read_tree, log=False)
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 
56 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_ner_trainer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza.tests import *
 4 | 
 5 | from stanza.models.ner import trainer
 6 | 
 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 8 | 
 9 | def test_fix_singleton_tags():
10 |     TESTS = [
11 |         (["O"], ["O"]),
12 |         (["B-PER"], ["S-PER"]),
13 |         (["B-PER", "I-PER"], ["B-PER", "E-PER"]),
14 |         (["B-PER", "O", "B-PER"], ["S-PER", "O", "S-PER"]),
15 |         (["B-PER", "B-PER", "I-PER"], ["S-PER", "B-PER", "E-PER"]),
16 |         (["B-PER", "I-PER", "O", "B-PER"], ["B-PER", "E-PER", "O", "S-PER"]),
17 |         (["B-PER", "B-PER", "I-PER", "B-PER"], ["S-PER", "B-PER", "E-PER", "S-PER"]),
18 |         (["B-PER", "I-ORG", "O", "B-PER"], ["S-PER", "S-ORG", "O", "S-PER"]),
19 |         (["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
20 |         (["S-PER", "B-PER", "E-PER"], ["S-PER", "B-PER", "E-PER"]),
21 |         (["E-PER"], ["S-PER"]),
22 |         (["E-PER", "O", "E-PER"], ["S-PER", "O", "S-PER"]),
23 |         (["B-PER", "E-ORG", "O", "B-PER"], ["S-PER", "S-ORG", "O", "S-PER"]),
24 |         (["I-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
25 |         (["B-PER", "I-PER", "I-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
26 |         (["B-PER", "I-PER", "E-PER", "O", "I-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
27 |         (["B-PER", "I-PER", "E-PER", "O", "B-PER", "I-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
28 |         (["I-PER", "I-PER", "I-PER", "O", "I-PER", "I-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
29 |     ]
30 |              
31 |     for unfixed, expected in TESTS:
32 |         assert trainer.fix_singleton_tags(unfixed) == expected, "Error converting {} to {}".format(unfixed, expected)
33 | 


--------------------------------------------------------------------------------
/stanza/models/coref/config.py:
--------------------------------------------------------------------------------
 1 | """ Describes Config, a simple namespace for config values.
 2 | 
 3 | For description of all config values, refer to config.toml.
 4 | """
 5 | 
 6 | from dataclasses import dataclass
 7 | from typing import Dict, List
 8 | 
 9 | 
10 | @dataclass
11 | class Config:  # pylint: disable=too-many-instance-attributes, too-few-public-methods
12 |     """ Contains values needed to set up the coreference model. """
13 |     section: str
14 | 
15 |     # TODO: can either eliminate data_dir or use it for the train/dev/test data
16 |     data_dir: str
17 |     save_dir: str
18 |     save_name: str
19 | 
20 |     train_data: str
21 |     dev_data: str
22 |     test_data: str
23 | 
24 |     device: str
25 | 
26 |     bert_model: str
27 |     bert_window_size: int
28 | 
29 |     embedding_size: int
30 |     sp_embedding_size: int
31 |     a_scoring_batch_size: int
32 |     hidden_size: int
33 |     n_hidden_layers: int
34 | 
35 |     max_span_len: int
36 | 
37 |     rough_k: int
38 | 
39 |     lora: bool
40 |     lora_alpha: int
41 |     lora_rank: int
42 |     lora_dropout: float
43 | 
44 |     full_pairwise: bool
45 | 
46 |     lora_target_modules: List[str]
47 |     lora_modules_to_save: List[str]
48 | 
49 |     clusters_starts_are_singletons: bool
50 |     bert_finetune: bool
51 |     dropout_rate: float
52 |     learning_rate: float
53 |     bert_learning_rate: float
54 |     # we find that setting this to a small but non-zero number
55 |     # makes the model less likely to forget how to do anything
56 |     bert_finetune_begin_epoch: float
57 |     train_epochs: int
58 |     bce_loss_weight: float
59 | 
60 |     tokenizer_kwargs: Dict[str, dict]
61 |     conll_log_dir: str
62 | 
63 |     save_each_checkpoint: bool
64 |     log_norms: bool
65 |     singletons: bool
66 |     
67 |     max_train_len: int
68 |     use_zeros: bool
69 | 
70 |     lang_lr_attenuation: str
71 |     lang_lr_weights: str
72 | 


--------------------------------------------------------------------------------
/stanza/protobuf/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from io import BytesIO
 4 | import warnings
 5 | 
 6 | from google.protobuf.internal.encoder import _EncodeVarint
 7 | from google.protobuf.internal.decoder import _DecodeVarint
 8 | from google.protobuf.message import DecodeError
 9 | from .CoreNLP_pb2 import *
10 | 
11 | def parseFromDelimitedString(obj, buf, offset=0):
12 |     """
13 |     Stanford CoreNLP uses the Java "writeDelimitedTo" function, which
14 |     writes the size (and offset) of the buffer before writing the object.
15 |     This function handles parsing this message starting from offset 0.
16 | 
17 |     @returns how many bytes of @buf were consumed.
18 |     """
19 |     size, pos = _DecodeVarint(buf, offset)
20 |     try:
21 |         obj.ParseFromString(buf[offset+pos:offset+pos+size])
22 |     except DecodeError as e:
23 |         warnings.warn("Failed to decode a serialized output from CoreNLP server. An incomplete or empty object will be returned.", \
24 |             RuntimeWarning)
25 |     return pos+size
26 | 
27 | def writeToDelimitedString(obj, stream=None):
28 |     """
29 |     Stanford CoreNLP uses the Java "writeDelimitedTo" function, which
30 |     writes the size (and offset) of the buffer before writing the object.
31 |     This function handles parsing this message starting from offset 0.
32 | 
33 |     @returns how many bytes of @buf were consumed.
34 |     """
35 |     if stream is None:
36 |         stream = BytesIO()
37 | 
38 |     _EncodeVarint(stream.write, obj.ByteSize(), True)
39 |     stream.write(obj.SerializeToString())
40 |     return stream
41 | 
42 | def to_text(sentence):
43 |     """
44 |     Helper routine that converts a Sentence protobuf to a string from
45 |     its tokens.
46 |     """
47 |     text = ""
48 |     for i, tok in enumerate(sentence.token):
49 |         if i != 0:
50 |             text += tok.before
51 |         text += tok.word
52 |     return text
53 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_relative_attn.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | 
 5 | from stanza.models.common.relative_attn import RelativeAttention
 6 | 
 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 8 | 
 9 | 
10 | def test_attn():
11 |     foo = RelativeAttention(d_model=100, num_heads=2, window=8, dropout=0.0)
12 |     bar = torch.randn(10, 13, 100)
13 |     result = foo(bar)
14 |     assert result.shape == bar.shape
15 |     value = foo.value(bar)
16 |     if not torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06):
17 |         raise ValueError(result[:, -1, :] - value[:, -1, :])
18 |     assert torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06)
19 |     assert not torch.allclose(result[:, 0, :], value[:, 0, :])
20 | 
21 | 
22 | def test_shorter_sequence():
23 |     # originally this was failing because the batch was smaller than the window
24 |     foo = RelativeAttention(d_model=20, num_heads=2, window=5, dropout=0.0)
25 |     bar = torch.randn(10, 3, 20)
26 |     result = foo(bar)
27 |     assert result.shape == bar.shape
28 | 
29 |     value = foo.value(bar)
30 |     if not torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06):
31 |         raise ValueError(result[:, -1, :] - value[:, -1, :])
32 |     assert torch.allclose(result[:, -1, :], value[:, -1, :], atol=1e-06)
33 |     assert not torch.allclose(result[:, 0, :], value[:, 0, :])
34 | 
35 | def test_reverse():
36 |     foo = RelativeAttention(d_model=100, num_heads=2, window=8, reverse=True, dropout=0.0)
37 |     bar = torch.randn(10, 13, 100)
38 |     result = foo(bar)
39 |     assert result.shape == bar.shape
40 |     value = foo.value(bar)
41 |     if not torch.allclose(result[:, 0, :], value[:, 0, :], atol=1e-06):
42 |         raise ValueError(result[:, 0, :] - value[:, 0, :])
43 |     assert torch.allclose(result[:, 0, :], value[:, 0, :], atol=1e-06)
44 |     assert not torch.allclose(result[:, -1, :], value[:, -1, :])
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/relabel_tags.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Retag an S-expression tree with a new set of POS tags
 3 | 
 4 | Also includes an option to write the new trees as bracket_labels
 5 | (essentially, skipping the treebank_to_labeled_brackets step)
 6 | """
 7 | 
 8 | import argparse
 9 | import logging
10 | 
11 | from stanza import Pipeline
12 | from stanza.models.constituency import retagging
13 | from stanza.models.constituency import tree_reader
14 | from stanza.models.constituency.utils import retag_trees
15 | 
16 | logger = logging.getLogger('stanza')
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(description="Script that retags a tree file")
20 |     parser.add_argument('--lang', default='vi', type=str, help='Language')
21 |     parser.add_argument('--input_file', default='data/constituency/vi_vlsp21_train.mrg', help='File to retag')
22 |     parser.add_argument('--output_file', default='vi_vlsp21_train_retagged.mrg', help='Where to write the retagged trees')
23 |     retagging.add_retag_args(parser)
24 | 
25 |     parser.add_argument('--bracket_labels', action='store_true', help='Write the trees as bracket labels instead of S-expressions')
26 | 
27 |     args = parser.parse_args()
28 |     args = vars(args)
29 |     retagging.postprocess_args(args)
30 | 
31 |     return args
32 | 
33 | def main():
34 |     args = parse_args()
35 | 
36 |     retag_pipeline = retagging.build_retag_pipeline(args)
37 | 
38 |     train_trees = tree_reader.read_treebank(args['input_file'])
39 |     logger.info("Retagging %d trees using %s", len(train_trees), args['retag_package'])
40 |     train_trees = retag_trees(train_trees, retag_pipeline, args['retag_xpos'])
41 |     tree_format = "{:L}" if args['bracket_labels'] else "{}"
42 |     with open(args['output_file'], "w") as fout:
43 |         for tree in train_trees:
44 |             fout.write(tree_format.format(tree))
45 |             fout.write("\n")
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/.github/workflows/stanza-tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Run Stanza Tests
 2 | on: [push]
 3 | jobs:
 4 |   Run-Stanza-Tests:
 5 |     runs-on: self-hosted
 6 |     steps:
 7 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 8 |       - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
 9 |       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
10 |       - name: Check out repository code
11 |         uses: actions/checkout@v2
12 |       - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
13 |       - run: echo "🖥️ The workflow is now ready to test your code on the runner."
14 |       - name: Run Stanza Tests
15 |         run: |
16 |           # set up environment
17 |           echo "Setting up environment..."
18 |           bash
19 |           #. $CONDA_PREFIX/etc/profile.d/conda.sh
20 |           . /home/stanzabuild/miniconda3/etc/profile.d/conda.sh
21 |           conda activate stanza
22 |           export STANZA_TEST_HOME=/scr/stanza_test
23 |           export CORENLP_HOME=$STANZA_TEST_HOME/corenlp_dir
24 |           export CLASSPATH=$CORENLP_HOME/*:
25 |           echo CORENLP_HOME=$CORENLP_HOME
26 |           echo CLASSPATH=$CLASSPATH
27 |           # install from stanza repo being evaluated
28 |           echo PWD: $pwd
29 |           echo PATH: $PATH
30 |           pip3 install -e .
31 |           pip3 install -e .[test]
32 |           pip3 install -e .[transformers]
33 |           pip3 install -e .[tokenizers]
34 |           # set up for tests
35 |           echo "Running stanza test set up..."
36 |           rm -rf $STANZA_TEST_HOME
37 |           python3 stanza/tests/setup.py
38 |           # run tests
39 |           echo "Running tests..."
40 |           export CUDA_VISIBLE_DEVICES=2
41 |           pytest stanza/tests
42 |           
43 |       - run: echo "🍏 This job's status is ${{ job.status }}."
44 | 


--------------------------------------------------------------------------------
/stanza/tests/ner/test_pay_amt_annotators.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple test for tracking AMT annotator work
 3 | """
 4 | 
 5 | import os
 6 | import zipfile
 7 | 
 8 | import pytest
 9 | 
10 | from stanza.tests import TEST_WORKING_DIR
11 | from stanza.utils.ner import paying_annotators
12 | 
13 | DATA_SOURCE = os.path.join(TEST_WORKING_DIR, "in", "aws_annotations.zip")
14 | 
15 | @pytest.fixture(scope="module")
16 | def completed_amt_job_metadata(tmp_path_factory):
17 |     assert os.path.exists(DATA_SOURCE)
18 |     unzip_path = tmp_path_factory.mktemp("amt_test")
19 |     input_path = unzip_path / "ner" / "aws_labeling_copy"
20 |     with zipfile.ZipFile(DATA_SOURCE, 'r') as zin:
21 |         zin.extractall(unzip_path)
22 |     return input_path
23 | 
24 | def test_amt_annotator_track(completed_amt_job_metadata):
25 |     workers = {
26 |         "7efc17ac-3397-4472-afe5-89184ad145d0": "Worker1",
27 |         "afce8c28-969c-4e73-a20f-622ef122f585": "Worker2",
28 |         "91f6236e-63c6-4a84-8fd6-1efbab6dedab": "Worker3",
29 |         "6f202e93-e6b6-4e1d-8f07-0484b9a9093a": "Worker4",
30 |         "2b674d33-f656-44b0-8f90-d70a1ab71ec2": "Worker5"
31 |     }  # map AMT annotator subs to relevant identifier
32 | 
33 |     tracked_work = paying_annotators.track_tasks(completed_amt_job_metadata, workers)
34 |     assert tracked_work == {'Worker4': 20, 'Worker5': 20, 'Worker2': 3, 'Worker3': 16}
35 | 
36 | 
37 | def test_amt_annotator_track_no_map(completed_amt_job_metadata):
38 |     sub_to_count = paying_annotators.track_tasks(completed_amt_job_metadata)
39 |     assert sub_to_count == {'6f202e93-e6b6-4e1d-8f07-0484b9a9093a': 20, '2b674d33-f656-44b0-8f90-d70a1ab71ec2': 20,
40 |                             'afce8c28-969c-4e73-a20f-622ef122f585': 3, '91f6236e-63c6-4a84-8fd6-1efbab6dedab': 16}
41 | 
42 | 
43 | def main():
44 |     test_amt_annotator_track()
45 |     test_amt_annotator_track_no_map()
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 |     print("TESTS COMPLETED!")
51 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/convert_mr_l3cube.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reads one piece of the MR L3Cube dataset
 3 | 
 4 | The dataset is structured as a long list of words already in IOB format
 5 | The sentences have an ID which changes when a new sentence starts
 6 | The tags are labeled BNEM instead of B-NEM, so we update that.
 7 | (Could theoretically remap the tags to names more typical of other datasets as well)
 8 | """
 9 | 
10 | def convert(input_file):
11 |     """
12 |     Converts one file of the dataset
13 | 
14 |     Return: a list of list of pairs, (text, tag)
15 |     """
16 |     with open(input_file, encoding="utf-8") as fin:
17 |         lines = fin.readlines()
18 | 
19 |     sentences = []
20 |     current_sentence = []
21 |     prev_sent_id = None
22 |     for idx, line in enumerate(lines):
23 |         # first line of each of the segments is the header
24 |         if idx == 0:
25 |             continue
26 | 
27 |         line = line.strip()
28 |         if not line:
29 |             continue
30 |         pieces = line.split("\t")
31 |         if len(pieces) != 3:
32 |             raise ValueError("Unexpected number of pieces at line %d of %s" % (idx, input_file))
33 | 
34 |         text, ner, sent_id = pieces
35 |         if ner != 'O':
36 |             # ner symbols are written as BNEM, BNED, etc in this dataset
37 |             ner = ner[0] + "-" + ner[1:]
38 | 
39 |         if not prev_sent_id:
40 |             prev_sent_id = sent_id
41 |         if sent_id != prev_sent_id:
42 |             prev_sent_id = sent_id
43 |             if len(current_sentence) == 0:
44 |                 raise ValueError("This should not happen!")
45 |             sentences.append(current_sentence)
46 |             current_sentence = []
47 | 
48 |         current_sentence.append((text, ner))
49 | 
50 |     if current_sentence:
51 |         sentences.append(current_sentence)
52 | 
53 |     print("Read %d sentences in %d lines from %s" % (len(sentences), len(lines), input_file))
54 |     return sentences
55 | 


--------------------------------------------------------------------------------
/stanza/models/langid/trainer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.optim as optim
 3 | 
 4 | from stanza.models.langid.model import LangIDBiLSTM
 5 | 
 6 | 
 7 | class Trainer:
 8 | 
 9 |     DEFAULT_BATCH_SIZE = 64
10 |     DEFAULT_LAYERS = 2
11 |     DEFAULT_EMBEDDING_DIM = 150
12 |     DEFAULT_HIDDEN_DIM = 150
13 | 
14 |     def __init__(self, config, load_model=False, device=None):
15 |         self.model_path = config["model_path"]
16 |         self.batch_size = config.get("batch_size", Trainer.DEFAULT_BATCH_SIZE)
17 |         if load_model:
18 |             self.load(config["load_name"], device)
19 |         else:
20 |             self.model = LangIDBiLSTM(config["char_to_idx"], config["tag_to_idx"], Trainer.DEFAULT_LAYERS, 
21 |                                       Trainer.DEFAULT_EMBEDDING_DIM,
22 |                                       Trainer.DEFAULT_HIDDEN_DIM,
23 |                                       batch_size=self.batch_size,
24 |                                       weights=config["lang_weights"]).to(device)
25 |         self.optimizer = optim.AdamW(self.model.parameters())
26 | 
27 |     def update(self, inputs):
28 |         self.model.train()
29 |         sentences, targets = inputs
30 |         self.optimizer.zero_grad()
31 |         y_hat = self.model.forward(sentences)
32 |         loss = self.model.loss(y_hat, targets)
33 |         loss.backward()
34 |         self.optimizer.step()
35 | 
36 |     def predict(self, inputs):
37 |         self.model.eval()
38 |         sentences, targets = inputs
39 |         return torch.argmax(self.model(sentences), dim=1)
40 | 
41 |     def save(self, label=None):
42 |         # save a copy of model with label
43 |         if label:
44 |             self.model.save(f"{self.model_path[:-3]}-{label}.pt")
45 |         self.model.save(self.model_path)
46 | 
47 |     def load(self, model_path=None, device=None):
48 |         if not model_path:
49 |             model_path = self.model_path
50 |         self.model = LangIDBiLSTM.load(model_path, device, self.batch_size)
51 | 
52 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/reduce_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cut short the training portion of a constituency dataset.
 3 | 
 4 | One could think this script isn't necessary, as shuf | head would work,
 5 | but some treebanks use multiple lines for representing trees.
 6 | Thus it is necessary to actually intelligently read the trees.
 7 | 
 8 | Run with
 9 | 
10 | python3  stanza/utils/datasets/constituency/reduce_dataset.py --input zh-hans_ctb-51b --output zh-hans_ctb5k
11 | """
12 | 
13 | import argparse
14 | import os
15 | import random
16 | 
17 | from stanza.models.constituency import tree_reader
18 | import stanza.utils.default_paths as default_paths
19 | from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(description="Script that cuts a treebank down to size")
23 |     parser.add_argument('--input', type=str, default=None, help='Input treebank')
24 |     parser.add_argument('--output', type=str, default=None, help='Output treebank')
25 |     parser.add_argument('--size', type=int, default=5000, help='How many trees')
26 |     args = parser.parse_args()
27 | 
28 |     random.seed(1234)
29 | 
30 |     paths = default_paths.get_default_paths()
31 |     output_directory = paths["CONSTITUENCY_DATA_DIR"]
32 | 
33 |     # data/constituency/en_ptb3_train.mrg
34 |     input_filenames = [os.path.join(output_directory, "%s_%s.mrg" % (args.input, shard)) for shard in SHARDS]
35 |     output_filenames = ["%s_%s.mrg" % (args.output, shard) for shard in SHARDS]
36 |     shrink_datasets = [True, False, False]
37 | 
38 |     datasets = []
39 |     for input_filename, shrink in zip(input_filenames, shrink_datasets):
40 |         treebank = tree_reader.read_treebank(input_filename)
41 |         if shrink:
42 |             random.shuffle(treebank)
43 |             treebank = treebank[:args.size]
44 |         datasets.append(treebank)
45 |     write_dataset(datasets, output_directory, args.output)
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/stanza/tests/pipeline/test_pipeline_pos_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic testing of part of speech tagging
 3 | """
 4 | 
 5 | import pytest
 6 | import stanza
 7 | 
 8 | from stanza.tests import *
 9 | 
10 | pytestmark = pytest.mark.pipeline
11 | 
12 | EN_DOC = "Joe Smith was born in California."
13 | 
14 | EN_DOC_GOLD = """
15 | <Token id=1;words=[<Word id=1;text=Joe;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
16 | <Token id=2;words=[<Word id=2;text=Smith;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
17 | <Token id=3;words=[<Word id=3;text=was;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin>]>
18 | <Token id=4;words=[<Word id=4;text=born;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass>]>
19 | <Token id=5;words=[<Word id=5;text=in;upos=ADP;xpos=IN>]>
20 | <Token id=6;words=[<Word id=6;text=California;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
21 | <Token id=7;words=[<Word id=7;text=.;upos=PUNCT;xpos=.>]>
22 | """.strip()
23 | 
24 | @pytest.fixture(scope="module")
25 | def pos_pipeline():
26 |     return stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'en'})
27 | 
28 | def test_part_of_speech(pos_pipeline):
29 |     doc = pos_pipeline(EN_DOC)
30 |     assert EN_DOC_GOLD == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
31 | 
32 | def test_get_known_xpos(pos_pipeline):
33 |     tags = pos_pipeline.processors['pos'].get_known_xpos()
34 |     # make sure we have xpos...
35 |     assert 'DT' in tags
36 |     # ... and not upos
37 |     assert 'DET' not in tags
38 | 
39 | def test_get_known_upos(pos_pipeline):
40 |     tags = pos_pipeline.processors['pos'].get_known_upos()
41 |     # make sure we have upos...
42 |     assert 'DET' in tags
43 |     # ... and not xpos
44 |     assert 'DT' not in tags
45 | 
46 | 
47 | def test_get_known_feats(pos_pipeline):
48 |     feats = pos_pipeline.processors['pos'].get_known_feats()
49 |     # I appreciate how self-referential the Abbr feat is
50 |     assert 'Abbr' in feats
51 |     assert 'Yes' in feats['Abbr']
52 | 


--------------------------------------------------------------------------------
/stanza/models/common/convert_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A utility script to load a word embedding file from a text file and save it as a .pt
 3 | 
 4 | Run it as follows:
 5 |   python stanza/models/common/convert_pretrain.py <.pt file> <text file> <# vectors>
 6 | 
 7 | Note that -1 for # of vectors will keep all the vectors.
 8 | You probably want to keep fewer than that for most publicly released
 9 | embeddings, though, as they can get quite large.
10 | 
11 | As a concrete example, you can convert a newly downloaded Faroese WV file as follows:
12 |   python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/fo_farpahc.pretrain.pt ~/extern_data/wordvec/fasttext/faroese.txt -1
13 | or save part of an Icelandic WV file:
14 |   python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/is_icepahc.pretrain.pt ~/extern_data/wordvec/fasttext/icelandic.cc.is.300.vec 150000
15 | Note that if the pretrain already exists, nothing will be changed.  It will not overwrite an existing .pt file.
16 | 
17 | """
18 | 
19 | import argparse
20 | import os
21 | import sys
22 | 
23 | from stanza.models.common import pretrain
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("output_pt", default=None, help="Where to write the converted PT file")
28 |     parser.add_argument("input_vec", default=None, help="Unconverted vectors file")
29 |     parser.add_argument("max_vocab", type=int, default=-1, nargs="?", help="How many vectors to convert.  -1 means convert them all")
30 |     args = parser.parse_args()
31 | 
32 |     if os.path.exists(args.output_pt):
33 |         print("Not overwriting existing pretrain file in %s" % args.output_pt)
34 | 
35 |     if args.input_vec.endswith(".csv"):
36 |         pt = pretrain.Pretrain(args.output_pt, max_vocab=args.max_vocab, csv_filename=args.input_vec)
37 |     else:
38 |         pt = pretrain.Pretrain(args.output_pt, args.input_vec, max_vocab=args.max_vocab)
39 |     print("Pretrain is of size {}".format(len(pt.vocab)))
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/stanza/tests/lemma_classifier/test_training.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | import pytest
 5 | 
 6 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 7 | 
 8 | from stanza.models.lemma_classifier import train_lstm_model
 9 | from stanza.models.lemma_classifier import train_transformer_model
10 | from stanza.models.lemma_classifier.base_model import LemmaClassifier
11 | from stanza.models.lemma_classifier.evaluate_models import evaluate_model
12 | 
13 | from stanza.tests import TEST_WORKING_DIR
14 | from stanza.tests.lemma_classifier.test_data_preparation import convert_english_dataset
15 | 
16 | @pytest.fixture(scope="module")
17 | def pretrain_file():
18 |     return f'{TEST_WORKING_DIR}/in/tiny_emb.pt'
19 | 
20 | def test_train_lstm(tmp_path, pretrain_file):
21 |     converted_files = convert_english_dataset(tmp_path)
22 | 
23 |     save_name = str(tmp_path / 'lemma.pt')
24 | 
25 |     train_file = converted_files[0]
26 |     eval_file = converted_files[1]
27 |     train_args = ['--wordvec_pretrain_file', pretrain_file,
28 |                   '--save_name', save_name,
29 |                   '--train_file', train_file,
30 |                   '--eval_file', eval_file]
31 |     trainer = train_lstm_model.main(train_args)
32 | 
33 |     evaluate_model(trainer.model, eval_file)
34 |     # test that loading the model works
35 |     model = LemmaClassifier.load(save_name, None)
36 | 
37 | def test_train_transformer(tmp_path, pretrain_file):
38 |     converted_files = convert_english_dataset(tmp_path)
39 | 
40 |     save_name = str(tmp_path / 'lemma.pt')
41 | 
42 |     train_file = converted_files[0]
43 |     eval_file = converted_files[1]
44 |     train_args = ['--bert_model', 'hf-internal-testing/tiny-bert',
45 |                   '--save_name', save_name,
46 |                   '--train_file', train_file,
47 |                   '--eval_file', eval_file]
48 |     trainer = train_transformer_model.main(train_args)
49 | 
50 |     evaluate_model(trainer.model, eval_file)
51 | 
52 |     # test that loading the model works
53 |     model = LemmaClassifier.load(save_name, None)
54 | 


--------------------------------------------------------------------------------
/stanza/tests/constituency/test_convert_starlang.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test a couple different classes of trees to check the output of the Starlang conversion
 3 | """
 4 | 
 5 | import os
 6 | import tempfile
 7 | 
 8 | import pytest
 9 | 
10 | from stanza.utils.datasets.constituency import convert_starlang
11 | 
12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
13 | 
14 | TREE="( (S (NP (NP {morphologicalAnalysis=bayan+NOUN+A3SG+PNON+NOM}{metaMorphemes=bayan}{turkish=Bayan}{english=Ms.}{semantics=TUR10-0396530}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}{englishSemantics=ENG31-06352895-n}) (NP {morphologicalAnalysis=haag+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=haag}{turkish=Haag}{english=Haag}{semantics=TUR10-0000000}{namedEntity=PERSON}{propBank=ARG0$TUR10-0148580}))  (VP (NP {morphologicalAnalysis=elianti+NOUN+PROP+A3SG+PNON+NOM}{metaMorphemes=elianti}{turkish=Elianti}{english=Elianti}{semantics=TUR10-0000000}{namedEntity=NONE}{propBank=ARG1$TUR10-0148580}) (VP {morphologicalAnalysis=çal+VERB+POS+AOR+A3SG}{metaMorphemes=çal+Ar}{turkish=çalar}{english=plays}{semantics=TUR10-0148580}{namedEntity=NONE}{propBank=PREDICATE$TUR10-0148580}{englishSemantics=ENG31-01730049-v}))  (. {morphologicalAnalysis=.+PUNC}{metaMorphemes=.}{metaMorphemesMoved=.}{turkish=.}{english=.}{semantics=TUR10-1081860}{namedEntity=NONE}{propBank=NONE}))  )"
15 | 
16 | def test_read_tree():
17 |     """
18 |     Test a basic tree read
19 |     """
20 |     tree = convert_starlang.read_tree(TREE)
21 |     assert "(ROOT (S (NP (NP Bayan) (NP Haag)) (VP (NP Elianti) (VP çalar)) (. .)))" == str(tree)
22 | 
23 | def test_missing_word():
24 |     """
25 |     Test that an error is thrown if the word is missing
26 |     """
27 |     tree_text = TREE.replace("turkish=", "foo=")
28 |     with pytest.raises(ValueError):
29 |         tree = convert_starlang.read_tree(tree_text)
30 | 
31 | def test_bad_label():
32 |     """
33 |     Test that an unexpected label results in an error
34 |     """
35 |     tree_text = TREE.replace("(S", "(s")
36 |     with pytest.raises(ValueError):
37 |         tree = convert_starlang.read_tree(tree_text)
38 | 


--------------------------------------------------------------------------------
/demo/CONLL_Dependency_Visualizer_Example.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "c0fd86c8",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
11 |     "\n",
12 |     "# load necessary conllu files - expected to be in the demo directory along with the notebook\n",
13 |     "en_file = \"en_test.conllu.txt\"\n",
14 |     "\n",
15 |     "# testing left to right languages\n",
16 |     "conll_to_visual(en_file, \"en\", sent_count=2)\n",
17 |     "conll_to_visual(en_file, \"en\", sent_count=10)\n",
18 |     "#conll_to_visual(en_file, \"en\", display_all=True)\n"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "id": "fc4b3f9b",
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
29 |     "\n",
30 |     "jp_file = \"japanese_test.conllu.txt\"\n",
31 |     "conll_to_visual(jp_file, \"ja\")\n"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "id": "6852b8e8",
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
42 |     "\n",
43 |     "# testing right to left languages\n",
44 |     "ar_file = \"arabic_test.conllu.txt\"\n",
45 |     "conll_to_visual(ar_file, \"ar\")"
46 |    ]
47 |   }
48 |  ],
49 |  "metadata": {
50 |   "kernelspec": {
51 |    "display_name": "Python 3 (ipykernel)",
52 |    "language": "python",
53 |    "name": "python3"
54 |   },
55 |   "language_info": {
56 |    "codemirror_mode": {
57 |     "name": "ipython",
58 |     "version": 3
59 |    },
60 |    "file_extension": ".py",
61 |    "mimetype": "text/x-python",
62 |    "name": "python",
63 |    "nbconvert_exporter": "python",
64 |    "pygments_lexer": "ipython3",
65 |    "version": "3.9.22"
66 |   }
67 |  },
68 |  "nbformat": 4,
69 |  "nbformat_minor": 5
70 | }
71 | 


--------------------------------------------------------------------------------
/stanza/tests/resources/test_installation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test installation functions.
 3 | """
 4 | 
 5 | import os
 6 | import pytest
 7 | import shutil
 8 | import tempfile
 9 | 
10 | import stanza
11 | from stanza.tests import TEST_WORKING_DIR
12 | 
13 | pytestmark = [pytest.mark.travis, pytest.mark.client]
14 | 
15 | def test_install_corenlp():
16 |     # we do not reset the CORENLP_HOME variable since this may impact the 
17 |     # client tests
18 |     with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
19 | 
20 |         # the download method doesn't install over existing directories
21 |         shutil.rmtree(test_dir)
22 |         stanza.install_corenlp(dir=test_dir)
23 | 
24 |         assert os.path.isdir(test_dir), "Installation destination directory not found."
25 |         jar_files = [f for f in os.listdir(test_dir) \
26 |                      if f.endswith('.jar') and f.startswith('stanford-corenlp')]
27 |         assert len(jar_files) > 0, \
28 |             "Cannot find stanford-corenlp jar files in the installation directory."
29 |         assert not os.path.exists(os.path.join(test_dir, 'corenlp.zip')), \
30 |             "Downloaded zip file was not removed."
31 |     
32 | def test_download_corenlp_models():
33 |     model_name = "arabic"
34 |     version = "4.2.2"
35 | 
36 |     with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
37 |         stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir)
38 | 
39 |         dest_file = os.path.join(test_dir, f"stanford-corenlp-{version}-models-{model_name}.jar")
40 |         assert os.path.isfile(dest_file), "Downloaded model file not found."
41 | 
42 | def test_download_tokenize_mwt():
43 |     with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
44 |         stanza.download("en", model_dir=test_dir, processors="tokenize", package="ewt", verbose=False)
45 |         pipeline = stanza.Pipeline("en", model_dir=test_dir, processors="tokenize", package="ewt")
46 |         assert isinstance(pipeline, stanza.Pipeline)
47 |         # mwt should be added to the list
48 |         assert len(pipeline.loaded_processors) == 2
49 | 


--------------------------------------------------------------------------------
/stanza/tests/server/test_parser_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the parser eval interface
 3 | """
 4 | 
 5 | import pytest
 6 | import stanza
 7 | from stanza.models.constituency import tree_reader
 8 | from stanza.protobuf import EvaluateParserRequest, EvaluateParserResponse
 9 | from stanza.server.parser_eval import build_request, collate, EvaluateParser, ParseResult
10 | from stanza.tests.server.test_java_protobuf_requests import check_tree
11 | 
12 | from stanza.tests import *
13 | 
14 | pytestmark = [pytest.mark.travis, pytest.mark.client]
15 | 
16 | def build_one_tree_treebank(fake_scores=True):
17 |     text = "((S (VP (VB Unban)) (NP (NNP Mox) (NNP Opal))))"
18 |     trees = tree_reader.read_trees(text)
19 |     assert len(trees) == 1
20 |     gold = trees[0]
21 |     if fake_scores:
22 |         prediction = (gold, 1.0)
23 |         treebank = [ParseResult(gold, [prediction], None, None)]
24 |         return treebank
25 |     else:
26 |         prediction = gold
27 |         return collate([gold], [prediction])
28 | 
29 | def check_build(fake_scores=True):
30 |     treebank = build_one_tree_treebank(fake_scores)
31 |     request = build_request(treebank)
32 | 
33 |     assert len(request.treebank) == 1
34 |     check_tree(request.treebank[0].gold, treebank[0][0], None)
35 |     assert len(request.treebank[0].predicted) == 1
36 |     if fake_scores:
37 |         check_tree(request.treebank[0].predicted[0], treebank[0][1][0][0], treebank[0][1][0][1])
38 |     else:
39 |         check_tree(request.treebank[0].predicted[0], treebank[0][1][0], None)
40 | 
41 | 
42 | def test_build_tuple_request():
43 |     check_build(True)
44 | 
45 | def test_build_notuple_request():
46 |     check_build(False)
47 | 
48 | def test_score_one_tree_tuples():
49 |     treebank = build_one_tree_treebank(True)
50 | 
51 |     with EvaluateParser() as ep:
52 |         response = ep.process(treebank)
53 |         assert response.f1 == pytest.approx(1.0)
54 | 
55 | def test_score_one_tree_notuples():
56 |     treebank = build_one_tree_treebank(False)
57 | 
58 |     with EvaluateParser() as ep:
59 |         response = ep.process(treebank)
60 |         assert response.f1 == pytest.approx(1.0)
61 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/json_to_bio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | If you want to convert .json back to .bio for some reason, this will do it for you
 3 | """
 4 | 
 5 | import argparse
 6 | import json
 7 | import os
 8 | from stanza.models.common.doc import Document
 9 | from stanza.models.ner.utils import process_tags
10 | from stanza.utils.default_paths import get_default_paths
11 | 
12 | def convert_json_to_bio(input_filename, output_filename):
13 |     with open(input_filename, encoding="utf-8") as fin:
14 |         doc = Document(json.load(fin))
15 |     sentences = [[(word.text, word.ner) for word in sentence.tokens] for sentence in doc.sentences]
16 |     sentences = process_tags(sentences, "bioes")
17 |     with open(output_filename, "w", encoding="utf-8") as fout:
18 |         for sentence in sentences:
19 |             for word in sentence:
20 |                 fout.write("%s\t%s\n" % word)
21 |             fout.write("\n")
22 | 
23 | def main(args=None):
24 |     ner_data_dir = get_default_paths()['NER_DATA_DIR']
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('--input_filename', type=str, default="data/ner/en_foreign-4class.test.json", help='Convert an individual file')
27 |     parser.add_argument('--input_dir', type=str, default=ner_data_dir, help='Which directory to find the dataset, if using --input_dataset')
28 |     parser.add_argument('--input_dataset', type=str, help='Convert an entire dataset')
29 |     parser.add_argument('--output_suffix', type=str, default='bioes', help='suffix for output filenames')
30 |     args = parser.parse_args(args)
31 | 
32 |     if args.input_dataset:
33 |         input_filenames = [os.path.join(args.input_dir, "%s.%s.json" % (args.input_dataset, shard))
34 |                            for shard in ("train", "dev", "test")]
35 |     else:
36 |         input_filenames = [args.input_filename]
37 |     for input_filename in input_filenames:
38 |         output_filename = os.path.splitext(input_filename)[0] + "." + args.output_suffix
39 |         print("%s -> %s" % (input_filename, output_filename))
40 |         convert_json_to_bio(input_filename, output_filename)
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/stanza/tests/server/test_server_pretokenized.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Misc tests for the server
 3 | """
 4 | 
 5 | import pytest
 6 | import re
 7 | 
 8 | from stanza.server import CoreNLPClient
 9 | 
10 | pytestmark = pytest.mark.client
11 | 
12 | tokens = {}
13 | tags = {}
14 | 
15 | # Italian examples
16 | tokens["italian"] = [
17 |     "È vero , tutti possiamo essere sostituiti .\n Alcune chiamate partirono da il Quirinale ."
18 | ]
19 | tags["italian"] = [
20 |     [
21 |         ["AUX", "ADJ", "PUNCT", "PRON", "AUX", "AUX", "VERB", "PUNCT"],
22 |         ["DET", "NOUN", "VERB", "ADP", "DET", "PROPN", "PUNCT"],
23 |     ],
24 | ]
25 | 
26 | 
27 | # French examples
28 | tokens["french"] = [
29 |     (
30 |      "Les études durent six ans mais leur contenu diffère donc selon les Facultés .\n"
31 |      "Il est fêté le 22 mai ."
32 |     )
33 | ]
34 | tags["french"] = [
35 |     [
36 |         ["DET", "NOUN", "VERB", "NUM", "NOUN", "CCONJ", "DET", "NOUN", "VERB", "ADV", "ADP", "DET", "PROPN", "PUNCT"],
37 |         ["PRON", "AUX", "VERB", "DET", "NUM", "NOUN", "PUNCT"]
38 |     ],
39 | ]
40 | 
41 | 
42 | # English examples
43 | tokens["english"] = ["This shouldn't be split .\n I hope it's not ."]
44 | tags["english"] = [
45 |     [
46 |         ["DT", "NN", "VB", "VBN", "."],
47 |         ["PRP", "VBP", "PRP$", "RB", "."],
48 |     ],
49 | ]
50 | 
51 | 
52 | def pretokenized_test(lang):
53 |     """Test submitting pretokenized French text."""
54 |     with CoreNLPClient(
55 |         properties=lang,
56 |         annotators="pos",
57 |         pretokenized=True,
58 |         be_quiet=True,
59 |     ) as client:
60 |         for input_text, gold_tags in zip(tokens[lang], tags[lang]):
61 |             ann = client.annotate(input_text)
62 |             for sentence_tags, sentence in zip(gold_tags, ann.sentence):
63 |                 result_tags = [tok.pos for tok in sentence.token]
64 |                 assert sentence_tags == result_tags
65 | 
66 | 
67 | def test_english_pretokenized():
68 |     pretokenized_test("english")
69 | 
70 | 
71 | def test_italian_pretokenized():
72 |     pretokenized_test("italian")
73 | 
74 | 
75 | def test_french_pretokenized():
76 |     pretokenized_test("french")
77 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/check_for_duplicates.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple tool to check if there are duplicates in a set of NER files
 3 | 
 4 | It's surprising how many datasets have a bunch of duplicates...
 5 | """
 6 | 
 7 | def read_sentences(filename):
 8 |     """
 9 |     Read the sentences (without tags) from a BIO file
10 |     """
11 |     sentences = []
12 |     with open(filename) as fin:
13 |         lines = fin.readlines()
14 |     current_sentence = []
15 |     for line in lines:
16 |         line = line.strip()
17 |         if not line:
18 |             if current_sentence:
19 |                 sentences.append(tuple(current_sentence))
20 |             current_sentence = []
21 |             continue
22 |         word = line.split("\t")[0]
23 |         current_sentence.append(word)
24 |     if len(current_sentence) > 0:
25 |         sentences.append(tuple(current_sentence))
26 |     return sentences
27 |     
28 | def check_for_duplicates(output_filenames, fail=False, check_self=False, print_all=False):
29 |     """
30 |     Checks for exact duplicates in a list of NER files
31 |     """
32 |     sentence_map = {}
33 |     for output_filename in output_filenames:
34 |         duplicates = 0
35 |         sentences = read_sentences(output_filename)
36 |         for sentence in sentences:
37 |             other_file = sentence_map.get(sentence, None)
38 |             if other_file is not None and (check_self or other_file != output_filename):
39 |                 if fail:
40 |                     raise ValueError("Duplicate sentence '{}', first in {}, also in {}".format("".join(sentence), sentence_map[sentence], output_filename))
41 |                 else:
42 |                     if duplicates == 0 and not print_all:
43 |                         print("First duplicate:")
44 |                     if duplicates == 0 or print_all:                    
45 |                         print("{}\nFound in {} and {}".format(sentence, other_file, output_filename))
46 |                     duplicates = duplicates + 1
47 |             sentence_map[sentence] = output_filename
48 |         if duplicates > 0:
49 |             print("%d duplicates found in %s" % (duplicates, output_filename))
50 | 


--------------------------------------------------------------------------------
/stanza/tests/common/test_data_objects.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic tests of the stanza data objects, especially the setter/getter routines
 3 | """
 4 | import pytest
 5 | 
 6 | import stanza
 7 | from stanza.models.common.doc import Document, Sentence, Word
 8 | from stanza.tests import *
 9 | 
10 | pytestmark = pytest.mark.pipeline
11 | 
12 | # data for testing
13 | EN_DOC = "This is a test document. Pretty cool!"
14 | 
15 | EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNCT_.'), ('ADV_RB', 'ADJ_JJ', 'PUNCT_.'))
16 | 
17 | EN_DOC2 = "Chris Manning wrote a sentence. Then another."
18 | 
19 | @pytest.fixture(scope="module")
20 | def nlp_pipeline():
21 |     nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
22 |     return nlp
23 | 
24 | def test_readonly(nlp_pipeline):
25 |     Document.add_property('some_property', 123)
26 |     doc = nlp_pipeline(EN_DOC)
27 |     assert doc.some_property == 123
28 |     with pytest.raises(ValueError):
29 |         doc.some_property = 456
30 | 
31 | 
32 | def test_getter(nlp_pipeline):
33 |     Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}")
34 | 
35 |     doc = nlp_pipeline(EN_DOC)
36 | 
37 |     assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences)
38 | 
39 | def test_setter_getter(nlp_pipeline):
40 |     int2str = {0: 'ok', 1: 'good', 2: 'bad'}
41 |     str2int = {'ok': 0, 'good': 1, 'bad': 2}
42 |     def setter(self, value):
43 |         self._classname = str2int[value]
44 |     Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter)
45 | 
46 |     doc = nlp_pipeline(EN_DOC)
47 |     sentence = doc.sentences[0]
48 |     sentence.classname = 'good'
49 |     assert sentence._classname == 1
50 | 
51 |     # don't try this at home
52 |     sentence._classname = 2
53 |     assert sentence.classname == 'bad'
54 | 
55 | def test_backpointer(nlp_pipeline):
56 |     doc = nlp_pipeline(EN_DOC2)
57 |     ent = doc.ents[0]
58 |     assert ent.sent is doc.sentences[0]
59 |     assert list(doc.iter_words())[0].sent is doc.sentences[0]
60 |     assert list(doc.iter_tokens())[-1].sent is doc.sentences[-1]
61 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/extract_all_silver_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | After running build_silver_dataset.py, this extracts the trees of all match levels at once
 3 | 
 4 | For example
 5 | 
 6 | python stanza/utils/datasets/constituency/extract_all_silver_dataset.py --output_prefix /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_ --parsed_trees /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_wiki_a*trees
 7 | 
 8 | cat /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_[012345678].mrg | sort | uniq | shuf > /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_sort.mrg
 9 | 
10 | shuf /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_sort.mrg | head -n 200000 > /u/nlp/data/constituency-parser/chinese/2024_zh_wiki/zh_silver_200K.mrg
11 | """
12 | 
13 | import argparse
14 | from collections import defaultdict
15 | import json
16 | 
17 | def parse_args():
18 |     parser = argparse.ArgumentParser(description="After finding common trees using build_silver_dataset, this extracts them all or just the ones from a particular level of accuracy")
19 |     parser.add_argument('--parsed_trees', type=str, nargs='+', help='Input file(s) of trees parsed into the build_silver_dataset json format.')
20 |     parser.add_argument('--output_prefix', type=str, default=None, help='Prefix to use for outputting trees')
21 |     parser.add_argument('--output_suffix', type=str, default=".mrg", help='Suffix to use for outputting trees')
22 |     args = parser.parse_args()
23 | 
24 |     return args
25 | 
26 | def main():
27 |     args = parse_args()
28 | 
29 |     trees = defaultdict(list)
30 |     for filename in args.parsed_trees:
31 |         with open(filename, encoding='utf-8') as fin:
32 |             for line in fin.readlines():
33 |                 tree = json.loads(line)
34 |                 trees[tree['count']].append(tree['tree'])
35 | 
36 |     for score, tree_list in trees.items():
37 |         filename = "%s%s%s" % (args.output_prefix, score, args.output_suffix)
38 |         with open(filename, 'w', encoding='utf-8') as fout:
39 |             for tree in tree_list:
40 |                 fout.write(tree)
41 |                 fout.write('\n')
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/stanza/models/classifiers/base_classifier.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import logging
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from stanza.models.common.utils import split_into_batches, sort_with_indices, unsort
 9 | 
10 | """
11 | A base classifier type
12 | 
13 | Currently, has the ability to process text or other inputs in a manner
14 | suitable for the particular model type.
15 | In other words, the CNNClassifier processes lists of words,
16 | and the ConstituencyClassifier processes trees
17 | """
18 | 
19 | logger = logging.getLogger('stanza')
20 | 
21 | class BaseClassifier(ABC, nn.Module):
22 |     @abstractmethod
23 |     def extract_sentences(self, doc):
24 |         """
25 |         Extract the sentences or the relevant information in the sentences from a document
26 |         """
27 | 
28 |     def preprocess_sentences(self, sentences):
29 |         """
30 |         By default, don't do anything
31 |         """
32 |         return sentences
33 | 
34 |     def label_sentences(self, sentences, batch_size=None):
35 |         """
36 |         Given a list of sentences, return the model's results on that text.
37 |         """
38 |         self.eval()
39 | 
40 |         sentences = self.preprocess_sentences(sentences)
41 | 
42 |         if batch_size is None:
43 |             intervals = [(0, len(sentences))]
44 |             orig_idx = None
45 |         else:
46 |             sentences, orig_idx = sort_with_indices(sentences, key=len, reverse=True)
47 |             intervals = split_into_batches(sentences, batch_size)
48 |         labels = []
49 |         for interval in intervals:
50 |             if interval[1] - interval[0] == 0:
51 |                 # this can happen for empty text
52 |                 continue
53 |             output = self(sentences[interval[0]:interval[1]])
54 |             predicted = torch.argmax(output, dim=1)
55 |             labels.extend(predicted.tolist())
56 | 
57 |         if orig_idx:
58 |             sentences = unsort(sentences, orig_idx)
59 |             labels = unsort(labels, orig_idx)
60 | 
61 |         logger.debug("Found labels")
62 |         for (label, sentence) in zip(labels, sentences):
63 |             logger.debug((label, sentence))
64 | 
65 |         return labels
66 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/constituency/extract_silver_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | After running build_silver_dataset.py, this extracts the trees of a certain match level
 3 | 
 4 | For example
 5 | 
 6 | python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score 0 --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_0.mrg
 7 | 
 8 | for i in `echo 0 1 2 3 4 5 6 7 8 9 10`; do python3 stanza/utils/datasets/constituency/extract_silver_dataset.py --parsed_trees /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/a*.trees --keep_score $i --output_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_$i.mrg; done
 9 | """
10 | 
11 | import argparse
12 | import json
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser(description="After finding common trees using build_silver_dataset, this extracts them all or just the ones from a particular level of accuracy")
16 |     parser.add_argument('--parsed_trees', type=str, nargs='+', help='Input file(s) of trees parsed into the build_silver_dataset json format.')
17 |     parser.add_argument('--keep_score', type=int, default=None, help='Which agreement level to keep.  None keeps all') 
18 |     parser.add_argument('--output_file', type=str, default=None, help='Where to put the output file')
19 |     args = parser.parse_args()
20 | 
21 |     return args
22 | 
23 | 
24 | def main():
25 |     args = parse_args()
26 | 
27 |     trees = []
28 |     for filename in args.parsed_trees:
29 |         with open(filename, encoding='utf-8') as fin:
30 |             for line in fin.readlines():
31 |                 tree = json.loads(line)
32 |                 if args.keep_score is None or tree['count'] == args.keep_score:
33 |                     tree = tree['tree']
34 |                     trees.append(tree)
35 | 
36 |     if args.output_file is None:
37 |         for tree in trees:
38 |             print(tree)
39 |     else:
40 |         with open(args.output_file, 'w', encoding='utf-8') as fout:
41 |             for tree in trees:
42 |                 fout.write(tree)
43 |                 fout.write('\n')
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 
48 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/sentiment/process_vsfc_vietnamese.py:
--------------------------------------------------------------------------------
 1 | """
 2 | VSFC sentiment dataset is available at
 3 |   https://drive.google.com/drive/folders/1xclbjHHK58zk2X6iqbvMPS2rcy9y9E0X
 4 | 
 5 | The format is extremely similar to ours - labels are 0,1,2.
 6 | Text needs to be tokenized, though.
 7 | Also, the files are split into two pieces, labels and text.
 8 | """
 9 | 
10 | import os
11 | import sys
12 | 
13 | from tqdm import tqdm
14 | 
15 | import stanza
16 | from stanza.models.classifiers.data import SentimentDatum
17 | import stanza.utils.datasets.sentiment.process_utils as process_utils
18 | 
19 | import stanza.utils.default_paths as default_paths
20 | 
21 | def combine_columns(in_directory, dataset, nlp):
22 |     directory = os.path.join(in_directory, dataset)
23 | 
24 |     sentiment_file = os.path.join(directory, "sentiments.txt")
25 |     with open(sentiment_file) as fin:
26 |         sentiment = fin.readlines()
27 | 
28 |     text_file = os.path.join(directory, "sents.txt")
29 |     with open(text_file) as fin:
30 |         text = fin.readlines()
31 | 
32 |     text = [[token.text for sentence in nlp(line.strip()).sentences for token in sentence.tokens]
33 |             for line in tqdm(text)]
34 | 
35 |     phrases = [SentimentDatum(s.strip(), t) for s, t in zip(sentiment, text)]
36 |     return phrases
37 | 
38 | def main(in_directory, out_directory, short_name):
39 |     nlp = stanza.Pipeline('vi', processors='tokenize')
40 |     for shard in ("train", "dev", "test"):
41 |         phrases = combine_columns(in_directory, shard, nlp)
42 |         output_file = os.path.join(out_directory, "%s.%s.json" % (short_name, shard))
43 |         process_utils.write_list(output_file, phrases)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     paths = default_paths.get_default_paths()
48 | 
49 |     if len(sys.argv) <= 1:
50 |         in_directory = os.path.join(paths['SENTIMENT_BASE'], "vietnamese", "_UIT-VSFC")
51 |     else:
52 |         in_directory = sys.argv[1]
53 | 
54 |     if len(sys.argv) <= 2:
55 |         out_directory = paths['SENTIMENT_DATA_DIR']
56 |     else:
57 |         out_directory = sys.argv[2]
58 | 
59 |     if len(sys.argv) <= 3:
60 |         short_name = 'vi_vsfc'
61 |     else:
62 |         short_name = sys.argv[3]
63 | 
64 |     main(in_directory, out_directory, short_name)
65 | 


--------------------------------------------------------------------------------
/stanza/models/constituency/tree_stack.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A utilitiy class for keeping track of intermediate parse states
 3 | """
 4 | 
 5 | from collections import namedtuple
 6 | 
 7 | class TreeStack(namedtuple('TreeStack', ['value', 'parent', 'length'])):
 8 |     """
 9 |     A stack which can branch in several directions, as long as you
10 |     keep track of the branching heads
11 | 
12 |     An example usage is when K constituents are removed at once
13 |     to create a new constituent, and then the LSTM which tracks the
14 |     values of the constituents is updated starting from the Kth
15 |     output of the LSTM with the new value.
16 | 
17 |     We don't simply keep track of a single stack object using a deque
18 |     because versions of the parser which use a beam will want to be
19 |     able to branch in different directions from the same base stack
20 | 
21 |     Another possible usage is if an oracle is used for training
22 |     in a manner where some fraction of steps are non-gold steps,
23 |     but we also want to take a gold step from the same state.
24 |     Eg, parser gets to state X, wants to make incorrect transition T
25 |     instead of gold transition G, and so we continue training both
26 |     X+G and X+T.  If we only represent the state X with standard
27 |     python stacks, it would not be possible to track both of these
28 |     states at the same time without copying the entire thing.
29 | 
30 |     Value can be as transition, a word, or a partially built constituent
31 | 
32 |     Implemented as a namedtuple to make it a bit more efficient
33 |     """
34 |     def pop(self):
35 |         return self.parent
36 | 
37 |     def push(self, value):
38 |         # returns a new stack node which points to this
39 |         return TreeStack(value, self, self.length+1)
40 | 
41 |     def __iter__(self):
42 |         stack = self
43 |         while stack.parent is not None:
44 |             yield stack.value
45 |             stack = stack.parent
46 |         yield stack.value
47 | 
48 |     def __reversed__(self):
49 |         items = list(iter(self))
50 |         for item in reversed(items):
51 |             yield item
52 | 
53 |     def __str__(self):
54 |         return "TreeStack(%s)" % ", ".join([str(x) for x in self])
55 | 
56 |     def __len__(self):
57 |         return self.length
58 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/coref/convert_hebrew_mixed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Build a dataset mixed with IAHLT Hebrew and UD Coref
 3 | 
 4 | We find that the IAHLT dataset by itself, trained using Stanza 1.11
 5 | with xlm-roberta-large and a lora finetuning layer, gets 49.7 F1.
 6 | This is a bit lower than the value the IAHLT group originally had, as
 7 | they reported 52.  Interestingly, we find that mixing in the 1.3 UD
 8 | Coref improves results, getting 51.7 under the same parameters
 9 | 
10 | This script runs the IAHLT conversion and the UD Coref conversion,
11 | then combines the files into one big training file
12 | """
13 | 
14 | import json
15 | import os
16 | import shutil
17 | import tempfile
18 | 
19 | from stanza.utils.datasets.coref import convert_hebrew_iahlt
20 | from stanza.utils.datasets.coref import convert_udcoref
21 | from stanza.utils.default_paths import get_default_paths
22 | 
23 | def main():
24 |     paths = get_default_paths()
25 |     coref_output_path = paths['COREF_DATA_DIR']
26 |     with tempfile.TemporaryDirectory() as temp_dir_path:
27 |         hebrew_filenames = convert_hebrew_iahlt.main(["--output_directory", temp_dir_path])
28 |         udcoref_filenames = convert_udcoref.main(["--project", "gerrom", "--output_directory", temp_dir_path])
29 | 
30 |         with open(os.path.join(temp_dir_path, hebrew_filenames[0]), encoding="utf-8") as fin:
31 |             hebrew_train = json.load(fin)
32 |         udcoref_train_filename = os.path.join(temp_dir_path, udcoref_filenames[0])
33 |         with open(udcoref_train_filename, encoding="utf-8") as fin:
34 |             print("Reading extra udcoref json data from %s" % udcoref_train_filename)
35 |             udcoref_train = json.load(fin)
36 |         mixed_train = hebrew_train + udcoref_train
37 |         with open(os.path.join(coref_output_path, "he_mixed.train.json"), "w", encoding="utf-8") as fout:
38 |             json.dump(mixed_train, fout, indent=2, ensure_ascii=False))
39 | 
40 |         shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[1]),
41 |                         os.path.join(coref_output_path, "he_mixed.dev.json"))
42 |         shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[2]),
43 |                         os.path.join(coref_output_path, "he_mixed.test.json"))
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/stanza/pipeline/mwt_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Processor for performing multi-word-token expansion
 3 | """
 4 | 
 5 | import io
 6 | 
 7 | import torch
 8 | 
 9 | from stanza.models.mwt.data import DataLoader
10 | from stanza.models.mwt.trainer import Trainer
11 | from stanza.pipeline._constants import *
12 | from stanza.pipeline.processor import UDProcessor, register_processor
13 | 
14 | @register_processor(MWT)
15 | class MWTProcessor(UDProcessor):
16 | 
17 |     # set of processor requirements this processor fulfills
18 |     PROVIDES_DEFAULT = set([MWT])
19 |     # set of processor requirements for this processor
20 |     REQUIRES_DEFAULT = set([TOKENIZE])
21 | 
22 |     def _set_up_model(self, config, pipeline, device):
23 |         self._trainer = Trainer(model_file=config['model_path'], device=device)
24 | 
25 |     def build_batch(self, document):
26 |         return DataLoader(document, self.config['batch_size'], self.config, vocab=self.vocab, evaluation=True, expand_unk_vocab=True)
27 | 
28 |     def process(self, document):
29 |         batch = self.build_batch(document)
30 | 
31 |         # process the rest
32 |         expansions = batch.doc.get_mwt_expansions(evaluation=True)
33 |         if len(batch) > 0:
34 |             # decide trainer type and run eval
35 |             if self.config['dict_only']:
36 |                 preds = self.trainer.predict_dict(expansions)
37 |             else:
38 |                 with torch.no_grad():
39 |                     preds = []
40 |                     for i, b in enumerate(batch.to_loader()):
41 |                         preds += self.trainer.predict(b, never_decode_unk=True, vocab=batch.vocab)
42 | 
43 |                 if self.config.get('ensemble_dict', False):
44 |                     preds = self.trainer.ensemble(expansions, preds)
45 |         else:
46 |             # skip eval if dev data does not exist
47 |             preds = []
48 | 
49 |         batch.doc.set_mwt_expansions(preds, process_manual_expanded=False)
50 |         return batch.doc
51 | 
52 |     def bulk_process(self, docs):
53 |         """
54 |         MWT processor counts some statistics on the individual docs, so we need to separately redo those stats
55 |         """
56 |         docs = super().bulk_process(docs)
57 |         for doc in docs:
58 |             doc._count_words()
59 |         return docs
60 | 


--------------------------------------------------------------------------------
/stanza/tests/mwt/test_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the MWT resplitting of preexisting tokens without word splits
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | import stanza
 8 | from stanza.models.mwt.utils import resplit_mwt
 9 | 
10 | from stanza.tests import TEST_MODELS_DIR
11 | 
12 | pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
13 | 
14 | @pytest.fixture(scope="module")
15 | def pipeline():
16 |     """
17 |     A reusable pipeline with the NER module
18 |     """
19 |     return stanza.Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize,mwt", package="gum")
20 | 
21 | 
22 | def test_resplit_keep_tokens(pipeline):
23 |     """
24 |     Test splitting with enforced token boundaries
25 |     """
26 |     tokens = [["I", "can't", "believe", "it"], ["I can't", "sleep"]]
27 |     doc = resplit_mwt(tokens, pipeline)
28 |     assert len(doc.sentences) == 2
29 |     assert len(doc.sentences[0].tokens) == 4
30 |     assert len(doc.sentences[0].tokens[1].words) == 2
31 |     assert doc.sentences[0].tokens[1].words[0].text == "ca"
32 |     assert doc.sentences[0].tokens[1].words[1].text == "n't"
33 | 
34 |     assert len(doc.sentences[1].tokens) == 2
35 |     # updated GUM MWT splits "I can't" into three segments
36 |     # the way we want, "I - ca - n't"
37 |     # previously it would split "I - can - 't"
38 |     assert len(doc.sentences[1].tokens[0].words) == 3
39 |     assert doc.sentences[1].tokens[0].words[0].text == "I"
40 |     assert doc.sentences[1].tokens[0].words[1].text == "ca"
41 |     assert doc.sentences[1].tokens[0].words[2].text == "n't"
42 | 
43 | 
44 | def test_resplit_no_keep_tokens(pipeline):
45 |     """
46 |     Test splitting without enforced token boundaries
47 |     """
48 |     tokens = [["I", "can't", "believe", "it"], ["I can't", "sleep"]]
49 |     doc = resplit_mwt(tokens, pipeline, keep_tokens=False)
50 |     assert len(doc.sentences) == 2
51 |     assert len(doc.sentences[0].tokens) == 4
52 |     assert len(doc.sentences[0].tokens[1].words) == 2
53 |     assert doc.sentences[0].tokens[1].words[0].text == "ca"
54 |     assert doc.sentences[0].tokens[1].words[1].text == "n't"
55 | 
56 |     assert len(doc.sentences[1].tokens) == 3
57 |     assert len(doc.sentences[1].tokens[1].words) == 2
58 |     assert doc.sentences[1].tokens[1].words[0].text == "ca"
59 |     assert doc.sentences[1].tokens[1].words[1].text == "n't"
60 | 


--------------------------------------------------------------------------------
/stanza/utils/default_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def get_default_paths():
 4 |     """
 5 |     Gets base paths for the data directories
 6 | 
 7 |     If DATA_ROOT is set in the environment, use that as the root
 8 |     otherwise use "./data"
 9 |     individual paths can also be set in the environment
10 |     """
11 |     DATA_ROOT = os.environ.get("DATA_ROOT", "data")
12 |     defaults = {
13 |         "TOKENIZE_DATA_DIR": DATA_ROOT + "/tokenize",
14 |         "MWT_DATA_DIR": DATA_ROOT + "/mwt",
15 |         "LEMMA_DATA_DIR": DATA_ROOT + "/lemma",
16 |         "POS_DATA_DIR": DATA_ROOT + "/pos",
17 |         "DEPPARSE_DATA_DIR": DATA_ROOT + "/depparse",
18 |         "ETE_DATA_DIR": DATA_ROOT + "/ete",
19 |         "NER_DATA_DIR": DATA_ROOT + "/ner",
20 |         "CHARLM_DATA_DIR": DATA_ROOT + "/charlm",
21 |         "SENTIMENT_DATA_DIR": DATA_ROOT + "/sentiment",
22 |         "CONSTITUENCY_DATA_DIR": DATA_ROOT + "/constituency",
23 |         "COREF_DATA_DIR": DATA_ROOT + "/coref",
24 |         "LEMMA_CLASSIFIER_DATA_DIR": DATA_ROOT + "/lemma_classifier",
25 | 
26 |         # Set directories to store external word vector data
27 |         "WORDVEC_DIR": "extern_data/wordvec",
28 | 
29 |         # TODO: not sure what other people actually have
30 |         # TODO: also, could make this automatically update to the latest
31 |         "UDBASE": "extern_data/ud2/ud-treebanks-v2.11",
32 |         "UDBASE_GIT": "extern_data/ud2/git",
33 | 
34 |         "NERBASE": "extern_data/ner",
35 |         "CONSTITUENCY_BASE": "extern_data/constituency",
36 |         "SENTIMENT_BASE": "extern_data/sentiment",
37 |         "COREF_BASE": "extern_data/coref",
38 | 
39 |         # there's a stanford github, stanfordnlp/handparsed-treebank,
40 |         # with some data for different languages
41 |         "HANDPARSED_DIR": "extern_data/handparsed-treebank",
42 | 
43 |         # directory with the contents of https://nlp.stanford.edu/projects/stanza/bio/
44 |         # on the cluster, for example, /u/nlp/software/stanza/bio_ud
45 |         "BIO_UD_DIR": "extern_data/bio",
46 | 
47 |         # data root for other general input files, such as VI_VLSP
48 |         "STANZA_EXTERN_DIR": "extern_data",
49 |     }
50 | 
51 |     paths = { "DATA_ROOT" : DATA_ROOT }
52 |     for k, v in defaults.items():
53 |         paths[k] = os.environ.get(k, v)
54 | 
55 |     return paths
56 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/coref/test_hebrew_iahlt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from stanza import Pipeline
 4 | from stanza.tests import TEST_MODELS_DIR
 5 | from stanza.utils.datasets.coref.convert_hebrew_iahlt import extract_doc
 6 | 
 7 | pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def tokenizer():
11 |     pipe = Pipeline(lang="he", processors="tokenize", dir=TEST_MODELS_DIR, download_method=None)
12 |     return pipe
13 | 
14 | TEXT = """
15 | 
16 | 
17 | 
18 | מבולבלים​? גם אנחנו​: ל​מסעדנים ו​ה​מלצרים יש עוד סימני שאלה על ה​טיפים​
19 | 
20 | ה​פער בין פסיקת בית ה​דין ל​עבודה לבין פסיקה קודמת של בג"ץ​, משאיר את ה​ענף ב​חוסר וודאות​, ו​ה -​1 ב​ינואר כבר מעבר ל​פינה . "​מ​בחינת​י , הייתי מוסיף ל​תפריט תוספת שירות של 17​% "​, אמר בעלים של מסעדה ב​שדרות​
21 | 
22 | ב​רשות ה​מיסים מסתפקים ב​מסר עמום באשר ל​כוונותי​הם לאור פסק דין ה​טיפים ש​צפוי להיכנס ל​תוקפ​ו ב​-​1 ב​ינואר . על פי פרשנות​ם ה​מקצועית , הבהירו​, יש מקום לחייב את כספי ה​טיפים ב​מע"מ , "​עם זאת​, ה​רשות עדין בוחנת את ה​סוגיה ו​טרם התקבלה החלטה אופרטיבית ב​עניין "​. ו​איך אמורים ה​מסעדנים להיערך בינתיים ל​יישום ה​פסיקה ו​ל​מחזור ה​שנה ה​באה ? ב​יום חמישי יפגשו אנשי ארגון '​מסעדנים חזקים ביחד​' עם מנהל רשות ה​מיסים ערן יעקב​, ו​ידרשו תשובות ברורות​.​
23 | 
24 | "​אני עדיין לא מדבר עם ה​עובדים של​י , ו​אני גם לא יודע איך להיערך החל מ​עוד שבועיים​"​, אמר ל​'​דבר ראשון​' ניר שוחט​, ה​בעלים של מסעדת סושי מוטו ב​שדרות ו​מוסיף כי יהיה קשה להתאים את ה​פסיקה ל​מציאות ב​שטח . "​אף אחד לא יודע​. יש המון סתירות – עורך ה​דין אומר דבר אחד ו​רואה ה​חשבון דבר אחר​. עדיין לא הצליחו להבין את ה​חוק ל​אשור​ו "​.​
25 | 
26 | "​מ​בחינת​י , הייתי מוסיף ל​תפריט תוספת שירות של 17​% . זה יגלם גם את ה​מע"מ ו​ה​טיפים ו​מ​זה אני אשלם ל​מלצרים . די כבר עם ה​טיפים ה​אלה , מספיק​.​"​
27 | """
28 | 
29 | CLUSTER = {'metadata': {'name': 'המסעדנים', 'entity': 'person'}, 'mentions': [[28, 35, {}], [572, 581, {}]]}
30 | 
31 | def test_extract_doc(tokenizer):
32 |     doc = {'text': TEXT,
33 |            'clusters': [CLUSTER],
34 |            'metadata': {
35 |                'doc_id': 'test'
36 |            }
37 |            }
38 |     extracted = extract_doc(tokenizer, [doc])
39 |     assert len(extracted) == 1
40 |     assert len(extracted[0].coref_spans) == 2
41 |     assert extracted[0].coref_spans[1] == [(0, 4, 4)]
42 |     assert extracted[0].coref_spans[6] == [(0, 3, 4)]
43 | 


--------------------------------------------------------------------------------
/stanza/models/lemma_classifier/baseline_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Baseline model for the existing lemmatizer which always predicts "be" and never "have" on the "'s" token.
 3 | 
 4 | The BaselineModel class can be updated to any arbitrary token and predicton lemma, not just "be" on the "s" token.
 5 | """
 6 | 
 7 | import stanza
 8 | import os
 9 | from stanza.models.lemma_classifier.evaluate_models import evaluate_sequences
10 | from stanza.models.lemma_classifier.prepare_dataset import load_doc_from_conll_file
11 | 
12 | class BaselineModel:
13 | 
14 |     def __init__(self, token_to_lemmatize, prediction_lemma, prediction_upos):
15 |         self.token_to_lemmatize = token_to_lemmatize
16 |         self.prediction_lemma = prediction_lemma
17 |         self.prediction_upos = prediction_upos
18 | 
19 |     def predict(self, token):
20 |         if token == self.token_to_lemmatize:
21 |             return self.prediction_lemma
22 | 
23 |     def evaluate(self, conll_path):
24 |         """
25 |         Evaluates the baseline model against the test set defined in conll_path.
26 | 
27 |         Returns a map where the keys are each class and the values are another map including the precision, recall and f1 scores
28 |         for that class.
29 | 
30 |         Also returns confusion matrix. Keys are gold tags and inner keys are predicted tags
31 |         """
32 |         doc = load_doc_from_conll_file(conll_path)
33 |         gold_tag_sequences, pred_tag_sequences = [], []
34 |         for sentence in doc.sentences:
35 |             gold_tags, pred_tags = [], []
36 |             for word in sentence.words:
37 |                 if word.upos in self.prediction_upos and word.text == self.token_to_lemmatize:
38 |                     pred = self.prediction_lemma
39 |                     gold = word.lemma
40 |                     gold_tags.append(gold)
41 |                     pred_tags.append(pred)
42 |             gold_tag_sequences.append(gold_tags)
43 |             pred_tag_sequences.append(pred_tags)
44 | 
45 |         multiclass_result, confusion_mtx, weighted_f1 = evaluate_sequences(gold_tag_sequences, pred_tag_sequences)
46 |         return multiclass_result, confusion_mtx
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 
51 |     bl_model = BaselineModel("'s", "be", ["AUX"])
52 |     coNLL_path = os.path.join(os.path.dirname(__file__), "en_gum-ud-train.conllu")
53 |     bl_model.evaluate(coNLL_path)
54 | 
55 | 


--------------------------------------------------------------------------------
/demo/Dependency_Visualization_Testing.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "64b2a9e0",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
11 |     "\n",
12 |     "ar_strings = ['برلين ترفض حصول شركة اميركية على رخصة تصنيع دبابة \"ليوبارد\" الالمانية', \"هل بإمكاني مساعدتك؟\", \n",
13 |     "              \"أراك في مابعد\", \"لحظة من فضلك\"]\n",
14 |     "# Testing with right to left language\n",
15 |     "visualize_strings(ar_strings, \"ar\")"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "id": "35ef521b",
22 |    "metadata": {},
23 |    "outputs": [],
24 |    "source": [
25 |     "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
26 |     "\n",
27 |     "en_strings = [\"This is a sentence.\", \n",
28 |     "              \"He is wearing a red shirt\",\n",
29 |     "              \"Barack Obama was born in Hawaii. He was elected President of the United States in 2008.\"]\n",
30 |     "# Testing with left to right languages\n",
31 |     "visualize_strings(en_strings, \"en\")"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "id": "f3cf10ba",
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
42 |     "\n",
43 |     "zh_strings = [\"中国是一个很有意思的国家。\"]\n",
44 |     "# Testing with right to left language\n",
45 |     "visualize_strings(zh_strings, \"zh\")"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "code",
50 |    "execution_count": null,
51 |    "id": "d2b9b574",
52 |    "metadata": {},
53 |    "outputs": [],
54 |    "source": []
55 |   }
56 |  ],
57 |  "metadata": {
58 |   "kernelspec": {
59 |    "display_name": "Python 3 (ipykernel)",
60 |    "language": "python",
61 |    "name": "python3"
62 |   },
63 |   "language_info": {
64 |    "codemirror_mode": {
65 |     "name": "ipython",
66 |     "version": 3
67 |    },
68 |    "file_extension": ".py",
69 |    "mimetype": "text/x-python",
70 |    "name": "python",
71 |    "nbconvert_exporter": "python",
72 |    "pygments_lexer": "ipython3",
73 |    "version": "3.9.22"
74 |   }
75 |  },
76 |  "nbformat": 4,
77 |  "nbformat_minor": 5
78 | }
79 | 


--------------------------------------------------------------------------------
/stanza/utils/datasets/ner/conll_to_iob.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Process a conll file into BIO
 3 | 
 4 | Includes the ability to process a file from a text file
 5 | or a text file within a zip
 6 | 
 7 | Main program extracts a piece of the zip file from the Danish DDT dataset
 8 | """
 9 | 
10 | import io
11 | import zipfile
12 | from zipfile import ZipFile
13 | from stanza.utils.conll import CoNLL
14 | 
15 | def process_conll(input_file, output_file, zip_file=None, conversion=None, attr_prefix="name", allow_empty=False):
16 |     """
17 |     Process a single file from DDT
18 | 
19 |     zip_filename: path to ddt.zip
20 |     in_filename: which piece to read
21 |     out_filename: where to write the result
22 | 
23 |     label: which attribute to get from the misc field
24 |     """
25 |     if not attr_prefix.endswith("="):
26 |         attr_prefix = attr_prefix + "="
27 | 
28 |     doc = CoNLL.conll2doc(input_file=input_file, zip_file=zip_file)
29 | 
30 |     with open(output_file, "w", encoding="utf-8") as fout:
31 |         for sentence_idx, sentence in enumerate(doc.sentences):
32 |             for token_idx, token in enumerate(sentence.tokens):
33 |                 misc = token.misc.split("|")
34 |                 for attr in misc:
35 |                     if attr.startswith(attr_prefix):
36 |                         ner = attr.split("=", 1)[1]
37 |                         break
38 |                 else: # name= not found
39 |                     if allow_empty:
40 |                         ner = "O"
41 |                     else:
42 |                         raise ValueError("Could not find ner tag in document {}, sentence {}, token {}".format(input_file, sentence_idx, token_idx))
43 | 
44 |                 if ner != "O" and conversion is not None:
45 |                     if isinstance(conversion, dict):
46 |                         bio, label = ner.split("-", 1)
47 |                         if label in conversion:
48 |                             label = conversion[label]
49 |                         ner = "%s-%s" % (bio, label)
50 |                     else:
51 |                         ner = conversion(ner)
52 |                 fout.write("%s\t%s\n" % (token.text, ner))
53 |             fout.write("\n")
54 | 
55 | def main():
56 |     process_conll(zip_file="extern_data/ner/da_ddt/ddt.zip", input_file="ddt.train.conllu", output_file="data/ner/da_ddt.train.bio")
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/stanza/models/coref/predict.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import json
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | from stanza.models.coref.model import CorefModel
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     argparser = argparse.ArgumentParser()
12 |     argparser.add_argument("experiment")
13 |     argparser.add_argument("input_file")
14 |     argparser.add_argument("output_file")
15 |     argparser.add_argument("--config-file", default="config.toml")
16 |     argparser.add_argument("--batch-size", type=int,
17 |                            help="Adjust to override the config value if you're"
18 |                                 " experiencing out-of-memory issues")
19 |     argparser.add_argument("--weights",
20 |                            help="Path to file with weights to load."
21 |                                 " If not supplied, in the latest"
22 |                                 " weights of the experiment will be loaded;"
23 |                                 " if there aren't any, an error is raised.")
24 |     args = argparser.parse_args()
25 | 
26 |     model = CorefModel.load_model(path=args.weights,
27 |                                   map_location="cpu",
28 |                                   ignore={"bert_optimizer", "general_optimizer",
29 |                                           "bert_scheduler", "general_scheduler"})
30 |     if args.batch_size:
31 |         model.config.a_scoring_batch_size = args.batch_size
32 |     model.training = False
33 | 
34 |     try:
35 |         with open(args.input_file, encoding="utf-8") as fin:
36 |             input_data = json.load(fin)
37 |     except json.decoder.JSONDecodeError:
38 |         # read the old jsonlines format if necessary
39 |         with open(args.input_file, encoding="utf-8") as fin:
40 |             text = "[" + ",\n".join(fin) + "]"
41 |         input_data = json.loads(text)
42 |     docs = [model.build_doc(doc) for doc in input_data]
43 | 
44 |     with torch.no_grad():
45 |         for doc in tqdm(docs, unit="docs"):
46 |             result = model.run(doc)
47 |             doc["span_clusters"] = result.span_clusters
48 |             doc["word_clusters"] = result.word_clusters
49 | 
50 |             for key in ("word2subword", "subwords", "word_id", "head2span"):
51 |                 del doc[key]
52 | 
53 |     with open(args.output_file, mode="w") as fout:
54 |         for doc in docs:
55 |             json.dump(doc, fout)
56 | 


--------------------------------------------------------------------------------
/scripts/config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Set environment variables for the training and testing of stanza modules.
 4 | 
 5 | # Set UDBASE to the location of UD data folder
 6 | # The data should be CoNLL-U format
 7 | # For details, see
 8 | #   http://universaldependencies.org/conll18/data.html (CoNLL-18 UD data)
 9 | #   https://universaldependencies.org/
10 | # When rebuilding models based on Universal Dependencies, download the
11 | #   UD data to some directory, set UDBASE to that directory, and
12 | #   uncomment this line.  Alternatively, put UDBASE in your shell
13 | #   config, Windows env variables, etc as relevant.
14 | # export UDBASE=/path/to/UD
15 | 
16 | # Set NERBASE to the location of NER data folder
17 | # The data should be BIO format or convertable to that format
18 | # For details, see https://www.aclweb.org/anthology/W03-0419.pdf (CoNLL-03 NER paper)
19 | # There are other NER datasets, supported in
20 | #   stanza/utils/datasets/ner/prepare_ner_dataset.py
21 | # If rebuilding NER data, choose a location for the NER directory
22 | #   and set NERBASE to that variable.
23 | # export NERBASE=/path/to/NER
24 | 
25 | # Set CONSTITUENCY_BASE to the location of NER data folder
26 | # The data will be in some dataset-specific format
27 | # There is a conversion script which will turn this
28 | #   into a PTB style format
29 | #   stanza/utils/datasets/constituency/prepare_con_dataset.py
30 | # If processing constituency data, choose a location for the CON data
31 | #   and set CONSTITUENCY_BASE to that variable.
32 | # export CONSTITUENCY_BASE=/path/to/CON
33 | 
34 | # Set directories to store processed training/evaluation files
35 | # $DATA_ROOT is a default home for where all the outputs from the
36 | #   preparation scripts will go.  The training scripts will then look
37 | #   for the stanza formatted data in that directory.
38 | export DATA_ROOT=./data
39 | export TOKENIZE_DATA_DIR=$DATA_ROOT/tokenize
40 | export MWT_DATA_DIR=$DATA_ROOT/mwt
41 | export LEMMA_DATA_DIR=$DATA_ROOT/lemma
42 | export POS_DATA_DIR=$DATA_ROOT/pos
43 | export DEPPARSE_DATA_DIR=$DATA_ROOT/depparse
44 | export ETE_DATA_DIR=$DATA_ROOT/ete
45 | export NER_DATA_DIR=$DATA_ROOT/ner
46 | export CHARLM_DATA_DIR=$DATA_ROOT/charlm
47 | export CONSTITUENCY_DATA_DIR=$DATA_ROOT/constituency
48 | export SENTIMENT_DATA_DIR=$DATA_ROOT/sentiment
49 | 
50 | # Set directories to store external word vector data
51 | export WORDVEC_DIR=./extern_data/wordvec
52 | 


--------------------------------------------------------------------------------
/stanza/tests/datasets/ner/test_prepare_ner_file.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test some simple conversions of NER bio files
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | import json
 8 | 
 9 | from stanza.models.common.doc import Document
10 | from stanza.utils.datasets.ner.prepare_ner_file import process_dataset
11 | 
12 | BIO_1 = """
13 | Jennifer	B-PERSON
14 | Sh'reyan	I-PERSON
15 | has	O
16 | lovely	O
17 | antennae	O
18 | """.strip()
19 | 
20 | BIO_2 = """
21 | but	O
22 | I	O
23 | don't	O
24 | like	O
25 | the	O
26 | way	O
27 | Jennifer	B-PERSON
28 | treated	O
29 | Beckett	B-PERSON
30 | on	O
31 | the	O
32 | Cerritos	B-LOCATION
33 | """.strip()
34 | 
35 | def check_json_file(doc, raw_text, expected_sentences, expected_tokens):
36 |     raw_sentences = raw_text.strip().split("\n\n")
37 |     assert len(raw_sentences) == expected_sentences
38 |     if isinstance(expected_tokens, int):
39 |         expected_tokens = [expected_tokens]
40 |     for raw_sentence, expected_len in zip(raw_sentences, expected_tokens):
41 |         assert len(raw_sentence.strip().split("\n")) == expected_len
42 | 
43 |     assert len(doc.sentences) == expected_sentences
44 |     for sentence, expected_len in zip(doc.sentences, expected_tokens):
45 |         assert len(sentence.tokens) == expected_len
46 |     for sentence, raw_sentence in zip(doc.sentences, raw_sentences):
47 |         for token, line in zip(sentence.tokens, raw_sentence.strip().split("\n")):
48 |             word, tag = line.strip().split()
49 |             assert token.text == word
50 |             assert token.ner == tag
51 | 
52 | def write_and_convert(tmp_path, raw_text):
53 |     bio_file = tmp_path / "test.bio"
54 |     with open(bio_file, "w", encoding="utf-8") as fout:
55 |         fout.write(raw_text)
56 | 
57 |     json_file = tmp_path / "json.bio"
58 |     process_dataset(bio_file, json_file)
59 | 
60 |     with open(json_file) as fin:
61 |         doc = Document(json.load(fin))
62 | 
63 |     return doc
64 | 
65 | def run_test(tmp_path, raw_text, expected_sentences, expected_tokens):
66 |     doc = write_and_convert(tmp_path, raw_text)
67 |     check_json_file(doc, raw_text, expected_sentences, expected_tokens)
68 | 
69 | def test_simple(tmp_path):
70 |     run_test(tmp_path, BIO_1, 1, 5)
71 | 
72 | def test_ner_at_end(tmp_path):
73 |     run_test(tmp_path, BIO_2, 1, 12)
74 | 
75 | def test_two_sentences(tmp_path):
76 |     raw_text = BIO_1 + "\n\n" + BIO_2
77 |     run_test(tmp_path, raw_text, 2, [5, 12])
78 | 


--------------------------------------------------------------------------------