├── data ├── processed │ ├── mwt │ │ └── README.txt │ ├── ner │ │ └── README.txt │ ├── pos │ │ └── README.txt │ ├── depparse │ │ └── README.txt │ ├── lemma │ │ └── README.txt │ ├── tokenize │ │ └── README.txt │ └── charlm │ │ └── en │ │ └── test │ │ ├── README.txt │ │ ├── dev.txt │ │ ├── test.txt │ │ └── train │ │ ├── train-1.txt │ │ └── train-2.txt ├── udbase │ └── UD_English-TEST │ │ ├── en_test-ud-dev.txt │ │ ├── en_test-ud-test.txt │ │ ├── en_test-ud-train.txt │ │ ├── en_test-ud-dev.conllu │ │ ├── en_test-ud-test.conllu │ │ └── en_test-ud-train.conllu ├── wordvec │ └── word2vec │ │ └── English │ │ ├── en.vectors.xz │ │ └── en.vectors.txt └── nerbase │ └── English-SAMPLE │ ├── en_sample.dev.bio │ ├── en_sample.test.bio │ └── en_sample.train.bio ├── requirements.txt ├── config ├── config.sh └── xpos_vocab_factory.py ├── .gitignore └── README.md /data/processed/mwt/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /data/processed/ner/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /data/processed/pos/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /data/processed/depparse/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /data/processed/lemma/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /data/processed/tokenize/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data generated by Stanza. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | protobuf 3 | requests 4 | tqdm 5 | torch>=1.3.0 6 | -------------------------------------------------------------------------------- /data/processed/charlm/en/test/README.txt: -------------------------------------------------------------------------------- 1 | Training and test data for character language model. 2 | -------------------------------------------------------------------------------- /data/processed/charlm/en/test/dev.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii . He was elected president in 2008 . 2 | -------------------------------------------------------------------------------- /data/processed/charlm/en/test/test.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii . He was elected president in 2008 . 2 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-dev.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii. He was elected president in 2008. 2 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-test.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii. He was elected president in 2008. 2 | -------------------------------------------------------------------------------- /data/processed/charlm/en/test/train/train-1.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii . He was elected president in 2008 . 2 | -------------------------------------------------------------------------------- /data/processed/charlm/en/test/train/train-2.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii . He was elected president in 2008 . 2 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-train.txt: -------------------------------------------------------------------------------- 1 | Barack Obama was born in Hawaii. He was elected president in 2008. 2 | -------------------------------------------------------------------------------- /data/wordvec/word2vec/English/en.vectors.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/stanza-train/HEAD/data/wordvec/word2vec/English/en.vectors.xz -------------------------------------------------------------------------------- /data/nerbase/English-SAMPLE/en_sample.dev.bio: -------------------------------------------------------------------------------- 1 | Barack I-PER 2 | Obama I-PER 3 | was O 4 | born O 5 | in O 6 | Hawaii I-LOC 7 | . O 8 | 9 | He O 10 | was O 11 | elected O 12 | president O 13 | in O 14 | 2008 O 15 | . O 16 | -------------------------------------------------------------------------------- /data/nerbase/English-SAMPLE/en_sample.test.bio: -------------------------------------------------------------------------------- 1 | Barack I-PER 2 | Obama I-PER 3 | was O 4 | born O 5 | in O 6 | Hawaii I-LOC 7 | . O 8 | 9 | He O 10 | was O 11 | elected O 12 | president O 13 | in O 14 | 2008 O 15 | . O 16 | -------------------------------------------------------------------------------- /data/nerbase/English-SAMPLE/en_sample.train.bio: -------------------------------------------------------------------------------- 1 | Barack I-PER 2 | Obama I-PER 3 | was O 4 | born O 5 | in O 6 | Hawaii I-LOC 7 | . O 8 | 9 | He O 10 | was O 11 | elected O 12 | president O 13 | in O 14 | 2008 O 15 | . O 16 | -------------------------------------------------------------------------------- /data/wordvec/word2vec/English/en.vectors.txt: -------------------------------------------------------------------------------- 1 | 10 5 2 | Barack 0.01613954 0.00141043 -0.00869777 0.000911 0.01950155 3 | Obama -0.00907914 0.01053656 -0.00389627 -0.00673913 -0.00667982 4 | was -0.00046209 0.01675782 0.00450974 0.00875711 -0.00223494 5 | born -0.02178387 0.01755228 -0.00446462 0.00476047 0.02028277 6 | in 0.00124867 0.01410756 0.01728466 0.01355088 -0.00336146 7 | Hawaii 0.00582745 -0.01101075 -0.00198883 0.01841053 0.00072485 8 | . 0.00745728 -0.00108565 0.01947713 0.00447089 -0.01529367 9 | He -0.00628661 -0.0084458 0.00466739 -0.00817884 -0.02236676 10 | elected 0.00366836 -0.00218679 0.01713075 -0.0119266 -0.0078803 11 | president 0.0030667 0.01066898 -0.01944919 0.00631905 0.00310773 12 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-dev.conllu: -------------------------------------------------------------------------------- 1 | # text = Barack Obama was born in Hawaii. 2 | 1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ _ 3 | 2 Obama Obama PROPN NNP Number=Sing 1 flat _ _ 4 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ _ 5 | 4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 6 | 5 in in ADP IN _ 6 case _ _ 7 | 6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No 8 | 7 . . PUNCT . _ 4 punct _ _ 9 | 10 | # text = He was elected president in 2008. 11 | 1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ _ 12 | 2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ _ 13 | 3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 14 | 4 president president PROPN NNP Number=Sing 3 xcomp _ _ 15 | 5 in in ADP IN _ 6 case _ _ 16 | 6 2008 2008 NUM CD NumType=Card 3 obl _ SpaceAfter=No 17 | 7 . . PUNCT . _ 3 punct _ _ 18 | 19 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-test.conllu: -------------------------------------------------------------------------------- 1 | # text = Barack Obama was born in Hawaii. 2 | 1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ _ 3 | 2 Obama Obama PROPN NNP Number=Sing 1 flat _ _ 4 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ _ 5 | 4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 6 | 5 in in ADP IN _ 6 case _ _ 7 | 6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No 8 | 7 . . PUNCT . _ 4 punct _ _ 9 | 10 | # text = He was elected president in 2008. 11 | 1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ _ 12 | 2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ _ 13 | 3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 14 | 4 president president PROPN NNP Number=Sing 3 xcomp _ _ 15 | 5 in in ADP IN _ 6 case _ _ 16 | 6 2008 2008 NUM CD NumType=Card 3 obl _ SpaceAfter=No 17 | 7 . . PUNCT . _ 3 punct _ _ 18 | 19 | -------------------------------------------------------------------------------- /data/udbase/UD_English-TEST/en_test-ud-train.conllu: -------------------------------------------------------------------------------- 1 | # text = Barack Obama was born in Hawaii. 2 | 1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ _ 3 | 2 Obama Obama PROPN NNP Number=Sing 1 flat _ _ 4 | 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ _ 5 | 4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 6 | 5 in in ADP IN _ 6 case _ _ 7 | 6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ SpaceAfter=No 8 | 7 . . PUNCT . _ 4 punct _ _ 9 | 10 | # text = He was elected president in 2008. 11 | 1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ _ 12 | 2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ _ 13 | 3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ _ 14 | 4 president president PROPN NNP Number=Sing 3 xcomp _ _ 15 | 5 in in ADP IN _ 6 case _ _ 16 | 6 2008 2008 NUM CD NumType=Card 3 obl _ SpaceAfter=No 17 | 7 . . PUNCT . _ 3 punct _ _ 18 | 19 | -------------------------------------------------------------------------------- /config/config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Set environment variables for the training and testing of stanza modules. 4 | 5 | # Set UDBASE to the location of UD data folder 6 | # The data should be CoNLL-U format 7 | # For details, see http://universaldependencies.org/conll18/data.html (CoNLL-18 UD data) 8 | export UDBASE=../data/udbase 9 | 10 | # Set NERBASE to the location of NER data folder 11 | # The data should be BIO format 12 | # For details, see https://www.aclweb.org/anthology/W03-0419.pdf (CoNLL-03 NER paper) 13 | export NERBASE=../data/nerbase 14 | 15 | # Set directories to store processed training/evaluation files 16 | export DATA_ROOT=../data/processed 17 | export TOKENIZE_DATA_DIR=$DATA_ROOT/tokenize 18 | export MWT_DATA_DIR=$DATA_ROOT/mwt 19 | export LEMMA_DATA_DIR=$DATA_ROOT/lemma 20 | export POS_DATA_DIR=$DATA_ROOT/pos 21 | export DEPPARSE_DATA_DIR=$DATA_ROOT/depparse 22 | export ETE_DATA_DIR=$DATA_ROOT/ete 23 | export NER_DATA_DIR=$DATA_ROOT/ner 24 | export CHARLM_DATA_DIR=$DATA_ROOT/charlm 25 | 26 | # Set directories to store external word vector data 27 | export WORDVEC_DIR=../data/wordvec 28 | -------------------------------------------------------------------------------- /config/xpos_vocab_factory.py: -------------------------------------------------------------------------------- 1 | # This is the XPOS factory method generated automatically from models.pos.build_xpos_factory. 2 | # Please don't edit it! 3 | 4 | from stanza.models.pos.vocab import WordVocab, XPOSVocab 5 | 6 | def xpos_vocab_factory(data, shorthand): 7 | if shorthand in ["af_afribooms", "grc_perseus", "ar_padt", "bg_btb", "hr_set", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gl_ctg", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "ja_gsd", "lv_lvtb", "lt_alksnis", "ro_nonstandard", "ro_rrt", "gd_arcosg", "sr_set", "sk_snk", "sl_ssj", "ta_ttb", "uk_iu", "gl_treegal", "la_perseus", "sl_sst"]: 8 | return XPOSVocab(data, shorthand, idx=2, sep="") 9 | elif shorthand in ["en_test", "grc_proiel", "hy_armtdp", "eu_bdt", "be_hse", "ca_ancora", "zh-hant_gsd", "zh-hans_gsdsimp", "lzh_kyoto", "cop_scriptorium", "da_ddt", "en_ewt", "en_gum", "et_edt", "fi_tdt", "fr_ftb", "fr_gsd", "fr_sequoia", "fr_spoken", "de_gsd", "de_hdt", "got_proiel", "el_gdt", "he_htb", "hi_hdtb", "hu_szeged", "ga_idt", "ja_bccwj", "la_proiel", "lt_hse", "mt_mudt", "mr_ufal", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "cu_proiel", "fro_srcmf", "orv_torot", "fa_seraji", "pt_bosque", "pt_gsd", "ru_gsd", "ru_syntagrus", "ru_taiga", "es_ancora", "es_gsd", "swl_sslc", "te_mtg", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "bxr_bdt", "et_ewt", "kk_ktb", "kmr_mg", "olo_kkpp", "sme_giella", "hsb_ufal"]: 10 | return WordVocab(data, shorthand, idx=2, ignore=["_"]) 11 | elif shorthand in ["nl_alpino", "nl_lassysmall", "la_ittb", "sv_talbanken"]: 12 | return XPOSVocab(data, shorthand, idx=2, sep="|") 13 | elif shorthand in ["en_lines", "sv_lines", "ur_udtb"]: 14 | return XPOSVocab(data, shorthand, idx=2, sep="-") 15 | elif shorthand in ["fi_ftb"]: 16 | return XPOSVocab(data, shorthand, idx=2, sep=",") 17 | elif shorthand in ["id_gsd", "ko_gsd", "ko_kaist"]: 18 | return XPOSVocab(data, shorthand, idx=2, sep="+") 19 | elif shorthand in ["pl_lfg", "pl_pdb"]: 20 | return XPOSVocab(data, shorthand, idx=2, sep=":") 21 | else: 22 | raise NotImplementedError('Language shorthand "{}" not found!'.format(shorthand)) 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # kept from original 2 | .DS_Store 3 | *.tmp 4 | *.pkl 5 | *.conllu 6 | *.lem 7 | *.toklabels 8 | 9 | # standard github python project gitignore 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | 126 | # Spyder project settings 127 | .spyderproject 128 | .spyproject 129 | 130 | # Rope project settings 131 | .ropeproject 132 | 133 | # mkdocs documentation 134 | /site 135 | 136 | # mypy 137 | .mypy_cache/ 138 | .dmypy.json 139 | dmypy.json 140 | 141 | # Pyre type checker 142 | .pyre/ 143 | 144 | # pytype static type analyzer 145 | .pytype/ 146 | 147 | # Cython debug symbols 148 | cython_debug/ 149 | 150 | 151 | # ignore the version of stanza we download into the training repo 152 | stanza/ 153 | 154 | # ignore the artifacts produced by preparing training data 155 | data/processed/mwt/en_test*json 156 | data/processed/ner/en_sample*json 157 | data/processed/tokenize/en_test*json 158 | data/processed/tokenize/en_test*txt 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
