├── ELMo ├── fine-tune-elmo │ ├── bilm │ │ ├── init.py │ │ ├── __pycache__ │ │ │ ├── data.cpython-36.pyc │ │ │ ├── elmo.cpython-36.pyc │ │ │ ├── model.cpython-36.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── training.cpython-36.pyc │ │ ├── __init__.py │ │ └── elmo.py │ ├── bilm.egg-info │ │ ├── not-zip-safe │ │ ├── top_level.txt │ │ ├── dependency_links.txt │ │ ├── SOURCES.txt │ │ └── PKG-INFO │ ├── dist │ │ └── bilm-0.1-py3.6.egg │ ├── tests │ │ ├── fixtures │ │ │ ├── model │ │ │ │ ├── lm_weights.hdf5 │ │ │ │ ├── lm_embeddings_0.hdf5 │ │ │ │ ├── lm_embeddings_1.hdf5 │ │ │ │ ├── lm_embeddings_2.hdf5 │ │ │ │ ├── options.json │ │ │ │ ├── lm_embeddings_sentences.json │ │ │ │ └── vocab_test.txt │ │ │ ├── train │ │ │ │ ├── vocab.txt │ │ │ │ └── data.txt │ │ │ └── data │ │ │ │ └── vocab_test.txt │ │ ├── __pycache__ │ │ │ ├── test_data.cpython-36.pyc │ │ │ ├── test_elmo.cpython-36.pyc │ │ │ ├── test_model.cpython-36.pyc │ │ │ └── test_training.cpython-36.pyc │ │ ├── test_elmo.py │ │ └── test_training.py │ ├── weight.sh │ ├── build │ │ └── lib │ │ │ └── bilm │ │ │ ├── __init__.py │ │ │ └── elmo.py │ ├── run_tests_before_shell.sh │ ├── train.sh │ ├── setup.py │ ├── bin │ │ ├── dump_weights.py │ │ ├── run_test.py │ │ ├── restart.py │ │ └── train_elmo.py │ ├── run.sh │ ├── Dockerfile │ ├── usage_cached.py │ ├── usage_character.py │ ├── log │ └── usage_token.py └── use_ELMo │ └── get_elmo_bin.py ├── biaffine-parser-elmo ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── Vocab.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── Dataloader.cpython-36.pyc │ │ └── Dependency.cpython-36.pyc │ ├── Dataloader.py │ └── Dependency.py ├── driver │ ├── __init__.py │ ├── __pycache__ │ │ ├── MST.cpython-36.pyc │ │ ├── Config.cpython-36.pyc │ │ ├── Layer.cpython-36.pyc │ │ ├── Model.cpython-36.pyc │ │ ├── Parser.cpython-36.pyc │ │ └── __init__.cpython-36.pyc │ ├── Test.py │ ├── Parser.py │ └── Config.py ├── run.sh └── config.cfg ├── multi_task_learning ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── Vocab.cpython-35.pyc │ │ ├── Vocab.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── Dataloader.cpython-35.pyc │ │ ├── Dataloader.cpython-36.pyc │ │ ├── Dependency.cpython-35.pyc │ │ └── Dependency.cpython-36.pyc │ ├── Dataloader.py │ ├── Dependency.py │ └── Vocab.py ├── driver │ ├── __init__.py │ ├── .Train.swp │ ├── __pycache__ │ │ ├── Layer.cpython-35.pyc │ │ ├── Layer.cpython-36.pyc │ │ ├── MST.cpython-35.pyc │ │ ├── MST.cpython-36.pyc │ │ ├── Model.cpython-35.pyc │ │ ├── Model.cpython-36.pyc │ │ ├── Config.cpython-35.pyc │ │ ├── Config.cpython-36.pyc │ │ ├── Parser.cpython-35.pyc │ │ ├── Parser.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc │ ├── Test.py │ ├── Parser.py │ └── Config.py ├── run.sh └── config.cfg ├── biaffine-parser-concat ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── Vocab.cpython-35.pyc │ │ ├── Vocab.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── Dataloader.cpython-35.pyc │ │ ├── Dataloader.cpython-36.pyc │ │ ├── Dependency.cpython-35.pyc │ │ └── Dependency.cpython-36.pyc │ ├── Dataloader.py │ └── Dependency.py ├── driver │ ├── __init__.py │ ├── __pycache__ │ │ ├── MST.cpython-35.pyc │ │ ├── MST.cpython-36.pyc │ │ ├── Config.cpython-35.pyc │ │ ├── Config.cpython-36.pyc │ │ ├── Layer.cpython-35.pyc │ │ ├── Layer.cpython-36.pyc │ │ ├── Model.cpython-35.pyc │ │ ├── Model.cpython-36.pyc │ │ ├── Parser.cpython-35.pyc │ │ ├── Parser.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc │ ├── Test.py │ ├── Parser.py │ ├── Model.py │ └── Config.py ├── run.sh └── config.cfg ├── biaffine-parser-domain-embedding ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── Vocab.cpython-35.pyc │ │ ├── Vocab.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── Dataloader.cpython-35.pyc │ │ ├── Dataloader.cpython-36.pyc │ │ ├── Dependency.cpython-35.pyc │ │ └── Dependency.cpython-36.pyc │ ├── Dataloader.py │ └── Dependency.py ├── driver │ ├── __init__.py │ ├── __pycache__ │ │ ├── MST.cpython-35.pyc │ │ ├── MST.cpython-36.pyc │ │ ├── Config.cpython-35.pyc │ │ ├── Config.cpython-36.pyc │ │ ├── Layer.cpython-35.pyc │ │ ├── Layer.cpython-36.pyc │ │ ├── Model.cpython-35.pyc │ │ ├── Model.cpython-36.pyc │ │ ├── Parser.cpython-35.pyc │ │ ├── Parser.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc │ ├── Test.py │ ├── Parser.py │ ├── Model.py │ └── Config.py ├── run.sh └── config.cfg └── readme.md /ELMo/fine-tune-elmo/bilm/init.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multi_task_learning/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multi_task_learning/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | bilm 2 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /multi_task_learning/driver/.Train.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/.Train.swp -------------------------------------------------------------------------------- /multi_task_learning/run.sh: -------------------------------------------------------------------------------- 1 | exe=../driver/Train.py 2 | nohup python3 -u $exe --config_file=config.cfg > log.train 2>&1 & 3 | 4 | 5 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/dist/bilm-0.1-py3.6.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/dist/bilm-0.1-py3.6.egg -------------------------------------------------------------------------------- /biaffine-parser-elmo/run.sh: -------------------------------------------------------------------------------- 1 | exe=../driver/Train-corpus-weighting.py 2 | nohup python3 $exe --config_file=config.cfg > log.train 2>&1 & 3 | 4 | 5 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/run.sh: -------------------------------------------------------------------------------- 1 | exe=../driver/Train-corpus-weighting.py 2 | nohup python3 -u $exe --config_file=config.cfg > log.train 2>&1 & 3 | 4 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__pycache__/data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/data.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__pycache__/elmo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/elmo.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/lm_weights.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_weights.hdf5 -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/run.sh: -------------------------------------------------------------------------------- 1 | #exe=../driver/Train.py 2 | exe=../driver/Train-corpus-weighting.py 3 | nohup python3 $exe --config_file=config.cfg > log.train 2>&1 & 4 | 5 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/__pycache__/Vocab.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Vocab.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/MST.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/MST.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Vocab.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Vocab.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Vocab.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Vocab.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Layer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Layer.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Layer.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/MST.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/MST.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/MST.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/MST.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Model.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Model.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__pycache__/training.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/training.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_0.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_0.hdf5 -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_1.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_1.hdf5 -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_2.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_2.hdf5 -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Vocab.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Vocab.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Vocab.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Vocab.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/MST.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/MST.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/MST.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/MST.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/Config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Config.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/Layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Layer.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/Model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Model.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Parser.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Config.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Config.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Parser.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Parser.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Parser.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/__pycache__/test_data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_data.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/__pycache__/test_elmo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_elmo.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/__pycache__/test_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_model.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Config.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Config.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Layer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Layer.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Layer.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Model.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Model.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Parser.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Parser.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Parser.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/__pycache__/Dataloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Dataloader.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/__pycache__/Dependency.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Dependency.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Dataloader.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dataloader.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Dataloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dataloader.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Dependency.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dependency.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/data/__pycache__/Dependency.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dependency.cpython-36.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /multi_task_learning/driver/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/__pycache__/test_training.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_training.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Dataloader.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Dataloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Dependency.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dependency.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/data/__pycache__/Dependency.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dependency.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-36.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/weight.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=6 2 | nohup python ./bin/dump_weights.py \ 3 | --save_dir ./domain_fine_tune_iter8/checkpoint\ 4 | --outfile='./checkpoint/weights.hdf5' 2>&1 & 5 | 6 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import Batcher, TokenBatcher 3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \ 4 | dump_bilm_embeddings 5 | from .elmo import weight_layers 6 | 7 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/build/lib/bilm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import Batcher, TokenBatcher 3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \ 4 | dump_bilm_embeddings 5 | from .elmo import weight_layers 6 | 7 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/run_tests_before_shell.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Verify environment by running tests 4 | # before dropping into a shell. 5 | set -e 6 | PYTHONDONTWRITEBYTECODE=1 python -m unittest discover tests/ 7 | echo "Tests passed!" 8 | 9 | /bin/bash -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/train.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 2 | python ./bin/train_elmo.py \ 3 | --train_prefix='/data1/yli/giga-train-data/split/sens.train.split.*' \ 4 | --vocab_file '/data1/yli/giga-train-data/dict.train'\ 5 | --save_dir /data1/yli/elmo-chinese-model/checkpoint 6 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | bilm/__init__.py 4 | bilm/data.py 5 | bilm/elmo.py 6 | bilm/model.py 7 | bilm/training.py 8 | bilm.egg-info/PKG-INFO 9 | bilm.egg-info/SOURCES.txt 10 | bilm.egg-info/dependency_links.txt 11 | bilm.egg-info/not-zip-safe 12 | bilm.egg-info/top_level.txt -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: bilm 3 | Version: 0.1 4 | Summary: UNKNOWN 5 | Home-page: http://github.com/allenai/bilm-tf 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description-Content-Type: UNKNOWN 10 | Description: UNKNOWN 11 | Platform: UNKNOWN 12 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import setuptools 3 | 4 | setuptools.setup( 5 | name='bilm', 6 | version='0.1', 7 | url='http://github.com/allenai/bilm-tf', 8 | packages=setuptools.find_packages(), 9 | tests_require=[], 10 | zip_safe=False, 11 | entry_points='', 12 | ) 13 | 14 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/options.json: -------------------------------------------------------------------------------- 1 | {"lstm": {"cell_clip": 3, "use_skip_connections": false, "n_layers": 2, "proj_clip": 3, "projection_dim": 16, "dim": 64}, "char_cnn": {"embedding": {"dim": 4}, "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], "n_highway": 2, "n_characters": 262, "max_characters_per_token": 50, "activation": "relu"}} 2 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/train/vocab.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | of 5 | the 6 | neural 7 | . 8 | words 9 | , 10 | in 11 | language 12 | models 13 | to 14 | a 15 | vocabulary 16 | embeddings 17 | use 18 | - 19 | are 20 | or 21 | make 22 | and 23 | increases 24 | as 25 | exponentially 26 | problem 27 | sequences 28 | continuous 29 | networks 30 | net 31 | number 32 | larger 33 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bin/dump_weights.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import argparse 4 | import sys 5 | sys.path.extend(["../../","../","./"]) 6 | 7 | from training import dump_weights as dw 8 | 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--save_dir', help='Location of checkpoint files') 13 | parser.add_argument('--outfile', help='Output hdf5 file with weights') 14 | 15 | args = parser.parse_args() 16 | dw(args.save_dir, args.outfile) 17 | 18 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/run.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=5,6,7 2 | nohup python -u /data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/bin/restart.py \ 3 | --train_prefix='/data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/data/4domain-trainFile/train_elmo_data.file.*'\ 4 | --vocab_file '/data/xpeng/elmo/Train_ELMo/bilm-tf/elmo-chinese-model/dict.train'\ 5 | --save_dir '/data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/4domain_fine_tune_iter8/checkpoint'\ 6 | > /data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/4domain_fine_tune_iter8/log.4domain-fine-tune-giga1100-model-iter8 2>&1 & 7 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/train/data.txt: -------------------------------------------------------------------------------- 1 | neural language models use continuous representations or embeddings of words to make their predictions . 2 | these models make use of neural networks . 3 | continuous space embeddings help to alleviate the curse of dimensionality in language modeling : as language models are trained on larger and larger texts , the number of unique words ( the vocabulary ) increases and the number of possible sequences of words increases exponentially with the size of the vocabulary , causing a data sparsity problem because for each of the exponentially many sequences , statistics are needed to properly estimate probabilities . 4 | neural networks avoid this problem by representing words in a distributed way , as non - linear combinations of weights in a neural net . 5 | the neural net architecture might be feed - forward or recurrent . -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/data/vocab_test.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | the 5 | , 6 | . 7 | to 8 | of 9 | and 10 | a 11 | in 12 | " 13 | 's 14 | that 15 | for 16 | on 17 | is 18 | The 19 | was 20 | with 21 | said 22 | as 23 | at 24 | it 25 | by 26 | from 27 | be 28 | have 29 | he 30 | has 31 | his 32 | are 33 | an 34 | ) 35 | not 36 | ( 37 | will 38 | who 39 | I 40 | had 41 | their 42 | -- 43 | were 44 | they 45 | but 46 | been 47 | this 48 | which 49 | more 50 | or 51 | its 52 | would 53 | about 54 | : 55 | after 56 | up 57 | $ 58 | one 59 | than 60 | also 61 | 't 62 | out 63 | her 64 | you 65 | year 66 | when 67 | It 68 | two 69 | people 70 | - 71 | all 72 | can 73 | over 74 | last 75 | first 76 | But 77 | into 78 | ' 79 | He 80 | A 81 | we 82 | In 83 | she 84 | other 85 | new 86 | years 87 | could 88 | there 89 | ? 90 | time 91 | some 92 | them 93 | if 94 | no 95 | percent 96 | so 97 | what 98 | only 99 | government 100 | million 101 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | python >= 3.6 3 | pytorch == 0.3.0 4 | ## Dataset 5 | | |BC |PB |ZX | 6 | |:--------:|:-----:|:-----: |:----:| 7 | |train | 52,433|5,140 |1,649 | 8 | |dev | 998 |1,300 |500 | 9 | |test |1,995 |2,600 | 1,100| 10 | |unlabeled | - | 326,981|32,492| 11 | 12 | ## Performance 13 | Final results on the test data: 14 | 15 | | | PB(uas) | PB(las) |ZX(uas) | ZX(las)| 16 | |:------------:|:----------:|:-------:|:-------:|:-------:| 17 | |BC-train |67.55 |61.01 |**68.44**|59.55 | 18 | |PB-train |**74.52** |**69.02** |51.62 |40.36 | 19 | |ZX-train |52.24 |42.76 |68.14 |**61.71**| 20 | | MTL | 75.39 |69.69 |72.11 |65.66 | 21 | | concat | 77.49 | 72.16 |76.80 |70.85 | 22 | | DOEMB | 78.24 |72.81 |77.96 |72.01 | 23 | | +ELMo |77.62 |72.35 |78.50 |72.49 | 24 | | +Fine-tuning | **82.05** |**77.216**|**80.44**|**75.11**| 25 | 26 | ## Usage 27 | 28 | 29 | -------------------------------------------------------------------------------- /biaffine-parser-concat/config.cfg: -------------------------------------------------------------------------------- 1 | [Data] 2 | pretrained_embeddings_file = ./data/giga.100.txt 3 | data_dir =./data 4 | train_file = %(data_dir)s/cdt-train.conll 5 | dev_file = %(data_dir)s/comment-dev.conll 6 | test_file = %(data_dir)s/comment-test.conll 7 | min_occur_count = 2 8 | 9 | [Save] 10 | save_dir =./best-model 11 | config_file = %(save_dir)s/config.cfg 12 | save_model_path = %(save_dir)s/model 13 | save_vocab_path = %(save_dir)s/vocab 14 | load_dir = ./best-model 15 | load_model_path = %(load_dir)s/model 16 | load_vocab_path = %(load_dir)s/vocab 17 | 18 | [Network] 19 | lstm_layers = 2 20 | word_dims = 100 21 | tag_dims = 100 22 | dropout_emb = 0.33 23 | lstm_hiddens = 300 24 | dropout_lstm_input = 0.33 25 | dropout_lstm_hidden = 0.33 26 | mlp_arc_size = 200 27 | mlp_rel_size = 100 28 | dropout_mlp = 0.33 29 | 30 | [Optimizer] 31 | learning_rate = 2e-3 32 | decay = .75 33 | decay_steps = 5000 34 | beta_1 = .9 35 | beta_2 = .9 36 | epsilon = 1e-12 37 | clip = 5.0 38 | 39 | [Run] 40 | num_buckets_train = 40 41 | num_buckets_valid = 10 42 | num_buckets_test = 10 43 | train_iters = 50000 44 | train_batch_size = 50 45 | test_batch_size = 100 46 | validate_every = 165 47 | save_after = 50 48 | update_every = 4 49 | 50 | [corous-weighting] 51 | domain_num_corpus_weighting=4700 52 | cdt_domain_ration=1 53 | domain_file=./data/comment-train.conll 54 | 55 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/config.cfg: -------------------------------------------------------------------------------- 1 | [Data] 2 | pretrained_embeddings_file = ./data/giga.100.txt 3 | data_dir = ./treebank 4 | train_file = %(data_dir)s/PC-BC.conll 5 | dev_file = %(data_dir)s/PC/PC-Dev.conll 6 | test_file = %(data_dir)s/PC/PC-Test.conll 7 | min_occur_count = 2 8 | 9 | [Save] 10 | save_dir =./best-model 11 | config_file = %(save_dir)s/config.cfg 12 | save_model_path = %(save_dir)s/model 13 | save_vocab_path = %(save_dir)s/vocab 14 | load_dir = ./best-model 15 | load_model_path = %(load_dir)s/model 16 | load_vocab_path = %(load_dir)s/vocab 17 | 18 | [Network] 19 | lstm_layers = 2 20 | word_dims = 100 21 | tag_dims = 100 22 | dropout_emb = 0.33 23 | lstm_hiddens = 300 24 | dropout_lstm_input = 0.33 25 | dropout_lstm_hidden = 0.33 26 | mlp_arc_size = 200 27 | mlp_rel_size = 100 28 | dropout_mlp = 0.33 29 | 30 | [Optimizer] 31 | learning_rate = 2e-3 32 | decay = .75 33 | decay_steps = 5000 34 | beta_1 = .9 35 | beta_2 = .9 36 | epsilon = 1e-12 37 | clip = 5.0 38 | 39 | [Run] 40 | num_buckets_train = 40 41 | num_buckets_valid = 10 42 | num_buckets_test = 10 43 | train_iters = 50000 44 | train_batch_size = 50 45 | test_batch_size = 100 46 | validate_every = 165 47 | save_after = 50 48 | update_every = 4 49 | 50 | [corous-weighting] 51 | domain_num_corpus_weighting=1500 52 | cdt_domain_ration=11 53 | domain_data_size=6892 54 | 55 | [Treebankid] 56 | tbank_emb_size=12 57 | 58 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 2 | 3 | ENV LC_ALL=C.UTF-8 4 | ENV LANG=C.UTF-8 5 | ENV PATH /opt/conda/bin:$PATH 6 | 7 | WORKDIR /stage 8 | 9 | # Install base packages. 10 | RUN apt-get update --fix-missing && apt-get install -y \ 11 | bzip2 \ 12 | ca-certificates \ 13 | curl \ 14 | gcc \ 15 | git \ 16 | libc-dev \ 17 | libglib2.0-0 \ 18 | libsm6 \ 19 | libxext6 \ 20 | libxrender1 \ 21 | wget \ 22 | libevent-dev \ 23 | build-essential && \ 24 | rm -rf /var/lib/apt/lists/* 25 | 26 | # Install Anaconda. 27 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 28 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ 29 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 30 | rm ~/miniconda.sh 31 | 32 | # Use python 3.5 33 | RUN conda install python=3.5 34 | 35 | # Copy files. 36 | COPY bilm/ bilm/ 37 | COPY bin/ bin/ 38 | COPY tests/ tests/ 39 | COPY README.md README.md 40 | COPY setup.py setup.py 41 | COPY usage*.py ./ 42 | COPY run_tests_before_shell.sh run_tests_before_shell.sh 43 | 44 | # Install package. 45 | RUN pip install tensorflow-gpu==1.2 h5py 46 | RUN python setup.py install 47 | 48 | # Run tests to verify the Docker build 49 | # and then drop into a shell. 50 | CMD ["/bin/bash", "run_tests_before_shell.sh"] 51 | -------------------------------------------------------------------------------- /multi_task_learning/config.cfg: -------------------------------------------------------------------------------- 1 | [Data] 2 | pretrained_embeddings_file = ./data/giga.100.txt 3 | data_dir = ./ 4 | train_file = %(data_dir)s/BC/train-tbank.conll 5 | cdt_dev_file = %(data_dir)s/BC/dev-tbank.conll 6 | cw_dev_file = %(data_dir)s/PB/dev-tbank.conll 7 | cdt_test_file = %(data_dir)s/BC/test-tbank.conll 8 | cw_test_file = %(data_dir)s/PC/test-tbank.conll 9 | min_occur_count = 2 10 | 11 | [Save] 12 | save_dir =./best-model 13 | config_file = %(save_dir)s/config.cfg 14 | save_model_path = %(save_dir)s/model 15 | save_vocab_path = %(save_dir)s/vocab 16 | load_dir = ./best-model 17 | load_model_path = %(load_dir)s/model 18 | load_vocab_path = %(load_dir)s/vocab 19 | 20 | [Network] 21 | lstm_layers = 2 22 | word_dims = 100 23 | tag_dims = 100 24 | dropout_emb = 0.33 25 | lstm_hiddens = 300 26 | dropout_lstm_input = 0.33 27 | dropout_lstm_hidden = 0.33 28 | mlp_arc_size = 200 29 | mlp_rel_size = 100 30 | dropout_mlp = 0.33 31 | 32 | [Optimizer] 33 | learning_rate = 2e-3 34 | decay = .75 35 | decay_steps = 5000 36 | beta_1 = .9 37 | beta_2 = .9 38 | epsilon = 1e-12 39 | clip = 5.0 40 | 41 | [Run] 42 | num_buckets_train = 40 43 | num_buckets_valid = 10 44 | num_buckets_test = 10 45 | train_iters = 50000 46 | train_batch_size = 50 47 | test_batch_size = 100 48 | validate_every = 165 49 | save_after = 50 50 | update_every = 4 51 | 52 | [corous-weighting] 53 | domain_num_corpus_weighting=3200 54 | cdt_domain_ration=11 55 | domain_file=./content/train-tbank.conll 56 | 57 | 58 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bin/run_test.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | 4 | from bilm.training import test, load_options_latest_checkpoint, load_vocab 5 | from bilm.data import LMDataset, BidirectionalLMDataset 6 | 7 | def main(args): 8 | options, ckpt_file = load_options_latest_checkpoint(args.save_dir) 9 | 10 | # load the vocab 11 | if 'char_cnn' in options: 12 | max_word_length = options['char_cnn']['max_characters_per_token'] 13 | else: 14 | max_word_length = None 15 | vocab = load_vocab(args.vocab_file, max_word_length) 16 | 17 | test_prefix = args.test_prefix 18 | 19 | kwargs = { 20 | 'test': True, 21 | 'shuffle_on_load': False, 22 | } 23 | 24 | if options.get('bidirectional'): 25 | data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) 26 | else: 27 | data = LMDataset(test_prefix, vocab, **kwargs) 28 | 29 | test(options, ckpt_file, data, batch_size=args.batch_size) 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Compute test perplexity') 34 | parser.add_argument('--save_dir', help='Location of checkpoint files') 35 | parser.add_argument('--vocab_file', help='Vocabulary file') 36 | parser.add_argument('--test_prefix', help='Prefix for test files') 37 | parser.add_argument('--batch_size', 38 | type=int, default=256, 39 | help='Batch size') 40 | 41 | args = parser.parse_args() 42 | main(args) 43 | 44 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/usage_cached.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ELMo usage example to write biLM embeddings for an entire dataset to 3 | a file. 4 | ''' 5 | 6 | import os 7 | import h5py 8 | from bilm import dump_bilm_embeddings 9 | 10 | # Our small dataset. 11 | raw_context = [ 12 | 'Pretrained biLMs compute representations useful for NLP tasks .', 13 | 'They give state of the art performance for many tasks .' 14 | ] 15 | tokenized_context = [sentence.split() for sentence in raw_context] 16 | tokenized_question = [ 17 | ['What', 'are', 'biLMs', 'useful', 'for', '?'], 18 | ] 19 | 20 | # Create the dataset file. 21 | dataset_file = 'dataset_file.txt' 22 | with open(dataset_file, 'w') as fout: 23 | for sentence in tokenized_context + tokenized_question: 24 | fout.write(' '.join(sentence) + '\n') 25 | 26 | 27 | # Location of pretrained LM. Here we use the test fixtures. 28 | datadir = os.path.join('tests', 'fixtures', 'model') 29 | vocab_file = os.path.join(datadir, 'vocab_test.txt') 30 | options_file = os.path.join(datadir, 'options.json') 31 | weight_file = os.path.join(datadir, 'lm_weights.hdf5') 32 | 33 | # Dump the embeddings to a file. Run this once for your dataset. 34 | embedding_file = 'elmo_embeddings.hdf5' 35 | dump_bilm_embeddings( 36 | vocab_file, dataset_file, options_file, weight_file, embedding_file 37 | ) 38 | 39 | # Load the embeddings from the file -- here the 2nd sentence. 40 | with h5py.File(embedding_file, 'r') as fin: 41 | second_sentence_embeddings = fin['1'][...] 42 | 43 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/config.cfg: -------------------------------------------------------------------------------- 1 | [Data] 2 | pretrained_embeddings_file = ./data/giga.100.txt 3 | data_dir = ./tbank-data 4 | train_file = %(data_dir)s/train-tbank.conll 5 | dev_file = %(data_dir)s/dev-tbank.conll 6 | test_file = %(data_dir)s/test-tbank.conll 7 | min_occur_count = 2 8 | 9 | [Save] 10 | save_dir = ./best-model 11 | config_file = %(save_dir)s/config.cfg 12 | save_model_path = %(save_dir)s/model 13 | save_vocab_path = %(save_dir)s/vocab 14 | load_dir =./best-model 15 | load_model_path = %(load_dir)s/model 16 | load_vocab_path = %(load_dir)s/vocab 17 | 18 | [Network] 19 | lstm_layers = 2 20 | word_dims = 100 21 | tag_dims = 100 22 | dropout_emb = 0.33 23 | lstm_hiddens = 300 24 | dropout_lstm_input = 0.33 25 | dropout_lstm_hidden = 0.33 26 | mlp_arc_size = 200 27 | mlp_rel_size = 100 28 | dropout_mlp = 0.33 29 | 30 | [Optimizer] 31 | learning_rate = 2e-3 32 | decay = .75 33 | decay_steps = 5000 34 | beta_1 = .9 35 | beta_2 = .9 36 | epsilon = 1e-12 37 | clip = 5.0 38 | 39 | [Run] 40 | num_buckets_train = 40 41 | num_buckets_valid = 10 42 | num_buckets_test = 10 43 | train_iters = 50000 44 | train_batch_size = 50 45 | test_batch_size = 100 46 | validate_every = 165 47 | save_after = 50 48 | update_every = 4 49 | 50 | [corous-weighting] 51 | domain_num_corpus_weighting=4700 52 | cdt_domain_ration=3 53 | domain_data_size = 6892 54 | 55 | [Elmo] 56 | train_elmo=./train.1024.bin 57 | dev_elmo=./dev.1024.bin 58 | test_elmo=./test.1024.bin 59 | elmo_dims=100 60 | 61 | [Treebankid] 62 | tbank_emb_size=12 63 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bin/restart.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import argparse 4 | import numpy as np 5 | import sys 6 | sys.path.extend(["../../","../","./"]) 7 | 8 | from bilm.training import train, load_options_latest_checkpoint, load_vocab 9 | from bilm.data import LMDataset, BidirectionalLMDataset 10 | 11 | def main(args): 12 | options, ckpt_file = load_options_latest_checkpoint(args.save_dir) 13 | 14 | if 'char_cnn' in options: 15 | max_word_length = options['char_cnn']['max_characters_per_token'] 16 | else: 17 | max_word_length = None 18 | vocab = load_vocab(args.vocab_file, max_word_length) 19 | 20 | prefix = args.train_prefix 21 | 22 | kwargs = { 23 | 'test': False, 24 | 'shuffle_on_load': True, 25 | } 26 | 27 | if options.get('bidirectional'): 28 | data = BidirectionalLMDataset(prefix, vocab, **kwargs) 29 | else: 30 | data = LMDataset(prefix, vocab, **kwargs) 31 | 32 | tf_save_dir = args.save_dir 33 | tf_log_dir = args.save_dir 34 | 35 | # set optional inputs 36 | if args.n_train_tokens > 0: 37 | options['n_train_tokens'] = args.n_train_tokens 38 | if args.n_epochs > 0: 39 | options['n_epochs'] = args.n_epochs 40 | if args.batch_size > 0: 41 | options['batch_size'] = args.batch_size 42 | 43 | train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, 44 | restart_ckpt_file=ckpt_file) 45 | 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--save_dir', help='Location of checkpoint files') 50 | parser.add_argument('--vocab_file', help='Vocabulary file') 51 | parser.add_argument('--train_prefix', help='Prefix for train files') 52 | parser.add_argument('--n_gpus', type=int, default=3, 53 | help='Number of GPUs to use') 54 | parser.add_argument('--batch_size', type=int, default=64) 55 | parser.add_argument('--n_train_tokens', type=int, default=15622976) 56 | parser.add_argument('--n_epochs', type=int, default=8) 57 | 58 | args = parser.parse_args() 59 | main(args) 60 | 61 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bin/train_elmo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | sys.path.extend(["../../","../","./"]) 5 | from training import train, load_options_latest_checkpoint, load_vocab 6 | from data import BidirectionalLMDataset 7 | 8 | 9 | def main(args): 10 | # load the vocab 11 | vocab = load_vocab(args.vocab_file, 110) 12 | 13 | # define the options 14 | batch_size = 64 # 128batch size for each GPU 15 | n_gpus = 2 16 | 17 | # number of tokens in training data (this for 1B Word Benchmark) 18 | n_train_tokens = 42466511 19 | 20 | options = { 21 | 'bidirectional': True, 22 | 23 | 'char_cnn': {'activation': 'relu', 24 | 'embedding': {'dim': 16}, 25 | 'filters': [[1, 32], 26 | [2, 32], 27 | [3, 64], 28 | [4, 128], 29 | [5, 256], 30 | [6, 512], 31 | [7, 1024]], 32 | 'max_characters_per_token': 110, 33 | 'n_characters': 6612, 34 | 'n_highway': 2}, 35 | 36 | 'dropout': 0.1, 37 | 38 | 'lstm': { 39 | 'cell_clip': 3, 40 | 'dim': 4096, 41 | 'n_layers': 2, 42 | 'proj_clip': 3, 43 | 'projection_dim': 512, 44 | #'projection_dim': 50, 45 | 'use_skip_connections': True}, 46 | 47 | 'all_clip_norm_val': 10.0, 48 | 49 | 'n_epochs': 10, 50 | 'n_train_tokens': n_train_tokens, 51 | 'batch_size': batch_size, 52 | 'n_tokens_vocab': vocab.size, 53 | 'unroll_steps': 20, 54 | 'n_negative_samples_batch': 8192, 55 | } 56 | 57 | prefix = args.train_prefix 58 | data = BidirectionalLMDataset(prefix, vocab, test=False, 59 | shuffle_on_load=True) 60 | 61 | tf_save_dir = args.save_dir 62 | tf_log_dir = args.save_dir 63 | train(options, data, n_gpus, tf_save_dir, tf_log_dir) 64 | 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--save_dir', help='Location of checkpoint files') 69 | parser.add_argument('--vocab_file', help='Vocabulary file') 70 | parser.add_argument('--train_prefix', help='Prefix for train files') 71 | 72 | args = parser.parse_args() 73 | main(args) 74 | 75 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/usage_character.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ELMo usage example with character inputs. 3 | 4 | Below, we show usage for SQuAD where each input example consists of both 5 | a question and a paragraph of context. 6 | ''' 7 | 8 | import tensorflow as tf 9 | import os 10 | from bilm import Batcher, BidirectionalLanguageModel, weight_layers 11 | 12 | # Location of pretrained LM. Here we use the test fixtures. 13 | datadir = os.path.join('tests', 'fixtures', 'model') 14 | vocab_file = os.path.join(datadir, 'vocab_test.txt') 15 | options_file = os.path.join(datadir, 'options.json') 16 | weight_file = os.path.join(datadir, 'lm_weights.hdf5') 17 | 18 | # Create a Batcher to map text to character ids. 19 | batcher = Batcher(vocab_file, 50) 20 | 21 | # Input placeholders to the biLM. 22 | context_character_ids = tf.placeholder('int32', shape=(None, None, 50)) 23 | question_character_ids = tf.placeholder('int32', shape=(None, None, 50)) 24 | 25 | # Build the biLM graph. 26 | bilm = BidirectionalLanguageModel(options_file, weight_file) 27 | 28 | # Get ops to compute the LM embeddings. 29 | context_embeddings_op = bilm(context_character_ids) 30 | question_embeddings_op = bilm(question_character_ids) 31 | 32 | # Get an op to compute ELMo (weighted average of the internal biLM layers) 33 | # Our SQuAD model includes ELMo at both the input and output layers 34 | # of the taskKilled GRU, so we need 4x ELMo representations for the question 35 | # and context at each of the input and output. 36 | # We use the same ELMo weights for both the question and context 37 | # at each of the input and output. 38 | elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) 39 | with tf.variable_scope('', reuse=True): 40 | # the reuse=True scope reuses weights from the context for the question 41 | elmo_question_input = weight_layers( 42 | 'input', question_embeddings_op, l2_coef=0.0 43 | ) 44 | 45 | elmo_context_output = weight_layers( 46 | 'output', context_embeddings_op, l2_coef=0.0 47 | ) 48 | with tf.variable_scope('', reuse=True): 49 | # the reuse=True scope reuses weights from the context for the question 50 | elmo_question_output = weight_layers( 51 | 'output', question_embeddings_op, l2_coef=0.0 52 | ) 53 | 54 | 55 | # Now we can compute embeddings. 56 | raw_context = [ 57 | 'Pretrained biLMs compute representations useful for NLP tasks .', 58 | 'They give state of the art performance for many tasks .' 59 | ] 60 | tokenized_context = [sentence.split() for sentence in raw_context] 61 | tokenized_question = [ 62 | ['What', 'are', 'biLMs', 'useful', 'for', '?'], 63 | ] 64 | 65 | with tf.Session() as sess: 66 | # It is necessary to initialize variables once before running inference. 67 | sess.run(tf.global_variables_initializer()) 68 | 69 | # Create batches of data. 70 | context_ids = batcher.batch_sentences(tokenized_context) 71 | question_ids = batcher.batch_sentences(tokenized_question) 72 | 73 | # Compute ELMo representations (here for the input only, for simplicity). 74 | elmo_context_input_, elmo_question_input_ = sess.run( 75 | [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']], 76 | feed_dict={context_character_ids: context_ids, 77 | question_character_ids: question_ids} 78 | ) 79 | 80 | -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/Test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.extend(["../../","../","./"]) 3 | import time 4 | import numpy as np 5 | import pickle 6 | import argparse 7 | from driver.Config import * 8 | from driver.Model import * 9 | from driver.Parser import * 10 | from data.Dataloader import * 11 | import pickle 12 | import random 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 14 | def evaluate(data, parser, vocab, outputFile): 15 | start = time.time() 16 | parser.model.eval() 17 | output = open(outputFile, 'w', encoding='utf-8') 18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0 19 | 20 | for onebatch in data_iter(data, config.test_batch_size, False): 21 | words, extwords, tags, heads, rels, lengths, masks = \ 22 | batch_data_variable(onebatch, vocab) 23 | count = 0 24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks) 25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab): 26 | printDepTree(output, tree) 27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count]) 28 | arc_total_test += arc_total 29 | arc_correct_test += arc_correct 30 | rel_total_test += rel_total 31 | rel_correct_test += rel_correct 32 | count += 1 33 | 34 | output.close() 35 | 36 | uas = arc_correct_test * 100.0 / arc_total_test 37 | las = rel_correct_test * 100.0 / rel_total_test 38 | 39 | 40 | end = time.time() 41 | during_time = float(end - start) 42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time)) 43 | 44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | random.seed(666) 50 | np.random.seed(666) 51 | torch.cuda.manual_seed(666) 52 | torch.manual_seed(666) 53 | 54 | ### gpu 55 | gpu = torch.cuda.is_available() 56 | print("GPU available: ", gpu) 57 | print("CuDNN: \n", torch.backends.cudnn.enabled) 58 | 59 | argparser = argparse.ArgumentParser() 60 | argparser.add_argument('--config_file', default='examples/default.cfg') 61 | argparser.add_argument('--model', default='BaseParser') 62 | argparser.add_argument('--thread', default=4, type=int, help='thread num') 63 | argparser.add_argument('--use-cuda', action='store_true', default=True) 64 | 65 | args, extra_args = argparser.parse_known_args() 66 | config = Configurable(args.config_file, extra_args) 67 | 68 | vocab = pickle.load(open(config.load_vocab_path, 'rb')) 69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file) 70 | 71 | 72 | args, extra_args = argparser.parse_known_args() 73 | config = Configurable(args.config_file, extra_args) 74 | torch.set_num_threads(args.thread) 75 | 76 | config.use_cuda = False 77 | if gpu and args.use_cuda: config.use_cuda = True 78 | print("\nGPU using status: ", config.use_cuda) 79 | # print(config.use_cuda) 80 | 81 | model = ParserModel(vocab, config, vec) 82 | model.load_state_dict(torch.load(config.load_model_path)) 83 | if config.use_cuda: 84 | torch.backends.cudnn.enabled = True 85 | model = model.cuda() 86 | 87 | parser = BiaffineParser(model, vocab.ROOT) 88 | test_data = read_corpus(config.test_file, vocab) 89 | 90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \ 91 | evaluate(test_data, parser, vocab, config.test_file + '.out') 92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \ 93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las)) 94 | 95 | 96 | -------------------------------------------------------------------------------- /multi_task_learning/driver/Test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.extend(["../../","../","./"]) 3 | import time 4 | import numpy as np 5 | import pickle 6 | import argparse 7 | from driver.Config import * 8 | from driver.Model import * 9 | from driver.Parser import * 10 | from data.Dataloader import * 11 | import pickle 12 | import random 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 14 | def evaluate(data, parser, vocab, outputFile): 15 | start = time.time() 16 | parser.model.eval() 17 | #output = open(outputFile, 'w', encoding='utf-8') 18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0 19 | 20 | for onebatch in data_iter(data, config.test_batch_size, False): 21 | words, extwords, tags, heads, rels, lengths, masks = \ 22 | batch_data_variable(onebatch, vocab) 23 | count = 0 24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks) 25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab): 26 | #printDepTree(output, tree) 27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count]) 28 | arc_total_test += arc_total 29 | arc_correct_test += arc_correct 30 | rel_total_test += rel_total 31 | rel_correct_test += rel_correct 32 | count += 1 33 | 34 | #output.close() 35 | 36 | uas = arc_correct_test * 100.0 / arc_total_test 37 | las = rel_correct_test * 100.0 / rel_total_test 38 | 39 | 40 | end = time.time() 41 | during_time = float(end - start) 42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time)) 43 | 44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | random.seed(666) 50 | np.random.seed(666) 51 | torch.cuda.manual_seed(666) 52 | torch.manual_seed(666) 53 | 54 | ### gpu 55 | gpu = torch.cuda.is_available() 56 | print("GPU available: ", gpu) 57 | print("CuDNN: \n", torch.backends.cudnn.enabled) 58 | 59 | argparser = argparse.ArgumentParser() 60 | argparser.add_argument('--config_file', default='examples/default.cfg') 61 | argparser.add_argument('--model', default='BaseParser') 62 | argparser.add_argument('--thread', default=4, type=int, help='thread num') 63 | argparser.add_argument('--use-cuda', action='store_true', default=True) 64 | 65 | args, extra_args = argparser.parse_known_args() 66 | config = Configurable(args.config_file, extra_args) 67 | 68 | vocab = pickle.load(open(config.load_vocab_path, 'rb')) 69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file) 70 | 71 | 72 | args, extra_args = argparser.parse_known_args() 73 | config = Configurable(args.config_file, extra_args) 74 | torch.set_num_threads(args.thread) 75 | 76 | config.use_cuda = False 77 | if gpu and args.use_cuda: config.use_cuda = True 78 | print("\nGPU using status: ", config.use_cuda) 79 | # print(config.use_cuda) 80 | 81 | model = ParserModel(vocab, config, vec) 82 | model.load_state_dict(torch.load(config.load_model_path)) 83 | if config.use_cuda: 84 | torch.backends.cudnn.enabled = True 85 | model = model.cuda() 86 | 87 | parser = BiaffineParser(model, vocab.ROOT) 88 | test_data = read_corpus(config.test_file, vocab) 89 | 90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \ 91 | evaluate(test_data, parser, vocab, config.test_file + '.out') 92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \ 93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las)) 94 | 95 | 96 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/Test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.extend(["../../../../","../../../", "../../", "../", "./"]) 3 | import time 4 | import numpy as np 5 | import pickle 6 | import argparse 7 | from driver.Config import * 8 | from driver.Model import * 9 | from driver.Parser import * 10 | from data.Dataloader import * 11 | import pickle 12 | import random 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 14 | def evaluate(data, parser, vocab, outputFile): 15 | start = time.time() 16 | parser.model.eval() 17 | #output = open(outputFile, 'w', encoding='utf-8') 18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0 19 | 20 | for onebatch in data_iter(data, config.test_batch_size, False): 21 | words, extwords, tags, heads, rels, lengths, masks,tbank_ids = \ 22 | batch_data_variable(onebatch, vocab) 23 | count = 0 24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks,tbank_ids) 25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab): 26 | #printDepTree(output, tree) 27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count]) 28 | arc_total_test += arc_total 29 | arc_correct_test += arc_correct 30 | rel_total_test += rel_total 31 | rel_correct_test += rel_correct 32 | count += 1 33 | 34 | #output.close() 35 | 36 | uas = arc_correct_test * 100.0 / arc_total_test 37 | las = rel_correct_test * 100.0 / rel_total_test 38 | 39 | 40 | end = time.time() 41 | during_time = float(end - start) 42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time)) 43 | 44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | random.seed(666) 50 | np.random.seed(666) 51 | torch.cuda.manual_seed(666) 52 | torch.manual_seed(666) 53 | 54 | ### gpu 55 | gpu = torch.cuda.is_available() 56 | print("GPU available: ", gpu) 57 | print("CuDNN: \n", torch.backends.cudnn.enabled) 58 | 59 | argparser = argparse.ArgumentParser() 60 | argparser.add_argument('--config_file', default='examples/default.cfg') 61 | argparser.add_argument('--model', default='BaseParser') 62 | argparser.add_argument('--thread', default=4, type=int, help='thread num') 63 | argparser.add_argument('--use-cuda', action='store_true', default=True) 64 | 65 | args, extra_args = argparser.parse_known_args() 66 | config = Configurable(args.config_file, extra_args) 67 | 68 | vocab = pickle.load(open(config.load_vocab_path, 'rb')) 69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file) 70 | 71 | 72 | args, extra_args = argparser.parse_known_args() 73 | config = Configurable(args.config_file, extra_args) 74 | torch.set_num_threads(args.thread) 75 | 76 | config.use_cuda = False 77 | if gpu and args.use_cuda: config.use_cuda = True 78 | print("\nGPU using status: ", config.use_cuda) 79 | # print(config.use_cuda) 80 | 81 | model = ParserModel(vocab, config, vec) 82 | model.load_state_dict(torch.load(config.load_model_path)) 83 | if config.use_cuda: 84 | torch.backends.cudnn.enabled = True 85 | model = model.cuda() 86 | 87 | parser = BiaffineParser(model, vocab.ROOT) 88 | test_data = read_corpus(config.test_file, vocab) 89 | 90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \ 91 | evaluate(test_data, parser, vocab, config.test_file + '.out') 92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \ 93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las)) 94 | 95 | 96 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/Test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.extend(["../../","../","./"]) 3 | import time 4 | import numpy as np 5 | import pickle 6 | import argparse 7 | from driver.Config import * 8 | from driver.Model import * 9 | from driver.Parser import * 10 | from data.Dataloader import * 11 | import pickle 12 | import random 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "5" 14 | def evaluate(data, parser, vocab, outputFile,elmofile): 15 | start = time.time() 16 | parser.model.eval() 17 | #output = open(outputFile, 'w', encoding='utf-8') 18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0 19 | 20 | for onebatch in data_iter(data, config.test_batch_size, False): 21 | words, extwords, tags, heads, rels, lengths, masks, elmosens ,tbank_ids= \ 22 | batch_data_variable(onebatch, vocab) 23 | count = 0 24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks,elmosens,elmofile,tbank_ids) 25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab): 26 | #printDepTree(output, tree) 27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count]) 28 | arc_total_test += arc_total 29 | arc_correct_test += arc_correct 30 | rel_total_test += rel_total 31 | rel_correct_test += rel_correct 32 | count += 1 33 | 34 | #output.close() 35 | 36 | uas = arc_correct_test * 100.0 / arc_total_test 37 | las = rel_correct_test * 100.0 / rel_total_test 38 | 39 | 40 | end = time.time() 41 | during_time = float(end - start) 42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time)) 43 | 44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | random.seed(666) 50 | np.random.seed(666) 51 | torch.cuda.manual_seed(666) 52 | torch.manual_seed(666) 53 | 54 | ### gpu 55 | gpu = torch.cuda.is_available() 56 | print("GPU available: ", gpu) 57 | print("CuDNN: \n", torch.backends.cudnn.enabled) 58 | 59 | argparser = argparse.ArgumentParser() 60 | argparser.add_argument('--config_file', default='examples/default.cfg') 61 | argparser.add_argument('--model', default='BaseParser') 62 | argparser.add_argument('--thread', default=4, type=int, help='thread num') 63 | argparser.add_argument('--use-cuda', action='store_true', default=True) 64 | 65 | args, extra_args = argparser.parse_known_args() 66 | config = Configurable(args.config_file, extra_args) 67 | 68 | vocab = pickle.load(open(config.load_vocab_path, 'rb')) 69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file) 70 | 71 | 72 | args, extra_args = argparser.parse_known_args() 73 | config = Configurable(args.config_file, extra_args) 74 | torch.set_num_threads(args.thread) 75 | 76 | config.use_cuda = False 77 | if gpu and args.use_cuda: config.use_cuda = True 78 | print("\nGPU using status: ", config.use_cuda) 79 | # print(config.use_cuda) 80 | 81 | model = ParserModel(vocab, config, vec) 82 | model.load_state_dict(torch.load(config.load_model_path)) 83 | if config.use_cuda: 84 | torch.backends.cudnn.enabled = True 85 | model = model.cuda() 86 | 87 | parser = BiaffineParser(model, vocab.ROOT) 88 | test_data = read_corpus(config.test_file, vocab) 89 | 90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \ 91 | evaluate(test_data, parser, vocab, config.test_file + '.out',config.test_elmo) 92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \ 93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las)) 94 | 95 | 96 | -------------------------------------------------------------------------------- /ELMo/use_ELMo/get_elmo_bin.py: -------------------------------------------------------------------------------- 1 | 2 | import struct 3 | from allennlp.modules.elmo import Elmo,batch_to_ids 4 | 5 | class get_elmo: 6 | def __init__(self,options_file,weight_file,test_file,outfile): 7 | self.options_file=options_file 8 | self.weight_file = weight_file 9 | self.test_file = test_file 10 | self.outfile = outfile 11 | def get_elmo(self): 12 | test_file = open(self.test_file, "r") 13 | outfile = open(self.outfile, "w") 14 | elmo = Elmo(self.options_file, self.weight_file, 2, dropout=0) 15 | sentence = [] 16 | sen=[] 17 | sentencenum = 0 18 | for line in test_file: 19 | if line == "\n": 20 | sentencenum += 1 21 | print(sentencenum,flush=True) 22 | sen.append(sentence) 23 | character_ids = batch_to_ids(sen) 24 | print(character_ids) 25 | embeddings = elmo(character_ids) 26 | embs=embeddings['elmo_representations'][0] 27 | for char in range(embs.shape[1]): 28 | for layer in range(embs.shape[0]): 29 | for dim in range(embs.shape[2]): 30 | outfile.write(str(float(embs[layer][char][dim]))+' ') 31 | outfile.write('#') 32 | outfile.write('\n') 33 | outfile.write("\n") 34 | outfile.flush() 35 | 36 | sentence = [] 37 | sen=[] 38 | else: 39 | sentence.append(line.strip().split('\t')[1]) 40 | test_file.close() 41 | outfile.close() 42 | class write_bin: 43 | def __init__(self,inputfile,outputfile): 44 | self.inputfile = inputfile 45 | self.outputfile = outputfile 46 | def get_lines(self): 47 | f1 = open(self.inputfile, "r") 48 | word = 0 49 | sens = 0 50 | count = 0 51 | for line in f1: 52 | count += 1 53 | if line != '\n' and line != '\r\n': 54 | word += 1 55 | else: 56 | sens += 1 57 | print("count", count) 58 | print("word", word) 59 | print("sens", sens) 60 | 61 | def get_first_layer(self): 62 | infile = open(self.inputfile, "r") 63 | outfile = open(self.outputfile, "wb") 64 | 65 | file_embs = [] 66 | sentence_emb_first = [] 67 | sentence_emb_second = [] 68 | sentence_emb_third = [] 69 | sentencenum = 0 70 | for line in infile: 71 | if line != '\n': 72 | parts = line.strip().split('#') 73 | firstlayer = parts[0].split() 74 | if len(firstlayer) != 1024: 75 | print("first layer", len(firstlayer)) 76 | 77 | sentence_emb_first.append(firstlayer) 78 | else: 79 | sentencenum += 1 80 | for first in sentence_emb_first: 81 | for emb1 in first: 82 | emb = float(emb1) 83 | outfile.write(struct.pack('f', float(emb))) 84 | if sentencenum % 1000 == 0: 85 | print(sentencenum, flush=True) 86 | sentence_emb_first = [] 87 | sentence_emb_second = [] 88 | sentence_emb_third = [] 89 | print('sentencenum', sentencenum) 90 | if __name__ == "__main__": 91 | train_file = "./train.conll" 92 | train_outfile = "./train/train" 93 | 94 | A = get_elmo("./options.json","./weights.hdf5",train_file,train_outfile+".elmo") 95 | A.get_elmo() 96 | B = write_bin(train_outfile+".elmo",train_outfile+".1024.bin") 97 | B.get_lines() 98 | B.get_first_layer() 99 | 100 | 101 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_sentences.json: -------------------------------------------------------------------------------- 1 | [["The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .", "When Ms. Winfrey invited Suzanne Somers to share her controversial views about bio-identical hormone treatment on her syndicated show in 2009 , it won Ms. Winfrey a rare dollop of unflattering press , including a Newsweek cover story titled \" Crazy Talk : Oprah , Wacky Cures & You . \"", "Elk calling -- a skill that hunters perfected long ago to lure game with the promise of a little romance -- is now its own sport .", "Don 't !", "Fish , ranked 98th in the world , fired 22 aces en route to a 6-3 , 6-7 ( 5 / 7 ) , 7-6 ( 7 / 4 ) win over seventh-seeded Argentinian David Nalbandian .", "Why does everything have to become such a big issue ?", "AMMAN ( Reuters ) - King Abdullah of Jordan will meet U.S. President Barack Obama in Washington on April 21 to lobby on behalf of Arab states for a stronger U.S. role in Middle East peacemaking , palace officials said on Sunday .", "To help keep traffic flowing the Congestion Charge will remain in operation through-out the strike and TfL will be suspending road works on major London roads wherever possible .", "If no candidate wins an absolute majority , there will be a runoff between the top two contenders , most likely in mid-October .", "Authorities previously served search warrants at Murray 's Las Vegas home and his businesses in Las Vegas and Houston ."], ["Brent North Sea crude for November delivery rose 84 cents to 68.88 dollars a barrel .", "That seems to have been their model up til now .", "Gordon will join Luol Deng on the GB team ; their respective NBA teams , the Detroit Pistons and the Chicago Bulls , play tonight .", "Nikam maintains the attacks were masterminded by the Muslim militant group Lashkar-e-Taiba .", "Last year , Williams was unseeded , ranked 81st and coming off one of her worst losses on tour -- in a Tier 4 event at Hobart -- yet she beat six seeded players en route to the title at Melbourne Park .", "It said that two officers involved in the case had been disciplined .", "\" There is more intelligence now being gathered , \" the official said , adding that such efforts would continue for some time .", "The majority will be of the standard 6X6 configuration for carrying personnel .", "\" Consequently , necessary actions may not be taken to reduce the risks to children of sexual exploitation and drug or alcohol misuse , \" the report said . \u2022 Almost two-thirds of inspected schools were good or outstanding , but the number of underperforming secondaries remained \" stubborn and persistent . \"", "What a World Cup ."], ["But , there have also been many cases of individuals and small groups of people protesting , as in the case of Rongye Adak , a nomad who called for the return of the Dalai Lama and for the freedom of Tibet during the Lithang Horse Racing Festival , in eastern Tibet .", "James Duncan , head of transportation at Bournemouth Borough Council , said : \" Our legal team is reviewing the entitlement of taxis to drop and pick up passengers at bus stops , only for as long as is absolutely necessary to fulfil that function and for no other reason .", "To Mo concerning the food log you kept -- Dr. Buchholz recommends the same thing .", "The CBO estimates that only 23 percent of that would be spent in 2009 and 2010 .", "Even so , Democrats slammed Bush as out of touch .", "An information campaign will be launched later to raise awareness of employment rights and how to enforce them .", "At the gallery the concept is less vague , as Ms. Piper cites specific instances of racial violence , political assassinations and the devastation of Hurricane Katrina .", "There have been some exceptions -- such as Medicare in 1965 .", "The government guidance will be reviewed early next year after a period of public comment .", "It wasn 't the most seaworthy of prizes ."]] -------------------------------------------------------------------------------- /biaffine-parser-concat/data/Dataloader.py: -------------------------------------------------------------------------------- 1 | from data.Vocab import * 2 | import numpy as np 3 | import torch 4 | from torch.autograd import Variable 5 | import random 6 | 7 | def read_corpus(file_path, vocab=None): 8 | data = [] 9 | with open(file_path, 'r') as infile: 10 | for sentence in readDepTree(infile, vocab): 11 | data.append(sentence) 12 | return data 13 | 14 | def sentences_numberize(sentences, vocab): 15 | for sentence in sentences: 16 | yield sentence2id(sentence, vocab) 17 | 18 | def sentence2id(sentence, vocab): 19 | result = [] 20 | for dep in sentence: 21 | wordid = vocab.word2id(dep.form) 22 | extwordid = vocab.extword2id(dep.form) 23 | tagid = vocab.tag2id(dep.tag) 24 | head = dep.head 25 | if head == -1: 26 | relid = -1 27 | elif head >= 0: 28 | relid = vocab.rel2id(dep.rel) 29 | result.append([wordid, extwordid, tagid, head, relid]) 30 | 31 | return result 32 | 33 | 34 | 35 | def batch_slice(data, batch_size): 36 | batch_num = int(np.ceil(len(data) / float(batch_size))) 37 | for i in range(batch_num): 38 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i 39 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)] 40 | 41 | yield sentences 42 | 43 | 44 | def data_iter(data, batch_size, shuffle=True): 45 | """ 46 | randomly permute data, then sort by source length, and partition into batches 47 | ensure that the length of sentences in each batch 48 | """ 49 | 50 | batched_data = [] 51 | if shuffle: np.random.shuffle(data) 52 | batched_data.extend(list(batch_slice(data, batch_size))) 53 | 54 | if shuffle: np.random.shuffle(batched_data) 55 | for batch in batched_data: 56 | yield batch 57 | 58 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration): 59 | batch_data1=random.sample(data1,domain_num_corpus_weighting) 60 | data2_size=domain_num_corpus_weighting*cdt_domain_ration 61 | batch_data2 = random.sample(data2, data2_size) 62 | batch_data=[] 63 | batch_data.extend(batch_data1) 64 | batch_data.extend(batch_data2) 65 | return batch_data 66 | 67 | def batch_data_variable(batch, vocab): 68 | length = len(batch[0]) 69 | batch_size = len(batch) 70 | for b in range(1, batch_size): 71 | if len(batch[b]) > length: length = len(batch[b]) 72 | 73 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 74 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 75 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 76 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False) 77 | heads = [] 78 | rels = [] 79 | lengths = [] 80 | 81 | b = 0 82 | for sentence in sentences_numberize(batch, vocab): 83 | index = 0 84 | length = len(sentence) 85 | lengths.append(length) 86 | head = np.zeros((length), dtype=np.int32) 87 | rel = np.zeros((length), dtype=np.int32) 88 | for dep in sentence: 89 | words[b, index] = dep[0] 90 | extwords[b, index] = dep[1] 91 | tags[b, index] = dep[2] 92 | head[index] = dep[3] 93 | rel[index] = dep[4] 94 | masks[b, index] = 1 95 | index += 1 96 | b += 1 97 | heads.append(head) 98 | rels.append(rel) 99 | 100 | return words, extwords, tags, heads, rels, lengths, masks 101 | 102 | def batch_variable_depTree(trees, heads, rels, lengths, vocab): 103 | for tree, head, rel, length in zip(trees, heads, rels, lengths): 104 | sentence = [] 105 | for idx in range(length): 106 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]))) 107 | yield sentence 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/log: -------------------------------------------------------------------------------- 1 | Found 6 shards at /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.a* 2 | Loading data from: /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.ad 3 | Loaded 25214 sentences. 4 | Finished loading 5 | Found 6 shards at /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.a* 6 | Loading data from: /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.af 7 | Loaded 13350 sentences. 8 | Finished loading 9 | USING SKIP CONNECTIONS 10 | USING SKIP CONNECTIONS 11 | USING SKIP CONNECTIONS 12 | [['global_step:0', TensorShape([])], 13 | ['lm/CNN/W_cnn_0:0', 14 | TensorShape([Dimension(1), Dimension(1), Dimension(16), Dimension(32)])], 15 | ['lm/CNN/W_cnn_1:0', 16 | TensorShape([Dimension(1), Dimension(2), Dimension(16), Dimension(32)])], 17 | ['lm/CNN/W_cnn_2:0', 18 | TensorShape([Dimension(1), Dimension(3), Dimension(16), Dimension(64)])], 19 | ['lm/CNN/W_cnn_3:0', 20 | TensorShape([Dimension(1), Dimension(4), Dimension(16), Dimension(128)])], 21 | ['lm/CNN/W_cnn_4:0', 22 | TensorShape([Dimension(1), Dimension(5), Dimension(16), Dimension(256)])], 23 | ['lm/CNN/W_cnn_5:0', 24 | TensorShape([Dimension(1), Dimension(6), Dimension(16), Dimension(512)])], 25 | ['lm/CNN/W_cnn_6:0', 26 | TensorShape([Dimension(1), Dimension(7), Dimension(16), Dimension(1024)])], 27 | ['lm/CNN/b_cnn_0:0', TensorShape([Dimension(32)])], 28 | ['lm/CNN/b_cnn_1:0', TensorShape([Dimension(32)])], 29 | ['lm/CNN/b_cnn_2:0', TensorShape([Dimension(64)])], 30 | ['lm/CNN/b_cnn_3:0', TensorShape([Dimension(128)])], 31 | ['lm/CNN/b_cnn_4:0', TensorShape([Dimension(256)])], 32 | ['lm/CNN/b_cnn_5:0', TensorShape([Dimension(512)])], 33 | ['lm/CNN/b_cnn_6:0', TensorShape([Dimension(1024)])], 34 | ['lm/CNN_high_0/W_carry:0', TensorShape([Dimension(2048), Dimension(2048)])], 35 | ['lm/CNN_high_0/W_transform:0', 36 | TensorShape([Dimension(2048), Dimension(2048)])], 37 | ['lm/CNN_high_0/b_carry:0', TensorShape([Dimension(2048)])], 38 | ['lm/CNN_high_0/b_transform:0', TensorShape([Dimension(2048)])], 39 | ['lm/CNN_high_1/W_carry:0', TensorShape([Dimension(2048), Dimension(2048)])], 40 | ['lm/CNN_high_1/W_transform:0', 41 | TensorShape([Dimension(2048), Dimension(2048)])], 42 | ['lm/CNN_high_1/b_carry:0', TensorShape([Dimension(2048)])], 43 | ['lm/CNN_high_1/b_transform:0', TensorShape([Dimension(2048)])], 44 | ['lm/CNN_proj/W_proj:0', TensorShape([Dimension(2048), Dimension(512)])], 45 | ['lm/CNN_proj/b_proj:0', TensorShape([Dimension(512)])], 46 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0', 47 | TensorShape([Dimension(16384)])], 48 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0', 49 | TensorShape([Dimension(1024), Dimension(16384)])], 50 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/projection/kernel:0', 51 | TensorShape([Dimension(4096), Dimension(512)])], 52 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/bias:0', 53 | TensorShape([Dimension(16384)])], 54 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/kernel:0', 55 | TensorShape([Dimension(1024), Dimension(16384)])], 56 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/projection/kernel:0', 57 | TensorShape([Dimension(4096), Dimension(512)])], 58 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0', 59 | TensorShape([Dimension(16384)])], 60 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0', 61 | TensorShape([Dimension(1024), Dimension(16384)])], 62 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/projection/kernel:0', 63 | TensorShape([Dimension(4096), Dimension(512)])], 64 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/bias:0', 65 | TensorShape([Dimension(16384)])], 66 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/kernel:0', 67 | TensorShape([Dimension(1024), Dimension(16384)])], 68 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/projection/kernel:0', 69 | TensorShape([Dimension(4096), Dimension(512)])], 70 | ['lm/char_embed:0', TensorShape([Dimension(261), Dimension(16)])], 71 | ['lm/softmax/W:0', TensorShape([Dimension(5422), Dimension(512)])], 72 | ['lm/softmax/b:0', TensorShape([Dimension(5422)])], 73 | ['train_perplexity:0', TensorShape([])]] 74 | Training for 10 epochs and 7850 batches 75 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/usage_token.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ELMo usage example with pre-computed and cached context independent 3 | token representations 4 | 5 | Below, we show usage for SQuAD where each input example consists of both 6 | a question and a paragraph of context. 7 | ''' 8 | 9 | import tensorflow as tf 10 | import os 11 | from bilm import TokenBatcher, BidirectionalLanguageModel, weight_layers, \ 12 | dump_token_embeddings 13 | 14 | # Our small dataset. 15 | raw_context = [ 16 | 'Pretrained biLMs compute representations useful for NLP tasks .', 17 | 'They give state of the art performance for many tasks .' 18 | ] 19 | tokenized_context = [sentence.split() for sentence in raw_context] 20 | tokenized_question = [ 21 | ['What', 'are', 'biLMs', 'useful', 'for', '?'], 22 | ] 23 | 24 | # Create the vocabulary file with all unique tokens and 25 | # the special , tokens (case sensitive). 26 | all_tokens = set(['', ''] + tokenized_question[0]) 27 | for context_sentence in tokenized_context: 28 | for token in context_sentence: 29 | all_tokens.add(token) 30 | vocab_file = 'vocab_small.txt' 31 | with open(vocab_file, 'w') as fout: 32 | fout.write('\n'.join(all_tokens)) 33 | 34 | # Location of pretrained LM. Here we use the test fixtures. 35 | datadir = os.path.join('tests', 'fixtures', 'model') 36 | options_file = os.path.join(datadir, 'options.json') 37 | weight_file = os.path.join(datadir, 'lm_weights.hdf5') 38 | 39 | # Dump the token embeddings to a file. Run this once for your dataset. 40 | token_embedding_file = 'elmo_token_embeddings.hdf5' 41 | dump_token_embeddings( 42 | vocab_file, options_file, weight_file, token_embedding_file 43 | ) 44 | tf.reset_default_graph() 45 | 46 | 47 | 48 | ## Now we can do inference. 49 | # Create a TokenBatcher to map text to token ids. 50 | batcher = TokenBatcher(vocab_file) 51 | 52 | # Input placeholders to the biLM. 53 | context_token_ids = tf.placeholder('int32', shape=(None, None)) 54 | question_token_ids = tf.placeholder('int32', shape=(None, None)) 55 | 56 | # Build the biLM graph. 57 | bilm = BidirectionalLanguageModel( 58 | options_file, 59 | weight_file, 60 | use_character_inputs=False, 61 | embedding_weight_file=token_embedding_file 62 | ) 63 | 64 | # Get ops to compute the LM embeddings. 65 | context_embeddings_op = bilm(context_token_ids) 66 | question_embeddings_op = bilm(question_token_ids) 67 | 68 | # Get an op to compute ELMo (weighted average of the internal biLM layers) 69 | # Our SQuAD model includes ELMo at both the input and output layers 70 | # of the task GRU, so we need 4x ELMo representations for the question 71 | # and context at each of the input and output. 72 | # We use the same ELMo weights for both the question and context 73 | # at each of the input and output. 74 | elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0) 75 | with tf.variable_scope('', reuse=True): 76 | # the reuse=True scope reuses weights from the context for the question 77 | elmo_question_input = weight_layers( 78 | 'input', question_embeddings_op, l2_coef=0.0 79 | ) 80 | 81 | elmo_context_output = weight_layers( 82 | 'output', context_embeddings_op, l2_coef=0.0 83 | ) 84 | with tf.variable_scope('', reuse=True): 85 | # the reuse=True scope reuses weights from the context for the question 86 | elmo_question_output = weight_layers( 87 | 'output', question_embeddings_op, l2_coef=0.0 88 | ) 89 | 90 | 91 | with tf.Session() as sess: 92 | # It is necessary to initialize variables once before running inference. 93 | sess.run(tf.global_variables_initializer()) 94 | 95 | # Create batches of data. 96 | context_ids = batcher.batch_sentences(tokenized_context) 97 | question_ids = batcher.batch_sentences(tokenized_question) 98 | 99 | # Compute ELMo representations (here for the input only, for simplicity). 100 | elmo_context_input_, elmo_question_input_ = sess.run( 101 | [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']], 102 | feed_dict={context_token_ids: context_ids, 103 | question_token_ids: question_ids} 104 | ) 105 | 106 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/Dataloader.py: -------------------------------------------------------------------------------- 1 | from data.Vocab import * 2 | import numpy as np 3 | import torch 4 | from torch.autograd import Variable 5 | import random 6 | 7 | def read_corpus(file_path, vocab=None): 8 | data = [] 9 | with open(file_path, 'r') as infile: 10 | for sentence in readDepTree(infile, vocab): 11 | data.append(sentence) 12 | return data 13 | 14 | def sentences_numberize(sentences, vocab): 15 | for sentence in sentences: 16 | yield sentence2id(sentence, vocab) 17 | 18 | def sentence2id(sentence, vocab): 19 | result = [] 20 | for dep in sentence: 21 | wordid = vocab.word2id(dep.form) 22 | extwordid = vocab.extword2id(dep.form) 23 | tagid = vocab.tag2id(dep.tag) 24 | head = dep.head 25 | if head == -1: 26 | relid = -1 27 | elif head >= 0: 28 | relid = vocab.rel2id(dep.rel) 29 | if dep.treebank_id in vocab._id2tbank: 30 | tbankid = vocab.tbank2id(dep.treebank_id) 31 | else: 32 | tbankid=vocab.tbank2id(dep.proxy_tbank) 33 | result.append([wordid, extwordid, tagid, head, relid,tbankid]) 34 | 35 | return result 36 | 37 | 38 | 39 | def batch_slice(data, batch_size): 40 | batch_num = int(np.ceil(len(data) / float(batch_size))) 41 | for i in range(batch_num): 42 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i 43 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)] 44 | 45 | yield sentences 46 | 47 | 48 | def data_iter(data, batch_size, shuffle=True): 49 | """ 50 | randomly permute data, then sort by source length, and partition into batches 51 | ensure that the length of sentences in each batch 52 | """ 53 | 54 | batched_data = [] 55 | if shuffle: np.random.shuffle(data) 56 | batched_data.extend(list(batch_slice(data, batch_size))) 57 | 58 | if shuffle: np.random.shuffle(batched_data) 59 | for batch in batched_data: 60 | yield batch 61 | 62 | 63 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration): 64 | batch_data1=random.sample(data1,domain_num_corpus_weighting) 65 | data2_size=domain_num_corpus_weighting*cdt_domain_ration 66 | batch_data2 = random.sample(data2, data2_size) 67 | batch_data=[] 68 | batch_data.extend(batch_data1) 69 | batch_data.extend(batch_data2) 70 | return batch_data 71 | 72 | def batch_data_variable(batch, vocab): 73 | length = len(batch[0]) 74 | batch_size = len(batch) 75 | for b in range(1, batch_size): 76 | if len(batch[b]) > length: length = len(batch[b]) 77 | 78 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 79 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 80 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 81 | tbank_ids = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 82 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False) 83 | heads = [] 84 | rels = [] 85 | lengths = [] 86 | 87 | b = 0 88 | for sentence in sentences_numberize(batch, vocab): 89 | index = 0 90 | length = len(sentence) 91 | lengths.append(length) 92 | head = np.zeros((length), dtype=np.int32) 93 | rel = np.zeros((length), dtype=np.int32) 94 | for dep in sentence: 95 | words[b, index] = dep[0] 96 | extwords[b, index] = dep[1] 97 | tags[b, index] = dep[2] 98 | tbank_ids[b,index] = dep[5] 99 | head[index] = dep[3] 100 | rel[index] = dep[4] 101 | 102 | masks[b, index] = 1 103 | index += 1 104 | b += 1 105 | heads.append(head) 106 | rels.append(rel) 107 | 108 | return words, extwords, tags, heads, rels, lengths, masks,tbank_ids 109 | 110 | def batch_variable_depTree(trees, heads, rels, lengths, vocab): 111 | for tree, head, rel, length in zip(trees, heads, rels, lengths): 112 | sentence = [] 113 | for idx in range(length): 114 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]),tree[idx].treebank_id)) 115 | yield sentence 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/bilm/elmo.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def weight_layers(name, bilm_ops, l2_coef=None, 5 | use_top_only=False, do_layer_norm=False): 6 | ''' 7 | Weight the layers of a biLM with trainable scalar weights to 8 | compute ELMo representations. 9 | 10 | For each output layer, this returns two ops. The first computes 11 | a layer specific weighted average of the biLM layers, and 12 | the second the l2 regularizer loss term. 13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 14 | 15 | Input: 16 | name = a string prefix used for the trainable variable names 17 | bilm_ops = the tensorflow ops returned to compute internal 18 | representations from a biLM. This is the return value 19 | from BidirectionalLanguageModel(...)(ids_placeholder) 20 | l2_coef: the l2 regularization coefficient $\lambda$. 21 | Pass None or 0.0 for no regularization. 22 | use_top_only: if True, then only use the top layer. 23 | do_layer_norm: if True, then apply layer normalization to each biLM 24 | layer before normalizing 25 | 26 | Output: 27 | { 28 | 'weighted_op': op to compute weighted average for output, 29 | 'regularization_op': op to compute regularization term 30 | } 31 | ''' 32 | def _l2_regularizer(weights): 33 | if l2_coef is not None: 34 | return l2_coef * tf.reduce_sum(tf.square(weights)) 35 | else: 36 | return 0.0 37 | 38 | # Get ops for computing LM embeddings and mask 39 | lm_embeddings = bilm_ops['lm_embeddings'] 40 | mask = bilm_ops['mask'] 41 | 42 | n_lm_layers = int(lm_embeddings.get_shape()[1]) 43 | lm_dim = int(lm_embeddings.get_shape()[3]) 44 | 45 | with tf.control_dependencies([lm_embeddings, mask]): 46 | # Cast the mask and broadcast for layer use. 47 | mask_float = tf.cast(mask, 'float32') 48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1) 49 | 50 | def _do_ln(x): 51 | # do layer normalization excluding the mask 52 | x_masked = x * broadcast_mask 53 | N = tf.reduce_sum(mask_float) * lm_dim 54 | mean = tf.reduce_sum(x_masked) / N 55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2 56 | ) / N 57 | return tf.nn.batch_normalization( 58 | x, mean, variance, None, None, 1E-12 59 | ) 60 | 61 | if use_top_only: 62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 63 | # just the top layer 64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1) 65 | # no regularization 66 | reg = 0.0 67 | else: 68 | W = tf.get_variable( 69 | '{}_ELMo_W'.format(name), 70 | shape=(n_lm_layers, ), 71 | initializer=tf.zeros_initializer, 72 | regularizer=_l2_regularizer, 73 | trainable=True, 74 | ) 75 | 76 | # normalize the weights 77 | normed_weights = tf.split( 78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers 79 | ) 80 | # split LM layers 81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 82 | 83 | # compute the weighted, normalized LM activations 84 | pieces = [] 85 | for w, t in zip(normed_weights, layers): 86 | if do_layer_norm: 87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1))) 88 | else: 89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1)) 90 | sum_pieces = tf.add_n(pieces) 91 | 92 | # get the regularizer 93 | reg = [ 94 | r for r in tf.get_collection( 95 | tf.GraphKeys.REGULARIZATION_LOSSES) 96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0 97 | ] 98 | if len(reg) != 1: 99 | raise ValueError 100 | 101 | # scale the weighted sum by gamma 102 | gamma = tf.get_variable( 103 | '{}_ELMo_gamma'.format(name), 104 | shape=(1, ), 105 | initializer=tf.ones_initializer, 106 | regularizer=None, 107 | trainable=True, 108 | ) 109 | weighted_lm_layers = sum_pieces * gamma 110 | 111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg} 112 | 113 | return ret 114 | 115 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/build/lib/bilm/elmo.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def weight_layers(name, bilm_ops, l2_coef=None, 5 | use_top_only=False, do_layer_norm=False): 6 | ''' 7 | Weight the layers of a biLM with trainable scalar weights to 8 | compute ELMo representations. 9 | 10 | For each output layer, this returns two ops. The first computes 11 | a layer specific weighted average of the biLM layers, and 12 | the second the l2 regularizer loss term. 13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 14 | 15 | Input: 16 | name = a string prefix used for the trainable variable names 17 | bilm_ops = the tensorflow ops returned to compute internal 18 | representations from a biLM. This is the return value 19 | from BidirectionalLanguageModel(...)(ids_placeholder) 20 | l2_coef: the l2 regularization coefficient $\lambda$. 21 | Pass None or 0.0 for no regularization. 22 | use_top_only: if True, then only use the top layer. 23 | do_layer_norm: if True, then apply layer normalization to each biLM 24 | layer before normalizing 25 | 26 | Output: 27 | { 28 | 'weighted_op': op to compute weighted average for output, 29 | 'regularization_op': op to compute regularization term 30 | } 31 | ''' 32 | def _l2_regularizer(weights): 33 | if l2_coef is not None: 34 | return l2_coef * tf.reduce_sum(tf.square(weights)) 35 | else: 36 | return 0.0 37 | 38 | # Get ops for computing LM embeddings and mask 39 | lm_embeddings = bilm_ops['lm_embeddings'] 40 | mask = bilm_ops['mask'] 41 | 42 | n_lm_layers = int(lm_embeddings.get_shape()[1]) 43 | lm_dim = int(lm_embeddings.get_shape()[3]) 44 | 45 | with tf.control_dependencies([lm_embeddings, mask]): 46 | # Cast the mask and broadcast for layer use. 47 | mask_float = tf.cast(mask, 'float32') 48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1) 49 | 50 | def _do_ln(x): 51 | # do layer normalization excluding the mask 52 | x_masked = x * broadcast_mask 53 | N = tf.reduce_sum(mask_float) * lm_dim 54 | mean = tf.reduce_sum(x_masked) / N 55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2 56 | ) / N 57 | return tf.nn.batch_normalization( 58 | x, mean, variance, None, None, 1E-12 59 | ) 60 | 61 | if use_top_only: 62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 63 | # just the top layer 64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1) 65 | # no regularization 66 | reg = 0.0 67 | else: 68 | W = tf.get_variable( 69 | '{}_ELMo_W'.format(name), 70 | shape=(n_lm_layers, ), 71 | initializer=tf.zeros_initializer, 72 | regularizer=_l2_regularizer, 73 | trainable=True, 74 | ) 75 | 76 | # normalize the weights 77 | normed_weights = tf.split( 78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers 79 | ) 80 | # split LM layers 81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 82 | 83 | # compute the weighted, normalized LM activations 84 | pieces = [] 85 | for w, t in zip(normed_weights, layers): 86 | if do_layer_norm: 87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1))) 88 | else: 89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1)) 90 | sum_pieces = tf.add_n(pieces) 91 | 92 | # get the regularizer 93 | reg = [ 94 | r for r in tf.get_collection( 95 | tf.GraphKeys.REGULARIZATION_LOSSES) 96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0 97 | ] 98 | if len(reg) != 1: 99 | raise ValueError 100 | 101 | # scale the weighted sum by gamma 102 | gamma = tf.get_variable( 103 | '{}_ELMo_gamma'.format(name), 104 | shape=(1, ), 105 | initializer=tf.ones_initializer, 106 | regularizer=None, 107 | trainable=True, 108 | ) 109 | weighted_lm_layers = sum_pieces * gamma 110 | 111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg} 112 | 113 | return ret 114 | 115 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/Dataloader.py: -------------------------------------------------------------------------------- 1 | from data.Vocab import * 2 | import numpy as np 3 | import torch 4 | from torch.autograd import Variable 5 | import random 6 | 7 | def read_corpus(file_path, vocab=None): 8 | data = [] 9 | with open(file_path, 'r') as infile: 10 | for sentence in readDepTree(infile, vocab): 11 | data.append(sentence) 12 | return data 13 | 14 | def sentences_numberize(sentences, vocab): 15 | for sentence in sentences: 16 | yield sentence2id(sentence, vocab) 17 | 18 | def sentence2id(sentence, vocab): 19 | result = [] 20 | for dep in sentence: 21 | wordid = vocab.word2id(dep.form) 22 | extwordid = vocab.extword2id(dep.form) 23 | tagid = vocab.tag2id(dep.tag) 24 | head = dep.head 25 | if head == -1: 26 | relid = -1 27 | elif head >= 0: 28 | relid = vocab.rel2id(dep.rel) 29 | if dep.treebank_id in vocab._id2tbank: 30 | tbankid = vocab.tbank2id(dep.treebank_id) 31 | else: 32 | tbankid=vocab.tbank2id(dep.proxy_tbank) 33 | word = dep.form 34 | charid = dep.charid 35 | id = dep.id 36 | result.append([wordid, extwordid, tagid, head, relid,word,charid, id,tbankid]) 37 | 38 | return result 39 | 40 | def batch_slice(data, batch_size): 41 | batch_num = int(np.ceil(len(data) / float(batch_size))) 42 | for i in range(batch_num): 43 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i 44 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)] 45 | 46 | yield sentences 47 | 48 | 49 | def data_iter(data, batch_size, shuffle=True): 50 | """ 51 | randomly permute data, then sort by source length, and partition into batches 52 | ensure that the length of sentences in each batch 53 | """ 54 | 55 | batched_data = [] 56 | if shuffle: np.random.shuffle(data) 57 | batched_data.extend(list(batch_slice(data, batch_size))) 58 | 59 | if shuffle: np.random.shuffle(batched_data) 60 | for batch in batched_data: 61 | yield batch 62 | 63 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration): 64 | batch_data1=random.sample(data1,domain_num_corpus_weighting) 65 | data2_size=domain_num_corpus_weighting*cdt_domain_ration 66 | batch_data2 = random.sample(data2, data2_size) 67 | batch_data=[] 68 | batch_data.extend(batch_data1) 69 | batch_data.extend(batch_data2) 70 | return batch_data 71 | 72 | def batch_data_variable(batch, vocab): 73 | length = len(batch[0]) 74 | batch_size = len(batch) 75 | for b in range(1, batch_size): 76 | if len(batch[b]) > length: length = len(batch[b]) 77 | 78 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 79 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 80 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 81 | tbank_ids = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 82 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False) 83 | heads = [] 84 | rels = [] 85 | lengths = [] 86 | elmosens = [] 87 | 88 | b = 0 89 | for sentence in sentences_numberize(batch, vocab): 90 | index = 0 91 | elmosen = [] 92 | length = len(sentence) 93 | lengths.append(length) 94 | elmosen.append(length) 95 | head = np.zeros((length), dtype=np.int32) 96 | rel = np.zeros((length), dtype=np.int32) 97 | for dep in sentence: 98 | words[b, index] = dep[0] 99 | extwords[b, index] = dep[1] 100 | tags[b, index] = dep[2] 101 | tbank_ids[b,index] = dep[8] 102 | head[index] = dep[3] 103 | rel[index] = dep[4] 104 | masks[b, index] = 1 105 | index += 1 106 | if dep[7] == 1: 107 | startcharid = dep[6] 108 | elmosen.append(startcharid) 109 | b += 1 110 | heads.append(head) 111 | rels.append(rel) 112 | elmosens.append(elmosen) 113 | 114 | return words, extwords, tags, heads, rels, lengths, masks,elmosens,tbank_ids 115 | 116 | def batch_variable_depTree(trees, heads, rels, lengths, vocab): 117 | for tree, head, rel, length in zip(trees, heads, rels, lengths): 118 | sentence = [] 119 | for idx in range(length): 120 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]),tree[idx].charid,tree[idx].treebank_id)) 121 | yield sentence 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /multi_task_learning/data/Dataloader.py: -------------------------------------------------------------------------------- 1 | from data.Vocab import * 2 | import numpy as np 3 | import torch 4 | from torch.autograd import Variable 5 | import random 6 | 7 | def read_corpus(file_path, vocab=None): 8 | data = [] 9 | tgt_type = parse_my_conll(file_path) 10 | with open(file_path, 'r') as infile: 11 | for sentence in readDepTree(infile,tgt_type, vocab): 12 | data.append(sentence) 13 | return data 14 | 15 | def sentences_numberize(sentences, vocab): 16 | for sentence in sentences: 17 | yield sentence2id(sentence, vocab) 18 | 19 | def sentence2id(sentence, vocab): 20 | result = [] 21 | for dep in sentence.sentence: 22 | wordid = vocab.word2id(dep.form) 23 | extwordid = vocab.extword2id(dep.form) 24 | tagid = vocab.tag2id(dep.tag) 25 | head = dep.head 26 | if head == -1: 27 | relid = -1 28 | elif head >= 0: 29 | relid = vocab.rel2id(dep.rel) 30 | result.append([wordid, extwordid, tagid, head, relid]) 31 | 32 | return result 33 | 34 | 35 | 36 | def batch_slice(data, batch_size): 37 | batch_num = int(np.ceil(len(data) / float(batch_size))) 38 | for i in range(batch_num): 39 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i 40 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)] 41 | 42 | yield sentences 43 | 44 | 45 | def data_iter(data, batch_size, shuffle=True): 46 | """ 47 | randomly permute data, then sort by source length, and partition into batches 48 | ensure that the length of sentences in each batch 49 | """ 50 | 51 | batched_data = [] 52 | if shuffle: np.random.shuffle(data) 53 | batched_data.extend(list(batch_slice(data, batch_size))) 54 | 55 | if shuffle: np.random.shuffle(batched_data) 56 | for batch in batched_data: 57 | yield batch 58 | 59 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration): 60 | batch_data1=random.sample(data1,domain_num_corpus_weighting) 61 | data2_size=domain_num_corpus_weighting*cdt_domain_ration 62 | batch_data2 = random.sample(data2, data2_size) 63 | return batch_data1,batch_data2 64 | 65 | def data_iter_from_two_data(data1, data2, batch_size, shuffle=True): 66 | type1 = data1[0].tgt_type 67 | type2 = data2[0].tgt_type 68 | assert(type1 != type2) 69 | data = data1 + data2 70 | batched_data = [] 71 | if shuffle: np.random.shuffle(data) 72 | batched_data.extend(list(batch_slice(data, batch_size))) 73 | 74 | if shuffle: np.random.shuffle(batched_data) 75 | for batch in batched_data: 76 | batch1 = [] 77 | batch2 = [] 78 | for instance in batch: 79 | if instance.tgt_type == type1: 80 | batch1.append(instance) 81 | elif instance.tgt_type == type2: 82 | batch2.append(instance) 83 | else: 84 | print("there is error in corpus") 85 | exit(0) 86 | yield batch1, batch2 87 | def batch_data_variable(batch, vocab): 88 | length = len(batch[0]) 89 | batch_size = len(batch) 90 | for b in range(1, batch_size): 91 | if len(batch[b]) > length: length = len(batch[b]) 92 | 93 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 94 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 95 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False) 96 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False) 97 | heads = [] 98 | rels = [] 99 | lengths = [] 100 | 101 | b = 0 102 | for sentence in sentences_numberize(batch, vocab): 103 | index = 0 104 | length = len(sentence) 105 | lengths.append(length) 106 | head = np.zeros((length), dtype=np.int32) 107 | rel = np.zeros((length), dtype=np.int32) 108 | for dep in sentence: 109 | words[b, index] = dep[0] 110 | extwords[b, index] = dep[1] 111 | tags[b, index] = dep[2] 112 | head[index] = dep[3] 113 | rel[index] = dep[4] 114 | masks[b, index] = 1 115 | index += 1 116 | b += 1 117 | heads.append(head) 118 | rels.append(rel) 119 | 120 | return words, extwords, tags, heads, rels, lengths, masks 121 | 122 | def batch_variable_depTree(trees, heads, rels, lengths, vocab): 123 | for Tree, head, rel, length in zip(trees, heads, rels, lengths): 124 | tree = Tree.sentence 125 | sentence = [] 126 | for idx in range(length): 127 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]))) 128 | yield sentence 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /biaffine-parser-concat/data/Dependency.py: -------------------------------------------------------------------------------- 1 | 2 | class Dependency: #以词为单位 3 | def __init__(self, id, form, tag, head, rel): 4 | self.id = id #该词在该句话的id 5 | self.org_form = form #该词语 6 | self.form = form.lower() 7 | self.tag = tag #该词词性 8 | self.head = head #该词头节点 9 | self.rel = rel #该词的标签 10 | 11 | def __str__(self): 12 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"] 13 | return '\t'.join(values) 14 | 15 | @property 16 | def pseudo(self): 17 | return self.id == 0 or self.form == '' 18 | 19 | 20 | class DepTree: #以句子为单位,判断是否为投影树 21 | def __init__(self, sentence): 22 | self.words = list(sentence) 23 | self.start = 1 24 | if sentence[0].id == 1: self.start = 0 25 | elif sentence[0].id == 0: self.start = 1 26 | else: self.start = len(self.words) 27 | 28 | def isProj(self): 29 | n = len(self.words) 30 | words = self.words 31 | if self.start > 1: return False 32 | if self.start == 0: words = [None] + words 33 | for i in range(1, n): 34 | hi = words[i].head 35 | for j in range(i+1, hi): 36 | hj = words[j].head 37 | if (hj - hi) * (hj - i) > 0: 38 | return False 39 | return True 40 | 41 | 42 | def evalDepTree(gold, predict): 43 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU'] 44 | PUNCT_TAGS = [] 45 | ignore_tags = set(PUNCT_TAGS) 46 | start_g = 0 47 | if gold[0].id == 0: start_g = 1 48 | start_p = 0 49 | if predict[0].id == 0: start_p = 1 50 | 51 | glength = len(gold) - start_g 52 | plength = len(predict) - start_p 53 | 54 | if glength != plength: 55 | raise Exception('gold length does not match predict length.') 56 | 57 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0 58 | for idx in range(glength): 59 | if gold[start_g + idx].pseudo: continue 60 | if gold[start_g + idx].tag in ignore_tags: continue 61 | if gold[start_g + idx].head == -1: continue 62 | arc_total += 1 63 | label_total += 1 64 | if gold[start_g + idx].head == predict[start_p + idx].head: 65 | arc_correct += 1 66 | if gold[start_g + idx].rel == predict[start_p + idx].rel: 67 | label_correct += 1 68 | 69 | return arc_total, arc_correct, label_total, label_correct 70 | 71 | 72 | def readDepTree(file, vocab=None): 73 | proj = 0 74 | total = 0 75 | min_count = 1 76 | if vocab is None: min_count = 0 77 | if vocab is None: sentence = [] 78 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 79 | for line in file: 80 | tok = line.strip().split('\t') 81 | if not tok or line.strip() == '' or line.strip().startswith('#'): 82 | if len(sentence) > min_count: #sententce至少有一个词 83 | if DepTree(sentence).isProj(): 84 | proj += 1 85 | total += 1 86 | yield sentence 87 | if vocab is None: 88 | sentence = [] 89 | else: 90 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 91 | elif len(tok) == 10: 92 | if tok[6] == '_': tok[6] = '-1' 93 | try: 94 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7])) 95 | except Exception: 96 | pass 97 | else: 98 | pass 99 | 100 | if len(sentence) > min_count: 101 | if DepTree(sentence).isProj(): 102 | proj += 1 103 | total += 1 104 | yield sentence 105 | 106 | print("Total num: ", total) 107 | print("Proj num: ", proj) 108 | 109 | 110 | def writeDepTree(filename, sentences): 111 | with open(filename, 'w') as file: 112 | for sentence in sentences: 113 | for entry in sentence: 114 | if not entry.pseudo: file.write(str(entry) + '\n') 115 | file.write('\n') 116 | 117 | def printDepTree(output, sentence, gold=None): 118 | if gold== None: 119 | for entry in sentence: 120 | if not entry.pseudo: output.write(str(entry) + '\n') 121 | output.write('\n') 122 | else: 123 | start_g = 0 124 | if gold[0].id == 0: start_g = 1 125 | start_p = 0 126 | if sentence[0].id == 0: start_p = 1 127 | glength = len(gold) - start_g 128 | plength = len(sentence) - start_p 129 | 130 | if glength != plength: 131 | raise Exception('gold length does not match predict length.') 132 | 133 | for idx in range(glength): 134 | if gold[start_g + idx].pseudo: continue 135 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \ 136 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"] 137 | output.write('\t'.join(values) + '\n') 138 | 139 | output.write('\n') 140 | 141 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/test_elmo.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import os 4 | import json 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from bilm.model import BidirectionalLanguageModel 10 | from bilm.data import Batcher 11 | from bilm.elmo import weight_layers 12 | 13 | FIXTURES = 'tests/fixtures/model/' 14 | 15 | 16 | 17 | class TestWeightedLayers(unittest.TestCase): 18 | def tearDown(self): 19 | tf.reset_default_graph() 20 | self.sess.close() 21 | 22 | def setUp(self): 23 | self.sess = tf.Session() 24 | 25 | def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only): 26 | # create the Batcher 27 | vocab_file = os.path.join(FIXTURES, 'vocab_test.txt') 28 | batcher = Batcher(vocab_file, 50) 29 | 30 | # load the model 31 | options_file = os.path.join(FIXTURES, 'options.json') 32 | weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') 33 | character_ids = tf.placeholder('int32', (None, None, 50)) 34 | model = BidirectionalLanguageModel( 35 | options_file, weight_file, max_batch_size=4) 36 | bilm_ops = model(character_ids) 37 | 38 | weighted_ops = [] 39 | for k in range(2): 40 | ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, 41 | do_layer_norm=do_layer_norm, 42 | use_top_only=use_top_only) 43 | weighted_ops.append(ops) 44 | 45 | # initialize 46 | self.sess.run(tf.global_variables_initializer()) 47 | 48 | n_expected_trainable_weights = 2 * (1 + int(not use_top_only)) 49 | self.assertEqual(len(tf.trainable_variables()), 50 | n_expected_trainable_weights) 51 | # and one regularizer per weighted layer 52 | n_expected_reg_losses = 2 * int(not use_top_only) 53 | self.assertEqual( 54 | len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), 55 | n_expected_reg_losses, 56 | ) 57 | 58 | # Set the variables. 59 | weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])], 60 | [np.array([0.2, 0.4, 0.6]), np.array([0.88])]] 61 | for k in range(2): 62 | with tf.variable_scope('', reuse=True): 63 | if not use_top_only: 64 | W = tf.get_variable('{}_ELMo_W'.format(k)) 65 | _ = self.sess.run([W.assign(weights[k][0])]) 66 | gamma = tf.get_variable('{}_ELMo_gamma'.format(k)) 67 | _ = self.sess.run([gamma.assign(weights[k][1])]) 68 | 69 | # make some data 70 | sentences = [ 71 | ['The', 'first', 'sentence', '.'], 72 | ['The', 'second'], 73 | ['Third'] 74 | ] 75 | X_chars = batcher.batch_sentences(sentences) 76 | 77 | ops = model(character_ids) 78 | lm_embeddings, mask, weighted0, weighted1 = self.sess.run( 79 | [ops['lm_embeddings'], ops['mask'], 80 | weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']], 81 | feed_dict={character_ids: X_chars} 82 | ) 83 | actual_elmo = [weighted0, weighted1] 84 | 85 | # check the mask first 86 | expected_mask = [[True, True, True, True], 87 | [True, True, False, False], 88 | [True, False, False, False]] 89 | self.assertTrue((expected_mask == mask).all()) 90 | 91 | # Now compute the actual weighted layers 92 | for k in range(2): 93 | normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum( 94 | np.exp(weights[k][0] + 1.0 / 3)) 95 | # masked layer normalization 96 | expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1])) 97 | if not use_top_only: 98 | for j in range(3): # number of LM layers 99 | if do_layer_norm: 100 | mean = np.mean(lm_embeddings[:, j, :, :][mask]) 101 | std = np.std(lm_embeddings[:, j, :, :][mask]) 102 | normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / ( 103 | std + 1E-12) 104 | expected_elmo += normed_weights[j] * normed_lm_embed 105 | else: 106 | expected_elmo += normed_weights[j] * lm_embeddings[ 107 | :, j, :, :] 108 | else: 109 | expected_elmo += lm_embeddings[:, -1, :, :] 110 | 111 | # the scale parameter 112 | expected_elmo *= weights[k][1] 113 | self.assertTrue( 114 | np.allclose(expected_elmo, actual_elmo[k], atol=1e-6) 115 | ) 116 | 117 | def test_weighted_layers(self): 118 | self._check_weighted_layer(1.0, do_layer_norm=True, use_top_only=False) 119 | 120 | def test_weighted_layers_no_norm(self): 121 | self._check_weighted_layer(1.0, do_layer_norm=False, use_top_only=False) 122 | 123 | def test_weighted_layers_top_only(self): 124 | self._check_weighted_layer(None, do_layer_norm=False, use_top_only=True) 125 | 126 | 127 | if __name__ == '__main__': 128 | unittest.main() 129 | 130 | 131 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/fixtures/model/vocab_test.txt: -------------------------------------------------------------------------------- 1 | Bournemouth 2 | concept 3 | care 4 | were 5 | it 6 | vague 7 | Arab 8 | ago 9 | comment 10 | roads 11 | Newsweek 12 | Pistons 13 | on 14 | rare 15 | not 16 | have 17 | majority 18 | Suzanne 19 | return 20 | report 21 | Charge 22 | World 23 | also 24 | thing 25 | Reuters 26 | you 27 | confirmed 28 | school 29 | press 30 | suspending 31 | later 32 | states 33 | 21 34 | swine 35 | them 36 | period 37 | route 38 | reversed 39 | Medicare 40 | likely 41 | at 42 | only 43 | awareness 44 | 23 45 | including 46 | 81st 47 | controversial 48 | romance 49 | Talk 50 | campaign 51 | long 52 | called 53 | Mo 54 | drug 55 | Melbourne 56 | major 57 | worst 58 | stops 59 | systems 60 | personnel 61 | U.S. 62 | even 63 | absolute 64 | aces 65 | to 66 | should 67 | persistent 68 | official 69 | Lithang 70 | Argentinian 71 | seeded 72 | Williams 73 | Winfrey 74 | Abdullah 75 | attacks 76 | Festival 77 | Control 78 | be 79 | search 80 | up 81 | over 82 | misuse 83 | barrel 84 | Cup 85 | unseeded 86 | other 87 | advised 88 | eastern 89 | ranked 90 | game 91 | strike 92 | legal 93 | specific 94 | guidance 95 | - 96 | stay 97 | does 98 | team 99 | small 100 | Dr. 101 | an 102 | ! 103 | that 104 | such 105 | centers 106 | percent 107 | militant 108 | Murray 109 | initially 110 | off 111 | businesses 112 | sexual 113 | continue 114 | violence 115 | as 116 | fulfil 117 | touch 118 | time 119 | necessary 120 | Democrats 121 | What 122 | hormone 123 | • 124 | if 125 | Brent 126 | bus 127 | en 128 | Dalai 129 | of 130 | racial 131 | itself 132 | with 133 | lobby 134 | disciplined 135 | out 136 | risks 137 | gathered 138 | people 139 | little 140 | traffic 141 | Rongye 142 | virus 143 | Nalbandian 144 | cents 145 | 2010 146 | entitlement 147 | employment 148 | launched 149 | possible 150 | between 151 | groups 152 | issue 153 | no 154 | reviewed 155 | losses 156 | estimates 157 | . 158 | North 159 | everything 160 | November 161 | after 162 | previously 163 | her 164 | become 165 | ( 166 | David 167 | world 168 | unflattering 169 | next 170 | 68.88 171 | Tibet 172 | alcohol 173 | story 174 | kept 175 | 2009 176 | 1965 177 | same 178 | contenders 179 | Wacky 180 | lure 181 | stronger 182 | in 183 | Katrina 184 | 6-7 185 | freedom 186 | recommends 187 | most 188 | group 189 | a 190 | London 191 | til 192 | standard 193 | gallery 194 | Sea 195 | through-out 196 | 's 197 | 6-3 198 | many 199 | apparent 200 | Piper 201 | peacemaking 202 | officers 203 | reviewing 204 | for 205 | some 206 | play 207 | how 208 | 209 | enforce 210 | spent 211 | exploitation 212 | number 213 | taken 214 | so 215 | 5 216 | But 217 | Even 218 | At 219 | , 220 | Our 221 | Gordon 222 | Chicago 223 | head 224 | case 225 | 7 226 | Jordan 227 | delivery 228 | concerning 229 | palace 230 | King 231 | Cures 232 | she 233 | mildness 234 | maintains 235 | devastation 236 | Consequently 237 | Crazy 238 | tonight 239 | Almost 240 | transportation 241 | seems 242 | remain 243 | they 244 | bio-identical 245 | warrants 246 | East 247 | own 248 | cites 249 | perfected 250 | individuals 251 | 22 252 | teams 253 | coming 254 | food 255 | ) 256 | efforts 257 | Fish 258 | 259 | configuration 260 | road 261 | adding 262 | outstanding 263 | secondaries 264 | children 265 | / 266 | Tier 267 | is 268 | will 269 | AMMAN 270 | taxis 271 | the 272 | crude 273 | meet 274 | Hurricane 275 | but 276 | syndicated 277 | underperforming 278 | occurred 279 | wherever 280 | Why 281 | runoff 282 | & 283 | was 284 | win 285 | GB 286 | Lashkar-e-Taiba 287 | tour 288 | its 289 | views 290 | top 291 | Somers 292 | saying 293 | early 294 | two-thirds 295 | exceptions 296 | candidate 297 | ; 298 | It 299 | Barack 300 | seaworthy 301 | Nikam 302 | Elk 303 | protesting 304 | raise 305 | by 306 | 98th 307 | wasn 308 | Vegas 309 | Prevention 310 | Bush 311 | had 312 | nomad 313 | The 314 | flowing 315 | -- 316 | government 317 | Disease 318 | served 319 | schools 320 | won 321 | instances 322 | yet 323 | more 324 | remained 325 | That 326 | and 327 | join 328 | Lama 329 | masterminded 330 | fired 331 | calling 332 | invited 333 | six 334 | TfL 335 | Washington 336 | dollop 337 | To 338 | reason 339 | said 340 | actions 341 | " 342 | Las 343 | officials 344 | Don 345 | function 346 | two 347 | respective 348 | An 349 | may 350 | April 351 | players 352 | good 353 | 7-6 354 | Buchholz 355 | 6X6 356 | home 357 | Centers 358 | Congestion 359 | public 360 | ? 361 | pick 362 | Horse 363 | Borough 364 | seventh-seeded 365 | passengers 366 | there 367 | hunters 368 | meant 369 | inspected 370 | during 371 | rights 372 | prizes 373 | Council 374 | one 375 | or 376 | role 377 | would 378 | model 379 | mid-October 380 | carrying 381 | Last 382 | : 383 | behalf 384 | less 385 | cases 386 | works 387 | When 388 | flu 389 | help 390 | Muslim 391 | dollars 392 | 't 393 | treatment 394 | James 395 | his 396 | titled 397 | about 398 | title 399 | drop 400 | 4 401 | Authorities 402 | operation 403 | assassinations 404 | Hobart 405 | There 406 | Sunday 407 | Middle 408 | CBO 409 | slammed 410 | event 411 | Detroit 412 | Duncan 413 | intelligence 414 | reduce 415 | their 416 | Racing 417 | outbreaks 418 | rose 419 | Ms. 420 | wins 421 | keep 422 | You 423 | beat 424 | been 425 | stubborn 426 | cover 427 | now 428 | share 429 | open 430 | show 431 | Oprah 432 | close 433 | involved 434 | year 435 | day 436 | Adak 437 | big 438 | Park 439 | log 440 | skill 441 | being 442 | promise 443 | who 444 | sport 445 | then 446 | If 447 | Obama 448 | Bulls 449 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/data/Dependency.py: -------------------------------------------------------------------------------- 1 | 2 | class Dependency: #以词为单位 3 | def __init__(self, id, form, tag, head, rel, treebank_id,proxy_tbank="BC"): 4 | self.id = id #该词在该句话的id 5 | self.org_form = form #该词语 6 | self.form = form.lower() 7 | self.tag = tag #该词词性 8 | self.head = head #该词头节点 9 | self.rel = rel #该词的标签 10 | self.treebank_id=treebank_id 11 | self.proxy_tbank=proxy_tbank #代理treebank_id,默认cdt 12 | 13 | def __str__(self): 14 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"] 15 | return '\t'.join(values) 16 | 17 | @property 18 | def pseudo(self): 19 | return self.id == 0 or self.form == '' 20 | 21 | 22 | class DepTree: #以句子为单位,判断是否为投影树 23 | def __init__(self, sentence): 24 | self.words = list(sentence) 25 | self.start = 1 26 | if sentence[0].id == 1: self.start = 0 27 | elif sentence[0].id == 0: self.start = 1 28 | else: self.start = len(self.words) 29 | 30 | def isProj(self): 31 | n = len(self.words) 32 | words = self.words 33 | if self.start > 1: return False 34 | if self.start == 0: words = [None] + words 35 | for i in range(1, n): 36 | hi = words[i].head 37 | for j in range(i+1, hi): 38 | hj = words[j].head 39 | if (hj - hi) * (hj - i) > 0: 40 | return False 41 | return True 42 | 43 | 44 | def evalDepTree(gold, predict): 45 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU'] 46 | PUNCT_TAGS = [] 47 | ignore_tags = set(PUNCT_TAGS) 48 | start_g = 0 49 | if gold[0].id == 0: start_g = 1 50 | start_p = 0 51 | if predict[0].id == 0: start_p = 1 52 | 53 | glength = len(gold) - start_g 54 | plength = len(predict) - start_p 55 | 56 | if glength != plength: 57 | raise Exception('gold length does not match predict length.') 58 | 59 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0 60 | for idx in range(glength): 61 | if gold[start_g + idx].pseudo: continue 62 | if gold[start_g + idx].tag in ignore_tags: continue 63 | if gold[start_g + idx].head == -1: continue 64 | arc_total += 1 65 | label_total += 1 66 | if gold[start_g + idx].head == predict[start_p + idx].head: 67 | arc_correct += 1 68 | if gold[start_g + idx].rel == predict[start_p + idx].rel: 69 | label_correct += 1 70 | 71 | return arc_total, arc_correct, label_total, label_correct 72 | 73 | 74 | def readDepTree(file, vocab=None): 75 | proj = 0 76 | total = 0 77 | min_count = 1 78 | last_tbank = "PC" 79 | if vocab is None: min_count = 0 80 | if vocab is None: sentence = [] 81 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,last_tbank)] 82 | for line in file: 83 | tok = line.strip().split('\t') 84 | if not tok or line.strip() == '' or line.strip().startswith('#'): 85 | if len(sentence) > min_count: #sententce至少有一个词 86 | if DepTree(sentence).isProj(): 87 | proj += 1 88 | total += 1 89 | yield sentence 90 | if vocab is None: 91 | sentence = [] 92 | else: 93 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,last_tbank)] 94 | elif len(tok) == 10: 95 | if tok[6] == '_': tok[6] = '-1' 96 | try: 97 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7],tok[9])) 98 | last_tbank = tok[9] 99 | except Exception: 100 | pass 101 | else: 102 | pass 103 | 104 | if len(sentence) > min_count: 105 | if DepTree(sentence).isProj(): 106 | proj += 1 107 | total += 1 108 | yield sentence 109 | 110 | print("Total num: ", total) 111 | print("Proj num: ", proj) 112 | 113 | 114 | # def writeDepTree(filename, sentences): 115 | # with open(filename, 'w') as file: 116 | # for sentence in sentences: 117 | # for entry in sentence: 118 | # if not entry.pseudo: file.write(str(entry) + '\n') 119 | # file.write('\n') 120 | 121 | # def printDepTree(output, sentence, gold=None): 122 | # if gold== None: 123 | # for entry in sentence: 124 | # if not entry.pseudo: output.write(str(entry) + '\n') 125 | # output.write('\n') 126 | # else: 127 | # start_g = 0 128 | # if gold[0].id == 0: start_g = 1 129 | # start_p = 0 130 | # if sentence[0].id == 0: start_p = 1 131 | # glength = len(gold) - start_g 132 | # plength = len(sentence) - start_p 133 | # 134 | # if glength != plength: 135 | # raise Exception('gold length does not match predict length.') 136 | # 137 | # for idx in range(glength): 138 | # if gold[start_g + idx].pseudo: continue 139 | # values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \ 140 | # "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"] 141 | # output.write('\t'.join(values) + '\n') 142 | # 143 | # output.write('\n') 144 | 145 | -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/Parser.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from driver.MST import * 3 | import torch.optim.lr_scheduler 4 | from driver.Layer import * 5 | import numpy as np 6 | 7 | 8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64): 9 | lengths = [len(x) for x in xs] 10 | if length is None: 11 | length = max(lengths) 12 | y = np.array([np.pad(x.astype(dtype), (0, length - l), 13 | mode="constant", constant_values=padding) 14 | for x, l in zip(xs, lengths)]) 15 | return torch.from_numpy(y) 16 | 17 | def _model_var(model, x): 18 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 19 | if p.is_cuda: 20 | x = x.cuda(p.get_device()) 21 | return torch.autograd.Variable(x) 22 | 23 | 24 | class BiaffineParser(object): 25 | def __init__(self, model, root_id): 26 | self.model = model 27 | self.root = root_id 28 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 29 | self.use_cuda = p.is_cuda 30 | self.device = p.get_device() if self.use_cuda else None 31 | 32 | def forward(self, words, extwords, tags, masks): 33 | if self.use_cuda: 34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device), 35 | tags = tags.cuda(self.device) 36 | masks = masks.cuda(self.device) 37 | 38 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks) 39 | # cache 40 | self.arc_logits = arc_logits 41 | self.rel_logits = rel_logits 42 | 43 | 44 | def compute_loss(self, true_arcs, true_rels, lengths): 45 | b, l1, l2 = self.arc_logits.size() 46 | index_true_arcs = _model_var( 47 | self.model, 48 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64)) 49 | true_arcs = _model_var( 50 | self.model, 51 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64)) 52 | 53 | masks = [] 54 | for length in lengths: 55 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length)) 56 | mask = _model_var(self.model, mask) 57 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 58 | masks.append(mask.transpose(0, 1)) 59 | length_mask = torch.stack(masks, 0) 60 | arc_logits = self.arc_logits + length_mask 61 | 62 | arc_loss = F.cross_entropy( 63 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 64 | ignore_index=-1) 65 | 66 | size = self.rel_logits.size() 67 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 68 | 69 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 70 | rel_probs = [] 71 | for i in range(l1): 72 | rel_probs.append(logits[i][int(arcs[i])]) 73 | rel_probs = torch.stack(rel_probs, dim=0) 74 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 75 | 76 | b, l1, d = output_logits.size() 77 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64)) 78 | 79 | rel_loss = F.cross_entropy( 80 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1) 81 | 82 | loss = arc_loss + rel_loss 83 | 84 | return loss 85 | 86 | def compute_accuracy(self, true_arcs, true_rels): 87 | b, l1, l2 = self.arc_logits.size() 88 | pred_arcs = self.arc_logits.data.max(2)[1].cpu() 89 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 90 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 91 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum() 92 | 93 | 94 | size = self.rel_logits.size() 95 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 96 | 97 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 98 | rel_probs = [] 99 | for i in range(l1): 100 | rel_probs.append(logits[i][arcs[i]]) 101 | rel_probs = torch.stack(rel_probs, dim=0) 102 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 103 | 104 | pred_rels = output_logits.data.max(2)[1].cpu() 105 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64) 106 | label_correct = pred_rels.eq(true_rels).cpu().sum() 107 | 108 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1) 109 | 110 | return arc_correct, label_correct, total_arcs 111 | 112 | def parse(self, words, extwords, tags, lengths, masks): 113 | if words is not None: 114 | self.forward(words, extwords, tags, masks) 115 | ROOT = self.root 116 | arcs_batch, rels_batch = [], [] 117 | arc_logits = self.arc_logits.data.cpu().numpy() 118 | rel_logits = self.rel_logits.data.cpu().numpy() 119 | 120 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths): 121 | arc_probs = softmax2d(arc_logit, length, length) 122 | arc_pred = arc_argmax(arc_probs, length, False) 123 | 124 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred] 125 | rel_pred = rel_argmax(rel_probs, length, ROOT, False) 126 | 127 | arcs_batch.append(arc_pred) 128 | rels_batch.append(rel_pred) 129 | 130 | return arcs_batch, rels_batch 131 | -------------------------------------------------------------------------------- /multi_task_learning/driver/Parser.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from driver.MST import * 3 | import torch.optim.lr_scheduler 4 | from driver.Layer import * 5 | import numpy as np 6 | 7 | 8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64): 9 | lengths = [len(x) for x in xs] 10 | if length is None: 11 | length = max(lengths) 12 | y = np.array([np.pad(x.astype(dtype), (0, length - l), 13 | mode="constant", constant_values=padding) 14 | for x, l in zip(xs, lengths)]) 15 | return torch.from_numpy(y) 16 | 17 | def _model_var(model, x): 18 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 19 | if p.is_cuda: 20 | x = x.cuda(p.get_device()) 21 | return torch.autograd.Variable(x) 22 | 23 | 24 | class BiaffineParser(object): 25 | def __init__(self, model, root_id): 26 | self.model = model 27 | self.root = root_id 28 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 29 | self.use_cuda = p.is_cuda 30 | self.device = p.get_device() if self.use_cuda else None 31 | 32 | def forward(self, words, extwords, tags, masks,type): 33 | if self.use_cuda: 34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device), 35 | tags = tags.cuda(self.device) 36 | masks = masks.cuda(self.device) 37 | 38 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,type) 39 | # cache 40 | self.arc_logits = arc_logits 41 | self.rel_logits = rel_logits 42 | 43 | 44 | def compute_loss(self, true_arcs, true_rels, lengths): 45 | b, l1, l2 = self.arc_logits.size() 46 | index_true_arcs = _model_var( 47 | self.model, 48 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64)) 49 | true_arcs = _model_var( 50 | self.model, 51 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64)) 52 | 53 | masks = [] 54 | for length in lengths: 55 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length)) 56 | mask = _model_var(self.model, mask) 57 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 58 | masks.append(mask.transpose(0, 1)) 59 | length_mask = torch.stack(masks, 0) 60 | arc_logits = self.arc_logits + length_mask 61 | 62 | arc_loss = F.cross_entropy( 63 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 64 | ignore_index=-1) 65 | 66 | size = self.rel_logits.size() 67 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 68 | 69 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 70 | rel_probs = [] 71 | for i in range(l1): 72 | rel_probs.append(logits[i][int(arcs[i])]) 73 | rel_probs = torch.stack(rel_probs, dim=0) 74 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 75 | 76 | b, l1, d = output_logits.size() 77 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64)) 78 | 79 | rel_loss = F.cross_entropy( 80 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1) 81 | 82 | loss = arc_loss + rel_loss 83 | 84 | return loss 85 | 86 | def compute_accuracy(self, true_arcs, true_rels): 87 | b, l1, l2 = self.arc_logits.size() 88 | pred_arcs = self.arc_logits.data.max(2)[1].cpu() 89 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 90 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 91 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum() 92 | 93 | 94 | size = self.rel_logits.size() 95 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 96 | 97 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 98 | rel_probs = [] 99 | for i in range(l1): 100 | rel_probs.append(logits[i][arcs[i]]) 101 | rel_probs = torch.stack(rel_probs, dim=0) 102 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 103 | 104 | pred_rels = output_logits.data.max(2)[1].cpu() 105 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64) 106 | label_correct = pred_rels.eq(true_rels).cpu().sum() 107 | 108 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1) 109 | 110 | return arc_correct, label_correct, total_arcs 111 | 112 | def parse(self, words, extwords, tags, lengths, masks,type): 113 | if words is not None: 114 | self.forward(words, extwords, tags, masks,type) 115 | ROOT = self.root 116 | arcs_batch, rels_batch = [], [] 117 | arc_logits = self.arc_logits.data.cpu().numpy() 118 | rel_logits = self.rel_logits.data.cpu().numpy() 119 | 120 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths): 121 | arc_probs = softmax2d(arc_logit, length, length) 122 | arc_pred = arc_argmax(arc_probs, length, False) 123 | 124 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred] 125 | rel_pred = rel_argmax(rel_probs, length, ROOT, False) 126 | 127 | arcs_batch.append(arc_pred) 128 | rels_batch.append(rel_pred) 129 | 130 | return arcs_batch, rels_batch 131 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/Parser.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from driver.MST import * 3 | import torch.optim.lr_scheduler 4 | from driver.Layer import * 5 | import numpy as np 6 | 7 | 8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64): 9 | lengths = [len(x) for x in xs] 10 | if length is None: 11 | length = max(lengths) 12 | y = np.array([np.pad(x.astype(dtype), (0, length - l), 13 | mode="constant", constant_values=padding) 14 | for x, l in zip(xs, lengths)]) 15 | return torch.from_numpy(y) 16 | 17 | def _model_var(model, x): 18 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 19 | if p.is_cuda: 20 | x = x.cuda(p.get_device()) 21 | return torch.autograd.Variable(x) 22 | 23 | 24 | class BiaffineParser(object): 25 | def __init__(self, model, root_id): 26 | self.model = model 27 | self.root = root_id 28 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 29 | self.use_cuda = p.is_cuda 30 | self.device = p.get_device() if self.use_cuda else None 31 | 32 | def forward(self, words, extwords, tags, masks,elmosens,elmofile,tbank_ids): 33 | if self.use_cuda: 34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device), 35 | tags = tags.cuda(self.device) 36 | masks = masks.cuda(self.device) 37 | tbank_ids=tbank_ids.cuda(self.device) 38 | 39 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,elmosens,elmofile,tbank_ids) 40 | # cache 41 | self.arc_logits = arc_logits 42 | self.rel_logits = rel_logits 43 | 44 | 45 | def compute_loss(self, true_arcs, true_rels, lengths): 46 | b, l1, l2 = self.arc_logits.size() 47 | index_true_arcs = _model_var(self.model,pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64)) 48 | true_arcs = _model_var(self.model,pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64)) 49 | masks = [] 50 | for length in lengths: 51 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length)) 52 | mask = _model_var(self.model, mask) 53 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 54 | masks.append(mask.transpose(0, 1)) 55 | length_mask = torch.stack(masks, 0) 56 | arc_logits = self.arc_logits + length_mask 57 | 58 | arc_loss = F.cross_entropy( 59 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 60 | ignore_index=-1) 61 | 62 | size = self.rel_logits.size() 63 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 64 | 65 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 66 | rel_probs = [] 67 | for i in range(l1): 68 | rel_probs.append(logits[i][int(arcs[i])]) 69 | rel_probs = torch.stack(rel_probs, dim=0) 70 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 71 | 72 | b, l1, d = output_logits.size() 73 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64)) 74 | 75 | rel_loss = F.cross_entropy( 76 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1) 77 | 78 | loss = arc_loss + rel_loss 79 | 80 | return loss 81 | 82 | def compute_accuracy(self, true_arcs, true_rels): 83 | b, l1, l2 = self.arc_logits.size() 84 | pred_arcs = self.arc_logits.data.max(2)[1].cpu() 85 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 86 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 87 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum() 88 | 89 | 90 | size = self.rel_logits.size() 91 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 92 | 93 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 94 | rel_probs = [] 95 | for i in range(l1): 96 | rel_probs.append(logits[i][arcs[i]]) 97 | rel_probs = torch.stack(rel_probs, dim=0) 98 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 99 | 100 | pred_rels = output_logits.data.max(2)[1].cpu() 101 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64) 102 | label_correct = pred_rels.eq(true_rels).cpu().sum() 103 | 104 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1) 105 | 106 | return arc_correct, label_correct, total_arcs 107 | 108 | def parse(self, words, extwords, tags, lengths, masks,elmosens,elmofile,tbank_ids): 109 | if words is not None: 110 | self.forward(words, extwords, tags, masks,elmosens,elmofile,tbank_ids) 111 | ROOT = self.root 112 | arcs_batch, rels_batch = [], [] 113 | arc_logits = self.arc_logits.data.cpu().numpy() 114 | rel_logits = self.rel_logits.data.cpu().numpy() 115 | 116 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths): 117 | arc_probs = softmax2d(arc_logit, length, length) 118 | arc_pred = arc_argmax(arc_probs, length, False) 119 | 120 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred] 121 | rel_pred = rel_argmax(rel_probs, length, ROOT, False) 122 | 123 | arcs_batch.append(arc_pred) 124 | rels_batch.append(rel_pred) 125 | 126 | return arcs_batch, rels_batch 127 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/data/Dependency.py: -------------------------------------------------------------------------------- 1 | 2 | class Dependency: #以词为单位 3 | def __init__(self, id, form, tag, head, rel,charid,treebank_id,proxy_tbank="cdt"): 4 | self.id = id #该词在该句话的id 5 | self.org_form = form #该词语 6 | self.form = form.lower() 7 | self.tag = tag #该词词性 8 | self.head = head #该词头节点 9 | self.rel = rel #该词的标签 10 | self.charid = charid 11 | self.treebank_id=treebank_id 12 | self.proxy_tbank=proxy_tbank #代理treebank_id,默认cdt 13 | 14 | 15 | def __str__(self): 16 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_", str(self.charid)] 17 | return '\t'.join(values) 18 | 19 | @property 20 | def pseudo(self): 21 | return self.id == 0 or self.form == '' 22 | 23 | 24 | class DepTree: #以句子为单位,判断是否为投影树 25 | def __init__(self, sentence): 26 | self.words = list(sentence) 27 | self.start = 1 28 | if sentence[0].id == 1: self.start = 0 29 | elif sentence[0].id == 0: self.start = 1 30 | else: self.start = len(self.words) 31 | 32 | def isProj(self): 33 | n = len(self.words) 34 | words = self.words 35 | if self.start > 1: return False 36 | if self.start == 0: words = [None] + words 37 | for i in range(1, n): 38 | hi = words[i].head 39 | for j in range(i+1, hi): 40 | hj = words[j].head 41 | if (hj - hi) * (hj - i) > 0: 42 | return False 43 | return True 44 | 45 | 46 | def evalDepTree(gold, predict): 47 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU'] 48 | PUNCT_TAGS = [] 49 | ignore_tags = set(PUNCT_TAGS) 50 | start_g = 0 51 | if gold[0].id == 0: start_g = 1 52 | start_p = 0 53 | if predict[0].id == 0: start_p = 1 54 | 55 | glength = len(gold) - start_g 56 | plength = len(predict) - start_p 57 | 58 | if glength != plength: 59 | raise Exception('gold length does not match predict length.') 60 | 61 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0 62 | for idx in range(glength): 63 | if gold[start_g + idx].pseudo: continue 64 | if gold[start_g + idx].tag in ignore_tags: continue 65 | if gold[start_g + idx].head == -1: continue 66 | arc_total += 1 67 | label_total += 1 68 | if gold[start_g + idx].head == predict[start_p + idx].head: 69 | arc_correct += 1 70 | if gold[start_g + idx].rel == predict[start_p + idx].rel: 71 | label_correct += 1 72 | 73 | return arc_total, arc_correct, label_total, label_correct 74 | 75 | 76 | def readDepTree(file, vocab=None): 77 | proj = 0 78 | total = 0 79 | min_count = 1 80 | charid = 0 81 | if vocab is None: min_count = 0 82 | if vocab is None: sentence = [] 83 | # else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 84 | else: 85 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,-1,'cdt')] 86 | for line in file: 87 | tok = line.strip().split('\t') 88 | if not tok or line.strip() == '' or line.strip().startswith('#'): 89 | if len(sentence) > min_count: #sententce至少有一个词 90 | if DepTree(sentence).isProj(): 91 | proj += 1 92 | total += 1 93 | yield sentence 94 | if vocab is None: 95 | sentence = [] 96 | else: 97 | # sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 98 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,-1,'cdt')] 99 | elif len(tok) == 10: 100 | if tok[6] == '_': tok[6] = '-1' 101 | try: 102 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7],charid,tok[9])) 103 | charid += 1 104 | except Exception: 105 | pass 106 | else: 107 | pass 108 | 109 | if len(sentence) > min_count: 110 | if DepTree(sentence).isProj(): 111 | proj += 1 112 | total += 1 113 | yield sentence 114 | 115 | print("Total num: ", total) 116 | print("Proj num: ", proj) 117 | 118 | 119 | def writeDepTree(filename, sentences): 120 | with open(filename, 'w') as file: 121 | for sentence in sentences: 122 | for entry in sentence: 123 | if not entry.pseudo: file.write(str(entry) + '\n') 124 | file.write('\n') 125 | 126 | def printDepTree(output, sentence, gold=None): 127 | if gold== None: 128 | for entry in sentence: 129 | if not entry.pseudo: output.write(str(entry) + '\n') 130 | output.write('\n') 131 | else: 132 | start_g = 0 133 | if gold[0].id == 0: start_g = 1 134 | start_p = 0 135 | if sentence[0].id == 0: start_p = 1 136 | glength = len(gold) - start_g 137 | plength = len(sentence) - start_p 138 | 139 | if glength != plength: 140 | raise Exception('gold length does not match predict length.') 141 | 142 | for idx in range(glength): 143 | if gold[start_g + idx].pseudo: continue 144 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \ 145 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"] 146 | output.write('\t'.join(values) + '\n') 147 | 148 | output.write('\n') 149 | 150 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/Parser.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from driver.MST import * 3 | import torch.optim.lr_scheduler 4 | from driver.Layer import * 5 | import numpy as np 6 | 7 | 8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64): 9 | lengths = [len(x) for x in xs] 10 | if length is None: 11 | length = max(lengths) 12 | y = np.array([np.pad(x.astype(dtype), (0, length - l), 13 | mode="constant", constant_values=padding) 14 | for x, l in zip(xs, lengths)]) 15 | return torch.from_numpy(y) 16 | 17 | def _model_var(model, x): 18 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 19 | if p.is_cuda: 20 | x = x.cuda(p.get_device()) 21 | return torch.autograd.Variable(x) 22 | 23 | 24 | class BiaffineParser(object): 25 | def __init__(self, model, root_id): 26 | self.model = model 27 | self.root = root_id 28 | p = next(filter(lambda p: p.requires_grad, model.parameters())) 29 | self.use_cuda = p.is_cuda 30 | self.device = p.get_device() if self.use_cuda else None 31 | 32 | def forward(self, words, extwords, tags, masks,tbank_ids): 33 | if self.use_cuda: 34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device) 35 | tags = tags.cuda(self.device) 36 | masks = masks.cuda(self.device) 37 | tbank_ids=tbank_ids.cuda(self.device) 38 | 39 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,tbank_ids) 40 | # cache 41 | self.arc_logits = arc_logits 42 | self.rel_logits = rel_logits 43 | 44 | 45 | def compute_loss(self, true_arcs, true_rels, lengths): 46 | b, l1, l2 = self.arc_logits.size() 47 | index_true_arcs = _model_var( 48 | self.model, 49 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64)) 50 | true_arcs = _model_var( 51 | self.model, 52 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64)) 53 | 54 | masks = [] 55 | for length in lengths: 56 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length)) 57 | mask = _model_var(self.model, mask) 58 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 59 | masks.append(mask.transpose(0, 1)) 60 | length_mask = torch.stack(masks, 0) 61 | arc_logits = self.arc_logits + length_mask 62 | 63 | arc_loss = F.cross_entropy( 64 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 65 | ignore_index=-1) 66 | 67 | size = self.rel_logits.size() 68 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 69 | 70 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 71 | rel_probs = [] 72 | for i in range(l1): 73 | rel_probs.append(logits[i][int(arcs[i])]) 74 | rel_probs = torch.stack(rel_probs, dim=0) 75 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 76 | 77 | b, l1, d = output_logits.size() 78 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64)) 79 | 80 | rel_loss = F.cross_entropy( 81 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1) 82 | 83 | loss = arc_loss + rel_loss 84 | 85 | return loss 86 | 87 | def compute_accuracy(self, true_arcs, true_rels): 88 | b, l1, l2 = self.arc_logits.size() 89 | pred_arcs = self.arc_logits.data.max(2)[1].cpu() 90 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 91 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64) 92 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum() 93 | 94 | 95 | size = self.rel_logits.size() 96 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3])) 97 | 98 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)): 99 | rel_probs = [] 100 | for i in range(l1): 101 | rel_probs.append(logits[i][arcs[i]]) 102 | rel_probs = torch.stack(rel_probs, dim=0) 103 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 104 | 105 | pred_rels = output_logits.data.max(2)[1].cpu() 106 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64) 107 | label_correct = pred_rels.eq(true_rels).cpu().sum() 108 | 109 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1) 110 | 111 | return arc_correct, label_correct, total_arcs 112 | 113 | def parse(self, words, extwords, tags, lengths, masks,tbank_ids): 114 | if words is not None: 115 | self.forward(words, extwords, tags, masks,tbank_ids) 116 | ROOT = self.root 117 | arcs_batch, rels_batch = [], [] 118 | arc_logits = self.arc_logits.data.cpu().numpy() 119 | rel_logits = self.rel_logits.data.cpu().numpy() 120 | 121 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths): 122 | arc_probs = softmax2d(arc_logit, length, length) 123 | arc_pred = arc_argmax(arc_probs, length, False) 124 | 125 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred] 126 | rel_pred = rel_argmax(rel_probs, length, ROOT, False) 127 | 128 | arcs_batch.append(arc_pred) 129 | rels_batch.append(rel_pred) 130 | 131 | return arcs_batch, rels_batch 132 | -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/Model.py: -------------------------------------------------------------------------------- 1 | from driver.Layer import * 2 | from data.Vocab import * 3 | 4 | 5 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb): 6 | batch_size, seq_length, _ = word_embeddings.size() 7 | word_masks = word_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb) 8 | word_masks = Variable(torch.bernoulli(word_masks), requires_grad=False) 9 | tag_masks = tag_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb) 10 | tag_masks = Variable(torch.bernoulli(tag_masks), requires_grad=False) 11 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12) 12 | word_masks *= scale 13 | tag_masks *= scale 14 | word_masks = word_masks.unsqueeze(dim=2) 15 | tag_masks = tag_masks.unsqueeze(dim=2) 16 | word_embeddings = word_embeddings * word_masks 17 | tag_embeddings = tag_embeddings * tag_masks 18 | 19 | return word_embeddings, tag_embeddings 20 | 21 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True): 22 | if batch_first: 23 | inputs = inputs.transpose(0, 1) 24 | seq_length, batch_size, hidden_size = inputs.size() 25 | drop_masks = inputs.data.new(batch_size, hidden_size).fill_(1 - dropout) 26 | drop_masks = Variable(torch.bernoulli(drop_masks), requires_grad=False) 27 | drop_masks = drop_masks / (1 - dropout) 28 | drop_masks = torch.unsqueeze(drop_masks, dim=2).expand(-1, -1, seq_length).permute(2, 0, 1) 29 | inputs = inputs * drop_masks 30 | 31 | return inputs.transpose(1, 0) 32 | 33 | 34 | class ParserModel(nn.Module): 35 | def __init__(self, vocab, config, pretrained_embedding): 36 | super(ParserModel, self).__init__() 37 | self.config = config 38 | self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) 39 | self.extword_embed = nn.Embedding(vocab.extvocab_size, config.word_dims, padding_idx=0) 40 | self.tag_embed = nn.Embedding(vocab.tag_size, config.tag_dims, padding_idx=0) 41 | 42 | word_init = np.zeros((vocab.vocab_size, config.word_dims), dtype=np.float32) 43 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) 44 | 45 | tag_init = np.random.randn(vocab.tag_size, config.tag_dims).astype(np.float32) 46 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init)) 47 | 48 | self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding)) 49 | self.extword_embed.weight.requires_grad = False 50 | 51 | 52 | self.lstm = MyLSTM( 53 | input_size=config.word_dims + config.tag_dims, 54 | hidden_size=config.lstm_hiddens, 55 | num_layers=config.lstm_layers, 56 | batch_first=True, 57 | bidirectional=True, 58 | dropout_in = config.dropout_lstm_input, 59 | dropout_out=config.dropout_lstm_hidden, 60 | ) 61 | 62 | self.mlp_arc_dep = NonLinear( 63 | input_size = 2*config.lstm_hiddens, 64 | hidden_size = config.mlp_arc_size+config.mlp_rel_size, 65 | activation = nn.LeakyReLU(0.1)) 66 | self.mlp_arc_head = NonLinear( 67 | input_size = 2*config.lstm_hiddens, 68 | hidden_size = config.mlp_arc_size+config.mlp_rel_size, 69 | activation = nn.LeakyReLU(0.1)) 70 | 71 | self.total_num = int((config.mlp_arc_size+config.mlp_rel_size) / 100) 72 | self.arc_num = int(config.mlp_arc_size / 100) 73 | self.rel_num = int(config.mlp_rel_size / 100) 74 | 75 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, \ 76 | 1, bias=(True, False)) 77 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, \ 78 | vocab.rel_size, bias=(True, True)) 79 | 80 | def forward(self, words, extwords, tags, masks): 81 | # x = (batch size, sequence length, dimension of embedding) 82 | x_word_embed = self.word_embed(words) 83 | x_extword_embed = self.extword_embed(extwords) 84 | x_embed = x_word_embed + x_extword_embed 85 | x_tag_embed = self.tag_embed(tags) 86 | 87 | if self.training: 88 | x_embed, x_tag_embed = drop_input_independent(x_embed, x_tag_embed, self.config.dropout_emb) 89 | 90 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2) 91 | 92 | outputs, _ = self.lstm(x_lexical, masks, None) 93 | outputs = outputs.transpose(1, 0) 94 | 95 | if self.training: 96 | outputs = drop_sequence_sharedmask(outputs, self.config.dropout_mlp) 97 | 98 | x_all_dep = self.mlp_arc_dep(outputs) 99 | x_all_head = self.mlp_arc_head(outputs) 100 | 101 | if self.training: 102 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 103 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 104 | 105 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2) 106 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2) 107 | 108 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 109 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 110 | 111 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 112 | arc_logit = torch.squeeze(arc_logit, dim=3) 113 | 114 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 115 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 116 | 117 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 118 | return arc_logit, rel_logit_cond -------------------------------------------------------------------------------- /multi_task_learning/data/Dependency.py: -------------------------------------------------------------------------------- 1 | 2 | class Dependency: #以词为单位 3 | def __init__(self, id, form, tag, head, rel): 4 | self.id = id #该词在该句话的id 5 | self.org_form = form #该词语 6 | self.form = form.lower() 7 | self.tag = tag #该词词性 8 | self.head = head #该词头节点 9 | self.rel = rel #该词的标签 10 | 11 | def __str__(self): 12 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"] 13 | return '\t'.join(values) 14 | 15 | @property 16 | def pseudo(self): 17 | return self.id == 0 or self.form == '' 18 | 19 | 20 | class DepTree: #以句子为单位,判断是否为投影树 21 | def __init__(self, sentence): 22 | self.words = list(sentence) 23 | self.start = 1 24 | if sentence[0].id == 1: self.start = 0 25 | elif sentence[0].id == 0: self.start = 1 26 | else: self.start = len(self.words) 27 | 28 | def isProj(self): 29 | n = len(self.words) 30 | words = self.words 31 | if self.start > 1: return False 32 | if self.start == 0: words = [None] + words 33 | for i in range(1, n): 34 | hi = words[i].head 35 | for j in range(i+1, hi): 36 | hj = words[j].head 37 | if (hj - hi) * (hj - i) > 0: 38 | return False 39 | return True 40 | 41 | 42 | def evalDepTree(gold, predict): 43 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU'] 44 | PUNCT_TAGS = [] 45 | ignore_tags = set(PUNCT_TAGS) 46 | start_g = 0 47 | if gold[0].id == 0: start_g = 1 48 | start_p = 0 49 | if predict[0].id == 0: start_p = 1 50 | 51 | glength = len(gold) - start_g 52 | plength = len(predict) - start_p 53 | 54 | if glength != plength: 55 | raise Exception('gold length does not match predict length.') 56 | 57 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0 58 | for idx in range(glength): 59 | if gold[start_g + idx].pseudo: continue 60 | if gold[start_g + idx].tag in ignore_tags: continue 61 | if gold[start_g + idx].head == -1: continue 62 | arc_total += 1 63 | label_total += 1 64 | if gold[start_g + idx].head == predict[start_p + idx].head: 65 | arc_correct += 1 66 | if gold[start_g + idx].rel == predict[start_p + idx].rel: 67 | label_correct += 1 68 | 69 | return arc_total, arc_correct, label_total, label_correct 70 | class Instance: 71 | def __init__(self, list_sentence, tgt_type): 72 | self.sentence = list_sentence 73 | self.tgt_type = tgt_type 74 | 75 | def __len__(self): 76 | return len(self.sentence) 77 | 78 | def parse_my_conll(file_path): 79 | file = open(file_path, mode="r", encoding="utf-8") 80 | for line in file: 81 | tokens = line.strip().split('\t') 82 | tgt_type = tokens[-1] 83 | file.close() 84 | return tgt_type 85 | 86 | def readDepTree(file,tgt_type, vocab=None): 87 | proj = 0 88 | total = 0 89 | min_count = 1 90 | if vocab is None: min_count = 0 91 | if vocab is None: sentence = [] 92 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 93 | for line in file: 94 | tok = line.strip().split('\t') 95 | if not tok or line.strip() == '' or line.strip().startswith('#'): 96 | if len(sentence) > min_count: #sententce至少有一个词 97 | if DepTree(sentence).isProj(): 98 | proj += 1 99 | total += 1 100 | yield Instance(sentence, tgt_type) 101 | if vocab is None: 102 | sentence = [] 103 | else: 104 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)] 105 | elif len(tok) == 10: 106 | if tok[6] == '_': tok[6] = '-1' 107 | try: 108 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7])) 109 | except Exception: 110 | pass 111 | else: 112 | pass 113 | 114 | if len(sentence) > min_count: 115 | if DepTree(sentence).isProj(): 116 | proj += 1 117 | total += 1 118 | yield Instance(sentence, tgt_type) 119 | 120 | print("Total num: ", total) 121 | print("Proj num: ", proj) 122 | 123 | 124 | def writeDepTree(filename, sentences): 125 | with open(filename, 'w') as file: 126 | for sentence in sentences: 127 | for entry in sentence: 128 | if not entry.pseudo: file.write(str(entry) + '\n') 129 | file.write('\n') 130 | 131 | def printDepTree(output, sentence, gold=None): 132 | if gold== None: 133 | for entry in sentence: 134 | if not entry.pseudo: output.write(str(entry) + '\n') 135 | output.write('\n') 136 | else: 137 | start_g = 0 138 | if gold[0].id == 0: start_g = 1 139 | start_p = 0 140 | if sentence[0].id == 0: start_p = 1 141 | glength = len(gold) - start_g 142 | plength = len(sentence) - start_p 143 | 144 | if glength != plength: 145 | raise Exception('gold length does not match predict length.') 146 | 147 | for idx in range(glength): 148 | if gold[start_g + idx].pseudo: continue 149 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \ 150 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"] 151 | output.write('\t'.join(values) + '\n') 152 | 153 | output.write('\n') 154 | 155 | -------------------------------------------------------------------------------- /ELMo/fine-tune-elmo/tests/test_training.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import unittest 4 | import os 5 | import shutil 6 | import tempfile 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | from bilm.training import train, test, load_vocab, \ 12 | load_options_latest_checkpoint 13 | from bilm.data import LMDataset, BidirectionalLMDataset 14 | 15 | FIXTURES = 'tests/fixtures/train/' 16 | 17 | class TestLanguageModel(unittest.TestCase): 18 | def tearDown(self): 19 | shutil.rmtree(self.tmp_dir) 20 | os.remove(self.tmp_file) 21 | tf.reset_default_graph() 22 | 23 | def setUp(self): 24 | self.tmp_dir = tempfile.mkdtemp() 25 | (_, self.tmp_file) = tempfile.mkstemp() 26 | 27 | def _get_data(self, bidirectional, use_chars, test=False): 28 | vocab_file = os.path.join(FIXTURES, 'vocab.txt') 29 | if use_chars: 30 | vocab = load_vocab(vocab_file, 10) 31 | else: 32 | vocab = load_vocab(vocab_file, None) 33 | 34 | prefix = os.path.join(FIXTURES, 'data.txt') 35 | 36 | if bidirectional: 37 | data = BidirectionalLMDataset(prefix, vocab, test=test) 38 | else: 39 | data = LMDataset(prefix, vocab, test=test, reverse=False) 40 | 41 | return data, vocab 42 | 43 | def _get_vocab_data_options(self, bidirectional, use_chars, 44 | share_embedding_softmax=False): 45 | data, vocab = self._get_data(bidirectional, use_chars) 46 | options = { 47 | 'n_tokens_vocab': vocab.size, 48 | 'n_negative_samples_batch': 16, 49 | 'n_train_tokens': 134, 50 | 'batch_size': 2, 51 | 'unroll_steps': 10, 52 | 'n_epochs': 50, 53 | 'all_clip_norm_val': 1.0, 54 | 'dropout': 0.1, 55 | 'lstm': {'dim': 16, 'projection_dim': 8, 'n_layers': 2}, 56 | 'bidirectional': bidirectional, 57 | } 58 | 59 | if use_chars: 60 | options['char_cnn'] = { 61 | 'n_characters': 261, 62 | 'max_characters_per_token': 10, 63 | 'filters': [ 64 | [1, 8], 65 | [2, 8], 66 | [3, 16], 67 | [4, 32], 68 | [5, 64], 69 | ], 70 | 'activation': 'tanh', 71 | 'embedding': {'dim': 4}, 72 | 'n_highway': 1, 73 | } 74 | 75 | if share_embedding_softmax: 76 | options['share_embedding_softmax'] = True 77 | 78 | return vocab, data, options 79 | 80 | def test_train_single_direction(self): 81 | vocab, data, options = self._get_vocab_data_options(False, False) 82 | train(options, data, 1, self.tmp_dir, self.tmp_dir) 83 | 84 | # now test 85 | tf.reset_default_graph() 86 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) 87 | data_test, vocab_test = self._get_data(False, False, True) 88 | perplexity = test(options, ckpt_file, data_test, batch_size=1) 89 | self.assertTrue(perplexity < 20.0) 90 | 91 | def test_train_bilm_chars(self): 92 | vocab, data, options = self._get_vocab_data_options(True, True) 93 | train(options, data, 1, self.tmp_dir, self.tmp_dir) 94 | 95 | # now test 96 | tf.reset_default_graph() 97 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) 98 | data_test, vocab_test = self._get_data(True, True, True) 99 | perplexity = test(options, ckpt_file, data_test, batch_size=1) 100 | self.assertTrue(perplexity < 20.0) 101 | 102 | def test_shared_variables(self): 103 | vocab, data, options = self._get_vocab_data_options(True, True) 104 | options['n_epochs'] = 1 105 | train(options, data, 2, self.tmp_dir, self.tmp_dir) 106 | self.assertEqual(len(tf.global_variables()), 64) 107 | 108 | def test_train_shared_softmax_embedding(self): 109 | bidirectional = True 110 | use_chars = False 111 | 112 | vocab, data, options = self._get_vocab_data_options( 113 | bidirectional, use_chars, share_embedding_softmax=True) 114 | train(options, data, 1, self.tmp_dir, self.tmp_dir) 115 | 116 | # now test 117 | tf.reset_default_graph() 118 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) 119 | data_test, vocab_test = self._get_data( 120 | bidirectional, use_chars, test=True) 121 | perplexity = test(options, ckpt_file, data_test, batch_size=1) 122 | self.assertTrue(perplexity < 20.0) 123 | 124 | def test_train_shared_softmax_no_chars(self): 125 | bidirectional = True 126 | use_chars = True 127 | vocab, data, options = self._get_vocab_data_options( 128 | bidirectional, use_chars, share_embedding_softmax=True) 129 | # character inputs and sharing weights not suppported 130 | with self.assertRaises(ValueError): 131 | train(options, data, 1, self.tmp_dir, self.tmp_dir) 132 | 133 | def test_train_skip_connections(self): 134 | bidirectional = True 135 | use_chars = False 136 | vocab, data, options = self._get_vocab_data_options( 137 | bidirectional, use_chars) 138 | options['lstm']['use_skip_connections'] = True 139 | train(options, data, 1, self.tmp_dir, self.tmp_dir) 140 | 141 | # now test 142 | tf.reset_default_graph() 143 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) 144 | data_test, vocab_test = self._get_data( 145 | bidirectional, use_chars, test=True) 146 | perplexity = test(options, ckpt_file, data_test, batch_size=1) 147 | self.assertTrue(perplexity < 20.0) 148 | 149 | 150 | if __name__ == '__main__': 151 | unittest.main() 152 | 153 | -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/Model.py: -------------------------------------------------------------------------------- 1 | from driver.Layer import * 2 | from data.Vocab import * 3 | 4 | 5 | def drop_input_independent(word_embeddings, tag_embeddings,tbank_embeddings, dropout_emb): 6 | batch_size, seq_length, _ = word_embeddings.size() 7 | word_masks = word_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb) 8 | word_masks = Variable(torch.bernoulli(word_masks), requires_grad=False) 9 | tag_masks = tag_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb) 10 | tag_masks = Variable(torch.bernoulli(tag_masks), requires_grad=False) 11 | tbank_masks = tbank_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb) 12 | tbank_masks = Variable(torch.bernoulli(tbank_masks), requires_grad=False) 13 | scale = 3.0 / (2.0 * word_masks + tag_masks + tbank_masks + 1e-12) 14 | word_masks *= scale 15 | tag_masks *= scale 16 | tbank_masks *= scale 17 | word_masks = word_masks.unsqueeze(dim=2) 18 | tag_masks = tag_masks.unsqueeze(dim=2) 19 | tbank_masks = tbank_masks.unsqueeze(dim=2) 20 | word_embeddings = word_embeddings * word_masks 21 | tag_embeddings = tag_embeddings * tag_masks 22 | tbank_embeddings = tbank_embeddings * tbank_masks 23 | 24 | return word_embeddings, tag_embeddings, tbank_embeddings 25 | 26 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True): 27 | if batch_first: 28 | inputs = inputs.transpose(0, 1) 29 | seq_length, batch_size, hidden_size = inputs.size() 30 | drop_masks = inputs.data.new(batch_size, hidden_size).fill_(1 - dropout) 31 | drop_masks = Variable(torch.bernoulli(drop_masks), requires_grad=False) 32 | drop_masks = drop_masks / (1 - dropout) 33 | drop_masks = torch.unsqueeze(drop_masks, dim=2).expand(-1, -1, seq_length).permute(2, 0, 1) 34 | inputs = inputs * drop_masks 35 | 36 | return inputs.transpose(1, 0) 37 | 38 | 39 | class ParserModel(nn.Module): 40 | def __init__(self, vocab, config, pretrained_embedding): 41 | super(ParserModel, self).__init__() 42 | self.config = config 43 | self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) 44 | self.extword_embed = nn.Embedding(vocab.extvocab_size, config.word_dims, padding_idx=0) 45 | self.tag_embed = nn.Embedding(vocab.tag_size, config.tag_dims, padding_idx=0) 46 | self.tbank_embed = nn.Embedding(vocab.tbank_size, config.tbank_emb_size, padding_idx=0) 47 | 48 | word_init = np.zeros((vocab.vocab_size, config.word_dims), dtype=np.float32) 49 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) 50 | 51 | tag_init = np.random.randn(vocab.tag_size, config.tag_dims).astype(np.float32) 52 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init)) 53 | 54 | self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding)) 55 | self.extword_embed.weight.requires_grad = False 56 | 57 | tbank_init = np.random.randn(vocab.tbank_size, config.tbank_emb_size).astype(np.float32) 58 | self.tbank_embed.weight.data.copy_(torch.from_numpy(tbank_init)) 59 | 60 | 61 | self.lstm = MyLSTM( 62 | input_size=config.word_dims + config.tag_dims + config.tbank_emb_size, 63 | hidden_size=config.lstm_hiddens, 64 | num_layers=config.lstm_layers, 65 | batch_first=True, 66 | bidirectional=True, 67 | dropout_in = config.dropout_lstm_input, 68 | dropout_out=config.dropout_lstm_hidden, 69 | ) 70 | 71 | self.mlp_arc_dep = NonLinear( 72 | input_size = 2*config.lstm_hiddens, 73 | hidden_size = config.mlp_arc_size+config.mlp_rel_size, 74 | activation = nn.LeakyReLU(0.1)) 75 | self.mlp_arc_head = NonLinear( 76 | input_size = 2*config.lstm_hiddens, 77 | hidden_size = config.mlp_arc_size+config.mlp_rel_size, 78 | activation = nn.LeakyReLU(0.1)) 79 | 80 | self.total_num = int((config.mlp_arc_size+config.mlp_rel_size) / 100) 81 | self.arc_num = int(config.mlp_arc_size / 100) 82 | self.rel_num = int(config.mlp_rel_size / 100) 83 | 84 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, \ 85 | 1, bias=(True, False)) 86 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, \ 87 | vocab.rel_size, bias=(True, True)) 88 | 89 | def forward(self, words, extwords, tags, masks,tbank_ids): 90 | # x = (batch size, sequence length, dimension of embedding) 91 | x_word_embed = self.word_embed(words) 92 | x_extword_embed = self.extword_embed(extwords) 93 | x_embed = x_word_embed + x_extword_embed 94 | x_tag_embed = self.tag_embed(tags) 95 | x_tbank_embed = self.tbank_embed(tbank_ids) 96 | 97 | if self.training: 98 | x_embed, x_tag_embed,x_tbank_embed = drop_input_independent(x_embed, x_tag_embed,x_tbank_embed, self.config.dropout_emb) 99 | 100 | x_lexical = torch.cat((x_embed, x_tag_embed,x_tbank_embed), dim=2) 101 | 102 | outputs, _ = self.lstm(x_lexical, masks, None) 103 | outputs = outputs.transpose(1, 0) 104 | 105 | if self.training: 106 | outputs = drop_sequence_sharedmask(outputs, self.config.dropout_mlp) 107 | 108 | x_all_dep = self.mlp_arc_dep(outputs) 109 | x_all_head = self.mlp_arc_head(outputs) 110 | 111 | if self.training: 112 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 113 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 114 | 115 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2) 116 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2) 117 | 118 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 119 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 120 | 121 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 122 | arc_logit = torch.squeeze(arc_logit, dim=3) 123 | 124 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 125 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 126 | 127 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 128 | return arc_logit, rel_logit_cond -------------------------------------------------------------------------------- /biaffine-parser-domain-embedding/driver/Config.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | import sys, os 3 | 4 | sys.path.append('..') 5 | 6 | 7 | # import models 8 | 9 | class Configurable(object): 10 | def __init__(self, config_file, extra_args): 11 | config = ConfigParser() 12 | config.read(config_file) 13 | if extra_args: 14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])]) 15 | for section in config.sections(): 16 | for k, v in config.items(section): 17 | if k in extra_args: 18 | v = type(v)(extra_args[k]) 19 | config.set(section, k, v) 20 | self._config = config 21 | if not os.path.isdir(self.save_dir): 22 | os.mkdir(self.save_dir) 23 | config.write(open(self.config_file, 'w')) 24 | print('Loaded config file sucessfully.') 25 | for section in config.sections(): 26 | for k, v in config.items(section): 27 | print(k, v) 28 | 29 | @property 30 | def pretrained_embeddings_file(self): 31 | return self._config.get('Data', 'pretrained_embeddings_file') 32 | 33 | @property 34 | def data_dir(self): 35 | return self._config.get('Data', 'data_dir') 36 | 37 | @property 38 | def train_file(self): 39 | return self._config.get('Data', 'train_file') 40 | 41 | @property 42 | def train_file2(self): 43 | return self._config.get('Data', 'train_file2') 44 | 45 | @property 46 | def train_file3(self): 47 | return self._config.get('Data', 'train_file3') 48 | 49 | @property 50 | def dev_file(self): 51 | return self._config.get('Data', 'dev_file') 52 | 53 | @property 54 | def test_file(self): 55 | return self._config.get('Data', 'test_file') 56 | 57 | @property 58 | def min_occur_count(self): 59 | return self._config.getint('Data', 'min_occur_count') 60 | 61 | @property 62 | def save_dir(self): 63 | return self._config.get('Save', 'save_dir') 64 | 65 | @property 66 | def config_file(self): 67 | return self._config.get('Save', 'config_file') 68 | 69 | @property 70 | def save_model_path(self): 71 | return self._config.get('Save', 'save_model_path') 72 | 73 | @property 74 | def save_vocab_path(self): 75 | return self._config.get('Save', 'save_vocab_path') 76 | 77 | @property 78 | def load_dir(self): 79 | return self._config.get('Save', 'load_dir') 80 | 81 | @property 82 | def load_model_path(self): 83 | return self._config.get('Save', 'load_model_path') 84 | 85 | @property 86 | def load_vocab_path(self): 87 | return self._config.get('Save', 'load_vocab_path') 88 | 89 | @property 90 | def lstm_layers(self): 91 | return self._config.getint('Network', 'lstm_layers') 92 | 93 | @property 94 | def word_dims(self): 95 | return self._config.getint('Network', 'word_dims') 96 | 97 | @property 98 | def tag_dims(self): 99 | return self._config.getint('Network', 'tag_dims') 100 | 101 | @property 102 | def dropout_emb(self): 103 | return self._config.getfloat('Network', 'dropout_emb') 104 | 105 | @property 106 | def lstm_hiddens(self): 107 | return self._config.getint('Network', 'lstm_hiddens') 108 | 109 | @property 110 | def dropout_lstm_input(self): 111 | return self._config.getfloat('Network', 'dropout_lstm_input') 112 | 113 | @property 114 | def dropout_lstm_hidden(self): 115 | return self._config.getfloat('Network', 'dropout_lstm_hidden') 116 | 117 | @property 118 | def mlp_arc_size(self): 119 | return self._config.getint('Network', 'mlp_arc_size') 120 | 121 | @property 122 | def mlp_rel_size(self): 123 | return self._config.getint('Network', 'mlp_rel_size') 124 | 125 | @property 126 | def dropout_mlp(self): 127 | return self._config.getfloat('Network', 'dropout_mlp') 128 | 129 | @property 130 | def learning_rate(self): 131 | return self._config.getfloat('Optimizer', 'learning_rate') 132 | 133 | @property 134 | def decay(self): 135 | return self._config.getfloat('Optimizer', 'decay') 136 | 137 | @property 138 | def decay_steps(self): 139 | return self._config.getint('Optimizer', 'decay_steps') 140 | 141 | @property 142 | def beta_1(self): 143 | return self._config.getfloat('Optimizer', 'beta_1') 144 | 145 | @property 146 | def beta_2(self): 147 | return self._config.getfloat('Optimizer', 'beta_2') 148 | 149 | @property 150 | def epsilon(self): 151 | return self._config.getfloat('Optimizer', 'epsilon') 152 | 153 | @property 154 | def clip(self): 155 | return self._config.getfloat('Optimizer', 'clip') 156 | 157 | @property 158 | def num_buckets_train(self): 159 | return self._config.getint('Run', 'num_buckets_train') 160 | 161 | @property 162 | def num_buckets_valid(self): 163 | return self._config.getint('Run', 'num_buckets_valid') 164 | 165 | @property 166 | def num_buckets_test(self): 167 | return self._config.getint('Run', 'num_buckets_test') 168 | 169 | @property 170 | def train_iters(self): 171 | return self._config.getint('Run', 'train_iters') 172 | 173 | @property 174 | def train_batch_size(self): 175 | return self._config.getint('Run', 'train_batch_size') 176 | 177 | @property 178 | def test_batch_size(self): 179 | return self._config.getint('Run', 'test_batch_size') 180 | 181 | @property 182 | def validate_every(self): 183 | return self._config.getint('Run', 'validate_every') 184 | 185 | @property 186 | def save_after(self): 187 | return self._config.getint('Run', 'save_after') 188 | 189 | @property 190 | def update_every(self): 191 | return self._config.getint('Run', 'update_every') 192 | 193 | @property 194 | def domain_num_corpus_weighting(self): 195 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting') 196 | 197 | @property 198 | def cdt_domain_ration(self): 199 | return self._config.getint('corous-weighting', 'cdt_domain_ration') 200 | 201 | @property 202 | def domain_data_size(self): 203 | return self._config.getint('corous-weighting', 'domain_data_size') 204 | 205 | @property 206 | def tbank_emb_size(self): 207 | return self._config.getint('Treebankid', 'tbank_emb_size') 208 | 209 | 210 | -------------------------------------------------------------------------------- /biaffine-parser-concat/driver/Config.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | import sys, os 3 | 4 | sys.path.append('..') 5 | 6 | 7 | # import models 8 | 9 | class Configurable(object): 10 | def __init__(self, config_file, extra_args): 11 | config = ConfigParser() 12 | config.read(config_file) 13 | if extra_args: 14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])]) 15 | for section in config.sections(): 16 | for k, v in config.items(section): 17 | if k in extra_args: 18 | v = type(v)(extra_args[k]) 19 | config.set(section, k, v) 20 | self._config = config 21 | if not os.path.isdir(self.save_dir): 22 | os.mkdir(self.save_dir) 23 | config.write(open(self.config_file, 'w')) 24 | print('Loaded config file sucessfully.') 25 | for section in config.sections(): 26 | for k, v in config.items(section): 27 | print(k, v) 28 | 29 | @property 30 | def pretrained_embeddings_file(self): 31 | return self._config.get('Data', 'pretrained_embeddings_file') 32 | 33 | @property 34 | def data_dir(self): 35 | return self._config.get('Data', 'data_dir') 36 | 37 | @property 38 | def train_file(self): 39 | return self._config.get('Data', 'train_file') 40 | 41 | @property 42 | def dev_file(self): 43 | return self._config.get('Data', 'dev_file') 44 | 45 | @property 46 | def test_file(self): 47 | return self._config.get('Data', 'test_file') 48 | 49 | @property 50 | def min_occur_count(self): 51 | return self._config.getint('Data', 'min_occur_count') 52 | 53 | @property 54 | def save_dir(self): 55 | return self._config.get('Save', 'save_dir') 56 | 57 | @property 58 | def config_file(self): 59 | return self._config.get('Save', 'config_file') 60 | 61 | @property 62 | def save_model_path(self): 63 | return self._config.get('Save', 'save_model_path') 64 | 65 | @property 66 | def save_vocab_path(self): 67 | return self._config.get('Save', 'save_vocab_path') 68 | 69 | @property 70 | def load_dir(self): 71 | return self._config.get('Save', 'load_dir') 72 | 73 | @property 74 | def load_model_path(self): 75 | return self._config.get('Save', 'load_model_path') 76 | 77 | @property 78 | def load_vocab_path(self): 79 | return self._config.get('Save', 'load_vocab_path') 80 | 81 | @property 82 | def lstm_layers(self): 83 | return self._config.getint('Network', 'lstm_layers') 84 | 85 | @property 86 | def word_dims(self): 87 | return self._config.getint('Network', 'word_dims') 88 | 89 | @property 90 | def tag_dims(self): 91 | return self._config.getint('Network', 'tag_dims') 92 | 93 | @property 94 | def dropout_emb(self): 95 | return self._config.getfloat('Network', 'dropout_emb') 96 | 97 | @property 98 | def lstm_hiddens(self): 99 | return self._config.getint('Network', 'lstm_hiddens') 100 | 101 | @property 102 | def dropout_lstm_input(self): 103 | return self._config.getfloat('Network', 'dropout_lstm_input') 104 | 105 | @property 106 | def dropout_lstm_hidden(self): 107 | return self._config.getfloat('Network', 'dropout_lstm_hidden') 108 | 109 | @property 110 | def mlp_arc_size(self): 111 | return self._config.getint('Network', 'mlp_arc_size') 112 | 113 | @property 114 | def mlp_rel_size(self): 115 | return self._config.getint('Network', 'mlp_rel_size') 116 | 117 | @property 118 | def dropout_mlp(self): 119 | return self._config.getfloat('Network', 'dropout_mlp') 120 | 121 | @property 122 | def learning_rate(self): 123 | return self._config.getfloat('Optimizer', 'learning_rate') 124 | 125 | @property 126 | def decay(self): 127 | return self._config.getfloat('Optimizer', 'decay') 128 | 129 | @property 130 | def decay_steps(self): 131 | return self._config.getint('Optimizer', 'decay_steps') 132 | 133 | @property 134 | def beta_1(self): 135 | return self._config.getfloat('Optimizer', 'beta_1') 136 | 137 | @property 138 | def beta_2(self): 139 | return self._config.getfloat('Optimizer', 'beta_2') 140 | 141 | @property 142 | def epsilon(self): 143 | return self._config.getfloat('Optimizer', 'epsilon') 144 | 145 | @property 146 | def clip(self): 147 | return self._config.getfloat('Optimizer', 'clip') 148 | 149 | @property 150 | def num_buckets_train(self): 151 | return self._config.getint('Run', 'num_buckets_train') 152 | 153 | @property 154 | def num_buckets_valid(self): 155 | return self._config.getint('Run', 'num_buckets_valid') 156 | 157 | @property 158 | def num_buckets_test(self): 159 | return self._config.getint('Run', 'num_buckets_test') 160 | 161 | @property 162 | def num_corpus_weighting1(self): 163 | return self._config.getint('Run', 'num_corpus_weighting1') 164 | 165 | @property 166 | def num_corpus_weighting2(self): 167 | return self._config.getint('Run', 'num_corpus_weighting2') 168 | 169 | @property 170 | def num_corpus_weighting3(self): 171 | return self._config.getint('Run', 'num_corpus_weighting3') 172 | 173 | @property 174 | def train_iters(self): 175 | return self._config.getint('Run', 'train_iters') 176 | 177 | @property 178 | def train_batch_size(self): 179 | return self._config.getint('Run', 'train_batch_size') 180 | 181 | @property 182 | def test_batch_size(self): 183 | return self._config.getint('Run', 'test_batch_size') 184 | 185 | @property 186 | def validate_every(self): 187 | return self._config.getint('Run', 'validate_every') 188 | 189 | @property 190 | def save_after(self): 191 | return self._config.getint('Run', 'save_after') 192 | 193 | @property 194 | def update_every(self): 195 | return self._config.getint('Run', 'update_every') 196 | 197 | @property 198 | def domain_num_corpus_weighting(self): 199 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting') 200 | 201 | @property 202 | def cdt_domain_ration(self): 203 | return self._config.getint('corous-weighting', 'cdt_domain_ration') 204 | 205 | @property 206 | def domain_file(self): 207 | return self._config.get('corous-weighting', 'domain_file') 208 | -------------------------------------------------------------------------------- /multi_task_learning/driver/Config.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | import sys, os 3 | 4 | sys.path.append('..') 5 | 6 | 7 | # import models 8 | 9 | class Configurable(object): 10 | def __init__(self, config_file, extra_args): 11 | config = ConfigParser() 12 | config.read(config_file) 13 | if extra_args: 14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])]) 15 | for section in config.sections(): 16 | for k, v in config.items(section): 17 | if k in extra_args: 18 | v = type(v)(extra_args[k]) 19 | config.set(section, k, v) 20 | self._config = config 21 | if not os.path.isdir(self.save_dir): 22 | os.mkdir(self.save_dir) 23 | config.write(open(self.config_file, 'w')) 24 | print('Loaded config file sucessfully.') 25 | for section in config.sections(): 26 | for k, v in config.items(section): 27 | print(k, v) 28 | 29 | @property 30 | def pretrained_embeddings_file(self): 31 | return self._config.get('Data', 'pretrained_embeddings_file') 32 | 33 | @property 34 | def data_dir(self): 35 | return self._config.get('Data', 'data_dir') 36 | 37 | @property 38 | def train_file(self): 39 | return self._config.get('Data', 'train_file') 40 | 41 | @property 42 | def cdt_dev_file(self): 43 | return self._config.get('Data', 'cdt_dev_file') 44 | 45 | @property 46 | def cw_dev_file(self): 47 | return self._config.get('Data', 'cw_dev_file') 48 | 49 | @property 50 | def cdt_test_file(self): 51 | return self._config.get('Data', 'cdt_test_file') 52 | 53 | @property 54 | def cw_test_file(self): 55 | return self._config.get('Data', 'cw_test_file') 56 | 57 | @property 58 | def min_occur_count(self): 59 | return self._config.getint('Data', 'min_occur_count') 60 | 61 | @property 62 | def save_dir(self): 63 | return self._config.get('Save', 'save_dir') 64 | 65 | @property 66 | def config_file(self): 67 | return self._config.get('Save', 'config_file') 68 | 69 | @property 70 | def save_model_path(self): 71 | return self._config.get('Save', 'save_model_path') 72 | 73 | @property 74 | def save_vocab_path(self): 75 | return self._config.get('Save', 'save_vocab_path') 76 | 77 | @property 78 | def load_dir(self): 79 | return self._config.get('Save', 'load_dir') 80 | 81 | @property 82 | def load_model_path(self): 83 | return self._config.get('Save', 'load_model_path') 84 | 85 | @property 86 | def load_vocab_path(self): 87 | return self._config.get('Save', 'load_vocab_path') 88 | 89 | @property 90 | def lstm_layers(self): 91 | return self._config.getint('Network', 'lstm_layers') 92 | 93 | @property 94 | def word_dims(self): 95 | return self._config.getint('Network', 'word_dims') 96 | 97 | @property 98 | def tag_dims(self): 99 | return self._config.getint('Network', 'tag_dims') 100 | 101 | @property 102 | def dropout_emb(self): 103 | return self._config.getfloat('Network', 'dropout_emb') 104 | 105 | @property 106 | def lstm_hiddens(self): 107 | return self._config.getint('Network', 'lstm_hiddens') 108 | 109 | @property 110 | def dropout_lstm_input(self): 111 | return self._config.getfloat('Network', 'dropout_lstm_input') 112 | 113 | @property 114 | def dropout_lstm_hidden(self): 115 | return self._config.getfloat('Network', 'dropout_lstm_hidden') 116 | 117 | @property 118 | def mlp_arc_size(self): 119 | return self._config.getint('Network', 'mlp_arc_size') 120 | 121 | @property 122 | def mlp_rel_size(self): 123 | return self._config.getint('Network', 'mlp_rel_size') 124 | 125 | @property 126 | def dropout_mlp(self): 127 | return self._config.getfloat('Network', 'dropout_mlp') 128 | 129 | @property 130 | def learning_rate(self): 131 | return self._config.getfloat('Optimizer', 'learning_rate') 132 | 133 | @property 134 | def decay(self): 135 | return self._config.getfloat('Optimizer', 'decay') 136 | 137 | @property 138 | def decay_steps(self): 139 | return self._config.getint('Optimizer', 'decay_steps') 140 | 141 | @property 142 | def beta_1(self): 143 | return self._config.getfloat('Optimizer', 'beta_1') 144 | 145 | @property 146 | def beta_2(self): 147 | return self._config.getfloat('Optimizer', 'beta_2') 148 | 149 | @property 150 | def epsilon(self): 151 | return self._config.getfloat('Optimizer', 'epsilon') 152 | 153 | @property 154 | def clip(self): 155 | return self._config.getfloat('Optimizer', 'clip') 156 | 157 | @property 158 | def num_buckets_train(self): 159 | return self._config.getint('Run', 'num_buckets_train') 160 | 161 | @property 162 | def num_buckets_valid(self): 163 | return self._config.getint('Run', 'num_buckets_valid') 164 | 165 | @property 166 | def num_buckets_test(self): 167 | return self._config.getint('Run', 'num_buckets_test') 168 | 169 | @property 170 | def num_corpus_weighting1(self): 171 | return self._config.getint('Run', 'num_corpus_weighting1') 172 | 173 | @property 174 | def num_corpus_weighting2(self): 175 | return self._config.getint('Run', 'num_corpus_weighting2') 176 | 177 | @property 178 | def num_corpus_weighting3(self): 179 | return self._config.getint('Run', 'num_corpus_weighting3') 180 | 181 | @property 182 | def train_iters(self): 183 | return self._config.getint('Run', 'train_iters') 184 | 185 | @property 186 | def train_batch_size(self): 187 | return self._config.getint('Run', 'train_batch_size') 188 | 189 | @property 190 | def test_batch_size(self): 191 | return self._config.getint('Run', 'test_batch_size') 192 | 193 | @property 194 | def validate_every(self): 195 | return self._config.getint('Run', 'validate_every') 196 | 197 | @property 198 | def save_after(self): 199 | return self._config.getint('Run', 'save_after') 200 | 201 | @property 202 | def update_every(self): 203 | return self._config.getint('Run', 'update_every') 204 | 205 | @property 206 | def domain_num_corpus_weighting(self): 207 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting') 208 | 209 | @property 210 | def cdt_domain_ration(self): 211 | return self._config.getint('corous-weighting', 'cdt_domain_ration') 212 | 213 | @property 214 | def domain_file(self): 215 | return self._config.get('corous-weighting', 'domain_file') 216 | -------------------------------------------------------------------------------- /biaffine-parser-elmo/driver/Config.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | import sys, os 3 | 4 | sys.path.append('..') 5 | 6 | 7 | # import models 8 | 9 | class Configurable(object): 10 | def __init__(self, config_file, extra_args): 11 | config = ConfigParser() 12 | config.read(config_file) 13 | if extra_args: 14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])]) 15 | for section in config.sections(): 16 | for k, v in config.items(section): 17 | if k in extra_args: 18 | v = type(v)(extra_args[k]) 19 | config.set(section, k, v) 20 | self._config = config 21 | if not os.path.isdir(self.save_dir): 22 | os.mkdir(self.save_dir) 23 | config.write(open(self.config_file, 'w')) 24 | print('Loaded config file sucessfully.') 25 | for section in config.sections(): 26 | for k, v in config.items(section): 27 | print(k, v) 28 | 29 | @property 30 | def pretrained_embeddings_file(self): 31 | return self._config.get('Data', 'pretrained_embeddings_file') 32 | 33 | @property 34 | def data_dir(self): 35 | return self._config.get('Data', 'data_dir') 36 | 37 | @property 38 | def train_file(self): 39 | return self._config.get('Data', 'train_file') 40 | 41 | @property 42 | def train_file2(self): 43 | return self._config.get('Data', 'train_file2') 44 | 45 | @property 46 | def train_file3(self): 47 | return self._config.get('Data', 'train_file3') 48 | 49 | @property 50 | def dev_file(self): 51 | return self._config.get('Data', 'dev_file') 52 | 53 | @property 54 | def test_file(self): 55 | return self._config.get('Data', 'test_file') 56 | 57 | @property 58 | def min_occur_count(self): 59 | return self._config.getint('Data', 'min_occur_count') 60 | 61 | @property 62 | def save_dir(self): 63 | return self._config.get('Save', 'save_dir') 64 | 65 | @property 66 | def config_file(self): 67 | return self._config.get('Save', 'config_file') 68 | 69 | @property 70 | def save_model_path(self): 71 | return self._config.get('Save', 'save_model_path') 72 | 73 | @property 74 | def save_vocab_path(self): 75 | return self._config.get('Save', 'save_vocab_path') 76 | 77 | @property 78 | def load_dir(self): 79 | return self._config.get('Save', 'load_dir') 80 | 81 | @property 82 | def load_model_path(self): 83 | return self._config.get('Save', 'load_model_path') 84 | 85 | @property 86 | def load_vocab_path(self): 87 | return self._config.get('Save', 'load_vocab_path') 88 | 89 | @property 90 | def lstm_layers(self): 91 | return self._config.getint('Network', 'lstm_layers') 92 | 93 | @property 94 | def word_dims(self): 95 | return self._config.getint('Network', 'word_dims') 96 | 97 | @property 98 | def tag_dims(self): 99 | return self._config.getint('Network', 'tag_dims') 100 | 101 | @property 102 | def dropout_emb(self): 103 | return self._config.getfloat('Network', 'dropout_emb') 104 | 105 | @property 106 | def lstm_hiddens(self): 107 | return self._config.getint('Network', 'lstm_hiddens') 108 | 109 | @property 110 | def dropout_lstm_input(self): 111 | return self._config.getfloat('Network', 'dropout_lstm_input') 112 | 113 | @property 114 | def dropout_lstm_hidden(self): 115 | return self._config.getfloat('Network', 'dropout_lstm_hidden') 116 | 117 | @property 118 | def mlp_arc_size(self): 119 | return self._config.getint('Network', 'mlp_arc_size') 120 | 121 | @property 122 | def mlp_rel_size(self): 123 | return self._config.getint('Network', 'mlp_rel_size') 124 | 125 | @property 126 | def dropout_mlp(self): 127 | return self._config.getfloat('Network', 'dropout_mlp') 128 | 129 | @property 130 | def learning_rate(self): 131 | return self._config.getfloat('Optimizer', 'learning_rate') 132 | 133 | @property 134 | def decay(self): 135 | return self._config.getfloat('Optimizer', 'decay') 136 | 137 | @property 138 | def decay_steps(self): 139 | return self._config.getint('Optimizer', 'decay_steps') 140 | 141 | @property 142 | def beta_1(self): 143 | return self._config.getfloat('Optimizer', 'beta_1') 144 | 145 | @property 146 | def beta_2(self): 147 | return self._config.getfloat('Optimizer', 'beta_2') 148 | 149 | @property 150 | def epsilon(self): 151 | return self._config.getfloat('Optimizer', 'epsilon') 152 | 153 | @property 154 | def clip(self): 155 | return self._config.getfloat('Optimizer', 'clip') 156 | 157 | @property 158 | def num_buckets_train(self): 159 | return self._config.getint('Run', 'num_buckets_train') 160 | 161 | @property 162 | def num_buckets_valid(self): 163 | return self._config.getint('Run', 'num_buckets_valid') 164 | 165 | @property 166 | def num_buckets_test(self): 167 | return self._config.getint('Run', 'num_buckets_test') 168 | 169 | @property 170 | def num_corpus_weighting1(self): 171 | return self._config.getint('Run', 'num_corpus_weighting1') 172 | 173 | @property 174 | def num_corpus_weighting2(self): 175 | return self._config.getint('Run', 'num_corpus_weighting2') 176 | 177 | @property 178 | def num_corpus_weighting3(self): 179 | return self._config.getint('Run', 'num_corpus_weighting3') 180 | 181 | @property 182 | def train_iters(self): 183 | return self._config.getint('Run', 'train_iters') 184 | 185 | @property 186 | def train_batch_size(self): 187 | return self._config.getint('Run', 'train_batch_size') 188 | 189 | @property 190 | def test_batch_size(self): 191 | return self._config.getint('Run', 'test_batch_size') 192 | 193 | @property 194 | def validate_every(self): 195 | return self._config.getint('Run', 'validate_every') 196 | 197 | @property 198 | def save_after(self): 199 | return self._config.getint('Run', 'save_after') 200 | 201 | @property 202 | def update_every(self): 203 | return self._config.getint('Run', 'update_every') 204 | 205 | @property 206 | def train_elmo(self): 207 | return self._config.get('Elmo', 'train_elmo') 208 | 209 | @property 210 | def dev_elmo(self): 211 | return self._config.get('Elmo', 'dev_elmo') 212 | 213 | @property 214 | def test_elmo(self): 215 | return self._config.get('Elmo', 'test_elmo') 216 | 217 | @property 218 | def elmo_dims(self): 219 | return self._config.getint('Elmo', 'elmo_dims') 220 | 221 | @property 222 | def tbank_emb_size(self): 223 | return self._config.getint('Treebankid', 'tbank_emb_size') 224 | 225 | @property 226 | def domain_num_corpus_weighting(self): 227 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting') 228 | 229 | @property 230 | def cw(self): 231 | return self._config.getint('corous-weighting', 'cw') 232 | 233 | @property 234 | def cdt_domain_ration(self): 235 | return self._config.getint('corous-weighting', 'cdt_domain_ration') 236 | 237 | @property 238 | def domain_data_size(self): 239 | return self._config.getint('corous-weighting', 'domain_data_size') 240 | 241 | 242 | 243 | -------------------------------------------------------------------------------- /multi_task_learning/data/Vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from data.Dependency import * 3 | import numpy as np 4 | 5 | class Vocab(object): 6 | PAD, ROOT, UNK = 0, 1, 2 7 | def __init__(self, word_counter, tag_counter, rel_counter, relroot='root', min_occur_count = 2): 8 | self._root = relroot 9 | self._root_form = '<' + relroot.lower() + '>' 10 | self._id2word = ['', self._root_form, ''] 11 | self._wordid2freq = [10000, 10000, 10000] 12 | self._id2extword = ['', self._root_form, ''] 13 | self._id2tag = ['', relroot] 14 | self._id2rel = ['', relroot,'pred','subj-in','frag','punc','repet','exp'] 15 | for word, count in word_counter.most_common(): 16 | if count > min_occur_count: 17 | self._id2word.append(word) 18 | self._wordid2freq.append(count) 19 | 20 | for tag, count in tag_counter.most_common(): 21 | if tag != relroot: self._id2tag.append(tag) 22 | 23 | for rel, count in rel_counter.most_common(): 24 | if rel != relroot and rel not in self._id2rel: self._id2rel.append(rel) 25 | 26 | reverse = lambda x: dict(zip(x, range(len(x)))) 27 | self._word2id = reverse(self._id2word) 28 | if len(self._word2id) != len(self._id2word): 29 | print("serious bug: words dumplicated, please check!") 30 | 31 | self._tag2id = reverse(self._id2tag) 32 | if len(self._tag2id) != len(self._id2tag): 33 | print("serious bug: POS tags dumplicated, please check!") 34 | 35 | self._rel2id = reverse(self._id2rel) 36 | if len(self._rel2id) != len(self._id2rel): 37 | print("serious bug: relation labels dumplicated, please check!") 38 | 39 | print("Vocab info: #words %d, #tags %d, #rels %d" % (self.vocab_size, self.tag_size, self.rel_size)) 40 | 41 | def load_pretrained_embs(self, embfile): 42 | embedding_dim = -1 43 | word_count = 0 44 | with open(embfile, encoding='utf-8') as f: 45 | for line in f.readlines(): 46 | if word_count < 1: 47 | values = line.split() 48 | embedding_dim = len(values) - 1 49 | word_count += 1 50 | print('Total words: ' + str(word_count) + '\n') 51 | print('The dim of pretrained embeddings: ' + str(embedding_dim) + '\n') 52 | 53 | index = len(self._id2extword) 54 | embeddings = np.zeros((word_count + index, embedding_dim)) 55 | with open(embfile, encoding='utf-8') as f: 56 | for line in f.readlines(): 57 | values = line.split() 58 | self._id2extword.append(values[0]) 59 | vector = np.array(values[1:], dtype='float64') 60 | embeddings[self.UNK] += vector 61 | embeddings[index] = vector 62 | index += 1 63 | 64 | embeddings[self.UNK] = embeddings[self.UNK] / word_count 65 | embeddings = embeddings / np.std(embeddings) 66 | 67 | reverse = lambda x: dict(zip(x, range(len(x)))) 68 | self._extword2id = reverse(self._id2extword) 69 | 70 | if len(self._extword2id) != len(self._id2extword): 71 | print("serious bug: extern words dumplicated, please check!") 72 | 73 | return embeddings 74 | 75 | def create_pretrained_embs(self, embfile): 76 | embedding_dim = -1 77 | word_count = 0 78 | with open(embfile, encoding='utf-8') as f: 79 | for line in f.readlines(): 80 | if word_count < 1: 81 | values = line.split() 82 | embedding_dim = len(values) - 1 83 | word_count += 1 84 | print('Total words: ' + str(word_count) + '\n') 85 | print('The dim of pretrained embeddings: ' + str(embedding_dim) + '\n') 86 | 87 | index = len(self._id2extword) - word_count 88 | embeddings = np.zeros((word_count + index, embedding_dim)) 89 | with open(embfile, encoding='utf-8') as f: 90 | for line in f.readlines(): 91 | values = line.split() 92 | if self._extword2id.get(values[0], self.UNK) != index: 93 | print("Broken vocab or error embedding file, please check!") 94 | vector = np.array(values[1:], dtype='float64') 95 | embeddings[self.UNK] += vector 96 | embeddings[index] = vector 97 | index += 1 98 | 99 | embeddings[self.UNK] = embeddings[self.UNK] / word_count 100 | embeddings = embeddings / np.std(embeddings) 101 | 102 | return embeddings 103 | 104 | 105 | def word2id(self, xs): 106 | if isinstance(xs, list): 107 | return [self._word2id.get(x, self.UNK) for x in xs] 108 | return self._word2id.get(xs, self.UNK) 109 | 110 | def id2word(self, xs): 111 | if isinstance(xs, list): 112 | return [self._id2word[x] for x in xs] 113 | return self._id2word[xs] 114 | 115 | def wordid2freq(self, xs): 116 | if isinstance(xs, list): 117 | return [self._wordid2freq[x] for x in xs] 118 | return self._wordid2freq[xs] 119 | 120 | def extword2id(self, xs): 121 | if isinstance(xs, list): 122 | return [self._extword2id.get(x, self.UNK) for x in xs] 123 | return self._extword2id.get(xs, self.UNK) 124 | 125 | def id2extword(self, xs): 126 | if isinstance(xs, list): 127 | return [self._id2extword[x] for x in xs] 128 | return self._id2extword[xs] 129 | 130 | def rel2id(self, xs): 131 | if isinstance(xs, list): 132 | return [self._rel2id[x] for x in xs] 133 | return self._rel2id[xs] 134 | 135 | def id2rel(self, xs): 136 | if isinstance(xs, list): 137 | return [self._id2rel[x] for x in xs] 138 | return self._id2rel[xs] 139 | 140 | def tag2id(self, xs): 141 | if isinstance(xs, list): 142 | return [self._tag2id.get(x) for x in xs] 143 | return self._tag2id.get(xs) 144 | 145 | def id2tag(self, xs): 146 | if isinstance(xs, list): 147 | return [self._id2tag[x] for x in xs] 148 | return self._id2tag[xs] 149 | 150 | @property 151 | def vocab_size(self): 152 | return len(self._id2word) 153 | 154 | @property 155 | def extvocab_size(self): 156 | return len(self._id2extword) 157 | 158 | @property 159 | def tag_size(self): 160 | return len(self._id2tag) 161 | 162 | @property 163 | def rel_size(self): 164 | return len(self._id2rel) 165 | 166 | def creatVocab(corpusFile, min_occur_count): 167 | word_counter = Counter() 168 | tag_counter = Counter() 169 | rel_counter = Counter() 170 | root = '' 171 | tgt_type = parse_my_conll(corpusFile) 172 | with open(corpusFile, 'r') as infile: 173 | #tgt_type = infile[0].strip().split("\t")[-1] 174 | for sentence in readDepTree(infile,tgt_type): 175 | for dep in sentence.sentence: 176 | word_counter[dep.form] += 1 177 | tag_counter[dep.tag] += 1 178 | if dep.head == -1: #补全的数据 179 | continue 180 | if dep.head != 0: 181 | rel_counter[dep.rel] += 1 182 | elif root == '': 183 | root = dep.rel 184 | rel_counter[dep.rel] += 1 185 | elif root != dep.rel: 186 | print('root = ' + root + ', rel for root = ' + dep.rel) 187 | 188 | return Vocab(word_counter, tag_counter, rel_counter, root, min_occur_count) 189 | --------------------------------------------------------------------------------