├── ELMo
├── fine-tune-elmo
│ ├── bilm
│ │ ├── init.py
│ │ ├── __pycache__
│ │ │ ├── data.cpython-36.pyc
│ │ │ ├── elmo.cpython-36.pyc
│ │ │ ├── model.cpython-36.pyc
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── training.cpython-36.pyc
│ │ ├── __init__.py
│ │ └── elmo.py
│ ├── bilm.egg-info
│ │ ├── not-zip-safe
│ │ ├── top_level.txt
│ │ ├── dependency_links.txt
│ │ ├── SOURCES.txt
│ │ └── PKG-INFO
│ ├── dist
│ │ └── bilm-0.1-py3.6.egg
│ ├── tests
│ │ ├── fixtures
│ │ │ ├── model
│ │ │ │ ├── lm_weights.hdf5
│ │ │ │ ├── lm_embeddings_0.hdf5
│ │ │ │ ├── lm_embeddings_1.hdf5
│ │ │ │ ├── lm_embeddings_2.hdf5
│ │ │ │ ├── options.json
│ │ │ │ ├── lm_embeddings_sentences.json
│ │ │ │ └── vocab_test.txt
│ │ │ ├── train
│ │ │ │ ├── vocab.txt
│ │ │ │ └── data.txt
│ │ │ └── data
│ │ │ │ └── vocab_test.txt
│ │ ├── __pycache__
│ │ │ ├── test_data.cpython-36.pyc
│ │ │ ├── test_elmo.cpython-36.pyc
│ │ │ ├── test_model.cpython-36.pyc
│ │ │ └── test_training.cpython-36.pyc
│ │ ├── test_elmo.py
│ │ └── test_training.py
│ ├── weight.sh
│ ├── build
│ │ └── lib
│ │ │ └── bilm
│ │ │ ├── __init__.py
│ │ │ └── elmo.py
│ ├── run_tests_before_shell.sh
│ ├── train.sh
│ ├── setup.py
│ ├── bin
│ │ ├── dump_weights.py
│ │ ├── run_test.py
│ │ ├── restart.py
│ │ └── train_elmo.py
│ ├── run.sh
│ ├── Dockerfile
│ ├── usage_cached.py
│ ├── usage_character.py
│ ├── log
│ └── usage_token.py
└── use_ELMo
│ └── get_elmo_bin.py
├── biaffine-parser-elmo
├── data
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── Vocab.cpython-36.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── Dataloader.cpython-36.pyc
│ │ └── Dependency.cpython-36.pyc
│ ├── Dataloader.py
│ └── Dependency.py
├── driver
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── MST.cpython-36.pyc
│ │ ├── Config.cpython-36.pyc
│ │ ├── Layer.cpython-36.pyc
│ │ ├── Model.cpython-36.pyc
│ │ ├── Parser.cpython-36.pyc
│ │ └── __init__.cpython-36.pyc
│ ├── Test.py
│ ├── Parser.py
│ └── Config.py
├── run.sh
└── config.cfg
├── multi_task_learning
├── data
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── Vocab.cpython-35.pyc
│ │ ├── Vocab.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── Dataloader.cpython-35.pyc
│ │ ├── Dataloader.cpython-36.pyc
│ │ ├── Dependency.cpython-35.pyc
│ │ └── Dependency.cpython-36.pyc
│ ├── Dataloader.py
│ ├── Dependency.py
│ └── Vocab.py
├── driver
│ ├── __init__.py
│ ├── .Train.swp
│ ├── __pycache__
│ │ ├── Layer.cpython-35.pyc
│ │ ├── Layer.cpython-36.pyc
│ │ ├── MST.cpython-35.pyc
│ │ ├── MST.cpython-36.pyc
│ │ ├── Model.cpython-35.pyc
│ │ ├── Model.cpython-36.pyc
│ │ ├── Config.cpython-35.pyc
│ │ ├── Config.cpython-36.pyc
│ │ ├── Parser.cpython-35.pyc
│ │ ├── Parser.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ └── __init__.cpython-36.pyc
│ ├── Test.py
│ ├── Parser.py
│ └── Config.py
├── run.sh
└── config.cfg
├── biaffine-parser-concat
├── data
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── Vocab.cpython-35.pyc
│ │ ├── Vocab.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── Dataloader.cpython-35.pyc
│ │ ├── Dataloader.cpython-36.pyc
│ │ ├── Dependency.cpython-35.pyc
│ │ └── Dependency.cpython-36.pyc
│ ├── Dataloader.py
│ └── Dependency.py
├── driver
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── MST.cpython-35.pyc
│ │ ├── MST.cpython-36.pyc
│ │ ├── Config.cpython-35.pyc
│ │ ├── Config.cpython-36.pyc
│ │ ├── Layer.cpython-35.pyc
│ │ ├── Layer.cpython-36.pyc
│ │ ├── Model.cpython-35.pyc
│ │ ├── Model.cpython-36.pyc
│ │ ├── Parser.cpython-35.pyc
│ │ ├── Parser.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ └── __init__.cpython-36.pyc
│ ├── Test.py
│ ├── Parser.py
│ ├── Model.py
│ └── Config.py
├── run.sh
└── config.cfg
├── biaffine-parser-domain-embedding
├── data
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── Vocab.cpython-35.pyc
│ │ ├── Vocab.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── Dataloader.cpython-35.pyc
│ │ ├── Dataloader.cpython-36.pyc
│ │ ├── Dependency.cpython-35.pyc
│ │ └── Dependency.cpython-36.pyc
│ ├── Dataloader.py
│ └── Dependency.py
├── driver
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── MST.cpython-35.pyc
│ │ ├── MST.cpython-36.pyc
│ │ ├── Config.cpython-35.pyc
│ │ ├── Config.cpython-36.pyc
│ │ ├── Layer.cpython-35.pyc
│ │ ├── Layer.cpython-36.pyc
│ │ ├── Model.cpython-35.pyc
│ │ ├── Model.cpython-36.pyc
│ │ ├── Parser.cpython-35.pyc
│ │ ├── Parser.cpython-36.pyc
│ │ ├── __init__.cpython-35.pyc
│ │ └── __init__.cpython-36.pyc
│ ├── Test.py
│ ├── Parser.py
│ ├── Model.py
│ └── Config.py
├── run.sh
└── config.cfg
└── readme.md
/ELMo/fine-tune-elmo/bilm/init.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multi_task_learning/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multi_task_learning/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | bilm
2 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/multi_task_learning/driver/.Train.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/.Train.swp
--------------------------------------------------------------------------------
/multi_task_learning/run.sh:
--------------------------------------------------------------------------------
1 | exe=../driver/Train.py
2 | nohup python3 -u $exe --config_file=config.cfg > log.train 2>&1 &
3 |
4 |
5 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/dist/bilm-0.1-py3.6.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/dist/bilm-0.1-py3.6.egg
--------------------------------------------------------------------------------
/biaffine-parser-elmo/run.sh:
--------------------------------------------------------------------------------
1 | exe=../driver/Train-corpus-weighting.py
2 | nohup python3 $exe --config_file=config.cfg > log.train 2>&1 &
3 |
4 |
5 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/run.sh:
--------------------------------------------------------------------------------
1 | exe=../driver/Train-corpus-weighting.py
2 | nohup python3 -u $exe --config_file=config.cfg > log.train 2>&1 &
3 |
4 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__pycache__/data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/data.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__pycache__/elmo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/elmo.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/lm_weights.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_weights.hdf5
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/run.sh:
--------------------------------------------------------------------------------
1 | #exe=../driver/Train.py
2 | exe=../driver/Train-corpus-weighting.py
3 | nohup python3 $exe --config_file=config.cfg > log.train 2>&1 &
4 |
5 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/__pycache__/Vocab.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Vocab.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/MST.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/MST.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Vocab.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Vocab.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Vocab.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Vocab.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Layer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Layer.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Layer.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/MST.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/MST.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/MST.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/MST.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Model.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Model.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Model.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__pycache__/training.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/bilm/__pycache__/training.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_0.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_0.hdf5
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_1.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_1.hdf5
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_2.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_2.hdf5
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Vocab.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Vocab.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Vocab.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Vocab.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/MST.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/MST.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/MST.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/MST.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/Config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Config.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/Layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Layer.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/Model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Model.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/Parser.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Config.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Config.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Parser.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Parser.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/Parser.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/__pycache__/test_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_data.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/__pycache__/test_elmo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_elmo.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/__pycache__/test_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_model.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Config.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Config.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Layer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Layer.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Layer.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Model.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Model.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Model.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Parser.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Parser.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/Parser.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/__pycache__/Dataloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Dataloader.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/__pycache__/Dependency.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/data/__pycache__/Dependency.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-elmo/driver/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Dataloader.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dataloader.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Dataloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dataloader.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Dependency.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dependency.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/data/__pycache__/Dependency.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/data/__pycache__/Dependency.cpython-36.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/multi_task_learning/driver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/multi_task_learning/driver/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/__pycache__/test_training.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/ELMo/fine-tune-elmo/tests/__pycache__/test_training.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dataloader.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Dependency.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dependency.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/__pycache__/Dependency.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/data/__pycache__/Dependency.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-concat/driver/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Vocab.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/MST.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Config.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Layer.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Model.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/Parser.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dataloader.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/data/__pycache__/Dependency.cpython-36.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUDA-LA/dep-cross-domain/HEAD/biaffine-parser-domain-embedding/driver/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/weight.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=6
2 | nohup python ./bin/dump_weights.py \
3 | --save_dir ./domain_fine_tune_iter8/checkpoint\
4 | --outfile='./checkpoint/weights.hdf5' 2>&1 &
5 |
6 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .data import Batcher, TokenBatcher
3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
4 | dump_bilm_embeddings
5 | from .elmo import weight_layers
6 |
7 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/build/lib/bilm/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .data import Batcher, TokenBatcher
3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
4 | dump_bilm_embeddings
5 | from .elmo import weight_layers
6 |
7 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/run_tests_before_shell.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Verify environment by running tests
4 | # before dropping into a shell.
5 | set -e
6 | PYTHONDONTWRITEBYTECODE=1 python -m unittest discover tests/
7 | echo "Tests passed!"
8 |
9 | /bin/bash
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/train.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6
2 | python ./bin/train_elmo.py \
3 | --train_prefix='/data1/yli/giga-train-data/split/sens.train.split.*' \
4 | --vocab_file '/data1/yli/giga-train-data/dict.train'\
5 | --save_dir /data1/yli/elmo-chinese-model/checkpoint
6 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | bilm/__init__.py
4 | bilm/data.py
5 | bilm/elmo.py
6 | bilm/model.py
7 | bilm/training.py
8 | bilm.egg-info/PKG-INFO
9 | bilm.egg-info/SOURCES.txt
10 | bilm.egg-info/dependency_links.txt
11 | bilm.egg-info/not-zip-safe
12 | bilm.egg-info/top_level.txt
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.0
2 | Name: bilm
3 | Version: 0.1
4 | Summary: UNKNOWN
5 | Home-page: http://github.com/allenai/bilm-tf
6 | Author: UNKNOWN
7 | Author-email: UNKNOWN
8 | License: UNKNOWN
9 | Description-Content-Type: UNKNOWN
10 | Description: UNKNOWN
11 | Platform: UNKNOWN
12 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import setuptools
3 |
4 | setuptools.setup(
5 | name='bilm',
6 | version='0.1',
7 | url='http://github.com/allenai/bilm-tf',
8 | packages=setuptools.find_packages(),
9 | tests_require=[],
10 | zip_safe=False,
11 | entry_points='',
12 | )
13 |
14 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/options.json:
--------------------------------------------------------------------------------
1 | {"lstm": {"cell_clip": 3, "use_skip_connections": false, "n_layers": 2, "proj_clip": 3, "projection_dim": 16, "dim": 64}, "char_cnn": {"embedding": {"dim": 4}, "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], "n_highway": 2, "n_characters": 262, "max_characters_per_token": 50, "activation": "relu"}}
2 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/train/vocab.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | of
5 | the
6 | neural
7 | .
8 | words
9 | ,
10 | in
11 | language
12 | models
13 | to
14 | a
15 | vocabulary
16 | embeddings
17 | use
18 | -
19 | are
20 | or
21 | make
22 | and
23 | increases
24 | as
25 | exponentially
26 | problem
27 | sequences
28 | continuous
29 | networks
30 | net
31 | number
32 | larger
33 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bin/dump_weights.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import argparse
4 | import sys
5 | sys.path.extend(["../../","../","./"])
6 |
7 | from training import dump_weights as dw
8 |
9 |
10 | if __name__ == '__main__':
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--save_dir', help='Location of checkpoint files')
13 | parser.add_argument('--outfile', help='Output hdf5 file with weights')
14 |
15 | args = parser.parse_args()
16 | dw(args.save_dir, args.outfile)
17 |
18 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/run.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=5,6,7
2 | nohup python -u /data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/bin/restart.py \
3 | --train_prefix='/data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/data/4domain-trainFile/train_elmo_data.file.*'\
4 | --vocab_file '/data/xpeng/elmo/Train_ELMo/bilm-tf/elmo-chinese-model/dict.train'\
5 | --save_dir '/data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/4domain_fine_tune_iter8/checkpoint'\
6 | > /data/xpeng/elmo/Train_ELMo/bilm-tf/train-bilm-tf-new/experiment/fine-tune-giga1100-elmo/4domain/4domain_fine_tune_iter8/log.4domain-fine-tune-giga1100-model-iter8 2>&1 &
7 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/train/data.txt:
--------------------------------------------------------------------------------
1 | neural language models use continuous representations or embeddings of words to make their predictions .
2 | these models make use of neural networks .
3 | continuous space embeddings help to alleviate the curse of dimensionality in language modeling : as language models are trained on larger and larger texts , the number of unique words ( the vocabulary ) increases and the number of possible sequences of words increases exponentially with the size of the vocabulary , causing a data sparsity problem because for each of the exponentially many sequences , statistics are needed to properly estimate probabilities .
4 | neural networks avoid this problem by representing words in a distributed way , as non - linear combinations of weights in a neural net .
5 | the neural net architecture might be feed - forward or recurrent .
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/data/vocab_test.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | the
5 | ,
6 | .
7 | to
8 | of
9 | and
10 | a
11 | in
12 | "
13 | 's
14 | that
15 | for
16 | on
17 | is
18 | The
19 | was
20 | with
21 | said
22 | as
23 | at
24 | it
25 | by
26 | from
27 | be
28 | have
29 | he
30 | has
31 | his
32 | are
33 | an
34 | )
35 | not
36 | (
37 | will
38 | who
39 | I
40 | had
41 | their
42 | --
43 | were
44 | they
45 | but
46 | been
47 | this
48 | which
49 | more
50 | or
51 | its
52 | would
53 | about
54 | :
55 | after
56 | up
57 | $
58 | one
59 | than
60 | also
61 | 't
62 | out
63 | her
64 | you
65 | year
66 | when
67 | It
68 | two
69 | people
70 | -
71 | all
72 | can
73 | over
74 | last
75 | first
76 | But
77 | into
78 | '
79 | He
80 | A
81 | we
82 | In
83 | she
84 | other
85 | new
86 | years
87 | could
88 | there
89 | ?
90 | time
91 | some
92 | them
93 | if
94 | no
95 | percent
96 | so
97 | what
98 | only
99 | government
100 | million
101 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ## Requirements
2 | python >= 3.6
3 | pytorch == 0.3.0
4 | ## Dataset
5 | | |BC |PB |ZX |
6 | |:--------:|:-----:|:-----: |:----:|
7 | |train | 52,433|5,140 |1,649 |
8 | |dev | 998 |1,300 |500 |
9 | |test |1,995 |2,600 | 1,100|
10 | |unlabeled | - | 326,981|32,492|
11 |
12 | ## Performance
13 | Final results on the test data:
14 |
15 | | | PB(uas) | PB(las) |ZX(uas) | ZX(las)|
16 | |:------------:|:----------:|:-------:|:-------:|:-------:|
17 | |BC-train |67.55 |61.01 |**68.44**|59.55 |
18 | |PB-train |**74.52** |**69.02** |51.62 |40.36 |
19 | |ZX-train |52.24 |42.76 |68.14 |**61.71**|
20 | | MTL | 75.39 |69.69 |72.11 |65.66 |
21 | | concat | 77.49 | 72.16 |76.80 |70.85 |
22 | | DOEMB | 78.24 |72.81 |77.96 |72.01 |
23 | | +ELMo |77.62 |72.35 |78.50 |72.49 |
24 | | +Fine-tuning | **82.05** |**77.216**|**80.44**|**75.11**|
25 |
26 | ## Usage
27 |
28 |
29 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/config.cfg:
--------------------------------------------------------------------------------
1 | [Data]
2 | pretrained_embeddings_file = ./data/giga.100.txt
3 | data_dir =./data
4 | train_file = %(data_dir)s/cdt-train.conll
5 | dev_file = %(data_dir)s/comment-dev.conll
6 | test_file = %(data_dir)s/comment-test.conll
7 | min_occur_count = 2
8 |
9 | [Save]
10 | save_dir =./best-model
11 | config_file = %(save_dir)s/config.cfg
12 | save_model_path = %(save_dir)s/model
13 | save_vocab_path = %(save_dir)s/vocab
14 | load_dir = ./best-model
15 | load_model_path = %(load_dir)s/model
16 | load_vocab_path = %(load_dir)s/vocab
17 |
18 | [Network]
19 | lstm_layers = 2
20 | word_dims = 100
21 | tag_dims = 100
22 | dropout_emb = 0.33
23 | lstm_hiddens = 300
24 | dropout_lstm_input = 0.33
25 | dropout_lstm_hidden = 0.33
26 | mlp_arc_size = 200
27 | mlp_rel_size = 100
28 | dropout_mlp = 0.33
29 |
30 | [Optimizer]
31 | learning_rate = 2e-3
32 | decay = .75
33 | decay_steps = 5000
34 | beta_1 = .9
35 | beta_2 = .9
36 | epsilon = 1e-12
37 | clip = 5.0
38 |
39 | [Run]
40 | num_buckets_train = 40
41 | num_buckets_valid = 10
42 | num_buckets_test = 10
43 | train_iters = 50000
44 | train_batch_size = 50
45 | test_batch_size = 100
46 | validate_every = 165
47 | save_after = 50
48 | update_every = 4
49 |
50 | [corous-weighting]
51 | domain_num_corpus_weighting=4700
52 | cdt_domain_ration=1
53 | domain_file=./data/comment-train.conll
54 |
55 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/config.cfg:
--------------------------------------------------------------------------------
1 | [Data]
2 | pretrained_embeddings_file = ./data/giga.100.txt
3 | data_dir = ./treebank
4 | train_file = %(data_dir)s/PC-BC.conll
5 | dev_file = %(data_dir)s/PC/PC-Dev.conll
6 | test_file = %(data_dir)s/PC/PC-Test.conll
7 | min_occur_count = 2
8 |
9 | [Save]
10 | save_dir =./best-model
11 | config_file = %(save_dir)s/config.cfg
12 | save_model_path = %(save_dir)s/model
13 | save_vocab_path = %(save_dir)s/vocab
14 | load_dir = ./best-model
15 | load_model_path = %(load_dir)s/model
16 | load_vocab_path = %(load_dir)s/vocab
17 |
18 | [Network]
19 | lstm_layers = 2
20 | word_dims = 100
21 | tag_dims = 100
22 | dropout_emb = 0.33
23 | lstm_hiddens = 300
24 | dropout_lstm_input = 0.33
25 | dropout_lstm_hidden = 0.33
26 | mlp_arc_size = 200
27 | mlp_rel_size = 100
28 | dropout_mlp = 0.33
29 |
30 | [Optimizer]
31 | learning_rate = 2e-3
32 | decay = .75
33 | decay_steps = 5000
34 | beta_1 = .9
35 | beta_2 = .9
36 | epsilon = 1e-12
37 | clip = 5.0
38 |
39 | [Run]
40 | num_buckets_train = 40
41 | num_buckets_valid = 10
42 | num_buckets_test = 10
43 | train_iters = 50000
44 | train_batch_size = 50
45 | test_batch_size = 100
46 | validate_every = 165
47 | save_after = 50
48 | update_every = 4
49 |
50 | [corous-weighting]
51 | domain_num_corpus_weighting=1500
52 | cdt_domain_ration=11
53 | domain_data_size=6892
54 |
55 | [Treebankid]
56 | tbank_emb_size=12
57 |
58 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
2 |
3 | ENV LC_ALL=C.UTF-8
4 | ENV LANG=C.UTF-8
5 | ENV PATH /opt/conda/bin:$PATH
6 |
7 | WORKDIR /stage
8 |
9 | # Install base packages.
10 | RUN apt-get update --fix-missing && apt-get install -y \
11 | bzip2 \
12 | ca-certificates \
13 | curl \
14 | gcc \
15 | git \
16 | libc-dev \
17 | libglib2.0-0 \
18 | libsm6 \
19 | libxext6 \
20 | libxrender1 \
21 | wget \
22 | libevent-dev \
23 | build-essential && \
24 | rm -rf /var/lib/apt/lists/*
25 |
26 | # Install Anaconda.
27 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
28 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
29 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \
30 | rm ~/miniconda.sh
31 |
32 | # Use python 3.5
33 | RUN conda install python=3.5
34 |
35 | # Copy files.
36 | COPY bilm/ bilm/
37 | COPY bin/ bin/
38 | COPY tests/ tests/
39 | COPY README.md README.md
40 | COPY setup.py setup.py
41 | COPY usage*.py ./
42 | COPY run_tests_before_shell.sh run_tests_before_shell.sh
43 |
44 | # Install package.
45 | RUN pip install tensorflow-gpu==1.2 h5py
46 | RUN python setup.py install
47 |
48 | # Run tests to verify the Docker build
49 | # and then drop into a shell.
50 | CMD ["/bin/bash", "run_tests_before_shell.sh"]
51 |
--------------------------------------------------------------------------------
/multi_task_learning/config.cfg:
--------------------------------------------------------------------------------
1 | [Data]
2 | pretrained_embeddings_file = ./data/giga.100.txt
3 | data_dir = ./
4 | train_file = %(data_dir)s/BC/train-tbank.conll
5 | cdt_dev_file = %(data_dir)s/BC/dev-tbank.conll
6 | cw_dev_file = %(data_dir)s/PB/dev-tbank.conll
7 | cdt_test_file = %(data_dir)s/BC/test-tbank.conll
8 | cw_test_file = %(data_dir)s/PC/test-tbank.conll
9 | min_occur_count = 2
10 |
11 | [Save]
12 | save_dir =./best-model
13 | config_file = %(save_dir)s/config.cfg
14 | save_model_path = %(save_dir)s/model
15 | save_vocab_path = %(save_dir)s/vocab
16 | load_dir = ./best-model
17 | load_model_path = %(load_dir)s/model
18 | load_vocab_path = %(load_dir)s/vocab
19 |
20 | [Network]
21 | lstm_layers = 2
22 | word_dims = 100
23 | tag_dims = 100
24 | dropout_emb = 0.33
25 | lstm_hiddens = 300
26 | dropout_lstm_input = 0.33
27 | dropout_lstm_hidden = 0.33
28 | mlp_arc_size = 200
29 | mlp_rel_size = 100
30 | dropout_mlp = 0.33
31 |
32 | [Optimizer]
33 | learning_rate = 2e-3
34 | decay = .75
35 | decay_steps = 5000
36 | beta_1 = .9
37 | beta_2 = .9
38 | epsilon = 1e-12
39 | clip = 5.0
40 |
41 | [Run]
42 | num_buckets_train = 40
43 | num_buckets_valid = 10
44 | num_buckets_test = 10
45 | train_iters = 50000
46 | train_batch_size = 50
47 | test_batch_size = 100
48 | validate_every = 165
49 | save_after = 50
50 | update_every = 4
51 |
52 | [corous-weighting]
53 | domain_num_corpus_weighting=3200
54 | cdt_domain_ration=11
55 | domain_file=./content/train-tbank.conll
56 |
57 |
58 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bin/run_test.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 |
4 | from bilm.training import test, load_options_latest_checkpoint, load_vocab
5 | from bilm.data import LMDataset, BidirectionalLMDataset
6 |
7 | def main(args):
8 | options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
9 |
10 | # load the vocab
11 | if 'char_cnn' in options:
12 | max_word_length = options['char_cnn']['max_characters_per_token']
13 | else:
14 | max_word_length = None
15 | vocab = load_vocab(args.vocab_file, max_word_length)
16 |
17 | test_prefix = args.test_prefix
18 |
19 | kwargs = {
20 | 'test': True,
21 | 'shuffle_on_load': False,
22 | }
23 |
24 | if options.get('bidirectional'):
25 | data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
26 | else:
27 | data = LMDataset(test_prefix, vocab, **kwargs)
28 |
29 | test(options, ckpt_file, data, batch_size=args.batch_size)
30 |
31 |
32 | if __name__ == '__main__':
33 | parser = argparse.ArgumentParser(description='Compute test perplexity')
34 | parser.add_argument('--save_dir', help='Location of checkpoint files')
35 | parser.add_argument('--vocab_file', help='Vocabulary file')
36 | parser.add_argument('--test_prefix', help='Prefix for test files')
37 | parser.add_argument('--batch_size',
38 | type=int, default=256,
39 | help='Batch size')
40 |
41 | args = parser.parse_args()
42 | main(args)
43 |
44 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/usage_cached.py:
--------------------------------------------------------------------------------
1 | '''
2 | ELMo usage example to write biLM embeddings for an entire dataset to
3 | a file.
4 | '''
5 |
6 | import os
7 | import h5py
8 | from bilm import dump_bilm_embeddings
9 |
10 | # Our small dataset.
11 | raw_context = [
12 | 'Pretrained biLMs compute representations useful for NLP tasks .',
13 | 'They give state of the art performance for many tasks .'
14 | ]
15 | tokenized_context = [sentence.split() for sentence in raw_context]
16 | tokenized_question = [
17 | ['What', 'are', 'biLMs', 'useful', 'for', '?'],
18 | ]
19 |
20 | # Create the dataset file.
21 | dataset_file = 'dataset_file.txt'
22 | with open(dataset_file, 'w') as fout:
23 | for sentence in tokenized_context + tokenized_question:
24 | fout.write(' '.join(sentence) + '\n')
25 |
26 |
27 | # Location of pretrained LM. Here we use the test fixtures.
28 | datadir = os.path.join('tests', 'fixtures', 'model')
29 | vocab_file = os.path.join(datadir, 'vocab_test.txt')
30 | options_file = os.path.join(datadir, 'options.json')
31 | weight_file = os.path.join(datadir, 'lm_weights.hdf5')
32 |
33 | # Dump the embeddings to a file. Run this once for your dataset.
34 | embedding_file = 'elmo_embeddings.hdf5'
35 | dump_bilm_embeddings(
36 | vocab_file, dataset_file, options_file, weight_file, embedding_file
37 | )
38 |
39 | # Load the embeddings from the file -- here the 2nd sentence.
40 | with h5py.File(embedding_file, 'r') as fin:
41 | second_sentence_embeddings = fin['1'][...]
42 |
43 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/config.cfg:
--------------------------------------------------------------------------------
1 | [Data]
2 | pretrained_embeddings_file = ./data/giga.100.txt
3 | data_dir = ./tbank-data
4 | train_file = %(data_dir)s/train-tbank.conll
5 | dev_file = %(data_dir)s/dev-tbank.conll
6 | test_file = %(data_dir)s/test-tbank.conll
7 | min_occur_count = 2
8 |
9 | [Save]
10 | save_dir = ./best-model
11 | config_file = %(save_dir)s/config.cfg
12 | save_model_path = %(save_dir)s/model
13 | save_vocab_path = %(save_dir)s/vocab
14 | load_dir =./best-model
15 | load_model_path = %(load_dir)s/model
16 | load_vocab_path = %(load_dir)s/vocab
17 |
18 | [Network]
19 | lstm_layers = 2
20 | word_dims = 100
21 | tag_dims = 100
22 | dropout_emb = 0.33
23 | lstm_hiddens = 300
24 | dropout_lstm_input = 0.33
25 | dropout_lstm_hidden = 0.33
26 | mlp_arc_size = 200
27 | mlp_rel_size = 100
28 | dropout_mlp = 0.33
29 |
30 | [Optimizer]
31 | learning_rate = 2e-3
32 | decay = .75
33 | decay_steps = 5000
34 | beta_1 = .9
35 | beta_2 = .9
36 | epsilon = 1e-12
37 | clip = 5.0
38 |
39 | [Run]
40 | num_buckets_train = 40
41 | num_buckets_valid = 10
42 | num_buckets_test = 10
43 | train_iters = 50000
44 | train_batch_size = 50
45 | test_batch_size = 100
46 | validate_every = 165
47 | save_after = 50
48 | update_every = 4
49 |
50 | [corous-weighting]
51 | domain_num_corpus_weighting=4700
52 | cdt_domain_ration=3
53 | domain_data_size = 6892
54 |
55 | [Elmo]
56 | train_elmo=./train.1024.bin
57 | dev_elmo=./dev.1024.bin
58 | test_elmo=./test.1024.bin
59 | elmo_dims=100
60 |
61 | [Treebankid]
62 | tbank_emb_size=12
63 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bin/restart.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import argparse
4 | import numpy as np
5 | import sys
6 | sys.path.extend(["../../","../","./"])
7 |
8 | from bilm.training import train, load_options_latest_checkpoint, load_vocab
9 | from bilm.data import LMDataset, BidirectionalLMDataset
10 |
11 | def main(args):
12 | options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
13 |
14 | if 'char_cnn' in options:
15 | max_word_length = options['char_cnn']['max_characters_per_token']
16 | else:
17 | max_word_length = None
18 | vocab = load_vocab(args.vocab_file, max_word_length)
19 |
20 | prefix = args.train_prefix
21 |
22 | kwargs = {
23 | 'test': False,
24 | 'shuffle_on_load': True,
25 | }
26 |
27 | if options.get('bidirectional'):
28 | data = BidirectionalLMDataset(prefix, vocab, **kwargs)
29 | else:
30 | data = LMDataset(prefix, vocab, **kwargs)
31 |
32 | tf_save_dir = args.save_dir
33 | tf_log_dir = args.save_dir
34 |
35 | # set optional inputs
36 | if args.n_train_tokens > 0:
37 | options['n_train_tokens'] = args.n_train_tokens
38 | if args.n_epochs > 0:
39 | options['n_epochs'] = args.n_epochs
40 | if args.batch_size > 0:
41 | options['batch_size'] = args.batch_size
42 |
43 | train(options, data, args.n_gpus, tf_save_dir, tf_log_dir,
44 | restart_ckpt_file=ckpt_file)
45 |
46 |
47 | if __name__ == '__main__':
48 | parser = argparse.ArgumentParser()
49 | parser.add_argument('--save_dir', help='Location of checkpoint files')
50 | parser.add_argument('--vocab_file', help='Vocabulary file')
51 | parser.add_argument('--train_prefix', help='Prefix for train files')
52 | parser.add_argument('--n_gpus', type=int, default=3,
53 | help='Number of GPUs to use')
54 | parser.add_argument('--batch_size', type=int, default=64)
55 | parser.add_argument('--n_train_tokens', type=int, default=15622976)
56 | parser.add_argument('--n_epochs', type=int, default=8)
57 |
58 | args = parser.parse_args()
59 | main(args)
60 |
61 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bin/train_elmo.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 | import sys
4 | sys.path.extend(["../../","../","./"])
5 | from training import train, load_options_latest_checkpoint, load_vocab
6 | from data import BidirectionalLMDataset
7 |
8 |
9 | def main(args):
10 | # load the vocab
11 | vocab = load_vocab(args.vocab_file, 110)
12 |
13 | # define the options
14 | batch_size = 64 # 128batch size for each GPU
15 | n_gpus = 2
16 |
17 | # number of tokens in training data (this for 1B Word Benchmark)
18 | n_train_tokens = 42466511
19 |
20 | options = {
21 | 'bidirectional': True,
22 |
23 | 'char_cnn': {'activation': 'relu',
24 | 'embedding': {'dim': 16},
25 | 'filters': [[1, 32],
26 | [2, 32],
27 | [3, 64],
28 | [4, 128],
29 | [5, 256],
30 | [6, 512],
31 | [7, 1024]],
32 | 'max_characters_per_token': 110,
33 | 'n_characters': 6612,
34 | 'n_highway': 2},
35 |
36 | 'dropout': 0.1,
37 |
38 | 'lstm': {
39 | 'cell_clip': 3,
40 | 'dim': 4096,
41 | 'n_layers': 2,
42 | 'proj_clip': 3,
43 | 'projection_dim': 512,
44 | #'projection_dim': 50,
45 | 'use_skip_connections': True},
46 |
47 | 'all_clip_norm_val': 10.0,
48 |
49 | 'n_epochs': 10,
50 | 'n_train_tokens': n_train_tokens,
51 | 'batch_size': batch_size,
52 | 'n_tokens_vocab': vocab.size,
53 | 'unroll_steps': 20,
54 | 'n_negative_samples_batch': 8192,
55 | }
56 |
57 | prefix = args.train_prefix
58 | data = BidirectionalLMDataset(prefix, vocab, test=False,
59 | shuffle_on_load=True)
60 |
61 | tf_save_dir = args.save_dir
62 | tf_log_dir = args.save_dir
63 | train(options, data, n_gpus, tf_save_dir, tf_log_dir)
64 |
65 |
66 | if __name__ == '__main__':
67 | parser = argparse.ArgumentParser()
68 | parser.add_argument('--save_dir', help='Location of checkpoint files')
69 | parser.add_argument('--vocab_file', help='Vocabulary file')
70 | parser.add_argument('--train_prefix', help='Prefix for train files')
71 |
72 | args = parser.parse_args()
73 | main(args)
74 |
75 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/usage_character.py:
--------------------------------------------------------------------------------
1 | '''
2 | ELMo usage example with character inputs.
3 |
4 | Below, we show usage for SQuAD where each input example consists of both
5 | a question and a paragraph of context.
6 | '''
7 |
8 | import tensorflow as tf
9 | import os
10 | from bilm import Batcher, BidirectionalLanguageModel, weight_layers
11 |
12 | # Location of pretrained LM. Here we use the test fixtures.
13 | datadir = os.path.join('tests', 'fixtures', 'model')
14 | vocab_file = os.path.join(datadir, 'vocab_test.txt')
15 | options_file = os.path.join(datadir, 'options.json')
16 | weight_file = os.path.join(datadir, 'lm_weights.hdf5')
17 |
18 | # Create a Batcher to map text to character ids.
19 | batcher = Batcher(vocab_file, 50)
20 |
21 | # Input placeholders to the biLM.
22 | context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
23 | question_character_ids = tf.placeholder('int32', shape=(None, None, 50))
24 |
25 | # Build the biLM graph.
26 | bilm = BidirectionalLanguageModel(options_file, weight_file)
27 |
28 | # Get ops to compute the LM embeddings.
29 | context_embeddings_op = bilm(context_character_ids)
30 | question_embeddings_op = bilm(question_character_ids)
31 |
32 | # Get an op to compute ELMo (weighted average of the internal biLM layers)
33 | # Our SQuAD model includes ELMo at both the input and output layers
34 | # of the taskKilled GRU, so we need 4x ELMo representations for the question
35 | # and context at each of the input and output.
36 | # We use the same ELMo weights for both the question and context
37 | # at each of the input and output.
38 | elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
39 | with tf.variable_scope('', reuse=True):
40 | # the reuse=True scope reuses weights from the context for the question
41 | elmo_question_input = weight_layers(
42 | 'input', question_embeddings_op, l2_coef=0.0
43 | )
44 |
45 | elmo_context_output = weight_layers(
46 | 'output', context_embeddings_op, l2_coef=0.0
47 | )
48 | with tf.variable_scope('', reuse=True):
49 | # the reuse=True scope reuses weights from the context for the question
50 | elmo_question_output = weight_layers(
51 | 'output', question_embeddings_op, l2_coef=0.0
52 | )
53 |
54 |
55 | # Now we can compute embeddings.
56 | raw_context = [
57 | 'Pretrained biLMs compute representations useful for NLP tasks .',
58 | 'They give state of the art performance for many tasks .'
59 | ]
60 | tokenized_context = [sentence.split() for sentence in raw_context]
61 | tokenized_question = [
62 | ['What', 'are', 'biLMs', 'useful', 'for', '?'],
63 | ]
64 |
65 | with tf.Session() as sess:
66 | # It is necessary to initialize variables once before running inference.
67 | sess.run(tf.global_variables_initializer())
68 |
69 | # Create batches of data.
70 | context_ids = batcher.batch_sentences(tokenized_context)
71 | question_ids = batcher.batch_sentences(tokenized_question)
72 |
73 | # Compute ELMo representations (here for the input only, for simplicity).
74 | elmo_context_input_, elmo_question_input_ = sess.run(
75 | [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']],
76 | feed_dict={context_character_ids: context_ids,
77 | question_character_ids: question_ids}
78 | )
79 |
80 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/Test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.extend(["../../","../","./"])
3 | import time
4 | import numpy as np
5 | import pickle
6 | import argparse
7 | from driver.Config import *
8 | from driver.Model import *
9 | from driver.Parser import *
10 | from data.Dataloader import *
11 | import pickle
12 | import random
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
14 | def evaluate(data, parser, vocab, outputFile):
15 | start = time.time()
16 | parser.model.eval()
17 | output = open(outputFile, 'w', encoding='utf-8')
18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0
19 |
20 | for onebatch in data_iter(data, config.test_batch_size, False):
21 | words, extwords, tags, heads, rels, lengths, masks = \
22 | batch_data_variable(onebatch, vocab)
23 | count = 0
24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks)
25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab):
26 | printDepTree(output, tree)
27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count])
28 | arc_total_test += arc_total
29 | arc_correct_test += arc_correct
30 | rel_total_test += rel_total
31 | rel_correct_test += rel_correct
32 | count += 1
33 |
34 | output.close()
35 |
36 | uas = arc_correct_test * 100.0 / arc_total_test
37 | las = rel_correct_test * 100.0 / rel_total_test
38 |
39 |
40 | end = time.time()
41 | during_time = float(end - start)
42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time))
43 |
44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las
45 |
46 |
47 |
48 | if __name__ == '__main__':
49 | random.seed(666)
50 | np.random.seed(666)
51 | torch.cuda.manual_seed(666)
52 | torch.manual_seed(666)
53 |
54 | ### gpu
55 | gpu = torch.cuda.is_available()
56 | print("GPU available: ", gpu)
57 | print("CuDNN: \n", torch.backends.cudnn.enabled)
58 |
59 | argparser = argparse.ArgumentParser()
60 | argparser.add_argument('--config_file', default='examples/default.cfg')
61 | argparser.add_argument('--model', default='BaseParser')
62 | argparser.add_argument('--thread', default=4, type=int, help='thread num')
63 | argparser.add_argument('--use-cuda', action='store_true', default=True)
64 |
65 | args, extra_args = argparser.parse_known_args()
66 | config = Configurable(args.config_file, extra_args)
67 |
68 | vocab = pickle.load(open(config.load_vocab_path, 'rb'))
69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file)
70 |
71 |
72 | args, extra_args = argparser.parse_known_args()
73 | config = Configurable(args.config_file, extra_args)
74 | torch.set_num_threads(args.thread)
75 |
76 | config.use_cuda = False
77 | if gpu and args.use_cuda: config.use_cuda = True
78 | print("\nGPU using status: ", config.use_cuda)
79 | # print(config.use_cuda)
80 |
81 | model = ParserModel(vocab, config, vec)
82 | model.load_state_dict(torch.load(config.load_model_path))
83 | if config.use_cuda:
84 | torch.backends.cudnn.enabled = True
85 | model = model.cuda()
86 |
87 | parser = BiaffineParser(model, vocab.ROOT)
88 | test_data = read_corpus(config.test_file, vocab)
89 |
90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \
91 | evaluate(test_data, parser, vocab, config.test_file + '.out')
92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \
93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las))
94 |
95 |
96 |
--------------------------------------------------------------------------------
/multi_task_learning/driver/Test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.extend(["../../","../","./"])
3 | import time
4 | import numpy as np
5 | import pickle
6 | import argparse
7 | from driver.Config import *
8 | from driver.Model import *
9 | from driver.Parser import *
10 | from data.Dataloader import *
11 | import pickle
12 | import random
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
14 | def evaluate(data, parser, vocab, outputFile):
15 | start = time.time()
16 | parser.model.eval()
17 | #output = open(outputFile, 'w', encoding='utf-8')
18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0
19 |
20 | for onebatch in data_iter(data, config.test_batch_size, False):
21 | words, extwords, tags, heads, rels, lengths, masks = \
22 | batch_data_variable(onebatch, vocab)
23 | count = 0
24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks)
25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab):
26 | #printDepTree(output, tree)
27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count])
28 | arc_total_test += arc_total
29 | arc_correct_test += arc_correct
30 | rel_total_test += rel_total
31 | rel_correct_test += rel_correct
32 | count += 1
33 |
34 | #output.close()
35 |
36 | uas = arc_correct_test * 100.0 / arc_total_test
37 | las = rel_correct_test * 100.0 / rel_total_test
38 |
39 |
40 | end = time.time()
41 | during_time = float(end - start)
42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time))
43 |
44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las
45 |
46 |
47 |
48 | if __name__ == '__main__':
49 | random.seed(666)
50 | np.random.seed(666)
51 | torch.cuda.manual_seed(666)
52 | torch.manual_seed(666)
53 |
54 | ### gpu
55 | gpu = torch.cuda.is_available()
56 | print("GPU available: ", gpu)
57 | print("CuDNN: \n", torch.backends.cudnn.enabled)
58 |
59 | argparser = argparse.ArgumentParser()
60 | argparser.add_argument('--config_file', default='examples/default.cfg')
61 | argparser.add_argument('--model', default='BaseParser')
62 | argparser.add_argument('--thread', default=4, type=int, help='thread num')
63 | argparser.add_argument('--use-cuda', action='store_true', default=True)
64 |
65 | args, extra_args = argparser.parse_known_args()
66 | config = Configurable(args.config_file, extra_args)
67 |
68 | vocab = pickle.load(open(config.load_vocab_path, 'rb'))
69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file)
70 |
71 |
72 | args, extra_args = argparser.parse_known_args()
73 | config = Configurable(args.config_file, extra_args)
74 | torch.set_num_threads(args.thread)
75 |
76 | config.use_cuda = False
77 | if gpu and args.use_cuda: config.use_cuda = True
78 | print("\nGPU using status: ", config.use_cuda)
79 | # print(config.use_cuda)
80 |
81 | model = ParserModel(vocab, config, vec)
82 | model.load_state_dict(torch.load(config.load_model_path))
83 | if config.use_cuda:
84 | torch.backends.cudnn.enabled = True
85 | model = model.cuda()
86 |
87 | parser = BiaffineParser(model, vocab.ROOT)
88 | test_data = read_corpus(config.test_file, vocab)
89 |
90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \
91 | evaluate(test_data, parser, vocab, config.test_file + '.out')
92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \
93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las))
94 |
95 |
96 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/Test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.extend(["../../../../","../../../", "../../", "../", "./"])
3 | import time
4 | import numpy as np
5 | import pickle
6 | import argparse
7 | from driver.Config import *
8 | from driver.Model import *
9 | from driver.Parser import *
10 | from data.Dataloader import *
11 | import pickle
12 | import random
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
14 | def evaluate(data, parser, vocab, outputFile):
15 | start = time.time()
16 | parser.model.eval()
17 | #output = open(outputFile, 'w', encoding='utf-8')
18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0
19 |
20 | for onebatch in data_iter(data, config.test_batch_size, False):
21 | words, extwords, tags, heads, rels, lengths, masks,tbank_ids = \
22 | batch_data_variable(onebatch, vocab)
23 | count = 0
24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks,tbank_ids)
25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab):
26 | #printDepTree(output, tree)
27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count])
28 | arc_total_test += arc_total
29 | arc_correct_test += arc_correct
30 | rel_total_test += rel_total
31 | rel_correct_test += rel_correct
32 | count += 1
33 |
34 | #output.close()
35 |
36 | uas = arc_correct_test * 100.0 / arc_total_test
37 | las = rel_correct_test * 100.0 / rel_total_test
38 |
39 |
40 | end = time.time()
41 | during_time = float(end - start)
42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time))
43 |
44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las
45 |
46 |
47 |
48 | if __name__ == '__main__':
49 | random.seed(666)
50 | np.random.seed(666)
51 | torch.cuda.manual_seed(666)
52 | torch.manual_seed(666)
53 |
54 | ### gpu
55 | gpu = torch.cuda.is_available()
56 | print("GPU available: ", gpu)
57 | print("CuDNN: \n", torch.backends.cudnn.enabled)
58 |
59 | argparser = argparse.ArgumentParser()
60 | argparser.add_argument('--config_file', default='examples/default.cfg')
61 | argparser.add_argument('--model', default='BaseParser')
62 | argparser.add_argument('--thread', default=4, type=int, help='thread num')
63 | argparser.add_argument('--use-cuda', action='store_true', default=True)
64 |
65 | args, extra_args = argparser.parse_known_args()
66 | config = Configurable(args.config_file, extra_args)
67 |
68 | vocab = pickle.load(open(config.load_vocab_path, 'rb'))
69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file)
70 |
71 |
72 | args, extra_args = argparser.parse_known_args()
73 | config = Configurable(args.config_file, extra_args)
74 | torch.set_num_threads(args.thread)
75 |
76 | config.use_cuda = False
77 | if gpu and args.use_cuda: config.use_cuda = True
78 | print("\nGPU using status: ", config.use_cuda)
79 | # print(config.use_cuda)
80 |
81 | model = ParserModel(vocab, config, vec)
82 | model.load_state_dict(torch.load(config.load_model_path))
83 | if config.use_cuda:
84 | torch.backends.cudnn.enabled = True
85 | model = model.cuda()
86 |
87 | parser = BiaffineParser(model, vocab.ROOT)
88 | test_data = read_corpus(config.test_file, vocab)
89 |
90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \
91 | evaluate(test_data, parser, vocab, config.test_file + '.out')
92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \
93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las))
94 |
95 |
96 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/Test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.extend(["../../","../","./"])
3 | import time
4 | import numpy as np
5 | import pickle
6 | import argparse
7 | from driver.Config import *
8 | from driver.Model import *
9 | from driver.Parser import *
10 | from data.Dataloader import *
11 | import pickle
12 | import random
13 | os.environ["CUDA_VISIBLE_DEVICES"] = "5"
14 | def evaluate(data, parser, vocab, outputFile,elmofile):
15 | start = time.time()
16 | parser.model.eval()
17 | #output = open(outputFile, 'w', encoding='utf-8')
18 | arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0
19 |
20 | for onebatch in data_iter(data, config.test_batch_size, False):
21 | words, extwords, tags, heads, rels, lengths, masks, elmosens ,tbank_ids= \
22 | batch_data_variable(onebatch, vocab)
23 | count = 0
24 | arcs_batch, rels_batch = parser.parse(words, extwords, tags, lengths, masks,elmosens,elmofile,tbank_ids)
25 | for tree in batch_variable_depTree(onebatch, arcs_batch, rels_batch, lengths, vocab):
26 | #printDepTree(output, tree)
27 | arc_total, arc_correct, rel_total, rel_correct = evalDepTree(predict=tree, gold=onebatch[count])
28 | arc_total_test += arc_total
29 | arc_correct_test += arc_correct
30 | rel_total_test += rel_total
31 | rel_correct_test += rel_correct
32 | count += 1
33 |
34 | #output.close()
35 |
36 | uas = arc_correct_test * 100.0 / arc_total_test
37 | las = rel_correct_test * 100.0 / rel_total_test
38 |
39 |
40 | end = time.time()
41 | during_time = float(end - start)
42 | print("sentence num: %d, parser time = %.2f " % (len(data), during_time))
43 |
44 | return arc_correct_test, rel_correct_test, arc_total_test, uas, las
45 |
46 |
47 |
48 | if __name__ == '__main__':
49 | random.seed(666)
50 | np.random.seed(666)
51 | torch.cuda.manual_seed(666)
52 | torch.manual_seed(666)
53 |
54 | ### gpu
55 | gpu = torch.cuda.is_available()
56 | print("GPU available: ", gpu)
57 | print("CuDNN: \n", torch.backends.cudnn.enabled)
58 |
59 | argparser = argparse.ArgumentParser()
60 | argparser.add_argument('--config_file', default='examples/default.cfg')
61 | argparser.add_argument('--model', default='BaseParser')
62 | argparser.add_argument('--thread', default=4, type=int, help='thread num')
63 | argparser.add_argument('--use-cuda', action='store_true', default=True)
64 |
65 | args, extra_args = argparser.parse_known_args()
66 | config = Configurable(args.config_file, extra_args)
67 |
68 | vocab = pickle.load(open(config.load_vocab_path, 'rb'))
69 | vec = vocab.create_pretrained_embs(config.pretrained_embeddings_file)
70 |
71 |
72 | args, extra_args = argparser.parse_known_args()
73 | config = Configurable(args.config_file, extra_args)
74 | torch.set_num_threads(args.thread)
75 |
76 | config.use_cuda = False
77 | if gpu and args.use_cuda: config.use_cuda = True
78 | print("\nGPU using status: ", config.use_cuda)
79 | # print(config.use_cuda)
80 |
81 | model = ParserModel(vocab, config, vec)
82 | model.load_state_dict(torch.load(config.load_model_path))
83 | if config.use_cuda:
84 | torch.backends.cudnn.enabled = True
85 | model = model.cuda()
86 |
87 | parser = BiaffineParser(model, vocab.ROOT)
88 | test_data = read_corpus(config.test_file, vocab)
89 |
90 | arc_correct, rel_correct, arc_total, test_uas, test_las = \
91 | evaluate(test_data, parser, vocab, config.test_file + '.out',config.test_elmo)
92 | print("Test: uas = %d/%d = %.2f, las = %d/%d =%.2f" % \
93 | (arc_correct, arc_total, test_uas, rel_correct, arc_total, test_las))
94 |
95 |
96 |
--------------------------------------------------------------------------------
/ELMo/use_ELMo/get_elmo_bin.py:
--------------------------------------------------------------------------------
1 |
2 | import struct
3 | from allennlp.modules.elmo import Elmo,batch_to_ids
4 |
5 | class get_elmo:
6 | def __init__(self,options_file,weight_file,test_file,outfile):
7 | self.options_file=options_file
8 | self.weight_file = weight_file
9 | self.test_file = test_file
10 | self.outfile = outfile
11 | def get_elmo(self):
12 | test_file = open(self.test_file, "r")
13 | outfile = open(self.outfile, "w")
14 | elmo = Elmo(self.options_file, self.weight_file, 2, dropout=0)
15 | sentence = []
16 | sen=[]
17 | sentencenum = 0
18 | for line in test_file:
19 | if line == "\n":
20 | sentencenum += 1
21 | print(sentencenum,flush=True)
22 | sen.append(sentence)
23 | character_ids = batch_to_ids(sen)
24 | print(character_ids)
25 | embeddings = elmo(character_ids)
26 | embs=embeddings['elmo_representations'][0]
27 | for char in range(embs.shape[1]):
28 | for layer in range(embs.shape[0]):
29 | for dim in range(embs.shape[2]):
30 | outfile.write(str(float(embs[layer][char][dim]))+' ')
31 | outfile.write('#')
32 | outfile.write('\n')
33 | outfile.write("\n")
34 | outfile.flush()
35 |
36 | sentence = []
37 | sen=[]
38 | else:
39 | sentence.append(line.strip().split('\t')[1])
40 | test_file.close()
41 | outfile.close()
42 | class write_bin:
43 | def __init__(self,inputfile,outputfile):
44 | self.inputfile = inputfile
45 | self.outputfile = outputfile
46 | def get_lines(self):
47 | f1 = open(self.inputfile, "r")
48 | word = 0
49 | sens = 0
50 | count = 0
51 | for line in f1:
52 | count += 1
53 | if line != '\n' and line != '\r\n':
54 | word += 1
55 | else:
56 | sens += 1
57 | print("count", count)
58 | print("word", word)
59 | print("sens", sens)
60 |
61 | def get_first_layer(self):
62 | infile = open(self.inputfile, "r")
63 | outfile = open(self.outputfile, "wb")
64 |
65 | file_embs = []
66 | sentence_emb_first = []
67 | sentence_emb_second = []
68 | sentence_emb_third = []
69 | sentencenum = 0
70 | for line in infile:
71 | if line != '\n':
72 | parts = line.strip().split('#')
73 | firstlayer = parts[0].split()
74 | if len(firstlayer) != 1024:
75 | print("first layer", len(firstlayer))
76 |
77 | sentence_emb_first.append(firstlayer)
78 | else:
79 | sentencenum += 1
80 | for first in sentence_emb_first:
81 | for emb1 in first:
82 | emb = float(emb1)
83 | outfile.write(struct.pack('f', float(emb)))
84 | if sentencenum % 1000 == 0:
85 | print(sentencenum, flush=True)
86 | sentence_emb_first = []
87 | sentence_emb_second = []
88 | sentence_emb_third = []
89 | print('sentencenum', sentencenum)
90 | if __name__ == "__main__":
91 | train_file = "./train.conll"
92 | train_outfile = "./train/train"
93 |
94 | A = get_elmo("./options.json","./weights.hdf5",train_file,train_outfile+".elmo")
95 | A.get_elmo()
96 | B = write_bin(train_outfile+".elmo",train_outfile+".1024.bin")
97 | B.get_lines()
98 | B.get_first_layer()
99 |
100 |
101 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/lm_embeddings_sentences.json:
--------------------------------------------------------------------------------
1 | [["The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .", "When Ms. Winfrey invited Suzanne Somers to share her controversial views about bio-identical hormone treatment on her syndicated show in 2009 , it won Ms. Winfrey a rare dollop of unflattering press , including a Newsweek cover story titled \" Crazy Talk : Oprah , Wacky Cures & You . \"", "Elk calling -- a skill that hunters perfected long ago to lure game with the promise of a little romance -- is now its own sport .", "Don 't !", "Fish , ranked 98th in the world , fired 22 aces en route to a 6-3 , 6-7 ( 5 / 7 ) , 7-6 ( 7 / 4 ) win over seventh-seeded Argentinian David Nalbandian .", "Why does everything have to become such a big issue ?", "AMMAN ( Reuters ) - King Abdullah of Jordan will meet U.S. President Barack Obama in Washington on April 21 to lobby on behalf of Arab states for a stronger U.S. role in Middle East peacemaking , palace officials said on Sunday .", "To help keep traffic flowing the Congestion Charge will remain in operation through-out the strike and TfL will be suspending road works on major London roads wherever possible .", "If no candidate wins an absolute majority , there will be a runoff between the top two contenders , most likely in mid-October .", "Authorities previously served search warrants at Murray 's Las Vegas home and his businesses in Las Vegas and Houston ."], ["Brent North Sea crude for November delivery rose 84 cents to 68.88 dollars a barrel .", "That seems to have been their model up til now .", "Gordon will join Luol Deng on the GB team ; their respective NBA teams , the Detroit Pistons and the Chicago Bulls , play tonight .", "Nikam maintains the attacks were masterminded by the Muslim militant group Lashkar-e-Taiba .", "Last year , Williams was unseeded , ranked 81st and coming off one of her worst losses on tour -- in a Tier 4 event at Hobart -- yet she beat six seeded players en route to the title at Melbourne Park .", "It said that two officers involved in the case had been disciplined .", "\" There is more intelligence now being gathered , \" the official said , adding that such efforts would continue for some time .", "The majority will be of the standard 6X6 configuration for carrying personnel .", "\" Consequently , necessary actions may not be taken to reduce the risks to children of sexual exploitation and drug or alcohol misuse , \" the report said . \u2022 Almost two-thirds of inspected schools were good or outstanding , but the number of underperforming secondaries remained \" stubborn and persistent . \"", "What a World Cup ."], ["But , there have also been many cases of individuals and small groups of people protesting , as in the case of Rongye Adak , a nomad who called for the return of the Dalai Lama and for the freedom of Tibet during the Lithang Horse Racing Festival , in eastern Tibet .", "James Duncan , head of transportation at Bournemouth Borough Council , said : \" Our legal team is reviewing the entitlement of taxis to drop and pick up passengers at bus stops , only for as long as is absolutely necessary to fulfil that function and for no other reason .", "To Mo concerning the food log you kept -- Dr. Buchholz recommends the same thing .", "The CBO estimates that only 23 percent of that would be spent in 2009 and 2010 .", "Even so , Democrats slammed Bush as out of touch .", "An information campaign will be launched later to raise awareness of employment rights and how to enforce them .", "At the gallery the concept is less vague , as Ms. Piper cites specific instances of racial violence , political assassinations and the devastation of Hurricane Katrina .", "There have been some exceptions -- such as Medicare in 1965 .", "The government guidance will be reviewed early next year after a period of public comment .", "It wasn 't the most seaworthy of prizes ."]]
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/Dataloader.py:
--------------------------------------------------------------------------------
1 | from data.Vocab import *
2 | import numpy as np
3 | import torch
4 | from torch.autograd import Variable
5 | import random
6 |
7 | def read_corpus(file_path, vocab=None):
8 | data = []
9 | with open(file_path, 'r') as infile:
10 | for sentence in readDepTree(infile, vocab):
11 | data.append(sentence)
12 | return data
13 |
14 | def sentences_numberize(sentences, vocab):
15 | for sentence in sentences:
16 | yield sentence2id(sentence, vocab)
17 |
18 | def sentence2id(sentence, vocab):
19 | result = []
20 | for dep in sentence:
21 | wordid = vocab.word2id(dep.form)
22 | extwordid = vocab.extword2id(dep.form)
23 | tagid = vocab.tag2id(dep.tag)
24 | head = dep.head
25 | if head == -1:
26 | relid = -1
27 | elif head >= 0:
28 | relid = vocab.rel2id(dep.rel)
29 | result.append([wordid, extwordid, tagid, head, relid])
30 |
31 | return result
32 |
33 |
34 |
35 | def batch_slice(data, batch_size):
36 | batch_num = int(np.ceil(len(data) / float(batch_size)))
37 | for i in range(batch_num):
38 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
39 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)]
40 |
41 | yield sentences
42 |
43 |
44 | def data_iter(data, batch_size, shuffle=True):
45 | """
46 | randomly permute data, then sort by source length, and partition into batches
47 | ensure that the length of sentences in each batch
48 | """
49 |
50 | batched_data = []
51 | if shuffle: np.random.shuffle(data)
52 | batched_data.extend(list(batch_slice(data, batch_size)))
53 |
54 | if shuffle: np.random.shuffle(batched_data)
55 | for batch in batched_data:
56 | yield batch
57 |
58 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration):
59 | batch_data1=random.sample(data1,domain_num_corpus_weighting)
60 | data2_size=domain_num_corpus_weighting*cdt_domain_ration
61 | batch_data2 = random.sample(data2, data2_size)
62 | batch_data=[]
63 | batch_data.extend(batch_data1)
64 | batch_data.extend(batch_data2)
65 | return batch_data
66 |
67 | def batch_data_variable(batch, vocab):
68 | length = len(batch[0])
69 | batch_size = len(batch)
70 | for b in range(1, batch_size):
71 | if len(batch[b]) > length: length = len(batch[b])
72 |
73 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
74 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
75 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
76 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False)
77 | heads = []
78 | rels = []
79 | lengths = []
80 |
81 | b = 0
82 | for sentence in sentences_numberize(batch, vocab):
83 | index = 0
84 | length = len(sentence)
85 | lengths.append(length)
86 | head = np.zeros((length), dtype=np.int32)
87 | rel = np.zeros((length), dtype=np.int32)
88 | for dep in sentence:
89 | words[b, index] = dep[0]
90 | extwords[b, index] = dep[1]
91 | tags[b, index] = dep[2]
92 | head[index] = dep[3]
93 | rel[index] = dep[4]
94 | masks[b, index] = 1
95 | index += 1
96 | b += 1
97 | heads.append(head)
98 | rels.append(rel)
99 |
100 | return words, extwords, tags, heads, rels, lengths, masks
101 |
102 | def batch_variable_depTree(trees, heads, rels, lengths, vocab):
103 | for tree, head, rel, length in zip(trees, heads, rels, lengths):
104 | sentence = []
105 | for idx in range(length):
106 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx])))
107 | yield sentence
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/log:
--------------------------------------------------------------------------------
1 | Found 6 shards at /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.a*
2 | Loading data from: /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.ad
3 | Loaded 25214 sentences.
4 | Finished loading
5 | Found 6 shards at /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.a*
6 | Loading data from: /home/cgong/MWS-shuffle-trainfiles/trainfiles.shuffle.af
7 | Loaded 13350 sentences.
8 | Finished loading
9 | USING SKIP CONNECTIONS
10 | USING SKIP CONNECTIONS
11 | USING SKIP CONNECTIONS
12 | [['global_step:0', TensorShape([])],
13 | ['lm/CNN/W_cnn_0:0',
14 | TensorShape([Dimension(1), Dimension(1), Dimension(16), Dimension(32)])],
15 | ['lm/CNN/W_cnn_1:0',
16 | TensorShape([Dimension(1), Dimension(2), Dimension(16), Dimension(32)])],
17 | ['lm/CNN/W_cnn_2:0',
18 | TensorShape([Dimension(1), Dimension(3), Dimension(16), Dimension(64)])],
19 | ['lm/CNN/W_cnn_3:0',
20 | TensorShape([Dimension(1), Dimension(4), Dimension(16), Dimension(128)])],
21 | ['lm/CNN/W_cnn_4:0',
22 | TensorShape([Dimension(1), Dimension(5), Dimension(16), Dimension(256)])],
23 | ['lm/CNN/W_cnn_5:0',
24 | TensorShape([Dimension(1), Dimension(6), Dimension(16), Dimension(512)])],
25 | ['lm/CNN/W_cnn_6:0',
26 | TensorShape([Dimension(1), Dimension(7), Dimension(16), Dimension(1024)])],
27 | ['lm/CNN/b_cnn_0:0', TensorShape([Dimension(32)])],
28 | ['lm/CNN/b_cnn_1:0', TensorShape([Dimension(32)])],
29 | ['lm/CNN/b_cnn_2:0', TensorShape([Dimension(64)])],
30 | ['lm/CNN/b_cnn_3:0', TensorShape([Dimension(128)])],
31 | ['lm/CNN/b_cnn_4:0', TensorShape([Dimension(256)])],
32 | ['lm/CNN/b_cnn_5:0', TensorShape([Dimension(512)])],
33 | ['lm/CNN/b_cnn_6:0', TensorShape([Dimension(1024)])],
34 | ['lm/CNN_high_0/W_carry:0', TensorShape([Dimension(2048), Dimension(2048)])],
35 | ['lm/CNN_high_0/W_transform:0',
36 | TensorShape([Dimension(2048), Dimension(2048)])],
37 | ['lm/CNN_high_0/b_carry:0', TensorShape([Dimension(2048)])],
38 | ['lm/CNN_high_0/b_transform:0', TensorShape([Dimension(2048)])],
39 | ['lm/CNN_high_1/W_carry:0', TensorShape([Dimension(2048), Dimension(2048)])],
40 | ['lm/CNN_high_1/W_transform:0',
41 | TensorShape([Dimension(2048), Dimension(2048)])],
42 | ['lm/CNN_high_1/b_carry:0', TensorShape([Dimension(2048)])],
43 | ['lm/CNN_high_1/b_transform:0', TensorShape([Dimension(2048)])],
44 | ['lm/CNN_proj/W_proj:0', TensorShape([Dimension(2048), Dimension(512)])],
45 | ['lm/CNN_proj/b_proj:0', TensorShape([Dimension(512)])],
46 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0',
47 | TensorShape([Dimension(16384)])],
48 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0',
49 | TensorShape([Dimension(1024), Dimension(16384)])],
50 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_0/lstm_cell/projection/kernel:0',
51 | TensorShape([Dimension(4096), Dimension(512)])],
52 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/bias:0',
53 | TensorShape([Dimension(16384)])],
54 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/kernel:0',
55 | TensorShape([Dimension(1024), Dimension(16384)])],
56 | ['lm/RNN_0/rnn/multi_rnn_cell/cell_1/lstm_cell/projection/kernel:0',
57 | TensorShape([Dimension(4096), Dimension(512)])],
58 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0',
59 | TensorShape([Dimension(16384)])],
60 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0',
61 | TensorShape([Dimension(1024), Dimension(16384)])],
62 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_0/lstm_cell/projection/kernel:0',
63 | TensorShape([Dimension(4096), Dimension(512)])],
64 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/bias:0',
65 | TensorShape([Dimension(16384)])],
66 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/kernel:0',
67 | TensorShape([Dimension(1024), Dimension(16384)])],
68 | ['lm/RNN_1/rnn/multi_rnn_cell/cell_1/lstm_cell/projection/kernel:0',
69 | TensorShape([Dimension(4096), Dimension(512)])],
70 | ['lm/char_embed:0', TensorShape([Dimension(261), Dimension(16)])],
71 | ['lm/softmax/W:0', TensorShape([Dimension(5422), Dimension(512)])],
72 | ['lm/softmax/b:0', TensorShape([Dimension(5422)])],
73 | ['train_perplexity:0', TensorShape([])]]
74 | Training for 10 epochs and 7850 batches
75 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/usage_token.py:
--------------------------------------------------------------------------------
1 | '''
2 | ELMo usage example with pre-computed and cached context independent
3 | token representations
4 |
5 | Below, we show usage for SQuAD where each input example consists of both
6 | a question and a paragraph of context.
7 | '''
8 |
9 | import tensorflow as tf
10 | import os
11 | from bilm import TokenBatcher, BidirectionalLanguageModel, weight_layers, \
12 | dump_token_embeddings
13 |
14 | # Our small dataset.
15 | raw_context = [
16 | 'Pretrained biLMs compute representations useful for NLP tasks .',
17 | 'They give state of the art performance for many tasks .'
18 | ]
19 | tokenized_context = [sentence.split() for sentence in raw_context]
20 | tokenized_question = [
21 | ['What', 'are', 'biLMs', 'useful', 'for', '?'],
22 | ]
23 |
24 | # Create the vocabulary file with all unique tokens and
25 | # the special , tokens (case sensitive).
26 | all_tokens = set(['', ''] + tokenized_question[0])
27 | for context_sentence in tokenized_context:
28 | for token in context_sentence:
29 | all_tokens.add(token)
30 | vocab_file = 'vocab_small.txt'
31 | with open(vocab_file, 'w') as fout:
32 | fout.write('\n'.join(all_tokens))
33 |
34 | # Location of pretrained LM. Here we use the test fixtures.
35 | datadir = os.path.join('tests', 'fixtures', 'model')
36 | options_file = os.path.join(datadir, 'options.json')
37 | weight_file = os.path.join(datadir, 'lm_weights.hdf5')
38 |
39 | # Dump the token embeddings to a file. Run this once for your dataset.
40 | token_embedding_file = 'elmo_token_embeddings.hdf5'
41 | dump_token_embeddings(
42 | vocab_file, options_file, weight_file, token_embedding_file
43 | )
44 | tf.reset_default_graph()
45 |
46 |
47 |
48 | ## Now we can do inference.
49 | # Create a TokenBatcher to map text to token ids.
50 | batcher = TokenBatcher(vocab_file)
51 |
52 | # Input placeholders to the biLM.
53 | context_token_ids = tf.placeholder('int32', shape=(None, None))
54 | question_token_ids = tf.placeholder('int32', shape=(None, None))
55 |
56 | # Build the biLM graph.
57 | bilm = BidirectionalLanguageModel(
58 | options_file,
59 | weight_file,
60 | use_character_inputs=False,
61 | embedding_weight_file=token_embedding_file
62 | )
63 |
64 | # Get ops to compute the LM embeddings.
65 | context_embeddings_op = bilm(context_token_ids)
66 | question_embeddings_op = bilm(question_token_ids)
67 |
68 | # Get an op to compute ELMo (weighted average of the internal biLM layers)
69 | # Our SQuAD model includes ELMo at both the input and output layers
70 | # of the task GRU, so we need 4x ELMo representations for the question
71 | # and context at each of the input and output.
72 | # We use the same ELMo weights for both the question and context
73 | # at each of the input and output.
74 | elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
75 | with tf.variable_scope('', reuse=True):
76 | # the reuse=True scope reuses weights from the context for the question
77 | elmo_question_input = weight_layers(
78 | 'input', question_embeddings_op, l2_coef=0.0
79 | )
80 |
81 | elmo_context_output = weight_layers(
82 | 'output', context_embeddings_op, l2_coef=0.0
83 | )
84 | with tf.variable_scope('', reuse=True):
85 | # the reuse=True scope reuses weights from the context for the question
86 | elmo_question_output = weight_layers(
87 | 'output', question_embeddings_op, l2_coef=0.0
88 | )
89 |
90 |
91 | with tf.Session() as sess:
92 | # It is necessary to initialize variables once before running inference.
93 | sess.run(tf.global_variables_initializer())
94 |
95 | # Create batches of data.
96 | context_ids = batcher.batch_sentences(tokenized_context)
97 | question_ids = batcher.batch_sentences(tokenized_question)
98 |
99 | # Compute ELMo representations (here for the input only, for simplicity).
100 | elmo_context_input_, elmo_question_input_ = sess.run(
101 | [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']],
102 | feed_dict={context_token_ids: context_ids,
103 | question_token_ids: question_ids}
104 | )
105 |
106 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/Dataloader.py:
--------------------------------------------------------------------------------
1 | from data.Vocab import *
2 | import numpy as np
3 | import torch
4 | from torch.autograd import Variable
5 | import random
6 |
7 | def read_corpus(file_path, vocab=None):
8 | data = []
9 | with open(file_path, 'r') as infile:
10 | for sentence in readDepTree(infile, vocab):
11 | data.append(sentence)
12 | return data
13 |
14 | def sentences_numberize(sentences, vocab):
15 | for sentence in sentences:
16 | yield sentence2id(sentence, vocab)
17 |
18 | def sentence2id(sentence, vocab):
19 | result = []
20 | for dep in sentence:
21 | wordid = vocab.word2id(dep.form)
22 | extwordid = vocab.extword2id(dep.form)
23 | tagid = vocab.tag2id(dep.tag)
24 | head = dep.head
25 | if head == -1:
26 | relid = -1
27 | elif head >= 0:
28 | relid = vocab.rel2id(dep.rel)
29 | if dep.treebank_id in vocab._id2tbank:
30 | tbankid = vocab.tbank2id(dep.treebank_id)
31 | else:
32 | tbankid=vocab.tbank2id(dep.proxy_tbank)
33 | result.append([wordid, extwordid, tagid, head, relid,tbankid])
34 |
35 | return result
36 |
37 |
38 |
39 | def batch_slice(data, batch_size):
40 | batch_num = int(np.ceil(len(data) / float(batch_size)))
41 | for i in range(batch_num):
42 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
43 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)]
44 |
45 | yield sentences
46 |
47 |
48 | def data_iter(data, batch_size, shuffle=True):
49 | """
50 | randomly permute data, then sort by source length, and partition into batches
51 | ensure that the length of sentences in each batch
52 | """
53 |
54 | batched_data = []
55 | if shuffle: np.random.shuffle(data)
56 | batched_data.extend(list(batch_slice(data, batch_size)))
57 |
58 | if shuffle: np.random.shuffle(batched_data)
59 | for batch in batched_data:
60 | yield batch
61 |
62 |
63 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration):
64 | batch_data1=random.sample(data1,domain_num_corpus_weighting)
65 | data2_size=domain_num_corpus_weighting*cdt_domain_ration
66 | batch_data2 = random.sample(data2, data2_size)
67 | batch_data=[]
68 | batch_data.extend(batch_data1)
69 | batch_data.extend(batch_data2)
70 | return batch_data
71 |
72 | def batch_data_variable(batch, vocab):
73 | length = len(batch[0])
74 | batch_size = len(batch)
75 | for b in range(1, batch_size):
76 | if len(batch[b]) > length: length = len(batch[b])
77 |
78 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
79 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
80 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
81 | tbank_ids = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
82 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False)
83 | heads = []
84 | rels = []
85 | lengths = []
86 |
87 | b = 0
88 | for sentence in sentences_numberize(batch, vocab):
89 | index = 0
90 | length = len(sentence)
91 | lengths.append(length)
92 | head = np.zeros((length), dtype=np.int32)
93 | rel = np.zeros((length), dtype=np.int32)
94 | for dep in sentence:
95 | words[b, index] = dep[0]
96 | extwords[b, index] = dep[1]
97 | tags[b, index] = dep[2]
98 | tbank_ids[b,index] = dep[5]
99 | head[index] = dep[3]
100 | rel[index] = dep[4]
101 |
102 | masks[b, index] = 1
103 | index += 1
104 | b += 1
105 | heads.append(head)
106 | rels.append(rel)
107 |
108 | return words, extwords, tags, heads, rels, lengths, masks,tbank_ids
109 |
110 | def batch_variable_depTree(trees, heads, rels, lengths, vocab):
111 | for tree, head, rel, length in zip(trees, heads, rels, lengths):
112 | sentence = []
113 | for idx in range(length):
114 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]),tree[idx].treebank_id))
115 | yield sentence
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/bilm/elmo.py:
--------------------------------------------------------------------------------
1 |
2 | import tensorflow as tf
3 |
4 | def weight_layers(name, bilm_ops, l2_coef=None,
5 | use_top_only=False, do_layer_norm=False):
6 | '''
7 | Weight the layers of a biLM with trainable scalar weights to
8 | compute ELMo representations.
9 |
10 | For each output layer, this returns two ops. The first computes
11 | a layer specific weighted average of the biLM layers, and
12 | the second the l2 regularizer loss term.
13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES
14 |
15 | Input:
16 | name = a string prefix used for the trainable variable names
17 | bilm_ops = the tensorflow ops returned to compute internal
18 | representations from a biLM. This is the return value
19 | from BidirectionalLanguageModel(...)(ids_placeholder)
20 | l2_coef: the l2 regularization coefficient $\lambda$.
21 | Pass None or 0.0 for no regularization.
22 | use_top_only: if True, then only use the top layer.
23 | do_layer_norm: if True, then apply layer normalization to each biLM
24 | layer before normalizing
25 |
26 | Output:
27 | {
28 | 'weighted_op': op to compute weighted average for output,
29 | 'regularization_op': op to compute regularization term
30 | }
31 | '''
32 | def _l2_regularizer(weights):
33 | if l2_coef is not None:
34 | return l2_coef * tf.reduce_sum(tf.square(weights))
35 | else:
36 | return 0.0
37 |
38 | # Get ops for computing LM embeddings and mask
39 | lm_embeddings = bilm_ops['lm_embeddings']
40 | mask = bilm_ops['mask']
41 |
42 | n_lm_layers = int(lm_embeddings.get_shape()[1])
43 | lm_dim = int(lm_embeddings.get_shape()[3])
44 |
45 | with tf.control_dependencies([lm_embeddings, mask]):
46 | # Cast the mask and broadcast for layer use.
47 | mask_float = tf.cast(mask, 'float32')
48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1)
49 |
50 | def _do_ln(x):
51 | # do layer normalization excluding the mask
52 | x_masked = x * broadcast_mask
53 | N = tf.reduce_sum(mask_float) * lm_dim
54 | mean = tf.reduce_sum(x_masked) / N
55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2
56 | ) / N
57 | return tf.nn.batch_normalization(
58 | x, mean, variance, None, None, 1E-12
59 | )
60 |
61 | if use_top_only:
62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
63 | # just the top layer
64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
65 | # no regularization
66 | reg = 0.0
67 | else:
68 | W = tf.get_variable(
69 | '{}_ELMo_W'.format(name),
70 | shape=(n_lm_layers, ),
71 | initializer=tf.zeros_initializer,
72 | regularizer=_l2_regularizer,
73 | trainable=True,
74 | )
75 |
76 | # normalize the weights
77 | normed_weights = tf.split(
78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
79 | )
80 | # split LM layers
81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
82 |
83 | # compute the weighted, normalized LM activations
84 | pieces = []
85 | for w, t in zip(normed_weights, layers):
86 | if do_layer_norm:
87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
88 | else:
89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1))
90 | sum_pieces = tf.add_n(pieces)
91 |
92 | # get the regularizer
93 | reg = [
94 | r for r in tf.get_collection(
95 | tf.GraphKeys.REGULARIZATION_LOSSES)
96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0
97 | ]
98 | if len(reg) != 1:
99 | raise ValueError
100 |
101 | # scale the weighted sum by gamma
102 | gamma = tf.get_variable(
103 | '{}_ELMo_gamma'.format(name),
104 | shape=(1, ),
105 | initializer=tf.ones_initializer,
106 | regularizer=None,
107 | trainable=True,
108 | )
109 | weighted_lm_layers = sum_pieces * gamma
110 |
111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
112 |
113 | return ret
114 |
115 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/build/lib/bilm/elmo.py:
--------------------------------------------------------------------------------
1 |
2 | import tensorflow as tf
3 |
4 | def weight_layers(name, bilm_ops, l2_coef=None,
5 | use_top_only=False, do_layer_norm=False):
6 | '''
7 | Weight the layers of a biLM with trainable scalar weights to
8 | compute ELMo representations.
9 |
10 | For each output layer, this returns two ops. The first computes
11 | a layer specific weighted average of the biLM layers, and
12 | the second the l2 regularizer loss term.
13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES
14 |
15 | Input:
16 | name = a string prefix used for the trainable variable names
17 | bilm_ops = the tensorflow ops returned to compute internal
18 | representations from a biLM. This is the return value
19 | from BidirectionalLanguageModel(...)(ids_placeholder)
20 | l2_coef: the l2 regularization coefficient $\lambda$.
21 | Pass None or 0.0 for no regularization.
22 | use_top_only: if True, then only use the top layer.
23 | do_layer_norm: if True, then apply layer normalization to each biLM
24 | layer before normalizing
25 |
26 | Output:
27 | {
28 | 'weighted_op': op to compute weighted average for output,
29 | 'regularization_op': op to compute regularization term
30 | }
31 | '''
32 | def _l2_regularizer(weights):
33 | if l2_coef is not None:
34 | return l2_coef * tf.reduce_sum(tf.square(weights))
35 | else:
36 | return 0.0
37 |
38 | # Get ops for computing LM embeddings and mask
39 | lm_embeddings = bilm_ops['lm_embeddings']
40 | mask = bilm_ops['mask']
41 |
42 | n_lm_layers = int(lm_embeddings.get_shape()[1])
43 | lm_dim = int(lm_embeddings.get_shape()[3])
44 |
45 | with tf.control_dependencies([lm_embeddings, mask]):
46 | # Cast the mask and broadcast for layer use.
47 | mask_float = tf.cast(mask, 'float32')
48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1)
49 |
50 | def _do_ln(x):
51 | # do layer normalization excluding the mask
52 | x_masked = x * broadcast_mask
53 | N = tf.reduce_sum(mask_float) * lm_dim
54 | mean = tf.reduce_sum(x_masked) / N
55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2
56 | ) / N
57 | return tf.nn.batch_normalization(
58 | x, mean, variance, None, None, 1E-12
59 | )
60 |
61 | if use_top_only:
62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
63 | # just the top layer
64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
65 | # no regularization
66 | reg = 0.0
67 | else:
68 | W = tf.get_variable(
69 | '{}_ELMo_W'.format(name),
70 | shape=(n_lm_layers, ),
71 | initializer=tf.zeros_initializer,
72 | regularizer=_l2_regularizer,
73 | trainable=True,
74 | )
75 |
76 | # normalize the weights
77 | normed_weights = tf.split(
78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
79 | )
80 | # split LM layers
81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
82 |
83 | # compute the weighted, normalized LM activations
84 | pieces = []
85 | for w, t in zip(normed_weights, layers):
86 | if do_layer_norm:
87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
88 | else:
89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1))
90 | sum_pieces = tf.add_n(pieces)
91 |
92 | # get the regularizer
93 | reg = [
94 | r for r in tf.get_collection(
95 | tf.GraphKeys.REGULARIZATION_LOSSES)
96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0
97 | ]
98 | if len(reg) != 1:
99 | raise ValueError
100 |
101 | # scale the weighted sum by gamma
102 | gamma = tf.get_variable(
103 | '{}_ELMo_gamma'.format(name),
104 | shape=(1, ),
105 | initializer=tf.ones_initializer,
106 | regularizer=None,
107 | trainable=True,
108 | )
109 | weighted_lm_layers = sum_pieces * gamma
110 |
111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
112 |
113 | return ret
114 |
115 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/Dataloader.py:
--------------------------------------------------------------------------------
1 | from data.Vocab import *
2 | import numpy as np
3 | import torch
4 | from torch.autograd import Variable
5 | import random
6 |
7 | def read_corpus(file_path, vocab=None):
8 | data = []
9 | with open(file_path, 'r') as infile:
10 | for sentence in readDepTree(infile, vocab):
11 | data.append(sentence)
12 | return data
13 |
14 | def sentences_numberize(sentences, vocab):
15 | for sentence in sentences:
16 | yield sentence2id(sentence, vocab)
17 |
18 | def sentence2id(sentence, vocab):
19 | result = []
20 | for dep in sentence:
21 | wordid = vocab.word2id(dep.form)
22 | extwordid = vocab.extword2id(dep.form)
23 | tagid = vocab.tag2id(dep.tag)
24 | head = dep.head
25 | if head == -1:
26 | relid = -1
27 | elif head >= 0:
28 | relid = vocab.rel2id(dep.rel)
29 | if dep.treebank_id in vocab._id2tbank:
30 | tbankid = vocab.tbank2id(dep.treebank_id)
31 | else:
32 | tbankid=vocab.tbank2id(dep.proxy_tbank)
33 | word = dep.form
34 | charid = dep.charid
35 | id = dep.id
36 | result.append([wordid, extwordid, tagid, head, relid,word,charid, id,tbankid])
37 |
38 | return result
39 |
40 | def batch_slice(data, batch_size):
41 | batch_num = int(np.ceil(len(data) / float(batch_size)))
42 | for i in range(batch_num):
43 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
44 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)]
45 |
46 | yield sentences
47 |
48 |
49 | def data_iter(data, batch_size, shuffle=True):
50 | """
51 | randomly permute data, then sort by source length, and partition into batches
52 | ensure that the length of sentences in each batch
53 | """
54 |
55 | batched_data = []
56 | if shuffle: np.random.shuffle(data)
57 | batched_data.extend(list(batch_slice(data, batch_size)))
58 |
59 | if shuffle: np.random.shuffle(batched_data)
60 | for batch in batched_data:
61 | yield batch
62 |
63 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration):
64 | batch_data1=random.sample(data1,domain_num_corpus_weighting)
65 | data2_size=domain_num_corpus_weighting*cdt_domain_ration
66 | batch_data2 = random.sample(data2, data2_size)
67 | batch_data=[]
68 | batch_data.extend(batch_data1)
69 | batch_data.extend(batch_data2)
70 | return batch_data
71 |
72 | def batch_data_variable(batch, vocab):
73 | length = len(batch[0])
74 | batch_size = len(batch)
75 | for b in range(1, batch_size):
76 | if len(batch[b]) > length: length = len(batch[b])
77 |
78 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
79 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
80 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
81 | tbank_ids = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
82 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False)
83 | heads = []
84 | rels = []
85 | lengths = []
86 | elmosens = []
87 |
88 | b = 0
89 | for sentence in sentences_numberize(batch, vocab):
90 | index = 0
91 | elmosen = []
92 | length = len(sentence)
93 | lengths.append(length)
94 | elmosen.append(length)
95 | head = np.zeros((length), dtype=np.int32)
96 | rel = np.zeros((length), dtype=np.int32)
97 | for dep in sentence:
98 | words[b, index] = dep[0]
99 | extwords[b, index] = dep[1]
100 | tags[b, index] = dep[2]
101 | tbank_ids[b,index] = dep[8]
102 | head[index] = dep[3]
103 | rel[index] = dep[4]
104 | masks[b, index] = 1
105 | index += 1
106 | if dep[7] == 1:
107 | startcharid = dep[6]
108 | elmosen.append(startcharid)
109 | b += 1
110 | heads.append(head)
111 | rels.append(rel)
112 | elmosens.append(elmosen)
113 |
114 | return words, extwords, tags, heads, rels, lengths, masks,elmosens,tbank_ids
115 |
116 | def batch_variable_depTree(trees, heads, rels, lengths, vocab):
117 | for tree, head, rel, length in zip(trees, heads, rels, lengths):
118 | sentence = []
119 | for idx in range(length):
120 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx]),tree[idx].charid,tree[idx].treebank_id))
121 | yield sentence
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/multi_task_learning/data/Dataloader.py:
--------------------------------------------------------------------------------
1 | from data.Vocab import *
2 | import numpy as np
3 | import torch
4 | from torch.autograd import Variable
5 | import random
6 |
7 | def read_corpus(file_path, vocab=None):
8 | data = []
9 | tgt_type = parse_my_conll(file_path)
10 | with open(file_path, 'r') as infile:
11 | for sentence in readDepTree(infile,tgt_type, vocab):
12 | data.append(sentence)
13 | return data
14 |
15 | def sentences_numberize(sentences, vocab):
16 | for sentence in sentences:
17 | yield sentence2id(sentence, vocab)
18 |
19 | def sentence2id(sentence, vocab):
20 | result = []
21 | for dep in sentence.sentence:
22 | wordid = vocab.word2id(dep.form)
23 | extwordid = vocab.extword2id(dep.form)
24 | tagid = vocab.tag2id(dep.tag)
25 | head = dep.head
26 | if head == -1:
27 | relid = -1
28 | elif head >= 0:
29 | relid = vocab.rel2id(dep.rel)
30 | result.append([wordid, extwordid, tagid, head, relid])
31 |
32 | return result
33 |
34 |
35 |
36 | def batch_slice(data, batch_size):
37 | batch_num = int(np.ceil(len(data) / float(batch_size)))
38 | for i in range(batch_num):
39 | cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
40 | sentences = [data[i * batch_size + b] for b in range(cur_batch_size)]
41 |
42 | yield sentences
43 |
44 |
45 | def data_iter(data, batch_size, shuffle=True):
46 | """
47 | randomly permute data, then sort by source length, and partition into batches
48 | ensure that the length of sentences in each batch
49 | """
50 |
51 | batched_data = []
52 | if shuffle: np.random.shuffle(data)
53 | batched_data.extend(list(batch_slice(data, batch_size)))
54 |
55 | if shuffle: np.random.shuffle(batched_data)
56 | for batch in batched_data:
57 | yield batch
58 |
59 | def data_corpus_weighting(data1, data2, domain_num_corpus_weighting,cdt_domain_ration):
60 | batch_data1=random.sample(data1,domain_num_corpus_weighting)
61 | data2_size=domain_num_corpus_weighting*cdt_domain_ration
62 | batch_data2 = random.sample(data2, data2_size)
63 | return batch_data1,batch_data2
64 |
65 | def data_iter_from_two_data(data1, data2, batch_size, shuffle=True):
66 | type1 = data1[0].tgt_type
67 | type2 = data2[0].tgt_type
68 | assert(type1 != type2)
69 | data = data1 + data2
70 | batched_data = []
71 | if shuffle: np.random.shuffle(data)
72 | batched_data.extend(list(batch_slice(data, batch_size)))
73 |
74 | if shuffle: np.random.shuffle(batched_data)
75 | for batch in batched_data:
76 | batch1 = []
77 | batch2 = []
78 | for instance in batch:
79 | if instance.tgt_type == type1:
80 | batch1.append(instance)
81 | elif instance.tgt_type == type2:
82 | batch2.append(instance)
83 | else:
84 | print("there is error in corpus")
85 | exit(0)
86 | yield batch1, batch2
87 | def batch_data_variable(batch, vocab):
88 | length = len(batch[0])
89 | batch_size = len(batch)
90 | for b in range(1, batch_size):
91 | if len(batch[b]) > length: length = len(batch[b])
92 |
93 | words = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
94 | extwords = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
95 | tags = Variable(torch.LongTensor(batch_size, length).zero_(), requires_grad=False)
96 | masks = Variable(torch.Tensor(batch_size, length).zero_(), requires_grad=False)
97 | heads = []
98 | rels = []
99 | lengths = []
100 |
101 | b = 0
102 | for sentence in sentences_numberize(batch, vocab):
103 | index = 0
104 | length = len(sentence)
105 | lengths.append(length)
106 | head = np.zeros((length), dtype=np.int32)
107 | rel = np.zeros((length), dtype=np.int32)
108 | for dep in sentence:
109 | words[b, index] = dep[0]
110 | extwords[b, index] = dep[1]
111 | tags[b, index] = dep[2]
112 | head[index] = dep[3]
113 | rel[index] = dep[4]
114 | masks[b, index] = 1
115 | index += 1
116 | b += 1
117 | heads.append(head)
118 | rels.append(rel)
119 |
120 | return words, extwords, tags, heads, rels, lengths, masks
121 |
122 | def batch_variable_depTree(trees, heads, rels, lengths, vocab):
123 | for Tree, head, rel, length in zip(trees, heads, rels, lengths):
124 | tree = Tree.sentence
125 | sentence = []
126 | for idx in range(length):
127 | sentence.append(Dependency(idx, tree[idx].org_form, tree[idx].tag, head[idx], vocab.id2rel(rel[idx])))
128 | yield sentence
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/data/Dependency.py:
--------------------------------------------------------------------------------
1 |
2 | class Dependency: #以词为单位
3 | def __init__(self, id, form, tag, head, rel):
4 | self.id = id #该词在该句话的id
5 | self.org_form = form #该词语
6 | self.form = form.lower()
7 | self.tag = tag #该词词性
8 | self.head = head #该词头节点
9 | self.rel = rel #该词的标签
10 |
11 | def __str__(self):
12 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"]
13 | return '\t'.join(values)
14 |
15 | @property
16 | def pseudo(self):
17 | return self.id == 0 or self.form == ''
18 |
19 |
20 | class DepTree: #以句子为单位,判断是否为投影树
21 | def __init__(self, sentence):
22 | self.words = list(sentence)
23 | self.start = 1
24 | if sentence[0].id == 1: self.start = 0
25 | elif sentence[0].id == 0: self.start = 1
26 | else: self.start = len(self.words)
27 |
28 | def isProj(self):
29 | n = len(self.words)
30 | words = self.words
31 | if self.start > 1: return False
32 | if self.start == 0: words = [None] + words
33 | for i in range(1, n):
34 | hi = words[i].head
35 | for j in range(i+1, hi):
36 | hj = words[j].head
37 | if (hj - hi) * (hj - i) > 0:
38 | return False
39 | return True
40 |
41 |
42 | def evalDepTree(gold, predict):
43 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU']
44 | PUNCT_TAGS = []
45 | ignore_tags = set(PUNCT_TAGS)
46 | start_g = 0
47 | if gold[0].id == 0: start_g = 1
48 | start_p = 0
49 | if predict[0].id == 0: start_p = 1
50 |
51 | glength = len(gold) - start_g
52 | plength = len(predict) - start_p
53 |
54 | if glength != plength:
55 | raise Exception('gold length does not match predict length.')
56 |
57 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0
58 | for idx in range(glength):
59 | if gold[start_g + idx].pseudo: continue
60 | if gold[start_g + idx].tag in ignore_tags: continue
61 | if gold[start_g + idx].head == -1: continue
62 | arc_total += 1
63 | label_total += 1
64 | if gold[start_g + idx].head == predict[start_p + idx].head:
65 | arc_correct += 1
66 | if gold[start_g + idx].rel == predict[start_p + idx].rel:
67 | label_correct += 1
68 |
69 | return arc_total, arc_correct, label_total, label_correct
70 |
71 |
72 | def readDepTree(file, vocab=None):
73 | proj = 0
74 | total = 0
75 | min_count = 1
76 | if vocab is None: min_count = 0
77 | if vocab is None: sentence = []
78 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
79 | for line in file:
80 | tok = line.strip().split('\t')
81 | if not tok or line.strip() == '' or line.strip().startswith('#'):
82 | if len(sentence) > min_count: #sententce至少有一个词
83 | if DepTree(sentence).isProj():
84 | proj += 1
85 | total += 1
86 | yield sentence
87 | if vocab is None:
88 | sentence = []
89 | else:
90 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
91 | elif len(tok) == 10:
92 | if tok[6] == '_': tok[6] = '-1'
93 | try:
94 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7]))
95 | except Exception:
96 | pass
97 | else:
98 | pass
99 |
100 | if len(sentence) > min_count:
101 | if DepTree(sentence).isProj():
102 | proj += 1
103 | total += 1
104 | yield sentence
105 |
106 | print("Total num: ", total)
107 | print("Proj num: ", proj)
108 |
109 |
110 | def writeDepTree(filename, sentences):
111 | with open(filename, 'w') as file:
112 | for sentence in sentences:
113 | for entry in sentence:
114 | if not entry.pseudo: file.write(str(entry) + '\n')
115 | file.write('\n')
116 |
117 | def printDepTree(output, sentence, gold=None):
118 | if gold== None:
119 | for entry in sentence:
120 | if not entry.pseudo: output.write(str(entry) + '\n')
121 | output.write('\n')
122 | else:
123 | start_g = 0
124 | if gold[0].id == 0: start_g = 1
125 | start_p = 0
126 | if sentence[0].id == 0: start_p = 1
127 | glength = len(gold) - start_g
128 | plength = len(sentence) - start_p
129 |
130 | if glength != plength:
131 | raise Exception('gold length does not match predict length.')
132 |
133 | for idx in range(glength):
134 | if gold[start_g + idx].pseudo: continue
135 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \
136 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"]
137 | output.write('\t'.join(values) + '\n')
138 |
139 | output.write('\n')
140 |
141 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/test_elmo.py:
--------------------------------------------------------------------------------
1 |
2 | import unittest
3 | import os
4 | import json
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 |
9 | from bilm.model import BidirectionalLanguageModel
10 | from bilm.data import Batcher
11 | from bilm.elmo import weight_layers
12 |
13 | FIXTURES = 'tests/fixtures/model/'
14 |
15 |
16 |
17 | class TestWeightedLayers(unittest.TestCase):
18 | def tearDown(self):
19 | tf.reset_default_graph()
20 | self.sess.close()
21 |
22 | def setUp(self):
23 | self.sess = tf.Session()
24 |
25 | def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only):
26 | # create the Batcher
27 | vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
28 | batcher = Batcher(vocab_file, 50)
29 |
30 | # load the model
31 | options_file = os.path.join(FIXTURES, 'options.json')
32 | weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
33 | character_ids = tf.placeholder('int32', (None, None, 50))
34 | model = BidirectionalLanguageModel(
35 | options_file, weight_file, max_batch_size=4)
36 | bilm_ops = model(character_ids)
37 |
38 | weighted_ops = []
39 | for k in range(2):
40 | ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef,
41 | do_layer_norm=do_layer_norm,
42 | use_top_only=use_top_only)
43 | weighted_ops.append(ops)
44 |
45 | # initialize
46 | self.sess.run(tf.global_variables_initializer())
47 |
48 | n_expected_trainable_weights = 2 * (1 + int(not use_top_only))
49 | self.assertEqual(len(tf.trainable_variables()),
50 | n_expected_trainable_weights)
51 | # and one regularizer per weighted layer
52 | n_expected_reg_losses = 2 * int(not use_top_only)
53 | self.assertEqual(
54 | len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
55 | n_expected_reg_losses,
56 | )
57 |
58 | # Set the variables.
59 | weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])],
60 | [np.array([0.2, 0.4, 0.6]), np.array([0.88])]]
61 | for k in range(2):
62 | with tf.variable_scope('', reuse=True):
63 | if not use_top_only:
64 | W = tf.get_variable('{}_ELMo_W'.format(k))
65 | _ = self.sess.run([W.assign(weights[k][0])])
66 | gamma = tf.get_variable('{}_ELMo_gamma'.format(k))
67 | _ = self.sess.run([gamma.assign(weights[k][1])])
68 |
69 | # make some data
70 | sentences = [
71 | ['The', 'first', 'sentence', '.'],
72 | ['The', 'second'],
73 | ['Third']
74 | ]
75 | X_chars = batcher.batch_sentences(sentences)
76 |
77 | ops = model(character_ids)
78 | lm_embeddings, mask, weighted0, weighted1 = self.sess.run(
79 | [ops['lm_embeddings'], ops['mask'],
80 | weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']],
81 | feed_dict={character_ids: X_chars}
82 | )
83 | actual_elmo = [weighted0, weighted1]
84 |
85 | # check the mask first
86 | expected_mask = [[True, True, True, True],
87 | [True, True, False, False],
88 | [True, False, False, False]]
89 | self.assertTrue((expected_mask == mask).all())
90 |
91 | # Now compute the actual weighted layers
92 | for k in range(2):
93 | normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum(
94 | np.exp(weights[k][0] + 1.0 / 3))
95 | # masked layer normalization
96 | expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1]))
97 | if not use_top_only:
98 | for j in range(3): # number of LM layers
99 | if do_layer_norm:
100 | mean = np.mean(lm_embeddings[:, j, :, :][mask])
101 | std = np.std(lm_embeddings[:, j, :, :][mask])
102 | normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / (
103 | std + 1E-12)
104 | expected_elmo += normed_weights[j] * normed_lm_embed
105 | else:
106 | expected_elmo += normed_weights[j] * lm_embeddings[
107 | :, j, :, :]
108 | else:
109 | expected_elmo += lm_embeddings[:, -1, :, :]
110 |
111 | # the scale parameter
112 | expected_elmo *= weights[k][1]
113 | self.assertTrue(
114 | np.allclose(expected_elmo, actual_elmo[k], atol=1e-6)
115 | )
116 |
117 | def test_weighted_layers(self):
118 | self._check_weighted_layer(1.0, do_layer_norm=True, use_top_only=False)
119 |
120 | def test_weighted_layers_no_norm(self):
121 | self._check_weighted_layer(1.0, do_layer_norm=False, use_top_only=False)
122 |
123 | def test_weighted_layers_top_only(self):
124 | self._check_weighted_layer(None, do_layer_norm=False, use_top_only=True)
125 |
126 |
127 | if __name__ == '__main__':
128 | unittest.main()
129 |
130 |
131 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/fixtures/model/vocab_test.txt:
--------------------------------------------------------------------------------
1 | Bournemouth
2 | concept
3 | care
4 | were
5 | it
6 | vague
7 | Arab
8 | ago
9 | comment
10 | roads
11 | Newsweek
12 | Pistons
13 | on
14 | rare
15 | not
16 | have
17 | majority
18 | Suzanne
19 | return
20 | report
21 | Charge
22 | World
23 | also
24 | thing
25 | Reuters
26 | you
27 | confirmed
28 | school
29 | press
30 | suspending
31 | later
32 | states
33 | 21
34 | swine
35 | them
36 | period
37 | route
38 | reversed
39 | Medicare
40 | likely
41 | at
42 | only
43 | awareness
44 | 23
45 | including
46 | 81st
47 | controversial
48 | romance
49 | Talk
50 | campaign
51 | long
52 | called
53 | Mo
54 | drug
55 | Melbourne
56 | major
57 | worst
58 | stops
59 | systems
60 | personnel
61 | U.S.
62 | even
63 | absolute
64 | aces
65 | to
66 | should
67 | persistent
68 | official
69 | Lithang
70 | Argentinian
71 | seeded
72 | Williams
73 | Winfrey
74 | Abdullah
75 | attacks
76 | Festival
77 | Control
78 | be
79 | search
80 | up
81 | over
82 | misuse
83 | barrel
84 | Cup
85 | unseeded
86 | other
87 | advised
88 | eastern
89 | ranked
90 | game
91 | strike
92 | legal
93 | specific
94 | guidance
95 | -
96 | stay
97 | does
98 | team
99 | small
100 | Dr.
101 | an
102 | !
103 | that
104 | such
105 | centers
106 | percent
107 | militant
108 | Murray
109 | initially
110 | off
111 | businesses
112 | sexual
113 | continue
114 | violence
115 | as
116 | fulfil
117 | touch
118 | time
119 | necessary
120 | Democrats
121 | What
122 | hormone
123 | •
124 | if
125 | Brent
126 | bus
127 | en
128 | Dalai
129 | of
130 | racial
131 | itself
132 | with
133 | lobby
134 | disciplined
135 | out
136 | risks
137 | gathered
138 | people
139 | little
140 | traffic
141 | Rongye
142 | virus
143 | Nalbandian
144 | cents
145 | 2010
146 | entitlement
147 | employment
148 | launched
149 | possible
150 | between
151 | groups
152 | issue
153 | no
154 | reviewed
155 | losses
156 | estimates
157 | .
158 | North
159 | everything
160 | November
161 | after
162 | previously
163 | her
164 | become
165 | (
166 | David
167 | world
168 | unflattering
169 | next
170 | 68.88
171 | Tibet
172 | alcohol
173 | story
174 | kept
175 | 2009
176 | 1965
177 | same
178 | contenders
179 | Wacky
180 | lure
181 | stronger
182 | in
183 | Katrina
184 | 6-7
185 | freedom
186 | recommends
187 | most
188 | group
189 | a
190 | London
191 | til
192 | standard
193 | gallery
194 | Sea
195 | through-out
196 | 's
197 | 6-3
198 | many
199 | apparent
200 | Piper
201 | peacemaking
202 | officers
203 | reviewing
204 | for
205 | some
206 | play
207 | how
208 |
209 | enforce
210 | spent
211 | exploitation
212 | number
213 | taken
214 | so
215 | 5
216 | But
217 | Even
218 | At
219 | ,
220 | Our
221 | Gordon
222 | Chicago
223 | head
224 | case
225 | 7
226 | Jordan
227 | delivery
228 | concerning
229 | palace
230 | King
231 | Cures
232 | she
233 | mildness
234 | maintains
235 | devastation
236 | Consequently
237 | Crazy
238 | tonight
239 | Almost
240 | transportation
241 | seems
242 | remain
243 | they
244 | bio-identical
245 | warrants
246 | East
247 | own
248 | cites
249 | perfected
250 | individuals
251 | 22
252 | teams
253 | coming
254 | food
255 | )
256 | efforts
257 | Fish
258 |
259 | configuration
260 | road
261 | adding
262 | outstanding
263 | secondaries
264 | children
265 | /
266 | Tier
267 | is
268 | will
269 | AMMAN
270 | taxis
271 | the
272 | crude
273 | meet
274 | Hurricane
275 | but
276 | syndicated
277 | underperforming
278 | occurred
279 | wherever
280 | Why
281 | runoff
282 | &
283 | was
284 | win
285 | GB
286 | Lashkar-e-Taiba
287 | tour
288 | its
289 | views
290 | top
291 | Somers
292 | saying
293 | early
294 | two-thirds
295 | exceptions
296 | candidate
297 | ;
298 | It
299 | Barack
300 | seaworthy
301 | Nikam
302 | Elk
303 | protesting
304 | raise
305 | by
306 | 98th
307 | wasn
308 | Vegas
309 | Prevention
310 | Bush
311 | had
312 | nomad
313 | The
314 | flowing
315 | --
316 | government
317 | Disease
318 | served
319 | schools
320 | won
321 | instances
322 | yet
323 | more
324 | remained
325 | That
326 | and
327 | join
328 | Lama
329 | masterminded
330 | fired
331 | calling
332 | invited
333 | six
334 | TfL
335 | Washington
336 | dollop
337 | To
338 | reason
339 | said
340 | actions
341 | "
342 | Las
343 | officials
344 | Don
345 | function
346 | two
347 | respective
348 | An
349 | may
350 | April
351 | players
352 | good
353 | 7-6
354 | Buchholz
355 | 6X6
356 | home
357 | Centers
358 | Congestion
359 | public
360 | ?
361 | pick
362 | Horse
363 | Borough
364 | seventh-seeded
365 | passengers
366 | there
367 | hunters
368 | meant
369 | inspected
370 | during
371 | rights
372 | prizes
373 | Council
374 | one
375 | or
376 | role
377 | would
378 | model
379 | mid-October
380 | carrying
381 | Last
382 | :
383 | behalf
384 | less
385 | cases
386 | works
387 | When
388 | flu
389 | help
390 | Muslim
391 | dollars
392 | 't
393 | treatment
394 | James
395 | his
396 | titled
397 | about
398 | title
399 | drop
400 | 4
401 | Authorities
402 | operation
403 | assassinations
404 | Hobart
405 | There
406 | Sunday
407 | Middle
408 | CBO
409 | slammed
410 | event
411 | Detroit
412 | Duncan
413 | intelligence
414 | reduce
415 | their
416 | Racing
417 | outbreaks
418 | rose
419 | Ms.
420 | wins
421 | keep
422 | You
423 | beat
424 | been
425 | stubborn
426 | cover
427 | now
428 | share
429 | open
430 | show
431 | Oprah
432 | close
433 | involved
434 | year
435 | day
436 | Adak
437 | big
438 | Park
439 | log
440 | skill
441 | being
442 | promise
443 | who
444 | sport
445 | then
446 | If
447 | Obama
448 | Bulls
449 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/data/Dependency.py:
--------------------------------------------------------------------------------
1 |
2 | class Dependency: #以词为单位
3 | def __init__(self, id, form, tag, head, rel, treebank_id,proxy_tbank="BC"):
4 | self.id = id #该词在该句话的id
5 | self.org_form = form #该词语
6 | self.form = form.lower()
7 | self.tag = tag #该词词性
8 | self.head = head #该词头节点
9 | self.rel = rel #该词的标签
10 | self.treebank_id=treebank_id
11 | self.proxy_tbank=proxy_tbank #代理treebank_id,默认cdt
12 |
13 | def __str__(self):
14 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"]
15 | return '\t'.join(values)
16 |
17 | @property
18 | def pseudo(self):
19 | return self.id == 0 or self.form == ''
20 |
21 |
22 | class DepTree: #以句子为单位,判断是否为投影树
23 | def __init__(self, sentence):
24 | self.words = list(sentence)
25 | self.start = 1
26 | if sentence[0].id == 1: self.start = 0
27 | elif sentence[0].id == 0: self.start = 1
28 | else: self.start = len(self.words)
29 |
30 | def isProj(self):
31 | n = len(self.words)
32 | words = self.words
33 | if self.start > 1: return False
34 | if self.start == 0: words = [None] + words
35 | for i in range(1, n):
36 | hi = words[i].head
37 | for j in range(i+1, hi):
38 | hj = words[j].head
39 | if (hj - hi) * (hj - i) > 0:
40 | return False
41 | return True
42 |
43 |
44 | def evalDepTree(gold, predict):
45 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU']
46 | PUNCT_TAGS = []
47 | ignore_tags = set(PUNCT_TAGS)
48 | start_g = 0
49 | if gold[0].id == 0: start_g = 1
50 | start_p = 0
51 | if predict[0].id == 0: start_p = 1
52 |
53 | glength = len(gold) - start_g
54 | plength = len(predict) - start_p
55 |
56 | if glength != plength:
57 | raise Exception('gold length does not match predict length.')
58 |
59 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0
60 | for idx in range(glength):
61 | if gold[start_g + idx].pseudo: continue
62 | if gold[start_g + idx].tag in ignore_tags: continue
63 | if gold[start_g + idx].head == -1: continue
64 | arc_total += 1
65 | label_total += 1
66 | if gold[start_g + idx].head == predict[start_p + idx].head:
67 | arc_correct += 1
68 | if gold[start_g + idx].rel == predict[start_p + idx].rel:
69 | label_correct += 1
70 |
71 | return arc_total, arc_correct, label_total, label_correct
72 |
73 |
74 | def readDepTree(file, vocab=None):
75 | proj = 0
76 | total = 0
77 | min_count = 1
78 | last_tbank = "PC"
79 | if vocab is None: min_count = 0
80 | if vocab is None: sentence = []
81 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,last_tbank)]
82 | for line in file:
83 | tok = line.strip().split('\t')
84 | if not tok or line.strip() == '' or line.strip().startswith('#'):
85 | if len(sentence) > min_count: #sententce至少有一个词
86 | if DepTree(sentence).isProj():
87 | proj += 1
88 | total += 1
89 | yield sentence
90 | if vocab is None:
91 | sentence = []
92 | else:
93 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,last_tbank)]
94 | elif len(tok) == 10:
95 | if tok[6] == '_': tok[6] = '-1'
96 | try:
97 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7],tok[9]))
98 | last_tbank = tok[9]
99 | except Exception:
100 | pass
101 | else:
102 | pass
103 |
104 | if len(sentence) > min_count:
105 | if DepTree(sentence).isProj():
106 | proj += 1
107 | total += 1
108 | yield sentence
109 |
110 | print("Total num: ", total)
111 | print("Proj num: ", proj)
112 |
113 |
114 | # def writeDepTree(filename, sentences):
115 | # with open(filename, 'w') as file:
116 | # for sentence in sentences:
117 | # for entry in sentence:
118 | # if not entry.pseudo: file.write(str(entry) + '\n')
119 | # file.write('\n')
120 |
121 | # def printDepTree(output, sentence, gold=None):
122 | # if gold== None:
123 | # for entry in sentence:
124 | # if not entry.pseudo: output.write(str(entry) + '\n')
125 | # output.write('\n')
126 | # else:
127 | # start_g = 0
128 | # if gold[0].id == 0: start_g = 1
129 | # start_p = 0
130 | # if sentence[0].id == 0: start_p = 1
131 | # glength = len(gold) - start_g
132 | # plength = len(sentence) - start_p
133 | #
134 | # if glength != plength:
135 | # raise Exception('gold length does not match predict length.')
136 | #
137 | # for idx in range(glength):
138 | # if gold[start_g + idx].pseudo: continue
139 | # values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \
140 | # "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"]
141 | # output.write('\t'.join(values) + '\n')
142 | #
143 | # output.write('\n')
144 |
145 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/Parser.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from driver.MST import *
3 | import torch.optim.lr_scheduler
4 | from driver.Layer import *
5 | import numpy as np
6 |
7 |
8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64):
9 | lengths = [len(x) for x in xs]
10 | if length is None:
11 | length = max(lengths)
12 | y = np.array([np.pad(x.astype(dtype), (0, length - l),
13 | mode="constant", constant_values=padding)
14 | for x, l in zip(xs, lengths)])
15 | return torch.from_numpy(y)
16 |
17 | def _model_var(model, x):
18 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
19 | if p.is_cuda:
20 | x = x.cuda(p.get_device())
21 | return torch.autograd.Variable(x)
22 |
23 |
24 | class BiaffineParser(object):
25 | def __init__(self, model, root_id):
26 | self.model = model
27 | self.root = root_id
28 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
29 | self.use_cuda = p.is_cuda
30 | self.device = p.get_device() if self.use_cuda else None
31 |
32 | def forward(self, words, extwords, tags, masks):
33 | if self.use_cuda:
34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device),
35 | tags = tags.cuda(self.device)
36 | masks = masks.cuda(self.device)
37 |
38 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks)
39 | # cache
40 | self.arc_logits = arc_logits
41 | self.rel_logits = rel_logits
42 |
43 |
44 | def compute_loss(self, true_arcs, true_rels, lengths):
45 | b, l1, l2 = self.arc_logits.size()
46 | index_true_arcs = _model_var(
47 | self.model,
48 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64))
49 | true_arcs = _model_var(
50 | self.model,
51 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64))
52 |
53 | masks = []
54 | for length in lengths:
55 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length))
56 | mask = _model_var(self.model, mask)
57 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
58 | masks.append(mask.transpose(0, 1))
59 | length_mask = torch.stack(masks, 0)
60 | arc_logits = self.arc_logits + length_mask
61 |
62 | arc_loss = F.cross_entropy(
63 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
64 | ignore_index=-1)
65 |
66 | size = self.rel_logits.size()
67 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
68 |
69 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
70 | rel_probs = []
71 | for i in range(l1):
72 | rel_probs.append(logits[i][int(arcs[i])])
73 | rel_probs = torch.stack(rel_probs, dim=0)
74 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
75 |
76 | b, l1, d = output_logits.size()
77 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64))
78 |
79 | rel_loss = F.cross_entropy(
80 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1)
81 |
82 | loss = arc_loss + rel_loss
83 |
84 | return loss
85 |
86 | def compute_accuracy(self, true_arcs, true_rels):
87 | b, l1, l2 = self.arc_logits.size()
88 | pred_arcs = self.arc_logits.data.max(2)[1].cpu()
89 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
90 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
91 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum()
92 |
93 |
94 | size = self.rel_logits.size()
95 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
96 |
97 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
98 | rel_probs = []
99 | for i in range(l1):
100 | rel_probs.append(logits[i][arcs[i]])
101 | rel_probs = torch.stack(rel_probs, dim=0)
102 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
103 |
104 | pred_rels = output_logits.data.max(2)[1].cpu()
105 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64)
106 | label_correct = pred_rels.eq(true_rels).cpu().sum()
107 |
108 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1)
109 |
110 | return arc_correct, label_correct, total_arcs
111 |
112 | def parse(self, words, extwords, tags, lengths, masks):
113 | if words is not None:
114 | self.forward(words, extwords, tags, masks)
115 | ROOT = self.root
116 | arcs_batch, rels_batch = [], []
117 | arc_logits = self.arc_logits.data.cpu().numpy()
118 | rel_logits = self.rel_logits.data.cpu().numpy()
119 |
120 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths):
121 | arc_probs = softmax2d(arc_logit, length, length)
122 | arc_pred = arc_argmax(arc_probs, length, False)
123 |
124 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred]
125 | rel_pred = rel_argmax(rel_probs, length, ROOT, False)
126 |
127 | arcs_batch.append(arc_pred)
128 | rels_batch.append(rel_pred)
129 |
130 | return arcs_batch, rels_batch
131 |
--------------------------------------------------------------------------------
/multi_task_learning/driver/Parser.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from driver.MST import *
3 | import torch.optim.lr_scheduler
4 | from driver.Layer import *
5 | import numpy as np
6 |
7 |
8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64):
9 | lengths = [len(x) for x in xs]
10 | if length is None:
11 | length = max(lengths)
12 | y = np.array([np.pad(x.astype(dtype), (0, length - l),
13 | mode="constant", constant_values=padding)
14 | for x, l in zip(xs, lengths)])
15 | return torch.from_numpy(y)
16 |
17 | def _model_var(model, x):
18 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
19 | if p.is_cuda:
20 | x = x.cuda(p.get_device())
21 | return torch.autograd.Variable(x)
22 |
23 |
24 | class BiaffineParser(object):
25 | def __init__(self, model, root_id):
26 | self.model = model
27 | self.root = root_id
28 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
29 | self.use_cuda = p.is_cuda
30 | self.device = p.get_device() if self.use_cuda else None
31 |
32 | def forward(self, words, extwords, tags, masks,type):
33 | if self.use_cuda:
34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device),
35 | tags = tags.cuda(self.device)
36 | masks = masks.cuda(self.device)
37 |
38 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,type)
39 | # cache
40 | self.arc_logits = arc_logits
41 | self.rel_logits = rel_logits
42 |
43 |
44 | def compute_loss(self, true_arcs, true_rels, lengths):
45 | b, l1, l2 = self.arc_logits.size()
46 | index_true_arcs = _model_var(
47 | self.model,
48 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64))
49 | true_arcs = _model_var(
50 | self.model,
51 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64))
52 |
53 | masks = []
54 | for length in lengths:
55 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length))
56 | mask = _model_var(self.model, mask)
57 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
58 | masks.append(mask.transpose(0, 1))
59 | length_mask = torch.stack(masks, 0)
60 | arc_logits = self.arc_logits + length_mask
61 |
62 | arc_loss = F.cross_entropy(
63 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
64 | ignore_index=-1)
65 |
66 | size = self.rel_logits.size()
67 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
68 |
69 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
70 | rel_probs = []
71 | for i in range(l1):
72 | rel_probs.append(logits[i][int(arcs[i])])
73 | rel_probs = torch.stack(rel_probs, dim=0)
74 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
75 |
76 | b, l1, d = output_logits.size()
77 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64))
78 |
79 | rel_loss = F.cross_entropy(
80 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1)
81 |
82 | loss = arc_loss + rel_loss
83 |
84 | return loss
85 |
86 | def compute_accuracy(self, true_arcs, true_rels):
87 | b, l1, l2 = self.arc_logits.size()
88 | pred_arcs = self.arc_logits.data.max(2)[1].cpu()
89 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
90 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
91 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum()
92 |
93 |
94 | size = self.rel_logits.size()
95 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
96 |
97 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
98 | rel_probs = []
99 | for i in range(l1):
100 | rel_probs.append(logits[i][arcs[i]])
101 | rel_probs = torch.stack(rel_probs, dim=0)
102 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
103 |
104 | pred_rels = output_logits.data.max(2)[1].cpu()
105 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64)
106 | label_correct = pred_rels.eq(true_rels).cpu().sum()
107 |
108 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1)
109 |
110 | return arc_correct, label_correct, total_arcs
111 |
112 | def parse(self, words, extwords, tags, lengths, masks,type):
113 | if words is not None:
114 | self.forward(words, extwords, tags, masks,type)
115 | ROOT = self.root
116 | arcs_batch, rels_batch = [], []
117 | arc_logits = self.arc_logits.data.cpu().numpy()
118 | rel_logits = self.rel_logits.data.cpu().numpy()
119 |
120 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths):
121 | arc_probs = softmax2d(arc_logit, length, length)
122 | arc_pred = arc_argmax(arc_probs, length, False)
123 |
124 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred]
125 | rel_pred = rel_argmax(rel_probs, length, ROOT, False)
126 |
127 | arcs_batch.append(arc_pred)
128 | rels_batch.append(rel_pred)
129 |
130 | return arcs_batch, rels_batch
131 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/Parser.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from driver.MST import *
3 | import torch.optim.lr_scheduler
4 | from driver.Layer import *
5 | import numpy as np
6 |
7 |
8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64):
9 | lengths = [len(x) for x in xs]
10 | if length is None:
11 | length = max(lengths)
12 | y = np.array([np.pad(x.astype(dtype), (0, length - l),
13 | mode="constant", constant_values=padding)
14 | for x, l in zip(xs, lengths)])
15 | return torch.from_numpy(y)
16 |
17 | def _model_var(model, x):
18 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
19 | if p.is_cuda:
20 | x = x.cuda(p.get_device())
21 | return torch.autograd.Variable(x)
22 |
23 |
24 | class BiaffineParser(object):
25 | def __init__(self, model, root_id):
26 | self.model = model
27 | self.root = root_id
28 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
29 | self.use_cuda = p.is_cuda
30 | self.device = p.get_device() if self.use_cuda else None
31 |
32 | def forward(self, words, extwords, tags, masks,elmosens,elmofile,tbank_ids):
33 | if self.use_cuda:
34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device),
35 | tags = tags.cuda(self.device)
36 | masks = masks.cuda(self.device)
37 | tbank_ids=tbank_ids.cuda(self.device)
38 |
39 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,elmosens,elmofile,tbank_ids)
40 | # cache
41 | self.arc_logits = arc_logits
42 | self.rel_logits = rel_logits
43 |
44 |
45 | def compute_loss(self, true_arcs, true_rels, lengths):
46 | b, l1, l2 = self.arc_logits.size()
47 | index_true_arcs = _model_var(self.model,pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64))
48 | true_arcs = _model_var(self.model,pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64))
49 | masks = []
50 | for length in lengths:
51 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length))
52 | mask = _model_var(self.model, mask)
53 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
54 | masks.append(mask.transpose(0, 1))
55 | length_mask = torch.stack(masks, 0)
56 | arc_logits = self.arc_logits + length_mask
57 |
58 | arc_loss = F.cross_entropy(
59 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
60 | ignore_index=-1)
61 |
62 | size = self.rel_logits.size()
63 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
64 |
65 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
66 | rel_probs = []
67 | for i in range(l1):
68 | rel_probs.append(logits[i][int(arcs[i])])
69 | rel_probs = torch.stack(rel_probs, dim=0)
70 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
71 |
72 | b, l1, d = output_logits.size()
73 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64))
74 |
75 | rel_loss = F.cross_entropy(
76 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1)
77 |
78 | loss = arc_loss + rel_loss
79 |
80 | return loss
81 |
82 | def compute_accuracy(self, true_arcs, true_rels):
83 | b, l1, l2 = self.arc_logits.size()
84 | pred_arcs = self.arc_logits.data.max(2)[1].cpu()
85 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
86 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
87 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum()
88 |
89 |
90 | size = self.rel_logits.size()
91 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
92 |
93 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
94 | rel_probs = []
95 | for i in range(l1):
96 | rel_probs.append(logits[i][arcs[i]])
97 | rel_probs = torch.stack(rel_probs, dim=0)
98 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
99 |
100 | pred_rels = output_logits.data.max(2)[1].cpu()
101 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64)
102 | label_correct = pred_rels.eq(true_rels).cpu().sum()
103 |
104 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1)
105 |
106 | return arc_correct, label_correct, total_arcs
107 |
108 | def parse(self, words, extwords, tags, lengths, masks,elmosens,elmofile,tbank_ids):
109 | if words is not None:
110 | self.forward(words, extwords, tags, masks,elmosens,elmofile,tbank_ids)
111 | ROOT = self.root
112 | arcs_batch, rels_batch = [], []
113 | arc_logits = self.arc_logits.data.cpu().numpy()
114 | rel_logits = self.rel_logits.data.cpu().numpy()
115 |
116 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths):
117 | arc_probs = softmax2d(arc_logit, length, length)
118 | arc_pred = arc_argmax(arc_probs, length, False)
119 |
120 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred]
121 | rel_pred = rel_argmax(rel_probs, length, ROOT, False)
122 |
123 | arcs_batch.append(arc_pred)
124 | rels_batch.append(rel_pred)
125 |
126 | return arcs_batch, rels_batch
127 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/data/Dependency.py:
--------------------------------------------------------------------------------
1 |
2 | class Dependency: #以词为单位
3 | def __init__(self, id, form, tag, head, rel,charid,treebank_id,proxy_tbank="cdt"):
4 | self.id = id #该词在该句话的id
5 | self.org_form = form #该词语
6 | self.form = form.lower()
7 | self.tag = tag #该词词性
8 | self.head = head #该词头节点
9 | self.rel = rel #该词的标签
10 | self.charid = charid
11 | self.treebank_id=treebank_id
12 | self.proxy_tbank=proxy_tbank #代理treebank_id,默认cdt
13 |
14 |
15 | def __str__(self):
16 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_", str(self.charid)]
17 | return '\t'.join(values)
18 |
19 | @property
20 | def pseudo(self):
21 | return self.id == 0 or self.form == ''
22 |
23 |
24 | class DepTree: #以句子为单位,判断是否为投影树
25 | def __init__(self, sentence):
26 | self.words = list(sentence)
27 | self.start = 1
28 | if sentence[0].id == 1: self.start = 0
29 | elif sentence[0].id == 0: self.start = 1
30 | else: self.start = len(self.words)
31 |
32 | def isProj(self):
33 | n = len(self.words)
34 | words = self.words
35 | if self.start > 1: return False
36 | if self.start == 0: words = [None] + words
37 | for i in range(1, n):
38 | hi = words[i].head
39 | for j in range(i+1, hi):
40 | hj = words[j].head
41 | if (hj - hi) * (hj - i) > 0:
42 | return False
43 | return True
44 |
45 |
46 | def evalDepTree(gold, predict):
47 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU']
48 | PUNCT_TAGS = []
49 | ignore_tags = set(PUNCT_TAGS)
50 | start_g = 0
51 | if gold[0].id == 0: start_g = 1
52 | start_p = 0
53 | if predict[0].id == 0: start_p = 1
54 |
55 | glength = len(gold) - start_g
56 | plength = len(predict) - start_p
57 |
58 | if glength != plength:
59 | raise Exception('gold length does not match predict length.')
60 |
61 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0
62 | for idx in range(glength):
63 | if gold[start_g + idx].pseudo: continue
64 | if gold[start_g + idx].tag in ignore_tags: continue
65 | if gold[start_g + idx].head == -1: continue
66 | arc_total += 1
67 | label_total += 1
68 | if gold[start_g + idx].head == predict[start_p + idx].head:
69 | arc_correct += 1
70 | if gold[start_g + idx].rel == predict[start_p + idx].rel:
71 | label_correct += 1
72 |
73 | return arc_total, arc_correct, label_total, label_correct
74 |
75 |
76 | def readDepTree(file, vocab=None):
77 | proj = 0
78 | total = 0
79 | min_count = 1
80 | charid = 0
81 | if vocab is None: min_count = 0
82 | if vocab is None: sentence = []
83 | # else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
84 | else:
85 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,-1,'cdt')]
86 | for line in file:
87 | tok = line.strip().split('\t')
88 | if not tok or line.strip() == '' or line.strip().startswith('#'):
89 | if len(sentence) > min_count: #sententce至少有一个词
90 | if DepTree(sentence).isProj():
91 | proj += 1
92 | total += 1
93 | yield sentence
94 | if vocab is None:
95 | sentence = []
96 | else:
97 | # sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
98 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root,-1,'cdt')]
99 | elif len(tok) == 10:
100 | if tok[6] == '_': tok[6] = '-1'
101 | try:
102 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7],charid,tok[9]))
103 | charid += 1
104 | except Exception:
105 | pass
106 | else:
107 | pass
108 |
109 | if len(sentence) > min_count:
110 | if DepTree(sentence).isProj():
111 | proj += 1
112 | total += 1
113 | yield sentence
114 |
115 | print("Total num: ", total)
116 | print("Proj num: ", proj)
117 |
118 |
119 | def writeDepTree(filename, sentences):
120 | with open(filename, 'w') as file:
121 | for sentence in sentences:
122 | for entry in sentence:
123 | if not entry.pseudo: file.write(str(entry) + '\n')
124 | file.write('\n')
125 |
126 | def printDepTree(output, sentence, gold=None):
127 | if gold== None:
128 | for entry in sentence:
129 | if not entry.pseudo: output.write(str(entry) + '\n')
130 | output.write('\n')
131 | else:
132 | start_g = 0
133 | if gold[0].id == 0: start_g = 1
134 | start_p = 0
135 | if sentence[0].id == 0: start_p = 1
136 | glength = len(gold) - start_g
137 | plength = len(sentence) - start_p
138 |
139 | if glength != plength:
140 | raise Exception('gold length does not match predict length.')
141 |
142 | for idx in range(glength):
143 | if gold[start_g + idx].pseudo: continue
144 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \
145 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"]
146 | output.write('\t'.join(values) + '\n')
147 |
148 | output.write('\n')
149 |
150 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/Parser.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from driver.MST import *
3 | import torch.optim.lr_scheduler
4 | from driver.Layer import *
5 | import numpy as np
6 |
7 |
8 | def pad_sequence(xs, length=None, padding=-1, dtype=np.float64):
9 | lengths = [len(x) for x in xs]
10 | if length is None:
11 | length = max(lengths)
12 | y = np.array([np.pad(x.astype(dtype), (0, length - l),
13 | mode="constant", constant_values=padding)
14 | for x, l in zip(xs, lengths)])
15 | return torch.from_numpy(y)
16 |
17 | def _model_var(model, x):
18 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
19 | if p.is_cuda:
20 | x = x.cuda(p.get_device())
21 | return torch.autograd.Variable(x)
22 |
23 |
24 | class BiaffineParser(object):
25 | def __init__(self, model, root_id):
26 | self.model = model
27 | self.root = root_id
28 | p = next(filter(lambda p: p.requires_grad, model.parameters()))
29 | self.use_cuda = p.is_cuda
30 | self.device = p.get_device() if self.use_cuda else None
31 |
32 | def forward(self, words, extwords, tags, masks,tbank_ids):
33 | if self.use_cuda:
34 | words, extwords = words.cuda(self.device), extwords.cuda(self.device)
35 | tags = tags.cuda(self.device)
36 | masks = masks.cuda(self.device)
37 | tbank_ids=tbank_ids.cuda(self.device)
38 |
39 | arc_logits, rel_logits = self.model.forward(words, extwords, tags, masks,tbank_ids)
40 | # cache
41 | self.arc_logits = arc_logits
42 | self.rel_logits = rel_logits
43 |
44 |
45 | def compute_loss(self, true_arcs, true_rels, lengths):
46 | b, l1, l2 = self.arc_logits.size()
47 | index_true_arcs = _model_var(
48 | self.model,
49 | pad_sequence(true_arcs, length=l1, padding=0, dtype=np.int64))
50 | true_arcs = _model_var(
51 | self.model,
52 | pad_sequence(true_arcs, length=l1, padding=-1, dtype=np.int64))
53 |
54 | masks = []
55 | for length in lengths:
56 | mask = torch.FloatTensor([0] * length + [-10000] * (l2 - length))
57 | mask = _model_var(self.model, mask)
58 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
59 | masks.append(mask.transpose(0, 1))
60 | length_mask = torch.stack(masks, 0)
61 | arc_logits = self.arc_logits + length_mask
62 |
63 | arc_loss = F.cross_entropy(
64 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
65 | ignore_index=-1)
66 |
67 | size = self.rel_logits.size()
68 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
69 |
70 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
71 | rel_probs = []
72 | for i in range(l1):
73 | rel_probs.append(logits[i][int(arcs[i])])
74 | rel_probs = torch.stack(rel_probs, dim=0)
75 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
76 |
77 | b, l1, d = output_logits.size()
78 | true_rels = _model_var(self.model, pad_sequence(true_rels, padding=-1, dtype=np.int64))
79 |
80 | rel_loss = F.cross_entropy(
81 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1)
82 |
83 | loss = arc_loss + rel_loss
84 |
85 | return loss
86 |
87 | def compute_accuracy(self, true_arcs, true_rels):
88 | b, l1, l2 = self.arc_logits.size()
89 | pred_arcs = self.arc_logits.data.max(2)[1].cpu()
90 | index_true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
91 | true_arcs = pad_sequence(true_arcs, padding=-1, dtype=np.int64)
92 | arc_correct = pred_arcs.eq(true_arcs).cpu().sum()
93 |
94 |
95 | size = self.rel_logits.size()
96 | output_logits = _model_var(self.model, torch.zeros(size[0], size[1], size[3]))
97 |
98 | for batch_index, (logits, arcs) in enumerate(zip(self.rel_logits, index_true_arcs)):
99 | rel_probs = []
100 | for i in range(l1):
101 | rel_probs.append(logits[i][arcs[i]])
102 | rel_probs = torch.stack(rel_probs, dim=0)
103 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
104 |
105 | pred_rels = output_logits.data.max(2)[1].cpu()
106 | true_rels = pad_sequence(true_rels, padding=-1, dtype=np.int64)
107 | label_correct = pred_rels.eq(true_rels).cpu().sum()
108 |
109 | total_arcs = b * l1 - np.sum(true_arcs.cpu().numpy() == -1)
110 |
111 | return arc_correct, label_correct, total_arcs
112 |
113 | def parse(self, words, extwords, tags, lengths, masks,tbank_ids):
114 | if words is not None:
115 | self.forward(words, extwords, tags, masks,tbank_ids)
116 | ROOT = self.root
117 | arcs_batch, rels_batch = [], []
118 | arc_logits = self.arc_logits.data.cpu().numpy()
119 | rel_logits = self.rel_logits.data.cpu().numpy()
120 |
121 | for arc_logit, rel_logit, length in zip(arc_logits, rel_logits, lengths):
122 | arc_probs = softmax2d(arc_logit, length, length)
123 | arc_pred = arc_argmax(arc_probs, length, False)
124 |
125 | rel_probs = rel_logit[np.arange(len(arc_pred)), arc_pred]
126 | rel_pred = rel_argmax(rel_probs, length, ROOT, False)
127 |
128 | arcs_batch.append(arc_pred)
129 | rels_batch.append(rel_pred)
130 |
131 | return arcs_batch, rels_batch
132 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/Model.py:
--------------------------------------------------------------------------------
1 | from driver.Layer import *
2 | from data.Vocab import *
3 |
4 |
5 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb):
6 | batch_size, seq_length, _ = word_embeddings.size()
7 | word_masks = word_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb)
8 | word_masks = Variable(torch.bernoulli(word_masks), requires_grad=False)
9 | tag_masks = tag_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb)
10 | tag_masks = Variable(torch.bernoulli(tag_masks), requires_grad=False)
11 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12)
12 | word_masks *= scale
13 | tag_masks *= scale
14 | word_masks = word_masks.unsqueeze(dim=2)
15 | tag_masks = tag_masks.unsqueeze(dim=2)
16 | word_embeddings = word_embeddings * word_masks
17 | tag_embeddings = tag_embeddings * tag_masks
18 |
19 | return word_embeddings, tag_embeddings
20 |
21 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
22 | if batch_first:
23 | inputs = inputs.transpose(0, 1)
24 | seq_length, batch_size, hidden_size = inputs.size()
25 | drop_masks = inputs.data.new(batch_size, hidden_size).fill_(1 - dropout)
26 | drop_masks = Variable(torch.bernoulli(drop_masks), requires_grad=False)
27 | drop_masks = drop_masks / (1 - dropout)
28 | drop_masks = torch.unsqueeze(drop_masks, dim=2).expand(-1, -1, seq_length).permute(2, 0, 1)
29 | inputs = inputs * drop_masks
30 |
31 | return inputs.transpose(1, 0)
32 |
33 |
34 | class ParserModel(nn.Module):
35 | def __init__(self, vocab, config, pretrained_embedding):
36 | super(ParserModel, self).__init__()
37 | self.config = config
38 | self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
39 | self.extword_embed = nn.Embedding(vocab.extvocab_size, config.word_dims, padding_idx=0)
40 | self.tag_embed = nn.Embedding(vocab.tag_size, config.tag_dims, padding_idx=0)
41 |
42 | word_init = np.zeros((vocab.vocab_size, config.word_dims), dtype=np.float32)
43 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
44 |
45 | tag_init = np.random.randn(vocab.tag_size, config.tag_dims).astype(np.float32)
46 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
47 |
48 | self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding))
49 | self.extword_embed.weight.requires_grad = False
50 |
51 |
52 | self.lstm = MyLSTM(
53 | input_size=config.word_dims + config.tag_dims,
54 | hidden_size=config.lstm_hiddens,
55 | num_layers=config.lstm_layers,
56 | batch_first=True,
57 | bidirectional=True,
58 | dropout_in = config.dropout_lstm_input,
59 | dropout_out=config.dropout_lstm_hidden,
60 | )
61 |
62 | self.mlp_arc_dep = NonLinear(
63 | input_size = 2*config.lstm_hiddens,
64 | hidden_size = config.mlp_arc_size+config.mlp_rel_size,
65 | activation = nn.LeakyReLU(0.1))
66 | self.mlp_arc_head = NonLinear(
67 | input_size = 2*config.lstm_hiddens,
68 | hidden_size = config.mlp_arc_size+config.mlp_rel_size,
69 | activation = nn.LeakyReLU(0.1))
70 |
71 | self.total_num = int((config.mlp_arc_size+config.mlp_rel_size) / 100)
72 | self.arc_num = int(config.mlp_arc_size / 100)
73 | self.rel_num = int(config.mlp_rel_size / 100)
74 |
75 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, \
76 | 1, bias=(True, False))
77 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, \
78 | vocab.rel_size, bias=(True, True))
79 |
80 | def forward(self, words, extwords, tags, masks):
81 | # x = (batch size, sequence length, dimension of embedding)
82 | x_word_embed = self.word_embed(words)
83 | x_extword_embed = self.extword_embed(extwords)
84 | x_embed = x_word_embed + x_extword_embed
85 | x_tag_embed = self.tag_embed(tags)
86 |
87 | if self.training:
88 | x_embed, x_tag_embed = drop_input_independent(x_embed, x_tag_embed, self.config.dropout_emb)
89 |
90 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2)
91 |
92 | outputs, _ = self.lstm(x_lexical, masks, None)
93 | outputs = outputs.transpose(1, 0)
94 |
95 | if self.training:
96 | outputs = drop_sequence_sharedmask(outputs, self.config.dropout_mlp)
97 |
98 | x_all_dep = self.mlp_arc_dep(outputs)
99 | x_all_head = self.mlp_arc_head(outputs)
100 |
101 | if self.training:
102 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
103 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
104 |
105 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
106 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
107 |
108 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
109 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
110 |
111 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
112 | arc_logit = torch.squeeze(arc_logit, dim=3)
113 |
114 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
115 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
116 |
117 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
118 | return arc_logit, rel_logit_cond
--------------------------------------------------------------------------------
/multi_task_learning/data/Dependency.py:
--------------------------------------------------------------------------------
1 |
2 | class Dependency: #以词为单位
3 | def __init__(self, id, form, tag, head, rel):
4 | self.id = id #该词在该句话的id
5 | self.org_form = form #该词语
6 | self.form = form.lower()
7 | self.tag = tag #该词词性
8 | self.head = head #该词头节点
9 | self.rel = rel #该词的标签
10 |
11 | def __str__(self):
12 | values = [str(self.id), self.org_form, "_", self.tag, "_", "_", str(self.head), self.rel, "_", "_"]
13 | return '\t'.join(values)
14 |
15 | @property
16 | def pseudo(self):
17 | return self.id == 0 or self.form == ''
18 |
19 |
20 | class DepTree: #以句子为单位,判断是否为投影树
21 | def __init__(self, sentence):
22 | self.words = list(sentence)
23 | self.start = 1
24 | if sentence[0].id == 1: self.start = 0
25 | elif sentence[0].id == 0: self.start = 1
26 | else: self.start = len(self.words)
27 |
28 | def isProj(self):
29 | n = len(self.words)
30 | words = self.words
31 | if self.start > 1: return False
32 | if self.start == 0: words = [None] + words
33 | for i in range(1, n):
34 | hi = words[i].head
35 | for j in range(i+1, hi):
36 | hj = words[j].head
37 | if (hj - hi) * (hj - i) > 0:
38 | return False
39 | return True
40 |
41 |
42 | def evalDepTree(gold, predict):
43 | #PUNCT_TAGS = ['``', "''", ':', ',', '.', 'PU']
44 | PUNCT_TAGS = []
45 | ignore_tags = set(PUNCT_TAGS)
46 | start_g = 0
47 | if gold[0].id == 0: start_g = 1
48 | start_p = 0
49 | if predict[0].id == 0: start_p = 1
50 |
51 | glength = len(gold) - start_g
52 | plength = len(predict) - start_p
53 |
54 | if glength != plength:
55 | raise Exception('gold length does not match predict length.')
56 |
57 | arc_total, arc_correct, label_total, label_correct = 0, 0, 0, 0
58 | for idx in range(glength):
59 | if gold[start_g + idx].pseudo: continue
60 | if gold[start_g + idx].tag in ignore_tags: continue
61 | if gold[start_g + idx].head == -1: continue
62 | arc_total += 1
63 | label_total += 1
64 | if gold[start_g + idx].head == predict[start_p + idx].head:
65 | arc_correct += 1
66 | if gold[start_g + idx].rel == predict[start_p + idx].rel:
67 | label_correct += 1
68 |
69 | return arc_total, arc_correct, label_total, label_correct
70 | class Instance:
71 | def __init__(self, list_sentence, tgt_type):
72 | self.sentence = list_sentence
73 | self.tgt_type = tgt_type
74 |
75 | def __len__(self):
76 | return len(self.sentence)
77 |
78 | def parse_my_conll(file_path):
79 | file = open(file_path, mode="r", encoding="utf-8")
80 | for line in file:
81 | tokens = line.strip().split('\t')
82 | tgt_type = tokens[-1]
83 | file.close()
84 | return tgt_type
85 |
86 | def readDepTree(file,tgt_type, vocab=None):
87 | proj = 0
88 | total = 0
89 | min_count = 1
90 | if vocab is None: min_count = 0
91 | if vocab is None: sentence = []
92 | else: sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
93 | for line in file:
94 | tok = line.strip().split('\t')
95 | if not tok or line.strip() == '' or line.strip().startswith('#'):
96 | if len(sentence) > min_count: #sententce至少有一个词
97 | if DepTree(sentence).isProj():
98 | proj += 1
99 | total += 1
100 | yield Instance(sentence, tgt_type)
101 | if vocab is None:
102 | sentence = []
103 | else:
104 | sentence = [Dependency(0, vocab._root_form, vocab._root, -1, vocab._root)]
105 | elif len(tok) == 10:
106 | if tok[6] == '_': tok[6] = '-1'
107 | try:
108 | sentence.append(Dependency(int(tok[0]), tok[1], tok[3], int(tok[6]), tok[7]))
109 | except Exception:
110 | pass
111 | else:
112 | pass
113 |
114 | if len(sentence) > min_count:
115 | if DepTree(sentence).isProj():
116 | proj += 1
117 | total += 1
118 | yield Instance(sentence, tgt_type)
119 |
120 | print("Total num: ", total)
121 | print("Proj num: ", proj)
122 |
123 |
124 | def writeDepTree(filename, sentences):
125 | with open(filename, 'w') as file:
126 | for sentence in sentences:
127 | for entry in sentence:
128 | if not entry.pseudo: file.write(str(entry) + '\n')
129 | file.write('\n')
130 |
131 | def printDepTree(output, sentence, gold=None):
132 | if gold== None:
133 | for entry in sentence:
134 | if not entry.pseudo: output.write(str(entry) + '\n')
135 | output.write('\n')
136 | else:
137 | start_g = 0
138 | if gold[0].id == 0: start_g = 1
139 | start_p = 0
140 | if sentence[0].id == 0: start_p = 1
141 | glength = len(gold) - start_g
142 | plength = len(sentence) - start_p
143 |
144 | if glength != plength:
145 | raise Exception('gold length does not match predict length.')
146 |
147 | for idx in range(glength):
148 | if gold[start_g + idx].pseudo: continue
149 | values = [str(gold[start_g + idx].id), gold[start_g + idx].org_form, "_", gold[start_g + idx].tag, \
150 | "_", "_", str(sentence[start_p + idx].head), sentence[start_p + idx].rel, "_", "_"]
151 | output.write('\t'.join(values) + '\n')
152 |
153 | output.write('\n')
154 |
155 |
--------------------------------------------------------------------------------
/ELMo/fine-tune-elmo/tests/test_training.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import unittest
4 | import os
5 | import shutil
6 | import tempfile
7 |
8 | import tensorflow as tf
9 | import numpy as np
10 |
11 | from bilm.training import train, test, load_vocab, \
12 | load_options_latest_checkpoint
13 | from bilm.data import LMDataset, BidirectionalLMDataset
14 |
15 | FIXTURES = 'tests/fixtures/train/'
16 |
17 | class TestLanguageModel(unittest.TestCase):
18 | def tearDown(self):
19 | shutil.rmtree(self.tmp_dir)
20 | os.remove(self.tmp_file)
21 | tf.reset_default_graph()
22 |
23 | def setUp(self):
24 | self.tmp_dir = tempfile.mkdtemp()
25 | (_, self.tmp_file) = tempfile.mkstemp()
26 |
27 | def _get_data(self, bidirectional, use_chars, test=False):
28 | vocab_file = os.path.join(FIXTURES, 'vocab.txt')
29 | if use_chars:
30 | vocab = load_vocab(vocab_file, 10)
31 | else:
32 | vocab = load_vocab(vocab_file, None)
33 |
34 | prefix = os.path.join(FIXTURES, 'data.txt')
35 |
36 | if bidirectional:
37 | data = BidirectionalLMDataset(prefix, vocab, test=test)
38 | else:
39 | data = LMDataset(prefix, vocab, test=test, reverse=False)
40 |
41 | return data, vocab
42 |
43 | def _get_vocab_data_options(self, bidirectional, use_chars,
44 | share_embedding_softmax=False):
45 | data, vocab = self._get_data(bidirectional, use_chars)
46 | options = {
47 | 'n_tokens_vocab': vocab.size,
48 | 'n_negative_samples_batch': 16,
49 | 'n_train_tokens': 134,
50 | 'batch_size': 2,
51 | 'unroll_steps': 10,
52 | 'n_epochs': 50,
53 | 'all_clip_norm_val': 1.0,
54 | 'dropout': 0.1,
55 | 'lstm': {'dim': 16, 'projection_dim': 8, 'n_layers': 2},
56 | 'bidirectional': bidirectional,
57 | }
58 |
59 | if use_chars:
60 | options['char_cnn'] = {
61 | 'n_characters': 261,
62 | 'max_characters_per_token': 10,
63 | 'filters': [
64 | [1, 8],
65 | [2, 8],
66 | [3, 16],
67 | [4, 32],
68 | [5, 64],
69 | ],
70 | 'activation': 'tanh',
71 | 'embedding': {'dim': 4},
72 | 'n_highway': 1,
73 | }
74 |
75 | if share_embedding_softmax:
76 | options['share_embedding_softmax'] = True
77 |
78 | return vocab, data, options
79 |
80 | def test_train_single_direction(self):
81 | vocab, data, options = self._get_vocab_data_options(False, False)
82 | train(options, data, 1, self.tmp_dir, self.tmp_dir)
83 |
84 | # now test
85 | tf.reset_default_graph()
86 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
87 | data_test, vocab_test = self._get_data(False, False, True)
88 | perplexity = test(options, ckpt_file, data_test, batch_size=1)
89 | self.assertTrue(perplexity < 20.0)
90 |
91 | def test_train_bilm_chars(self):
92 | vocab, data, options = self._get_vocab_data_options(True, True)
93 | train(options, data, 1, self.tmp_dir, self.tmp_dir)
94 |
95 | # now test
96 | tf.reset_default_graph()
97 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
98 | data_test, vocab_test = self._get_data(True, True, True)
99 | perplexity = test(options, ckpt_file, data_test, batch_size=1)
100 | self.assertTrue(perplexity < 20.0)
101 |
102 | def test_shared_variables(self):
103 | vocab, data, options = self._get_vocab_data_options(True, True)
104 | options['n_epochs'] = 1
105 | train(options, data, 2, self.tmp_dir, self.tmp_dir)
106 | self.assertEqual(len(tf.global_variables()), 64)
107 |
108 | def test_train_shared_softmax_embedding(self):
109 | bidirectional = True
110 | use_chars = False
111 |
112 | vocab, data, options = self._get_vocab_data_options(
113 | bidirectional, use_chars, share_embedding_softmax=True)
114 | train(options, data, 1, self.tmp_dir, self.tmp_dir)
115 |
116 | # now test
117 | tf.reset_default_graph()
118 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
119 | data_test, vocab_test = self._get_data(
120 | bidirectional, use_chars, test=True)
121 | perplexity = test(options, ckpt_file, data_test, batch_size=1)
122 | self.assertTrue(perplexity < 20.0)
123 |
124 | def test_train_shared_softmax_no_chars(self):
125 | bidirectional = True
126 | use_chars = True
127 | vocab, data, options = self._get_vocab_data_options(
128 | bidirectional, use_chars, share_embedding_softmax=True)
129 | # character inputs and sharing weights not suppported
130 | with self.assertRaises(ValueError):
131 | train(options, data, 1, self.tmp_dir, self.tmp_dir)
132 |
133 | def test_train_skip_connections(self):
134 | bidirectional = True
135 | use_chars = False
136 | vocab, data, options = self._get_vocab_data_options(
137 | bidirectional, use_chars)
138 | options['lstm']['use_skip_connections'] = True
139 | train(options, data, 1, self.tmp_dir, self.tmp_dir)
140 |
141 | # now test
142 | tf.reset_default_graph()
143 | options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
144 | data_test, vocab_test = self._get_data(
145 | bidirectional, use_chars, test=True)
146 | perplexity = test(options, ckpt_file, data_test, batch_size=1)
147 | self.assertTrue(perplexity < 20.0)
148 |
149 |
150 | if __name__ == '__main__':
151 | unittest.main()
152 |
153 |
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/Model.py:
--------------------------------------------------------------------------------
1 | from driver.Layer import *
2 | from data.Vocab import *
3 |
4 |
5 | def drop_input_independent(word_embeddings, tag_embeddings,tbank_embeddings, dropout_emb):
6 | batch_size, seq_length, _ = word_embeddings.size()
7 | word_masks = word_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb)
8 | word_masks = Variable(torch.bernoulli(word_masks), requires_grad=False)
9 | tag_masks = tag_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb)
10 | tag_masks = Variable(torch.bernoulli(tag_masks), requires_grad=False)
11 | tbank_masks = tbank_embeddings.data.new(batch_size, seq_length).fill_(1 - dropout_emb)
12 | tbank_masks = Variable(torch.bernoulli(tbank_masks), requires_grad=False)
13 | scale = 3.0 / (2.0 * word_masks + tag_masks + tbank_masks + 1e-12)
14 | word_masks *= scale
15 | tag_masks *= scale
16 | tbank_masks *= scale
17 | word_masks = word_masks.unsqueeze(dim=2)
18 | tag_masks = tag_masks.unsqueeze(dim=2)
19 | tbank_masks = tbank_masks.unsqueeze(dim=2)
20 | word_embeddings = word_embeddings * word_masks
21 | tag_embeddings = tag_embeddings * tag_masks
22 | tbank_embeddings = tbank_embeddings * tbank_masks
23 |
24 | return word_embeddings, tag_embeddings, tbank_embeddings
25 |
26 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
27 | if batch_first:
28 | inputs = inputs.transpose(0, 1)
29 | seq_length, batch_size, hidden_size = inputs.size()
30 | drop_masks = inputs.data.new(batch_size, hidden_size).fill_(1 - dropout)
31 | drop_masks = Variable(torch.bernoulli(drop_masks), requires_grad=False)
32 | drop_masks = drop_masks / (1 - dropout)
33 | drop_masks = torch.unsqueeze(drop_masks, dim=2).expand(-1, -1, seq_length).permute(2, 0, 1)
34 | inputs = inputs * drop_masks
35 |
36 | return inputs.transpose(1, 0)
37 |
38 |
39 | class ParserModel(nn.Module):
40 | def __init__(self, vocab, config, pretrained_embedding):
41 | super(ParserModel, self).__init__()
42 | self.config = config
43 | self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0)
44 | self.extword_embed = nn.Embedding(vocab.extvocab_size, config.word_dims, padding_idx=0)
45 | self.tag_embed = nn.Embedding(vocab.tag_size, config.tag_dims, padding_idx=0)
46 | self.tbank_embed = nn.Embedding(vocab.tbank_size, config.tbank_emb_size, padding_idx=0)
47 |
48 | word_init = np.zeros((vocab.vocab_size, config.word_dims), dtype=np.float32)
49 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
50 |
51 | tag_init = np.random.randn(vocab.tag_size, config.tag_dims).astype(np.float32)
52 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
53 |
54 | self.extword_embed.weight.data.copy_(torch.from_numpy(pretrained_embedding))
55 | self.extword_embed.weight.requires_grad = False
56 |
57 | tbank_init = np.random.randn(vocab.tbank_size, config.tbank_emb_size).astype(np.float32)
58 | self.tbank_embed.weight.data.copy_(torch.from_numpy(tbank_init))
59 |
60 |
61 | self.lstm = MyLSTM(
62 | input_size=config.word_dims + config.tag_dims + config.tbank_emb_size,
63 | hidden_size=config.lstm_hiddens,
64 | num_layers=config.lstm_layers,
65 | batch_first=True,
66 | bidirectional=True,
67 | dropout_in = config.dropout_lstm_input,
68 | dropout_out=config.dropout_lstm_hidden,
69 | )
70 |
71 | self.mlp_arc_dep = NonLinear(
72 | input_size = 2*config.lstm_hiddens,
73 | hidden_size = config.mlp_arc_size+config.mlp_rel_size,
74 | activation = nn.LeakyReLU(0.1))
75 | self.mlp_arc_head = NonLinear(
76 | input_size = 2*config.lstm_hiddens,
77 | hidden_size = config.mlp_arc_size+config.mlp_rel_size,
78 | activation = nn.LeakyReLU(0.1))
79 |
80 | self.total_num = int((config.mlp_arc_size+config.mlp_rel_size) / 100)
81 | self.arc_num = int(config.mlp_arc_size / 100)
82 | self.rel_num = int(config.mlp_rel_size / 100)
83 |
84 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, \
85 | 1, bias=(True, False))
86 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, \
87 | vocab.rel_size, bias=(True, True))
88 |
89 | def forward(self, words, extwords, tags, masks,tbank_ids):
90 | # x = (batch size, sequence length, dimension of embedding)
91 | x_word_embed = self.word_embed(words)
92 | x_extword_embed = self.extword_embed(extwords)
93 | x_embed = x_word_embed + x_extword_embed
94 | x_tag_embed = self.tag_embed(tags)
95 | x_tbank_embed = self.tbank_embed(tbank_ids)
96 |
97 | if self.training:
98 | x_embed, x_tag_embed,x_tbank_embed = drop_input_independent(x_embed, x_tag_embed,x_tbank_embed, self.config.dropout_emb)
99 |
100 | x_lexical = torch.cat((x_embed, x_tag_embed,x_tbank_embed), dim=2)
101 |
102 | outputs, _ = self.lstm(x_lexical, masks, None)
103 | outputs = outputs.transpose(1, 0)
104 |
105 | if self.training:
106 | outputs = drop_sequence_sharedmask(outputs, self.config.dropout_mlp)
107 |
108 | x_all_dep = self.mlp_arc_dep(outputs)
109 | x_all_head = self.mlp_arc_head(outputs)
110 |
111 | if self.training:
112 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
113 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
114 |
115 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
116 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
117 |
118 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
119 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
120 |
121 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
122 | arc_logit = torch.squeeze(arc_logit, dim=3)
123 |
124 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
125 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
126 |
127 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
128 | return arc_logit, rel_logit_cond
--------------------------------------------------------------------------------
/biaffine-parser-domain-embedding/driver/Config.py:
--------------------------------------------------------------------------------
1 | from configparser import ConfigParser
2 | import sys, os
3 |
4 | sys.path.append('..')
5 |
6 |
7 | # import models
8 |
9 | class Configurable(object):
10 | def __init__(self, config_file, extra_args):
11 | config = ConfigParser()
12 | config.read(config_file)
13 | if extra_args:
14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])])
15 | for section in config.sections():
16 | for k, v in config.items(section):
17 | if k in extra_args:
18 | v = type(v)(extra_args[k])
19 | config.set(section, k, v)
20 | self._config = config
21 | if not os.path.isdir(self.save_dir):
22 | os.mkdir(self.save_dir)
23 | config.write(open(self.config_file, 'w'))
24 | print('Loaded config file sucessfully.')
25 | for section in config.sections():
26 | for k, v in config.items(section):
27 | print(k, v)
28 |
29 | @property
30 | def pretrained_embeddings_file(self):
31 | return self._config.get('Data', 'pretrained_embeddings_file')
32 |
33 | @property
34 | def data_dir(self):
35 | return self._config.get('Data', 'data_dir')
36 |
37 | @property
38 | def train_file(self):
39 | return self._config.get('Data', 'train_file')
40 |
41 | @property
42 | def train_file2(self):
43 | return self._config.get('Data', 'train_file2')
44 |
45 | @property
46 | def train_file3(self):
47 | return self._config.get('Data', 'train_file3')
48 |
49 | @property
50 | def dev_file(self):
51 | return self._config.get('Data', 'dev_file')
52 |
53 | @property
54 | def test_file(self):
55 | return self._config.get('Data', 'test_file')
56 |
57 | @property
58 | def min_occur_count(self):
59 | return self._config.getint('Data', 'min_occur_count')
60 |
61 | @property
62 | def save_dir(self):
63 | return self._config.get('Save', 'save_dir')
64 |
65 | @property
66 | def config_file(self):
67 | return self._config.get('Save', 'config_file')
68 |
69 | @property
70 | def save_model_path(self):
71 | return self._config.get('Save', 'save_model_path')
72 |
73 | @property
74 | def save_vocab_path(self):
75 | return self._config.get('Save', 'save_vocab_path')
76 |
77 | @property
78 | def load_dir(self):
79 | return self._config.get('Save', 'load_dir')
80 |
81 | @property
82 | def load_model_path(self):
83 | return self._config.get('Save', 'load_model_path')
84 |
85 | @property
86 | def load_vocab_path(self):
87 | return self._config.get('Save', 'load_vocab_path')
88 |
89 | @property
90 | def lstm_layers(self):
91 | return self._config.getint('Network', 'lstm_layers')
92 |
93 | @property
94 | def word_dims(self):
95 | return self._config.getint('Network', 'word_dims')
96 |
97 | @property
98 | def tag_dims(self):
99 | return self._config.getint('Network', 'tag_dims')
100 |
101 | @property
102 | def dropout_emb(self):
103 | return self._config.getfloat('Network', 'dropout_emb')
104 |
105 | @property
106 | def lstm_hiddens(self):
107 | return self._config.getint('Network', 'lstm_hiddens')
108 |
109 | @property
110 | def dropout_lstm_input(self):
111 | return self._config.getfloat('Network', 'dropout_lstm_input')
112 |
113 | @property
114 | def dropout_lstm_hidden(self):
115 | return self._config.getfloat('Network', 'dropout_lstm_hidden')
116 |
117 | @property
118 | def mlp_arc_size(self):
119 | return self._config.getint('Network', 'mlp_arc_size')
120 |
121 | @property
122 | def mlp_rel_size(self):
123 | return self._config.getint('Network', 'mlp_rel_size')
124 |
125 | @property
126 | def dropout_mlp(self):
127 | return self._config.getfloat('Network', 'dropout_mlp')
128 |
129 | @property
130 | def learning_rate(self):
131 | return self._config.getfloat('Optimizer', 'learning_rate')
132 |
133 | @property
134 | def decay(self):
135 | return self._config.getfloat('Optimizer', 'decay')
136 |
137 | @property
138 | def decay_steps(self):
139 | return self._config.getint('Optimizer', 'decay_steps')
140 |
141 | @property
142 | def beta_1(self):
143 | return self._config.getfloat('Optimizer', 'beta_1')
144 |
145 | @property
146 | def beta_2(self):
147 | return self._config.getfloat('Optimizer', 'beta_2')
148 |
149 | @property
150 | def epsilon(self):
151 | return self._config.getfloat('Optimizer', 'epsilon')
152 |
153 | @property
154 | def clip(self):
155 | return self._config.getfloat('Optimizer', 'clip')
156 |
157 | @property
158 | def num_buckets_train(self):
159 | return self._config.getint('Run', 'num_buckets_train')
160 |
161 | @property
162 | def num_buckets_valid(self):
163 | return self._config.getint('Run', 'num_buckets_valid')
164 |
165 | @property
166 | def num_buckets_test(self):
167 | return self._config.getint('Run', 'num_buckets_test')
168 |
169 | @property
170 | def train_iters(self):
171 | return self._config.getint('Run', 'train_iters')
172 |
173 | @property
174 | def train_batch_size(self):
175 | return self._config.getint('Run', 'train_batch_size')
176 |
177 | @property
178 | def test_batch_size(self):
179 | return self._config.getint('Run', 'test_batch_size')
180 |
181 | @property
182 | def validate_every(self):
183 | return self._config.getint('Run', 'validate_every')
184 |
185 | @property
186 | def save_after(self):
187 | return self._config.getint('Run', 'save_after')
188 |
189 | @property
190 | def update_every(self):
191 | return self._config.getint('Run', 'update_every')
192 |
193 | @property
194 | def domain_num_corpus_weighting(self):
195 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting')
196 |
197 | @property
198 | def cdt_domain_ration(self):
199 | return self._config.getint('corous-weighting', 'cdt_domain_ration')
200 |
201 | @property
202 | def domain_data_size(self):
203 | return self._config.getint('corous-weighting', 'domain_data_size')
204 |
205 | @property
206 | def tbank_emb_size(self):
207 | return self._config.getint('Treebankid', 'tbank_emb_size')
208 |
209 |
210 |
--------------------------------------------------------------------------------
/biaffine-parser-concat/driver/Config.py:
--------------------------------------------------------------------------------
1 | from configparser import ConfigParser
2 | import sys, os
3 |
4 | sys.path.append('..')
5 |
6 |
7 | # import models
8 |
9 | class Configurable(object):
10 | def __init__(self, config_file, extra_args):
11 | config = ConfigParser()
12 | config.read(config_file)
13 | if extra_args:
14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])])
15 | for section in config.sections():
16 | for k, v in config.items(section):
17 | if k in extra_args:
18 | v = type(v)(extra_args[k])
19 | config.set(section, k, v)
20 | self._config = config
21 | if not os.path.isdir(self.save_dir):
22 | os.mkdir(self.save_dir)
23 | config.write(open(self.config_file, 'w'))
24 | print('Loaded config file sucessfully.')
25 | for section in config.sections():
26 | for k, v in config.items(section):
27 | print(k, v)
28 |
29 | @property
30 | def pretrained_embeddings_file(self):
31 | return self._config.get('Data', 'pretrained_embeddings_file')
32 |
33 | @property
34 | def data_dir(self):
35 | return self._config.get('Data', 'data_dir')
36 |
37 | @property
38 | def train_file(self):
39 | return self._config.get('Data', 'train_file')
40 |
41 | @property
42 | def dev_file(self):
43 | return self._config.get('Data', 'dev_file')
44 |
45 | @property
46 | def test_file(self):
47 | return self._config.get('Data', 'test_file')
48 |
49 | @property
50 | def min_occur_count(self):
51 | return self._config.getint('Data', 'min_occur_count')
52 |
53 | @property
54 | def save_dir(self):
55 | return self._config.get('Save', 'save_dir')
56 |
57 | @property
58 | def config_file(self):
59 | return self._config.get('Save', 'config_file')
60 |
61 | @property
62 | def save_model_path(self):
63 | return self._config.get('Save', 'save_model_path')
64 |
65 | @property
66 | def save_vocab_path(self):
67 | return self._config.get('Save', 'save_vocab_path')
68 |
69 | @property
70 | def load_dir(self):
71 | return self._config.get('Save', 'load_dir')
72 |
73 | @property
74 | def load_model_path(self):
75 | return self._config.get('Save', 'load_model_path')
76 |
77 | @property
78 | def load_vocab_path(self):
79 | return self._config.get('Save', 'load_vocab_path')
80 |
81 | @property
82 | def lstm_layers(self):
83 | return self._config.getint('Network', 'lstm_layers')
84 |
85 | @property
86 | def word_dims(self):
87 | return self._config.getint('Network', 'word_dims')
88 |
89 | @property
90 | def tag_dims(self):
91 | return self._config.getint('Network', 'tag_dims')
92 |
93 | @property
94 | def dropout_emb(self):
95 | return self._config.getfloat('Network', 'dropout_emb')
96 |
97 | @property
98 | def lstm_hiddens(self):
99 | return self._config.getint('Network', 'lstm_hiddens')
100 |
101 | @property
102 | def dropout_lstm_input(self):
103 | return self._config.getfloat('Network', 'dropout_lstm_input')
104 |
105 | @property
106 | def dropout_lstm_hidden(self):
107 | return self._config.getfloat('Network', 'dropout_lstm_hidden')
108 |
109 | @property
110 | def mlp_arc_size(self):
111 | return self._config.getint('Network', 'mlp_arc_size')
112 |
113 | @property
114 | def mlp_rel_size(self):
115 | return self._config.getint('Network', 'mlp_rel_size')
116 |
117 | @property
118 | def dropout_mlp(self):
119 | return self._config.getfloat('Network', 'dropout_mlp')
120 |
121 | @property
122 | def learning_rate(self):
123 | return self._config.getfloat('Optimizer', 'learning_rate')
124 |
125 | @property
126 | def decay(self):
127 | return self._config.getfloat('Optimizer', 'decay')
128 |
129 | @property
130 | def decay_steps(self):
131 | return self._config.getint('Optimizer', 'decay_steps')
132 |
133 | @property
134 | def beta_1(self):
135 | return self._config.getfloat('Optimizer', 'beta_1')
136 |
137 | @property
138 | def beta_2(self):
139 | return self._config.getfloat('Optimizer', 'beta_2')
140 |
141 | @property
142 | def epsilon(self):
143 | return self._config.getfloat('Optimizer', 'epsilon')
144 |
145 | @property
146 | def clip(self):
147 | return self._config.getfloat('Optimizer', 'clip')
148 |
149 | @property
150 | def num_buckets_train(self):
151 | return self._config.getint('Run', 'num_buckets_train')
152 |
153 | @property
154 | def num_buckets_valid(self):
155 | return self._config.getint('Run', 'num_buckets_valid')
156 |
157 | @property
158 | def num_buckets_test(self):
159 | return self._config.getint('Run', 'num_buckets_test')
160 |
161 | @property
162 | def num_corpus_weighting1(self):
163 | return self._config.getint('Run', 'num_corpus_weighting1')
164 |
165 | @property
166 | def num_corpus_weighting2(self):
167 | return self._config.getint('Run', 'num_corpus_weighting2')
168 |
169 | @property
170 | def num_corpus_weighting3(self):
171 | return self._config.getint('Run', 'num_corpus_weighting3')
172 |
173 | @property
174 | def train_iters(self):
175 | return self._config.getint('Run', 'train_iters')
176 |
177 | @property
178 | def train_batch_size(self):
179 | return self._config.getint('Run', 'train_batch_size')
180 |
181 | @property
182 | def test_batch_size(self):
183 | return self._config.getint('Run', 'test_batch_size')
184 |
185 | @property
186 | def validate_every(self):
187 | return self._config.getint('Run', 'validate_every')
188 |
189 | @property
190 | def save_after(self):
191 | return self._config.getint('Run', 'save_after')
192 |
193 | @property
194 | def update_every(self):
195 | return self._config.getint('Run', 'update_every')
196 |
197 | @property
198 | def domain_num_corpus_weighting(self):
199 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting')
200 |
201 | @property
202 | def cdt_domain_ration(self):
203 | return self._config.getint('corous-weighting', 'cdt_domain_ration')
204 |
205 | @property
206 | def domain_file(self):
207 | return self._config.get('corous-weighting', 'domain_file')
208 |
--------------------------------------------------------------------------------
/multi_task_learning/driver/Config.py:
--------------------------------------------------------------------------------
1 | from configparser import ConfigParser
2 | import sys, os
3 |
4 | sys.path.append('..')
5 |
6 |
7 | # import models
8 |
9 | class Configurable(object):
10 | def __init__(self, config_file, extra_args):
11 | config = ConfigParser()
12 | config.read(config_file)
13 | if extra_args:
14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])])
15 | for section in config.sections():
16 | for k, v in config.items(section):
17 | if k in extra_args:
18 | v = type(v)(extra_args[k])
19 | config.set(section, k, v)
20 | self._config = config
21 | if not os.path.isdir(self.save_dir):
22 | os.mkdir(self.save_dir)
23 | config.write(open(self.config_file, 'w'))
24 | print('Loaded config file sucessfully.')
25 | for section in config.sections():
26 | for k, v in config.items(section):
27 | print(k, v)
28 |
29 | @property
30 | def pretrained_embeddings_file(self):
31 | return self._config.get('Data', 'pretrained_embeddings_file')
32 |
33 | @property
34 | def data_dir(self):
35 | return self._config.get('Data', 'data_dir')
36 |
37 | @property
38 | def train_file(self):
39 | return self._config.get('Data', 'train_file')
40 |
41 | @property
42 | def cdt_dev_file(self):
43 | return self._config.get('Data', 'cdt_dev_file')
44 |
45 | @property
46 | def cw_dev_file(self):
47 | return self._config.get('Data', 'cw_dev_file')
48 |
49 | @property
50 | def cdt_test_file(self):
51 | return self._config.get('Data', 'cdt_test_file')
52 |
53 | @property
54 | def cw_test_file(self):
55 | return self._config.get('Data', 'cw_test_file')
56 |
57 | @property
58 | def min_occur_count(self):
59 | return self._config.getint('Data', 'min_occur_count')
60 |
61 | @property
62 | def save_dir(self):
63 | return self._config.get('Save', 'save_dir')
64 |
65 | @property
66 | def config_file(self):
67 | return self._config.get('Save', 'config_file')
68 |
69 | @property
70 | def save_model_path(self):
71 | return self._config.get('Save', 'save_model_path')
72 |
73 | @property
74 | def save_vocab_path(self):
75 | return self._config.get('Save', 'save_vocab_path')
76 |
77 | @property
78 | def load_dir(self):
79 | return self._config.get('Save', 'load_dir')
80 |
81 | @property
82 | def load_model_path(self):
83 | return self._config.get('Save', 'load_model_path')
84 |
85 | @property
86 | def load_vocab_path(self):
87 | return self._config.get('Save', 'load_vocab_path')
88 |
89 | @property
90 | def lstm_layers(self):
91 | return self._config.getint('Network', 'lstm_layers')
92 |
93 | @property
94 | def word_dims(self):
95 | return self._config.getint('Network', 'word_dims')
96 |
97 | @property
98 | def tag_dims(self):
99 | return self._config.getint('Network', 'tag_dims')
100 |
101 | @property
102 | def dropout_emb(self):
103 | return self._config.getfloat('Network', 'dropout_emb')
104 |
105 | @property
106 | def lstm_hiddens(self):
107 | return self._config.getint('Network', 'lstm_hiddens')
108 |
109 | @property
110 | def dropout_lstm_input(self):
111 | return self._config.getfloat('Network', 'dropout_lstm_input')
112 |
113 | @property
114 | def dropout_lstm_hidden(self):
115 | return self._config.getfloat('Network', 'dropout_lstm_hidden')
116 |
117 | @property
118 | def mlp_arc_size(self):
119 | return self._config.getint('Network', 'mlp_arc_size')
120 |
121 | @property
122 | def mlp_rel_size(self):
123 | return self._config.getint('Network', 'mlp_rel_size')
124 |
125 | @property
126 | def dropout_mlp(self):
127 | return self._config.getfloat('Network', 'dropout_mlp')
128 |
129 | @property
130 | def learning_rate(self):
131 | return self._config.getfloat('Optimizer', 'learning_rate')
132 |
133 | @property
134 | def decay(self):
135 | return self._config.getfloat('Optimizer', 'decay')
136 |
137 | @property
138 | def decay_steps(self):
139 | return self._config.getint('Optimizer', 'decay_steps')
140 |
141 | @property
142 | def beta_1(self):
143 | return self._config.getfloat('Optimizer', 'beta_1')
144 |
145 | @property
146 | def beta_2(self):
147 | return self._config.getfloat('Optimizer', 'beta_2')
148 |
149 | @property
150 | def epsilon(self):
151 | return self._config.getfloat('Optimizer', 'epsilon')
152 |
153 | @property
154 | def clip(self):
155 | return self._config.getfloat('Optimizer', 'clip')
156 |
157 | @property
158 | def num_buckets_train(self):
159 | return self._config.getint('Run', 'num_buckets_train')
160 |
161 | @property
162 | def num_buckets_valid(self):
163 | return self._config.getint('Run', 'num_buckets_valid')
164 |
165 | @property
166 | def num_buckets_test(self):
167 | return self._config.getint('Run', 'num_buckets_test')
168 |
169 | @property
170 | def num_corpus_weighting1(self):
171 | return self._config.getint('Run', 'num_corpus_weighting1')
172 |
173 | @property
174 | def num_corpus_weighting2(self):
175 | return self._config.getint('Run', 'num_corpus_weighting2')
176 |
177 | @property
178 | def num_corpus_weighting3(self):
179 | return self._config.getint('Run', 'num_corpus_weighting3')
180 |
181 | @property
182 | def train_iters(self):
183 | return self._config.getint('Run', 'train_iters')
184 |
185 | @property
186 | def train_batch_size(self):
187 | return self._config.getint('Run', 'train_batch_size')
188 |
189 | @property
190 | def test_batch_size(self):
191 | return self._config.getint('Run', 'test_batch_size')
192 |
193 | @property
194 | def validate_every(self):
195 | return self._config.getint('Run', 'validate_every')
196 |
197 | @property
198 | def save_after(self):
199 | return self._config.getint('Run', 'save_after')
200 |
201 | @property
202 | def update_every(self):
203 | return self._config.getint('Run', 'update_every')
204 |
205 | @property
206 | def domain_num_corpus_weighting(self):
207 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting')
208 |
209 | @property
210 | def cdt_domain_ration(self):
211 | return self._config.getint('corous-weighting', 'cdt_domain_ration')
212 |
213 | @property
214 | def domain_file(self):
215 | return self._config.get('corous-weighting', 'domain_file')
216 |
--------------------------------------------------------------------------------
/biaffine-parser-elmo/driver/Config.py:
--------------------------------------------------------------------------------
1 | from configparser import ConfigParser
2 | import sys, os
3 |
4 | sys.path.append('..')
5 |
6 |
7 | # import models
8 |
9 | class Configurable(object):
10 | def __init__(self, config_file, extra_args):
11 | config = ConfigParser()
12 | config.read(config_file)
13 | if extra_args:
14 | extra_args = dict([(k[2:], v) for k, v in zip(extra_args[0::2], extra_args[1::2])])
15 | for section in config.sections():
16 | for k, v in config.items(section):
17 | if k in extra_args:
18 | v = type(v)(extra_args[k])
19 | config.set(section, k, v)
20 | self._config = config
21 | if not os.path.isdir(self.save_dir):
22 | os.mkdir(self.save_dir)
23 | config.write(open(self.config_file, 'w'))
24 | print('Loaded config file sucessfully.')
25 | for section in config.sections():
26 | for k, v in config.items(section):
27 | print(k, v)
28 |
29 | @property
30 | def pretrained_embeddings_file(self):
31 | return self._config.get('Data', 'pretrained_embeddings_file')
32 |
33 | @property
34 | def data_dir(self):
35 | return self._config.get('Data', 'data_dir')
36 |
37 | @property
38 | def train_file(self):
39 | return self._config.get('Data', 'train_file')
40 |
41 | @property
42 | def train_file2(self):
43 | return self._config.get('Data', 'train_file2')
44 |
45 | @property
46 | def train_file3(self):
47 | return self._config.get('Data', 'train_file3')
48 |
49 | @property
50 | def dev_file(self):
51 | return self._config.get('Data', 'dev_file')
52 |
53 | @property
54 | def test_file(self):
55 | return self._config.get('Data', 'test_file')
56 |
57 | @property
58 | def min_occur_count(self):
59 | return self._config.getint('Data', 'min_occur_count')
60 |
61 | @property
62 | def save_dir(self):
63 | return self._config.get('Save', 'save_dir')
64 |
65 | @property
66 | def config_file(self):
67 | return self._config.get('Save', 'config_file')
68 |
69 | @property
70 | def save_model_path(self):
71 | return self._config.get('Save', 'save_model_path')
72 |
73 | @property
74 | def save_vocab_path(self):
75 | return self._config.get('Save', 'save_vocab_path')
76 |
77 | @property
78 | def load_dir(self):
79 | return self._config.get('Save', 'load_dir')
80 |
81 | @property
82 | def load_model_path(self):
83 | return self._config.get('Save', 'load_model_path')
84 |
85 | @property
86 | def load_vocab_path(self):
87 | return self._config.get('Save', 'load_vocab_path')
88 |
89 | @property
90 | def lstm_layers(self):
91 | return self._config.getint('Network', 'lstm_layers')
92 |
93 | @property
94 | def word_dims(self):
95 | return self._config.getint('Network', 'word_dims')
96 |
97 | @property
98 | def tag_dims(self):
99 | return self._config.getint('Network', 'tag_dims')
100 |
101 | @property
102 | def dropout_emb(self):
103 | return self._config.getfloat('Network', 'dropout_emb')
104 |
105 | @property
106 | def lstm_hiddens(self):
107 | return self._config.getint('Network', 'lstm_hiddens')
108 |
109 | @property
110 | def dropout_lstm_input(self):
111 | return self._config.getfloat('Network', 'dropout_lstm_input')
112 |
113 | @property
114 | def dropout_lstm_hidden(self):
115 | return self._config.getfloat('Network', 'dropout_lstm_hidden')
116 |
117 | @property
118 | def mlp_arc_size(self):
119 | return self._config.getint('Network', 'mlp_arc_size')
120 |
121 | @property
122 | def mlp_rel_size(self):
123 | return self._config.getint('Network', 'mlp_rel_size')
124 |
125 | @property
126 | def dropout_mlp(self):
127 | return self._config.getfloat('Network', 'dropout_mlp')
128 |
129 | @property
130 | def learning_rate(self):
131 | return self._config.getfloat('Optimizer', 'learning_rate')
132 |
133 | @property
134 | def decay(self):
135 | return self._config.getfloat('Optimizer', 'decay')
136 |
137 | @property
138 | def decay_steps(self):
139 | return self._config.getint('Optimizer', 'decay_steps')
140 |
141 | @property
142 | def beta_1(self):
143 | return self._config.getfloat('Optimizer', 'beta_1')
144 |
145 | @property
146 | def beta_2(self):
147 | return self._config.getfloat('Optimizer', 'beta_2')
148 |
149 | @property
150 | def epsilon(self):
151 | return self._config.getfloat('Optimizer', 'epsilon')
152 |
153 | @property
154 | def clip(self):
155 | return self._config.getfloat('Optimizer', 'clip')
156 |
157 | @property
158 | def num_buckets_train(self):
159 | return self._config.getint('Run', 'num_buckets_train')
160 |
161 | @property
162 | def num_buckets_valid(self):
163 | return self._config.getint('Run', 'num_buckets_valid')
164 |
165 | @property
166 | def num_buckets_test(self):
167 | return self._config.getint('Run', 'num_buckets_test')
168 |
169 | @property
170 | def num_corpus_weighting1(self):
171 | return self._config.getint('Run', 'num_corpus_weighting1')
172 |
173 | @property
174 | def num_corpus_weighting2(self):
175 | return self._config.getint('Run', 'num_corpus_weighting2')
176 |
177 | @property
178 | def num_corpus_weighting3(self):
179 | return self._config.getint('Run', 'num_corpus_weighting3')
180 |
181 | @property
182 | def train_iters(self):
183 | return self._config.getint('Run', 'train_iters')
184 |
185 | @property
186 | def train_batch_size(self):
187 | return self._config.getint('Run', 'train_batch_size')
188 |
189 | @property
190 | def test_batch_size(self):
191 | return self._config.getint('Run', 'test_batch_size')
192 |
193 | @property
194 | def validate_every(self):
195 | return self._config.getint('Run', 'validate_every')
196 |
197 | @property
198 | def save_after(self):
199 | return self._config.getint('Run', 'save_after')
200 |
201 | @property
202 | def update_every(self):
203 | return self._config.getint('Run', 'update_every')
204 |
205 | @property
206 | def train_elmo(self):
207 | return self._config.get('Elmo', 'train_elmo')
208 |
209 | @property
210 | def dev_elmo(self):
211 | return self._config.get('Elmo', 'dev_elmo')
212 |
213 | @property
214 | def test_elmo(self):
215 | return self._config.get('Elmo', 'test_elmo')
216 |
217 | @property
218 | def elmo_dims(self):
219 | return self._config.getint('Elmo', 'elmo_dims')
220 |
221 | @property
222 | def tbank_emb_size(self):
223 | return self._config.getint('Treebankid', 'tbank_emb_size')
224 |
225 | @property
226 | def domain_num_corpus_weighting(self):
227 | return self._config.getint('corous-weighting', 'domain_num_corpus_weighting')
228 |
229 | @property
230 | def cw(self):
231 | return self._config.getint('corous-weighting', 'cw')
232 |
233 | @property
234 | def cdt_domain_ration(self):
235 | return self._config.getint('corous-weighting', 'cdt_domain_ration')
236 |
237 | @property
238 | def domain_data_size(self):
239 | return self._config.getint('corous-weighting', 'domain_data_size')
240 |
241 |
242 |
243 |
--------------------------------------------------------------------------------
/multi_task_learning/data/Vocab.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from data.Dependency import *
3 | import numpy as np
4 |
5 | class Vocab(object):
6 | PAD, ROOT, UNK = 0, 1, 2
7 | def __init__(self, word_counter, tag_counter, rel_counter, relroot='root', min_occur_count = 2):
8 | self._root = relroot
9 | self._root_form = '<' + relroot.lower() + '>'
10 | self._id2word = ['', self._root_form, '']
11 | self._wordid2freq = [10000, 10000, 10000]
12 | self._id2extword = ['', self._root_form, '']
13 | self._id2tag = ['', relroot]
14 | self._id2rel = ['', relroot,'pred','subj-in','frag','punc','repet','exp']
15 | for word, count in word_counter.most_common():
16 | if count > min_occur_count:
17 | self._id2word.append(word)
18 | self._wordid2freq.append(count)
19 |
20 | for tag, count in tag_counter.most_common():
21 | if tag != relroot: self._id2tag.append(tag)
22 |
23 | for rel, count in rel_counter.most_common():
24 | if rel != relroot and rel not in self._id2rel: self._id2rel.append(rel)
25 |
26 | reverse = lambda x: dict(zip(x, range(len(x))))
27 | self._word2id = reverse(self._id2word)
28 | if len(self._word2id) != len(self._id2word):
29 | print("serious bug: words dumplicated, please check!")
30 |
31 | self._tag2id = reverse(self._id2tag)
32 | if len(self._tag2id) != len(self._id2tag):
33 | print("serious bug: POS tags dumplicated, please check!")
34 |
35 | self._rel2id = reverse(self._id2rel)
36 | if len(self._rel2id) != len(self._id2rel):
37 | print("serious bug: relation labels dumplicated, please check!")
38 |
39 | print("Vocab info: #words %d, #tags %d, #rels %d" % (self.vocab_size, self.tag_size, self.rel_size))
40 |
41 | def load_pretrained_embs(self, embfile):
42 | embedding_dim = -1
43 | word_count = 0
44 | with open(embfile, encoding='utf-8') as f:
45 | for line in f.readlines():
46 | if word_count < 1:
47 | values = line.split()
48 | embedding_dim = len(values) - 1
49 | word_count += 1
50 | print('Total words: ' + str(word_count) + '\n')
51 | print('The dim of pretrained embeddings: ' + str(embedding_dim) + '\n')
52 |
53 | index = len(self._id2extword)
54 | embeddings = np.zeros((word_count + index, embedding_dim))
55 | with open(embfile, encoding='utf-8') as f:
56 | for line in f.readlines():
57 | values = line.split()
58 | self._id2extword.append(values[0])
59 | vector = np.array(values[1:], dtype='float64')
60 | embeddings[self.UNK] += vector
61 | embeddings[index] = vector
62 | index += 1
63 |
64 | embeddings[self.UNK] = embeddings[self.UNK] / word_count
65 | embeddings = embeddings / np.std(embeddings)
66 |
67 | reverse = lambda x: dict(zip(x, range(len(x))))
68 | self._extword2id = reverse(self._id2extword)
69 |
70 | if len(self._extword2id) != len(self._id2extword):
71 | print("serious bug: extern words dumplicated, please check!")
72 |
73 | return embeddings
74 |
75 | def create_pretrained_embs(self, embfile):
76 | embedding_dim = -1
77 | word_count = 0
78 | with open(embfile, encoding='utf-8') as f:
79 | for line in f.readlines():
80 | if word_count < 1:
81 | values = line.split()
82 | embedding_dim = len(values) - 1
83 | word_count += 1
84 | print('Total words: ' + str(word_count) + '\n')
85 | print('The dim of pretrained embeddings: ' + str(embedding_dim) + '\n')
86 |
87 | index = len(self._id2extword) - word_count
88 | embeddings = np.zeros((word_count + index, embedding_dim))
89 | with open(embfile, encoding='utf-8') as f:
90 | for line in f.readlines():
91 | values = line.split()
92 | if self._extword2id.get(values[0], self.UNK) != index:
93 | print("Broken vocab or error embedding file, please check!")
94 | vector = np.array(values[1:], dtype='float64')
95 | embeddings[self.UNK] += vector
96 | embeddings[index] = vector
97 | index += 1
98 |
99 | embeddings[self.UNK] = embeddings[self.UNK] / word_count
100 | embeddings = embeddings / np.std(embeddings)
101 |
102 | return embeddings
103 |
104 |
105 | def word2id(self, xs):
106 | if isinstance(xs, list):
107 | return [self._word2id.get(x, self.UNK) for x in xs]
108 | return self._word2id.get(xs, self.UNK)
109 |
110 | def id2word(self, xs):
111 | if isinstance(xs, list):
112 | return [self._id2word[x] for x in xs]
113 | return self._id2word[xs]
114 |
115 | def wordid2freq(self, xs):
116 | if isinstance(xs, list):
117 | return [self._wordid2freq[x] for x in xs]
118 | return self._wordid2freq[xs]
119 |
120 | def extword2id(self, xs):
121 | if isinstance(xs, list):
122 | return [self._extword2id.get(x, self.UNK) for x in xs]
123 | return self._extword2id.get(xs, self.UNK)
124 |
125 | def id2extword(self, xs):
126 | if isinstance(xs, list):
127 | return [self._id2extword[x] for x in xs]
128 | return self._id2extword[xs]
129 |
130 | def rel2id(self, xs):
131 | if isinstance(xs, list):
132 | return [self._rel2id[x] for x in xs]
133 | return self._rel2id[xs]
134 |
135 | def id2rel(self, xs):
136 | if isinstance(xs, list):
137 | return [self._id2rel[x] for x in xs]
138 | return self._id2rel[xs]
139 |
140 | def tag2id(self, xs):
141 | if isinstance(xs, list):
142 | return [self._tag2id.get(x) for x in xs]
143 | return self._tag2id.get(xs)
144 |
145 | def id2tag(self, xs):
146 | if isinstance(xs, list):
147 | return [self._id2tag[x] for x in xs]
148 | return self._id2tag[xs]
149 |
150 | @property
151 | def vocab_size(self):
152 | return len(self._id2word)
153 |
154 | @property
155 | def extvocab_size(self):
156 | return len(self._id2extword)
157 |
158 | @property
159 | def tag_size(self):
160 | return len(self._id2tag)
161 |
162 | @property
163 | def rel_size(self):
164 | return len(self._id2rel)
165 |
166 | def creatVocab(corpusFile, min_occur_count):
167 | word_counter = Counter()
168 | tag_counter = Counter()
169 | rel_counter = Counter()
170 | root = ''
171 | tgt_type = parse_my_conll(corpusFile)
172 | with open(corpusFile, 'r') as infile:
173 | #tgt_type = infile[0].strip().split("\t")[-1]
174 | for sentence in readDepTree(infile,tgt_type):
175 | for dep in sentence.sentence:
176 | word_counter[dep.form] += 1
177 | tag_counter[dep.tag] += 1
178 | if dep.head == -1: #补全的数据
179 | continue
180 | if dep.head != 0:
181 | rel_counter[dep.rel] += 1
182 | elif root == '':
183 | root = dep.rel
184 | rel_counter[dep.rel] += 1
185 | elif root != dep.rel:
186 | print('root = ' + root + ', rel for root = ' + dep.rel)
187 |
188 | return Vocab(word_counter, tag_counter, rel_counter, root, min_occur_count)
189 |
--------------------------------------------------------------------------------