├── .gitignore ├── .idea ├── deployment.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── ssc.iml └── vcs.xml ├── README.md ├── config_cnn.json ├── config_mlp.json ├── config_rnn.json ├── data ├── .gitignore └── sinanews.demo ├── doc ├── .gitignore ├── bn_dev.svg ├── bn_train.svg ├── col_bi-gru.png ├── col_bi-lstm.png ├── col_cnn.png ├── col_gru.png ├── col_lstm.png ├── col_mlp.png ├── dev_Accuracy.svg ├── dev_CORR.svg ├── dev_F1_macro.svg ├── dropout_dev.svg ├── dropout_train.svg ├── embed_dev.svg ├── embed_train.svg ├── self-attention_dev.svg ├── self-attention_train.svg └── train_Accuracy.svg ├── elmoformanylangs ├── __init__.py ├── __main__.py ├── biLM.py ├── dataloader.py ├── elmo.py ├── frontend.py ├── modules │ ├── __init__.py │ ├── classify_layer.py │ ├── elmo.py │ ├── embedding_layer.py │ ├── encoder_base.py │ ├── highway.py │ ├── lstm.py │ ├── lstm_cell_with_projection.py │ ├── token_embedder.py │ └── util.py └── utils.py ├── main.py ├── preprocess.py ├── requirements.txt ├── run_cnn.sh ├── run_mlp.sh ├── run_preprocess_elmo.sh ├── run_preprocess_word2vec.sh ├── run_rnn.sh └── save ├── bi-gru_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559405473.gpu-theta.8556.0 ├── bi-lstm_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559400227.gpu-theta.32041.0 ├── bi-lstm_2 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559438048.gpu-theta.14643.0 ├── bi-lstm_3 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559440265.gpu-theta.15460.0 ├── bi-lstm_4 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559441032.gpu-theta.21582.0 ├── cnn_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559399753.gpu-theta.10281.0 ├── cnn_2 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559409399.gpu-theta.22755.0 ├── cnn_3 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559409541.gpu-theta.30631.0 ├── cnn_4 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559437928.gpu-theta.6850.0 ├── cnn_5 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559437980.gpu-theta.10531.0 ├── cnn_6 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559443831.gpu-theta.16155.0 ├── cnn_7 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559462006.gpu-theta.17687.0 ├── cnn_8 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559462040.gpu-theta.19995.0 ├── gru_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559406000.gpu-theta.3523.0 ├── lstm_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559402877.gpu-theta.2069.0 ├── mlp_1 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559399750.gpu-theta.9979.0 ├── mlp_2 ├── config.json ├── log.txt └── runs │ └── events.out.tfevents.1559409302.gpu-theta.16485.0 └── mlp_3 ├── config.json ├── log.txt └── runs └── events.out.tfevents.1559438821.gpu-theta.15937.0 /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 3 | 4 | # User-specific stuff 5 | .idea/**/workspace.xml 6 | .idea/**/tasks.xml 7 | .idea/**/usage.statistics.xml 8 | .idea/**/dictionaries 9 | .idea/**/shelf 10 | 11 | # Generated files 12 | .idea/**/contentModel.xml 13 | 14 | # Sensitive or high-churn files 15 | .idea/**/dataSources/ 16 | .idea/**/dataSources.ids 17 | .idea/**/dataSources.local.xml 18 | .idea/**/sqlDataSources.xml 19 | .idea/**/dynamic.xml 20 | .idea/**/uiDesigner.xml 21 | .idea/**/dbnavigator.xml 22 | 23 | # Gradle 24 | .idea/**/gradle.xml 25 | .idea/**/libraries 26 | 27 | # Gradle and Maven with auto-import 28 | # When using Gradle or Maven with auto-import, you should exclude module files, 29 | # since they will be recreated, and may cause churn. Uncomment if using 30 | # auto-import. 31 | # .idea/modules.xml 32 | # .idea/*.iml 33 | # .idea/modules 34 | # *.iml 35 | # *.ipr 36 | 37 | # CMake 38 | cmake-build-*/ 39 | 40 | # Mongo Explorer plugin 41 | .idea/**/mongoSettings.xml 42 | 43 | # File-based project format 44 | *.iws 45 | 46 | # IntelliJ 47 | out/ 48 | 49 | # mpeltonen/sbt-idea plugin 50 | .idea_modules/ 51 | 52 | # JIRA plugin 53 | atlassian-ide-plugin.xml 54 | 55 | # Cursive Clojure plugin 56 | .idea/replstate.xml 57 | 58 | # Crashlytics plugin (for Android Studio and IntelliJ) 59 | com_crashlytics_export_strings.xml 60 | crashlytics.properties 61 | crashlytics-build.properties 62 | fabric.properties 63 | 64 | # Editor-based Rest Client 65 | .idea/httpRequests 66 | 67 | # Android studio 3.1+ serialized cache file 68 | .idea/caches/build_file_checksums.ser 69 | 70 | # Byte-compiled / optimized / DLL files 71 | __pycache__/ 72 | *.py[cod] 73 | *$py.class 74 | 75 | # C extensions 76 | *.so 77 | 78 | # Distribution / packaging 79 | .Python 80 | build/ 81 | develop-eggs/ 82 | dist/ 83 | downloads/ 84 | eggs/ 85 | .eggs/ 86 | lib/ 87 | lib64/ 88 | parts/ 89 | sdist/ 90 | var/ 91 | wheels/ 92 | pip-wheel-metadata/ 93 | share/python-wheels/ 94 | *.egg-info/ 95 | .installed.cfg 96 | *.egg 97 | MANIFEST 98 | 99 | # PyInstaller 100 | # Usually these files are written by a python script from a template 101 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 102 | *.manifest 103 | *.spec 104 | 105 | # Installer logs 106 | pip-log.txt 107 | pip-delete-this-directory.txt 108 | 109 | # Unit test / coverage reports 110 | htmlcov/ 111 | .tox/ 112 | .nox/ 113 | .coverage 114 | .coverage.* 115 | .cache 116 | nosetests.xml 117 | coverage.xml 118 | *.cover 119 | .hypothesis/ 120 | .pytest_cache/ 121 | 122 | # Translations 123 | *.mo 124 | *.pot 125 | 126 | # Django stuff: 127 | *.log 128 | local_settings.py 129 | db.sqlite3 130 | db.sqlite3-journal 131 | 132 | # Flask stuff: 133 | instance/ 134 | .webassets-cache 135 | 136 | # Scrapy stuff: 137 | .scrapy 138 | 139 | # Sphinx documentation 140 | docs/_build/ 141 | 142 | # PyBuilder 143 | target/ 144 | 145 | # Jupyter Notebook 146 | .ipynb_checkpoints 147 | 148 | # IPython 149 | profile_default/ 150 | ipython_config.py 151 | 152 | # pyenv 153 | .python-version 154 | 155 | # pipenv 156 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 157 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 158 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 159 | # install all needed dependencies. 160 | #Pipfile.lock 161 | 162 | # celery beat schedule file 163 | celerybeat-schedule 164 | 165 | # SageMath parsed files 166 | *.sage.py 167 | 168 | # Environments 169 | .env 170 | .venv 171 | env/ 172 | venv/ 173 | ENV/ 174 | env.bak/ 175 | venv.bak/ 176 | 177 | # Spyder project settings 178 | .spyderproject 179 | .spyproject 180 | 181 | # Rope project settings 182 | .ropeproject 183 | 184 | # mkdocs documentation 185 | /site 186 | 187 | # mypy 188 | .mypy_cache/ 189 | .dmypy.json 190 | dmypy.json 191 | 192 | # Pyre type checker 193 | .pyre/ 194 | 195 | tmp/ -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 136 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/ssc.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 简单的中文文本情感分类 2 | 3 | 一个用 PyTorch 实现的中文文本情感分类网络,代码较简单,功能较丰富,包含了多种模型 baseline。 4 | 5 | ## 环境需求 6 | 7 | * python == 3.6 8 | * torch == 1.1.0 9 | * Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz 10 | * NVIDIA TITAN Xp 11 | 12 | 其余的见 `requirements.txt` 13 | 14 | ## 使用方法 15 | 16 | 先预处理,`./run_preprocess_word2vec.sh` 或 `./run_preprocess_elmo.sh 3`(3 是 gpu 编号) 17 | 18 | 然后运行 `python3 main.py --config_path config_cnn.json` 19 | 20 | ## 预处理 21 | 22 | 将所给文本的每个词转换成预训练模型的词向量后存到文件里。我分别尝试了这两种 embedding: 23 | 24 | * ELMo 中文预训练模型,1024d(https://github.com/HIT-SCIR/ELMoForManyLangs) 25 | * Chinese-Word-Vectors,300d(https://github.com/Embedding/Chinese-Word-Vectors) 26 | 27 | 请自行下载相应的模型文件到 `data/word2vec/` 或 `data/zhs.model` 文件夹下。 28 | 29 | 具体细节见 `preprocess.py` 文件,若想使用自己的数据集,修改该文件即可。 30 | 31 | ## 实现的模型 32 | 33 | ### MLP (2 layer) 34 | 35 | Linear + ReLU + Dropout + Linear + Softmax 36 | 37 | ### CNN (1 layer) + MLP (2 layer) 38 | 39 | Conv1d + ReLU + Dropout + MaxPool1d + Linear + ReLU + Dropout + Linear + Softmax 40 | 41 | 见这篇 paper [https://www.aclweb.org/anthology/D14-1181](https://www.aclweb.org/anthology/D14-1181) 42 | 43 | [1] Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751. 44 | 45 | ### RNN (1 layer) + Self-Attention + MLP (2 layer) 46 | 47 | RNN (GRU or LSTM or bi-GRU or bi-LSTM) + Self-Attention + Linear + ReLU + Dropout + Linear + Softmax 48 | 49 | Self-Attention 见这篇 paper [https://arxiv.org/pdf/1703.03130.pdf](https://arxiv.org/pdf/1703.03130.pdf) 50 | 51 | [2] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. In Proceedings of International Conference on Learning Representations. 52 | 53 | ## 某些参数的解释 54 | 55 | * `seed`:`20000125` (保证结果可复现) 56 | * `gpu`:`false` (使用 cpu),`true` (使用 nvidia 系 gpu,推荐) 57 | * `output_path`:运行模型会将日志文件、TensorBoard 文件、配置文件生成到该目录下 58 | * `loss`:`l1` (L1Loss) ,`mse` (MSELoss),`cross_entropy` (CrossEntropyLoss,推荐) 59 | * `optimizer`:`sgd`,`adagrad` (Adagrad 自带了 L2 regularization,推荐) 60 | * `embedding_size`:`1024` (ELMo),`300` (Chinese-Word-Vectors,较小,推荐) 61 | * `type`:`mlp`,`cnn`,`rnn` 62 | 63 |   具体见 `config_mlp.json`、`config_cnn.json`、`config_rnn.json` 这些文件。 64 | 65 | ## 数据集 66 | 67 |   数据集用的是 THU 计算机系《人工智能导论》作业三的数据集,在这里我不方便公开数据集及其介绍。大概的介绍一下就是,中文新闻,8 种情感分类。 68 | 69 |   这个数据集我必须要说一下,数据集比较小,网民标注数据不太准确,训练集和测试集分布不太一样(训练集是 2012 年 1 月至 2 月 发布的 2,342 篇新闻文章,测试集是 2012 年 3 月至 4 月发布的 2,228 篇新闻文章),所以某些模型可能达不到预期的效果。 70 | 71 |   我从所给训练集数据取出后 1/10 作为 dev 数据集,每训一个 epoch 都测试一次 dev 数据集,然后取一个准确率最高的那个 epoch 的模型作为最终测试的模型。 72 | 73 | ## 实验结果 74 | 75 |   先放最好的结果(总共训练了 300 epoch,取 dev 数据集准确率最高的那个 epoch 来对 test 数据集进行测试得到的下表,总用时指的是训完 300 个 epoch 后的用时): 76 | 77 | | 模型 | Accuracy(%) | F1(%) | CORR | 总用时 | 参数 | 78 | | :------: | :---------: | :---------: | :--: | :--------------: | :--------------------------------------------------------: | 79 | | ![](./doc/col_mlp.png) MLP | 59.4 | 21.5 | 0.28 | 7m44s | [save/mlp_1/config.json](./save/mlp_1/config.json) | 80 | | ![](./doc/col_cnn.png) CNN | 62.4 | 30.2 | 0.41 | 9m56s | [save/cnn_5/config.json](./save/cnn_5/config.json) | 81 | | ![](./doc/col_bi-lstm.png) bi-LSTM | 58.1 | 30.8 | 0.27 | 37m47s | [save/bi-lstm_1/config.json](./save/bi-lstm_1/config.json) | 82 | | ![](./doc/col_bi-gru.png) bi-GRU | 57.3 | 26.3 | 0.31 | 34m47s | [save/bi-gru_1/config.json](./save/bi-gru_1/config.json) | 83 | | ![](./doc/col_lstm.png) LSTM | 55.56 | 25.3 | 0.26 | 21m44s | [save/lstm_1/config.json](./save/lstm_1/config.json) | 84 | | ![](./doc/col_gru.png) GRU | 51.3 | 25.3 | 0.26 | 20m41s | [save/gru_1/config.json](./save/gru_1/config.json) | 85 | | MLP-ELMo | 58.1 | 21.9 | 0.21 | 18m26s | [save/mlp_3/config.json](./save/mlp_3/config.json) | 86 | | CNN-ELMo | 59.8 | 30.1 | 0.34 | 14m23s | [save/cnn_6/config.json](./save/cnn_6/config.json) | 87 | 88 |   下图为 dev Accuracy 89 | 90 | ![](./doc/dev_Accuracy.svg) 91 | 92 |   下图为 dev F1 (macro) 93 | 94 | ![](./doc/dev_F1_macro.svg) 95 | 96 |   下图为 dev CORR 97 | 98 | ![](./doc/dev_CORR.svg) 99 | 100 |   下图为 train Accuracy 101 | 102 | ![](./doc/train_Accuracy.svg) 103 | 104 |   若想查看更多图表请运行以下命令 105 | 106 | ``` 107 | $ tensorboard --logdir MLP:save/mlp_1/runs/,\ 108 | CNN:save/cnn_5/runs/,\ 109 | bi-LSTM:save/bi-lstm_1/runs/,\ 110 | bi-GRU:save/bi-gru_1/runs/,\ 111 | LSTM:save/lstm_1/runs/,\ 112 | GRU:save/gru_1/runs/,\ 113 | MLP-ELMo:save/mlp_3/runs/,\ 114 | CNN-ELMo:save/cnn_6/runs/ 115 | ``` 116 | 117 | ## 模型与参数比较 118 | 119 |   由上图来看,MLP 收敛最快,最早进入过拟合,测试结果一般;RNN 系的模型收敛速度较快(bi-GRU除外),测试结果不是很好;CNN 的模型收敛较慢,特别稳定,测试结果特别好。 120 | 121 |   MLP 作为一个入门模型,效果却非常不错,甚至吊打 RNN 系,这其实挺迷的,我能想到的原因只有我参数没找对或者数据集有毒。而 CNN 我曾经尝试过用 2 层卷积层,但效果不如 1 层的,所以后来就放弃了。 122 | 123 |   优化器基本上都是用的 `adagrad`,`sgd` 根据调参结果基本不会再用了(收敛太慢了)。 124 | 125 | ```json 126 | "optimizer": "adagrad", 127 | "lr": 0.01, 128 | "lr_decay": 0, 129 | "weight_decay": 0.0001, 130 | ``` 131 | 132 |   `ELMo` 的词向量经测试效果不太好,感觉应该是 pre-trained model 训的数据集和我们这个数据集分布差的有点多,然后我又没将 pre-trained model 接到我网络前面继续 fine tune 才导致的效果差?比较 [save/cnn_5](./save/cnn_5/config.json)(Chinese-Word-Vectors,橙色)和 [save/cnn_6](./save/cnn_6/config.json)(ELMo,蓝色)的图能看出在这个数据集上这个 ELMo 效果确实太好。 133 | 134 | | dev Accuracy | train Accuracy | 135 | | :------------------------: | :--------------------------: | 136 | | ![](./doc/embed_dev.svg) | ![](./doc/embed_train.svg) | 137 | 138 |   `Dropout` 是为了防止过拟合,比较 [save/cnn_1](./save/cnn_1/config.json)(dropout = 0.5,橙色)和 [save/cnn_4](./save/cnn_4/config.json)(dropout = 0.9,蓝色)的图能明显看出,当 dropout 越大,收敛越慢,但准确率更高,防止过拟合能力越强(毕竟解耦能力强)。 139 | 140 | | dev Accuracy | train Accuracy | 141 | | :------------------------: | :--------------------------: | 142 | | ![](./doc/dropout_dev.svg) | ![](./doc/dropout_train.svg) | 143 | 144 |   `Batch Normalization` 可以防止过拟合,比较卷积层里无 BN 无 Drouput 的 CNN [save/cnn_8](./save/cnn_8/config.json)(蓝色)和卷积层里有 BN 无 Drouput 的 CNN [save/cnn_7](./save/cnn_7/config.json)(Conv1d + BatchNorm1D + ReLU + MaxPool1d + Linear + ReLU + Dropout + Linear + Softmax,橙色)可以非常明显的看出。 145 | 146 | | dev Accuracy | train Accuracy | 147 | | :------------------------: | :--------------------------: | 148 | | ![](./doc/bn_dev.svg) | ![](./doc/bn_train.svg) | 149 | 150 |   `Self-Attention` 给 Loss 加了个惩罚项(penalization term),我比较了其在总 Loss 中系数不同的效果, [save/bi-lstm_1](./save/bi-lstm_1/config.json)(系数 = 1,橙色)和 [save/bi-lstm_2](./save/bi-lstm_2/config.json)(系数 = 0.3,蓝色),发现其对收敛速度有影响,高一点的话收敛会快很多,不过准确率有下降(感觉还是自己没细调的原因)。 151 | 152 | | dev Accuracy | train Accuracy | 153 | | :------------------------: | :--------------------------: | 154 | | ![](./doc/self-attention_dev.svg) | ![](./doc/self-attention_train.svg) | 155 | 156 |   参数实在太多,真的不想再调了。 157 | 158 | ## 问题思考 159 | 160 | ##### 1) 实验训练什么时候停止是最合适的?简要陈述你的实现方式,并试分析固定迭代次数与通过验证集调整等方法的优缺点。 161 | 162 |   两种方式,我采用的是第 2 种: 163 | 164 | 1. 设置一个阈值 θ(和累计次数 λ),当历史准确率最大值减去当前准确率最大值大于 θ 时(或累积大于 λ 次时),则停止。 165 | 2. 估计过拟合前的位置(也能用方法 1 来估计),设置固定的迭代次数。 166 | 167 |   固定迭代次数的优点是,在机器配置较好(或模型较简单)的情况下,可以跑多一点 epoch,从而可以非常直观的比较不同模型的数据(曲线)来判断模型的优劣;缺点是花费时间较长。 168 | 169 |   通过验证集调整的方式的优点是,在机器配置较低(或模型较复杂)、ddl 时间不足或想快速选择模型时,能有效提高时间效率;缺点是不太能直观地比较不同模型的特点。 170 | 171 | ##### 2)实验参数的初始化是怎么做的?不同的方法适合哪些地方?(现有的初始化方法为零均值初始化,高斯分布初始化,正交初始化等) 172 | 173 |   我的代码里,除了 Self-Attention 的两个矩阵用的是标准正态分布外(还试过 U(-sqrt(1/u), sqrt(1/u)) 和 U(-sqrt(1/da), sqrt(1/da)) 但效果差不多),其余所有参数都是用 PyTorch 的默认初始化。 174 | 175 |   一般用均匀或正态的零均值分布来初始化线性层、卷积层的权重矩阵和偏置,根据系数的不同,又分为 xavier 的均匀、正态分布,kaiming 的均匀、正态分布。xavier 的分布适用于激活函数是 tanh 的初始化,不太适用于 ReLU。而 kaming 则非常适用于带 ReLU 激活函数的地方。PyTorch 里的线性层的初始化为均匀分布 U(-sqrt(1/in), sqrt(1/in))(a = sqrt(5) 的 kaiming 分布,in 为矩阵的第二维大小)。 176 | 177 |   而正交初始化适用于 RNN 中参数的初始化,用以解决递归网络下的梯度消失、梯度爆炸问题。 178 | 179 | ##### 3)过拟合是深度学习常见的问题,有什么方法可以方式训练过程陷入过拟合。 180 | 181 | Batch Normalization: 182 | 183 |   BN 能提高模型的泛化能力,有效防止过拟合,减少参数初始化的影响。其原因在于 BN 将数据分布拉到了一个标准正态分布,减小了分布的 internal covariate shift ,一定程度上避免了梯度爆炸,而中心部分也有很大的梯度值,一定程度上避免了梯度消失。但若学习率设置不好,模型可能收敛得很慢。 184 | 185 | Dropout: 186 | 187 |   Dropout 是提高模型泛化能力的万金油,通过抑制一些神经元的输出以及放大其余神经元的输出来减小不同神经元的耦合性。加就完事了。 188 | 189 |   但需要注意的是,加了 BN 再加 Dropout 效果可能很不好。 190 | 191 | ##### 4)试分析 CNN,RNN,全连接神经网络(MLP)三者的优缺点。 192 | 193 | | | MLP | CNN | RNN | 194 | | :--: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | 195 | | 优点 | 结构简单,易理解,可解释,学习全局信息,能矩阵加速 | 结构简单,可解释,学习局部信息,在局部特征明显时优势很大,能矩阵加速,超快 | 结构简单,可解释,输入长度可变,有时序性,可记忆 | 196 | | 缺点 | 输入大小固定,参数规模较小时学习能力弱,参数规模大时内存开不下 | 输入大小固定,比较浅时学习能力有限,不能获取全局信息,往往需要在后面加个 MLP | 容易梯度消失和梯度爆炸,总体不是矩阵乘法,无法矩阵加速(单个元内部的矩阵乘法除外),较慢,往往后面也要加个 MLP | 197 | 198 | ## 心得体会 199 | 200 |   我在上学期的《人工神经网络》中学过相关知识,而且在作业里也手写过 MLP、CNN、RNN 的源码级别的实现,因此做本次作业还是比较轻松的(更何况直接调库就好了)。不过这是我第一次从零开始用 PyTorch 写出这些模型,所以我还是学到了很多 PyTorch 的知识的。同时我还为室友们和其他同学们讲解了 MLP、CNN、RNN 的概念和原理,巩固了我在这方面的知识。最后感谢一下助教在微信群里的答疑。 201 | 202 |   在这里提一个小建议,总的来说,本次作业综合了神经网络里的三个基本模型还是不错的,但唯一遗憾的是数据集可能不是那么的优秀,从而导致有些同学(包括我)怀疑自己模型、参数有问题,认为人工智能十分玄学,甚至对这门学科感到厌倦。希望在明年能改进这个问题吧。 203 | -------------------------------------------------------------------------------- /config_cnn.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_path": "data/word2vec_temp", 3 | "output_path": "save/cnn_5", 4 | "gpu": true, 5 | "seed": 20000125, 6 | "display_per_batch": 5, 7 | "optimizer": "adagrad", 8 | "lr": 0.01, 9 | "lr_decay": 0, 10 | "weight_decay": 0.0001, 11 | "momentum": 0.985, 12 | "num_epochs": 300, 13 | "batch_size": 64, 14 | "num_labels": 8, 15 | "embedding_size": 300, 16 | "type": "cnn", 17 | "cnn": { 18 | "max_length": 512, 19 | "conv_1": { 20 | "size": 512, 21 | "kernel_size": 3, 22 | "dropout": 0.9 23 | }, 24 | "max_pool_1": { 25 | "kernel_size": 2, 26 | "stride": 2 27 | }, 28 | "fc": { 29 | "hidden_size": 512, 30 | "dropout": 0.9 31 | }, 32 | "loss": "cross_entropy" 33 | } 34 | } -------------------------------------------------------------------------------- /config_mlp.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_path": "data/word2vec_temp", 3 | "output_path": "save/mlp_1", 4 | "gpu": true, 5 | "seed": 20000125, 6 | "display_per_batch": 5, 7 | "optimizer": "adagrad", 8 | "lr": 0.01, 9 | "lr_decay": 0, 10 | "weight_decay": 0.0001, 11 | "momentum": 0.985, 12 | "num_epochs": 300, 13 | "batch_size": 64, 14 | "num_labels": 8, 15 | "embedding_size": 300, 16 | "type": "mlp", 17 | "mlp": { 18 | "max_length": 512, 19 | "dropout": 0.5, 20 | "hidden_size": 512, 21 | "loss": "cross_entropy" 22 | } 23 | } -------------------------------------------------------------------------------- /config_rnn.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_path": "data/word2vec_temp", 3 | "output_path": "save/bi-lstm_1", 4 | "gpu": true, 5 | "seed": 20000125, 6 | "display_per_batch": 5, 7 | "optimizer": "adagrad", 8 | "lr": 0.01, 9 | "lr_decay": 0, 10 | "weight_decay": 0.0001, 11 | "momentum": 0.985, 12 | "num_epochs": 300, 13 | "batch_size": 64, 14 | "num_labels": 8, 15 | "embedding_size": 300, 16 | "type": "rnn", 17 | "rnn": { 18 | "type": "lstm", 19 | "bidirectional": true, 20 | "rnn_hidden_size": 256, 21 | "mlp_hidden_size": 512, 22 | "dropout": 0.5, 23 | "p_coefficient": 1, 24 | "num_layers": 1, 25 | "param_da": 350, 26 | "param_r": 30, 27 | "loss": "cross_entropy" 28 | } 29 | } -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | zhs.model 2 | word2vec 3 | word2vec_temp 4 | elmo_temp 5 | sinanews.test 6 | sinanews.train -------------------------------------------------------------------------------- /data/sinanews.demo: -------------------------------------------------------------------------------- 1 | 201201010000_18871 Total:2 感动:0 同情:0 无聊:2 愤怒:0 搞笑:0 难过:0 新奇:0 温馨:0 女生 发表 轻生 日志 获 网友 连夜 搜 救 本报 讯 永别 朋友 永别 世界 前天 晚上 清华 毕业 女生 小陈 网 主页 写下 日志 后 失踪 日志 中 表示 欲 寻短见 得知 消息 校友 途径 连夜 发起 搜 救 昨天 上午 记者 了解 民警 已经 找到 小陈 已经 危险 了解 小 陈 本科 研究生 分别 就读 中国 传媒 大学 清华大学 今年 毕业 后 进入 国家机关 下属 媒体 工作 前天 晚上 7点 44分 小陈 网 主页 发表 篇 题为 永别 朋友 日志 后 失踪 篇 日志 很快 引起 关注 当天 晚上 10点 关注 清华大学 微 博 协会 会长 刘若晴 微 博 发布 紧急 寻 人 信息 呼吁 网友 路 留 心眼 儿 救人 命 随后 网友 展开 搜 救 昨天 上午 条 寻找 小 陈 微 博 已 转发 8000 次 小陈 同学 寻 遍 宣武门 长椿 街 西单 传媒 大学 清华大学 不断 网上 交流 搜 救 信息 汇总 清华大学 微 博 协会 微 博 从前 晚 10点 昨天 凌晨 3点 小时 新 搜 救 信息 上面 发布 昨天 上午 北京市 公安局 官方 微 博 发布消息 称 北京 警方 积极 努力 开展 相关 工作 随后 市 公安局 勤务 指挥部 传来 好 消息 警方 连夜 工作 民警 丰台 六里桥 附近 找到 同学 同学 平安无事 现场 自杀 情况 危险 民警 正在 进一步 调查 中 2 | 201201010139_29492 Total:1 感动:0 同情:0 无聊:0 愤怒:0 搞笑:0 难过:0 新奇:0 温馨:1 客机 起飞 小时 后 机舱 冒烟 返航 昨天 上午 8点 25分 左右 首都 国际 机场 起飞 小时 架 美 航 AA186 次 航班 机舱 冒烟 折返 降落 事后 返航 乘客 安排 附近 酒店 下午 5点 乘客 接 航空公司 答复 改 今天 上午 10点 航班 再次 起飞 希望 次 顺利 到达 乘客 表示 机舱 冒烟 客机 返航 飞机 飞 飞 冒烟 北京 回来 昨天 上午 位 网友 微 博 透露 乘坐 美国 航空公司 AA186 次 航班 北京 芝加哥 飞机 起飞 后 客舱 突然 烟 最终 飞机 返航 元旦 北京 度过 真是 悲剧 飞机 起飞 紧急 返航 此刻 T3 位 网友 全 熊 小 猫 微 博 直播 次 遭遇 记者 联系 时 已 航空公司 安排 附近 酒店 姓 黄 美国 上学 留学生 次 芝加哥 中途 换乘 黄 女士 告诉 记者 事发 时 上午 8点 25分 飞机 已经 起飞 小时 突然 机舱 飘 股 浓浓的 异味 儿 有点 烟 味儿 广播 中 机长 声音 证实 一点 说 飞机 临时 出现 机械 故障 导致 机舱 冒烟 需要 返航 降落 记者 查询 发现 美 航 AA186 次 航班 首都 机场 正点 起飞 时间 7点 55分 留学生 临时 翻译 昨天 上午 8点 55分 AA186 次 航班 起飞 小时 后 重新 降落 首都 国际 机场 整个 过程 无 人 受伤 黄 女士 称 飞机 落地 之后 机 舱门 迟迟 未 打开 部分 乘客 着急 不断 身边 乘务员 询问 空姐 懂 中文 机舱 气氛 一度 些 混乱 懂 英文 乘客 帮忙 翻译 一下 机舱 紧张 气氛 平息 下来 黄 女士 称 帮忙 翻译 意思 说 飞机 已经 安全 落地 请 乘客 耐心 等待 地面 空勤 人员 正在 舷梯 接 一会儿 下 飞机 微 博 黄 女士 次 特殊 翻译 经历 晒 机上 空姐 讲 中文 找 翻译 体验 次 小 喇叭 广播 手 感觉 乘客 今天 上午 再 起程 事后 航空公司 AA186 全部 乘客 安排 酒店 架飞机 波音 777 满员 乘客 估计 300 人 外籍 乘客 希尔顿 酒店 国内 乘客 丽 酒店 黄 女士 透露 酒店 呆 长时间 航空公司 却 一直 无 人 出面 解释 答复 下 步 赔偿 问题 无从 谈 昨天 下午 记者 联系 丽 酒店 负责 协调 返航 乘客 负责人 称 受 美 航 公司 委托 负责 协调 乘客 飞机 再次 起飞 知晓 首都 机场 负责 乘客 改 签 工作 美 航 工作人员 透露 目前 尚未 接 上级 批 乘客 改 签 起飞 具体 通知 飞机 返航 具体 原因 赔偿 问题 记者 采访 美 航 北京 办事处 媒体 传媒 部 昨天 下午 记者 连续 拨打 办事处 电话 十几 次 无 人 接 听 首都 机场 美 航 公司 工作人员 拒绝 记者 转达 采访 意图 昨天 下午 5点 黄 女士 致电 记者 称 航空公司 通知 乘坐 今天 上午 10点 航班 再次 起飞 一大早 机场 等候 工作人员 带领 取 行李 希望 次 顺利 到达 黄 女士 提前 送 份 新年 祝福 晨报 96101 热线 新闻记者 岳 亦 雷 线索 祝 先生 ■ 新闻 链接 次 航班 曾 冒烟 返航 记录 2010年 12月 16日 上午 10时 许 美国 航空公司 北京 飞往 芝加哥 AA186 航班 起飞 小时 后 客舱 突然 烟 机长 决定 返航 首都 机场 解释 称 返航 空调系统 故障 机场 运行 未 受 影响 美国 航空公司 北京 代表处 表示 故障 未 造成 人员 伤亡 3 | 201201010216_13984 Total:38 感动:3 同情:3 无聊:0 愤怒:16 搞笑:7 难过:8 新奇:1 温馨:0 男子 婚 房 房 贷 抢劫 彩票 店 本报 记者 赵云龙 2011年 12月 28日 晚 10时 17分 许 水 屯 路南 段 福彩 投 注 站 女 老板 林丽 化名 整理 钱 款 欲 关门 回家 低头 收拾时 男子 突然 快步 走 进 店 里 手 持 红色 U 形 锁 隔 柜台 头部 猛 砸 过去 砸 两 下 后 男子 绕 进 柜台 内侧 林丽连 砸 下 柜台 两 提包 抢走 砸 完 跑 店 门 店 里 一个 椅子 追 四五 米 骑 摩托车 跑 林丽 告诉 记者 砸 后 头部 留下 处 伤口 缝 20 多针 抢走 两 包 里 13000 余 元 现金 一些 银行 存折 部分 刮 刮 乐 彩票 案发 后 调取 监控 录像 警方 很快 掌握 嫌疑人 体貌 特征 本报 2011年 12月 30日 刊发 深夜 抢 彩票 站 狂 砸 女 老板 文 显示 嫌疑人 清晰 面容 监控 截 图 刊登 希望 读者 提供 相关 破案 线索 警 媒 联动 下 2011年 12月 30日 下午 2时 许 热心 市民 警方 提供 条 重要 线索 嫌疑人 居住 梁 府 庄 一带 民警 梁 府 庄 犯罪 嫌疑人 向某 抓获 缴获 抢劫 13000 余 元 现金 银行 存折 刮 刮 乐 彩票 物 今年 30 岁 已 失业 许久 2年 前 结婚 时 曾 妻子 老家 泰安 买 套 婚 房 目前 尚 万 元 房 贷 春节 向某 身上 钱 房 贷 近期 一直 为难 2011年 12月 28日 晚 6时 向某 朋友 东关 大街 酒店 吃饭 席间 喝 五六 瓶 啤酒 消愁 饭局 结束 后 向某 回家 途中 想 买 彩票 碰碰 运气 当晚 10时 09分 走 进 水 屯 路南 段 福彩 投 注 站 告知 开奖 时间 已 结束 该店 关门 无意间 向某 看到 该店 女 老板 林丽 正在 清点 钱 款 萌生 抢劫 念头 盯 林丽 直到 数 钱 结束 观察 店 情况 随后 持 车 锁 行 抢 4 | 201201010226_10658 Total:735 感动:27 同情:45 无聊:105 愤怒:44 搞笑:392 难过:56 新奇:60 温馨:6 99 岁 丈夫 发现 妻子 60年 前 出轨 欲 离婚 新华社 电 意大利 年 百 岁 夫妇 正 闹 离婚 原因 丈夫 发现 妻子 60年 前 相 好 律师 拒绝 公开 两 人 全 名 丈夫 名为 安东尼奥 现年 99 岁 妻子 罗莎 96 岁 安东尼奥 世纪 30年代 那不勒斯 服役 结识 罗莎 两 人 结婚 77年 孩子 12 孙 辈 重孙 今年 圣诞节 前 数 天 安东尼奥 翻 五斗橱 发现 一些 情书 显示 妻子 世纪 40年代 段 秘密 婚外恋 妻子 承认 写 信 请求 丈夫 原谅 婚外恋 过去 60 年 丈夫 立即 要求 离婚 5 | 201201010226_28346 Total:25 感动:1 同情:0 无聊:0 愤怒:18 搞笑:4 难过:2 新奇:0 温馨:0 商 户 举报 占 道 经营 遭 威胁 代 缴 罚款 本报 讯 记者 张太凌 郭公庄 路 商贩 冯 明 举报 同业 占用 机动车 道 摆摊 设点 想 城管 查处 后 挤出 块 好 路段 摆 摊 未 料 举报 电话 认出 遭到 举报者 围堵 家门 威胁 举报者 支付 罚款 扣 车 款 6500 余 元 事后 冯明 怀疑 电话 城管 泄露 要求 城管 赔偿 代 缴 罚款 损失 昨天 花乡 城管 分队 称 冯明 怀疑 纯属 臆断 丰台区 城管 大队 表示 城管 必要 泄露 举报者 信息 冯明 回忆 称 去年 12月 27日 中午 11时 左右 丰台区 花乡 城管 分队 电话 举报 郭公庄 路上 许多 鱼贩 占用 机动车 道 摆摊 设点 养鱼 水 洒 路面 结冰 影响 行走 冯 明说 约 20分钟 后 城管 执法 车 赶到 驱散 路边 商贩 过程 中 看到 城管 人员 商贩 一些 类似 举报 记录单 双方 交流 城管 执法 人员 走 后 商贩 径直 走 面前 称 已 人 认出 举报 电话 手机 号 威胁 说 城管 查抄 车 回来 找 冯 明说 冯明 称 次日 中午 12时 丰台 城管 再次 出动 查 扣 郭公庄 路 当天 出摊 10 辆 鱼贩 车 当晚 七八点 钟 鱼贩 找到 冯明 家 冯 明 邻居 刘 先生 证实 当晚 辆 面包车 载 10 余 人 堵 冯 明 家门口 车上 板 砖 钢管 堆 人 人 坏 办 冯明 房东 寇 先生 说 朋友 居中 调停 12月 29日 下午 冯明 鱼贩 城管 支付 罚款 扣 车 款 6500 余 元 后 领 回 扣 车辆 冯 明说 报警 家人 安全 息事宁人 担心 报警 使 后果 严重 车 取 回来 后 鱼 损失 700 元 昨日 中午 位 罚 鱼贩 表示 冯明 举报 避 口 谈 位 同样 罚 鱼贩 告诉 记者 听 瞎说 吃里扒外 一些 鱼贩 记者 透露 举报 挨 罚 鱼贩 确实 知道 举报者 同行 冯 明 吃里扒外 冯明 表示 经营 鱼 水果 举报 想 清理 经营 秩序 好 路段 挤出 块 地方 摆 摊 认为 城管 泄露 举报信息 导致 已 损失 前 日 冯明 前往 花乡 城管 分队 交涉 一年 挣 万 代 缴 罚款 等于 今年 干 冯明 表示 想 做出 臆断 昨日 花乡 城管 分队 工作人员 称 丰台 城管 大队 表示 冯 明 举报 后 城管 进行 执法 泄露 举报者 信息 举报 举报 泄露 电话 必要 冯明 称 城管 交涉 时 举报 记录显示 确实 举报者 北京市 翰盛 律师 事务所 律师 郭召利 认为 执法 部门 泄露 举报人 信息 带来 人身 安全 威胁 伤害 应 给予 一定 保护 相应 补偿 泄露 追 责 郭召利 称 事件 中 执法 部门 做出 罚款 决定 说明 违法 事实 存在 举报人 举报人 缴纳 罚款 说明 受到 某种 威胁 代 缴 罚款 私了 受 法律 保护 代 缴 罚款 举报人 处罚 人 追 讨 6 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | 实验三介绍PPT.pptx 2 | 实验三说明文档.pdf 3 | README.pdf -------------------------------------------------------------------------------- /doc/col_bi-gru.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_bi-gru.png -------------------------------------------------------------------------------- /doc/col_bi-lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_bi-lstm.png -------------------------------------------------------------------------------- /doc/col_cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_cnn.png -------------------------------------------------------------------------------- /doc/col_gru.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_gru.png -------------------------------------------------------------------------------- /doc/col_lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_lstm.png -------------------------------------------------------------------------------- /doc/col_mlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_mlp.png -------------------------------------------------------------------------------- /elmoformanylangs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from .elmo import Embedder 3 | -------------------------------------------------------------------------------- /elmoformanylangs/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import sys 6 | import codecs 7 | import argparse 8 | import logging 9 | import json 10 | import torch 11 | from .modules.embedding_layer import EmbeddingLayer 12 | from .utils import dict2namedtuple 13 | from .frontend import Model 14 | from .frontend import create_batches 15 | import numpy as np 16 | import h5py 17 | 18 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s') 19 | 20 | 21 | def read_corpus(path, max_chars=None): 22 | """ 23 | read raw text file. The format of the input is like, one sentence per line 24 | words are separated by '\t' 25 | 26 | :param path: 27 | :param max_chars: int, the number of maximum characters in a word, this 28 | parameter is used when the model is configured with CNN word encoder. 29 | :return: 30 | """ 31 | dataset = [] 32 | textset = [] 33 | with codecs.open(path, 'r', encoding='utf-8') as fin: 34 | for line in fin.read().strip().split('\n'): 35 | data = [''] 36 | text = [] 37 | for token in line.split('\t'): 38 | text.append(token) 39 | if max_chars is not None and len(token) + 2 > max_chars: 40 | token = token[:max_chars - 2] 41 | data.append(token) 42 | data.append('') 43 | dataset.append(data) 44 | textset.append(text) 45 | return dataset, textset 46 | 47 | 48 | def read_conll_corpus(path, max_chars=None): 49 | """ 50 | read text in CoNLL-U format. 51 | 52 | :param path: 53 | :param max_chars: 54 | :return: 55 | """ 56 | dataset = [] 57 | textset = [] 58 | with codecs.open(path, 'r', encoding='utf-8') as fin: 59 | for payload in fin.read().strip().split('\n\n'): 60 | data = [''] 61 | text = [] 62 | lines = payload.splitlines() 63 | body = [line for line in lines if not line.startswith('#')] 64 | for line in body: 65 | fields = line.split('\t') 66 | num, token = fields[0], fields[1] 67 | if '-' in num or '.' in num: 68 | continue 69 | text.append(token) 70 | if max_chars is not None and len(token) + 2 > max_chars: 71 | token = token[:max_chars - 2] 72 | data.append(token) 73 | data.append('') 74 | dataset.append(data) 75 | textset.append(text) 76 | return dataset, textset 77 | 78 | 79 | def read_conll_char_corpus(path, max_chars=None): 80 | """ 81 | 82 | :param path: 83 | :param max_chars: 84 | :return: 85 | """ 86 | dataset = [] 87 | textset = [] 88 | with codecs.open(path, 'r', encoding='utf-8') as fin: 89 | for payload in fin.read().strip().split('\n\n'): 90 | data = [''] 91 | text = [] 92 | lines = payload.splitlines() 93 | body = [line for line in lines if not line.startswith('#')] 94 | for line in body: 95 | fields = line.split('\t') 96 | num, token = fields[0], fields[1] 97 | if '-' in num or '.' in num: 98 | continue 99 | for ch in token: 100 | text.append(ch) 101 | if max_chars is not None and len(ch) + 2 > max_chars: 102 | ch = ch[:max_chars - 2] 103 | data.append(ch) 104 | data.append('') 105 | dataset.append(data) 106 | textset.append(text) 107 | return dataset, textset 108 | 109 | 110 | def read_conll_char_vi_corpus(path, max_chars=None): 111 | """ 112 | 113 | :param path: 114 | :param max_chars: 115 | :return: 116 | """ 117 | dataset = [] 118 | textset = [] 119 | with codecs.open(path, 'r', encoding='utf-8') as fin: 120 | for payload in fin.read().strip().split('\n\n'): 121 | data = [''] 122 | text = [] 123 | lines = payload.splitlines() 124 | body = [line for line in lines if not line.startswith('#')] 125 | for line in body: 126 | fields = line.split('\t') 127 | num, token = fields[0], fields[1] 128 | if '-' in num or '.' in num: 129 | continue 130 | for ch in token.split(): 131 | text.append(ch) 132 | if max_chars is not None and len(ch) + 2 > max_chars: 133 | ch = ch[:max_chars - 2] 134 | data.append(ch) 135 | data.append('') 136 | dataset.append(data) 137 | textset.append(text) 138 | return dataset, textset 139 | 140 | 141 | def test_main(): 142 | # Configurations 143 | cmd = argparse.ArgumentParser('The testing components of') 144 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') 145 | cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'), 146 | help='the input format.') 147 | cmd.add_argument("--input", help="the path to the raw text file.") 148 | cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).' 149 | ' Use comma to separate the format identifiers,' 150 | ' like \'--output_format=hdf5,plain\'') 151 | cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of ' 152 | '..') 153 | cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM ' 154 | 'hidden layer, 2 for the second LSTM hidden layer, -1 for an average' 155 | 'of 3 layers.') 156 | cmd.add_argument("--model", required=True, help="the path to the model.") 157 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') 158 | args = cmd.parse_args(sys.argv[2:]) 159 | 160 | if args.gpu >= 0: 161 | torch.cuda.set_device(args.gpu) 162 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 163 | # load the model configurations 164 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config_rnn.json'), 'r', encoding='utf-8'))) 165 | 166 | with open(os.path.join(args.model, args2.config_path), 'r') as fin: 167 | config = json.load(fin) 168 | 169 | # For the model trained with character-based word encoder. 170 | if config['token_embedder']['char_dim'] > 0: 171 | char_lexicon = {} 172 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: 173 | for line in fpi: 174 | tokens = line.strip().split('\t') 175 | if len(tokens) == 1: 176 | tokens.insert(0, '\u3000') 177 | token, i = tokens 178 | char_lexicon[token] = int(i) 179 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) 180 | logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) 181 | else: 182 | char_lexicon = None 183 | char_emb_layer = None 184 | 185 | # For the model trained with word form word encoder. 186 | if config['token_embedder']['word_dim'] > 0: 187 | word_lexicon = {} 188 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: 189 | for line in fpi: 190 | tokens = line.strip().split('\t') 191 | if len(tokens) == 1: 192 | tokens.insert(0, '\u3000') 193 | token, i = tokens 194 | word_lexicon[token] = int(i) 195 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) 196 | logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) 197 | else: 198 | word_lexicon = None 199 | word_emb_layer = None 200 | 201 | # instantiate the model 202 | model = Model(config, word_emb_layer, char_emb_layer, use_cuda) 203 | 204 | if use_cuda: 205 | model.cuda() 206 | 207 | logging.info(str(model)) 208 | model.load_model(args.model) 209 | 210 | # read test data according to input format 211 | read_function = read_corpus if args.input_format == 'plain' else ( 212 | read_conll_corpus if args.input_format == 'conll' else ( 213 | read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus)) 214 | 215 | if config['token_embedder']['name'].lower() == 'cnn': 216 | test, text = read_function(args.input, config['token_embedder']['max_characters_per_token']) 217 | else: 218 | test, text = read_function(args.input) 219 | 220 | # create test batches from the input data. 221 | test_w, test_c, test_lens, test_masks, test_text = create_batches( 222 | test, args.batch_size, word_lexicon, char_lexicon, config, text=text) 223 | 224 | # configure the model to evaluation mode. 225 | model.eval() 226 | 227 | sent_set = set() 228 | cnt = 0 229 | 230 | output_formats = args.output_format.split(',') 231 | output_layers = map(int, args.output_layer.split(',')) 232 | 233 | handlers = {} 234 | for output_format in output_formats: 235 | if output_format not in ('hdf5', 'txt'): 236 | print('Unknown output_format: {0}'.format(output_format)) 237 | continue 238 | for output_layer in output_layers: 239 | filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format) 240 | handlers[output_format, output_layer] = \ 241 | h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w') 242 | 243 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 244 | output = model.forward(w, c, masks) 245 | for i, text in enumerate(texts): 246 | sent = '\t'.join(text) 247 | sent = sent.replace('.', '$period$') 248 | sent = sent.replace('/', '$backslash$') 249 | if sent in sent_set: 250 | continue 251 | sent_set.add(sent) 252 | if config['encoder']['name'].lower() == 'lstm': 253 | data = output[i, 1:lens[i]-1, :].data 254 | if use_cuda: 255 | data = data.cpu() 256 | data = data.numpy() 257 | elif config['encoder']['name'].lower() == 'elmo': 258 | data = output[:, i, 1:lens[i]-1, :].data 259 | if use_cuda: 260 | data = data.cpu() 261 | data = data.numpy() 262 | 263 | for (output_format, output_layer) in handlers: 264 | fout = handlers[output_format, output_layer] 265 | if output_layer == -1: 266 | payload = np.average(data, axis=0) 267 | else: 268 | payload = data[output_layer] 269 | if output_format == 'hdf5': 270 | fout.create_dataset(sent, payload.shape, dtype='float32', data=payload) 271 | else: 272 | for word, row in zip(text, payload): 273 | print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout) 274 | print('', file=fout) 275 | 276 | cnt += 1 277 | if cnt % 1000 == 0: 278 | logging.info('Finished {0} sentences.'.format(cnt)) 279 | for _, handler in handlers.items(): 280 | handler.close() 281 | 282 | 283 | if __name__ == "__main__": 284 | if len(sys.argv) > 1 and sys.argv[1] == 'test': 285 | test_main() 286 | else: 287 | print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr) 288 | -------------------------------------------------------------------------------- /elmoformanylangs/biLM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import errno 6 | import sys 7 | import codecs 8 | import argparse 9 | import time 10 | import random 11 | import logging 12 | import json 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | import torch.optim as optim 17 | from torch.autograd import Variable 18 | from .modules.elmo import ElmobiLm 19 | from .modules.lstm import LstmbiLm 20 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder 21 | from .modules.embedding_layer import EmbeddingLayer 22 | from .modules.classify_layer import SoftmaxLayer, CNNSoftmaxLayer, SampledSoftmaxLayer 23 | from .dataloader import load_embedding 24 | from .utils import dict2namedtuple 25 | from collections import Counter 26 | import numpy as np 27 | 28 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s') 29 | 30 | 31 | def divide(data, valid_size): 32 | valid_size = min(valid_size, len(data) // 10) 33 | random.shuffle(data) 34 | return data[valid_size:], data[:valid_size] 35 | 36 | 37 | def break_sentence(sentence, max_sent_len): 38 | """ 39 | For example, for a sentence with 70 words, supposing the the `max_sent_len' 40 | is 30, break it into 3 sentences. 41 | 42 | :param sentence: list[str] the sentence 43 | :param max_sent_len: 44 | :return: 45 | """ 46 | ret = [] 47 | cur = 0 48 | length = len(sentence) 49 | while cur < length: 50 | if cur + max_sent_len + 5 >= length: 51 | ret.append(sentence[cur: length]) 52 | break 53 | ret.append(sentence[cur: min(length, cur + max_sent_len)]) 54 | cur += max_sent_len 55 | return ret 56 | 57 | 58 | def read_corpus(path, max_chars=None, max_sent_len=20): 59 | """ 60 | read raw text file 61 | :param path: str 62 | :param max_chars: int 63 | :param max_sent_len: int 64 | :return: 65 | """ 66 | data = [] 67 | with codecs.open(path, 'r', encoding='utf-8') as fin: 68 | for line in fin: 69 | data.append('') 70 | for token in line.strip().split(): 71 | if max_chars is not None and len(token) + 2 > max_chars: 72 | token = token[:max_chars - 2] 73 | data.append(token) 74 | data.append('') 75 | dataset = break_sentence(data, max_sent_len) 76 | return dataset 77 | 78 | 79 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True): 80 | """ 81 | 82 | :param x: 83 | :param word2id: dict 84 | :param char2id: dict 85 | :param config: 86 | :param oov: 87 | :param pad: 88 | :param sort: 89 | :return: 90 | """ 91 | batch_size = len(x) 92 | lst = list(range(batch_size)) 93 | if sort: 94 | lst.sort(key=lambda l: -len(x[l])) 95 | 96 | x = [x[i] for i in lst] 97 | lens = [len(x[i]) for i in lst] 98 | max_len = max(lens) 99 | 100 | if word2id is not None: 101 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None) 102 | assert oov_id is not None and pad_id is not None 103 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id) 104 | for i, x_i in enumerate(x): 105 | for j, x_ij in enumerate(x_i): 106 | batch_w[i][j] = word2id.get(x_ij, oov_id) 107 | else: 108 | batch_w = None 109 | 110 | if char2id is not None: 111 | bow_id, eow_id, oov_id, pad_id = char2id.get('', None), char2id.get('', None), char2id.get(oov, None), char2id.get(pad, None) 112 | 113 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None 114 | 115 | if config['token_embedder']['name'].lower() == 'cnn': 116 | max_chars = config['token_embedder']['max_characters_per_token'] 117 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars 118 | elif config['token_embedder']['name'].lower() == 'lstm': 119 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 # counting the and 120 | 121 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id) 122 | 123 | for i, x_i in enumerate(x): 124 | for j, x_ij in enumerate(x_i): 125 | batch_c[i][j][0] = bow_id 126 | if x_ij == '' or x_ij == '': 127 | batch_c[i][j][1] = char2id.get(x_ij) 128 | batch_c[i][j][2] = eow_id 129 | else: 130 | for k, c in enumerate(x_ij): 131 | batch_c[i][j][k + 1] = char2id.get(c, oov_id) 132 | batch_c[i][j][len(x_ij) + 1] = eow_id 133 | else: 134 | batch_c = None 135 | 136 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []] 137 | 138 | for i, x_i in enumerate(x): 139 | for j in range(len(x_i)): 140 | masks[0][i][j] = 1 141 | if j + 1 < len(x_i): 142 | masks[1].append(i * max_len + j) 143 | if j > 0: 144 | masks[2].append(i * max_len + j) 145 | 146 | assert len(masks[1]) <= batch_size * max_len 147 | assert len(masks[2]) <= batch_size * max_len 148 | 149 | masks[1] = torch.LongTensor(masks[1]) 150 | masks[2] = torch.LongTensor(masks[2]) 151 | 152 | return batch_w, batch_c, lens, masks 153 | 154 | 155 | # shuffle training examples and create mini-batches 156 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, use_cuda=False): 157 | """ 158 | 159 | :param x: 160 | :param batch_size: 161 | :param word2id: 162 | :param char2id: 163 | :param config: 164 | :param perm: 165 | :param shuffle: 166 | :param sort: 167 | :param use_cuda: 168 | :return: 169 | """ 170 | lst = perm or list(range(len(x))) 171 | if shuffle: 172 | random.shuffle(lst) 173 | 174 | if sort: 175 | lst.sort(key=lambda l: -len(x[l])) 176 | 177 | x = [x[i] for i in lst] 178 | 179 | sum_len = 0.0 180 | batches_w, batches_c, batches_lens, batches_masks = [], [], [], [] 181 | size = batch_size 182 | nbatch = (len(x) - 1) // size + 1 183 | for i in range(nbatch): 184 | start_id, end_id = i * size, (i + 1) * size 185 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 186 | sum_len += sum(blens) 187 | batches_w.append(bw) 188 | batches_c.append(bc) 189 | batches_lens.append(blens) 190 | batches_masks.append(bmasks) 191 | 192 | if sort: 193 | perm = list(range(nbatch)) 194 | random.shuffle(perm) 195 | batches_w = [batches_w[i] for i in perm] 196 | batches_c = [batches_c[i] for i in perm] 197 | batches_lens = [batches_lens[i] for i in perm] 198 | batches_masks = [batches_masks[i] for i in perm] 199 | 200 | logging.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x))) 201 | return batches_w, batches_c, batches_lens, batches_masks 202 | 203 | 204 | class Model(nn.Module): 205 | def __init__(self, config, word_emb_layer, char_emb_layer, n_class, use_cuda=False): 206 | super(Model, self).__init__() 207 | self.use_cuda = use_cuda 208 | self.config = config 209 | 210 | if config['token_embedder']['name'].lower() == 'cnn': 211 | self.token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda) 212 | elif config['token_embedder']['name'].lower() == 'lstm': 213 | self.token_embedder = LstmTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda) 214 | 215 | if config['encoder']['name'].lower() == 'elmo': 216 | self.encoder = ElmobiLm(config, use_cuda) 217 | elif config['encoder']['name'].lower() == 'lstm': 218 | self.encoder = LstmbiLm(config, use_cuda) 219 | 220 | self.output_dim = config['encoder']['projection_dim'] 221 | if config['classifier']['name'].lower() == 'softmax': 222 | self.classify_layer = SoftmaxLayer(self.output_dim, n_class) 223 | elif config['classifier']['name'].lower() == 'cnn_softmax': 224 | self.classify_layer = CNNSoftmaxLayer(self.token_embedder, self.output_dim, n_class, 225 | config['classifier']['n_samples'], config['classifier']['corr_dim'], 226 | use_cuda) 227 | elif config['classifier']['name'].lower() == 'sampled_softmax': 228 | self.classify_layer = SampledSoftmaxLayer(self.output_dim, n_class, config['classifier']['n_samples'], use_cuda) 229 | 230 | def forward(self, word_inp, chars_inp, mask_package): 231 | """ 232 | 233 | :param word_inp: 234 | :param chars_inp: 235 | :param mask_package: Tuple[] 236 | :return: 237 | """ 238 | classifier_name = self.config['classifier']['name'].lower() 239 | 240 | if self.training and classifier_name == 'cnn_softmax' or classifier_name == 'sampled_softmax': 241 | self.classify_layer.update_negative_samples(word_inp, chars_inp, mask_package[0]) 242 | self.classify_layer.update_embedding_matrix() 243 | 244 | token_embedding = self.token_embedder(word_inp, chars_inp, (mask_package[0].size(0), mask_package[0].size(1))) 245 | token_embedding = F.dropout(token_embedding, self.config['dropout'], self.training) 246 | 247 | encoder_name = self.config['encoder']['name'].lower() 248 | if encoder_name == 'elmo': 249 | mask = Variable(mask_package[0].cuda()).cuda() if self.use_cuda else Variable(mask_package[0]) 250 | encoder_output = self.encoder(token_embedding, mask) 251 | encoder_output = encoder_output[1] 252 | # [batch_size, len, hidden_size] 253 | elif encoder_name == 'lstm': 254 | encoder_output = self.encoder(token_embedding) 255 | else: 256 | raise ValueError('') 257 | 258 | encoder_output = F.dropout(encoder_output, self.config['dropout'], self.training) 259 | forward, backward = encoder_output.split(self.output_dim, 2) 260 | 261 | word_inp = Variable(word_inp) 262 | if self.use_cuda: 263 | word_inp = word_inp.cuda() 264 | 265 | mask1 = Variable(mask_package[1].cuda()).cuda() if self.use_cuda else Variable(mask_package[1]) 266 | mask2 = Variable(mask_package[2].cuda()).cuda() if self.use_cuda else Variable(mask_package[2]) 267 | 268 | forward_x = forward.contiguous().view(-1, self.output_dim).index_select(0, mask1) 269 | forward_y = word_inp.contiguous().view(-1).index_select(0, mask2) 270 | 271 | backward_x = backward.contiguous().view(-1, self.output_dim).index_select(0, mask2) 272 | backward_y = word_inp.contiguous().view(-1).index_select(0, mask1) 273 | 274 | return self.classify_layer(forward_x, forward_y), self.classify_layer(backward_x, backward_y) 275 | 276 | def save_model(self, path, save_classify_layer): 277 | torch.save(self.token_embedder.state_dict(), os.path.join(path, 'token_embedder.pkl')) 278 | torch.save(self.encoder.state_dict(), os.path.join(path, 'encoder.pkl')) 279 | if save_classify_layer: 280 | torch.save(self.classify_layer.state_dict(), os.path.join(path, 'classifier.pkl')) 281 | 282 | def load_model(self, path): 283 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'))) 284 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'))) 285 | self.classify_layer.load_state_dict(torch.load(os.path.join(path, 'classifier.pkl'))) 286 | 287 | 288 | def eval_model(model, valid): 289 | model.eval() 290 | if model.config['classifier']['name'].lower() == 'cnn_softmax' or \ 291 | model.config['classifier']['name'].lower() == 'sampled_softmax': 292 | model.classify_layer.update_embedding_matrix() 293 | total_loss, total_tag = 0.0, 0 294 | valid_w, valid_c, valid_lens, valid_masks = valid 295 | for w, c, lens, masks in zip(valid_w, valid_c, valid_lens, valid_masks): 296 | loss_forward, loss_backward = model.forward(w, c, masks) 297 | total_loss += loss_forward.data[0] 298 | n_tags = sum(lens) 299 | total_tag += n_tags 300 | model.train() 301 | return np.exp(total_loss / total_tag) 302 | 303 | 304 | def train_model(epoch, opt, model, optimizer, 305 | train, valid, test, best_train, best_valid, test_result): 306 | """ 307 | Training model for one epoch 308 | 309 | :param epoch: 310 | :param opt: 311 | :param model: 312 | :param optimizer: 313 | :param train: 314 | :param best_train: 315 | :param valid: 316 | :param best_valid: 317 | :param test: 318 | :param test_result: 319 | :return: 320 | """ 321 | model.train() 322 | 323 | total_loss, total_tag = 0.0, 0 324 | cnt = 0 325 | start_time = time.time() 326 | 327 | train_w, train_c, train_lens, train_masks = train 328 | 329 | lst = list(range(len(train_w))) 330 | random.shuffle(lst) 331 | 332 | train_w = [train_w[l] for l in lst] 333 | train_c = [train_c[l] for l in lst] 334 | train_lens = [train_lens[l] for l in lst] 335 | train_masks = [train_masks[l] for l in lst] 336 | 337 | for w, c, lens, masks in zip(train_w, train_c, train_lens, train_masks): 338 | cnt += 1 339 | model.zero_grad() 340 | loss_forward, loss_backward = model.forward(w, c, masks) 341 | 342 | loss = (loss_forward + loss_backward) / 2.0 343 | total_loss += loss_forward.data[0] 344 | n_tags = sum(lens) 345 | total_tag += n_tags 346 | loss.backward() 347 | 348 | torch.nn.utils.clip_grad_norm(model.parameters(), opt.clip_grad) 349 | optimizer.step() 350 | if cnt * opt.batch_size % 1024 == 0: 351 | logging.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f} time={:.2f}s".format( 352 | epoch, cnt, optimizer.param_groups[0]['lr'], 353 | np.exp(total_loss / total_tag), time.time() - start_time 354 | )) 355 | start_time = time.time() 356 | 357 | if cnt % opt.eval_steps == 0 or cnt % len(train_w) == 0: 358 | if valid is None: 359 | train_ppl = np.exp(total_loss / total_tag) 360 | logging.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f}".format( 361 | epoch, cnt, optimizer.param_groups[0]['lr'], train_ppl)) 362 | if train_ppl < best_train: 363 | best_train = train_ppl 364 | logging.info("New record achieved on training dataset!") 365 | model.save_model(opt.model, opt.save_classify_layer) 366 | else: 367 | valid_ppl = eval_model(model, valid) 368 | logging.info("Epoch={} iter={} lr={:.6f} valid_ppl={:.6f}".format( 369 | epoch, cnt, optimizer.param_groups[0]['lr'], valid_ppl)) 370 | 371 | if valid_ppl < best_valid: 372 | model.save_model(opt.model, opt.save_classify_layer) 373 | best_valid = valid_ppl 374 | logging.info("New record achieved!") 375 | 376 | if test is not None: 377 | test_result = eval_model(model, test) 378 | logging.info("Epoch={} iter={} lr={:.6f} test_ppl={:.6f}".format( 379 | epoch, cnt, optimizer.param_groups[0]['lr'], test_result)) 380 | return best_train, best_valid, test_result 381 | 382 | 383 | def get_truncated_vocab(dataset, min_count): 384 | """ 385 | 386 | :param dataset: 387 | :param min_count: int 388 | :return: 389 | """ 390 | word_count = Counter() 391 | for sentence in dataset: 392 | word_count.update(sentence) 393 | 394 | word_count = list(word_count.items()) 395 | word_count.sort(key=lambda x: x[1], reverse=True) 396 | 397 | i = 0 398 | for word, count in word_count: 399 | if count < min_count: 400 | break 401 | i += 1 402 | 403 | logging.info('Truncated word count: {0}.'.format(sum([count for word, count in word_count[i:]]))) 404 | logging.info('Original vocabulary size: {0}.'.format(len(word_count))) 405 | return word_count[:i] 406 | 407 | 408 | def train(): 409 | cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') 410 | cmd.add_argument('--seed', default=1, type=int, help='The random seed.') 411 | cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') 412 | 413 | cmd.add_argument('--train_path', required=True, help='The path to the training file.') 414 | cmd.add_argument('--valid_path', help='The path to the development file.') 415 | cmd.add_argument('--test_path', help='The path to the testing file.') 416 | 417 | cmd.add_argument('--config_path', required=True, help='the path to the config file.') 418 | cmd.add_argument("--word_embedding", help="The path to word vectors.") 419 | 420 | cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'], 421 | help='the type of optimizer: valid options=[sgd, adam, adagrad]') 422 | cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') 423 | cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') 424 | 425 | cmd.add_argument("--model", required=True, help="path to save model") 426 | 427 | cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') 428 | cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') 429 | 430 | cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.') 431 | 432 | cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') 433 | 434 | cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.') 435 | 436 | cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.') 437 | 438 | cmd.add_argument('--save_classify_layer', default=False, action='store_true', 439 | help="whether to save the classify layer") 440 | 441 | cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") 442 | cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.') 443 | 444 | opt = cmd.parse_args(sys.argv[2:]) 445 | 446 | with open(opt.config_path, 'r') as fin: 447 | config = json.load(fin) 448 | 449 | # Dump configurations 450 | print(opt) 451 | print(config) 452 | 453 | # set seed. 454 | torch.manual_seed(opt.seed) 455 | random.seed(opt.seed) 456 | if opt.gpu >= 0: 457 | torch.cuda.set_device(opt.gpu) 458 | if opt.seed > 0: 459 | torch.cuda.manual_seed(opt.seed) 460 | 461 | use_cuda = opt.gpu >= 0 and torch.cuda.is_available() 462 | 463 | token_embedder_name = config['token_embedder']['name'].lower() 464 | token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None) 465 | if token_embedder_name == 'cnn': 466 | train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len) 467 | elif token_embedder_name == 'lstm': 468 | train_data = read_corpus(opt.train_path, opt.max_sent_len) 469 | else: 470 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 471 | 472 | logging.info('training instance: {}, training tokens: {}.'.format(len(train_data), 473 | sum([len(s) - 1 for s in train_data]))) 474 | 475 | if opt.valid_path is not None: 476 | if token_embedder_name == 'cnn': 477 | valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len) 478 | elif token_embedder_name == 'lstm': 479 | valid_data = read_corpus(opt.valid_path, opt.max_sent_len) 480 | else: 481 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 482 | logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data), 483 | sum([len(s) - 1 for s in valid_data]))) 484 | elif opt.valid_size > 0: 485 | train_data, valid_data = divide(train_data, opt.valid_size) 486 | logging.info('training instance: {}, training tokens after division: {}.'.format( 487 | len(train_data), sum([len(s) - 1 for s in train_data]))) 488 | logging.info('valid instance: {}, valid tokens: {}.'.format( 489 | len(valid_data), sum([len(s) - 1 for s in valid_data]))) 490 | else: 491 | valid_data = None 492 | 493 | if opt.test_path is not None: 494 | if token_embedder_name == 'cnn': 495 | test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len) 496 | elif token_embedder_name == 'lstm': 497 | test_data = read_corpus(opt.test_path, opt.max_sent_len) 498 | else: 499 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 500 | logging.info('testing instance: {}, testing tokens: {}.'.format( 501 | len(test_data), sum([len(s) - 1 for s in test_data]))) 502 | else: 503 | test_data = None 504 | 505 | if opt.word_embedding is not None: 506 | embs = load_embedding(opt.word_embedding) 507 | word_lexicon = {word: i for i, word in enumerate(embs[0])} 508 | else: 509 | embs = None 510 | word_lexicon = {} 511 | 512 | # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification 513 | vocab = get_truncated_vocab(train_data, opt.min_count) 514 | 515 | # Ensure index of '' is 0 516 | for special_word in ['', '', '', '']: 517 | if special_word not in word_lexicon: 518 | word_lexicon[special_word] = len(word_lexicon) 519 | 520 | for word, _ in vocab: 521 | if word not in word_lexicon: 522 | word_lexicon[word] = len(word_lexicon) 523 | 524 | # Word Embedding 525 | if config['token_embedder']['word_dim'] > 0: 526 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs) 527 | logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id))) 528 | else: 529 | word_emb_layer = None 530 | logging.info('Vocabulary size: {0}'.format(len(word_lexicon))) 531 | 532 | # Character Lexicon 533 | if config['token_embedder']['char_dim'] > 0: 534 | char_lexicon = {} 535 | for sentence in train_data: 536 | for word in sentence: 537 | for ch in word: 538 | if ch not in char_lexicon: 539 | char_lexicon[ch] = len(char_lexicon) 540 | 541 | for special_char in ['', '', '', '', '', '']: 542 | if special_char not in char_lexicon: 543 | char_lexicon[special_char] = len(char_lexicon) 544 | 545 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) 546 | logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id))) 547 | else: 548 | char_lexicon = None 549 | char_emb_layer = None 550 | 551 | train = create_batches( 552 | train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) 553 | 554 | if opt.eval_steps is None: 555 | opt.eval_steps = len(train[0]) 556 | logging.info('Evaluate every {0} batches.'.format(opt.eval_steps)) 557 | 558 | if valid_data is not None: 559 | valid = create_batches( 560 | valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 561 | else: 562 | valid = None 563 | 564 | if test_data is not None: 565 | test = create_batches( 566 | test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 567 | else: 568 | test = None 569 | 570 | label_to_ix = word_lexicon 571 | logging.info('vocab size: {0}'.format(len(label_to_ix))) 572 | 573 | nclasses = len(label_to_ix) 574 | 575 | model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda) 576 | logging.info(str(model)) 577 | if use_cuda: 578 | model = model.cuda() 579 | 580 | need_grad = lambda x: x.requires_grad 581 | if opt.optimizer.lower() == 'adam': 582 | optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) 583 | elif opt.optimizer.lower() == 'sgd': 584 | optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) 585 | elif opt.optimizer.lower() == 'adagrad': 586 | optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr) 587 | else: 588 | raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower())) 589 | 590 | try: 591 | os.makedirs(opt.model) 592 | except OSError as exception: 593 | if exception.errno != errno.EEXIST: 594 | raise 595 | 596 | if config['token_embedder']['char_dim'] > 0: 597 | with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: 598 | for ch, i in char_emb_layer.word2id.items(): 599 | print('{0}\t{1}'.format(ch, i), file=fpo) 600 | 601 | with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: 602 | for w, i in word_lexicon.items(): 603 | print('{0}\t{1}'.format(w, i), file=fpo) 604 | 605 | json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config_rnn.json'), 'w', encoding='utf-8')) 606 | 607 | best_train = 1e+8 608 | best_valid = 1e+8 609 | test_result = 1e+8 610 | 611 | for epoch in range(opt.max_epoch): 612 | best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer, 613 | train, valid, test, best_train, best_valid, test_result) 614 | if opt.lr_decay > 0: 615 | optimizer.param_groups[0]['lr'] *= opt.lr_decay 616 | 617 | if valid_data is None: 618 | logging.info("best train ppl: {:.6f}.".format(best_train)) 619 | elif test_data is None: 620 | logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid)) 621 | else: 622 | logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result)) 623 | 624 | 625 | def test(): 626 | cmd = argparse.ArgumentParser('The testing components of') 627 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') 628 | cmd.add_argument("--input", help="the path to the raw text file.") 629 | cmd.add_argument("--model", required=True, help="path to save model") 630 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') 631 | args = cmd.parse_args(sys.argv[2:]) 632 | 633 | if args.gpu >= 0: 634 | torch.cuda.set_device(args.gpu) 635 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 636 | 637 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config_rnn.json'), 'r', encoding='utf-8'))) 638 | 639 | with open(args2.config_path, 'r') as fin: 640 | config = json.load(fin) 641 | 642 | if config['token_embedder']['char_dim'] > 0: 643 | char_lexicon = {} 644 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: 645 | for line in fpi: 646 | tokens = line.strip().split('\t') 647 | if len(tokens) == 1: 648 | tokens.insert(0, '\u3000') 649 | token, i = tokens 650 | char_lexicon[token] = int(i) 651 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) 652 | logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) 653 | else: 654 | char_lexicon = None 655 | char_emb_layer = None 656 | 657 | word_lexicon = {} 658 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: 659 | for line in fpi: 660 | tokens = line.strip().split('\t') 661 | if len(tokens) == 1: 662 | tokens.insert(0, '\u3000') 663 | token, i = tokens 664 | word_lexicon[token] = int(i) 665 | 666 | if config['token_embedder']['word_dim'] > 0: 667 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) 668 | logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) 669 | else: 670 | word_emb_layer = None 671 | 672 | model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda) 673 | 674 | if use_cuda: 675 | model.cuda() 676 | 677 | logging.info(str(model)) 678 | model.load_model(args.model) 679 | if config['token_embedder']['name'].lower() == 'cnn': 680 | test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000) 681 | elif config['token_embedder']['name'].lower() == 'lstm': 682 | test = read_corpus(args.input, max_sent_len=10000) 683 | else: 684 | raise ValueError('') 685 | 686 | test_w, test_c, test_lens, test_masks = create_batches( 687 | test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 688 | 689 | test_result = eval_model(model, (test_w, test_c, test_lens, test_masks)) 690 | 691 | logging.info("test_ppl={:.6f}".format(test_result)) 692 | 693 | 694 | if __name__ == "__main__": 695 | if len(sys.argv) > 1 and sys.argv[1] == 'train': 696 | train() 697 | elif len(sys.argv) > 1 and sys.argv[1] == 'test': 698 | test() 699 | else: 700 | print('Usage: {0} [train|test] [options]'.format(sys.argv[0]), file=sys.stderr) 701 | -------------------------------------------------------------------------------- /elmoformanylangs/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import codecs 4 | import numpy as np 5 | 6 | 7 | def pad(sequences, pad_token='', pad_left=False): 8 | """ 9 | input sequences is a list of text sequence [[str]] 10 | pad each text sequence to the length of the longest 11 | 12 | :param sequences: 13 | :param pad_token: 14 | :param pad_left: 15 | :return: 16 | """ 17 | # max_len = max(5,max(len(seq) for seq in sequences)) 18 | max_len = max(len(seq) for seq in sequences) 19 | if pad_left: 20 | return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences] 21 | return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences] 22 | 23 | 24 | def load_embedding_npz(path): 25 | data = np.load(path) 26 | return [str(w) for w in data['words']], data['vals'] 27 | 28 | 29 | def load_embedding_txt(path): 30 | words = [] 31 | vals = [] 32 | with codecs.open(path, 'r', encoding='utf-8') as fin: 33 | fin.readline() 34 | for line in fin: 35 | line = line.strip() 36 | if line: 37 | parts = line.split() 38 | words.append(parts[0]) 39 | vals += [float(x) for x in parts[1:]] # equal to append 40 | return words, np.asarray(vals).reshape(len(words), -1) # reshape 41 | 42 | 43 | def load_embedding(path): 44 | if path.endswith(".npz"): 45 | return load_embedding_npz(path) 46 | else: 47 | return load_embedding_txt(path) 48 | -------------------------------------------------------------------------------- /elmoformanylangs/elmo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import codecs 6 | import random 7 | import logging 8 | import json 9 | import torch 10 | from .modules.embedding_layer import EmbeddingLayer 11 | from .utils import dict2namedtuple 12 | from .frontend import create_one_batch 13 | from .frontend import Model 14 | import numpy as np 15 | 16 | logging.basicConfig(level=logging.INFO, 17 | format='%(asctime)-15s %(levelname)s: %(message)s') 18 | 19 | 20 | def read_list(sents, max_chars=None): 21 | """ 22 | read raw text file. The format of the input is like, one sentence per line 23 | words are separated by '\t' 24 | 25 | :param path: 26 | :param max_chars: int, the number of maximum characters in a word, this 27 | parameter is used when the model is configured with CNN word encoder. 28 | :return: 29 | """ 30 | dataset = [] 31 | textset = [] 32 | for sent in sents: 33 | data = [''] 34 | text = [] 35 | for token in sent: 36 | text.append(token) 37 | if max_chars is not None and len(token) + 2 > max_chars: 38 | token = token[:max_chars - 2] 39 | data.append(token) 40 | data.append('') 41 | dataset.append(data) 42 | textset.append(text) 43 | return dataset, textset 44 | 45 | 46 | def recover(li, ind): 47 | # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort)) 48 | dummy = list(range(len(ind))) 49 | dummy.sort(key=lambda l: ind[l]) 50 | li = [li[i] for i in dummy] 51 | return li 52 | 53 | 54 | # shuffle training examples and create mini-batches 55 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None): 56 | ind = list(range(len(x))) 57 | lst = perm or list(range(len(x))) 58 | if shuffle: 59 | random.shuffle(lst) 60 | 61 | if sort: 62 | lst.sort(key=lambda l: -len(x[l])) 63 | 64 | x = [x[i] for i in lst] 65 | ind = [ind[i] for i in lst] 66 | if text is not None: 67 | text = [text[i] for i in lst] 68 | 69 | sum_len = 0.0 70 | batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], [] 71 | size = batch_size 72 | nbatch = (len(x) - 1) // size + 1 73 | for i in range(nbatch): 74 | start_id, end_id = i * size, (i + 1) * size 75 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 76 | sum_len += sum(blens) 77 | batches_w.append(bw) 78 | batches_c.append(bc) 79 | batches_lens.append(blens) 80 | batches_masks.append(bmasks) 81 | batches_ind.append(ind[start_id: end_id]) 82 | if text is not None: 83 | batches_text.append(text[start_id: end_id]) 84 | 85 | if sort: 86 | perm = list(range(nbatch)) 87 | random.shuffle(perm) 88 | batches_w = [batches_w[i] for i in perm] 89 | batches_c = [batches_c[i] for i in perm] 90 | batches_lens = [batches_lens[i] for i in perm] 91 | batches_masks = [batches_masks[i] for i in perm] 92 | batches_ind = [batches_ind[i] for i in perm] 93 | if text is not None: 94 | batches_text = [batches_text[i] for i in perm] 95 | 96 | logging.info("{} batches, avg len: {:.1f}".format( 97 | nbatch, sum_len / len(x))) 98 | recover_ind = [item for sublist in batches_ind for item in sublist] 99 | if text is not None: 100 | return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind 101 | return batches_w, batches_c, batches_lens, batches_masks, recover_ind 102 | 103 | 104 | class Embedder(object): 105 | def __init__(self, model_dir, batch_size=4): 106 | self.model_dir = model_dir 107 | self.model, self.config = self.get_model() 108 | self.batch_size = batch_size 109 | 110 | def get_model(self): 111 | # torch.cuda.set_device(1) 112 | self.use_cuda = torch.cuda.is_available() 113 | # load the model configurations 114 | args2 = dict2namedtuple(json.load(codecs.open( 115 | os.path.join(self.model_dir, 'config_rnn.json'), 'r', encoding='utf-8'))) 116 | 117 | with open(os.path.join(self.model_dir, args2.config_path), 'r') as fin: 118 | config = json.load(fin) 119 | 120 | # For the model trained with character-based word encoder. 121 | if config['token_embedder']['char_dim'] > 0: 122 | self.char_lexicon = {} 123 | with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: 124 | for line in fpi: 125 | tokens = line.strip().split('\t') 126 | if len(tokens) == 1: 127 | tokens.insert(0, '\u3000') 128 | token, i = tokens 129 | self.char_lexicon[token] = int(i) 130 | char_emb_layer = EmbeddingLayer( 131 | config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None) 132 | logging.info('char embedding size: ' + 133 | str(len(char_emb_layer.word2id))) 134 | else: 135 | self.char_lexicon = None 136 | char_emb_layer = None 137 | 138 | # For the model trained with word form word encoder. 139 | if config['token_embedder']['word_dim'] > 0: 140 | self.word_lexicon = {} 141 | with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi: 142 | for line in fpi: 143 | tokens = line.strip().split('\t') 144 | if len(tokens) == 1: 145 | tokens.insert(0, '\u3000') 146 | token, i = tokens 147 | self.word_lexicon[token] = int(i) 148 | word_emb_layer = EmbeddingLayer( 149 | config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None) 150 | logging.info('word embedding size: ' + 151 | str(len(word_emb_layer.word2id))) 152 | else: 153 | self.word_lexicon = None 154 | word_emb_layer = None 155 | 156 | # instantiate the model 157 | model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda) 158 | 159 | if self.use_cuda: 160 | model.cuda() 161 | 162 | logging.info(str(model)) 163 | model.load_model(self.model_dir) 164 | 165 | # read test data according to input format 166 | 167 | # configure the model to evaluation mode. 168 | model.eval() 169 | return model, config 170 | 171 | def sents2elmo(self, sents, output_layer=-1): 172 | read_function = read_list 173 | 174 | if self.config['token_embedder']['name'].lower() == 'cnn': 175 | test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token']) 176 | else: 177 | test, text = read_function(sents) 178 | 179 | # create test batches from the input data. 180 | test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches( 181 | test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text) 182 | 183 | cnt = 0 184 | 185 | after_elmo = [] 186 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 187 | output = self.model.forward(w, c, masks) 188 | for i, text in enumerate(texts): 189 | 190 | if self.config['encoder']['name'].lower() == 'lstm': 191 | data = output[i, 1:lens[i]-1, :].data 192 | if self.use_cuda: 193 | data = data.cpu() 194 | data = data.numpy() 195 | elif self.config['encoder']['name'].lower() == 'elmo': 196 | data = output[:, i, 1:lens[i]-1, :].data 197 | if self.use_cuda: 198 | data = data.cpu() 199 | data = data.numpy() 200 | 201 | if output_layer == -1: 202 | payload = np.average(data, axis=0) 203 | elif output_layer == -2: 204 | payload = data 205 | else: 206 | payload = data[output_layer] 207 | after_elmo.append(payload) 208 | 209 | cnt += 1 210 | if cnt % 1000 == 0: 211 | logging.info('Finished {0} sentences.'.format(cnt)) 212 | 213 | after_elmo = recover(after_elmo, recover_ind) 214 | return after_elmo 215 | -------------------------------------------------------------------------------- /elmoformanylangs/frontend.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import logging 7 | from torch.autograd import Variable 8 | from .modules.elmo import ElmobiLm 9 | from .modules.lstm import LstmbiLm 10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder 11 | 12 | 13 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True): 14 | """ 15 | Create one batch of input. 16 | 17 | :param x: List[List[str]] 18 | :param word2id: Dict | None 19 | :param char2id: Dict | None 20 | :param config: Dict 21 | :param oov: str, the form of OOV token. 22 | :param pad: str, the form of padding token. 23 | :param sort: bool, specify whether sorting the sentences by their lengths. 24 | :return: 25 | """ 26 | batch_size = len(x) 27 | # lst represents the order of sentences 28 | lst = list(range(batch_size)) 29 | if sort: 30 | lst.sort(key=lambda l: -len(x[l])) 31 | 32 | # shuffle the sentences by 33 | x = [x[i] for i in lst] 34 | lens = [len(x[i]) for i in lst] 35 | max_len = max(lens) 36 | 37 | # get a batch of word id whose size is (batch x max_len) 38 | if word2id is not None: 39 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None) 40 | assert oov_id is not None and pad_id is not None 41 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id) 42 | for i, x_i in enumerate(x): 43 | for j, x_ij in enumerate(x_i): 44 | batch_w[i][j] = word2id.get(x_ij, oov_id) 45 | else: 46 | batch_w = None 47 | 48 | # get a batch of character id whose size is (batch x max_len x max_chars) 49 | if char2id is not None: 50 | bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('', '', oov, pad)] 51 | 52 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None 53 | 54 | if config['token_embedder']['name'].lower() == 'cnn': 55 | max_chars = config['token_embedder']['max_characters_per_token'] 56 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars 57 | elif config['token_embedder']['name'].lower() == 'lstm': 58 | # counting the and 59 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 60 | else: 61 | raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name'])) 62 | 63 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id) 64 | 65 | for i, x_i in enumerate(x): 66 | for j, x_ij in enumerate(x_i): 67 | batch_c[i][j][0] = bow_id 68 | if x_ij == '' or x_ij == '': 69 | batch_c[i][j][1] = char2id.get(x_ij) 70 | batch_c[i][j][2] = eow_id 71 | else: 72 | for k, c in enumerate(x_ij): 73 | batch_c[i][j][k + 1] = char2id.get(c, oov_id) 74 | batch_c[i][j][len(x_ij) + 1] = eow_id 75 | else: 76 | batch_c = None 77 | 78 | # mask[0] is the matrix (batch x max_len) indicating whether 79 | # there is an id is valid (not a padding) in this batch. 80 | # mask[1] stores the flattened ids indicating whether there is a valid 81 | # previous token 82 | # mask[2] stores the flattened ids indicating whether there is a valid 83 | # next token 84 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []] 85 | 86 | for i, x_i in enumerate(x): 87 | for j in range(len(x_i)): 88 | masks[0][i][j] = 1 89 | if j + 1 < len(x_i): 90 | masks[1].append(i * max_len + j) 91 | if j > 0: 92 | masks[2].append(i * max_len + j) 93 | 94 | assert len(masks[1]) <= batch_size * max_len 95 | assert len(masks[2]) <= batch_size * max_len 96 | 97 | masks[1] = torch.LongTensor(masks[1]) 98 | masks[2] = torch.LongTensor(masks[2]) 99 | 100 | return batch_w, batch_c, lens, masks 101 | 102 | 103 | # shuffle training examples and create mini-batches 104 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None): 105 | """ 106 | 107 | :param x: List[List[str]] 108 | :param batch_size: 109 | :param word2id: 110 | :param char2id: 111 | :param config: 112 | :param perm: 113 | :param shuffle: 114 | :param sort: 115 | :param text: 116 | :return: 117 | """ 118 | lst = perm or list(range(len(x))) 119 | if shuffle: 120 | random.shuffle(lst) 121 | 122 | if sort: 123 | lst.sort(key=lambda l: -len(x[l])) 124 | 125 | x = [x[i] for i in lst] 126 | if text is not None: 127 | text = [text[i] for i in lst] 128 | 129 | sum_len = 0.0 130 | batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], [] 131 | size = batch_size 132 | nbatch = (len(x) - 1) // size + 1 133 | for i in range(nbatch): 134 | start_id, end_id = i * size, (i + 1) * size 135 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 136 | sum_len += sum(blens) 137 | batches_w.append(bw) 138 | batches_c.append(bc) 139 | batches_lens.append(blens) 140 | batches_masks.append(bmasks) 141 | if text is not None: 142 | batches_text.append(text[start_id: end_id]) 143 | 144 | if sort: 145 | perm = list(range(nbatch)) 146 | random.shuffle(perm) 147 | batches_w = [batches_w[i] for i in perm] 148 | batches_c = [batches_c[i] for i in perm] 149 | batches_lens = [batches_lens[i] for i in perm] 150 | batches_masks = [batches_masks[i] for i in perm] 151 | if text is not None: 152 | batches_text = [batches_text[i] for i in perm] 153 | 154 | logging.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x))) 155 | if text is not None: 156 | return batches_w, batches_c, batches_lens, batches_masks, batches_text 157 | return batches_w, batches_c, batches_lens, batches_masks 158 | 159 | 160 | class Model(nn.Module): 161 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 162 | super(Model, self).__init__() 163 | self.use_cuda = use_cuda 164 | self.config = config 165 | 166 | if config['token_embedder']['name'].lower() == 'cnn': 167 | self.token_embedder = ConvTokenEmbedder( 168 | config, word_emb_layer, char_emb_layer, use_cuda) 169 | elif config['token_embedder']['name'].lower() == 'lstm': 170 | self.token_embedder = LstmTokenEmbedder( 171 | config, word_emb_layer, char_emb_layer, use_cuda) 172 | 173 | if config['encoder']['name'].lower() == 'elmo': 174 | self.encoder = ElmobiLm(config, use_cuda) 175 | elif config['encoder']['name'].lower() == 'lstm': 176 | self.encoder = LstmbiLm(config, use_cuda) 177 | 178 | self.output_dim = config['encoder']['projection_dim'] 179 | 180 | def forward(self, word_inp, chars_package, mask_package): 181 | """ 182 | 183 | :param word_inp: 184 | :param chars_package: 185 | :param mask_package: 186 | :return: 187 | """ 188 | token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1))) 189 | if self.config['encoder']['name'] == 'elmo': 190 | mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0]) 191 | encoder_output = self.encoder(token_embedding, mask) 192 | sz = encoder_output.size() 193 | token_embedding = torch.cat( 194 | [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) 195 | encoder_output = torch.cat( 196 | [token_embedding, encoder_output], dim=0) 197 | elif self.config['encoder']['name'] == 'lstm': 198 | encoder_output = self.encoder(token_embedding) 199 | else: 200 | raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name'])) 201 | 202 | return encoder_output 203 | 204 | def load_model(self, path): 205 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'), 206 | map_location=lambda storage, loc: storage)) 207 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'), 208 | map_location=lambda storage, loc: storage)) 209 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/elmoformanylangs/modules/__init__.py -------------------------------------------------------------------------------- /elmoformanylangs/modules/classify_layer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s') 9 | 10 | 11 | class SoftmaxLayer(nn.Module): 12 | """ Naive softmax-layer """ 13 | def __init__(self, output_dim, n_class): 14 | """ 15 | 16 | :param output_dim: int 17 | :param n_class: int 18 | """ 19 | super(SoftmaxLayer, self).__init__() 20 | self.hidden2tag = nn.Linear(output_dim, n_class) 21 | self.criterion = nn.CrossEntropyLoss(size_average=False) 22 | 23 | def forward(self, x, y): 24 | """ 25 | 26 | :param x: torch.Tensor 27 | :param y: torch.Tensor 28 | :return: 29 | """ 30 | tag_scores = self.hidden2tag(x) 31 | return self.criterion(tag_scores, y) 32 | 33 | 34 | class SampledSoftmaxLayer(nn.Module): 35 | """ 36 | 37 | """ 38 | def __init__(self, output_dim, n_class, n_samples, use_cuda): 39 | """ 40 | 41 | :param output_dim: 42 | :param n_class: 43 | :param n_samples: 44 | :param use_cuda: 45 | """ 46 | super(SampledSoftmaxLayer, self).__init__() 47 | self.n_samples = n_samples 48 | self.n_class = n_class 49 | self.use_cuda = use_cuda 50 | self.criterion = nn.CrossEntropyLoss(size_average=False) 51 | self.negative_samples = [] 52 | self.word_to_column = {0: 0} 53 | 54 | self.all_word = [] 55 | self.all_word_to_column = {0: 0} 56 | 57 | self.column_emb = nn.Embedding(n_class, output_dim) 58 | self.column_emb.weight.data.uniform_(-0.25, 0.25) 59 | 60 | self.column_bias = nn.Embedding(n_class, 1) 61 | self.column_bias.weight.data.uniform_(-0.25, 0.25) 62 | 63 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 64 | self.oov_column.data.uniform_(-0.25, 0.25) 65 | 66 | def forward(self, x, y): 67 | if self.training: 68 | for i in range(y.size(0)): 69 | y[i] = self.word_to_column.get(y[i].tolist()) 70 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 71 | for word in self.negative_samples: 72 | samples[self.word_to_column[word]] = word 73 | else: 74 | for i in range(y.size(0)): 75 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 76 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 77 | for word in self.all_word: 78 | samples[self.all_word_to_column[word]] = word 79 | 80 | if self.use_cuda: 81 | samples = samples.cuda() 82 | 83 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 84 | (self.column_bias.forward(samples)).view(1, -1) 85 | return self.criterion(tag_scores, y) 86 | 87 | def update_embedding_matrix(self): 88 | word_inp, chars_inp = [], [] 89 | if self.training: 90 | columns = torch.LongTensor(len(self.negative_samples) + 1) 91 | samples = self.negative_samples 92 | for i, word in enumerate(samples): 93 | columns[self.word_to_column[word]] = word 94 | columns[0] = 0 95 | else: 96 | columns = torch.LongTensor(len(self.all_word) + 1) 97 | samples = self.all_word 98 | for i, word in enumerate(samples): 99 | columns[self.all_word_to_column[word]] = word 100 | columns[0] = 0 101 | 102 | if self.use_cuda: 103 | columns = columns.cuda() 104 | self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1) 105 | 106 | def update_negative_samples(self, word_inp, chars_inp, mask): 107 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 108 | in_batch = set() 109 | for i in range(batch_size): 110 | for j in range(seq_len): 111 | if mask[i][j] == 0: 112 | continue 113 | word = word_inp[i][j].tolist() 114 | in_batch.add(word) 115 | for i in range(batch_size): 116 | for j in range(seq_len): 117 | if mask[i][j] == 0: 118 | continue 119 | word = word_inp[i][j].tolist() 120 | if word not in self.all_word_to_column: 121 | self.all_word.append(word) 122 | self.all_word_to_column[word] = len(self.all_word_to_column) 123 | 124 | if word not in self.word_to_column: 125 | if len(self.negative_samples) < self.n_samples: 126 | self.negative_samples.append(word) 127 | self.word_to_column[word] = len(self.word_to_column) 128 | else: 129 | while self.negative_samples[0] in in_batch: 130 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 131 | self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0]) 132 | self.negative_samples = self.negative_samples[1:] + [word] 133 | 134 | 135 | class CNNSoftmaxLayer(nn.Module): 136 | def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda): 137 | super(CNNSoftmaxLayer, self).__init__() 138 | self.token_embedder = token_embedder 139 | self.n_samples = n_samples 140 | self.use_cuda = use_cuda 141 | self.criterion = nn.CrossEntropyLoss(size_average=False) 142 | self.negative_samples = [] 143 | self.word_to_column = {0: 0} 144 | 145 | self.all_word = [] 146 | self.all_word_to_column = {0: 0} 147 | 148 | self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim)) 149 | stdv = 1. / math.sqrt(self.M.size(1)) 150 | self.M.data.uniform_(-stdv, stdv) 151 | 152 | self.corr = nn.Embedding(n_class, corr_dim) 153 | self.corr.weight.data.uniform_(-0.25, 0.25) 154 | 155 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 156 | self.oov_column.data.uniform_(-0.25, 0.25) 157 | 158 | def forward(self, x, y): 159 | if self.training: 160 | for i in range(y.size(0)): 161 | y[i] = self.word_to_column.get(y[i].tolist()) 162 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 163 | for package in self.negative_samples: 164 | samples[self.word_to_column[package[0]]] = package[0] 165 | else: 166 | for i in range(y.size(0)): 167 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 168 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 169 | for package in self.all_word: 170 | samples[self.all_word_to_column[package[0]]] = package[0] 171 | 172 | if self.use_cuda: 173 | samples = samples.cuda() 174 | 175 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 176 | (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1) 177 | return self.criterion(tag_scores, y) 178 | 179 | def update_embedding_matrix(self): 180 | batch_size = 2048 181 | word_inp, chars_inp = [], [] 182 | if self.training: 183 | sub_matrices = [self.oov_column] 184 | samples = self.negative_samples 185 | id2pack = {} 186 | for i, package in enumerate(samples): 187 | id2pack[self.word_to_column[package[0]]] = i 188 | else: 189 | sub_matrices = [self.oov_column] 190 | samples = self.all_word 191 | id2pack = {} 192 | for i, package in enumerate(samples): 193 | id2pack[self.all_word_to_column[package[0]]] = i 194 | 195 | for i in range(len(samples)): 196 | # [n_samples, 1], [n_samples, 1, x], [n_samples, 1] 197 | word_inp.append(samples[id2pack[i + 1]][0]) 198 | chars_inp.append(samples[id2pack[i + 1]][1]) 199 | if len(word_inp) == batch_size or i == len(samples) - 1: 200 | sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1), 201 | None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])), 202 | (len(word_inp), 1)).squeeze(1).transpose(0, 1)) 203 | if not self.training: 204 | sub_matrices[-1] = sub_matrices[-1].detach() 205 | word_inp, chars_inp = [], [] 206 | 207 | sum = 0 208 | for mat in sub_matrices: 209 | sum += mat.size(1) 210 | #print(sum, len(self.word_to_column)) 211 | self.embedding_matrix = torch.cat(sub_matrices, dim=1) 212 | 213 | def update_negative_samples(self, word_inp, chars_inp, mask): 214 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 215 | in_batch = set() 216 | for i in range(batch_size): 217 | for j in range(seq_len): 218 | if mask[i][j] == 0: 219 | continue 220 | word = word_inp[i][j].tolist() 221 | in_batch.add(word) 222 | for i in range(batch_size): 223 | for j in range(seq_len): 224 | if mask[i][j] == 0: 225 | continue 226 | package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist()) 227 | if package[0] not in self.all_word_to_column: 228 | self.all_word.append(package) 229 | self.all_word_to_column[package[0]] = len(self.all_word_to_column) 230 | 231 | if package[0] not in self.word_to_column: 232 | if len(self.negative_samples) < self.n_samples: 233 | self.negative_samples.append(package) 234 | self.word_to_column[package[0]] = len(self.word_to_column) 235 | else: 236 | while self.negative_samples[0][0] in in_batch: 237 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 238 | self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0]) 239 | self.negative_samples = self.negative_samples[1:] + [package] 240 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/elmo.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, List, Callable, Union 2 | 3 | import h5py 4 | import numpy 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence 9 | from torch.autograd import Variable 10 | 11 | from .encoder_base import _EncoderBase 12 | from .lstm_cell_with_projection import LstmCellWithProjection 13 | 14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name 15 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name 16 | 17 | 18 | class ElmobiLm(_EncoderBase): 19 | def __init__(self, config, use_cuda=False): 20 | super(ElmobiLm, self).__init__(stateful=True) 21 | self.config = config 22 | self.use_cuda = use_cuda 23 | input_size = config['encoder']['projection_dim'] 24 | hidden_size = config['encoder']['projection_dim'] 25 | cell_size = config['encoder']['dim'] 26 | num_layers = config['encoder']['n_layers'] 27 | memory_cell_clip_value = config['encoder']['cell_clip'] 28 | state_projection_clip_value = config['encoder']['proj_clip'] 29 | recurrent_dropout_probability = config['dropout'] 30 | 31 | self.input_size = input_size 32 | self.hidden_size = hidden_size 33 | self.num_layers = num_layers 34 | self.cell_size = cell_size 35 | 36 | forward_layers = [] 37 | backward_layers = [] 38 | 39 | lstm_input_size = input_size 40 | go_forward = True 41 | for layer_index in range(num_layers): 42 | forward_layer = LstmCellWithProjection(lstm_input_size, 43 | hidden_size, 44 | cell_size, 45 | go_forward, 46 | recurrent_dropout_probability, 47 | memory_cell_clip_value, 48 | state_projection_clip_value) 49 | backward_layer = LstmCellWithProjection(lstm_input_size, 50 | hidden_size, 51 | cell_size, 52 | not go_forward, 53 | recurrent_dropout_probability, 54 | memory_cell_clip_value, 55 | state_projection_clip_value) 56 | lstm_input_size = hidden_size 57 | 58 | self.add_module('forward_layer_{}'.format(layer_index), forward_layer) 59 | self.add_module('backward_layer_{}'.format(layer_index), backward_layer) 60 | forward_layers.append(forward_layer) 61 | backward_layers.append(backward_layer) 62 | self.forward_layers = forward_layers 63 | self.backward_layers = backward_layers 64 | 65 | def forward(self, inputs, mask): 66 | batch_size, total_sequence_length = mask.size() 67 | stacked_sequence_output, final_states, restoration_indices = \ 68 | self.sort_and_run_forward(self._lstm_forward, inputs, mask) 69 | 70 | num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size() 71 | # Add back invalid rows which were removed in the call to sort_and_run_forward. 72 | if num_valid < batch_size: 73 | zeros = stacked_sequence_output.data.new(num_layers, 74 | batch_size - num_valid, 75 | returned_timesteps, 76 | encoder_dim).fill_(0) 77 | zeros = Variable(zeros) 78 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1) 79 | 80 | # The states also need to have invalid rows added back. 81 | new_states = [] 82 | for state in final_states: 83 | state_dim = state.size(-1) 84 | zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0) 85 | zeros = Variable(zeros) 86 | new_states.append(torch.cat([state, zeros], 1)) 87 | final_states = new_states 88 | 89 | # It's possible to need to pass sequences which are padded to longer than the 90 | # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking 91 | # the sequences mean that the returned tensor won't include these dimensions, because 92 | # the RNN did not need to process them. We add them back on in the form of zeros here. 93 | sequence_length_difference = total_sequence_length - returned_timesteps 94 | if sequence_length_difference > 0: 95 | zeros = stacked_sequence_output.data.new(num_layers, 96 | batch_size, 97 | sequence_length_difference, 98 | stacked_sequence_output[0].size(-1)).fill_(0) 99 | zeros = Variable(zeros) 100 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2) 101 | 102 | self._update_states(final_states, restoration_indices) 103 | 104 | # Restore the original indices and return the sequence. 105 | # Has shape (num_layers, batch_size, sequence_length, hidden_size) 106 | return stacked_sequence_output.index_select(1, restoration_indices) 107 | 108 | 109 | def _lstm_forward(self, 110 | inputs: PackedSequence, 111 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \ 112 | Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 113 | """ 114 | Parameters 115 | ---------- 116 | inputs : ``PackedSequence``, required. 117 | A batch first ``PackedSequence`` to run the stacked LSTM over. 118 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) 119 | A tuple (state, memory) representing the initial hidden state and memory 120 | of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and 121 | (num_layers, batch_size, 2 * cell_size) respectively. 122 | Returns 123 | ------- 124 | output_sequence : ``torch.FloatTensor`` 125 | The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size) 126 | final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` 127 | The per-layer final (state, memory) states of the LSTM, with shape 128 | (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) 129 | respectively. The last dimension is duplicated because it contains the state/memory 130 | for both the forward and backward layers. 131 | """ 132 | 133 | if initial_state is None: 134 | hidden_states: List[Optional[Tuple[torch.Tensor, 135 | torch.Tensor]]] = [None] * len(self.forward_layers) 136 | elif initial_state[0].size()[0] != len(self.forward_layers): 137 | raise Exception("Initial states were passed to forward() but the number of " 138 | "initial states does not match the number of layers.") 139 | else: 140 | hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) 141 | 142 | inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) 143 | forward_output_sequence = inputs 144 | backward_output_sequence = inputs 145 | 146 | final_states = [] 147 | sequence_outputs = [] 148 | for layer_index, state in enumerate(hidden_states): 149 | forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index)) 150 | backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index)) 151 | 152 | forward_cache = forward_output_sequence 153 | backward_cache = backward_output_sequence 154 | 155 | if state is not None: 156 | forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2) 157 | forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2) 158 | forward_state = (forward_hidden_state, forward_memory_state) 159 | backward_state = (backward_hidden_state, backward_memory_state) 160 | else: 161 | forward_state = None 162 | backward_state = None 163 | 164 | forward_output_sequence, forward_state = forward_layer(forward_output_sequence, 165 | batch_lengths, 166 | forward_state) 167 | backward_output_sequence, backward_state = backward_layer(backward_output_sequence, 168 | batch_lengths, 169 | backward_state) 170 | # Skip connections, just adding the input to the output. 171 | if layer_index != 0: 172 | forward_output_sequence += forward_cache 173 | backward_output_sequence += backward_cache 174 | 175 | sequence_outputs.append(torch.cat([forward_output_sequence, 176 | backward_output_sequence], -1)) 177 | # Append the state tuples in a list, so that we can return 178 | # the final states for all the layers. 179 | final_states.append((torch.cat([forward_state[0], backward_state[0]], -1), 180 | torch.cat([forward_state[1], backward_state[1]], -1))) 181 | 182 | stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs) 183 | # Stack the hidden state and memory for each layer into 2 tensors of shape 184 | # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) 185 | # respectively. 186 | final_hidden_states, final_memory_states = zip(*final_states) 187 | final_state_tuple: Tuple[torch.FloatTensor, 188 | torch.FloatTensor] = (torch.cat(final_hidden_states, 0), 189 | torch.cat(final_memory_states, 0)) 190 | return stacked_sequence_outputs, final_state_tuple -------------------------------------------------------------------------------- /elmoformanylangs/modules/embedding_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import logging 6 | 7 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s') 8 | 9 | 10 | class EmbeddingLayer(nn.Module): 11 | def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='', pad='', normalize=True): 12 | super(EmbeddingLayer, self).__init__() 13 | if embs is not None: 14 | embwords, embvecs = embs 15 | # for word in embwords: 16 | # assert word not in word2id, "Duplicate words in pre-trained embeddings" 17 | # word2id[word] = len(word2id) 18 | 19 | logging.info("{} pre-trained word embeddings loaded.".format(len(word2id))) 20 | if n_d != len(embvecs[0]): 21 | logging.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format( 22 | n_d, len(embvecs[0]), len(embvecs[0]))) 23 | n_d = len(embvecs[0]) 24 | 25 | self.word2id = word2id 26 | self.id2word = {i: word for word, i in word2id.items()} 27 | self.n_V, self.n_d = len(word2id), n_d 28 | self.oovid = word2id[oov] 29 | self.padid = word2id[pad] 30 | self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid) 31 | self.embedding.weight.data.uniform_(-0.25, 0.25) 32 | 33 | if embs is not None: 34 | weight = self.embedding.weight 35 | weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs)) 36 | logging.info("embedding shape: {}".format(weight.size())) 37 | 38 | if normalize: 39 | weight = self.embedding.weight 40 | norms = weight.data.norm(2, 1) 41 | if norms.dim() == 1: 42 | norms = norms.unsqueeze(1) 43 | weight.data.div_(norms.expand_as(weight.data)) 44 | 45 | if fix_emb: 46 | self.embedding.weight.requires_grad = False 47 | 48 | def forward(self, input_): 49 | return self.embedding(input_) -------------------------------------------------------------------------------- /elmoformanylangs/modules/encoder_base.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union, Optional, Callable 2 | import torch 3 | from torch.autograd import Variable 4 | from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence 5 | 6 | from .util import get_lengths_from_binary_sequence_mask, sort_batch_by_length 7 | 8 | # We have two types here for the state, because storing the state in something 9 | # which is Iterable (like a tuple, below), is helpful for internal manipulation 10 | # - however, the states are consumed as either Tensors or a Tuple of Tensors, so 11 | # returning them in this format is unhelpful. 12 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name 13 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name 14 | 15 | 16 | class _EncoderBase(torch.nn.Module): 17 | # pylint: disable=abstract-method 18 | """ 19 | This abstract class serves as a base for the 3 ``Encoder`` abstractions in AllenNLP. 20 | - :class:`~allennlp.modules.seq2seq_encoders.Seq2SeqEncoders` 21 | - :class:`~allennlp.modules.seq2vec_encoders.Seq2VecEncoders` 22 | Additionally, this class provides functionality for sorting sequences by length 23 | so they can be consumed by Pytorch RNN classes, which require their inputs to be 24 | sorted by length. Finally, it also provides optional statefulness to all of it's 25 | subclasses by allowing the caching and retrieving of the hidden states of RNNs. 26 | """ 27 | def __init__(self, stateful: bool = False) -> None: 28 | super(_EncoderBase, self).__init__() 29 | self.stateful = stateful 30 | self._states: Optional[RnnStateStorage] = None 31 | 32 | def sort_and_run_forward(self, 33 | module: Callable[[PackedSequence, Optional[RnnState]], 34 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]], 35 | inputs: torch.Tensor, 36 | mask: torch.Tensor, 37 | hidden_state: Optional[RnnState] = None): 38 | """ 39 | This function exists because Pytorch RNNs require that their inputs be sorted 40 | before being passed as input. As all of our Seq2xxxEncoders use this functionality, 41 | it is provided in a base class. This method can be called on any module which 42 | takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a 43 | tuple of tensors or a tensor. 44 | As all of our Seq2xxxEncoders have different return types, we return `sorted` 45 | outputs from the module, which is called directly. Additionally, we return the 46 | indices into the batch dimension required to restore the tensor to it's correct, 47 | unsorted order and the number of valid batch elements (i.e the number of elements 48 | in the batch which are not completely masked). This un-sorting and re-padding 49 | of the module outputs is left to the subclasses because their outputs have different 50 | types and handling them smoothly here is difficult. 51 | Parameters 52 | ---------- 53 | module : ``Callable[[PackedSequence, Optional[RnnState]], 54 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required. 55 | A function to run on the inputs. In most cases, this is a ``torch.nn.Module``. 56 | inputs : ``torch.Tensor``, required. 57 | A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing 58 | the inputs to the Encoder. 59 | mask : ``torch.Tensor``, required. 60 | A tensor of shape ``(batch_size, sequence_length)``, representing masked and 61 | non-masked elements of the sequence for each element in the batch. 62 | hidden_state : ``Optional[RnnState]``, (default = None). 63 | A single tensor of shape (num_layers, batch_size, hidden_size) representing the 64 | state of an RNN with or a tuple of 65 | tensors of shapes (num_layers, batch_size, hidden_size) and 66 | (num_layers, batch_size, memory_size), representing the hidden state and memory 67 | state of an LSTM-like RNN. 68 | Returns 69 | ------- 70 | module_output : ``Union[torch.Tensor, PackedSequence]``. 71 | A Tensor or PackedSequence representing the output of the Pytorch Module. 72 | The batch size dimension will be equal to ``num_valid``, as sequences of zero 73 | length are clipped off before the module is called, as Pytorch cannot handle 74 | zero length sequences. 75 | final_states : ``Optional[RnnState]`` 76 | A Tensor representing the hidden state of the Pytorch Module. This can either 77 | be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in 78 | the case of a GRU, or a tuple of tensors, such as those required for an LSTM. 79 | restoration_indices : ``torch.LongTensor`` 80 | A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform 81 | the outputs back to their original batch order. 82 | """ 83 | # In some circumstances you may have sequences of zero length. ``pack_padded_sequence`` 84 | # requires all sequence lengths to be > 0, so remove sequences of zero length before 85 | # calling self._module, then fill with zeros. 86 | 87 | # First count how many sequences are empty. 88 | batch_size = mask.size(0) 89 | num_valid = torch.sum(mask[:, 0]).int().item() 90 | 91 | sequence_lengths = get_lengths_from_binary_sequence_mask(mask) 92 | sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ 93 | sort_batch_by_length(inputs, sequence_lengths) 94 | 95 | # Now create a PackedSequence with only the non-empty, sorted sequences. 96 | packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :], 97 | sorted_sequence_lengths[:num_valid].data.tolist(), 98 | batch_first=True) 99 | # Prepare the initial states. 100 | if not self.stateful: 101 | if hidden_state is None: 102 | initial_states = hidden_state 103 | elif isinstance(hidden_state, tuple): 104 | initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :] 105 | for state in hidden_state] 106 | else: 107 | initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :] 108 | 109 | else: 110 | initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) 111 | 112 | # Actually call the module on the sorted PackedSequence. 113 | module_output, final_states = module(packed_sequence_input, initial_states) 114 | 115 | return module_output, final_states, restoration_indices 116 | 117 | def _get_initial_states(self, 118 | batch_size: int, 119 | num_valid: int, 120 | sorting_indices: torch.LongTensor) -> Optional[RnnState]: 121 | """ 122 | Returns an initial state for use in an RNN. Additionally, this method handles 123 | the batch size changing across calls by mutating the state to append initial states 124 | for new elements in the batch. Finally, it also handles sorting the states 125 | with respect to the sequence lengths of elements in the batch and removing rows 126 | which are completely padded. Importantly, this `mutates` the state if the 127 | current batch size is larger than when it was previously called. 128 | Parameters 129 | ---------- 130 | batch_size : ``int``, required. 131 | The batch size can change size across calls to stateful RNNs, so we need 132 | to know if we need to expand or shrink the states before returning them. 133 | Expanded states will be set to zero. 134 | num_valid : ``int``, required. 135 | The batch may contain completely padded sequences which get removed before 136 | the sequence is passed through the encoder. We also need to clip these off 137 | of the state too. 138 | sorting_indices ``torch.LongTensor``, required. 139 | Pytorch RNNs take sequences sorted by length. When we return the states to be 140 | used for a given call to ``module.forward``, we need the states to match up to 141 | the sorted sequences, so before returning them, we sort the states using the 142 | same indices used to sort the sequences. 143 | Returns 144 | ------- 145 | This method has a complex return type because it has to deal with the first time it 146 | is called, when it has no state, and the fact that types of RNN have heterogeneous 147 | states. 148 | If it is the first time the module has been called, it returns ``None``, regardless 149 | of the type of the ``Module``. 150 | Otherwise, for LSTMs, it returns a tuple of ``torch.Tensors`` with shape 151 | ``(num_layers, num_valid, state_size)`` and ``(num_layers, num_valid, memory_size)`` 152 | respectively, or for GRUs, it returns a single ``torch.Tensor`` of shape 153 | ``(num_layers, num_valid, state_size)``. 154 | """ 155 | # We don't know the state sizes the first time calling forward, 156 | # so we let the module define what it's initial hidden state looks like. 157 | if self._states is None: 158 | return None 159 | 160 | # Otherwise, we have some previous states. 161 | if batch_size > self._states[0].size(1): 162 | # This batch is larger than the all previous states. 163 | # If so, resize the states. 164 | num_states_to_concat = batch_size - self._states[0].size(1) 165 | resized_states = [] 166 | # state has shape (num_layers, batch_size, hidden_size) 167 | for state in self._states: 168 | # This _must_ be inside the loop because some 169 | # RNNs have states with different last dimension sizes. 170 | zeros = state.data.new(state.size(0), 171 | num_states_to_concat, 172 | state.size(2)).fill_(0) 173 | zeros = Variable(zeros) 174 | resized_states.append(torch.cat([state, zeros], 1)) 175 | self._states = tuple(resized_states) 176 | correctly_shaped_states = self._states 177 | 178 | elif batch_size < self._states[0].size(1): 179 | # This batch is smaller than the previous one. 180 | correctly_shaped_states = tuple(state[:, :batch_size, :] for state in self._states) 181 | else: 182 | correctly_shaped_states = self._states 183 | 184 | # At this point, our states are of shape (num_layers, batch_size, hidden_size). 185 | # However, the encoder uses sorted sequences and additionally removes elements 186 | # of the batch which are fully padded. We need the states to match up to these 187 | # sorted and filtered sequences, so we do that in the next two blocks before 188 | # returning the state/s. 189 | if len(self._states) == 1: 190 | # GRUs only have a single state. This `unpacks` it from the 191 | # tuple and returns the tensor directly. 192 | correctly_shaped_state = correctly_shaped_states[0] 193 | sorted_state = correctly_shaped_state.index_select(1, sorting_indices) 194 | return sorted_state[:, :num_valid, :] 195 | else: 196 | # LSTMs have a state tuple of (state, memory). 197 | sorted_states = [state.index_select(1, sorting_indices) 198 | for state in correctly_shaped_states] 199 | return tuple(state[:, :num_valid, :] for state in sorted_states) 200 | 201 | def _update_states(self, 202 | final_states: RnnStateStorage, 203 | restoration_indices: torch.LongTensor) -> None: 204 | """ 205 | After the RNN has run forward, the states need to be updated. 206 | This method just sets the state to the updated new state, performing 207 | several pieces of book-keeping along the way - namely, unsorting the 208 | states and ensuring that the states of completely padded sequences are 209 | not updated. Finally, it also detatches the state variable from the 210 | computational graph, such that the graph can be garbage collected after 211 | each batch iteration. 212 | Parameters 213 | ---------- 214 | final_states : ``RnnStateStorage``, required. 215 | The hidden states returned as output from the RNN. 216 | restoration_indices : ``torch.LongTensor``, required. 217 | The indices that invert the sorting used in ``sort_and_run_forward`` 218 | to order the states with respect to the lengths of the sequences in 219 | the batch. 220 | """ 221 | # TODO(Mark): seems weird to sort here, but append zeros in the subclasses. 222 | # which way around is best? 223 | new_unsorted_states = [state.index_select(1, restoration_indices) 224 | for state in final_states] 225 | 226 | if self._states is None: 227 | # We don't already have states, so just set the 228 | # ones we receive to be the current state. 229 | self._states = tuple([torch.autograd.Variable(state.data) 230 | for state in new_unsorted_states]) 231 | else: 232 | # Now we've sorted the states back so that they correspond to the original 233 | # indices, we need to figure out what states we need to update, because if we 234 | # didn't use a state for a particular row, we want to preserve its state. 235 | # Thankfully, the rows which are all zero in the state correspond exactly 236 | # to those which aren't used, so we create masks of shape (new_batch_size,), 237 | # denoting which states were used in the RNN computation. 238 | current_state_batch_size = self._states[0].size(1) 239 | new_state_batch_size = final_states[0].size(1) 240 | # Masks for the unused states of shape (1, new_batch_size, 1) 241 | used_new_rows_mask = [(state[0, :, :].sum(-1) 242 | != 0.0).float().view(1, new_state_batch_size, 1) 243 | for state in new_unsorted_states] 244 | new_states = [] 245 | if current_state_batch_size > new_state_batch_size: 246 | # The new state is smaller than the old one, 247 | # so just update the indices which we used. 248 | for old_state, new_state, used_mask in zip(self._states, 249 | new_unsorted_states, 250 | used_new_rows_mask): 251 | # zero out all rows in the previous state 252 | # which _were_ used in the current state. 253 | masked_old_state = old_state[:, :new_state_batch_size, :] * (1 - used_mask) 254 | # The old state is larger, so update the relevant parts of it. 255 | old_state[:, :new_state_batch_size, :] = new_state + masked_old_state 256 | # Detatch the Variable. 257 | new_states.append(torch.autograd.Variable(old_state.data)) 258 | else: 259 | # The states are the same size, so we just have to 260 | # deal with the possibility that some rows weren't used. 261 | new_states = [] 262 | for old_state, new_state, used_mask in zip(self._states, 263 | new_unsorted_states, 264 | used_new_rows_mask): 265 | # zero out all rows which _were_ used in the current state. 266 | masked_old_state = old_state * (1 - used_mask) 267 | # The old state is larger, so update the relevant parts of it. 268 | new_state += masked_old_state 269 | # Detatch the Variable. 270 | new_states.append(torch.autograd.Variable(new_state.data)) 271 | 272 | # It looks like there should be another case handled here - when 273 | # the current_state_batch_size < new_state_batch_size. However, 274 | # this never happens, because the states themeselves are mutated 275 | # by appending zeros when calling _get_inital_states, meaning that 276 | # the new states are either of equal size, or smaller, in the case 277 | # that there are some unused elements (zero-length) for the RNN computation. 278 | self._states = tuple(new_states) 279 | 280 | def reset_states(self): 281 | self._states = None 282 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/highway.py: -------------------------------------------------------------------------------- 1 | """ 2 | A `Highway layer `_ that does a gated combination of a linear 3 | transformation and a non-linear transformation of its input. 4 | """ 5 | 6 | from typing import Callable 7 | 8 | import torch 9 | from overrides import overrides 10 | 11 | 12 | class Highway(torch.nn.Module): 13 | """ 14 | A `Highway layer `_ does a gated combination of a linear 15 | transformation and a non-linear transformation of its input. :math:`y = g * x + (1 - g) * 16 | f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise 17 | non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`. 18 | This module will apply a fixed number of highway layers to its input, returning the final 19 | result. 20 | Parameters 21 | ---------- 22 | input_dim : ``int`` 23 | The dimensionality of :math:`x`. We assume the input has shape ``(batch_size, 24 | input_dim)``. 25 | num_layers : ``int``, optional (default=``1``) 26 | The number of highway layers to apply to the input. 27 | activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``) 28 | The non-linearity to use in the highway layers. 29 | """ 30 | def __init__(self, 31 | input_dim: int, 32 | num_layers: int = 1, 33 | activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None: 34 | super(Highway, self).__init__() 35 | self._input_dim = input_dim 36 | self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2) 37 | for _ in range(num_layers)]) 38 | self._activation = activation 39 | for layer in self._layers: 40 | # We should bias the highway layer to just carry its input forward. We do that by 41 | # setting the bias on `B(x)` to be positive, because that means `g` will be biased to 42 | # be high, to we will carry the input forward. The bias on `B(x)` is the second half 43 | # of the bias vector in each Linear layer. 44 | layer.bias[input_dim:].data.fill_(1) 45 | 46 | @overrides 47 | def forward(self, inputs: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ 48 | current_input = inputs 49 | for layer in self._layers: 50 | projected_input = layer(current_input) 51 | linear_part = current_input 52 | # NOTE: if you modify this, think about whether you should modify the initialization 53 | # above, too. 54 | nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)] 55 | gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)] 56 | nonlinear_part = self._activation(nonlinear_part) 57 | gate = torch.sigmoid(gate) 58 | current_input = gate * linear_part + (1 - gate) * nonlinear_part 59 | return current_input 60 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import logging 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import copy 9 | 10 | 11 | class LstmbiLm(nn.Module): 12 | def __init__(self, config, use_cuda=False): 13 | super(LstmbiLm, self).__init__() 14 | self.config = config 15 | self.use_cuda = use_cuda 16 | 17 | self.encoder = nn.LSTM(self.config['encoder']['projection_dim'], 18 | self.config['encoder']['dim'], 19 | num_layers=self.config['encoder']['n_layers'], 20 | bidirectional=True, 21 | batch_first=True, 22 | dropout=self.config['dropout']) 23 | self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True) 24 | 25 | def forward(self, inputs): 26 | forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2) 27 | return torch.cat([self.projection(forward), self.projection(backward)], dim=2) 28 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/lstm_cell_with_projection.py: -------------------------------------------------------------------------------- 1 | """ 2 | An LSTM with Recurrent Dropout, a hidden_state which is projected and 3 | clipping on both the hidden state and the memory state of the LSTM. 4 | """ 5 | 6 | from typing import Optional, Tuple, List 7 | 8 | import torch 9 | from torch.autograd import Variable 10 | 11 | from .util import block_orthogonal, get_dropout_mask 12 | 13 | class LstmCellWithProjection(torch.nn.Module): 14 | """ 15 | An LSTM with Recurrent Dropout and a projected and clipped hidden state and 16 | memory. Note: this implementation is slower than the native Pytorch LSTM because 17 | it cannot make use of CUDNN optimizations for stacked RNNs due to and 18 | variational dropout and the custom nature of the cell state. 19 | Parameters 20 | ---------- 21 | input_size : ``int``, required. 22 | The dimension of the inputs to the LSTM. 23 | hidden_size : ``int``, required. 24 | The dimension of the outputs of the LSTM. 25 | cell_size : ``int``, required. 26 | The dimension of the memory cell used for the LSTM. 27 | go_forward: ``bool``, optional (default = True) 28 | The direction in which the LSTM is applied to the sequence. 29 | Forwards by default, or backwards if False. 30 | recurrent_dropout_probability: ``float``, optional (default = 0.0) 31 | The dropout probability to be used in a dropout scheme as stated in 32 | `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks 33 | `_ . Implementation wise, this simply 34 | applies a fixed dropout mask per sequence to the recurrent connection of the 35 | LSTM. 36 | state_projection_clip_value: ``float``, optional, (default = None) 37 | The magnitude with which to clip the hidden_state after projecting it. 38 | memory_cell_clip_value: ``float``, optional, (default = None) 39 | The magnitude with which to clip the memory cell. 40 | Returns 41 | ------- 42 | output_accumulator : ``torch.FloatTensor`` 43 | The outputs of the LSTM for each timestep. A tensor of shape 44 | (batch_size, max_timesteps, hidden_size) where for a given batch 45 | element, all outputs past the sequence length for that batch are 46 | zero tensors. 47 | final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` 48 | The final (state, memory) states of the LSTM, with shape 49 | (1, batch_size, hidden_size) and (1, batch_size, cell_size) 50 | respectively. The first dimension is 1 in order to match the Pytorch 51 | API for returning stacked LSTM states. 52 | """ 53 | def __init__(self, 54 | input_size: int, 55 | hidden_size: int, 56 | cell_size: int, 57 | go_forward: bool = True, 58 | recurrent_dropout_probability: float = 0.0, 59 | memory_cell_clip_value: Optional[float] = None, 60 | state_projection_clip_value: Optional[float] = None) -> None: 61 | super(LstmCellWithProjection, self).__init__() 62 | # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`. 63 | self.input_size = input_size 64 | self.hidden_size = hidden_size 65 | self.cell_size = cell_size 66 | 67 | self.go_forward = go_forward 68 | self.state_projection_clip_value = state_projection_clip_value 69 | self.memory_cell_clip_value = memory_cell_clip_value 70 | self.recurrent_dropout_probability = recurrent_dropout_probability 71 | 72 | # We do the projections for all the gates all at once. 73 | self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False) 74 | self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True) 75 | 76 | # Additional projection matrix for making the hidden state smaller. 77 | self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False) 78 | self.reset_parameters() 79 | 80 | def reset_parameters(self): 81 | # Use sensible default initializations for parameters. 82 | block_orthogonal(self.input_linearity.weight.data, [self.cell_size, self.input_size]) 83 | block_orthogonal(self.state_linearity.weight.data, [self.cell_size, self.hidden_size]) 84 | 85 | self.state_linearity.bias.data.fill_(0.0) 86 | # Initialize forget gate biases to 1.0 as per An Empirical 87 | # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015). 88 | self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0) 89 | 90 | def forward(self, # pylint: disable=arguments-differ 91 | inputs: torch.FloatTensor, 92 | batch_lengths: List[int], 93 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None): 94 | """ 95 | Parameters 96 | ---------- 97 | inputs : ``torch.FloatTensor``, required. 98 | A tensor of shape (batch_size, num_timesteps, input_size) 99 | to apply the LSTM over. 100 | batch_lengths : ``List[int]``, required. 101 | A list of length batch_size containing the lengths of the sequences in batch. 102 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) 103 | A tuple (state, memory) representing the initial hidden state and memory 104 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the 105 | ``memory`` has shape (1, batch_size, cell_size). 106 | Returns 107 | ------- 108 | output_accumulator : ``torch.FloatTensor`` 109 | The outputs of the LSTM for each timestep. A tensor of shape 110 | (batch_size, max_timesteps, hidden_size) where for a given batch 111 | element, all outputs past the sequence length for that batch are 112 | zero tensors. 113 | final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]`` 114 | A tuple (state, memory) representing the initial hidden state and memory 115 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the 116 | ``memory`` has shape (1, batch_size, cell_size). 117 | """ 118 | batch_size = inputs.size()[0] 119 | total_timesteps = inputs.size()[1] 120 | 121 | # We have to use this '.data.new().fill_' pattern to create tensors with the correct 122 | # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors. 123 | output_accumulator = Variable(inputs.data.new(batch_size, 124 | total_timesteps, 125 | self.hidden_size).fill_(0)) 126 | if initial_state is None: 127 | full_batch_previous_memory = Variable(inputs.data.new(batch_size, 128 | self.cell_size).fill_(0)) 129 | full_batch_previous_state = Variable(inputs.data.new(batch_size, 130 | self.hidden_size).fill_(0)) 131 | else: 132 | full_batch_previous_state = initial_state[0].squeeze(0) 133 | full_batch_previous_memory = initial_state[1].squeeze(0) 134 | 135 | current_length_index = batch_size - 1 if self.go_forward else 0 136 | if self.recurrent_dropout_probability > 0.0 and self.training: 137 | dropout_mask = get_dropout_mask(self.recurrent_dropout_probability, 138 | full_batch_previous_state) 139 | else: 140 | dropout_mask = None 141 | 142 | for timestep in range(total_timesteps): 143 | # The index depends on which end we start. 144 | index = timestep if self.go_forward else total_timesteps - timestep - 1 145 | 146 | # What we are doing here is finding the index into the batch dimension 147 | # which we need to use for this timestep, because the sequences have 148 | # variable length, so once the index is greater than the length of this 149 | # particular batch sequence, we no longer need to do the computation for 150 | # this sequence. The key thing to recognise here is that the batch inputs 151 | # must be _ordered_ by length from longest (first in batch) to shortest 152 | # (last) so initially, we are going forwards with every sequence and as we 153 | # pass the index at which the shortest elements of the batch finish, 154 | # we stop picking them up for the computation. 155 | if self.go_forward: 156 | while batch_lengths[current_length_index] <= index: 157 | current_length_index -= 1 158 | # If we're going backwards, we are _picking up_ more indices. 159 | else: 160 | # First conditional: Are we already at the maximum number of elements in the batch? 161 | # Second conditional: Does the next shortest sequence beyond the current batch 162 | # index require computation use this timestep? 163 | while current_length_index < (len(batch_lengths) - 1) and \ 164 | batch_lengths[current_length_index + 1] > index: 165 | current_length_index += 1 166 | 167 | # Actually get the slices of the batch which we 168 | # need for the computation at this timestep. 169 | # shape (batch_size, cell_size) 170 | previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone() 171 | # Shape (batch_size, hidden_size) 172 | previous_state = full_batch_previous_state[0: current_length_index + 1].clone() 173 | # Shape (batch_size, input_size) 174 | timestep_input = inputs[0: current_length_index + 1, index] 175 | 176 | # Do the projections for all the gates all at once. 177 | # Both have shape (batch_size, 4 * cell_size) 178 | projected_input = self.input_linearity(timestep_input) 179 | projected_state = self.state_linearity(previous_state) 180 | 181 | # Main LSTM equations using relevant chunks of the big linear 182 | # projections of the hidden state and inputs. 183 | input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] + 184 | projected_state[:, (0 * self.cell_size):(1 * self.cell_size)]) 185 | forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] + 186 | projected_state[:, (1 * self.cell_size):(2 * self.cell_size)]) 187 | memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] + 188 | projected_state[:, (2 * self.cell_size):(3 * self.cell_size)]) 189 | output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] + 190 | projected_state[:, (3 * self.cell_size):(4 * self.cell_size)]) 191 | memory = input_gate * memory_init + forget_gate * previous_memory 192 | 193 | # Here is the non-standard part of this LSTM cell; first, we clip the 194 | # memory cell, then we project the output of the timestep to a smaller size 195 | # and again clip it. 196 | 197 | if self.memory_cell_clip_value: 198 | # pylint: disable=invalid-unary-operand-type 199 | memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value) 200 | 201 | # shape (current_length_index, cell_size) 202 | pre_projection_timestep_output = output_gate * torch.tanh(memory) 203 | 204 | # shape (current_length_index, hidden_size) 205 | timestep_output = self.state_projection(pre_projection_timestep_output) 206 | if self.state_projection_clip_value: 207 | # pylint: disable=invalid-unary-operand-type 208 | timestep_output = torch.clamp(timestep_output, 209 | -self.state_projection_clip_value, 210 | self.state_projection_clip_value) 211 | 212 | # Only do dropout if the dropout prob is > 0.0 and we are in training mode. 213 | if dropout_mask is not None: 214 | timestep_output = timestep_output * dropout_mask[0: current_length_index + 1] 215 | 216 | # We've been doing computation with less than the full batch, so here we create a new 217 | # variable for the the whole batch at this timestep and insert the result for the 218 | # relevant elements of the batch into it. 219 | full_batch_previous_memory = Variable(full_batch_previous_memory.data.clone()) 220 | full_batch_previous_state = Variable(full_batch_previous_state.data.clone()) 221 | full_batch_previous_memory[0:current_length_index + 1] = memory 222 | full_batch_previous_state[0:current_length_index + 1] = timestep_output 223 | output_accumulator[0:current_length_index + 1, index] = timestep_output 224 | 225 | # Mimic the pytorch API by returning state in the following shape: 226 | # (num_layers * num_directions, batch_size, ...). As this 227 | # LSTM cell cannot be stacked, the first dimension here is just 1. 228 | final_state = (full_batch_previous_state.unsqueeze(0), 229 | full_batch_previous_memory.unsqueeze(0)) 230 | 231 | return output_accumulator, final_state 232 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/token_embedder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import logging 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import copy 9 | from .highway import Highway 10 | 11 | 12 | class LstmTokenEmbedder(nn.Module): 13 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 14 | super(LstmTokenEmbedder, self).__init__() 15 | self.config = config 16 | self.use_cuda = use_cuda 17 | self.word_emb_layer = word_emb_layer 18 | self.char_emb_layer = char_emb_layer 19 | self.output_dim = config['encoder']['projection_dim'] 20 | emb_dim = 0 21 | if word_emb_layer is not None: 22 | emb_dim += word_emb_layer.n_d 23 | 24 | if char_emb_layer is not None: 25 | emb_dim += char_emb_layer.n_d * 2 26 | self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True, 27 | batch_first=True, dropout=config['dropout']) 28 | 29 | self.projection = nn.Linear(emb_dim, self.output_dim, bias=True) 30 | 31 | def forward(self, word_inp, chars_inp, shape): 32 | embs = [] 33 | batch_size, seq_len = shape 34 | if self.word_emb_layer is not None: 35 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 36 | embs.append(word_emb) 37 | 38 | if self.char_emb_layer is not None: 39 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 40 | chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 41 | _, (chars_outputs, __) = self.char_lstm(chars_emb) 42 | chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2) 43 | embs.append(chars_outputs) 44 | 45 | token_embedding = torch.cat(embs, dim=2) 46 | 47 | return self.projection(token_embedding) 48 | 49 | 50 | class ConvTokenEmbedder(nn.Module): 51 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda): 52 | super(ConvTokenEmbedder, self).__init__() 53 | self.config = config 54 | self.use_cuda = use_cuda 55 | 56 | self.word_emb_layer = word_emb_layer 57 | self.char_emb_layer = char_emb_layer 58 | 59 | self.output_dim = config['encoder']['projection_dim'] 60 | self.emb_dim = 0 61 | if word_emb_layer is not None: 62 | self.emb_dim += word_emb_layer.n_d 63 | 64 | if char_emb_layer is not None: 65 | self.convolutions = [] 66 | cnn_config = config['token_embedder'] 67 | filters = cnn_config['filters'] 68 | char_embed_dim = cnn_config['char_dim'] 69 | 70 | for i, (width, num) in enumerate(filters): 71 | conv = torch.nn.Conv1d( 72 | in_channels=char_embed_dim, 73 | out_channels=num, 74 | kernel_size=width, 75 | bias=True 76 | ) 77 | self.convolutions.append(conv) 78 | 79 | self.convolutions = nn.ModuleList(self.convolutions) 80 | 81 | self.n_filters = sum(f[1] for f in filters) 82 | self.n_highway = cnn_config['n_highway'] 83 | 84 | self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu) 85 | self.emb_dim += self.n_filters 86 | 87 | self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True) 88 | 89 | def forward(self, word_inp, chars_inp, shape): 90 | embs = [] 91 | batch_size, seq_len = shape 92 | if self.word_emb_layer is not None: 93 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 94 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 95 | embs.append(word_emb) 96 | 97 | if self.char_emb_layer is not None: 98 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 99 | 100 | character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 101 | 102 | character_embedding = torch.transpose(character_embedding, 1, 2) 103 | 104 | cnn_config = self.config['token_embedder'] 105 | if cnn_config['activation'] == 'tanh': 106 | activation = torch.nn.functional.tanh 107 | elif cnn_config['activation'] == 'relu': 108 | activation = torch.nn.functional.relu 109 | else: 110 | raise Exception("Unknown activation") 111 | 112 | convs = [] 113 | for i in range(len(self.convolutions)): 114 | convolved = self.convolutions[i](character_embedding) 115 | # (batch_size * sequence_length, n_filters for this width) 116 | convolved, _ = torch.max(convolved, dim=-1) 117 | convolved = activation(convolved) 118 | convs.append(convolved) 119 | char_emb = torch.cat(convs, dim=-1) 120 | char_emb = self.highways(char_emb) 121 | 122 | embs.append(char_emb.view(batch_size, -1, self.n_filters)) 123 | 124 | token_embedding = torch.cat(embs, dim=2) 125 | 126 | return self.projection(token_embedding) 127 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Assorted utilities for working with neural networks in AllenNLP. 3 | """ 4 | from collections import defaultdict 5 | from typing import Dict, List, Optional, Any, Tuple, Callable 6 | import logging 7 | import itertools 8 | import math 9 | import torch 10 | from torch.autograd import Variable 11 | 12 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): 13 | """ 14 | Compute sequence lengths for each batch element in a tensor using a 15 | binary mask. 16 | Parameters 17 | ---------- 18 | mask : torch.Tensor, required. 19 | A 2D binary mask of shape (batch_size, sequence_length) to 20 | calculate the per-batch sequence lengths from. 21 | Returns 22 | ------- 23 | A torch.LongTensor of shape (batch_size,) representing the lengths 24 | of the sequences in the batch. 25 | """ 26 | return mask.long().sum(-1) 27 | 28 | 29 | def sort_batch_by_length(tensor: torch.autograd.Variable, 30 | sequence_lengths: torch.autograd.Variable): 31 | """ 32 | Sort a batch first tensor by some specified lengths. 33 | Parameters 34 | ---------- 35 | tensor : Variable(torch.FloatTensor), required. 36 | A batch first Pytorch tensor. 37 | sequence_lengths : Variable(torch.LongTensor), required. 38 | A tensor representing the lengths of some dimension of the tensor which 39 | we want to sort by. 40 | Returns 41 | ------- 42 | sorted_tensor : Variable(torch.FloatTensor) 43 | The original tensor sorted along the batch dimension with respect to sequence_lengths. 44 | sorted_sequence_lengths : Variable(torch.LongTensor) 45 | The original sequence_lengths sorted by decreasing size. 46 | restoration_indices : Variable(torch.LongTensor) 47 | Indices into the sorted_tensor such that 48 | ``sorted_tensor.index_select(0, restoration_indices) == original_tensor`` 49 | permuation_index : Variable(torch.LongTensor) 50 | The indices used to sort the tensor. This is useful if you want to sort many 51 | tensors using the same ordering. 52 | """ 53 | 54 | if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable): 55 | raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.") 56 | 57 | sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True) 58 | sorted_tensor = tensor.index_select(0, permutation_index) 59 | 60 | # This is ugly, but required - we are creating a new variable at runtime, so we 61 | # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and 62 | # refilling one of the inputs to the function. 63 | index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths))) 64 | # This is the equivalent of zipping with index, sorting by the original 65 | # sequence lengths and returning the now sorted indices. 66 | index_range = Variable(index_range.long()) 67 | _, reverse_mapping = permutation_index.sort(0, descending=False) 68 | restoration_indices = index_range.index_select(0, reverse_mapping) 69 | return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index 70 | 71 | 72 | def get_final_encoder_states(encoder_outputs: torch.Tensor, 73 | mask: torch.Tensor, 74 | bidirectional: bool = False) -> torch.Tensor: 75 | """ 76 | Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length, 77 | encoding_dim)``, this method returns the final hidden state for each element of the batch, 78 | giving a tensor of shape ``(batch_size, encoding_dim)``. This is not as simple as 79 | ``encoder_outputs[:, -1]``, because the sequences could have different lengths. We use the 80 | mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch 81 | instance. 82 | Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the 83 | ``encoder_outputs`` into two and assume that the first half is for the forward direction of the 84 | encoder and the second half is for the backward direction. We will concatenate the last state 85 | for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with 86 | ``encoder_outputs[:, 0, encoding_dim/2:]``. 87 | """ 88 | # These are the indices of the last words in the sequences (i.e. length sans padding - 1). We 89 | # are assuming sequences are right padded. 90 | # Shape: (batch_size,) 91 | last_word_indices = mask.sum(1).long() - 1 92 | batch_size, _, encoder_output_dim = encoder_outputs.size() 93 | expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim) 94 | # Shape: (batch_size, 1, encoder_output_dim) 95 | final_encoder_output = encoder_outputs.gather(1, expanded_indices) 96 | final_encoder_output = final_encoder_output.squeeze(1) # (batch_size, encoder_output_dim) 97 | if bidirectional: 98 | final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)] 99 | final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):] 100 | final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1) 101 | return final_encoder_output 102 | 103 | 104 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable): 105 | """ 106 | Computes and returns an element-wise dropout mask for a given tensor, where 107 | each element in the mask is dropped out with probability dropout_probability. 108 | Note that the mask is NOT applied to the tensor - the tensor is passed to retain 109 | the correct CUDA tensor type for the mask. 110 | Parameters 111 | ---------- 112 | dropout_probability : float, required. 113 | Probability of dropping a dimension of the input. 114 | tensor_for_masking : torch.Variable, required. 115 | Returns 116 | ------- 117 | A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). 118 | This scaling ensures expected values and variances of the output of applying this mask 119 | and the original tensor are the same. 120 | """ 121 | binary_mask = tensor_for_masking.clone() 122 | binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability) 123 | # Scale mask by 1/keep_prob to preserve output statistics. 124 | dropout_mask = binary_mask.float().div(1.0 - dropout_probability) 125 | return dropout_mask 126 | 127 | def block_orthogonal(tensor: torch.Tensor, 128 | split_sizes: List[int], 129 | gain: float = 1.0) -> None: 130 | """ 131 | An initializer which allows initializing model parameters in "blocks". This is helpful 132 | in the case of recurrent models which use multiple gates applied to linear projections, 133 | which can be computed efficiently if they are concatenated together. However, they are 134 | separate parameters which should be initialized independently. 135 | Parameters 136 | ---------- 137 | tensor : ``torch.Tensor``, required. 138 | A tensor to initialize. 139 | split_sizes : List[int], required. 140 | A list of length ``tensor.ndim()`` specifying the size of the 141 | blocks along that particular dimension. E.g. ``[10, 20]`` would 142 | result in the tensor being split into chunks of size 10 along the 143 | first dimension and 20 along the second. 144 | gain : float, optional (default = 1.0) 145 | The gain (scaling) applied to the orthogonal initialization. 146 | """ 147 | 148 | if isinstance(tensor, Variable): 149 | # in pytorch 4.0, Variable equals Tensor 150 | # block_orthogonal(tensor.data, split_sizes, gain) 151 | #else: 152 | sizes = list(tensor.size()) 153 | if any([a % b != 0 for a, b in zip(sizes, split_sizes)]): 154 | raise ConfigurationError("tensor dimensions must be divisible by their respective " 155 | "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes)) 156 | indexes = [list(range(0, max_size, split)) 157 | for max_size, split in zip(sizes, split_sizes)] 158 | # Iterate over all possible blocks within the tensor. 159 | for block_start_indices in itertools.product(*indexes): 160 | # A list of tuples containing the index to start at for this block 161 | # and the appropriate step size (i.e split_size[i] for dimension i). 162 | index_and_step_tuples = zip(block_start_indices, split_sizes) 163 | # This is a tuple of slices corresponding to: 164 | # tensor[index: index + step_size, ...]. This is 165 | # required because we could have an arbitrary number 166 | # of dimensions. The actual slices we need are the 167 | # start_index: start_index + step for each dimension in the tensor. 168 | block_slice = tuple([slice(start_index, start_index + step) 169 | for start_index, step in index_and_step_tuples]) 170 | tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain) 171 | -------------------------------------------------------------------------------- /elmoformanylangs/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import collections 4 | import itertools 5 | 6 | 7 | def flatten(lst): 8 | return list(itertools.chain.from_iterable(lst)) 9 | 10 | 11 | def deep_iter(x): 12 | if isinstance(x, list) or isinstance(x, tuple): 13 | for u in x: 14 | for v in deep_iter(u): 15 | yield v 16 | else: 17 | yield 18 | 19 | 20 | def dict2namedtuple(dic): 21 | return collections.namedtuple('Namespace', dic.keys())(**dic) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import logging 5 | import json 6 | import numpy as np 7 | import random 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torch.utils.tensorboard import SummaryWriter 13 | from scipy.stats import pearsonr 14 | from sklearn.metrics import f1_score, accuracy_score 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--config_path', type=str, required=True) 18 | args = parser.parse_args() 19 | with open(args.config_path, 'r') as f: 20 | args = json.load(f) 21 | 22 | os.makedirs(args['output_path'], exist_ok=True) 23 | 24 | logFormatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 25 | log = logging.getLogger() 26 | 27 | fileHandler = logging.FileHandler(os.path.join(args['output_path'], 'log.txt')) 28 | fileHandler.setFormatter(logFormatter) 29 | log.addHandler(fileHandler) 30 | 31 | consoleHandler = logging.StreamHandler(sys.stdout) 32 | consoleHandler.setFormatter(logFormatter) 33 | log.addHandler(consoleHandler) 34 | log.setLevel(logging.DEBUG) 35 | 36 | log.info('{}'.format(args)) 37 | 38 | if not torch.cuda.is_available() and args['gpu']: 39 | log.warning('Cannot use gpu. use cpu instead.') 40 | args['gpu'] = False 41 | is_gpu = args['gpu'] 42 | 43 | with open(os.path.join(args['output_path'], 'config.json'), 'w') as f: 44 | json.dump(args, f) 45 | 46 | random.seed(args['seed']) 47 | np.random.seed(args['seed']) 48 | torch.manual_seed(args['seed']) 49 | if is_gpu: 50 | torch.backends.cudnn.deterministic = True 51 | torch.backends.cudnn.benchmark = False 52 | 53 | writer = SummaryWriter(log_dir=os.path.join(args['output_path'], 'runs')) 54 | 55 | 56 | class Data: 57 | def __init__(self, text, label, training): 58 | self.text = text 59 | self.label = label 60 | self.training = training 61 | 62 | def split(self, size): 63 | return (Data(self.text[:-size], self.label[:-size], training=self.training), 64 | Data(self.text[-size:], self.label[-size:], training=False)) 65 | 66 | def __len__(self): 67 | return len(self.text) 68 | 69 | 70 | def load_data(input_path, max_length, training): 71 | text_file = os.path.join(input_path, 'train_text.npy' if training else 'test_text.npy') 72 | label_file = os.path.join(input_path, 'train_label.npy' if training else 'test_label.npy') 73 | log.info('load data from {}, {}, training: {}'.format(text_file, label_file, training)) 74 | text = np.load(text_file, allow_pickle=True) 75 | label = np.load(label_file, allow_pickle=True) 76 | label = torch.tensor([np.array(label[i]) / np.sum(label[i]) for i in range(len(label))], 77 | dtype=torch.float) 78 | if max_length == -1: 79 | text = [torch.tensor(t, dtype=torch.float) for t in text] 80 | else: 81 | text_temp = [] 82 | for t in text: 83 | b = torch.tensor(t, dtype=torch.float) 84 | zero = torch.zeros(max_length, b.size(1)) 85 | length = min(max_length, b.size(0)) 86 | zero[:length, :] = b[:length, :] 87 | text_temp.append(zero) 88 | text = text_temp 89 | log.info('loaded. total len: {}'.format(len(text))) 90 | return Data(text, label, training) 91 | 92 | 93 | class BatchGen: 94 | def __init__(self, data, batch_size): 95 | self.batch_size = batch_size 96 | self.data = data 97 | 98 | def __len__(self): 99 | return (len(self.data) + self.batch_size - 1) // self.batch_size 100 | 101 | def __iter__(self): 102 | size = len(self.data) 103 | ids = torch.randperm(size) 104 | if not self.data.training: 105 | ids = torch.arange(size) 106 | 107 | for i in range(len(self)): 108 | batch_idx = ids[self.batch_size * i: self.batch_size * (i + 1)] 109 | label = torch.index_select(self.data.label, 0, batch_idx) 110 | if is_gpu: 111 | label = label.cuda() 112 | text = [self.data.text[j].cuda() for j in batch_idx] 113 | else: 114 | text = [self.data.text[j] for j in batch_idx] 115 | yield (text, label) 116 | 117 | 118 | class Stat: 119 | def __init__(self, training): 120 | self.loss = [] 121 | self.gold_labels = [] 122 | self.norm_gold_labels = [] 123 | self.pred_labels = [] 124 | self.norm_pred_labels = [] 125 | self.training = training 126 | self.save = { 127 | 'acc': [], 128 | 'f1': [], 129 | 'corr': [], 130 | } 131 | 132 | def add(self, pred, gold, loss): 133 | gold_labels = torch.argmax(gold, dim=1).cpu().numpy() 134 | norm_gold_labels = gold.cpu().numpy() 135 | pred_labels = torch.argmax(pred, dim=1).cpu().numpy() 136 | norm_pred_labels = pred.cpu().numpy() 137 | self.loss.append(loss) 138 | self.gold_labels.extend(gold_labels) 139 | self.norm_gold_labels.extend(norm_gold_labels) 140 | self.pred_labels.extend(pred_labels) 141 | self.norm_pred_labels.extend(norm_pred_labels) 142 | 143 | def eval(self): 144 | acc = accuracy_score(self.gold_labels, self.pred_labels) * 100 145 | f1 = f1_score(self.gold_labels, self.pred_labels, average='macro') * 100 146 | norm_gold = np.asarray(self.norm_gold_labels).transpose((1, 0)) 147 | norm_pred = np.asarray(self.norm_pred_labels).transpose((1, 0)) 148 | corr = sum([pearsonr(norm_gold[i], norm_pred[i])[0] for i in range(len(norm_gold))]) / len(norm_gold) 149 | return acc, f1, corr 150 | 151 | def log(self, global_step, epoch, batch): 152 | acc, f1, corr = self.eval() 153 | if self.training: 154 | loss = sum(self.loss) / len(self.loss) 155 | log.info('step: {}, epoch: {}, batch: {}, loss: {}, acc: {}, f1: {}, r: {}'.format( 156 | global_step, epoch, batch, loss, acc, f1, corr)) 157 | writer.add_scalar('train_Loss', loss, global_step) 158 | writer.add_scalar('train_Accuracy', acc, global_step) 159 | writer.add_scalar('train_F1_macro', f1, global_step) 160 | writer.add_scalar('train_CORR', corr, global_step) 161 | else: 162 | log.info('step: {}, epoch: {}, acc: {}, f1: {}, r: {}'.format( 163 | global_step, epoch, acc, f1, corr)) 164 | writer.add_scalar('dev_Accuracy', acc, global_step) 165 | writer.add_scalar('dev_F1_macro', f1, global_step) 166 | writer.add_scalar('dev_CORR', corr, global_step) 167 | self.save['acc'].append(acc) 168 | self.save['f1'].append(f1) 169 | self.save['corr'].append(corr) 170 | self.loss = [] 171 | self.gold_labels = [] 172 | self.norm_gold_labels = [] 173 | self.pred_labels = [] 174 | self.norm_pred_labels = [] 175 | 176 | 177 | class MLP(nn.Module): 178 | """ 179 | b: batch_size, n: seq_len, d: embedding_size 180 | """ 181 | def __init__(self, config): 182 | super().__init__() 183 | opt = config['mlp'] 184 | self.max_length = opt['max_length'] 185 | dropout = opt['dropout'] 186 | u = opt['hidden_size'] 187 | self.mlp = nn.Sequential( 188 | nn.Linear(self.max_length * config['embedding_size'], u), 189 | nn.ReLU(), 190 | nn.Dropout(dropout), 191 | nn.Linear(u, config['num_labels']), 192 | ) 193 | self.loss_type = opt['loss'] 194 | if self.loss_type == 'l1': 195 | self.loss = nn.L1Loss() 196 | elif self.loss_type == 'mse': 197 | self.loss = nn.MSELoss() 198 | elif self.loss_type == 'cross_entropy': 199 | self.loss = nn.CrossEntropyLoss() 200 | else: 201 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"') 202 | 203 | def forward(self, embedding, gold_labels=None): 204 | """ 205 | :param embedding: [b, n, d] 206 | :param gold_labels: [b, num_labels] 207 | :return: If training, return (loss, predicted labels). Else return predicted labels 208 | """ 209 | data = torch.stack(embedding) 210 | output = self.mlp(data.view(data.size(0), -1)) 211 | labels = F.softmax(output, dim=1) 212 | if not self.training: 213 | return labels.detach() 214 | if self.loss_type == 'cross_entropy': 215 | loss = self.loss(output, torch.argmax(gold_labels, dim=1)) 216 | else: 217 | loss = self.loss(labels, gold_labels) 218 | return loss, labels.detach() 219 | 220 | 221 | class CNN(nn.Module): 222 | def __init__(self, config): 223 | super().__init__() 224 | opt = config['cnn'] 225 | self.cnn_1 = nn.Sequential( 226 | nn.Conv1d(config['embedding_size'], opt['conv_1']['size'], opt['conv_1']['kernel_size'], 227 | padding=opt['conv_1']['kernel_size'] // 2), 228 | # nn.BatchNorm1d(opt['conv_1']['size']), 229 | nn.ReLU(), 230 | nn.Dropout(opt['conv_1']['dropout']), 231 | nn.MaxPool1d(opt['max_pool_1']['kernel_size'], opt['max_pool_1']['stride']), 232 | ) 233 | """ 234 | self.cnn_2 = nn.Sequential( 235 | nn.Conv1d(opt['conv_1']['size'], opt['conv_2']['size'], opt['conv_2']['kernel_size'], 236 | padding=opt['conv_2']['kernel_size'] // 2), 237 | nn.ReLU(), 238 | nn.Dropout(opt['conv_2']['dropout']), 239 | nn.MaxPool1d(opt['max_pool_2']['kernel_size'], opt['max_pool_2']['stride']), 240 | ) 241 | """ 242 | mlp_u = opt['fc']['hidden_size'] 243 | self.mlp = nn.Sequential( 244 | nn.Linear(opt['conv_1']['size'] * opt['max_length'] // 2, mlp_u), 245 | nn.ReLU(), 246 | nn.Dropout(opt['fc']['dropout']), 247 | nn.Linear(mlp_u, config['num_labels']), 248 | ) 249 | self.loss_type = opt['loss'] 250 | if self.loss_type == 'l1': 251 | self.loss = nn.L1Loss() 252 | elif self.loss_type == 'mse': 253 | self.loss = nn.MSELoss() 254 | elif self.loss_type == 'cross_entropy': 255 | self.loss = nn.CrossEntropyLoss() 256 | else: 257 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"') 258 | 259 | def forward(self, embedding, gold_labels=None): 260 | """ 261 | :param embedding: [b, n, d] 262 | :param gold_labels: [b, num_labels] 263 | :return: If training, return (loss, predicted labels). Else return predicted labels 264 | """ 265 | data = torch.stack(embedding).transpose(1, 2) # [b, d, n] 266 | out_1 = self.cnn_1(data) 267 | # out_2 = self.cnn_2(out_1) 268 | # output = self.mlp(out_2.view(out_2.size(0), -1)) 269 | output = self.mlp(out_1.view(out_1.size(0), -1)) 270 | labels = F.softmax(output, dim=1) 271 | if not self.training: 272 | return labels.detach() 273 | if self.loss_type == 'cross_entropy': 274 | loss = self.loss(output, torch.argmax(gold_labels, dim=1)) 275 | else: 276 | loss = self.loss(labels, gold_labels) 277 | return loss, labels.detach() 278 | 279 | 280 | class RNN(nn.Module): 281 | """ 282 | b: batch_size, n: seq_len, u: rnn_hidden_size, da: param_da, r: param_r, d: embedding_size 283 | """ 284 | 285 | def __init__(self, config): 286 | super().__init__() 287 | opt = config['rnn'] 288 | u = opt['rnn_hidden_size'] 289 | da = opt['param_da'] 290 | r = opt['param_r'] 291 | d = config['embedding_size'] 292 | num_layers = opt['num_layers'] 293 | bidirectional = opt['bidirectional'] 294 | if opt['type'] == 'lstm': 295 | self.rnn = nn.LSTM(input_size=d, hidden_size=u, num_layers=num_layers, 296 | bidirectional=bidirectional, batch_first=True) 297 | elif opt['type'] == 'gru': 298 | self.rnn = nn.GRU(input_size=d, hidden_size=u, num_layers=num_layers, 299 | bidirectional=bidirectional, batch_first=True) 300 | else: 301 | log.fatal('Invalid rnn type. Should be "lstm" or "gru"') 302 | if bidirectional: 303 | u = u * 2 304 | mlp_u = opt['mlp_hidden_size'] 305 | self.mlp = nn.Sequential( 306 | nn.Linear(r * u, mlp_u), 307 | nn.ReLU(), 308 | nn.Dropout(opt['dropout']), 309 | nn.Linear(mlp_u, config['num_labels']), 310 | ) 311 | self.Ws1 = nn.Parameter(torch.randn(da, u)) 312 | self.Ws2 = nn.Parameter(torch.randn(r, da)) 313 | self.p_c = opt['p_coefficient'] 314 | self.loss_type = opt['loss'] 315 | if self.loss_type == 'l1': 316 | self.loss = nn.L1Loss() 317 | elif self.loss_type == 'mse': 318 | self.loss = nn.MSELoss() 319 | elif self.loss_type == 'cross_entropy': 320 | self.loss = nn.CrossEntropyLoss() 321 | else: 322 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"') 323 | 324 | def forward(self, embedding, gold_labels=None): 325 | """ 326 | :param embedding: [b, n, d] 327 | :param gold_labels: [b, num_labels] 328 | :return: If training, return (loss, predicted labels). Else return predicted labels 329 | """ 330 | padded = nn.utils.rnn.pad_sequence(embedding, batch_first=True) # [b, n, d] 331 | H = self.rnn(padded)[0] # [b, n, u] 332 | A = F.softmax(torch.matmul(self.Ws2, torch.tanh(torch.matmul(self.Ws1, H.transpose(1, 2)))), dim=2) # [b, r, n] 333 | M = torch.matmul(A, H) # [b, r, u] 334 | output = self.mlp(M.view(M.size(0), -1)) 335 | labels = F.softmax(output, dim=1) 336 | if not self.training: 337 | return labels.detach() 338 | I = torch.eye(A.size(1)) 339 | if is_gpu: 340 | I = I.cuda() 341 | tmp = torch.matmul(A, A.transpose(1, 2)) - I 342 | P = (tmp * tmp).sum() / A.size(0) 343 | loss = self.p_c * P 344 | if self.loss_type == 'cross_entropy': 345 | loss = self.loss(output, torch.argmax(gold_labels, dim=1)) 346 | else: 347 | loss = self.loss(labels, gold_labels) 348 | return loss, labels.detach() 349 | 350 | 351 | def main(): 352 | log.info('Loading Train Data') 353 | batch_size = args['batch_size'] 354 | train_data = load_data(args['input_path'], 355 | -1 if args['type'] == 'rnn' else args[args['type']]['max_length'], 356 | True) 357 | train_data, dev_data = train_data.split(len(train_data) // 10) 358 | log.info('Train: length: {}, total batch: {}, batch size: {}'.format( 359 | len(train_data), (len(train_data) + batch_size - 1) // batch_size, batch_size)) 360 | log.info('Dev: length: {}, total batch: {}, batch size: {}'.format( 361 | len(dev_data), (len(dev_data) + batch_size - 1) // batch_size, batch_size)) 362 | 363 | log.info('Loading model {}'.format(args['type'])) 364 | model = None 365 | if args['type'] == 'rnn': 366 | model = RNN(args) 367 | elif args['type'] == 'cnn': 368 | model = CNN(args) 369 | elif args['type'] == 'mlp': 370 | model = MLP(args) 371 | else: 372 | log.fatal('Invalid type. Should be "rnn", "cnn" or "mlp"') 373 | 374 | if is_gpu: 375 | model.cuda() 376 | 377 | optimizer = None 378 | if args['optimizer'] == 'adagrad': 379 | optimizer = optim.Adagrad(model.parameters(), lr=args['lr'], 380 | lr_decay=args['lr_decay'], weight_decay=args['weight_decay']) 381 | elif args['optimizer'] == 'sgd': 382 | optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum'], 383 | weight_decay=args['weight_decay']) 384 | else: 385 | log.fatal('Invalid optimizer type. Should be "adagrad" or "sgd"') 386 | 387 | train_stat = Stat(True) 388 | eval_stat = Stat(False) 389 | best_epoch = -1 390 | best_state_dict = None 391 | 392 | for epoch in range(args['num_epochs']): 393 | log.info('*** epoch: {} ***'.format(epoch + 1)) 394 | 395 | log.info('*** training ***') 396 | model.train() 397 | gen = BatchGen(train_data, batch_size) 398 | cnt = 0 399 | for batch, data in enumerate(gen): 400 | optimizer.zero_grad() 401 | loss, pred_labels = model(data[0], data[1]) 402 | loss.backward() 403 | optimizer.step() 404 | cnt += 1 405 | if cnt == args['display_per_batch']: 406 | cnt = 0 407 | train_stat.add(pred_labels, data[1], loss.item()) 408 | train_stat.log(epoch * len(gen) + batch + 1, epoch, batch) 409 | 410 | log.info('*** evaluating ***') 411 | model.eval() 412 | gen = BatchGen(dev_data, batch_size) 413 | for batch, data in enumerate(gen): 414 | with torch.no_grad(): 415 | pred_labels = model(data[0]) 416 | eval_stat.add(pred_labels, data[1], None) 417 | eval_stat.log(epoch + 1, epoch, None) 418 | if best_epoch == -1 or eval_stat.save['acc'][-1] > eval_stat.save['acc'][best_epoch]: 419 | best_epoch = epoch 420 | best_state_dict = model.state_dict() 421 | 422 | log.info('\n*** Best acc model ***\nepoch: {}\nacc: {}\nf1: {}\ncorr: {}'.format( 423 | best_epoch + 1, eval_stat.save['acc'][best_epoch], eval_stat.save['f1'][best_epoch], 424 | eval_stat.save['corr'][best_epoch])) 425 | writer.close() 426 | 427 | log.info('Loading Test Data') 428 | test_data = load_data(args['input_path'], 429 | -1 if args['type'] == 'rnn' else args[args['type']]['max_length'], 430 | False) 431 | log.info('Test: length: {}, total batch: {}, batch size: {}'.format( 432 | len(test_data), (len(test_data) + batch_size - 1) // batch_size, batch_size)) 433 | 434 | model.load_state_dict(best_state_dict) 435 | model.eval() 436 | gen = BatchGen(dev_data, batch_size) 437 | for batch, data in enumerate(gen): 438 | with torch.no_grad(): 439 | pred_labels = model(data[0]) 440 | eval_stat.add(pred_labels, data[1], None) 441 | acc, f1, corr = eval_stat.eval() 442 | log.info('\n*** Test Result ***\nacc: {}\nf1: {}\ncorr: {}'.format(acc, f1, corr)) 443 | 444 | 445 | if __name__ == '__main__': 446 | main() 447 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from elmoformanylangs import Embedder 4 | import argparse 5 | import logging 6 | import numpy as np 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--type', type=str, required=True) 10 | parser.add_argument('--elmo_model_path', type=str) 11 | parser.add_argument('--vector_path', type=str) 12 | parser.add_argument('--train_file', type=str, required=True) 13 | parser.add_argument('--test_file', type=str, required=True) 14 | parser.add_argument('--output_path', type=str, required=True) 15 | args = parser.parse_args() 16 | 17 | os.makedirs(args.output_path, exist_ok=True) 18 | 19 | logFormatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 20 | log = logging.getLogger() 21 | 22 | fileHandler = logging.FileHandler(os.path.join(args.output_path, 'log.txt')) 23 | fileHandler.setFormatter(logFormatter) 24 | log.addHandler(fileHandler) 25 | 26 | consoleHandler = logging.StreamHandler(sys.stdout) 27 | consoleHandler.setFormatter(logFormatter) 28 | log.addHandler(consoleHandler) 29 | log.setLevel(logging.DEBUG) 30 | 31 | log.info('=====Pre-processing=====') 32 | 33 | log.info('{}'.format(args)) 34 | 35 | if args.type == 'elmo': 36 | e = Embedder(args.elmo_model_path, batch_size=2) 37 | 38 | 39 | def work(input_path, output_text_file, output_label_file): 40 | log.info('Loading data') 41 | 42 | label_list = [] 43 | text_list = [] 44 | 45 | with open(input_path, 'r') as f: 46 | for line in f.readlines(): 47 | data = line.strip().split('\t') 48 | data[1] = data[1].strip().split() 49 | label = [0 for i in range(8)] 50 | for i in range(0, 8): 51 | label[i] = int(data[1][1 + i].split(':')[1]) 52 | label_list.append(label) 53 | text_list.append(data[2].strip().split()) 54 | 55 | log.info('size: {}'.format(len(text_list))) 56 | 57 | seq_len = [len(x) for x in text_list] 58 | log.info('max seq len: {}'.format(max(seq_len))) 59 | log.info('ava seq len: {:.3f}'.format(sum(seq_len) / len(seq_len))) 60 | 61 | if args.type == 'elmo': 62 | log.info('Loading elmo model') 63 | log.info(' Loaded') 64 | log.info('Processing') 65 | text_embed_list = e.sents2elmo(text_list) 66 | log.info(' Done') 67 | elif args.type == 'word2vec': 68 | log.info('Loading word2vec model') 69 | 70 | # https://github.com/Embedding/Chinese-Word-Vectors/blob/master/evaluation/ana_eval_dense.py 71 | def read_vectors(path, topn): # read top n word vectors, i.e. top is 10000 72 | lines_num, dim = 0, 0 73 | vectors = {} 74 | with open(path, encoding='utf-8', errors='ignore') as f: 75 | first_line = True 76 | for l in f: 77 | if first_line: 78 | first_line = False 79 | dim = int(l.rstrip().split()[1]) 80 | continue 81 | lines_num += 1 82 | tokens = l.rstrip().split(' ') 83 | vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]]) 84 | if topn != 0 and lines_num >= topn: 85 | break 86 | return vectors, dim 87 | 88 | vct, dim = read_vectors(args.vector_path, 0) 89 | # https://github.com/Embedding/Chinese-Word-Vectors/issues/23 90 | avg = np.zeros(dim) 91 | ''' 92 | for v in vct.values(): 93 | avg += v 94 | avg /= len(vct) 95 | ''' 96 | log.info(' Loaded') 97 | log.info('Processing, dim: {}'.format(dim)) 98 | text_embed_list = [] 99 | for sen in text_list: 100 | sen_embed = [] 101 | for w in sen: 102 | if w in vct: 103 | w_embed = vct[w] 104 | else: 105 | w_embed = avg 106 | sen_embed.append(w_embed) 107 | text_embed_list.append(sen_embed) 108 | log.info(' Done') 109 | else: 110 | log.fatal('Invalid type. Should be "elmo" or "word2vec"') 111 | 112 | log.info('sample: \n{}'.format(text_embed_list[0][0])) 113 | np.save(output_text_file, text_embed_list) 114 | np.save(output_label_file, label_list) 115 | 116 | 117 | work(args.train_file, 118 | os.path.join(args.output_path, 'train_text.npy'), 119 | os.path.join(args.output_path, 'train_label.npy')) 120 | 121 | work(args.test_file, 122 | os.path.join(args.output_path, 'test_text.npy'), 123 | os.path.join(args.output_path, 'test_label.npy')) 124 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.1 2 | alabaster==0.7.12 3 | allennlp==0.8.3 4 | asn1crypto==0.24.0 5 | astor==0.8.0 6 | atomicwrites==1.3.0 7 | attrs==19.1.0 8 | aws-sam-translator==1.11.0 9 | aws-xray-sdk==2.4.2 10 | awscli==1.16.163 11 | Babel==2.6.0 12 | bert-serving-client==1.9.1 13 | bert-serving-server==1.9.1 14 | bleach==1.5.0 15 | blis==0.2.4 16 | boto==2.49.0 17 | boto3==1.9.153 18 | botocore==1.12.153 19 | certifi==2019.3.9 20 | cffi==1.12.3 21 | cfn-lint==0.21.0 22 | chardet==3.0.4 23 | Click==7.0 24 | colorama==0.3.9 25 | conllu==0.11 26 | cookies==2.2.1 27 | cryptography==2.6.1 28 | cycler==0.10.0 29 | cymem==2.0.2 30 | cytoolz==0.9.0.1 31 | dill==0.2.9 32 | docker==4.0.1 33 | docker-pycreds==0.4.0 34 | docutils==0.14 35 | ecdsa==0.13.3 36 | editdistance==0.5.3 37 | en-core-web-sm==2.0.0 38 | enum34==1.1.6 39 | flaky==3.5.3 40 | Flask==1.0.3 41 | Flask-Cors==3.0.7 42 | ftfy==5.5.1 43 | future==0.17.1 44 | gast==0.2.2 45 | gevent==1.4.0 46 | GPUtil==1.4.0 47 | greenlet==0.4.15 48 | grpcio==1.20.1 49 | h5py==2.9.0 50 | html5lib==0.9999999 51 | idna==2.7 52 | imagesize==1.1.0 53 | itsdangerous==1.1.0 54 | Jinja2==2.10.1 55 | jmespath==0.9.4 56 | joblib==0.13.2 57 | jsondiff==1.1.2 58 | jsonnet==0.12.1 59 | jsonpatch==1.23 60 | jsonpickle==1.1 61 | jsonpointer==2.0 62 | jsonschema==2.6.0 63 | Keras-Applications==1.0.7 64 | Keras-Preprocessing==1.0.9 65 | kiwisolver==1.1.0 66 | Markdown==3.1.1 67 | MarkupSafe==1.1.1 68 | matplotlib==3.1.0 69 | mock==3.0.5 70 | more-itertools==7.0.0 71 | moto==1.3.8 72 | msgpack==0.5.6 73 | msgpack-numpy==0.4.4.3 74 | msgpack-python==0.5.6 75 | murmurhash==1.0.2 76 | nltk==3.4.5 77 | numpy==1.16.3 78 | numpydoc==0.9.1 79 | overrides==1.9 80 | packaging==19.0 81 | pandas==0.24.2 82 | parsimonious==0.8.1 83 | pbkdf2==1.3 84 | pbr==5.2.0 85 | Pillow==6.2.0 86 | plac==0.9.6 87 | pluggy==0.11.0 88 | pprint==0.1 89 | preshed==2.0.1 90 | protobuf==3.7.1 91 | py==1.8.0 92 | pyaml==19.4.1 93 | pyasn1==0.4.5 94 | pycparser==2.19 95 | pycryptodome==3.8.1 96 | Pygments==2.4.0 97 | pyparsing==2.4.0 98 | pytest==4.5.0 99 | python-dateutil==2.8.0 100 | python-jose==3.0.1 101 | pytorch-pretrained-bert==0.6.2 102 | pytz==2019.1 103 | PyYAML==5.1 104 | pyzmq==18.0.1 105 | regex==2019.4.14 106 | requests==2.22.0 107 | responses==0.10.6 108 | rsa==3.4.2 109 | s3transfer==0.2.0 110 | scikit-learn==0.21.1 111 | scipy==1.3.0 112 | singledispatch==3.4.0.3 113 | six==1.12.0 114 | snowballstemmer==1.2.1 115 | spacy==2.1.4 116 | Sphinx==2.0.1 117 | sphinxcontrib-applehelp==1.0.1 118 | sphinxcontrib-devhelp==1.0.1 119 | sphinxcontrib-htmlhelp==1.0.2 120 | sphinxcontrib-jsmath==1.0.1 121 | sphinxcontrib-qthelp==1.0.2 122 | sphinxcontrib-serializinghtml==1.1.3 123 | sphinxcontrib-websupport==1.1.2 124 | sqlparse==0.3.0 125 | srsly==0.0.5 126 | tb-nightly==1.14.0a20190523 127 | tensorboard==1.13.1 128 | tensorboardX==1.7 129 | tensorflow-estimator==1.13.0 130 | tensorflow-gpu==1.15.0 131 | tensorflow-tensorboard==1.5.1 132 | termcolor==1.1.0 133 | thinc==7.0.4 134 | toolz==0.9.0 135 | torch==1.1.0 136 | tqdm==4.32.1 137 | ujson==1.35 138 | Unidecode==1.0.23 139 | urllib3==1.25.2 140 | wasabi==0.2.2 141 | wcwidth==0.1.7 142 | webencodings==0.5.1 143 | websocket-client==0.56.0 144 | Werkzeug==0.15.4 145 | word2number==1.1 146 | wrapt==1.11.1 147 | xmltodict==0.12.0 148 | -------------------------------------------------------------------------------- /run_cnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python='python3' 4 | 5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_cnn.json 6 | -------------------------------------------------------------------------------- /run_mlp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python='python3' 4 | 5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_mlp.json 6 | -------------------------------------------------------------------------------- /run_preprocess_elmo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python='python3' 4 | 5 | CUDA_VISIBLE_DEVICES=$1 ${python} preprocess.py \ 6 | --type elmo \ 7 | --elmo_model_path data/zhs.model \ 8 | --train_file data/sinanews.train \ 9 | --test_file data/sinanews.test \ 10 | --output_path data/elmo_temp 11 | -------------------------------------------------------------------------------- /run_preprocess_word2vec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python='python3' 4 | 5 | CUDA_VISIBLE_DEVICES=$1 ${python} preprocess.py \ 6 | --type word2vec \ 7 | --vector_path data/word2vec/sgns.sogounews.bigram-char \ 8 | --train_file data/sinanews.train \ 9 | --test_file data/sinanews.test \ 10 | --output_path data/word2vec_temp 11 | -------------------------------------------------------------------------------- /run_rnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python='python3' 4 | 5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_rnn.json 6 | -------------------------------------------------------------------------------- /save/bi-gru_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-gru_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "gru", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/bi-gru_1/runs/events.out.tfevents.1559405473.gpu-theta.8556.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-gru_1/runs/events.out.tfevents.1559405473.gpu-theta.8556.0 -------------------------------------------------------------------------------- /save/bi-lstm_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/bi-lstm_1/runs/events.out.tfevents.1559400227.gpu-theta.32041.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_1/runs/events.out.tfevents.1559400227.gpu-theta.32041.0 -------------------------------------------------------------------------------- /save/bi-lstm_2/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.9, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/bi-lstm_2/runs/events.out.tfevents.1559438048.gpu-theta.14643.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_2/runs/events.out.tfevents.1559438048.gpu-theta.14643.0 -------------------------------------------------------------------------------- /save/bi-lstm_3/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 1024, "dropout": 0.9, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/bi-lstm_3/runs/events.out.tfevents.1559440265.gpu-theta.15460.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_3/runs/events.out.tfevents.1559440265.gpu-theta.15460.0 -------------------------------------------------------------------------------- /save/bi-lstm_4/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_4", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 512, "mlp_hidden_size": 1024, "dropout": 0.5, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/bi-lstm_4/runs/events.out.tfevents.1559441032.gpu-theta.21582.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_4/runs/events.out.tfevents.1559441032.gpu-theta.21582.0 -------------------------------------------------------------------------------- /save/cnn_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_1/runs/events.out.tfevents.1559399753.gpu-theta.10281.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_1/runs/events.out.tfevents.1559399753.gpu-theta.10281.0 -------------------------------------------------------------------------------- /save/cnn_2/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "l1"}} -------------------------------------------------------------------------------- /save/cnn_2/runs/events.out.tfevents.1559409399.gpu-theta.22755.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_2/runs/events.out.tfevents.1559409399.gpu-theta.22755.0 -------------------------------------------------------------------------------- /save/cnn_3/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 1024, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_3/runs/events.out.tfevents.1559409541.gpu-theta.30631.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_3/runs/events.out.tfevents.1559409541.gpu-theta.30631.0 -------------------------------------------------------------------------------- /save/cnn_4/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_4", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.9}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_4/runs/events.out.tfevents.1559437928.gpu-theta.6850.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_4/runs/events.out.tfevents.1559437928.gpu-theta.6850.0 -------------------------------------------------------------------------------- /save/cnn_5/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_5", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.9}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_5/runs/events.out.tfevents.1559437980.gpu-theta.10531.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_5/runs/events.out.tfevents.1559437980.gpu-theta.10531.0 -------------------------------------------------------------------------------- /save/cnn_6/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/elmo_temp", "output_path": "save/cnn_6", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 1024, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.9}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_6/runs/events.out.tfevents.1559443831.gpu-theta.16155.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_6/runs/events.out.tfevents.1559443831.gpu-theta.16155.0 -------------------------------------------------------------------------------- /save/cnn_7/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_7", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.5}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_7/runs/events.out.tfevents.1559462006.gpu-theta.17687.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_7/runs/events.out.tfevents.1559462006.gpu-theta.17687.0 -------------------------------------------------------------------------------- /save/cnn_8/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_8", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.5}, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/cnn_8/runs/events.out.tfevents.1559462040.gpu-theta.19995.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_8/runs/events.out.tfevents.1559462040.gpu-theta.19995.0 -------------------------------------------------------------------------------- /save/gru_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/gru_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "gru", "bidirectional": false, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/gru_1/runs/events.out.tfevents.1559406000.gpu-theta.3523.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/gru_1/runs/events.out.tfevents.1559406000.gpu-theta.3523.0 -------------------------------------------------------------------------------- /save/lstm_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/lstm_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": false, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/lstm_1/runs/events.out.tfevents.1559402877.gpu-theta.2069.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/lstm_1/runs/events.out.tfevents.1559402877.gpu-theta.2069.0 -------------------------------------------------------------------------------- /save/mlp_1/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/mlp_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.5, "hidden_size": 512, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/mlp_1/runs/events.out.tfevents.1559399750.gpu-theta.9979.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_1/runs/events.out.tfevents.1559399750.gpu-theta.9979.0 -------------------------------------------------------------------------------- /save/mlp_2/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/word2vec_temp", "output_path": "save/mlp_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.9, "hidden_size": 512, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/mlp_2/runs/events.out.tfevents.1559409302.gpu-theta.16485.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_2/runs/events.out.tfevents.1559409302.gpu-theta.16485.0 -------------------------------------------------------------------------------- /save/mlp_3/config.json: -------------------------------------------------------------------------------- 1 | {"input_path": "data/elmo_temp", "output_path": "save/mlp_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 1024, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.5, "hidden_size": 512, "loss": "cross_entropy"}} -------------------------------------------------------------------------------- /save/mlp_3/runs/events.out.tfevents.1559438821.gpu-theta.15937.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_3/runs/events.out.tfevents.1559438821.gpu-theta.15937.0 --------------------------------------------------------------------------------