├── .gitignore
├── .idea
├── deployment.xml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── ssc.iml
└── vcs.xml
├── README.md
├── config_cnn.json
├── config_mlp.json
├── config_rnn.json
├── data
├── .gitignore
└── sinanews.demo
├── doc
├── .gitignore
├── bn_dev.svg
├── bn_train.svg
├── col_bi-gru.png
├── col_bi-lstm.png
├── col_cnn.png
├── col_gru.png
├── col_lstm.png
├── col_mlp.png
├── dev_Accuracy.svg
├── dev_CORR.svg
├── dev_F1_macro.svg
├── dropout_dev.svg
├── dropout_train.svg
├── embed_dev.svg
├── embed_train.svg
├── self-attention_dev.svg
├── self-attention_train.svg
└── train_Accuracy.svg
├── elmoformanylangs
├── __init__.py
├── __main__.py
├── biLM.py
├── dataloader.py
├── elmo.py
├── frontend.py
├── modules
│ ├── __init__.py
│ ├── classify_layer.py
│ ├── elmo.py
│ ├── embedding_layer.py
│ ├── encoder_base.py
│ ├── highway.py
│ ├── lstm.py
│ ├── lstm_cell_with_projection.py
│ ├── token_embedder.py
│ └── util.py
└── utils.py
├── main.py
├── preprocess.py
├── requirements.txt
├── run_cnn.sh
├── run_mlp.sh
├── run_preprocess_elmo.sh
├── run_preprocess_word2vec.sh
├── run_rnn.sh
└── save
├── bi-gru_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559405473.gpu-theta.8556.0
├── bi-lstm_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559400227.gpu-theta.32041.0
├── bi-lstm_2
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559438048.gpu-theta.14643.0
├── bi-lstm_3
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559440265.gpu-theta.15460.0
├── bi-lstm_4
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559441032.gpu-theta.21582.0
├── cnn_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559399753.gpu-theta.10281.0
├── cnn_2
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559409399.gpu-theta.22755.0
├── cnn_3
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559409541.gpu-theta.30631.0
├── cnn_4
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559437928.gpu-theta.6850.0
├── cnn_5
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559437980.gpu-theta.10531.0
├── cnn_6
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559443831.gpu-theta.16155.0
├── cnn_7
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559462006.gpu-theta.17687.0
├── cnn_8
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559462040.gpu-theta.19995.0
├── gru_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559406000.gpu-theta.3523.0
├── lstm_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559402877.gpu-theta.2069.0
├── mlp_1
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559399750.gpu-theta.9979.0
├── mlp_2
├── config.json
├── log.txt
└── runs
│ └── events.out.tfevents.1559409302.gpu-theta.16485.0
└── mlp_3
├── config.json
├── log.txt
└── runs
└── events.out.tfevents.1559438821.gpu-theta.15937.0
/.gitignore:
--------------------------------------------------------------------------------
1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
3 |
4 | # User-specific stuff
5 | .idea/**/workspace.xml
6 | .idea/**/tasks.xml
7 | .idea/**/usage.statistics.xml
8 | .idea/**/dictionaries
9 | .idea/**/shelf
10 |
11 | # Generated files
12 | .idea/**/contentModel.xml
13 |
14 | # Sensitive or high-churn files
15 | .idea/**/dataSources/
16 | .idea/**/dataSources.ids
17 | .idea/**/dataSources.local.xml
18 | .idea/**/sqlDataSources.xml
19 | .idea/**/dynamic.xml
20 | .idea/**/uiDesigner.xml
21 | .idea/**/dbnavigator.xml
22 |
23 | # Gradle
24 | .idea/**/gradle.xml
25 | .idea/**/libraries
26 |
27 | # Gradle and Maven with auto-import
28 | # When using Gradle or Maven with auto-import, you should exclude module files,
29 | # since they will be recreated, and may cause churn. Uncomment if using
30 | # auto-import.
31 | # .idea/modules.xml
32 | # .idea/*.iml
33 | # .idea/modules
34 | # *.iml
35 | # *.ipr
36 |
37 | # CMake
38 | cmake-build-*/
39 |
40 | # Mongo Explorer plugin
41 | .idea/**/mongoSettings.xml
42 |
43 | # File-based project format
44 | *.iws
45 |
46 | # IntelliJ
47 | out/
48 |
49 | # mpeltonen/sbt-idea plugin
50 | .idea_modules/
51 |
52 | # JIRA plugin
53 | atlassian-ide-plugin.xml
54 |
55 | # Cursive Clojure plugin
56 | .idea/replstate.xml
57 |
58 | # Crashlytics plugin (for Android Studio and IntelliJ)
59 | com_crashlytics_export_strings.xml
60 | crashlytics.properties
61 | crashlytics-build.properties
62 | fabric.properties
63 |
64 | # Editor-based Rest Client
65 | .idea/httpRequests
66 |
67 | # Android studio 3.1+ serialized cache file
68 | .idea/caches/build_file_checksums.ser
69 |
70 | # Byte-compiled / optimized / DLL files
71 | __pycache__/
72 | *.py[cod]
73 | *$py.class
74 |
75 | # C extensions
76 | *.so
77 |
78 | # Distribution / packaging
79 | .Python
80 | build/
81 | develop-eggs/
82 | dist/
83 | downloads/
84 | eggs/
85 | .eggs/
86 | lib/
87 | lib64/
88 | parts/
89 | sdist/
90 | var/
91 | wheels/
92 | pip-wheel-metadata/
93 | share/python-wheels/
94 | *.egg-info/
95 | .installed.cfg
96 | *.egg
97 | MANIFEST
98 |
99 | # PyInstaller
100 | # Usually these files are written by a python script from a template
101 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
102 | *.manifest
103 | *.spec
104 |
105 | # Installer logs
106 | pip-log.txt
107 | pip-delete-this-directory.txt
108 |
109 | # Unit test / coverage reports
110 | htmlcov/
111 | .tox/
112 | .nox/
113 | .coverage
114 | .coverage.*
115 | .cache
116 | nosetests.xml
117 | coverage.xml
118 | *.cover
119 | .hypothesis/
120 | .pytest_cache/
121 |
122 | # Translations
123 | *.mo
124 | *.pot
125 |
126 | # Django stuff:
127 | *.log
128 | local_settings.py
129 | db.sqlite3
130 | db.sqlite3-journal
131 |
132 | # Flask stuff:
133 | instance/
134 | .webassets-cache
135 |
136 | # Scrapy stuff:
137 | .scrapy
138 |
139 | # Sphinx documentation
140 | docs/_build/
141 |
142 | # PyBuilder
143 | target/
144 |
145 | # Jupyter Notebook
146 | .ipynb_checkpoints
147 |
148 | # IPython
149 | profile_default/
150 | ipython_config.py
151 |
152 | # pyenv
153 | .python-version
154 |
155 | # pipenv
156 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
157 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
158 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
159 | # install all needed dependencies.
160 | #Pipfile.lock
161 |
162 | # celery beat schedule file
163 | celerybeat-schedule
164 |
165 | # SageMath parsed files
166 | *.sage.py
167 |
168 | # Environments
169 | .env
170 | .venv
171 | env/
172 | venv/
173 | ENV/
174 | env.bak/
175 | venv.bak/
176 |
177 | # Spyder project settings
178 | .spyderproject
179 | .spyproject
180 |
181 | # Rope project settings
182 | .ropeproject
183 |
184 | # mkdocs documentation
185 | /site
186 |
187 | # mypy
188 | .mypy_cache/
189 | .dmypy.json
190 | dmypy.json
191 |
192 | # Pyre type checker
193 | .pyre/
194 |
195 | tmp/
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/ssc.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 简单的中文文本情感分类
2 |
3 | 一个用 PyTorch 实现的中文文本情感分类网络,代码较简单,功能较丰富,包含了多种模型 baseline。
4 |
5 | ## 环境需求
6 |
7 | * python == 3.6
8 | * torch == 1.1.0
9 | * Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
10 | * NVIDIA TITAN Xp
11 |
12 | 其余的见 `requirements.txt`
13 |
14 | ## 使用方法
15 |
16 | 先预处理,`./run_preprocess_word2vec.sh` 或 `./run_preprocess_elmo.sh 3`(3 是 gpu 编号)
17 |
18 | 然后运行 `python3 main.py --config_path config_cnn.json`
19 |
20 | ## 预处理
21 |
22 | 将所给文本的每个词转换成预训练模型的词向量后存到文件里。我分别尝试了这两种 embedding:
23 |
24 | * ELMo 中文预训练模型,1024d(https://github.com/HIT-SCIR/ELMoForManyLangs)
25 | * Chinese-Word-Vectors,300d(https://github.com/Embedding/Chinese-Word-Vectors)
26 |
27 | 请自行下载相应的模型文件到 `data/word2vec/` 或 `data/zhs.model` 文件夹下。
28 |
29 | 具体细节见 `preprocess.py` 文件,若想使用自己的数据集,修改该文件即可。
30 |
31 | ## 实现的模型
32 |
33 | ### MLP (2 layer)
34 |
35 | Linear + ReLU + Dropout + Linear + Softmax
36 |
37 | ### CNN (1 layer) + MLP (2 layer)
38 |
39 | Conv1d + ReLU + Dropout + MaxPool1d + Linear + ReLU + Dropout + Linear + Softmax
40 |
41 | 见这篇 paper [https://www.aclweb.org/anthology/D14-1181](https://www.aclweb.org/anthology/D14-1181)
42 |
43 | [1] Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751.
44 |
45 | ### RNN (1 layer) + Self-Attention + MLP (2 layer)
46 |
47 | RNN (GRU or LSTM or bi-GRU or bi-LSTM) + Self-Attention + Linear + ReLU + Dropout + Linear + Softmax
48 |
49 | Self-Attention 见这篇 paper [https://arxiv.org/pdf/1703.03130.pdf](https://arxiv.org/pdf/1703.03130.pdf)
50 |
51 | [2] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. In Proceedings of International Conference on Learning Representations.
52 |
53 | ## 某些参数的解释
54 |
55 | * `seed`:`20000125` (保证结果可复现)
56 | * `gpu`:`false` (使用 cpu),`true` (使用 nvidia 系 gpu,推荐)
57 | * `output_path`:运行模型会将日志文件、TensorBoard 文件、配置文件生成到该目录下
58 | * `loss`:`l1` (L1Loss) ,`mse` (MSELoss),`cross_entropy` (CrossEntropyLoss,推荐)
59 | * `optimizer`:`sgd`,`adagrad` (Adagrad 自带了 L2 regularization,推荐)
60 | * `embedding_size`:`1024` (ELMo),`300` (Chinese-Word-Vectors,较小,推荐)
61 | * `type`:`mlp`,`cnn`,`rnn`
62 |
63 | 具体见 `config_mlp.json`、`config_cnn.json`、`config_rnn.json` 这些文件。
64 |
65 | ## 数据集
66 |
67 | 数据集用的是 THU 计算机系《人工智能导论》作业三的数据集,在这里我不方便公开数据集及其介绍。大概的介绍一下就是,中文新闻,8 种情感分类。
68 |
69 | 这个数据集我必须要说一下,数据集比较小,网民标注数据不太准确,训练集和测试集分布不太一样(训练集是 2012 年 1 月至 2 月 发布的 2,342 篇新闻文章,测试集是 2012 年 3 月至 4 月发布的 2,228 篇新闻文章),所以某些模型可能达不到预期的效果。
70 |
71 | 我从所给训练集数据取出后 1/10 作为 dev 数据集,每训一个 epoch 都测试一次 dev 数据集,然后取一个准确率最高的那个 epoch 的模型作为最终测试的模型。
72 |
73 | ## 实验结果
74 |
75 | 先放最好的结果(总共训练了 300 epoch,取 dev 数据集准确率最高的那个 epoch 来对 test 数据集进行测试得到的下表,总用时指的是训完 300 个 epoch 后的用时):
76 |
77 | | 模型 | Accuracy(%) | F1(%) | CORR | 总用时 | 参数 |
78 | | :------: | :---------: | :---------: | :--: | :--------------: | :--------------------------------------------------------: |
79 | |  MLP | 59.4 | 21.5 | 0.28 | 7m44s | [save/mlp_1/config.json](./save/mlp_1/config.json) |
80 | |  CNN | 62.4 | 30.2 | 0.41 | 9m56s | [save/cnn_5/config.json](./save/cnn_5/config.json) |
81 | |  bi-LSTM | 58.1 | 30.8 | 0.27 | 37m47s | [save/bi-lstm_1/config.json](./save/bi-lstm_1/config.json) |
82 | |  bi-GRU | 57.3 | 26.3 | 0.31 | 34m47s | [save/bi-gru_1/config.json](./save/bi-gru_1/config.json) |
83 | |  LSTM | 55.56 | 25.3 | 0.26 | 21m44s | [save/lstm_1/config.json](./save/lstm_1/config.json) |
84 | |  GRU | 51.3 | 25.3 | 0.26 | 20m41s | [save/gru_1/config.json](./save/gru_1/config.json) |
85 | | MLP-ELMo | 58.1 | 21.9 | 0.21 | 18m26s | [save/mlp_3/config.json](./save/mlp_3/config.json) |
86 | | CNN-ELMo | 59.8 | 30.1 | 0.34 | 14m23s | [save/cnn_6/config.json](./save/cnn_6/config.json) |
87 |
88 | 下图为 dev Accuracy
89 |
90 | 
91 |
92 | 下图为 dev F1 (macro)
93 |
94 | 
95 |
96 | 下图为 dev CORR
97 |
98 | 
99 |
100 | 下图为 train Accuracy
101 |
102 | 
103 |
104 | 若想查看更多图表请运行以下命令
105 |
106 | ```
107 | $ tensorboard --logdir MLP:save/mlp_1/runs/,\
108 | CNN:save/cnn_5/runs/,\
109 | bi-LSTM:save/bi-lstm_1/runs/,\
110 | bi-GRU:save/bi-gru_1/runs/,\
111 | LSTM:save/lstm_1/runs/,\
112 | GRU:save/gru_1/runs/,\
113 | MLP-ELMo:save/mlp_3/runs/,\
114 | CNN-ELMo:save/cnn_6/runs/
115 | ```
116 |
117 | ## 模型与参数比较
118 |
119 | 由上图来看,MLP 收敛最快,最早进入过拟合,测试结果一般;RNN 系的模型收敛速度较快(bi-GRU除外),测试结果不是很好;CNN 的模型收敛较慢,特别稳定,测试结果特别好。
120 |
121 | MLP 作为一个入门模型,效果却非常不错,甚至吊打 RNN 系,这其实挺迷的,我能想到的原因只有我参数没找对或者数据集有毒。而 CNN 我曾经尝试过用 2 层卷积层,但效果不如 1 层的,所以后来就放弃了。
122 |
123 | 优化器基本上都是用的 `adagrad`,`sgd` 根据调参结果基本不会再用了(收敛太慢了)。
124 |
125 | ```json
126 | "optimizer": "adagrad",
127 | "lr": 0.01,
128 | "lr_decay": 0,
129 | "weight_decay": 0.0001,
130 | ```
131 |
132 | `ELMo` 的词向量经测试效果不太好,感觉应该是 pre-trained model 训的数据集和我们这个数据集分布差的有点多,然后我又没将 pre-trained model 接到我网络前面继续 fine tune 才导致的效果差?比较 [save/cnn_5](./save/cnn_5/config.json)(Chinese-Word-Vectors,橙色)和 [save/cnn_6](./save/cnn_6/config.json)(ELMo,蓝色)的图能看出在这个数据集上这个 ELMo 效果确实太好。
133 |
134 | | dev Accuracy | train Accuracy |
135 | | :------------------------: | :--------------------------: |
136 | |  |  |
137 |
138 | `Dropout` 是为了防止过拟合,比较 [save/cnn_1](./save/cnn_1/config.json)(dropout = 0.5,橙色)和 [save/cnn_4](./save/cnn_4/config.json)(dropout = 0.9,蓝色)的图能明显看出,当 dropout 越大,收敛越慢,但准确率更高,防止过拟合能力越强(毕竟解耦能力强)。
139 |
140 | | dev Accuracy | train Accuracy |
141 | | :------------------------: | :--------------------------: |
142 | |  |  |
143 |
144 | `Batch Normalization` 可以防止过拟合,比较卷积层里无 BN 无 Drouput 的 CNN [save/cnn_8](./save/cnn_8/config.json)(蓝色)和卷积层里有 BN 无 Drouput 的 CNN [save/cnn_7](./save/cnn_7/config.json)(Conv1d + BatchNorm1D + ReLU + MaxPool1d + Linear + ReLU + Dropout + Linear + Softmax,橙色)可以非常明显的看出。
145 |
146 | | dev Accuracy | train Accuracy |
147 | | :------------------------: | :--------------------------: |
148 | |  |  |
149 |
150 | `Self-Attention` 给 Loss 加了个惩罚项(penalization term),我比较了其在总 Loss 中系数不同的效果, [save/bi-lstm_1](./save/bi-lstm_1/config.json)(系数 = 1,橙色)和 [save/bi-lstm_2](./save/bi-lstm_2/config.json)(系数 = 0.3,蓝色),发现其对收敛速度有影响,高一点的话收敛会快很多,不过准确率有下降(感觉还是自己没细调的原因)。
151 |
152 | | dev Accuracy | train Accuracy |
153 | | :------------------------: | :--------------------------: |
154 | |  |  |
155 |
156 | 参数实在太多,真的不想再调了。
157 |
158 | ## 问题思考
159 |
160 | ##### 1) 实验训练什么时候停止是最合适的?简要陈述你的实现方式,并试分析固定迭代次数与通过验证集调整等方法的优缺点。
161 |
162 | 两种方式,我采用的是第 2 种:
163 |
164 | 1. 设置一个阈值 θ(和累计次数 λ),当历史准确率最大值减去当前准确率最大值大于 θ 时(或累积大于 λ 次时),则停止。
165 | 2. 估计过拟合前的位置(也能用方法 1 来估计),设置固定的迭代次数。
166 |
167 | 固定迭代次数的优点是,在机器配置较好(或模型较简单)的情况下,可以跑多一点 epoch,从而可以非常直观的比较不同模型的数据(曲线)来判断模型的优劣;缺点是花费时间较长。
168 |
169 | 通过验证集调整的方式的优点是,在机器配置较低(或模型较复杂)、ddl 时间不足或想快速选择模型时,能有效提高时间效率;缺点是不太能直观地比较不同模型的特点。
170 |
171 | ##### 2)实验参数的初始化是怎么做的?不同的方法适合哪些地方?(现有的初始化方法为零均值初始化,高斯分布初始化,正交初始化等)
172 |
173 | 我的代码里,除了 Self-Attention 的两个矩阵用的是标准正态分布外(还试过 U(-sqrt(1/u), sqrt(1/u)) 和 U(-sqrt(1/da), sqrt(1/da)) 但效果差不多),其余所有参数都是用 PyTorch 的默认初始化。
174 |
175 | 一般用均匀或正态的零均值分布来初始化线性层、卷积层的权重矩阵和偏置,根据系数的不同,又分为 xavier 的均匀、正态分布,kaiming 的均匀、正态分布。xavier 的分布适用于激活函数是 tanh 的初始化,不太适用于 ReLU。而 kaming 则非常适用于带 ReLU 激活函数的地方。PyTorch 里的线性层的初始化为均匀分布 U(-sqrt(1/in), sqrt(1/in))(a = sqrt(5) 的 kaiming 分布,in 为矩阵的第二维大小)。
176 |
177 | 而正交初始化适用于 RNN 中参数的初始化,用以解决递归网络下的梯度消失、梯度爆炸问题。
178 |
179 | ##### 3)过拟合是深度学习常见的问题,有什么方法可以方式训练过程陷入过拟合。
180 |
181 | Batch Normalization:
182 |
183 | BN 能提高模型的泛化能力,有效防止过拟合,减少参数初始化的影响。其原因在于 BN 将数据分布拉到了一个标准正态分布,减小了分布的 internal covariate shift ,一定程度上避免了梯度爆炸,而中心部分也有很大的梯度值,一定程度上避免了梯度消失。但若学习率设置不好,模型可能收敛得很慢。
184 |
185 | Dropout:
186 |
187 | Dropout 是提高模型泛化能力的万金油,通过抑制一些神经元的输出以及放大其余神经元的输出来减小不同神经元的耦合性。加就完事了。
188 |
189 | 但需要注意的是,加了 BN 再加 Dropout 效果可能很不好。
190 |
191 | ##### 4)试分析 CNN,RNN,全连接神经网络(MLP)三者的优缺点。
192 |
193 | | | MLP | CNN | RNN |
194 | | :--: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
195 | | 优点 | 结构简单,易理解,可解释,学习全局信息,能矩阵加速 | 结构简单,可解释,学习局部信息,在局部特征明显时优势很大,能矩阵加速,超快 | 结构简单,可解释,输入长度可变,有时序性,可记忆 |
196 | | 缺点 | 输入大小固定,参数规模较小时学习能力弱,参数规模大时内存开不下 | 输入大小固定,比较浅时学习能力有限,不能获取全局信息,往往需要在后面加个 MLP | 容易梯度消失和梯度爆炸,总体不是矩阵乘法,无法矩阵加速(单个元内部的矩阵乘法除外),较慢,往往后面也要加个 MLP |
197 |
198 | ## 心得体会
199 |
200 | 我在上学期的《人工神经网络》中学过相关知识,而且在作业里也手写过 MLP、CNN、RNN 的源码级别的实现,因此做本次作业还是比较轻松的(更何况直接调库就好了)。不过这是我第一次从零开始用 PyTorch 写出这些模型,所以我还是学到了很多 PyTorch 的知识的。同时我还为室友们和其他同学们讲解了 MLP、CNN、RNN 的概念和原理,巩固了我在这方面的知识。最后感谢一下助教在微信群里的答疑。
201 |
202 | 在这里提一个小建议,总的来说,本次作业综合了神经网络里的三个基本模型还是不错的,但唯一遗憾的是数据集可能不是那么的优秀,从而导致有些同学(包括我)怀疑自己模型、参数有问题,认为人工智能十分玄学,甚至对这门学科感到厌倦。希望在明年能改进这个问题吧。
203 |
--------------------------------------------------------------------------------
/config_cnn.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_path": "data/word2vec_temp",
3 | "output_path": "save/cnn_5",
4 | "gpu": true,
5 | "seed": 20000125,
6 | "display_per_batch": 5,
7 | "optimizer": "adagrad",
8 | "lr": 0.01,
9 | "lr_decay": 0,
10 | "weight_decay": 0.0001,
11 | "momentum": 0.985,
12 | "num_epochs": 300,
13 | "batch_size": 64,
14 | "num_labels": 8,
15 | "embedding_size": 300,
16 | "type": "cnn",
17 | "cnn": {
18 | "max_length": 512,
19 | "conv_1": {
20 | "size": 512,
21 | "kernel_size": 3,
22 | "dropout": 0.9
23 | },
24 | "max_pool_1": {
25 | "kernel_size": 2,
26 | "stride": 2
27 | },
28 | "fc": {
29 | "hidden_size": 512,
30 | "dropout": 0.9
31 | },
32 | "loss": "cross_entropy"
33 | }
34 | }
--------------------------------------------------------------------------------
/config_mlp.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_path": "data/word2vec_temp",
3 | "output_path": "save/mlp_1",
4 | "gpu": true,
5 | "seed": 20000125,
6 | "display_per_batch": 5,
7 | "optimizer": "adagrad",
8 | "lr": 0.01,
9 | "lr_decay": 0,
10 | "weight_decay": 0.0001,
11 | "momentum": 0.985,
12 | "num_epochs": 300,
13 | "batch_size": 64,
14 | "num_labels": 8,
15 | "embedding_size": 300,
16 | "type": "mlp",
17 | "mlp": {
18 | "max_length": 512,
19 | "dropout": 0.5,
20 | "hidden_size": 512,
21 | "loss": "cross_entropy"
22 | }
23 | }
--------------------------------------------------------------------------------
/config_rnn.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_path": "data/word2vec_temp",
3 | "output_path": "save/bi-lstm_1",
4 | "gpu": true,
5 | "seed": 20000125,
6 | "display_per_batch": 5,
7 | "optimizer": "adagrad",
8 | "lr": 0.01,
9 | "lr_decay": 0,
10 | "weight_decay": 0.0001,
11 | "momentum": 0.985,
12 | "num_epochs": 300,
13 | "batch_size": 64,
14 | "num_labels": 8,
15 | "embedding_size": 300,
16 | "type": "rnn",
17 | "rnn": {
18 | "type": "lstm",
19 | "bidirectional": true,
20 | "rnn_hidden_size": 256,
21 | "mlp_hidden_size": 512,
22 | "dropout": 0.5,
23 | "p_coefficient": 1,
24 | "num_layers": 1,
25 | "param_da": 350,
26 | "param_r": 30,
27 | "loss": "cross_entropy"
28 | }
29 | }
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | zhs.model
2 | word2vec
3 | word2vec_temp
4 | elmo_temp
5 | sinanews.test
6 | sinanews.train
--------------------------------------------------------------------------------
/data/sinanews.demo:
--------------------------------------------------------------------------------
1 | 201201010000_18871 Total:2 感动:0 同情:0 无聊:2 愤怒:0 搞笑:0 难过:0 新奇:0 温馨:0 女生 发表 轻生 日志 获 网友 连夜 搜 救 本报 讯 永别 朋友 永别 世界 前天 晚上 清华 毕业 女生 小陈 网 主页 写下 日志 后 失踪 日志 中 表示 欲 寻短见 得知 消息 校友 途径 连夜 发起 搜 救 昨天 上午 记者 了解 民警 已经 找到 小陈 已经 危险 了解 小 陈 本科 研究生 分别 就读 中国 传媒 大学 清华大学 今年 毕业 后 进入 国家机关 下属 媒体 工作 前天 晚上 7点 44分 小陈 网 主页 发表 篇 题为 永别 朋友 日志 后 失踪 篇 日志 很快 引起 关注 当天 晚上 10点 关注 清华大学 微 博 协会 会长 刘若晴 微 博 发布 紧急 寻 人 信息 呼吁 网友 路 留 心眼 儿 救人 命 随后 网友 展开 搜 救 昨天 上午 条 寻找 小 陈 微 博 已 转发 8000 次 小陈 同学 寻 遍 宣武门 长椿 街 西单 传媒 大学 清华大学 不断 网上 交流 搜 救 信息 汇总 清华大学 微 博 协会 微 博 从前 晚 10点 昨天 凌晨 3点 小时 新 搜 救 信息 上面 发布 昨天 上午 北京市 公安局 官方 微 博 发布消息 称 北京 警方 积极 努力 开展 相关 工作 随后 市 公安局 勤务 指挥部 传来 好 消息 警方 连夜 工作 民警 丰台 六里桥 附近 找到 同学 同学 平安无事 现场 自杀 情况 危险 民警 正在 进一步 调查 中
2 | 201201010139_29492 Total:1 感动:0 同情:0 无聊:0 愤怒:0 搞笑:0 难过:0 新奇:0 温馨:1 客机 起飞 小时 后 机舱 冒烟 返航 昨天 上午 8点 25分 左右 首都 国际 机场 起飞 小时 架 美 航 AA186 次 航班 机舱 冒烟 折返 降落 事后 返航 乘客 安排 附近 酒店 下午 5点 乘客 接 航空公司 答复 改 今天 上午 10点 航班 再次 起飞 希望 次 顺利 到达 乘客 表示 机舱 冒烟 客机 返航 飞机 飞 飞 冒烟 北京 回来 昨天 上午 位 网友 微 博 透露 乘坐 美国 航空公司 AA186 次 航班 北京 芝加哥 飞机 起飞 后 客舱 突然 烟 最终 飞机 返航 元旦 北京 度过 真是 悲剧 飞机 起飞 紧急 返航 此刻 T3 位 网友 全 熊 小 猫 微 博 直播 次 遭遇 记者 联系 时 已 航空公司 安排 附近 酒店 姓 黄 美国 上学 留学生 次 芝加哥 中途 换乘 黄 女士 告诉 记者 事发 时 上午 8点 25分 飞机 已经 起飞 小时 突然 机舱 飘 股 浓浓的 异味 儿 有点 烟 味儿 广播 中 机长 声音 证实 一点 说 飞机 临时 出现 机械 故障 导致 机舱 冒烟 需要 返航 降落 记者 查询 发现 美 航 AA186 次 航班 首都 机场 正点 起飞 时间 7点 55分 留学生 临时 翻译 昨天 上午 8点 55分 AA186 次 航班 起飞 小时 后 重新 降落 首都 国际 机场 整个 过程 无 人 受伤 黄 女士 称 飞机 落地 之后 机 舱门 迟迟 未 打开 部分 乘客 着急 不断 身边 乘务员 询问 空姐 懂 中文 机舱 气氛 一度 些 混乱 懂 英文 乘客 帮忙 翻译 一下 机舱 紧张 气氛 平息 下来 黄 女士 称 帮忙 翻译 意思 说 飞机 已经 安全 落地 请 乘客 耐心 等待 地面 空勤 人员 正在 舷梯 接 一会儿 下 飞机 微 博 黄 女士 次 特殊 翻译 经历 晒 机上 空姐 讲 中文 找 翻译 体验 次 小 喇叭 广播 手 感觉 乘客 今天 上午 再 起程 事后 航空公司 AA186 全部 乘客 安排 酒店 架飞机 波音 777 满员 乘客 估计 300 人 外籍 乘客 希尔顿 酒店 国内 乘客 丽 酒店 黄 女士 透露 酒店 呆 长时间 航空公司 却 一直 无 人 出面 解释 答复 下 步 赔偿 问题 无从 谈 昨天 下午 记者 联系 丽 酒店 负责 协调 返航 乘客 负责人 称 受 美 航 公司 委托 负责 协调 乘客 飞机 再次 起飞 知晓 首都 机场 负责 乘客 改 签 工作 美 航 工作人员 透露 目前 尚未 接 上级 批 乘客 改 签 起飞 具体 通知 飞机 返航 具体 原因 赔偿 问题 记者 采访 美 航 北京 办事处 媒体 传媒 部 昨天 下午 记者 连续 拨打 办事处 电话 十几 次 无 人 接 听 首都 机场 美 航 公司 工作人员 拒绝 记者 转达 采访 意图 昨天 下午 5点 黄 女士 致电 记者 称 航空公司 通知 乘坐 今天 上午 10点 航班 再次 起飞 一大早 机场 等候 工作人员 带领 取 行李 希望 次 顺利 到达 黄 女士 提前 送 份 新年 祝福 晨报 96101 热线 新闻记者 岳 亦 雷 线索 祝 先生 ■ 新闻 链接 次 航班 曾 冒烟 返航 记录 2010年 12月 16日 上午 10时 许 美国 航空公司 北京 飞往 芝加哥 AA186 航班 起飞 小时 后 客舱 突然 烟 机长 决定 返航 首都 机场 解释 称 返航 空调系统 故障 机场 运行 未 受 影响 美国 航空公司 北京 代表处 表示 故障 未 造成 人员 伤亡
3 | 201201010216_13984 Total:38 感动:3 同情:3 无聊:0 愤怒:16 搞笑:7 难过:8 新奇:1 温馨:0 男子 婚 房 房 贷 抢劫 彩票 店 本报 记者 赵云龙 2011年 12月 28日 晚 10时 17分 许 水 屯 路南 段 福彩 投 注 站 女 老板 林丽 化名 整理 钱 款 欲 关门 回家 低头 收拾时 男子 突然 快步 走 进 店 里 手 持 红色 U 形 锁 隔 柜台 头部 猛 砸 过去 砸 两 下 后 男子 绕 进 柜台 内侧 林丽连 砸 下 柜台 两 提包 抢走 砸 完 跑 店 门 店 里 一个 椅子 追 四五 米 骑 摩托车 跑 林丽 告诉 记者 砸 后 头部 留下 处 伤口 缝 20 多针 抢走 两 包 里 13000 余 元 现金 一些 银行 存折 部分 刮 刮 乐 彩票 案发 后 调取 监控 录像 警方 很快 掌握 嫌疑人 体貌 特征 本报 2011年 12月 30日 刊发 深夜 抢 彩票 站 狂 砸 女 老板 文 显示 嫌疑人 清晰 面容 监控 截 图 刊登 希望 读者 提供 相关 破案 线索 警 媒 联动 下 2011年 12月 30日 下午 2时 许 热心 市民 警方 提供 条 重要 线索 嫌疑人 居住 梁 府 庄 一带 民警 梁 府 庄 犯罪 嫌疑人 向某 抓获 缴获 抢劫 13000 余 元 现金 银行 存折 刮 刮 乐 彩票 物 今年 30 岁 已 失业 许久 2年 前 结婚 时 曾 妻子 老家 泰安 买 套 婚 房 目前 尚 万 元 房 贷 春节 向某 身上 钱 房 贷 近期 一直 为难 2011年 12月 28日 晚 6时 向某 朋友 东关 大街 酒店 吃饭 席间 喝 五六 瓶 啤酒 消愁 饭局 结束 后 向某 回家 途中 想 买 彩票 碰碰 运气 当晚 10时 09分 走 进 水 屯 路南 段 福彩 投 注 站 告知 开奖 时间 已 结束 该店 关门 无意间 向某 看到 该店 女 老板 林丽 正在 清点 钱 款 萌生 抢劫 念头 盯 林丽 直到 数 钱 结束 观察 店 情况 随后 持 车 锁 行 抢
4 | 201201010226_10658 Total:735 感动:27 同情:45 无聊:105 愤怒:44 搞笑:392 难过:56 新奇:60 温馨:6 99 岁 丈夫 发现 妻子 60年 前 出轨 欲 离婚 新华社 电 意大利 年 百 岁 夫妇 正 闹 离婚 原因 丈夫 发现 妻子 60年 前 相 好 律师 拒绝 公开 两 人 全 名 丈夫 名为 安东尼奥 现年 99 岁 妻子 罗莎 96 岁 安东尼奥 世纪 30年代 那不勒斯 服役 结识 罗莎 两 人 结婚 77年 孩子 12 孙 辈 重孙 今年 圣诞节 前 数 天 安东尼奥 翻 五斗橱 发现 一些 情书 显示 妻子 世纪 40年代 段 秘密 婚外恋 妻子 承认 写 信 请求 丈夫 原谅 婚外恋 过去 60 年 丈夫 立即 要求 离婚
5 | 201201010226_28346 Total:25 感动:1 同情:0 无聊:0 愤怒:18 搞笑:4 难过:2 新奇:0 温馨:0 商 户 举报 占 道 经营 遭 威胁 代 缴 罚款 本报 讯 记者 张太凌 郭公庄 路 商贩 冯 明 举报 同业 占用 机动车 道 摆摊 设点 想 城管 查处 后 挤出 块 好 路段 摆 摊 未 料 举报 电话 认出 遭到 举报者 围堵 家门 威胁 举报者 支付 罚款 扣 车 款 6500 余 元 事后 冯明 怀疑 电话 城管 泄露 要求 城管 赔偿 代 缴 罚款 损失 昨天 花乡 城管 分队 称 冯明 怀疑 纯属 臆断 丰台区 城管 大队 表示 城管 必要 泄露 举报者 信息 冯明 回忆 称 去年 12月 27日 中午 11时 左右 丰台区 花乡 城管 分队 电话 举报 郭公庄 路上 许多 鱼贩 占用 机动车 道 摆摊 设点 养鱼 水 洒 路面 结冰 影响 行走 冯 明说 约 20分钟 后 城管 执法 车 赶到 驱散 路边 商贩 过程 中 看到 城管 人员 商贩 一些 类似 举报 记录单 双方 交流 城管 执法 人员 走 后 商贩 径直 走 面前 称 已 人 认出 举报 电话 手机 号 威胁 说 城管 查抄 车 回来 找 冯 明说 冯明 称 次日 中午 12时 丰台 城管 再次 出动 查 扣 郭公庄 路 当天 出摊 10 辆 鱼贩 车 当晚 七八点 钟 鱼贩 找到 冯明 家 冯 明 邻居 刘 先生 证实 当晚 辆 面包车 载 10 余 人 堵 冯 明 家门口 车上 板 砖 钢管 堆 人 人 坏 办 冯明 房东 寇 先生 说 朋友 居中 调停 12月 29日 下午 冯明 鱼贩 城管 支付 罚款 扣 车 款 6500 余 元 后 领 回 扣 车辆 冯 明说 报警 家人 安全 息事宁人 担心 报警 使 后果 严重 车 取 回来 后 鱼 损失 700 元 昨日 中午 位 罚 鱼贩 表示 冯明 举报 避 口 谈 位 同样 罚 鱼贩 告诉 记者 听 瞎说 吃里扒外 一些 鱼贩 记者 透露 举报 挨 罚 鱼贩 确实 知道 举报者 同行 冯 明 吃里扒外 冯明 表示 经营 鱼 水果 举报 想 清理 经营 秩序 好 路段 挤出 块 地方 摆 摊 认为 城管 泄露 举报信息 导致 已 损失 前 日 冯明 前往 花乡 城管 分队 交涉 一年 挣 万 代 缴 罚款 等于 今年 干 冯明 表示 想 做出 臆断 昨日 花乡 城管 分队 工作人员 称 丰台 城管 大队 表示 冯 明 举报 后 城管 进行 执法 泄露 举报者 信息 举报 举报 泄露 电话 必要 冯明 称 城管 交涉 时 举报 记录显示 确实 举报者 北京市 翰盛 律师 事务所 律师 郭召利 认为 执法 部门 泄露 举报人 信息 带来 人身 安全 威胁 伤害 应 给予 一定 保护 相应 补偿 泄露 追 责 郭召利 称 事件 中 执法 部门 做出 罚款 决定 说明 违法 事实 存在 举报人 举报人 缴纳 罚款 说明 受到 某种 威胁 代 缴 罚款 私了 受 法律 保护 代 缴 罚款 举报人 处罚 人 追 讨
6 |
--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | 实验三介绍PPT.pptx
2 | 实验三说明文档.pdf
3 | README.pdf
--------------------------------------------------------------------------------
/doc/col_bi-gru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_bi-gru.png
--------------------------------------------------------------------------------
/doc/col_bi-lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_bi-lstm.png
--------------------------------------------------------------------------------
/doc/col_cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_cnn.png
--------------------------------------------------------------------------------
/doc/col_gru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_gru.png
--------------------------------------------------------------------------------
/doc/col_lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_lstm.png
--------------------------------------------------------------------------------
/doc/col_mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/doc/col_mlp.png
--------------------------------------------------------------------------------
/elmoformanylangs/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from .elmo import Embedder
3 |
--------------------------------------------------------------------------------
/elmoformanylangs/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import print_function
3 | from __future__ import unicode_literals
4 | import os
5 | import sys
6 | import codecs
7 | import argparse
8 | import logging
9 | import json
10 | import torch
11 | from .modules.embedding_layer import EmbeddingLayer
12 | from .utils import dict2namedtuple
13 | from .frontend import Model
14 | from .frontend import create_batches
15 | import numpy as np
16 | import h5py
17 |
18 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
19 |
20 |
21 | def read_corpus(path, max_chars=None):
22 | """
23 | read raw text file. The format of the input is like, one sentence per line
24 | words are separated by '\t'
25 |
26 | :param path:
27 | :param max_chars: int, the number of maximum characters in a word, this
28 | parameter is used when the model is configured with CNN word encoder.
29 | :return:
30 | """
31 | dataset = []
32 | textset = []
33 | with codecs.open(path, 'r', encoding='utf-8') as fin:
34 | for line in fin.read().strip().split('\n'):
35 | data = ['']
36 | text = []
37 | for token in line.split('\t'):
38 | text.append(token)
39 | if max_chars is not None and len(token) + 2 > max_chars:
40 | token = token[:max_chars - 2]
41 | data.append(token)
42 | data.append('')
43 | dataset.append(data)
44 | textset.append(text)
45 | return dataset, textset
46 |
47 |
48 | def read_conll_corpus(path, max_chars=None):
49 | """
50 | read text in CoNLL-U format.
51 |
52 | :param path:
53 | :param max_chars:
54 | :return:
55 | """
56 | dataset = []
57 | textset = []
58 | with codecs.open(path, 'r', encoding='utf-8') as fin:
59 | for payload in fin.read().strip().split('\n\n'):
60 | data = ['']
61 | text = []
62 | lines = payload.splitlines()
63 | body = [line for line in lines if not line.startswith('#')]
64 | for line in body:
65 | fields = line.split('\t')
66 | num, token = fields[0], fields[1]
67 | if '-' in num or '.' in num:
68 | continue
69 | text.append(token)
70 | if max_chars is not None and len(token) + 2 > max_chars:
71 | token = token[:max_chars - 2]
72 | data.append(token)
73 | data.append('')
74 | dataset.append(data)
75 | textset.append(text)
76 | return dataset, textset
77 |
78 |
79 | def read_conll_char_corpus(path, max_chars=None):
80 | """
81 |
82 | :param path:
83 | :param max_chars:
84 | :return:
85 | """
86 | dataset = []
87 | textset = []
88 | with codecs.open(path, 'r', encoding='utf-8') as fin:
89 | for payload in fin.read().strip().split('\n\n'):
90 | data = ['']
91 | text = []
92 | lines = payload.splitlines()
93 | body = [line for line in lines if not line.startswith('#')]
94 | for line in body:
95 | fields = line.split('\t')
96 | num, token = fields[0], fields[1]
97 | if '-' in num or '.' in num:
98 | continue
99 | for ch in token:
100 | text.append(ch)
101 | if max_chars is not None and len(ch) + 2 > max_chars:
102 | ch = ch[:max_chars - 2]
103 | data.append(ch)
104 | data.append('')
105 | dataset.append(data)
106 | textset.append(text)
107 | return dataset, textset
108 |
109 |
110 | def read_conll_char_vi_corpus(path, max_chars=None):
111 | """
112 |
113 | :param path:
114 | :param max_chars:
115 | :return:
116 | """
117 | dataset = []
118 | textset = []
119 | with codecs.open(path, 'r', encoding='utf-8') as fin:
120 | for payload in fin.read().strip().split('\n\n'):
121 | data = ['']
122 | text = []
123 | lines = payload.splitlines()
124 | body = [line for line in lines if not line.startswith('#')]
125 | for line in body:
126 | fields = line.split('\t')
127 | num, token = fields[0], fields[1]
128 | if '-' in num or '.' in num:
129 | continue
130 | for ch in token.split():
131 | text.append(ch)
132 | if max_chars is not None and len(ch) + 2 > max_chars:
133 | ch = ch[:max_chars - 2]
134 | data.append(ch)
135 | data.append('')
136 | dataset.append(data)
137 | textset.append(text)
138 | return dataset, textset
139 |
140 |
141 | def test_main():
142 | # Configurations
143 | cmd = argparse.ArgumentParser('The testing components of')
144 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
145 | cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'),
146 | help='the input format.')
147 | cmd.add_argument("--input", help="the path to the raw text file.")
148 | cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).'
149 | ' Use comma to separate the format identifiers,'
150 | ' like \'--output_format=hdf5,plain\'')
151 | cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of '
152 | '..')
153 | cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM '
154 | 'hidden layer, 2 for the second LSTM hidden layer, -1 for an average'
155 | 'of 3 layers.')
156 | cmd.add_argument("--model", required=True, help="the path to the model.")
157 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
158 | args = cmd.parse_args(sys.argv[2:])
159 |
160 | if args.gpu >= 0:
161 | torch.cuda.set_device(args.gpu)
162 | use_cuda = args.gpu >= 0 and torch.cuda.is_available()
163 | # load the model configurations
164 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config_rnn.json'), 'r', encoding='utf-8')))
165 |
166 | with open(os.path.join(args.model, args2.config_path), 'r') as fin:
167 | config = json.load(fin)
168 |
169 | # For the model trained with character-based word encoder.
170 | if config['token_embedder']['char_dim'] > 0:
171 | char_lexicon = {}
172 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
173 | for line in fpi:
174 | tokens = line.strip().split('\t')
175 | if len(tokens) == 1:
176 | tokens.insert(0, '\u3000')
177 | token, i = tokens
178 | char_lexicon[token] = int(i)
179 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
180 | logging.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
181 | else:
182 | char_lexicon = None
183 | char_emb_layer = None
184 |
185 | # For the model trained with word form word encoder.
186 | if config['token_embedder']['word_dim'] > 0:
187 | word_lexicon = {}
188 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
189 | for line in fpi:
190 | tokens = line.strip().split('\t')
191 | if len(tokens) == 1:
192 | tokens.insert(0, '\u3000')
193 | token, i = tokens
194 | word_lexicon[token] = int(i)
195 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
196 | logging.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
197 | else:
198 | word_lexicon = None
199 | word_emb_layer = None
200 |
201 | # instantiate the model
202 | model = Model(config, word_emb_layer, char_emb_layer, use_cuda)
203 |
204 | if use_cuda:
205 | model.cuda()
206 |
207 | logging.info(str(model))
208 | model.load_model(args.model)
209 |
210 | # read test data according to input format
211 | read_function = read_corpus if args.input_format == 'plain' else (
212 | read_conll_corpus if args.input_format == 'conll' else (
213 | read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus))
214 |
215 | if config['token_embedder']['name'].lower() == 'cnn':
216 | test, text = read_function(args.input, config['token_embedder']['max_characters_per_token'])
217 | else:
218 | test, text = read_function(args.input)
219 |
220 | # create test batches from the input data.
221 | test_w, test_c, test_lens, test_masks, test_text = create_batches(
222 | test, args.batch_size, word_lexicon, char_lexicon, config, text=text)
223 |
224 | # configure the model to evaluation mode.
225 | model.eval()
226 |
227 | sent_set = set()
228 | cnt = 0
229 |
230 | output_formats = args.output_format.split(',')
231 | output_layers = map(int, args.output_layer.split(','))
232 |
233 | handlers = {}
234 | for output_format in output_formats:
235 | if output_format not in ('hdf5', 'txt'):
236 | print('Unknown output_format: {0}'.format(output_format))
237 | continue
238 | for output_layer in output_layers:
239 | filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format)
240 | handlers[output_format, output_layer] = \
241 | h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w')
242 |
243 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
244 | output = model.forward(w, c, masks)
245 | for i, text in enumerate(texts):
246 | sent = '\t'.join(text)
247 | sent = sent.replace('.', '$period$')
248 | sent = sent.replace('/', '$backslash$')
249 | if sent in sent_set:
250 | continue
251 | sent_set.add(sent)
252 | if config['encoder']['name'].lower() == 'lstm':
253 | data = output[i, 1:lens[i]-1, :].data
254 | if use_cuda:
255 | data = data.cpu()
256 | data = data.numpy()
257 | elif config['encoder']['name'].lower() == 'elmo':
258 | data = output[:, i, 1:lens[i]-1, :].data
259 | if use_cuda:
260 | data = data.cpu()
261 | data = data.numpy()
262 |
263 | for (output_format, output_layer) in handlers:
264 | fout = handlers[output_format, output_layer]
265 | if output_layer == -1:
266 | payload = np.average(data, axis=0)
267 | else:
268 | payload = data[output_layer]
269 | if output_format == 'hdf5':
270 | fout.create_dataset(sent, payload.shape, dtype='float32', data=payload)
271 | else:
272 | for word, row in zip(text, payload):
273 | print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout)
274 | print('', file=fout)
275 |
276 | cnt += 1
277 | if cnt % 1000 == 0:
278 | logging.info('Finished {0} sentences.'.format(cnt))
279 | for _, handler in handlers.items():
280 | handler.close()
281 |
282 |
283 | if __name__ == "__main__":
284 | if len(sys.argv) > 1 and sys.argv[1] == 'test':
285 | test_main()
286 | else:
287 | print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr)
288 |
--------------------------------------------------------------------------------
/elmoformanylangs/biLM.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import print_function
3 | from __future__ import unicode_literals
4 | import os
5 | import errno
6 | import sys
7 | import codecs
8 | import argparse
9 | import time
10 | import random
11 | import logging
12 | import json
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.functional as F
16 | import torch.optim as optim
17 | from torch.autograd import Variable
18 | from .modules.elmo import ElmobiLm
19 | from .modules.lstm import LstmbiLm
20 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder
21 | from .modules.embedding_layer import EmbeddingLayer
22 | from .modules.classify_layer import SoftmaxLayer, CNNSoftmaxLayer, SampledSoftmaxLayer
23 | from .dataloader import load_embedding
24 | from .utils import dict2namedtuple
25 | from collections import Counter
26 | import numpy as np
27 |
28 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
29 |
30 |
31 | def divide(data, valid_size):
32 | valid_size = min(valid_size, len(data) // 10)
33 | random.shuffle(data)
34 | return data[valid_size:], data[:valid_size]
35 |
36 |
37 | def break_sentence(sentence, max_sent_len):
38 | """
39 | For example, for a sentence with 70 words, supposing the the `max_sent_len'
40 | is 30, break it into 3 sentences.
41 |
42 | :param sentence: list[str] the sentence
43 | :param max_sent_len:
44 | :return:
45 | """
46 | ret = []
47 | cur = 0
48 | length = len(sentence)
49 | while cur < length:
50 | if cur + max_sent_len + 5 >= length:
51 | ret.append(sentence[cur: length])
52 | break
53 | ret.append(sentence[cur: min(length, cur + max_sent_len)])
54 | cur += max_sent_len
55 | return ret
56 |
57 |
58 | def read_corpus(path, max_chars=None, max_sent_len=20):
59 | """
60 | read raw text file
61 | :param path: str
62 | :param max_chars: int
63 | :param max_sent_len: int
64 | :return:
65 | """
66 | data = []
67 | with codecs.open(path, 'r', encoding='utf-8') as fin:
68 | for line in fin:
69 | data.append('')
70 | for token in line.strip().split():
71 | if max_chars is not None and len(token) + 2 > max_chars:
72 | token = token[:max_chars - 2]
73 | data.append(token)
74 | data.append('')
75 | dataset = break_sentence(data, max_sent_len)
76 | return dataset
77 |
78 |
79 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True):
80 | """
81 |
82 | :param x:
83 | :param word2id: dict
84 | :param char2id: dict
85 | :param config:
86 | :param oov:
87 | :param pad:
88 | :param sort:
89 | :return:
90 | """
91 | batch_size = len(x)
92 | lst = list(range(batch_size))
93 | if sort:
94 | lst.sort(key=lambda l: -len(x[l]))
95 |
96 | x = [x[i] for i in lst]
97 | lens = [len(x[i]) for i in lst]
98 | max_len = max(lens)
99 |
100 | if word2id is not None:
101 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None)
102 | assert oov_id is not None and pad_id is not None
103 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
104 | for i, x_i in enumerate(x):
105 | for j, x_ij in enumerate(x_i):
106 | batch_w[i][j] = word2id.get(x_ij, oov_id)
107 | else:
108 | batch_w = None
109 |
110 | if char2id is not None:
111 | bow_id, eow_id, oov_id, pad_id = char2id.get('', None), char2id.get('', None), char2id.get(oov, None), char2id.get(pad, None)
112 |
113 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None
114 |
115 | if config['token_embedder']['name'].lower() == 'cnn':
116 | max_chars = config['token_embedder']['max_characters_per_token']
117 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars
118 | elif config['token_embedder']['name'].lower() == 'lstm':
119 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 # counting the and
120 |
121 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
122 |
123 | for i, x_i in enumerate(x):
124 | for j, x_ij in enumerate(x_i):
125 | batch_c[i][j][0] = bow_id
126 | if x_ij == '' or x_ij == '':
127 | batch_c[i][j][1] = char2id.get(x_ij)
128 | batch_c[i][j][2] = eow_id
129 | else:
130 | for k, c in enumerate(x_ij):
131 | batch_c[i][j][k + 1] = char2id.get(c, oov_id)
132 | batch_c[i][j][len(x_ij) + 1] = eow_id
133 | else:
134 | batch_c = None
135 |
136 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
137 |
138 | for i, x_i in enumerate(x):
139 | for j in range(len(x_i)):
140 | masks[0][i][j] = 1
141 | if j + 1 < len(x_i):
142 | masks[1].append(i * max_len + j)
143 | if j > 0:
144 | masks[2].append(i * max_len + j)
145 |
146 | assert len(masks[1]) <= batch_size * max_len
147 | assert len(masks[2]) <= batch_size * max_len
148 |
149 | masks[1] = torch.LongTensor(masks[1])
150 | masks[2] = torch.LongTensor(masks[2])
151 |
152 | return batch_w, batch_c, lens, masks
153 |
154 |
155 | # shuffle training examples and create mini-batches
156 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, use_cuda=False):
157 | """
158 |
159 | :param x:
160 | :param batch_size:
161 | :param word2id:
162 | :param char2id:
163 | :param config:
164 | :param perm:
165 | :param shuffle:
166 | :param sort:
167 | :param use_cuda:
168 | :return:
169 | """
170 | lst = perm or list(range(len(x)))
171 | if shuffle:
172 | random.shuffle(lst)
173 |
174 | if sort:
175 | lst.sort(key=lambda l: -len(x[l]))
176 |
177 | x = [x[i] for i in lst]
178 |
179 | sum_len = 0.0
180 | batches_w, batches_c, batches_lens, batches_masks = [], [], [], []
181 | size = batch_size
182 | nbatch = (len(x) - 1) // size + 1
183 | for i in range(nbatch):
184 | start_id, end_id = i * size, (i + 1) * size
185 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
186 | sum_len += sum(blens)
187 | batches_w.append(bw)
188 | batches_c.append(bc)
189 | batches_lens.append(blens)
190 | batches_masks.append(bmasks)
191 |
192 | if sort:
193 | perm = list(range(nbatch))
194 | random.shuffle(perm)
195 | batches_w = [batches_w[i] for i in perm]
196 | batches_c = [batches_c[i] for i in perm]
197 | batches_lens = [batches_lens[i] for i in perm]
198 | batches_masks = [batches_masks[i] for i in perm]
199 |
200 | logging.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x)))
201 | return batches_w, batches_c, batches_lens, batches_masks
202 |
203 |
204 | class Model(nn.Module):
205 | def __init__(self, config, word_emb_layer, char_emb_layer, n_class, use_cuda=False):
206 | super(Model, self).__init__()
207 | self.use_cuda = use_cuda
208 | self.config = config
209 |
210 | if config['token_embedder']['name'].lower() == 'cnn':
211 | self.token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda)
212 | elif config['token_embedder']['name'].lower() == 'lstm':
213 | self.token_embedder = LstmTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda)
214 |
215 | if config['encoder']['name'].lower() == 'elmo':
216 | self.encoder = ElmobiLm(config, use_cuda)
217 | elif config['encoder']['name'].lower() == 'lstm':
218 | self.encoder = LstmbiLm(config, use_cuda)
219 |
220 | self.output_dim = config['encoder']['projection_dim']
221 | if config['classifier']['name'].lower() == 'softmax':
222 | self.classify_layer = SoftmaxLayer(self.output_dim, n_class)
223 | elif config['classifier']['name'].lower() == 'cnn_softmax':
224 | self.classify_layer = CNNSoftmaxLayer(self.token_embedder, self.output_dim, n_class,
225 | config['classifier']['n_samples'], config['classifier']['corr_dim'],
226 | use_cuda)
227 | elif config['classifier']['name'].lower() == 'sampled_softmax':
228 | self.classify_layer = SampledSoftmaxLayer(self.output_dim, n_class, config['classifier']['n_samples'], use_cuda)
229 |
230 | def forward(self, word_inp, chars_inp, mask_package):
231 | """
232 |
233 | :param word_inp:
234 | :param chars_inp:
235 | :param mask_package: Tuple[]
236 | :return:
237 | """
238 | classifier_name = self.config['classifier']['name'].lower()
239 |
240 | if self.training and classifier_name == 'cnn_softmax' or classifier_name == 'sampled_softmax':
241 | self.classify_layer.update_negative_samples(word_inp, chars_inp, mask_package[0])
242 | self.classify_layer.update_embedding_matrix()
243 |
244 | token_embedding = self.token_embedder(word_inp, chars_inp, (mask_package[0].size(0), mask_package[0].size(1)))
245 | token_embedding = F.dropout(token_embedding, self.config['dropout'], self.training)
246 |
247 | encoder_name = self.config['encoder']['name'].lower()
248 | if encoder_name == 'elmo':
249 | mask = Variable(mask_package[0].cuda()).cuda() if self.use_cuda else Variable(mask_package[0])
250 | encoder_output = self.encoder(token_embedding, mask)
251 | encoder_output = encoder_output[1]
252 | # [batch_size, len, hidden_size]
253 | elif encoder_name == 'lstm':
254 | encoder_output = self.encoder(token_embedding)
255 | else:
256 | raise ValueError('')
257 |
258 | encoder_output = F.dropout(encoder_output, self.config['dropout'], self.training)
259 | forward, backward = encoder_output.split(self.output_dim, 2)
260 |
261 | word_inp = Variable(word_inp)
262 | if self.use_cuda:
263 | word_inp = word_inp.cuda()
264 |
265 | mask1 = Variable(mask_package[1].cuda()).cuda() if self.use_cuda else Variable(mask_package[1])
266 | mask2 = Variable(mask_package[2].cuda()).cuda() if self.use_cuda else Variable(mask_package[2])
267 |
268 | forward_x = forward.contiguous().view(-1, self.output_dim).index_select(0, mask1)
269 | forward_y = word_inp.contiguous().view(-1).index_select(0, mask2)
270 |
271 | backward_x = backward.contiguous().view(-1, self.output_dim).index_select(0, mask2)
272 | backward_y = word_inp.contiguous().view(-1).index_select(0, mask1)
273 |
274 | return self.classify_layer(forward_x, forward_y), self.classify_layer(backward_x, backward_y)
275 |
276 | def save_model(self, path, save_classify_layer):
277 | torch.save(self.token_embedder.state_dict(), os.path.join(path, 'token_embedder.pkl'))
278 | torch.save(self.encoder.state_dict(), os.path.join(path, 'encoder.pkl'))
279 | if save_classify_layer:
280 | torch.save(self.classify_layer.state_dict(), os.path.join(path, 'classifier.pkl'))
281 |
282 | def load_model(self, path):
283 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl')))
284 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl')))
285 | self.classify_layer.load_state_dict(torch.load(os.path.join(path, 'classifier.pkl')))
286 |
287 |
288 | def eval_model(model, valid):
289 | model.eval()
290 | if model.config['classifier']['name'].lower() == 'cnn_softmax' or \
291 | model.config['classifier']['name'].lower() == 'sampled_softmax':
292 | model.classify_layer.update_embedding_matrix()
293 | total_loss, total_tag = 0.0, 0
294 | valid_w, valid_c, valid_lens, valid_masks = valid
295 | for w, c, lens, masks in zip(valid_w, valid_c, valid_lens, valid_masks):
296 | loss_forward, loss_backward = model.forward(w, c, masks)
297 | total_loss += loss_forward.data[0]
298 | n_tags = sum(lens)
299 | total_tag += n_tags
300 | model.train()
301 | return np.exp(total_loss / total_tag)
302 |
303 |
304 | def train_model(epoch, opt, model, optimizer,
305 | train, valid, test, best_train, best_valid, test_result):
306 | """
307 | Training model for one epoch
308 |
309 | :param epoch:
310 | :param opt:
311 | :param model:
312 | :param optimizer:
313 | :param train:
314 | :param best_train:
315 | :param valid:
316 | :param best_valid:
317 | :param test:
318 | :param test_result:
319 | :return:
320 | """
321 | model.train()
322 |
323 | total_loss, total_tag = 0.0, 0
324 | cnt = 0
325 | start_time = time.time()
326 |
327 | train_w, train_c, train_lens, train_masks = train
328 |
329 | lst = list(range(len(train_w)))
330 | random.shuffle(lst)
331 |
332 | train_w = [train_w[l] for l in lst]
333 | train_c = [train_c[l] for l in lst]
334 | train_lens = [train_lens[l] for l in lst]
335 | train_masks = [train_masks[l] for l in lst]
336 |
337 | for w, c, lens, masks in zip(train_w, train_c, train_lens, train_masks):
338 | cnt += 1
339 | model.zero_grad()
340 | loss_forward, loss_backward = model.forward(w, c, masks)
341 |
342 | loss = (loss_forward + loss_backward) / 2.0
343 | total_loss += loss_forward.data[0]
344 | n_tags = sum(lens)
345 | total_tag += n_tags
346 | loss.backward()
347 |
348 | torch.nn.utils.clip_grad_norm(model.parameters(), opt.clip_grad)
349 | optimizer.step()
350 | if cnt * opt.batch_size % 1024 == 0:
351 | logging.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f} time={:.2f}s".format(
352 | epoch, cnt, optimizer.param_groups[0]['lr'],
353 | np.exp(total_loss / total_tag), time.time() - start_time
354 | ))
355 | start_time = time.time()
356 |
357 | if cnt % opt.eval_steps == 0 or cnt % len(train_w) == 0:
358 | if valid is None:
359 | train_ppl = np.exp(total_loss / total_tag)
360 | logging.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f}".format(
361 | epoch, cnt, optimizer.param_groups[0]['lr'], train_ppl))
362 | if train_ppl < best_train:
363 | best_train = train_ppl
364 | logging.info("New record achieved on training dataset!")
365 | model.save_model(opt.model, opt.save_classify_layer)
366 | else:
367 | valid_ppl = eval_model(model, valid)
368 | logging.info("Epoch={} iter={} lr={:.6f} valid_ppl={:.6f}".format(
369 | epoch, cnt, optimizer.param_groups[0]['lr'], valid_ppl))
370 |
371 | if valid_ppl < best_valid:
372 | model.save_model(opt.model, opt.save_classify_layer)
373 | best_valid = valid_ppl
374 | logging.info("New record achieved!")
375 |
376 | if test is not None:
377 | test_result = eval_model(model, test)
378 | logging.info("Epoch={} iter={} lr={:.6f} test_ppl={:.6f}".format(
379 | epoch, cnt, optimizer.param_groups[0]['lr'], test_result))
380 | return best_train, best_valid, test_result
381 |
382 |
383 | def get_truncated_vocab(dataset, min_count):
384 | """
385 |
386 | :param dataset:
387 | :param min_count: int
388 | :return:
389 | """
390 | word_count = Counter()
391 | for sentence in dataset:
392 | word_count.update(sentence)
393 |
394 | word_count = list(word_count.items())
395 | word_count.sort(key=lambda x: x[1], reverse=True)
396 |
397 | i = 0
398 | for word, count in word_count:
399 | if count < min_count:
400 | break
401 | i += 1
402 |
403 | logging.info('Truncated word count: {0}.'.format(sum([count for word, count in word_count[i:]])))
404 | logging.info('Original vocabulary size: {0}.'.format(len(word_count)))
405 | return word_count[:i]
406 |
407 |
408 | def train():
409 | cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
410 | cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
411 | cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.')
412 |
413 | cmd.add_argument('--train_path', required=True, help='The path to the training file.')
414 | cmd.add_argument('--valid_path', help='The path to the development file.')
415 | cmd.add_argument('--test_path', help='The path to the testing file.')
416 |
417 | cmd.add_argument('--config_path', required=True, help='the path to the config file.')
418 | cmd.add_argument("--word_embedding", help="The path to word vectors.")
419 |
420 | cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'],
421 | help='the type of optimizer: valid options=[sgd, adam, adagrad]')
422 | cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.')
423 | cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.')
424 |
425 | cmd.add_argument("--model", required=True, help="path to save model")
426 |
427 | cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.')
428 | cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.')
429 |
430 | cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.')
431 |
432 | cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.')
433 |
434 | cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.')
435 |
436 | cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.')
437 |
438 | cmd.add_argument('--save_classify_layer', default=False, action='store_true',
439 | help="whether to save the classify layer")
440 |
441 | cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.")
442 | cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.')
443 |
444 | opt = cmd.parse_args(sys.argv[2:])
445 |
446 | with open(opt.config_path, 'r') as fin:
447 | config = json.load(fin)
448 |
449 | # Dump configurations
450 | print(opt)
451 | print(config)
452 |
453 | # set seed.
454 | torch.manual_seed(opt.seed)
455 | random.seed(opt.seed)
456 | if opt.gpu >= 0:
457 | torch.cuda.set_device(opt.gpu)
458 | if opt.seed > 0:
459 | torch.cuda.manual_seed(opt.seed)
460 |
461 | use_cuda = opt.gpu >= 0 and torch.cuda.is_available()
462 |
463 | token_embedder_name = config['token_embedder']['name'].lower()
464 | token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None)
465 | if token_embedder_name == 'cnn':
466 | train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len)
467 | elif token_embedder_name == 'lstm':
468 | train_data = read_corpus(opt.train_path, opt.max_sent_len)
469 | else:
470 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
471 |
472 | logging.info('training instance: {}, training tokens: {}.'.format(len(train_data),
473 | sum([len(s) - 1 for s in train_data])))
474 |
475 | if opt.valid_path is not None:
476 | if token_embedder_name == 'cnn':
477 | valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len)
478 | elif token_embedder_name == 'lstm':
479 | valid_data = read_corpus(opt.valid_path, opt.max_sent_len)
480 | else:
481 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
482 | logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data),
483 | sum([len(s) - 1 for s in valid_data])))
484 | elif opt.valid_size > 0:
485 | train_data, valid_data = divide(train_data, opt.valid_size)
486 | logging.info('training instance: {}, training tokens after division: {}.'.format(
487 | len(train_data), sum([len(s) - 1 for s in train_data])))
488 | logging.info('valid instance: {}, valid tokens: {}.'.format(
489 | len(valid_data), sum([len(s) - 1 for s in valid_data])))
490 | else:
491 | valid_data = None
492 |
493 | if opt.test_path is not None:
494 | if token_embedder_name == 'cnn':
495 | test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len)
496 | elif token_embedder_name == 'lstm':
497 | test_data = read_corpus(opt.test_path, opt.max_sent_len)
498 | else:
499 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
500 | logging.info('testing instance: {}, testing tokens: {}.'.format(
501 | len(test_data), sum([len(s) - 1 for s in test_data])))
502 | else:
503 | test_data = None
504 |
505 | if opt.word_embedding is not None:
506 | embs = load_embedding(opt.word_embedding)
507 | word_lexicon = {word: i for i, word in enumerate(embs[0])}
508 | else:
509 | embs = None
510 | word_lexicon = {}
511 |
512 | # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification
513 | vocab = get_truncated_vocab(train_data, opt.min_count)
514 |
515 | # Ensure index of '' is 0
516 | for special_word in ['', '', '', '']:
517 | if special_word not in word_lexicon:
518 | word_lexicon[special_word] = len(word_lexicon)
519 |
520 | for word, _ in vocab:
521 | if word not in word_lexicon:
522 | word_lexicon[word] = len(word_lexicon)
523 |
524 | # Word Embedding
525 | if config['token_embedder']['word_dim'] > 0:
526 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs)
527 | logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id)))
528 | else:
529 | word_emb_layer = None
530 | logging.info('Vocabulary size: {0}'.format(len(word_lexicon)))
531 |
532 | # Character Lexicon
533 | if config['token_embedder']['char_dim'] > 0:
534 | char_lexicon = {}
535 | for sentence in train_data:
536 | for word in sentence:
537 | for ch in word:
538 | if ch not in char_lexicon:
539 | char_lexicon[ch] = len(char_lexicon)
540 |
541 | for special_char in ['', '', '', '', '', '']:
542 | if special_char not in char_lexicon:
543 | char_lexicon[special_char] = len(char_lexicon)
544 |
545 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
546 | logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id)))
547 | else:
548 | char_lexicon = None
549 | char_emb_layer = None
550 |
551 | train = create_batches(
552 | train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda)
553 |
554 | if opt.eval_steps is None:
555 | opt.eval_steps = len(train[0])
556 | logging.info('Evaluate every {0} batches.'.format(opt.eval_steps))
557 |
558 | if valid_data is not None:
559 | valid = create_batches(
560 | valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
561 | else:
562 | valid = None
563 |
564 | if test_data is not None:
565 | test = create_batches(
566 | test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
567 | else:
568 | test = None
569 |
570 | label_to_ix = word_lexicon
571 | logging.info('vocab size: {0}'.format(len(label_to_ix)))
572 |
573 | nclasses = len(label_to_ix)
574 |
575 | model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda)
576 | logging.info(str(model))
577 | if use_cuda:
578 | model = model.cuda()
579 |
580 | need_grad = lambda x: x.requires_grad
581 | if opt.optimizer.lower() == 'adam':
582 | optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr)
583 | elif opt.optimizer.lower() == 'sgd':
584 | optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr)
585 | elif opt.optimizer.lower() == 'adagrad':
586 | optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr)
587 | else:
588 | raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower()))
589 |
590 | try:
591 | os.makedirs(opt.model)
592 | except OSError as exception:
593 | if exception.errno != errno.EEXIST:
594 | raise
595 |
596 | if config['token_embedder']['char_dim'] > 0:
597 | with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo:
598 | for ch, i in char_emb_layer.word2id.items():
599 | print('{0}\t{1}'.format(ch, i), file=fpo)
600 |
601 | with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo:
602 | for w, i in word_lexicon.items():
603 | print('{0}\t{1}'.format(w, i), file=fpo)
604 |
605 | json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config_rnn.json'), 'w', encoding='utf-8'))
606 |
607 | best_train = 1e+8
608 | best_valid = 1e+8
609 | test_result = 1e+8
610 |
611 | for epoch in range(opt.max_epoch):
612 | best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer,
613 | train, valid, test, best_train, best_valid, test_result)
614 | if opt.lr_decay > 0:
615 | optimizer.param_groups[0]['lr'] *= opt.lr_decay
616 |
617 | if valid_data is None:
618 | logging.info("best train ppl: {:.6f}.".format(best_train))
619 | elif test_data is None:
620 | logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid))
621 | else:
622 | logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
623 |
624 |
625 | def test():
626 | cmd = argparse.ArgumentParser('The testing components of')
627 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
628 | cmd.add_argument("--input", help="the path to the raw text file.")
629 | cmd.add_argument("--model", required=True, help="path to save model")
630 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
631 | args = cmd.parse_args(sys.argv[2:])
632 |
633 | if args.gpu >= 0:
634 | torch.cuda.set_device(args.gpu)
635 | use_cuda = args.gpu >= 0 and torch.cuda.is_available()
636 |
637 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config_rnn.json'), 'r', encoding='utf-8')))
638 |
639 | with open(args2.config_path, 'r') as fin:
640 | config = json.load(fin)
641 |
642 | if config['token_embedder']['char_dim'] > 0:
643 | char_lexicon = {}
644 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
645 | for line in fpi:
646 | tokens = line.strip().split('\t')
647 | if len(tokens) == 1:
648 | tokens.insert(0, '\u3000')
649 | token, i = tokens
650 | char_lexicon[token] = int(i)
651 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
652 | logging.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
653 | else:
654 | char_lexicon = None
655 | char_emb_layer = None
656 |
657 | word_lexicon = {}
658 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
659 | for line in fpi:
660 | tokens = line.strip().split('\t')
661 | if len(tokens) == 1:
662 | tokens.insert(0, '\u3000')
663 | token, i = tokens
664 | word_lexicon[token] = int(i)
665 |
666 | if config['token_embedder']['word_dim'] > 0:
667 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
668 | logging.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
669 | else:
670 | word_emb_layer = None
671 |
672 | model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda)
673 |
674 | if use_cuda:
675 | model.cuda()
676 |
677 | logging.info(str(model))
678 | model.load_model(args.model)
679 | if config['token_embedder']['name'].lower() == 'cnn':
680 | test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000)
681 | elif config['token_embedder']['name'].lower() == 'lstm':
682 | test = read_corpus(args.input, max_sent_len=10000)
683 | else:
684 | raise ValueError('')
685 |
686 | test_w, test_c, test_lens, test_masks = create_batches(
687 | test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
688 |
689 | test_result = eval_model(model, (test_w, test_c, test_lens, test_masks))
690 |
691 | logging.info("test_ppl={:.6f}".format(test_result))
692 |
693 |
694 | if __name__ == "__main__":
695 | if len(sys.argv) > 1 and sys.argv[1] == 'train':
696 | train()
697 | elif len(sys.argv) > 1 and sys.argv[1] == 'test':
698 | test()
699 | else:
700 | print('Usage: {0} [train|test] [options]'.format(sys.argv[0]), file=sys.stderr)
701 |
--------------------------------------------------------------------------------
/elmoformanylangs/dataloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import unicode_literals
3 | import codecs
4 | import numpy as np
5 |
6 |
7 | def pad(sequences, pad_token='', pad_left=False):
8 | """
9 | input sequences is a list of text sequence [[str]]
10 | pad each text sequence to the length of the longest
11 |
12 | :param sequences:
13 | :param pad_token:
14 | :param pad_left:
15 | :return:
16 | """
17 | # max_len = max(5,max(len(seq) for seq in sequences))
18 | max_len = max(len(seq) for seq in sequences)
19 | if pad_left:
20 | return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences]
21 | return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences]
22 |
23 |
24 | def load_embedding_npz(path):
25 | data = np.load(path)
26 | return [str(w) for w in data['words']], data['vals']
27 |
28 |
29 | def load_embedding_txt(path):
30 | words = []
31 | vals = []
32 | with codecs.open(path, 'r', encoding='utf-8') as fin:
33 | fin.readline()
34 | for line in fin:
35 | line = line.strip()
36 | if line:
37 | parts = line.split()
38 | words.append(parts[0])
39 | vals += [float(x) for x in parts[1:]] # equal to append
40 | return words, np.asarray(vals).reshape(len(words), -1) # reshape
41 |
42 |
43 | def load_embedding(path):
44 | if path.endswith(".npz"):
45 | return load_embedding_npz(path)
46 | else:
47 | return load_embedding_txt(path)
48 |
--------------------------------------------------------------------------------
/elmoformanylangs/elmo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import print_function
3 | from __future__ import unicode_literals
4 | import os
5 | import codecs
6 | import random
7 | import logging
8 | import json
9 | import torch
10 | from .modules.embedding_layer import EmbeddingLayer
11 | from .utils import dict2namedtuple
12 | from .frontend import create_one_batch
13 | from .frontend import Model
14 | import numpy as np
15 |
16 | logging.basicConfig(level=logging.INFO,
17 | format='%(asctime)-15s %(levelname)s: %(message)s')
18 |
19 |
20 | def read_list(sents, max_chars=None):
21 | """
22 | read raw text file. The format of the input is like, one sentence per line
23 | words are separated by '\t'
24 |
25 | :param path:
26 | :param max_chars: int, the number of maximum characters in a word, this
27 | parameter is used when the model is configured with CNN word encoder.
28 | :return:
29 | """
30 | dataset = []
31 | textset = []
32 | for sent in sents:
33 | data = ['']
34 | text = []
35 | for token in sent:
36 | text.append(token)
37 | if max_chars is not None and len(token) + 2 > max_chars:
38 | token = token[:max_chars - 2]
39 | data.append(token)
40 | data.append('')
41 | dataset.append(data)
42 | textset.append(text)
43 | return dataset, textset
44 |
45 |
46 | def recover(li, ind):
47 | # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort))
48 | dummy = list(range(len(ind)))
49 | dummy.sort(key=lambda l: ind[l])
50 | li = [li[i] for i in dummy]
51 | return li
52 |
53 |
54 | # shuffle training examples and create mini-batches
55 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None):
56 | ind = list(range(len(x)))
57 | lst = perm or list(range(len(x)))
58 | if shuffle:
59 | random.shuffle(lst)
60 |
61 | if sort:
62 | lst.sort(key=lambda l: -len(x[l]))
63 |
64 | x = [x[i] for i in lst]
65 | ind = [ind[i] for i in lst]
66 | if text is not None:
67 | text = [text[i] for i in lst]
68 |
69 | sum_len = 0.0
70 | batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
71 | size = batch_size
72 | nbatch = (len(x) - 1) // size + 1
73 | for i in range(nbatch):
74 | start_id, end_id = i * size, (i + 1) * size
75 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
76 | sum_len += sum(blens)
77 | batches_w.append(bw)
78 | batches_c.append(bc)
79 | batches_lens.append(blens)
80 | batches_masks.append(bmasks)
81 | batches_ind.append(ind[start_id: end_id])
82 | if text is not None:
83 | batches_text.append(text[start_id: end_id])
84 |
85 | if sort:
86 | perm = list(range(nbatch))
87 | random.shuffle(perm)
88 | batches_w = [batches_w[i] for i in perm]
89 | batches_c = [batches_c[i] for i in perm]
90 | batches_lens = [batches_lens[i] for i in perm]
91 | batches_masks = [batches_masks[i] for i in perm]
92 | batches_ind = [batches_ind[i] for i in perm]
93 | if text is not None:
94 | batches_text = [batches_text[i] for i in perm]
95 |
96 | logging.info("{} batches, avg len: {:.1f}".format(
97 | nbatch, sum_len / len(x)))
98 | recover_ind = [item for sublist in batches_ind for item in sublist]
99 | if text is not None:
100 | return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind
101 | return batches_w, batches_c, batches_lens, batches_masks, recover_ind
102 |
103 |
104 | class Embedder(object):
105 | def __init__(self, model_dir, batch_size=4):
106 | self.model_dir = model_dir
107 | self.model, self.config = self.get_model()
108 | self.batch_size = batch_size
109 |
110 | def get_model(self):
111 | # torch.cuda.set_device(1)
112 | self.use_cuda = torch.cuda.is_available()
113 | # load the model configurations
114 | args2 = dict2namedtuple(json.load(codecs.open(
115 | os.path.join(self.model_dir, 'config_rnn.json'), 'r', encoding='utf-8')))
116 |
117 | with open(os.path.join(self.model_dir, args2.config_path), 'r') as fin:
118 | config = json.load(fin)
119 |
120 | # For the model trained with character-based word encoder.
121 | if config['token_embedder']['char_dim'] > 0:
122 | self.char_lexicon = {}
123 | with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
124 | for line in fpi:
125 | tokens = line.strip().split('\t')
126 | if len(tokens) == 1:
127 | tokens.insert(0, '\u3000')
128 | token, i = tokens
129 | self.char_lexicon[token] = int(i)
130 | char_emb_layer = EmbeddingLayer(
131 | config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None)
132 | logging.info('char embedding size: ' +
133 | str(len(char_emb_layer.word2id)))
134 | else:
135 | self.char_lexicon = None
136 | char_emb_layer = None
137 |
138 | # For the model trained with word form word encoder.
139 | if config['token_embedder']['word_dim'] > 0:
140 | self.word_lexicon = {}
141 | with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
142 | for line in fpi:
143 | tokens = line.strip().split('\t')
144 | if len(tokens) == 1:
145 | tokens.insert(0, '\u3000')
146 | token, i = tokens
147 | self.word_lexicon[token] = int(i)
148 | word_emb_layer = EmbeddingLayer(
149 | config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None)
150 | logging.info('word embedding size: ' +
151 | str(len(word_emb_layer.word2id)))
152 | else:
153 | self.word_lexicon = None
154 | word_emb_layer = None
155 |
156 | # instantiate the model
157 | model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda)
158 |
159 | if self.use_cuda:
160 | model.cuda()
161 |
162 | logging.info(str(model))
163 | model.load_model(self.model_dir)
164 |
165 | # read test data according to input format
166 |
167 | # configure the model to evaluation mode.
168 | model.eval()
169 | return model, config
170 |
171 | def sents2elmo(self, sents, output_layer=-1):
172 | read_function = read_list
173 |
174 | if self.config['token_embedder']['name'].lower() == 'cnn':
175 | test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token'])
176 | else:
177 | test, text = read_function(sents)
178 |
179 | # create test batches from the input data.
180 | test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches(
181 | test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text)
182 |
183 | cnt = 0
184 |
185 | after_elmo = []
186 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
187 | output = self.model.forward(w, c, masks)
188 | for i, text in enumerate(texts):
189 |
190 | if self.config['encoder']['name'].lower() == 'lstm':
191 | data = output[i, 1:lens[i]-1, :].data
192 | if self.use_cuda:
193 | data = data.cpu()
194 | data = data.numpy()
195 | elif self.config['encoder']['name'].lower() == 'elmo':
196 | data = output[:, i, 1:lens[i]-1, :].data
197 | if self.use_cuda:
198 | data = data.cpu()
199 | data = data.numpy()
200 |
201 | if output_layer == -1:
202 | payload = np.average(data, axis=0)
203 | elif output_layer == -2:
204 | payload = data
205 | else:
206 | payload = data[output_layer]
207 | after_elmo.append(payload)
208 |
209 | cnt += 1
210 | if cnt % 1000 == 0:
211 | logging.info('Finished {0} sentences.'.format(cnt))
212 |
213 | after_elmo = recover(after_elmo, recover_ind)
214 | return after_elmo
215 |
--------------------------------------------------------------------------------
/elmoformanylangs/frontend.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import random
4 | import torch
5 | import torch.nn as nn
6 | import logging
7 | from torch.autograd import Variable
8 | from .modules.elmo import ElmobiLm
9 | from .modules.lstm import LstmbiLm
10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder
11 |
12 |
13 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True):
14 | """
15 | Create one batch of input.
16 |
17 | :param x: List[List[str]]
18 | :param word2id: Dict | None
19 | :param char2id: Dict | None
20 | :param config: Dict
21 | :param oov: str, the form of OOV token.
22 | :param pad: str, the form of padding token.
23 | :param sort: bool, specify whether sorting the sentences by their lengths.
24 | :return:
25 | """
26 | batch_size = len(x)
27 | # lst represents the order of sentences
28 | lst = list(range(batch_size))
29 | if sort:
30 | lst.sort(key=lambda l: -len(x[l]))
31 |
32 | # shuffle the sentences by
33 | x = [x[i] for i in lst]
34 | lens = [len(x[i]) for i in lst]
35 | max_len = max(lens)
36 |
37 | # get a batch of word id whose size is (batch x max_len)
38 | if word2id is not None:
39 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None)
40 | assert oov_id is not None and pad_id is not None
41 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
42 | for i, x_i in enumerate(x):
43 | for j, x_ij in enumerate(x_i):
44 | batch_w[i][j] = word2id.get(x_ij, oov_id)
45 | else:
46 | batch_w = None
47 |
48 | # get a batch of character id whose size is (batch x max_len x max_chars)
49 | if char2id is not None:
50 | bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('', '', oov, pad)]
51 |
52 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None
53 |
54 | if config['token_embedder']['name'].lower() == 'cnn':
55 | max_chars = config['token_embedder']['max_characters_per_token']
56 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars
57 | elif config['token_embedder']['name'].lower() == 'lstm':
58 | # counting the and
59 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2
60 | else:
61 | raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
62 |
63 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
64 |
65 | for i, x_i in enumerate(x):
66 | for j, x_ij in enumerate(x_i):
67 | batch_c[i][j][0] = bow_id
68 | if x_ij == '' or x_ij == '':
69 | batch_c[i][j][1] = char2id.get(x_ij)
70 | batch_c[i][j][2] = eow_id
71 | else:
72 | for k, c in enumerate(x_ij):
73 | batch_c[i][j][k + 1] = char2id.get(c, oov_id)
74 | batch_c[i][j][len(x_ij) + 1] = eow_id
75 | else:
76 | batch_c = None
77 |
78 | # mask[0] is the matrix (batch x max_len) indicating whether
79 | # there is an id is valid (not a padding) in this batch.
80 | # mask[1] stores the flattened ids indicating whether there is a valid
81 | # previous token
82 | # mask[2] stores the flattened ids indicating whether there is a valid
83 | # next token
84 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
85 |
86 | for i, x_i in enumerate(x):
87 | for j in range(len(x_i)):
88 | masks[0][i][j] = 1
89 | if j + 1 < len(x_i):
90 | masks[1].append(i * max_len + j)
91 | if j > 0:
92 | masks[2].append(i * max_len + j)
93 |
94 | assert len(masks[1]) <= batch_size * max_len
95 | assert len(masks[2]) <= batch_size * max_len
96 |
97 | masks[1] = torch.LongTensor(masks[1])
98 | masks[2] = torch.LongTensor(masks[2])
99 |
100 | return batch_w, batch_c, lens, masks
101 |
102 |
103 | # shuffle training examples and create mini-batches
104 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None):
105 | """
106 |
107 | :param x: List[List[str]]
108 | :param batch_size:
109 | :param word2id:
110 | :param char2id:
111 | :param config:
112 | :param perm:
113 | :param shuffle:
114 | :param sort:
115 | :param text:
116 | :return:
117 | """
118 | lst = perm or list(range(len(x)))
119 | if shuffle:
120 | random.shuffle(lst)
121 |
122 | if sort:
123 | lst.sort(key=lambda l: -len(x[l]))
124 |
125 | x = [x[i] for i in lst]
126 | if text is not None:
127 | text = [text[i] for i in lst]
128 |
129 | sum_len = 0.0
130 | batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], []
131 | size = batch_size
132 | nbatch = (len(x) - 1) // size + 1
133 | for i in range(nbatch):
134 | start_id, end_id = i * size, (i + 1) * size
135 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
136 | sum_len += sum(blens)
137 | batches_w.append(bw)
138 | batches_c.append(bc)
139 | batches_lens.append(blens)
140 | batches_masks.append(bmasks)
141 | if text is not None:
142 | batches_text.append(text[start_id: end_id])
143 |
144 | if sort:
145 | perm = list(range(nbatch))
146 | random.shuffle(perm)
147 | batches_w = [batches_w[i] for i in perm]
148 | batches_c = [batches_c[i] for i in perm]
149 | batches_lens = [batches_lens[i] for i in perm]
150 | batches_masks = [batches_masks[i] for i in perm]
151 | if text is not None:
152 | batches_text = [batches_text[i] for i in perm]
153 |
154 | logging.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x)))
155 | if text is not None:
156 | return batches_w, batches_c, batches_lens, batches_masks, batches_text
157 | return batches_w, batches_c, batches_lens, batches_masks
158 |
159 |
160 | class Model(nn.Module):
161 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
162 | super(Model, self).__init__()
163 | self.use_cuda = use_cuda
164 | self.config = config
165 |
166 | if config['token_embedder']['name'].lower() == 'cnn':
167 | self.token_embedder = ConvTokenEmbedder(
168 | config, word_emb_layer, char_emb_layer, use_cuda)
169 | elif config['token_embedder']['name'].lower() == 'lstm':
170 | self.token_embedder = LstmTokenEmbedder(
171 | config, word_emb_layer, char_emb_layer, use_cuda)
172 |
173 | if config['encoder']['name'].lower() == 'elmo':
174 | self.encoder = ElmobiLm(config, use_cuda)
175 | elif config['encoder']['name'].lower() == 'lstm':
176 | self.encoder = LstmbiLm(config, use_cuda)
177 |
178 | self.output_dim = config['encoder']['projection_dim']
179 |
180 | def forward(self, word_inp, chars_package, mask_package):
181 | """
182 |
183 | :param word_inp:
184 | :param chars_package:
185 | :param mask_package:
186 | :return:
187 | """
188 | token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1)))
189 | if self.config['encoder']['name'] == 'elmo':
190 | mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0])
191 | encoder_output = self.encoder(token_embedding, mask)
192 | sz = encoder_output.size()
193 | token_embedding = torch.cat(
194 | [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
195 | encoder_output = torch.cat(
196 | [token_embedding, encoder_output], dim=0)
197 | elif self.config['encoder']['name'] == 'lstm':
198 | encoder_output = self.encoder(token_embedding)
199 | else:
200 | raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name']))
201 |
202 | return encoder_output
203 |
204 | def load_model(self, path):
205 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'),
206 | map_location=lambda storage, loc: storage))
207 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'),
208 | map_location=lambda storage, loc: storage))
209 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/elmoformanylangs/modules/__init__.py
--------------------------------------------------------------------------------
/elmoformanylangs/modules/classify_layer.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import math
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.autograd import Variable
7 |
8 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
9 |
10 |
11 | class SoftmaxLayer(nn.Module):
12 | """ Naive softmax-layer """
13 | def __init__(self, output_dim, n_class):
14 | """
15 |
16 | :param output_dim: int
17 | :param n_class: int
18 | """
19 | super(SoftmaxLayer, self).__init__()
20 | self.hidden2tag = nn.Linear(output_dim, n_class)
21 | self.criterion = nn.CrossEntropyLoss(size_average=False)
22 |
23 | def forward(self, x, y):
24 | """
25 |
26 | :param x: torch.Tensor
27 | :param y: torch.Tensor
28 | :return:
29 | """
30 | tag_scores = self.hidden2tag(x)
31 | return self.criterion(tag_scores, y)
32 |
33 |
34 | class SampledSoftmaxLayer(nn.Module):
35 | """
36 |
37 | """
38 | def __init__(self, output_dim, n_class, n_samples, use_cuda):
39 | """
40 |
41 | :param output_dim:
42 | :param n_class:
43 | :param n_samples:
44 | :param use_cuda:
45 | """
46 | super(SampledSoftmaxLayer, self).__init__()
47 | self.n_samples = n_samples
48 | self.n_class = n_class
49 | self.use_cuda = use_cuda
50 | self.criterion = nn.CrossEntropyLoss(size_average=False)
51 | self.negative_samples = []
52 | self.word_to_column = {0: 0}
53 |
54 | self.all_word = []
55 | self.all_word_to_column = {0: 0}
56 |
57 | self.column_emb = nn.Embedding(n_class, output_dim)
58 | self.column_emb.weight.data.uniform_(-0.25, 0.25)
59 |
60 | self.column_bias = nn.Embedding(n_class, 1)
61 | self.column_bias.weight.data.uniform_(-0.25, 0.25)
62 |
63 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
64 | self.oov_column.data.uniform_(-0.25, 0.25)
65 |
66 | def forward(self, x, y):
67 | if self.training:
68 | for i in range(y.size(0)):
69 | y[i] = self.word_to_column.get(y[i].tolist())
70 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
71 | for word in self.negative_samples:
72 | samples[self.word_to_column[word]] = word
73 | else:
74 | for i in range(y.size(0)):
75 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
76 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
77 | for word in self.all_word:
78 | samples[self.all_word_to_column[word]] = word
79 |
80 | if self.use_cuda:
81 | samples = samples.cuda()
82 |
83 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
84 | (self.column_bias.forward(samples)).view(1, -1)
85 | return self.criterion(tag_scores, y)
86 |
87 | def update_embedding_matrix(self):
88 | word_inp, chars_inp = [], []
89 | if self.training:
90 | columns = torch.LongTensor(len(self.negative_samples) + 1)
91 | samples = self.negative_samples
92 | for i, word in enumerate(samples):
93 | columns[self.word_to_column[word]] = word
94 | columns[0] = 0
95 | else:
96 | columns = torch.LongTensor(len(self.all_word) + 1)
97 | samples = self.all_word
98 | for i, word in enumerate(samples):
99 | columns[self.all_word_to_column[word]] = word
100 | columns[0] = 0
101 |
102 | if self.use_cuda:
103 | columns = columns.cuda()
104 | self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1)
105 |
106 | def update_negative_samples(self, word_inp, chars_inp, mask):
107 | batch_size, seq_len = word_inp.size(0), word_inp.size(1)
108 | in_batch = set()
109 | for i in range(batch_size):
110 | for j in range(seq_len):
111 | if mask[i][j] == 0:
112 | continue
113 | word = word_inp[i][j].tolist()
114 | in_batch.add(word)
115 | for i in range(batch_size):
116 | for j in range(seq_len):
117 | if mask[i][j] == 0:
118 | continue
119 | word = word_inp[i][j].tolist()
120 | if word not in self.all_word_to_column:
121 | self.all_word.append(word)
122 | self.all_word_to_column[word] = len(self.all_word_to_column)
123 |
124 | if word not in self.word_to_column:
125 | if len(self.negative_samples) < self.n_samples:
126 | self.negative_samples.append(word)
127 | self.word_to_column[word] = len(self.word_to_column)
128 | else:
129 | while self.negative_samples[0] in in_batch:
130 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
131 | self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0])
132 | self.negative_samples = self.negative_samples[1:] + [word]
133 |
134 |
135 | class CNNSoftmaxLayer(nn.Module):
136 | def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda):
137 | super(CNNSoftmaxLayer, self).__init__()
138 | self.token_embedder = token_embedder
139 | self.n_samples = n_samples
140 | self.use_cuda = use_cuda
141 | self.criterion = nn.CrossEntropyLoss(size_average=False)
142 | self.negative_samples = []
143 | self.word_to_column = {0: 0}
144 |
145 | self.all_word = []
146 | self.all_word_to_column = {0: 0}
147 |
148 | self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim))
149 | stdv = 1. / math.sqrt(self.M.size(1))
150 | self.M.data.uniform_(-stdv, stdv)
151 |
152 | self.corr = nn.Embedding(n_class, corr_dim)
153 | self.corr.weight.data.uniform_(-0.25, 0.25)
154 |
155 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
156 | self.oov_column.data.uniform_(-0.25, 0.25)
157 |
158 | def forward(self, x, y):
159 | if self.training:
160 | for i in range(y.size(0)):
161 | y[i] = self.word_to_column.get(y[i].tolist())
162 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
163 | for package in self.negative_samples:
164 | samples[self.word_to_column[package[0]]] = package[0]
165 | else:
166 | for i in range(y.size(0)):
167 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
168 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
169 | for package in self.all_word:
170 | samples[self.all_word_to_column[package[0]]] = package[0]
171 |
172 | if self.use_cuda:
173 | samples = samples.cuda()
174 |
175 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
176 | (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1)
177 | return self.criterion(tag_scores, y)
178 |
179 | def update_embedding_matrix(self):
180 | batch_size = 2048
181 | word_inp, chars_inp = [], []
182 | if self.training:
183 | sub_matrices = [self.oov_column]
184 | samples = self.negative_samples
185 | id2pack = {}
186 | for i, package in enumerate(samples):
187 | id2pack[self.word_to_column[package[0]]] = i
188 | else:
189 | sub_matrices = [self.oov_column]
190 | samples = self.all_word
191 | id2pack = {}
192 | for i, package in enumerate(samples):
193 | id2pack[self.all_word_to_column[package[0]]] = i
194 |
195 | for i in range(len(samples)):
196 | # [n_samples, 1], [n_samples, 1, x], [n_samples, 1]
197 | word_inp.append(samples[id2pack[i + 1]][0])
198 | chars_inp.append(samples[id2pack[i + 1]][1])
199 | if len(word_inp) == batch_size or i == len(samples) - 1:
200 | sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1),
201 | None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])),
202 | (len(word_inp), 1)).squeeze(1).transpose(0, 1))
203 | if not self.training:
204 | sub_matrices[-1] = sub_matrices[-1].detach()
205 | word_inp, chars_inp = [], []
206 |
207 | sum = 0
208 | for mat in sub_matrices:
209 | sum += mat.size(1)
210 | #print(sum, len(self.word_to_column))
211 | self.embedding_matrix = torch.cat(sub_matrices, dim=1)
212 |
213 | def update_negative_samples(self, word_inp, chars_inp, mask):
214 | batch_size, seq_len = word_inp.size(0), word_inp.size(1)
215 | in_batch = set()
216 | for i in range(batch_size):
217 | for j in range(seq_len):
218 | if mask[i][j] == 0:
219 | continue
220 | word = word_inp[i][j].tolist()
221 | in_batch.add(word)
222 | for i in range(batch_size):
223 | for j in range(seq_len):
224 | if mask[i][j] == 0:
225 | continue
226 | package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist())
227 | if package[0] not in self.all_word_to_column:
228 | self.all_word.append(package)
229 | self.all_word_to_column[package[0]] = len(self.all_word_to_column)
230 |
231 | if package[0] not in self.word_to_column:
232 | if len(self.negative_samples) < self.n_samples:
233 | self.negative_samples.append(package)
234 | self.word_to_column[package[0]] = len(self.word_to_column)
235 | else:
236 | while self.negative_samples[0][0] in in_batch:
237 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
238 | self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0])
239 | self.negative_samples = self.negative_samples[1:] + [package]
240 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/elmo.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple, List, Callable, Union
2 |
3 | import h5py
4 | import numpy
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence
9 | from torch.autograd import Variable
10 |
11 | from .encoder_base import _EncoderBase
12 | from .lstm_cell_with_projection import LstmCellWithProjection
13 |
14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name
15 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name
16 |
17 |
18 | class ElmobiLm(_EncoderBase):
19 | def __init__(self, config, use_cuda=False):
20 | super(ElmobiLm, self).__init__(stateful=True)
21 | self.config = config
22 | self.use_cuda = use_cuda
23 | input_size = config['encoder']['projection_dim']
24 | hidden_size = config['encoder']['projection_dim']
25 | cell_size = config['encoder']['dim']
26 | num_layers = config['encoder']['n_layers']
27 | memory_cell_clip_value = config['encoder']['cell_clip']
28 | state_projection_clip_value = config['encoder']['proj_clip']
29 | recurrent_dropout_probability = config['dropout']
30 |
31 | self.input_size = input_size
32 | self.hidden_size = hidden_size
33 | self.num_layers = num_layers
34 | self.cell_size = cell_size
35 |
36 | forward_layers = []
37 | backward_layers = []
38 |
39 | lstm_input_size = input_size
40 | go_forward = True
41 | for layer_index in range(num_layers):
42 | forward_layer = LstmCellWithProjection(lstm_input_size,
43 | hidden_size,
44 | cell_size,
45 | go_forward,
46 | recurrent_dropout_probability,
47 | memory_cell_clip_value,
48 | state_projection_clip_value)
49 | backward_layer = LstmCellWithProjection(lstm_input_size,
50 | hidden_size,
51 | cell_size,
52 | not go_forward,
53 | recurrent_dropout_probability,
54 | memory_cell_clip_value,
55 | state_projection_clip_value)
56 | lstm_input_size = hidden_size
57 |
58 | self.add_module('forward_layer_{}'.format(layer_index), forward_layer)
59 | self.add_module('backward_layer_{}'.format(layer_index), backward_layer)
60 | forward_layers.append(forward_layer)
61 | backward_layers.append(backward_layer)
62 | self.forward_layers = forward_layers
63 | self.backward_layers = backward_layers
64 |
65 | def forward(self, inputs, mask):
66 | batch_size, total_sequence_length = mask.size()
67 | stacked_sequence_output, final_states, restoration_indices = \
68 | self.sort_and_run_forward(self._lstm_forward, inputs, mask)
69 |
70 | num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size()
71 | # Add back invalid rows which were removed in the call to sort_and_run_forward.
72 | if num_valid < batch_size:
73 | zeros = stacked_sequence_output.data.new(num_layers,
74 | batch_size - num_valid,
75 | returned_timesteps,
76 | encoder_dim).fill_(0)
77 | zeros = Variable(zeros)
78 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1)
79 |
80 | # The states also need to have invalid rows added back.
81 | new_states = []
82 | for state in final_states:
83 | state_dim = state.size(-1)
84 | zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0)
85 | zeros = Variable(zeros)
86 | new_states.append(torch.cat([state, zeros], 1))
87 | final_states = new_states
88 |
89 | # It's possible to need to pass sequences which are padded to longer than the
90 | # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking
91 | # the sequences mean that the returned tensor won't include these dimensions, because
92 | # the RNN did not need to process them. We add them back on in the form of zeros here.
93 | sequence_length_difference = total_sequence_length - returned_timesteps
94 | if sequence_length_difference > 0:
95 | zeros = stacked_sequence_output.data.new(num_layers,
96 | batch_size,
97 | sequence_length_difference,
98 | stacked_sequence_output[0].size(-1)).fill_(0)
99 | zeros = Variable(zeros)
100 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2)
101 |
102 | self._update_states(final_states, restoration_indices)
103 |
104 | # Restore the original indices and return the sequence.
105 | # Has shape (num_layers, batch_size, sequence_length, hidden_size)
106 | return stacked_sequence_output.index_select(1, restoration_indices)
107 |
108 |
109 | def _lstm_forward(self,
110 | inputs: PackedSequence,
111 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
112 | Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
113 | """
114 | Parameters
115 | ----------
116 | inputs : ``PackedSequence``, required.
117 | A batch first ``PackedSequence`` to run the stacked LSTM over.
118 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
119 | A tuple (state, memory) representing the initial hidden state and memory
120 | of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
121 | (num_layers, batch_size, 2 * cell_size) respectively.
122 | Returns
123 | -------
124 | output_sequence : ``torch.FloatTensor``
125 | The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
126 | final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
127 | The per-layer final (state, memory) states of the LSTM, with shape
128 | (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size)
129 | respectively. The last dimension is duplicated because it contains the state/memory
130 | for both the forward and backward layers.
131 | """
132 |
133 | if initial_state is None:
134 | hidden_states: List[Optional[Tuple[torch.Tensor,
135 | torch.Tensor]]] = [None] * len(self.forward_layers)
136 | elif initial_state[0].size()[0] != len(self.forward_layers):
137 | raise Exception("Initial states were passed to forward() but the number of "
138 | "initial states does not match the number of layers.")
139 | else:
140 | hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
141 |
142 | inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
143 | forward_output_sequence = inputs
144 | backward_output_sequence = inputs
145 |
146 | final_states = []
147 | sequence_outputs = []
148 | for layer_index, state in enumerate(hidden_states):
149 | forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
150 | backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))
151 |
152 | forward_cache = forward_output_sequence
153 | backward_cache = backward_output_sequence
154 |
155 | if state is not None:
156 | forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
157 | forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
158 | forward_state = (forward_hidden_state, forward_memory_state)
159 | backward_state = (backward_hidden_state, backward_memory_state)
160 | else:
161 | forward_state = None
162 | backward_state = None
163 |
164 | forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
165 | batch_lengths,
166 | forward_state)
167 | backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
168 | batch_lengths,
169 | backward_state)
170 | # Skip connections, just adding the input to the output.
171 | if layer_index != 0:
172 | forward_output_sequence += forward_cache
173 | backward_output_sequence += backward_cache
174 |
175 | sequence_outputs.append(torch.cat([forward_output_sequence,
176 | backward_output_sequence], -1))
177 | # Append the state tuples in a list, so that we can return
178 | # the final states for all the layers.
179 | final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
180 | torch.cat([forward_state[1], backward_state[1]], -1)))
181 |
182 | stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
183 | # Stack the hidden state and memory for each layer into 2 tensors of shape
184 | # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
185 | # respectively.
186 | final_hidden_states, final_memory_states = zip(*final_states)
187 | final_state_tuple: Tuple[torch.FloatTensor,
188 | torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
189 | torch.cat(final_memory_states, 0))
190 | return stacked_sequence_outputs, final_state_tuple
--------------------------------------------------------------------------------
/elmoformanylangs/modules/embedding_layer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import logging
6 |
7 | logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
8 |
9 |
10 | class EmbeddingLayer(nn.Module):
11 | def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='', pad='', normalize=True):
12 | super(EmbeddingLayer, self).__init__()
13 | if embs is not None:
14 | embwords, embvecs = embs
15 | # for word in embwords:
16 | # assert word not in word2id, "Duplicate words in pre-trained embeddings"
17 | # word2id[word] = len(word2id)
18 |
19 | logging.info("{} pre-trained word embeddings loaded.".format(len(word2id)))
20 | if n_d != len(embvecs[0]):
21 | logging.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format(
22 | n_d, len(embvecs[0]), len(embvecs[0])))
23 | n_d = len(embvecs[0])
24 |
25 | self.word2id = word2id
26 | self.id2word = {i: word for word, i in word2id.items()}
27 | self.n_V, self.n_d = len(word2id), n_d
28 | self.oovid = word2id[oov]
29 | self.padid = word2id[pad]
30 | self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid)
31 | self.embedding.weight.data.uniform_(-0.25, 0.25)
32 |
33 | if embs is not None:
34 | weight = self.embedding.weight
35 | weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
36 | logging.info("embedding shape: {}".format(weight.size()))
37 |
38 | if normalize:
39 | weight = self.embedding.weight
40 | norms = weight.data.norm(2, 1)
41 | if norms.dim() == 1:
42 | norms = norms.unsqueeze(1)
43 | weight.data.div_(norms.expand_as(weight.data))
44 |
45 | if fix_emb:
46 | self.embedding.weight.requires_grad = False
47 |
48 | def forward(self, input_):
49 | return self.embedding(input_)
--------------------------------------------------------------------------------
/elmoformanylangs/modules/encoder_base.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, Union, Optional, Callable
2 | import torch
3 | from torch.autograd import Variable
4 | from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence
5 |
6 | from .util import get_lengths_from_binary_sequence_mask, sort_batch_by_length
7 |
8 | # We have two types here for the state, because storing the state in something
9 | # which is Iterable (like a tuple, below), is helpful for internal manipulation
10 | # - however, the states are consumed as either Tensors or a Tuple of Tensors, so
11 | # returning them in this format is unhelpful.
12 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name
13 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name
14 |
15 |
16 | class _EncoderBase(torch.nn.Module):
17 | # pylint: disable=abstract-method
18 | """
19 | This abstract class serves as a base for the 3 ``Encoder`` abstractions in AllenNLP.
20 | - :class:`~allennlp.modules.seq2seq_encoders.Seq2SeqEncoders`
21 | - :class:`~allennlp.modules.seq2vec_encoders.Seq2VecEncoders`
22 | Additionally, this class provides functionality for sorting sequences by length
23 | so they can be consumed by Pytorch RNN classes, which require their inputs to be
24 | sorted by length. Finally, it also provides optional statefulness to all of it's
25 | subclasses by allowing the caching and retrieving of the hidden states of RNNs.
26 | """
27 | def __init__(self, stateful: bool = False) -> None:
28 | super(_EncoderBase, self).__init__()
29 | self.stateful = stateful
30 | self._states: Optional[RnnStateStorage] = None
31 |
32 | def sort_and_run_forward(self,
33 | module: Callable[[PackedSequence, Optional[RnnState]],
34 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]],
35 | inputs: torch.Tensor,
36 | mask: torch.Tensor,
37 | hidden_state: Optional[RnnState] = None):
38 | """
39 | This function exists because Pytorch RNNs require that their inputs be sorted
40 | before being passed as input. As all of our Seq2xxxEncoders use this functionality,
41 | it is provided in a base class. This method can be called on any module which
42 | takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a
43 | tuple of tensors or a tensor.
44 | As all of our Seq2xxxEncoders have different return types, we return `sorted`
45 | outputs from the module, which is called directly. Additionally, we return the
46 | indices into the batch dimension required to restore the tensor to it's correct,
47 | unsorted order and the number of valid batch elements (i.e the number of elements
48 | in the batch which are not completely masked). This un-sorting and re-padding
49 | of the module outputs is left to the subclasses because their outputs have different
50 | types and handling them smoothly here is difficult.
51 | Parameters
52 | ----------
53 | module : ``Callable[[PackedSequence, Optional[RnnState]],
54 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required.
55 | A function to run on the inputs. In most cases, this is a ``torch.nn.Module``.
56 | inputs : ``torch.Tensor``, required.
57 | A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing
58 | the inputs to the Encoder.
59 | mask : ``torch.Tensor``, required.
60 | A tensor of shape ``(batch_size, sequence_length)``, representing masked and
61 | non-masked elements of the sequence for each element in the batch.
62 | hidden_state : ``Optional[RnnState]``, (default = None).
63 | A single tensor of shape (num_layers, batch_size, hidden_size) representing the
64 | state of an RNN with or a tuple of
65 | tensors of shapes (num_layers, batch_size, hidden_size) and
66 | (num_layers, batch_size, memory_size), representing the hidden state and memory
67 | state of an LSTM-like RNN.
68 | Returns
69 | -------
70 | module_output : ``Union[torch.Tensor, PackedSequence]``.
71 | A Tensor or PackedSequence representing the output of the Pytorch Module.
72 | The batch size dimension will be equal to ``num_valid``, as sequences of zero
73 | length are clipped off before the module is called, as Pytorch cannot handle
74 | zero length sequences.
75 | final_states : ``Optional[RnnState]``
76 | A Tensor representing the hidden state of the Pytorch Module. This can either
77 | be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in
78 | the case of a GRU, or a tuple of tensors, such as those required for an LSTM.
79 | restoration_indices : ``torch.LongTensor``
80 | A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform
81 | the outputs back to their original batch order.
82 | """
83 | # In some circumstances you may have sequences of zero length. ``pack_padded_sequence``
84 | # requires all sequence lengths to be > 0, so remove sequences of zero length before
85 | # calling self._module, then fill with zeros.
86 |
87 | # First count how many sequences are empty.
88 | batch_size = mask.size(0)
89 | num_valid = torch.sum(mask[:, 0]).int().item()
90 |
91 | sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
92 | sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\
93 | sort_batch_by_length(inputs, sequence_lengths)
94 |
95 | # Now create a PackedSequence with only the non-empty, sorted sequences.
96 | packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
97 | sorted_sequence_lengths[:num_valid].data.tolist(),
98 | batch_first=True)
99 | # Prepare the initial states.
100 | if not self.stateful:
101 | if hidden_state is None:
102 | initial_states = hidden_state
103 | elif isinstance(hidden_state, tuple):
104 | initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :]
105 | for state in hidden_state]
106 | else:
107 | initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :]
108 |
109 | else:
110 | initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices)
111 |
112 | # Actually call the module on the sorted PackedSequence.
113 | module_output, final_states = module(packed_sequence_input, initial_states)
114 |
115 | return module_output, final_states, restoration_indices
116 |
117 | def _get_initial_states(self,
118 | batch_size: int,
119 | num_valid: int,
120 | sorting_indices: torch.LongTensor) -> Optional[RnnState]:
121 | """
122 | Returns an initial state for use in an RNN. Additionally, this method handles
123 | the batch size changing across calls by mutating the state to append initial states
124 | for new elements in the batch. Finally, it also handles sorting the states
125 | with respect to the sequence lengths of elements in the batch and removing rows
126 | which are completely padded. Importantly, this `mutates` the state if the
127 | current batch size is larger than when it was previously called.
128 | Parameters
129 | ----------
130 | batch_size : ``int``, required.
131 | The batch size can change size across calls to stateful RNNs, so we need
132 | to know if we need to expand or shrink the states before returning them.
133 | Expanded states will be set to zero.
134 | num_valid : ``int``, required.
135 | The batch may contain completely padded sequences which get removed before
136 | the sequence is passed through the encoder. We also need to clip these off
137 | of the state too.
138 | sorting_indices ``torch.LongTensor``, required.
139 | Pytorch RNNs take sequences sorted by length. When we return the states to be
140 | used for a given call to ``module.forward``, we need the states to match up to
141 | the sorted sequences, so before returning them, we sort the states using the
142 | same indices used to sort the sequences.
143 | Returns
144 | -------
145 | This method has a complex return type because it has to deal with the first time it
146 | is called, when it has no state, and the fact that types of RNN have heterogeneous
147 | states.
148 | If it is the first time the module has been called, it returns ``None``, regardless
149 | of the type of the ``Module``.
150 | Otherwise, for LSTMs, it returns a tuple of ``torch.Tensors`` with shape
151 | ``(num_layers, num_valid, state_size)`` and ``(num_layers, num_valid, memory_size)``
152 | respectively, or for GRUs, it returns a single ``torch.Tensor`` of shape
153 | ``(num_layers, num_valid, state_size)``.
154 | """
155 | # We don't know the state sizes the first time calling forward,
156 | # so we let the module define what it's initial hidden state looks like.
157 | if self._states is None:
158 | return None
159 |
160 | # Otherwise, we have some previous states.
161 | if batch_size > self._states[0].size(1):
162 | # This batch is larger than the all previous states.
163 | # If so, resize the states.
164 | num_states_to_concat = batch_size - self._states[0].size(1)
165 | resized_states = []
166 | # state has shape (num_layers, batch_size, hidden_size)
167 | for state in self._states:
168 | # This _must_ be inside the loop because some
169 | # RNNs have states with different last dimension sizes.
170 | zeros = state.data.new(state.size(0),
171 | num_states_to_concat,
172 | state.size(2)).fill_(0)
173 | zeros = Variable(zeros)
174 | resized_states.append(torch.cat([state, zeros], 1))
175 | self._states = tuple(resized_states)
176 | correctly_shaped_states = self._states
177 |
178 | elif batch_size < self._states[0].size(1):
179 | # This batch is smaller than the previous one.
180 | correctly_shaped_states = tuple(state[:, :batch_size, :] for state in self._states)
181 | else:
182 | correctly_shaped_states = self._states
183 |
184 | # At this point, our states are of shape (num_layers, batch_size, hidden_size).
185 | # However, the encoder uses sorted sequences and additionally removes elements
186 | # of the batch which are fully padded. We need the states to match up to these
187 | # sorted and filtered sequences, so we do that in the next two blocks before
188 | # returning the state/s.
189 | if len(self._states) == 1:
190 | # GRUs only have a single state. This `unpacks` it from the
191 | # tuple and returns the tensor directly.
192 | correctly_shaped_state = correctly_shaped_states[0]
193 | sorted_state = correctly_shaped_state.index_select(1, sorting_indices)
194 | return sorted_state[:, :num_valid, :]
195 | else:
196 | # LSTMs have a state tuple of (state, memory).
197 | sorted_states = [state.index_select(1, sorting_indices)
198 | for state in correctly_shaped_states]
199 | return tuple(state[:, :num_valid, :] for state in sorted_states)
200 |
201 | def _update_states(self,
202 | final_states: RnnStateStorage,
203 | restoration_indices: torch.LongTensor) -> None:
204 | """
205 | After the RNN has run forward, the states need to be updated.
206 | This method just sets the state to the updated new state, performing
207 | several pieces of book-keeping along the way - namely, unsorting the
208 | states and ensuring that the states of completely padded sequences are
209 | not updated. Finally, it also detatches the state variable from the
210 | computational graph, such that the graph can be garbage collected after
211 | each batch iteration.
212 | Parameters
213 | ----------
214 | final_states : ``RnnStateStorage``, required.
215 | The hidden states returned as output from the RNN.
216 | restoration_indices : ``torch.LongTensor``, required.
217 | The indices that invert the sorting used in ``sort_and_run_forward``
218 | to order the states with respect to the lengths of the sequences in
219 | the batch.
220 | """
221 | # TODO(Mark): seems weird to sort here, but append zeros in the subclasses.
222 | # which way around is best?
223 | new_unsorted_states = [state.index_select(1, restoration_indices)
224 | for state in final_states]
225 |
226 | if self._states is None:
227 | # We don't already have states, so just set the
228 | # ones we receive to be the current state.
229 | self._states = tuple([torch.autograd.Variable(state.data)
230 | for state in new_unsorted_states])
231 | else:
232 | # Now we've sorted the states back so that they correspond to the original
233 | # indices, we need to figure out what states we need to update, because if we
234 | # didn't use a state for a particular row, we want to preserve its state.
235 | # Thankfully, the rows which are all zero in the state correspond exactly
236 | # to those which aren't used, so we create masks of shape (new_batch_size,),
237 | # denoting which states were used in the RNN computation.
238 | current_state_batch_size = self._states[0].size(1)
239 | new_state_batch_size = final_states[0].size(1)
240 | # Masks for the unused states of shape (1, new_batch_size, 1)
241 | used_new_rows_mask = [(state[0, :, :].sum(-1)
242 | != 0.0).float().view(1, new_state_batch_size, 1)
243 | for state in new_unsorted_states]
244 | new_states = []
245 | if current_state_batch_size > new_state_batch_size:
246 | # The new state is smaller than the old one,
247 | # so just update the indices which we used.
248 | for old_state, new_state, used_mask in zip(self._states,
249 | new_unsorted_states,
250 | used_new_rows_mask):
251 | # zero out all rows in the previous state
252 | # which _were_ used in the current state.
253 | masked_old_state = old_state[:, :new_state_batch_size, :] * (1 - used_mask)
254 | # The old state is larger, so update the relevant parts of it.
255 | old_state[:, :new_state_batch_size, :] = new_state + masked_old_state
256 | # Detatch the Variable.
257 | new_states.append(torch.autograd.Variable(old_state.data))
258 | else:
259 | # The states are the same size, so we just have to
260 | # deal with the possibility that some rows weren't used.
261 | new_states = []
262 | for old_state, new_state, used_mask in zip(self._states,
263 | new_unsorted_states,
264 | used_new_rows_mask):
265 | # zero out all rows which _were_ used in the current state.
266 | masked_old_state = old_state * (1 - used_mask)
267 | # The old state is larger, so update the relevant parts of it.
268 | new_state += masked_old_state
269 | # Detatch the Variable.
270 | new_states.append(torch.autograd.Variable(new_state.data))
271 |
272 | # It looks like there should be another case handled here - when
273 | # the current_state_batch_size < new_state_batch_size. However,
274 | # this never happens, because the states themeselves are mutated
275 | # by appending zeros when calling _get_inital_states, meaning that
276 | # the new states are either of equal size, or smaller, in the case
277 | # that there are some unused elements (zero-length) for the RNN computation.
278 | self._states = tuple(new_states)
279 |
280 | def reset_states(self):
281 | self._states = None
282 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/highway.py:
--------------------------------------------------------------------------------
1 | """
2 | A `Highway layer `_ that does a gated combination of a linear
3 | transformation and a non-linear transformation of its input.
4 | """
5 |
6 | from typing import Callable
7 |
8 | import torch
9 | from overrides import overrides
10 |
11 |
12 | class Highway(torch.nn.Module):
13 | """
14 | A `Highway layer `_ does a gated combination of a linear
15 | transformation and a non-linear transformation of its input. :math:`y = g * x + (1 - g) *
16 | f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
17 | non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
18 | This module will apply a fixed number of highway layers to its input, returning the final
19 | result.
20 | Parameters
21 | ----------
22 | input_dim : ``int``
23 | The dimensionality of :math:`x`. We assume the input has shape ``(batch_size,
24 | input_dim)``.
25 | num_layers : ``int``, optional (default=``1``)
26 | The number of highway layers to apply to the input.
27 | activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
28 | The non-linearity to use in the highway layers.
29 | """
30 | def __init__(self,
31 | input_dim: int,
32 | num_layers: int = 1,
33 | activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
34 | super(Highway, self).__init__()
35 | self._input_dim = input_dim
36 | self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2)
37 | for _ in range(num_layers)])
38 | self._activation = activation
39 | for layer in self._layers:
40 | # We should bias the highway layer to just carry its input forward. We do that by
41 | # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
42 | # be high, to we will carry the input forward. The bias on `B(x)` is the second half
43 | # of the bias vector in each Linear layer.
44 | layer.bias[input_dim:].data.fill_(1)
45 |
46 | @overrides
47 | def forward(self, inputs: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ
48 | current_input = inputs
49 | for layer in self._layers:
50 | projected_input = layer(current_input)
51 | linear_part = current_input
52 | # NOTE: if you modify this, think about whether you should modify the initialization
53 | # above, too.
54 | nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
55 | gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
56 | nonlinear_part = self._activation(nonlinear_part)
57 | gate = torch.sigmoid(gate)
58 | current_input = gate * linear_part + (1 - gate) * nonlinear_part
59 | return current_input
60 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/lstm.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import unicode_literals
3 | import logging
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import copy
9 |
10 |
11 | class LstmbiLm(nn.Module):
12 | def __init__(self, config, use_cuda=False):
13 | super(LstmbiLm, self).__init__()
14 | self.config = config
15 | self.use_cuda = use_cuda
16 |
17 | self.encoder = nn.LSTM(self.config['encoder']['projection_dim'],
18 | self.config['encoder']['dim'],
19 | num_layers=self.config['encoder']['n_layers'],
20 | bidirectional=True,
21 | batch_first=True,
22 | dropout=self.config['dropout'])
23 | self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True)
24 |
25 | def forward(self, inputs):
26 | forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2)
27 | return torch.cat([self.projection(forward), self.projection(backward)], dim=2)
28 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/lstm_cell_with_projection.py:
--------------------------------------------------------------------------------
1 | """
2 | An LSTM with Recurrent Dropout, a hidden_state which is projected and
3 | clipping on both the hidden state and the memory state of the LSTM.
4 | """
5 |
6 | from typing import Optional, Tuple, List
7 |
8 | import torch
9 | from torch.autograd import Variable
10 |
11 | from .util import block_orthogonal, get_dropout_mask
12 |
13 | class LstmCellWithProjection(torch.nn.Module):
14 | """
15 | An LSTM with Recurrent Dropout and a projected and clipped hidden state and
16 | memory. Note: this implementation is slower than the native Pytorch LSTM because
17 | it cannot make use of CUDNN optimizations for stacked RNNs due to and
18 | variational dropout and the custom nature of the cell state.
19 | Parameters
20 | ----------
21 | input_size : ``int``, required.
22 | The dimension of the inputs to the LSTM.
23 | hidden_size : ``int``, required.
24 | The dimension of the outputs of the LSTM.
25 | cell_size : ``int``, required.
26 | The dimension of the memory cell used for the LSTM.
27 | go_forward: ``bool``, optional (default = True)
28 | The direction in which the LSTM is applied to the sequence.
29 | Forwards by default, or backwards if False.
30 | recurrent_dropout_probability: ``float``, optional (default = 0.0)
31 | The dropout probability to be used in a dropout scheme as stated in
32 | `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks
33 | `_ . Implementation wise, this simply
34 | applies a fixed dropout mask per sequence to the recurrent connection of the
35 | LSTM.
36 | state_projection_clip_value: ``float``, optional, (default = None)
37 | The magnitude with which to clip the hidden_state after projecting it.
38 | memory_cell_clip_value: ``float``, optional, (default = None)
39 | The magnitude with which to clip the memory cell.
40 | Returns
41 | -------
42 | output_accumulator : ``torch.FloatTensor``
43 | The outputs of the LSTM for each timestep. A tensor of shape
44 | (batch_size, max_timesteps, hidden_size) where for a given batch
45 | element, all outputs past the sequence length for that batch are
46 | zero tensors.
47 | final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
48 | The final (state, memory) states of the LSTM, with shape
49 | (1, batch_size, hidden_size) and (1, batch_size, cell_size)
50 | respectively. The first dimension is 1 in order to match the Pytorch
51 | API for returning stacked LSTM states.
52 | """
53 | def __init__(self,
54 | input_size: int,
55 | hidden_size: int,
56 | cell_size: int,
57 | go_forward: bool = True,
58 | recurrent_dropout_probability: float = 0.0,
59 | memory_cell_clip_value: Optional[float] = None,
60 | state_projection_clip_value: Optional[float] = None) -> None:
61 | super(LstmCellWithProjection, self).__init__()
62 | # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`.
63 | self.input_size = input_size
64 | self.hidden_size = hidden_size
65 | self.cell_size = cell_size
66 |
67 | self.go_forward = go_forward
68 | self.state_projection_clip_value = state_projection_clip_value
69 | self.memory_cell_clip_value = memory_cell_clip_value
70 | self.recurrent_dropout_probability = recurrent_dropout_probability
71 |
72 | # We do the projections for all the gates all at once.
73 | self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False)
74 | self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True)
75 |
76 | # Additional projection matrix for making the hidden state smaller.
77 | self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False)
78 | self.reset_parameters()
79 |
80 | def reset_parameters(self):
81 | # Use sensible default initializations for parameters.
82 | block_orthogonal(self.input_linearity.weight.data, [self.cell_size, self.input_size])
83 | block_orthogonal(self.state_linearity.weight.data, [self.cell_size, self.hidden_size])
84 |
85 | self.state_linearity.bias.data.fill_(0.0)
86 | # Initialize forget gate biases to 1.0 as per An Empirical
87 | # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015).
88 | self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0)
89 |
90 | def forward(self, # pylint: disable=arguments-differ
91 | inputs: torch.FloatTensor,
92 | batch_lengths: List[int],
93 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
94 | """
95 | Parameters
96 | ----------
97 | inputs : ``torch.FloatTensor``, required.
98 | A tensor of shape (batch_size, num_timesteps, input_size)
99 | to apply the LSTM over.
100 | batch_lengths : ``List[int]``, required.
101 | A list of length batch_size containing the lengths of the sequences in batch.
102 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
103 | A tuple (state, memory) representing the initial hidden state and memory
104 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
105 | ``memory`` has shape (1, batch_size, cell_size).
106 | Returns
107 | -------
108 | output_accumulator : ``torch.FloatTensor``
109 | The outputs of the LSTM for each timestep. A tensor of shape
110 | (batch_size, max_timesteps, hidden_size) where for a given batch
111 | element, all outputs past the sequence length for that batch are
112 | zero tensors.
113 | final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]``
114 | A tuple (state, memory) representing the initial hidden state and memory
115 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
116 | ``memory`` has shape (1, batch_size, cell_size).
117 | """
118 | batch_size = inputs.size()[0]
119 | total_timesteps = inputs.size()[1]
120 |
121 | # We have to use this '.data.new().fill_' pattern to create tensors with the correct
122 | # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors.
123 | output_accumulator = Variable(inputs.data.new(batch_size,
124 | total_timesteps,
125 | self.hidden_size).fill_(0))
126 | if initial_state is None:
127 | full_batch_previous_memory = Variable(inputs.data.new(batch_size,
128 | self.cell_size).fill_(0))
129 | full_batch_previous_state = Variable(inputs.data.new(batch_size,
130 | self.hidden_size).fill_(0))
131 | else:
132 | full_batch_previous_state = initial_state[0].squeeze(0)
133 | full_batch_previous_memory = initial_state[1].squeeze(0)
134 |
135 | current_length_index = batch_size - 1 if self.go_forward else 0
136 | if self.recurrent_dropout_probability > 0.0 and self.training:
137 | dropout_mask = get_dropout_mask(self.recurrent_dropout_probability,
138 | full_batch_previous_state)
139 | else:
140 | dropout_mask = None
141 |
142 | for timestep in range(total_timesteps):
143 | # The index depends on which end we start.
144 | index = timestep if self.go_forward else total_timesteps - timestep - 1
145 |
146 | # What we are doing here is finding the index into the batch dimension
147 | # which we need to use for this timestep, because the sequences have
148 | # variable length, so once the index is greater than the length of this
149 | # particular batch sequence, we no longer need to do the computation for
150 | # this sequence. The key thing to recognise here is that the batch inputs
151 | # must be _ordered_ by length from longest (first in batch) to shortest
152 | # (last) so initially, we are going forwards with every sequence and as we
153 | # pass the index at which the shortest elements of the batch finish,
154 | # we stop picking them up for the computation.
155 | if self.go_forward:
156 | while batch_lengths[current_length_index] <= index:
157 | current_length_index -= 1
158 | # If we're going backwards, we are _picking up_ more indices.
159 | else:
160 | # First conditional: Are we already at the maximum number of elements in the batch?
161 | # Second conditional: Does the next shortest sequence beyond the current batch
162 | # index require computation use this timestep?
163 | while current_length_index < (len(batch_lengths) - 1) and \
164 | batch_lengths[current_length_index + 1] > index:
165 | current_length_index += 1
166 |
167 | # Actually get the slices of the batch which we
168 | # need for the computation at this timestep.
169 | # shape (batch_size, cell_size)
170 | previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone()
171 | # Shape (batch_size, hidden_size)
172 | previous_state = full_batch_previous_state[0: current_length_index + 1].clone()
173 | # Shape (batch_size, input_size)
174 | timestep_input = inputs[0: current_length_index + 1, index]
175 |
176 | # Do the projections for all the gates all at once.
177 | # Both have shape (batch_size, 4 * cell_size)
178 | projected_input = self.input_linearity(timestep_input)
179 | projected_state = self.state_linearity(previous_state)
180 |
181 | # Main LSTM equations using relevant chunks of the big linear
182 | # projections of the hidden state and inputs.
183 | input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] +
184 | projected_state[:, (0 * self.cell_size):(1 * self.cell_size)])
185 | forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] +
186 | projected_state[:, (1 * self.cell_size):(2 * self.cell_size)])
187 | memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] +
188 | projected_state[:, (2 * self.cell_size):(3 * self.cell_size)])
189 | output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] +
190 | projected_state[:, (3 * self.cell_size):(4 * self.cell_size)])
191 | memory = input_gate * memory_init + forget_gate * previous_memory
192 |
193 | # Here is the non-standard part of this LSTM cell; first, we clip the
194 | # memory cell, then we project the output of the timestep to a smaller size
195 | # and again clip it.
196 |
197 | if self.memory_cell_clip_value:
198 | # pylint: disable=invalid-unary-operand-type
199 | memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value)
200 |
201 | # shape (current_length_index, cell_size)
202 | pre_projection_timestep_output = output_gate * torch.tanh(memory)
203 |
204 | # shape (current_length_index, hidden_size)
205 | timestep_output = self.state_projection(pre_projection_timestep_output)
206 | if self.state_projection_clip_value:
207 | # pylint: disable=invalid-unary-operand-type
208 | timestep_output = torch.clamp(timestep_output,
209 | -self.state_projection_clip_value,
210 | self.state_projection_clip_value)
211 |
212 | # Only do dropout if the dropout prob is > 0.0 and we are in training mode.
213 | if dropout_mask is not None:
214 | timestep_output = timestep_output * dropout_mask[0: current_length_index + 1]
215 |
216 | # We've been doing computation with less than the full batch, so here we create a new
217 | # variable for the the whole batch at this timestep and insert the result for the
218 | # relevant elements of the batch into it.
219 | full_batch_previous_memory = Variable(full_batch_previous_memory.data.clone())
220 | full_batch_previous_state = Variable(full_batch_previous_state.data.clone())
221 | full_batch_previous_memory[0:current_length_index + 1] = memory
222 | full_batch_previous_state[0:current_length_index + 1] = timestep_output
223 | output_accumulator[0:current_length_index + 1, index] = timestep_output
224 |
225 | # Mimic the pytorch API by returning state in the following shape:
226 | # (num_layers * num_directions, batch_size, ...). As this
227 | # LSTM cell cannot be stacked, the first dimension here is just 1.
228 | final_state = (full_batch_previous_state.unsqueeze(0),
229 | full_batch_previous_memory.unsqueeze(0))
230 |
231 | return output_accumulator, final_state
232 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/token_embedder.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import unicode_literals
3 | import logging
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import copy
9 | from .highway import Highway
10 |
11 |
12 | class LstmTokenEmbedder(nn.Module):
13 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
14 | super(LstmTokenEmbedder, self).__init__()
15 | self.config = config
16 | self.use_cuda = use_cuda
17 | self.word_emb_layer = word_emb_layer
18 | self.char_emb_layer = char_emb_layer
19 | self.output_dim = config['encoder']['projection_dim']
20 | emb_dim = 0
21 | if word_emb_layer is not None:
22 | emb_dim += word_emb_layer.n_d
23 |
24 | if char_emb_layer is not None:
25 | emb_dim += char_emb_layer.n_d * 2
26 | self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True,
27 | batch_first=True, dropout=config['dropout'])
28 |
29 | self.projection = nn.Linear(emb_dim, self.output_dim, bias=True)
30 |
31 | def forward(self, word_inp, chars_inp, shape):
32 | embs = []
33 | batch_size, seq_len = shape
34 | if self.word_emb_layer is not None:
35 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
36 | embs.append(word_emb)
37 |
38 | if self.char_emb_layer is not None:
39 | chars_inp = chars_inp.view(batch_size * seq_len, -1)
40 | chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
41 | _, (chars_outputs, __) = self.char_lstm(chars_emb)
42 | chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
43 | embs.append(chars_outputs)
44 |
45 | token_embedding = torch.cat(embs, dim=2)
46 |
47 | return self.projection(token_embedding)
48 |
49 |
50 | class ConvTokenEmbedder(nn.Module):
51 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda):
52 | super(ConvTokenEmbedder, self).__init__()
53 | self.config = config
54 | self.use_cuda = use_cuda
55 |
56 | self.word_emb_layer = word_emb_layer
57 | self.char_emb_layer = char_emb_layer
58 |
59 | self.output_dim = config['encoder']['projection_dim']
60 | self.emb_dim = 0
61 | if word_emb_layer is not None:
62 | self.emb_dim += word_emb_layer.n_d
63 |
64 | if char_emb_layer is not None:
65 | self.convolutions = []
66 | cnn_config = config['token_embedder']
67 | filters = cnn_config['filters']
68 | char_embed_dim = cnn_config['char_dim']
69 |
70 | for i, (width, num) in enumerate(filters):
71 | conv = torch.nn.Conv1d(
72 | in_channels=char_embed_dim,
73 | out_channels=num,
74 | kernel_size=width,
75 | bias=True
76 | )
77 | self.convolutions.append(conv)
78 |
79 | self.convolutions = nn.ModuleList(self.convolutions)
80 |
81 | self.n_filters = sum(f[1] for f in filters)
82 | self.n_highway = cnn_config['n_highway']
83 |
84 | self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
85 | self.emb_dim += self.n_filters
86 |
87 | self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
88 |
89 | def forward(self, word_inp, chars_inp, shape):
90 | embs = []
91 | batch_size, seq_len = shape
92 | if self.word_emb_layer is not None:
93 | batch_size, seq_len = word_inp.size(0), word_inp.size(1)
94 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
95 | embs.append(word_emb)
96 |
97 | if self.char_emb_layer is not None:
98 | chars_inp = chars_inp.view(batch_size * seq_len, -1)
99 |
100 | character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
101 |
102 | character_embedding = torch.transpose(character_embedding, 1, 2)
103 |
104 | cnn_config = self.config['token_embedder']
105 | if cnn_config['activation'] == 'tanh':
106 | activation = torch.nn.functional.tanh
107 | elif cnn_config['activation'] == 'relu':
108 | activation = torch.nn.functional.relu
109 | else:
110 | raise Exception("Unknown activation")
111 |
112 | convs = []
113 | for i in range(len(self.convolutions)):
114 | convolved = self.convolutions[i](character_embedding)
115 | # (batch_size * sequence_length, n_filters for this width)
116 | convolved, _ = torch.max(convolved, dim=-1)
117 | convolved = activation(convolved)
118 | convs.append(convolved)
119 | char_emb = torch.cat(convs, dim=-1)
120 | char_emb = self.highways(char_emb)
121 |
122 | embs.append(char_emb.view(batch_size, -1, self.n_filters))
123 |
124 | token_embedding = torch.cat(embs, dim=2)
125 |
126 | return self.projection(token_embedding)
127 |
--------------------------------------------------------------------------------
/elmoformanylangs/modules/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Assorted utilities for working with neural networks in AllenNLP.
3 | """
4 | from collections import defaultdict
5 | from typing import Dict, List, Optional, Any, Tuple, Callable
6 | import logging
7 | import itertools
8 | import math
9 | import torch
10 | from torch.autograd import Variable
11 |
12 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor):
13 | """
14 | Compute sequence lengths for each batch element in a tensor using a
15 | binary mask.
16 | Parameters
17 | ----------
18 | mask : torch.Tensor, required.
19 | A 2D binary mask of shape (batch_size, sequence_length) to
20 | calculate the per-batch sequence lengths from.
21 | Returns
22 | -------
23 | A torch.LongTensor of shape (batch_size,) representing the lengths
24 | of the sequences in the batch.
25 | """
26 | return mask.long().sum(-1)
27 |
28 |
29 | def sort_batch_by_length(tensor: torch.autograd.Variable,
30 | sequence_lengths: torch.autograd.Variable):
31 | """
32 | Sort a batch first tensor by some specified lengths.
33 | Parameters
34 | ----------
35 | tensor : Variable(torch.FloatTensor), required.
36 | A batch first Pytorch tensor.
37 | sequence_lengths : Variable(torch.LongTensor), required.
38 | A tensor representing the lengths of some dimension of the tensor which
39 | we want to sort by.
40 | Returns
41 | -------
42 | sorted_tensor : Variable(torch.FloatTensor)
43 | The original tensor sorted along the batch dimension with respect to sequence_lengths.
44 | sorted_sequence_lengths : Variable(torch.LongTensor)
45 | The original sequence_lengths sorted by decreasing size.
46 | restoration_indices : Variable(torch.LongTensor)
47 | Indices into the sorted_tensor such that
48 | ``sorted_tensor.index_select(0, restoration_indices) == original_tensor``
49 | permuation_index : Variable(torch.LongTensor)
50 | The indices used to sort the tensor. This is useful if you want to sort many
51 | tensors using the same ordering.
52 | """
53 |
54 | if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable):
55 | raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.")
56 |
57 | sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
58 | sorted_tensor = tensor.index_select(0, permutation_index)
59 |
60 | # This is ugly, but required - we are creating a new variable at runtime, so we
61 | # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and
62 | # refilling one of the inputs to the function.
63 | index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths)))
64 | # This is the equivalent of zipping with index, sorting by the original
65 | # sequence lengths and returning the now sorted indices.
66 | index_range = Variable(index_range.long())
67 | _, reverse_mapping = permutation_index.sort(0, descending=False)
68 | restoration_indices = index_range.index_select(0, reverse_mapping)
69 | return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
70 |
71 |
72 | def get_final_encoder_states(encoder_outputs: torch.Tensor,
73 | mask: torch.Tensor,
74 | bidirectional: bool = False) -> torch.Tensor:
75 | """
76 | Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length,
77 | encoding_dim)``, this method returns the final hidden state for each element of the batch,
78 | giving a tensor of shape ``(batch_size, encoding_dim)``. This is not as simple as
79 | ``encoder_outputs[:, -1]``, because the sequences could have different lengths. We use the
80 | mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch
81 | instance.
82 | Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the
83 | ``encoder_outputs`` into two and assume that the first half is for the forward direction of the
84 | encoder and the second half is for the backward direction. We will concatenate the last state
85 | for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with
86 | ``encoder_outputs[:, 0, encoding_dim/2:]``.
87 | """
88 | # These are the indices of the last words in the sequences (i.e. length sans padding - 1). We
89 | # are assuming sequences are right padded.
90 | # Shape: (batch_size,)
91 | last_word_indices = mask.sum(1).long() - 1
92 | batch_size, _, encoder_output_dim = encoder_outputs.size()
93 | expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim)
94 | # Shape: (batch_size, 1, encoder_output_dim)
95 | final_encoder_output = encoder_outputs.gather(1, expanded_indices)
96 | final_encoder_output = final_encoder_output.squeeze(1) # (batch_size, encoder_output_dim)
97 | if bidirectional:
98 | final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)]
99 | final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):]
100 | final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1)
101 | return final_encoder_output
102 |
103 |
104 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable):
105 | """
106 | Computes and returns an element-wise dropout mask for a given tensor, where
107 | each element in the mask is dropped out with probability dropout_probability.
108 | Note that the mask is NOT applied to the tensor - the tensor is passed to retain
109 | the correct CUDA tensor type for the mask.
110 | Parameters
111 | ----------
112 | dropout_probability : float, required.
113 | Probability of dropping a dimension of the input.
114 | tensor_for_masking : torch.Variable, required.
115 | Returns
116 | -------
117 | A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
118 | This scaling ensures expected values and variances of the output of applying this mask
119 | and the original tensor are the same.
120 | """
121 | binary_mask = tensor_for_masking.clone()
122 | binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability)
123 | # Scale mask by 1/keep_prob to preserve output statistics.
124 | dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
125 | return dropout_mask
126 |
127 | def block_orthogonal(tensor: torch.Tensor,
128 | split_sizes: List[int],
129 | gain: float = 1.0) -> None:
130 | """
131 | An initializer which allows initializing model parameters in "blocks". This is helpful
132 | in the case of recurrent models which use multiple gates applied to linear projections,
133 | which can be computed efficiently if they are concatenated together. However, they are
134 | separate parameters which should be initialized independently.
135 | Parameters
136 | ----------
137 | tensor : ``torch.Tensor``, required.
138 | A tensor to initialize.
139 | split_sizes : List[int], required.
140 | A list of length ``tensor.ndim()`` specifying the size of the
141 | blocks along that particular dimension. E.g. ``[10, 20]`` would
142 | result in the tensor being split into chunks of size 10 along the
143 | first dimension and 20 along the second.
144 | gain : float, optional (default = 1.0)
145 | The gain (scaling) applied to the orthogonal initialization.
146 | """
147 |
148 | if isinstance(tensor, Variable):
149 | # in pytorch 4.0, Variable equals Tensor
150 | # block_orthogonal(tensor.data, split_sizes, gain)
151 | #else:
152 | sizes = list(tensor.size())
153 | if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
154 | raise ConfigurationError("tensor dimensions must be divisible by their respective "
155 | "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes))
156 | indexes = [list(range(0, max_size, split))
157 | for max_size, split in zip(sizes, split_sizes)]
158 | # Iterate over all possible blocks within the tensor.
159 | for block_start_indices in itertools.product(*indexes):
160 | # A list of tuples containing the index to start at for this block
161 | # and the appropriate step size (i.e split_size[i] for dimension i).
162 | index_and_step_tuples = zip(block_start_indices, split_sizes)
163 | # This is a tuple of slices corresponding to:
164 | # tensor[index: index + step_size, ...]. This is
165 | # required because we could have an arbitrary number
166 | # of dimensions. The actual slices we need are the
167 | # start_index: start_index + step for each dimension in the tensor.
168 | block_slice = tuple([slice(start_index, start_index + step)
169 | for start_index, step in index_and_step_tuples])
170 | tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain)
171 |
--------------------------------------------------------------------------------
/elmoformanylangs/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import unicode_literals
3 | import collections
4 | import itertools
5 |
6 |
7 | def flatten(lst):
8 | return list(itertools.chain.from_iterable(lst))
9 |
10 |
11 | def deep_iter(x):
12 | if isinstance(x, list) or isinstance(x, tuple):
13 | for u in x:
14 | for v in deep_iter(u):
15 | yield v
16 | else:
17 | yield
18 |
19 |
20 | def dict2namedtuple(dic):
21 | return collections.namedtuple('Namespace', dic.keys())(**dic)
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import logging
5 | import json
6 | import numpy as np
7 | import random
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 | from torch.utils.tensorboard import SummaryWriter
13 | from scipy.stats import pearsonr
14 | from sklearn.metrics import f1_score, accuracy_score
15 |
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--config_path', type=str, required=True)
18 | args = parser.parse_args()
19 | with open(args.config_path, 'r') as f:
20 | args = json.load(f)
21 |
22 | os.makedirs(args['output_path'], exist_ok=True)
23 |
24 | logFormatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
25 | log = logging.getLogger()
26 |
27 | fileHandler = logging.FileHandler(os.path.join(args['output_path'], 'log.txt'))
28 | fileHandler.setFormatter(logFormatter)
29 | log.addHandler(fileHandler)
30 |
31 | consoleHandler = logging.StreamHandler(sys.stdout)
32 | consoleHandler.setFormatter(logFormatter)
33 | log.addHandler(consoleHandler)
34 | log.setLevel(logging.DEBUG)
35 |
36 | log.info('{}'.format(args))
37 |
38 | if not torch.cuda.is_available() and args['gpu']:
39 | log.warning('Cannot use gpu. use cpu instead.')
40 | args['gpu'] = False
41 | is_gpu = args['gpu']
42 |
43 | with open(os.path.join(args['output_path'], 'config.json'), 'w') as f:
44 | json.dump(args, f)
45 |
46 | random.seed(args['seed'])
47 | np.random.seed(args['seed'])
48 | torch.manual_seed(args['seed'])
49 | if is_gpu:
50 | torch.backends.cudnn.deterministic = True
51 | torch.backends.cudnn.benchmark = False
52 |
53 | writer = SummaryWriter(log_dir=os.path.join(args['output_path'], 'runs'))
54 |
55 |
56 | class Data:
57 | def __init__(self, text, label, training):
58 | self.text = text
59 | self.label = label
60 | self.training = training
61 |
62 | def split(self, size):
63 | return (Data(self.text[:-size], self.label[:-size], training=self.training),
64 | Data(self.text[-size:], self.label[-size:], training=False))
65 |
66 | def __len__(self):
67 | return len(self.text)
68 |
69 |
70 | def load_data(input_path, max_length, training):
71 | text_file = os.path.join(input_path, 'train_text.npy' if training else 'test_text.npy')
72 | label_file = os.path.join(input_path, 'train_label.npy' if training else 'test_label.npy')
73 | log.info('load data from {}, {}, training: {}'.format(text_file, label_file, training))
74 | text = np.load(text_file, allow_pickle=True)
75 | label = np.load(label_file, allow_pickle=True)
76 | label = torch.tensor([np.array(label[i]) / np.sum(label[i]) for i in range(len(label))],
77 | dtype=torch.float)
78 | if max_length == -1:
79 | text = [torch.tensor(t, dtype=torch.float) for t in text]
80 | else:
81 | text_temp = []
82 | for t in text:
83 | b = torch.tensor(t, dtype=torch.float)
84 | zero = torch.zeros(max_length, b.size(1))
85 | length = min(max_length, b.size(0))
86 | zero[:length, :] = b[:length, :]
87 | text_temp.append(zero)
88 | text = text_temp
89 | log.info('loaded. total len: {}'.format(len(text)))
90 | return Data(text, label, training)
91 |
92 |
93 | class BatchGen:
94 | def __init__(self, data, batch_size):
95 | self.batch_size = batch_size
96 | self.data = data
97 |
98 | def __len__(self):
99 | return (len(self.data) + self.batch_size - 1) // self.batch_size
100 |
101 | def __iter__(self):
102 | size = len(self.data)
103 | ids = torch.randperm(size)
104 | if not self.data.training:
105 | ids = torch.arange(size)
106 |
107 | for i in range(len(self)):
108 | batch_idx = ids[self.batch_size * i: self.batch_size * (i + 1)]
109 | label = torch.index_select(self.data.label, 0, batch_idx)
110 | if is_gpu:
111 | label = label.cuda()
112 | text = [self.data.text[j].cuda() for j in batch_idx]
113 | else:
114 | text = [self.data.text[j] for j in batch_idx]
115 | yield (text, label)
116 |
117 |
118 | class Stat:
119 | def __init__(self, training):
120 | self.loss = []
121 | self.gold_labels = []
122 | self.norm_gold_labels = []
123 | self.pred_labels = []
124 | self.norm_pred_labels = []
125 | self.training = training
126 | self.save = {
127 | 'acc': [],
128 | 'f1': [],
129 | 'corr': [],
130 | }
131 |
132 | def add(self, pred, gold, loss):
133 | gold_labels = torch.argmax(gold, dim=1).cpu().numpy()
134 | norm_gold_labels = gold.cpu().numpy()
135 | pred_labels = torch.argmax(pred, dim=1).cpu().numpy()
136 | norm_pred_labels = pred.cpu().numpy()
137 | self.loss.append(loss)
138 | self.gold_labels.extend(gold_labels)
139 | self.norm_gold_labels.extend(norm_gold_labels)
140 | self.pred_labels.extend(pred_labels)
141 | self.norm_pred_labels.extend(norm_pred_labels)
142 |
143 | def eval(self):
144 | acc = accuracy_score(self.gold_labels, self.pred_labels) * 100
145 | f1 = f1_score(self.gold_labels, self.pred_labels, average='macro') * 100
146 | norm_gold = np.asarray(self.norm_gold_labels).transpose((1, 0))
147 | norm_pred = np.asarray(self.norm_pred_labels).transpose((1, 0))
148 | corr = sum([pearsonr(norm_gold[i], norm_pred[i])[0] for i in range(len(norm_gold))]) / len(norm_gold)
149 | return acc, f1, corr
150 |
151 | def log(self, global_step, epoch, batch):
152 | acc, f1, corr = self.eval()
153 | if self.training:
154 | loss = sum(self.loss) / len(self.loss)
155 | log.info('step: {}, epoch: {}, batch: {}, loss: {}, acc: {}, f1: {}, r: {}'.format(
156 | global_step, epoch, batch, loss, acc, f1, corr))
157 | writer.add_scalar('train_Loss', loss, global_step)
158 | writer.add_scalar('train_Accuracy', acc, global_step)
159 | writer.add_scalar('train_F1_macro', f1, global_step)
160 | writer.add_scalar('train_CORR', corr, global_step)
161 | else:
162 | log.info('step: {}, epoch: {}, acc: {}, f1: {}, r: {}'.format(
163 | global_step, epoch, acc, f1, corr))
164 | writer.add_scalar('dev_Accuracy', acc, global_step)
165 | writer.add_scalar('dev_F1_macro', f1, global_step)
166 | writer.add_scalar('dev_CORR', corr, global_step)
167 | self.save['acc'].append(acc)
168 | self.save['f1'].append(f1)
169 | self.save['corr'].append(corr)
170 | self.loss = []
171 | self.gold_labels = []
172 | self.norm_gold_labels = []
173 | self.pred_labels = []
174 | self.norm_pred_labels = []
175 |
176 |
177 | class MLP(nn.Module):
178 | """
179 | b: batch_size, n: seq_len, d: embedding_size
180 | """
181 | def __init__(self, config):
182 | super().__init__()
183 | opt = config['mlp']
184 | self.max_length = opt['max_length']
185 | dropout = opt['dropout']
186 | u = opt['hidden_size']
187 | self.mlp = nn.Sequential(
188 | nn.Linear(self.max_length * config['embedding_size'], u),
189 | nn.ReLU(),
190 | nn.Dropout(dropout),
191 | nn.Linear(u, config['num_labels']),
192 | )
193 | self.loss_type = opt['loss']
194 | if self.loss_type == 'l1':
195 | self.loss = nn.L1Loss()
196 | elif self.loss_type == 'mse':
197 | self.loss = nn.MSELoss()
198 | elif self.loss_type == 'cross_entropy':
199 | self.loss = nn.CrossEntropyLoss()
200 | else:
201 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"')
202 |
203 | def forward(self, embedding, gold_labels=None):
204 | """
205 | :param embedding: [b, n, d]
206 | :param gold_labels: [b, num_labels]
207 | :return: If training, return (loss, predicted labels). Else return predicted labels
208 | """
209 | data = torch.stack(embedding)
210 | output = self.mlp(data.view(data.size(0), -1))
211 | labels = F.softmax(output, dim=1)
212 | if not self.training:
213 | return labels.detach()
214 | if self.loss_type == 'cross_entropy':
215 | loss = self.loss(output, torch.argmax(gold_labels, dim=1))
216 | else:
217 | loss = self.loss(labels, gold_labels)
218 | return loss, labels.detach()
219 |
220 |
221 | class CNN(nn.Module):
222 | def __init__(self, config):
223 | super().__init__()
224 | opt = config['cnn']
225 | self.cnn_1 = nn.Sequential(
226 | nn.Conv1d(config['embedding_size'], opt['conv_1']['size'], opt['conv_1']['kernel_size'],
227 | padding=opt['conv_1']['kernel_size'] // 2),
228 | # nn.BatchNorm1d(opt['conv_1']['size']),
229 | nn.ReLU(),
230 | nn.Dropout(opt['conv_1']['dropout']),
231 | nn.MaxPool1d(opt['max_pool_1']['kernel_size'], opt['max_pool_1']['stride']),
232 | )
233 | """
234 | self.cnn_2 = nn.Sequential(
235 | nn.Conv1d(opt['conv_1']['size'], opt['conv_2']['size'], opt['conv_2']['kernel_size'],
236 | padding=opt['conv_2']['kernel_size'] // 2),
237 | nn.ReLU(),
238 | nn.Dropout(opt['conv_2']['dropout']),
239 | nn.MaxPool1d(opt['max_pool_2']['kernel_size'], opt['max_pool_2']['stride']),
240 | )
241 | """
242 | mlp_u = opt['fc']['hidden_size']
243 | self.mlp = nn.Sequential(
244 | nn.Linear(opt['conv_1']['size'] * opt['max_length'] // 2, mlp_u),
245 | nn.ReLU(),
246 | nn.Dropout(opt['fc']['dropout']),
247 | nn.Linear(mlp_u, config['num_labels']),
248 | )
249 | self.loss_type = opt['loss']
250 | if self.loss_type == 'l1':
251 | self.loss = nn.L1Loss()
252 | elif self.loss_type == 'mse':
253 | self.loss = nn.MSELoss()
254 | elif self.loss_type == 'cross_entropy':
255 | self.loss = nn.CrossEntropyLoss()
256 | else:
257 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"')
258 |
259 | def forward(self, embedding, gold_labels=None):
260 | """
261 | :param embedding: [b, n, d]
262 | :param gold_labels: [b, num_labels]
263 | :return: If training, return (loss, predicted labels). Else return predicted labels
264 | """
265 | data = torch.stack(embedding).transpose(1, 2) # [b, d, n]
266 | out_1 = self.cnn_1(data)
267 | # out_2 = self.cnn_2(out_1)
268 | # output = self.mlp(out_2.view(out_2.size(0), -1))
269 | output = self.mlp(out_1.view(out_1.size(0), -1))
270 | labels = F.softmax(output, dim=1)
271 | if not self.training:
272 | return labels.detach()
273 | if self.loss_type == 'cross_entropy':
274 | loss = self.loss(output, torch.argmax(gold_labels, dim=1))
275 | else:
276 | loss = self.loss(labels, gold_labels)
277 | return loss, labels.detach()
278 |
279 |
280 | class RNN(nn.Module):
281 | """
282 | b: batch_size, n: seq_len, u: rnn_hidden_size, da: param_da, r: param_r, d: embedding_size
283 | """
284 |
285 | def __init__(self, config):
286 | super().__init__()
287 | opt = config['rnn']
288 | u = opt['rnn_hidden_size']
289 | da = opt['param_da']
290 | r = opt['param_r']
291 | d = config['embedding_size']
292 | num_layers = opt['num_layers']
293 | bidirectional = opt['bidirectional']
294 | if opt['type'] == 'lstm':
295 | self.rnn = nn.LSTM(input_size=d, hidden_size=u, num_layers=num_layers,
296 | bidirectional=bidirectional, batch_first=True)
297 | elif opt['type'] == 'gru':
298 | self.rnn = nn.GRU(input_size=d, hidden_size=u, num_layers=num_layers,
299 | bidirectional=bidirectional, batch_first=True)
300 | else:
301 | log.fatal('Invalid rnn type. Should be "lstm" or "gru"')
302 | if bidirectional:
303 | u = u * 2
304 | mlp_u = opt['mlp_hidden_size']
305 | self.mlp = nn.Sequential(
306 | nn.Linear(r * u, mlp_u),
307 | nn.ReLU(),
308 | nn.Dropout(opt['dropout']),
309 | nn.Linear(mlp_u, config['num_labels']),
310 | )
311 | self.Ws1 = nn.Parameter(torch.randn(da, u))
312 | self.Ws2 = nn.Parameter(torch.randn(r, da))
313 | self.p_c = opt['p_coefficient']
314 | self.loss_type = opt['loss']
315 | if self.loss_type == 'l1':
316 | self.loss = nn.L1Loss()
317 | elif self.loss_type == 'mse':
318 | self.loss = nn.MSELoss()
319 | elif self.loss_type == 'cross_entropy':
320 | self.loss = nn.CrossEntropyLoss()
321 | else:
322 | log.fatal('Invalid loss type. Should be "l1" or "cross_entropy"')
323 |
324 | def forward(self, embedding, gold_labels=None):
325 | """
326 | :param embedding: [b, n, d]
327 | :param gold_labels: [b, num_labels]
328 | :return: If training, return (loss, predicted labels). Else return predicted labels
329 | """
330 | padded = nn.utils.rnn.pad_sequence(embedding, batch_first=True) # [b, n, d]
331 | H = self.rnn(padded)[0] # [b, n, u]
332 | A = F.softmax(torch.matmul(self.Ws2, torch.tanh(torch.matmul(self.Ws1, H.transpose(1, 2)))), dim=2) # [b, r, n]
333 | M = torch.matmul(A, H) # [b, r, u]
334 | output = self.mlp(M.view(M.size(0), -1))
335 | labels = F.softmax(output, dim=1)
336 | if not self.training:
337 | return labels.detach()
338 | I = torch.eye(A.size(1))
339 | if is_gpu:
340 | I = I.cuda()
341 | tmp = torch.matmul(A, A.transpose(1, 2)) - I
342 | P = (tmp * tmp).sum() / A.size(0)
343 | loss = self.p_c * P
344 | if self.loss_type == 'cross_entropy':
345 | loss = self.loss(output, torch.argmax(gold_labels, dim=1))
346 | else:
347 | loss = self.loss(labels, gold_labels)
348 | return loss, labels.detach()
349 |
350 |
351 | def main():
352 | log.info('Loading Train Data')
353 | batch_size = args['batch_size']
354 | train_data = load_data(args['input_path'],
355 | -1 if args['type'] == 'rnn' else args[args['type']]['max_length'],
356 | True)
357 | train_data, dev_data = train_data.split(len(train_data) // 10)
358 | log.info('Train: length: {}, total batch: {}, batch size: {}'.format(
359 | len(train_data), (len(train_data) + batch_size - 1) // batch_size, batch_size))
360 | log.info('Dev: length: {}, total batch: {}, batch size: {}'.format(
361 | len(dev_data), (len(dev_data) + batch_size - 1) // batch_size, batch_size))
362 |
363 | log.info('Loading model {}'.format(args['type']))
364 | model = None
365 | if args['type'] == 'rnn':
366 | model = RNN(args)
367 | elif args['type'] == 'cnn':
368 | model = CNN(args)
369 | elif args['type'] == 'mlp':
370 | model = MLP(args)
371 | else:
372 | log.fatal('Invalid type. Should be "rnn", "cnn" or "mlp"')
373 |
374 | if is_gpu:
375 | model.cuda()
376 |
377 | optimizer = None
378 | if args['optimizer'] == 'adagrad':
379 | optimizer = optim.Adagrad(model.parameters(), lr=args['lr'],
380 | lr_decay=args['lr_decay'], weight_decay=args['weight_decay'])
381 | elif args['optimizer'] == 'sgd':
382 | optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum'],
383 | weight_decay=args['weight_decay'])
384 | else:
385 | log.fatal('Invalid optimizer type. Should be "adagrad" or "sgd"')
386 |
387 | train_stat = Stat(True)
388 | eval_stat = Stat(False)
389 | best_epoch = -1
390 | best_state_dict = None
391 |
392 | for epoch in range(args['num_epochs']):
393 | log.info('*** epoch: {} ***'.format(epoch + 1))
394 |
395 | log.info('*** training ***')
396 | model.train()
397 | gen = BatchGen(train_data, batch_size)
398 | cnt = 0
399 | for batch, data in enumerate(gen):
400 | optimizer.zero_grad()
401 | loss, pred_labels = model(data[0], data[1])
402 | loss.backward()
403 | optimizer.step()
404 | cnt += 1
405 | if cnt == args['display_per_batch']:
406 | cnt = 0
407 | train_stat.add(pred_labels, data[1], loss.item())
408 | train_stat.log(epoch * len(gen) + batch + 1, epoch, batch)
409 |
410 | log.info('*** evaluating ***')
411 | model.eval()
412 | gen = BatchGen(dev_data, batch_size)
413 | for batch, data in enumerate(gen):
414 | with torch.no_grad():
415 | pred_labels = model(data[0])
416 | eval_stat.add(pred_labels, data[1], None)
417 | eval_stat.log(epoch + 1, epoch, None)
418 | if best_epoch == -1 or eval_stat.save['acc'][-1] > eval_stat.save['acc'][best_epoch]:
419 | best_epoch = epoch
420 | best_state_dict = model.state_dict()
421 |
422 | log.info('\n*** Best acc model ***\nepoch: {}\nacc: {}\nf1: {}\ncorr: {}'.format(
423 | best_epoch + 1, eval_stat.save['acc'][best_epoch], eval_stat.save['f1'][best_epoch],
424 | eval_stat.save['corr'][best_epoch]))
425 | writer.close()
426 |
427 | log.info('Loading Test Data')
428 | test_data = load_data(args['input_path'],
429 | -1 if args['type'] == 'rnn' else args[args['type']]['max_length'],
430 | False)
431 | log.info('Test: length: {}, total batch: {}, batch size: {}'.format(
432 | len(test_data), (len(test_data) + batch_size - 1) // batch_size, batch_size))
433 |
434 | model.load_state_dict(best_state_dict)
435 | model.eval()
436 | gen = BatchGen(dev_data, batch_size)
437 | for batch, data in enumerate(gen):
438 | with torch.no_grad():
439 | pred_labels = model(data[0])
440 | eval_stat.add(pred_labels, data[1], None)
441 | acc, f1, corr = eval_stat.eval()
442 | log.info('\n*** Test Result ***\nacc: {}\nf1: {}\ncorr: {}'.format(acc, f1, corr))
443 |
444 |
445 | if __name__ == '__main__':
446 | main()
447 |
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from elmoformanylangs import Embedder
4 | import argparse
5 | import logging
6 | import numpy as np
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--type', type=str, required=True)
10 | parser.add_argument('--elmo_model_path', type=str)
11 | parser.add_argument('--vector_path', type=str)
12 | parser.add_argument('--train_file', type=str, required=True)
13 | parser.add_argument('--test_file', type=str, required=True)
14 | parser.add_argument('--output_path', type=str, required=True)
15 | args = parser.parse_args()
16 |
17 | os.makedirs(args.output_path, exist_ok=True)
18 |
19 | logFormatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
20 | log = logging.getLogger()
21 |
22 | fileHandler = logging.FileHandler(os.path.join(args.output_path, 'log.txt'))
23 | fileHandler.setFormatter(logFormatter)
24 | log.addHandler(fileHandler)
25 |
26 | consoleHandler = logging.StreamHandler(sys.stdout)
27 | consoleHandler.setFormatter(logFormatter)
28 | log.addHandler(consoleHandler)
29 | log.setLevel(logging.DEBUG)
30 |
31 | log.info('=====Pre-processing=====')
32 |
33 | log.info('{}'.format(args))
34 |
35 | if args.type == 'elmo':
36 | e = Embedder(args.elmo_model_path, batch_size=2)
37 |
38 |
39 | def work(input_path, output_text_file, output_label_file):
40 | log.info('Loading data')
41 |
42 | label_list = []
43 | text_list = []
44 |
45 | with open(input_path, 'r') as f:
46 | for line in f.readlines():
47 | data = line.strip().split('\t')
48 | data[1] = data[1].strip().split()
49 | label = [0 for i in range(8)]
50 | for i in range(0, 8):
51 | label[i] = int(data[1][1 + i].split(':')[1])
52 | label_list.append(label)
53 | text_list.append(data[2].strip().split())
54 |
55 | log.info('size: {}'.format(len(text_list)))
56 |
57 | seq_len = [len(x) for x in text_list]
58 | log.info('max seq len: {}'.format(max(seq_len)))
59 | log.info('ava seq len: {:.3f}'.format(sum(seq_len) / len(seq_len)))
60 |
61 | if args.type == 'elmo':
62 | log.info('Loading elmo model')
63 | log.info(' Loaded')
64 | log.info('Processing')
65 | text_embed_list = e.sents2elmo(text_list)
66 | log.info(' Done')
67 | elif args.type == 'word2vec':
68 | log.info('Loading word2vec model')
69 |
70 | # https://github.com/Embedding/Chinese-Word-Vectors/blob/master/evaluation/ana_eval_dense.py
71 | def read_vectors(path, topn): # read top n word vectors, i.e. top is 10000
72 | lines_num, dim = 0, 0
73 | vectors = {}
74 | with open(path, encoding='utf-8', errors='ignore') as f:
75 | first_line = True
76 | for l in f:
77 | if first_line:
78 | first_line = False
79 | dim = int(l.rstrip().split()[1])
80 | continue
81 | lines_num += 1
82 | tokens = l.rstrip().split(' ')
83 | vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
84 | if topn != 0 and lines_num >= topn:
85 | break
86 | return vectors, dim
87 |
88 | vct, dim = read_vectors(args.vector_path, 0)
89 | # https://github.com/Embedding/Chinese-Word-Vectors/issues/23
90 | avg = np.zeros(dim)
91 | '''
92 | for v in vct.values():
93 | avg += v
94 | avg /= len(vct)
95 | '''
96 | log.info(' Loaded')
97 | log.info('Processing, dim: {}'.format(dim))
98 | text_embed_list = []
99 | for sen in text_list:
100 | sen_embed = []
101 | for w in sen:
102 | if w in vct:
103 | w_embed = vct[w]
104 | else:
105 | w_embed = avg
106 | sen_embed.append(w_embed)
107 | text_embed_list.append(sen_embed)
108 | log.info(' Done')
109 | else:
110 | log.fatal('Invalid type. Should be "elmo" or "word2vec"')
111 |
112 | log.info('sample: \n{}'.format(text_embed_list[0][0]))
113 | np.save(output_text_file, text_embed_list)
114 | np.save(output_label_file, label_list)
115 |
116 |
117 | work(args.train_file,
118 | os.path.join(args.output_path, 'train_text.npy'),
119 | os.path.join(args.output_path, 'train_label.npy'))
120 |
121 | work(args.test_file,
122 | os.path.join(args.output_path, 'test_text.npy'),
123 | os.path.join(args.output_path, 'test_label.npy'))
124 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.7.1
2 | alabaster==0.7.12
3 | allennlp==0.8.3
4 | asn1crypto==0.24.0
5 | astor==0.8.0
6 | atomicwrites==1.3.0
7 | attrs==19.1.0
8 | aws-sam-translator==1.11.0
9 | aws-xray-sdk==2.4.2
10 | awscli==1.16.163
11 | Babel==2.6.0
12 | bert-serving-client==1.9.1
13 | bert-serving-server==1.9.1
14 | bleach==1.5.0
15 | blis==0.2.4
16 | boto==2.49.0
17 | boto3==1.9.153
18 | botocore==1.12.153
19 | certifi==2019.3.9
20 | cffi==1.12.3
21 | cfn-lint==0.21.0
22 | chardet==3.0.4
23 | Click==7.0
24 | colorama==0.3.9
25 | conllu==0.11
26 | cookies==2.2.1
27 | cryptography==2.6.1
28 | cycler==0.10.0
29 | cymem==2.0.2
30 | cytoolz==0.9.0.1
31 | dill==0.2.9
32 | docker==4.0.1
33 | docker-pycreds==0.4.0
34 | docutils==0.14
35 | ecdsa==0.13.3
36 | editdistance==0.5.3
37 | en-core-web-sm==2.0.0
38 | enum34==1.1.6
39 | flaky==3.5.3
40 | Flask==1.0.3
41 | Flask-Cors==3.0.7
42 | ftfy==5.5.1
43 | future==0.17.1
44 | gast==0.2.2
45 | gevent==1.4.0
46 | GPUtil==1.4.0
47 | greenlet==0.4.15
48 | grpcio==1.20.1
49 | h5py==2.9.0
50 | html5lib==0.9999999
51 | idna==2.7
52 | imagesize==1.1.0
53 | itsdangerous==1.1.0
54 | Jinja2==2.10.1
55 | jmespath==0.9.4
56 | joblib==0.13.2
57 | jsondiff==1.1.2
58 | jsonnet==0.12.1
59 | jsonpatch==1.23
60 | jsonpickle==1.1
61 | jsonpointer==2.0
62 | jsonschema==2.6.0
63 | Keras-Applications==1.0.7
64 | Keras-Preprocessing==1.0.9
65 | kiwisolver==1.1.0
66 | Markdown==3.1.1
67 | MarkupSafe==1.1.1
68 | matplotlib==3.1.0
69 | mock==3.0.5
70 | more-itertools==7.0.0
71 | moto==1.3.8
72 | msgpack==0.5.6
73 | msgpack-numpy==0.4.4.3
74 | msgpack-python==0.5.6
75 | murmurhash==1.0.2
76 | nltk==3.4.5
77 | numpy==1.16.3
78 | numpydoc==0.9.1
79 | overrides==1.9
80 | packaging==19.0
81 | pandas==0.24.2
82 | parsimonious==0.8.1
83 | pbkdf2==1.3
84 | pbr==5.2.0
85 | Pillow==6.2.0
86 | plac==0.9.6
87 | pluggy==0.11.0
88 | pprint==0.1
89 | preshed==2.0.1
90 | protobuf==3.7.1
91 | py==1.8.0
92 | pyaml==19.4.1
93 | pyasn1==0.4.5
94 | pycparser==2.19
95 | pycryptodome==3.8.1
96 | Pygments==2.4.0
97 | pyparsing==2.4.0
98 | pytest==4.5.0
99 | python-dateutil==2.8.0
100 | python-jose==3.0.1
101 | pytorch-pretrained-bert==0.6.2
102 | pytz==2019.1
103 | PyYAML==5.1
104 | pyzmq==18.0.1
105 | regex==2019.4.14
106 | requests==2.22.0
107 | responses==0.10.6
108 | rsa==3.4.2
109 | s3transfer==0.2.0
110 | scikit-learn==0.21.1
111 | scipy==1.3.0
112 | singledispatch==3.4.0.3
113 | six==1.12.0
114 | snowballstemmer==1.2.1
115 | spacy==2.1.4
116 | Sphinx==2.0.1
117 | sphinxcontrib-applehelp==1.0.1
118 | sphinxcontrib-devhelp==1.0.1
119 | sphinxcontrib-htmlhelp==1.0.2
120 | sphinxcontrib-jsmath==1.0.1
121 | sphinxcontrib-qthelp==1.0.2
122 | sphinxcontrib-serializinghtml==1.1.3
123 | sphinxcontrib-websupport==1.1.2
124 | sqlparse==0.3.0
125 | srsly==0.0.5
126 | tb-nightly==1.14.0a20190523
127 | tensorboard==1.13.1
128 | tensorboardX==1.7
129 | tensorflow-estimator==1.13.0
130 | tensorflow-gpu==1.15.0
131 | tensorflow-tensorboard==1.5.1
132 | termcolor==1.1.0
133 | thinc==7.0.4
134 | toolz==0.9.0
135 | torch==1.1.0
136 | tqdm==4.32.1
137 | ujson==1.35
138 | Unidecode==1.0.23
139 | urllib3==1.25.2
140 | wasabi==0.2.2
141 | wcwidth==0.1.7
142 | webencodings==0.5.1
143 | websocket-client==0.56.0
144 | Werkzeug==0.15.4
145 | word2number==1.1
146 | wrapt==1.11.1
147 | xmltodict==0.12.0
148 |
--------------------------------------------------------------------------------
/run_cnn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python='python3'
4 |
5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_cnn.json
6 |
--------------------------------------------------------------------------------
/run_mlp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python='python3'
4 |
5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_mlp.json
6 |
--------------------------------------------------------------------------------
/run_preprocess_elmo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python='python3'
4 |
5 | CUDA_VISIBLE_DEVICES=$1 ${python} preprocess.py \
6 | --type elmo \
7 | --elmo_model_path data/zhs.model \
8 | --train_file data/sinanews.train \
9 | --test_file data/sinanews.test \
10 | --output_path data/elmo_temp
11 |
--------------------------------------------------------------------------------
/run_preprocess_word2vec.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python='python3'
4 |
5 | CUDA_VISIBLE_DEVICES=$1 ${python} preprocess.py \
6 | --type word2vec \
7 | --vector_path data/word2vec/sgns.sogounews.bigram-char \
8 | --train_file data/sinanews.train \
9 | --test_file data/sinanews.test \
10 | --output_path data/word2vec_temp
11 |
--------------------------------------------------------------------------------
/run_rnn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python='python3'
4 |
5 | CUDA_VISIBLE_DEVICES=$1 ${python} main.py --config_path config_rnn.json
6 |
--------------------------------------------------------------------------------
/save/bi-gru_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-gru_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "gru", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/bi-gru_1/runs/events.out.tfevents.1559405473.gpu-theta.8556.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-gru_1/runs/events.out.tfevents.1559405473.gpu-theta.8556.0
--------------------------------------------------------------------------------
/save/bi-lstm_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/bi-lstm_1/runs/events.out.tfevents.1559400227.gpu-theta.32041.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_1/runs/events.out.tfevents.1559400227.gpu-theta.32041.0
--------------------------------------------------------------------------------
/save/bi-lstm_2/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.9, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/bi-lstm_2/runs/events.out.tfevents.1559438048.gpu-theta.14643.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_2/runs/events.out.tfevents.1559438048.gpu-theta.14643.0
--------------------------------------------------------------------------------
/save/bi-lstm_3/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 256, "mlp_hidden_size": 1024, "dropout": 0.9, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/bi-lstm_3/runs/events.out.tfevents.1559440265.gpu-theta.15460.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_3/runs/events.out.tfevents.1559440265.gpu-theta.15460.0
--------------------------------------------------------------------------------
/save/bi-lstm_4/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/bi-lstm_4", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": true, "rnn_hidden_size": 512, "mlp_hidden_size": 1024, "dropout": 0.5, "p_coefficient": 0.3, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/bi-lstm_4/runs/events.out.tfevents.1559441032.gpu-theta.21582.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/bi-lstm_4/runs/events.out.tfevents.1559441032.gpu-theta.21582.0
--------------------------------------------------------------------------------
/save/cnn_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_1/runs/events.out.tfevents.1559399753.gpu-theta.10281.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_1/runs/events.out.tfevents.1559399753.gpu-theta.10281.0
--------------------------------------------------------------------------------
/save/cnn_2/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "l1"}}
--------------------------------------------------------------------------------
/save/cnn_2/runs/events.out.tfevents.1559409399.gpu-theta.22755.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_2/runs/events.out.tfevents.1559409399.gpu-theta.22755.0
--------------------------------------------------------------------------------
/save/cnn_3/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 1024, "kernel_size": 3, "dropout": 0.5}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.5}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_3/runs/events.out.tfevents.1559409541.gpu-theta.30631.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_3/runs/events.out.tfevents.1559409541.gpu-theta.30631.0
--------------------------------------------------------------------------------
/save/cnn_4/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_4", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 2000, "dropout": 0.9}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_4/runs/events.out.tfevents.1559437928.gpu-theta.6850.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_4/runs/events.out.tfevents.1559437928.gpu-theta.6850.0
--------------------------------------------------------------------------------
/save/cnn_5/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_5", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.9}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_5/runs/events.out.tfevents.1559437980.gpu-theta.10531.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_5/runs/events.out.tfevents.1559437980.gpu-theta.10531.0
--------------------------------------------------------------------------------
/save/cnn_6/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/elmo_temp", "output_path": "save/cnn_6", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 1024, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3, "dropout": 0.9}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.9}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_6/runs/events.out.tfevents.1559443831.gpu-theta.16155.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_6/runs/events.out.tfevents.1559443831.gpu-theta.16155.0
--------------------------------------------------------------------------------
/save/cnn_7/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_7", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.5}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_7/runs/events.out.tfevents.1559462006.gpu-theta.17687.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_7/runs/events.out.tfevents.1559462006.gpu-theta.17687.0
--------------------------------------------------------------------------------
/save/cnn_8/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/cnn_8", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "cnn", "cnn": {"max_length": 512, "conv_1": {"size": 512, "kernel_size": 3}, "max_pool_1": {"kernel_size": 2, "stride": 2}, "fc": {"hidden_size": 512, "dropout": 0.5}, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/cnn_8/runs/events.out.tfevents.1559462040.gpu-theta.19995.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/cnn_8/runs/events.out.tfevents.1559462040.gpu-theta.19995.0
--------------------------------------------------------------------------------
/save/gru_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/gru_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "gru", "bidirectional": false, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/gru_1/runs/events.out.tfevents.1559406000.gpu-theta.3523.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/gru_1/runs/events.out.tfevents.1559406000.gpu-theta.3523.0
--------------------------------------------------------------------------------
/save/lstm_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/lstm_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "rnn", "rnn": {"type": "lstm", "bidirectional": false, "rnn_hidden_size": 256, "mlp_hidden_size": 512, "dropout": 0.5, "p_coefficient": 1, "num_layers": 1, "param_da": 350, "param_r": 30, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/lstm_1/runs/events.out.tfevents.1559402877.gpu-theta.2069.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/lstm_1/runs/events.out.tfevents.1559402877.gpu-theta.2069.0
--------------------------------------------------------------------------------
/save/mlp_1/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/mlp_1", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.5, "hidden_size": 512, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/mlp_1/runs/events.out.tfevents.1559399750.gpu-theta.9979.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_1/runs/events.out.tfevents.1559399750.gpu-theta.9979.0
--------------------------------------------------------------------------------
/save/mlp_2/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/word2vec_temp", "output_path": "save/mlp_2", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 300, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.9, "hidden_size": 512, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/mlp_2/runs/events.out.tfevents.1559409302.gpu-theta.16485.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_2/runs/events.out.tfevents.1559409302.gpu-theta.16485.0
--------------------------------------------------------------------------------
/save/mlp_3/config.json:
--------------------------------------------------------------------------------
1 | {"input_path": "data/elmo_temp", "output_path": "save/mlp_3", "gpu": true, "seed": 20000125, "display_per_batch": 5, "optimizer": "adagrad", "lr": 0.01, "lr_decay": 0, "weight_decay": 0.0001, "momentum": 0.985, "num_epochs": 300, "batch_size": 64, "num_labels": 8, "embedding_size": 1024, "type": "mlp", "mlp": {"max_length": 512, "dropout": 0.5, "hidden_size": 512, "loss": "cross_entropy"}}
--------------------------------------------------------------------------------
/save/mlp_3/runs/events.out.tfevents.1559438821.gpu-theta.15937.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xalanq/chinese-sentiment-classification/f6b92d206b35e1904449c3d56b111092511bd065/save/mlp_3/runs/events.out.tfevents.1559438821.gpu-theta.15937.0
--------------------------------------------------------------------------------