├── models ├── LSTMStack.py ├── ConvS2S.py ├── QuantumCNN.py ├── LSTMTree.py ├── BiBloSA.py ├── DiSAN.py ├── FastText.py ├── __init__.py ├── CNNMultiLayer.py ├── LSTMBI.py ├── RNN_CNN.py ├── CNNText.py ├── LSTMwithAttention.py ├── SelfAttention.py ├── LSTM.py ├── RCNN.py ├── MLP.py ├── MemoryNetwork.py ├── CNNInception.py ├── CNN_Inception.py ├── CNNBasic.py ├── Capsule.py ├── CNNKim.py ├── CNN.py └── Transformer.py ├── config └── imdb.ini ├── push.bash ├── search.sh ├── docs ├── windows_torch.md ├── windows_torch_en.md ├── data_config.md └── data_config_en.md ├── LICENSE.txt ├── dataloader ├── __init__.py ├── glove.py ├── torch_text_demo │ ├── imdb.py │ ├── trec.py │ └── sst.py ├── imdb.py ├── sst.py ├── ag.py ├── mr.py └── Dataset.py ├── main.py ├── README.md ├── trandition.py ├── parameter_search.py ├── opts.py ├── utils.py └── dataHelper.py /models/LSTMStack.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /config/imdb.ini: -------------------------------------------------------------------------------- 1 | [COMMON] 2 | dataset = imdb 3 | 4 | -------------------------------------------------------------------------------- /models/ConvS2S.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/QuantumCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/LSTMTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/dasguptar/treelstm.pytorch -------------------------------------------------------------------------------- /models/BiBloSA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #https://github.com/galsang/BiBloSA-pytorch/blob/master/model/model.py 4 | 5 | -------------------------------------------------------------------------------- /models/DiSAN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/taoshen58/DiSAN/blob/master/SST_disan/src/model/model_disan.py -------------------------------------------------------------------------------- /push.bash: -------------------------------------------------------------------------------- 1 | git add *.py 2 | git add models/*.py 3 | git add dataloader/*.py 4 | git commit -m $1 5 | git pull 6 | git push 7 | 8 | -------------------------------------------------------------------------------- /search.sh: -------------------------------------------------------------------------------- 1 | echo "use gpu with multiple processes"; 2 | for((i=0;i<=8;i++)) 3 | do 4 | { 5 | echo "use gpu" +$i ; 6 | echo CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 7 | CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 8 | 9 | }& 10 | done 11 | wait -------------------------------------------------------------------------------- /docs/windows_torch.md: -------------------------------------------------------------------------------- 1 | # Windows 平台安装 PyTorch 2 | 3 | 如果是Linux,Mac安装直接移步pytorch[主页](http://pytorch.org/), 再安装TorchText 4 | 5 | ## Python安装 6 | 建议直接安装anaconda的[安装包](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe) 7 | 8 | ## Pytorch安装 9 | 在[百度网盘](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch)下载一个 离线安装包 , 0.3版本或者是0.2版本均可 10 | 如果是whl安装包 11 |
pip install torch0.3XXX.whl
12 | 如果是一个conda安装包(压缩文件后缀)
13 | conda install --offline torch0.3XXX.tar.bz
14 |
15 | ## TorchText 安装
16 |
17 | 前提是有git和pip,如果没有需要下载git,并将其放到Path环境变量里
18 | pip install git+https://github.com/pytorch/text.git
19 |
20 | 还需要有代理的话
21 |
22 |
23 |
24 | pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080
25 |
26 |
27 | 参考链接
28 | https://zhuanlan.zhihu.com/p/31747695
29 |
--------------------------------------------------------------------------------
/docs/windows_torch_en.md:
--------------------------------------------------------------------------------
1 | # Windows Platform Installation for PyTorch
2 |
3 | If Linux, Mac directly use pytorch from [homepage](http://pytorch.org/), and reinstall TorchText
4 |
5 | ## Python installation
6 | Please install anaconda directly: [installation package](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe)
7 |
8 | ## Pytorch installation
9 | In[Baidu Network Disk](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch) download offline, Version 0.3 or 0.2 wheels
10 | pip install torch0.3XXX.whl
11 |
12 | If it is a conda installation environment
13 | conda install --offline torch0.3XXX.tar.bz
14 |
15 | ## TorchText installation
16 |
17 | The assumption is that you have git and pip, if you don't, you need to download git and put it in the Path environment variable.
18 | pip install git+https://github.com/pytorch/text.git
19 |
20 | If you need a proxy,
21 | pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080
22 |
23 |
24 | Reference Link:
25 | https://zhuanlan.zhihu.com/p/31747695
26 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Barun Patra
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from .imdb import IMDBDataset
5 | from .mr import MRDataset
6 | from .glove import Glove
7 | from .sst import SSTDataset
8 | from .ag import AGDataset
9 |
10 | from .Dataset import Dataset
11 | def getDataset(opt):
12 | if opt.dataset=="imdb":
13 | dataset = IMDBDataset(opt)
14 | elif opt.dataset=="mr":
15 | dataset = MRDataset(opt)
16 | elif opt.dataset=="sst":
17 | dataset =SSTDataset(opt)
18 | elif opt.dataset == "ag":
19 | dataset =AGDataset(opt)
20 | elif opt.dataset in ["cr","mpqa","mr","sst1","sst2","subj","trec"]:
21 | dataset =Dataset(opt)
22 |
23 |
24 | else:
25 | raise Exception("dataset not supported: {}".format(opt.dataset))
26 | return dataset
27 |
28 | def getEmbedding(opt):
29 | if opt.embedding_file.startswith("glove"):
30 | assert len(opt.embedding_file.split(".")) ==3 , "embedding_type format wrong"
31 | _,corpus,dim=opt.embedding_file.split(".")
32 | return Glove(corpus,dim,opt)
33 | else:
34 | raise Exception("embedding not supported: {}".format(opt.embedding_type))
35 |
36 |
--------------------------------------------------------------------------------
/dataloader/glove.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 |
4 | from .Dataset import Dataset
5 | class Glove(Dataset):
6 | def __init__(self,corpus,dim,opt=None,**kwargs):
7 | super(Glove,self).__init__(opt,**kwargs)
8 |
9 | self.root = ".vector_cache"
10 |
11 | # if not os.path.exists(self.root):
12 | # os.makedirs(self.root)
13 |
14 | embeding_urls = {
15 | '42b': 'http://nlp.stanford.edu/data/glove.42B.300d.zip',
16 | '840b': 'http://nlp.stanford.edu/data/glove.840B.300d.zip',
17 | 'twitter.27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
18 | '6b': 'http://nlp.stanford.edu/data/glove.6B.zip',
19 | }
20 |
21 |
22 | self.urls= [ embeding_urls[corpus.lower()] ]
23 | print(self.urls)
24 | self.name = corpus
25 |
26 |
27 | def process(self):
28 |
29 | root=self.download()
30 |
31 | return root
32 | def getFilename(self):
33 | return self.process()
34 |
35 | if __name__ =="__main__":
36 | import opts
37 | opt = opts.parse_opt()
38 |
39 |
40 | import dataloader
41 | glove=dataloader.getEmbedding(opt)
42 | print(glove.getFilename())
43 |
44 |
--------------------------------------------------------------------------------
/docs/data_config.md:
--------------------------------------------------------------------------------
1 | # 数据配置
2 |
3 |
4 | ##第一步先支持[torchtext](https://github.com/pytorch/text)本来支持的数据集合
5 |
6 |
7 | The datasets module currently contains:
8 |
9 | - Sentiment analysis: SST and IMDb
10 | - Question classification: TREC
11 | - Entailment: SNLI
12 | - Language modeling: WikiText-2
13 | - Machine translation: Multi30k, IWSLT, WMT14
14 |
15 | Others are planned or a work in progress:
16 |
17 | - Question answering: SQuAD
18 |
19 | 目前需要配置的数据集合
20 |
21 | ###Glove的下载到项目的根目录 ..vector_cache文件夹下
22 |
23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip)
24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip)
25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip)
26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip)
27 |
28 | ###分类数据集下载配置
29 |
30 | - [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)数据集下载到 .data/imdb
31 | - [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)数据集下载到.data/sst
32 | - TREC [1](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) 问题分类数据集下载到.data/imdb
33 |
34 | ###文件结构示例如下
35 |
36 | - TextClassificationBenchmark
37 | - .data
38 | - imdb
39 | - aclImdb_v1.tar.gz
40 | - sst
41 | - trainDevTestTrees_PTB.zip
42 | - trec
43 | - train_5500.label
44 | - TREC_10.label
45 | - .vector_cache
46 | - glove.42B.300d.zip
47 | - glove.840B.300d.zip
48 | - glove.twitter.27B.zip
49 | - glove.6B.zip
50 |
51 |
52 |
53 | ##更多的数据集请等待我们进一步更新
--------------------------------------------------------------------------------
/dataloader/torch_text_demo/imdb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 |
5 | from torchtext import data
6 | from torchtext import datasets
7 | from torchtext.vocab import GloVe
8 | import torch
9 | if torch.cuda.is_available() :
10 | device = -1
11 | else:
12 | device = 0
13 | # Approach 1:
14 | # set up fields
15 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
16 | LABEL = data.Field(sequential=False)
17 |
18 |
19 | # make splits for data
20 | train, test = datasets.IMDB.splits(TEXT, LABEL)
21 |
22 | # print information about the data
23 | print('train.fields', train.fields)
24 | print('len(train)', len(train))
25 | print('vars(train[0])', vars(train[0]))
26 |
27 | # build the vocabulary
28 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
29 | LABEL.build_vocab(train)
30 |
31 | # print vocab information
32 | print('len(TEXT.vocab)', len(TEXT.vocab))
33 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
34 |
35 | # make iterator for splits
36 | #train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3, device=0)
37 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3,device=-1)
38 | # print batch information
39 | batch = next(iter(train_iter))
40 | print(batch.text)
41 | print(batch.label)
42 |
43 | # Approach 2:
44 | train_iter, test_iter = datasets.IMDB.iters(batch_size=4,device=-1)
45 |
46 | # print batch information
47 | batch = next(iter(train_iter))
48 | print(batch.text)
49 | print(batch.label)
--------------------------------------------------------------------------------
/dataloader/torch_text_demo/trec.py:
--------------------------------------------------------------------------------
1 | from torchtext import data
2 | from torchtext import datasets
3 | from torchtext.vocab import GloVe, CharNGram
4 | import torch
5 | if not torch.cuda.is_available() :
6 | device = -1
7 | else:
8 | device = 0
9 |
10 | # Approach 1:
11 | # set up fields
12 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
13 | LABEL = data.Field(sequential=False)
14 |
15 |
16 | # make splits for data
17 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
18 |
19 | # print information about the data
20 | print('train.fields', train.fields)
21 | print('len(train)', len(train))
22 | print('vars(train[0])', vars(train[0]))
23 |
24 | # build the vocabulary
25 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
26 | LABEL.build_vocab(train)
27 |
28 | # print vocab information
29 | print('len(TEXT.vocab)', len(TEXT.vocab))
30 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
31 |
32 | # make iterator for splits
33 | train_iter, test_iter = data.BucketIterator.splits(
34 | (train, test), batch_size=3, device=device)
35 |
36 | # print batch information
37 | batch = next(iter(train_iter))
38 | print(batch.text)
39 | print(batch.label)
40 |
41 | # Approach 2:
42 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()],device=device)
43 | LABEL.build_vocab(train)
44 |
45 | train_iter, test_iter = datasets.TREC.iters(batch_size=4)
46 |
47 | # print batch information
48 | batch = next(iter(train_iter))
49 | print(batch.text)
50 | print(batch.label)
--------------------------------------------------------------------------------
/models/FastText.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch as t
4 |
5 | import numpy as np
6 | from torch import nn
7 | from collections import OrderedDict
8 | class FastText(nn.Module):
9 | def __init__(self, opt ):
10 | super(FastText, self).__init__()
11 | self.model_name = 'FastText'
12 |
13 | linear_hidden_size=getattr(opt,"linear_hidden_size",2000)
14 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
15 | if opt.__dict__.get("embeddings",None) is not None:
16 | print('load embedding')
17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
18 |
19 |
20 | self.content_fc = nn.Sequential(
21 | nn.Linear(opt.embedding_dim,linear_hidden_size),
22 | nn.BatchNorm1d(linear_hidden_size),
23 | nn.ReLU(inplace=True),
24 | # nn.Linear(opt.linear_hidden_size,opt.linear_hidden_size),
25 | # nn.BatchNorm1d(opt.linear_hidden_size),
26 | # nn.ReLU(inplace=True),
27 | nn.Linear(linear_hidden_size,opt.label_size)
28 | )
29 |
30 |
31 | def forward(self,content):
32 |
33 | content_=t.mean(self.encoder(content),dim=1)
34 |
35 |
36 | out=self.content_fc(content_.view(content_.size(0),-1))
37 |
38 | return out
39 | if __name__ == '__main__':
40 | import sys
41 | sys.path.append(r"..")
42 | import opts
43 | opt=opts.parse_opt()
44 | opt.vocab_size=2501
45 | opt.label_size=3
46 | m = FastText(opt)
47 |
48 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long()
49 | o = m(content)
50 | print(o.size())
--------------------------------------------------------------------------------
/docs/data_config_en.md:
--------------------------------------------------------------------------------
1 | # Data configuration
2 |
3 | **Install [torchtext](https://github.com/pytorch/text) for data processing**
4 |
5 | The datasets module currently contains:
6 |
7 | - Sentiment analysis: SST and IMDb
8 | - Question classification: TREC
9 | - Entailment: SNLI
10 | - Language modeling: WikiText-2
11 | - Machine translation: Multi30k, IWSLT, WMT14
12 |
13 | Others are planned or a work in progress:
14 |
15 | - Question answering: SQuAD
16 |
17 | The current need to configure the data collection
18 |
19 | ### Glove
20 |
21 | Download to the project's root directory under the folder vector_cache
22 |
23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip)
24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip)
25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip)
26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip)
27 |
28 | ### Classification Datasets
29 |
30 | - Download [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) dataset to .data/imdb
31 | - Download [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip) dataset to .data/sst
32 | - Download TREC [Question Classification ](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) dataset to .data/imdb
33 |
34 | ### File Structure
35 |
36 | - TextClassificationBenchmark
37 | - .data
38 | - imdb
39 | - aclImdb_v1.tar.gz
40 | - sst
41 | - trainDevTestTrees_PTB.zip
42 | - trec
43 | - train_5500.label
44 | - TREC_10.label
45 | - .vector_cache
46 | - glove.42B.300d.zip
47 | - glove.840B.300d.zip
48 | - glove.twitter.27B.zip
49 | - glove.6B.zip
50 |
51 |
52 |
53 | ## More datasets and updates coming soon, please wait for us to update further
54 |
--------------------------------------------------------------------------------
/dataloader/imdb.py:
--------------------------------------------------------------------------------
1 | from .Dataset import Dataset
2 | import os
3 | import pandas as pd
4 | from codecs import open
5 |
6 | class IMDBDataset(Dataset):
7 | def __init__(self,opt=None,**kwargs):
8 | super(IMDBDataset,self).__init__(opt,**kwargs)
9 | self.urls=['http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz']
10 |
11 |
12 | def process(self):
13 |
14 | root=self.download()
15 | root = os.path.join(root,"aclImdb")
16 | print("processing into: "+ root)
17 | # root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb"
18 | if not os.path.exists(self.saved_path):
19 | print("mkdir " + self.saved_path)
20 | os.makedirs(self.saved_path) # better than os.mkdir
21 |
22 | datafiles=[]
23 |
24 | for data_folder in ("train","test"):
25 | data = []
26 | for polarity in ("pos","neg"):
27 | diranme=os.path.join( os.path.join(root,data_folder), polarity)
28 | for rt, dirs, files in os.walk(diranme):
29 | for f in files:
30 | filename= os.path.join(rt,f)
31 | data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")})
32 | df=pd.DataFrame(data)
33 | saved_filename=os.path.join(self.saved_path,data_folder+".csv")
34 |
35 | df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8")
36 | print("finished %s"%saved_filename)
37 | datafiles.append(saved_filename)
38 | print("processing into formated files over")
39 |
40 |
41 | return datafiles
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 |
9 | import numpy as np
10 |
11 |
12 |
13 | from .LSTM import LSTMClassifier
14 | from .CNNBasic import BasicCNN1D,BasicCNN2D
15 | from .CNNKim import KIMCNN1D,KIMCNN2D
16 | from .CNNMultiLayer import MultiLayerCNN
17 | from .CNNInception import InceptionCNN
18 | from .FastText import FastText
19 | from .Capsule import CapsuleNet
20 | from .RCNN import RCNN
21 | from .RNN_CNN import RNN_CNN
22 | from .LSTMBI import LSTMBI
23 | from .Transformer import AttentionIsAllYouNeed
24 | from .SelfAttention import SelfAttention
25 | from .LSTMwithAttention import LSTMAttention
26 | def setup(opt):
27 |
28 | if opt.model == 'lstm':
29 | model = LSTMClassifier(opt)
30 | elif opt.model == 'baisc_cnn' or opt.model == "cnn":
31 | model = BasicCNN1D(opt)
32 | elif opt.model == 'baisc_cnn_2d' :
33 | model = BasicCNN2D(opt)
34 | elif opt.model == 'kim_cnn' :
35 | model = KIMCNN1D(opt)
36 | elif opt.model == 'kim_cnn_2d':
37 | model = KIMCNN2D(opt)
38 | elif opt.model == 'multi_cnn':
39 | model = MultiLayerCNN(opt)
40 | elif opt.model == 'inception_cnn':
41 | model = InceptionCNN(opt)
42 | elif opt.model == 'fasttext':
43 | model = FastText(opt)
44 | elif opt.model == 'capsule':
45 | model = CapsuleNet(opt)
46 | elif opt.model == 'rnn_cnn':
47 | model = RNN_CNN(opt)
48 | elif opt.model == 'rcnn':
49 | model = RCNN(opt)
50 | elif opt.model == 'bilstm':
51 | model = LSTMBI(opt)
52 | elif opt.model == "transformer":
53 | model = AttentionIsAllYouNeed(opt)
54 | elif opt.model == "selfattention":
55 | model = SelfAttention(opt)
56 | elif opt.model == "lstm_attention":
57 | model =LSTMAttention(opt)
58 | else:
59 | raise Exception("model not supported: {}".format(opt.model))
60 | return model
61 |
--------------------------------------------------------------------------------
/models/CNNMultiLayer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py
9 | class MultiLayerCNN(nn.Module):
10 | def __init__(self, opt):
11 | super(MultiLayerCNN, self).__init__()
12 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim)
13 |
14 | if opt.__dict__.get("embeddings",None) is not None:
15 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
16 |
17 | self.conv1 = nn.Sequential(
18 | nn.Conv1d(opt.max_seq_len, 256, kernel_size=7, stride=1),
19 | nn.ReLU(),
20 | nn.MaxPool1d(kernel_size=3, stride=3)
21 | )
22 |
23 | self.conv2 = nn.Sequential(
24 | nn.Conv1d(256, 256, kernel_size=7, stride=1),
25 | nn.ReLU(),
26 | nn.MaxPool1d(kernel_size=3, stride=3)
27 | )
28 |
29 | self.conv3 = nn.Sequential(
30 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
31 | nn.ReLU()
32 | )
33 |
34 | self.conv4 = nn.Sequential(
35 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
36 | nn.ReLU()
37 | )
38 |
39 | self.conv5 = nn.Sequential(
40 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
41 | nn.ReLU()
42 | )
43 |
44 | self.conv6 = nn.Sequential(
45 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
46 | nn.ReLU(),
47 | nn.MaxPool1d(kernel_size=3, stride=3)
48 | )
49 |
50 | self.fc = nn.Linear(256*7, opt.label_size)
51 |
52 | def forward(self, x):
53 | # Embedding
54 | x = self.embed(x) # dim: (batch_size, max_seq_len, embedding_size)
55 | x = self.conv1(x)
56 | x = self.conv2(x)
57 | x = self.conv3(x)
58 | x = self.conv4(x)
59 | x = self.conv5(x)
60 | x = self.conv6(x)
61 |
62 | # collapse
63 | x = x.view(x.size(0), -1)
64 | x = self.fc(x)
65 |
66 | return F.log_softmax(x)
67 |
--------------------------------------------------------------------------------
/dataloader/torch_text_demo/sst.py:
--------------------------------------------------------------------------------
1 | from torchtext import data
2 | from torchtext import datasets
3 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
4 |
5 |
6 | # Approach 1:
7 | # set up fields
8 | TEXT = data.Field()
9 | LABEL = data.Field(sequential=False)
10 |
11 | # make splits for data
12 | train, val, test = datasets.SST.splits(
13 | TEXT, LABEL, fine_grained=True, train_subtrees=True,
14 | filter_pred=lambda ex: ex.label != 'neutral')
15 |
16 | # print information about the data
17 | print('train.fields', train.fields)
18 | print('len(train)', len(train))
19 | print('vars(train[0])', vars(train[0]))
20 |
21 | # build the vocabulary
22 | url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
23 | TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url))
24 | LABEL.build_vocab(train)
25 |
26 | # print vocab information
27 | print('len(TEXT.vocab)', len(TEXT.vocab))
28 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
29 |
30 | # make iterator for splits
31 | train_iter, val_iter, test_iter = data.BucketIterator.splits(
32 | (train, val, test), batch_size=3, device=0)
33 |
34 | # print batch information
35 | batch = next(iter(train_iter))
36 | print(batch.text)
37 | print(batch.label)
38 |
39 | # Approach 2:
40 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()])
41 | LABEL.build_vocab(train)
42 |
43 | # print vocab information
44 | print('len(TEXT.vocab)', len(TEXT.vocab))
45 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
46 |
47 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)
48 |
49 | # print batch information
50 | batch = next(iter(train_iter))
51 | print(batch.text)
52 | print(batch.label)
53 |
54 | # Approach 3:
55 | f = FastText()
56 | TEXT.build_vocab(train, vectors=f)
57 | TEXT.vocab.extend(f)
58 | LABEL.build_vocab(train)
59 |
60 | # print vocab information
61 | print('len(TEXT.vocab)', len(TEXT.vocab))
62 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
63 |
64 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)
65 |
66 | # print batch information
67 | batch = next(iter(train_iter))
68 | print(batch.text)
69 | print(batch.label)
--------------------------------------------------------------------------------
/models/LSTMBI.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import torch
6 | from torch.autograd import Variable
7 | #from memory_profiler import profile
8 |
9 | class LSTMBI(nn.Module):
10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu
11 | def __init__(self,opt):
12 | self.opt=opt
13 | super(LSTMBI, self).__init__()
14 | self.hidden_dim = opt.hidden_dim
15 | self.batch_size = opt.batch_size
16 | self.use_gpu = torch.cuda.is_available()
17 |
18 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
19 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
20 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
21 |
22 | self.lstm_layers = opt.lstm_layers
23 | #self.bidirectional = True
24 | self.dropout = opt.keep_dropout
25 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.lstm_layers, dropout=self.dropout, bidirectional=True)
26 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
27 | self.hidden = self.init_hidden()
28 | self.mean = opt.__dict__.get("lstm_mean",True)
29 |
30 | def init_hidden(self,batch_size=None):
31 | if batch_size is None:
32 | batch_size= self.batch_size
33 |
34 | if self.use_gpu:
35 | h0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2).cuda())
36 | c0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2).cuda())
37 | else:
38 | h0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2))
39 | c0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2))
40 | return (h0, c0)
41 | # @profile
42 | def forward(self, sentence):
43 | embeds = self.word_embeddings(sentence)
44 |
45 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
46 | x=embeds.permute(1,0,2) # we do this because the default parameter of lstm is False
47 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64
48 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128
49 | if self.mean=="mean":
50 | out = lstm_out.permute(1,0,2)
51 | final = torch.mean(out,1)
52 | else:
53 | final=lstm_out[-1]
54 | y = self.hidden2label(final) #64x3 #lstm_out[-1]
55 | return y
56 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import torch
8 | from torch.autograd import Variable
9 | import torch.optim as optim
10 | import numpy as np
11 |
12 | from six.moves import cPickle
13 |
14 | import opts
15 | import models
16 | import torch.nn as nn
17 | import utils
18 | import torch.nn.functional as F
19 | from torchtext import data
20 | from torchtext import datasets
21 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
22 | from torch.nn.modules.loss import NLLLoss,MultiLabelSoftMarginLoss,MultiLabelMarginLoss,BCELoss
23 | import dataHelper
24 | import time,os
25 |
26 |
27 | from_torchtext = False
28 |
29 | opt = opts.parse_opt()
30 | #opt.proxy="http://xxxx.xxxx.com:8080"
31 |
32 |
33 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys():
34 | os.environ["CUDA_VISIBLE_DEVICES"] =opt.gpu
35 | #opt.model ='lstm'
36 | #opt.model ='capsule'
37 |
38 | if from_torchtext:
39 | train_iter, test_iter = utils.loadData(opt)
40 | else:
41 | import dataHelper as helper
42 | train_iter, test_iter = dataHelper.loadData(opt)
43 |
44 | opt.lstm_layers=2
45 |
46 | model=models.setup(opt)
47 | if torch.cuda.is_available():
48 | model.cuda()
49 | model.train()
50 | print("# parameters:", sum(param.numel() for param in model.parameters() if param.requires_grad))
51 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.learning_rate)
52 | optimizer.zero_grad()
53 | loss_fun = F.cross_entropy
54 |
55 | #batch = next(iter(train_iter))
56 |
57 | #x=batch.text[0]
58 |
59 | #x=batch.text[0] #64x200
60 |
61 | #print(utils.evaluation(model,test_iter))
62 | for i in range(opt.max_epoch):
63 | for epoch,batch in enumerate(train_iter):
64 | start= time.time()
65 |
66 | text = batch.text[0] if from_torchtext else batch.text
67 | predicted = model(text)
68 |
69 | loss= loss_fun(predicted,batch.label)
70 |
71 | loss.backward()
72 | utils.clip_gradient(optimizer, opt.grad_clip)
73 | optimizer.step()
74 | if epoch% 100==0:
75 | if torch.cuda.is_available():
76 | print("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.cpu().data.numpy()[0],time.time()-start))
77 | else:
78 | print("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.data.numpy()[0],time.time()-start))
79 |
80 | percision=utils.evaluation(model,test_iter,from_torchtext)
81 | print("%d iteration with percision %.4f" % (i,percision))
82 |
83 |
84 |
--------------------------------------------------------------------------------
/models/RNN_CNN.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | import torch
4 | from torch.autograd import Variable
5 | #from memory_profiler import profile
6 |
7 | class RNN_CNN(nn.Module):
8 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu
9 | def __init__(self,opt):
10 | self.opt=opt
11 | super(RNN_CNN, self).__init__()
12 | self.hidden_dim = opt.hidden_dim
13 | self.batch_size = opt.batch_size
14 | self.use_gpu = torch.cuda.is_available()
15 |
16 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
17 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
18 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
19 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim)
20 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
21 | self.hidden = self.init_hidden()
22 |
23 | self.content_dim = 256
24 | self.conv = nn.Conv1d(in_channels=opt.hidden_dim, out_channels=self.content_dim, kernel_size=opt.hidden_dim * 2, stride=opt.embedding_dim)
25 | self.hidden2label = nn.Linear(self.content_dim, opt.label_size)
26 |
27 | def init_hidden(self,batch_size=None):
28 | if batch_size is None:
29 | batch_size= self.batch_size
30 |
31 | if self.use_gpu:
32 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
33 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
34 | else:
35 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim))
36 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim))
37 | return (h0, c0)
38 | # @profile
39 | def forward(self, sentence):
40 | embeds = self.word_embeddings(sentence) #64x200x300
41 |
42 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
43 | x=embeds.permute(1,0,2) #200x64x300
44 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128
45 | lstm_out, self.hidden = self.lstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions)
46 | #lstm_out 200x64x128 lstm_out.permute(1,2,0):64x128x200
47 | y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1
48 | ###y = self.conv(lstm_out.permute(1,2,0).contiguous().view(self.batch_size,128,-1))
49 | #y = self.hidden2label(y.view(sentence.size()[0],-1))
50 | y = self.hidden2label(y.view(y.size()[0],-1)) #64x3
51 | return y
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text Classification Benchmark
2 | A Benchmark of Text Classification in PyTorch
3 |
4 |
5 | ## Motivation
6 |
7 | We are trying to build a Benchmark for Text Classification including
8 |
9 |
10 | >Many Text Classification **DataSet**, including Sentiment/Topic Classfication, popular language(e.g. English and Chinese). Meanwhile, a basic word embedding is provided.
11 |
12 | >Implment many popular and state-of-art **Models**, especially in deep neural network.
13 |
14 | ## Have done
15 | We have done some dataset and models
16 | ### Dataset done
17 | - IMDB
18 | - SST
19 | - Trec
20 |
21 | ### Models done
22 | - FastText
23 | - BasicCNN (KimCNN,MultiLayerCNN, Multi-perspective CNN)
24 | - InceptionCNN
25 | - LSTM (BILSTM, StackLSTM)
26 | - LSTM with Attention (Self Attention / Quantum Attention)
27 | - Hybrids between CNN and RNN (RCNN, C-LSTM)
28 | - Transformer - Attention is all you need
29 | - ConS2S
30 | - Capsule
31 | - Quantum-inspired NN
32 |
33 | ## Libary
34 |
35 | You should have install [these librarys](docs/windows_torch_en.md)
36 | 37 | python3 38 | torch 39 | torchtext (optional) 40 |41 | 42 | ## Dataset 43 | Dataset will be automatically configured in current path, or download manually your data in [Dataset](docs/data_config_en.md), step-by step. 44 | 45 | including 46 |
47 | Glove embeding 48 | Sentiment classfication dataset IMDB 49 |50 | 51 | 52 | ## usage 53 | 54 | 55 | Run in default setting 56 |
python main.py
57 |
58 | CNN
59 | python main.py --model cnn
60 |
61 | LSTM
62 | python main.py --model lstm
63 |
64 | ## Road Map
65 | - [X] Data preprossing framework
66 | - [X] Models modules
67 | - [ ] Loss, Estimator and hyper-paramter tuning.
68 | - [ ] Test modules
69 | - [ ] More Dataset
70 | - [ ] More models
71 |
72 |
73 |
74 | ## Organisation of the repository
75 | The core of this repository is models and dataset.
76 |
77 |
78 | * ```dataloader/```: loading all dataset such as ```IMDB```, ```SST```
79 |
80 | * ```models/```: creating all models such as ```FastText```, ```LSTM```,```CNN```,```Capsule```,```QuantumCNN``` ,```Multi-Head Attention```
81 |
82 | * ```opts.py```: Parameter and config info.
83 |
84 | * ```utils.py```: tools.
85 |
86 | * ```dataHelper```: data helper
87 |
88 |
89 |
90 |
91 | ## Contributor
92 | - [@Allenzhai](https://github.com/zhaizheng)
93 | - [@JaredWei](https://github.com/jacobwei)
94 | - [@AlexMeng](https://github.com/EdwardLorenz)
95 | - [@Lilianwang](https://github.com/WangLilian)
96 | - [@ZhanSu](https://github.com/shuishen112)
97 | - [@Wabywang](https://github.com/Wabyking)
98 |
99 | Welcome your issues and contribution!!!
100 |
101 |
--------------------------------------------------------------------------------
/models/CNNText.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import torch as t
3 | import numpy as np
4 | from torch import nn
5 |
6 | class CNNText(nn.Module):
7 | def __init__(self, opt ):
8 | super(CNNText, self).__init__()
9 | self.model_name = 'CNNText'
10 | self.opt=opt
11 | self.content_dim=opt.__dict__.get("content_dim",256)
12 | self.kernel_size=opt.__dict__.get("kernel_size",3)
13 |
14 |
15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
16 | if opt.__dict__.get("embeddings",None) is not None:
17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
18 |
19 |
20 | self.content_conv = nn.Sequential(
21 | nn.Conv1d(in_channels = opt.embedding_dim,
22 | out_channels = self.content_dim,
23 | kernel_size = self.kernel_size),
24 | nn.ReLU(),
25 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1))
26 | # nn.AdaptiveMaxPool1d()
27 | )
28 |
29 | self.fc = nn.Linear(self.content_dim, opt.label_size)
30 |
31 |
32 | def forward(self, content):
33 |
34 | content = self.encoder(content)
35 | content_out = self.content_conv(content.permute(0,2,1))
36 | reshaped = content_out.view(content_out.size(0), -1)
37 | logits = self.fc(reshaped)
38 | return logits
39 |
40 | import argparse
41 |
42 | def parse_opt():
43 | parser = argparse.ArgumentParser()
44 | # Data input settings
45 | parser.add_argument('--hidden_dim', type=int, default=128,
46 | help='hidden_dim')
47 |
48 |
49 | parser.add_argument('--batch_size', type=int, default=64,
50 | help='batch_size')
51 | parser.add_argument('--embedding_dim', type=int, default=300,
52 | help='embedding_dim')
53 | parser.add_argument('--learning_rate', type=float, default=4e-4,
54 | help='learning_rate')
55 | parser.add_argument('--grad_clip', type=float, default=1e-1,
56 | help='grad_clip')
57 | parser.add_argument('--model', type=str, default="lstm",
58 | help='model name')
59 |
60 |
61 | #
62 | args = parser.parse_args()
63 | args.embedding_dim=300
64 | args.vocab_size=10000
65 | args.kernel_size=3
66 | args.num_classes=3
67 | args.content_dim=256
68 | args.max_seq_len=50
69 |
70 | #
71 | # # Check if args are valid
72 | # assert args.rnn_size > 0, "rnn_size should be greater than 0"
73 |
74 |
75 | return args
76 |
77 | if __name__ == '__main__':
78 |
79 |
80 | opt = parse_opt()
81 | m = CNNText(opt)
82 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long()
83 | o = m(content)
84 | print(o.size())
85 |
86 |
--------------------------------------------------------------------------------
/models/LSTMwithAttention.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch
4 | import numpy as np
5 | import torch.nn as nn
6 | from sklearn.utils import shuffle
7 | from torch.autograd import Variable
8 |
9 | class LSTMAttention(torch.nn.Module):
10 | def __init__(self,opt):
11 | self.opt=opt
12 | super(LSTMAttention, self).__init__()
13 | self.hidden_dim = opt.hidden_dim
14 | self.batch_size = opt.batch_size
15 | self.use_gpu = torch.cuda.is_available()
16 |
17 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
18 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
19 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
20 |
21 | self.num_layers = opt.lstm_layers
22 | #self.bidirectional = True
23 | self.dropout = opt.keep_dropout
24 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, batch_first=True,num_layers=self.num_layers, dropout=self.dropout, bidirectional=True)
25 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
26 | self.hidden = self.init_hidden()
27 | self.mean = opt.__dict__.get("lstm_mean",True)
28 | self.attn_fc = torch.nn.Linear(opt.embedding_dim, 1)
29 | def init_hidden(self,batch_size=None):
30 | if batch_size is None:
31 | batch_size= self.batch_size
32 |
33 | if self.use_gpu:
34 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
35 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
36 | else:
37 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2))
38 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2))
39 | return (h0, c0)
40 |
41 |
42 | def attention(self, rnn_out, state):
43 | merged_state = torch.cat([s for s in state],1)
44 | merged_state = merged_state.squeeze(0).unsqueeze(2)
45 | # (batch, seq_len, cell_size) * (batch, cell_size, 1) = (batch, seq_len, 1)
46 | weights = torch.bmm(rnn_out, merged_state)
47 | weights = torch.nn.functional.softmax(weights.squeeze(2)).unsqueeze(2)
48 | # (batch, cell_size, seq_len) * (batch, seq_len, 1) = (batch, cell_size, 1)
49 | return torch.bmm(torch.transpose(rnn_out, 1, 2), weights).squeeze(2)
50 | # end method attention
51 |
52 |
53 | def forward(self, X):
54 | embedded = self.word_embeddings(X)
55 | hidden= self.init_hidden(X.size()[0]) #
56 | rnn_out, hidden = self.bilstm(embedded, hidden)
57 | h_n, c_n = hidden
58 | attn_out = self.attention(rnn_out, h_n)
59 | logits = self.hidden2label(attn_out)
60 | return logits
--------------------------------------------------------------------------------
/models/SelfAttention.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-#
2 | # https://arxiv.org/pdf/1703.03130.pdf
3 | # A Structured Self-attentive Sentence Embedding
4 | # https://github.com/nn116003/self-attention-classification/blob/master/imdb_attn.py
5 |
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | import torch
9 | from torch.autograd import Variable
10 | #from memory_profiler import profile
11 |
12 | class SelfAttention(nn.Module):
13 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu
14 | def __init__(self,opt):
15 | self.opt=opt
16 | super(SelfAttention, self).__init__()
17 | self.hidden_dim = opt.hidden_dim
18 | self.batch_size = opt.batch_size
19 | self.use_gpu = torch.cuda.is_available()
20 |
21 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
22 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
23 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
24 |
25 | self.num_layers = 1
26 | #self.bidirectional = True
27 | self.dropout = opt.keep_dropout
28 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True)
29 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
30 | self.hidden = self.init_hidden()
31 | self.self_attention = nn.Sequential(
32 | nn.Linear(opt.hidden_dim, 24),
33 | nn.ReLU(True),
34 | nn.Linear(24,1)
35 | )
36 | def init_hidden(self,batch_size=None):
37 | if batch_size is None:
38 | batch_size= self.batch_size
39 |
40 | if self.use_gpu:
41 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
42 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
43 | else:
44 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2))
45 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2))
46 | return (h0, c0)
47 | # @profile
48 | def forward(self, sentence):
49 | embeds = self.word_embeddings(sentence)
50 |
51 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
52 | x=embeds.permute(1,0,2)
53 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64
54 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128
55 | final =lstm_out.permute(1,0,2)#torch.mean(,1)
56 | attn_ene = self.self_attention(final)
57 | attns =F.softmax(attn_ene.view(self.batch_size, -1))
58 | feats = (final * attns).sum(dim=1)
59 | y = self.hidden2label(feats) #64x3
60 |
61 | return y
--------------------------------------------------------------------------------
/trandition.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from sklearn.feature_extraction.text import CountVectorizer
6 | from sklearn.feature_extraction.text import TfidfTransformer
7 | from sklearn.naive_bayes import MultinomialNB
8 | from sklearn.pipeline import Pipeline
9 | from sklearn.pipeline import make_pipeline
10 | from sklearn.linear_model import SGDClassifier
11 | from sklearn import metrics
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.model_selection import cross_val_score
14 | import numpy as np
15 | import opts
16 | import dataHelper
17 | #refer to "https://zhuanlan.zhihu.com/p/26729228"
18 | opt = opts.parse_opt()
19 | import dataHelper as helper
20 | train_iter, test_iter = dataHelper.loadData(opt,embedding=False)
21 | #categories = ['good', 'bad', 'mid']
22 | x_train,y_train=train_iter
23 | x_test,y_test = test_iter
24 |
25 | #opt.model ="haha"
26 | if opt.model == "bayes":
27 | """ Naive Bayes classifier """
28 | # sklearn有一套很成熟的管道流程Pipeline,快速搭建机器学习模型神器
29 | bayes_clf = Pipeline([('vect', CountVectorizer()),
30 | ('tfidf', TfidfTransformer()),
31 | ('clf', MultinomialNB())
32 | ])
33 | bayes_clf.fit(x_train, y_train)
34 | """ Predict the test dataset using Naive Bayes"""
35 | predicted = bayes_clf.predict(x_test)
36 | print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
37 | # 输出f1分数,准确率,召回率等指标
38 | # print(metrics.classification_report(y_test, predicted, target_names=categories))
39 | elif opt.model == "svm":
40 |
41 | """ Support Vector Machine (SVM) classifier"""
42 | svm_clf = Pipeline([('vect', CountVectorizer()),
43 | ('tfidf', TfidfTransformer()),
44 | ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)),
45 | ])
46 | svm_clf.fit(x_train, y_train)
47 | predicted = svm_clf.predict(x_test)
48 | print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
49 | # print(metrics.classification_report(y_test, predicted, target_names=categories))
50 |
51 | else:
52 | """ 10-折交叉验证 """
53 | clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
54 | clf_s= make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter= 5, random_state=42))
55 |
56 | bayes_10_fold = cross_val_score(clf_b, x_test, y_test, cv=10)
57 | svm_10_fold = cross_val_score(clf_s, x_test, y_test, cv=10)
58 |
59 | print('Naives Bayes 10-fold correct prediction: {:4.4f}'.format(np.mean(bayes_10_fold)))
60 | print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold)))
61 | # 输出混淆矩阵
62 | #print("Confusion Matrix:")
63 | #print(metrics.confusion_matrix(y_test, predicted))
64 | #print('\n')
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/models/LSTM.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import torch
6 | from torch.autograd import Variable
7 | #from memory_profiler import profile
8 |
9 | class LSTMClassifier(nn.Module):
10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu
11 | def __init__(self,opt):
12 | self.opt=opt
13 | super(LSTMClassifier, self).__init__()
14 | self.hidden_dim = opt.hidden_dim
15 | self.batch_size = opt.batch_size
16 | self.use_gpu = torch.cuda.is_available()
17 |
18 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
19 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
20 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
21 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim)
22 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
23 | self.hidden = self.init_hidden()
24 | self.mean = opt.__dict__.get("lstm_mean",True)
25 |
26 | def init_hidden(self,batch_size=None):
27 | if batch_size is None:
28 | batch_size= self.batch_size
29 |
30 | if self.use_gpu:
31 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
32 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
33 | else:
34 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim))
35 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim))
36 | return (h0, c0)
37 | # @profile
38 | def forward(self, sentence):
39 | embeds = self.word_embeddings(sentence) #64x200x300
40 |
41 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
42 | x=embeds.permute(1,0,2) #200x64x300
43 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128
44 | lstm_out, self.hidden = self.lstm(x, self.hidden) #200x64x128
45 | if self.mean=="mean":
46 | out = lstm_out.permute(1,0,2)
47 | final = torch.mean(out,1)
48 | else:
49 | final=lstm_out[-1]
50 | y = self.hidden2label(final) #64x3
51 | return y
52 | # def forward1(self, sentence):
53 | #
54 | # return torch.zeros(sentence.size()[0], self.opt.label_size)
55 | ## def __call__(self, **args):
56 | ## self.forward(args)
57 | # def test():
58 | #
59 | # import numpy as np
60 | #
61 | # word_embeddings = nn.Embedding(10000, 300)
62 | # lstm = nn.LSTM(300, 100)
63 | # h0 = Variable(torch.zeros(1, 128, 100))
64 | # c0 = Variable(torch.zeros(1, 128, 100))
65 | # hidden=(h0, c0)
66 | # sentence = Variable(torch.LongTensor(np.zeros((128,30),dtype=np.int64)))
67 | # embeds = word_embeddings(sentence)
68 | # torch.tile(sentence)
69 | # sentence.size()[0]
70 | #
71 | #
72 | #
73 | ## x= Variable(torch.zeros(30, 128, 300))
74 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
75 | # embeds=embeds.permute(1,0,2)
76 | # lstm_out, hidden = lstm(embeds, hidden)
77 | ##
--------------------------------------------------------------------------------
/dataloader/sst.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .Dataset import Dataset
4 | import os
5 | import pandas as pd
6 | import numpy as np
7 | from codecs import open
8 |
9 | class SSTDataset(Dataset):
10 | def __init__(self,opt=None,**kwargs):
11 | super(SSTDataset,self).__init__(opt,**kwargs)
12 | self.urls=['http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip']
13 |
14 |
15 | def process(self):
16 |
17 | root=self.download()
18 | root = os.path.join(root,"rt-polaritydata")
19 | print("processing into: "+ root)
20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb"
21 | if not os.path.exists(self.saved_path):
22 | print("mkdir " + self.saved_path)
23 | os.makedirs(self.saved_path) # better than os.mkdir
24 | #
25 | datas=[]
26 | for polarity in ("neg","pos"):
27 | filename = os.path.join(root,"rt-polarity."+polarity)
28 | records=[]
29 | with open(filename,encoding="utf-8",errors="replace") as f:
30 | for i,line in enumerate(f):
31 | print(i)
32 | print(line)
33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0})
34 | datas.append(pd.DataFrame(records))
35 |
36 |
37 |
38 | df = pd.concat(datas)
39 | from sklearn.utils import shuffle
40 | df = shuffle(df).reset_index()
41 |
42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8))
43 | # train=df.sample(frac=0.8)
44 | train = df[split_index]
45 | test = df[~np.array(split_index)]
46 |
47 | train_filename=os.path.join(self.saved_path,"train.csv")
48 | test_filename = os.path.join(self.saved_path,"test.csv")
49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None)
50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None)
51 |
52 |
53 | #
54 | # for data_folder in ("train","test"):
55 | # data = []
56 | # for polarity in ("pos","neg"):
57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity)
58 | # for rt, dirs, files in os.walk(diranme):
59 | # for f in files:
60 | # filename= os.path.join(rt,f)
61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")})
62 | # df=pd.DataFrame(data)
63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv")
64 | #
65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8")
66 | # print("finished %s"%saved_filename)
67 | print("processing into formated files over")
68 |
69 | return [train_filename,test_filename]
70 |
71 | if __name__=="__main__":
72 | import opts
73 | opt = opts.parse_opt()
74 | opt.dataset="sst"
75 | import dataloader
76 | dataset= dataloader.getDataset(opt)
77 | dataset.process()
78 |
79 |
80 |
--------------------------------------------------------------------------------
/dataloader/ag.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .Dataset import Dataset
4 | import os
5 | import pandas as pd
6 | import numpy as np
7 | from codecs import open
8 |
9 | class AGDataset(Dataset):
10 | def __init__(self,opt=None,**kwargs):
11 | super(AGDataset,self).__init__(opt,**kwargs)
12 | self.urls=['http://www.di.unipi.it/~gulli/newsSpace.bz2']
13 |
14 |
15 | def process(self):
16 |
17 | root=self.download()
18 | # root = os.path.join(root,"rt-polaritydata")
19 | # print("processing into: "+ root)
20 | ### root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb"
21 | # if not os.path.exists(self.saved_path):
22 | # print("mkdir " + self.saved_path)
23 | # os.makedirs(self.saved_path) # better than os.mkdir
24 | ##
25 | # datas=[]
26 | # for polarity in ("neg","pos"):
27 | # filename = os.path.join(root,"rt-polarity."+polarity)
28 | # records=[]
29 | # with open(filename,encoding="utf-8",errors="replace") as f:
30 | # for i,line in enumerate(f):
31 | # print(i)
32 | # print(line)
33 | # records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0})
34 | # datas.append(pd.DataFrame(records))
35 | #
36 | #
37 | #
38 | # df = pd.concat(datas)
39 | # from sklearn.utils import shuffle
40 | # df = shuffle(df).reset_index()
41 | #
42 | # split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8))
43 | ## train=df.sample(frac=0.8)
44 | # train = df[split_index]
45 | # test = df[~np.array(split_index)]
46 | #
47 | # train_filename=os.path.join(self.saved_path,"train.csv")
48 | # test_filename = os.path.join(self.saved_path,"test.csv")
49 | # train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None)
50 | # test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None)
51 | #
52 |
53 | #
54 | # for data_folder in ("train","test"):
55 | # data = []
56 | # for polarity in ("pos","neg"):
57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity)
58 | # for rt, dirs, files in os.walk(diranme):
59 | # for f in files:
60 | # filename= os.path.join(rt,f)
61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")})
62 | # df=pd.DataFrame(data)
63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv")
64 | #
65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8")
66 | # print("finished %s"%saved_filename)
67 | print("processing into formated files over")
68 |
69 | # return [train_filename,test_filename]
70 |
71 | if __name__=="__main__":
72 | import opts
73 | opt = opts.parse_opt()
74 | opt.dataset="ag"
75 | import dataloader
76 | dataset= dataloader.getDataset(opt)
77 | dataset.process()
78 |
79 |
80 |
--------------------------------------------------------------------------------
/dataloader/mr.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .Dataset import Dataset
4 | import os
5 | import pandas as pd
6 | import numpy as np
7 | from codecs import open
8 |
9 | class MRDataset(Dataset):
10 | def __init__(self,opt=None,**kwargs):
11 | super(MRDataset,self).__init__(opt,**kwargs)
12 | self.urls=['https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz']
13 |
14 |
15 | def process(self):
16 |
17 | root=self.download()
18 | root = os.path.join(root,"rt-polaritydata")
19 | print("processing into: "+ root)
20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb"
21 | if not os.path.exists(self.saved_path):
22 | print("mkdir " + self.saved_path)
23 | os.makedirs(self.saved_path) # better than os.mkdir
24 | #
25 | datas=[]
26 | for polarity in ("neg","pos"):
27 | filename = os.path.join(root,"rt-polarity."+polarity)
28 | records=[]
29 | with open(filename,encoding="utf-8",errors="replace") as f:
30 | for i,line in enumerate(f):
31 | print(i)
32 | print(line)
33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0})
34 | datas.append(pd.DataFrame(records))
35 |
36 |
37 |
38 | df = pd.concat(datas)
39 | from sklearn.utils import shuffle
40 | df = shuffle(df).reset_index()
41 |
42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8))
43 | # train=df.sample(frac=0.8)
44 | train = df[split_index]
45 | test = df[~np.array(split_index)]
46 |
47 | train_filename=os.path.join(self.saved_path,"train.csv")
48 | test_filename = os.path.join(self.saved_path,"test.csv")
49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None)
50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None)
51 |
52 |
53 | #
54 | # for data_folder in ("train","test"):
55 | # data = []
56 | # for polarity in ("pos","neg"):
57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity)
58 | # for rt, dirs, files in os.walk(diranme):
59 | # for f in files:
60 | # filename= os.path.join(rt,f)
61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")})
62 | # df=pd.DataFrame(data)
63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv")
64 | #
65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8")
66 | # print("finished %s"%saved_filename)
67 | print("processing into formated files over")
68 |
69 | return [train_filename,test_filename]
70 |
71 | if __name__=="__main__":
72 | import opts
73 | opt = opts.parse_opt()
74 | opt.dataset="mr"
75 | import dataloader
76 | dataset= dataloader.getDataset(opt)
77 | dataset.process()
78 |
79 |
80 |
--------------------------------------------------------------------------------
/models/RCNN.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | import torch
4 | from torch.autograd import Variable
5 | #from memory_profiler import profile
6 |
7 | """
8 | Lai S, Xu L, Liu K, et al. Recurrent Convolutional Neural Networks for Text Classification[C]//AAAI. 2015, 333: 2267-2273.
9 | """
10 |
11 | class RCNN(nn.Module):
12 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu
13 | def __init__(self,opt):
14 | self.opt=opt
15 | super(RCNN, self).__init__()
16 | self.hidden_dim = opt.hidden_dim
17 | self.batch_size = opt.batch_size
18 | self.use_gpu = torch.cuda.is_available()
19 |
20 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim)
21 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
22 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings))
23 |
24 | self.num_layers = 1
25 | #self.bidirectional = True
26 | self.dropout = opt.keep_dropout
27 | self.bilstm = nn.LSTM(input_size=opt.embedding_dim, hidden_size=opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True)
28 |
29 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size)
30 | self.hidden = self.init_hidden()
31 |
32 | self.max_pooling = nn.MaxPool1d(kernel_size=3, stride=2)
33 |
34 | self.content_dim = 256
35 | #self.conv = nn.Conv1d(opt.hidden_dim, self.content_dim, opt.hidden_dim * 2, stride=opt.embedding_dim)
36 | self.hidden2label = nn.Linear( (2*opt.hidden_dim // 2+opt.embedding_dim), opt.label_size)
37 |
38 | def init_hidden(self,batch_size=None):
39 | if batch_size is None:
40 | batch_size= self.batch_size
41 |
42 | if self.use_gpu:
43 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
44 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda())
45 | else:
46 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2))
47 | c0 = Variable(torch.zeros(2*self.num_layers,batch_size, self.hidden_dim // 2))
48 | return (h0, c0)
49 | # @profile
50 | def forward(self, sentence):
51 | embeds = self.word_embeddings(sentence) #64x200x300
52 |
53 | # x = embeds.view(sentence.size()[1], self.batch_size, -1)
54 | x=embeds.permute(1,0,2) #200x64x300
55 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x128
56 | lstm_out, self.hidden = self.bilstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions)
57 | #lstm_out 200x64x128
58 |
59 | c_lr = lstm_out.permute(1,0,2) #64x200x128
60 | xi = torch.cat((c_lr[:,:,0:int(c_lr.size()[2]/2)],embeds,c_lr[:,:,int(c_lr.size()[2]/2):]),2) #64x200x428
61 | yi = torch.tanh(xi.permute(0,2,1)) #64x428x200
62 | y = self.max_pooling(yi) #64x428x99
63 | y = y.permute(2,0,1)
64 |
65 | ##y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1
66 |
67 | y = self.hidden2label(y[-1])
68 | #y = self.hidden2label(y[:,-1,:].view(y[:,-1,:].size()[0],-1))
69 | return y
--------------------------------------------------------------------------------
/models/MLP.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.init as init
7 | from torch.autograd import Variable
8 |
9 | # https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py
10 |
11 | def position_encoding(sentence_size, embedding_dim):
12 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32)
13 | ls = sentence_size + 1
14 | le = embedding_dim + 1
15 | for i in range(1, le):
16 | for j in range(1, ls):
17 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2)
18 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size
19 | # Make position encoding of time words identity to avoid modifying them
20 | encoding[:, -1] = 1.0
21 | return np.transpose(encoding)
22 |
23 | class AttrProxy(object):
24 | """
25 | Translates index lookups into attribute lookups.
26 | To implement some trick which able to use list of nn.Module in a nn.Module
27 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2
28 | """
29 | def __init__(self, module, prefix):
30 | self.module = module
31 | self.prefix = prefix
32 |
33 | def __getitem__(self, i):
34 | return getattr(self.module, self.prefix + str(i))
35 |
36 |
37 | class MemN2N(nn.Module):
38 | def __init__(self, opt):
39 | super(MemN2N, self).__init__()
40 |
41 | use_cuda = opt["use_cuda"]
42 | num_vocab = opt["num_vocab"]
43 | embedding_dim = opt["embedding_dim"]
44 | sentence_size = opt["sentence_size"]
45 | self.max_hops = opt["max_hops"]
46 |
47 | for hop in range(self.max_hops+1):
48 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0)
49 | C.weight.data.normal_(0, 0.1)
50 | self.add_module("C_{}".format(hop), C)
51 | self.C = AttrProxy(self, "C_")
52 |
53 | self.softmax = nn.Softmax()
54 | self.encoding = Variable(torch.FloatTensor(
55 | position_encoding(sentence_size, embedding_dim)), requires_grad=False)
56 |
57 | if use_cuda:
58 | self.encoding = self.encoding.cuda()
59 |
60 | def forward(self, story, query):
61 | story_size = story.size()
62 |
63 | u = list()
64 | query_embed = self.C[0](query)
65 | # weired way to perform reduce_dot
66 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed)
67 | u.append(torch.sum(query_embed*encoding, 1))
68 |
69 | for hop in range(self.max_hops):
70 | embed_A = self.C[hop](story.view(story.size(0), -1))
71 | embed_A = embed_A.view(story_size+(embed_A.size(-1),))
72 |
73 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A)
74 | m_A = torch.sum(embed_A*encoding, 2)
75 |
76 | u_temp = u[-1].unsqueeze(1).expand_as(m_A)
77 | prob = self.softmax(torch.sum(m_A*u_temp, 2))
78 |
79 | embed_C = self.C[hop+1](story.view(story.size(0), -1))
80 | embed_C = embed_C.view(story_size+(embed_C.size(-1),))
81 | m_C = torch.sum(embed_C*encoding, 2)
82 |
83 | prob = prob.unsqueeze(2).expand_as(m_C)
84 | o_k = torch.sum(m_C*prob, 1)
85 |
86 | u_k = u[-1] + o_k
87 | u.append(u_k)
88 |
89 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1)
90 | return a_hat, self.softmax(a_hat)
--------------------------------------------------------------------------------
/models/MemoryNetwork.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.init as init
7 | from torch.autograd import Variable
8 |
9 | def position_encoding(sentence_size, embedding_dim):
10 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32)
11 | ls = sentence_size + 1
12 | le = embedding_dim + 1
13 | for i in range(1, le):
14 | for j in range(1, ls):
15 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2)
16 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size
17 | # Make position encoding of time words identity to avoid modifying them
18 | encoding[:, -1] = 1.0
19 | return np.transpose(encoding)
20 |
21 | class AttrProxy(object):
22 | """
23 | Translates index lookups into attribute lookups.
24 | To implement some trick which able to use list of nn.Module in a nn.Module
25 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2
26 | """
27 | def __init__(self, module, prefix):
28 | self.module = module
29 | self.prefix = prefix
30 |
31 | def __getitem__(self, i):
32 | return getattr(self.module, self.prefix + str(i))
33 |
34 |
35 | class MemN2N(nn.Module):
36 | def __init__(self, settings):
37 | super(MemN2N, self).__init__()
38 |
39 | use_cuda = settings["use_cuda"]
40 | num_vocab = settings["num_vocab"]
41 | embedding_dim = settings["embedding_dim"]
42 | sentence_size = settings["sentence_size"]
43 | self.max_hops = settings["max_hops"]
44 |
45 | for hop in range(self.max_hops+1):
46 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0)
47 | C.weight.data.normal_(0, 0.1)
48 | self.add_module("C_{}".format(hop), C)
49 | self.C = AttrProxy(self, "C_")
50 |
51 | self.softmax = nn.Softmax()
52 | self.encoding = Variable(torch.FloatTensor(
53 | position_encoding(sentence_size, embedding_dim)), requires_grad=False)
54 |
55 | if use_cuda:
56 | self.encoding = self.encoding.cuda()
57 |
58 | def forward(self, query):
59 |
60 | story=query # for text classfication
61 |
62 | story_size = story.size()
63 |
64 | u = list()
65 | query_embed = self.C[0](query)
66 | # weired way to perform reduce_dot
67 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed)
68 | u.append(torch.sum(query_embed*encoding, 1))
69 |
70 | for hop in range(self.max_hops):
71 | embed_A = self.C[hop](story.view(story.size(0), -1))
72 | embed_A = embed_A.view(story_size+(embed_A.size(-1),))
73 |
74 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A)
75 | m_A = torch.sum(embed_A*encoding, 2)
76 |
77 | u_temp = u[-1].unsqueeze(1).expand_as(m_A)
78 | prob = self.softmax(torch.sum(m_A*u_temp, 2))
79 |
80 | embed_C = self.C[hop+1](story.view(story.size(0), -1))
81 | embed_C = embed_C.view(story_size+(embed_C.size(-1),))
82 | m_C = torch.sum(embed_C*encoding, 2)
83 |
84 | prob = prob.unsqueeze(2).expand_as(m_C)
85 | o_k = torch.sum(m_C*prob, 1)
86 |
87 | u_k = u[-1] + o_k
88 | u.append(u_k)
89 |
90 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1)
91 | return a_hat, self.softmax(a_hat)
--------------------------------------------------------------------------------
/models/CNNInception.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import torch as t
5 | import torch
6 | import numpy as np
7 | from torch import nn
8 | from collections import OrderedDict
9 |
10 | class Inception(nn.Module):
11 | def __init__(self,cin,co,relu=True,norm=True):
12 | super(Inception, self).__init__()
13 | assert(co%4==0)
14 | cos=[int(co/4)]*4
15 | self.activa=nn.Sequential()
16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co))
17 | if relu:self.activa.add_module('relu',nn.ReLU(True))
18 | self.branch1 =nn.Sequential(OrderedDict([
19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)),
20 | ]))
21 | self.branch2 =nn.Sequential(OrderedDict([
22 | ('conv1', nn.Conv1d(cin,cos[1], 1)),
23 | ('norm1', nn.BatchNorm1d(cos[1])),
24 | ('relu1', nn.ReLU(inplace=True)),
25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)),
26 | ]))
27 | self.branch3 =nn.Sequential(OrderedDict([
28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)),
29 | ('norm1', nn.BatchNorm1d(cos[2])),
30 | ('relu1', nn.ReLU(inplace=True)),
31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)),
32 | ]))
33 | self.branch4 =nn.Sequential(OrderedDict([
34 | #('pool',nn.MaxPool1d(2)),
35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)),
36 | ]))
37 | def forward(self,x):
38 | branch1=self.branch1(x)
39 | branch2=self.branch2(x)
40 | branch3=self.branch3(x)
41 | branch4=self.branch4(x)
42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1))
43 | return result
44 | class InceptionCNN(nn.Module):
45 | def __init__(self, opt ):
46 | super(InceptionCNN, self).__init__()
47 | incept_dim=getattr(opt,"inception_dim",512)
48 | self.model_name = 'CNNText_inception'
49 | self.opt=opt
50 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
51 |
52 | self.content_conv=nn.Sequential(
53 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2)
54 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4)
55 | Inception(incept_dim,incept_dim),
56 | nn.MaxPool1d(opt.max_seq_len)
57 | )
58 | self.fc = nn.Sequential(
59 | nn.Linear(incept_dim,getattr(opt,"linear_hidden_size",2000)),
60 | nn.BatchNorm1d(getattr(opt,"linear_hidden_size",2000)),
61 | nn.ReLU(inplace=True),
62 | nn.Linear(getattr(opt,"linear_hidden_size",2000) ,opt.label_size)
63 | )
64 | if opt.__dict__.get("embeddings",None) is not None:
65 | self.encoder.weight=nn.Parameter(opt.embeddings)
66 |
67 | def forward(self,content):
68 |
69 | content=self.encoder(content)
70 | if self.opt.embedding_type=="static":
71 | content=content.detach(0)
72 |
73 | content_out=self.content_conv(content.permute(0,2,1))
74 | out=content_out.view(content_out.size(0), -1)
75 | out=self.fc(out)
76 | return out
77 |
78 | if __name__ == '__main__':
79 | import sys
80 | sys.path.append(r"..")
81 | import opts
82 | opt=opts.parse_opt()
83 | opt.vocab_size=2501
84 | opt.label_size=3
85 | m = CNNText_inception(opt)
86 |
87 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long()
88 | o = m(content)
89 | print(o.size())
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/models/CNN_Inception.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | import torch as t
5 | import torch
6 | import numpy as np
7 | from torch import nn
8 | from collections import OrderedDict
9 |
10 | class Inception(nn.Module):
11 | def __init__(self,cin,co,relu=True,norm=True):
12 | super(Inception, self).__init__()
13 | assert(co%4==0)
14 | cos=[co/4]*4
15 | self.activa=nn.Sequential()
16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co))
17 | if relu:self.activa.add_module('relu',nn.ReLU(True))
18 | self.branch1 =nn.Sequential(OrderedDict([
19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)),
20 | ]))
21 | self.branch2 =nn.Sequential(OrderedDict([
22 | ('conv1', nn.Conv1d(cin,cos[1], 1)),
23 | ('norm1', nn.BatchNorm1d(cos[1])),
24 | ('relu1', nn.ReLU(inplace=True)),
25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)),
26 | ]))
27 | self.branch3 =nn.Sequential(OrderedDict([
28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)),
29 | ('norm1', nn.BatchNorm1d(cos[2])),
30 | ('relu1', nn.ReLU(inplace=True)),
31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)),
32 | ]))
33 | self.branch4 =nn.Sequential(OrderedDict([
34 | #('pool',nn.MaxPool1d(2)),
35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)),
36 | ]))
37 | def forward(self,x):
38 | branch1=self.branch1(x)
39 | branch2=self.branch2(x)
40 | branch3=self.branch3(x)
41 | branch4=self.branch4(x)
42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1))
43 | return result
44 | class CNNText_inception(nn.Module):
45 | def __init__(self, opt ):
46 | super(CNNText_inception, self).__init__()
47 | incept_dim=getattr(opt,"inception_dim",512)
48 | self.model_name = 'CNNText_inception'
49 | self.opt=opt
50 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
51 |
52 | self.content_conv=nn.Sequential(
53 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2)
54 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4)
55 | Inception(incept_dim,incept_dim),
56 | nn.MaxPool1d(opt.max_seq_len)
57 | )
58 | self.fc = nn.Sequential(
59 | nn.Linear(incept_dim,getattr(opt,"linear_hidden_size",2000)),
60 | nn.BatchNorm1d(getattr(opt,"linear_hidden_size",2000)),
61 | nn.ReLU(inplace=True),
62 | nn.Linear(getattr(opt,"linear_hidden_size",2000) ,opt.label_size)
63 | )
64 | if opt.__dict__.get("embeddings",None) is not None:
65 | print('load embedding')
66 | self.encoder.weight.data.copy_(t.from_numpy(opt.embeddings))
67 |
68 | def forward(self,content):
69 |
70 | content=self.encoder(content)
71 | if self.opt.static:
72 | content=content.detach(0)
73 |
74 | content_out=self.content_conv(content.permute(0,2,1))
75 | out=content_out.view(content_out.size(0), -1)
76 | out=self.fc(out)
77 | return out
78 |
79 | if __name__ == '__main__':
80 | import sys
81 | sys.path.append(r"..")
82 | import opts
83 | opt=opts.parse_opt()
84 | opt.vocab_size=2501
85 | opt.label_size=3
86 | m = CNNText_inception(opt)
87 |
88 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long()
89 | o = m(content)
90 | print(o.size())
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/parameter_search.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from six.moves import cPickle
10 | import time,os,random
11 | import itertools
12 |
13 | import torch
14 | from torch.autograd import Variable
15 | import torch.optim as optim
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | from torch.nn.modules.loss import NLLLoss,MultiLabelSoftMarginLoss,MultiLabelMarginLoss,BCELoss
19 |
20 | import opts
21 | import models
22 | import utils
23 |
24 |
25 | timeArray = time.localtime(int(time.time()) )
26 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray)
27 | performance_log_file =timeStamp+ "result.csv"
28 |
29 |
30 | opt = opts.parse_opt()
31 | train_iter, test_iter = utils.loadData(opt)
32 |
33 |
34 | def train(opt,train_iter, test_iter,verbose=True):
35 | global_start= time.time()
36 | logger = utils.getLogger()
37 | model=models.setup(opt)
38 | if torch.cuda.is_available():
39 | model.cuda()
40 | params = [param for param in model.parameters() if param.requires_grad] #filter(lambda p: p.requires_grad, model.parameters())
41 |
42 | model_info ="; ".join( [str(k)+" : "+ str(v) for k,v in opt.__dict__.items() if type(v) in (str,int,float,list,bool)])
43 | logger.info("# parameters:" + str(sum(param.numel() for param in params)))
44 | logger.info(model_info)
45 |
46 |
47 | model.train()
48 | optimizer = utils.getOptimizer(params,name=opt.optimizer, lr=opt.learning_rate,scheduler=opt.lr_scheduler)
49 | optimizer.zero_grad()
50 | loss_fun = F.cross_entropy
51 |
52 | percisions=[]
53 | for i in range(opt.max_epoch):
54 | for epoch,batch in enumerate(train_iter):
55 | start= time.time()
56 |
57 | text = batch.text[0] if opt.from_torchtext else batch.text
58 | predicted = model(text)
59 |
60 | loss= loss_fun(predicted,batch.label)
61 |
62 | loss.backward()
63 | utils.clip_gradient(optimizer, opt.grad_clip)
64 | optimizer.step()
65 |
66 | if verbose:
67 | if torch.cuda.is_available():
68 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.cpu().data.numpy()[0],time.time()-start))
69 | else:
70 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.data.numpy()[0],time.time()-start))
71 |
72 | percision=utils.evaluation(model,test_iter,opt.from_torchtext)
73 | percisions.append(percision)
74 | if verbose:
75 | logger.info("%d iteration with percision %.4f" % (i,percision))
76 |
77 | # while(utils.is_writeable(performance_log_file)):
78 | df = pd.read_csv(performance_log_file,index_col=0,sep="\t")
79 | df.loc[model_info,opt.dataset] = max(percisions)
80 | df.to_csv(performance_log_file,sep="\t")
81 | logger.info(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) )
82 | print(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) )
83 |
84 | if __name__=="__main__":
85 |
86 | if not os.path.exists(performance_log_file):
87 | with open(performance_log_file,"w") as f:
88 | f.write("argument\n")
89 | f.close()
90 | print("gpu : %d" % opt.gpu)
91 |
92 |
93 | parameter_pools={
94 | "model":["lstm","cnn","kim_cnn","fasttext"],
95 | "keep_dropout":[0.1,0.5,0.8,0.9,1.0],
96 | "batch_size":[32,64,128],
97 | "learning_rate":[100,10,1,1e-1,1e-2,1e-3],
98 | "optimizer":["adam"],
99 | "lr_scheduler":[None]
100 | }
101 |
102 | pool =[ arg for arg in itertools.product(*parameter_pools.values())]
103 | pool=random.shuffle(pool)
104 | args=[arg for i,arg in enumerate(pool) if i%8==opt.gpu]
105 |
106 | for arg in args:
107 | for k,v in zip(parameter_pools.keys(),arg):
108 | opt.__setattr__(k,v)
109 | train(opt,train_iter, test_iter,verbose=True)
110 |
--------------------------------------------------------------------------------
/dataloader/Dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os,urllib
3 | class Dataset(object):
4 | def __init__(self,opt=None):
5 | if opt is not None:
6 | self.setup(opt)
7 | self.http_proxy= opt.__dict__.get("proxy","null")
8 |
9 | else:
10 | self.name="demo"
11 | self.dirname="demo"
12 | self.http_proxy="null"
13 |
14 | self.urls=[]
15 | self.root=".data"
16 | self.saved_path= os.path.join(os.path.join(self.root,"clean"),self.name)
17 | self.formated_files=None
18 |
19 |
20 |
21 | def setup(self,opt):
22 |
23 | self.name=opt.dataset
24 | self.dirname=opt.dataset
25 | self.http_proxy= opt.__dict__.get("proxy","null")
26 |
27 |
28 | def process(self):
29 | dirname=self.download()
30 | print("processing dirname: "+ dirname)
31 | raise Exception("method in father class have been called in processing: {} dataset".format(opt.dataset))
32 | return dirname
33 |
34 |
35 | def getFormatedData(self):
36 |
37 | if self.formated_files is not None:
38 | return self.formated_files
39 |
40 | if os.path.exists(self.saved_path):
41 | return [os.path.join(self.saved_path,filename) for filename in os.listdir(self.saved_path)]
42 | self.formated_files = self.process()
43 | return self.formated_files
44 |
45 | def download_from_url(self,url, path, schedule=None):
46 | #if schedule is None:
47 | # schedule=lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*100)%10==0 else None
48 | if self.http_proxy != "null":
49 | proxy = urllib.request.ProxyHandler({'http': self.http_proxy,'https': self.http_proxy})
50 | # construct a new opener using your proxy settings
51 | opener = urllib.request.build_opener(proxy)
52 | # install the openen on the module-level
53 | urllib.request.install_opener(opener)
54 | print("proxy in %s" % self.http_proxy)
55 | # urllib.request.urlretrieve(url,path,lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*1000)%100==0 else None )a
56 | try:
57 | urllib.request.urlretrieve(url,path )
58 | except:
59 | import urllib2
60 | urllib2.urlretrieve(url,path )
61 | return path
62 |
63 | def download(self,check=None):
64 | """Download and unzip an online archive (.zip, .gz, or .tgz).
65 |
66 | Arguments:
67 | check (str or None): Folder whose existence indicates
68 | that the dataset has already been downloaded, or
69 | None to check the existence of root/{cls.name}.
70 |
71 | Returns:
72 | dataset_path (str): Path to extracted dataset.
73 | """
74 | import zipfile,tarfile
75 |
76 | path = os.path.join(self.root, self.name)
77 | check = path if check is None else check
78 | if not os.path.isdir(check):
79 | for url in self.urls:
80 | if isinstance(url, tuple):
81 | url, filename = url
82 | else:
83 | filename = os.path.basename(url)
84 | zpath = os.path.join(path, filename)
85 | if not os.path.isfile(zpath):
86 | if not os.path.exists(os.path.dirname(zpath)):
87 | os.makedirs(os.path.dirname(zpath))
88 | print('downloading {}'.format(filename))
89 |
90 | self.download_from_url(url, zpath)
91 | ext = os.path.splitext(filename)[-1]
92 | if ext == '.zip':
93 | with zipfile.ZipFile(zpath, 'r') as zfile:
94 | print('extracting')
95 | zfile.extractall(path)
96 | elif ext in ['.gz', '.tgz',".bz2"]:
97 | with tarfile.open(zpath, 'r:gz') as tar:
98 | dirs = [member for member in tar.getmembers()]
99 | tar.extractall(path=path, members=dirs)
100 | else:
101 | print("%s do not need to be downloaded" % path)
102 | return path
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
1 | import argparse,os
2 | import configparser
3 | def parse_opt():
4 | parser = argparse.ArgumentParser()
5 | # Data input settings
6 |
7 | parser.add_argument('--config', type=str, default="no_file_exists",
8 | help='gpu number')
9 |
10 |
11 | parser.add_argument('--hidden_dim', type=int, default=128,
12 | help='hidden_dim')
13 |
14 | parser.add_argument('--max_seq_len', type=int, default=200,
15 | help='max_seq_len')
16 | parser.add_argument('--batch_size', type=int, default=64,
17 | help='batch_size')
18 | parser.add_argument('--embedding_dim', type=int, default=100,
19 | help='embedding_dim')
20 | parser.add_argument('--learning_rate', type=float, default=2e-5,
21 | help='learning_rate')
22 | parser.add_argument('--grad_clip', type=float, default=1e-1,
23 | help='grad_clip')
24 |
25 | parser.add_argument('--model', type=str, default="bilstm",
26 | help='model name')
27 |
28 | parser.add_argument('--dataset', type=str, default="imdb",
29 |
30 | help='dataset')
31 | parser.add_argument('--position', type=bool, default=False,
32 | help='gpu number')
33 |
34 | parser.add_argument('--keep_dropout', type=float, default=0.8,
35 | help='keep_dropout')
36 | parser.add_argument('--max_epoch', type=int, default=20,
37 | help='max_epoch')
38 | parser.add_argument('--embedding_file', type=str, default="glove.6b.300",
39 | help='glove or w2v')
40 | parser.add_argument('--embedding_training', type=str, default="false",
41 | help='embedding_training')
42 | #kim CNN
43 | parser.add_argument('--kernel_sizes', type=str, default="1,2,3,5",
44 | help='kernel_sizes')
45 | parser.add_argument('--kernel_nums', type=str, default="256,256,256,256",
46 | help='kernel_nums')
47 | parser.add_argument('--embedding_type', type=str, default="non-static",
48 | help='embedding_type')
49 | parser.add_argument('--lstm_mean', type=str, default="mean",# last
50 | help='lstm_mean')
51 | parser.add_argument('--lstm_layers', type=int, default=1,# last
52 | help='lstm_layers')
53 | parser.add_argument('--gpu', type=int, default=0,
54 | help='gpu number')
55 | parser.add_argument('--proxy', type=str, default="null",
56 | help='http://proxy.xx.com:8080')
57 | parser.add_argument('--debug', type=str, default="true",
58 | help='gpu number')
59 |
60 | parser.add_argument('--embedding_dir', type=str, default=".glove/glove.6B.300d.txt",
61 | help='embedding_dir')
62 | parser.add_argument('--from_torchtext', type=str, default="false",
63 | help='from torchtext or native data loader')
64 | #
65 | args = parser.parse_args()
66 |
67 | if args.config != "no_file_exists":
68 | if os.path.exists(args.config):
69 | config = configparser.ConfigParser()
70 | config_file_path=args.config
71 | config.read(config_file_path)
72 | config_common = config['COMMON']
73 | for key in config_common.keys():
74 | args.__dict__[key]=config_common[key]
75 | else:
76 | print("config file named %s does not exist" % args.config)
77 |
78 | args.kernel_sizes = [int(i) for i in args.kernel_sizes.split(",")]
79 | args.kernel_nums = [int(i) for i in args.kernel_nums.split(",")]
80 | #
81 | # # Check if args are valid
82 | # assert args.rnn_size > 0, "rnn_size should be greater than 0"
83 |
84 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys():
85 | os.environ["CUDA_VISIBLE_DEVICES"] =str(args.gpu)
86 |
87 | if args.model=="transformer":
88 | args.position=True
89 | else:
90 | args.position=False
91 | if args.debug.lower() =="true":
92 | args.debug = True
93 | else:
94 | args.debug = False
95 |
96 | if args.embedding_training.lower() =="true":
97 | args.embedding_training = True
98 | else:
99 | args.embedding_training = False
100 | if args.from_torchtext.lower() =="true":
101 | args.from_torchtext = True
102 | else:
103 | args.from_torchtext = False
104 |
105 |
106 | if os.path.exists("proxy.config"):
107 | with open("proxy.config") as f:
108 |
109 | args.proxy = f.read()
110 | print(args.proxy)
111 |
112 |
113 |
114 | return args
115 |
--------------------------------------------------------------------------------
/models/CNNBasic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import torch as t
3 | import numpy as np
4 | from torch import nn
5 |
6 | class BasicCNN1D(nn.Module):
7 | def __init__(self, opt ):
8 | super(BasicCNN1D, self).__init__()
9 | self.model_name = 'CNNText'
10 | self.opt=opt
11 | self.content_dim=opt.__dict__.get("content_dim",256)
12 | self.kernel_size=opt.__dict__.get("kernel_size",3)
13 |
14 |
15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
16 | if opt.__dict__.get("embeddings",None) is not None:
17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
18 |
19 | self.content_conv = nn.Sequential(
20 | nn.Conv1d(in_channels = opt.embedding_dim,
21 | out_channels = self.content_dim, #256
22 | kernel_size = self.kernel_size), #3
23 | nn.ReLU(),
24 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1))
25 | # nn.AdaptiveMaxPool1d()
26 | )
27 | self.fc = nn.Linear(self.content_dim, opt.label_size)
28 |
29 | def forward(self, content):
30 |
31 | content = self.encoder(content) #64x200x300
32 | content_out = self.content_conv(content.permute(0,2,1)) #64x256x1
33 | reshaped = content_out.view(content_out.size(0), -1) #64x256
34 | logits = self.fc(reshaped) #64x3
35 | return logits
36 | class BasicCNN2D(nn.Module):
37 | """
38 | A CNN for text classification.
39 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
40 | """
41 | def __init__(self, args):
42 | super(BasicCNN2D, self).__init__()
43 | self.opt = opt
44 |
45 | self.embedding_dim = opt.embedding_dim
46 | self.vocab_size = opt.vocab_size
47 | self.label_size = opt.label_size
48 | self.keep_dropout = opt.keep_dropout
49 | in_channel = 1
50 | self.kernel_nums = opt.kernel_nums
51 | self.kernel_sizes = opt.kernel_sizes
52 |
53 | self.embed = nn.Embedding(self.vocab_size+1, self.embedding_dim)
54 |
55 | if opt.__dict__.get("embeddings",None) is not None:
56 | self.embed.weight=nn.Parameter(opt.embeddings)
57 |
58 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, self.embedding_dim)) for K,out_channel in zip(self.kernel_sizes,self.kernel_nums)])
59 |
60 | self.dropout = nn.Dropout(self.keep_dropout)
61 | self.fc = nn.Linear(len(self.kernel_sizes) * self.out_channel, self.label_size)
62 |
63 |
64 | def forward(self, input_x):
65 | """
66 | :param input_x: a list size having the number of batch_size elements with the same length
67 | :return: batch_size X num_aspects tensor
68 | """
69 | # Embedding
70 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size)
71 |
72 | if self.opt.static:
73 | x = F.Variable(input_x)
74 |
75 | # Conv & max pool
76 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size)
77 |
78 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1])
79 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
80 |
81 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes)
82 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
83 | x = torch.cat(x, 1)
84 |
85 | # Dropout & output
86 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels)
87 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects)
88 |
89 | return logit
90 | import argparse
91 |
92 | def parse_opt():
93 | parser = argparse.ArgumentParser()
94 | # Data input settings
95 | parser.add_argument('--hidden_dim', type=int, default=128,
96 | help='hidden_dim')
97 |
98 |
99 | parser.add_argument('--batch_size', type=int, default=64,
100 | help='batch_size')
101 | parser.add_argument('--embedding_dim', type=int, default=300,
102 | help='embedding_dim')
103 | parser.add_argument('--learning_rate', type=float, default=4e-4,
104 | help='learning_rate')
105 | parser.add_argument('--grad_clip', type=float, default=1e-1,
106 | help='grad_clip')
107 | parser.add_argument('--model', type=str, default="lstm",
108 | help='model name')
109 | parser.add_argument('--model', type=str, default="lstm",
110 | help='model name')
111 |
112 |
113 | #
114 | args = parser.parse_args()
115 | args.embedding_dim=300
116 | args.vocab_size=10000
117 | args.kernel_size=3
118 | args.num_classes=3
119 | args.content_dim=256
120 | args.max_seq_len=50
121 |
122 | #
123 | # # Check if args are valid
124 | # assert args.rnn_size > 0, "rnn_size should be greater than 0"
125 |
126 |
127 | return args
128 |
129 | if __name__ == '__main__':
130 |
131 | opt = parse_opt()
132 | m = CNNText(opt)
133 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long()
134 | o = m(content)
135 | print(o.size())
136 |
137 |
--------------------------------------------------------------------------------
/models/Capsule.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # paper
3 |
4 |
5 | #
6 |
7 |
8 |
9 | import torch
10 | import torch.nn.functional as F
11 | from torch import nn
12 | import numpy as np
13 |
14 | BATCH_SIZE = 100
15 |
16 | NUM_EPOCHS = 500
17 | NUM_ROUTING_ITERATIONS = 3
18 |
19 | cuda = torch.cuda.is_available()
20 |
21 | def softmax(input, dim=1):
22 | transposed_input = input.transpose(dim, len(input.size()) - 1)
23 | softmaxed_output = F.softmax(transposed_input.contiguous().view(-1, transposed_input.size(-1)))
24 | return softmaxed_output.view(*transposed_input.size()).transpose(dim, len(input.size()) - 1)
25 |
26 |
27 |
28 |
29 |
30 | class CapsuleLayer(nn.Module):
31 | def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None,
32 | num_iterations=NUM_ROUTING_ITERATIONS,padding=0):
33 | super(CapsuleLayer, self).__init__()
34 |
35 | self.num_route_nodes = num_route_nodes
36 | self.num_iterations = num_iterations
37 |
38 | self.num_capsules = num_capsules
39 |
40 |
41 |
42 | if num_route_nodes != -1:
43 | self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels))
44 | else:
45 | prime=[3,5,7,9,11,13,17,19,23]
46 | sizes=prime[:self.num_capsules]
47 | self.capsules = nn.ModuleList(
48 | [nn.Conv1d(in_channels, out_channels, kernel_size=i, stride=2, padding=int((i-1)/2)) for i in sizes])
49 |
50 | def squash(self, tensor, dim=-1):
51 | squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True)
52 | scale = squared_norm / (1 + squared_norm)
53 | return scale * tensor / torch.sqrt(squared_norm)
54 |
55 | def forward(self, x):
56 |
57 | if self.num_route_nodes != -1:
58 | priors =torch.matmul( x[None, :, :, None, :],self.route_weights[:, None, :, :, :])
59 |
60 | if torch.cuda.is_available():
61 | logits = torch.autograd.Variable(torch.zeros(priors.size())).cuda()
62 | else:
63 | logits = torch.autograd.Variable(torch.zeros(priors.size()))
64 | for i in range(self.num_iterations):
65 | probs = softmax(logits, dim=2)
66 | outputs = self.squash((torch.mul(probs , priors)).sum(dim=2, keepdim=True))
67 |
68 | if i != self.num_iterations - 1:
69 | delta_logits = (torch.mul(priors , outputs)).sum(dim=-1, keepdim=True)
70 | logits = logits + delta_logits
71 | else:
72 | outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules]
73 | outputs = torch.cat(outputs, dim=-1)
74 | outputs = self.squash(outputs)
75 |
76 | return outputs
77 |
78 |
79 | class CapsuleNet(nn.Module):
80 | def __init__(self,opt):
81 | super(CapsuleNet, self).__init__()
82 | self.opt=opt #300*300
83 | self.label_size=opt.label_size
84 | self.embed = nn.Embedding(opt.vocab_size+1, opt.embedding_dim)
85 | self.opt.cnn_dim = 1
86 | self.kernel_size = 3
87 | self.kernel_size_primary=3
88 | if opt.__dict__.get("embeddings",None) is not None:
89 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training)
90 |
91 | self.primary_capsules = CapsuleLayer(num_capsules=8, num_route_nodes=-1, in_channels=256, out_channels=32)
92 | self.digit_capsules = CapsuleLayer(num_capsules=opt.label_size, num_route_nodes=int(32 * opt.max_seq_len/2), in_channels=8,
93 | out_channels=16)
94 | if self.opt.cnn_dim == 2:
95 | self.conv_2d = nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(self.kernel_size,opt.embedding_dim), stride=(1,opt.embedding_dim),padding=(int((self.kernel_size-1)/2),0))
96 | else:
97 | self.conv_1d = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=opt.embedding_dim * self.kernel_size, stride=opt.embedding_dim, padding=opt.embedding_dim* int((self.kernel_size-1)/2) )
98 |
99 | self.decoder = nn.Sequential(
100 | nn.Linear(16 * self.label_size, 512),
101 | nn.ReLU(inplace=True),
102 | nn.Linear(512, 1024),
103 | nn.ReLU(inplace=True),
104 | nn.Linear(1024, 784),
105 | nn.Sigmoid()
106 | )
107 |
108 | def forward(self, x, y=None,reconstruct=False):
109 | #x = next(iter(train_iter)).text[0]
110 |
111 | x= self.embed(x)
112 | if self.opt.cnn_dim == 1:
113 | x=x.view(x.size(0),1,x.size(-1)*x.size(-2))
114 | x_conv = F.relu(self.conv_1d(x), inplace=True)
115 | else:
116 |
117 | x=x.unsqueeze(1)
118 | x_conv = F.relu(self.conv_2d(x), inplace=True).squeeze(3)
119 |
120 | x = self.primary_capsules(x_conv)
121 | x = self.digit_capsules(x).squeeze().transpose(0, 1)
122 |
123 | classes = (x ** 2).sum(dim=-1) ** 0.5
124 | classes = F.softmax(classes)
125 | if not reconstruct:
126 | return classes
127 | if y is None:
128 | # In all batches, get the most active capsule.
129 | _, max_length_indices = classes.max(dim=1)
130 | if torch.cuda.is_available():
131 | y = Variable(torch.sparse.torch.eye(self.label_size)).cuda().index_select(dim=0, index=max_length_indices.data)
132 | else:
133 | y = Variable(torch.sparse.torch.eye(self.label_size)).index_select(dim=0, index=max_length_indices.data)
134 | reconstructions = self.decoder((x * y[:, :, None]).view(x.size(0), -1))
135 |
136 | return classes, reconstructions
137 |
--------------------------------------------------------------------------------
/models/CNNKim.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class KIMCNN1D(nn.Module):
7 | def __init__(self, opt):
8 | super(KIMCNN1D, self).__init__()
9 |
10 | self.embedding_type = opt.embedding_type
11 | self.batch_size = opt.batch_size
12 | self.max_seq_len = opt.max_seq_len
13 | self.embedding_dim = opt.embedding_dim
14 | self.vocab_size = opt.vocab_size
15 | self.label_size = opt.label_size
16 | self.kernel_sizes = opt.kernel_sizes
17 | self.kernel_nums = opt.kernel_nums
18 | self.keep_dropout = opt.keep_dropout
19 | self.in_channel = 1
20 |
21 | assert (len(self.kernel_sizes) == len(self.kernel_nums))
22 |
23 | # one for UNK and one for zero padding
24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1)
25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel":
26 | self.embedding.weight=nn.Parameter(opt.embeddings)
27 | if self.embedding_type == "static":
28 | self.embedding.weight.requires_grad = False
29 | elif self.embedding_type == "multichannel":
30 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1)
31 | self.embedding2.weight=nn.Parameter(opt.embeddings)
32 | self.embedding2.weight.requires_grad = False
33 | self.in_channel = 2
34 | else:
35 | pass
36 | #
37 | # for i in range(len(self.kernel_sizes)):
38 | # conv = nn.Conv1d(self.in_channel, self.kernel_nums[i], self.embedding_dim * self.kernel_sizes[i], stride=self.embedding_dim)
39 | # setattr(self, 'conv_%d'%i, conv)
40 | self.convs = nn.ModuleList([nn.Conv1d(self.in_channel, num, self.embedding_dim * size, stride=self.embedding_dim) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)])
41 | self.fc = nn.Linear(sum(self.kernel_nums), self.label_size)
42 |
43 | def get_conv(self, i):
44 | return getattr(self, 'conv_%d'%i)
45 |
46 | def forward(self, inp):
47 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_seq_len)
48 | if self.embedding_type == "multichannel":
49 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_seq_len)
50 | x = torch.cat((x, x2), 1)
51 |
52 | # conv_results = [
53 | # F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_seq_len - self.kernel_sizes[i] + 1)
54 | # .view(-1, self.kernel_nums[i])
55 | # for i in range(len(self.kernel_sizes))]
56 | conv_results = [
57 | F.max_pool1d(F.relu(self.convs[i](x)), self.max_seq_len - self.kernel_sizes[i] + 1)
58 | .view(-1, self.kernel_nums[i])
59 | for i in range(len(self.convs))]
60 |
61 | x = torch.cat(conv_results, 1)
62 | x = F.dropout(x, p=self.keep_dropout, training=self.training)
63 | x = self.fc(x)
64 | return x
65 |
66 |
67 |
68 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py
69 | class KIMCNN2D(nn.Module):
70 |
71 | def __init__(self, opt):
72 | super(KIMCNN2D,self).__init__()
73 | self.opt = opt
74 | self.embedding_type = opt.embedding_type
75 | self.batch_size = opt.batch_size
76 | self.max_seq_len = opt.max_seq_len
77 | self.embedding_dim = opt.embedding_dim
78 | self.vocab_size = opt.vocab_size
79 | self.label_size = opt.label_size
80 | self.kernel_sizes = opt.kernel_sizes
81 | self.kernel_nums = opt.kernel_nums
82 | self.keep_dropout = opt.keep_dropout
83 |
84 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1)
85 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel":
86 | self.embedding.weight=nn.Parameter(opt.embeddings)
87 | if self.embedding_type == "static":
88 | self.embedding.weight.requires_grad = False
89 | elif self.embedding_type == "multichannel":
90 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1)
91 | self.embedding2.weight=nn.Parameter(opt.embeddings)
92 | self.embedding2.weight.requires_grad = False
93 | self.in_channel = 2
94 | else:
95 | pass
96 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
97 | self.convs1 = nn.ModuleList([nn.Conv2d(1, num, (size, opt.embedding_dim)) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)])
98 | '''
99 | self.conv13 = nn.Conv2d(Ci, Co, (3, D))
100 | self.conv14 = nn.Conv2d(Ci, Co, (4, D))
101 | self.conv15 = nn.Conv2d(Ci, Co, (5, D))
102 | '''
103 | self.dropout = nn.Dropout(opt.keep_dropout)
104 | self.fc = nn.Linear(sum(opt.kernel_nums), opt.label_size)
105 |
106 | def conv_and_pool(self, x, conv):
107 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
108 | x = F.max_pool1d(x, x.size(2)).squeeze(2)
109 | return x
110 |
111 |
112 | def forward(self, x):
113 | x = self.embedding(x) # (N,W,D)
114 |
115 |
116 |
117 | x = x.unsqueeze(1) # (N,Ci,W,D)
118 |
119 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
120 |
121 |
122 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
123 |
124 | x = torch.cat(x, 1)
125 |
126 | '''
127 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
128 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
129 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
130 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
131 | '''
132 | x = self.dropout(x) # (N,len(Ks)*Co)
133 | logit = self.fc(x) # (N,C)
134 | return logit
135 |
136 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import torch
3 | import torch.nn.functional as F
4 | from torchtext import data
5 | from torchtext import datasets
6 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
7 | import numpy as np
8 | from functools import wraps
9 | import time
10 | import sys
11 | import logging
12 | import os
13 |
14 | def log_time_delta(func):
15 | @wraps(func)
16 | def _deco(*args, **kwargs):
17 | start = time.time()
18 | ret = func(*args, **kwargs)
19 | end = time.time()
20 | delta = end - start
21 | print( "%s runed %.2f seconds"% (func.__name__,delta))
22 | return ret
23 | return _deco
24 |
25 | def clip_gradient(optimizer, grad_clip):
26 | for group in optimizer.param_groups:
27 | for param in group['params']:
28 | if param.grad is not None and param.requires_grad:
29 | param.grad.data.clamp_(-grad_clip, grad_clip)
30 |
31 |
32 | def loadData(opt):
33 | if not opt.from_torchtext:
34 | import dataHelper as helper
35 | return helper.loadData(opt)
36 | device = 0 if torch.cuda.is_available() else -1
37 |
38 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len)
39 | LABEL = data.Field(sequential=False)
40 | if opt.dataset=="imdb":
41 | train, test = datasets.IMDB.splits(TEXT, LABEL)
42 | elif opt.dataset=="sst":
43 | train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
44 | filter_pred=lambda ex: ex.label != 'neutral')
45 | elif opt.dataset=="trec":
46 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
47 | else:
48 | print("does not support this datset")
49 |
50 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
51 | LABEL.build_vocab(train)
52 | # print vocab information
53 | print('len(TEXT.vocab)', len(TEXT.vocab))
54 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
55 |
56 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)
57 |
58 | opt.label_size= len(LABEL.vocab)
59 | opt.vocab_size = len(TEXT.vocab)
60 | opt.embedding_dim= TEXT.vocab.vectors.size()[1]
61 | opt.embeddings = TEXT.vocab.vectors
62 |
63 | return train_iter, test_iter
64 |
65 |
66 | def evaluation(model,test_iter,from_torchtext=True):
67 | model.eval()
68 | accuracy=[]
69 | # batch= next(iter(test_iter))
70 | for index,batch in enumerate( test_iter):
71 | text = batch.text[0] if from_torchtext else batch.text
72 | predicted = model(text)
73 | prob, idx = torch.max(predicted, 1)
74 | percision=(idx== batch.label).float().mean()
75 |
76 | if torch.cuda.is_available():
77 | accuracy.append(percision.data.cpu().numpy()[0] )
78 | else:
79 | accuracy.append(percision.data.numpy()[0] )
80 | model.train()
81 | return np.mean(accuracy)
82 |
83 |
84 |
85 | def getOptimizer(params,name="adam",lr=1,momentum=None,scheduler=None):
86 |
87 | name = name.lower().strip()
88 |
89 | if name=="adadelta":
90 | optimizer=torch.optim.Adadelta(params, lr=1.0*lr, rho=0.9, eps=1e-06, weight_decay=0).param_groups()
91 | elif name == "adagrad":
92 | optimizer=torch.optim.Adagrad(params, lr=0.01*lr, lr_decay=0, weight_decay=0)
93 | elif name == "sparseadam":
94 | optimizer=torch.optim.SparseAdam(params, lr=0.001*lr, betas=(0.9, 0.999), eps=1e-08)
95 | elif name =="adamax":
96 | optimizer=torch.optim.Adamax(params, lr=0.002*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
97 | elif name =="asgd":
98 | optimizer=torch.optim.ASGD(params, lr=0.01*lr, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0)
99 | elif name == "lbfgs":
100 | optimizer=torch.optim.LBFGS(params, lr=1*lr, max_iter=20, max_eval=None, tolerance_grad=1e-05, tolerance_change=1e-09, history_size=100, line_search_fn=None)
101 | elif name == "rmsprop":
102 | optimizer=torch.optim.RMSprop(params, lr=0.01*lr, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
103 | elif name =="rprop":
104 | optimizer=torch.optim.Rprop(params, lr=0.01*lr, etas=(0.5, 1.2), step_sizes=(1e-06, 50))
105 | elif name =="sgd":
106 | optimizer=torch.optim.SGD(params, lr=0.1*lr, momentum=0, dampening=0, weight_decay=0, nesterov=False)
107 | elif name =="adam":
108 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
109 | else:
110 | print("undefined optimizer, use adam in default")
111 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
112 |
113 | if scheduler is not None:
114 | if scheduler == "lambdalr":
115 | lambda1 = lambda epoch: epoch // 30
116 | lambda2 = lambda epoch: 0.95 ** epoch
117 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
118 | elif scheduler=="steplr":
119 | return torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
120 | elif scheduler =="multisteplr":
121 | return torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
122 | elif scheduler =="reducelronplateau":
123 | return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
124 | else:
125 | pass
126 |
127 | else:
128 | return optimizer
129 |
130 |
131 | return
132 | def getLogger():
133 | import random
134 | random_str = str(random.randint(1,10000))
135 |
136 | now = int(time.time())
137 | timeArray = time.localtime(now)
138 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray)
139 | log_filename = "log/" +time.strftime("%Y%m%d", timeArray)
140 |
141 | program = os.path.basename(sys.argv[0])
142 | logger = logging.getLogger(program)
143 | if not os.path.exists("log"):
144 | os.mkdir("log")
145 | if not os.path.exists(log_filename):
146 | os.mkdir(log_filename)
147 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',datefmt='%a, %d %b %Y %H:%M:%S',filename=log_filename+'/qa'+timeStamp+"_"+ random_str+'.log',filemode='w')
148 | logging.root.setLevel(level=logging.INFO)
149 | logger.info("running %s" % ' '.join(sys.argv))
150 |
151 | return logger
152 | def is_writeable(path, check_parent=False):
153 | '''
154 | Check if a given path is writeable by the current user.
155 | :param path: The path to check
156 | :param check_parent: If the path to check does not exist, check for the
157 | ability to write to the parent directory instead
158 | :returns: True or False
159 | '''
160 | if os.access(path, os.F_OK) and os.access(path, os.W_OK):
161 | # The path exists and is writeable
162 | return True
163 | if os.access(path, os.F_OK) and not os.access(path, os.W_OK):
164 | # The path exists and is not writeable
165 | return False
166 | # The path does not exists or is not writeable
167 | if check_parent is False:
168 | # We're not allowed to check the parent directory of the provided path
169 | return False
170 | # Lets get the parent directory of the provided path
171 | parent_dir = os.path.dirname(path)
172 | if not os.access(parent_dir, os.F_OK):
173 | # Parent directory does not exit
174 | return False
175 | # Finally, return if we're allowed to write in the parent directory of the
176 | # provided path
177 | return os.access(parent_dir, os.W_OK)
178 | def is_readable(path):
179 | '''
180 | Check if a given path is readable by the current user.
181 | :param path: The path to check
182 | :returns: True or False
183 | '''
184 | if os.access(path, os.F_OK) and os.access(path, os.R_OK):
185 | # The path exists and is readable
186 | return True
187 | # The path does not exist
188 | return False
189 |
190 |
--------------------------------------------------------------------------------
/models/CNN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class CNN(nn.Module):
7 | def __init__(self, opt):
8 | super(CNN, self).__init__()
9 |
10 | self.embedding_type = opt.embedding_type
11 | self.batch_size = opt.batch_size
12 | self.max_sent_len = opt.max_sent_len
13 | self.embedding_dim = opt.embedding_dim
14 | self.vocab_size = opt.vocab_size
15 | self.CLASS_SIZE = opt.label_size
16 | self.FILTERS = opt["FILTERS"]
17 | self.FILTER_NUM = opt["FILTER_NUM"]
18 | self.keep_dropout = opt.keep_dropout
19 | self.IN_CHANNEL = 1
20 |
21 | assert (len(self.FILTERS) == len(self.FILTER_NUM))
22 |
23 | # one for UNK and one for zero padding
24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1)
25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel":
26 | self.WV_MATRIX = opt["WV_MATRIX"]
27 | self.embedding.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
28 | if self.embedding_type == "static":
29 | self.embedding.weight.requires_grad = False
30 | elif self.embedding_type == "multichannel":
31 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.VOCAB_SIZE + 1)
32 | self.embedding2.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
33 | self.embedding2.weight.requires_grad = False
34 | self.IN_CHANNEL = 2
35 |
36 | for i in range(len(self.FILTERS)):
37 | conv = nn.Conv1d(self.IN_CHANNEL, self.FILTER_NUM[i], self.embedding_dim * self.FILTERS[i], stride=self.WORD_DIM)
38 | setattr(self, 'conv_%d'%i, conv)
39 |
40 | self.fc = nn.Linear(sum(self.FILTER_NUM), self.label_size)
41 |
42 | def get_conv(self, i):
43 | return getattr(self, 'conv_%d'%i)
44 |
45 | def forward(self, inp):
46 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_sent_len)
47 | if self.embedding_type == "multichannel":
48 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_sent_len)
49 | x = torch.cat((x, x2), 1)
50 |
51 | conv_results = [
52 | F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_sent_len - self.FILTERS[i] + 1)
53 | .view(-1, self.FILTER_NUM[i])
54 | for i in range(len(self.FILTERS))]
55 |
56 | x = torch.cat(conv_results, 1)
57 | x = F.dropout(x, p=self.keep_dropout, training=self.training)
58 | x = self.fc(x)
59 | return x
60 |
61 |
62 |
63 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py
64 | class CNN1(nn.Module):
65 |
66 | def __init__(self, opt):
67 | super(CNN1,self).__init__()
68 | self.opt = opt
69 |
70 | V = opt.vocab_size
71 | D = opt.embedding_dim
72 | C = opt.label_size
73 | Ci = 1
74 | Co = opt.kernel_num
75 | Ks = opt.kernel_sizes
76 |
77 | self.embed = nn.Embedding(V, D)
78 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
79 | self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
80 | '''
81 | self.conv13 = nn.Conv2d(Ci, Co, (3, D))
82 | self.conv14 = nn.Conv2d(Ci, Co, (4, D))
83 | self.conv15 = nn.Conv2d(Ci, Co, (5, D))
84 | '''
85 | self.dropout = nn.Dropout(opt.dropout)
86 | self.fc1 = nn.Linear(len(Ks)*Co, C)
87 |
88 | def conv_and_pool(self, x, conv):
89 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
90 | x = F.max_pool1d(x, x.size(2)).squeeze(2)
91 | return x
92 |
93 |
94 | def forward(self, x):
95 | x = self.embed(x) # (N,W,D)
96 |
97 | if self.args.static:
98 | x = Variable(x)
99 |
100 | x = x.unsqueeze(1) # (N,Ci,W,D)
101 |
102 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
103 |
104 |
105 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
106 |
107 | x = torch.cat(x, 1)
108 |
109 | '''
110 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
111 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
112 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
113 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
114 | '''
115 | x = self.dropout(x) # (N,len(Ks)*Co)
116 | logit = self.fc1(x) # (N,C)
117 | return logit
118 |
119 | import torch.nn as nn
120 |
121 |
122 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py
123 | class CNN2(nn.Module):
124 | def __init__(self, opt):
125 | super(CNN2, self).__init__()
126 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim)
127 |
128 | self.conv1 = nn.Sequential(
129 | nn.Conv1d(opt.l0, 256, kernel_size=7, stride=1),
130 | nn.ReLU(),
131 | nn.MaxPool1d(kernel_size=3, stride=3)
132 | )
133 |
134 | self.conv2 = nn.Sequential(
135 | nn.Conv1d(256, 256, kernel_size=7, stride=1),
136 | nn.ReLU(),
137 | nn.MaxPool1d(kernel_size=3, stride=3)
138 | )
139 |
140 | self.conv3 = nn.Sequential(
141 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
142 | nn.ReLU()
143 | )
144 |
145 | self.conv4 = nn.Sequential(
146 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
147 | nn.ReLU()
148 | )
149 |
150 | self.conv5 = nn.Sequential(
151 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
152 | nn.ReLU()
153 | )
154 |
155 | self.conv6 = nn.Sequential(
156 | nn.Conv1d(256, 256, kernel_size=3, stride=1),
157 | nn.ReLU(),
158 | nn.MaxPool1d(kernel_size=3, stride=3)
159 | )
160 |
161 | self.fc = nn.Linear(256, opt.label_size)
162 |
163 | def forward(self, x_input):
164 | # Embedding
165 | x = self.embed(x_input) # dim: (batch_size, max_seq_len, embedding_size)
166 | x = self.conv1(x)
167 | x = self.conv2(x)
168 | x = self.conv3(x)
169 | x = self.conv4(x)
170 | x = self.conv5(x)
171 | x = self.conv6(x)
172 |
173 | # collapse
174 | x = x.view(x.size(0), -1)
175 | x = self.fc(x)
176 |
177 | return F.log_softmax(x)
178 | class CNN3(nn.Module):
179 | """
180 | A CNN for text classification.
181 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
182 | """
183 | def __init__(self, args):
184 | super(CNN3, self).__init__()
185 | self.args = args
186 |
187 | embedding_dim = args.embed_dim
188 | embedding_num = args.num_features
189 | class_number = args.class_num
190 | in_channel = 1
191 | out_channel = args.kernel_num
192 | kernel_sizes = args.kernel_sizes
193 |
194 | self.embed = nn.Embedding(embedding_num+1, embedding_dim)
195 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in kernel_sizes])
196 |
197 | self.dropout = nn.Dropout(args.dropout)
198 | self.fc = nn.Linear(len(kernel_sizes) * out_channel, class_number)
199 |
200 |
201 | def forward(self, input_x):
202 | """
203 | :param input_x: a list size having the number of batch_size elements with the same length
204 | :return: batch_size X num_aspects tensor
205 | """
206 | # Embedding
207 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size)
208 |
209 | if self.args.static:
210 | x = F.Variable(input_x)
211 |
212 | # Conv & max pool
213 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size)
214 |
215 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1])
216 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
217 |
218 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes)
219 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
220 | x = torch.cat(x, 1)
221 |
222 | # Dropout & output
223 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels)
224 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects)
225 |
226 | return logit
--------------------------------------------------------------------------------
/dataHelper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import numpy as np
5 | import string
6 | from collections import Counter
7 | import pandas as pd
8 | from tqdm import tqdm
9 | import random
10 | import time
11 | from utils import log_time_delta
12 | from tqdm import tqdm
13 | from dataloader import Dataset
14 | import torch
15 | from torch.autograd import Variable
16 | from codecs import open
17 | try:
18 | import cPickle as pickle
19 | except ImportError:
20 | import pickle
21 | class Alphabet(dict):
22 | def __init__(self, start_feature_id = 1, alphabet_type="text"):
23 | self.fid = start_feature_id
24 | if alphabet_type=="text":
25 | self.add('[PADDING]')
26 | self.add('[UNK]')
27 | self.add('[END]')
28 | self.unknow_token = self.get('[UNK]')
29 | self.end_token = self.get('[END]')
30 | self.padding_token = self.get('[PADDING]')
31 |
32 | def add(self, item):
33 | idx = self.get(item, None)
34 | if idx is None:
35 | idx = self.fid
36 | self[item] = idx
37 | # self[idx] = item
38 | self.fid += 1
39 | return idx
40 |
41 | def addAll(self,words):
42 | for word in words:
43 | self.add(word)
44 |
45 | def dump(self, fname,path="temp"):
46 | if not os.path.exists(path):
47 | os.mkdir(path)
48 | with open(os.path.join(path,fname), "w",encoding="utf-8") as out:
49 | for k in sorted(self.keys()):
50 | out.write("{}\t{}\n".format(k, self[k]))
51 |
52 | class DottableDict(dict):
53 | def __init__(self, *args, **kwargs):
54 | dict.__init__(self, *args, **kwargs)
55 | self.__dict__ = self
56 | self.allowDotting()
57 | def allowDotting(self, state=True):
58 | if state:
59 | self.__dict__ = self
60 | else:
61 | self.__dict__ = dict()
62 |
63 | class BucketIterator(object):
64 | def __init__(self,data,opt=None,batch_size=2,shuffle=True,test=False,position=False):
65 | self.shuffle=shuffle
66 | self.data=data
67 | self.batch_size=batch_size
68 | self.test=test
69 | if opt is not None:
70 | self.setup(opt)
71 | def setup(self,opt):
72 |
73 | self.batch_size=opt.batch_size
74 | self.shuffle=opt.__dict__.get("shuffle",self.shuffle)
75 | self.position=opt.__dict__.get("position",False)
76 | self.padding_token = opt.alphabet.padding_token
77 |
78 | def transform(self,data):
79 | if torch.cuda.is_available():
80 | data=data.reset_index()
81 | text= Variable(torch.LongTensor(data.text).cuda())
82 | label= Variable(torch.LongTensor([int(i) for i in data.label.tolist()]).cuda())
83 | else:
84 | data=data.reset_index()
85 | text= Variable(torch.LongTensor(data.text))
86 | label= Variable(torch.LongTensor(data.label.tolist()))
87 | if self.position:
88 | position_tensor = self.get_position(data.text)
89 | return DottableDict({"text":(text,position_tensor),"label":label})
90 | return DottableDict({"text":text,"label":label})
91 |
92 | def get_position(self,inst_data):
93 | inst_position = np.array([[pos_i+1 if w_i != self.padding_token else 0 for pos_i, w_i in enumerate(inst)] for inst in inst_data])
94 | inst_position_tensor = Variable( torch.LongTensor(inst_position), volatile=self.test)
95 | if torch.cuda.is_available():
96 | inst_position_tensor=inst_position_tensor.cuda()
97 | return inst_position_tensor
98 |
99 | def __iter__(self):
100 | if self.shuffle:
101 | self.data = self.data.sample(frac=1).reset_index(drop=True)
102 | batch_nums = int(len(self.data)/self.batch_size)
103 | for i in range(batch_nums):
104 | yield self.transform(self.data[i*self.batch_size:(i+1)*self.batch_size])
105 | yield self.transform(self.data[-1*self.batch_size:])
106 |
107 |
108 |
109 |
110 | @log_time_delta
111 | def vectors_lookup(vectors,vocab,dim):
112 | embedding = np.zeros((len(vocab),dim))
113 | count = 1
114 | for word in vocab:
115 | if word in vectors:
116 | count += 1
117 | embedding[vocab[word]]= vectors[word]
118 | else:
119 | embedding[vocab[word]]= np.random.uniform(-0.5,+0.5,dim)#vectors['[UNKNOW]'] #.tolist()
120 | print( 'word in embedding',count)
121 | return embedding
122 |
123 | @log_time_delta
124 | def load_text_vec(alphabet,filename="",embedding_size=-1):
125 | vectors = {}
126 | with open(filename,encoding='utf-8') as f:
127 | for line in tqdm(f):
128 | items = line.strip().split(' ')
129 | if len(items) == 2:
130 | vocab_size, embedding_size= items[0],items[1]
131 | print( 'embedding_size',embedding_size)
132 | print( 'vocab_size in pretrained embedding',vocab_size)
133 | else:
134 | word = items[0]
135 | if word in alphabet:
136 | vectors[word] = items[1:]
137 | print( 'words need to be found ',len(alphabet))
138 | print( 'words found in wor2vec embedding ',len(vectors.keys()))
139 |
140 | if embedding_size==-1:
141 | embedding_size = len(vectors[list(vectors.keys())[0]])
142 | return vectors,embedding_size
143 |
144 | def getEmbeddingFile(opt):
145 | #"glove" "w2v"
146 | embedding_name = opt.__dict__.get("embedding","glove_6b_300")
147 | if embedding_name.startswith("glove"):
148 | return os.path.join( ".vector_cache","glove.6B.300d.txt")
149 | else:
150 | return opt.embedding_dir
151 | # please refer to https://pypi.python.org/pypi/torchwordemb/0.0.7
152 | return
153 | @log_time_delta
154 | def getSubVectors(opt,alphabet):
155 | pickle_filename = "temp/"+opt.dataset+".vec"
156 | if not os.path.exists(pickle_filename) or opt.debug:
157 | glove_file = getEmbeddingFile(opt)
158 | wordset= set(alphabet.keys()) # python 2.7
159 | loaded_vectors,embedding_size = load_text_vec(wordset,glove_file)
160 |
161 | vectors = vectors_lookup(loaded_vectors,alphabet,embedding_size)
162 | if opt.debug:
163 | if not os.path.exists("temp"):
164 | os.mkdir("temp")
165 | with open("temp/oov.txt","w","utf-8") as f:
166 | unknown_set = set(alphabet.keys()) - set(loaded_vectors.keys())
167 | f.write("\n".join( unknown_set))
168 | if opt.debug:
169 | pickle.dump(vectors,open(pickle_filename,"wb"))
170 | return vectors
171 | else:
172 | print("load cache for SubVector")
173 | return pickle.load(open(pickle_filename,"rb"))
174 |
175 | def getDataSet(opt):
176 | import dataloader
177 | dataset= dataloader.getDataset(opt)
178 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']]
179 |
180 | return dataset.getFormatedData()
181 |
182 | #data_dir = os.path.join(".data/clean",opt.dataset)
183 | #if not os.path.exists(data_dir):
184 | # import dataloader
185 | # dataset= dataloader.getDataset(opt)
186 | # return dataset.getFormatedData()
187 | #else:
188 | # for root, dirs, files in os.walk(data_dir):
189 | # for file in files:
190 | # yield os.path.join(root,file)
191 |
192 |
193 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']]
194 |
195 | import re
196 | def clean(text):
197 | # text="'tycoon.