├── LICENSE.txt ├── README.md ├── config └── imdb.ini ├── dataHelper.py ├── dataloader ├── Dataset.py ├── __init__.py ├── ag.py ├── glove.py ├── imdb.py ├── mr.py ├── sst.py └── torch_text_demo │ ├── imdb.py │ ├── sst.py │ └── trec.py ├── docs ├── data_config.md ├── data_config_en.md ├── windows_torch.md └── windows_torch_en.md ├── main.py ├── models ├── BiBloSA.py ├── CNN.py ├── CNNBasic.py ├── CNNInception.py ├── CNNKim.py ├── CNNMultiLayer.py ├── CNNText.py ├── CNN_Inception.py ├── Capsule.py ├── ConvS2S.py ├── DiSAN.py ├── FastText.py ├── LSTM.py ├── LSTMBI.py ├── LSTMStack.py ├── LSTMTree.py ├── LSTMwithAttention.py ├── MLP.py ├── MemoryNetwork.py ├── QuantumCNN.py ├── RCNN.py ├── RNN_CNN.py ├── SelfAttention.py ├── Transformer.py └── __init__.py ├── opts.py ├── parameter_search.py ├── push.bash ├── search.sh ├── trandition.py └── utils.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Barun Patra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification Benchmark 2 | A Benchmark of Text Classification in PyTorch 3 | 4 | 5 | ## Motivation 6 | 7 | We are trying to build a Benchmark for Text Classification including 8 | 9 | 10 | >Many Text Classification **DataSet**, including Sentiment/Topic Classfication, popular language(e.g. English and Chinese). Meanwhile, a basic word embedding is provided. 11 | 12 | >Implment many popular and state-of-art **Models**, especially in deep neural network. 13 | 14 | ## Have done 15 | We have done some dataset and models 16 | ### Dataset done 17 | - IMDB 18 | - SST 19 | - Trec 20 | 21 | ### Models done 22 | - FastText 23 | - BasicCNN (KimCNN,MultiLayerCNN, Multi-perspective CNN) 24 | - InceptionCNN 25 | - LSTM (BILSTM, StackLSTM) 26 | - LSTM with Attention (Self Attention / Quantum Attention) 27 | - Hybrids between CNN and RNN (RCNN, C-LSTM) 28 | - Transformer - Attention is all you need 29 | - ConS2S 30 | - Capsule 31 | - Quantum-inspired NN 32 | 33 | ## Libary 34 | 35 | You should have install [these librarys](docs/windows_torch_en.md) 36 |
 37 | python3
 38 | torch
 39 | torchtext (optional)
 40 | 
41 | 42 | ## Dataset 43 | Dataset will be automatically configured in current path, or download manually your data in [Dataset](docs/data_config_en.md), step-by step. 44 | 45 | including 46 |
 47 | Glove embeding
 48 | Sentiment classfication dataset IMDB
 49 | 
50 | 51 | 52 | ## usage 53 | 54 | 55 | Run in default setting 56 |
python main.py
57 | 58 | CNN 59 |
python main.py --model cnn
60 | 61 | LSTM 62 |
python main.py --model lstm
63 | 64 | ## Road Map 65 | - [X] Data preprossing framework 66 | - [X] Models modules 67 | - [ ] Loss, Estimator and hyper-paramter tuning. 68 | - [ ] Test modules 69 | - [ ] More Dataset 70 | - [ ] More models 71 | 72 | 73 | 74 | ## Organisation of the repository 75 | The core of this repository is models and dataset. 76 | 77 | 78 | * ```dataloader/```: loading all dataset such as ```IMDB```, ```SST``` 79 | 80 | * ```models/```: creating all models such as ```FastText```, ```LSTM```,```CNN```,```Capsule```,```QuantumCNN``` ,```Multi-Head Attention``` 81 | 82 | * ```opts.py```: Parameter and config info. 83 | 84 | * ```utils.py```: tools. 85 | 86 | * ```dataHelper```: data helper 87 | 88 | 89 | 90 | 91 | ## Contributor 92 | - [@Allenzhai](https://github.com/zhaizheng) 93 | - [@JaredWei](https://github.com/jacobwei) 94 | - [@AlexMeng](https://github.com/EdwardLorenz) 95 | - [@Lilianwang](https://github.com/WangLilian) 96 | - [@ZhanSu](https://github.com/shuishen112) 97 | - [@Wabywang](https://github.com/Wabyking) 98 | 99 | Welcome your issues and contribution!!! 100 | 101 | -------------------------------------------------------------------------------- /config/imdb.ini: -------------------------------------------------------------------------------- 1 | [COMMON] 2 | dataset = imdb 3 | 4 | -------------------------------------------------------------------------------- /dataHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import numpy as np 5 | import string 6 | from collections import Counter 7 | import pandas as pd 8 | from tqdm import tqdm 9 | import random 10 | import time 11 | from utils import log_time_delta 12 | from tqdm import tqdm 13 | from dataloader import Dataset 14 | import torch 15 | from torch.autograd import Variable 16 | from codecs import open 17 | try: 18 | import cPickle as pickle 19 | except ImportError: 20 | import pickle 21 | class Alphabet(dict): 22 | def __init__(self, start_feature_id = 1, alphabet_type="text"): 23 | self.fid = start_feature_id 24 | if alphabet_type=="text": 25 | self.add('[PADDING]') 26 | self.add('[UNK]') 27 | self.add('[END]') 28 | self.unknow_token = self.get('[UNK]') 29 | self.end_token = self.get('[END]') 30 | self.padding_token = self.get('[PADDING]') 31 | 32 | def add(self, item): 33 | idx = self.get(item, None) 34 | if idx is None: 35 | idx = self.fid 36 | self[item] = idx 37 | # self[idx] = item 38 | self.fid += 1 39 | return idx 40 | 41 | def addAll(self,words): 42 | for word in words: 43 | self.add(word) 44 | 45 | def dump(self, fname,path="temp"): 46 | if not os.path.exists(path): 47 | os.mkdir(path) 48 | with open(os.path.join(path,fname), "w",encoding="utf-8") as out: 49 | for k in sorted(self.keys()): 50 | out.write("{}\t{}\n".format(k, self[k])) 51 | 52 | class DottableDict(dict): 53 | def __init__(self, *args, **kwargs): 54 | dict.__init__(self, *args, **kwargs) 55 | self.__dict__ = self 56 | self.allowDotting() 57 | def allowDotting(self, state=True): 58 | if state: 59 | self.__dict__ = self 60 | else: 61 | self.__dict__ = dict() 62 | 63 | class BucketIterator(object): 64 | def __init__(self,data,opt=None,batch_size=2,shuffle=True,test=False,position=False): 65 | self.shuffle=shuffle 66 | self.data=data 67 | self.batch_size=batch_size 68 | self.test=test 69 | if opt is not None: 70 | self.setup(opt) 71 | def setup(self,opt): 72 | 73 | self.batch_size=opt.batch_size 74 | self.shuffle=opt.__dict__.get("shuffle",self.shuffle) 75 | self.position=opt.__dict__.get("position",False) 76 | self.padding_token = opt.alphabet.padding_token 77 | 78 | def transform(self,data): 79 | if torch.cuda.is_available(): 80 | data=data.reset_index() 81 | text= Variable(torch.LongTensor(data.text).cuda()) 82 | label= Variable(torch.LongTensor([int(i) for i in data.label.tolist()]).cuda()) 83 | else: 84 | data=data.reset_index() 85 | text= Variable(torch.LongTensor(data.text)) 86 | label= Variable(torch.LongTensor(data.label.tolist())) 87 | if self.position: 88 | position_tensor = self.get_position(data.text) 89 | return DottableDict({"text":(text,position_tensor),"label":label}) 90 | return DottableDict({"text":text,"label":label}) 91 | 92 | def get_position(self,inst_data): 93 | inst_position = np.array([[pos_i+1 if w_i != self.padding_token else 0 for pos_i, w_i in enumerate(inst)] for inst in inst_data]) 94 | inst_position_tensor = Variable( torch.LongTensor(inst_position), volatile=self.test) 95 | if torch.cuda.is_available(): 96 | inst_position_tensor=inst_position_tensor.cuda() 97 | return inst_position_tensor 98 | 99 | def __iter__(self): 100 | if self.shuffle: 101 | self.data = self.data.sample(frac=1).reset_index(drop=True) 102 | batch_nums = int(len(self.data)/self.batch_size) 103 | for i in range(batch_nums): 104 | yield self.transform(self.data[i*self.batch_size:(i+1)*self.batch_size]) 105 | yield self.transform(self.data[-1*self.batch_size:]) 106 | 107 | 108 | 109 | 110 | @log_time_delta 111 | def vectors_lookup(vectors,vocab,dim): 112 | embedding = np.zeros((len(vocab),dim)) 113 | count = 1 114 | for word in vocab: 115 | if word in vectors: 116 | count += 1 117 | embedding[vocab[word]]= vectors[word] 118 | else: 119 | embedding[vocab[word]]= np.random.uniform(-0.5,+0.5,dim)#vectors['[UNKNOW]'] #.tolist() 120 | print( 'word in embedding',count) 121 | return embedding 122 | 123 | @log_time_delta 124 | def load_text_vec(alphabet,filename="",embedding_size=-1): 125 | vectors = {} 126 | with open(filename,encoding='utf-8') as f: 127 | for line in tqdm(f): 128 | items = line.strip().split(' ') 129 | if len(items) == 2: 130 | vocab_size, embedding_size= items[0],items[1] 131 | print( 'embedding_size',embedding_size) 132 | print( 'vocab_size in pretrained embedding',vocab_size) 133 | else: 134 | word = items[0] 135 | if word in alphabet: 136 | vectors[word] = items[1:] 137 | print( 'words need to be found ',len(alphabet)) 138 | print( 'words found in wor2vec embedding ',len(vectors.keys())) 139 | 140 | if embedding_size==-1: 141 | embedding_size = len(vectors[list(vectors.keys())[0]]) 142 | return vectors,embedding_size 143 | 144 | def getEmbeddingFile(opt): 145 | #"glove" "w2v" 146 | embedding_name = opt.__dict__.get("embedding","glove_6b_300") 147 | if embedding_name.startswith("glove"): 148 | return os.path.join( ".vector_cache","glove.6B.300d.txt") 149 | else: 150 | return opt.embedding_dir 151 | # please refer to https://pypi.python.org/pypi/torchwordemb/0.0.7 152 | return 153 | @log_time_delta 154 | def getSubVectors(opt,alphabet): 155 | pickle_filename = "temp/"+opt.dataset+".vec" 156 | if not os.path.exists(pickle_filename) or opt.debug: 157 | glove_file = getEmbeddingFile(opt) 158 | wordset= set(alphabet.keys()) # python 2.7 159 | loaded_vectors,embedding_size = load_text_vec(wordset,glove_file) 160 | 161 | vectors = vectors_lookup(loaded_vectors,alphabet,embedding_size) 162 | if opt.debug: 163 | if not os.path.exists("temp"): 164 | os.mkdir("temp") 165 | with open("temp/oov.txt","w","utf-8") as f: 166 | unknown_set = set(alphabet.keys()) - set(loaded_vectors.keys()) 167 | f.write("\n".join( unknown_set)) 168 | if opt.debug: 169 | pickle.dump(vectors,open(pickle_filename,"wb")) 170 | return vectors 171 | else: 172 | print("load cache for SubVector") 173 | return pickle.load(open(pickle_filename,"rb")) 174 | 175 | def getDataSet(opt): 176 | import dataloader 177 | dataset= dataloader.getDataset(opt) 178 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']] 179 | 180 | return dataset.getFormatedData() 181 | 182 | #data_dir = os.path.join(".data/clean",opt.dataset) 183 | #if not os.path.exists(data_dir): 184 | # import dataloader 185 | # dataset= dataloader.getDataset(opt) 186 | # return dataset.getFormatedData() 187 | #else: 188 | # for root, dirs, files in os.walk(data_dir): 189 | # for file in files: 190 | # yield os.path.join(root,file) 191 | 192 | 193 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']] 194 | 195 | import re 196 | def clean(text): 197 | # text="'tycoon.","
","+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ",text) 201 | 202 | # print("%s $$$$$ %s" %(pre,text)) 203 | 204 | return text.lower().split() 205 | @log_time_delta 206 | def get_clean_datas(opt): 207 | pickle_filename = "temp/"+opt.dataset+".data" 208 | if not os.path.exists(pickle_filename) or opt.debug: 209 | datas = [] 210 | for filename in getDataSet(opt): 211 | df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0') 212 | 213 | # df["text"]= df["text"].apply(clean).str.lower().str.split() #replace("[\",:#]"," ") 214 | df["text"]= df["text"].apply(clean) 215 | datas.append(df) 216 | if opt.debug: 217 | if not os.path.exists("temp"): 218 | os.mkdir("temp") 219 | pickle.dump(datas,open(pickle_filename,"wb")) 220 | return datas 221 | else: 222 | print("load cache for data") 223 | return pickle.load(open(pickle_filename,"rb")) 224 | 225 | 226 | def loadData(opt,embedding=True): 227 | if embedding==False: 228 | return loadDataWithoutEmbedding(opt) 229 | 230 | datas =get_clean_datas(opt) 231 | 232 | alphabet = Alphabet(start_feature_id = 0) 233 | label_alphabet= Alphabet(start_feature_id = 0,alphabet_type="label") 234 | 235 | 236 | df=pd.concat(datas) 237 | df.to_csv("demo.text",sep="\t",index=False) 238 | label_set = set(df["label"]) 239 | label_alphabet.addAll(label_set) 240 | 241 | word_set=set() 242 | [word_set.add(word) for l in df["text"] if l is not None for word in l ] 243 | # from functools import reduce 244 | # word_set=set(reduce(lambda x,y :x+y,df["text"])) 245 | 246 | alphabet.addAll(word_set) 247 | 248 | vectors = getSubVectors(opt,alphabet) 249 | 250 | if opt.max_seq_len==-1: 251 | opt.max_seq_len = df.apply(lambda row: row["text"].__len__(),axis=1).max() 252 | opt.vocab_size= len(alphabet) 253 | opt.label_size= len(label_alphabet) 254 | opt.embedding_dim= vectors.shape[-1] 255 | opt.embeddings = torch.FloatTensor(vectors) 256 | opt.alphabet=alphabet 257 | # alphabet.dump(opt.dataset+".alphabet") 258 | for data in datas: 259 | data["text"]= data["text"].apply(lambda text: [alphabet.get(word,alphabet.unknow_token) for word in text[:opt.max_seq_len]] + [alphabet.padding_token] *int(opt.max_seq_len-len(text)) ) 260 | data["label"]=data["label"].apply(lambda text: label_alphabet.get(text)) 261 | 262 | return map(lambda x:BucketIterator(x,opt),datas)#map(BucketIterator,datas) # 263 | 264 | def loadDataWithoutEmbedding(opt): 265 | datas=[] 266 | for filename in getDataSet(opt): 267 | df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0') 268 | df["text"]= df["text"].str.lower() 269 | datas.append((df["text"],df["label"])) 270 | return datas 271 | 272 | 273 | 274 | 275 | 276 | if __name__ =="__main__": 277 | import opts 278 | opt = opts.parse_opt() 279 | opt.max_seq_len=-1 280 | import dataloader 281 | dataset= dataloader.getDataset(opt) 282 | datas=loadData(opt) 283 | 284 | 285 | -------------------------------------------------------------------------------- /dataloader/Dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os,urllib 3 | class Dataset(object): 4 | def __init__(self,opt=None): 5 | if opt is not None: 6 | self.setup(opt) 7 | self.http_proxy= opt.__dict__.get("proxy","null") 8 | 9 | else: 10 | self.name="demo" 11 | self.dirname="demo" 12 | self.http_proxy="null" 13 | 14 | self.urls=[] 15 | self.root=".data" 16 | self.saved_path= os.path.join(os.path.join(self.root,"clean"),self.name) 17 | self.formated_files=None 18 | 19 | 20 | 21 | def setup(self,opt): 22 | 23 | self.name=opt.dataset 24 | self.dirname=opt.dataset 25 | self.http_proxy= opt.__dict__.get("proxy","null") 26 | 27 | 28 | def process(self): 29 | dirname=self.download() 30 | print("processing dirname: "+ dirname) 31 | raise Exception("method in father class have been called in processing: {} dataset".format(opt.dataset)) 32 | return dirname 33 | 34 | 35 | def getFormatedData(self): 36 | 37 | if self.formated_files is not None: 38 | return self.formated_files 39 | 40 | if os.path.exists(self.saved_path): 41 | return [os.path.join(self.saved_path,filename) for filename in os.listdir(self.saved_path)] 42 | self.formated_files = self.process() 43 | return self.formated_files 44 | 45 | def download_from_url(self,url, path, schedule=None): 46 | #if schedule is None: 47 | # schedule=lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*100)%10==0 else None 48 | if self.http_proxy != "null": 49 | proxy = urllib.request.ProxyHandler({'http': self.http_proxy,'https': self.http_proxy}) 50 | # construct a new opener using your proxy settings 51 | opener = urllib.request.build_opener(proxy) 52 | # install the openen on the module-level 53 | urllib.request.install_opener(opener) 54 | print("proxy in %s" % self.http_proxy) 55 | # urllib.request.urlretrieve(url,path,lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*1000)%100==0 else None )a 56 | try: 57 | urllib.request.urlretrieve(url,path ) 58 | except: 59 | import urllib2 60 | urllib2.urlretrieve(url,path ) 61 | return path 62 | 63 | def download(self,check=None): 64 | """Download and unzip an online archive (.zip, .gz, or .tgz). 65 | 66 | Arguments: 67 | check (str or None): Folder whose existence indicates 68 | that the dataset has already been downloaded, or 69 | None to check the existence of root/{cls.name}. 70 | 71 | Returns: 72 | dataset_path (str): Path to extracted dataset. 73 | """ 74 | import zipfile,tarfile 75 | 76 | path = os.path.join(self.root, self.name) 77 | check = path if check is None else check 78 | if not os.path.isdir(check): 79 | for url in self.urls: 80 | if isinstance(url, tuple): 81 | url, filename = url 82 | else: 83 | filename = os.path.basename(url) 84 | zpath = os.path.join(path, filename) 85 | if not os.path.isfile(zpath): 86 | if not os.path.exists(os.path.dirname(zpath)): 87 | os.makedirs(os.path.dirname(zpath)) 88 | print('downloading {}'.format(filename)) 89 | 90 | self.download_from_url(url, zpath) 91 | ext = os.path.splitext(filename)[-1] 92 | if ext == '.zip': 93 | with zipfile.ZipFile(zpath, 'r') as zfile: 94 | print('extracting') 95 | zfile.extractall(path) 96 | elif ext in ['.gz', '.tgz',".bz2"]: 97 | with tarfile.open(zpath, 'r:gz') as tar: 98 | dirs = [member for member in tar.getmembers()] 99 | tar.extractall(path=path, members=dirs) 100 | else: 101 | print("%s do not need to be downloaded" % path) 102 | return path 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from .imdb import IMDBDataset 5 | from .mr import MRDataset 6 | from .glove import Glove 7 | from .sst import SSTDataset 8 | from .ag import AGDataset 9 | 10 | from .Dataset import Dataset 11 | def getDataset(opt): 12 | if opt.dataset=="imdb": 13 | dataset = IMDBDataset(opt) 14 | elif opt.dataset=="mr": 15 | dataset = MRDataset(opt) 16 | elif opt.dataset=="sst": 17 | dataset =SSTDataset(opt) 18 | elif opt.dataset == "ag": 19 | dataset =AGDataset(opt) 20 | elif opt.dataset in ["cr","mpqa","mr","sst1","sst2","subj","trec"]: 21 | dataset =Dataset(opt) 22 | 23 | 24 | else: 25 | raise Exception("dataset not supported: {}".format(opt.dataset)) 26 | return dataset 27 | 28 | def getEmbedding(opt): 29 | if opt.embedding_file.startswith("glove"): 30 | assert len(opt.embedding_file.split(".")) ==3 , "embedding_type format wrong" 31 | _,corpus,dim=opt.embedding_file.split(".") 32 | return Glove(corpus,dim,opt) 33 | else: 34 | raise Exception("embedding not supported: {}".format(opt.embedding_type)) 35 | 36 | -------------------------------------------------------------------------------- /dataloader/ag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class AGDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(AGDataset,self).__init__(opt,**kwargs) 12 | self.urls=['http://www.di.unipi.it/~gulli/newsSpace.bz2'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | # root = os.path.join(root,"rt-polaritydata") 19 | # print("processing into: "+ root) 20 | ### root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | # if not os.path.exists(self.saved_path): 22 | # print("mkdir " + self.saved_path) 23 | # os.makedirs(self.saved_path) # better than os.mkdir 24 | ## 25 | # datas=[] 26 | # for polarity in ("neg","pos"): 27 | # filename = os.path.join(root,"rt-polarity."+polarity) 28 | # records=[] 29 | # with open(filename,encoding="utf-8",errors="replace") as f: 30 | # for i,line in enumerate(f): 31 | # print(i) 32 | # print(line) 33 | # records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | # datas.append(pd.DataFrame(records)) 35 | # 36 | # 37 | # 38 | # df = pd.concat(datas) 39 | # from sklearn.utils import shuffle 40 | # df = shuffle(df).reset_index() 41 | # 42 | # split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | ## train=df.sample(frac=0.8) 44 | # train = df[split_index] 45 | # test = df[~np.array(split_index)] 46 | # 47 | # train_filename=os.path.join(self.saved_path,"train.csv") 48 | # test_filename = os.path.join(self.saved_path,"test.csv") 49 | # train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | # test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | # 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | # return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="ag" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/glove.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from .Dataset import Dataset 5 | class Glove(Dataset): 6 | def __init__(self,corpus,dim,opt=None,**kwargs): 7 | super(Glove,self).__init__(opt,**kwargs) 8 | 9 | self.root = ".vector_cache" 10 | 11 | # if not os.path.exists(self.root): 12 | # os.makedirs(self.root) 13 | 14 | embeding_urls = { 15 | '42b': 'http://nlp.stanford.edu/data/glove.42B.300d.zip', 16 | '840b': 'http://nlp.stanford.edu/data/glove.840B.300d.zip', 17 | 'twitter.27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip', 18 | '6b': 'http://nlp.stanford.edu/data/glove.6B.zip', 19 | } 20 | 21 | 22 | self.urls= [ embeding_urls[corpus.lower()] ] 23 | print(self.urls) 24 | self.name = corpus 25 | 26 | 27 | def process(self): 28 | 29 | root=self.download() 30 | 31 | return root 32 | def getFilename(self): 33 | return self.process() 34 | 35 | if __name__ =="__main__": 36 | import opts 37 | opt = opts.parse_opt() 38 | 39 | 40 | import dataloader 41 | glove=dataloader.getEmbedding(opt) 42 | print(glove.getFilename()) 43 | 44 | -------------------------------------------------------------------------------- /dataloader/imdb.py: -------------------------------------------------------------------------------- 1 | from .Dataset import Dataset 2 | import os 3 | import pandas as pd 4 | from codecs import open 5 | 6 | class IMDBDataset(Dataset): 7 | def __init__(self,opt=None,**kwargs): 8 | super(IMDBDataset,self).__init__(opt,**kwargs) 9 | self.urls=['http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'] 10 | 11 | 12 | def process(self): 13 | 14 | root=self.download() 15 | root = os.path.join(root,"aclImdb") 16 | print("processing into: "+ root) 17 | # root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 18 | if not os.path.exists(self.saved_path): 19 | print("mkdir " + self.saved_path) 20 | os.makedirs(self.saved_path) # better than os.mkdir 21 | 22 | datafiles=[] 23 | 24 | for data_folder in ("train","test"): 25 | data = [] 26 | for polarity in ("pos","neg"): 27 | diranme=os.path.join( os.path.join(root,data_folder), polarity) 28 | for rt, dirs, files in os.walk(diranme): 29 | for f in files: 30 | filename= os.path.join(rt,f) 31 | data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 32 | df=pd.DataFrame(data) 33 | saved_filename=os.path.join(self.saved_path,data_folder+".csv") 34 | 35 | df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 36 | print("finished %s"%saved_filename) 37 | datafiles.append(saved_filename) 38 | print("processing into formated files over") 39 | 40 | 41 | return datafiles 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /dataloader/mr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class MRDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(MRDataset,self).__init__(opt,**kwargs) 12 | self.urls=['https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | root = os.path.join(root,"rt-polaritydata") 19 | print("processing into: "+ root) 20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | if not os.path.exists(self.saved_path): 22 | print("mkdir " + self.saved_path) 23 | os.makedirs(self.saved_path) # better than os.mkdir 24 | # 25 | datas=[] 26 | for polarity in ("neg","pos"): 27 | filename = os.path.join(root,"rt-polarity."+polarity) 28 | records=[] 29 | with open(filename,encoding="utf-8",errors="replace") as f: 30 | for i,line in enumerate(f): 31 | print(i) 32 | print(line) 33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | datas.append(pd.DataFrame(records)) 35 | 36 | 37 | 38 | df = pd.concat(datas) 39 | from sklearn.utils import shuffle 40 | df = shuffle(df).reset_index() 41 | 42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | # train=df.sample(frac=0.8) 44 | train = df[split_index] 45 | test = df[~np.array(split_index)] 46 | 47 | train_filename=os.path.join(self.saved_path,"train.csv") 48 | test_filename = os.path.join(self.saved_path,"test.csv") 49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="mr" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/sst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class SSTDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(SSTDataset,self).__init__(opt,**kwargs) 12 | self.urls=['http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | root = os.path.join(root,"rt-polaritydata") 19 | print("processing into: "+ root) 20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | if not os.path.exists(self.saved_path): 22 | print("mkdir " + self.saved_path) 23 | os.makedirs(self.saved_path) # better than os.mkdir 24 | # 25 | datas=[] 26 | for polarity in ("neg","pos"): 27 | filename = os.path.join(root,"rt-polarity."+polarity) 28 | records=[] 29 | with open(filename,encoding="utf-8",errors="replace") as f: 30 | for i,line in enumerate(f): 31 | print(i) 32 | print(line) 33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | datas.append(pd.DataFrame(records)) 35 | 36 | 37 | 38 | df = pd.concat(datas) 39 | from sklearn.utils import shuffle 40 | df = shuffle(df).reset_index() 41 | 42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | # train=df.sample(frac=0.8) 44 | train = df[split_index] 45 | test = df[~np.array(split_index)] 46 | 47 | train_filename=os.path.join(self.saved_path,"train.csv") 48 | test_filename = os.path.join(self.saved_path,"test.csv") 49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="sst" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/torch_text_demo/imdb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | 5 | from torchtext import data 6 | from torchtext import datasets 7 | from torchtext.vocab import GloVe 8 | import torch 9 | if torch.cuda.is_available() : 10 | device = -1 11 | else: 12 | device = 0 13 | # Approach 1: 14 | # set up fields 15 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) 16 | LABEL = data.Field(sequential=False) 17 | 18 | 19 | # make splits for data 20 | train, test = datasets.IMDB.splits(TEXT, LABEL) 21 | 22 | # print information about the data 23 | print('train.fields', train.fields) 24 | print('len(train)', len(train)) 25 | print('vars(train[0])', vars(train[0])) 26 | 27 | # build the vocabulary 28 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 29 | LABEL.build_vocab(train) 30 | 31 | # print vocab information 32 | print('len(TEXT.vocab)', len(TEXT.vocab)) 33 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 34 | 35 | # make iterator for splits 36 | #train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3, device=0) 37 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3,device=-1) 38 | # print batch information 39 | batch = next(iter(train_iter)) 40 | print(batch.text) 41 | print(batch.label) 42 | 43 | # Approach 2: 44 | train_iter, test_iter = datasets.IMDB.iters(batch_size=4,device=-1) 45 | 46 | # print batch information 47 | batch = next(iter(train_iter)) 48 | print(batch.text) 49 | print(batch.label) -------------------------------------------------------------------------------- /dataloader/torch_text_demo/sst.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext import datasets 3 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText 4 | 5 | 6 | # Approach 1: 7 | # set up fields 8 | TEXT = data.Field() 9 | LABEL = data.Field(sequential=False) 10 | 11 | # make splits for data 12 | train, val, test = datasets.SST.splits( 13 | TEXT, LABEL, fine_grained=True, train_subtrees=True, 14 | filter_pred=lambda ex: ex.label != 'neutral') 15 | 16 | # print information about the data 17 | print('train.fields', train.fields) 18 | print('len(train)', len(train)) 19 | print('vars(train[0])', vars(train[0])) 20 | 21 | # build the vocabulary 22 | url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' 23 | TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url)) 24 | LABEL.build_vocab(train) 25 | 26 | # print vocab information 27 | print('len(TEXT.vocab)', len(TEXT.vocab)) 28 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 29 | 30 | # make iterator for splits 31 | train_iter, val_iter, test_iter = data.BucketIterator.splits( 32 | (train, val, test), batch_size=3, device=0) 33 | 34 | # print batch information 35 | batch = next(iter(train_iter)) 36 | print(batch.text) 37 | print(batch.label) 38 | 39 | # Approach 2: 40 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) 41 | LABEL.build_vocab(train) 42 | 43 | # print vocab information 44 | print('len(TEXT.vocab)', len(TEXT.vocab)) 45 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 46 | 47 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) 48 | 49 | # print batch information 50 | batch = next(iter(train_iter)) 51 | print(batch.text) 52 | print(batch.label) 53 | 54 | # Approach 3: 55 | f = FastText() 56 | TEXT.build_vocab(train, vectors=f) 57 | TEXT.vocab.extend(f) 58 | LABEL.build_vocab(train) 59 | 60 | # print vocab information 61 | print('len(TEXT.vocab)', len(TEXT.vocab)) 62 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 63 | 64 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) 65 | 66 | # print batch information 67 | batch = next(iter(train_iter)) 68 | print(batch.text) 69 | print(batch.label) -------------------------------------------------------------------------------- /dataloader/torch_text_demo/trec.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext import datasets 3 | from torchtext.vocab import GloVe, CharNGram 4 | import torch 5 | if not torch.cuda.is_available() : 6 | device = -1 7 | else: 8 | device = 0 9 | 10 | # Approach 1: 11 | # set up fields 12 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) 13 | LABEL = data.Field(sequential=False) 14 | 15 | 16 | # make splits for data 17 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) 18 | 19 | # print information about the data 20 | print('train.fields', train.fields) 21 | print('len(train)', len(train)) 22 | print('vars(train[0])', vars(train[0])) 23 | 24 | # build the vocabulary 25 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 26 | LABEL.build_vocab(train) 27 | 28 | # print vocab information 29 | print('len(TEXT.vocab)', len(TEXT.vocab)) 30 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 31 | 32 | # make iterator for splits 33 | train_iter, test_iter = data.BucketIterator.splits( 34 | (train, test), batch_size=3, device=device) 35 | 36 | # print batch information 37 | batch = next(iter(train_iter)) 38 | print(batch.text) 39 | print(batch.label) 40 | 41 | # Approach 2: 42 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()],device=device) 43 | LABEL.build_vocab(train) 44 | 45 | train_iter, test_iter = datasets.TREC.iters(batch_size=4) 46 | 47 | # print batch information 48 | batch = next(iter(train_iter)) 49 | print(batch.text) 50 | print(batch.label) -------------------------------------------------------------------------------- /docs/data_config.md: -------------------------------------------------------------------------------- 1 | # 数据配置 2 | 3 | 4 | ##第一步先支持[torchtext](https://github.com/pytorch/text)本来支持的数据集合 5 | 6 | 7 | The datasets module currently contains: 8 | 9 | - Sentiment analysis: SST and IMDb 10 | - Question classification: TREC 11 | - Entailment: SNLI 12 | - Language modeling: WikiText-2 13 | - Machine translation: Multi30k, IWSLT, WMT14 14 | 15 | Others are planned or a work in progress: 16 | 17 | - Question answering: SQuAD 18 | 19 | 目前需要配置的数据集合 20 | 21 | ###Glove的下载到项目的根目录 ..vector_cache文件夹下 22 | 23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip) 24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip) 25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip) 26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip) 27 | 28 | ###分类数据集下载配置 29 | 30 | - [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)数据集下载到 .data/imdb 31 | - [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)数据集下载到.data/sst 32 | - TREC [1](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) 问题分类数据集下载到.data/imdb 33 | 34 | ###文件结构示例如下 35 | 36 | - TextClassificationBenchmark 37 | - .data 38 | - imdb 39 | - aclImdb_v1.tar.gz 40 | - sst 41 | - trainDevTestTrees_PTB.zip 42 | - trec 43 | - train_5500.label 44 | - TREC_10.label 45 | - .vector_cache 46 | - glove.42B.300d.zip 47 | - glove.840B.300d.zip 48 | - glove.twitter.27B.zip 49 | - glove.6B.zip 50 | 51 | 52 | 53 | ##更多的数据集请等待我们进一步更新 -------------------------------------------------------------------------------- /docs/data_config_en.md: -------------------------------------------------------------------------------- 1 | # Data configuration 2 | 3 | **Install [torchtext](https://github.com/pytorch/text) for data processing** 4 | 5 | The datasets module currently contains: 6 | 7 | - Sentiment analysis: SST and IMDb 8 | - Question classification: TREC 9 | - Entailment: SNLI 10 | - Language modeling: WikiText-2 11 | - Machine translation: Multi30k, IWSLT, WMT14 12 | 13 | Others are planned or a work in progress: 14 | 15 | - Question answering: SQuAD 16 | 17 | The current need to configure the data collection 18 | 19 | ### Glove 20 | 21 | Download to the project's root directory under the folder vector_cache 22 | 23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip) 24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip) 25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip) 26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip) 27 | 28 | ### Classification Datasets 29 | 30 | - Download [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) dataset to .data/imdb 31 | - Download [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip) dataset to .data/sst 32 | - Download TREC [Question Classification ](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) dataset to .data/imdb 33 | 34 | ### File Structure 35 | 36 | - TextClassificationBenchmark 37 | - .data 38 | - imdb 39 | - aclImdb_v1.tar.gz 40 | - sst 41 | - trainDevTestTrees_PTB.zip 42 | - trec 43 | - train_5500.label 44 | - TREC_10.label 45 | - .vector_cache 46 | - glove.42B.300d.zip 47 | - glove.840B.300d.zip 48 | - glove.twitter.27B.zip 49 | - glove.6B.zip 50 | 51 | 52 | 53 | ## More datasets and updates coming soon, please wait for us to update further 54 | -------------------------------------------------------------------------------- /docs/windows_torch.md: -------------------------------------------------------------------------------- 1 | # Windows 平台安装 PyTorch 2 | 3 | 如果是Linux,Mac安装直接移步pytorch[主页](http://pytorch.org/), 再安装TorchText 4 | 5 | ## Python安装 6 | 建议直接安装anaconda的[安装包](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe) 7 | 8 | ## Pytorch安装 9 | 在[百度网盘](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch)下载一个 离线安装包 , 0.3版本或者是0.2版本均可 10 | 如果是whl安装包 11 |
pip install torch0.3XXX.whl
12 | 如果是一个conda安装包(压缩文件后缀) 13 |
conda install --offline  torch0.3XXX.tar.bz
14 | 15 | ## TorchText 安装 16 | 17 | 前提是有git和pip,如果没有需要下载git,并将其放到Path环境变量里 18 |
pip install git+https://github.com/pytorch/text.git 
19 | 20 | 还需要有代理的话 21 | 22 | 23 | 24 |
pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080 
25 | 26 | 27 | 参考链接 28 | https://zhuanlan.zhihu.com/p/31747695 29 | -------------------------------------------------------------------------------- /docs/windows_torch_en.md: -------------------------------------------------------------------------------- 1 | # Windows Platform Installation for PyTorch 2 | 3 | If Linux, Mac directly use pytorch from [homepage](http://pytorch.org/), and reinstall TorchText 4 | 5 | ## Python installation 6 | Please install anaconda directly: [installation package](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe) 7 | 8 | ## Pytorch installation 9 | In[Baidu Network Disk](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch) download offline, Version 0.3 or 0.2 wheels 10 |
pip install torch0.3XXX.whl
11 | 12 | If it is a conda installation environment 13 |
conda install --offline  torch0.3XXX.tar.bz
14 | 15 | ## TorchText installation 16 | 17 | The assumption is that you have git and pip, if you don't, you need to download git and put it in the Path environment variable. 18 |
pip install git+https://github.com/pytorch/text.git 
19 | 20 | If you need a proxy, 21 |
pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080 
22 | 23 | 24 | Reference Link: 25 | https://zhuanlan.zhihu.com/p/31747695 26 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import torch 8 | from torch.autograd import Variable 9 | import torch.optim as optim 10 | import numpy as np 11 | 12 | from six.moves import cPickle 13 | 14 | import opts 15 | import models 16 | import torch.nn as nn 17 | import utils 18 | import torch.nn.functional as F 19 | from torchtext import data 20 | from torchtext import datasets 21 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText 22 | from torch.nn.modules.loss import NLLLoss,MultiLabelSoftMarginLoss,MultiLabelMarginLoss,BCELoss 23 | import dataHelper 24 | import time,os 25 | 26 | 27 | from_torchtext = False 28 | 29 | opt = opts.parse_opt() 30 | #opt.proxy="http://xxxx.xxxx.com:8080" 31 | 32 | 33 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys(): 34 | os.environ["CUDA_VISIBLE_DEVICES"] =opt.gpu 35 | #opt.model ='lstm' 36 | #opt.model ='capsule' 37 | 38 | if from_torchtext: 39 | train_iter, test_iter = utils.loadData(opt) 40 | else: 41 | import dataHelper as helper 42 | train_iter, test_iter = dataHelper.loadData(opt) 43 | 44 | opt.lstm_layers=2 45 | 46 | model=models.setup(opt) 47 | if torch.cuda.is_available(): 48 | model.cuda() 49 | model.train() 50 | print("# parameters:", sum(param.numel() for param in model.parameters() if param.requires_grad)) 51 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.learning_rate) 52 | optimizer.zero_grad() 53 | loss_fun = F.cross_entropy 54 | 55 | #batch = next(iter(train_iter)) 56 | 57 | #x=batch.text[0] 58 | 59 | #x=batch.text[0] #64x200 60 | 61 | #print(utils.evaluation(model,test_iter)) 62 | for i in range(opt.max_epoch): 63 | for epoch,batch in enumerate(train_iter): 64 | start= time.time() 65 | 66 | text = batch.text[0] if from_torchtext else batch.text 67 | predicted = model(text) 68 | 69 | loss= loss_fun(predicted,batch.label) 70 | 71 | loss.backward() 72 | utils.clip_gradient(optimizer, opt.grad_clip) 73 | optimizer.step() 74 | if epoch% 100==0: 75 | if torch.cuda.is_available(): 76 | print("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.cpu().data.numpy()[0],time.time()-start)) 77 | else: 78 | print("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.data.numpy()[0],time.time()-start)) 79 | 80 | percision=utils.evaluation(model,test_iter,from_torchtext) 81 | print("%d iteration with percision %.4f" % (i,percision)) 82 | 83 | 84 | -------------------------------------------------------------------------------- /models/BiBloSA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #https://github.com/galsang/BiBloSA-pytorch/blob/master/model/model.py 4 | 5 | -------------------------------------------------------------------------------- /models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class CNN(nn.Module): 7 | def __init__(self, opt): 8 | super(CNN, self).__init__() 9 | 10 | self.embedding_type = opt.embedding_type 11 | self.batch_size = opt.batch_size 12 | self.max_sent_len = opt.max_sent_len 13 | self.embedding_dim = opt.embedding_dim 14 | self.vocab_size = opt.vocab_size 15 | self.CLASS_SIZE = opt.label_size 16 | self.FILTERS = opt["FILTERS"] 17 | self.FILTER_NUM = opt["FILTER_NUM"] 18 | self.keep_dropout = opt.keep_dropout 19 | self.IN_CHANNEL = 1 20 | 21 | assert (len(self.FILTERS) == len(self.FILTER_NUM)) 22 | 23 | # one for UNK and one for zero padding 24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 26 | self.WV_MATRIX = opt["WV_MATRIX"] 27 | self.embedding.weight.data.copy_(torch.from_numpy(self.WV_MATRIX)) 28 | if self.embedding_type == "static": 29 | self.embedding.weight.requires_grad = False 30 | elif self.embedding_type == "multichannel": 31 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.VOCAB_SIZE + 1) 32 | self.embedding2.weight.data.copy_(torch.from_numpy(self.WV_MATRIX)) 33 | self.embedding2.weight.requires_grad = False 34 | self.IN_CHANNEL = 2 35 | 36 | for i in range(len(self.FILTERS)): 37 | conv = nn.Conv1d(self.IN_CHANNEL, self.FILTER_NUM[i], self.embedding_dim * self.FILTERS[i], stride=self.WORD_DIM) 38 | setattr(self, 'conv_%d'%i, conv) 39 | 40 | self.fc = nn.Linear(sum(self.FILTER_NUM), self.label_size) 41 | 42 | def get_conv(self, i): 43 | return getattr(self, 'conv_%d'%i) 44 | 45 | def forward(self, inp): 46 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_sent_len) 47 | if self.embedding_type == "multichannel": 48 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_sent_len) 49 | x = torch.cat((x, x2), 1) 50 | 51 | conv_results = [ 52 | F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_sent_len - self.FILTERS[i] + 1) 53 | .view(-1, self.FILTER_NUM[i]) 54 | for i in range(len(self.FILTERS))] 55 | 56 | x = torch.cat(conv_results, 1) 57 | x = F.dropout(x, p=self.keep_dropout, training=self.training) 58 | x = self.fc(x) 59 | return x 60 | 61 | 62 | 63 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py 64 | class CNN1(nn.Module): 65 | 66 | def __init__(self, opt): 67 | super(CNN1,self).__init__() 68 | self.opt = opt 69 | 70 | V = opt.vocab_size 71 | D = opt.embedding_dim 72 | C = opt.label_size 73 | Ci = 1 74 | Co = opt.kernel_num 75 | Ks = opt.kernel_sizes 76 | 77 | self.embed = nn.Embedding(V, D) 78 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] 79 | self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) 80 | ''' 81 | self.conv13 = nn.Conv2d(Ci, Co, (3, D)) 82 | self.conv14 = nn.Conv2d(Ci, Co, (4, D)) 83 | self.conv15 = nn.Conv2d(Ci, Co, (5, D)) 84 | ''' 85 | self.dropout = nn.Dropout(opt.dropout) 86 | self.fc1 = nn.Linear(len(Ks)*Co, C) 87 | 88 | def conv_and_pool(self, x, conv): 89 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W) 90 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 91 | return x 92 | 93 | 94 | def forward(self, x): 95 | x = self.embed(x) # (N,W,D) 96 | 97 | if self.args.static: 98 | x = Variable(x) 99 | 100 | x = x.unsqueeze(1) # (N,Ci,W,D) 101 | 102 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks) 103 | 104 | 105 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks) 106 | 107 | x = torch.cat(x, 1) 108 | 109 | ''' 110 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co) 111 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co) 112 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co) 113 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) 114 | ''' 115 | x = self.dropout(x) # (N,len(Ks)*Co) 116 | logit = self.fc1(x) # (N,C) 117 | return logit 118 | 119 | import torch.nn as nn 120 | 121 | 122 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py 123 | class CNN2(nn.Module): 124 | def __init__(self, opt): 125 | super(CNN2, self).__init__() 126 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim) 127 | 128 | self.conv1 = nn.Sequential( 129 | nn.Conv1d(opt.l0, 256, kernel_size=7, stride=1), 130 | nn.ReLU(), 131 | nn.MaxPool1d(kernel_size=3, stride=3) 132 | ) 133 | 134 | self.conv2 = nn.Sequential( 135 | nn.Conv1d(256, 256, kernel_size=7, stride=1), 136 | nn.ReLU(), 137 | nn.MaxPool1d(kernel_size=3, stride=3) 138 | ) 139 | 140 | self.conv3 = nn.Sequential( 141 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 142 | nn.ReLU() 143 | ) 144 | 145 | self.conv4 = nn.Sequential( 146 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 147 | nn.ReLU() 148 | ) 149 | 150 | self.conv5 = nn.Sequential( 151 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 152 | nn.ReLU() 153 | ) 154 | 155 | self.conv6 = nn.Sequential( 156 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 157 | nn.ReLU(), 158 | nn.MaxPool1d(kernel_size=3, stride=3) 159 | ) 160 | 161 | self.fc = nn.Linear(256, opt.label_size) 162 | 163 | def forward(self, x_input): 164 | # Embedding 165 | x = self.embed(x_input) # dim: (batch_size, max_seq_len, embedding_size) 166 | x = self.conv1(x) 167 | x = self.conv2(x) 168 | x = self.conv3(x) 169 | x = self.conv4(x) 170 | x = self.conv5(x) 171 | x = self.conv6(x) 172 | 173 | # collapse 174 | x = x.view(x.size(0), -1) 175 | x = self.fc(x) 176 | 177 | return F.log_softmax(x) 178 | class CNN3(nn.Module): 179 | """ 180 | A CNN for text classification. 181 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 182 | """ 183 | def __init__(self, args): 184 | super(CNN3, self).__init__() 185 | self.args = args 186 | 187 | embedding_dim = args.embed_dim 188 | embedding_num = args.num_features 189 | class_number = args.class_num 190 | in_channel = 1 191 | out_channel = args.kernel_num 192 | kernel_sizes = args.kernel_sizes 193 | 194 | self.embed = nn.Embedding(embedding_num+1, embedding_dim) 195 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in kernel_sizes]) 196 | 197 | self.dropout = nn.Dropout(args.dropout) 198 | self.fc = nn.Linear(len(kernel_sizes) * out_channel, class_number) 199 | 200 | 201 | def forward(self, input_x): 202 | """ 203 | :param input_x: a list size having the number of batch_size elements with the same length 204 | :return: batch_size X num_aspects tensor 205 | """ 206 | # Embedding 207 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size) 208 | 209 | if self.args.static: 210 | x = F.Variable(input_x) 211 | 212 | # Conv & max pool 213 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size) 214 | 215 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1]) 216 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] 217 | 218 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes) 219 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] 220 | x = torch.cat(x, 1) 221 | 222 | # Dropout & output 223 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels) 224 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects) 225 | 226 | return logit -------------------------------------------------------------------------------- /models/CNNBasic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import numpy as np 4 | from torch import nn 5 | 6 | class BasicCNN1D(nn.Module): 7 | def __init__(self, opt ): 8 | super(BasicCNN1D, self).__init__() 9 | self.model_name = 'CNNText' 10 | self.opt=opt 11 | self.content_dim=opt.__dict__.get("content_dim",256) 12 | self.kernel_size=opt.__dict__.get("kernel_size",3) 13 | 14 | 15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 16 | if opt.__dict__.get("embeddings",None) is not None: 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | 19 | self.content_conv = nn.Sequential( 20 | nn.Conv1d(in_channels = opt.embedding_dim, 21 | out_channels = self.content_dim, #256 22 | kernel_size = self.kernel_size), #3 23 | nn.ReLU(), 24 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1)) 25 | # nn.AdaptiveMaxPool1d() 26 | ) 27 | self.fc = nn.Linear(self.content_dim, opt.label_size) 28 | 29 | def forward(self, content): 30 | 31 | content = self.encoder(content) #64x200x300 32 | content_out = self.content_conv(content.permute(0,2,1)) #64x256x1 33 | reshaped = content_out.view(content_out.size(0), -1) #64x256 34 | logits = self.fc(reshaped) #64x3 35 | return logits 36 | class BasicCNN2D(nn.Module): 37 | """ 38 | A CNN for text classification. 39 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 40 | """ 41 | def __init__(self, args): 42 | super(BasicCNN2D, self).__init__() 43 | self.opt = opt 44 | 45 | self.embedding_dim = opt.embedding_dim 46 | self.vocab_size = opt.vocab_size 47 | self.label_size = opt.label_size 48 | self.keep_dropout = opt.keep_dropout 49 | in_channel = 1 50 | self.kernel_nums = opt.kernel_nums 51 | self.kernel_sizes = opt.kernel_sizes 52 | 53 | self.embed = nn.Embedding(self.vocab_size+1, self.embedding_dim) 54 | 55 | if opt.__dict__.get("embeddings",None) is not None: 56 | self.embed.weight=nn.Parameter(opt.embeddings) 57 | 58 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, self.embedding_dim)) for K,out_channel in zip(self.kernel_sizes,self.kernel_nums)]) 59 | 60 | self.dropout = nn.Dropout(self.keep_dropout) 61 | self.fc = nn.Linear(len(self.kernel_sizes) * self.out_channel, self.label_size) 62 | 63 | 64 | def forward(self, input_x): 65 | """ 66 | :param input_x: a list size having the number of batch_size elements with the same length 67 | :return: batch_size X num_aspects tensor 68 | """ 69 | # Embedding 70 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size) 71 | 72 | if self.opt.static: 73 | x = F.Variable(input_x) 74 | 75 | # Conv & max pool 76 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size) 77 | 78 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1]) 79 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] 80 | 81 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes) 82 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] 83 | x = torch.cat(x, 1) 84 | 85 | # Dropout & output 86 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels) 87 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects) 88 | 89 | return logit 90 | import argparse 91 | 92 | def parse_opt(): 93 | parser = argparse.ArgumentParser() 94 | # Data input settings 95 | parser.add_argument('--hidden_dim', type=int, default=128, 96 | help='hidden_dim') 97 | 98 | 99 | parser.add_argument('--batch_size', type=int, default=64, 100 | help='batch_size') 101 | parser.add_argument('--embedding_dim', type=int, default=300, 102 | help='embedding_dim') 103 | parser.add_argument('--learning_rate', type=float, default=4e-4, 104 | help='learning_rate') 105 | parser.add_argument('--grad_clip', type=float, default=1e-1, 106 | help='grad_clip') 107 | parser.add_argument('--model', type=str, default="lstm", 108 | help='model name') 109 | parser.add_argument('--model', type=str, default="lstm", 110 | help='model name') 111 | 112 | 113 | # 114 | args = parser.parse_args() 115 | args.embedding_dim=300 116 | args.vocab_size=10000 117 | args.kernel_size=3 118 | args.num_classes=3 119 | args.content_dim=256 120 | args.max_seq_len=50 121 | 122 | # 123 | # # Check if args are valid 124 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 125 | 126 | 127 | return args 128 | 129 | if __name__ == '__main__': 130 | 131 | opt = parse_opt() 132 | m = CNNText(opt) 133 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long() 134 | o = m(content) 135 | print(o.size()) 136 | 137 | -------------------------------------------------------------------------------- /models/CNNInception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch as t 5 | import torch 6 | import numpy as np 7 | from torch import nn 8 | from collections import OrderedDict 9 | 10 | class Inception(nn.Module): 11 | def __init__(self,cin,co,relu=True,norm=True): 12 | super(Inception, self).__init__() 13 | assert(co%4==0) 14 | cos=[int(co/4)]*4 15 | self.activa=nn.Sequential() 16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co)) 17 | if relu:self.activa.add_module('relu',nn.ReLU(True)) 18 | self.branch1 =nn.Sequential(OrderedDict([ 19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)), 20 | ])) 21 | self.branch2 =nn.Sequential(OrderedDict([ 22 | ('conv1', nn.Conv1d(cin,cos[1], 1)), 23 | ('norm1', nn.BatchNorm1d(cos[1])), 24 | ('relu1', nn.ReLU(inplace=True)), 25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)), 26 | ])) 27 | self.branch3 =nn.Sequential(OrderedDict([ 28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)), 29 | ('norm1', nn.BatchNorm1d(cos[2])), 30 | ('relu1', nn.ReLU(inplace=True)), 31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)), 32 | ])) 33 | self.branch4 =nn.Sequential(OrderedDict([ 34 | #('pool',nn.MaxPool1d(2)), 35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)), 36 | ])) 37 | def forward(self,x): 38 | branch1=self.branch1(x) 39 | branch2=self.branch2(x) 40 | branch3=self.branch3(x) 41 | branch4=self.branch4(x) 42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1)) 43 | return result 44 | class InceptionCNN(nn.Module): 45 | def __init__(self, opt ): 46 | super(InceptionCNN, self).__init__() 47 | incept_dim=getattr(opt,"inception_dim",512) 48 | self.model_name = 'CNNText_inception' 49 | self.opt=opt 50 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 51 | 52 | self.content_conv=nn.Sequential( 53 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2) 54 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4) 55 | Inception(incept_dim,incept_dim), 56 | nn.MaxPool1d(opt.max_seq_len) 57 | ) 58 | self.fc = nn.Sequential( 59 | nn.Linear(incept_dim,getattr(opt,"linear_hidden_size",2000)), 60 | nn.BatchNorm1d(getattr(opt,"linear_hidden_size",2000)), 61 | nn.ReLU(inplace=True), 62 | nn.Linear(getattr(opt,"linear_hidden_size",2000) ,opt.label_size) 63 | ) 64 | if opt.__dict__.get("embeddings",None) is not None: 65 | self.encoder.weight=nn.Parameter(opt.embeddings) 66 | 67 | def forward(self,content): 68 | 69 | content=self.encoder(content) 70 | if self.opt.embedding_type=="static": 71 | content=content.detach(0) 72 | 73 | content_out=self.content_conv(content.permute(0,2,1)) 74 | out=content_out.view(content_out.size(0), -1) 75 | out=self.fc(out) 76 | return out 77 | 78 | if __name__ == '__main__': 79 | import sys 80 | sys.path.append(r"..") 81 | import opts 82 | opt=opts.parse_opt() 83 | opt.vocab_size=2501 84 | opt.label_size=3 85 | m = CNNText_inception(opt) 86 | 87 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 88 | o = m(content) 89 | print(o.size()) 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /models/CNNKim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class KIMCNN1D(nn.Module): 7 | def __init__(self, opt): 8 | super(KIMCNN1D, self).__init__() 9 | 10 | self.embedding_type = opt.embedding_type 11 | self.batch_size = opt.batch_size 12 | self.max_seq_len = opt.max_seq_len 13 | self.embedding_dim = opt.embedding_dim 14 | self.vocab_size = opt.vocab_size 15 | self.label_size = opt.label_size 16 | self.kernel_sizes = opt.kernel_sizes 17 | self.kernel_nums = opt.kernel_nums 18 | self.keep_dropout = opt.keep_dropout 19 | self.in_channel = 1 20 | 21 | assert (len(self.kernel_sizes) == len(self.kernel_nums)) 22 | 23 | # one for UNK and one for zero padding 24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 26 | self.embedding.weight=nn.Parameter(opt.embeddings) 27 | if self.embedding_type == "static": 28 | self.embedding.weight.requires_grad = False 29 | elif self.embedding_type == "multichannel": 30 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 31 | self.embedding2.weight=nn.Parameter(opt.embeddings) 32 | self.embedding2.weight.requires_grad = False 33 | self.in_channel = 2 34 | else: 35 | pass 36 | # 37 | # for i in range(len(self.kernel_sizes)): 38 | # conv = nn.Conv1d(self.in_channel, self.kernel_nums[i], self.embedding_dim * self.kernel_sizes[i], stride=self.embedding_dim) 39 | # setattr(self, 'conv_%d'%i, conv) 40 | self.convs = nn.ModuleList([nn.Conv1d(self.in_channel, num, self.embedding_dim * size, stride=self.embedding_dim) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)]) 41 | self.fc = nn.Linear(sum(self.kernel_nums), self.label_size) 42 | 43 | def get_conv(self, i): 44 | return getattr(self, 'conv_%d'%i) 45 | 46 | def forward(self, inp): 47 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_seq_len) 48 | if self.embedding_type == "multichannel": 49 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_seq_len) 50 | x = torch.cat((x, x2), 1) 51 | 52 | # conv_results = [ 53 | # F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_seq_len - self.kernel_sizes[i] + 1) 54 | # .view(-1, self.kernel_nums[i]) 55 | # for i in range(len(self.kernel_sizes))] 56 | conv_results = [ 57 | F.max_pool1d(F.relu(self.convs[i](x)), self.max_seq_len - self.kernel_sizes[i] + 1) 58 | .view(-1, self.kernel_nums[i]) 59 | for i in range(len(self.convs))] 60 | 61 | x = torch.cat(conv_results, 1) 62 | x = F.dropout(x, p=self.keep_dropout, training=self.training) 63 | x = self.fc(x) 64 | return x 65 | 66 | 67 | 68 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py 69 | class KIMCNN2D(nn.Module): 70 | 71 | def __init__(self, opt): 72 | super(KIMCNN2D,self).__init__() 73 | self.opt = opt 74 | self.embedding_type = opt.embedding_type 75 | self.batch_size = opt.batch_size 76 | self.max_seq_len = opt.max_seq_len 77 | self.embedding_dim = opt.embedding_dim 78 | self.vocab_size = opt.vocab_size 79 | self.label_size = opt.label_size 80 | self.kernel_sizes = opt.kernel_sizes 81 | self.kernel_nums = opt.kernel_nums 82 | self.keep_dropout = opt.keep_dropout 83 | 84 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 85 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 86 | self.embedding.weight=nn.Parameter(opt.embeddings) 87 | if self.embedding_type == "static": 88 | self.embedding.weight.requires_grad = False 89 | elif self.embedding_type == "multichannel": 90 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 91 | self.embedding2.weight=nn.Parameter(opt.embeddings) 92 | self.embedding2.weight.requires_grad = False 93 | self.in_channel = 2 94 | else: 95 | pass 96 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] 97 | self.convs1 = nn.ModuleList([nn.Conv2d(1, num, (size, opt.embedding_dim)) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)]) 98 | ''' 99 | self.conv13 = nn.Conv2d(Ci, Co, (3, D)) 100 | self.conv14 = nn.Conv2d(Ci, Co, (4, D)) 101 | self.conv15 = nn.Conv2d(Ci, Co, (5, D)) 102 | ''' 103 | self.dropout = nn.Dropout(opt.keep_dropout) 104 | self.fc = nn.Linear(sum(opt.kernel_nums), opt.label_size) 105 | 106 | def conv_and_pool(self, x, conv): 107 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W) 108 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 109 | return x 110 | 111 | 112 | def forward(self, x): 113 | x = self.embedding(x) # (N,W,D) 114 | 115 | 116 | 117 | x = x.unsqueeze(1) # (N,Ci,W,D) 118 | 119 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks) 120 | 121 | 122 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks) 123 | 124 | x = torch.cat(x, 1) 125 | 126 | ''' 127 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co) 128 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co) 129 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co) 130 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) 131 | ''' 132 | x = self.dropout(x) # (N,len(Ks)*Co) 133 | logit = self.fc(x) # (N,C) 134 | return logit 135 | 136 | -------------------------------------------------------------------------------- /models/CNNMultiLayer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py 9 | class MultiLayerCNN(nn.Module): 10 | def __init__(self, opt): 11 | super(MultiLayerCNN, self).__init__() 12 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim) 13 | 14 | if opt.__dict__.get("embeddings",None) is not None: 15 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 16 | 17 | self.conv1 = nn.Sequential( 18 | nn.Conv1d(opt.max_seq_len, 256, kernel_size=7, stride=1), 19 | nn.ReLU(), 20 | nn.MaxPool1d(kernel_size=3, stride=3) 21 | ) 22 | 23 | self.conv2 = nn.Sequential( 24 | nn.Conv1d(256, 256, kernel_size=7, stride=1), 25 | nn.ReLU(), 26 | nn.MaxPool1d(kernel_size=3, stride=3) 27 | ) 28 | 29 | self.conv3 = nn.Sequential( 30 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 31 | nn.ReLU() 32 | ) 33 | 34 | self.conv4 = nn.Sequential( 35 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 36 | nn.ReLU() 37 | ) 38 | 39 | self.conv5 = nn.Sequential( 40 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 41 | nn.ReLU() 42 | ) 43 | 44 | self.conv6 = nn.Sequential( 45 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 46 | nn.ReLU(), 47 | nn.MaxPool1d(kernel_size=3, stride=3) 48 | ) 49 | 50 | self.fc = nn.Linear(256*7, opt.label_size) 51 | 52 | def forward(self, x): 53 | # Embedding 54 | x = self.embed(x) # dim: (batch_size, max_seq_len, embedding_size) 55 | x = self.conv1(x) 56 | x = self.conv2(x) 57 | x = self.conv3(x) 58 | x = self.conv4(x) 59 | x = self.conv5(x) 60 | x = self.conv6(x) 61 | 62 | # collapse 63 | x = x.view(x.size(0), -1) 64 | x = self.fc(x) 65 | 66 | return F.log_softmax(x) 67 | -------------------------------------------------------------------------------- /models/CNNText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import numpy as np 4 | from torch import nn 5 | 6 | class CNNText(nn.Module): 7 | def __init__(self, opt ): 8 | super(CNNText, self).__init__() 9 | self.model_name = 'CNNText' 10 | self.opt=opt 11 | self.content_dim=opt.__dict__.get("content_dim",256) 12 | self.kernel_size=opt.__dict__.get("kernel_size",3) 13 | 14 | 15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 16 | if opt.__dict__.get("embeddings",None) is not None: 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | 19 | 20 | self.content_conv = nn.Sequential( 21 | nn.Conv1d(in_channels = opt.embedding_dim, 22 | out_channels = self.content_dim, 23 | kernel_size = self.kernel_size), 24 | nn.ReLU(), 25 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1)) 26 | # nn.AdaptiveMaxPool1d() 27 | ) 28 | 29 | self.fc = nn.Linear(self.content_dim, opt.label_size) 30 | 31 | 32 | def forward(self, content): 33 | 34 | content = self.encoder(content) 35 | content_out = self.content_conv(content.permute(0,2,1)) 36 | reshaped = content_out.view(content_out.size(0), -1) 37 | logits = self.fc(reshaped) 38 | return logits 39 | 40 | import argparse 41 | 42 | def parse_opt(): 43 | parser = argparse.ArgumentParser() 44 | # Data input settings 45 | parser.add_argument('--hidden_dim', type=int, default=128, 46 | help='hidden_dim') 47 | 48 | 49 | parser.add_argument('--batch_size', type=int, default=64, 50 | help='batch_size') 51 | parser.add_argument('--embedding_dim', type=int, default=300, 52 | help='embedding_dim') 53 | parser.add_argument('--learning_rate', type=float, default=4e-4, 54 | help='learning_rate') 55 | parser.add_argument('--grad_clip', type=float, default=1e-1, 56 | help='grad_clip') 57 | parser.add_argument('--model', type=str, default="lstm", 58 | help='model name') 59 | 60 | 61 | # 62 | args = parser.parse_args() 63 | args.embedding_dim=300 64 | args.vocab_size=10000 65 | args.kernel_size=3 66 | args.num_classes=3 67 | args.content_dim=256 68 | args.max_seq_len=50 69 | 70 | # 71 | # # Check if args are valid 72 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 73 | 74 | 75 | return args 76 | 77 | if __name__ == '__main__': 78 | 79 | 80 | opt = parse_opt() 81 | m = CNNText(opt) 82 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long() 83 | o = m(content) 84 | print(o.size()) 85 | 86 | -------------------------------------------------------------------------------- /models/CNN_Inception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch as t 5 | import torch 6 | import numpy as np 7 | from torch import nn 8 | from collections import OrderedDict 9 | 10 | class Inception(nn.Module): 11 | def __init__(self,cin,co,relu=True,norm=True): 12 | super(Inception, self).__init__() 13 | assert(co%4==0) 14 | cos=[co/4]*4 15 | self.activa=nn.Sequential() 16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co)) 17 | if relu:self.activa.add_module('relu',nn.ReLU(True)) 18 | self.branch1 =nn.Sequential(OrderedDict([ 19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)), 20 | ])) 21 | self.branch2 =nn.Sequential(OrderedDict([ 22 | ('conv1', nn.Conv1d(cin,cos[1], 1)), 23 | ('norm1', nn.BatchNorm1d(cos[1])), 24 | ('relu1', nn.ReLU(inplace=True)), 25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)), 26 | ])) 27 | self.branch3 =nn.Sequential(OrderedDict([ 28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)), 29 | ('norm1', nn.BatchNorm1d(cos[2])), 30 | ('relu1', nn.ReLU(inplace=True)), 31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)), 32 | ])) 33 | self.branch4 =nn.Sequential(OrderedDict([ 34 | #('pool',nn.MaxPool1d(2)), 35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)), 36 | ])) 37 | def forward(self,x): 38 | branch1=self.branch1(x) 39 | branch2=self.branch2(x) 40 | branch3=self.branch3(x) 41 | branch4=self.branch4(x) 42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1)) 43 | return result 44 | class CNNText_inception(nn.Module): 45 | def __init__(self, opt ): 46 | super(CNNText_inception, self).__init__() 47 | incept_dim=getattr(opt,"inception_dim",512) 48 | self.model_name = 'CNNText_inception' 49 | self.opt=opt 50 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 51 | 52 | self.content_conv=nn.Sequential( 53 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2) 54 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4) 55 | Inception(incept_dim,incept_dim), 56 | nn.MaxPool1d(opt.max_seq_len) 57 | ) 58 | self.fc = nn.Sequential( 59 | nn.Linear(incept_dim,getattr(opt,"linear_hidden_size",2000)), 60 | nn.BatchNorm1d(getattr(opt,"linear_hidden_size",2000)), 61 | nn.ReLU(inplace=True), 62 | nn.Linear(getattr(opt,"linear_hidden_size",2000) ,opt.label_size) 63 | ) 64 | if opt.__dict__.get("embeddings",None) is not None: 65 | print('load embedding') 66 | self.encoder.weight.data.copy_(t.from_numpy(opt.embeddings)) 67 | 68 | def forward(self,content): 69 | 70 | content=self.encoder(content) 71 | if self.opt.static: 72 | content=content.detach(0) 73 | 74 | content_out=self.content_conv(content.permute(0,2,1)) 75 | out=content_out.view(content_out.size(0), -1) 76 | out=self.fc(out) 77 | return out 78 | 79 | if __name__ == '__main__': 80 | import sys 81 | sys.path.append(r"..") 82 | import opts 83 | opt=opts.parse_opt() 84 | opt.vocab_size=2501 85 | opt.label_size=3 86 | m = CNNText_inception(opt) 87 | 88 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 89 | o = m(content) 90 | print(o.size()) 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /models/Capsule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # paper 3 | 4 | 5 | # 6 | 7 | 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | from torch import nn 12 | import numpy as np 13 | 14 | BATCH_SIZE = 100 15 | 16 | NUM_EPOCHS = 500 17 | NUM_ROUTING_ITERATIONS = 3 18 | 19 | cuda = torch.cuda.is_available() 20 | 21 | def softmax(input, dim=1): 22 | transposed_input = input.transpose(dim, len(input.size()) - 1) 23 | softmaxed_output = F.softmax(transposed_input.contiguous().view(-1, transposed_input.size(-1))) 24 | return softmaxed_output.view(*transposed_input.size()).transpose(dim, len(input.size()) - 1) 25 | 26 | 27 | 28 | 29 | 30 | class CapsuleLayer(nn.Module): 31 | def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None, 32 | num_iterations=NUM_ROUTING_ITERATIONS,padding=0): 33 | super(CapsuleLayer, self).__init__() 34 | 35 | self.num_route_nodes = num_route_nodes 36 | self.num_iterations = num_iterations 37 | 38 | self.num_capsules = num_capsules 39 | 40 | 41 | 42 | if num_route_nodes != -1: 43 | self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels)) 44 | else: 45 | prime=[3,5,7,9,11,13,17,19,23] 46 | sizes=prime[:self.num_capsules] 47 | self.capsules = nn.ModuleList( 48 | [nn.Conv1d(in_channels, out_channels, kernel_size=i, stride=2, padding=int((i-1)/2)) for i in sizes]) 49 | 50 | def squash(self, tensor, dim=-1): 51 | squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True) 52 | scale = squared_norm / (1 + squared_norm) 53 | return scale * tensor / torch.sqrt(squared_norm) 54 | 55 | def forward(self, x): 56 | 57 | if self.num_route_nodes != -1: 58 | priors =torch.matmul( x[None, :, :, None, :],self.route_weights[:, None, :, :, :]) 59 | 60 | if torch.cuda.is_available(): 61 | logits = torch.autograd.Variable(torch.zeros(priors.size())).cuda() 62 | else: 63 | logits = torch.autograd.Variable(torch.zeros(priors.size())) 64 | for i in range(self.num_iterations): 65 | probs = softmax(logits, dim=2) 66 | outputs = self.squash((torch.mul(probs , priors)).sum(dim=2, keepdim=True)) 67 | 68 | if i != self.num_iterations - 1: 69 | delta_logits = (torch.mul(priors , outputs)).sum(dim=-1, keepdim=True) 70 | logits = logits + delta_logits 71 | else: 72 | outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules] 73 | outputs = torch.cat(outputs, dim=-1) 74 | outputs = self.squash(outputs) 75 | 76 | return outputs 77 | 78 | 79 | class CapsuleNet(nn.Module): 80 | def __init__(self,opt): 81 | super(CapsuleNet, self).__init__() 82 | self.opt=opt #300*300 83 | self.label_size=opt.label_size 84 | self.embed = nn.Embedding(opt.vocab_size+1, opt.embedding_dim) 85 | self.opt.cnn_dim = 1 86 | self.kernel_size = 3 87 | self.kernel_size_primary=3 88 | if opt.__dict__.get("embeddings",None) is not None: 89 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 90 | 91 | self.primary_capsules = CapsuleLayer(num_capsules=8, num_route_nodes=-1, in_channels=256, out_channels=32) 92 | self.digit_capsules = CapsuleLayer(num_capsules=opt.label_size, num_route_nodes=int(32 * opt.max_seq_len/2), in_channels=8, 93 | out_channels=16) 94 | if self.opt.cnn_dim == 2: 95 | self.conv_2d = nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(self.kernel_size,opt.embedding_dim), stride=(1,opt.embedding_dim),padding=(int((self.kernel_size-1)/2),0)) 96 | else: 97 | self.conv_1d = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=opt.embedding_dim * self.kernel_size, stride=opt.embedding_dim, padding=opt.embedding_dim* int((self.kernel_size-1)/2) ) 98 | 99 | self.decoder = nn.Sequential( 100 | nn.Linear(16 * self.label_size, 512), 101 | nn.ReLU(inplace=True), 102 | nn.Linear(512, 1024), 103 | nn.ReLU(inplace=True), 104 | nn.Linear(1024, 784), 105 | nn.Sigmoid() 106 | ) 107 | 108 | def forward(self, x, y=None,reconstruct=False): 109 | #x = next(iter(train_iter)).text[0] 110 | 111 | x= self.embed(x) 112 | if self.opt.cnn_dim == 1: 113 | x=x.view(x.size(0),1,x.size(-1)*x.size(-2)) 114 | x_conv = F.relu(self.conv_1d(x), inplace=True) 115 | else: 116 | 117 | x=x.unsqueeze(1) 118 | x_conv = F.relu(self.conv_2d(x), inplace=True).squeeze(3) 119 | 120 | x = self.primary_capsules(x_conv) 121 | x = self.digit_capsules(x).squeeze().transpose(0, 1) 122 | 123 | classes = (x ** 2).sum(dim=-1) ** 0.5 124 | classes = F.softmax(classes) 125 | if not reconstruct: 126 | return classes 127 | if y is None: 128 | # In all batches, get the most active capsule. 129 | _, max_length_indices = classes.max(dim=1) 130 | if torch.cuda.is_available(): 131 | y = Variable(torch.sparse.torch.eye(self.label_size)).cuda().index_select(dim=0, index=max_length_indices.data) 132 | else: 133 | y = Variable(torch.sparse.torch.eye(self.label_size)).index_select(dim=0, index=max_length_indices.data) 134 | reconstructions = self.decoder((x * y[:, :, None]).view(x.size(0), -1)) 135 | 136 | return classes, reconstructions 137 | -------------------------------------------------------------------------------- /models/ConvS2S.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/DiSAN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/taoshen58/DiSAN/blob/master/SST_disan/src/model/model_disan.py -------------------------------------------------------------------------------- /models/FastText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch as t 4 | 5 | import numpy as np 6 | from torch import nn 7 | from collections import OrderedDict 8 | class FastText(nn.Module): 9 | def __init__(self, opt ): 10 | super(FastText, self).__init__() 11 | self.model_name = 'FastText' 12 | 13 | linear_hidden_size=getattr(opt,"linear_hidden_size",2000) 14 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 15 | if opt.__dict__.get("embeddings",None) is not None: 16 | print('load embedding') 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | 19 | 20 | self.content_fc = nn.Sequential( 21 | nn.Linear(opt.embedding_dim,linear_hidden_size), 22 | nn.BatchNorm1d(linear_hidden_size), 23 | nn.ReLU(inplace=True), 24 | # nn.Linear(opt.linear_hidden_size,opt.linear_hidden_size), 25 | # nn.BatchNorm1d(opt.linear_hidden_size), 26 | # nn.ReLU(inplace=True), 27 | nn.Linear(linear_hidden_size,opt.label_size) 28 | ) 29 | 30 | 31 | def forward(self,content): 32 | 33 | content_=t.mean(self.encoder(content),dim=1) 34 | 35 | 36 | out=self.content_fc(content_.view(content_.size(0),-1)) 37 | 38 | return out 39 | if __name__ == '__main__': 40 | import sys 41 | sys.path.append(r"..") 42 | import opts 43 | opt=opts.parse_opt() 44 | opt.vocab_size=2501 45 | opt.label_size=3 46 | m = FastText(opt) 47 | 48 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 49 | o = m(content) 50 | print(o.size()) -------------------------------------------------------------------------------- /models/LSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch 6 | from torch.autograd import Variable 7 | #from memory_profiler import profile 8 | 9 | class LSTMClassifier(nn.Module): 10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 11 | def __init__(self,opt): 12 | self.opt=opt 13 | super(LSTMClassifier, self).__init__() 14 | self.hidden_dim = opt.hidden_dim 15 | self.batch_size = opt.batch_size 16 | self.use_gpu = torch.cuda.is_available() 17 | 18 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 19 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 20 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 21 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim) 22 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 23 | self.hidden = self.init_hidden() 24 | self.mean = opt.__dict__.get("lstm_mean",True) 25 | 26 | def init_hidden(self,batch_size=None): 27 | if batch_size is None: 28 | batch_size= self.batch_size 29 | 30 | if self.use_gpu: 31 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 32 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 33 | else: 34 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim)) 35 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim)) 36 | return (h0, c0) 37 | # @profile 38 | def forward(self, sentence): 39 | embeds = self.word_embeddings(sentence) #64x200x300 40 | 41 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 42 | x=embeds.permute(1,0,2) #200x64x300 43 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128 44 | lstm_out, self.hidden = self.lstm(x, self.hidden) #200x64x128 45 | if self.mean=="mean": 46 | out = lstm_out.permute(1,0,2) 47 | final = torch.mean(out,1) 48 | else: 49 | final=lstm_out[-1] 50 | y = self.hidden2label(final) #64x3 51 | return y 52 | # def forward1(self, sentence): 53 | # 54 | # return torch.zeros(sentence.size()[0], self.opt.label_size) 55 | ## def __call__(self, **args): 56 | ## self.forward(args) 57 | # def test(): 58 | # 59 | # import numpy as np 60 | # 61 | # word_embeddings = nn.Embedding(10000, 300) 62 | # lstm = nn.LSTM(300, 100) 63 | # h0 = Variable(torch.zeros(1, 128, 100)) 64 | # c0 = Variable(torch.zeros(1, 128, 100)) 65 | # hidden=(h0, c0) 66 | # sentence = Variable(torch.LongTensor(np.zeros((128,30),dtype=np.int64))) 67 | # embeds = word_embeddings(sentence) 68 | # torch.tile(sentence) 69 | # sentence.size()[0] 70 | # 71 | # 72 | # 73 | ## x= Variable(torch.zeros(30, 128, 300)) 74 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 75 | # embeds=embeds.permute(1,0,2) 76 | # lstm_out, hidden = lstm(embeds, hidden) 77 | ## -------------------------------------------------------------------------------- /models/LSTMBI.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch 6 | from torch.autograd import Variable 7 | #from memory_profiler import profile 8 | 9 | class LSTMBI(nn.Module): 10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 11 | def __init__(self,opt): 12 | self.opt=opt 13 | super(LSTMBI, self).__init__() 14 | self.hidden_dim = opt.hidden_dim 15 | self.batch_size = opt.batch_size 16 | self.use_gpu = torch.cuda.is_available() 17 | 18 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 19 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 20 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 21 | 22 | self.lstm_layers = opt.lstm_layers 23 | #self.bidirectional = True 24 | self.dropout = opt.keep_dropout 25 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.lstm_layers, dropout=self.dropout, bidirectional=True) 26 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 27 | self.hidden = self.init_hidden() 28 | self.mean = opt.__dict__.get("lstm_mean",True) 29 | 30 | def init_hidden(self,batch_size=None): 31 | if batch_size is None: 32 | batch_size= self.batch_size 33 | 34 | if self.use_gpu: 35 | h0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2).cuda()) 36 | c0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2).cuda()) 37 | else: 38 | h0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2)) 39 | c0 = Variable(torch.zeros(2*self.lstm_layers, batch_size, self.hidden_dim // 2)) 40 | return (h0, c0) 41 | # @profile 42 | def forward(self, sentence): 43 | embeds = self.word_embeddings(sentence) 44 | 45 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 46 | x=embeds.permute(1,0,2) # we do this because the default parameter of lstm is False 47 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64 48 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128 49 | if self.mean=="mean": 50 | out = lstm_out.permute(1,0,2) 51 | final = torch.mean(out,1) 52 | else: 53 | final=lstm_out[-1] 54 | y = self.hidden2label(final) #64x3 #lstm_out[-1] 55 | return y 56 | -------------------------------------------------------------------------------- /models/LSTMStack.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DrJZhou/TextClassificationBenchmark/b5ac116c74493c28f2af2541f21385df7c73ef93/models/LSTMStack.py -------------------------------------------------------------------------------- /models/LSTMTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/dasguptar/treelstm.pytorch -------------------------------------------------------------------------------- /models/LSTMwithAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import numpy as np 5 | import torch.nn as nn 6 | from sklearn.utils import shuffle 7 | from torch.autograd import Variable 8 | 9 | class LSTMAttention(torch.nn.Module): 10 | def __init__(self,opt): 11 | self.opt=opt 12 | super(LSTMAttention, self).__init__() 13 | self.hidden_dim = opt.hidden_dim 14 | self.batch_size = opt.batch_size 15 | self.use_gpu = torch.cuda.is_available() 16 | 17 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 18 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 19 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 20 | 21 | self.num_layers = opt.lstm_layers 22 | #self.bidirectional = True 23 | self.dropout = opt.keep_dropout 24 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, batch_first=True,num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 25 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 26 | self.hidden = self.init_hidden() 27 | self.mean = opt.__dict__.get("lstm_mean",True) 28 | self.attn_fc = torch.nn.Linear(opt.embedding_dim, 1) 29 | def init_hidden(self,batch_size=None): 30 | if batch_size is None: 31 | batch_size= self.batch_size 32 | 33 | if self.use_gpu: 34 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 35 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 36 | else: 37 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 38 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 39 | return (h0, c0) 40 | 41 | 42 | def attention(self, rnn_out, state): 43 | merged_state = torch.cat([s for s in state],1) 44 | merged_state = merged_state.squeeze(0).unsqueeze(2) 45 | # (batch, seq_len, cell_size) * (batch, cell_size, 1) = (batch, seq_len, 1) 46 | weights = torch.bmm(rnn_out, merged_state) 47 | weights = torch.nn.functional.softmax(weights.squeeze(2)).unsqueeze(2) 48 | # (batch, cell_size, seq_len) * (batch, seq_len, 1) = (batch, cell_size, 1) 49 | return torch.bmm(torch.transpose(rnn_out, 1, 2), weights).squeeze(2) 50 | # end method attention 51 | 52 | 53 | def forward(self, X): 54 | embedded = self.word_embeddings(X) 55 | hidden= self.init_hidden(X.size()[0]) # 56 | rnn_out, hidden = self.bilstm(embedded, hidden) 57 | h_n, c_n = hidden 58 | attn_out = self.attention(rnn_out, h_n) 59 | logits = self.hidden2label(attn_out) 60 | return logits -------------------------------------------------------------------------------- /models/MLP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | from torch.autograd import Variable 8 | 9 | # https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py 10 | 11 | def position_encoding(sentence_size, embedding_dim): 12 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32) 13 | ls = sentence_size + 1 14 | le = embedding_dim + 1 15 | for i in range(1, le): 16 | for j in range(1, ls): 17 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2) 18 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size 19 | # Make position encoding of time words identity to avoid modifying them 20 | encoding[:, -1] = 1.0 21 | return np.transpose(encoding) 22 | 23 | class AttrProxy(object): 24 | """ 25 | Translates index lookups into attribute lookups. 26 | To implement some trick which able to use list of nn.Module in a nn.Module 27 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2 28 | """ 29 | def __init__(self, module, prefix): 30 | self.module = module 31 | self.prefix = prefix 32 | 33 | def __getitem__(self, i): 34 | return getattr(self.module, self.prefix + str(i)) 35 | 36 | 37 | class MemN2N(nn.Module): 38 | def __init__(self, opt): 39 | super(MemN2N, self).__init__() 40 | 41 | use_cuda = opt["use_cuda"] 42 | num_vocab = opt["num_vocab"] 43 | embedding_dim = opt["embedding_dim"] 44 | sentence_size = opt["sentence_size"] 45 | self.max_hops = opt["max_hops"] 46 | 47 | for hop in range(self.max_hops+1): 48 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0) 49 | C.weight.data.normal_(0, 0.1) 50 | self.add_module("C_{}".format(hop), C) 51 | self.C = AttrProxy(self, "C_") 52 | 53 | self.softmax = nn.Softmax() 54 | self.encoding = Variable(torch.FloatTensor( 55 | position_encoding(sentence_size, embedding_dim)), requires_grad=False) 56 | 57 | if use_cuda: 58 | self.encoding = self.encoding.cuda() 59 | 60 | def forward(self, story, query): 61 | story_size = story.size() 62 | 63 | u = list() 64 | query_embed = self.C[0](query) 65 | # weired way to perform reduce_dot 66 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed) 67 | u.append(torch.sum(query_embed*encoding, 1)) 68 | 69 | for hop in range(self.max_hops): 70 | embed_A = self.C[hop](story.view(story.size(0), -1)) 71 | embed_A = embed_A.view(story_size+(embed_A.size(-1),)) 72 | 73 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A) 74 | m_A = torch.sum(embed_A*encoding, 2) 75 | 76 | u_temp = u[-1].unsqueeze(1).expand_as(m_A) 77 | prob = self.softmax(torch.sum(m_A*u_temp, 2)) 78 | 79 | embed_C = self.C[hop+1](story.view(story.size(0), -1)) 80 | embed_C = embed_C.view(story_size+(embed_C.size(-1),)) 81 | m_C = torch.sum(embed_C*encoding, 2) 82 | 83 | prob = prob.unsqueeze(2).expand_as(m_C) 84 | o_k = torch.sum(m_C*prob, 1) 85 | 86 | u_k = u[-1] + o_k 87 | u.append(u_k) 88 | 89 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1) 90 | return a_hat, self.softmax(a_hat) -------------------------------------------------------------------------------- /models/MemoryNetwork.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | from torch.autograd import Variable 8 | 9 | def position_encoding(sentence_size, embedding_dim): 10 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32) 11 | ls = sentence_size + 1 12 | le = embedding_dim + 1 13 | for i in range(1, le): 14 | for j in range(1, ls): 15 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2) 16 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size 17 | # Make position encoding of time words identity to avoid modifying them 18 | encoding[:, -1] = 1.0 19 | return np.transpose(encoding) 20 | 21 | class AttrProxy(object): 22 | """ 23 | Translates index lookups into attribute lookups. 24 | To implement some trick which able to use list of nn.Module in a nn.Module 25 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2 26 | """ 27 | def __init__(self, module, prefix): 28 | self.module = module 29 | self.prefix = prefix 30 | 31 | def __getitem__(self, i): 32 | return getattr(self.module, self.prefix + str(i)) 33 | 34 | 35 | class MemN2N(nn.Module): 36 | def __init__(self, settings): 37 | super(MemN2N, self).__init__() 38 | 39 | use_cuda = settings["use_cuda"] 40 | num_vocab = settings["num_vocab"] 41 | embedding_dim = settings["embedding_dim"] 42 | sentence_size = settings["sentence_size"] 43 | self.max_hops = settings["max_hops"] 44 | 45 | for hop in range(self.max_hops+1): 46 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0) 47 | C.weight.data.normal_(0, 0.1) 48 | self.add_module("C_{}".format(hop), C) 49 | self.C = AttrProxy(self, "C_") 50 | 51 | self.softmax = nn.Softmax() 52 | self.encoding = Variable(torch.FloatTensor( 53 | position_encoding(sentence_size, embedding_dim)), requires_grad=False) 54 | 55 | if use_cuda: 56 | self.encoding = self.encoding.cuda() 57 | 58 | def forward(self, query): 59 | 60 | story=query # for text classfication 61 | 62 | story_size = story.size() 63 | 64 | u = list() 65 | query_embed = self.C[0](query) 66 | # weired way to perform reduce_dot 67 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed) 68 | u.append(torch.sum(query_embed*encoding, 1)) 69 | 70 | for hop in range(self.max_hops): 71 | embed_A = self.C[hop](story.view(story.size(0), -1)) 72 | embed_A = embed_A.view(story_size+(embed_A.size(-1),)) 73 | 74 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A) 75 | m_A = torch.sum(embed_A*encoding, 2) 76 | 77 | u_temp = u[-1].unsqueeze(1).expand_as(m_A) 78 | prob = self.softmax(torch.sum(m_A*u_temp, 2)) 79 | 80 | embed_C = self.C[hop+1](story.view(story.size(0), -1)) 81 | embed_C = embed_C.view(story_size+(embed_C.size(-1),)) 82 | m_C = torch.sum(embed_C*encoding, 2) 83 | 84 | prob = prob.unsqueeze(2).expand_as(m_C) 85 | o_k = torch.sum(m_C*prob, 1) 86 | 87 | u_k = u[-1] + o_k 88 | u.append(u_k) 89 | 90 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1) 91 | return a_hat, self.softmax(a_hat) -------------------------------------------------------------------------------- /models/QuantumCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/RCNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | #from memory_profiler import profile 6 | 7 | """ 8 | Lai S, Xu L, Liu K, et al. Recurrent Convolutional Neural Networks for Text Classification[C]//AAAI. 2015, 333: 2267-2273. 9 | """ 10 | 11 | class RCNN(nn.Module): 12 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 13 | def __init__(self,opt): 14 | self.opt=opt 15 | super(RCNN, self).__init__() 16 | self.hidden_dim = opt.hidden_dim 17 | self.batch_size = opt.batch_size 18 | self.use_gpu = torch.cuda.is_available() 19 | 20 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 21 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 22 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 23 | 24 | self.num_layers = 1 25 | #self.bidirectional = True 26 | self.dropout = opt.keep_dropout 27 | self.bilstm = nn.LSTM(input_size=opt.embedding_dim, hidden_size=opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 28 | 29 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 30 | self.hidden = self.init_hidden() 31 | 32 | self.max_pooling = nn.MaxPool1d(kernel_size=3, stride=2) 33 | 34 | self.content_dim = 256 35 | #self.conv = nn.Conv1d(opt.hidden_dim, self.content_dim, opt.hidden_dim * 2, stride=opt.embedding_dim) 36 | self.hidden2label = nn.Linear( (2*opt.hidden_dim // 2+opt.embedding_dim), opt.label_size) 37 | 38 | def init_hidden(self,batch_size=None): 39 | if batch_size is None: 40 | batch_size= self.batch_size 41 | 42 | if self.use_gpu: 43 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 44 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 45 | else: 46 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 47 | c0 = Variable(torch.zeros(2*self.num_layers,batch_size, self.hidden_dim // 2)) 48 | return (h0, c0) 49 | # @profile 50 | def forward(self, sentence): 51 | embeds = self.word_embeddings(sentence) #64x200x300 52 | 53 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 54 | x=embeds.permute(1,0,2) #200x64x300 55 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x128 56 | lstm_out, self.hidden = self.bilstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions) 57 | #lstm_out 200x64x128 58 | 59 | c_lr = lstm_out.permute(1,0,2) #64x200x128 60 | xi = torch.cat((c_lr[:,:,0:int(c_lr.size()[2]/2)],embeds,c_lr[:,:,int(c_lr.size()[2]/2):]),2) #64x200x428 61 | yi = torch.tanh(xi.permute(0,2,1)) #64x428x200 62 | y = self.max_pooling(yi) #64x428x99 63 | y = y.permute(2,0,1) 64 | 65 | ##y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1 66 | 67 | y = self.hidden2label(y[-1]) 68 | #y = self.hidden2label(y[:,-1,:].view(y[:,-1,:].size()[0],-1)) 69 | return y -------------------------------------------------------------------------------- /models/RNN_CNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | #from memory_profiler import profile 6 | 7 | class RNN_CNN(nn.Module): 8 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 9 | def __init__(self,opt): 10 | self.opt=opt 11 | super(RNN_CNN, self).__init__() 12 | self.hidden_dim = opt.hidden_dim 13 | self.batch_size = opt.batch_size 14 | self.use_gpu = torch.cuda.is_available() 15 | 16 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 17 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 19 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim) 20 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 21 | self.hidden = self.init_hidden() 22 | 23 | self.content_dim = 256 24 | self.conv = nn.Conv1d(in_channels=opt.hidden_dim, out_channels=self.content_dim, kernel_size=opt.hidden_dim * 2, stride=opt.embedding_dim) 25 | self.hidden2label = nn.Linear(self.content_dim, opt.label_size) 26 | 27 | def init_hidden(self,batch_size=None): 28 | if batch_size is None: 29 | batch_size= self.batch_size 30 | 31 | if self.use_gpu: 32 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 33 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 34 | else: 35 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim)) 36 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim)) 37 | return (h0, c0) 38 | # @profile 39 | def forward(self, sentence): 40 | embeds = self.word_embeddings(sentence) #64x200x300 41 | 42 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 43 | x=embeds.permute(1,0,2) #200x64x300 44 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128 45 | lstm_out, self.hidden = self.lstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions) 46 | #lstm_out 200x64x128 lstm_out.permute(1,2,0):64x128x200 47 | y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1 48 | ###y = self.conv(lstm_out.permute(1,2,0).contiguous().view(self.batch_size,128,-1)) 49 | #y = self.hidden2label(y.view(sentence.size()[0],-1)) 50 | y = self.hidden2label(y.view(y.size()[0],-1)) #64x3 51 | return y -------------------------------------------------------------------------------- /models/SelfAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-# 2 | # https://arxiv.org/pdf/1703.03130.pdf 3 | # A Structured Self-attentive Sentence Embedding 4 | # https://github.com/nn116003/self-attention-classification/blob/master/imdb_attn.py 5 | 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch 9 | from torch.autograd import Variable 10 | #from memory_profiler import profile 11 | 12 | class SelfAttention(nn.Module): 13 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 14 | def __init__(self,opt): 15 | self.opt=opt 16 | super(SelfAttention, self).__init__() 17 | self.hidden_dim = opt.hidden_dim 18 | self.batch_size = opt.batch_size 19 | self.use_gpu = torch.cuda.is_available() 20 | 21 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 22 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 23 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 24 | 25 | self.num_layers = 1 26 | #self.bidirectional = True 27 | self.dropout = opt.keep_dropout 28 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 29 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 30 | self.hidden = self.init_hidden() 31 | self.self_attention = nn.Sequential( 32 | nn.Linear(opt.hidden_dim, 24), 33 | nn.ReLU(True), 34 | nn.Linear(24,1) 35 | ) 36 | def init_hidden(self,batch_size=None): 37 | if batch_size is None: 38 | batch_size= self.batch_size 39 | 40 | if self.use_gpu: 41 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 42 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 43 | else: 44 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 45 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 46 | return (h0, c0) 47 | # @profile 48 | def forward(self, sentence): 49 | embeds = self.word_embeddings(sentence) 50 | 51 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 52 | x=embeds.permute(1,0,2) 53 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64 54 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128 55 | final =lstm_out.permute(1,0,2)#torch.mean(,1) 56 | attn_ene = self.self_attention(final) 57 | attns =F.softmax(attn_ene.view(self.batch_size, -1)) 58 | feats = (final * attns).sum(dim=1) 59 | y = self.hidden2label(feats) #64x3 60 | 61 | return y -------------------------------------------------------------------------------- /models/Transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' Define the Transformer model ''' 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | import torch.nn.init as init 8 | 9 | 10 | 11 | __author__ = "Yu-Hsiang Huang" 12 | #refer to "https://github.com/jadore801120/attention-is-all-you-need-pytorch" 13 | 14 | class ConstantsClass(): 15 | def __init__(self): 16 | self.PAD = 0 17 | self.UNK = 1 18 | self.BOS = 2 19 | self.EOS = 3 20 | self.PAD_WORD = '' 21 | self.UNK_WORD = '' 22 | self.BOS_WORD = '' 23 | self.EOS_WORD = '' 24 | Constants =ConstantsClass() 25 | 26 | class Linear(nn.Module): 27 | ''' Simple Linear layer with xavier init ''' 28 | def __init__(self, d_in, d_out, bias=True): 29 | super(Linear, self).__init__() 30 | self.linear = nn.Linear(d_in, d_out, bias=bias) 31 | init.xavier_normal(self.linear.weight) 32 | 33 | def forward(self, x): 34 | return self.linear(x) 35 | 36 | class Bottle(nn.Module): 37 | ''' Perform the reshape routine before and after an operation ''' 38 | 39 | def forward(self, input): 40 | if len(input.size()) <= 2: 41 | return super(Bottle, self).forward(input) 42 | size = input.size()[:2] 43 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 44 | return out.view(size[0], size[1], -1) 45 | 46 | class BottleLinear(Bottle, Linear): 47 | ''' Perform the reshape routine before and after a linear projection ''' 48 | pass 49 | 50 | class BottleSoftmax(Bottle, nn.Softmax): 51 | ''' Perform the reshape routine before and after a softmax operation''' 52 | pass 53 | 54 | class LayerNormalization(nn.Module): 55 | ''' Layer normalization module ''' 56 | 57 | def __init__(self, d_hid, eps=1e-3): 58 | super(LayerNormalization, self).__init__() 59 | 60 | self.eps = eps 61 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 62 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 63 | 64 | def forward(self, z): 65 | if z.size(1) == 1: 66 | return z 67 | 68 | mu = torch.mean(z, keepdim=True, dim=-1) 69 | sigma = torch.std(z, keepdim=True, dim=-1) 70 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 71 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) 72 | 73 | return ln_out 74 | 75 | class BatchBottle(nn.Module): 76 | ''' Perform the reshape routine before and after an operation ''' 77 | 78 | def forward(self, input): 79 | if len(input.size()) <= 2: 80 | return super(BatchBottle, self).forward(input) 81 | size = input.size()[1:] 82 | out = super(BatchBottle, self).forward(input.view(-1, size[0]*size[1])) 83 | return out.view(-1, size[0], size[1]) 84 | 85 | class BottleLayerNormalization(BatchBottle, LayerNormalization): 86 | ''' Perform the reshape routine before and after a layer normalization''' 87 | pass 88 | 89 | class ScaledDotProductAttention(nn.Module): 90 | ''' Scaled Dot-Product Attention ''' 91 | 92 | def __init__(self, d_model, attn_dropout=0.1): 93 | super(ScaledDotProductAttention, self).__init__() 94 | self.temper = np.power(d_model, 0.5) 95 | self.dropout = nn.Dropout(attn_dropout) 96 | self.softmax = BottleSoftmax() 97 | 98 | def forward(self, q, k, v, attn_mask=None): 99 | 100 | attn = torch.bmm(q, k.transpose(1, 2)) / self.temper 101 | 102 | if attn_mask is not None: 103 | 104 | assert attn_mask.size() == attn.size(), \ 105 | 'Attention mask shape {} mismatch ' \ 106 | 'with Attention logit tensor shape ' \ 107 | '{}.'.format(attn_mask.size(), attn.size()) 108 | 109 | attn.data.masked_fill_(attn_mask, -float('inf')) 110 | 111 | attn = self.softmax(attn) 112 | attn = self.dropout(attn) 113 | output = torch.bmm(attn, v) 114 | 115 | return output, attn 116 | class MultiHeadAttention(nn.Module): 117 | ''' Multi-Head Attention module ''' 118 | 119 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 120 | super(MultiHeadAttention, self).__init__() 121 | 122 | self.n_head = n_head 123 | self.d_k = d_k 124 | self.d_v = d_v 125 | 126 | self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) 127 | self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) 128 | self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) 129 | 130 | self.attention = ScaledDotProductAttention(d_model) 131 | self.layer_norm = LayerNormalization(d_model) 132 | self.proj = Linear(n_head*d_v, d_model) 133 | 134 | self.dropout = nn.Dropout(dropout) 135 | 136 | init.xavier_normal(self.w_qs) 137 | init.xavier_normal(self.w_ks) 138 | init.xavier_normal(self.w_vs) 139 | 140 | def forward(self, q, k, v, attn_mask=None): 141 | 142 | d_k, d_v = self.d_k, self.d_v 143 | n_head = self.n_head 144 | 145 | residual = q 146 | 147 | mb_size, len_q, d_model = q.size() 148 | mb_size, len_k, d_model = k.size() 149 | mb_size, len_v, d_model = v.size() 150 | 151 | # treat as a (n_head) size batch 152 | q_s = q.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_q) x d_model 153 | k_s = k.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_k) x d_model 154 | v_s = v.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_v) x d_model 155 | 156 | # treat the result as a (n_head * mb_size) size batch 157 | q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k) # (n_head*mb_size) x len_q x d_k 158 | k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k) # (n_head*mb_size) x len_k x d_k 159 | v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v) # (n_head*mb_size) x len_v x d_v 160 | 161 | # perform attention, result size = (n_head * mb_size) x len_q x d_v 162 | outputs, attns = self.attention(q_s, k_s, v_s, attn_mask=attn_mask.repeat(n_head, 1, 1)) 163 | 164 | # back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v) 165 | outputs = torch.cat(torch.split(outputs, mb_size, dim=0), dim=-1) 166 | 167 | # project back to residual size 168 | outputs = self.proj(outputs) 169 | outputs = self.dropout(outputs) 170 | 171 | return self.layer_norm(outputs + residual), attns 172 | 173 | class PositionwiseFeedForward(nn.Module): 174 | ''' A two-feed-forward-layer module ''' 175 | 176 | def __init__(self, d_hid, d_inner_hid, dropout=0.1): 177 | super(PositionwiseFeedForward, self).__init__() 178 | self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1) # position-wise 179 | self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1) # position-wise 180 | self.layer_norm = LayerNormalization(d_hid) 181 | self.dropout = nn.Dropout(dropout) 182 | self.relu = nn.ReLU() 183 | 184 | def forward(self, x): 185 | residual = x 186 | output = self.relu(self.w_1(x.transpose(1, 2))) 187 | output = self.w_2(output).transpose(2, 1) 188 | output = self.dropout(output) 189 | return self.layer_norm(output + residual) 190 | class EncoderLayer(nn.Module): 191 | ''' Compose with two layers ''' 192 | 193 | def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): 194 | super(EncoderLayer, self).__init__() 195 | self.slf_attn = MultiHeadAttention( 196 | n_head, d_model, d_k, d_v, dropout=dropout) 197 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout) 198 | 199 | def forward(self, enc_input, slf_attn_mask=None): 200 | enc_output, enc_slf_attn = self.slf_attn( 201 | enc_input, enc_input, enc_input, attn_mask=slf_attn_mask) 202 | enc_output = self.pos_ffn(enc_output) 203 | return enc_output, enc_slf_attn 204 | 205 | class DecoderLayer(nn.Module): 206 | ''' Compose with three layers ''' 207 | 208 | def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): 209 | super(DecoderLayer, self).__init__() 210 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 211 | self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 212 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout) 213 | 214 | def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None): 215 | dec_output, dec_slf_attn = self.slf_attn( 216 | dec_input, dec_input, dec_input, attn_mask=slf_attn_mask) 217 | dec_output, dec_enc_attn = self.enc_attn( 218 | dec_output, enc_output, enc_output, attn_mask=dec_enc_attn_mask) 219 | dec_output = self.pos_ffn(dec_output) 220 | 221 | return dec_output, dec_slf_attn, dec_enc_attn 222 | 223 | def position_encoding_init(n_position, d_pos_vec): 224 | ''' Init the sinusoid position encoding table ''' 225 | 226 | # keep dim 0 for padding token position encoding zero vector 227 | position_enc = np.array([ 228 | [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)] 229 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) 230 | 231 | position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i 232 | position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 233 | return torch.from_numpy(position_enc).type(torch.FloatTensor) 234 | 235 | def get_attn_padding_mask(seq_q, seq_k): 236 | ''' Indicate the padding-related part to mask ''' 237 | assert seq_q.dim() == 2 and seq_k.dim() == 2 238 | mb_size, len_q = seq_q.size() 239 | mb_size, len_k = seq_k.size() 240 | pad_attn_mask = seq_k.data.eq(Constants.PAD).unsqueeze(1) # bx1xsk 241 | pad_attn_mask = pad_attn_mask.expand(mb_size, len_q, len_k) # bxsqxsk 242 | return pad_attn_mask 243 | 244 | def get_attn_subsequent_mask(seq): 245 | ''' Get an attention mask to avoid using the subsequent info.''' 246 | assert seq.dim() == 2 247 | attn_shape = (seq.size(0), seq.size(1), seq.size(1)) 248 | subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') 249 | subsequent_mask = torch.from_numpy(subsequent_mask) 250 | if seq.is_cuda: 251 | subsequent_mask = subsequent_mask.cuda() 252 | return subsequent_mask 253 | 254 | class Encoder(nn.Module): 255 | ''' A encoder model with self attention mechanism. ''' 256 | 257 | def __init__( 258 | self, n_src_vocab, n_max_seq, n_layers=6, n_head=8, d_k=64, d_v=64, 259 | d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1): 260 | 261 | super(Encoder, self).__init__() 262 | 263 | n_position = n_max_seq + 1 264 | self.n_max_seq = n_max_seq 265 | self.d_model = d_model 266 | 267 | self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants.PAD) 268 | self.position_enc.weight.data = position_encoding_init(n_position, d_word_vec) 269 | 270 | self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=Constants.PAD) 271 | 272 | self.layer_stack = nn.ModuleList([ 273 | EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) 274 | for _ in range(n_layers)]) 275 | 276 | def forward(self, src_seq, src_pos, return_attns=False): 277 | # Word embedding look up 278 | enc_input = self.src_word_emb(src_seq) 279 | 280 | # Position Encoding addition 281 | enc_input += self.position_enc(src_pos) 282 | if return_attns: 283 | enc_slf_attns = [] 284 | 285 | enc_output = enc_input 286 | enc_slf_attn_mask = get_attn_padding_mask(src_seq, src_seq) 287 | for enc_layer in self.layer_stack: 288 | enc_output, enc_slf_attn = enc_layer( 289 | enc_output, slf_attn_mask=enc_slf_attn_mask) 290 | if return_attns: 291 | enc_slf_attns += [enc_slf_attn] 292 | 293 | if return_attns: 294 | return enc_output, enc_slf_attns 295 | else: 296 | return enc_output 297 | 298 | class Decoder(nn.Module): 299 | ''' A decoder model with self attention mechanism. ''' 300 | def __init__( 301 | self, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_k=64, d_v=64, 302 | d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1): 303 | 304 | super(Decoder, self).__init__() 305 | n_position = n_max_seq + 1 306 | self.n_max_seq = n_max_seq 307 | self.d_model = d_model 308 | 309 | self.position_enc = nn.Embedding( 310 | n_position, d_word_vec, padding_idx=Constants.PAD) 311 | self.position_enc.weight.data = position_encoding_init(n_position, d_word_vec) 312 | 313 | self.tgt_word_emb = nn.Embedding( 314 | n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) 315 | self.dropout = nn.Dropout(dropout) 316 | 317 | self.layer_stack = nn.ModuleList([ 318 | DecoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) 319 | for _ in range(n_layers)]) 320 | 321 | def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): 322 | # Word embedding look up 323 | dec_input = self.tgt_word_emb(tgt_seq) 324 | 325 | # Position Encoding addition 326 | dec_input += self.position_enc(tgt_pos) 327 | 328 | # Decode 329 | dec_slf_attn_pad_mask = get_attn_padding_mask(tgt_seq, tgt_seq) 330 | dec_slf_attn_sub_mask = get_attn_subsequent_mask(tgt_seq) 331 | dec_slf_attn_mask = torch.gt(dec_slf_attn_pad_mask + dec_slf_attn_sub_mask, 0) 332 | 333 | dec_enc_attn_pad_mask = get_attn_padding_mask(tgt_seq, src_seq) 334 | 335 | if return_attns: 336 | dec_slf_attns, dec_enc_attns = [], [] 337 | 338 | dec_output = dec_input 339 | for dec_layer in self.layer_stack: 340 | dec_output, dec_slf_attn, dec_enc_attn = dec_layer( 341 | dec_output, enc_output, 342 | slf_attn_mask=dec_slf_attn_mask, 343 | dec_enc_attn_mask=dec_enc_attn_pad_mask) 344 | 345 | if return_attns: 346 | dec_slf_attns += [dec_slf_attn] 347 | dec_enc_attns += [dec_enc_attn] 348 | 349 | if return_attns: 350 | return dec_output, dec_slf_attns, dec_enc_attns 351 | else: 352 | return dec_output, 353 | 354 | class Transformer(nn.Module): 355 | ''' A sequence to sequence model with attention mechanism. ''' 356 | 357 | def __init__( 358 | self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, 359 | d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, 360 | dropout=0.1, proj_share_weight=True, embs_share_weight=True): 361 | 362 | super(Transformer, self).__init__() 363 | self.encoder = Encoder( 364 | n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, 365 | d_word_vec=d_word_vec, d_model=d_model, 366 | d_inner_hid=d_inner_hid, dropout=dropout) 367 | self.decoder = Decoder( 368 | n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, 369 | d_word_vec=d_word_vec, d_model=d_model, 370 | d_inner_hid=d_inner_hid, dropout=dropout) 371 | self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) 372 | self.dropout = nn.Dropout(dropout) 373 | 374 | assert d_model == d_word_vec, \ 375 | 'To facilitate the residual connections, \ 376 | the dimensions of all module output shall be the same.' 377 | 378 | if proj_share_weight: 379 | # Share the weight matrix between tgt word embedding/projection 380 | assert d_model == d_word_vec 381 | self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight 382 | 383 | if embs_share_weight: 384 | # Share the weight matrix between src/tgt word embeddings 385 | # assume the src/tgt word vec size are the same 386 | assert n_src_vocab == n_tgt_vocab, \ 387 | "To share word embedding table, the vocabulary size of src/tgt shall be the same." 388 | self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight 389 | 390 | def get_trainable_parameters(self): 391 | ''' Avoid updating the position encoding ''' 392 | enc_freezed_param_ids = set(map(id, self.encoder.position_enc.parameters())) 393 | dec_freezed_param_ids = set(map(id, self.decoder.position_enc.parameters())) 394 | freezed_param_ids = enc_freezed_param_ids | dec_freezed_param_ids 395 | return (p for p in self.parameters() if id(p) not in freezed_param_ids) 396 | 397 | def forward(self, src, tgt): 398 | src_seq, src_pos = src 399 | tgt_seq, tgt_pos = tgt 400 | 401 | tgt_seq = tgt_seq[:, :-1] 402 | tgt_pos = tgt_pos[:, :-1] 403 | 404 | enc_output, _ = self.encoder(src_seq, src_pos) 405 | dec_output, _ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output) 406 | seq_logit = self.tgt_word_proj(dec_output) 407 | 408 | return seq_logit.view(-1, seq_logit.size(2)) 409 | 410 | class AttentionIsAllYouNeed(nn.Module): 411 | def __init__(self, opt, n_layers=6, n_head=8, 412 | d_word_vec=128, d_model=128, d_inner_hid=256, d_k=32, d_v=32, 413 | dropout=0.1, proj_share_weight=True, embs_share_weight=True): 414 | # self, opt, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, 415 | 416 | super(AttentionIsAllYouNeed, self).__init__() 417 | self.encoder = Encoder( 418 | opt.vocab_size, opt.max_seq_len, n_layers=n_layers, n_head=n_head, 419 | d_word_vec=d_word_vec, d_model=d_model, 420 | d_inner_hid=d_inner_hid, dropout=dropout) 421 | self.hidden2label = nn.Linear(opt.max_seq_len*d_model, opt.label_size) 422 | self.batch_size=opt.batch_size 423 | def forward(self, inp): 424 | 425 | src_seq,src_pos = inp 426 | # enc_output, *_ = self.encoder(src_seq, src_pos) #64x200x512 427 | enc_output = self.encoder(src_seq, src_pos) #64x200x512 428 | return self.hidden2label(enc_output.view((self.batch_size,-1))) 429 | 430 | 431 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | 9 | import numpy as np 10 | 11 | 12 | 13 | from .LSTM import LSTMClassifier 14 | from .CNNBasic import BasicCNN1D,BasicCNN2D 15 | from .CNNKim import KIMCNN1D,KIMCNN2D 16 | from .CNNMultiLayer import MultiLayerCNN 17 | from .CNNInception import InceptionCNN 18 | from .FastText import FastText 19 | from .Capsule import CapsuleNet 20 | from .RCNN import RCNN 21 | from .RNN_CNN import RNN_CNN 22 | from .LSTMBI import LSTMBI 23 | from .Transformer import AttentionIsAllYouNeed 24 | from .SelfAttention import SelfAttention 25 | from .LSTMwithAttention import LSTMAttention 26 | def setup(opt): 27 | 28 | if opt.model == 'lstm': 29 | model = LSTMClassifier(opt) 30 | elif opt.model == 'baisc_cnn' or opt.model == "cnn": 31 | model = BasicCNN1D(opt) 32 | elif opt.model == 'baisc_cnn_2d' : 33 | model = BasicCNN2D(opt) 34 | elif opt.model == 'kim_cnn' : 35 | model = KIMCNN1D(opt) 36 | elif opt.model == 'kim_cnn_2d': 37 | model = KIMCNN2D(opt) 38 | elif opt.model == 'multi_cnn': 39 | model = MultiLayerCNN(opt) 40 | elif opt.model == 'inception_cnn': 41 | model = InceptionCNN(opt) 42 | elif opt.model == 'fasttext': 43 | model = FastText(opt) 44 | elif opt.model == 'capsule': 45 | model = CapsuleNet(opt) 46 | elif opt.model == 'rnn_cnn': 47 | model = RNN_CNN(opt) 48 | elif opt.model == 'rcnn': 49 | model = RCNN(opt) 50 | elif opt.model == 'bilstm': 51 | model = LSTMBI(opt) 52 | elif opt.model == "transformer": 53 | model = AttentionIsAllYouNeed(opt) 54 | elif opt.model == "selfattention": 55 | model = SelfAttention(opt) 56 | elif opt.model == "lstm_attention": 57 | model =LSTMAttention(opt) 58 | else: 59 | raise Exception("model not supported: {}".format(opt.model)) 60 | return model 61 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse,os 2 | import configparser 3 | def parse_opt(): 4 | parser = argparse.ArgumentParser() 5 | # Data input settings 6 | 7 | parser.add_argument('--config', type=str, default="no_file_exists", 8 | help='gpu number') 9 | 10 | 11 | parser.add_argument('--hidden_dim', type=int, default=128, 12 | help='hidden_dim') 13 | 14 | parser.add_argument('--max_seq_len', type=int, default=200, 15 | help='max_seq_len') 16 | parser.add_argument('--batch_size', type=int, default=64, 17 | help='batch_size') 18 | parser.add_argument('--embedding_dim', type=int, default=100, 19 | help='embedding_dim') 20 | parser.add_argument('--learning_rate', type=float, default=2e-5, 21 | help='learning_rate') 22 | parser.add_argument('--grad_clip', type=float, default=1e-1, 23 | help='grad_clip') 24 | 25 | parser.add_argument('--model', type=str, default="bilstm", 26 | help='model name') 27 | 28 | parser.add_argument('--dataset', type=str, default="imdb", 29 | 30 | help='dataset') 31 | parser.add_argument('--position', type=bool, default=False, 32 | help='gpu number') 33 | 34 | parser.add_argument('--keep_dropout', type=float, default=0.8, 35 | help='keep_dropout') 36 | parser.add_argument('--max_epoch', type=int, default=20, 37 | help='max_epoch') 38 | parser.add_argument('--embedding_file', type=str, default="glove.6b.300", 39 | help='glove or w2v') 40 | parser.add_argument('--embedding_training', type=str, default="false", 41 | help='embedding_training') 42 | #kim CNN 43 | parser.add_argument('--kernel_sizes', type=str, default="1,2,3,5", 44 | help='kernel_sizes') 45 | parser.add_argument('--kernel_nums', type=str, default="256,256,256,256", 46 | help='kernel_nums') 47 | parser.add_argument('--embedding_type', type=str, default="non-static", 48 | help='embedding_type') 49 | parser.add_argument('--lstm_mean', type=str, default="mean",# last 50 | help='lstm_mean') 51 | parser.add_argument('--lstm_layers', type=int, default=1,# last 52 | help='lstm_layers') 53 | parser.add_argument('--gpu', type=int, default=0, 54 | help='gpu number') 55 | parser.add_argument('--proxy', type=str, default="null", 56 | help='http://proxy.xx.com:8080') 57 | parser.add_argument('--debug', type=str, default="true", 58 | help='gpu number') 59 | 60 | parser.add_argument('--embedding_dir', type=str, default=".glove/glove.6B.300d.txt", 61 | help='embedding_dir') 62 | parser.add_argument('--from_torchtext', type=str, default="false", 63 | help='from torchtext or native data loader') 64 | # 65 | args = parser.parse_args() 66 | 67 | if args.config != "no_file_exists": 68 | if os.path.exists(args.config): 69 | config = configparser.ConfigParser() 70 | config_file_path=args.config 71 | config.read(config_file_path) 72 | config_common = config['COMMON'] 73 | for key in config_common.keys(): 74 | args.__dict__[key]=config_common[key] 75 | else: 76 | print("config file named %s does not exist" % args.config) 77 | 78 | args.kernel_sizes = [int(i) for i in args.kernel_sizes.split(",")] 79 | args.kernel_nums = [int(i) for i in args.kernel_nums.split(",")] 80 | # 81 | # # Check if args are valid 82 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 83 | 84 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys(): 85 | os.environ["CUDA_VISIBLE_DEVICES"] =str(args.gpu) 86 | 87 | if args.model=="transformer": 88 | args.position=True 89 | else: 90 | args.position=False 91 | if args.debug.lower() =="true": 92 | args.debug = True 93 | else: 94 | args.debug = False 95 | 96 | if args.embedding_training.lower() =="true": 97 | args.embedding_training = True 98 | else: 99 | args.embedding_training = False 100 | if args.from_torchtext.lower() =="true": 101 | args.from_torchtext = True 102 | else: 103 | args.from_torchtext = False 104 | 105 | 106 | if os.path.exists("proxy.config"): 107 | with open("proxy.config") as f: 108 | 109 | args.proxy = f.read() 110 | print(args.proxy) 111 | 112 | 113 | 114 | return args 115 | -------------------------------------------------------------------------------- /parameter_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from six.moves import cPickle 10 | import time,os,random 11 | import itertools 12 | 13 | import torch 14 | from torch.autograd import Variable 15 | import torch.optim as optim 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch.nn.modules.loss import NLLLoss,MultiLabelSoftMarginLoss,MultiLabelMarginLoss,BCELoss 19 | 20 | import opts 21 | import models 22 | import utils 23 | 24 | 25 | timeArray = time.localtime(int(time.time()) ) 26 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) 27 | performance_log_file =timeStamp+ "result.csv" 28 | 29 | 30 | opt = opts.parse_opt() 31 | train_iter, test_iter = utils.loadData(opt) 32 | 33 | 34 | def train(opt,train_iter, test_iter,verbose=True): 35 | global_start= time.time() 36 | logger = utils.getLogger() 37 | model=models.setup(opt) 38 | if torch.cuda.is_available(): 39 | model.cuda() 40 | params = [param for param in model.parameters() if param.requires_grad] #filter(lambda p: p.requires_grad, model.parameters()) 41 | 42 | model_info ="; ".join( [str(k)+" : "+ str(v) for k,v in opt.__dict__.items() if type(v) in (str,int,float,list,bool)]) 43 | logger.info("# parameters:" + str(sum(param.numel() for param in params))) 44 | logger.info(model_info) 45 | 46 | 47 | model.train() 48 | optimizer = utils.getOptimizer(params,name=opt.optimizer, lr=opt.learning_rate,scheduler=opt.lr_scheduler) 49 | optimizer.zero_grad() 50 | loss_fun = F.cross_entropy 51 | 52 | percisions=[] 53 | for i in range(opt.max_epoch): 54 | for epoch,batch in enumerate(train_iter): 55 | start= time.time() 56 | 57 | text = batch.text[0] if opt.from_torchtext else batch.text 58 | predicted = model(text) 59 | 60 | loss= loss_fun(predicted,batch.label) 61 | 62 | loss.backward() 63 | utils.clip_gradient(optimizer, opt.grad_clip) 64 | optimizer.step() 65 | 66 | if verbose: 67 | if torch.cuda.is_available(): 68 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.cpu().data.numpy()[0],time.time()-start)) 69 | else: 70 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.data.numpy()[0],time.time()-start)) 71 | 72 | percision=utils.evaluation(model,test_iter,opt.from_torchtext) 73 | percisions.append(percision) 74 | if verbose: 75 | logger.info("%d iteration with percision %.4f" % (i,percision)) 76 | 77 | # while(utils.is_writeable(performance_log_file)): 78 | df = pd.read_csv(performance_log_file,index_col=0,sep="\t") 79 | df.loc[model_info,opt.dataset] = max(percisions) 80 | df.to_csv(performance_log_file,sep="\t") 81 | logger.info(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) ) 82 | print(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) ) 83 | 84 | if __name__=="__main__": 85 | 86 | if not os.path.exists(performance_log_file): 87 | with open(performance_log_file,"w") as f: 88 | f.write("argument\n") 89 | f.close() 90 | print("gpu : %d" % opt.gpu) 91 | 92 | 93 | parameter_pools={ 94 | "model":["lstm","cnn","kim_cnn","fasttext"], 95 | "keep_dropout":[0.1,0.5,0.8,0.9,1.0], 96 | "batch_size":[32,64,128], 97 | "learning_rate":[100,10,1,1e-1,1e-2,1e-3], 98 | "optimizer":["adam"], 99 | "lr_scheduler":[None] 100 | } 101 | 102 | pool =[ arg for arg in itertools.product(*parameter_pools.values())] 103 | pool=random.shuffle(pool) 104 | args=[arg for i,arg in enumerate(pool) if i%8==opt.gpu] 105 | 106 | for arg in args: 107 | for k,v in zip(parameter_pools.keys(),arg): 108 | opt.__setattr__(k,v) 109 | train(opt,train_iter, test_iter,verbose=True) 110 | -------------------------------------------------------------------------------- /push.bash: -------------------------------------------------------------------------------- 1 | git add *.py 2 | git add models/*.py 3 | git add dataloader/*.py 4 | git commit -m $1 5 | git pull 6 | git push 7 | 8 | -------------------------------------------------------------------------------- /search.sh: -------------------------------------------------------------------------------- 1 | echo "use gpu with multiple processes"; 2 | for((i=0;i<=8;i++)) 3 | do 4 | { 5 | echo "use gpu" +$i ; 6 | echo CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 7 | CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 8 | 9 | }& 10 | done 11 | wait -------------------------------------------------------------------------------- /trandition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | from sklearn.naive_bayes import MultinomialNB 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.pipeline import make_pipeline 10 | from sklearn.linear_model import SGDClassifier 11 | from sklearn import metrics 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.model_selection import cross_val_score 14 | import numpy as np 15 | import opts 16 | import dataHelper 17 | #refer to "https://zhuanlan.zhihu.com/p/26729228" 18 | opt = opts.parse_opt() 19 | import dataHelper as helper 20 | train_iter, test_iter = dataHelper.loadData(opt,embedding=False) 21 | #categories = ['good', 'bad', 'mid'] 22 | x_train,y_train=train_iter 23 | x_test,y_test = test_iter 24 | 25 | #opt.model ="haha" 26 | if opt.model == "bayes": 27 | """ Naive Bayes classifier """ 28 | # sklearn有一套很成熟的管道流程Pipeline,快速搭建机器学习模型神器 29 | bayes_clf = Pipeline([('vect', CountVectorizer()), 30 | ('tfidf', TfidfTransformer()), 31 | ('clf', MultinomialNB()) 32 | ]) 33 | bayes_clf.fit(x_train, y_train) 34 | """ Predict the test dataset using Naive Bayes""" 35 | predicted = bayes_clf.predict(x_test) 36 | print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) 37 | # 输出f1分数,准确率,召回率等指标 38 | # print(metrics.classification_report(y_test, predicted, target_names=categories)) 39 | elif opt.model == "svm": 40 | 41 | """ Support Vector Machine (SVM) classifier""" 42 | svm_clf = Pipeline([('vect', CountVectorizer()), 43 | ('tfidf', TfidfTransformer()), 44 | ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)), 45 | ]) 46 | svm_clf.fit(x_train, y_train) 47 | predicted = svm_clf.predict(x_test) 48 | print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) 49 | # print(metrics.classification_report(y_test, predicted, target_names=categories)) 50 | 51 | else: 52 | """ 10-折交叉验证 """ 53 | clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB()) 54 | clf_s= make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter= 5, random_state=42)) 55 | 56 | bayes_10_fold = cross_val_score(clf_b, x_test, y_test, cv=10) 57 | svm_10_fold = cross_val_score(clf_s, x_test, y_test, cv=10) 58 | 59 | print('Naives Bayes 10-fold correct prediction: {:4.4f}'.format(np.mean(bayes_10_fold))) 60 | print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold))) 61 | # 输出混淆矩阵 62 | #print("Confusion Matrix:") 63 | #print(metrics.confusion_matrix(y_test, predicted)) 64 | #print('\n') 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn.functional as F 4 | from torchtext import data 5 | from torchtext import datasets 6 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText 7 | import numpy as np 8 | from functools import wraps 9 | import time 10 | import sys 11 | import logging 12 | import os 13 | 14 | def log_time_delta(func): 15 | @wraps(func) 16 | def _deco(*args, **kwargs): 17 | start = time.time() 18 | ret = func(*args, **kwargs) 19 | end = time.time() 20 | delta = end - start 21 | print( "%s runed %.2f seconds"% (func.__name__,delta)) 22 | return ret 23 | return _deco 24 | 25 | def clip_gradient(optimizer, grad_clip): 26 | for group in optimizer.param_groups: 27 | for param in group['params']: 28 | if param.grad is not None and param.requires_grad: 29 | param.grad.data.clamp_(-grad_clip, grad_clip) 30 | 31 | 32 | def loadData(opt): 33 | if not opt.from_torchtext: 34 | import dataHelper as helper 35 | return helper.loadData(opt) 36 | device = 0 if torch.cuda.is_available() else -1 37 | 38 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len) 39 | LABEL = data.Field(sequential=False) 40 | if opt.dataset=="imdb": 41 | train, test = datasets.IMDB.splits(TEXT, LABEL) 42 | elif opt.dataset=="sst": 43 | train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True, 44 | filter_pred=lambda ex: ex.label != 'neutral') 45 | elif opt.dataset=="trec": 46 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) 47 | else: 48 | print("does not support this datset") 49 | 50 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 51 | LABEL.build_vocab(train) 52 | # print vocab information 53 | print('len(TEXT.vocab)', len(TEXT.vocab)) 54 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 55 | 56 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True) 57 | 58 | opt.label_size= len(LABEL.vocab) 59 | opt.vocab_size = len(TEXT.vocab) 60 | opt.embedding_dim= TEXT.vocab.vectors.size()[1] 61 | opt.embeddings = TEXT.vocab.vectors 62 | 63 | return train_iter, test_iter 64 | 65 | 66 | def evaluation(model,test_iter,from_torchtext=True): 67 | model.eval() 68 | accuracy=[] 69 | # batch= next(iter(test_iter)) 70 | for index,batch in enumerate( test_iter): 71 | text = batch.text[0] if from_torchtext else batch.text 72 | predicted = model(text) 73 | prob, idx = torch.max(predicted, 1) 74 | percision=(idx== batch.label).float().mean() 75 | 76 | if torch.cuda.is_available(): 77 | accuracy.append(percision.data.cpu().numpy()[0] ) 78 | else: 79 | accuracy.append(percision.data.numpy()[0] ) 80 | model.train() 81 | return np.mean(accuracy) 82 | 83 | 84 | 85 | def getOptimizer(params,name="adam",lr=1,momentum=None,scheduler=None): 86 | 87 | name = name.lower().strip() 88 | 89 | if name=="adadelta": 90 | optimizer=torch.optim.Adadelta(params, lr=1.0*lr, rho=0.9, eps=1e-06, weight_decay=0).param_groups() 91 | elif name == "adagrad": 92 | optimizer=torch.optim.Adagrad(params, lr=0.01*lr, lr_decay=0, weight_decay=0) 93 | elif name == "sparseadam": 94 | optimizer=torch.optim.SparseAdam(params, lr=0.001*lr, betas=(0.9, 0.999), eps=1e-08) 95 | elif name =="adamax": 96 | optimizer=torch.optim.Adamax(params, lr=0.002*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 97 | elif name =="asgd": 98 | optimizer=torch.optim.ASGD(params, lr=0.01*lr, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) 99 | elif name == "lbfgs": 100 | optimizer=torch.optim.LBFGS(params, lr=1*lr, max_iter=20, max_eval=None, tolerance_grad=1e-05, tolerance_change=1e-09, history_size=100, line_search_fn=None) 101 | elif name == "rmsprop": 102 | optimizer=torch.optim.RMSprop(params, lr=0.01*lr, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) 103 | elif name =="rprop": 104 | optimizer=torch.optim.Rprop(params, lr=0.01*lr, etas=(0.5, 1.2), step_sizes=(1e-06, 50)) 105 | elif name =="sgd": 106 | optimizer=torch.optim.SGD(params, lr=0.1*lr, momentum=0, dampening=0, weight_decay=0, nesterov=False) 107 | elif name =="adam": 108 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 109 | else: 110 | print("undefined optimizer, use adam in default") 111 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 112 | 113 | if scheduler is not None: 114 | if scheduler == "lambdalr": 115 | lambda1 = lambda epoch: epoch // 30 116 | lambda2 = lambda epoch: 0.95 ** epoch 117 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1, lambda2]) 118 | elif scheduler=="steplr": 119 | return torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) 120 | elif scheduler =="multisteplr": 121 | return torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1) 122 | elif scheduler =="reducelronplateau": 123 | return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') 124 | else: 125 | pass 126 | 127 | else: 128 | return optimizer 129 | 130 | 131 | return 132 | def getLogger(): 133 | import random 134 | random_str = str(random.randint(1,10000)) 135 | 136 | now = int(time.time()) 137 | timeArray = time.localtime(now) 138 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) 139 | log_filename = "log/" +time.strftime("%Y%m%d", timeArray) 140 | 141 | program = os.path.basename(sys.argv[0]) 142 | logger = logging.getLogger(program) 143 | if not os.path.exists("log"): 144 | os.mkdir("log") 145 | if not os.path.exists(log_filename): 146 | os.mkdir(log_filename) 147 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',datefmt='%a, %d %b %Y %H:%M:%S',filename=log_filename+'/qa'+timeStamp+"_"+ random_str+'.log',filemode='w') 148 | logging.root.setLevel(level=logging.INFO) 149 | logger.info("running %s" % ' '.join(sys.argv)) 150 | 151 | return logger 152 | def is_writeable(path, check_parent=False): 153 | ''' 154 | Check if a given path is writeable by the current user. 155 | :param path: The path to check 156 | :param check_parent: If the path to check does not exist, check for the 157 | ability to write to the parent directory instead 158 | :returns: True or False 159 | ''' 160 | if os.access(path, os.F_OK) and os.access(path, os.W_OK): 161 | # The path exists and is writeable 162 | return True 163 | if os.access(path, os.F_OK) and not os.access(path, os.W_OK): 164 | # The path exists and is not writeable 165 | return False 166 | # The path does not exists or is not writeable 167 | if check_parent is False: 168 | # We're not allowed to check the parent directory of the provided path 169 | return False 170 | # Lets get the parent directory of the provided path 171 | parent_dir = os.path.dirname(path) 172 | if not os.access(parent_dir, os.F_OK): 173 | # Parent directory does not exit 174 | return False 175 | # Finally, return if we're allowed to write in the parent directory of the 176 | # provided path 177 | return os.access(parent_dir, os.W_OK) 178 | def is_readable(path): 179 | ''' 180 | Check if a given path is readable by the current user. 181 | :param path: The path to check 182 | :returns: True or False 183 | ''' 184 | if os.access(path, os.F_OK) and os.access(path, os.R_OK): 185 | # The path exists and is readable 186 | return True 187 | # The path does not exist 188 | return False 189 | 190 | --------------------------------------------------------------------------------