├── LICENSE.txt ├── README.md ├── config ├── grid_search_cnn.ini └── imdb.ini ├── dataHelper.py ├── dataloader ├── Dataset.py ├── __init__.py ├── ag.py ├── glove.py ├── imdb.py ├── mr.py ├── sst.py └── torch_text_demo │ ├── imdb.py │ ├── sst.py │ └── trec.py ├── docs ├── data_config.md ├── data_config_en.md ├── windows_torch.md └── windows_torch_en.md ├── main.py ├── models ├── BERTFast.py ├── BaseModel.py ├── BiBloSA.py ├── CNN.py ├── CNNBasic.py ├── CNNInception.py ├── CNNKim.py ├── CNNMultiLayer.py ├── CNNText.py ├── CNN_Inception.py ├── Capsule.py ├── ConvS2S.py ├── DiSAN.py ├── FastText.py ├── GPTModel.py ├── LSTM.py ├── LSTMBI.py ├── LSTMStack.py ├── LSTMTree.py ├── LSTMwithAttention.py ├── MLP.py ├── MemoryNetwork.py ├── QuantumCNN.py ├── RCNN.py ├── RNN_CNN.py ├── SelfAttention.py ├── Transformer.py ├── XLNetTransformer.py ├── __init__.py └── ensemble_strategy.py ├── opts.py ├── push.bash ├── search.sh ├── trandition.py └── utils.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Barun Patra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification Benchmark 2 | A Benchmark of Text Classification in PyTorch 3 | 4 | 5 | ## Motivation 6 | 7 | We are trying to build a Benchmark for Text Classification including 8 | 9 | 10 | >Many Text Classification **DataSet**, including Sentiment/Topic Classfication, popular language(e.g. English and Chinese). Meanwhile, a basic word embedding is provided. 11 | 12 | >Implment many popular and state-of-art **Models**, especially in deep neural network. 13 | 14 | ## Have done 15 | We have done some dataset and models 16 | ### Dataset done 17 | - IMDB 18 | - SST 19 | - Trec 20 | 21 | ### Models done 22 | - FastText 23 | - BasicCNN (KimCNN,MultiLayerCNN, Multi-perspective CNN) 24 | - InceptionCNN 25 | - LSTM (BILSTM, StackLSTM) 26 | - LSTM with Attention (Self Attention / Quantum Attention) 27 | - Hybrids between CNN and RNN (RCNN, C-LSTM) 28 | - Transformer - Attention is all you need 29 | - ConS2S 30 | - Capsule 31 | - Quantum-inspired NN 32 | 33 | ## Libary 34 | 35 | You should have install [these librarys](docs/windows_torch_en.md) 36 |
 37 | python3
 38 | torch
 39 | torchtext (optional)
 40 | 
41 | 42 | ## Dataset 43 | Dataset will be automatically configured in current path, or download manually your data in [Dataset](docs/data_config_en.md), step-by step. 44 | 45 | including 46 |
 47 | Glove embeding
 48 | Sentiment classfication dataset IMDB
 49 | 
50 | 51 | 52 | ## usage 53 | 54 | 55 | Run in default setting 56 |
python main.py
57 | 58 | CNN 59 |
python main.py --model cnn
60 | 61 | LSTM 62 |
python main.py --model lstm
63 | 64 | ## Road Map 65 | - [X] Data preprossing framework 66 | - [X] Models modules 67 | - [ ] Loss, Estimator and hyper-paramter tuning. 68 | - [ ] Test modules 69 | - [ ] More Dataset 70 | - [ ] More models 71 | 72 | 73 | 74 | ## Organisation of the repository 75 | The core of this repository is models and dataset. 76 | 77 | 78 | * ```dataloader/```: loading all dataset such as ```IMDB```, ```SST``` 79 | 80 | * ```models/```: creating all models such as ```FastText```, ```LSTM```,```CNN```,```Capsule```,```QuantumCNN``` ,```Multi-Head Attention``` 81 | 82 | * ```opts.py```: Parameter and config info. 83 | 84 | * ```utils.py```: tools. 85 | 86 | * ```dataHelper```: data helper 87 | 88 | 89 | 90 | 91 | ## Contributor 92 | - [@Allenzhai](https://github.com/zhaizheng) 93 | - [@JaredWei](https://github.com/jacobwei) 94 | - [@AlexMeng](https://github.com/EdwardLorenz) 95 | - [@Lilianwang](https://github.com/WangLilian) 96 | - [@ZhanSu](https://github.com/shuishen112) 97 | - [@Wabywang](https://github.com/Wabyking) 98 | 99 | Welcome your issues and contribution!!! 100 | 101 | -------------------------------------------------------------------------------- /config/grid_search_cnn.ini: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | [COMMON] 4 | model = bilstm 5 | keep_dropout=0.8;0.9 6 | batch_size=64;32;128 7 | learning_rate=10;1;0.1 8 | optimizer = adam;rmsprop 9 | dataset = imdb -------------------------------------------------------------------------------- /config/imdb.ini: -------------------------------------------------------------------------------- 1 | [COMMON] 2 | dataset = imdb;sst 3 | 4 | -------------------------------------------------------------------------------- /dataHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import numpy as np 5 | import string 6 | from collections import Counter 7 | import pandas as pd 8 | from tqdm import tqdm 9 | import random 10 | import time 11 | from utils import log_time_delta 12 | from dataloader import Dataset 13 | import torch 14 | from torch.autograd import Variable 15 | from codecs import open 16 | try: 17 | import cPickle as pickle 18 | except ImportError: 19 | import pickle 20 | class Alphabet(dict): 21 | def __init__(self, start_feature_id = 1, alphabet_type="text"): 22 | self.fid = start_feature_id 23 | if alphabet_type=="text": 24 | self.add('[PADDING]') 25 | self.add('[UNK]') 26 | self.add('[END]') 27 | self.unknow_token = self.get('[UNK]') 28 | self.end_token = self.get('[END]') 29 | self.padding_token = self.get('[PADDING]') 30 | 31 | def add(self, item): 32 | idx = self.get(item, None) 33 | if idx is None: 34 | idx = self.fid 35 | self[item] = idx 36 | # self[idx] = item 37 | self.fid += 1 38 | return idx 39 | 40 | def addAll(self,words): 41 | for word in words: 42 | self.add(word) 43 | 44 | def dump(self, fname,path="temp"): 45 | if not os.path.exists(path): 46 | os.mkdir(path) 47 | with open(os.path.join(path,fname), "w",encoding="utf-8") as out: 48 | for k in sorted(self.keys()): 49 | out.write("{}\t{}\n".format(k, self[k])) 50 | 51 | class DottableDict(dict): 52 | def __init__(self, *args, **kwargs): 53 | dict.__init__(self, *args, **kwargs) 54 | self.__dict__ = self 55 | self.allowDotting() 56 | def allowDotting(self, state=True): 57 | if state: 58 | self.__dict__ = self 59 | else: 60 | self.__dict__ = dict() 61 | 62 | class BucketIterator(object): 63 | def __init__(self,data,opt=None,batch_size=2,shuffle=True,test=False,position=False): 64 | self.shuffle=shuffle 65 | self.data=data 66 | self.batch_size=batch_size 67 | self.test=test 68 | if opt is not None: 69 | self.setup(opt) 70 | def setup(self,opt): 71 | 72 | self.batch_size=opt.batch_size 73 | self.shuffle=opt.__dict__.get("shuffle",self.shuffle) 74 | self.position=opt.__dict__.get("position",False) 75 | if self.position: 76 | self.padding_token = opt.alphabet.padding_token 77 | 78 | def transform(self,data): 79 | if torch.cuda.is_available(): 80 | data=data.reset_index() 81 | text= Variable(torch.LongTensor(data.text).cuda()) 82 | label= Variable(torch.LongTensor([int(i) for i in data.label.tolist()]).cuda()) 83 | else: 84 | data=data.reset_index() 85 | text= Variable(torch.LongTensor(data.text)) 86 | label= Variable(torch.LongTensor(data.label.tolist())) 87 | if self.position: 88 | position_tensor = self.get_position(data.text) 89 | return DottableDict({"text":(text,position_tensor),"label":label}) 90 | return DottableDict({"text":text,"label":label}) 91 | 92 | def get_position(self,inst_data): 93 | inst_position = np.array([[pos_i+1 if w_i != self.padding_token else 0 for pos_i, w_i in enumerate(inst)] for inst in inst_data]) 94 | inst_position_tensor = Variable( torch.LongTensor(inst_position), volatile=self.test) 95 | if torch.cuda.is_available(): 96 | inst_position_tensor=inst_position_tensor.cuda() 97 | return inst_position_tensor 98 | 99 | def __iter__(self): 100 | if self.shuffle: 101 | self.data = self.data.sample(frac=1).reset_index(drop=True) 102 | batch_nums = int(len(self.data)/self.batch_size) 103 | for i in range(batch_nums): 104 | yield self.transform(self.data[i*self.batch_size:(i+1)*self.batch_size]) 105 | yield self.transform(self.data[-1*self.batch_size:]) 106 | 107 | 108 | 109 | 110 | @log_time_delta 111 | def vectors_lookup(vectors,vocab,dim): 112 | embedding = np.zeros((len(vocab),dim)) 113 | count = 1 114 | for word in vocab: 115 | if word in vectors: 116 | count += 1 117 | embedding[vocab[word]]= vectors[word] 118 | else: 119 | embedding[vocab[word]]= np.random.uniform(-0.5,+0.5,dim)#vectors['[UNKNOW]'] #.tolist() 120 | print( 'word in embedding',count) 121 | return embedding 122 | 123 | @log_time_delta 124 | def load_text_vec(alphabet,filename="",embedding_size=-1): 125 | vectors = {} 126 | with open(filename,encoding='utf-8') as f: 127 | for line in tqdm(f): 128 | items = line.strip().split(' ') 129 | if len(items) == 2: 130 | vocab_size, embedding_size= items[0],items[1] 131 | print( 'embedding_size',embedding_size) 132 | print( 'vocab_size in pretrained embedding',vocab_size) 133 | else: 134 | word = items[0] 135 | if word in alphabet: 136 | vectors[word] = items[1:] 137 | print( 'words need to be found ',len(alphabet)) 138 | print( 'words found in wor2vec embedding ',len(vectors.keys())) 139 | 140 | if embedding_size==-1: 141 | embedding_size = len(vectors[list(vectors.keys())[0]]) 142 | return vectors,embedding_size 143 | 144 | def getEmbeddingFile(opt): 145 | #"glove" "w2v" 146 | embedding_name = opt.__dict__.get("embedding","glove_6b_300") 147 | if embedding_name.startswith("glove"): 148 | return os.path.join( ".vector_cache","glove.6B.300d.txt") 149 | else: 150 | return opt.embedding_dir 151 | # please refer to https://pypi.python.org/pypi/torchwordemb/0.0.7 152 | return 153 | @log_time_delta 154 | def getSubVectors(opt,alphabet): 155 | pickle_filename = "temp/"+opt.dataset+".vec" 156 | if not os.path.exists(pickle_filename) or opt.debug: 157 | glove_file = getEmbeddingFile(opt) 158 | wordset= set(alphabet.keys()) # python 2.7 159 | loaded_vectors,embedding_size = load_text_vec(wordset,glove_file) 160 | 161 | vectors = vectors_lookup(loaded_vectors,alphabet,embedding_size) 162 | if opt.debug: 163 | if not os.path.exists("temp"): 164 | os.mkdir("temp") 165 | with open("temp/oov.txt","w","utf-8") as f: 166 | unknown_set = set(alphabet.keys()) - set(loaded_vectors.keys()) 167 | f.write("\n".join( unknown_set)) 168 | if opt.debug: 169 | pickle.dump(vectors,open(pickle_filename,"wb")) 170 | return vectors 171 | else: 172 | print("load cache for SubVector") 173 | return pickle.load(open(pickle_filename,"rb")) 174 | 175 | def getDataSet(opt): 176 | import dataloader 177 | dataset= dataloader.getDataset(opt) 178 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']] 179 | 180 | return dataset.getFormatedData() 181 | 182 | #data_dir = os.path.join(".data/clean",opt.dataset) 183 | #if not os.path.exists(data_dir): 184 | # import dataloader 185 | # dataset= dataloader.getDataset(opt) 186 | # return dataset.getFormatedData() 187 | #else: 188 | # for root, dirs, files in os.walk(data_dir): 189 | # for file in files: 190 | # yield os.path.join(root,file) 191 | 192 | 193 | # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']] 194 | 195 | import re 196 | def clean(text): 197 | # text="'tycoon.","
","+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ",text) 201 | 202 | # print("%s $$$$$ %s" %(pre,text)) 203 | 204 | return text.lower().split() 205 | @log_time_delta 206 | def get_clean_datas(opt): 207 | pickle_filename = "temp/"+opt.dataset+".data" 208 | if not os.path.exists(pickle_filename) or opt.debug: 209 | datas = [] 210 | for filename in getDataSet(opt): 211 | df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0') 212 | 213 | # df["text"]= df["text"].apply(clean).str.lower().str.split() #replace("[\",:#]"," ") 214 | df["text"]= df["text"].apply(clean) 215 | datas.append(df) 216 | if opt.debug: 217 | if not os.path.exists("temp"): 218 | os.mkdir("temp") 219 | pickle.dump(datas,open(pickle_filename,"wb")) 220 | return datas 221 | else: 222 | print("load cache for data") 223 | return pickle.load(open(pickle_filename,"rb")) 224 | 225 | 226 | 227 | 228 | 229 | def load_vocab_from_bert(bert_base): 230 | 231 | 232 | bert_vocab_dir = os.path.join(bert_base,"vocab.txt") 233 | alphabet = Alphabet(start_feature_id = 0,alphabet_type="bert") 234 | 235 | from pytorch_pretrained_bert import BertTokenizer 236 | 237 | # Load pre-trained model tokenizer (vocabulary) 238 | tokenizer = BertTokenizer.from_pretrained(bert_vocab_dir) 239 | for index,word in tokenizer.ids_to_tokens.items(): 240 | alphabet.add(word) 241 | return alphabet,tokenizer 242 | 243 | 244 | def process_with_bert(text,tokenizer,max_seq_len) : 245 | tokens =tokenizer.convert_tokens_to_ids( tokenizer.tokenize(" ".join(text[:max_seq_len]))) 246 | 247 | return tokens[:max_seq_len] + [0] *int(max_seq_len-len(tokens)) 248 | 249 | def loadData(opt,embedding=True): 250 | if embedding==False: 251 | return loadDataWithoutEmbedding(opt) 252 | 253 | datas =get_clean_datas(opt) 254 | 255 | alphabet = Alphabet(start_feature_id = 0) 256 | label_alphabet= Alphabet(start_feature_id = 0,alphabet_type="label") 257 | 258 | df=pd.concat(datas) 259 | df.to_csv("demo.text",sep="\t",index=False) 260 | label_set = set(df["label"]) 261 | label_alphabet.addAll(label_set) 262 | opt.label_size= len(label_alphabet) 263 | if opt.max_seq_len==-1: 264 | opt.max_seq_len = df.apply(lambda row: row["text"].__len__(),axis=1).max() 265 | 266 | if "bert" not in opt.model.lower(): 267 | 268 | 269 | word_set=set() 270 | [word_set.add(word) for l in df["text"] if l is not None for word in l ] 271 | # from functools import reduce 272 | # word_set=set(reduce(lambda x,y :x+y,df["text"])) 273 | 274 | alphabet.addAll(word_set) 275 | 276 | vectors = getSubVectors(opt,alphabet) 277 | 278 | opt.vocab_size= len(alphabet) 279 | # opt.label_size= len(label_alphabet) 280 | opt.embedding_dim= vectors.shape[-1] 281 | opt.embeddings = torch.FloatTensor(vectors) 282 | 283 | else: 284 | alphabet,tokenizer = load_vocab_from_bert(opt.bert_dir) 285 | 286 | opt.alphabet=alphabet 287 | 288 | # alphabet.dump(opt.dataset+".alphabet") 289 | for data in datas: 290 | if "bert" not in opt.model.lower(): 291 | data["text"]= data["text"].apply(lambda text: [alphabet.get(word,alphabet.unknow_token) for word in text[:opt.max_seq_len]] + [alphabet.padding_token] *int(opt.max_seq_len-len(text)) ) 292 | else : 293 | data["text"]= data["text"].apply(process_with_bert,tokenizer=tokenizer,max_seq_len = opt.max_seq_len) 294 | data["label"]=data["label"].apply(lambda text: label_alphabet.get(text)) 295 | 296 | return map(lambda x:BucketIterator(x,opt),datas)#map(BucketIterator,datas) # 297 | 298 | def loadDataWithoutEmbedding(opt): 299 | datas=[] 300 | for filename in getDataSet(opt): 301 | df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0') 302 | df["text"]= df["text"].str.lower() 303 | datas.append((df["text"],df["label"])) 304 | return datas 305 | 306 | 307 | 308 | 309 | 310 | if __name__ =="__main__": 311 | import opts 312 | opt = opts.parse_opt() 313 | opt.max_seq_len=-1 314 | import dataloader 315 | dataset= dataloader.getDataset(opt) 316 | datas=loadData(opt) 317 | 318 | 319 | -------------------------------------------------------------------------------- /dataloader/Dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os,urllib 3 | class Dataset(object): 4 | def __init__(self,opt=None): 5 | if opt is not None: 6 | self.setup(opt) 7 | self.http_proxy= opt.__dict__.get("proxy","null") 8 | 9 | else: 10 | self.name="demo" 11 | self.dirname="demo" 12 | self.http_proxy="null" 13 | 14 | self.urls=[] 15 | self.root=".data" 16 | self.saved_path= os.path.join(os.path.join(self.root,"clean"),self.name) 17 | self.formated_files=None 18 | 19 | 20 | 21 | def setup(self,opt): 22 | 23 | self.name=opt.dataset 24 | self.dirname=opt.dataset 25 | self.http_proxy= opt.__dict__.get("proxy","null") 26 | 27 | 28 | def process(self): 29 | dirname=self.download() 30 | print("processing dirname: "+ dirname) 31 | raise Exception("method in father class have been called in processing: {} dataset".format(opt.dataset)) 32 | return dirname 33 | 34 | 35 | def getFormatedData(self): 36 | 37 | if self.formated_files is not None: 38 | return self.formated_files 39 | 40 | if os.path.exists(self.saved_path): 41 | return [os.path.join(self.saved_path,filename) for filename in os.listdir(self.saved_path)] 42 | self.formated_files = self.process() 43 | return self.formated_files 44 | 45 | def download_from_url(self,url, path, schedule=None): 46 | #if schedule is None: 47 | # schedule=lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*100)%10==0 else None 48 | if self.http_proxy != "null": 49 | proxy = urllib.request.ProxyHandler({'http': self.http_proxy,'https': self.http_proxy}) 50 | # construct a new opener using your proxy settings 51 | opener = urllib.request.build_opener(proxy) 52 | # install the openen on the module-level 53 | urllib.request.install_opener(opener) 54 | print("proxy in %s" % self.http_proxy) 55 | # urllib.request.urlretrieve(url,path,lambda a,b,c : print("%.1f"%(100.0 * a * b / c), end='\r',flush=True) if (int(a * b / c)*1000)%100==0 else None )a 56 | try: 57 | urllib.request.urlretrieve(url,path ) 58 | except: 59 | import urllib2 60 | urllib2.urlretrieve(url,path ) 61 | return path 62 | 63 | def download(self,check=None): 64 | """Download and unzip an online archive (.zip, .gz, or .tgz). 65 | 66 | Arguments: 67 | check (str or None): Folder whose existence indicates 68 | that the dataset has already been downloaded, or 69 | None to check the existence of root/{cls.name}. 70 | 71 | Returns: 72 | dataset_path (str): Path to extracted dataset. 73 | """ 74 | import zipfile,tarfile 75 | 76 | path = os.path.join(self.root, self.name) 77 | check = path if check is None else check 78 | if not os.path.isdir(check): 79 | for url in self.urls: 80 | if isinstance(url, tuple): 81 | url, filename = url 82 | else: 83 | filename = os.path.basename(url) 84 | zpath = os.path.join(path, filename) 85 | if not os.path.isfile(zpath): 86 | if not os.path.exists(os.path.dirname(zpath)): 87 | os.makedirs(os.path.dirname(zpath)) 88 | print('downloading {}'.format(filename)) 89 | 90 | self.download_from_url(url, zpath) 91 | ext = os.path.splitext(filename)[-1] 92 | if ext == '.zip': 93 | with zipfile.ZipFile(zpath, 'r') as zfile: 94 | print('extracting') 95 | zfile.extractall(path) 96 | elif ext in ['.gz', '.tgz',".bz2"]: 97 | with tarfile.open(zpath, 'r:gz') as tar: 98 | dirs = [member for member in tar.getmembers()] 99 | tar.extractall(path=path, members=dirs) 100 | else: 101 | print("%s do not need to be downloaded" % path) 102 | return path 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from .imdb import IMDBDataset 5 | from .mr import MRDataset 6 | from .glove import Glove 7 | from .sst import SSTDataset 8 | from .ag import AGDataset 9 | 10 | from .Dataset import Dataset 11 | def getDataset(opt): 12 | if opt.dataset=="imdb": 13 | dataset = IMDBDataset(opt) 14 | elif opt.dataset=="mr": 15 | dataset = MRDataset(opt) 16 | elif opt.dataset=="sst": 17 | dataset =SSTDataset(opt) 18 | elif opt.dataset == "ag": 19 | dataset =AGDataset(opt) 20 | elif opt.dataset in ["cr","mpqa","mr","sst1","sst2","subj","trec"]: 21 | dataset =Dataset(opt) 22 | 23 | 24 | else: 25 | raise Exception("dataset not supported: {}".format(opt.dataset)) 26 | return dataset 27 | 28 | def getEmbedding(opt): 29 | if opt.embedding_file.startswith("glove"): 30 | assert len(opt.embedding_file.split(".")) ==3 , "embedding_type format wrong" 31 | _,corpus,dim=opt.embedding_file.split(".") 32 | return Glove(corpus,dim,opt) 33 | else: 34 | raise Exception("embedding not supported: {}".format(opt.embedding_type)) 35 | 36 | -------------------------------------------------------------------------------- /dataloader/ag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class AGDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(AGDataset,self).__init__(opt,**kwargs) 12 | self.urls=['http://www.di.unipi.it/~gulli/newsSpace.bz2'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | # root = os.path.join(root,"rt-polaritydata") 19 | # print("processing into: "+ root) 20 | ### root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | # if not os.path.exists(self.saved_path): 22 | # print("mkdir " + self.saved_path) 23 | # os.makedirs(self.saved_path) # better than os.mkdir 24 | ## 25 | # datas=[] 26 | # for polarity in ("neg","pos"): 27 | # filename = os.path.join(root,"rt-polarity."+polarity) 28 | # records=[] 29 | # with open(filename,encoding="utf-8",errors="replace") as f: 30 | # for i,line in enumerate(f): 31 | # print(i) 32 | # print(line) 33 | # records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | # datas.append(pd.DataFrame(records)) 35 | # 36 | # 37 | # 38 | # df = pd.concat(datas) 39 | # from sklearn.utils import shuffle 40 | # df = shuffle(df).reset_index() 41 | # 42 | # split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | ## train=df.sample(frac=0.8) 44 | # train = df[split_index] 45 | # test = df[~np.array(split_index)] 46 | # 47 | # train_filename=os.path.join(self.saved_path,"train.csv") 48 | # test_filename = os.path.join(self.saved_path,"test.csv") 49 | # train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | # test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | # 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | # return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="ag" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/glove.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from .Dataset import Dataset 5 | class Glove(Dataset): 6 | def __init__(self,corpus,dim,opt=None,**kwargs): 7 | super(Glove,self).__init__(opt,**kwargs) 8 | 9 | self.root = ".vector_cache" 10 | 11 | # if not os.path.exists(self.root): 12 | # os.makedirs(self.root) 13 | 14 | embeding_urls = { 15 | '42b': 'http://nlp.stanford.edu/data/glove.42B.300d.zip', 16 | '840b': 'http://nlp.stanford.edu/data/glove.840B.300d.zip', 17 | 'twitter.27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip', 18 | '6b': 'http://nlp.stanford.edu/data/glove.6B.zip', 19 | } 20 | 21 | 22 | self.urls= [ embeding_urls[corpus.lower()] ] 23 | print(self.urls) 24 | self.name = corpus 25 | 26 | 27 | def process(self): 28 | 29 | root=self.download() 30 | 31 | return root 32 | def getFilename(self): 33 | return self.process() 34 | 35 | if __name__ =="__main__": 36 | import opts 37 | opt = opts.parse_opt() 38 | 39 | 40 | import dataloader 41 | glove=dataloader.getEmbedding(opt) 42 | print(glove.getFilename()) 43 | 44 | -------------------------------------------------------------------------------- /dataloader/imdb.py: -------------------------------------------------------------------------------- 1 | from .Dataset import Dataset 2 | import os 3 | import pandas as pd 4 | from codecs import open 5 | 6 | class IMDBDataset(Dataset): 7 | def __init__(self,opt=None,**kwargs): 8 | super(IMDBDataset,self).__init__(opt,**kwargs) 9 | self.urls=['http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'] 10 | 11 | 12 | def process(self): 13 | 14 | root=self.download() 15 | root = os.path.join(root,"aclImdb") 16 | print("processing into: "+ root) 17 | # root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 18 | if not os.path.exists(self.saved_path): 19 | print("mkdir " + self.saved_path) 20 | os.makedirs(self.saved_path) # better than os.mkdir 21 | 22 | datafiles=[] 23 | 24 | for data_folder in ("train","test"): 25 | data = [] 26 | for polarity in ("pos","neg"): 27 | diranme=os.path.join( os.path.join(root,data_folder), polarity) 28 | for rt, dirs, files in os.walk(diranme): 29 | for f in files: 30 | filename= os.path.join(rt,f) 31 | data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 32 | df=pd.DataFrame(data) 33 | saved_filename=os.path.join(self.saved_path,data_folder+".csv") 34 | 35 | df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 36 | print("finished %s"%saved_filename) 37 | datafiles.append(saved_filename) 38 | print("processing into formated files over") 39 | 40 | 41 | return datafiles 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /dataloader/mr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class MRDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(MRDataset,self).__init__(opt,**kwargs) 12 | self.urls=['https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | root = os.path.join(root,"rt-polaritydata") 19 | print("processing into: "+ root) 20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | if not os.path.exists(self.saved_path): 22 | print("mkdir " + self.saved_path) 23 | os.makedirs(self.saved_path) # better than os.mkdir 24 | # 25 | datas=[] 26 | for polarity in ("neg","pos"): 27 | filename = os.path.join(root,"rt-polarity."+polarity) 28 | records=[] 29 | with open(filename,encoding="utf-8",errors="replace") as f: 30 | for i,line in enumerate(f): 31 | print(i) 32 | print(line) 33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | datas.append(pd.DataFrame(records)) 35 | 36 | 37 | 38 | df = pd.concat(datas) 39 | from sklearn.utils import shuffle 40 | df = shuffle(df).reset_index() 41 | 42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | # train=df.sample(frac=0.8) 44 | train = df[split_index] 45 | test = df[~np.array(split_index)] 46 | 47 | train_filename=os.path.join(self.saved_path,"train.csv") 48 | test_filename = os.path.join(self.saved_path,"test.csv") 49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="mr" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/sst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Dataset import Dataset 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from codecs import open 8 | 9 | class SSTDataset(Dataset): 10 | def __init__(self,opt=None,**kwargs): 11 | super(SSTDataset,self).__init__(opt,**kwargs) 12 | self.urls=['http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip'] 13 | 14 | 15 | def process(self): 16 | 17 | root=self.download() 18 | root = os.path.join(root,"rt-polaritydata") 19 | print("processing into: "+ root) 20 | ## root = "D:\code\git\TextClassificationBenchmark\.data_waby\\imdb\\aclImdb" 21 | if not os.path.exists(self.saved_path): 22 | print("mkdir " + self.saved_path) 23 | os.makedirs(self.saved_path) # better than os.mkdir 24 | # 25 | datas=[] 26 | for polarity in ("neg","pos"): 27 | filename = os.path.join(root,"rt-polarity."+polarity) 28 | records=[] 29 | with open(filename,encoding="utf-8",errors="replace") as f: 30 | for i,line in enumerate(f): 31 | print(i) 32 | print(line) 33 | records.append({"text":line.strip(),"label": 1 if polarity == "pos" else 0}) 34 | datas.append(pd.DataFrame(records)) 35 | 36 | 37 | 38 | df = pd.concat(datas) 39 | from sklearn.utils import shuffle 40 | df = shuffle(df).reset_index() 41 | 42 | split_index = [True] * int (len(df) *0.8) + [False] *(len(df)-int (len(df) *0.8)) 43 | # train=df.sample(frac=0.8) 44 | train = df[split_index] 45 | test = df[~np.array(split_index)] 46 | 47 | train_filename=os.path.join(self.saved_path,"train.csv") 48 | test_filename = os.path.join(self.saved_path,"test.csv") 49 | train[["text","label"]].to_csv(train_filename,encoding="utf-8",sep="\t",index=False,header=None) 50 | test[["text","label"]].to_csv(test_filename,encoding="utf-8",sep="\t",index=False,header=None) 51 | 52 | 53 | # 54 | # for data_folder in ("train","test"): 55 | # data = [] 56 | # for polarity in ("pos","neg"): 57 | # diranme=os.path.join( os.path.join(root,data_folder), polarity) 58 | # for rt, dirs, files in os.walk(diranme): 59 | # for f in files: 60 | # filename= os.path.join(rt,f) 61 | # data.append( {"text": open(filename,encoding="utf-8").read().strip(),"label":int(polarity=="pos")}) 62 | # df=pd.DataFrame(data) 63 | # saved_filename=os.path.join(self.saved_path,data_folder+".csv") 64 | # 65 | # df[["text","label"]].to_csv(saved_filename,index=False,header=None,sep="\t",encoding="utf-8") 66 | # print("finished %s"%saved_filename) 67 | print("processing into formated files over") 68 | 69 | return [train_filename,test_filename] 70 | 71 | if __name__=="__main__": 72 | import opts 73 | opt = opts.parse_opt() 74 | opt.dataset="sst" 75 | import dataloader 76 | dataset= dataloader.getDataset(opt) 77 | dataset.process() 78 | 79 | 80 | -------------------------------------------------------------------------------- /dataloader/torch_text_demo/imdb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | 5 | from torchtext import data 6 | from torchtext import datasets 7 | from torchtext.vocab import GloVe 8 | import torch 9 | if torch.cuda.is_available() : 10 | device = -1 11 | else: 12 | device = 0 13 | # Approach 1: 14 | # set up fields 15 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) 16 | LABEL = data.Field(sequential=False) 17 | 18 | 19 | # make splits for data 20 | train, test = datasets.IMDB.splits(TEXT, LABEL) 21 | 22 | # print information about the data 23 | print('train.fields', train.fields) 24 | print('len(train)', len(train)) 25 | print('vars(train[0])', vars(train[0])) 26 | 27 | # build the vocabulary 28 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 29 | LABEL.build_vocab(train) 30 | 31 | # print vocab information 32 | print('len(TEXT.vocab)', len(TEXT.vocab)) 33 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 34 | 35 | # make iterator for splits 36 | #train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3, device=0) 37 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3,device=-1) 38 | # print batch information 39 | batch = next(iter(train_iter)) 40 | print(batch.text) 41 | print(batch.label) 42 | 43 | # Approach 2: 44 | train_iter, test_iter = datasets.IMDB.iters(batch_size=4,device=-1) 45 | 46 | # print batch information 47 | batch = next(iter(train_iter)) 48 | print(batch.text) 49 | print(batch.label) -------------------------------------------------------------------------------- /dataloader/torch_text_demo/sst.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext import datasets 3 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText 4 | 5 | 6 | # Approach 1: 7 | # set up fields 8 | TEXT = data.Field() 9 | LABEL = data.Field(sequential=False) 10 | 11 | # make splits for data 12 | train, val, test = datasets.SST.splits( 13 | TEXT, LABEL, fine_grained=True, train_subtrees=True, 14 | filter_pred=lambda ex: ex.label != 'neutral') 15 | 16 | # print information about the data 17 | print('train.fields', train.fields) 18 | print('len(train)', len(train)) 19 | print('vars(train[0])', vars(train[0])) 20 | 21 | # build the vocabulary 22 | url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' 23 | TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url)) 24 | LABEL.build_vocab(train) 25 | 26 | # print vocab information 27 | print('len(TEXT.vocab)', len(TEXT.vocab)) 28 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 29 | 30 | # make iterator for splits 31 | train_iter, val_iter, test_iter = data.BucketIterator.splits( 32 | (train, val, test), batch_size=3, device=0) 33 | 34 | # print batch information 35 | batch = next(iter(train_iter)) 36 | print(batch.text) 37 | print(batch.label) 38 | 39 | # Approach 2: 40 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) 41 | LABEL.build_vocab(train) 42 | 43 | # print vocab information 44 | print('len(TEXT.vocab)', len(TEXT.vocab)) 45 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 46 | 47 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) 48 | 49 | # print batch information 50 | batch = next(iter(train_iter)) 51 | print(batch.text) 52 | print(batch.label) 53 | 54 | # Approach 3: 55 | f = FastText() 56 | TEXT.build_vocab(train, vectors=f) 57 | TEXT.vocab.extend(f) 58 | LABEL.build_vocab(train) 59 | 60 | # print vocab information 61 | print('len(TEXT.vocab)', len(TEXT.vocab)) 62 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 63 | 64 | train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) 65 | 66 | # print batch information 67 | batch = next(iter(train_iter)) 68 | print(batch.text) 69 | print(batch.label) -------------------------------------------------------------------------------- /dataloader/torch_text_demo/trec.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext import datasets 3 | from torchtext.vocab import GloVe, CharNGram 4 | import torch 5 | if not torch.cuda.is_available() : 6 | device = -1 7 | else: 8 | device = 0 9 | 10 | # Approach 1: 11 | # set up fields 12 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) 13 | LABEL = data.Field(sequential=False) 14 | 15 | 16 | # make splits for data 17 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) 18 | 19 | # print information about the data 20 | print('train.fields', train.fields) 21 | print('len(train)', len(train)) 22 | print('vars(train[0])', vars(train[0])) 23 | 24 | # build the vocabulary 25 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 26 | LABEL.build_vocab(train) 27 | 28 | # print vocab information 29 | print('len(TEXT.vocab)', len(TEXT.vocab)) 30 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 31 | 32 | # make iterator for splits 33 | train_iter, test_iter = data.BucketIterator.splits( 34 | (train, test), batch_size=3, device=device) 35 | 36 | # print batch information 37 | batch = next(iter(train_iter)) 38 | print(batch.text) 39 | print(batch.label) 40 | 41 | # Approach 2: 42 | TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()],device=device) 43 | LABEL.build_vocab(train) 44 | 45 | train_iter, test_iter = datasets.TREC.iters(batch_size=4) 46 | 47 | # print batch information 48 | batch = next(iter(train_iter)) 49 | print(batch.text) 50 | print(batch.label) -------------------------------------------------------------------------------- /docs/data_config.md: -------------------------------------------------------------------------------- 1 | # 数据配置 2 | 3 | 4 | ##第一步先支持[torchtext](https://github.com/pytorch/text)本来支持的数据集合 5 | 6 | 7 | The datasets module currently contains: 8 | 9 | - Sentiment analysis: SST and IMDb 10 | - Question classification: TREC 11 | - Entailment: SNLI 12 | - Language modeling: WikiText-2 13 | - Machine translation: Multi30k, IWSLT, WMT14 14 | 15 | Others are planned or a work in progress: 16 | 17 | - Question answering: SQuAD 18 | 19 | 目前需要配置的数据集合 20 | 21 | ###Glove的下载到项目的根目录 ..vector_cache文件夹下 22 | 23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip) 24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip) 25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip) 26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip) 27 | 28 | ###分类数据集下载配置 29 | 30 | - [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)数据集下载到 .data/imdb 31 | - [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)数据集下载到.data/sst 32 | - TREC [1](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) 问题分类数据集下载到.data/imdb 33 | 34 | ###文件结构示例如下 35 | 36 | - TextClassificationBenchmark 37 | - .data 38 | - imdb 39 | - aclImdb_v1.tar.gz 40 | - sst 41 | - trainDevTestTrees_PTB.zip 42 | - trec 43 | - train_5500.label 44 | - TREC_10.label 45 | - .vector_cache 46 | - glove.42B.300d.zip 47 | - glove.840B.300d.zip 48 | - glove.twitter.27B.zip 49 | - glove.6B.zip 50 | 51 | 52 | 53 | ##更多的数据集请等待我们进一步更新 -------------------------------------------------------------------------------- /docs/data_config_en.md: -------------------------------------------------------------------------------- 1 | # Data configuration 2 | 3 | **Install [torchtext](https://github.com/pytorch/text) for data processing** 4 | 5 | The datasets module currently contains: 6 | 7 | - Sentiment analysis: SST and IMDb 8 | - Question classification: TREC 9 | - Entailment: SNLI 10 | - Language modeling: WikiText-2 11 | - Machine translation: Multi30k, IWSLT, WMT14 12 | 13 | Others are planned or a work in progress: 14 | 15 | - Question answering: SQuAD 16 | 17 | The current need to configure the data collection 18 | 19 | ### Glove 20 | 21 | Download to the project's root directory under the folder vector_cache 22 | 23 | - [42B](http://nlp.stanford.edu/data/glove.42B.300d.zip) 24 | - [840B](http://nlp.stanford.edu/data/glove.840B.300d.zip) 25 | - [twitter.27B](http://nlp.stanford.edu/data/glove.twitter.27B.zip) 26 | - [6B](http://nlp.stanford.edu/data/glove.6B.zip) 27 | 28 | ### Classification Datasets 29 | 30 | - Download [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) dataset to .data/imdb 31 | - Download [SST](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip) dataset to .data/sst 32 | - Download TREC [Question Classification ](http://cogcomp.org/Data/QA/QC/train_5500.label) [2](http://cogcomp.org/Data/QA/QC/TREC_10.label) dataset to .data/imdb 33 | 34 | ### File Structure 35 | 36 | - TextClassificationBenchmark 37 | - .data 38 | - imdb 39 | - aclImdb_v1.tar.gz 40 | - sst 41 | - trainDevTestTrees_PTB.zip 42 | - trec 43 | - train_5500.label 44 | - TREC_10.label 45 | - .vector_cache 46 | - glove.42B.300d.zip 47 | - glove.840B.300d.zip 48 | - glove.twitter.27B.zip 49 | - glove.6B.zip 50 | 51 | 52 | 53 | ## More datasets and updates coming soon, please wait for us to update further 54 | -------------------------------------------------------------------------------- /docs/windows_torch.md: -------------------------------------------------------------------------------- 1 | # Windows 平台安装 PyTorch 2 | 3 | 如果是Linux,Mac安装直接移步pytorch[主页](http://pytorch.org/), 再安装TorchText 4 | 5 | ## Python安装 6 | 建议直接安装anaconda的[安装包](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe) 7 | 8 | ## Pytorch安装 9 | 在[百度网盘](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch)下载一个 离线安装包 , 0.3版本或者是0.2版本均可 10 | 如果是whl安装包 11 |
pip install torch0.3XXX.whl
12 | 如果是一个conda安装包(压缩文件后缀) 13 |
conda install --offline  torch0.3XXX.tar.bz
14 | 15 | ## TorchText 安装 16 | 17 | 前提是有git和pip,如果没有需要下载git,并将其放到Path环境变量里 18 |
pip install git+https://github.com/pytorch/text.git 
19 | 20 | 还需要有代理的话 21 | 22 | 23 | 24 |
pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080 
25 | 26 | 27 | 参考链接 28 | https://zhuanlan.zhihu.com/p/31747695 29 | -------------------------------------------------------------------------------- /docs/windows_torch_en.md: -------------------------------------------------------------------------------- 1 | # Windows Platform Installation for PyTorch 2 | 3 | If Linux, Mac directly use pytorch from [homepage](http://pytorch.org/), and reinstall TorchText 4 | 5 | ## Python installation 6 | Please install anaconda directly: [installation package](https://repo.continuum.io/archive/Anaconda3-5.0.1-Windows-x86_64.exe) 7 | 8 | ## Pytorch installation 9 | In[Baidu Network Disk](https://pan.baidu.com/s/1dF6ayLr#list/path=%2Fpytorch) download offline, Version 0.3 or 0.2 wheels 10 |
pip install torch0.3XXX.whl
11 | 12 | If it is a conda installation environment 13 |
conda install --offline  torch0.3XXX.tar.bz
14 | 15 | ## TorchText installation 16 | 17 | The assumption is that you have git and pip, if you don't, you need to download git and put it in the Path environment variable. 18 |
pip install git+https://github.com/pytorch/text.git 
19 | 20 | If you need a proxy, 21 |
pip install git+https://github.com/pytorch/text.git --proxy proxy.xx.com:8080 
22 | 23 | 24 | Reference Link: 25 | https://zhuanlan.zhihu.com/p/31747695 26 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from six.moves import cPickle 10 | import time,os,random 11 | import itertools 12 | 13 | import torch 14 | from torch.autograd import Variable 15 | import torch.optim as optim 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch.nn.modules.loss import NLLLoss,MultiLabelSoftMarginLoss,MultiLabelMarginLoss,BCELoss 19 | 20 | import opts 21 | import models 22 | import utils 23 | 24 | 25 | timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()) )) 26 | performance_log_file = os.path.join("log","result"+timeStamp+ ".csv") 27 | if not os.path.exists(performance_log_file): 28 | with open(performance_log_file,"w") as f: 29 | f.write("argument\n") 30 | f.close() 31 | 32 | 33 | def train(opt,train_iter, test_iter,verbose=True): 34 | global_start= time.time() 35 | logger = utils.getLogger() 36 | model=models.setup(opt) 37 | if torch.cuda.is_available(): 38 | model.cuda() 39 | params = [param for param in model.parameters() if param.requires_grad] #filter(lambda p: p.requires_grad, model.parameters()) 40 | 41 | model_info =";".join( [str(k)+":"+ str(v) for k,v in opt.__dict__.items() if type(v) in (str,int,float,list,bool)]) 42 | logger.info("# parameters:" + str(sum(param.numel() for param in params))) 43 | logger.info(model_info) 44 | 45 | 46 | model.train() 47 | optimizer = utils.getOptimizer(params,name=opt.optimizer, lr=opt.learning_rate,scheduler= utils.get_lr_scheduler(opt.lr_scheduler)) 48 | 49 | loss_fun = F.cross_entropy 50 | 51 | filename = None 52 | percisions=[] 53 | for i in range(opt.max_epoch): 54 | for epoch,batch in enumerate(train_iter): 55 | optimizer.zero_grad() 56 | start= time.time() 57 | 58 | text = batch.text[0] if opt.from_torchtext else batch.text 59 | predicted = model(text) 60 | 61 | loss= loss_fun(predicted,batch.label) 62 | 63 | loss.backward() 64 | utils.clip_gradient(optimizer, opt.grad_clip) 65 | optimizer.step() 66 | 67 | if verbose: 68 | if torch.cuda.is_available(): 69 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.cpu().data.numpy(),time.time()-start)) 70 | else: 71 | logger.info("%d iteration %d epoch with loss : %.5f in %.4f seconds" % (i,epoch,loss.data.numpy()[0],time.time()-start)) 72 | 73 | percision=utils.evaluation(model,test_iter,opt.from_torchtext) 74 | if verbose: 75 | logger.info("%d iteration with percision %.4f" % (i,percision)) 76 | if len(percisions)==0 or percision > max(percisions): 77 | if filename: 78 | os.remove(filename) 79 | filename = model.save(metric=percision) 80 | percisions.append(percision) 81 | 82 | # while(utils.is_writeable(performance_log_file)): 83 | df = pd.read_csv(performance_log_file,index_col=0,sep="\t") 84 | df.loc[model_info,opt.dataset] = max(percisions) 85 | df.to_csv(performance_log_file,sep="\t") 86 | logger.info(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) ) 87 | print(model_info +" with time :"+ str( time.time()-global_start)+" ->" +str( max(percisions) ) ) 88 | 89 | 90 | if __name__=="__main__": 91 | parameter_pools = utils.parse_grid_parameters("config/grid_search_cnn.ini") 92 | 93 | # parameter_pools={ 94 | # "model":["lstm","cnn","fasttext"], 95 | # "keep_dropout":[0.8,0.9,1.0], 96 | # "batch_size":[32,64,128], 97 | # "learning_rate":[100,10,1,1e-1,1e-2,1e-3], 98 | # "optimizer":["adam"], 99 | # "lr_scheduler":[None] 100 | # } 101 | opt = opts.parse_opt() 102 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys(): 103 | os.environ["CUDA_VISIBLE_DEVICES"] =opt.gpu 104 | train_iter, test_iter = utils.loadData(opt) 105 | # if from_torchtext: 106 | # train_iter, test_iter = utils.loadData(opt) 107 | # else: 108 | # import dataHelper 109 | # train_iter, test_iter = dataHelper.loadData(opt) 110 | if False: 111 | model=models.setup(opt) 112 | print(opt.model) 113 | if torch.cuda.is_available(): 114 | model.cuda() 115 | train(opt,train_iter, test_iter) 116 | else: 117 | 118 | pool =[ arg for arg in itertools.product(*parameter_pools.values())] 119 | random.shuffle(pool) 120 | args=[arg for i,arg in enumerate(pool) if i%opt.gpu_num==opt.gpu] 121 | 122 | for arg in args: 123 | olddataset = opt.dataset 124 | for k,v in zip(parameter_pools.keys(),arg): 125 | opt.__setattr__(k,v) 126 | if "dataset" in parameter_pools and olddataset != opt.dataset: 127 | train_iter, test_iter = utils.loadData(opt) 128 | train(opt,train_iter, test_iter,verbose=False) 129 | -------------------------------------------------------------------------------- /models/BERTFast.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import numpy as np 4 | from torch import nn 5 | from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM 6 | from models.BaseModel import BaseModel 7 | class BERTFast(BaseModel): 8 | def __init__(self, opt ): 9 | super(BERTFast, self).__init__(opt) 10 | 11 | self.bert_model = BertModel.from_pretrained('bert-base-uncased') 12 | for param in self.bert_model.parameters(): 13 | param.requires_grad=self.opt.bert_trained 14 | self.hidden2label = nn.Linear(768, opt.label_size) 15 | self.properties.update( 16 | {"bert_trained":self.opt.bert_trained 17 | }) 18 | 19 | 20 | def forward(self, content): 21 | encoded, _ = self.bert_model(content) 22 | encoded_doc = t.mean(encoded[-1],dim=1) 23 | logits = self.hidden2label(encoded_doc) 24 | return logits 25 | 26 | import argparse 27 | 28 | def parse_opt(): 29 | parser = argparse.ArgumentParser() 30 | # Data input settings 31 | parser.add_argument('--hidden_dim', type=int, default=128, 32 | help='hidden_dim') 33 | 34 | 35 | parser.add_argument('--batch_size', type=int, default=64, 36 | help='batch_size') 37 | parser.add_argument('--embedding_dim', type=int, default=300, 38 | help='embedding_dim') 39 | parser.add_argument('--learning_rate', type=float, default=4e-4, 40 | help='learning_rate') 41 | parser.add_argument('--grad_clip', type=float, default=1e-1, 42 | help='grad_clip') 43 | parser.add_argument('--model', type=str, default="lstm", 44 | help='model name') 45 | parser.add_argument('--label_size', type=str, default=2, 46 | help='label_size') 47 | 48 | 49 | # 50 | args = parser.parse_args() 51 | args.embedding_dim=300 52 | args.vocab_size=10000 53 | args.kernel_size=3 54 | args.num_classes=3 55 | args.content_dim=256 56 | args.max_seq_len=50 57 | 58 | # 59 | # # Check if args are valid 60 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 61 | 62 | 63 | return args 64 | 65 | if __name__ == '__main__': 66 | 67 | 68 | opt = parse_opt() 69 | m = BERTFast(opt) 70 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long() 71 | o = m(content) 72 | print(o.size()) 73 | 74 | -------------------------------------------------------------------------------- /models/BaseModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch as t 4 | 5 | import numpy as np 6 | from torch import nn 7 | from collections import OrderedDict 8 | import os 9 | class BaseModel(nn.Module): 10 | def __init__(self, opt ): 11 | super(BaseModel, self).__init__() 12 | self.model_name = 'BaseModel' 13 | self.opt=opt 14 | 15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 16 | if opt.__dict__.get("embeddings",None) is not None: 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | self.fc = nn.Linear(opt.embedding_dim, opt.label_size) 19 | 20 | 21 | self.properties = {"model_name":self.__class__.__name__, 22 | # "embedding_dim":self.opt.embedding_dim, 23 | # "embedding_training":self.opt.embedding_training, 24 | # "max_seq_len":self.opt.max_seq_len, 25 | "batch_size":self.opt.batch_size, 26 | "learning_rate":self.opt.learning_rate, 27 | "keep_dropout":self.opt.keep_dropout, 28 | } 29 | 30 | def forward(self,content): 31 | content_=t.mean(self.encoder(content),dim=1) 32 | out=self.fc(content_.view(content_.size(0),-1)) 33 | return out 34 | 35 | 36 | 37 | def save(self,save_dir="saved_model",metric=None): 38 | if not os.path.exists(save_dir): 39 | os.mkdir(save_dir) 40 | self.model_info = "__".join([k+"_"+str(v) if type(v)!=list else k+"_"+str(v)[1:-1].replace(",","_").replace(",","") for k,v in self.properties.items() ]) 41 | if metric: 42 | path = os.path.join(save_dir, str(metric)[2:] +"_"+ self.model_info) 43 | else: 44 | path = os.path.join(save_dir,self.model_info) 45 | t.save(self,path) 46 | return path 47 | 48 | 49 | 50 | if __name__ == '__main__': 51 | import sys 52 | sys.path.append(r"..") 53 | import opts 54 | opt=opts.parse_opt() 55 | opt.vocab_size=2501 56 | opt.embedding_dim=300 57 | opt.label_size=3 58 | m = BaseModel(opt) 59 | 60 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 61 | o = m(content) 62 | print(o.size()) 63 | path = m.save() -------------------------------------------------------------------------------- /models/BiBloSA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #https://github.com/galsang/BiBloSA-pytorch/blob/master/model/model.py 4 | 5 | -------------------------------------------------------------------------------- /models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from models.BaseModel import BaseModel 6 | class CNN(BaseModel): 7 | def __init__(self, opt): 8 | super(CNN, self).__init__(opt) 9 | 10 | self.embedding_type = opt.embedding_type 11 | self.batch_size = opt.batch_size 12 | self.max_sent_len = opt.max_sent_len 13 | self.embedding_dim = opt.embedding_dim 14 | self.vocab_size = opt.vocab_size 15 | self.CLASS_SIZE = opt.label_size 16 | self.FILTERS = opt["FILTERS"] 17 | self.FILTER_NUM = opt["FILTER_NUM"] 18 | self.keep_dropout = opt.keep_dropout 19 | self.IN_CHANNEL = 1 20 | 21 | assert (len(self.FILTERS) == len(self.FILTER_NUM)) 22 | 23 | # one for UNK and one for zero padding 24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 26 | self.WV_MATRIX = opt["WV_MATRIX"] 27 | self.embedding.weight.data.copy_(torch.from_numpy(self.WV_MATRIX)) 28 | if self.embedding_type == "static": 29 | self.embedding.weight.requires_grad = False 30 | elif self.embedding_type == "multichannel": 31 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.VOCAB_SIZE + 1) 32 | self.embedding2.weight.data.copy_(torch.from_numpy(self.WV_MATRIX)) 33 | self.embedding2.weight.requires_grad = False 34 | self.IN_CHANNEL = 2 35 | 36 | for i in range(len(self.FILTERS)): 37 | conv = nn.Conv1d(self.IN_CHANNEL, self.FILTER_NUM[i], self.embedding_dim * self.FILTERS[i], stride=self.WORD_DIM) 38 | setattr(self, 'conv_%d'%i, conv) 39 | 40 | self.fc = nn.Linear(sum(self.FILTER_NUM), self.label_size) 41 | 42 | self.properties.update( 43 | {"FILTER_NUM":self.FILTER_NUM, 44 | "FILTERS":self.FILTERS, 45 | }) 46 | 47 | def get_conv(self, i): 48 | return getattr(self, 'conv_%d'%i) 49 | 50 | def forward(self, inp): 51 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_sent_len) 52 | if self.embedding_type == "multichannel": 53 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_sent_len) 54 | x = torch.cat((x, x2), 1) 55 | 56 | conv_results = [ 57 | F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_sent_len - self.FILTERS[i] + 1) 58 | .view(-1, self.FILTER_NUM[i]) 59 | for i in range(len(self.FILTERS))] 60 | 61 | x = torch.cat(conv_results, 1) 62 | x = F.dropout(x, p=self.keep_dropout, training=self.training) 63 | x = self.fc(x) 64 | return x 65 | 66 | 67 | 68 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py 69 | from models.BaseModel import BaseModel 70 | class CNN1(BaseModel): 71 | 72 | def __init__(self, opt): 73 | super(CNN1,self).__init__(opt) 74 | 75 | V = opt.vocab_size 76 | D = opt.embedding_dim 77 | C = opt.label_size 78 | Ci = 1 79 | Co = opt.kernel_num 80 | Ks = opt.kernel_sizes 81 | 82 | self.embed = nn.Embedding(V, D) 83 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] 84 | self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) 85 | ''' 86 | self.conv13 = nn.Conv2d(Ci, Co, (3, D)) 87 | self.conv14 = nn.Conv2d(Ci, Co, (4, D)) 88 | self.conv15 = nn.Conv2d(Ci, Co, (5, D)) 89 | ''' 90 | self.dropout = nn.Dropout(opt.dropout) 91 | self.fc1 = nn.Linear(len(Ks)*Co, C) 92 | self.properties.update( 93 | {"kernel_num":opt.kernel_num, 94 | "kernel_sizes":opt.kernel_sizes, 95 | }) 96 | 97 | def conv_and_pool(self, x, conv): 98 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W) 99 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 100 | return x 101 | 102 | 103 | def forward(self, x): 104 | x = self.embed(x) # (N,W,D) 105 | 106 | if self.args.static: 107 | x = Variable(x) 108 | 109 | x = x.unsqueeze(1) # (N,Ci,W,D) 110 | 111 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks) 112 | 113 | 114 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks) 115 | 116 | x = torch.cat(x, 1) 117 | 118 | ''' 119 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co) 120 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co) 121 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co) 122 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) 123 | ''' 124 | x = self.dropout(x) # (N,len(Ks)*Co) 125 | logit = self.fc1(x) # (N,C) 126 | return logit 127 | 128 | import torch.nn as nn 129 | 130 | 131 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py 132 | from models.BaseModel import BaseModel 133 | class CNN2(BaseModel): 134 | def __init__(self, opt): 135 | super(CNN2, self).__init__(opt) 136 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim) 137 | 138 | self.conv1 = nn.Sequential( 139 | nn.Conv1d(opt.l0, 256, kernel_size=7, stride=1), 140 | nn.ReLU(), 141 | nn.MaxPool1d(kernel_size=3, stride=3) 142 | ) 143 | 144 | self.conv2 = nn.Sequential( 145 | nn.Conv1d(256, 256, kernel_size=7, stride=1), 146 | nn.ReLU(), 147 | nn.MaxPool1d(kernel_size=3, stride=3) 148 | ) 149 | 150 | self.conv3 = nn.Sequential( 151 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 152 | nn.ReLU() 153 | ) 154 | 155 | self.conv4 = nn.Sequential( 156 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 157 | nn.ReLU() 158 | ) 159 | 160 | self.conv5 = nn.Sequential( 161 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 162 | nn.ReLU() 163 | ) 164 | 165 | self.conv6 = nn.Sequential( 166 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 167 | nn.ReLU(), 168 | nn.MaxPool1d(kernel_size=3, stride=3) 169 | ) 170 | 171 | self.fc = nn.Linear(256, opt.label_size) 172 | self.properties.update( 173 | {}) 174 | 175 | def forward(self, x_input): 176 | # Embedding 177 | x = self.embed(x_input) # dim: (batch_size, max_seq_len, embedding_size) 178 | x = self.conv1(x) 179 | x = self.conv2(x) 180 | x = self.conv3(x) 181 | x = self.conv4(x) 182 | x = self.conv5(x) 183 | x = self.conv6(x) 184 | 185 | # collapse 186 | x = x.view(x.size(0), -1) 187 | x = self.fc(x) 188 | 189 | return F.log_softmax(x) 190 | 191 | from models.BaseModel import BaseModel 192 | class CNN3(BaseModel): 193 | """ 194 | A CNN for text classification. 195 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 196 | """ 197 | def __init__(self, args): 198 | super(CNN3, self).__init__(opt) 199 | self.args = args 200 | 201 | embedding_dim = args.embed_dim 202 | embedding_num = args.num_features 203 | class_number = args.class_num 204 | in_channel = 1 205 | out_channel = args.kernel_num 206 | kernel_sizes = args.kernel_sizes 207 | 208 | self.embed = nn.Embedding(embedding_num+1, embedding_dim) 209 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in kernel_sizes]) 210 | 211 | self.dropout = nn.Dropout(args.dropout) 212 | self.fc = nn.Linear(len(kernel_sizes) * out_channel, class_number) 213 | self.properties.update( 214 | {"kernel_sizes":kernel_sizes 215 | }) 216 | 217 | def forward(self, input_x): 218 | """ 219 | :param input_x: a list size having the number of batch_size elements with the same length 220 | :return: batch_size X num_aspects tensor 221 | """ 222 | # Embedding 223 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size) 224 | 225 | if self.args.static: 226 | x = F.Variable(input_x) 227 | 228 | # Conv & max pool 229 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size) 230 | 231 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1]) 232 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] 233 | 234 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes) 235 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] 236 | x = torch.cat(x, 1) 237 | 238 | # Dropout & output 239 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels) 240 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects) 241 | 242 | return logit -------------------------------------------------------------------------------- /models/CNNBasic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import numpy as np 4 | from torch import nn 5 | from models.BaseModel import BaseModel 6 | class BasicCNN1D(BaseModel): 7 | def __init__(self, opt ): 8 | super(BasicCNN1D, self).__init__(opt) 9 | 10 | self.content_dim=opt.__dict__.get("content_dim",256) 11 | self.kernel_size=opt.__dict__.get("kernel_size",3) 12 | 13 | 14 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 15 | if opt.__dict__.get("embeddings",None) is not None: 16 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 17 | 18 | self.content_conv = nn.Sequential( 19 | nn.Conv1d(in_channels = opt.embedding_dim, 20 | out_channels = self.content_dim, #256 21 | kernel_size = self.kernel_size), #3 22 | nn.ReLU(), 23 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1)) 24 | # nn.AdaptiveMaxPool1d() 25 | ) 26 | self.fc = nn.Linear(self.content_dim, opt.label_size) 27 | self.properties.update( 28 | {"content_dim":self.content_dim, 29 | "kernel_size":self.kernel_size, 30 | }) 31 | 32 | def forward(self, content): 33 | 34 | content = self.encoder(content) #64x200x300 35 | content_out = self.content_conv(content.permute(0,2,1)) #64x256x1 36 | reshaped = content_out.view(content_out.size(0), -1) #64x256 37 | logits = self.fc(reshaped) #64x3 38 | return logits 39 | 40 | from models.BaseModel import BaseModel 41 | class BasicCNN2D(BaseModel): 42 | """ 43 | A CNN for text classification. 44 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 45 | """ 46 | def __init__(self, args): 47 | super(BasicCNN2D, self).__init__(opt) 48 | 49 | self.embedding_dim = opt.embedding_dim 50 | self.vocab_size = opt.vocab_size 51 | self.label_size = opt.label_size 52 | self.keep_dropout = opt.keep_dropout 53 | in_channel = 1 54 | self.kernel_nums = opt.kernel_nums 55 | self.kernel_sizes = opt.kernel_sizes 56 | 57 | self.embed = nn.Embedding(self.vocab_size+1, self.embedding_dim) 58 | 59 | if opt.__dict__.get("embeddings",None) is not None: 60 | self.embed.weight=nn.Parameter(opt.embeddings) 61 | 62 | self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, self.embedding_dim)) for K,out_channel in zip(self.kernel_sizes,self.kernel_nums)]) 63 | 64 | self.dropout = nn.Dropout(self.keep_dropout) 65 | self.fc = nn.Linear(len(self.kernel_sizes) * self.out_channel, self.label_size) 66 | 67 | self.properties.update( 68 | {"kernel_nums":self.kernel_nums, 69 | "kernel_sizes":self.kernel_sizes, 70 | }) 71 | 72 | def forward(self, input_x): 73 | """ 74 | :param input_x: a list size having the number of batch_size elements with the same length 75 | :return: batch_size X num_aspects tensor 76 | """ 77 | # Embedding 78 | x = self.embed(input_x) # dim: (batch_size, max_seq_len, embedding_size) 79 | 80 | if self.opt.static: 81 | x = F.Variable(input_x) 82 | 83 | # Conv & max pool 84 | x = x.unsqueeze(1) # dim: (batch_size, 1, max_seq_len, embedding_size) 85 | 86 | # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1]) 87 | x = [F.relu(conv(x)).squeeze(3) for conv in self.conv] 88 | 89 | # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes) 90 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] 91 | x = torch.cat(x, 1) 92 | 93 | # Dropout & output 94 | x = self.dropout(x) # (batch_size,len(kernel_sizes)*num_kernels) 95 | logit = F.log_softmax(self.fc(x)) # (batch_size, num_aspects) 96 | 97 | return logit 98 | import argparse 99 | 100 | def parse_opt(): 101 | parser = argparse.ArgumentParser() 102 | # Data input settings 103 | parser.add_argument('--hidden_dim', type=int, default=128, 104 | help='hidden_dim') 105 | 106 | 107 | parser.add_argument('--batch_size', type=int, default=64, 108 | help='batch_size') 109 | parser.add_argument('--embedding_dim', type=int, default=300, 110 | help='embedding_dim') 111 | parser.add_argument('--learning_rate', type=float, default=4e-4, 112 | help='learning_rate') 113 | parser.add_argument('--grad_clip', type=float, default=1e-1, 114 | help='grad_clip') 115 | parser.add_argument('--model', type=str, default="lstm", 116 | help='model name') 117 | parser.add_argument('--model', type=str, default="lstm", 118 | help='model name') 119 | 120 | 121 | # 122 | args = parser.parse_args() 123 | args.embedding_dim=300 124 | args.vocab_size=10000 125 | args.kernel_size=3 126 | args.num_classes=3 127 | args.content_dim=256 128 | args.max_seq_len=50 129 | 130 | # 131 | # # Check if args are valid 132 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 133 | 134 | 135 | return args 136 | 137 | if __name__ == '__main__': 138 | 139 | opt = parse_opt() 140 | m = CNNText(opt) 141 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long() 142 | o = m(content) 143 | print(o.size()) 144 | 145 | -------------------------------------------------------------------------------- /models/CNNInception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch as t 5 | import torch 6 | import numpy as np 7 | from torch import nn 8 | from collections import OrderedDict 9 | 10 | class Inception(nn.Module): 11 | def __init__(self,cin,co,relu=True,norm=True): 12 | super(Inception, self).__init__() 13 | assert(co%4==0) 14 | cos=[int(co/4)]*4 15 | self.activa=nn.Sequential() 16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co)) 17 | if relu:self.activa.add_module('relu',nn.ReLU(True)) 18 | self.branch1 =nn.Sequential(OrderedDict([ 19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)), 20 | ])) 21 | self.branch2 =nn.Sequential(OrderedDict([ 22 | ('conv1', nn.Conv1d(cin,cos[1], 1)), 23 | ('norm1', nn.BatchNorm1d(cos[1])), 24 | ('relu1', nn.ReLU(inplace=True)), 25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)), 26 | ])) 27 | self.branch3 =nn.Sequential(OrderedDict([ 28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)), 29 | ('norm1', nn.BatchNorm1d(cos[2])), 30 | ('relu1', nn.ReLU(inplace=True)), 31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)), 32 | ])) 33 | self.branch4 =nn.Sequential(OrderedDict([ 34 | #('pool',nn.MaxPool1d(2)), 35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)), 36 | ])) 37 | def forward(self,x): 38 | branch1=self.branch1(x) 39 | branch2=self.branch2(x) 40 | branch3=self.branch3(x) 41 | branch4=self.branch4(x) 42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1)) 43 | return result 44 | 45 | from models.BaseModel import BaseModel 46 | class InceptionCNN(BaseModel): 47 | def __init__(self, opt ): 48 | super(InceptionCNN, self).__init__(opt) 49 | incept_dim=getattr(opt,"inception_dim",512) 50 | self.model_name = 'CNNText_inception' 51 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 52 | 53 | self.content_conv=nn.Sequential( 54 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2) 55 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4) 56 | Inception(incept_dim,incept_dim), 57 | nn.MaxPool1d(opt.max_seq_len) 58 | ) 59 | linear_hidden_size = getattr(opt,"linear_hidden_size",2000) 60 | self.fc = nn.Sequential( 61 | nn.Linear(incept_dim,linear_hidden_size), 62 | nn.BatchNorm1d(linear_hidden_size), 63 | nn.ReLU(inplace=True), 64 | nn.Linear(linear_hidden_size ,opt.label_size) 65 | ) 66 | if opt.__dict__.get("embeddings",None) is not None: 67 | self.encoder.weight=nn.Parameter(opt.embeddings) 68 | self.properties.update( 69 | {"linear_hidden_size":linear_hidden_size, 70 | "incept_dim":incept_dim, 71 | }) 72 | 73 | def forward(self,content): 74 | 75 | content=self.encoder(content) 76 | if self.opt.embedding_type=="static": 77 | content=content.detach(0) 78 | 79 | content_out=self.content_conv(content.permute(0,2,1)) 80 | out=content_out.view(content_out.size(0), -1) 81 | out=self.fc(out) 82 | return out 83 | 84 | if __name__ == '__main__': 85 | import sys 86 | sys.path.append(r"..") 87 | import opts 88 | opt=opts.parse_opt() 89 | opt.vocab_size=2501 90 | opt.label_size=3 91 | m = CNNText_inception(opt) 92 | 93 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 94 | o = m(content) 95 | print(o.size()) 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /models/CNNKim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from models.BaseModel import BaseModel 6 | class KIMCNN1D(BaseModel): 7 | def __init__(self, opt): 8 | super(KIMCNN1D, self).__init__(opt) 9 | 10 | self.embedding_type = opt.embedding_type 11 | self.batch_size = opt.batch_size 12 | self.max_seq_len = opt.max_seq_len 13 | self.embedding_dim = opt.embedding_dim 14 | self.vocab_size = opt.vocab_size 15 | self.label_size = opt.label_size 16 | self.kernel_sizes = opt.kernel_sizes 17 | self.kernel_nums = opt.kernel_nums 18 | self.keep_dropout = opt.keep_dropout 19 | self.in_channel = 1 20 | 21 | assert (len(self.kernel_sizes) == len(self.kernel_nums)) 22 | 23 | # one for UNK and one for zero padding 24 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim) #, padding_idx=self.vocab_size + 1 25 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 26 | self.embedding.weight=nn.Parameter(opt.embeddings) 27 | if self.embedding_type == "static": 28 | self.embedding.weight.requires_grad = False 29 | elif self.embedding_type == "multichannel": 30 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 31 | self.embedding2.weight=nn.Parameter(opt.embeddings) 32 | self.embedding2.weight.requires_grad = False 33 | self.in_channel = 2 34 | else: 35 | pass 36 | # 37 | # for i in range(len(self.kernel_sizes)): 38 | # conv = nn.Conv1d(self.in_channel, self.kernel_nums[i], self.embedding_dim * self.kernel_sizes[i], stride=self.embedding_dim) 39 | # setattr(self, 'conv_%d'%i, conv) 40 | self.convs = nn.ModuleList([nn.Conv1d(self.in_channel, num, self.embedding_dim * size, stride=self.embedding_dim) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)]) 41 | self.fc = nn.Linear(sum(self.kernel_nums), self.label_size) 42 | self.properties.update( 43 | {"kernel_sizes":self.kernel_sizes, 44 | "kernel_nums":self.kernel_nums, 45 | }) 46 | def get_conv(self, i): 47 | return getattr(self, 'conv_%d'%i) 48 | 49 | def forward(self, inp): 50 | x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_seq_len) 51 | if self.embedding_type == "multichannel": 52 | x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_seq_len) 53 | x = torch.cat((x, x2), 1) 54 | 55 | # conv_results = [ 56 | # F.max_pool1d(F.relu(self.get_conv(i)(x)), self.max_seq_len - self.kernel_sizes[i] + 1) 57 | # .view(-1, self.kernel_nums[i]) 58 | # for i in range(len(self.kernel_sizes))] 59 | conv_results = [ 60 | F.max_pool1d(F.relu(self.convs[i](x)), self.max_seq_len - self.kernel_sizes[i] + 1) 61 | .view(-1, self.kernel_nums[i]) 62 | for i in range(len(self.convs))] 63 | 64 | x = torch.cat(conv_results, 1) 65 | x = F.dropout(x, p=self.keep_dropout, training=self.training) 66 | x = self.fc(x) 67 | return x 68 | 69 | 70 | 71 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Yoon/model.py 72 | class KIMCNN2D(nn.Module): 73 | 74 | def __init__(self, opt): 75 | super(KIMCNN2D,self).__init__() 76 | self.opt = opt 77 | self.embedding_type = opt.embedding_type 78 | self.batch_size = opt.batch_size 79 | self.max_seq_len = opt.max_seq_len 80 | self.embedding_dim = opt.embedding_dim 81 | self.vocab_size = opt.vocab_size 82 | self.label_size = opt.label_size 83 | self.kernel_sizes = opt.kernel_sizes 84 | self.kernel_nums = opt.kernel_nums 85 | self.keep_dropout = opt.keep_dropout 86 | 87 | self.embedding = nn.Embedding(self.vocab_size + 2, self.embedding_dim) # padding_idx=self.vocab_size + 1 88 | if self.embedding_type == "static" or self.embedding_type == "non-static" or self.embedding_type == "multichannel": 89 | self.embedding.weight=nn.Parameter(opt.embeddings) 90 | if self.embedding_type == "static": 91 | self.embedding.weight.requires_grad = False 92 | elif self.embedding_type == "multichannel": 93 | self.embedding2 = nn.Embedding(self.vocab_size + 2, self.embedding_dim, padding_idx=self.vocab_size + 1) 94 | self.embedding2.weight=nn.Parameter(opt.embeddings) 95 | self.embedding2.weight.requires_grad = False 96 | self.in_channel = 2 97 | else: 98 | pass 99 | #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] 100 | self.convs1 = nn.ModuleList([nn.Conv2d(1, num, (size, opt.embedding_dim)) for size,num in zip(opt.kernel_sizes,opt.kernel_nums)]) 101 | ''' 102 | self.conv13 = nn.Conv2d(Ci, Co, (3, D)) 103 | self.conv14 = nn.Conv2d(Ci, Co, (4, D)) 104 | self.conv15 = nn.Conv2d(Ci, Co, (5, D)) 105 | ''' 106 | self.dropout = nn.Dropout(opt.keep_dropout) 107 | self.fc = nn.Linear(sum(opt.kernel_nums), opt.label_size) 108 | 109 | def conv_and_pool(self, x, conv): 110 | x = F.relu(conv(x)).squeeze(3) #(N,Co,W) 111 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 112 | return x 113 | 114 | 115 | def forward(self, x): 116 | x = self.embedding(x) # (N,W,D) 117 | 118 | 119 | 120 | x = x.unsqueeze(1) # (N,Ci,W,D) 121 | 122 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks) 123 | 124 | 125 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks) 126 | 127 | x = torch.cat(x, 1) 128 | 129 | ''' 130 | x1 = self.conv_and_pool(x,self.conv13) #(N,Co) 131 | x2 = self.conv_and_pool(x,self.conv14) #(N,Co) 132 | x3 = self.conv_and_pool(x,self.conv15) #(N,Co) 133 | x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co) 134 | ''' 135 | x = self.dropout(x) # (N,len(Ks)*Co) 136 | logit = self.fc(x) # (N,C) 137 | return logit 138 | 139 | if __name__ == '__main__': 140 | import sys 141 | sys.path.append(r"..") 142 | import opts 143 | import torch as t 144 | opt=opts.parse_opt() 145 | import dataHelper 146 | train_iter, test_iter = dataHelper.loadData(opt) 147 | m = KIMCNN2D(opt) 148 | 149 | 150 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 151 | o = m(content) 152 | print(o.size()) 153 | path = m.save() -------------------------------------------------------------------------------- /models/CNNMultiLayer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | #https://github.com/zachAlbus/pyTorch-text-classification/blob/master/Zhang/model.py 9 | from models.BaseModel import BaseModel 10 | class MultiLayerCNN(BaseModel): 11 | def __init__(self, opt): 12 | super(MultiLayerCNN, self).__init__(opt) 13 | self.embed = nn.Embedding(opt.vocab_size + 1, opt.embedding_dim) 14 | 15 | if opt.__dict__.get("embeddings",None) is not None: 16 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 17 | 18 | self.conv1 = nn.Sequential( 19 | nn.Conv1d(opt.max_seq_len, 256, kernel_size=7, stride=1), 20 | nn.ReLU(), 21 | nn.MaxPool1d(kernel_size=3, stride=3) 22 | ) 23 | 24 | self.conv2 = nn.Sequential( 25 | nn.Conv1d(256, 256, kernel_size=7, stride=1), 26 | nn.ReLU(), 27 | nn.MaxPool1d(kernel_size=3, stride=3) 28 | ) 29 | 30 | self.conv3 = nn.Sequential( 31 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 32 | nn.ReLU() 33 | ) 34 | 35 | self.conv4 = nn.Sequential( 36 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 37 | nn.ReLU() 38 | ) 39 | 40 | self.conv5 = nn.Sequential( 41 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 42 | nn.ReLU() 43 | ) 44 | 45 | self.conv6 = nn.Sequential( 46 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 47 | nn.ReLU(), 48 | nn.MaxPool1d(kernel_size=3, stride=3) 49 | ) 50 | 51 | self.fc = nn.Linear(256*7, opt.label_size) 52 | 53 | def forward(self, x): 54 | # Embedding 55 | x = self.embed(x) # dim: (batch_size, max_seq_len, embedding_size) 56 | x = self.conv1(x) 57 | x = self.conv2(x) 58 | x = self.conv3(x) 59 | x = self.conv4(x) 60 | x = self.conv5(x) 61 | x = self.conv6(x) 62 | 63 | # collapse 64 | x = x.view(x.size(0), -1) 65 | x = self.fc(x) 66 | 67 | return F.log_softmax(x) 68 | -------------------------------------------------------------------------------- /models/CNNText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import numpy as np 4 | from torch import nn 5 | from models.BaseModel import BaseModel 6 | class CNNText(BaseModel): 7 | def __init__(self, opt ): 8 | super(CNNText, self).__init__(opt) 9 | 10 | 11 | self.content_dim=opt.__dict__.get("content_dim",256) 12 | self.kernel_size=opt.__dict__.get("kernel_size",3) 13 | 14 | 15 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 16 | if opt.__dict__.get("embeddings",None) is not None: 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | 19 | 20 | self.content_conv = nn.Sequential( 21 | nn.Conv1d(in_channels = opt.embedding_dim, 22 | out_channels = self.content_dim, 23 | kernel_size = self.kernel_size), 24 | nn.ReLU(), 25 | nn.MaxPool1d(kernel_size = (opt.max_seq_len - self.kernel_size + 1)) 26 | # nn.AdaptiveMaxPool1d() 27 | ) 28 | 29 | self.fc = nn.Linear(self.content_dim, opt.label_size) 30 | self.properties.update( 31 | {"content_dim":self.content_dim, 32 | "kernel_size":self.kernel_size, 33 | }) 34 | 35 | def forward(self, content): 36 | 37 | content = self.encoder(content) 38 | content_out = self.content_conv(content.permute(0,2,1)) 39 | reshaped = content_out.view(content_out.size(0), -1) 40 | logits = self.fc(reshaped) 41 | return logits 42 | 43 | import argparse 44 | 45 | def parse_opt(): 46 | parser = argparse.ArgumentParser() 47 | # Data input settings 48 | parser.add_argument('--hidden_dim', type=int, default=128, 49 | help='hidden_dim') 50 | 51 | 52 | parser.add_argument('--batch_size', type=int, default=64, 53 | help='batch_size') 54 | parser.add_argument('--embedding_dim', type=int, default=300, 55 | help='embedding_dim') 56 | parser.add_argument('--learning_rate', type=float, default=4e-4, 57 | help='learning_rate') 58 | parser.add_argument('--grad_clip', type=float, default=1e-1, 59 | help='grad_clip') 60 | parser.add_argument('--model', type=str, default="lstm", 61 | help='model name') 62 | 63 | 64 | # 65 | args = parser.parse_args() 66 | args.embedding_dim=300 67 | args.vocab_size=10000 68 | args.kernel_size=3 69 | args.num_classes=3 70 | args.content_dim=256 71 | args.max_seq_len=50 72 | 73 | # 74 | # # Check if args are valid 75 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 76 | 77 | 78 | return args 79 | 80 | if __name__ == '__main__': 81 | 82 | 83 | opt = parse_opt() 84 | m = CNNText(opt) 85 | content = t.autograd.Variable(t.arange(0,3200).view(-1,50)).long() 86 | o = m(content) 87 | print(o.size()) 88 | 89 | -------------------------------------------------------------------------------- /models/CNN_Inception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch as t 5 | import torch 6 | import numpy as np 7 | from torch import nn 8 | from collections import OrderedDict 9 | from models.BaseModel import BaseModel 10 | class Inception(nn.Module): 11 | def __init__(self,cin,co,relu=True,norm=True): 12 | super(Inception, self).__init__() 13 | assert(co%4==0) 14 | cos=[co/4]*4 15 | self.activa=nn.Sequential() 16 | if norm:self.activa.add_module('norm',nn.BatchNorm1d(co)) 17 | if relu:self.activa.add_module('relu',nn.ReLU(True)) 18 | self.branch1 =nn.Sequential(OrderedDict([ 19 | ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)), 20 | ])) 21 | self.branch2 =nn.Sequential(OrderedDict([ 22 | ('conv1', nn.Conv1d(cin,cos[1], 1)), 23 | ('norm1', nn.BatchNorm1d(cos[1])), 24 | ('relu1', nn.ReLU(inplace=True)), 25 | ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)), 26 | ])) 27 | self.branch3 =nn.Sequential(OrderedDict([ 28 | ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)), 29 | ('norm1', nn.BatchNorm1d(cos[2])), 30 | ('relu1', nn.ReLU(inplace=True)), 31 | ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)), 32 | ])) 33 | self.branch4 =nn.Sequential(OrderedDict([ 34 | #('pool',nn.MaxPool1d(2)), 35 | ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)), 36 | ])) 37 | def forward(self,x): 38 | branch1=self.branch1(x) 39 | branch2=self.branch2(x) 40 | branch3=self.branch3(x) 41 | branch4=self.branch4(x) 42 | result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1)) 43 | return result 44 | class CNNText_inception(BaseModel): 45 | def __init__(self, opt ): 46 | super(CNNText_inception, self).__init__(opt) 47 | incept_dim=getattr(opt,"inception_dim",512) 48 | 49 | 50 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 51 | 52 | self.content_conv=nn.Sequential( 53 | Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2) 54 | #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4) 55 | Inception(incept_dim,incept_dim), 56 | nn.MaxPool1d(opt.max_seq_len) 57 | ) 58 | opt.hidden_size = getattr(opt,"linear_hidden_size",2000) 59 | self.fc = nn.Sequential( 60 | nn.Linear(incept_dim,opt.hidden_size), 61 | nn.BatchNorm1d(opt.hidden_size), 62 | nn.ReLU(inplace=True), 63 | nn.Linear(opt.hidden_size ,opt.label_size) 64 | ) 65 | if opt.__dict__.get("embeddings",None) is not None: 66 | print('load embedding') 67 | self.encoder.weight.data.copy_(t.from_numpy(opt.embeddings)) 68 | self.properties.update( 69 | {"inception_dim":incept_dim, 70 | "hidden_size":opt.hidden_size, 71 | }) 72 | 73 | def forward(self,content): 74 | 75 | content=self.encoder(content) 76 | if self.opt.static: 77 | content=content.detach(0) 78 | 79 | content_out=self.content_conv(content.permute(0,2,1)) 80 | out=content_out.view(content_out.size(0), -1) 81 | out=self.fc(out) 82 | return out 83 | 84 | if __name__ == '__main__': 85 | import sys 86 | sys.path.append(r"..") 87 | import opts 88 | opt=opts.parse_opt() 89 | opt.vocab_size=2501 90 | opt.label_size=3 91 | m = CNNText_inception(opt) 92 | 93 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 94 | o = m(content) 95 | print(o.size()) 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /models/Capsule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # paper 3 | 4 | 5 | # 6 | 7 | 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | from torch import nn 12 | import numpy as np 13 | 14 | BATCH_SIZE = 100 15 | 16 | NUM_EPOCHS = 500 17 | NUM_ROUTING_ITERATIONS = 3 18 | 19 | cuda = torch.cuda.is_available() 20 | 21 | def softmax(input, dim=1): 22 | transposed_input = input.transpose(dim, len(input.size()) - 1) 23 | softmaxed_output = F.softmax(transposed_input.contiguous().view(-1, transposed_input.size(-1))) 24 | return softmaxed_output.view(*transposed_input.size()).transpose(dim, len(input.size()) - 1) 25 | 26 | 27 | 28 | 29 | 30 | class CapsuleLayer(nn.Module): 31 | def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None, 32 | num_iterations=NUM_ROUTING_ITERATIONS,padding=0): 33 | super(CapsuleLayer, self).__init__() 34 | 35 | self.num_route_nodes = num_route_nodes 36 | self.num_iterations = num_iterations 37 | 38 | self.num_capsules = num_capsules 39 | 40 | 41 | 42 | if num_route_nodes != -1: 43 | self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels)) 44 | else: 45 | prime=[3,5,7,9,11,13,17,19,23] 46 | sizes=prime[:self.num_capsules] 47 | self.capsules = nn.ModuleList( 48 | [nn.Conv1d(in_channels, out_channels, kernel_size=i, stride=2, padding=int((i-1)/2)) for i in sizes]) 49 | 50 | def squash(self, tensor, dim=-1): 51 | squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True) 52 | scale = squared_norm / (1 + squared_norm) 53 | return scale * tensor / torch.sqrt(squared_norm) 54 | 55 | def forward(self, x): 56 | 57 | if self.num_route_nodes != -1: 58 | priors =torch.matmul( x[None, :, :, None, :],self.route_weights[:, None, :, :, :]) 59 | 60 | if torch.cuda.is_available(): 61 | logits = torch.autograd.Variable(torch.zeros(priors.size())).cuda() 62 | else: 63 | logits = torch.autograd.Variable(torch.zeros(priors.size())) 64 | for i in range(self.num_iterations): 65 | probs = softmax(logits, dim=2) 66 | outputs = self.squash((torch.mul(probs , priors)).sum(dim=2, keepdim=True)) 67 | 68 | if i != self.num_iterations - 1: 69 | delta_logits = (torch.mul(priors , outputs)).sum(dim=-1, keepdim=True) 70 | logits = logits + delta_logits 71 | else: 72 | outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules] 73 | outputs = torch.cat(outputs, dim=-1) 74 | outputs = self.squash(outputs) 75 | 76 | return outputs 77 | 78 | from models.BaseModel import BaseModel 79 | class CapsuleNet(BaseModel): 80 | def __init__(self,opt): 81 | super(CapsuleNet, self).__init__(opt) 82 | 83 | self.label_size=opt.label_size 84 | self.embed = nn.Embedding(opt.vocab_size+1, opt.embedding_dim) 85 | self.opt.cnn_dim = 1 86 | self.kernel_size = 3 87 | self.kernel_size_primary=3 88 | if opt.__dict__.get("embeddings",None) is not None: 89 | self.embed.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 90 | 91 | self.primary_capsules = CapsuleLayer(num_capsules=8, num_route_nodes=-1, in_channels=256, out_channels=32) 92 | self.digit_capsules = CapsuleLayer(num_capsules=opt.label_size, num_route_nodes=int(32 * opt.max_seq_len/2), in_channels=8, 93 | out_channels=16) 94 | if self.opt.cnn_dim == 2: 95 | self.conv_2d = nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(self.kernel_size,opt.embedding_dim), stride=(1,opt.embedding_dim),padding=(int((self.kernel_size-1)/2),0)) 96 | else: 97 | self.conv_1d = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=opt.embedding_dim * self.kernel_size, stride=opt.embedding_dim, padding=opt.embedding_dim* int((self.kernel_size-1)/2) ) 98 | 99 | self.decoder = nn.Sequential( 100 | nn.Linear(16 * self.label_size, 512), 101 | nn.ReLU(inplace=True), 102 | nn.Linear(512, 1024), 103 | nn.ReLU(inplace=True), 104 | nn.Linear(1024, 784), 105 | nn.Sigmoid() 106 | ) 107 | 108 | def forward(self, x, y=None,reconstruct=False): 109 | #x = next(iter(train_iter)).text[0] 110 | 111 | x= self.embed(x) 112 | if self.opt.cnn_dim == 1: 113 | x=x.view(x.size(0),1,x.size(-1)*x.size(-2)) 114 | x_conv = F.relu(self.conv_1d(x), inplace=True) 115 | else: 116 | 117 | x=x.unsqueeze(1) 118 | x_conv = F.relu(self.conv_2d(x), inplace=True).squeeze(3) 119 | 120 | x = self.primary_capsules(x_conv) 121 | x = self.digit_capsules(x).squeeze().transpose(0, 1) 122 | 123 | classes = (x ** 2).sum(dim=-1) ** 0.5 124 | classes = F.softmax(classes) 125 | if not reconstruct: 126 | return classes 127 | if y is None: 128 | # In all batches, get the most active capsule. 129 | _, max_length_indices = classes.max(dim=1) 130 | if torch.cuda.is_available(): 131 | y = Variable(torch.sparse.torch.eye(self.label_size)).cuda().index_select(dim=0, index=max_length_indices.data) 132 | else: 133 | y = Variable(torch.sparse.torch.eye(self.label_size)).index_select(dim=0, index=max_length_indices.data) 134 | reconstructions = self.decoder((x * y[:, :, None]).view(x.size(0), -1)) 135 | 136 | return classes, reconstructions 137 | -------------------------------------------------------------------------------- /models/ConvS2S.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/DiSAN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/taoshen58/DiSAN/blob/master/SST_disan/src/model/model_disan.py -------------------------------------------------------------------------------- /models/FastText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch as t 4 | 5 | import numpy as np 6 | from torch import nn 7 | from collections import OrderedDict 8 | from models.BaseModel import BaseModel 9 | class FastText(BaseModel): 10 | def __init__(self, opt ): 11 | super(FastText, self).__init__(opt) 12 | 13 | linear_hidden_size=getattr(opt,"linear_hidden_size",2000) 14 | self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim) 15 | if opt.__dict__.get("embeddings",None) is not None: 16 | print('load embedding') 17 | self.encoder.weight=nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | 19 | 20 | self.content_fc = nn.Sequential( 21 | nn.Linear(opt.embedding_dim,linear_hidden_size), 22 | nn.BatchNorm1d(linear_hidden_size), 23 | nn.ReLU(inplace=True), 24 | # nn.Linear(opt.linear_hidden_size,opt.linear_hidden_size), 25 | # nn.BatchNorm1d(opt.linear_hidden_size), 26 | # nn.ReLU(inplace=True), 27 | nn.Linear(linear_hidden_size,opt.label_size) 28 | ) 29 | # self.fc = nn.Linear(300, opt.label_size) 30 | self.properties.update( 31 | {"linear_hidden_size":linear_hidden_size 32 | }) 33 | 34 | def forward(self,content): 35 | 36 | content_=t.mean(self.encoder(content),dim=1) 37 | 38 | 39 | out=self.content_fc(content_.view(content_.size(0),-1)) 40 | 41 | return out 42 | if __name__ == '__main__': 43 | import sys 44 | sys.path.append(r"..") 45 | import opts 46 | opt=opts.parse_opt() 47 | opt.vocab_size=2501 48 | opt.label_size=3 49 | m = FastText(opt) 50 | 51 | content = t.autograd.Variable(t.arange(0,2500).view(10,250)).long() 52 | o = m(content) 53 | print(o.size()) -------------------------------------------------------------------------------- /models/GPTModel.py: -------------------------------------------------------------------------------- 1 | # In this code we have used GPT-2 Model for classification of IMDB Dataset: 2 | 3 | # Code is as followed: 4 | 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.base import BaseEstimator, TransformerMixin 11 | import torch 12 | import re 13 | 14 | 15 | # Defining cleaning function: 16 | def clean(text): 17 | for token in ["
", "
", "
"]: 18 | text = re.sub(token, " ", text) 19 | 20 | text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text) 21 | 22 | return text.lower() 23 | 24 | # Loading Data: 25 | def load_imdb_dataset(data_path, nrows=100): 26 | df = pd.read_csv(data_path, nrows=nrows) 27 | texts = df['review'].apply(clean) 28 | labels = df['sentiment'] 29 | return texts, labels 30 | 31 | # Class for GPT-Transformer 32 | class GPT2Transformer(BaseEstimator, TransformerMixin): 33 | def __init__(self, tokenizer, max_length=2): 34 | self.tokenizer = tokenizer 35 | self.max_length = max_length 36 | 37 | def fit(self, X, y=None): 38 | return self 39 | 40 | def transform(self, X): 41 | input_ids = [] 42 | for text in X: 43 | encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True) 44 | input_ids.append(encoded_text) 45 | return input_ids 46 | 47 | # Class for the classifier 48 | class GPT2Classifier(BaseEstimator): 49 | def __init__(self, model): 50 | self.model = model 51 | 52 | def fit(self, X, y): 53 | return self 54 | 55 | def predict(self, X): 56 | # Finding the maximum sequence length 57 | max_length = max(len(seq) for seq in X) 58 | 59 | # Pading sequences to the maximum length 60 | padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X] 61 | 62 | # Convert input to tensors 63 | input_ids = torch.tensor(padded_input_ids) 64 | 65 | # Move input tensors to the appropriate device (GPU if available) 66 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 67 | input_ids = input_ids.to(device) 68 | 69 | # Moving the model to the appropriate device 70 | self.model.to(device) 71 | 72 | # Predicting logits 73 | with torch.no_grad(): 74 | logits = self.model(input_ids)[0] 75 | 76 | # Moving logits back to CPU if necessary 77 | logits = logits.cpu() 78 | 79 | # Converting logits to class labels 80 | predicted_labels = torch.argmax(logits, dim=1).tolist() 81 | 82 | # Converting predicted labels to original label format 83 | label_map = {1: 'positive', 0: 'negative'} 84 | predicted_labels = [label_map[label] for label in predicted_labels] 85 | 86 | return predicted_labels 87 | 88 | def main(): 89 | data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv" 90 | texts, labels = load_imdb_dataset(data_path, nrows=20000) # Load only the top 100 rows 91 | train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) 92 | 93 | # Initializing tokenizer and model 94 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 95 | 96 | # Loading GPT2 configuration 97 | config = GPT2Config.from_pretrained('gpt2') 98 | config.pad_token_id = config.eos_token_id # Set the padding token ID to the end-of-sequence token ID 99 | 100 | # Initializing model with updated configuration 101 | model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config) 102 | 103 | # Defining the pipeline for text classification 104 | pipeline = Pipeline([ 105 | ('transformer', GPT2Transformer(tokenizer, max_length=2)), 106 | ('clf', GPT2Classifier(model)), 107 | ]) 108 | 109 | # Training the classifier 110 | pipeline.fit(train_texts, train_labels) 111 | 112 | # Predicting on the test set 113 | predicted_labels = pipeline.predict(test_texts) 114 | 115 | # Calculating accuracy 116 | accuracy = accuracy_score(test_labels, predicted_labels) 117 | print("GPT2 Accuracy:", accuracy) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | 122 | 123 | # Got Accuracy of 0.48825. 124 | -------------------------------------------------------------------------------- /models/LSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch 6 | from torch.autograd import Variable 7 | #from memory_profiler import profile 8 | from models.BaseModel import BaseModel 9 | class LSTMClassifier(BaseModel): 10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 11 | def __init__(self,opt): 12 | 13 | super(LSTMClassifier, self).__init__(opt) 14 | self.hidden_dim = opt.hidden_dim 15 | self.batch_size = opt.batch_size 16 | self.use_gpu = torch.cuda.is_available() 17 | 18 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 19 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 20 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 21 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim) 22 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 23 | self.hidden = self.init_hidden() 24 | self.lsmt_reduce_by_mean = opt.__dict__.get("lstm_mean",True) 25 | 26 | def init_hidden(self,batch_size=None): 27 | if batch_size is None: 28 | batch_size= self.batch_size 29 | 30 | if self.use_gpu: 31 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 32 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 33 | else: 34 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim)) 35 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim)) 36 | return (h0, c0) 37 | # @profile 38 | def forward(self, sentence): 39 | embeds = self.word_embeddings(sentence) #64x200x300 40 | 41 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 42 | x=embeds.permute(1,0,2) #200x64x300 43 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128 44 | lstm_out, self.hidden = self.lstm(x, self.hidden) #200x64x128 45 | if self.lsmt_reduce_by_mean=="mean": 46 | out = lstm_out.permute(1,0,2) 47 | final = torch.mean(out,1) 48 | else: 49 | final=lstm_out[-1] 50 | y = self.hidden2label(final) #64x3 51 | return y 52 | 53 | # def forward1(self, sentence): 54 | # 55 | # return torch.zeros(sentence.size()[0], self.opt.label_size) 56 | ## def __call__(self, **args): 57 | ## self.forward(args) 58 | # def test(): 59 | # 60 | # import numpy as np 61 | # 62 | # word_embeddings = nn.Embedding(10000, 300) 63 | # lstm = nn.LSTM(300, 100) 64 | # h0 = Variable(torch.zeros(1, 128, 100)) 65 | # c0 = Variable(torch.zeros(1, 128, 100)) 66 | # hidden=(h0, c0) 67 | # sentence = Variable(torch.LongTensor(np.zeros((128,30),dtype=np.int64))) 68 | # embeds = word_embeddings(sentence) 69 | # torch.tile(sentence) 70 | # sentence.size()[0] 71 | # 72 | # 73 | # 74 | ## x= Variable(torch.zeros(30, 128, 300)) 75 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 76 | # embeds=embeds.permute(1,0,2) 77 | # lstm_out, hidden = lstm(embeds, hidden) 78 | ## -------------------------------------------------------------------------------- /models/LSTMBI.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch 6 | from torch.autograd import Variable 7 | #from memory_profiler import profile 8 | from models.BaseModel import BaseModel 9 | class LSTMBI(BaseModel): 10 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 11 | def __init__(self,opt): 12 | super(LSTMBI, self).__init__(opt) 13 | 14 | 15 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 16 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 17 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 18 | 19 | 20 | #self.bidirectional = True 21 | 22 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.opt.lstm_layers, dropout=self.opt.keep_dropout, bidirectional=self.opt.bidirectional) 23 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 24 | self.hidden = self.init_hidden() 25 | self.lsmt_reduce_by_mean = opt.__dict__.get("lstm_mean",True) 26 | 27 | 28 | self.properties.update( 29 | {"hidden_dim":self.opt.hidden_dim, 30 | "lstm_mean":self.lsmt_reduce_by_mean, 31 | "lstm_layers":self.opt.lstm_layers, 32 | # "bidirectional":str(self.opt.bidirectional) 33 | }) 34 | 35 | def init_hidden(self,batch_size=None): 36 | if batch_size is None: 37 | batch_size= self.opt.batch_size 38 | 39 | if torch.cuda.is_available(): 40 | h0 = Variable(torch.zeros(2*self.opt.lstm_layers, batch_size, self.opt.hidden_dim // 2).cuda()) 41 | c0 = Variable(torch.zeros(2*self.opt.lstm_layers, batch_size, self.opt.hidden_dim // 2).cuda()) 42 | else: 43 | h0 = Variable(torch.zeros(2*self.opt.lstm_layers, batch_size, self.opt.hidden_dim // 2)) 44 | c0 = Variable(torch.zeros(2*self.opt.lstm_layers, batch_size, self.opt.hidden_dim // 2)) 45 | return (h0, c0) 46 | # @profile 47 | def forward(self, sentence): 48 | embeds = self.word_embeddings(sentence) 49 | 50 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 51 | x=embeds.permute(1,0,2) # we do this because the default parameter of lstm is False 52 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64 53 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128 54 | if self.lsmt_reduce_by_mean=="mean": 55 | out = lstm_out.permute(1,0,2) 56 | final = torch.mean(out,1) 57 | else: 58 | final=lstm_out[-1] 59 | y = self.hidden2label(final) #64x3 #lstm_out[-1] 60 | return y 61 | 62 | -------------------------------------------------------------------------------- /models/LSTMStack.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/TextClassificationBenchmark/c3dfd2622794e6f9d7d91e75506795ceb31f57b9/models/LSTMStack.py -------------------------------------------------------------------------------- /models/LSTMTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # https://github.com/dasguptar/treelstm.pytorch -------------------------------------------------------------------------------- /models/LSTMwithAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import numpy as np 5 | import torch.nn as nn 6 | from sklearn.utils import shuffle 7 | from torch.autograd import Variable 8 | from models.BaseModel import BaseModel 9 | class LSTMAttention(torch.nn.Module): 10 | def __init__(self,opt): 11 | 12 | super(LSTMAttention, self).__init__() 13 | self.hidden_dim = opt.hidden_dim 14 | self.batch_size = opt.batch_size 15 | self.use_gpu = torch.cuda.is_available() 16 | 17 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 18 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 19 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 20 | 21 | self.num_layers = opt.lstm_layers 22 | #self.bidirectional = True 23 | self.dropout = opt.keep_dropout 24 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, batch_first=True,num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 25 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 26 | self.hidden = self.init_hidden() 27 | self.mean = opt.__dict__.get("lstm_mean",True) 28 | self.attn_fc = torch.nn.Linear(opt.embedding_dim, 1) 29 | def init_hidden(self,batch_size=None): 30 | if batch_size is None: 31 | batch_size= self.batch_size 32 | 33 | if self.use_gpu: 34 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 35 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 36 | else: 37 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 38 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 39 | return (h0, c0) 40 | 41 | 42 | def attention(self, rnn_out, state): 43 | merged_state = torch.cat([s for s in state],1) 44 | merged_state = merged_state.squeeze(0).unsqueeze(2) 45 | # (batch, seq_len, cell_size) * (batch, cell_size, 1) = (batch, seq_len, 1) 46 | weights = torch.bmm(rnn_out, merged_state) 47 | weights = torch.nn.functional.softmax(weights.squeeze(2)).unsqueeze(2) 48 | # (batch, cell_size, seq_len) * (batch, seq_len, 1) = (batch, cell_size, 1) 49 | return torch.bmm(torch.transpose(rnn_out, 1, 2), weights).squeeze(2) 50 | # end method attention 51 | 52 | 53 | def forward(self, X): 54 | embedded = self.word_embeddings(X) 55 | hidden= self.init_hidden(X.size()[0]) # 56 | rnn_out, hidden = self.bilstm(embedded, hidden) 57 | h_n, c_n = hidden 58 | attn_out = self.attention(rnn_out, h_n) 59 | logits = self.hidden2label(attn_out) 60 | return logits -------------------------------------------------------------------------------- /models/MLP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | from torch.autograd import Variable 8 | 9 | # https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py 10 | 11 | def position_encoding(sentence_size, embedding_dim): 12 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32) 13 | ls = sentence_size + 1 14 | le = embedding_dim + 1 15 | for i in range(1, le): 16 | for j in range(1, ls): 17 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2) 18 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size 19 | # Make position encoding of time words identity to avoid modifying them 20 | encoding[:, -1] = 1.0 21 | return np.transpose(encoding) 22 | 23 | class AttrProxy(object): 24 | """ 25 | Translates index lookups into attribute lookups. 26 | To implement some trick which able to use list of nn.Module in a nn.Module 27 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2 28 | """ 29 | def __init__(self, module, prefix): 30 | self.module = module 31 | self.prefix = prefix 32 | 33 | def __getitem__(self, i): 34 | return getattr(self.module, self.prefix + str(i)) 35 | 36 | 37 | class MemN2N(nn.Module): 38 | def __init__(self, opt): 39 | super(MemN2N, self).__init__() 40 | 41 | use_cuda = opt["use_cuda"] 42 | num_vocab = opt["num_vocab"] 43 | embedding_dim = opt["embedding_dim"] 44 | sentence_size = opt["sentence_size"] 45 | self.max_hops = opt["max_hops"] 46 | 47 | for hop in range(self.max_hops+1): 48 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0) 49 | C.weight.data.normal_(0, 0.1) 50 | self.add_module("C_{}".format(hop), C) 51 | self.C = AttrProxy(self, "C_") 52 | 53 | self.softmax = nn.Softmax() 54 | self.encoding = Variable(torch.FloatTensor( 55 | position_encoding(sentence_size, embedding_dim)), requires_grad=False) 56 | 57 | if use_cuda: 58 | self.encoding = self.encoding.cuda() 59 | 60 | def forward(self, story, query): 61 | story_size = story.size() 62 | 63 | u = list() 64 | query_embed = self.C[0](query) 65 | # weired way to perform reduce_dot 66 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed) 67 | u.append(torch.sum(query_embed*encoding, 1)) 68 | 69 | for hop in range(self.max_hops): 70 | embed_A = self.C[hop](story.view(story.size(0), -1)) 71 | embed_A = embed_A.view(story_size+(embed_A.size(-1),)) 72 | 73 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A) 74 | m_A = torch.sum(embed_A*encoding, 2) 75 | 76 | u_temp = u[-1].unsqueeze(1).expand_as(m_A) 77 | prob = self.softmax(torch.sum(m_A*u_temp, 2)) 78 | 79 | embed_C = self.C[hop+1](story.view(story.size(0), -1)) 80 | embed_C = embed_C.view(story_size+(embed_C.size(-1),)) 81 | m_C = torch.sum(embed_C*encoding, 2) 82 | 83 | prob = prob.unsqueeze(2).expand_as(m_C) 84 | o_k = torch.sum(m_C*prob, 1) 85 | 86 | u_k = u[-1] + o_k 87 | u.append(u_k) 88 | 89 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1) 90 | return a_hat, self.softmax(a_hat) -------------------------------------------------------------------------------- /models/MemoryNetwork.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #https://github.com/nmhkahn/MemN2N-pytorch/blob/master/memn2n/model.py 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | from torch.autograd import Variable 8 | 9 | def position_encoding(sentence_size, embedding_dim): 10 | encoding = np.ones((embedding_dim, sentence_size), dtype=np.float32) 11 | ls = sentence_size + 1 12 | le = embedding_dim + 1 13 | for i in range(1, le): 14 | for j in range(1, ls): 15 | encoding[i-1, j-1] = (i - (embedding_dim+1)/2) * (j - (sentence_size+1)/2) 16 | encoding = 1 + 4 * encoding / embedding_dim / sentence_size 17 | # Make position encoding of time words identity to avoid modifying them 18 | encoding[:, -1] = 1.0 19 | return np.transpose(encoding) 20 | 21 | class AttrProxy(object): 22 | """ 23 | Translates index lookups into attribute lookups. 24 | To implement some trick which able to use list of nn.Module in a nn.Module 25 | see https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/2 26 | """ 27 | def __init__(self, module, prefix): 28 | self.module = module 29 | self.prefix = prefix 30 | 31 | def __getitem__(self, i): 32 | return getattr(self.module, self.prefix + str(i)) 33 | 34 | 35 | class MemN2N(nn.Module): 36 | def __init__(self, settings): 37 | super(MemN2N, self).__init__() 38 | 39 | use_cuda = settings["use_cuda"] 40 | num_vocab = settings["num_vocab"] 41 | embedding_dim = settings["embedding_dim"] 42 | sentence_size = settings["sentence_size"] 43 | self.max_hops = settings["max_hops"] 44 | 45 | for hop in range(self.max_hops+1): 46 | C = nn.Embedding(num_vocab, embedding_dim, padding_idx=0) 47 | C.weight.data.normal_(0, 0.1) 48 | self.add_module("C_{}".format(hop), C) 49 | self.C = AttrProxy(self, "C_") 50 | 51 | self.softmax = nn.Softmax() 52 | self.encoding = Variable(torch.FloatTensor( 53 | position_encoding(sentence_size, embedding_dim)), requires_grad=False) 54 | 55 | if use_cuda: 56 | self.encoding = self.encoding.cuda() 57 | 58 | def forward(self, query): 59 | 60 | story=query # for text classfication 61 | 62 | story_size = story.size() 63 | 64 | u = list() 65 | query_embed = self.C[0](query) 66 | # weired way to perform reduce_dot 67 | encoding = self.encoding.unsqueeze(0).expand_as(query_embed) 68 | u.append(torch.sum(query_embed*encoding, 1)) 69 | 70 | for hop in range(self.max_hops): 71 | embed_A = self.C[hop](story.view(story.size(0), -1)) 72 | embed_A = embed_A.view(story_size+(embed_A.size(-1),)) 73 | 74 | encoding = self.encoding.unsqueeze(0).unsqueeze(1).expand_as(embed_A) 75 | m_A = torch.sum(embed_A*encoding, 2) 76 | 77 | u_temp = u[-1].unsqueeze(1).expand_as(m_A) 78 | prob = self.softmax(torch.sum(m_A*u_temp, 2)) 79 | 80 | embed_C = self.C[hop+1](story.view(story.size(0), -1)) 81 | embed_C = embed_C.view(story_size+(embed_C.size(-1),)) 82 | m_C = torch.sum(embed_C*encoding, 2) 83 | 84 | prob = prob.unsqueeze(2).expand_as(m_C) 85 | o_k = torch.sum(m_C*prob, 1) 86 | 87 | u_k = u[-1] + o_k 88 | u.append(u_k) 89 | 90 | a_hat = u[-1]@self.C[self.max_hops].weight.transpose(0, 1) 91 | return a_hat, self.softmax(a_hat) -------------------------------------------------------------------------------- /models/QuantumCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /models/RCNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | #from memory_profiler import profile 6 | 7 | """ 8 | Lai S, Xu L, Liu K, et al. Recurrent Convolutional Neural Networks for Text Classification[C]//AAAI. 2015, 333: 2267-2273. 9 | """ 10 | from models.BaseModel import BaseModel 11 | class RCNN(BaseModel): 12 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 13 | def __init__(self,opt): 14 | 15 | super(RCNN, self).__init__(opt) 16 | self.hidden_dim = opt.hidden_dim 17 | self.batch_size = opt.batch_size 18 | self.use_gpu = torch.cuda.is_available() 19 | 20 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 21 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 22 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 23 | 24 | self.num_layers = 1 25 | #self.bidirectional = True 26 | self.dropout = opt.keep_dropout 27 | self.bilstm = nn.LSTM(input_size=opt.embedding_dim, hidden_size=opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 28 | 29 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 30 | self.hidden = self.init_hidden() 31 | 32 | self.max_pooling = nn.MaxPool1d(kernel_size=3, stride=2) 33 | 34 | self.content_dim = 256 35 | #self.conv = nn.Conv1d(opt.hidden_dim, self.content_dim, opt.hidden_dim * 2, stride=opt.embedding_dim) 36 | self.hidden2label = nn.Linear( (2*opt.hidden_dim // 2+opt.embedding_dim), opt.label_size) 37 | 38 | 39 | def init_hidden(self,batch_size=None): 40 | if batch_size is None: 41 | batch_size= self.batch_size 42 | 43 | if self.use_gpu: 44 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 45 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 46 | else: 47 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 48 | c0 = Variable(torch.zeros(2*self.num_layers,batch_size, self.hidden_dim // 2)) 49 | return (h0, c0) 50 | # @profile 51 | def forward(self, sentence): 52 | embeds = self.word_embeddings(sentence) #64x200x300 53 | 54 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 55 | x=embeds.permute(1,0,2) #200x64x300 56 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x128 57 | lstm_out, self.hidden = self.bilstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions) 58 | #lstm_out 200x64x128 59 | 60 | c_lr = lstm_out.permute(1,0,2) #64x200x128 61 | xi = torch.cat((c_lr[:,:,0:int(c_lr.size()[2]/2)],embeds,c_lr[:,:,int(c_lr.size()[2]/2):]),2) #64x200x428 62 | yi = torch.tanh(xi.permute(0,2,1)) #64x428x200 63 | y = self.max_pooling(yi) #64x428x99 64 | y = y.permute(2,0,1) 65 | 66 | ##y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1 67 | 68 | y = self.hidden2label(y[-1]) 69 | #y = self.hidden2label(y[:,-1,:].view(y[:,-1,:].size()[0],-1)) 70 | return y -------------------------------------------------------------------------------- /models/RNN_CNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | #from memory_profiler import profile 6 | from models.BaseModel import BaseModel 7 | class RNN_CNN(BaseModel): 8 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 9 | def __init__(self,opt): 10 | 11 | super(RNN_CNN, self).__init__(opt) 12 | self.hidden_dim = opt.hidden_dim 13 | self.batch_size = opt.batch_size 14 | self.use_gpu = torch.cuda.is_available() 15 | 16 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 17 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 18 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 19 | self.lstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim) 20 | ###self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 21 | self.hidden = self.init_hidden() 22 | 23 | self.content_dim = 256 24 | self.conv = nn.Conv1d(in_channels=opt.hidden_dim, out_channels=self.content_dim, kernel_size=opt.hidden_dim * 2, stride=opt.embedding_dim) 25 | self.hidden2label = nn.Linear(self.content_dim, opt.label_size) 26 | self.properties.update( 27 | {"content_dim":self.content_dim, 28 | }) 29 | 30 | def init_hidden(self,batch_size=None): 31 | if batch_size is None: 32 | batch_size= self.batch_size 33 | 34 | if self.use_gpu: 35 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 36 | c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda()) 37 | else: 38 | h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim)) 39 | c0 = Variable(torch.zeros(1,batch_size, self.hidden_dim)) 40 | return (h0, c0) 41 | # @profile 42 | def forward(self, sentence): 43 | embeds = self.word_embeddings(sentence) #64x200x300 44 | 45 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 46 | x=embeds.permute(1,0,2) #200x64x300 47 | self.hidden= self.init_hidden(sentence.size()[0]) #1x64x128 48 | lstm_out, self.hidden = self.lstm(x, self.hidden) ###input (seq_len, batch, input_size) #Outupts:output, (h_n, c_n) output:(seq_len, batch, hidden_size * num_directions) 49 | #lstm_out 200x64x128 lstm_out.permute(1,2,0):64x128x200 50 | y = self.conv(lstm_out.permute(1,2,0)) ###64x256x1 51 | ###y = self.conv(lstm_out.permute(1,2,0).contiguous().view(self.batch_size,128,-1)) 52 | #y = self.hidden2label(y.view(sentence.size()[0],-1)) 53 | y = self.hidden2label(y.view(y.size()[0],-1)) #64x3 54 | return y -------------------------------------------------------------------------------- /models/SelfAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-# 2 | # https://arxiv.org/pdf/1703.03130.pdf 3 | # A Structured Self-attentive Sentence Embedding 4 | # https://github.com/nn116003/self-attention-classification/blob/master/imdb_attn.py 5 | 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch 9 | from torch.autograd import Variable 10 | #from memory_profiler import profile 11 | 12 | class SelfAttention(nn.Module): 13 | # embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu 14 | def __init__(self,opt): 15 | self.opt=opt 16 | super(SelfAttention, self).__init__() 17 | self.hidden_dim = opt.hidden_dim 18 | self.batch_size = opt.batch_size 19 | self.use_gpu = torch.cuda.is_available() 20 | 21 | self.word_embeddings = nn.Embedding(opt.vocab_size, opt.embedding_dim) 22 | self.word_embeddings.weight = nn.Parameter(opt.embeddings,requires_grad=opt.embedding_training) 23 | # self.word_embeddings.weight.data.copy_(torch.from_numpy(opt.embeddings)) 24 | 25 | self.num_layers = 1 26 | #self.bidirectional = True 27 | self.dropout = opt.keep_dropout 28 | self.bilstm = nn.LSTM(opt.embedding_dim, opt.hidden_dim // 2, num_layers=self.num_layers, dropout=self.dropout, bidirectional=True) 29 | self.hidden2label = nn.Linear(opt.hidden_dim, opt.label_size) 30 | self.hidden = self.init_hidden() 31 | self.self_attention = nn.Sequential( 32 | nn.Linear(opt.hidden_dim, 24), 33 | nn.ReLU(True), 34 | nn.Linear(24,1) 35 | ) 36 | def init_hidden(self,batch_size=None): 37 | if batch_size is None: 38 | batch_size= self.batch_size 39 | 40 | if self.use_gpu: 41 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 42 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2).cuda()) 43 | else: 44 | h0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 45 | c0 = Variable(torch.zeros(2*self.num_layers, batch_size, self.hidden_dim // 2)) 46 | return (h0, c0) 47 | # @profile 48 | def forward(self, sentence): 49 | embeds = self.word_embeddings(sentence) 50 | 51 | # x = embeds.view(sentence.size()[1], self.batch_size, -1) 52 | x=embeds.permute(1,0,2) 53 | self.hidden= self.init_hidden(sentence.size()[0]) #2x64x64 54 | lstm_out, self.hidden = self.bilstm(x, self.hidden) #lstm_out:200x64x128 55 | final =lstm_out.permute(1,0,2)#torch.mean(,1) 56 | attn_ene = self.self_attention(final) 57 | attns =F.softmax(attn_ene.view(self.batch_size, -1)) 58 | feats = (final * attns).sum(dim=1) 59 | y = self.hidden2label(feats) #64x3 60 | 61 | return y -------------------------------------------------------------------------------- /models/Transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' Define the Transformer model ''' 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | import torch.nn.init as init 8 | 9 | 10 | 11 | __author__ = "Yu-Hsiang Huang" 12 | #refer to "https://github.com/jadore801120/attention-is-all-you-need-pytorch" 13 | 14 | class ConstantsClass(): 15 | def __init__(self): 16 | self.PAD = 0 17 | self.UNK = 1 18 | self.BOS = 2 19 | self.EOS = 3 20 | self.PAD_WORD = '' 21 | self.UNK_WORD = '' 22 | self.BOS_WORD = '' 23 | self.EOS_WORD = '' 24 | Constants =ConstantsClass() 25 | 26 | class Linear(nn.Module): 27 | ''' Simple Linear layer with xavier init ''' 28 | def __init__(self, d_in, d_out, bias=True): 29 | super(Linear, self).__init__() 30 | self.linear = nn.Linear(d_in, d_out, bias=bias) 31 | init.xavier_normal(self.linear.weight) 32 | 33 | def forward(self, x): 34 | return self.linear(x) 35 | 36 | class Bottle(nn.Module): 37 | ''' Perform the reshape routine before and after an operation ''' 38 | 39 | def forward(self, input): 40 | if len(input.size()) <= 2: 41 | return super(Bottle, self).forward(input) 42 | size = input.size()[:2] 43 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 44 | return out.view(size[0], size[1], -1) 45 | 46 | class BottleLinear(Bottle, Linear): 47 | ''' Perform the reshape routine before and after a linear projection ''' 48 | pass 49 | 50 | class BottleSoftmax(Bottle, nn.Softmax): 51 | ''' Perform the reshape routine before and after a softmax operation''' 52 | pass 53 | 54 | class LayerNormalization(nn.Module): 55 | ''' Layer normalization module ''' 56 | 57 | def __init__(self, d_hid, eps=1e-3): 58 | super(LayerNormalization, self).__init__() 59 | 60 | self.eps = eps 61 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 62 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 63 | 64 | def forward(self, z): 65 | if z.size(1) == 1: 66 | return z 67 | 68 | mu = torch.mean(z, keepdim=True, dim=-1) 69 | sigma = torch.std(z, keepdim=True, dim=-1) 70 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 71 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) 72 | 73 | return ln_out 74 | 75 | class BatchBottle(nn.Module): 76 | ''' Perform the reshape routine before and after an operation ''' 77 | 78 | def forward(self, input): 79 | if len(input.size()) <= 2: 80 | return super(BatchBottle, self).forward(input) 81 | size = input.size()[1:] 82 | out = super(BatchBottle, self).forward(input.view(-1, size[0]*size[1])) 83 | return out.view(-1, size[0], size[1]) 84 | 85 | class BottleLayerNormalization(BatchBottle, LayerNormalization): 86 | ''' Perform the reshape routine before and after a layer normalization''' 87 | pass 88 | 89 | class ScaledDotProductAttention(nn.Module): 90 | ''' Scaled Dot-Product Attention ''' 91 | 92 | def __init__(self, d_model, attn_dropout=0.1): 93 | super(ScaledDotProductAttention, self).__init__() 94 | self.temper = np.power(d_model, 0.5) 95 | self.dropout = nn.Dropout(attn_dropout) 96 | self.softmax = BottleSoftmax() 97 | 98 | def forward(self, q, k, v, attn_mask=None): 99 | 100 | attn = torch.bmm(q, k.transpose(1, 2)) / self.temper 101 | 102 | if attn_mask is not None: 103 | 104 | assert attn_mask.size() == attn.size(), \ 105 | 'Attention mask shape {} mismatch ' \ 106 | 'with Attention logit tensor shape ' \ 107 | '{}.'.format(attn_mask.size(), attn.size()) 108 | 109 | attn.data.masked_fill_(attn_mask, -float('inf')) 110 | 111 | attn = self.softmax(attn) 112 | attn = self.dropout(attn) 113 | output = torch.bmm(attn, v) 114 | 115 | return output, attn 116 | class MultiHeadAttention(nn.Module): 117 | ''' Multi-Head Attention module ''' 118 | 119 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 120 | super(MultiHeadAttention, self).__init__() 121 | 122 | self.n_head = n_head 123 | self.d_k = d_k 124 | self.d_v = d_v 125 | 126 | self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) 127 | self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) 128 | self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) 129 | 130 | self.attention = ScaledDotProductAttention(d_model) 131 | self.layer_norm = LayerNormalization(d_model) 132 | self.proj = Linear(n_head*d_v, d_model) 133 | 134 | self.dropout = nn.Dropout(dropout) 135 | 136 | init.xavier_normal(self.w_qs) 137 | init.xavier_normal(self.w_ks) 138 | init.xavier_normal(self.w_vs) 139 | 140 | def forward(self, q, k, v, attn_mask=None): 141 | 142 | d_k, d_v = self.d_k, self.d_v 143 | n_head = self.n_head 144 | 145 | residual = q 146 | 147 | mb_size, len_q, d_model = q.size() 148 | mb_size, len_k, d_model = k.size() 149 | mb_size, len_v, d_model = v.size() 150 | 151 | # treat as a (n_head) size batch 152 | q_s = q.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_q) x d_model 153 | k_s = k.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_k) x d_model 154 | v_s = v.repeat(n_head, 1, 1).view(n_head, -1, d_model) # n_head x (mb_size*len_v) x d_model 155 | 156 | # treat the result as a (n_head * mb_size) size batch 157 | q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k) # (n_head*mb_size) x len_q x d_k 158 | k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k) # (n_head*mb_size) x len_k x d_k 159 | v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v) # (n_head*mb_size) x len_v x d_v 160 | 161 | # perform attention, result size = (n_head * mb_size) x len_q x d_v 162 | outputs, attns = self.attention(q_s, k_s, v_s, attn_mask=attn_mask.repeat(n_head, 1, 1)) 163 | 164 | # back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v) 165 | outputs = torch.cat(torch.split(outputs, mb_size, dim=0), dim=-1) 166 | 167 | # project back to residual size 168 | outputs = self.proj(outputs) 169 | outputs = self.dropout(outputs) 170 | 171 | return self.layer_norm(outputs + residual), attns 172 | 173 | class PositionwiseFeedForward(nn.Module): 174 | ''' A two-feed-forward-layer module ''' 175 | 176 | def __init__(self, d_hid, d_inner_hid, dropout=0.1): 177 | super(PositionwiseFeedForward, self).__init__() 178 | self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1) # position-wise 179 | self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1) # position-wise 180 | self.layer_norm = LayerNormalization(d_hid) 181 | self.dropout = nn.Dropout(dropout) 182 | self.relu = nn.ReLU() 183 | 184 | def forward(self, x): 185 | residual = x 186 | output = self.relu(self.w_1(x.transpose(1, 2))) 187 | output = self.w_2(output).transpose(2, 1) 188 | output = self.dropout(output) 189 | return self.layer_norm(output + residual) 190 | class EncoderLayer(nn.Module): 191 | ''' Compose with two layers ''' 192 | 193 | def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): 194 | super(EncoderLayer, self).__init__() 195 | self.slf_attn = MultiHeadAttention( 196 | n_head, d_model, d_k, d_v, dropout=dropout) 197 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout) 198 | 199 | def forward(self, enc_input, slf_attn_mask=None): 200 | enc_output, enc_slf_attn = self.slf_attn( 201 | enc_input, enc_input, enc_input, attn_mask=slf_attn_mask) 202 | enc_output = self.pos_ffn(enc_output) 203 | return enc_output, enc_slf_attn 204 | 205 | class DecoderLayer(nn.Module): 206 | ''' Compose with three layers ''' 207 | 208 | def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): 209 | super(DecoderLayer, self).__init__() 210 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 211 | self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 212 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout) 213 | 214 | def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None): 215 | dec_output, dec_slf_attn = self.slf_attn( 216 | dec_input, dec_input, dec_input, attn_mask=slf_attn_mask) 217 | dec_output, dec_enc_attn = self.enc_attn( 218 | dec_output, enc_output, enc_output, attn_mask=dec_enc_attn_mask) 219 | dec_output = self.pos_ffn(dec_output) 220 | 221 | return dec_output, dec_slf_attn, dec_enc_attn 222 | 223 | def position_encoding_init(n_position, d_pos_vec): 224 | ''' Init the sinusoid position encoding table ''' 225 | 226 | # keep dim 0 for padding token position encoding zero vector 227 | position_enc = np.array([ 228 | [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)] 229 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) 230 | 231 | position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i 232 | position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 233 | return torch.from_numpy(position_enc).type(torch.FloatTensor) 234 | 235 | def get_attn_padding_mask(seq_q, seq_k): 236 | ''' Indicate the padding-related part to mask ''' 237 | assert seq_q.dim() == 2 and seq_k.dim() == 2 238 | mb_size, len_q = seq_q.size() 239 | mb_size, len_k = seq_k.size() 240 | pad_attn_mask = seq_k.data.eq(Constants.PAD).unsqueeze(1) # bx1xsk 241 | pad_attn_mask = pad_attn_mask.expand(mb_size, len_q, len_k) # bxsqxsk 242 | return pad_attn_mask 243 | 244 | def get_attn_subsequent_mask(seq): 245 | ''' Get an attention mask to avoid using the subsequent info.''' 246 | assert seq.dim() == 2 247 | attn_shape = (seq.size(0), seq.size(1), seq.size(1)) 248 | subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') 249 | subsequent_mask = torch.from_numpy(subsequent_mask) 250 | if seq.is_cuda: 251 | subsequent_mask = subsequent_mask.cuda() 252 | return subsequent_mask 253 | 254 | class Encoder(nn.Module): 255 | ''' A encoder model with self attention mechanism. ''' 256 | 257 | def __init__( 258 | self, n_src_vocab, n_max_seq, n_layers=6, n_head=8, d_k=64, d_v=64, 259 | d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1): 260 | 261 | super(Encoder, self).__init__() 262 | 263 | n_position = n_max_seq + 1 264 | self.n_max_seq = n_max_seq 265 | self.d_model = d_model 266 | 267 | self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants.PAD) 268 | self.position_enc.weight.data = position_encoding_init(n_position, d_word_vec) 269 | 270 | self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=Constants.PAD) 271 | 272 | self.layer_stack = nn.ModuleList([ 273 | EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) 274 | for _ in range(n_layers)]) 275 | 276 | def forward(self, src_seq, src_pos, return_attns=False): 277 | # Word embedding look up 278 | enc_input = self.src_word_emb(src_seq) 279 | 280 | # Position Encoding addition 281 | enc_input += self.position_enc(src_pos) 282 | if return_attns: 283 | enc_slf_attns = [] 284 | 285 | enc_output = enc_input 286 | enc_slf_attn_mask = get_attn_padding_mask(src_seq, src_seq) 287 | for enc_layer in self.layer_stack: 288 | enc_output, enc_slf_attn = enc_layer( 289 | enc_output, slf_attn_mask=enc_slf_attn_mask) 290 | if return_attns: 291 | enc_slf_attns += [enc_slf_attn] 292 | 293 | if return_attns: 294 | return enc_output, enc_slf_attns 295 | else: 296 | return enc_output 297 | 298 | class Decoder(nn.Module): 299 | ''' A decoder model with self attention mechanism. ''' 300 | def __init__( 301 | self, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, d_k=64, d_v=64, 302 | d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1): 303 | 304 | super(Decoder, self).__init__() 305 | n_position = n_max_seq + 1 306 | self.n_max_seq = n_max_seq 307 | self.d_model = d_model 308 | 309 | self.position_enc = nn.Embedding( 310 | n_position, d_word_vec, padding_idx=Constants.PAD) 311 | self.position_enc.weight.data = position_encoding_init(n_position, d_word_vec) 312 | 313 | self.tgt_word_emb = nn.Embedding( 314 | n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) 315 | self.dropout = nn.Dropout(dropout) 316 | 317 | self.layer_stack = nn.ModuleList([ 318 | DecoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) 319 | for _ in range(n_layers)]) 320 | 321 | def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): 322 | # Word embedding look up 323 | dec_input = self.tgt_word_emb(tgt_seq) 324 | 325 | # Position Encoding addition 326 | dec_input += self.position_enc(tgt_pos) 327 | 328 | # Decode 329 | dec_slf_attn_pad_mask = get_attn_padding_mask(tgt_seq, tgt_seq) 330 | dec_slf_attn_sub_mask = get_attn_subsequent_mask(tgt_seq) 331 | dec_slf_attn_mask = torch.gt(dec_slf_attn_pad_mask + dec_slf_attn_sub_mask, 0) 332 | 333 | dec_enc_attn_pad_mask = get_attn_padding_mask(tgt_seq, src_seq) 334 | 335 | if return_attns: 336 | dec_slf_attns, dec_enc_attns = [], [] 337 | 338 | dec_output = dec_input 339 | for dec_layer in self.layer_stack: 340 | dec_output, dec_slf_attn, dec_enc_attn = dec_layer( 341 | dec_output, enc_output, 342 | slf_attn_mask=dec_slf_attn_mask, 343 | dec_enc_attn_mask=dec_enc_attn_pad_mask) 344 | 345 | if return_attns: 346 | dec_slf_attns += [dec_slf_attn] 347 | dec_enc_attns += [dec_enc_attn] 348 | 349 | if return_attns: 350 | return dec_output, dec_slf_attns, dec_enc_attns 351 | else: 352 | return dec_output, 353 | 354 | class Transformer(nn.Module): 355 | ''' A sequence to sequence model with attention mechanism. ''' 356 | 357 | def __init__( 358 | self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8, 359 | d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, 360 | dropout=0.1, proj_share_weight=True, embs_share_weight=True): 361 | 362 | super(Transformer, self).__init__() 363 | self.encoder = Encoder( 364 | n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, 365 | d_word_vec=d_word_vec, d_model=d_model, 366 | d_inner_hid=d_inner_hid, dropout=dropout) 367 | self.decoder = Decoder( 368 | n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head, 369 | d_word_vec=d_word_vec, d_model=d_model, 370 | d_inner_hid=d_inner_hid, dropout=dropout) 371 | self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False) 372 | self.dropout = nn.Dropout(dropout) 373 | 374 | assert d_model == d_word_vec, \ 375 | 'To facilitate the residual connections, \ 376 | the dimensions of all module output shall be the same.' 377 | 378 | if proj_share_weight: 379 | # Share the weight matrix between tgt word embedding/projection 380 | assert d_model == d_word_vec 381 | self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight 382 | 383 | if embs_share_weight: 384 | # Share the weight matrix between src/tgt word embeddings 385 | # assume the src/tgt word vec size are the same 386 | assert n_src_vocab == n_tgt_vocab, \ 387 | "To share word embedding table, the vocabulary size of src/tgt shall be the same." 388 | self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight 389 | 390 | def get_trainable_parameters(self): 391 | ''' Avoid updating the position encoding ''' 392 | enc_freezed_param_ids = set(map(id, self.encoder.position_enc.parameters())) 393 | dec_freezed_param_ids = set(map(id, self.decoder.position_enc.parameters())) 394 | freezed_param_ids = enc_freezed_param_ids | dec_freezed_param_ids 395 | return (p for p in self.parameters() if id(p) not in freezed_param_ids) 396 | 397 | def forward(self, src, tgt): 398 | src_seq, src_pos = src 399 | tgt_seq, tgt_pos = tgt 400 | 401 | tgt_seq = tgt_seq[:, :-1] 402 | tgt_pos = tgt_pos[:, :-1] 403 | 404 | enc_output, _ = self.encoder(src_seq, src_pos) 405 | dec_output, _ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output) 406 | seq_logit = self.tgt_word_proj(dec_output) 407 | 408 | return seq_logit.view(-1, seq_logit.size(2)) 409 | 410 | class AttentionIsAllYouNeed(nn.Module): 411 | def __init__(self, opt, n_layers=6, n_head=8, 412 | d_word_vec=128, d_model=128, d_inner_hid=256, d_k=32, d_v=32, 413 | dropout=0.1, proj_share_weight=True, embs_share_weight=True): 414 | # self, opt, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64, 415 | 416 | super(AttentionIsAllYouNeed, self).__init__() 417 | self.encoder = Encoder( 418 | opt.vocab_size, opt.max_seq_len, n_layers=n_layers, n_head=n_head, 419 | d_word_vec=d_word_vec, d_model=d_model, 420 | d_inner_hid=d_inner_hid, dropout=dropout) 421 | self.hidden2label = nn.Linear(opt.max_seq_len*d_model, opt.label_size) 422 | self.batch_size=opt.batch_size 423 | def forward(self, inp): 424 | 425 | src_seq,src_pos = inp 426 | # enc_output, *_ = self.encoder(src_seq, src_pos) #64x200x512 427 | enc_output = self.encoder(src_seq, src_pos) #64x200x512 428 | return self.hidden2label(enc_output.view((self.batch_size,-1))) 429 | 430 | 431 | -------------------------------------------------------------------------------- /models/XLNetTransformer.py: -------------------------------------------------------------------------------- 1 | # This python code applies XLNetTransformer for classification of IMDB Dataset: 2 | 3 | 4 | # Importing Neccessary Libraries 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score 8 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.base import BaseEstimator, TransformerMixin 11 | import torch 12 | import re 13 | 14 | # Defining cleaning Text function 15 | def clean(text): 16 | for token in ["
", "
", "
"]: 17 | text = re.sub(token, " ", text) 18 | 19 | text = re.sub("[\s+\.\!\/_,$%^*()\(\)<>+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text) 20 | 21 | return text.lower() 22 | 23 | # Loading Dataset 24 | def load_imdb_dataset(data_path, nrows=100): 25 | df = pd.read_csv(data_path, nrows=nrows) # Limit to the first 100 rows 26 | texts = df['review'].apply(clean) 27 | labels = df['sentiment'] 28 | return texts, labels 29 | 30 | # Class for XLNET-Transformer 31 | class XLNetTransformer(BaseEstimator, TransformerMixin): 32 | def __init__(self, tokenizer, max_length=256): 33 | self.tokenizer = tokenizer 34 | self.max_length = max_length 35 | 36 | def fit(self, X, y=None): 37 | return self 38 | 39 | def transform(self, X): 40 | input_ids = [] 41 | for text in X: 42 | encoded_text = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True) 43 | input_ids.append(encoded_text) 44 | return input_ids 45 | 46 | # Class for the classifier: 47 | class XLNetClassifier(BaseEstimator): 48 | def __init__(self, model): 49 | self.model = model 50 | 51 | def fit(self, X, y): 52 | # No training required for this example 53 | return self 54 | 55 | def predict(self, X): 56 | # Finding the maximum sequence length 57 | max_length = max(len(seq) for seq in X) 58 | 59 | # Pading the sequences to the maximum length 60 | padded_input_ids = [seq + [0] * (max_length - len(seq)) for seq in X] 61 | 62 | # Converting input to tensors 63 | input_ids = torch.tensor(padded_input_ids) 64 | 65 | # Moving input tensors to the appropriate device (GPU if available) 66 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 67 | input_ids = input_ids.to(device) 68 | 69 | # Moving the model to the appropriate device 70 | self.model.to(device) 71 | 72 | # Predicting logits 73 | with torch.no_grad(): 74 | logits = self.model(input_ids)[0] 75 | 76 | # Moveing logits back to CPU if necessary 77 | logits = logits.cpu() 78 | 79 | # Converting logits to class labels 80 | predicted_labels = torch.argmax(logits, dim=1).tolist() 81 | 82 | # Converting predicted labels to original label format 83 | label_map = {1: 'positive', 0: 'negative'} 84 | predicted_labels = [label_map[label] for label in predicted_labels] 85 | 86 | return predicted_labels 87 | 88 | def main(): 89 | data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv" 90 | texts, labels = load_imdb_dataset(data_path, nrows=1500) # Load only the top 100 rows 91 | train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) 92 | 93 | # Initializing tokenizer and model 94 | tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') 95 | model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased') 96 | 97 | # Defining the pipeline for text classification 98 | pipeline = Pipeline([ 99 | ('transformer', XLNetTransformer(tokenizer, max_length=256)), 100 | ('clf', XLNetClassifier(model)), 101 | ]) 102 | 103 | # Trainig the classifier 104 | pipeline.fit(train_texts, train_labels) 105 | 106 | # Predicting on the test set 107 | predicted_labels = pipeline.predict(test_texts) 108 | 109 | # Calculating accuracy 110 | accuracy = accuracy_score(test_labels, predicted_labels) 111 | print("XLNet Accuracy:", accuracy) 112 | 113 | if __name__ == "__main__": 114 | main() 115 | 116 | 117 | #XLNet Accuracy: 0.5166666666666667 -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | 9 | import numpy as np 10 | 11 | 12 | 13 | from models.LSTM import LSTMClassifier 14 | from models.CNNBasic import BasicCNN1D,BasicCNN2D 15 | from models.CNNKim import KIMCNN1D,KIMCNN2D 16 | from models.CNNMultiLayer import MultiLayerCNN 17 | from models.CNNInception import InceptionCNN 18 | from models.FastText import FastText 19 | from models.Capsule import CapsuleNet 20 | from models.RCNN import RCNN 21 | from models.RNN_CNN import RNN_CNN 22 | from models.LSTMBI import LSTMBI 23 | from models.Transformer import AttentionIsAllYouNeed 24 | from models.SelfAttention import SelfAttention 25 | from models.LSTMwithAttention import LSTMAttention 26 | from models.BERTFast import BERTFast 27 | def setup(opt): 28 | 29 | if opt.model == 'lstm': 30 | model = LSTMClassifier(opt) 31 | elif opt.model == 'basic_cnn' or opt.model == "cnn": 32 | model = BasicCNN1D(opt) 33 | elif opt.model == 'baisc_cnn_2d' : 34 | model = BasicCNN2D(opt) 35 | elif opt.model == 'kim_cnn' : 36 | model = KIMCNN1D(opt) 37 | elif opt.model == 'kim_cnn_2d': 38 | model = KIMCNN2D(opt) 39 | elif opt.model == 'multi_cnn': 40 | model = MultiLayerCNN(opt) 41 | elif opt.model == 'inception_cnn': 42 | model = InceptionCNN(opt) 43 | elif opt.model == 'fasttext': 44 | model = FastText(opt) 45 | elif opt.model == 'capsule': 46 | model = CapsuleNet(opt) 47 | elif opt.model == 'rnn_cnn': 48 | model = RNN_CNN(opt) 49 | elif opt.model == 'rcnn': 50 | model = RCNN(opt) 51 | elif opt.model == 'bilstm': 52 | model = LSTMBI(opt) 53 | elif opt.model == "transformer": 54 | model = AttentionIsAllYouNeed(opt) 55 | elif opt.model == "selfattention": 56 | model = SelfAttention(opt) 57 | elif opt.model == "lstm_attention": 58 | model =LSTMAttention(opt) 59 | elif opt.model == "bert": 60 | model =BERTFast(opt) 61 | else: 62 | raise Exception("model not supported: {}".format(opt.model)) 63 | return model 64 | -------------------------------------------------------------------------------- /models/ensemble_strategy.py: -------------------------------------------------------------------------------- 1 | # This file contains the ensemble methods like Random Forest, Gradient Boosting, AdaBoost Accuracy and Bagging Accuracy 2 | # This has been applied on the data of IMDB. 3 | # Following code would include whole process: 4 | 5 | 6 | 7 | # Importing Neccessary Modules: 8 | import re 9 | import pandas as pd 10 | import os 11 | import pickle 12 | import numpy as np 13 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 14 | from sklearn.pipeline import Pipeline 15 | from sklearn.ensemble import GradientBoostingClassifier 16 | from sklearn.model_selection import train_test_split 17 | from sklearn import metrics 18 | import pandas as pd 19 | from sklearn.model_selection import train_test_split 20 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 21 | from sklearn.pipeline import Pipeline 22 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier 23 | from sklearn.metrics import accuracy_score 24 | import re 25 | 26 | 27 | 28 | # Data Preprocessing Function: 29 | 30 | def clean(text): 31 | for token in ["
", "
", "+\"\[\]\-\?;:\'{}`]+|[+——!,。?、~@#¥%……&*()]+", " ", text) 35 | 36 | return text.lower() 37 | 38 | 39 | # Data Loading Function: 40 | 41 | def load_imdb_dataset(data_path): 42 | df = pd.read_csv(data_path) 43 | 44 | texts = df['review'].apply(clean) 45 | labels = df['sentiment'] 46 | 47 | 48 | 49 | # Main function to load data and implementation of ensemble Methods: 50 | 51 | 52 | def main(): 53 | data_path = "/content/drive/MyDrive/DATASETS/IMDB Dataset.csv" 54 | 55 | texts, labels = load_imdb_dataset(data_path) 56 | 57 | train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) 58 | 59 | # Defining classifiers for each ensemble method 60 | classifiers = { 61 | "Random Forest": RandomForestClassifier(), 62 | "Gradient Boosting": GradientBoostingClassifier(), 63 | "AdaBoost": AdaBoostClassifier(), 64 | "Bagging": BaggingClassifier(), 65 | } 66 | 67 | 68 | for name, classifier in classifiers.items(): 69 | # Defining the pipeline for text classification 70 | pipeline = Pipeline([ 71 | ('vect', CountVectorizer()), 72 | ('tfidf', TfidfTransformer()), 73 | ('clf', classifier), 74 | ]) 75 | 76 | 77 | pipeline.fit(train_texts, train_labels) 78 | 79 | 80 | predicted_labels = pipeline.predict(test_texts) 81 | 82 | # Calculating accuracy 83 | accuracy = accuracy_score(test_labels, predicted_labels) 84 | print(f"{name} Accuracy:", accuracy) 85 | 86 | if __name__ == "__main__": 87 | main() 88 | 89 | 90 | 91 | # Random Forest Accuracy: 0.8457 92 | # Gradient Boosting Accuracy: 0.8162 93 | # AdaBoost Accuracy: 0.807 94 | # Bagging Accuracy: 0.7833 -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse,os,re 2 | import configparser 3 | 4 | class Params(object): 5 | def __init__(self): 6 | parser = argparse.ArgumentParser() 7 | # Data input settings 8 | 9 | parser.add_argument('--config', type=str, default="no_file_exists", 10 | help='gpu number') 11 | 12 | 13 | parser.add_argument('--hidden_dim', type=int, default=128, 14 | help='hidden_dim') 15 | 16 | parser.add_argument('--max_seq_len', type=int, default=200, 17 | help='max_seq_len') 18 | parser.add_argument('--batch_size', type=int, default=64, 19 | help='batch_size') 20 | parser.add_argument('--embedding_dim', type=int, default=-1, 21 | help='embedding_dim') 22 | parser.add_argument('--learning_rate', type=float, default=2e-5, 23 | help='learning_rate') 24 | parser.add_argument('--grad_clip', type=float, default=1e-1, 25 | help='grad_clip') 26 | 27 | parser.add_argument('--model', type=str, default="cnn", 28 | help='model name') 29 | 30 | parser.add_argument('--dataset', type=str, default="imdb", 31 | 32 | help='dataset') 33 | parser.add_argument('--position', type=bool, default=False, 34 | help='gpu number') 35 | 36 | parser.add_argument('--keep_dropout', type=float, default=0.8, 37 | help='keep_dropout') 38 | parser.add_argument('--max_epoch', type=int, default=20, 39 | help='max_epoch') 40 | parser.add_argument('--embedding_file', type=str, default="glove.6b.300", 41 | help='glove or w2v') 42 | parser.add_argument('--embedding_training', type=str, default="false", 43 | help='embedding_training') 44 | #kim CNN 45 | parser.add_argument('--kernel_sizes', type=str, default="1,2,3,5", 46 | help='kernel_sizes') 47 | parser.add_argument('--kernel_nums', type=str, default="256,256,256,256", 48 | help='kernel_nums') 49 | parser.add_argument('--embedding_type', type=str, default="non-static", 50 | help='embedding_type') 51 | parser.add_argument('--lstm_mean', type=str, default="mean",# last 52 | help='lstm_mean') 53 | parser.add_argument('--lstm_layers', type=int, default=1,# last 54 | help='lstm_layers') 55 | parser.add_argument('--gpu', type=int, default=0, 56 | help='gpu number') 57 | parser.add_argument('--proxy', type=str, default="null", 58 | help='http://proxy.xx.com:8080') 59 | parser.add_argument('--debug', type=str, default="true", 60 | help='gpu number') 61 | 62 | parser.add_argument('--embedding_dir', type=str, default=".glove/glove.6B.300d.txt", 63 | help='embedding_dir') 64 | 65 | parser.add_argument('--bert_dir', type=str, default="D:/dataset/bert/uncased_L-12_H-768_A-12", 66 | help='bert dir') 67 | parser.add_argument('--bert_trained', type=str, default="false", 68 | help='fine tune the bert or not') 69 | 70 | parser.add_argument('--from_torchtext', type=str, default="false", 71 | help='from torchtext or native data loader') 72 | # 73 | args = parser.parse_args() 74 | 75 | if args.config != "no_file_exists": 76 | if os.path.exists(args.config): 77 | config = configparser.ConfigParser() 78 | config_file_path=args.config 79 | config.read(config_file_path) 80 | config_common = config['COMMON'] 81 | for key in config_common.keys(): 82 | args.__dict__[key]=config_common[key] 83 | else: 84 | print("config file named %s does not exist" % args.config) 85 | 86 | # args.kernel_sizes = [int(i) for i in args.kernel_sizes.split(",")] 87 | # args.kernel_nums = [int(i) for i in args.kernel_nums.split(",")] 88 | # 89 | # # Check if args are valid 90 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 91 | 92 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys(): 93 | os.environ["CUDA_VISIBLE_DEVICES"] =str(args.gpu) 94 | 95 | if args.model=="transformer": 96 | args.position=True 97 | else: 98 | args.position=False 99 | 100 | # process the type for bool and list 101 | for arg in args.__dict__.keys(): 102 | if type(args.__dict__[arg])==str: 103 | if args.__dict__[arg].lower()=="true": 104 | args.__dict__[arg]=True 105 | elif args.__dict__[arg].lower()=="false": 106 | args.__dict__[arg]=False 107 | elif "," in args.__dict__[arg]: 108 | args.__dict__[arg]= [int(i) for i in args.__dict__[arg].split(",")] 109 | else: 110 | pass 111 | 112 | 113 | if os.path.exists("proxy.config"): 114 | with open("proxy.config") as f: 115 | 116 | args.proxy = f.read() 117 | print(args.proxy) 118 | 119 | return args 120 | 121 | def parse_config(self, config_file_path): 122 | config = configparser.ConfigParser() 123 | config.read(config_file_path) 124 | config_common = config['COMMON'] 125 | is_numberic = re.compile(r'^[-+]?[0-9.]+$') 126 | for key,value in config_common.items(): 127 | result = is_numberic.match(value) 128 | if result: 129 | if type(eval(value)) == int: 130 | value= int(value) 131 | else : 132 | value= float(value) 133 | 134 | self.__dict__.__setitem__(key,value) 135 | 136 | def export_to_config(self, config_file_path): 137 | config = configparser.ConfigParser() 138 | config['COMMON'] = {} 139 | config_common = config['COMMON'] 140 | for k,v in self.__dict__.items(): 141 | if not k == 'lookup_table': 142 | config_common[k] = str(v) 143 | 144 | with open(config_file_path, 'w') as configfile: 145 | config.write(configfile) 146 | 147 | def parseArgs(self): 148 | #required arguments: 149 | parser = argparse.ArgumentParser(description='running the complex embedding network') 150 | parser.add_argument('-config', action = 'store', dest = 'config_file_path', help = 'The configuration file path.') 151 | args = parser.parse_args() 152 | self.parse_config(args.config_file_path) 153 | 154 | def setup(self,parameters): 155 | for k, v in parameters: 156 | self.__dict__.__setitem__(k,v) 157 | def get_parameter_list(self): 158 | info=[] 159 | for k, v in self.__dict__.items(): 160 | if k in ["validation_split","batch_size","dropout_rate","hidden_unit_num","hidden_unit_num_second","cell_type","contatenate","model"]: 161 | info.append("%s-%s"%(k,str(v))) 162 | return info 163 | 164 | def to_string(self): 165 | return "_".join(self.get_parameter_list()) 166 | 167 | 168 | def parse_opt(): 169 | 170 | parser = argparse.ArgumentParser() 171 | # Data input settings 172 | 173 | parser.add_argument('--config', type=str, default="no_file_exists", 174 | help='gpu number') 175 | 176 | 177 | parser.add_argument('--hidden_dim', type=int, default=128, 178 | help='hidden_dim') 179 | 180 | parser.add_argument('--max_seq_len', type=int, default=200, 181 | help='max_seq_len') 182 | parser.add_argument('--batch_size', type=int, default=64, 183 | help='batch_size') 184 | parser.add_argument('--embedding_dim', type=int, default=-1, 185 | help='embedding_dim') 186 | 187 | 188 | parser.add_argument('--learning_rate', type=float, default=2e-5, 189 | help='learning_rate') 190 | parser.add_argument('--lr_scheduler', type=str, default="none", 191 | help='lr_scheduler') 192 | parser.add_argument('--optimizer', type=str, default="adam", 193 | help='optimizer') 194 | parser.add_argument('--grad_clip', type=float, default=1e-1, 195 | help='grad_clip') 196 | 197 | parser.add_argument('--model', type=str, default="bilstm", 198 | help='model name') 199 | 200 | parser.add_argument('--dataset', type=str, default="imdb", 201 | 202 | help='dataset') 203 | parser.add_argument('--position', type=bool, default=False, 204 | help='gpu number') 205 | 206 | parser.add_argument('--keep_dropout', type=float, default=0.8, 207 | help='keep_dropout') 208 | parser.add_argument('--max_epoch', type=int, default=20, 209 | help='max_epoch') 210 | parser.add_argument('--embedding_file', type=str, default="glove.6b.300", 211 | help='glove or w2v') 212 | parser.add_argument('--embedding_training', type=str, default="false", 213 | help='embedding_training') 214 | #kim CNN 215 | parser.add_argument('--kernel_sizes', type=str, default="1,2,3,5", 216 | help='kernel_sizes') 217 | parser.add_argument('--kernel_nums', type=str, default="256,256,256,256", 218 | help='kernel_nums') 219 | parser.add_argument('--embedding_type', type=str, default="non-static", 220 | help='embedding_type') 221 | parser.add_argument('--lstm_mean', type=str, default="mean",# last 222 | help='lstm_mean') 223 | parser.add_argument('--lstm_layers', type=int, default=1,# last 224 | help='lstm_layers') 225 | parser.add_argument('--gpu', type=int, default=0, 226 | help='gpu number') 227 | parser.add_argument('--gpu_num', type=int, default=1, 228 | help='gpu number') 229 | parser.add_argument('--proxy', type=str, default="null", 230 | help='http://proxy.xx.com:8080') 231 | parser.add_argument('--debug', type=str, default="true", 232 | help='gpu number') 233 | parser.add_argument('--bidirectional', type=str, default="true", 234 | help='bidirectional') 235 | 236 | parser.add_argument('--embedding_dir', type=str, default=".glove/glove.6B.300d.txt", 237 | help='embedding_dir') 238 | 239 | parser.add_argument('--bert_dir', type=str, default="D:/dataset/bert/uncased_L-12_H-768_A-12", 240 | help='bert dir') 241 | parser.add_argument('--bert_trained', type=str, default="false", 242 | help='fine tune the bert or not') 243 | 244 | parser.add_argument('--from_torchtext', type=str, default="false", 245 | help='from torchtext or native data loader') 246 | # 247 | args = parser.parse_args() 248 | 249 | if args.config != "no_file_exists": 250 | if os.path.exists(args.config): 251 | config = configparser.ConfigParser() 252 | config_file_path=args.config 253 | config.read(config_file_path) 254 | config_common = config['COMMON'] 255 | for key in config_common.keys(): 256 | args.__dict__[key]=config_common[key] 257 | else: 258 | print("config file named %s does not exist" % args.config) 259 | 260 | # args.kernel_sizes = [int(i) for i in args.kernel_sizes.split(",")] 261 | # args.kernel_nums = [int(i) for i in args.kernel_nums.split(",")] 262 | # 263 | # # Check if args are valid 264 | # assert args.rnn_size > 0, "rnn_size should be greater than 0" 265 | 266 | if "CUDA_VISIBLE_DEVICES" not in os.environ.keys(): 267 | os.environ["CUDA_VISIBLE_DEVICES"] =str(args.gpu) 268 | 269 | if args.model=="transformer": 270 | args.position=True 271 | else: 272 | args.position=False 273 | 274 | # process the type for bool and list 275 | for arg in args.__dict__.keys(): 276 | if type(args.__dict__[arg])==str: 277 | if args.__dict__[arg].lower()=="true": 278 | args.__dict__[arg]=True 279 | elif args.__dict__[arg].lower()=="false": 280 | args.__dict__[arg]=False 281 | elif "," in args.__dict__[arg]: 282 | args.__dict__[arg]= [int(i) for i in args.__dict__[arg].split(",")] 283 | else: 284 | pass 285 | 286 | 287 | if os.path.exists("proxy.config"): 288 | with open("proxy.config") as f: 289 | 290 | args.proxy = f.read() 291 | print(args.proxy) 292 | 293 | return args -------------------------------------------------------------------------------- /push.bash: -------------------------------------------------------------------------------- 1 | git add *.py 2 | git add models/*.py 3 | git add dataloader/*.py 4 | git commit -m $1 5 | git pull 6 | git push 7 | 8 | -------------------------------------------------------------------------------- /search.sh: -------------------------------------------------------------------------------- 1 | echo "use gpu with multiple processes"; 2 | for((i=0;i<=8;i++)) 3 | do 4 | { 5 | echo "use gpu" +$i ; 6 | echo CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 7 | CUDA_VISIBLE_DEVICES=$i python parameter_search.py --gpu $i --config config/imdb.ini; 8 | 9 | }& 10 | done 11 | wait -------------------------------------------------------------------------------- /trandition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.feature_extraction.text import TfidfTransformer 7 | from sklearn.naive_bayes import MultinomialNB 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.pipeline import make_pipeline 10 | from sklearn.linear_model import SGDClassifier 11 | from sklearn import metrics 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.model_selection import cross_val_score 14 | import numpy as np 15 | import opts 16 | import dataHelper 17 | #refer to "https://zhuanlan.zhihu.com/p/26729228" 18 | opt = opts.parse_opt() 19 | import dataHelper as helper 20 | train_iter, test_iter = dataHelper.loadData(opt,embedding=False) 21 | #categories = ['good', 'bad', 'mid'] 22 | x_train,y_train=train_iter 23 | x_test,y_test = test_iter 24 | 25 | #opt.model ="haha" 26 | if opt.model == "bayes": 27 | """ Naive Bayes classifier """ 28 | # sklearn有一套很成熟的管道流程Pipeline,快速搭建机器学习模型神器 29 | bayes_clf = Pipeline([('vect', CountVectorizer()), 30 | ('tfidf', TfidfTransformer()), 31 | ('clf', MultinomialNB()) 32 | ]) 33 | bayes_clf.fit(x_train, y_train) 34 | """ Predict the test dataset using Naive Bayes""" 35 | predicted = bayes_clf.predict(x_test) 36 | print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) 37 | # 输出f1分数,准确率,召回率等指标 38 | # print(metrics.classification_report(y_test, predicted, target_names=categories)) 39 | elif opt.model == "svm": 40 | 41 | """ Support Vector Machine (SVM) classifier""" 42 | svm_clf = Pipeline([('vect', CountVectorizer()), 43 | ('tfidf', TfidfTransformer()), 44 | ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)), 45 | ]) 46 | svm_clf.fit(x_train, y_train) 47 | predicted = svm_clf.predict(x_test) 48 | print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) 49 | # print(metrics.classification_report(y_test, predicted, target_names=categories)) 50 | 51 | else: 52 | """ 10-折交叉验证 """ 53 | clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB()) 54 | clf_s= make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter= 5, random_state=42)) 55 | 56 | bayes_10_fold = cross_val_score(clf_b, x_test, y_test, cv=10) 57 | svm_10_fold = cross_val_score(clf_s, x_test, y_test, cv=10) 58 | 59 | print('Naives Bayes 10-fold correct prediction: {:4.4f}'.format(np.mean(bayes_10_fold))) 60 | print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold))) 61 | # 输出混淆矩阵 62 | #print("Confusion Matrix:") 63 | #print(metrics.confusion_matrix(y_test, predicted)) 64 | #print('\n') 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn.functional as F 4 | from torchtext import data 5 | from torchtext import datasets 6 | from torchtext.vocab import Vectors, GloVe, CharNGram, FastText 7 | import numpy as np 8 | from functools import wraps 9 | import time 10 | import sys 11 | import logging 12 | import os,configparser,re 13 | 14 | def log_time_delta(func): 15 | @wraps(func) 16 | def _deco(*args, **kwargs): 17 | start = time.time() 18 | ret = func(*args, **kwargs) 19 | end = time.time() 20 | delta = end - start 21 | print( "%s runed %.2f seconds"% (func.__name__,delta)) 22 | return ret 23 | return _deco 24 | 25 | def clip_gradient(optimizer, grad_clip): 26 | for group in optimizer.param_groups: 27 | for param in group['params']: 28 | if param.grad is not None and param.requires_grad: 29 | param.grad.data.clamp_(-grad_clip, grad_clip) 30 | 31 | 32 | def loadData(opt): 33 | if not opt.from_torchtext: 34 | import dataHelper as helper 35 | return helper.loadData(opt) 36 | device = 0 if torch.cuda.is_available() else -1 37 | 38 | TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len) 39 | LABEL = data.Field(sequential=False) 40 | if opt.dataset=="imdb": 41 | train, test = datasets.IMDB.splits(TEXT, LABEL) 42 | elif opt.dataset=="sst": 43 | train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True, 44 | filter_pred=lambda ex: ex.label != 'neutral') 45 | elif opt.dataset=="trec": 46 | train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) 47 | else: 48 | print("does not support this datset") 49 | 50 | TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 51 | LABEL.build_vocab(train) 52 | # print vocab information 53 | print('len(TEXT.vocab)', len(TEXT.vocab)) 54 | print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) 55 | 56 | train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True) 57 | 58 | opt.label_size= len(LABEL.vocab) 59 | opt.vocab_size = len(TEXT.vocab) 60 | opt.embedding_dim= TEXT.vocab.vectors.size()[1] 61 | opt.embeddings = TEXT.vocab.vectors 62 | 63 | return train_iter, test_iter 64 | 65 | 66 | def evaluation(model,test_iter,from_torchtext=True): 67 | model.eval() 68 | accuracy=[] 69 | # batch= next(iter(test_iter)) 70 | for index,batch in enumerate( test_iter): 71 | text = batch.text[0] if from_torchtext else batch.text 72 | predicted = model(text) 73 | prob, idx = torch.max(predicted, 1) 74 | percision=(idx== batch.label).float().mean() 75 | 76 | if torch.cuda.is_available(): 77 | accuracy.append(percision.data.item() ) 78 | else: 79 | accuracy.append(percision.data.numpy()[0] ) 80 | model.train() 81 | return np.mean(accuracy) 82 | 83 | 84 | 85 | def getOptimizer(params,name="adam",lr=1,momentum=None,scheduler=None): 86 | 87 | name = name.lower().strip() 88 | 89 | if name=="adadelta": 90 | optimizer=torch.optim.Adadelta(params, lr=1.0*lr, rho=0.9, eps=1e-06, weight_decay=0).param_groups() 91 | elif name == "adagrad": 92 | optimizer=torch.optim.Adagrad(params, lr=0.01*lr, lr_decay=0, weight_decay=0) 93 | elif name == "sparseadam": 94 | optimizer=torch.optim.SparseAdam(params, lr=0.001*lr, betas=(0.9, 0.999), eps=1e-08) 95 | elif name =="adamax": 96 | optimizer=torch.optim.Adamax(params, lr=0.002*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 97 | elif name =="asgd": 98 | optimizer=torch.optim.ASGD(params, lr=0.01*lr, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) 99 | elif name == "lbfgs": 100 | optimizer=torch.optim.LBFGS(params, lr=1*lr, max_iter=20, max_eval=None, tolerance_grad=1e-05, tolerance_change=1e-09, history_size=100, line_search_fn=None) 101 | elif name == "rmsprop": 102 | optimizer=torch.optim.RMSprop(params, lr=0.01*lr, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) 103 | elif name =="rprop": 104 | optimizer=torch.optim.Rprop(params, lr=0.01*lr, etas=(0.5, 1.2), step_sizes=(1e-06, 50)) 105 | elif name =="sgd": 106 | optimizer=torch.optim.SGD(params, lr=0.1*lr, momentum=0, dampening=0, weight_decay=0, nesterov=False) 107 | elif name =="adam": 108 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 109 | else: 110 | print("undefined optimizer, use adam in default") 111 | optimizer=torch.optim.Adam(params, lr=0.1*lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) 112 | 113 | if scheduler is not None: 114 | if scheduler == "lambdalr": 115 | lambda1 = lambda epoch: epoch // 30 116 | lambda2 = lambda epoch: 0.95 ** epoch 117 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1, lambda2]) 118 | elif scheduler=="steplr": 119 | return torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) 120 | elif scheduler =="multisteplr": 121 | return torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1) 122 | elif scheduler =="reducelronplateau": 123 | return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') 124 | else: 125 | pass 126 | 127 | else: 128 | return optimizer 129 | return 130 | 131 | def get_lr_scheduler(name): 132 | # todo 133 | return None 134 | 135 | 136 | 137 | def getLogger(): 138 | import random 139 | random_str = str(random.randint(1,10000)) 140 | 141 | now = int(time.time()) 142 | timeArray = time.localtime(now) 143 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) 144 | log_filename = "log/" +time.strftime("%Y%m%d", timeArray) 145 | 146 | program = os.path.basename(sys.argv[0]) 147 | logger = logging.getLogger(program) 148 | if not os.path.exists("log"): 149 | os.mkdir("log") 150 | if not os.path.exists(log_filename): 151 | os.mkdir(log_filename) 152 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',datefmt='%a, %d %b %Y %H:%M:%S',filename=log_filename+'/qa'+timeStamp+"_"+ random_str+'.log',filemode='w') 153 | logging.root.setLevel(level=logging.INFO) 154 | logger.info("running %s" % ' '.join(sys.argv)) 155 | 156 | return logger 157 | 158 | def parse_grid_parameters(file_path): 159 | config = configparser.ConfigParser() 160 | config.read(file_path) 161 | config_common = config['COMMON'] 162 | dictionary = {} 163 | for key,value in config_common.items(): 164 | array = value.split(';') 165 | is_numberic = re.compile(r'^[-+]?[0-9.]+$') 166 | new_array = [] 167 | 168 | for value in array: 169 | value = value.strip() 170 | result = is_numberic.match(value) 171 | if result: 172 | if type(eval(value)) == int: 173 | value= int(value) 174 | else : 175 | value= float(value) 176 | new_array.append(value) 177 | dictionary[key] = new_array 178 | return dictionary 179 | 180 | def is_writeable(path, check_parent=False): 181 | ''' 182 | Check if a given path is writeable by the current user. 183 | :param path: The path to check 184 | :param check_parent: If the path to check does not exist, check for the 185 | ability to write to the parent directory instead 186 | :returns: True or False 187 | ''' 188 | if os.access(path, os.F_OK) and os.access(path, os.W_OK): 189 | # The path exists and is writeable 190 | return True 191 | if os.access(path, os.F_OK) and not os.access(path, os.W_OK): 192 | # The path exists and is not writeable 193 | return False 194 | # The path does not exists or is not writeable 195 | if check_parent is False: 196 | # We're not allowed to check the parent directory of the provided path 197 | return False 198 | # Lets get the parent directory of the provided path 199 | parent_dir = os.path.dirname(path) 200 | if not os.access(parent_dir, os.F_OK): 201 | # Parent directory does not exit 202 | return False 203 | # Finally, return if we're allowed to write in the parent directory of the 204 | # provided path 205 | return os.access(parent_dir, os.W_OK) 206 | def is_readable(path): 207 | ''' 208 | Check if a given path is readable by the current user. 209 | :param path: The path to check 210 | :returns: True or False 211 | ''' 212 | if os.access(path, os.F_OK) and os.access(path, os.R_OK): 213 | # The path exists and is readable 214 | return True 215 | # The path does not exist 216 | return False 217 | 218 | --------------------------------------------------------------------------------