├── .gitattributes ├── .gitignore ├── LICENSE ├── MNIST ├── __init__.py ├── train_LSTM.py ├── train_MetaLSTM.py ├── train_MetaRNN.py ├── train_RNN.py ├── train_base_LSTM.py └── train_base_RNN.py ├── Modules ├── MetaNormLSTM.py ├── MetaRNNCells.py ├── MetaRNNs.py ├── NormLSTM.py ├── RNNCells.py ├── RNNs.py └── __init__.py ├── NER ├── Module │ ├── __init__.py │ ├── char.py │ ├── cove_encoder.py │ ├── crf.py │ ├── encoder.py │ └── ner.py ├── Tests │ ├── __init__.py │ ├── assist.py │ ├── cellulars │ │ ├── __init__.py │ │ ├── train_BioNLP13CG.py │ │ ├── train_BioNLP13PC.py │ │ ├── train_CRAFT.py │ │ └── train_jointCellulars.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── train_cove_300d.py │ │ └── train_glove_300d.py │ ├── group │ │ ├── __init__.py │ │ ├── cc │ │ │ ├── __init__.py │ │ │ ├── train_BioNLP13CG-cc.py │ │ │ ├── train_BioNLP13PC-cc.py │ │ │ ├── train_CRAFT-cc.py │ │ │ └── train_joint-cc.py │ │ ├── cell │ │ │ ├── __init__.py │ │ │ ├── train_BioNLP13CG-cell.py │ │ │ ├── train_CRAFT-cell.py │ │ │ └── train_joint-cell.py │ │ ├── chem │ │ │ ├── __init__.py │ │ │ ├── train_BC4CHEMD.py │ │ │ ├── train_BC5CDR-chem.py │ │ │ ├── train_BioNLP11ID-chem.py │ │ │ ├── train_BioNLP13CG-chem.py │ │ │ ├── train_BioNLP13PC-chem.py │ │ │ ├── train_CRAFT-chem.py │ │ │ └── train_joint-chem.py │ │ ├── disease │ │ │ ├── __init__.py │ │ │ ├── train_BC5CDR-disease.py │ │ │ ├── train_NCBI-disease.py │ │ │ └── train_joint-disease.py │ │ └── species │ │ │ ├── __init__.py │ │ │ ├── train_BioNLP11ID-species.py │ │ │ ├── train_BioNLP13CG-species.py │ │ │ ├── train_CRAFT-species.py │ │ │ ├── train_joint-species.py │ │ │ └── train_linnaeus.py │ └── train_optimizers.py ├── __init__.py ├── train_BaseLSTM.py ├── train_BaseRNN.py ├── train_LSTM.py ├── train_MetaLSTM.py ├── train_MetaRNN.py ├── train_NormLSTM.py ├── train_RNN.py └── utils │ ├── __init__.py │ ├── alphabet.py │ ├── config.py │ ├── functions.py │ ├── helpers.py │ └── metric.py ├── README.md ├── images ├── base_LSTM_CoNLL-2003.PNG ├── base_LSTM_MNIST.PNG ├── base_RNN_CoNLL-2003.PNG ├── base_RNN_MNIST.PNG ├── catnlp_logo.png ├── cellulars │ ├── cellulars1.PNG │ ├── cellulars2.PNG │ └── cellulars3.PNG ├── embeddings │ └── embedding_glove_cove-300.PNG ├── meta_LSTM_CoNLL-2003.PNG ├── meta_RNN_LSTM_CoNLL-2003.PNG ├── meta_RNN_LSTM_MNIST.PNG └── optimizers │ ├── dev_optimizers.png │ ├── loss_optimizers.png │ ├── test_optimizers.png │ └── train_optimizers.png ├── requirements.txt ├── test_MetaRNNs.ipynb ├── test_RNNs.ipynb └── train_MultiMetaLSTM.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # Environments 82 | .env 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # mypy 99 | .mypy_cache/ 100 | 101 | # pycharm 102 | .idea 103 | *.iml 104 | gen/ 105 | out/ 106 | 107 | # project 108 | data/ 109 | models/ 110 | .torch/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 cat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MNIST/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/11 15:24 6 | ''' -------------------------------------------------------------------------------- /MNIST/train_LSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/4/26 11:49 6 | ''' 7 | from Modules.RNNs import LSTM 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torchvision.datasets as dsets 12 | import torchvision.transforms as transforms 13 | from torch.autograd import Variable 14 | import numpy as np 15 | import visdom 16 | import math 17 | 18 | torch.manual_seed(100) 19 | 20 | # Hyper Parameters 21 | sequence_length = 28 22 | input_size = 28 23 | hidden_size = 128 24 | num_layers = 2 25 | num_classes = 10 26 | batch_size = 100 27 | num_epochs = 100 28 | learning_rate = 0.01 29 | 30 | # MNIST Dataset 31 | train_dataset = dsets.MNIST(root='../data/', train=True, transform=transforms.ToTensor(), download=True) 32 | test_dataset = dsets.MNIST(root='../data/', train=False, transform=transforms.ToTensor()) 33 | 34 | # Data Loader (Input Pipeline) 35 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) 36 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) 37 | 38 | # RNN Model (Many-to-One) 39 | class LSTMModel(nn.Module): 40 | def __init__(self, input_size, hidden_size, num_layers, num_classes, bias=True): 41 | super(LSTMModel, self).__init__() 42 | self.input_size = input_size 43 | self.hidden_size = hidden_size 44 | self.num_layers = num_layers 45 | self.num_classes = num_classes 46 | self.rnn = LSTM(input_size, hidden_size, num_layers=num_layers, bias=bias) 47 | self.fc = nn.Linear(hidden_size, num_classes, bias=bias) 48 | 49 | self.reset_parameters() 50 | 51 | def reset_parameters(self): 52 | stdv = 1.0 / math.sqrt(self.hidden_size) 53 | for weight in self.parameters(): 54 | weight.data.uniform_(-stdv, stdv) 55 | 56 | def forward(self, x): 57 | # set initial states 58 | # initial_states = [Variable(torch.zeros(x.size(0), self.hidden_size)) for _ in range(self.num_layers)] 59 | 60 | # forward propagate RNN 61 | out, _ = self.rnn(x) 62 | # print('out0-------') 63 | # print(out.size()) 64 | out = out[:, -1, :] 65 | # print('out1------') 66 | # print(out.size()) 67 | out.view(-1, self.hidden_size) 68 | # print('out2----------') 69 | # print(out.size()) 70 | out = self.fc(out) 71 | # print('out3--------') 72 | # print(out.size()) 73 | out = out.view(-1, self.num_classes) 74 | # print('out4----------') 75 | # print(out.size()) 76 | return out 77 | 78 | model = LSTMModel(input_size, hidden_size, num_layers, num_classes, bias=True) 79 | 80 | criterion = nn.CrossEntropyLoss() 81 | 82 | # Test the Model 83 | def evaluate(model): 84 | correct = 0 85 | total = 0 86 | for images, labels in test_loader: 87 | images = Variable(images.view(-1, sequence_length, input_size)) 88 | outputs = model(images) 89 | _, predicted = torch.max(outputs.data, 1) 90 | total += labels.size(0) 91 | correct += (predicted == labels).sum() 92 | accuracy = 100.0 * correct / total 93 | print('Test Accuracy of the model on the 10000 test images: %.2f %%' % accuracy) 94 | return accuracy 95 | 96 | # Train the Model 97 | def train(model, model_name, save_path): 98 | vis = visdom.Visdom() 99 | best_accuracy = 0 100 | losses = [] 101 | accuracy = [] 102 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 103 | for epoch in range(num_epochs): 104 | model.train(True) 105 | for i, (images, labels) in enumerate(train_loader): 106 | images = Variable(images.view(-1, sequence_length, input_size)) 107 | labels = Variable(labels) 108 | 109 | optimizer.zero_grad() 110 | outputs = model(images) 111 | loss = criterion(outputs, labels) 112 | sample_loss = loss.data[0] 113 | loss.backward() 114 | optimizer.step() 115 | 116 | if (i + 1) % 100 == 0: 117 | # draw the loss line 118 | losses.append(sample_loss) 119 | vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), 120 | win=model_name+'_loss', opts={'title': model_name+'_loss', 'legend': ['loss']}) 121 | print('Epoch [%d], Step [%d], Loss: %.4f' % (epoch+1, i+1, sample_loss)) 122 | model.train(False) 123 | current_accuracy = evaluate(model) 124 | 125 | # draw the accuracy line 126 | accuracy.append(current_accuracy) 127 | vis.line(np.array(accuracy), X=np.array([i for i in range(len(accuracy))]), 128 | win=model_name+'_accuracy', opts={'title': model_name+'_accuracy', 'legend': ['accuracy']}) 129 | if(current_accuracy > best_accuracy): 130 | best_accuracy = current_accuracy 131 | torch.save(model.state_dict(), save_path) 132 | print('Best Accuracy of the model on the 10000 test images: %.2f %%' % best_accuracy) 133 | 134 | train(model, 'LSTM', '../models/LSTM.pkl') 135 | -------------------------------------------------------------------------------- /MNIST/train_RNN.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/4/26 11:48 6 | ''' 7 | from Modules.RNNs import RNN 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torchvision.datasets as dsets 12 | import torchvision.transforms as transforms 13 | from torch.autograd import Variable 14 | import numpy as np 15 | import visdom 16 | 17 | torch.manual_seed(100) 18 | 19 | # Hyper Parameters 20 | sequence_length = 28 21 | input_size = 28 22 | hidden_size = 128 23 | num_layers = 2 24 | num_classes = 10 25 | batch_size = 100 26 | num_epochs = 100 27 | learning_rate = 0.01 28 | 29 | # MNIST Dataset 30 | train_dataset = dsets.MNIST(root='../data/', train=True, transform=transforms.ToTensor(), download=True) 31 | test_dataset = dsets.MNIST(root='../data/', train=False, transform=transforms.ToTensor()) 32 | 33 | # Data Loader (Input Pipeline) 34 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) 35 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) 36 | 37 | # RNN Model (Many-to-One) 38 | class RNNModel(nn.Module): 39 | def __init__(self, input_size, hidden_size, num_layers, num_classes, bias=True, grad_clip=None): 40 | super(RNNModel, self).__init__() 41 | self.input_size = input_size 42 | self.hidden_size = hidden_size 43 | self.num_layers = num_layers 44 | self.num_classes = num_classes 45 | self.rnn = RNN(input_size, hidden_size, num_layers=num_layers, 46 | bias=bias, grad_clip=grad_clip) 47 | self.fc = nn.Linear(hidden_size, num_classes, bias=bias) 48 | 49 | def forward(self, x): 50 | # set initial states 51 | initial_states = [Variable(torch.zeros(x.size(0), self.hidden_size)) for _ in range(self.num_layers)] 52 | 53 | # forward propagate RNN 54 | _, out = self.rnn(x, initial_states) 55 | out = self.fc(out) 56 | out = out.view(-1, self.num_classes) 57 | return out 58 | 59 | model = RNNModel(input_size, hidden_size, num_layers, num_classes, bias=True, grad_clip=10) 60 | 61 | criterion = nn.CrossEntropyLoss() 62 | 63 | # Test the Model 64 | def evaluate(model): 65 | correct = 0 66 | total = 0 67 | for images, labels in test_loader: 68 | images = Variable(images.view(-1, sequence_length, input_size)) 69 | outputs = model(images) 70 | _, predicted = torch.max(outputs.data, 1) 71 | total += labels.size(0) 72 | correct += (predicted == labels).sum() 73 | accuracy = 100.0 * correct / total 74 | print('Test Accuracy of the model on the 10000 test images: %.2f %%' % accuracy) 75 | return accuracy 76 | 77 | # Train the Model 78 | def train(model, model_name, save_path): 79 | vis = visdom.Visdom() 80 | best_accuracy = 0 81 | losses = [] 82 | accuracy = [] 83 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 84 | for epoch in range(num_epochs): 85 | model.train(True) 86 | for i, (images, labels) in enumerate(train_loader): 87 | images = Variable(images.view(-1, sequence_length, input_size)) 88 | labels = Variable(labels) 89 | 90 | optimizer.zero_grad() 91 | outputs = model(images) 92 | loss = criterion(outputs, labels) 93 | sample_loss = loss.data[0] 94 | loss.backward() 95 | optimizer.step() 96 | 97 | if (i + 1) % 100 == 0: 98 | # draw the loss line 99 | losses.append(sample_loss) 100 | vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), 101 | win=model_name+'_loss', opts={'title': model_name+'_loss', 'legend': ['loss']}) 102 | print('Epoch [%d], Step [%d], Loss: %.4f' % (epoch+1, i+1, sample_loss)) 103 | model.train(False) 104 | current_accuracy = evaluate(model) 105 | 106 | # draw the accuracy line 107 | accuracy.append(current_accuracy) 108 | vis.line(np.array(accuracy), X=np.array([i for i in range(len(accuracy))]), 109 | win=model_name+'_accuracy', opts={'title': model_name+'_accuracy', 'legend': ['accuracy']}) 110 | if(current_accuracy > best_accuracy): 111 | best_accuracy = current_accuracy 112 | torch.save(model.state_dict(), save_path) 113 | print('Best Accuracy of the model on the 10000 test images: %.2f %%' % best_accuracy) 114 | 115 | train(model, 'RNN', '../models/RNN.pkl') -------------------------------------------------------------------------------- /MNIST/train_base_LSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/4/27 22:01 6 | ''' 7 | import torch 8 | import torch.nn as nn 9 | import torchvision.datasets as dsets 10 | import torchvision.transforms as transforms 11 | from torch.autograd import Variable 12 | import numpy as np 13 | import visdom 14 | import math 15 | 16 | torch.manual_seed(100) 17 | 18 | # Hyper Parameters 19 | sequence_length = 28 20 | input_size = 28 21 | hidden_size = 128 22 | num_layers = 2 23 | num_classes = 10 24 | batch_size = 100 25 | num_epochs = 100 26 | learning_rate = 0.01 27 | 28 | # MNIST Dataset 29 | train_dataset = dsets.MNIST(root='../data/', train=True, transform=transforms.ToTensor(), download=True) 30 | test_dataset = dsets.MNIST(root='../data/', train=False, transform=transforms.ToTensor()) 31 | 32 | # Data Loader (Input Pipeline) 33 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) 34 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) 35 | 36 | # RNN Model (Many-to-One) 37 | class base_LSTMModel(nn.Module): 38 | def __init__(self, input_size, hidden_size, num_layers, num_classes, bias=True): 39 | super(base_LSTMModel, self).__init__() 40 | self.input_size = input_size 41 | self.hidden_size = hidden_size 42 | self.num_layers = num_layers 43 | self.num_classes = num_classes 44 | self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, bias=bias, batch_first=True) 45 | self.fc = nn.Linear(hidden_size, num_classes, bias=bias) 46 | 47 | self.reset_parameters() 48 | 49 | def reset_parameters(self): 50 | stdv = 1.0 / math.sqrt(self.hidden_size) 51 | for weight in self.parameters(): 52 | weight.data.uniform_(-stdv, stdv) 53 | 54 | def forward(self, x): 55 | # set initial states 56 | # initial_states = [Variable(torch.zeros(x.size(0), self.hidden_size)) for _ in range(self.num_layers)] 57 | 58 | # forward propagate RNN 59 | out, _ = self.rnn(x) 60 | # print('out0-------') 61 | # print(out.size()) 62 | out = out[:, -1, :] 63 | # print('out1------') 64 | # print(out.size()) 65 | out.view(-1, self.hidden_size) 66 | # print('out2----------') 67 | # print(out.size()) 68 | out = self.fc(out) 69 | # print('out3--------') 70 | # print(out.size()) 71 | out = out.view(-1, self.num_classes) 72 | # print('out4----------') 73 | # print(out.size()) 74 | return out 75 | 76 | base_model = base_LSTMModel(input_size, hidden_size, num_layers, num_classes, bias=True) 77 | 78 | criterion = nn.CrossEntropyLoss() 79 | 80 | # Test the Model 81 | def evaluate(model): 82 | correct = 0 83 | total = 0 84 | for images, labels in test_loader: 85 | images = Variable(images.view(-1, sequence_length, input_size)) 86 | outputs = model(images) 87 | _, predicted = torch.max(outputs.data, 1) 88 | total += labels.size(0) 89 | correct += (predicted == labels).sum() 90 | accuracy = 100.0 * correct / total 91 | print('Test Accuracy of the model on the 10000 test images: %.2f %%' % accuracy) 92 | return accuracy 93 | 94 | # Train the Model 95 | def train(model, model_name, save_path): 96 | vis = visdom.Visdom() 97 | best_accuracy = 0 98 | losses = [] 99 | accuracy = [] 100 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 101 | for epoch in range(num_epochs): 102 | model.train(True) 103 | for i, (images, labels) in enumerate(train_loader): 104 | images = Variable(images.view(-1, sequence_length, input_size)) 105 | labels = Variable(labels) 106 | 107 | optimizer.zero_grad() 108 | outputs = model(images) 109 | loss = criterion(outputs, labels) 110 | sample_loss = loss.data[0] 111 | loss.backward() 112 | optimizer.step() 113 | 114 | if (i + 1) % 100 == 0: 115 | # draw the loss line 116 | losses.append(sample_loss) 117 | vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), 118 | win=model_name+'_loss', opts={'title': model_name+'_loss', 'legend': ['loss']}) 119 | print('Epoch [%d], Step [%d], Loss: %.4f' % (epoch+1, i+1, sample_loss)) 120 | model.train(False) 121 | current_accuracy = evaluate(model) 122 | 123 | # draw the accuracy line 124 | accuracy.append(current_accuracy) 125 | vis.line(np.array(accuracy), X=np.array([i for i in range(len(accuracy))]), 126 | win=model_name+'_accuracy', opts={'title': model_name+'_accuracy', 'legend': ['accuracy']}) 127 | if(current_accuracy > best_accuracy): 128 | best_accuracy = current_accuracy 129 | torch.save(model.state_dict(), save_path) 130 | print('Best Accuracy of the model on the 10000 test images: %.2f %%' % best_accuracy) 131 | 132 | train(base_model, 'base_LSTM', '../models/base_LSTM.pkl') -------------------------------------------------------------------------------- /MNIST/train_base_RNN.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/4/27 19:07 6 | ''' 7 | import torch 8 | import torch.nn as nn 9 | import torchvision.datasets as dsets 10 | import torchvision.transforms as transforms 11 | from torch.autograd import Variable 12 | import numpy as np 13 | import visdom 14 | import math 15 | 16 | torch.manual_seed(100) 17 | 18 | # Hyper Parameters 19 | sequence_length = 28 20 | input_size = 28 21 | hidden_size = 128 22 | num_layers = 2 23 | num_classes = 10 24 | batch_size = 100 25 | num_epochs = 100 26 | learning_rate = 0.01 27 | 28 | # MNIST Dataset 29 | train_dataset = dsets.MNIST(root='../data/', train=True, transform=transforms.ToTensor(), download=True) 30 | test_dataset = dsets.MNIST(root='../data/', train=False, transform=transforms.ToTensor()) 31 | 32 | # Data Loader (Input Pipeline) 33 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) 34 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) 35 | 36 | # RNN Model (Many-to-One) 37 | class base_RNNModel(nn.Module): 38 | def __init__(self, input_size, hidden_size, num_layers, num_classes, bias=True): 39 | super(base_RNNModel, self).__init__() 40 | self.input_size = input_size 41 | self.hidden_size = hidden_size 42 | self.num_layers = num_layers 43 | self.num_classes = num_classes 44 | self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, bias=bias, batch_first=True) 45 | self.fc = nn.Linear(hidden_size, num_classes, bias=bias) 46 | 47 | self.reset_parameters() 48 | 49 | def reset_parameters(self): 50 | stdv = 1.0 / math.sqrt(self.hidden_size) 51 | for weight in self.parameters(): 52 | weight.data.uniform_(-stdv, stdv) 53 | 54 | def forward(self, x): 55 | # set initial states 56 | # initial_states = [Variable(torch.zeros(x.size(0), self.hidden_size)) for _ in range(self.num_layers)] 57 | 58 | # forward propagate RNN 59 | out, _ = self.rnn(x) 60 | # print('out0-------') 61 | # print(out.size()) 62 | out = out[:, -1, :] 63 | # print('out1------') 64 | # print(out.size()) 65 | out.view(-1, self.hidden_size) 66 | # print('out2----------') 67 | # print(out.size()) 68 | out = self.fc(out) 69 | # print('out3--------') 70 | # print(out.size()) 71 | out = out.view(-1, self.num_classes) 72 | # print('out4----------') 73 | # print(out.size()) 74 | return out 75 | 76 | base_model = base_RNNModel(input_size, hidden_size, num_layers, num_classes, bias=True) 77 | 78 | criterion = nn.CrossEntropyLoss() 79 | 80 | # Test the Model 81 | def evaluate(model): 82 | correct = 0 83 | total = 0 84 | for images, labels in test_loader: 85 | images = Variable(images.view(-1, sequence_length, input_size)) 86 | outputs = model(images) 87 | _, predicted = torch.max(outputs.data, 1) 88 | total += labels.size(0) 89 | correct += (predicted == labels).sum() 90 | accuracy = 100.0 * correct / total 91 | print('Test Accuracy of the model on the 10000 test images: %.2f %%' % accuracy) 92 | return accuracy 93 | 94 | # Train the Model 95 | def train(model, model_name, save_path): 96 | vis = visdom.Visdom() 97 | best_accuracy = 0 98 | losses = [] 99 | accuracy = [] 100 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 101 | for epoch in range(num_epochs): 102 | model.train(True) 103 | for i, (images, labels) in enumerate(train_loader): 104 | images = Variable(images.view(-1, sequence_length, input_size)) 105 | labels = Variable(labels) 106 | 107 | optimizer.zero_grad() 108 | outputs = model(images) 109 | loss = criterion(outputs, labels) 110 | sample_loss = loss.data[0] 111 | loss.backward() 112 | optimizer.step() 113 | 114 | if (i + 1) % 100 == 0: 115 | # draw the loss line 116 | losses.append(sample_loss) 117 | vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), 118 | win=model_name+'_loss', opts={'title': model_name+'_loss', 'legend': ['loss']}) 119 | print('Epoch [%d], Step [%d], Loss: %.4f' % (epoch+1, i+1, sample_loss)) 120 | model.train(False) 121 | current_accuracy = evaluate(model) 122 | 123 | # draw the accuracy line 124 | accuracy.append(current_accuracy) 125 | vis.line(np.array(accuracy), X=np.array([i for i in range(len(accuracy))]), 126 | win=model_name+'_accuracy', opts={'title': model_name+'_accuracy', 'legend': ['accuracy']}) 127 | if(current_accuracy > best_accuracy): 128 | best_accuracy = current_accuracy 129 | torch.save(model.state_dict(), save_path) 130 | print('Best Accuracy of the model on the 10000 test images: %.2f %%' % best_accuracy) 131 | 132 | train(base_model, 'base_RNN', '../models/base_RNN.pkl') -------------------------------------------------------------------------------- /Modules/RNNCells.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/4/24 19:35 6 | ''' 7 | import math 8 | 9 | import torch 10 | from torch.nn import Module, Parameter 11 | import torch.nn.functional as F 12 | 13 | class RNNCellBase(Module): 14 | def __repr__(selfs): 15 | s = '{name}({input_size}, {hidden_size}' 16 | if 'bias' in self.__dict__ and self.bias is not True: 17 | s += ', bias={bias}' 18 | if 'nonlinearity' in self.__dict__ and self.nonlinearity != 'tanh': 19 | s += ', nonlinearity={nonlinearity}' 20 | s += ')' 21 | return s.format(name=self.__class__.__name__, **self.__dict__) 22 | 23 | ''' 24 | @Math: h' = tanh(w_{ih}x + w_{hh}h + b) 25 | ''' 26 | class RNNCell(RNNCellBase): 27 | def __init__(self, input_size, hidden_size, bias=True): 28 | super(RNNCell, self).__init__() 29 | self.input_size = input_size 30 | self.hidden_size = hidden_size 31 | 32 | self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size)) 33 | self.weight_hh = Parameter(torch.Tensor(hidden_size, hidden_size)) 34 | if bias: 35 | self.bias = Parameter(torch.Tensor(hidden_size)) 36 | else: 37 | self.register_parameter('bias', None) 38 | 39 | self.reset_parameters() 40 | 41 | def reset_parameters(self): 42 | stdv = 1.0 / math.sqrt(self.hidden_size) 43 | for weight in self.parameters(): 44 | weight.data.uniform_(-stdv, stdv) 45 | 46 | def forward(self, input, h): 47 | output = F.linear(input, self.weight_ih) + F.linear(h, self.weight_hh) + self.bias 48 | output = F.relu(output) 49 | 50 | return output 51 | 52 | ''' 53 | i = sigmoid(W_{ii}x + W_{hi}h + b_i) 54 | f = sigmoid(W_{if}x + W_{hf}h + b_f) 55 | g = tanh(W_{ig}x + W_{hg}h + b_g) 56 | o = sigmoid(W_{io}x + W_{ho}h + b_o) 57 | c' = f * c + i * g 58 | h' = o * tanh(c') 59 | ''' 60 | class LSTMCell(RNNCellBase): 61 | def __init__(self, input_size, hidden_size, bias=True): 62 | super(LSTMCell, self).__init__() 63 | self.input_size = input_size 64 | self.hidden_size = hidden_size 65 | 66 | self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) 67 | self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) 68 | if bias: 69 | self.bias = Parameter(torch.Tensor(4 * hidden_size)) 70 | else: 71 | self.register_parameter('bias', None) 72 | 73 | self.reset_parameters() 74 | 75 | def reset_parameters(self): 76 | stdv = 1.0 / math.sqrt(self.hidden_size) 77 | for weight in self.parameters(): 78 | weight.data.uniform_(-stdv, stdv) 79 | 80 | def forward(self, input, hx): 81 | h, c = hx 82 | 83 | pre = F.linear(input, self.weight_ih) + F.linear(h, self.weight_hh) + self.bias 84 | 85 | i = F.sigmoid(pre[:, : self.hidden_size]) 86 | f = F.sigmoid(pre[:, self.hidden_size: self.hidden_size * 2]) 87 | g = F.tanh(pre[:, self.hidden_size * 2: self.hidden_size * 3]) 88 | o = F.sigmoid(pre[:, self.hidden_size * 3: ]) 89 | c = f * c + i * g 90 | h = o * F.tanh(c) 91 | return h, c -------------------------------------------------------------------------------- /Modules/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/14 18:45 6 | ''' -------------------------------------------------------------------------------- /NER/Module/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:18 6 | ''' -------------------------------------------------------------------------------- /NER/Module/char.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 15:02 6 | ''' 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 11 | import numpy as np 12 | 13 | class Char(nn.Module): 14 | def __init__(self, mode, alphabet_size, embedding_dim, hidden_dim, dropout, gpu): 15 | super(Char, self).__init__() 16 | print('---build batched char---') 17 | self.mode = mode 18 | self.gpu = gpu 19 | self.dropout = nn.Dropout(dropout) 20 | self.embeddings = nn.Embedding(alphabet_size, embedding_dim) 21 | self.embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim))) 22 | 23 | if self.mode == 'LSTM': 24 | self.hidden_dim = hidden_dim // 2 25 | self.char = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=True) 26 | elif self.mode == 'CNN': 27 | self.hidden_dim = hidden_dim 28 | self.char = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1) 29 | else: 30 | print('Error char feature selection, please check parameter data.char_features.') 31 | exit(0) 32 | 33 | if self.gpu: 34 | self.dropout = self.dropout.cuda() 35 | self.embeddings = self.embeddings.cuda() 36 | self.char = self.char.cuda() 37 | 38 | def random_embedding(self, vocab_size, embedding_dim): 39 | emb = np.empty([vocab_size, embedding_dim]) 40 | scale = np.sqrt(3.0 / embedding_dim) 41 | for index in range(vocab_size): 42 | emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) 43 | return emb 44 | 45 | def get_last_hiddens(self, input, seq_lengths): 46 | batch_size = input.size(0) 47 | char_embeds = self.dropout(self.embeddings(input)) 48 | if self.mode == 'LSTM': 49 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 50 | char_rnn_out, char_hidden = self.char(pack_input) 51 | char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 52 | last_hiddens = char_hidden[0].transpose(1, 0).contiguous().view(batch_size, -1) 53 | elif self.mode == 'CNN': 54 | char_embeds = char_embeds.transpose(2, 1).contiguous() 55 | char_cnn_out = self.char(char_embeds) 56 | last_hiddens = F.max_pool1d(char_cnn_out, char_cnn_out.size(2)).view(batch_size, -1) 57 | else: 58 | last_hiddens = None 59 | return last_hiddens 60 | 61 | def get_all_hiddens(self, input, seq_lengths): 62 | char_embeds = self.dropout(self.embeddings(input)) 63 | if self.mode == 'LSTM': 64 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 65 | char_rnn_out, char_hidden = self.char(pack_input) 66 | char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 67 | all_hiddens = char_rnn_out.transpose(1, 0) 68 | elif self.mode == 'CNN': 69 | char_embeds = char_embeds.transpose(2, 1).contiguous() 70 | all_hiddens = self.char(char_embeds).transpose(2, 1).contiguous() 71 | else: 72 | all_hiddens = None 73 | return all_hiddens 74 | 75 | def forward(self, input, seq_lengths): 76 | return self.get_all_hiddens(input, seq_lengths) 77 | -------------------------------------------------------------------------------- /NER/Module/ner.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 15:06 6 | ''' 7 | from NER.Module.encoder import Encoder 8 | from NER.Module.cove_encoder import CoVeEncoder 9 | from NER.Module.crf import CRF 10 | import torch.nn as nn 11 | 12 | class NER(nn.Module): 13 | def __init__(self, config, cove_flag=False): 14 | super(NER, self).__init__() 15 | print('---build batched NER---') 16 | label_size = config.label_alphabet_size 17 | config.label_alphabet_size += 2 18 | if cove_flag: 19 | self.encoder = CoVeEncoder(config) 20 | else: 21 | self.encoder = Encoder(config) 22 | self.crf = CRF(label_size, config.gpu) 23 | 24 | def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): 25 | # loss, tag_seq = self.encoder.neg_log_likelihood_loss(word_inputs, word_seq_lengths, batch_label) 26 | # return loss, tag_seq 27 | outs = self.encoder.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 28 | total_loss = self.crf.neg_log_likelihood_loss(outs, batch_label, mask) 29 | scores, tag_seq = self.crf.viterbi_decode(outs, mask) 30 | return total_loss, tag_seq 31 | 32 | def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): 33 | # decode_seq = self.encoder(word_inputs, word_seq_lengths, mask) 34 | # return decode_seq 35 | outs = self.encoder.get_output_score(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 36 | scores, tag_seq = self.crf.viterbi_decode(outs, mask) 37 | return tag_seq 38 | 39 | def get_word_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): 40 | return self.encoder.get_word_features(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 41 | -------------------------------------------------------------------------------- /NER/Tests/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/15 18:35 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/assist.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/18 12:18 6 | ''' 7 | import os 8 | 9 | def addContent(target, file, tag): 10 | print(file) 11 | beginTag = '<' + tag + '>' 12 | endTag = '' 13 | with open(file) as src: 14 | lines = src.read() 15 | lines = lines.replace('\n\n', '\n'+endTag+'\tS-'+tag+'\n\n'+beginTag+'\tS-'+tag+'\n') 16 | lines = beginTag + '\tS-'+tag+'\n' + lines + endTag + '\tS-'+tag+'\n\n' 17 | target.write(lines) 18 | 19 | def make_jointCorpus(dataset, name, dirList): 20 | print('---make joint corpus---') 21 | if dataset[-1] == '/': 22 | dataset = dataset[0: -1] 23 | 24 | joint_dir = dataset + '/' + name 25 | if not os.path.exists(joint_dir): 26 | os.makedirs(joint_dir) 27 | 28 | trainF = joint_dir + '/train.tsv' 29 | develF = joint_dir + '/devel.tsv' 30 | testF = joint_dir + '/test.tsv' 31 | 32 | with open(trainF, 'w') as trainF, open(develF, 'w') as develF, open(testF, 'w') as testF: 33 | for dir in dirList: 34 | tag = dir 35 | dir = dataset + '/' + dir + '-IOBES' 36 | addContent(trainF, dir+'/train.tsv', tag) 37 | addContent(develF, dir+'/devel.tsv', tag) 38 | addContent(testF, dir+'/test.tsv', tag) 39 | 40 | if __name__ == "__main__": 41 | dataset = '../../data/group/species' 42 | name = 'joint-species' 43 | dirList = ['BioNLP11ID-species', 'BioNLP13CG-species', 'CRAFT-species', 'linnaeus'] 44 | make_jointCorpus(dataset, name, dirList) -------------------------------------------------------------------------------- /NER/Tests/cellulars/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/17 14:56 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/cellulars/train_BioNLP13CG.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/17 14:57 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiDatasets/cellulars/BioNLP13CG') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiDatasets/cellulars/BioNLP13CG.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/cellular/BioNLP13CG-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../data/cellular/BioNLP13CG-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../data/cellular/BioNLP13CG-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'LSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13CG' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 122 | -------------------------------------------------------------------------------- /NER/Tests/cellulars/train_BioNLP13PC.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/17 15:15 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiDatasets/cellulars/BioNLP13PC') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiDatasets/cellulars/BioNLP13PC.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/cellular/BioNLP13PC-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../data/cellular/BioNLP13PC-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../data/cellular/BioNLP13PC-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'LSTM' # catnlp 85 | config = Config() 86 | config.optim = 'Adam' 87 | config.char_features = 'CNN' 88 | config.lr = 0.015 89 | config.hidden_dim = 200 90 | config.bid_flag = True 91 | config.number_normalized = True 92 | data_initialization(config, train_file, dev_file, test_file) 93 | config.gpu = gpu 94 | config.word_features = name 95 | print('Word features: ', config.word_features) 96 | config.generate_instance(train_file, 'train') 97 | config.generate_instance(dev_file, 'dev') 98 | config.generate_instance(test_file, 'test') 99 | if emb_file: 100 | print('load word emb file...norm: ', config.norm_word_emb) 101 | config.build_word_pretain_emb(emb_file) 102 | if char_emb_file != 'none': 103 | print('load char emb file...norm: ', config.norm_char_emb) 104 | config.build_char_pretrain_emb(char_emb_file) 105 | 106 | name = 'BioNLP13PC' # catnlp 107 | train(config, name, dset_dir, save_model_dir, seg) 108 | elif status == 'test': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(dev_file, 'dev') 111 | load_model_decode(model_dir, data, 'dev', gpu, seg) 112 | data.generate_instance(test_file, 'test') 113 | load_model_decode(model_dir, data, 'test', gpu, seg) 114 | elif status == 'decode': 115 | data = load_data_setting(dset_dir) 116 | data.generate_instance(raw_file, 'raw') 117 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 118 | data.write_decoded_results(output_file, decode_results, 'raw') 119 | else: 120 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 121 | -------------------------------------------------------------------------------- /NER/Tests/cellulars/train_CRAFT.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/17 15:21 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiDatasets/cellulars/CRAFT') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiDatasets/cellulars/CRAFT.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/cellular/CRAFT-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../data/cellular/CRAFT-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../data/cellular/CRAFT-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'LSTM' # catnlp 85 | config = Config() 86 | config.optim = 'Adam' 87 | config.char_features = 'CNN' 88 | config.lr = 0.015 89 | config.hidden_dim = 200 90 | config.bid_flag = True 91 | config.number_normalized = True 92 | data_initialization(config, train_file, dev_file, test_file) 93 | config.gpu = gpu 94 | config.word_features = name 95 | print('Word features: ', config.word_features) 96 | config.generate_instance(train_file, 'train') 97 | config.generate_instance(dev_file, 'dev') 98 | config.generate_instance(test_file, 'test') 99 | if emb_file: 100 | print('load word emb file...norm: ', config.norm_word_emb) 101 | config.build_word_pretain_emb(emb_file) 102 | if char_emb_file != 'none': 103 | print('load char emb file...norm: ', config.norm_char_emb) 104 | config.build_char_pretrain_emb(char_emb_file) 105 | 106 | name = 'CRAFT' # catnlp 107 | train(config, name, dset_dir, save_model_dir, seg) 108 | elif status == 'test': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(dev_file, 'dev') 111 | load_model_decode(model_dir, data, 'dev', gpu, seg) 112 | data.generate_instance(test_file, 'test') 113 | load_model_decode(model_dir, data, 'test', gpu, seg) 114 | elif status == 'decode': 115 | data = load_data_setting(dset_dir) 116 | data.generate_instance(raw_file, 'raw') 117 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 118 | data.write_decoded_results(output_file, decode_results, 'raw') 119 | else: 120 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 121 | -------------------------------------------------------------------------------- /NER/Tests/cellulars/train_jointCellulars.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/18 12:05 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiDatasets/cellulars/jointCellulars') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiDatasets/cellulars/jointCellulars.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/cellular/jointCellulars/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../data/cellular/jointCellulars/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../data/cellular/jointCellulars/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'LSTM' # catnlp 85 | config = Config() 86 | config.optim = 'Adam' 87 | config.char_features = 'CNN' 88 | config.lr = 0.015 89 | config.hidden_dim = 200 90 | config.bid_flag = True 91 | config.number_normalized = True 92 | data_initialization(config, train_file, dev_file, test_file) 93 | config.gpu = gpu 94 | config.word_features = name 95 | print('Word features: ', config.word_features) 96 | config.generate_instance(train_file, 'train') 97 | config.generate_instance(dev_file, 'dev') 98 | config.generate_instance(test_file, 'test') 99 | if emb_file: 100 | print('load word emb file...norm: ', config.norm_word_emb) 101 | config.build_word_pretain_emb(emb_file) 102 | if char_emb_file != 'none': 103 | print('load char emb file...norm: ', config.norm_char_emb) 104 | config.build_char_pretrain_emb(char_emb_file) 105 | 106 | name = 'jointCellulars' # catnlp 107 | ignore = True 108 | train(config, name, dset_dir, save_model_dir, seg, ignore) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/19 19:33 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/embeddings/train_cove_300d.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 12:50 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove300d') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiEmbeddings/cove300d') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiEmbeddings/cove300d.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/conll2003/train.bmes') # catnlp 32 | parser.add_argument('--dev', default='../../../data/conll2003/dev.bmes') # catnlp 33 | parser.add_argument('--test', default='../../../data/conll2003/test.bmes') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | elif emb == 'glove300d': 80 | emb_file = '../../../data/embedding/glove.840B.300d.txt' 81 | else: 82 | emb_file = None 83 | char_emb_file = args.charemb.lower() 84 | print('Char Embedding: ', char_emb_file) 85 | 86 | name = 'LSTM' # catnlp 87 | config = Config() 88 | config.layers = 2 89 | config.optim = 'Adam' 90 | config.char_features = 'CNN' 91 | config.word_emb_dim = 300 92 | config.hidden_dim = 600 93 | config.bid_flag = True 94 | config.number_normalized = True 95 | data_initialization(config, train_file, dev_file, test_file) 96 | config.gpu = gpu 97 | config.word_features = name 98 | print('Word features: ', config.word_features) 99 | config.generate_instance(train_file, 'train') 100 | config.generate_instance(dev_file, 'dev') 101 | config.generate_instance(test_file, 'test') 102 | if emb_file: 103 | print('load word emb file...norm: ', config.norm_word_emb) 104 | config.build_word_pretain_emb(emb_file) 105 | if char_emb_file != 'none': 106 | print('load char emb file...norm: ', config.norm_char_emb) 107 | config.build_char_pretrain_emb(char_emb_file) 108 | 109 | name = 'cove300d' # catnlp 110 | train(config, name, dset_dir, save_model_dir, seg, cove_flag=True) 111 | elif status == 'test': 112 | data = load_data_setting(dset_dir) 113 | data.generate_instance(dev_file, 'dev') 114 | load_model_decode(model_dir, data, 'dev', gpu, seg) 115 | data.generate_instance(test_file, 'test') 116 | load_model_decode(model_dir, data, 'test', gpu, seg) 117 | elif status == 'decode': 118 | data = load_data_setting(dset_dir) 119 | data.generate_instance(raw_file, 'raw') 120 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 121 | data.write_decoded_results(output_file, decode_results, 'raw') 122 | else: 123 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 124 | -------------------------------------------------------------------------------- /NER/Tests/embeddings/train_glove_300d.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/19 19:33 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove300d') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../models/multiEmbeddings/glove300d') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../models/multiEmbeddings/glove300d.dset') # catnlp 31 | parser.add_argument('--train', default='../../../data/conll2003/train.bmes') # catnlp 32 | parser.add_argument('--dev', default='../../../data/conll2003/dev.bmes') # catnlp 33 | parser.add_argument('--test', default='../../../data/conll2003/test.bmes') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../data/embedding/glove.6B.100d.txt' 79 | elif emb == 'glove300d': 80 | emb_file = '../../../data/embedding/glove.840B.300d.txt' 81 | else: 82 | emb_file = None 83 | char_emb_file = args.charemb.lower() 84 | print('Char Embedding: ', char_emb_file) 85 | 86 | name = 'LSTM' # catnlp 87 | config = Config() 88 | config.layers = 2 89 | config.optim = 'Adam' 90 | config.char_features = 'CNN' 91 | config.word_emb_dim = 300 92 | config.hidden_dim = 600 93 | config.bid_flag = True 94 | config.number_normalized = True 95 | data_initialization(config, train_file, dev_file, test_file) 96 | config.gpu = gpu 97 | config.word_features = name 98 | print('Word features: ', config.word_features) 99 | config.generate_instance(train_file, 'train') 100 | config.generate_instance(dev_file, 'dev') 101 | config.generate_instance(test_file, 'test') 102 | if emb_file: 103 | print('load word emb file...norm: ', config.norm_word_emb) 104 | config.build_word_pretain_emb(emb_file) 105 | if char_emb_file != 'none': 106 | print('load char emb file...norm: ', config.norm_char_emb) 107 | config.build_char_pretrain_emb(char_emb_file) 108 | 109 | name = 'glove300d' # catnlp 110 | train(config, name, dset_dir, save_model_dir, seg) 111 | elif status == 'test': 112 | data = load_data_setting(dset_dir) 113 | data.generate_instance(dev_file, 'dev') 114 | load_model_decode(model_dir, data, 'dev', gpu, seg) 115 | data.generate_instance(test_file, 'test') 116 | load_model_decode(model_dir, data, 'test', gpu, seg) 117 | elif status == 'decode': 118 | data = load_data_setting(dset_dir) 119 | data.generate_instance(raw_file, 'raw') 120 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 121 | data.write_decoded_results(output_file, decode_results, 'raw') 122 | else: 123 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 124 | -------------------------------------------------------------------------------- /NER/Tests/group/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:51 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/cc/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:51 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/cc/train_BioNLP13CG-cc.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:53 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cc/BioNLP13CG-cc') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cc/BioNLP13CG-cc.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cc/BioNLP13CG-cc-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cc/BioNLP13CG-cc-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cc/BioNLP13CG-cc-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13CG-cc' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 122 | -------------------------------------------------------------------------------- /NER/Tests/group/cc/train_BioNLP13PC-cc.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:59 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cc/BioNLP13PC-cc') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cc/BioNLP13PC-cc.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cc/BioNLP13PC-cc-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cc/BioNLP13PC-cc-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cc/BioNLP13PC-cc-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13PC-cc' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 122 | -------------------------------------------------------------------------------- /NER/Tests/group/cc/train_CRAFT-cc.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 17:00 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cc/CRAFT-cc') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cc/CRAFT-cc.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cc/CRAFT-cc-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cc/CRAFT-cc-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cc/CRAFT-cc-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'CRAFT-cc' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 122 | -------------------------------------------------------------------------------- /NER/Tests/group/cc/train_joint-cc.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 18:23 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cc/joint-cc') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cc/joint-cc.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cc/joint-cc/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cc/joint-cc/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cc/joint-cc/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'joint-cc' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg, ignore=True) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 122 | -------------------------------------------------------------------------------- /NER/Tests/group/cell/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:52 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/cell/train_BioNLP13CG-cell.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 18:59 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cell/BioNLP13CG-cell') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cell/BioNLP13CG-cell.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cell/BioNLP13CG-cell-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cell/BioNLP13CG-cell-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cell/BioNLP13CG-cell-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13CG-cell' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/cell/train_CRAFT-cell.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 18:59 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cell/CRAFT-cell') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cell/CRAFT-cell.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cell/CRAFT-cell-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cell/CRAFT-cell-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cell/CRAFT-cell-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'CRAFT-cell' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/cell/train_joint-cell.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:01 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/cell/joint-cell') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/cell/joint-cell.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/cell/joint-cell/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/cell/joint-cell/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/cell/joint-cell/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'joint-cell' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg, ignore=True) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:52 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_BC4CHEMD.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:10 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/BC4CHEMD') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/BC4CHEMD.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/BC4CHEMD-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/BC4CHEMD-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/BC4CHEMD-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BC4CHEMD-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_BC5CDR-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:11 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/BC5CDR-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/BC5CDR-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/BC5CDR-chem-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/BC5CDR-chem-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/BC5CDR-chem-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BC5CDR-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_BioNLP11ID-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:12 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/BioNLP11ID-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/BioNLP11ID-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/BioNLP11ID-chem-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/BioNLP11ID-chem-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/BioNLP11ID-chem-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP11ID-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_BioNLP13CG-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:12 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/BioNLP13CG-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/BioNLP13CG-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/BioNLP13CG-chem-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/BioNLP13CG-chem-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/BioNLP13CG-chem-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13CG-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_BioNLP13PC-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:13 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/BioNLP13PC-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/BioNLP13PC-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/BioNLP13PC-chem-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/BioNLP13PC-chem-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/BioNLP13PC-chem-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13PC-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_CRAFT-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:13 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/CRAFT-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/CRAFT-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/CRAFT-chem-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/CRAFT-chem-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/CRAFT-chem-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'CRAFT-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/chem/train_joint-chem.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:14 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/chem/joint-chem') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/chem/joint-chem.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/chem/joint-chem/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/chem/joint-chem/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/chem/joint-chem/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'joint-chem' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg, ignore=True) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/disease/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:52 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/disease/train_BC5CDR-disease.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:29 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/disease/BC5CDR-disease') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/disease/BC5CDR-disease.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/disease/BC5CDR-disease-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/disease/BC5CDR-disease-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/disease/BC5CDR-disease-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BC5CDR-disease' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/disease/train_NCBI-disease.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:29 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/disease/NCBI-disease') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/disease/NCBI-disease.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/disease/NCBI-disease-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/disease/NCBI-disease-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/disease/NCBI-disease-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'NCBI-disease' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/disease/train_joint-disease.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:30 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/disease/joint-disease') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/disease/joint-disease.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/disease/joint-disease/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/disease/joint-disease/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/disease/joint-disease/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'joint-disease' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg, ignore=True) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/species/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 16:52 6 | ''' -------------------------------------------------------------------------------- /NER/Tests/group/species/train_BioNLP11ID-species.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:35 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/species/BioNLP11ID-species') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/species/BioNLP11ID-species.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/species/BioNLP11ID-species-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/species/BioNLP11ID-species-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/species/BioNLP11ID-species-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP11ID-species' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/species/train_BioNLP13CG-species.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:35 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/species/BioNLP13CG-species') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/species/BioNLP13CG-species.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/species/BioNLP13CG-species-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/species/BioNLP13CG-species-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/species/BioNLP13CG-species-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'BioNLP13CG-species' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/species/train_CRAFT-species.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:35 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/species/CRAFT-species') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/species/CRAFT-species.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/species/CRAFT-species-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/species/CRAFT-species-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/species/CRAFT-species-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'CRAFT-species' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/species/train_joint-species.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:36 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/species/joint-species') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/species/joint-species.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/species/joint-species/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/species/joint-species/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/species/joint-species/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'joint-species' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg, ignore=True) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/group/species/train_linnaeus.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/21 19:36 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../../../../models/multiDatasets/group/species/linnaeus') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../../../../models/multiDatasets/group/species/linnaeus.dset') # catnlp 31 | parser.add_argument('--train', default='../../../../data/group/species/linnaeus-IOBES/train.tsv') # catnlp 32 | parser.add_argument('--dev', default='../../../../data/group/species/linnaeus-IOBES/devel.tsv') # catnlp 33 | parser.add_argument('--test', default='../../../../data/group/species/linnaeus-IOBES/test.tsv') # catnlp 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../../../../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.layers = 2 87 | config.optim = 'Adam' 88 | config.char_features = 'CNN' 89 | config.lr = 0.015 90 | config.hidden_dim = 200 91 | config.bid_flag = True 92 | config.number_normalized = True 93 | data_initialization(config, train_file, dev_file, test_file) 94 | config.gpu = gpu 95 | config.word_features = name 96 | print('Word features: ', config.word_features) 97 | config.generate_instance(train_file, 'train') 98 | config.generate_instance(dev_file, 'dev') 99 | config.generate_instance(test_file, 'test') 100 | if emb_file: 101 | print('load word emb file...norm: ', config.norm_word_emb) 102 | config.build_word_pretain_emb(emb_file) 103 | if char_emb_file != 'none': 104 | print('load char emb file...norm: ', config.norm_char_emb) 105 | config.build_char_pretrain_emb(char_emb_file) 106 | 107 | name = 'linnaeus-species' # catnlp 108 | train(config, name, dset_dir, save_model_dir, seg) 109 | elif status == 'test': 110 | data = load_data_setting(dset_dir) 111 | data.generate_instance(dev_file, 'dev') 112 | load_model_decode(model_dir, data, 'dev', gpu, seg) 113 | data.generate_instance(test_file, 'test') 114 | load_model_decode(model_dir, data, 'test', gpu, seg) 115 | elif status == 'decode': 116 | data = load_data_setting(dset_dir) 117 | data.generate_instance(raw_file, 'raw') 118 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 119 | data.write_decoded_results(output_file, decode_results, 'raw') 120 | else: 121 | print('Invalid argument! Please use valid arguments! (train/test/decode)') -------------------------------------------------------------------------------- /NER/Tests/train_optimizers.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/15 21:24 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--train', default='../../data/conll2003/train.bmes') 29 | parser.add_argument('--dev', default='../../data/conll2003/dev.bmes') 30 | parser.add_argument('--test', default='../../data/conll2003/test.bmes') 31 | parser.add_argument('--gpu', default='True') 32 | args = parser.parse_args() 33 | 34 | train_file = args.train 35 | dev_file = args.dev 36 | test_file = args.test 37 | 38 | if args.gpu.lower() == 'false': 39 | gpu = False 40 | else: 41 | gpu = torch.cuda.is_available() 42 | 43 | print('Seed num: ', seed_num) 44 | print('GPU available: ', gpu) 45 | print('Train file: ', train_file) 46 | print('Dev file: ', dev_file) 47 | print('Test file: ', test_file) 48 | sys.stdout.flush() 49 | 50 | emb = args.wordemb.lower() 51 | print('Word Embedding: ', emb) 52 | if emb == 'glove': 53 | emb_file = '../../data/embedding/glove.6B.100d.txt' 54 | else: 55 | emb_file = None 56 | char_emb_file = args.charemb.lower() 57 | print('Char Embedding: ', char_emb_file) 58 | 59 | name = 'BaseLSTM' # catnlp 60 | config = Config() 61 | config.lr = 0.015 62 | config.hidden_dim = 200 63 | config.number_normalized = True 64 | data_initialization(config, train_file, dev_file, test_file) 65 | config.gpu = gpu 66 | config.word_features = name 67 | print('Word features: ', config.word_features) 68 | config.generate_instance(train_file, 'train') 69 | config.generate_instance(dev_file, 'dev') 70 | config.generate_instance(test_file, 'test') 71 | if emb_file: 72 | print('load word emb file...norm: ', config.norm_word_emb) 73 | config.build_word_pretain_emb(emb_file) 74 | if char_emb_file != 'none': 75 | print('load char emb file...norm: ', config.norm_char_emb) 76 | config.build_char_pretrain_emb(char_emb_file) 77 | test_optimizer(config) -------------------------------------------------------------------------------- /NER/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 12:58 6 | ''' -------------------------------------------------------------------------------- /NER/train_BaseLSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:14 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/BaseLSTM') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/BaseLSTM.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseLSTM' # catnlp 85 | config = Config() 86 | config.lr = 0.015 87 | config.hidden_dim = 200 88 | # config.clip = True 89 | config.number_normalized = True 90 | data_initialization(config, train_file, dev_file, test_file) 91 | config.gpu = gpu 92 | config.word_features = name 93 | print('Word features: ', config.word_features) 94 | config.generate_instance(train_file, 'train') 95 | config.generate_instance(dev_file, 'dev') 96 | config.generate_instance(test_file, 'test') 97 | if emb_file: 98 | print('load word emb file...norm: ', config.norm_word_emb) 99 | config.build_word_pretain_emb(emb_file) 100 | if char_emb_file != 'none': 101 | print('load char emb file...norm: ', config.norm_char_emb) 102 | config.build_char_pretrain_emb(char_emb_file) 103 | train(config, name, dset_dir, save_model_dir, seg) 104 | elif status == 'test': 105 | data = load_data_setting(dset_dir) 106 | data.generate_instance(dev_file, 'dev') 107 | load_model_decode(model_dir, data, 'dev', gpu, seg) 108 | data.generate_instance(test_file, 'test') 109 | load_model_decode(model_dir, data, 'test', gpu, seg) 110 | elif status == 'decode': 111 | data = load_data_setting(dset_dir) 112 | data.generate_instance(raw_file, 'raw') 113 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 114 | data.write_decoded_results(output_file, decode_results, 'raw') 115 | else: 116 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 117 | -------------------------------------------------------------------------------- /NER/train_BaseRNN.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:08 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/BaseRNN') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/BaseRNN.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'BaseRNN' # catnlp 85 | config = Config() 86 | config.lr = 0.0015 87 | config.number_normalized = True 88 | data_initialization(config, train_file, dev_file, test_file) 89 | config.gpu = gpu 90 | config.word_features = name 91 | print('Word features: ', config.word_features) 92 | config.generate_instance(train_file, 'train') 93 | config.generate_instance(dev_file, 'dev') 94 | config.generate_instance(test_file, 'test') 95 | if emb_file: 96 | print('load word emb file...norm: ', config.norm_word_emb) 97 | config.build_word_pretain_emb(emb_file) 98 | if char_emb_file != 'none': 99 | print('load char emb file...norm: ', config.norm_char_emb) 100 | config.build_char_pretrain_emb(char_emb_file) 101 | train(config, name, dset_dir, save_model_dir, seg) 102 | elif status == 'test': 103 | data = load_data_setting(dset_dir) 104 | data.generate_instance(dev_file, 'dev') 105 | load_model_decode(model_dir, data, 'dev', gpu, seg) 106 | data.generate_instance(test_file, 'test') 107 | load_model_decode(model_dir, data, 'test', gpu, seg) 108 | elif status == 'decode': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(raw_file, 'raw') 111 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 112 | data.write_decoded_results(output_file, decode_results, 'raw') 113 | else: 114 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 115 | -------------------------------------------------------------------------------- /NER/train_LSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:14 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/LSTM') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/LSTM.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'LSTM' # catnlp 85 | config = Config() 86 | config.char_features = 'CNN' 87 | config.lr = 0.015 88 | config.hidden_dim = 200 89 | config.bid_flag = True 90 | config.number_normalized = True 91 | data_initialization(config, train_file, dev_file, test_file) 92 | config.gpu = gpu 93 | config.word_features = name 94 | print('Word features: ', config.word_features) 95 | config.generate_instance(train_file, 'train') 96 | config.generate_instance(dev_file, 'dev') 97 | config.generate_instance(test_file, 'test') 98 | if emb_file: 99 | print('load word emb file...norm: ', config.norm_word_emb) 100 | config.build_word_pretain_emb(emb_file) 101 | if char_emb_file != 'none': 102 | print('load char emb file...norm: ', config.norm_char_emb) 103 | config.build_char_pretrain_emb(char_emb_file) 104 | train(config, name, dset_dir, save_model_dir, seg) 105 | elif status == 'test': 106 | data = load_data_setting(dset_dir) 107 | data.generate_instance(dev_file, 'dev') 108 | load_model_decode(model_dir, data, 'dev', gpu, seg) 109 | data.generate_instance(test_file, 'test') 110 | load_model_decode(model_dir, data, 'test', gpu, seg) 111 | elif status == 'decode': 112 | data = load_data_setting(dset_dir) 113 | data.generate_instance(raw_file, 'raw') 114 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 115 | data.write_decoded_results(output_file, decode_results, 'raw') 116 | else: 117 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 118 | -------------------------------------------------------------------------------- /NER/train_MetaLSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:14 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/MetaLSTM') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/MetaLSTM.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'MetaLSTM' # catnlp 85 | config = Config() 86 | # config.optim = 'Adam' 87 | config.lr = 0.015 88 | config.hidden_dim = 200 89 | config.iteration = 200 90 | config.bid_flag = True 91 | config.number_normalized = True 92 | data_initialization(config, train_file, dev_file, test_file) 93 | config.gpu = gpu 94 | config.word_features = name 95 | print('Word features: ', config.word_features) 96 | config.generate_instance(train_file, 'train') 97 | config.generate_instance(dev_file, 'dev') 98 | config.generate_instance(test_file, 'test') 99 | if emb_file: 100 | print('load word emb file...norm: ', config.norm_word_emb) 101 | config.build_word_pretain_emb(emb_file) 102 | if char_emb_file != 'none': 103 | print('load char emb file...norm: ', config.norm_char_emb) 104 | config.build_char_pretrain_emb(char_emb_file) 105 | name = "MetaLSTM_SGD_z" 106 | train(config, name, dset_dir, save_model_dir, seg) 107 | elif status == 'test': 108 | data = load_data_setting(dset_dir) 109 | data.generate_instance(dev_file, 'dev') 110 | load_model_decode(model_dir, data, 'dev', gpu, seg) 111 | data.generate_instance(test_file, 'test') 112 | load_model_decode(model_dir, data, 'test', gpu, seg) 113 | elif status == 'decode': 114 | data = load_data_setting(dset_dir) 115 | data.generate_instance(raw_file, 'raw') 116 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 117 | data.write_decoded_results(output_file, decode_results, 'raw') 118 | else: 119 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 120 | -------------------------------------------------------------------------------- /NER/train_MetaRNN.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:14 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/MetaRNN') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/MetaRNN.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'MetaRNN' # catnlp 85 | config = Config() 86 | config.lr = 0.0015 87 | config.number_normalized = True 88 | data_initialization(config, train_file, dev_file, test_file) 89 | config.gpu = gpu 90 | config.word_features = name 91 | print('Word features: ', config.word_features) 92 | config.generate_instance(train_file, 'train') 93 | config.generate_instance(dev_file, 'dev') 94 | config.generate_instance(test_file, 'test') 95 | if emb_file: 96 | print('load word emb file...norm: ', config.norm_word_emb) 97 | config.build_word_pretain_emb(emb_file) 98 | if char_emb_file != 'none': 99 | print('load char emb file...norm: ', config.norm_char_emb) 100 | config.build_char_pretrain_emb(char_emb_file) 101 | train(config, name, dset_dir, save_model_dir, seg) 102 | elif status == 'test': 103 | data = load_data_setting(dset_dir) 104 | data.generate_instance(dev_file, 'dev') 105 | load_model_decode(model_dir, data, 'dev', gpu, seg) 106 | data.generate_instance(test_file, 'test') 107 | load_model_decode(model_dir, data, 'test', gpu, seg) 108 | elif status == 'decode': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(raw_file, 'raw') 111 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 112 | data.write_decoded_results(output_file, decode_results, 'raw') 113 | else: 114 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 115 | -------------------------------------------------------------------------------- /NER/train_NormLSTM.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/8 19:50 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/NormLSTM') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/NormLSTM.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'NormLSTM' # catnlp 85 | config = Config() 86 | config.lr = 0.0015 87 | config.number_normalized = True 88 | data_initialization(config, train_file, dev_file, test_file) 89 | config.gpu = gpu 90 | config.word_features = name 91 | print('Word features: ', config.word_features) 92 | config.generate_instance(train_file, 'train') 93 | config.generate_instance(dev_file, 'dev') 94 | config.generate_instance(test_file, 'test') 95 | if emb_file: 96 | print('load word emb file...norm: ', config.norm_word_emb) 97 | config.build_word_pretain_emb(emb_file) 98 | if char_emb_file != 'none': 99 | print('load char emb file...norm: ', config.norm_char_emb) 100 | config.build_char_pretrain_emb(char_emb_file) 101 | train(config, name, dset_dir, save_model_dir, seg) 102 | elif status == 'test': 103 | data = load_data_setting(dset_dir) 104 | data.generate_instance(dev_file, 'dev') 105 | load_model_decode(model_dir, data, 'dev', gpu, seg) 106 | data.generate_instance(test_file, 'test') 107 | load_model_decode(model_dir, data, 'test', gpu, seg) 108 | elif status == 'decode': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(raw_file, 'raw') 111 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 112 | data.write_decoded_results(output_file, decode_results, 'raw') 113 | else: 114 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 115 | -------------------------------------------------------------------------------- /NER/train_RNN.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 14:13 6 | ''' 7 | from NER.utils.config import Config 8 | from NER.utils.helpers import * 9 | 10 | import sys 11 | import argparse 12 | import random 13 | import torch 14 | import numpy as np 15 | 16 | seed_num = 100 17 | random.seed(seed_num) 18 | torch.manual_seed(seed_num) 19 | np.random.seed(seed_num) 20 | 21 | import os 22 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser(description='Tuning with NER') 26 | parser.add_argument('--wordemb', help='Embedding for words', default='glove') 27 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 28 | parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train') 29 | parser.add_argument('--savemodel', default='../models/conll2003/RNN') # catnlp 30 | parser.add_argument('--savedset', help='Dir of saved data setting', default='../models/conll2003/RNN.dset') # catnlp 31 | parser.add_argument('--train', default='../data/conll2003/train.bmes') 32 | parser.add_argument('--dev', default='../data/conll2003/dev.bmes') 33 | parser.add_argument('--test', default='../data/conll2003/test.bmes') 34 | parser.add_argument('--gpu', default='True') 35 | parser.add_argument('--seg', default='True') 36 | parser.add_argument('--extendalphabet', default='True') 37 | parser.add_argument('--raw') 38 | parser.add_argument('--loadmodel') 39 | parser.add_argument('--output') 40 | args = parser.parse_args() 41 | 42 | train_file = args.train 43 | dev_file = args.dev 44 | test_file = args.test 45 | raw_file = args.raw 46 | model_dir = args.loadmodel 47 | dset_dir = args.savedset 48 | output_file = args.output 49 | if args.seg.lower() == 'true': 50 | seg = True 51 | else: 52 | seg = False 53 | status = args.status.lower() 54 | 55 | save_model_dir = args.savemodel 56 | if args.gpu.lower() == 'false': 57 | gpu = False 58 | else: 59 | gpu = torch.cuda.is_available() 60 | 61 | print('Seed num: ', seed_num) 62 | print('GPU available: ', gpu) 63 | print('Status: ', status) 64 | 65 | print('Seg: ', seg) 66 | print('Train file: ', train_file) 67 | print('Dev file: ', dev_file) 68 | print('Test file: ', test_file) 69 | print('Raw file: ', raw_file) 70 | if status == 'train': 71 | print('Model saved to: ', save_model_dir) 72 | sys.stdout.flush() 73 | 74 | if status == 'train': 75 | emb = args.wordemb.lower() 76 | print('Word Embedding: ', emb) 77 | if emb == 'glove': 78 | emb_file = '../data/embedding/glove.6B.100d.txt' 79 | else: 80 | emb_file = None 81 | char_emb_file = args.charemb.lower() 82 | print('Char Embedding: ', char_emb_file) 83 | 84 | name = 'RNN' # catnlp 85 | config = Config() 86 | config.lr = 0.0015 87 | config.number_normalized = True 88 | data_initialization(config, train_file, dev_file, test_file) 89 | config.gpu = gpu 90 | config.word_features = name 91 | print('Word features: ', config.word_features) 92 | config.generate_instance(train_file, 'train') 93 | config.generate_instance(dev_file, 'dev') 94 | config.generate_instance(test_file, 'test') 95 | if emb_file: 96 | print('load word emb file...norm: ', config.norm_word_emb) 97 | config.build_word_pretain_emb(emb_file) 98 | if char_emb_file != 'none': 99 | print('load char emb file...norm: ', config.norm_char_emb) 100 | config.build_char_pretrain_emb(char_emb_file) 101 | train(config, name, dset_dir, save_model_dir, seg) 102 | elif status == 'test': 103 | data = load_data_setting(dset_dir) 104 | data.generate_instance(dev_file, 'dev') 105 | load_model_decode(model_dir, data, 'dev', gpu, seg) 106 | data.generate_instance(test_file, 'test') 107 | load_model_decode(model_dir, data, 'test', gpu, seg) 108 | elif status == 'decode': 109 | data = load_data_setting(dset_dir) 110 | data.generate_instance(raw_file, 'raw') 111 | decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg) 112 | data.write_decoded_results(output_file, decode_results, 'raw') 113 | else: 114 | print('Invalid argument! Please use valid arguments! (train/test/decode)') 115 | -------------------------------------------------------------------------------- /NER/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 19:37 6 | ''' -------------------------------------------------------------------------------- /NER/utils/alphabet.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 19:46 6 | ''' 7 | import json 8 | import os 9 | 10 | class Alphabet: 11 | def __init__(self, name, label=False, keep_growing=True): 12 | self.__name = name 13 | self.UNKNOWN = '' 14 | self.PAD = '' ## catnlp add pad 15 | self.label = label 16 | self.instance2index = {} 17 | self.instances = [] 18 | self.keep_growing = keep_growing 19 | 20 | self.default_index = 0 21 | self.next_index = 1 22 | if not self.label: 23 | self.add(self.PAD) ## catnlp_add pad 24 | self.add(self.UNKNOWN) 25 | 26 | def clear(self, keep_growing=True): 27 | self.instance2index = {} 28 | self.instances = [] 29 | self.keep_growing = keep_growing 30 | 31 | self.default_index = 0 32 | self.next_index = 1 33 | 34 | def add(self, instance): 35 | if instance not in self.instance2index: 36 | self.instances.append(instance) 37 | self.instance2index[instance] = self.next_index 38 | self.next_index += 1 39 | 40 | def get_index(self, instance): 41 | try: 42 | return self.instance2index[instance] 43 | except KeyError: 44 | if self.keep_growing: 45 | index = self.next_index 46 | self.add(instance) 47 | return index 48 | else: 49 | return self.instance2index[self.UNKNOWN] 50 | 51 | def get_instance(self, index): 52 | if index == 0: 53 | return None 54 | try: 55 | return self.instances[index - 1] 56 | except IndexError: 57 | print('WARNING: Alphabet get_instance, unknown instance, return the first label.') 58 | return self.instances[0] 59 | 60 | def size(self): 61 | return len(self.instances) + 1 62 | 63 | def iteritems(self): 64 | return self.instance2index.items() # catnlp 65 | 66 | def enumerate_items(self, start=1): 67 | if start < 1 or start >= self.size(): 68 | raise IndexError('Enumerate is allowed between [1: size of the alphabet]') 69 | return zip(range(start, len(self.instances) + 1), self.instances[start - 1:]) 70 | 71 | def close(self): 72 | self.keep_growing = False 73 | 74 | def open(self): 75 | self.keep_growing = True 76 | 77 | def get_content(self): 78 | return {'instance2index': self.instance2index, 'instances': self.instances} 79 | 80 | def from_json(self, data): 81 | self.instances = data['instances'] 82 | self.instance2index = data['instance2index'] 83 | 84 | def save(self, output_directory, name=None): 85 | saving_name = name if name else self.__name 86 | try: 87 | json.dump(self.get_content(), open(os.path.join(output_directory, saving_name + '.json'), 'w')) 88 | except Exception as e: 89 | print('Exception: Alphabet is not saved: ' % repr(e)) 90 | def load(self, input_directory, name=None): 91 | loading_name = name if name else self.__name 92 | self.from_json(json.load(open(os.path.join(input_directory, loading_name + '.json')))) -------------------------------------------------------------------------------- /NER/utils/functions.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | ''' 3 | @Author: catnlp 4 | @Email: wk_nlp@163.com 5 | @Time: 2018/5/2 21:04 6 | ''' 7 | import numpy as np 8 | 9 | def normalize_word(word): 10 | new_word = '' 11 | for char in word: 12 | if char.isdigit(): 13 | new_word += '0' 14 | else: 15 | new_word += char 16 | return new_word 17 | 18 | def read_instance(input_file, word_alphabet, char_alphabet, label_alphabet, number_normalized, max_sent_length, char_padding_size=-1, char_padding_symbol=''): 19 | in_lines = open(input_file, 'r').readlines() 20 | instance_texts = [] 21 | instance_ids = [] 22 | words = [] 23 | chars = [] 24 | labels = [] 25 | word_ids = [] 26 | char_ids = [] 27 | labels_ids = [] 28 | for line in in_lines: 29 | if len(line) > 2: 30 | pairs = line.strip().split() 31 | word = pairs[0] # catnlp 32 | if number_normalized: 33 | word = normalize_word(word) 34 | label = pairs[-1] 35 | words.append(word) 36 | labels.append(label) 37 | word_ids.append(word_alphabet.get_index(word)) 38 | labels_ids.append(label_alphabet.get_index(label)) 39 | char_list = [] 40 | char_id = [] 41 | for char in word: 42 | char_list.append(char) 43 | if char_padding_size > 0: 44 | char_number = len(char_list) 45 | if char_number < char_padding_size: 46 | char_list = char_list + [char_padding_symbol] * (char_padding_size - char_number) 47 | assert (len(char_list) == char_padding_size) 48 | for char in char_list: 49 | char_id.append(char_alphabet.get_index(char)) 50 | chars.append(char_list) 51 | char_ids.append(char_id) 52 | else: 53 | if (max_sent_length < 0) or (len(words) < max_sent_length): 54 | instance_texts.append([words, chars, labels]) 55 | instance_ids.append([word_ids, char_ids, labels_ids]) 56 | 57 | words = [] 58 | chars = [] 59 | labels = [] 60 | word_ids = [] 61 | char_ids = [] 62 | labels_ids = [] 63 | return instance_texts, instance_ids 64 | 65 | def build_pretrain_embedding(embedding_path, word_alphabet, embed_dim=100, norm=True): 66 | embed_dict = dict() 67 | if embedding_path != None: 68 | embed_dict, embed_dim = load_pretrain_emb(embedding_path) 69 | alphabet_size = word_alphabet.size() 70 | scale = np.sqrt(3.0 / embed_dim) 71 | pretrain_emb = np.empty([word_alphabet.size(), embed_dim]) 72 | perfect_match = 0 73 | case_match = 0 74 | not_match = 0 75 | for word, index in word_alphabet.iteritems(): 76 | if word in embed_dict: 77 | if norm: 78 | pretrain_emb[index, :] = norm2one(embed_dict[word]) 79 | else: 80 | pretrain_emb[index, :] = embed_dict[word] 81 | perfect_match += 1 82 | elif word.lower() in embed_dict: 83 | if norm: 84 | pretrain_emb[index, :] = norm2one(embed_dict[word.lower()]) 85 | else: 86 | pretrain_emb[index, :] = embed_dict[word.lower()] 87 | case_match += 1 88 | else: 89 | pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embed_dim]) 90 | not_match += 1 91 | pretrain_size = len(embed_dict) 92 | print('Embedding:\n\tpretrain word:%s, perfect match:%s, case_match:%s, oov:%s, oov%%:%s' 93 | % (pretrain_size, perfect_match, case_match, not_match, (not_match+0.)/alphabet_size)) 94 | return pretrain_emb, embed_dim 95 | 96 | def norm2one(vec): 97 | root_sum_square = np.sqrt(np.sum(np.square(vec))) 98 | return vec/root_sum_square 99 | 100 | def load_pretrain_emb(embedding_path): 101 | embed_dim = -1 102 | embed_dict = dict() 103 | with open(embedding_path, 'r') as file: 104 | for line in file: 105 | line = line.strip() 106 | if len(line) == 0: 107 | continue 108 | tokens = line.split() 109 | if embed_dim < 0: 110 | embed_dim = len(tokens) - 1 111 | # else: 112 | # if(len(tokens) != embed_dim + 1): 113 | # print(tokens) 114 | # assert(embed_dim + 1 == len(tokens)) 115 | embed = np.empty([1, embed_dim]) 116 | embed[:] = tokens[len(tokens) - embed_dim: ] 117 | name = tokens[0] 118 | size = len(tokens) - embed_dim - 1 119 | for i in range(size): 120 | name += ' ' + tokens[i+1] 121 | embed_dict[name] = embed # catnlp 122 | return embed_dict, embed_dim 123 | -------------------------------------------------------------------------------- /images/base_LSTM_CoNLL-2003.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/base_LSTM_CoNLL-2003.PNG -------------------------------------------------------------------------------- /images/base_LSTM_MNIST.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/base_LSTM_MNIST.PNG -------------------------------------------------------------------------------- /images/base_RNN_CoNLL-2003.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/base_RNN_CoNLL-2003.PNG -------------------------------------------------------------------------------- /images/base_RNN_MNIST.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/base_RNN_MNIST.PNG -------------------------------------------------------------------------------- /images/catnlp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/catnlp_logo.png -------------------------------------------------------------------------------- /images/cellulars/cellulars1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/cellulars/cellulars1.PNG -------------------------------------------------------------------------------- /images/cellulars/cellulars2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/cellulars/cellulars2.PNG -------------------------------------------------------------------------------- /images/cellulars/cellulars3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/cellulars/cellulars3.PNG -------------------------------------------------------------------------------- /images/embeddings/embedding_glove_cove-300.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/embeddings/embedding_glove_cove-300.PNG -------------------------------------------------------------------------------- /images/meta_LSTM_CoNLL-2003.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/meta_LSTM_CoNLL-2003.PNG -------------------------------------------------------------------------------- /images/meta_RNN_LSTM_CoNLL-2003.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/meta_RNN_LSTM_CoNLL-2003.PNG -------------------------------------------------------------------------------- /images/meta_RNN_LSTM_MNIST.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/meta_RNN_LSTM_MNIST.PNG -------------------------------------------------------------------------------- /images/optimizers/dev_optimizers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/optimizers/dev_optimizers.png -------------------------------------------------------------------------------- /images/optimizers/loss_optimizers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/optimizers/loss_optimizers.png -------------------------------------------------------------------------------- /images/optimizers/test_optimizers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/optimizers/test_optimizers.png -------------------------------------------------------------------------------- /images/optimizers/train_optimizers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catnlp/metaLSTM/f477f49d6435f0fbf30a848efc72b67fa34a3f9f/images/optimizers/train_optimizers.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.14.2 2 | torch==0.3.1 3 | visdom==0.1.8.5 4 | torchvision==0.2.0 5 | --------------------------------------------------------------------------------