├── .gitignore ├── README.md ├── aion ├── embeddings │ ├── cove.py │ ├── doc2vec.py │ ├── document_embeddings.py │ ├── elmo.py │ ├── embeddings.py │ ├── glove.py │ ├── infersent.py │ ├── infersent_lib │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── data.py │ │ ├── dataset │ │ │ ├── get_data.bash │ │ │ └── tokenizer.sed │ │ ├── encoder │ │ │ ├── demo.ipynb │ │ │ ├── extract_features.py │ │ │ ├── models.py │ │ │ └── samples.txt │ │ ├── models.py │ │ ├── mutils.py │ │ └── train_nli.py │ ├── sentence_embeddings.py │ ├── skip_thoughts.py │ └── word_embeddings.py ├── helper │ ├── __init__.py │ └── file_helper.py └── util │ ├── __init__.py │ └── spell_check.py └── sample ├── embeddings └── nlp-embeddings-document-doc2vec.ipynb ├── nlp-3_basic_distance_measurement_in_text_mining.ipynb ├── nlp-bag_of_words.ipynb ├── nlp-character_embedding.ipynb ├── nlp-distance-edit_distance.ipynb ├── nlp-embeddings-sentence-elmo.ipynb ├── nlp-embeddings-sentence-infersent.ipynb ├── nlp-embeddings-word-cove.ipynb ├── nlp-lsa_lda.ipynb ├── nlp-model_interpretation-201808.ipynb ├── nlp-model_interpretation.ipynb ├── nlp-model_interpretation_anchor.ipynb ├── nlp-model_interpretation_shap.ipynb ├── nlp-named_entity_recognition.ipynb ├── nlp-part_of_speech.ipynb ├── nlp-sentence_tokenization.ipynb ├── nlp-skip_thoughts.ipynb ├── nlp-stemming.ipynb ├── nlp-stop_words.ipynb ├── nlp-text_summarization_extractive.ipynb ├── nlp-word_embedding.ipynb ├── nlp-word_mover_distance.ipynb ├── nlp-word_tokenization.ipynb ├── nlp_lemmatization.ipynb ├── preprocessing └── nlp-preprocessing-string_matching-fuzzywuzzy.ipynb ├── resources └── LSI and LDA.pptx └── util ├── nlp-util-spell_corrector.ipynb └── nlp-util-symspell.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /aion/embeddings/cove.py: -------------------------------------------------------------------------------- 1 | import keras 2 | 3 | from .word_embeddings import WordEmbeddings 4 | from .glove import GloVeEmbeddings 5 | 6 | ''' 7 | Source: https://github.com/rgsachin/CoVe 8 | ''' 9 | 10 | 11 | class CoVeEmbeddings(WordEmbeddings): 12 | COVE_MODEL_KERAS_URL = 'https://github.com/rgsachin/CoVe/raw/master/Keras_CoVe.h5' 13 | 14 | def __init__(self, 15 | word_embeddings_dir, 16 | handle_oov=True, oov_vector_type='random', 17 | padding=True, pad_vector_type='random', 18 | max_sequence_length=50, tokenizer=None, 19 | verbose=0): 20 | super().__init__(verbose=verbose) 21 | 22 | if tokenizer is None: 23 | self.tokenizer = self._tokenizer_space 24 | 25 | self.word_embeddings_dir = word_embeddings_dir 26 | self.handle_oov = handle_oov 27 | self.oov_vector_type = oov_vector_type 28 | self.padding = padding 29 | self.pad_vector_type = pad_vector_type 30 | self.max_sequence_length = max_sequence_length 31 | 32 | def load_model(self, dest_dir, src=None, trainable=True, verbose=0): 33 | if src is None: 34 | src = self.COVE_MODEL_KERAS_URL 35 | 36 | file_path = self.download( 37 | src=src, dest_dir=dest_dir, dest_file=None, uncompress=False) 38 | 39 | self.model = keras.models.load_model(file_path) 40 | 41 | self.word_embs_model = GloVeEmbeddings( 42 | handle_oov=self.handle_oov, oov_vector_type=self.oov_vector_type, 43 | padding=self.padding, pad_vector_type=self.pad_vector_type, 44 | max_sequence_length=self.max_sequence_length) 45 | self.word_embs_model.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose) 46 | 47 | def encode(self, x, tokenize=True): 48 | if tokenize: 49 | tokens = [self.tokenizer(sentence) for sentence in x] 50 | else: 51 | tokens = x 52 | 53 | x_embs = self.word_embs_model.encode(tokens) 54 | 55 | return self.model.predict(x_embs) 56 | 57 | -------------------------------------------------------------------------------- /aion/embeddings/doc2vec.py: -------------------------------------------------------------------------------- 1 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 2 | 3 | from .document_embeddings import DocumentEmbeddings 4 | 5 | 6 | class Doc2VecEmbeddings(DocumentEmbeddings): 7 | def __init__(self, 8 | merge_mode="concat", algorithms="dm", 9 | word_dimension=300, min_word_count=1, 10 | word_window=10, n_job=4, 11 | train_epoch=10, infer_epoch=5, 12 | infer_aplha=0.1, infer_min_alpha=0.0001, 13 | verbose=0): 14 | super().__init__(verbose=verbose) 15 | 16 | self.merge_mode = merge_mode 17 | if merge_mode == 'concat': 18 | self.dm_concat = 1 19 | self.dm_mean = None 20 | elif merge_mode == 'mean': 21 | self.dm_concat = None 22 | self.dm_mean = 1 23 | else: 24 | raise Exception('merge_mode only allows either concat or mean') 25 | 26 | self.algorithms = algorithms 27 | if algorithms == 'dm': 28 | self.dm = 1 29 | elif algorithms == 'dbow': 30 | self.dm = 0 31 | 32 | self.word_dimension = word_dimension 33 | self.min_word_count = min_word_count 34 | self.word_window = word_window 35 | self.n_job = n_job 36 | self.train_epoch = train_epoch 37 | self.infer_epoch = infer_epoch 38 | self.infer_alpha = infer_aplha 39 | self.infer_min_alpha = infer_min_alpha 40 | 41 | self.vocab_size = 0 42 | self.word2idx = {} 43 | 44 | def build_vocab(self, documents, training=True, tokenize=True): 45 | if tokenize: 46 | docs = [self._tokenizer_space(document) for document in documents] 47 | else: 48 | docs = documents 49 | 50 | vocab = {} 51 | for words in docs: 52 | for word in words: 53 | if word not in vocab: 54 | vocab[word] = 1 55 | 56 | if training: 57 | self.vocab_size = len(vocab) 58 | 59 | 60 | docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)] 61 | return docs 62 | 63 | def train(self, documents): 64 | self.model = Doc2Vec( 65 | documents, dm_concat=self.dm_concat, dm_mean=self.dm_mean, 66 | dm=self.dm, vector_size=self.word_dimension, 67 | window=self.word_window, min_count=self.min_word_count, 68 | workers=self.n_job) 69 | 70 | self.model.train( 71 | documents, total_words=self.vocab_size, 72 | epochs=self.train_epoch) 73 | 74 | def encode(self, documents, tokenize=True): 75 | if tokenize: 76 | docs = [self._tokenizer_space(document) for document in documents] 77 | else: 78 | docs = documents 79 | 80 | docs = [ 81 | self.model.infer_vector( 82 | document, alpha=self.infer_alpha, 83 | min_alpha=self.infer_min_alpha, 84 | steps=self.infer_epoch) 85 | for document in docs 86 | ] 87 | 88 | return docs 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /aion/embeddings/document_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Edward Ma. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os, datetime 17 | 18 | from .embeddings import Embeddings 19 | 20 | 21 | class DocumentEmbeddings(Embeddings): 22 | def __init__(self, verbose=0): 23 | self.verbose = verbose -------------------------------------------------------------------------------- /aion/embeddings/elmo.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import tensorflow as tf 4 | import tensorflow_hub as tf_hub 5 | 6 | from .word_embeddings import WordEmbeddings 7 | 8 | 9 | class ELMoEmbeddings(WordEmbeddings): 10 | ELMO_MODEL_V2_URL = "https://tfhub.dev/google/elmo/2" 11 | 12 | def __init__(self, layer, verbose=0): 13 | super().__init__(verbose=verbose) 14 | self.layer = layer 15 | 16 | def _set_tf_log_level(self, verbose): 17 | if verbose >= 30: 18 | tf.logging.set_verbosity(tf.logging.INFO) 19 | elif verbose >= 20: 20 | tf.logging.set_verbosity(tf.logging.WARN) 21 | elif verbose >= 10: 22 | tf.logging.set_verbosity(tf.logging.DEBUG) 23 | else: 24 | tf.logging.set_verbosity(tf.logging.ERROR) 25 | 26 | def load(self, src=None, dest_dir=None, trainable=True, verbose=0): 27 | self._log_time(status='LOADING', msg='file', verbose=verbose) 28 | self._set_tf_log_level(verbose) 29 | 30 | if src == None: 31 | src = self.ELMO_MODEL_V2_URL 32 | 33 | if dest_dir is not None: 34 | os.environ["TFHUB_CACHE_DIR"] = dest_dir 35 | 36 | self.model = tf_hub.Module(src, trainable=trainable) 37 | 38 | self._log_time(status='LOADED', msg='', verbose=verbose) 39 | 40 | return self.model 41 | 42 | def to_keras_layer(self, x): 43 | # Source: https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb 44 | ''' 45 | For signature and layer parameters, you can visit https://alpha.tfhub.dev/google/elmo/2 46 | ''' 47 | return self.model( 48 | tf.squeeze(tf.cast(x, tf.string)), 49 | signature="default", as_dict=True)[self.layer] 50 | 51 | 52 | # import operator 53 | # import datetime 54 | # import re 55 | 56 | # from bilm.data import Vocabulary 57 | 58 | # class ELMoEmbeddings: 59 | # def __init__(self, tokenizer=None, verbose=0): 60 | # self.verbose = verbose 61 | 62 | # self.tokenizer = self.get_tokenizer(tokenizer) 63 | 64 | # def _space_tokenizer(self, sentence): 65 | # # There is some unicode from source data 66 | # # return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.encode('ascii', 'ignore').decode('ascii').split(' ') if t != ''] 67 | # # return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.split(' ') if t != ''] 68 | # return [t for t in sentence.split(' ') if t != ''] 69 | 70 | # def _spacy_tokenizer(self, sentence, model=None): 71 | # if model is None: 72 | # import spacy 73 | # model = spacy.load('en') 74 | 75 | # return [t.text.encode('ascii', 'ignore') for t in model(str(sentence)) if t.text != ''] 76 | 77 | # def get_tokenizer(self, tokenizer): 78 | # if tokenizer is None or tokenizer == 'space': 79 | # tokenizer = self._space_tokenizer 80 | # elif tokenizer == 'spacy': 81 | # tokenizer = self._spacy_tokenizer 82 | 83 | # return tokenizer 84 | 85 | # def preprocess(self, sentence): 86 | # normalized_space = sentence.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') 87 | # normalized_unicode = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', normalized_space) 88 | 89 | # normalized_text = re.sub(' +',' ', normalized_unicode) 90 | 91 | # return normalized_text 92 | 93 | # def get_basic_elements(self, mode): 94 | # if mode == 'build': 95 | # return ['', '', ''] 96 | # elif mode == 'train': 97 | # return ['', ''] 98 | # return [] 99 | 100 | # def build_vocab(self, sentences, mode, vocab_file_path): 101 | # word_dict = {} 102 | 103 | # basic_elements = self.get_basic_elements(mode) 104 | 105 | # for sentence in sentences: 106 | # sentence = self.preprocess(sentence) 107 | # for w in self.tokenizer(sentence): 108 | 109 | # if w not in word_dict: 110 | # word_dict[w] = 0 111 | # word_dict[w] += 1 112 | 113 | # word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True) 114 | # print('Total Word: %d' % (len(word_dict))) 115 | 116 | # with open(vocab_file_path, 'w') as f: 117 | # for item in basic_elements: 118 | # f.write("%s\n" % item) 119 | 120 | # for word, count in word_dict: 121 | # # Ximenez, characters <-- finding these word to check unicode issue 122 | # # print([word]) 123 | # if word != '': 124 | # f.write("%s\n" % word) 125 | 126 | # def build_data(self, sentences, data_file_path): 127 | # with open(data_file_path, 'w') as f: 128 | # for sentence in sentences: 129 | # sentence = self.preprocess(sentence) 130 | # tokens = self.tokenizer(sentence) 131 | # if len(tokens) > 0: 132 | # f.write("%s\n" % ' '.join(str(tokens))) -------------------------------------------------------------------------------- /aion/embeddings/embeddings.py: -------------------------------------------------------------------------------- 1 | import datetime, os, urllib, zipfile 2 | 3 | 4 | class Embeddings: 5 | def __init__(self, verbose=0): 6 | self.verbose = verbose 7 | self.model = {} 8 | self.model_path = '' 9 | 10 | def _log_time(self, status, msg, verbose): 11 | if self.verbose >= 10 or verbose >= 10: 12 | print('%s. [%s] %s' % (datetime.datetime.now(), status, msg)) 13 | 14 | def download(self, src, dest_dir, dest_file, uncompress, housekeep=False, verbose=0): 15 | if not os.path.exists(dest_dir): 16 | os.makedirs(dest_dir) 17 | 18 | if dest_file is None: 19 | dest_file = os.path.basename(src) 20 | 21 | if not self.is_file_exist(dest_dir + dest_file): 22 | self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose) 23 | file = urllib.request.urlopen(src) 24 | with open(dest_dir + dest_file,'wb') as output: 25 | output.write(file.read()) 26 | else: 27 | self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose) 28 | 29 | if uncompress: 30 | self.uncompress(dest_dir + dest_file) 31 | 32 | if uncompress and housekeep: 33 | self.housekeep(dest_dir + dest_file) 34 | 35 | 36 | return dest_dir + dest_file 37 | 38 | """ 39 | File related 40 | """ 41 | 42 | def uncompress(self): 43 | raise NotImplemented() 44 | 45 | def unzip(self, file_path): 46 | dest_dir = os.path.dirname(file_path) 47 | with zipfile.ZipFile(file_path, "r") as zip_ref: 48 | zip_ref.extractall(dest_dir) 49 | 50 | def housekeep(self, file_path): 51 | os.remove(file_path) 52 | 53 | def is_file_exist(self, file_path): 54 | return os.path.exists(file_path) 55 | 56 | def save(self): 57 | raise NotImplemented() 58 | 59 | def load(self): 60 | raise NotImplemented() 61 | 62 | """ 63 | Model related 64 | """ 65 | 66 | def get_model(self): 67 | return self.model 68 | 69 | def set_model(self, model): 70 | self.model = model 71 | 72 | def load(self, src=None, dest_dir=None, trainable=True, verbose=0): 73 | raise NotImplemented() 74 | 75 | """ 76 | Vocabulary realted 77 | """ 78 | 79 | def load_vocab(self, **kwargs): 80 | raise NotImplemented() 81 | 82 | def build_vocab(self): 83 | raise NotImplemented() 84 | 85 | def get_vocab(self): 86 | raise NotImplemented() 87 | 88 | def _tokenizer_space(self, sentence): 89 | return sentence.split(' ') 90 | 91 | """ 92 | Vector related 93 | """ 94 | 95 | def train(self): 96 | raise NotImplemented() 97 | 98 | def encode(self, sentences): 99 | raise NotImplemented() 100 | 101 | def visualize(self): 102 | raise NotImplemented() 103 | 104 | """ 105 | Netowrk realted 106 | """ 107 | 108 | def to_numpy_layer(self): 109 | raise NotImplemented() 110 | 111 | def to_keras_layer(self): 112 | raise NotImplemented() 113 | 114 | def to_tensorflow_layer(self): 115 | raise NotImplemented() 116 | 117 | def to_pytorch_layer(self): 118 | raise NotImplemented() -------------------------------------------------------------------------------- /aion/embeddings/glove.py: -------------------------------------------------------------------------------- 1 | import datetime, os, zipfile 2 | import numpy as np 3 | 4 | from .word_embeddings import WordEmbeddings 5 | 6 | 7 | class GloVeEmbeddings(WordEmbeddings): 8 | GLOVE_COMMON_CRAWL_MODEL_URL = 'http://nlp.stanford.edu/data/glove.42B.300d.zip' 9 | 10 | def __init__(self, 11 | handle_oov=True, oov_vector=None, oov_vector_type='zero', 12 | padding=True, pad_vector=None, pad_vector_type='zero', 13 | max_sequence_length=10, dimension=300, 14 | verbose=0): 15 | super().__init__( 16 | handle_oov=handle_oov, oov_vector=oov_vector, oov_vector_type=oov_vector_type, 17 | padding=padding, pad_vector=pad_vector, pad_vector_type=pad_vector_type, 18 | max_sequence_length=max_sequence_length, dimension=dimension, 19 | verbose=verbose) 20 | 21 | def load_model(self, dest_dir, src=None, trainable=True, process=True, verbose=0): 22 | if src is None: 23 | src = self.GLOVE_COMMON_CRAWL_MODEL_URL 24 | 25 | dest_file = os.path.basename(src) 26 | 27 | file_path = self.download( 28 | src=src, dest_dir=dest_dir, dest_file=None, 29 | uncompress=True, housekeep=False, verbose=verbose) 30 | 31 | self.model_path = dest_dir + dest_file 32 | 33 | dest_file = dest_file.replace('.zip', '.txt') 34 | 35 | if process and not self.is_file_exist(dest_dir + dest_file): 36 | with open(dest_dir + dest_file, encoding="utf8" ) as f: 37 | lines = f.readlines() 38 | 39 | for line in lines: 40 | line_contents = line.split() 41 | word = line_contents[0] 42 | self.model[word] = np.array([float(val) for val in line_contents[1:]]) 43 | 44 | return self.model 45 | 46 | def uncompress(self, file_path): 47 | self.unzip(file_path) 48 | 49 | def encode(self, sentences): 50 | preds = np.empty([len(sentences), self.max_sequence_length, self.dimension]) 51 | 52 | for i, words in enumerate(sentences): 53 | pred = np.empty([self.max_sequence_length, self.dimension]) 54 | cnt = 0 55 | 56 | for word in words: 57 | if self.is_vector_exist(word): 58 | pred[cnt] = self.model[word] 59 | cnt += 1 60 | elif self.handle_oov: 61 | pred[cnt] = self.oov_vector 62 | cnt += 1 63 | 64 | if cnt + 1 >= self.max_sequence_length: 65 | break 66 | 67 | if self.padding and (cnt + 1 < self.max_sequence_length): 68 | for i in range(0, self.max_sequence_length - cnt): 69 | pred[cnt] = self.pad_vector 70 | cnt += 1 71 | 72 | preds[i] = pred 73 | 74 | 75 | return preds -------------------------------------------------------------------------------- /aion/embeddings/infersent.py: -------------------------------------------------------------------------------- 1 | import datetime, os, zipfile 2 | import numpy as np 3 | import torch 4 | import subprocess 5 | 6 | from .glove import GloVeEmbeddings 7 | from .sentence_embeddings import SentenceEmbeddings 8 | 9 | # InferSent (as of Sep 2018) is not a a library (https://github.com/facebookresearch/InferSent/issues/76), Cloned from https://github.com/facebookresearch/InferSent 10 | from .infersent_lib.models import InferSent 11 | 12 | 13 | class InferSentEmbeddings(SentenceEmbeddings): 14 | INFERSENT_GLOVE_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent1.pkl' 15 | INFERSENT_FASTTEXT_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent2.pkl' 16 | 17 | def __init__(self, 18 | word_embeddings_dir, 19 | batch_size=64, word_dimension=300, encoder_lstm_dimension=2048, 20 | pooling_type='max', model_version=1, dropout=0.0, 21 | verbose=0): 22 | super().__init__(verbose=verbose) 23 | 24 | self.word_embeddings_dir = word_embeddings_dir 25 | self.batch_size = batch_size 26 | self.word_dimension = word_dimension 27 | self.encoder_lstm_dimension = encoder_lstm_dimension 28 | self.pooling_type = pooling_type 29 | self.dropout = dropout 30 | self.model_version = model_version 31 | 32 | def get_params(self): 33 | return { 34 | 'bsize': self.batch_size, 35 | 'word_emb_dim': self.word_dimension, 36 | 'enc_lstm_dim': self.encoder_lstm_dimension, 37 | 'pool_type': self.pooling_type, 38 | 'dpout_model': self.dropout, 39 | 'version': self.model_version 40 | } 41 | 42 | def load_model(self, dest_dir, src=None, trainable=True, verbose=0): 43 | # TODO: Support V2 model 44 | if src is None: 45 | src = InferSentEmbeddings.INFERSENT_GLOVE_MODEL_URL 46 | 47 | dest_file = os.path.basename(src) 48 | file_path = self.download( 49 | src=src, dest_dir=dest_dir, dest_file=dest_file, 50 | uncompress=False, housekeep=False, verbose=verbose) 51 | 52 | self.model = InferSent(self.get_params()) 53 | self.model.load_state_dict(torch.load(dest_dir + dest_file)) 54 | 55 | # TODO: support different glove model and fasttext model 56 | word_embs = GloVeEmbeddings() 57 | word_embs.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose) 58 | 59 | self.model.set_w2v_path(word_embs.model_path) 60 | 61 | def build_vocab(self, sentences, tokenize=True): 62 | return self.model.build_vocab(sentences, tokenize=tokenize) 63 | 64 | def encode(self, sentences, tokenize=True): 65 | return self.model.encode(sentences, tokenize=tokenize) 66 | 67 | def visualize(self, sentence, tokenize=True): 68 | self.model.visualize(sentence, tokenize=tokenize) -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/.gitignore: -------------------------------------------------------------------------------- 1 | dataset/GloVe 2 | dataset/MultiNLI 3 | dataset/SNLI 4 | encoder/infersent.allnli.pickle 5 | 6 | *.swp 7 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/README.md: -------------------------------------------------------------------------------- 1 | # InferSent 2 | 3 | *InferSent* is a *sentence embeddings* method that provides semantic representations for English sentences. It is trained on natural language inference data and generalizes well to many different tasks. 4 | 5 | We provide our pre-trained English sentence encoder [our paper](https://arxiv.org/abs/1705.02364) and our [SentEval](https://github.com/facebookresearch/SentEval) evaluation toolkit. 6 | 7 | **Recent changes**: Added infersent2 model trained on fastText vectors and added max-pool option. 8 | 9 | ## Dependencies 10 | 11 | This code is written in python. Dependencies include: 12 | 13 | * Python 2/3 14 | * [Pytorch](http://pytorch.org/) (recent version) 15 | * NLTK >= 3 16 | 17 | ## Download datasets 18 | To get SNLI and MultiNLI, run (in dataset/): 19 | ```bash 20 | ./get_data.bash 21 | ``` 22 | This will download and preprocess SNLI/MultiNLI datasets. For MacOS, you may have to use *p7zip* instead of *unzip*. 23 | 24 | 25 | Download [GloVe](https://nlp.stanford.edu/projects/glove/) (V1) or [fastText](https://fasttext.cc/docs/en/english-vectors.html) (V2) vectors: 26 | ```bash 27 | mkdir dataset/GloVe 28 | curl -Lo dataset/GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip 29 | unzip dataset/GloVe/glove.840B.300d.zip -d dataset/GloVe/ 30 | mkdir dataset/fastText 31 | curl -Lo dataset/fastText/crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip 32 | unzip dataset/fastText/crawl-300d-2M.vec.zip -d dataset/fastText/ 33 | ``` 34 | 35 | ## Use our sentence encoder 36 | We provide a simple interface to encode English sentences. **See [**encoder/demo.ipynb**](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb) 37 | for a practical example.** Get started with the following steps: 38 | 39 | *0.0) Download our InferSent models (V1 trained with GloVe, V2 trained with fastText)[147MB]:* 40 | ```bash 41 | curl -Lo encoder/infersent1.pkl https://s3.amazonaws.com/senteval/infersent/infersent1.pkl 42 | curl -Lo encoder/infersent2.pkl https://s3.amazonaws.com/senteval/infersent/infersent2.pkl 43 | ``` 44 | Note that infersent1 is trained with GloVe (which have been trained on text preprocessed with the PTB tokenizer) and infersent2 is trained with fastText (which have been trained on text preprocessed with the MOSES tokenizer). The latter also removes the padding of zeros with max-pooling which was inconvenient when embedding sentences outside of their batches. 45 | 46 | *0.1) Make sure you have the NLTK tokenizer by running the following once:* 47 | ```python 48 | import nltk 49 | nltk.download('punkt') 50 | ``` 51 | 52 | *1) [Load our pre-trained model](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb) (in encoder/):* 53 | ```python 54 | from models import InferSent 55 | V = 2 56 | MODEL_PATH = 'encoder/infersent%s.pkl' % V 57 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 58 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} 59 | infersent = InferSent(params_model) 60 | infersent.load_state_dict(torch.load(MODEL_PATH)) 61 | ``` 62 | 63 | *2) Set word vector path for the model:* 64 | ```python 65 | W2V_PATH = 'fastText/crawl-300d-2M.vec' 66 | infersent.set_w2v_path(W2V_PATH) 67 | ``` 68 | 69 | *3) Build the vocabulary of word vectors (i.e keep only those needed):* 70 | ```python 71 | infersent.build_vocab(sentences, tokenize=True) 72 | ``` 73 | where *sentences* is your list of **n** sentences. You can update your vocabulary using *infersent.update_vocab(sentences)*, or directly load the **K** most common English words with *infersent.build_vocab_k_words(K=100000)*. 74 | If **tokenize** is True (by default), sentences will be tokenized using NTLK. 75 | 76 | *4) Encode your sentences (list of *n* sentences):* 77 | ```python 78 | embeddings = infersent.encode(sentences, tokenize=True) 79 | ``` 80 | This outputs a numpy array with *n* vectors of dimension **4096**. Speed is around *1000 sentences per second* with batch size 128 on a single GPU. 81 | 82 | *5) Visualize the importance that our model attributes to each word:* 83 | 84 | We provide a function to visualize the importance of each word in the encoding of a sentence: 85 | ```python 86 | infersent.visualize('A man plays an instrument.', tokenize=True) 87 | ``` 88 | ![Model](https://s3.amazonaws.com/senteval/infersent/visualization.png) 89 | 90 | 91 | ## Train model on Natural Language Inference (SNLI) 92 | To reproduce our results on [SNLI](https://nlp.stanford.edu/projects/snli/), run: 93 | ```bash 94 | python train_nli.py --word_emb_path '' 95 | ``` 96 | You should obtain a dev accuracy of 85 and a test accuracy of **[84.5](https://nlp.stanford.edu/projects/snli/)** with the default setting. 97 | 98 | ## Evaluate the encoder on transfer tasks 99 | To evaluate the model on transfer tasks, see [SentEval](https://github.com/facebookresearch/SentEval/tree/master/examples). Be mindful to choose the same tokenization used for training the encoder. You should obtain the following test results for the baselines and the InferSent models: 100 | 101 | Model | MR | CR | SUBJ | MPQA | STS14 | [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) | SICK Relatedness | SICK Entailment | SST | TREC | MRPC 102 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: 103 | `InferSent1` | **81.1** | **86.3** | 92.4 | **90.2** | **.68/.65** | 75.8/75.5 | 0.884 | 86.1 | **84.6** | 88.2 | **76.2**/83.1 104 | `InferSent2` | 79.7 | 84.2 | 92.7 | 89.4 | **.68/.66** | **78.4/78.4** | **0.888** | **86.3** | 84.3 | **90.8** | 76.0/**83.8** 105 | `SkipThought` | 79.4 | 83.1 | **93.7** | 89.3 | .44/.45 | 72.1/70.2| 0.858 | 79.5 | 82.9 | 88.4 | - 106 | `fastText-BoV` | 78.2 | 80.2 | 91.8 | 88.0 | .65/.63 | 70.2/68.3 | 0.823 | 78.9 | 82.3 | 83.4 | 74.4/82.4 107 | 108 | ## Reference 109 | 110 | Please consider citing [[1]](https://arxiv.org/abs/1705.02364) if you found this code useful. 111 | 112 | ### Supervised Learning of Universal Sentence Representations from Natural Language Inference Data (EMNLP 2017) 113 | 114 | [1] A. Conneau, D. Kiela, H. Schwenk, L. Barrault, A. Bordes, [*Supervised Learning of Universal Sentence Representations from Natural Language Inference Data*](https://arxiv.org/abs/1705.02364) 115 | 116 | ``` 117 | @InProceedings{conneau-EtAl:2017:EMNLP2017, 118 | author = {Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and Barrault, Lo\"{i}c and Bordes, Antoine}, 119 | title = {Supervised Learning of Universal Sentence Representations from Natural Language Inference Data}, 120 | booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing}, 121 | month = {September}, 122 | year = {2017}, 123 | address = {Copenhagen, Denmark}, 124 | publisher = {Association for Computational Linguistics}, 125 | pages = {670--680}, 126 | url = {https://www.aclweb.org/anthology/D17-1070} 127 | } 128 | ``` 129 | 130 | ### Related work 131 | * [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726) 132 | * [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx) 133 | * [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207) 134 | * [A. Conneau, D. Kiela - SentEval: An Evaluation Toolkit for Universal Sentence Representations, LREC 2018](https://arxiv.org/abs/1803.05449) 135 | * [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079) 136 | * [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334) 137 | * [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175) 138 | * [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070) 139 | * [A. Wang, A. Singh, J. Michael, F. Hill, O. Levy, S. Bowman - GLUE: A Multi-Task Benchmark and Analysis Platform 140 | for Natural Language Understanding](https://arxiv.org/abs/1804.07461) 141 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import numpy as np 10 | import torch 11 | 12 | 13 | def get_batch(batch, word_vec, emb_dim=300): 14 | # sent in batch in decreasing order of lengths (bsize, max_len, word_dim) 15 | lengths = np.array([len(x) for x in batch]) 16 | max_len = np.max(lengths) 17 | embed = np.zeros((max_len, len(batch), emb_dim)) 18 | 19 | for i in range(len(batch)): 20 | for j in range(len(batch[i])): 21 | embed[j, i, :] = word_vec[batch[i][j]] 22 | 23 | return torch.from_numpy(embed).float(), lengths 24 | 25 | 26 | def get_word_dict(sentences): 27 | # create vocab of words 28 | word_dict = {} 29 | for sent in sentences: 30 | for word in sent.split(): 31 | if word not in word_dict: 32 | word_dict[word] = '' 33 | word_dict[''] = '' 34 | word_dict[''] = '' 35 | word_dict['

'] = '' 36 | return word_dict 37 | 38 | 39 | def get_glove(word_dict, glove_path): 40 | # create word_vec with glove vectors 41 | word_vec = {} 42 | with open(glove_path) as f: 43 | for line in f: 44 | word, vec = line.split(' ', 1) 45 | if word in word_dict: 46 | word_vec[word] = np.array(list(map(float, vec.split()))) 47 | print('Found {0}(/{1}) words with glove vectors'.format( 48 | len(word_vec), len(word_dict))) 49 | return word_vec 50 | 51 | 52 | def build_vocab(sentences, glove_path): 53 | word_dict = get_word_dict(sentences) 54 | word_vec = get_glove(word_dict, glove_path) 55 | print('Vocab size : {0}'.format(len(word_vec))) 56 | return word_vec 57 | 58 | 59 | def get_nli(data_path): 60 | s1 = {} 61 | s2 = {} 62 | target = {} 63 | 64 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} 65 | 66 | for data_type in ['train', 'dev', 'test']: 67 | s1[data_type], s2[data_type], target[data_type] = {}, {}, {} 68 | s1[data_type]['path'] = os.path.join(data_path, 's1.' + data_type) 69 | s2[data_type]['path'] = os.path.join(data_path, 's2.' + data_type) 70 | target[data_type]['path'] = os.path.join(data_path, 71 | 'labels.' + data_type) 72 | 73 | s1[data_type]['sent'] = [line.rstrip() for line in 74 | open(s1[data_type]['path'], 'r')] 75 | s2[data_type]['sent'] = [line.rstrip() for line in 76 | open(s2[data_type]['path'], 'r')] 77 | target[data_type]['data'] = np.array([dico_label[line.rstrip('\n')] 78 | for line in open(target[data_type]['path'], 'r')]) 79 | 80 | assert len(s1[data_type]['sent']) == len(s2[data_type]['sent']) == \ 81 | len(target[data_type]['data']) 82 | 83 | print('** {0} DATA : Found {1} pairs of {2} sentences.'.format( 84 | data_type.upper(), len(s1[data_type]['sent']), data_type)) 85 | 86 | train = {'s1': s1['train']['sent'], 's2': s2['train']['sent'], 87 | 'label': target['train']['data']} 88 | dev = {'s1': s1['dev']['sent'], 's2': s2['dev']['sent'], 89 | 'label': target['dev']['data']} 90 | test = {'s1': s1['test']['sent'], 's2': s2['test']['sent'], 91 | 'label': target['test']['data']} 92 | return train, dev, test 93 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/dataset/get_data.bash: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | preprocess_exec="sed -f tokenizer.sed" 9 | 10 | SNLI='https://nlp.stanford.edu/projects/snli/snli_1.0.zip' 11 | MultiNLI='https://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip' 12 | 13 | 14 | ZIPTOOL="unzip" 15 | 16 | #if [ "$OSTYPE" == "darwin"* ]; then 17 | # # unzip can't handle large files on some MacOS versions 18 | # ZIPTOOL="7za x" 19 | #fi 20 | 21 | 22 | ### download SNLI 23 | mkdir SNLI 24 | curl -Lo SNLI/snli_1.0.zip $SNLI 25 | $ZIPTOOL SNLI/snli_1.0.zip -d SNLI 26 | rm SNLI/snli_1.0.zip 27 | rm -r SNLI/__MACOSX 28 | 29 | for split in train dev test 30 | do 31 | fpath=SNLI/$split.snli.txt 32 | awk '{ if ( $1 != "-" ) { print $0; } }' SNLI/snli_1.0/snli_1.0_$split.txt | cut -f 1,6,7 | sed '1d' > $fpath 33 | cut -f1 $fpath > SNLI/labels.$split 34 | cut -f2 $fpath | $preprocess_exec > SNLI/s1.$split 35 | cut -f3 $fpath | $preprocess_exec > SNLI/s2.$split 36 | rm $fpath 37 | done 38 | rm -r SNLI/snli_1.0 39 | 40 | 41 | # MultiNLI 42 | # Test set not available yet : we define dev set as the "matched" set and the test set as the "mismatched" 43 | mkdir MultiNLI 44 | curl -Lo MultiNLI/multinli_0.9.zip $MultiNLI 45 | $ZIPTOOL MultiNLI/multinli_0.9.zip -d MultiNLI 46 | rm MultiNLI/multinli_0.9.zip 47 | rm -r MultiNLI/__MACOSX 48 | 49 | 50 | mv MultiNLI/multinli_0.9/multinli_0.9_train.txt MultiNLI/train.multinli.txt 51 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_matched.txt MultiNLI/dev.matched.multinli.txt 52 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_mismatched.txt MultiNLI/dev.mismatched.multinli.txt 53 | 54 | rm -r MultiNLI/multinli_0.9 55 | 56 | for split in train dev.matched dev.mismatched 57 | do 58 | fpath=MultiNLI/$split.multinli.txt 59 | awk '{ if ( $1 != "-" ) { print $0; } }' $fpath | cut -f 1,6,7 | sed '1d' > $fpath.tok 60 | cut -f1 $fpath.tok > MultiNLI/labels.$split 61 | cut -f2 $fpath.tok | $preprocess_exec > MultiNLI/s1.$split 62 | cut -f3 $fpath.tok | $preprocess_exec > MultiNLI/s2.$split 63 | rm $fpath $fpath.tok 64 | done 65 | 66 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/dataset/tokenizer.sed: -------------------------------------------------------------------------------- 1 | # Sed script to produce Penn Treebank tokenization on arbitrary raw text. 2 | # Yeah, sure. 3 | 4 | # expected input: raw text with ONE SENTENCE TOKEN PER LINE 5 | 6 | # by Robert MacIntyre, University of Pennsylvania, late 1995. 7 | 8 | # If this wasn't such a trivial program, I'd include all that stuff about 9 | # no warrantee, free use, etc. from the GNU General Public License. If you 10 | # want to be picky, assume that all of its terms apply. Okay? 11 | 12 | # attempt to get correct directional quotes 13 | s=^"=`` =g 14 | s=\([ ([{<]\)"=\1 `` =g 15 | # close quotes handled at end 16 | 17 | s=\.\.\.= ... =g 18 | s=[,;:@#$%&]= & =g 19 | 20 | # Assume sentence tokenization has been done first, so split FINAL periods 21 | # only. 22 | s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g 23 | # however, we may as well split ALL question marks and exclamation points, 24 | # since they shouldn't have the abbrev.-marker ambiguity problem 25 | s=[?!]= & =g 26 | 27 | # parentheses, brackets, etc. 28 | s=[][(){}<>]= & =g 29 | # Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file 30 | # version of these symbols. 31 | # UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. 32 | # s/(/-LRB-/g 33 | # s/)/-RRB-/g 34 | # s/\[/-LSB-/g 35 | # s/\]/-RSB-/g 36 | # s/{/-LCB-/g 37 | # s/}/-RCB-/g 38 | 39 | s=--= -- =g 40 | 41 | # NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since 42 | # you might someday want to know how the words originally fit together -- 43 | # but it's too late to make a better system now, given the millions of 44 | # words we've already done "wrong". 45 | 46 | # First off, add a space to the beginning and end of each line, to reduce 47 | # necessary number of regexps. 48 | s=$= = 49 | s=^= = 50 | 51 | s="= '' =g 52 | # possessive or close-single-quote 53 | s=\([^']\)' =\1 ' =g 54 | # as in it's, I'm, we'd 55 | s='\([sSmMdD]\) = '\1 =g 56 | s='ll = 'll =g 57 | s='re = 're =g 58 | s='ve = 've =g 59 | s=n't = n't =g 60 | s='LL = 'LL =g 61 | s='RE = 'RE =g 62 | s='VE = 'VE =g 63 | s=N'T = N'T =g 64 | 65 | s= \([Cc]\)annot = \1an not =g 66 | s= \([Dd]\)'ye = \1' ye =g 67 | s= \([Gg]\)imme = \1im me =g 68 | s= \([Gg]\)onna = \1on na =g 69 | s= \([Gg]\)otta = \1ot ta =g 70 | s= \([Ll]\)emme = \1em me =g 71 | s= \([Mm]\)ore'n = \1ore 'n =g 72 | s= '\([Tt]\)is = '\1 is =g 73 | s= '\([Tt]\)was = '\1 was =g 74 | s= \([Ww]\)anna = \1an na =g 75 | # s= \([Ww]\)haddya = \1ha dd ya =g 76 | # s= \([Ww]\)hatcha = \1ha t cha =g 77 | 78 | # clean out extra spaces 79 | s= *= =g 80 | s=^ *==g 81 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/encoder/extract_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import torch 4 | import argparse 5 | 6 | import numpy as np 7 | from models import InferSent 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser( 12 | prog='extract-features', 13 | description='Extract features from pretrained InferSent model') 14 | 15 | parser.add_argument('-g', '--w2v_path', type=str, required=True, 16 | help='Path to word vector file') 17 | parser.add_argument('-v', '--version', type=int, required=True, 18 | help='Version of InferSent (GloVe-V1 or fastText-V2)') 19 | parser.add_argument('-f', '--model_path', type=str, required=True, 20 | help='Path to pretrained .pkl model file') 21 | parser.add_argument('-t', '--tokenize', action='store_true', 22 | help='Passes tokenize=True to build_vocab()') 23 | parser.add_argument('-o', '--out-dir', type=str, required=True, 24 | help='Output folder to save feature files') 25 | parser.add_argument('-c', '--cpu', action='store_true', 26 | help='Use CPU instead of GPU.') 27 | parser.add_argument('-b', '--batch-size', type=int, default=64, 28 | help='Batch size (default: 64)') 29 | parser.add_argument('files', nargs='+', 30 | help='List of files to extract sentence embeddings') 31 | 32 | args = parser.parse_args() 33 | 34 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 35 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version} 36 | model = InferSent(params_model) 37 | model.load_state_dict(torch.load(args.model_path)) 38 | 39 | if not args.cpu: 40 | model = model.cuda() 41 | 42 | model.set_w2v_path(args.w2v_path) 43 | 44 | # Ensure directory 45 | if not os.path.exists(args.out_dir): 46 | os.makedirs(args.out_dir) 47 | 48 | # Read files and extract features 49 | for fpath in args.files: 50 | print('Reading file {}'.format(fpath)) 51 | sents = [] 52 | with open(fpath) as f: 53 | for line in f: 54 | line = line.strip() 55 | assert line, 'Empty line in {}'.format(fpath) 56 | sents.append(line) 57 | 58 | # Set output file name 59 | out_name = os.path.join( 60 | args.out_dir, "{}.embs.npy".format(os.path.basename(fpath))) 61 | 62 | # Build vocab 63 | print('Building vocabulary') 64 | model.build_vocab(sents, args.tokenize) 65 | 66 | # Get embeddings 67 | embs = model.encode(sents, tokenize=args.tokenize, 68 | verbose=True, bsize=args.batch_size) 69 | 70 | print('Saving to {}'.format(out_name)) 71 | np.save(out_name, embs) 72 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/encoder/models.py: -------------------------------------------------------------------------------- 1 | ../models.py -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/mutils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import re 9 | import inspect 10 | from torch import optim 11 | 12 | 13 | def get_optimizer(s): 14 | """ 15 | Parse optimizer parameters. 16 | Input should be of the form: 17 | - "sgd,lr=0.01" 18 | - "adagrad,lr=0.1,lr_decay=0.05" 19 | """ 20 | if "," in s: 21 | method = s[:s.find(',')] 22 | optim_params = {} 23 | for x in s[s.find(',') + 1:].split(','): 24 | split = x.split('=') 25 | assert len(split) == 2 26 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None 27 | optim_params[split[0]] = float(split[1]) 28 | else: 29 | method = s 30 | optim_params = {} 31 | 32 | if method == 'adadelta': 33 | optim_fn = optim.Adadelta 34 | elif method == 'adagrad': 35 | optim_fn = optim.Adagrad 36 | elif method == 'adam': 37 | optim_fn = optim.Adam 38 | elif method == 'adamax': 39 | optim_fn = optim.Adamax 40 | elif method == 'asgd': 41 | optim_fn = optim.ASGD 42 | elif method == 'rmsprop': 43 | optim_fn = optim.RMSprop 44 | elif method == 'rprop': 45 | optim_fn = optim.Rprop 46 | elif method == 'sgd': 47 | optim_fn = optim.SGD 48 | assert 'lr' in optim_params 49 | else: 50 | raise Exception('Unknown optimization method: "%s"' % method) 51 | 52 | # check that we give good parameters to the optimizer 53 | expected_args = inspect.getargspec(optim_fn.__init__)[0] 54 | assert expected_args[:2] == ['self', 'params'] 55 | if not all(k in expected_args[2:] for k in optim_params.keys()): 56 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 57 | str(expected_args[2:]), str(optim_params.keys()))) 58 | 59 | return optim_fn, optim_params 60 | 61 | 62 | """ 63 | Importing batcher and prepare for SentEval 64 | """ 65 | 66 | 67 | def batcher(batch, params): 68 | # batch contains list of words 69 | batch = [[''] + s + [''] for s in batch] 70 | sentences = [' '.join(s) for s in batch] 71 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size, 72 | tokenize=False) 73 | 74 | return embeddings 75 | 76 | 77 | def prepare(params, samples): 78 | params.infersent.build_vocab([' '.join(s) for s in samples], 79 | params.glove_path, tokenize=False) 80 | 81 | 82 | class dotdict(dict): 83 | """ dot.notation access to dictionary attributes """ 84 | __getattr__ = dict.get 85 | __setattr__ = dict.__setitem__ 86 | __delattr__ = dict.__delitem__ 87 | -------------------------------------------------------------------------------- /aion/embeddings/infersent_lib/train_nli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import sys 10 | import time 11 | import argparse 12 | 13 | import numpy as np 14 | 15 | import torch 16 | from torch.autograd import Variable 17 | import torch.nn as nn 18 | 19 | from data import get_nli, get_batch, build_vocab 20 | from mutils import get_optimizer 21 | from models import NLINet 22 | 23 | 24 | parser = argparse.ArgumentParser(description='NLI training') 25 | # paths 26 | parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)") 27 | parser.add_argument("--outputdir", type=str, default='savedir/', help="Output directory") 28 | parser.add_argument("--outputmodelname", type=str, default='model.pickle') 29 | parser.add_argument("--word_emb_path", type=str, default="dataset/GloVe/glove.840B.300d.txt", help="word embedding file path") 30 | 31 | # training 32 | parser.add_argument("--n_epochs", type=int, default=20) 33 | parser.add_argument("--batch_size", type=int, default=64) 34 | parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout") 35 | parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout") 36 | parser.add_argument("--nonlinear_fc", type=float, default=0, help="use nonlinearity in fc") 37 | parser.add_argument("--optimizer", type=str, default="sgd,lr=0.1", help="adam or sgd,lr=0.1") 38 | parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd") 39 | parser.add_argument("--decay", type=float, default=0.99, help="lr decay") 40 | parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr") 41 | parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)") 42 | 43 | # model 44 | parser.add_argument("--encoder_type", type=str, default='InferSentV1', help="see list of encoders") 45 | parser.add_argument("--enc_lstm_dim", type=int, default=2048, help="encoder nhid dimension") 46 | parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers") 47 | parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers") 48 | parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction") 49 | parser.add_argument("--pool_type", type=str, default='max', help="max or mean") 50 | 51 | # gpu 52 | parser.add_argument("--gpu_id", type=int, default=3, help="GPU ID") 53 | parser.add_argument("--seed", type=int, default=1234, help="seed") 54 | 55 | # data 56 | parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension") 57 | 58 | params, _ = parser.parse_known_args() 59 | 60 | # set gpu device 61 | torch.cuda.set_device(params.gpu_id) 62 | 63 | # print parameters passed, and all parameters 64 | print('\ntogrep : {0}\n'.format(sys.argv[1:])) 65 | print(params) 66 | 67 | 68 | """ 69 | SEED 70 | """ 71 | np.random.seed(params.seed) 72 | torch.manual_seed(params.seed) 73 | torch.cuda.manual_seed(params.seed) 74 | 75 | """ 76 | DATA 77 | """ 78 | train, valid, test = get_nli(params.nlipath) 79 | word_vec = build_vocab(train['s1'] + train['s2'] + 80 | valid['s1'] + valid['s2'] + 81 | test['s1'] + test['s2'], params.word_emb_path) 82 | 83 | for split in ['s1', 's2']: 84 | for data_type in ['train', 'valid', 'test']: 85 | eval(data_type)[split] = np.array([[''] + 86 | [word for word in sent.split() if word in word_vec] + 87 | [''] for sent in eval(data_type)[split]]) 88 | 89 | 90 | """ 91 | MODEL 92 | """ 93 | # model config 94 | config_nli_model = { 95 | 'n_words' : len(word_vec) , 96 | 'word_emb_dim' : params.word_emb_dim , 97 | 'enc_lstm_dim' : params.enc_lstm_dim , 98 | 'n_enc_layers' : params.n_enc_layers , 99 | 'dpout_model' : params.dpout_model , 100 | 'dpout_fc' : params.dpout_fc , 101 | 'fc_dim' : params.fc_dim , 102 | 'bsize' : params.batch_size , 103 | 'n_classes' : params.n_classes , 104 | 'pool_type' : params.pool_type , 105 | 'nonlinear_fc' : params.nonlinear_fc , 106 | 'encoder_type' : params.encoder_type , 107 | 'use_cuda' : True , 108 | 109 | } 110 | 111 | # model 112 | encoder_types = ['InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder', 113 | 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder', 114 | 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'] 115 | assert params.encoder_type in encoder_types, "encoder_type must be in " + \ 116 | str(encoder_types) 117 | nli_net = NLINet(config_nli_model) 118 | print(nli_net) 119 | 120 | # loss 121 | weight = torch.FloatTensor(params.n_classes).fill_(1) 122 | loss_fn = nn.CrossEntropyLoss(weight=weight) 123 | loss_fn.size_average = False 124 | 125 | # optimizer 126 | optim_fn, optim_params = get_optimizer(params.optimizer) 127 | optimizer = optim_fn(nli_net.parameters(), **optim_params) 128 | 129 | # cuda by default 130 | nli_net.cuda() 131 | loss_fn.cuda() 132 | 133 | 134 | """ 135 | TRAIN 136 | """ 137 | val_acc_best = -1e10 138 | adam_stop = False 139 | stop_training = False 140 | lr = optim_params['lr'] if 'sgd' in params.optimizer else None 141 | 142 | 143 | def trainepoch(epoch): 144 | print('\nTRAINING : Epoch ' + str(epoch)) 145 | nli_net.train() 146 | all_costs = [] 147 | logs = [] 148 | words_count = 0 149 | 150 | last_time = time.time() 151 | correct = 0. 152 | # shuffle the data 153 | permutation = np.random.permutation(len(train['s1'])) 154 | 155 | s1 = train['s1'][permutation] 156 | s2 = train['s2'][permutation] 157 | target = train['label'][permutation] 158 | 159 | 160 | optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\ 161 | and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr'] 162 | print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr'])) 163 | 164 | for stidx in range(0, len(s1), params.batch_size): 165 | # prepare batch 166 | s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size], 167 | word_vec, params.word_emb_dim) 168 | s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size], 169 | word_vec, params.word_emb_dim) 170 | s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda()) 171 | tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda() 172 | k = s1_batch.size(1) # actual batch size 173 | 174 | # model forward 175 | output = nli_net((s1_batch, s1_len), (s2_batch, s2_len)) 176 | 177 | pred = output.data.max(1)[1] 178 | correct += pred.long().eq(tgt_batch.data.long()).cpu().sum() 179 | assert len(pred) == len(s1[stidx:stidx + params.batch_size]) 180 | 181 | # loss 182 | loss = loss_fn(output, tgt_batch) 183 | all_costs.append(loss.data[0]) 184 | words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim 185 | 186 | # backward 187 | optimizer.zero_grad() 188 | loss.backward() 189 | 190 | # gradient clipping (off by default) 191 | shrink_factor = 1 192 | total_norm = 0 193 | 194 | for p in nli_net.parameters(): 195 | if p.requires_grad: 196 | p.grad.data.div_(k) # divide by the actual batch size 197 | total_norm += p.grad.data.norm() ** 2 198 | total_norm = np.sqrt(total_norm) 199 | 200 | if total_norm > params.max_norm: 201 | shrink_factor = params.max_norm / total_norm 202 | current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam) 203 | optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update 204 | 205 | # optimizer step 206 | optimizer.step() 207 | optimizer.param_groups[0]['lr'] = current_lr 208 | 209 | if len(all_costs) == 100: 210 | logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format( 211 | stidx, round(np.mean(all_costs), 2), 212 | int(len(all_costs) * params.batch_size / (time.time() - last_time)), 213 | int(words_count * 1.0 / (time.time() - last_time)), 214 | round(100.*correct/(stidx+k), 2))) 215 | print(logs[-1]) 216 | last_time = time.time() 217 | words_count = 0 218 | all_costs = [] 219 | train_acc = round(100 * correct/len(s1), 2) 220 | print('results : epoch {0} ; mean accuracy train : {1}' 221 | .format(epoch, train_acc)) 222 | return train_acc 223 | 224 | 225 | def evaluate(epoch, eval_type='valid', final_eval=False): 226 | nli_net.eval() 227 | correct = 0. 228 | global val_acc_best, lr, stop_training, adam_stop 229 | 230 | if eval_type == 'valid': 231 | print('\nVALIDATION : Epoch {0}'.format(epoch)) 232 | 233 | s1 = valid['s1'] if eval_type == 'valid' else test['s1'] 234 | s2 = valid['s2'] if eval_type == 'valid' else test['s2'] 235 | target = valid['label'] if eval_type == 'valid' else test['label'] 236 | 237 | for i in range(0, len(s1), params.batch_size): 238 | # prepare batch 239 | s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec, params.word_emb_dim) 240 | s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec, params.word_emb_dim) 241 | s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda()) 242 | tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda() 243 | 244 | # model forward 245 | output = nli_net((s1_batch, s1_len), (s2_batch, s2_len)) 246 | 247 | pred = output.data.max(1)[1] 248 | correct += pred.long().eq(tgt_batch.data.long()).cpu().sum() 249 | 250 | # save model 251 | eval_acc = round(100 * correct / len(s1), 2) 252 | if final_eval: 253 | print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc)) 254 | else: 255 | print('togrep : results : epoch {0} ; mean accuracy {1} :\ 256 | {2}'.format(epoch, eval_type, eval_acc)) 257 | 258 | if eval_type == 'valid' and epoch <= params.n_epochs: 259 | if eval_acc > val_acc_best: 260 | print('saving model at epoch {0}'.format(epoch)) 261 | if not os.path.exists(params.outputdir): 262 | os.makedirs(params.outputdir) 263 | torch.save(nli_net.state_dict(), os.path.join(params.outputdir, 264 | params.outputmodelname)) 265 | val_acc_best = eval_acc 266 | else: 267 | if 'sgd' in params.optimizer: 268 | optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink 269 | print('Shrinking lr by : {0}. New lr = {1}' 270 | .format(params.lrshrink, 271 | optimizer.param_groups[0]['lr'])) 272 | if optimizer.param_groups[0]['lr'] < params.minlr: 273 | stop_training = True 274 | if 'adam' in params.optimizer: 275 | # early stopping (at 2nd decrease in accuracy) 276 | stop_training = adam_stop 277 | adam_stop = True 278 | return eval_acc 279 | 280 | 281 | """ 282 | Train model on Natural Language Inference task 283 | """ 284 | epoch = 1 285 | 286 | while not stop_training and epoch <= params.n_epochs: 287 | train_acc = trainepoch(epoch) 288 | eval_acc = evaluate(epoch, 'valid') 289 | epoch += 1 290 | 291 | # Run best model on test set. 292 | nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname)), map_location={'cuda:1' : 'cuda:0', 'cuda:2' : 'cuda:0'}) 293 | 294 | print('\nTEST : Epoch {0}'.format(epoch)) 295 | evaluate(1e6, 'valid', True) 296 | evaluate(0, 'test', True) 297 | 298 | # Save encoder instead of full model 299 | torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl')) 300 | -------------------------------------------------------------------------------- /aion/embeddings/sentence_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Edward Ma. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os, datetime 17 | 18 | from .embeddings import Embeddings 19 | 20 | 21 | class SentenceEmbeddings(Embeddings): 22 | def __init__(self, verbose=0): 23 | self.verbose = verbose -------------------------------------------------------------------------------- /aion/embeddings/skip_thoughts.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Edward Ma. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os, datetime 17 | 18 | class SkipThoughtsEmbeddingsTorch: 19 | DICTIONARY_URL = "http://www.cs.toronto.edu/~rkiros/models/dictionary.txt" 20 | UNISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/utable.npy" 21 | BISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/btable.npy" 22 | UNISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz" 23 | BISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz" 24 | UNISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl" 25 | BISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl" 26 | 27 | def __init__(self, model_dir, algorithm='uniskip', tokenizer=None, verbose=0): 28 | super().__init__(verbose=verbose) 29 | 30 | from torch import LongTensor 31 | from torch.autograd import Variable 32 | from skipthoughts import UniSkip, BiSkip 33 | 34 | self.model_dir = model_dir 35 | self.algorithm = algorithm 36 | self.vocab = {} 37 | self.vocabs = [] 38 | if tokenizer is None: 39 | self.tokenizer = self._tokenizer_space 40 | else: 41 | self.tokenizer = tokenizer 42 | self.max_sentence_len = -1 43 | 44 | def downloads(self, dest_dir, sources=None): 45 | if sources is None: 46 | sources = [self.DICTIONARY_URL, self.UNISKIP_URL, self.BISKIP_URL, 47 | self.UNISKIPS_URL, self.BISKIPS_URL, self.UNISKIPS_PKL_URL, 48 | self.BISKIPS_PKL_URL] 49 | 50 | for src in sources: 51 | self.download(src=src, dest_dir=dest_dir, dest_file=None, unzip=False) 52 | 53 | def build_vocab(self, sentences, clear_vocab=True, max_sentence_len=-1): 54 | if clear_vocab: 55 | self.vocab = {} 56 | 57 | self.max_sentence_len = max_sentence_len 58 | 59 | for sentence in sentences: 60 | words = self.tokenizer(sentence) 61 | if max_sentence_len == -1: 62 | self.max_sentence_len = max(self.max_sentence_len, len(words)) 63 | 64 | for word in words: 65 | if word not in self.vocab: 66 | self.vocabs.append(word) 67 | # Reserve the first one for padding 68 | self.vocab[word] = len(self.vocab) + 1 69 | 70 | def process(self, sentences): 71 | word_id_sentences = [] 72 | for sentence in sentences: 73 | word_ids = [self.vocab[w] for w in self.tokenizer(sentence) if w in self.vocab] 74 | 75 | if self.max_sentence_len > len(word_ids): 76 | for i in range(0, self.max_sentence_len-len(word_ids)): 77 | word_ids.append(0) 78 | elif self.max_sentence_len < len(word_ids): 79 | word_ids = word_ids[:self.max_sentence_len] 80 | 81 | word_id_sentences.append(word_ids) 82 | 83 | return word_id_sentences 84 | 85 | def get_algorithm(self, words, model_dir=None): 86 | if model_dir is None: 87 | model_dir = self.model_dir 88 | 89 | if self.algorithm == 'uniskip': 90 | return UniSkip(model_dir, words) 91 | else: 92 | return BiSkip(model_dir, words) 93 | 94 | def to_numpy_layer(self, layer): 95 | return layer.detach().numpy() 96 | 97 | def encode(self, sentences, output_format='torch'): 98 | transformed_sentences = self.process(sentences) 99 | 100 | algo = self.get_algorithm(self.vocabs) 101 | inputs = Variable(LongTensor(transformed_sentences)) 102 | outpus = algo(inputs, lengths=[len(words) for words in transformed_sentences]) 103 | 104 | if output_format == 'np': 105 | return self.to_numpy_layer(outpus) 106 | elif output_format == 'torch': 107 | return outpus 108 | 109 | def predict_batch(self, sentences, output_format='torch', batch_size=1000): 110 | batches = [sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size-1) // batch_size)] 111 | 112 | results = [] 113 | for batch in batches: 114 | results.append(skip_thoughts_emb.predict(sentences=batch, output_format=output_format)) 115 | 116 | if output_format == 'np': 117 | return np.concatenate(results, axis=0) 118 | elif output_format == 'torch': 119 | return torch.cat(results, 0) -------------------------------------------------------------------------------- /aion/embeddings/word_embeddings.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import numpy as np 3 | 4 | from .embeddings import Embeddings 5 | 6 | 7 | class WordEmbeddings(Embeddings): 8 | 9 | def __init__(self, 10 | handle_oov=True, oov_vector=None, oov_vector_type='zero', 11 | padding=True, pad_vector=None, pad_vector_type='zero', 12 | max_sequence_length=10, dimension=300, 13 | verbose=0): 14 | super().__init__(verbose=verbose) 15 | self.handle_oov = handle_oov 16 | self.oov_vector_type = oov_vector_type 17 | if handle_oov and oov_vector is None: 18 | if oov_vector_type == 'zero': 19 | self.oov_vector = np.zeros(dimension) 20 | elif oov_vector_type == 'random': 21 | self.oov_vector = np.random.rand(dimension) 22 | else: 23 | self.oov_vector = oov_vector 24 | 25 | self.padding = padding 26 | self.pad_vector_type = pad_vector_type 27 | if padding and pad_vector is None: 28 | if pad_vector_type == 'zero': 29 | self.pad_vector = np.zeros(dimension) 30 | elif pad_vector_type == 'random': 31 | self.pad_vector = np.random.rand(dimension) 32 | else: 33 | self.pad_vector = pad_vector 34 | 35 | self.max_sequence_length = max_sequence_length 36 | self.dimension = dimension 37 | 38 | def get_oov_vector(self): 39 | return self.oov_vector 40 | 41 | def set_oov_vector(self, oov_vector): 42 | self.oov_vector = oov_vector 43 | 44 | def get_pad_vector(self): 45 | return self.pad_vector 46 | 47 | def set_pad_vector(self, pad_vector): 48 | self.pad_vector = pad_vector 49 | 50 | def is_vector_exist(self, word): 51 | return word in self.model -------------------------------------------------------------------------------- /aion/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/aion/helper/__init__.py -------------------------------------------------------------------------------- /aion/helper/file_helper.py: -------------------------------------------------------------------------------- 1 | import datetime, os, urllib.request, zipfile 2 | 3 | 4 | class FileHelper: 5 | def __init__(self, verbose=0): 6 | self.verbose = verbose 7 | 8 | def _log_time(self, status, msg, verbose): 9 | if self.verbose >= 0 or verbose >= 0: 10 | print('%s. [%s] %s' % (datetime.datetime.now(), status, msg)) 11 | 12 | def is_file_exist(self, file_path): 13 | return os.path.exists(file_path) 14 | 15 | def download(self, src, dest_dir, dest_file, uncompress=False, housekeep=False, force_download=False, verbose=0): 16 | if not os.path.exists(dest_dir): 17 | os.makedirs(dest_dir) 18 | 19 | # print('dest_dir:', dest_dir) 20 | 21 | if dest_file is None: 22 | dest_file = os.path.basename(src) 23 | 24 | # print('dest_file:', dest_file) 25 | 26 | if not self.is_file_exist(dest_dir + dest_file) or force_download: 27 | self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose) 28 | file = urllib.request.urlopen(src) 29 | with open(dest_dir + dest_file,'wb') as output: 30 | output.write(file.read()) 31 | else: 32 | self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose) 33 | 34 | # if uncompress: 35 | # self.uncompress(dest_dir + dest_file) 36 | 37 | # if uncompress and housekeep: 38 | # self.housekeep(dest_dir + dest_file) 39 | 40 | return dest_dir + dest_file -------------------------------------------------------------------------------- /aion/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/aion/util/__init__.py -------------------------------------------------------------------------------- /aion/util/spell_check.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | from collections import Counter 3 | from symspellpy.symspellpy import SymSpell as SymSpellPy, Verbosity 4 | 5 | class SpellCheck: 6 | def __init__(self, dictionary=None, verbose=0): 7 | self.verbose = verbose 8 | self.dictionary = dictionary 9 | 10 | def correction(self, text): 11 | return '' 12 | 13 | 14 | ''' 15 | Source: https://norvig.com/spell-correct.html 16 | ''' 17 | class SpellCorrector(SpellCheck): 18 | def __init__(self, dictionary, verbose=0): 19 | super().__init__(dictionary=dictionary, verbose=verbose) 20 | 21 | def words(text): 22 | return re.findall(r'\w+', text.lower()) 23 | 24 | def P(self, word): 25 | "Probability of `word`." 26 | N = sum(self.dictionary.values()) 27 | return self.dictionary[word] / N 28 | 29 | def correction(self, word): 30 | "Most probable spelling correction for word." 31 | return max(self.candidates(word), key=self.P) 32 | 33 | def candidates(self, word, verbose=0): 34 | "Generate possible spelling corrections for word." 35 | 36 | known_result = self.known([word]) 37 | edit1_result = self.known(self.edits1(word)) 38 | edit2_result = self.known(self.edits2(word)) 39 | 40 | if self.verbose > 0 or verbose > 0: 41 | print('Known Result: ', known_result) 42 | print('Edit1 Result: ', edit1_result) 43 | print('Edit2 Result: ', edit2_result) 44 | 45 | return (known_result or edit1_result or edit2_result or [word]) 46 | 47 | def known(self, words): 48 | "The subset of `words` that appear in the dictionary of WORDS." 49 | return set(w for w in words if w in self.dictionary) 50 | 51 | def edits1(self, word): 52 | "All edits that are one edit away from `word`." 53 | letters = 'abcdefghijklmnopqrstuvwxyz' 54 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 55 | deletes = [L + R[1:] for L, R in splits if R] 56 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] 57 | replaces = [L + c + R[1:] for L, R in splits if R for c in letters] 58 | inserts = [L + c + R for L, R in splits for c in letters] 59 | return set(deletes + transposes + replaces + inserts) 60 | 61 | def edits2(self, word): 62 | "All edits that are two edits away from `word`." 63 | return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1)) 64 | 65 | 66 | class SymSpell(SpellCheck): 67 | def __init__(self, dictionary_file_path='', dictionary=None, verbose=0): 68 | super().__init__(dictionary=dictionary, verbose=verbose) 69 | 70 | self.dictionary_file_path = dictionary_file_path 71 | self.model = None 72 | 73 | def load_vocab(self, corpus_file_path, max_edit_distance_dictionary=2, prefix_length=5): 74 | #initial_capacity = len(corpus) 75 | 76 | #sym_spell = SymSpellPy( 77 | # initial_capacity, max_edit_distance_dictionary, 78 | # prefix_length) 79 | self.model = SymSpellPy( 80 | max_dictionary_edit_distance=max_edit_distance_dictionary, 81 | prefix_length=prefix_length) 82 | 83 | term_index = 0 # column of the term in the dictionary text file 84 | count_index = 1 # column of the term frequency in the dictionary text file 85 | if not self.model.load_dictionary(corpus_file_path, term_index, count_index): 86 | print("Dictionary file not found") 87 | 88 | def build_vocab(self, dictionary, file_dir, file_name, verbose=0): 89 | if not os.path.exists(file_dir): 90 | os.makedirs(file_dir) 91 | 92 | """ 93 | Data format: 94 | token, frequency 95 | Example: 96 | edward 154 97 | edwards 50 98 | ... 99 | """ 100 | if self.verbose > 3 or verbose > 3: 101 | print('Size of dictionary: %d' % len(dictionary)) 102 | 103 | with open(file_dir + file_name, "w") as text_file: 104 | for token, count in dictionary.items(): 105 | text_file.write(token + ' ' + str(count)) 106 | text_file.write('\n') 107 | 108 | def correction(self, word, max_edit_distance_lookup=2, mode='cloest'): 109 | if mode == 'cloest': 110 | suggestion_verbosity = Verbosity.CLOSEST 111 | elif mode == 'top': 112 | suggestion_verbosity = Verbosity.TOP 113 | elif mode == 'all': 114 | suggestion_verbosity = Verbosity.ALL 115 | 116 | results = self.model.lookup( 117 | word, suggestion_verbosity, max_edit_distance_lookup) 118 | 119 | results = [{'word': suggestion.term, 'count': suggestion.count, 'distance': suggestion.distance} for suggestion in results] 120 | return results 121 | 122 | def corrections(self, sentence, max_edit_distance_lookup=2): 123 | normalized_sentence = (sentence.lower()) 124 | results = self.model.lookup_compound( 125 | normalized_sentence, max_edit_distance_lookup) 126 | 127 | results = [{'word': suggestion.term, 'distance': suggestion.distance} for suggestion in results] 128 | return results 129 | -------------------------------------------------------------------------------- /sample/embeddings/nlp-embeddings-document-doc2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Train: 2031\n", 20 | "Val: 226\n", 21 | "Test: 1502\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import numpy as np\n", 27 | "from sklearn.datasets import fetch_20newsgroups\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "\n", 30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n", 31 | "\n", 32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n", 33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n", 34 | "\n", 35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n", 36 | "x_test = np.array(test_raw_df.data)\n", 37 | "y_test = test_raw_df.target\n", 38 | "\n", 39 | "# x_train = [x_train[:200] for x in x_train]\n", 40 | "\n", 41 | "print('Train:', len(x_train))\n", 42 | "print('Val:', len(x_val))\n", 43 | "print('Test:', len(x_test))" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "/data/jupyter/common\n", 56 | "Added /data/jupyter/common into sys.path.\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "%reload_ext autoreload\n", 62 | "%autoreload 2\n", 63 | "\n", 64 | "import sys, os\n", 65 | "def add_aion(curr_path=None):\n", 66 | " if curr_path is None:\n", 67 | " dir_path = os.getcwd()\n", 68 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n", 69 | " print(target_path)\n", 70 | " if target_path not in sys.path:\n", 71 | " print('Added %s into sys.path.' % (target_path))\n", 72 | " sys.path.insert(0, target_path)\n", 73 | " \n", 74 | "add_aion()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# Model" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from aion.embeddings.doc2vec import Doc2VecEmbeddings" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "2018-10-08 22:52:10.269082 start\n", 105 | "2018-10-08 22:53:30.387969 end\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "doc2vec_embs = Doc2VecEmbeddings()\n", 111 | "x_train_tokens = doc2vec_embs.build_vocab(documents=x_train)\n", 112 | "doc2vec_embs.train(x_train_tokens)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 8, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "x_train_t = doc2vec_embs.encode(documents=x_train)\n", 124 | "x_test_t = doc2vec_embs.encode(documents=x_test)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 9, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "from sklearn.linear_model import LogisticRegression\n", 136 | "\n", 137 | "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n", 138 | "model.fit(x_train_t, y_train)\n", 139 | "\n", 140 | "y_pred = model.predict(x_test_t)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 10, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Accuracy:52.80%\n", 153 | "Classification Report:\n", 154 | " precision recall f1-score support\n", 155 | "\n", 156 | " 0 0.56 0.17 0.26 319\n", 157 | " 1 0.82 0.63 0.72 389\n", 158 | " 2 0.85 0.31 0.45 396\n", 159 | " 3 0.38 0.93 0.54 398\n", 160 | "\n", 161 | "avg / total 0.66 0.53 0.50 1502\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "from sklearn.metrics import accuracy_score\n", 168 | "from sklearn.metrics import classification_report\n", 169 | "\n", 170 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n", 171 | "print('Classification Report:')\n", 172 | "print(classification_report(y_test, y_pred))" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.5.2" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /sample/nlp-distance-edit_distance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Edit Distance" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Edit Distance for \"edward\" and \"edwin\" is 3\n", 20 | "Edit Distance for \"Edward\" and \"edwin\" is 4\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "import editdistance\n", 26 | "\n", 27 | "data = ['edward', 'Edward']\n", 28 | "\n", 29 | "for record in data:\n", 30 | " dist = editdistance.eval(record, 'edwin')\n", 31 | " print('Edit Distance for \"%s\" and \"%s\" is %d' % (record, 'edwin', dist))" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.5.2" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /sample/nlp-embeddings-sentence-infersent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Train: 2031\n", 20 | "Val: 226\n", 21 | "Test: 1502\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import numpy as np\n", 27 | "from sklearn.datasets import fetch_20newsgroups\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "\n", 30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n", 31 | "\n", 32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n", 33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n", 34 | "\n", 35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n", 36 | "x_test = np.array(test_raw_df.data)\n", 37 | "y_test = test_raw_df.target\n", 38 | "\n", 39 | "# x_train = [x_train[:200] for x in x_train]\n", 40 | "\n", 41 | "print('Train:', len(x_train))\n", 42 | "print('Val:', len(x_val))\n", 43 | "print('Test:', len(x_test))" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Added /data/jupyter/common into sys.path.\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "%reload_ext autoreload\n", 61 | "%autoreload 2\n", 62 | "\n", 63 | "import sys, os\n", 64 | "def add_aion(curr_path=None):\n", 65 | " if curr_path is None:\n", 66 | " dir_path = os.getcwd()\n", 67 | " target_path = os.path.dirname(dir_path)\n", 68 | " if target_path not in sys.path:\n", 69 | " print('Added %s into sys.path.' % (target_path))\n", 70 | " sys.path.insert(0, target_path)\n", 71 | " \n", 72 | "add_aion()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# Model" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "[nltk_data] Downloading package punkt to /home/dscoe/nltk_data...\n", 92 | "[nltk_data] Package punkt is already up-to-date!\n" 93 | ] 94 | }, 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "True" 99 | ] 100 | }, 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "import nltk\n", 108 | "nltk.download('punkt')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from aion.embeddings.infersent import InferSentEmbeddings\n", 118 | "\n", 119 | "infer_sent_embs = InferSentEmbeddings(word_embeddings_dir='../model/text/stanford/glove/', verbose=20)\n", 120 | "infer_sent_embs.load_model(dest_dir='../model/text/facebook/infersent/')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Found 22119(/46170) words with w2v vectors\n", 133 | "Vocab size : 22119\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "infer_sent_embs.build_vocab(x_train, tokenize=True)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stderr", 148 | "output_type": "stream", 149 | "text": [ 150 | "/data/jupyter/common/aion/embeddings/infersent_lib/models.py:222: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 151 | " sentences[stidx:stidx + bsize]), volatile=True)\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "x_train_t = infer_sent_embs.encode(x_train, tokenize=True)\n", 157 | "x_test_t = infer_sent_embs.encode(x_test, tokenize=True)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n", 170 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 171 | "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n", 172 | " warnings.warn('Line Search failed')\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "from sklearn.linear_model import LogisticRegression\n", 178 | "\n", 179 | "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n", 180 | "model.fit(x_train_t, y_train)\n", 181 | "\n", 182 | "y_pred = model.predict(x_test_t)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "Accuracy:86.55%\n", 195 | "Classification Report:\n", 196 | " precision recall f1-score support\n", 197 | "\n", 198 | " 0 0.85 0.76 0.80 319\n", 199 | " 1 0.86 0.95 0.91 389\n", 200 | " 2 0.95 0.79 0.86 396\n", 201 | " 3 0.82 0.94 0.87 398\n", 202 | "\n", 203 | "avg / total 0.87 0.87 0.86 1502\n", 204 | "\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "from sklearn.metrics import accuracy_score\n", 210 | "from sklearn.metrics import classification_report\n", 211 | "\n", 212 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n", 213 | "print('Classification Report:')\n", 214 | "print(classification_report(y_test, y_pred))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.5.2" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 2 248 | } 249 | -------------------------------------------------------------------------------- /sample/nlp-embeddings-word-cove.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Train: 2031\n", 20 | "Val: 226\n", 21 | "Test: 1502\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import numpy as np\n", 27 | "from sklearn.datasets import fetch_20newsgroups\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "\n", 30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n", 31 | "\n", 32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n", 33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n", 34 | "\n", 35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n", 36 | "x_test = np.array(test_raw_df.data)\n", 37 | "y_test = test_raw_df.target\n", 38 | "\n", 39 | "# x_train = [x_train[:200] for x in x_train]\n", 40 | "\n", 41 | "print('Train:', len(x_train))\n", 42 | "print('Val:', len(x_val))\n", 43 | "print('Test:', len(x_test))" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Added /data/jupyter/common into sys.path.\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "%reload_ext autoreload\n", 61 | "%autoreload 2\n", 62 | "\n", 63 | "import sys, os\n", 64 | "def add_aion(curr_path=None):\n", 65 | " if curr_path is None:\n", 66 | " dir_path = os.getcwd()\n", 67 | " target_path = os.path.dirname(dir_path)\n", 68 | " if target_path not in sys.path:\n", 69 | " print('Added %s into sys.path.' % (target_path))\n", 70 | " sys.path.insert(0, target_path)\n", 71 | " \n", 72 | "add_aion()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# Model (Keras)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "max_sequence_length = 200" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stderr", 100 | "output_type": "stream", 101 | "text": [ 102 | "Using TensorFlow backend.\n" 103 | ] 104 | }, 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "2018-10-06 16:04:42.665310. [FOUND] Keras_CoVe.h5 in ../model/text/salesforce/cove/\n" 110 | ] 111 | }, 112 | { 113 | "name": "stderr", 114 | "output_type": "stream", 115 | "text": [ 116 | "/anaconda/envs/py35/lib/python3.5/site-packages/keras/engine/saving.py:269: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n", 117 | " warnings.warn('No training configuration found in save file: '\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "from aion.embeddings.cove import CoVeEmbeddings\n", 123 | "\n", 124 | "cove_embs = CoVeEmbeddings(\n", 125 | " word_embeddings_dir='../model/text/stanford/glove/', \n", 126 | " max_sequence_length=max_sequence_length, verbose=20)\n", 127 | "tmp = cove_embs.load_model(dest_dir='../model/text/salesforce/cove/')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "x_train_t = cove_embs.encode(x_train)\n", 137 | "x_test_t = cove_embs.encode(x_test)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "x_train_t2 = x_train_t.reshape(len(x_train_t), max_sequence_length*600)\n", 149 | "x_test_t2 = x_test_t.reshape(len(x_test_t), max_sequence_length*600)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stderr", 159 | "output_type": "stream", 160 | "text": [ 161 | "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n", 162 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 163 | "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n", 164 | " warnings.warn('Line Search failed')\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "from sklearn.linear_model import LogisticRegression\n", 170 | "\n", 171 | "model = LogisticRegression(solver='newton-cg')\n", 172 | "model.fit(x_train_t2, y_train)\n", 173 | "\n", 174 | "y_pred = model.predict(x_test_t2)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "For sake of easier demonstration, I did not do any data preprocessing. It leads lots of OOV and causing the result bad." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 8, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "Accuracy:32.56%\n", 194 | "Classification Report:\n", 195 | " precision recall f1-score support\n", 196 | "\n", 197 | " 0 0.26 0.03 0.05 319\n", 198 | " 1 0.34 0.61 0.43 389\n", 199 | " 2 0.33 0.05 0.08 396\n", 200 | " 3 0.32 0.56 0.41 398\n", 201 | "\n", 202 | "avg / total 0.31 0.33 0.25 1502\n", 203 | "\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "from sklearn.metrics import accuracy_score\n", 209 | "from sklearn.metrics import classification_report\n", 210 | "\n", 211 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n", 212 | "print('Classification Report:')\n", 213 | "print(classification_report(y_test, y_pred))" 214 | ] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.5.2" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /sample/nlp-lsa_lda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2 latent methods for dimension reduction and topic modeling\n", 8 | "\n", 9 | "![](https://cdn.pixabay.com/photo/2015/11/07/11/17/golden-gate-bridge-1030999_960_720.jpg)\n", 10 | "Photo: https://pixabay.com/en/golden-gate-bridge-women-back-1030999/\n", 11 | "\n", 12 | "Before the state-of-the-art word embedding technique, Latent Semantic Analysis (LSA) and Latent Dirichlet Allocation (LDA) area good approaches to deal with NLP problems. Both LSA and LDA have same input which is Bag of words in matrix format. LSA focus on reducing matrix dimension while LDA solves topic modeling problems.\n", 13 | "\n", 14 | "I will not go through mathematical detail and as there is lot of great material for that. You may check it from reference. For the sake of keeping it easy to understand, I did not do pre-processing such as stopwords removal. It is critical part when you use LSA, LSI and LDA. After reading this article, you will know:\n", 15 | "- Latent Semantic Analysis (LSA)\n", 16 | "- Latent Dirichlet Allocation (LDA)\n", 17 | "- Take Away" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.datasets import fetch_20newsgroups\n", 29 | "train_raw = fetch_20newsgroups(subset='train')\n", 30 | "test_raw = fetch_20newsgroups(subset='test')\n", 31 | "\n", 32 | "x_train = train_raw.data\n", 33 | "y_train = train_raw.target\n", 34 | "\n", 35 | "x_test = test_raw.data\n", 36 | "y_test = test_raw.target" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# Latent Senmantic Analysis (LSA)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "The idea is that words will occurs in similar pieces of text if they have similar meaning. People usually use Latent Semantic Indexing (LSI) as an alternative name in NLP field.\n", 51 | "\n", 52 | "First of all, we have m documents and n words as input. An m * n matrix can be constructed while column and row are document and word respectively. You can use count occurrence or TF-IDF score. However, TF-IDF is better than count occurrence in most of the time as high frequency do not account for better classification.\n", 53 | "\n", 54 | "![](https://1.bp.blogspot.com/-tnzPA6dDtTU/Vw6EWm_PjCI/AAAAAAABDwI/JatHtUJb4fsce9E-Ns5t02_nakFtGrsugCLcB/s1600/%25E8%259E%25A2%25E5%25B9%2595%25E5%25BF%25AB%25E7%2585%25A7%2B2016-04-14%2B%25E4%25B8%258A%25E5%258D%25881.39.07.png)\n", 55 | "Photo: http://mropengate.blogspot.com/2016/04/tf-idf-in-r-language.html\n", 56 | "\n", 57 | "The idea of TF-IDF is that high frequency may not able to provide much information gain. In another word, rare words contribute more weights to the model. Word importance will be increased if the number of occurrence within same document (i.e. training record). On the other hand, it will be decreased if it occurs in corpus (i.e. other training records). For detail, you may check this [blog](https://towardsdatascience.com/3-basic-approaches-in-bag-of-words-which-are-better-than-word-embeddings-c2cbc7398016).\n", 58 | "\n", 59 | "The challenge is that the matrix is very sparse (or high dimension) and noisy (or include lots of low frequency word). So truncated SVD is adopted to reduce dimension.\n", 60 | "\n", 61 | "![]()\n", 62 | "\n", 63 | "The idea of SVD is finding the most valuable information and using lower dimension t to represent same thing." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 26, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "TF-IDF output shape: (11314, 130107)\n", 76 | "LSA output shape: (11314, 50)\n", 77 | "Sum of explained variance ratio: 8%\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 83 | "from sklearn.decomposition import TruncatedSVD\n", 84 | "\n", 85 | "def build_lsa(x_train, x_test, dim=50):\n", 86 | " tfidf_vec = TfidfVectorizer(use_idf=True, norm='l2')\n", 87 | " svd = TruncatedSVD(n_components=dim)\n", 88 | " \n", 89 | " transformed_x_train = tfidf_vec.fit_transform(x_train)\n", 90 | " transformed_x_test = tfidf_vec.transform(x_test)\n", 91 | " \n", 92 | " print('TF-IDF output shape:', transformed_x_train.shape)\n", 93 | " \n", 94 | " x_train_svd = svd.fit_transform(transformed_x_train)\n", 95 | " x_test_svd = svd.transform(transformed_x_test)\n", 96 | " \n", 97 | " print('LSA output shape:', x_train_svd.shape)\n", 98 | " \n", 99 | " explained_variance = svd.explained_variance_ratio_.sum()\n", 100 | " print(\"Sum of explained variance ratio: %d%%\" % (int(explained_variance * 100)))\n", 101 | " \n", 102 | " return tfidf_vec, svd, x_train_svd, x_test_svd\n", 103 | "\n", 104 | "\n", 105 | "tfidf_vec, svd, x_train_lda, x_test_lda = build_lsa(x_train, x_test)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "We can see that the dimension reduces from 130k to 50 only." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 27, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Accuracy: 0.6511 (+/- 0.0201)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "from sklearn.linear_model import LogisticRegression\n", 130 | "from sklearn.model_selection import cross_val_score, KFold\n", 131 | "\n", 132 | "lr_model = LogisticRegression(solver='newton-cg',n_jobs=-1)\n", 133 | "lr_model.fit(x_train_svd, y_train)\n", 134 | "\n", 135 | "cv = KFold(n_splits=5, shuffle=True)\n", 136 | " \n", 137 | "scores = cross_val_score(lr_model, x_test_svd, y_test, cv=cv, scoring='accuracy')\n", 138 | "print(\"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Latent Dirichlet Allocation (LDA)\n", 146 | "\n", 147 | "LDA is introduced by David Blei, Andrew Ng and Michael O. Jordan in 2003. It is unsupervised learning and topic model is the typical example. The assumption is that each document mix with various topics and every topic mix with various words.\n", 148 | "\n", 149 | "![]()\n", 150 | "\n", 151 | "Intuitively, you can image that we have two layer of aggregations. First layer is the distribution of categories. For example, we have finance news, weather news and political news. Second layer is distribution of words within the category. For instance, we can find \"sunny\" and \"cloud\" in weather news while \"money\" and \"stock\" exists in finance news. \n", 152 | "\n", 153 | "However, \"a\", \"with\" and \"can\" do not contribute on topic modeling problem. Those words exist among documents and will have roughly same probability between categories. Therefore, stopwords removal is a critical step to achieve a better result.\n", 154 | "\n", 155 | "![]()\n", 156 | "\n", 157 | "For particular document d, we get the topic distribution which is θ. From this distribution(θ), topic t will be chosen and selecting corresponding word from ϕ." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 44, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "Topic 0:\n", 170 | "['the', 'for', 'and', 'to', 'edu']\n", 171 | "Topic 1:\n", 172 | "['c_', 'w7', 'hz', 'mv', 'ck']\n", 173 | "Topic 2:\n", 174 | "['space', 'nasa', 'cmu', 'science', 'edu']\n", 175 | "Topic 3:\n", 176 | "['the', 'to', 'of', 'for', 'and']\n", 177 | "Topic 4:\n", 178 | "['the', 'to', 'of', 'and', 'in']\n", 179 | "Topic 5:\n", 180 | "['the', 'of', 'and', 'in', 'were']\n", 181 | "Topic 6:\n", 182 | "['edu', 'team', 'he', 'game', '10']\n", 183 | "Topic 7:\n", 184 | "['ax', 'max', 'g9v', 'b8f', 'a86']\n", 185 | "Topic 8:\n", 186 | "['db', 'bike', 'ac', 'image', 'dod']\n", 187 | "Topic 9:\n", 188 | "['nec', 'mil', 'navy', 'sg', 'behanna']\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "from sklearn.feature_extraction.text import CountVectorizer\n", 194 | "from sklearn.decomposition import LatentDirichletAllocation\n", 195 | "\n", 196 | "def build_lda(x_train, num_of_topic=10):\n", 197 | " vec = CountVectorizer()\n", 198 | " transformed_x_train = vec.fit_transform(x_train)\n", 199 | " feature_names = vec.get_feature_names()\n", 200 | "\n", 201 | " lda = LatentDirichletAllocation(\n", 202 | " n_components=num_of_topic, max_iter=5, \n", 203 | " learning_method='online', random_state=0)\n", 204 | " lda.fit(transformed_x_train)\n", 205 | "\n", 206 | " return lda, vec, feature_names\n", 207 | "\n", 208 | "def display_word_distribution(model, feature_names, n_word):\n", 209 | " for topic_idx, topic in enumerate(model.components_):\n", 210 | " print(\"Topic %d:\" % (topic_idx))\n", 211 | " words = []\n", 212 | " for i in topic.argsort()[:-n_word - 1:-1]:\n", 213 | " words.append(feature_names[i])\n", 214 | " print(words)\n", 215 | "\n", 216 | "lda_model, vec, feature_names = build_lda(x_train)\n", 217 | "display_word_distribution(\n", 218 | " model=lda_model, feature_names=feature_names, \n", 219 | " n_word=5)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "# Take Away\n", 227 | "- Both of them use __Bag-of-words as input matrix__\n", 228 | "- The challenge of SVD is that we are __hard to determine the optimal number of dimension__. In general, low dimension consume less resource but we may not able to distinguish opposite meaning words while high dimension overcome it but consuming more resource." 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "# About Me\n", 236 | "I am Data Scientist in Bay Area. Focusing on state-of-the-art in Data Science, Artificial Intelligence , especially in NLP and platform related. You can reach me from [Medium Blog](https://medium.com/@makcedward) or [Github](https://github.com/makcedward)." 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "# Reference\n", 244 | "- [1] SVD Tutorial: https://cs.fit.edu/~dmitra/SciComp/Resources/singular-value-decomposition-fast-track-tutorial.pdf\n", 245 | "- [2] CUHK LSI Tutorial: http://www1.se.cuhk.edu.hk/~seem5680/lecture/LSI-Eg.pdf\n", 246 | "- [3] Stanford LSI Tutorial: https://nlp.stanford.edu/IR-book/pdf/18lsi.pdf\n", 247 | "- [4] LSA and LDA Explanation: https://cs.stanford.edu/~ppasupat/a9online/1140.html" 248 | ] 249 | } 250 | ], 251 | "metadata": { 252 | "kernelspec": { 253 | "display_name": "Python 3", 254 | "language": "python", 255 | "name": "python3" 256 | }, 257 | "language_info": { 258 | "codemirror_mode": { 259 | "name": "ipython", 260 | "version": 3 261 | }, 262 | "file_extension": ".py", 263 | "mimetype": "text/x-python", 264 | "name": "python", 265 | "nbconvert_exporter": "python", 266 | "pygments_lexer": "ipython3", 267 | "version": "3.5.2" 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 2 272 | } 273 | -------------------------------------------------------------------------------- /sample/nlp-named_entity_recognition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Named Entity Recognition](https://cdn-images-1.medium.com/max/800/0*6qNBX5v1XFr1pMvr.jpg)\n", 8 | "Source: https://hackernoon.com/named-entity-recognition-applications-and-use-cases-c2ef0904e9fe" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "ner_dir = '/stanford/ner/'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Copy from https://en.wikipedia.org/wiki/Stanford_University\n", 31 | "\n", 32 | "article = \"The university was founded in 1885 by Leland and Jane Stanford in memory of \\\n", 33 | "their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous \\\n", 34 | "year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. \\\n", 35 | "The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "article2 = 'New York, New York , NY N.Y. new york'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Stanford NER" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "NTLK Version: 3.2.5\n" 66 | ] 67 | }, 68 | { 69 | "name": "stderr", 70 | "output_type": "stream", 71 | "text": [ 72 | "/anaconda/envs/py35/lib/python3.5/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: \n", 73 | "The StanfordTokenizer will be deprecated in version 3.2.5.\n", 74 | "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n", 75 | " super(StanfordNERTagger, self).__init__(*args, **kwargs)\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "import nltk\n", 81 | "print('NTLK Version: %s' % nltk.__version__)\n", 82 | "\n", 83 | "from nltk.tag import StanfordNERTagger\n", 84 | "\n", 85 | "stanford_ner_tagger = StanfordNERTagger(\n", 86 | " ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz',\n", 87 | " ner_dir + 'stanford-ner-3.9.1.jar'\n", 88 | ")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 11, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "results = stanford_ner_tagger.tag(article.split())" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 22, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n", 112 | "\n", 113 | "Type: LOCATION, Value: New\n", 114 | "Type: LOCATION, Value: York\n", 115 | "Type: LOCATION, Value: NY\n", 116 | "Type: LOCATION, Value: N.Y.\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "print('Original Sentence: %s' % (article))\n", 122 | "print()\n", 123 | "for result in results:\n", 124 | " tag_value = result[0]\n", 125 | " tag_type = result[1]\n", 126 | " if tag_type != 'O':\n", 127 | " print('Type: %s, Value: %s' % (tag_type, tag_value))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 14, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "results = stanford_ner_tagger.tag(article2.split())" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 21, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "Original Sentence: New York, New York , NY N.Y. new york\n", 151 | "\n", 152 | "Type: LOCATION, Value: New\n", 153 | "Type: LOCATION, Value: York\n", 154 | "Type: LOCATION, Value: NY\n", 155 | "Type: LOCATION, Value: N.Y.\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "print('Original Sentence: %s' % (article2))\n", 161 | "print()\n", 162 | "for result in results:\n", 163 | " tag_value = result[0]\n", 164 | " tag_type = result[1]\n", 165 | " if tag_type != 'O':\n", 166 | " print('Type: %s, Value: %s' % (tag_type, tag_value))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "# NLTK NE" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 25, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "NTLK version: 3.2.5\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "import nltk\n", 191 | "\n", 192 | "print('NTLK version: %s' % (nltk.__version__))\n", 193 | "\n", 194 | "from nltk import word_tokenize, pos_tag, ne_chunk\n", 195 | "\n", 196 | "nltk.download('words')\n", 197 | "nltk.download('averaged_perceptron_tagger')\n", 198 | "nltk.download('punkt')\n", 199 | "nltk.download('maxent_ne_chunker')" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 43, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "results = ne_chunk(pos_tag(word_tokenize(article)))" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 44, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n", 223 | "\n", 224 | " (GPE Leland/NNP)\n", 225 | " (PERSON Jane/NNP Stanford/NNP)\n", 226 | " (GPE Leland/NNP)\n", 227 | " Stanford/NNP\n", 228 | " Jr./NNP\n", 229 | " (PERSON Stanford/NNP)\n", 230 | " Governor/NNP\n", 231 | " (GPE California/NNP)\n", 232 | " (GPE U.S/NNP)\n", 233 | " Senator/NNP\n", 234 | " October/NNP\n", 235 | " ]/NNP\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "print('Original Sentence: %s' % (article))\n", 241 | "print()\n", 242 | "for x in str(results).split('\\n'):\n", 243 | " if '/NNP' in x:\n", 244 | " print(x)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 45, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "results = ne_chunk(pos_tag(word_tokenize(article2)))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 46, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Original Sentence: New York, New York , NY N.Y. new york\n", 268 | "\n", 269 | " (GPE New/NNP York/NNP)\n", 270 | " (GPE New/NNP York/NNP)\n", 271 | " (ORGANIZATION NY/NNP)\n", 272 | " N.Y./NNP\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "print('Original Sentence: %s' % (article2))\n", 278 | "print()\n", 279 | "for x in str(results).split('\\n'):\n", 280 | " if '/NNP' in x:\n", 281 | " print(x)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "# Spacy" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 7, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "spaCy: 2.0.11\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "import spacy\n", 306 | "\n", 307 | "print('spaCy: %s' % (spacy.__version__))" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 8, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "spacy_nlp = spacy.load('en')" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 20, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n", 331 | "\n", 332 | "Type: DATE, Value: 1885\n", 333 | "Type: GPE, Value: Leland\n", 334 | "Type: PERSON, Value: Jane Stanford\n", 335 | "Type: PERSON, Value: Leland Stanford Jr.\n", 336 | "Type: DATE, Value: age 15 the previous year\n", 337 | "Type: ORG, Value: Stanford\n", 338 | "Type: GPE, Value: California\n", 339 | "Type: GPE, Value: U.S.\n", 340 | "Type: ORDINAL, Value: first\n", 341 | "Type: DATE, Value: October 1, 1891,[2][3\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "document = spacy_nlp(article)\n", 347 | "\n", 348 | "print('Original Sentence: %s' % (article))\n", 349 | "print()\n", 350 | "for element in document.ents:\n", 351 | " print('Type: %s, Value: %s' % (element.label_, element))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 24, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "Original Sentence: New York, New York , NY N.Y. new york\n", 364 | "\n", 365 | "Type: GPE, Value: New York\n", 366 | "Type: GPE, Value: New York\n", 367 | "Type: GPE, Value: NY N.Y.\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "document = spacy_nlp(article2)\n", 373 | "\n", 374 | "print('Original Sentence: %s' % (article2))\n", 375 | "print()\n", 376 | "for element in document.ents:\n", 377 | " print('Type: %s, Value: %s' % (element.label_, element))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 7, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "Python 3", 402 | "language": "python", 403 | "name": "python3" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.5.2" 416 | } 417 | }, 418 | "nbformat": 4, 419 | "nbformat_minor": 2 420 | } 421 | -------------------------------------------------------------------------------- /sample/nlp-part_of_speech.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Part of Speech](https://dailygenius.com/wp-content/uploads/2014/09/handwriting1.jpg)\n", 8 | "\n", 9 | "Source: https://dailygenius.com/handwriting-helps-learn-graphic/" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Part of Speech\n", 17 | "\n", 18 | "Part of Speech, aka POS, is referring to category of words. Same category of words can represent similar behavior. For example, \"Word\" is a noun while \"Run\" is a verb. To have a better understanding on article, we have to know the POS. \n", 19 | "\n", 20 | "In NLP, POS is an important part but we may not always deal with it directly. Lemmanization and Stemming process relies on POS but some libraries (e.g. spaCy) is very nice that helped us to tackle it.\n", 21 | "\n", 22 | "In English, we have noun, adjective, conjunction etc. Sometimes, same word can have both verb and noun. In Chinese, two major categories are Content Word and Function words which including noun, adverb, conjunction as well. \n", 23 | "This article includes how we can do it for English (via spaCy) and Chinese (via jieba)." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Catpure from https://en.wikipedia.org/wiki/Part_of_speech\n", 35 | "\n", 36 | "article = 'In traditional grammar, a part of speech (abbreviated form: PoS or POS) is \\\n", 37 | "a category of words (or, more generally, of lexical items) which have similar grammatical properties. '" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# Catpure from https://zh.wikipedia.org/wiki/%E8%A9%9E%E9%A1%9E\n", 49 | "\n", 50 | "article2 = '詞類是一個語言學術語,是一種語言中詞的語法分類,是以語法特徵\\\n", 51 | "(包括句法功能和形態變化)為主要依據、兼顧詞彙意義對詞進行劃分的結果。'" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### spaCy" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "spaCy Version: 2.0.11\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "import spacy\n", 76 | "print('spaCy Version: %s' % (spacy.__version__))\n", 77 | "spacy_nlp = spacy.load('en_core_web_sm')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "\"a\" is DT which means deteminer. \"part\" is NN which is noun while \"of\" is IN which is preposition." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "scrolled": true 92 | }, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Original Article: In traditional grammar, a part of speech (abbreviated form: PoS or POS) is a category of words (or, more generally, of lexical items) which have similar grammatical properties. \n", 99 | "\n", 100 | "Word: In, POS: IN\n", 101 | "Word: traditional, POS: JJ\n", 102 | "Word: grammar, POS: NN\n", 103 | "Word: ,, POS: ,\n", 104 | "Word: a, POS: DT\n", 105 | "Word: part, POS: NN\n", 106 | "Word: of, POS: IN\n", 107 | "Word: speech, POS: NN\n", 108 | "Word: (, POS: -LRB-\n", 109 | "Word: abbreviated, POS: VBN\n", 110 | "Word: form, POS: NN\n", 111 | "Word: :, POS: :\n", 112 | "Word: PoS, POS: NNP\n", 113 | "Word: or, POS: CC\n", 114 | "Word: POS, POS: NNP\n", 115 | "Word: ), POS: -RRB-\n", 116 | "Word: is, POS: VBZ\n", 117 | "Word: a, POS: DT\n", 118 | "Word: category, POS: NN\n", 119 | "Word: of, POS: IN\n", 120 | "Word: words, POS: NNS\n", 121 | "Word: (, POS: -LRB-\n", 122 | "Word: or, POS: CC\n", 123 | "Word: ,, POS: ,\n", 124 | "Word: more, POS: RBR\n", 125 | "Word: generally, POS: RB\n", 126 | "Word: ,, POS: ,\n", 127 | "Word: of, POS: IN\n", 128 | "Word: lexical, POS: JJ\n", 129 | "Word: items, POS: NNS\n", 130 | "Word: ), POS: -RRB-\n", 131 | "Word: which, POS: WDT\n", 132 | "Word: have, POS: VBP\n", 133 | "Word: similar, POS: JJ\n", 134 | "Word: grammatical, POS: JJ\n", 135 | "Word: properties, POS: NNS\n", 136 | "Word: ., POS: .\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "doc = spacy_nlp(article)\n", 142 | "tokens = [token.text for token in doc if not token.is_stop]\n", 143 | "\n", 144 | "print('Original Article: %s' % (article))\n", 145 | "print()\n", 146 | "for token in doc:\n", 147 | " print('Word: %s, POS: %s' % (token.text, token.tag_))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### jieba" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "jieba Version: 0.39\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "import jieba\n", 172 | "print('jieba Version: %s' % jieba.__version__)\n", 173 | "\n", 174 | "import jieba.posseg as jieba_pos_tagger" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "\"詞類\" is noun while \"是\" is verb." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": { 188 | "scrolled": true 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stderr", 193 | "output_type": "stream", 194 | "text": [ 195 | "Building prefix dict from the default dictionary ...\n", 196 | "Loading model from cache /tmp/jieba.cache\n" 197 | ] 198 | }, 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "Original Article: 詞類是一個語言學術語,是一種語言中詞的語法分類,是以語法特徵(包括句法功能和形態變化)為主要依據、兼顧詞彙意義對詞進行劃分的結果。\n", 204 | "\n" 205 | ] 206 | }, 207 | { 208 | "name": "stderr", 209 | "output_type": "stream", 210 | "text": [ 211 | "Loading model cost 1.159 seconds.\n", 212 | "Prefix dict has been built succesfully.\n" 213 | ] 214 | }, 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Word: 詞類, POS: n\n", 220 | "Word: 是, POS: v\n", 221 | "Word: 一個, POS: m\n", 222 | "Word: 語言, POS: n\n", 223 | "Word: 學術, POS: n\n", 224 | "Word: 語, POS: n\n", 225 | "Word: ,, POS: x\n", 226 | "Word: 是, POS: v\n", 227 | "Word: 一種, POS: m\n", 228 | "Word: 語, POS: n\n", 229 | "Word: 言中, POS: nr\n", 230 | "Word: 詞, POS: n\n", 231 | "Word: 的, POS: uj\n", 232 | "Word: 語法, POS: n\n", 233 | "Word: 分類, POS: vn\n", 234 | "Word: ,, POS: x\n", 235 | "Word: 是, POS: v\n", 236 | "Word: 以, POS: p\n", 237 | "Word: 語, POS: n\n", 238 | "Word: 法特, POS: ns\n", 239 | "Word: 徵, POS: zg\n", 240 | "Word: (, POS: x\n", 241 | "Word: 包括, POS: v\n", 242 | "Word: 句法, POS: n\n", 243 | "Word: 功能, POS: n\n", 244 | "Word: 和, POS: c\n", 245 | "Word: 形態, POS: n\n", 246 | "Word: 變化, POS: vn\n", 247 | "Word: ), POS: x\n", 248 | "Word: 為, POS: zg\n", 249 | "Word: 主要, POS: b\n", 250 | "Word: 依據, POS: p\n", 251 | "Word: 、, POS: x\n", 252 | "Word: 兼顧, POS: v\n", 253 | "Word: 詞, POS: n\n", 254 | "Word: 彙, POS: zg\n", 255 | "Word: 意, POS: ng\n", 256 | "Word: 義, POS: nt\n", 257 | "Word: 對, POS: p\n", 258 | "Word: 詞, POS: n\n", 259 | "Word: 進, POS: v\n", 260 | "Word: 行, POS: v\n", 261 | "Word: 劃, POS: v\n", 262 | "Word: 分, POS: q\n", 263 | "Word: 的, POS: uj\n", 264 | "Word: 結, POS: v\n", 265 | "Word: 果, POS: ng\n", 266 | "Word: 。, POS: x\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "print('Original Article: %s' % (article2))\n", 272 | "print()\n", 273 | "\n", 274 | "words = jieba_pos_tagger.cut(article2)\n", 275 | "\n", 276 | "for word in words:\n", 277 | " print('Word: %s, POS: %s' % (word.word, word.flag))" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "# Conclusion\n", 285 | "\n", 286 | "POS helps a lot on text pre-processing. For example, we have to know the POS of word in order to perform lemmanization, stemming and stop word removal. These three pre-processing will be discussed in later article. Stay tuned." 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "# Reference\n", 294 | "\n", 295 | "Standard Syntactic Categories: https://cs.nyu.edu/grishman/jet/guide/PennPOS.html" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "kernelspec": { 301 | "display_name": "Python 3", 302 | "language": "python", 303 | "name": "python3" 304 | }, 305 | "language_info": { 306 | "codemirror_mode": { 307 | "name": "ipython", 308 | "version": 3 309 | }, 310 | "file_extension": ".py", 311 | "mimetype": "text/x-python", 312 | "name": "python", 313 | "nbconvert_exporter": "python", 314 | "pygments_lexer": "ipython3", 315 | "version": "3.5.2" 316 | } 317 | }, 318 | "nbformat": 4, 319 | "nbformat_minor": 2 320 | } 321 | -------------------------------------------------------------------------------- /sample/nlp-sentence_tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "![Sentence Tokenization](http://www.digitalmeetsculture.net/wp-content/uploads/2015/04/article.jpg)\n", 10 | "\n", 11 | "Source: http://www.digitalmeetsculture.net/article/article-about-preforma-published-in-archival-science/" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "source": [ 20 | "# Sentence Tokenization\n", 21 | "\n", 22 | "In previous article, word tokenization is introduced. What if we want to tokenize sentence? In general, we can easily split sentence by some punctuation such ., ? and !. However, there are lots of exception if we splitting article by those punctuation only.\n", 23 | "In this article, you will go through why we need to use sentence tokenization and how can we use it." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "source": [ 32 | "# Why?\n", 33 | "According to researchers, about 86% of article include the importance sentence in first one or two sentences. Believe that it is one of the reason why textsum model use first 2 sentences for training\n", 34 | "When I am in school, teacher teaches how we should write an article. The importance sentence will be placed in the first sentence most of the time. It may exists in last sentence sometimes." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "# How?\n", 42 | "So how can we tokenize sentence? You can use the following simple python script to do that or using library such as nltk and spacy" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n", 54 | "\n", 55 | "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n", 56 | "converting a sequence of characters (such as in a computer program or web page) into a \\\n", 57 | "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n", 58 | "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n", 59 | "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n", 60 | "which together analyze the syntax of programming languages, web pages, and so forth.'\n", 61 | "\n", 62 | "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'\n", 63 | "\n", 64 | "article3 = 'It is a great moment from 10 a.m. to 1 p.m. every weekend.'" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### Self build" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 2, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n", 84 | "\n", 85 | "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning)\n", 86 | "-->Sentence 1: .\n", 87 | "-->Sentence 2: A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer\n", 88 | "-->Sentence 3: .\n", 89 | "-->Sentence 4: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth\n", 90 | "-->Sentence 5: .\n", 91 | "-->Sentence 6: \n", 92 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n", 93 | "\n", 94 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_\n", 95 | "-->Sentence 1: !\n", 96 | "-->Sentence 2: @# \n", 97 | "-->Sentence 3: !\n", 98 | "-->Sentence 4: @#$%^&*()_+ 0123456\n", 99 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n", 100 | "\n", 101 | "-->Sentence 0: It is a great moment from 10 a\n", 102 | "-->Sentence 1: .\n", 103 | "-->Sentence 2: m\n", 104 | "-->Sentence 3: .\n", 105 | "-->Sentence 4: to 1 p\n", 106 | "-->Sentence 5: .\n", 107 | "-->Sentence 6: m\n", 108 | "-->Sentence 7: .\n", 109 | "-->Sentence 8: every weekend\n", 110 | "-->Sentence 9: .\n", 111 | "-->Sentence 10: \n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "import re\n", 117 | "\n", 118 | "for doc in [article, article2, article3]:\n", 119 | " print('Original Article: %s' % (doc))\n", 120 | " print()\n", 121 | "\n", 122 | " sentences = re.split('(\\.|!|\\?)', doc)\n", 123 | " \n", 124 | " for i, s in enumerate(sentences):\n", 125 | " print('-->Sentence %d: %s' % (i, s))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "You can see that, \"a.m.\" should treat as a \"word\". Of course, we can enhance the above regular expression to do it. But I will go for library rather than build the wheel again" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "### spaCy" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "spaCy Version: 2.0.11\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "import spacy\n", 157 | "print('spaCy Version: %s' % spacy.__version__)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 4, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "spacy_nlp = spacy.load('en_core_web_sm')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 5, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n", 181 | "\n", 182 | "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).\n", 183 | "-->Sentence 1: A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.\n", 184 | "-->Sentence 2: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n", 185 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n", 186 | "\n", 187 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!@# !\n", 188 | "-->Sentence 1: @#$%^&*()_+ 0123456\n", 189 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n", 190 | "\n", 191 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "for article in [article, article2, article3]:\n", 197 | " print('Original Article: %s' % (article))\n", 198 | " print()\n", 199 | " doc = spacy_nlp(article)\n", 200 | " for i, token in enumerate(doc.sents):\n", 201 | " print('-->Sentence %d: %s' % (i, token.text))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "Can see that spacy handled \"a.m.\" somehow." 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "### NLTK" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 6, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "NTLK Version: 3.2.5\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "import nltk\n", 233 | "from nltk.tokenize import sent_tokenize\n", 234 | "print('NTLK Version: %s' % nltk.__version__)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# nltk.download('punkt')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n", 256 | "\n", 257 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n", 258 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n", 259 | "\n", 260 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!\n", 261 | "-->Sentence 1: @# !\n", 262 | "-->Sentence 2: @#$%^&*()_+ 0123456\n", 263 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n", 264 | "\n", 265 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "for article in [article, article2, article3]:\n", 271 | " print('Original Article: %s' % (article))\n", 272 | " print()\n", 273 | "\n", 274 | " doc = sent_tokenize(article)\n", 275 | " for i, token in enumerate(doc):\n", 276 | " print('-->Sentence %d: %s' % (i, token))" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "# Conclusion\n", 284 | "So far both NLTK and spacy provides similar behavior so it depends on which library do you use in performing other preprocessing. \n", 285 | "Recently, I works on text mining related project which is classifying news category. Of course, I can build a ML model to classify it but I go for a simple approach. Only focus on the first sentence for every news and performing simple key word searching to build a baseline model. The result is not bad but it is a very quick way to deliver an initial version." 286 | ] 287 | } 288 | ], 289 | "metadata": { 290 | "kernelspec": { 291 | "display_name": "Python 3", 292 | "language": "python", 293 | "name": "python3" 294 | }, 295 | "language_info": { 296 | "codemirror_mode": { 297 | "name": "ipython", 298 | "version": 3 299 | }, 300 | "file_extension": ".py", 301 | "mimetype": "text/x-python", 302 | "name": "python", 303 | "nbconvert_exporter": "python", 304 | "pygments_lexer": "ipython3", 305 | "version": "3.5.2" 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 2 310 | } 311 | -------------------------------------------------------------------------------- /sample/nlp-stemming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Stemming](https://i1.wp.com/s3-eu-west-1.amazonaws.com/leadersandco/wp-content/uploads/2017/05/31224050/Diary-writing-is-an-old-human-art.jpg?fit=800%2C600&ssl=1)\n", 8 | "\n", 9 | "Source: https://www.thisdaylive.com/index.php/2017/05/31/death-of-the-diary/" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Stemming\n", 17 | "\n", 18 | "After tokenized word, we may want a root form rather than the original input form for post processing or modelling such as topic classification. The root word does not necessarily a word itself. For example, \"reduc\" is a root word of \"reduce\", \"suffici\" is a root word of \"sufficient\".\n", 19 | "\n", 20 | "There are lots of stemming algorithm in NLTK. Porter Stemmer and Snowball Stemmer (aka Porter2) will be selected for demonstration because they are the most popular." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "# Copy from https://en.wikipedia.org/wiki/Stemming\n", 32 | "\n", 33 | "article = 'In linguistic morphology and information retrieval, stemming is the process of \\\n", 34 | "reducing inflected (or sometimes derived) words to their word stem, base or root \\\n", 35 | "form—generally a written word form. The stem need not be identical to the morphological \\\n", 36 | "root of the word; it is usually sufficient that related words map to the same stem, even \\\n", 37 | "if this stem is not in itself a valid root.'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Porter Stemmer" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "NLTK Version: 3.2.5\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "import nltk \n", 62 | "print('NLTK Version: %s' % (nltk.__version__))\n", 63 | "\n", 64 | "porter_stemmer = nltk.stem.PorterStemmer()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n", 77 | "\n", 78 | "Original : linguistic, New: linguist\n", 79 | "Original : morphology, New: morpholog\n", 80 | "Original : information, New: inform\n", 81 | "Original : retrieval, New: retriev\n", 82 | "Original : stemming, New: stem\n", 83 | "Original : reducing, New: reduc\n", 84 | "Original : inflected, New: inflect\n", 85 | "Original : sometimes, New: sometim\n", 86 | "Original : derived, New: deriv\n", 87 | "Original : words, New: word\n", 88 | "Original : form—generally, New: form—gener\n", 89 | "Original : The, New: the\n", 90 | "Original : identical, New: ident\n", 91 | "Original : morphological, New: morpholog\n", 92 | "Original : usually, New: usual\n", 93 | "Original : sufficient, New: suffici\n", 94 | "Original : related, New: relat\n", 95 | "Original : words, New: word\n", 96 | "Original : this, New: thi\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "tokens = nltk.word_tokenize(article)\n", 102 | "\n", 103 | "print('Original Article: %s' % (article))\n", 104 | "print()\n", 105 | "\n", 106 | "for token in tokens:\n", 107 | " stemmed_token = porter_stemmer.stem(token)\n", 108 | " \n", 109 | " if token != stemmed_token:\n", 110 | " print('Original : %s, New: %s' % (token, stemmed_token))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Snowball Stemmer" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "NLTK Version: 3.2.5\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "import nltk \n", 135 | "print('NLTK Version: %s' % (nltk.__version__))\n", 136 | "\n", 137 | "snowball_stemmer = nltk.stem.SnowballStemmer('english')" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n", 150 | "\n", 151 | "Original : In, New: in\n", 152 | "Original : linguistic, New: linguist\n", 153 | "Original : morphology, New: morpholog\n", 154 | "Original : information, New: inform\n", 155 | "Original : retrieval, New: retriev\n", 156 | "Original : stemming, New: stem\n", 157 | "Original : reducing, New: reduc\n", 158 | "Original : inflected, New: inflect\n", 159 | "Original : sometimes, New: sometim\n", 160 | "Original : derived, New: deriv\n", 161 | "Original : words, New: word\n", 162 | "Original : form—generally, New: form—gener\n", 163 | "Original : The, New: the\n", 164 | "Original : identical, New: ident\n", 165 | "Original : morphological, New: morpholog\n", 166 | "Original : usually, New: usual\n", 167 | "Original : sufficient, New: suffici\n", 168 | "Original : related, New: relat\n", 169 | "Original : words, New: word\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "tokens = nltk.word_tokenize(article)\n", 175 | "\n", 176 | "print('Original Article: %s' % (article))\n", 177 | "print()\n", 178 | "\n", 179 | "for token in tokens:\n", 180 | " stemmed_token = snowball_stemmer.stem(token)\n", 181 | " \n", 182 | " if token != stemmed_token:\n", 183 | " print('Original : %s, New: %s' % (token, stemmed_token))" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Except \"In\", the result of Snowball Stemmer are same as Porter Stemmer." 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "# Conclusion\n", 198 | "\n", 199 | "\n", 200 | "Snowball Stemmer not only support English, but also Germanic and other languages as well. For detail, you may check on the Snowball website. \n", 201 | "\n", 202 | "Snowball Stemmer: http://snowballstem.org/algorithms/\n", 203 | "\n", 204 | "Besides Porter Stemmer and Snowball Stemmer, reader may also have on look on other stemmer algorithm such as Hunspell\n", 205 | "\n", 206 | "Hunspell Stemmer: https://github.com/hunspell/hunspell" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.5.2" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /sample/nlp-stop_words.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![title](https://www.channelone.com/wp-content/uploads/2015/03/bigstock-Pile-Of-Words-1896131-crop.jpg)\n", 8 | "\n", 9 | "Source: https://www.channelone.com/blog_post/web-tools-for-studying-vocabulary-words/" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Stop Words\n", 17 | "\n", 18 | "When we deal with text problem in Natural Language Processing, stop words removal process is a one of the important step to have a better input for any models. Stop words means that it is a very common words in a language (e.g. a, an, the in English. 的, 了 in Chinese. え, も in Japanese). It does not help on most of NLP problem such as semantic analysis, classification etc.\n", 19 | "\n", 20 | "In this article, we will look into using multi libraries pre-defined stop words, third party pre-defined stop words as well as domain specific stop words. Definition of stop words (capture from wiki) will be used to demonstrate the result after removing stop words.\n", 21 | "\n", 22 | "Word tokenization and lemmatization arethe essential part for removing stop words. You may refer to this article to understand word tokenization and lemmatization.\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "# Capture from https://en.wikipedia.org/wiki/Stop_words\n", 34 | "\n", 35 | "article = 'In computing, stop words are words which are filtered out before or \\\n", 36 | "after processing of natural language data (text).[1] Though \"stop words\" usually \\\n", 37 | "refers to the most common words in a language, there is no single universal list of \\\n", 38 | "stop words used by all natural language processing tools, and indeed not all tools \\\n", 39 | "even use such a list. Some tools specifically avoid removing these stop words to \\\n", 40 | "support phrase search.'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "# Catpure from https://zh.wikipedia.org/wiki/%E5%81%9C%E7%94%A8%E8%AF%8D\n", 52 | "\n", 53 | "article2 = '在信息檢索中,為節省存儲空間和提高搜索效率,在處理自然語言數據(或文本)之前或之後會自動過濾掉某些字或詞,\\\n", 54 | "這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的,\\\n", 55 | "生成後的停用詞會形成一個停用詞表。但是,並沒有一個明確的停用詞表能夠適用於所有的工具。\\\n", 56 | "甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。'" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### spaCy" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "spaCy Version: 2.0.11\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "import spacy\n", 81 | "print('spaCy Version: %s' % (spacy.__version__))\n", 82 | "spacy_nlp = spacy.load('en_core_web_sm')" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "Check pre-defined English stop words" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Number of stop words: 305\n", 102 | "First ten stop words: ['from', 'i', 'cannot', 'seeming', 'seemed', 'him', 'them', 'hundred', 'whoever', 'few']\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS\n", 108 | "\n", 109 | "print('Number of stop words: %d' % len(spacy_stopwords))\n", 110 | "print('First ten stop words: %s' % list(spacy_stopwords)[:10])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Remove stop words" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n", 130 | "\n", 131 | "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "doc = spacy_nlp(article)\n", 137 | "tokens = [token.text for token in doc if not token.is_stop]\n", 138 | "\n", 139 | "print('Original Article: %s' % (article))\n", 140 | "print()\n", 141 | "print(tokens)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Add customize stop words" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 6, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n", 161 | "\n", 162 | "['In', ',', 'stop', 'words', 'words', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "customize_stop_words = [\n", 168 | " 'computing', 'filtered'\n", 169 | "]\n", 170 | "\n", 171 | "for w in customize_stop_words:\n", 172 | " spacy_nlp.vocab[w].is_stop = True\n", 173 | "\n", 174 | "\n", 175 | "doc = spacy_nlp(article)\n", 176 | "tokens = [token.text for token in doc if not token.is_stop]\n", 177 | "\n", 178 | "print('Original Article: %s' % (article))\n", 179 | "print()\n", 180 | "print(tokens)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "### NLTK" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "NLTK Version: 3.2.5\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "import nltk \n", 205 | "print('NLTK Version: %s' % (nltk.__version__))\n", 206 | "\n", 207 | "nltk.download('stopwords')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 8, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Number of stop words: 179\n", 220 | "First ten stop words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\"]\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "nltk_stopwords = nltk.corpus.stopwords.words('english')\n", 226 | "\n", 227 | "print('Number of stop words: %d' % len(nltk_stopwords))\n", 228 | "print('First ten stop words: %s' % list(nltk_stopwords)[:10])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "General words such as \"are\", \"the\" are removed as well. For example, \"indeed\" is removed in NLTK but not spaCy. On the other hand, \"used\" are removed in spaCy but not NLTK" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 9, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n", 248 | "\n", 249 | "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text', ')', '.', '[', '1', ']', 'Though', '``', 'stop', 'words', \"''\", 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'used', 'natural', 'language', 'processing', 'tools', ',', 'indeed', 'tools', 'even', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "tokens = nltk.tokenize.word_tokenize(article)\n", 255 | "tokens = [token for token in tokens if not token in nltk_stopwords]\n", 256 | "\n", 257 | "print('Original Article: %s' % (article))\n", 258 | "print()\n", 259 | "print(tokens)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### jieba\n", 267 | "For Chinese word, we use the similar ideas to filter out words if it is stop words." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 10, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "jieba Version: 0.39\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "import jieba\n", 285 | "print('jieba Version: %s' % jieba.__version__)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 11, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# Capture from https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt\n", 297 | "\n", 298 | "jieba_stop_words = [\n", 299 | " '的', '了', '和', '是', '就', '都', '而', '及', '與', \n", 300 | " '著', '或', '一個', '沒有', '我們', '你們', '妳們', \n", 301 | " '他們', '她們', '是否'\n", 302 | "]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Different from English, word will not be removed if stop words belongs to part of word. For example, \"是\" is defined as stop words but \"但是\" still exist as \"但是\" is a kind of \"single word\". Therefore, word tokenization is very important for stop word removal." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 12, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stderr", 319 | "output_type": "stream", 320 | "text": [ 321 | "Building prefix dict from the default dictionary ...\n", 322 | "Loading model from cache /tmp/jieba.cache\n" 323 | ] 324 | }, 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "Original Article: 在信息檢索中,為節省存儲空間和提高搜索效率,在處理自然語言數據(或文本)之前或之後會自動過濾掉某些字或詞,這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的,生成後的停用詞會形成一個停用詞表。但是,並沒有一個明確的停用詞表能夠適用於所有的工具。甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。\n", 330 | "\n" 331 | ] 332 | }, 333 | { 334 | "name": "stderr", 335 | "output_type": "stream", 336 | "text": [ 337 | "Loading model cost 1.118 seconds.\n", 338 | "Prefix dict has been built succesfully.\n" 339 | ] 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "['在', '信息', '檢索', '中', ',', '為節', '省存', '儲空間', '提高', '搜索', '效率', ',', '在', '處理', '自然', '語言數', '據', '(', '文本', ')', '之前', '之後會', '自動', '過濾', '掉', '某些', '字', '詞', ',', '這些', '字', '詞', '即', '被', '稱', '為', 'Stop', ' ', 'Words', '(', '停用', '詞', ')', '。', '不要', '把', '停用', '詞', '安全', '口令', '混淆', '。', ' ', '這些', '停用', '詞', '人工', '輸入', '、', '非自動', '化生成', ',', '生成', '後', '停用', '詞會', '形成', '停用', '詞表', '。', '但是', ',', '並沒有', '明確', '停用', '詞表能夠', '適用', '於', '所有', '工具', '。', '甚至', '有', '一些', '工具', '明確', '地', '避免', '使用', '停用', '詞來', '支持', '短語', '搜索', '。']\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "print('Original Article: %s' % (article2))\n", 351 | "print()\n", 352 | "words = jieba.cut(article2, cut_all=False)\n", 353 | "words = [str(word) for word in words if not str(word) in jieba_stop_words]\n", 354 | "print(words)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "source": [ 363 | "# Conclusion" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "The procedure of removing stop words is similar across libraries so the most importance is defining your own stop words. In initial phase, pre-defined stop words can be adopted but more and more words should be added into stop word list later on. \n", 371 | "\n", 372 | "So besides, using spaCy or NLTK pre-defined stop words, we can use other words which are defined by other party such as Stanford NLP and Rank NL. You may check out the stop list from \n", 373 | "\n", 374 | "Stanford NLP: https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt\n", 375 | "\n", 376 | "Rank NL: https://www.ranks.nl/stopwords\n", 377 | "\n", 378 | "jieba: https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt" 379 | ] 380 | } 381 | ], 382 | "metadata": { 383 | "kernelspec": { 384 | "display_name": "Python 3", 385 | "language": "python", 386 | "name": "python3" 387 | }, 388 | "language_info": { 389 | "codemirror_mode": { 390 | "name": "ipython", 391 | "version": 3 392 | }, 393 | "file_extension": ".py", 394 | "mimetype": "text/x-python", 395 | "name": "python", 396 | "nbconvert_exporter": "python", 397 | "pygments_lexer": "ipython3", 398 | "version": "3.5.2" 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 2 403 | } 404 | -------------------------------------------------------------------------------- /sample/nlp-word_mover_distance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Word Mover's Distance\n", 8 | "\n", 9 | "![](https://cdn.pixabay.com/photo/2017/10/23/23/41/hong-kong-2883036_960_720.jpg)\n", 10 | "Photo: https://pixabay.com/en/hong-kong-harbor-boats-water-night-2883036/\n", 11 | "\n", 12 | "Word Mover's Distance (WMD) is proposed fro distance measurement between 2 documents (or sentences). It leverages Word Embeddings power to overcome those basic distance measurement limitations. \n", 13 | "\n", 14 | "WMD[1] was introduced by Kusner et al. in 2015. Instead of using Euclidean Distance and other bag-of-words based distance measurement, they proposed to use word embeddings to calculate the similarities. To be precise, it uses normalized [Bag-of-Words](https://towardsdatascience.com/3-basic-approaches-in-bag-of-words-which-are-better-than-word-embeddings-c2cbc7398016) and [Word Embeddings](https://medium.com/towards-data-science/3-silver-bullets-of-word-embedding-in-nlp-10fa8f50cc5a) to calculate the distance between documents.\n", 15 | "\n", 16 | "After reading this article, you will understand:\n", 17 | "- Earth Mover Distance (EMD)\n", 18 | "- Word Mover's Distance (WMD)\n", 19 | "- Relaxed Word Moving Distance (RWMD)\n", 20 | "- WMD Implementation\n", 21 | "- Take Away" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Earth Mover Distance (EMD)\n", 29 | "Before introducing WMD, I have to share the idea of Earth Mover Distance (EMD) first because the core part of WMD is EMD.\n", 30 | "\n", 31 | "EMD [2] solves transportation problem. For instance, we have m and n while m and n denote a set of suppliers and warehouses. The target is going to minimize transportation cost such that shipping all goods from m to n. Given that there are constraints:\n", 32 | "\n", 33 | "\n", 34 | "\n", 35 | "\n", 36 | "\n", 37 | "- Only allowing transport from m to n. Not allowing transport from n to m\n", 38 | "- Total number of sending cargoes cannot exceed total capacity of m\n", 39 | "- Total number of receiving cargoes cannot exceed total capacity of n\n", 40 | "- Maximum number of transportation is the minimum between total cargoes in m and total cargoes in n\n", 41 | "\n", 42 | "The denotations are:\n", 43 | "- p: Set of origin\n", 44 | "- q: Set of destination\n", 45 | "- f(i,j): flow from i to j\n", 46 | "- m: Number of origin\n", 47 | "- n: Number of destination\n", 48 | "- w(i, j): Number of cargo transport from i to j\n", 49 | "\n", 50 | "To optimal flow F, the linear formula is\n", 51 | "\n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | "# Word Mover's Distance (WMD)\n", 60 | "In the previous blog, I shared how we can use simple way to find the \"similarity\" between two documents (or sentences). At that time, Euclidean Distance, Cosine Distance and Jaccard Similarity are introduced but it has some limitations. WMD is designed to __overcome synonym problem__.\n", 61 | "\n", 62 | "The typical example is \n", 63 | "- Sentence 1: Obama speaks to the media in Illinois\n", 64 | "- Sentence 2: The president greets the press in Chicago\n", 65 | "\n", 66 | "Except the stop words, there is no common words among two sentences but both of them are taking about same topic (at that time).\n", 67 | "\n", 68 | "\n", 69 | "\n", 70 | "\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "\n", 76 | "WMD use word embeddings to calculate the distance so that it can calculate even though there is no common word. The assumption is that similar words should have similar vectors.\n", 77 | "\n", 78 | "First of all, lower case and removing stopwords is an essential step to reduce complexity and preventing misleading. \n", 79 | "- Sentence 1: obama speaks media illinois\n", 80 | "- Sentence 2: president greets press chicago\n", 81 | "\n", 82 | "Retrieve vectors from any pre-trained word embeddings models. It can be GloVe, word2vec, fasttext or custom vectors. After that it using normalized bag-of-words (nBOW) to represent the weight or importance. It assumes that higher frequency implies that it is more important.\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "\n", 89 | "It allows transfer every word from sentence 1 to sentence 2 because algorithm does not know \"obama\" should transfer to \"president\". At the end it will choose the minimum transportation cost to transport every word from sentence 1 to sentence 2.\n", 90 | "\n", 91 | "# Relaxed Word Moving Distance (RWMD)\n", 92 | "The best average time of solving WMD is about O(p³ log p) while p is number of unique word. It is a little bit slow so there are two approaches to improve the reduce computation time. First one is __Word Centroid Distance (WCD)__ which is summarizing the lower bound distance between. Second approach is __Relaxed Word Moving Distance (RWMD)__ which is using the closet distance without considering there are multiple words transforming to single words.\n", 93 | "\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "Taking the previous sentence as an example. Assuming that shortest word in sentence of all word in sentence 1 is \"president\", it will use summarize these score instead of pairing one by one. So that the time complexity reduce to O(p²).\n", 98 | "\n", 99 | "\n", 100 | "\n", 101 | "\n", 102 | "\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "# WMD Implementation\n", 110 | "By using gensim, we only need to provide two list of tokens then it will take the rest of calculation" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 60, 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "\"\"\"\n", 122 | " News headline get from \n", 123 | " \n", 124 | " https://www.reuters.com/article/us-musk-tunnel/elon-musks-boring-co-to-build-high-speed-airport-link-in-chicago-idUSKBN1JA224\n", 125 | " http://money.cnn.com/2018/06/14/technology/elon-musk-boring-company-chicago/index.html\n", 126 | " https://www.theverge.com/2018/6/13/17462496/elon-musk-boring-company-approved-tunnel-chicago\n", 127 | "\n", 128 | "\"\"\"\n", 129 | "\n", 130 | "news_headline1 = \"Elon Musk's Boring Co to build high-speed airport link in Chicago\"\n", 131 | "news_headline2 = \"Elon Musk's Boring Company to build high-speed Chicago airport link\"\n", 132 | "news_headline3 = \"Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\"\n", 133 | "news_headline4 = \"Both apple and orange are fruit\"\n", 134 | "\n", 135 | "news_headlines = [news_headline1, news_headline2, news_headline3, news_headline4]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 65, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "gensim version: 3.4.0\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "# Load Word Embedding Model\n", 153 | "import gensim\n", 154 | "print('gensim version: %s' % gensim.__version__)\n", 155 | "glove_model = gensim.models.KeyedVectors.load_word2vec_format('../model/text/stanford/glove/glove.6B.50d.vec')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 66, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "[['elon', 'musk', \"'s\", 'boring', 'co', 'build', 'high', '-', 'speed', 'airport', 'link', 'chicago'], ['elon', 'musk', \"'s\", 'boring', 'company', 'build', 'high', '-', 'speed', 'chicago', 'airport', 'link'], ['elon', 'musk', '’s', 'boring', 'company', 'approved', 'build', 'high', '-', 'speed', 'transit', 'downtown', 'chicago', 'o’hare', 'airport'], ['both', 'apple', 'orange', 'fruit']]\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "# Remove stopwords\n", 173 | "import spacy\n", 174 | "spacy_nlp = spacy.load('en')\n", 175 | "\n", 176 | "headline_tokens = []\n", 177 | "for news_headline in news_headlines:\n", 178 | " headline_tokens.append([token.text.lower() for token in spacy_nlp(news_headline) if not token.is_stop])\n", 179 | "\n", 180 | "print(headline_tokens)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 67, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Headline: Elon Musk's Boring Co to build high-speed airport link in Chicago\n", 193 | "==================================================\n", 194 | "\n", 195 | "--------------------------------------------------\n", 196 | "Comparing to: Elon Musk's Boring Co to build high-speed airport link in Chicago\n", 197 | "distance = 0.0000\n", 198 | "--------------------------------------------------\n", 199 | "Comparing to: Elon Musk's Boring Company to build high-speed Chicago airport link\n", 200 | "distance = 0.3589\n", 201 | "--------------------------------------------------\n", 202 | "Comparing to: Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\n", 203 | "distance = 1.9456\n", 204 | "--------------------------------------------------\n", 205 | "Comparing to: Both apple and orange are fruit\n", 206 | "distance = 5.4350\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "subject_headline = news_headlines[0]\n", 212 | "subject_token = headline_tokens[0]\n", 213 | "\n", 214 | "print('Headline: ', subject_headline)\n", 215 | "print('=' * 50)\n", 216 | "print()\n", 217 | "\n", 218 | "for token, headline in zip(headline_tokens, news_headlines):\n", 219 | " print('-' * 50)\n", 220 | " print('Comparing to:', headline)\n", 221 | " distance = glove_model.wmdistance(subject_token, token)\n", 222 | " print('distance = %.4f' % distance)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "In gensim implementation, OOV will be removed so that it will not throw an exception or using random vector." 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "# Take Away\n", 237 | "For source code, you may check out from my github repo.\n", 238 | "- The advantage of WMD are __hyper-parameter free and overcoming synonym problem__.\n", 239 | "- Same as those simple approaches, WMD __does not consider ordering__.\n", 240 | "- The __time complexity is an issue__. The original version is O(p³ log p) while the enhanced version is still O(p²).\n", 241 | "- __Pre-train vectors may not apply to all scenario__.\n", 242 | "\n", 243 | "# Reference\n", 244 | "[1] Kusner Matt J., Sun Yu, Kolkin Nicholas I., Weinberger Kilian Q. From Word Embeedings To Document Distance. 2015. http://proceedings.mlr.press/v37/kusnerb15.pdf\n", 245 | "[2] EMD Theory: https://en.wikipedia.org/wiki/Earth_mover%27s_distance" 246 | ] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.5.2" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 2 270 | } 271 | -------------------------------------------------------------------------------- /sample/nlp-word_tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Word Tokenization](http://youthvoices.net/sites/default/files/image/69585/sep/persuasive-landing-pages-words-have-power.jpg)\n", 8 | "\n", 9 | "Source: http://youthvoices.net/discussion/will-you-1-powerful-words" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Word Tokenization\n", 17 | "To tackle text related problem in Machine Learning area, tokenization is one of the common pre-processing. In this article, we will go through how we can handle work toeknization and sentence tokenization by using three libraries which are spaCy, NLTK and jieba (for Chinese word)." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n", 29 | "\n", 30 | "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n", 31 | "converting a sequence of characters (such as in a computer program or web page) into a \\\n", 32 | "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n", 33 | "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n", 34 | "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n", 35 | "which together analyze the syntax of programming languages, web pages, and so forth.'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "article3 = '你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛'" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "# Capture from https://zh.wikipedia.org/wiki/%E8%AF%8D%E6%B3%95%E5%88%86%E6%9E%90\n", 69 | "\n", 70 | "article4 = '词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器,也叫扫描器。词法分析器一般以函数的形式存在,供语法分析器调用。'" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# spaCy" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "spaCy Version: 2.0.11\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "import spacy\n", 95 | "print('spaCy Version: %s' % spacy.__version__)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "spacy_nlp = spacy.load('en_core_web_sm')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 9, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n", 119 | "\n", 120 | "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer,[1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print('Original Article: %s' % (article))\n", 126 | "print()\n", 127 | "doc = spacy_nlp(article)\n", 128 | "tokens = [token.text for token in doc]\n", 129 | "print(tokens)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Not all special character will be seperated." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 8, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n", 149 | "\n", 150 | "['ConcateStringAnd123', 'ConcateSepcialCharacter_!@', '#', '!', '@#$%^&*()_+', '0123456']\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "print('Original Article: %s' % (article2))\n", 156 | "print()\n", 157 | "doc = spacy_nlp(article2)\n", 158 | "tokens = [token.text for token in doc]\n", 159 | "print(tokens)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "First step of spaCy separates word by space and then applying some guidelines such as exception rule, prefix, suffix etc." 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "# NLTK" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 9, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "NTLK Version: 3.2.5\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "import nltk\n", 191 | "print('NTLK Version: %s' % nltk.__version__)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n", 204 | "\n", 205 | "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer', ',', '[', '1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "print('Original Article: %s' % (article))\n", 211 | "print()\n", 212 | "print(nltk.word_tokenize(article))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Some special character (e.g. _) will not be seperated" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 11, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n", 232 | "\n", 233 | "['ConcateStringAnd123', 'ConcateSepcialCharacter_', '!', '@', '#', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_+', '0123456']\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "print('Original Article: %s' % (article2))\n", 239 | "print()\n", 240 | "print(nltk.word_tokenize(article2))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "The behavior is a little difference from spaCy. NLTK treats most of special character as a \"word\" except \"_\". Of course, number will be tokenized as well." 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "# jieba" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 12, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "jieba Version: 0.39\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "import jieba\n", 272 | "print('jieba Version: %s' % jieba.__version__)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 13, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stderr", 282 | "output_type": "stream", 283 | "text": [ 284 | "Building prefix dict from the default dictionary ...\n", 285 | "Loading model from cache /tmp/jieba.cache\n" 286 | ] 287 | }, 288 | { 289 | "name": "stdout", 290 | "output_type": "stream", 291 | "text": [ 292 | "Original Article: 你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛\n", 293 | "\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "Loading model cost 1.086 seconds.\n", 301 | "Prefix dict has been built succesfully.\n" 302 | ] 303 | }, 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "['你', '的', '姿態', ' ', '你', '的', '青睞', ' ', '我', '存在', '在', '你', '的', '存在', ' ', '你', '以', '為', '愛', ' ', '就是', '被', '愛']\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "print('Original Article: %s' % (article3))\n", 314 | "print()\n", 315 | "\n", 316 | "words = jieba.cut(article3, cut_all=False)\n", 317 | "words = [str(word) for word in words]\n", 318 | "print(words)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 14, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "Original Article: 词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器,也叫扫描器。词法分析器一般以函数的形式存在,供语法分析器调用。\n", 331 | "\n", 332 | "['词法', '分析', '是', '计算机科学', '中将', '字符', '序列', '转换', '为', '标记', '序列', '的', '过程', '。', '进行', '词法', '分析', '的', '程序', '或者', '函数', '叫作', '词法', '分析器', ',', '也', '叫', '扫描器', '。', '词法', '分析器', '一般', '以', '函数', '的', '形式', '存在', ',', '供', '语法分析', '器', '调用', '。']\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "print('Original Article: %s' % (article4))\n", 338 | "print()\n", 339 | "\n", 340 | "words = jieba.cut(article4, cut_all=False)\n", 341 | "words = [str(word) for word in words]\n", 342 | "print(words)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "source": [ 351 | "jieba does a great job on tokenizes Chinese word (both simplified chinese to traditional chinese)." 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "# Conculsion" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "spaCy seems like having a intelligence on tokenize and the performance is better than NLTK. If you need to tokenize, jieba is a good choice for you. Also, studied spaCy (version 2.x) Chinese language implementation. They wrapped jieba library. From lang/zh/__init__.py" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "# copy from spaCy/lang/zh/__init__.py\n", 377 | "class Chinese(Language):\n", 378 | " lang = 'zh'\n", 379 | " Defaults = ChineseDefaults # override defaults\n", 380 | "\n", 381 | " def make_doc(self, text):\n", 382 | " try:\n", 383 | " import jieba\n", 384 | " except ImportError:\n", 385 | " raise ImportError(\"The Chinese tokenizer requires the Jieba library: \"\n", 386 | " \"https://github.com/fxsjy/jieba\")\n", 387 | " words = list(jieba.cut(text, cut_all=False))\n", 388 | " words = [x for x in words if x]\n", 389 | " return Doc(self.vocab, words=words, spaces=[False]*len(words))" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "On the other hand, Stanford NLP also released a word tokenize library for multiple language including English and Chinese. You may visit the official website if you are interested.\n", 397 | "\n", 398 | "URL: https://nlp.stanford.edu/software/tokenizer.html" 399 | ] 400 | } 401 | ], 402 | "metadata": { 403 | "kernelspec": { 404 | "display_name": "Python 3", 405 | "language": "python", 406 | "name": "python3" 407 | }, 408 | "language_info": { 409 | "codemirror_mode": { 410 | "name": "ipython", 411 | "version": 3 412 | }, 413 | "file_extension": ".py", 414 | "mimetype": "text/x-python", 415 | "name": "python", 416 | "nbconvert_exporter": "python", 417 | "pygments_lexer": "ipython3", 418 | "version": "3.5.2" 419 | } 420 | }, 421 | "nbformat": 4, 422 | "nbformat_minor": 2 423 | } 424 | -------------------------------------------------------------------------------- /sample/nlp_lemmatization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Lemmatization](https://www.tell-a-tale.com/wp-content/uploads/2018/03/time-for-a-change-2015164_960_720_compressed-810x539.jpg)\n", 8 | "\n", 9 | "Source: https://www.tell-a-tale.com/unbox-idea-social-open-mic-tell-a-story-to-change-world/" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Lemmatization\n", 17 | "\n", 18 | "In English words (Other language as well), same word may have different form such as \"affected\", \"affects\" and \"affect\". \n", 19 | "To have a smaller size vocabulary and better representation on NLP problem, we want to have a single word to represent \"\", \"\" in some scenarios. In this article, we will go through some libraries to work on lemmatization." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Capture from https://en.wikipedia.org/wiki/Lemmatisation\n", 31 | "\n", 32 | "article = \"Lemmatisation (or lemmatization) in linguistics is the process of grouping together \\\n", 33 | "the inflected forms of a word so they can be analysed as a single item, identified by the word's \\\n", 34 | "lemma, or dictionary form.\"" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### spaCy" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "spaCy Version: 2.0.11\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "import spacy\n", 59 | "print('spaCy Version: %s' % (spacy.__version__))\n", 60 | "spacy_nlp = spacy.load('en_core_web_sm')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Original Article: Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.\n", 73 | "\n", 74 | "Original : Lemmatisation, New: lemmatisation\n", 75 | "Original : linguistics, New: linguistic\n", 76 | "Original : is, New: be\n", 77 | "Original : grouping, New: group\n", 78 | "Original : inflected, New: inflect\n", 79 | "Original : forms, New: form\n", 80 | "Original : they, New: -PRON-\n", 81 | "Original : analysed, New: analyse\n", 82 | "Original : identified, New: identify\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "doc = spacy_nlp(article)\n", 88 | "tokens = [token.text for token in doc]\n", 89 | "\n", 90 | "print('Original Article: %s' % (article))\n", 91 | "print()\n", 92 | "\n", 93 | "for token in doc:\n", 94 | " if token.text != token.lemma_:\n", 95 | " print('Original : %s, New: %s' % (token.text, token.lemma_))" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "spaCy will convert word to lower case and changing past tense, gerund form (other tenses as well) to present tense. Also, \"they\" normalize to \"-PRON-\" which is pronoun." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "### NLTK" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "NLTK Version: 3.2.5\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "import nltk \n", 127 | "print('NLTK Version: %s' % (nltk.__version__))\n", 128 | "\n", 129 | "nltk.download('wordnet')\n", 130 | "\n", 131 | "wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "Original Article: Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.\n", 144 | "\n", 145 | "Original : forms, New: form\n", 146 | "Original : as, New: a\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "tokens = nltk.word_tokenize(article)\n", 152 | "\n", 153 | "print('Original Article: %s' % (article))\n", 154 | "print()\n", 155 | "\n", 156 | "for token in tokens:\n", 157 | " lemmatized_token = wordnet_lemmatizer.lemmatize(token)\n", 158 | " \n", 159 | " if token != lemmatized_token:\n", 160 | " print('Original : %s, New: %s' % (token, lemmatized_token))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "The result is totally difference from spaCy. Only two words are lemmaizated and one of them \"as\" is strange. It seems that \"s\" will removed if it is the last character. Therefore, \"as\" is converted to \"a\"" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Conclusion" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "The result of spaCy is better and expected. Taking \"as\" an example, it seems that spaCy\" has a kind of \"intelligent\" that it will convert \"as\" as \"a\". Therefore, I further studying on source code, it seems like there are well defined word and rule to support lemmatization." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "# Copy from spacy/lang/en/lemmatizer/_lemma_rules.py\n", 193 | "ADJECTIVE_RULES = [\n", 194 | " [\"er\", \"\"],\n", 195 | " [\"est\", \"\"],\n", 196 | " [\"er\", \"e\"],\n", 197 | " [\"est\", \"e\"]\n", 198 | "]\n", 199 | "# Copy from spacy/lang/en/lemmatizer/_nouns_irreg.py\n", 200 | "NOUNS_IRREG = {\n", 201 | " \"aardwolves\": (\"aardwolf\",),\n", 202 | " \"abaci\": (\"abacus\",),\n", 203 | " \"aboideaux\": (\"aboideau\",),\n", 204 | " \"aboiteaux\": (\"aboiteau\",),\n", 205 | " \"abscissae\": (\"abscissa\",),\n", 206 | " \"acanthi\": (\"acanthus\",),\n", 207 | " \"acari\": (\"acarus\",),\n", 208 | "# ...\n", 209 | "}" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 7, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "# Copy from spacy/lang/fr/lemmatizer.py\n", 221 | "LOOKUP = {\n", 222 | " \"Ap.\": \"après\",\n", 223 | " \"Apr.\": \"après\",\n", 224 | " \"Auxerroises\": \"Auxerrois\",\n", 225 | " \"Av.\": \"avenue\",\n", 226 | " \"Ave.\": \"avenue\",\n", 227 | " \"Avr.\": \"avril\",\n", 228 | " \"Bd.\": \"boulevard\",\n", 229 | " \"Boliviennes\": \"Bolivien\",\n", 230 | " \"Canadiennes\": \"Canadien\",\n", 231 | " \"Cannoises\": \"Cannois\",\n", 232 | "# ...\n", 233 | "}" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "TL;DR\n", 241 | "\n", 242 | "How does spaCy work on lemmatizion in Enlgish. From source code, it will go through POS (Part of Speech) first. Lemmatization will be performed if the word is noun, verb, adjective or adverb. Later on, it will check whether existing in irregular list. Lemmatized word will be returned if existing in irregular list. Otherwise, it will go the pre-defined suffix rule." 243 | ] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.5.2" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /sample/preprocessing/nlp-preprocessing-string_matching-fuzzywuzzy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%reload_ext autoreload\n", 17 | "%autoreload 2\n", 18 | "\n", 19 | "import sys, os\n", 20 | "def add_aion(curr_path=None):\n", 21 | " if curr_path is None:\n", 22 | " dir_path = os.getcwd()\n", 23 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n", 24 | " if target_path not in sys.path:\n", 25 | " sys.path.insert(0, target_path)\n", 26 | " \n", 27 | "add_aion()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "2018-12-27 19:39:50.455930. [DOWNLOAD] From https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv to ../../data/location/country.csv\n", 40 | "0 : Afghanistan\n", 41 | "1 : Åland Islands\n", 42 | "2 : Albania\n", 43 | "3 : Algeria\n", 44 | "4 : American Samoa\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "from aion.helper.file_helper import FileHelper\n", 51 | "\n", 52 | "file_helper = FileHelper()\n", 53 | "countries_file_path = file_helper.download(\n", 54 | " src='https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv', \n", 55 | " dest_dir='../../data/location/', dest_file='country.csv', force_download=True)\n", 56 | "\n", 57 | "country_df = pd.read_csv(countries_file_path)\n", 58 | "countries = country_df['value'].tolist()\n", 59 | "\n", 60 | "for i, country in enumerate(countries[:5]):\n", 61 | " print(i, \":\", country)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Fuzzywuzzy" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from fuzzywuzzy import fuzz\n", 78 | "from fuzzywuzzy import process" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "[('Hong Kong SAR China', 90), ('Congo - Kinshasa', 57)]\n", 91 | "[('Japan', 60), ('Yemen', 60)]\n", 92 | "[('United States', 96), ('United Arab Emirates', 86)]\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# Default scorer is Weighed Ratio\n", 98 | "for location in ['Hong Kong', 'jepen', 'United tates']:\n", 99 | " result = process.extract(location, countries, limit=2)\n", 100 | " print(result)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "[('Edwards', 92), ('Edwards2', 86), ('drawdE', 50)]" 112 | ] 113 | }, 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "# Ratio\n", 121 | "process.extract('Edward', ['Edwards', 'Edwards2', 'drawdE'], scorer=fuzz.ratio)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "[('Hong Kong SAR China', 64), ('Congo - Kinshasa', 48), ('Mongolia', 47)]" 133 | ] 134 | }, 135 | "execution_count": 7, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# Partial Ratio\n", 142 | "process.extract('Hong Kong', countries, scorer=fuzz.QRatio, limit=3)" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.5.5" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /sample/resources/LSI and LDA.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/sample/resources/LSI and LDA.pptx -------------------------------------------------------------------------------- /sample/util/nlp-util-spell_corrector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from collections import Counter\n", 19 | "from sklearn.datasets import fetch_20newsgroups\n", 20 | "import re\n", 21 | "\n", 22 | "\n", 23 | "corpus = []\n", 24 | "for line in fetch_20newsgroups().data:\n", 25 | " line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n", 26 | " line = re.sub('[^a-z ]', ' ', line)\n", 27 | " tokens = line.split(' ')\n", 28 | " tokens = [token for token in tokens if len(token) > 0]\n", 29 | " corpus.extend(tokens)\n", 30 | "\n", 31 | "corpus = Counter(corpus)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "%reload_ext autoreload\n", 41 | "%autoreload 2\n", 42 | "\n", 43 | "import sys, os\n", 44 | "def add_aion(curr_path=None):\n", 45 | " if curr_path is None:\n", 46 | " dir_path = os.getcwd()\n", 47 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n", 48 | " if target_path not in sys.path:\n", 49 | "# print('Added %s into sys.path.' % (target_path))\n", 50 | " sys.path.insert(0, target_path)\n", 51 | " \n", 52 | "add_aion()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# SpellCorrector" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Known Result: set()\n", 72 | "Edit1 Result: {'edward', 'edwards'}\n", 73 | "Edit2 Result: {'gedwards', 'edward', 'eduard', 'edvard', 'tedward', 'edgardo', 'edwards', 'tedwards'}\n" 74 | ] 75 | }, 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "'edward'" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "from aion.util.spell_corrector import SpellCorrector\n", 89 | "\n", 90 | "spell_corrector = SpellCorrector(dictionary=corpus, verbose=1)\n", 91 | "spell_corrector.correction('edwardd')" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.5.2" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /sample/util/nlp-util-symspell.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Ingestion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from collections import Counter\n", 19 | "from sklearn.datasets import fetch_20newsgroups\n", 20 | "import re\n", 21 | "\n", 22 | "\n", 23 | "corpus = []\n", 24 | "for line in fetch_20newsgroups().data:\n", 25 | " line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n", 26 | " line = re.sub('[^a-z ]', ' ', line)\n", 27 | " tokens = line.split(' ')\n", 28 | " tokens = [token for token in tokens if len(token) > 0]\n", 29 | " corpus.extend(tokens)\n", 30 | "\n", 31 | "corpus = Counter(corpus)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "%reload_ext autoreload\n", 41 | "%autoreload 2\n", 42 | "\n", 43 | "import sys, os\n", 44 | "def add_aion(curr_path=None):\n", 45 | " if curr_path is None:\n", 46 | " dir_path = os.getcwd()\n", 47 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n", 48 | " if target_path not in sys.path:\n", 49 | "# print('Added %s into sys.path.' % (target_path))\n", 50 | " sys.path.insert(0, target_path)\n", 51 | " \n", 52 | "add_aion()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "source": [ 61 | "# Symspell" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "from aion.util.spell_check import SymSpell" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "Size of dictionary: 89038\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "corpus_dir = '../../data/'\n", 90 | "corpus_file_name = 'spell_check_dictionary.txt'\n", 91 | "\n", 92 | "symspell = SymSpell(verbose=10)\n", 93 | "symspell.build_vocab(\n", 94 | " dictionary=corpus, \n", 95 | " file_dir=corpus_dir, file_name=corpus_file_name)\n", 96 | "\n", 97 | "symspell.load_vocab(corpus_file_path=corpus_dir+corpus_file_name)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Correct single word" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "[{'word': 'edward', 'distance': 1, 'count': 154}, {'word': 'edwards', 'distance': 1, 'count': 50}]\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "results = symspell.correction(word='edwarda')\n", 122 | "print(results)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Correct sentence" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "[{'word': 'hello i am ed area', 'distance': 3}]\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "results = symspell.corrections(sentence='Hello I am Edarda')\n", 147 | "print(results)" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.5.2" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | --------------------------------------------------------------------------------