├── .gitignore
├── README.md
├── aion
    ├── embeddings
    │   ├── cove.py
    │   ├── doc2vec.py
    │   ├── document_embeddings.py
    │   ├── elmo.py
    │   ├── embeddings.py
    │   ├── glove.py
    │   ├── infersent.py
    │   ├── infersent_lib
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── data.py
    │   │   ├── dataset
    │   │   │   ├── get_data.bash
    │   │   │   └── tokenizer.sed
    │   │   ├── encoder
    │   │   │   ├── demo.ipynb
    │   │   │   ├── extract_features.py
    │   │   │   ├── models.py
    │   │   │   └── samples.txt
    │   │   ├── models.py
    │   │   ├── mutils.py
    │   │   └── train_nli.py
    │   ├── sentence_embeddings.py
    │   ├── skip_thoughts.py
    │   └── word_embeddings.py
    ├── helper
    │   ├── __init__.py
    │   └── file_helper.py
    └── util
    │   ├── __init__.py
    │   └── spell_check.py
└── sample
    ├── embeddings
        └── nlp-embeddings-document-doc2vec.ipynb
    ├── nlp-3_basic_distance_measurement_in_text_mining.ipynb
    ├── nlp-bag_of_words.ipynb
    ├── nlp-character_embedding.ipynb
    ├── nlp-distance-edit_distance.ipynb
    ├── nlp-embeddings-sentence-elmo.ipynb
    ├── nlp-embeddings-sentence-infersent.ipynb
    ├── nlp-embeddings-word-cove.ipynb
    ├── nlp-lsa_lda.ipynb
    ├── nlp-model_interpretation-201808.ipynb
    ├── nlp-model_interpretation.ipynb
    ├── nlp-model_interpretation_anchor.ipynb
    ├── nlp-model_interpretation_shap.ipynb
    ├── nlp-named_entity_recognition.ipynb
    ├── nlp-part_of_speech.ipynb
    ├── nlp-sentence_tokenization.ipynb
    ├── nlp-skip_thoughts.ipynb
    ├── nlp-stemming.ipynb
    ├── nlp-stop_words.ipynb
    ├── nlp-text_summarization_extractive.ipynb
    ├── nlp-word_embedding.ipynb
    ├── nlp-word_mover_distance.ipynb
    ├── nlp-word_tokenization.ipynb
    ├── nlp_lemmatization.ipynb
    ├── preprocessing
        └── nlp-preprocessing-string_matching-fuzzywuzzy.ipynb
    ├── resources
        └── LSI and LDA.pptx
    └── util
        ├── nlp-util-spell_corrector.ipynb
        └── nlp-util-symspell.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc


--------------------------------------------------------------------------------
/aion/embeddings/cove.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | 
 3 | from .word_embeddings import WordEmbeddings
 4 | from .glove import GloVeEmbeddings
 5 | 
 6 | '''
 7 |     Source: https://github.com/rgsachin/CoVe
 8 | '''
 9 | 
10 | 
11 | class CoVeEmbeddings(WordEmbeddings):
12 |     COVE_MODEL_KERAS_URL = 'https://github.com/rgsachin/CoVe/raw/master/Keras_CoVe.h5'
13 |     
14 |     def __init__(self, 
15 |                  word_embeddings_dir, 
16 |                  handle_oov=True, oov_vector_type='random', 
17 |                  padding=True, pad_vector_type='random', 
18 |                  max_sequence_length=50, tokenizer=None,
19 |                  verbose=0):
20 |         super().__init__(verbose=verbose)
21 |         
22 |         if tokenizer is None:
23 |             self.tokenizer = self._tokenizer_space
24 |         
25 |         self.word_embeddings_dir = word_embeddings_dir
26 |         self.handle_oov = handle_oov
27 |         self.oov_vector_type = oov_vector_type
28 |         self.padding = padding
29 |         self.pad_vector_type = pad_vector_type
30 |         self.max_sequence_length = max_sequence_length
31 |         
32 |     def load_model(self, dest_dir, src=None, trainable=True, verbose=0):
33 |         if src is None:
34 |             src = self.COVE_MODEL_KERAS_URL
35 |         
36 |         file_path = self.download(
37 |             src=src, dest_dir=dest_dir, dest_file=None, uncompress=False)
38 |     
39 |         self.model = keras.models.load_model(file_path)
40 |         
41 |         self.word_embs_model = GloVeEmbeddings(
42 |             handle_oov=self.handle_oov, oov_vector_type=self.oov_vector_type,
43 |             padding=self.padding, pad_vector_type=self.pad_vector_type, 
44 |             max_sequence_length=self.max_sequence_length)
45 |         self.word_embs_model.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose)
46 |         
47 |     def encode(self, x, tokenize=True):
48 |         if tokenize:
49 |             tokens = [self.tokenizer(sentence) for sentence in x]
50 |         else:
51 |             tokens = x
52 |         
53 |         x_embs = self.word_embs_model.encode(tokens)
54 |         
55 |         return self.model.predict(x_embs)
56 |     
57 |         


--------------------------------------------------------------------------------
/aion/embeddings/doc2vec.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 2 | 
 3 | from .document_embeddings import DocumentEmbeddings
 4 | 
 5 | 
 6 | class Doc2VecEmbeddings(DocumentEmbeddings):
 7 |     def __init__(self, 
 8 |                  merge_mode="concat", algorithms="dm", 
 9 |                  word_dimension=300, min_word_count=1, 
10 |                  word_window=10, n_job=4, 
11 |                  train_epoch=10, infer_epoch=5,
12 |                  infer_aplha=0.1, infer_min_alpha=0.0001,
13 |                  verbose=0):
14 |         super().__init__(verbose=verbose)
15 |         
16 |         self.merge_mode = merge_mode
17 |         if merge_mode == 'concat':
18 |             self.dm_concat = 1
19 |             self.dm_mean = None
20 |         elif merge_mode == 'mean':
21 |             self.dm_concat = None
22 |             self.dm_mean = 1
23 |         else:
24 |             raise Exception('merge_mode only allows either concat or mean')
25 |         
26 |         self.algorithms = algorithms
27 |         if algorithms == 'dm':
28 |             self.dm = 1
29 |         elif algorithms == 'dbow':
30 |             self.dm = 0
31 |             
32 |         self.word_dimension = word_dimension
33 |         self.min_word_count = min_word_count
34 |         self.word_window = word_window
35 |         self.n_job = n_job
36 |         self.train_epoch = train_epoch
37 |         self.infer_epoch = infer_epoch
38 |         self.infer_alpha = infer_aplha
39 |         self.infer_min_alpha = infer_min_alpha
40 | 
41 |         self.vocab_size = 0
42 |         self.word2idx = {}
43 |         
44 |     def build_vocab(self, documents, training=True, tokenize=True):
45 |         if tokenize:
46 |             docs = [self._tokenizer_space(document) for document in documents]
47 |         else:
48 |             docs = documents
49 | 
50 |         vocab = {}
51 |         for words in docs:
52 |             for word in words:
53 |                 if word not in vocab:
54 |                     vocab[word] = 1
55 | 
56 |         if training:
57 |             self.vocab_size = len(vocab)
58 | 
59 |         
60 |         docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
61 |         return docs
62 |         
63 |     def train(self, documents):
64 |         self.model = Doc2Vec(
65 |             documents, dm_concat=self.dm_concat, dm_mean=self.dm_mean, 
66 |             dm=self.dm, vector_size=self.word_dimension, 
67 |             window=self.word_window, min_count=self.min_word_count, 
68 |             workers=self.n_job)
69 |         
70 |         self.model.train(
71 |             documents, total_words=self.vocab_size, 
72 |             epochs=self.train_epoch)
73 |         
74 |     def encode(self, documents, tokenize=True):
75 |         if tokenize:
76 |             docs = [self._tokenizer_space(document) for document in documents]
77 |         else:
78 |             docs = documents
79 |             
80 |         docs = [
81 |             self.model.infer_vector(
82 |                 document, alpha=self.infer_alpha, 
83 |                 min_alpha=self.infer_min_alpha, 
84 |                 steps=self.infer_epoch)
85 |             for document in docs
86 |         ]
87 |             
88 |         return docs
89 |         
90 |         
91 |         
92 |         


--------------------------------------------------------------------------------
/aion/embeddings/document_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Edward Ma. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import os, datetime
17 | 
18 | from .embeddings import Embeddings
19 | 
20 | 
21 | class DocumentEmbeddings(Embeddings):
22 |     def __init__(self, verbose=0):
23 |         self.verbose = verbose


--------------------------------------------------------------------------------
/aion/embeddings/elmo.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import tensorflow as tf
  4 | import tensorflow_hub as tf_hub
  5 | 
  6 | from .word_embeddings import WordEmbeddings
  7 | 
  8 | 
  9 | class ELMoEmbeddings(WordEmbeddings):
 10 |     ELMO_MODEL_V2_URL = "https://tfhub.dev/google/elmo/2"
 11 | 
 12 |     def __init__(self, layer, verbose=0):
 13 |         super().__init__(verbose=verbose)
 14 |         self.layer = layer
 15 |         
 16 |     def _set_tf_log_level(self, verbose):
 17 |         if verbose >= 30:
 18 |             tf.logging.set_verbosity(tf.logging.INFO)
 19 |         elif verbose >= 20:
 20 |             tf.logging.set_verbosity(tf.logging.WARN)
 21 |         elif verbose >= 10:
 22 |             tf.logging.set_verbosity(tf.logging.DEBUG)
 23 |         else:
 24 |             tf.logging.set_verbosity(tf.logging.ERROR)
 25 |         
 26 |     def load(self, src=None, dest_dir=None, trainable=True, verbose=0):
 27 |         self._log_time(status='LOADING', msg='file', verbose=verbose)
 28 |         self._set_tf_log_level(verbose)
 29 |         
 30 |         if src == None:
 31 |             src = self.ELMO_MODEL_V2_URL
 32 |         
 33 |         if dest_dir is not None:
 34 |             os.environ["TFHUB_CACHE_DIR"] = dest_dir
 35 |         
 36 |         self.model = tf_hub.Module(src, trainable=trainable)
 37 |         
 38 |         self._log_time(status='LOADED', msg='', verbose=verbose)
 39 |         
 40 |         return self.model    
 41 |     
 42 |     def to_keras_layer(self, x):
 43 |         # Source: https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb
 44 |         '''
 45 |             For signature and layer parameters, you can visit https://alpha.tfhub.dev/google/elmo/2
 46 |         '''        
 47 |         return self.model(
 48 |             tf.squeeze(tf.cast(x, tf.string)), 
 49 |             signature="default", as_dict=True)[self.layer]
 50 |     
 51 |     
 52 |     # import operator
 53 | # import datetime
 54 | # import re
 55 | 
 56 | # from bilm.data import Vocabulary
 57 | 
 58 | # class ELMoEmbeddings:
 59 | #     def __init__(self, tokenizer=None, verbose=0):
 60 | #         self.verbose = verbose
 61 |         
 62 | #         self.tokenizer = self.get_tokenizer(tokenizer)
 63 | 
 64 | #     def _space_tokenizer(self, sentence):
 65 | #         # There is some unicode from source data
 66 | # #         return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.encode('ascii', 'ignore').decode('ascii').split(' ') if t != '']
 67 | # #         return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.split(' ') if t != '']
 68 | #         return [t for t in sentence.split(' ') if t != '']
 69 | 
 70 | #     def _spacy_tokenizer(self, sentence, model=None):
 71 | #         if model is None:
 72 | #             import spacy
 73 | #             model = spacy.load('en')
 74 | 
 75 | #         return [t.text.encode('ascii', 'ignore') for t in model(str(sentence)) if t.text != '']
 76 | 
 77 | #     def get_tokenizer(self, tokenizer):
 78 | #         if tokenizer is None or tokenizer == 'space':
 79 | #             tokenizer = self._space_tokenizer
 80 | #         elif tokenizer == 'spacy':
 81 | #             tokenizer = self._spacy_tokenizer
 82 | 
 83 | #         return tokenizer
 84 |     
 85 | #     def preprocess(self, sentence):
 86 | #         normalized_space = sentence.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
 87 | #         normalized_unicode = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', normalized_space)
 88 | 
 89 | #         normalized_text = re.sub(' +',' ', normalized_unicode)
 90 |         
 91 | #         return normalized_text
 92 |     
 93 | #     def get_basic_elements(self, mode):
 94 | #         if mode == 'build':
 95 | #             return ['<S>', '</S>', '<UNK>']
 96 | #         elif mode == 'train':
 97 | #             return ['<S>', '</S>']
 98 | #         return []
 99 | 
100 | #     def build_vocab(self, sentences, mode, vocab_file_path):
101 | #         word_dict = {}
102 |         
103 | #         basic_elements = self.get_basic_elements(mode)
104 | 
105 | #         for sentence in sentences:
106 | #             sentence = self.preprocess(sentence)
107 | #             for w in self.tokenizer(sentence):
108 |                 
109 | #                 if w not in word_dict:
110 | #                     word_dict[w] = 0
111 | #                 word_dict[w] += 1
112 | 
113 | #         word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
114 | #         print('Total Word: %d' % (len(word_dict)))
115 |         
116 | #         with open(vocab_file_path, 'w') as f:
117 | #             for item in basic_elements:
118 | #                 f.write("%s\n" % item)
119 |             
120 | #             for word, count in word_dict:
121 | #                 # Ximenez, characters <-- finding these word to check unicode issue
122 | # #                 print([word])
123 | #                 if word != '':
124 | #                     f.write("%s\n" % word)
125 |                 
126 | #     def build_data(self, sentences, data_file_path):
127 | #         with open(data_file_path, 'w') as f:
128 | #             for sentence in sentences:
129 | #                 sentence = self.preprocess(sentence)
130 | #                 tokens = self.tokenizer(sentence)
131 | #                 if len(tokens) > 0:
132 | #                     f.write("%s\n" % ' '.join(str(tokens)))


--------------------------------------------------------------------------------
/aion/embeddings/embeddings.py:
--------------------------------------------------------------------------------
  1 | import datetime, os, urllib, zipfile
  2 | 
  3 | 
  4 | class Embeddings:
  5 |     def __init__(self, verbose=0):
  6 |         self.verbose = verbose
  7 |         self.model = {}
  8 |         self.model_path = ''
  9 |         
 10 |     def _log_time(self, status, msg, verbose):
 11 |         if self.verbose >= 10 or verbose >= 10:
 12 |             print('%s. [%s] %s' % (datetime.datetime.now(), status, msg))
 13 |             
 14 |     def download(self, src, dest_dir, dest_file, uncompress, housekeep=False, verbose=0):
 15 |         if not os.path.exists(dest_dir):
 16 |             os.makedirs(dest_dir)
 17 |     
 18 |         if dest_file is None:
 19 |             dest_file = os.path.basename(src)
 20 |             
 21 |         if not self.is_file_exist(dest_dir + dest_file):
 22 |             self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose)
 23 |             file = urllib.request.urlopen(src)
 24 |             with open(dest_dir + dest_file,'wb') as output:
 25 |                 output.write(file.read())
 26 |         else:
 27 |             self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose)
 28 |             
 29 |         if uncompress:
 30 |             self.uncompress(dest_dir + dest_file)
 31 |             
 32 |         if uncompress and housekeep:
 33 |             self.housekeep(dest_dir + dest_file)
 34 |             
 35 |             
 36 |         return dest_dir + dest_file
 37 |     
 38 |     """
 39 |         File related
 40 |     """
 41 |     
 42 |     def uncompress(self):
 43 |         raise NotImplemented()
 44 |     
 45 |     def unzip(self, file_path):
 46 |         dest_dir = os.path.dirname(file_path)
 47 |         with zipfile.ZipFile(file_path, "r") as zip_ref:
 48 |             zip_ref.extractall(dest_dir)
 49 |             
 50 |     def housekeep(self, file_path):
 51 |         os.remove(file_path)
 52 |         
 53 |     def is_file_exist(self, file_path):
 54 |         return os.path.exists(file_path)
 55 |     
 56 |     def save(self):
 57 |         raise NotImplemented()
 58 |         
 59 |     def load(self):
 60 |         raise NotImplemented()
 61 |     
 62 |     """
 63 |         Model related
 64 |     """
 65 |         
 66 |     def get_model(self):
 67 |         return self.model
 68 |     
 69 |     def set_model(self, model):
 70 |         self.model = model
 71 |     
 72 |     def load(self, src=None, dest_dir=None, trainable=True, verbose=0):
 73 |         raise NotImplemented()
 74 |         
 75 |     """
 76 |         Vocabulary realted
 77 |     """
 78 |     
 79 |     def load_vocab(self, **kwargs):
 80 |         raise NotImplemented()
 81 |         
 82 |     def build_vocab(self):
 83 |         raise NotImplemented()
 84 |     
 85 |     def get_vocab(self):
 86 |         raise NotImplemented()
 87 |         
 88 |     def _tokenizer_space(self, sentence):
 89 |         return sentence.split(' ')
 90 |         
 91 |     """
 92 |         Vector related
 93 |     """
 94 |     
 95 |     def train(self):
 96 |         raise NotImplemented()
 97 |         
 98 |     def encode(self, sentences):
 99 |         raise NotImplemented()
100 |         
101 |     def visualize(self):
102 |         raise NotImplemented()
103 |         
104 |     """
105 |         Netowrk realted
106 |     """
107 |     
108 |     def to_numpy_layer(self):
109 |         raise NotImplemented()
110 |         
111 |     def to_keras_layer(self):
112 |         raise NotImplemented()
113 |         
114 |     def to_tensorflow_layer(self):
115 |         raise NotImplemented()
116 |     
117 |     def to_pytorch_layer(self):
118 |         raise NotImplemented()


--------------------------------------------------------------------------------
/aion/embeddings/glove.py:
--------------------------------------------------------------------------------
 1 | import datetime, os, zipfile
 2 | import numpy as np
 3 | 
 4 | from .word_embeddings import WordEmbeddings
 5 | 
 6 | 
 7 | class GloVeEmbeddings(WordEmbeddings):
 8 |     GLOVE_COMMON_CRAWL_MODEL_URL = 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
 9 |     
10 |     def __init__(self, 
11 |                  handle_oov=True, oov_vector=None, oov_vector_type='zero',
12 |                  padding=True, pad_vector=None, pad_vector_type='zero',
13 |                  max_sequence_length=10, dimension=300,
14 |                  verbose=0):
15 |         super().__init__(
16 |             handle_oov=handle_oov, oov_vector=oov_vector, oov_vector_type=oov_vector_type,
17 |             padding=padding, pad_vector=pad_vector, pad_vector_type=pad_vector_type,
18 |             max_sequence_length=max_sequence_length, dimension=dimension,
19 |             verbose=verbose)
20 |         
21 |     def load_model(self, dest_dir, src=None, trainable=True, process=True, verbose=0):
22 |         if src is None:
23 |             src = self.GLOVE_COMMON_CRAWL_MODEL_URL
24 |             
25 |         dest_file = os.path.basename(src)
26 |             
27 |         file_path = self.download(
28 |             src=src, dest_dir=dest_dir, dest_file=None, 
29 |             uncompress=True, housekeep=False, verbose=verbose)
30 |             
31 |         self.model_path = dest_dir + dest_file
32 |         
33 |         dest_file = dest_file.replace('.zip', '.txt')
34 |             
35 |         if process and not self.is_file_exist(dest_dir + dest_file):
36 |             with open(dest_dir + dest_file, encoding="utf8" ) as f:
37 |                 lines = f.readlines()
38 | 
39 |             for line in lines:
40 |                 line_contents = line.split()
41 |                 word = line_contents[0]
42 |                 self.model[word] = np.array([float(val) for val in line_contents[1:]])
43 |             
44 |         return self.model
45 |         
46 |     def uncompress(self, file_path):
47 |         self.unzip(file_path)
48 |         
49 |     def encode(self, sentences):
50 |         preds = np.empty([len(sentences), self.max_sequence_length, self.dimension])
51 |         
52 |         for i, words in enumerate(sentences):
53 |             pred = np.empty([self.max_sequence_length, self.dimension])
54 |             cnt = 0
55 |             
56 |             for word in words:
57 |                 if self.is_vector_exist(word):
58 |                     pred[cnt] = self.model[word]
59 |                     cnt += 1
60 |                 elif self.handle_oov:
61 |                     pred[cnt] = self.oov_vector
62 |                     cnt += 1
63 |                     
64 |                 if cnt + 1 >= self.max_sequence_length:
65 |                     break
66 |                     
67 |             if self.padding and (cnt + 1 < self.max_sequence_length):
68 |                 for i in range(0, self.max_sequence_length - cnt):
69 |                     pred[cnt] = self.pad_vector
70 |                     cnt += 1
71 |                     
72 |             preds[i] = pred
73 |             
74 |         
75 |         return preds


--------------------------------------------------------------------------------
/aion/embeddings/infersent.py:
--------------------------------------------------------------------------------
 1 | import datetime, os, zipfile
 2 | import numpy as np
 3 | import torch
 4 | import subprocess
 5 | 
 6 | from .glove import GloVeEmbeddings
 7 | from .sentence_embeddings import SentenceEmbeddings
 8 | 
 9 | # InferSent (as of Sep 2018) is not a a library (https://github.com/facebookresearch/InferSent/issues/76), Cloned from https://github.com/facebookresearch/InferSent
10 | from .infersent_lib.models import InferSent
11 | 
12 | 
13 | class InferSentEmbeddings(SentenceEmbeddings):
14 |     INFERSENT_GLOVE_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent1.pkl'
15 |     INFERSENT_FASTTEXT_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent2.pkl'
16 |     
17 |     def __init__(self, 
18 |                  word_embeddings_dir,
19 |                  batch_size=64, word_dimension=300, encoder_lstm_dimension=2048, 
20 |                  pooling_type='max', model_version=1, dropout=0.0, 
21 |                  verbose=0):
22 |         super().__init__(verbose=verbose)
23 |         
24 |         self.word_embeddings_dir = word_embeddings_dir
25 |         self.batch_size = batch_size
26 |         self.word_dimension = word_dimension
27 |         self.encoder_lstm_dimension = encoder_lstm_dimension
28 |         self.pooling_type = pooling_type
29 |         self.dropout = dropout
30 |         self.model_version = model_version
31 |         
32 |     def get_params(self):
33 |         return {
34 |             'bsize': self.batch_size, 
35 |             'word_emb_dim': self.word_dimension, 
36 |             'enc_lstm_dim': self.encoder_lstm_dimension,
37 |             'pool_type': self.pooling_type, 
38 |             'dpout_model': self.dropout, 
39 |             'version': self.model_version
40 |         }
41 |         
42 |     def load_model(self, dest_dir, src=None, trainable=True, verbose=0):
43 |         # TODO: Support V2 model
44 |         if src is None:
45 |             src = InferSentEmbeddings.INFERSENT_GLOVE_MODEL_URL
46 |             
47 |         dest_file = os.path.basename(src)
48 |         file_path = self.download(
49 |             src=src, dest_dir=dest_dir, dest_file=dest_file, 
50 |             uncompress=False, housekeep=False, verbose=verbose)
51 |             
52 |         self.model = InferSent(self.get_params())
53 |         self.model.load_state_dict(torch.load(dest_dir + dest_file))
54 |         
55 |         # TODO: support different glove model and fasttext model
56 |         word_embs = GloVeEmbeddings()
57 |         word_embs.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose)
58 |         
59 |         self.model.set_w2v_path(word_embs.model_path)
60 |         
61 |     def build_vocab(self, sentences, tokenize=True):
62 |         return self.model.build_vocab(sentences, tokenize=tokenize)
63 |     
64 |     def encode(self, sentences, tokenize=True):
65 |         return self.model.encode(sentences, tokenize=tokenize)
66 |     
67 |     def visualize(self, sentence, tokenize=True):
68 |         self.model.visualize(sentence, tokenize=tokenize)    


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/GloVe
2 | dataset/MultiNLI
3 | dataset/SNLI
4 | encoder/infersent.allnli.pickle
5 | 
6 | *.swp
7 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/README.md:
--------------------------------------------------------------------------------
  1 | # InferSent
  2 | 
  3 | *InferSent* is a *sentence embeddings* method that provides semantic representations for English sentences. It is trained on natural language inference data and generalizes well to many different tasks.
  4 | 
  5 | We provide our pre-trained English sentence encoder [our paper](https://arxiv.org/abs/1705.02364) and our [SentEval](https://github.com/facebookresearch/SentEval) evaluation toolkit.
  6 | 
  7 | **Recent changes**: Added infersent2 model trained on fastText vectors and added max-pool option.
  8 | 
  9 | ## Dependencies
 10 | 
 11 | This code is written in python. Dependencies include:
 12 | 
 13 | * Python 2/3
 14 | * [Pytorch](http://pytorch.org/) (recent version)
 15 | * NLTK >= 3
 16 | 
 17 | ## Download datasets
 18 | To get SNLI and MultiNLI, run (in dataset/):
 19 | ```bash
 20 | ./get_data.bash
 21 | ```
 22 | This will download and preprocess SNLI/MultiNLI datasets. For MacOS, you may have to use *p7zip* instead of *unzip*.
 23 | 
 24 | 
 25 | Download [GloVe](https://nlp.stanford.edu/projects/glove/) (V1) or [fastText](https://fasttext.cc/docs/en/english-vectors.html) (V2) vectors:
 26 | ```bash
 27 | mkdir dataset/GloVe
 28 | curl -Lo dataset/GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
 29 | unzip dataset/GloVe/glove.840B.300d.zip -d dataset/GloVe/
 30 | mkdir dataset/fastText
 31 | curl -Lo dataset/fastText/crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
 32 | unzip dataset/fastText/crawl-300d-2M.vec.zip -d dataset/fastText/
 33 | ```
 34 | 
 35 | ## Use our sentence encoder
 36 | We provide a simple interface to encode English sentences. **See [**encoder/demo.ipynb**](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb)
 37 | for a practical example.** Get started with the following steps:
 38 | 
 39 | *0.0) Download our InferSent models (V1 trained with GloVe, V2 trained with fastText)[147MB]:*
 40 | ```bash
 41 | curl -Lo encoder/infersent1.pkl https://s3.amazonaws.com/senteval/infersent/infersent1.pkl
 42 | curl -Lo encoder/infersent2.pkl https://s3.amazonaws.com/senteval/infersent/infersent2.pkl
 43 | ```
 44 | Note that infersent1 is trained with GloVe (which have been trained on text preprocessed with the PTB tokenizer) and infersent2 is trained with fastText (which have been trained on text preprocessed with the MOSES tokenizer). The latter also removes the padding of zeros with max-pooling which was inconvenient when embedding sentences outside of their batches.
 45 | 
 46 | *0.1) Make sure you have the NLTK tokenizer by running the following once:*
 47 | ```python
 48 | import nltk
 49 | nltk.download('punkt')
 50 | ```
 51 | 
 52 | *1) [Load our pre-trained model](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb) (in encoder/):*
 53 | ```python
 54 | from models import InferSent
 55 | V = 2
 56 | MODEL_PATH = 'encoder/infersent%s.pkl' % V
 57 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
 58 |                 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
 59 | infersent = InferSent(params_model)
 60 | infersent.load_state_dict(torch.load(MODEL_PATH))
 61 | ```
 62 | 
 63 | *2) Set word vector path for the model:*
 64 | ```python
 65 | W2V_PATH = 'fastText/crawl-300d-2M.vec'
 66 | infersent.set_w2v_path(W2V_PATH)
 67 | ```
 68 | 
 69 | *3) Build the vocabulary of word vectors (i.e keep only those needed):*
 70 | ```python
 71 | infersent.build_vocab(sentences, tokenize=True)
 72 | ```
 73 | where *sentences* is your list of **n** sentences. You can update your vocabulary using *infersent.update_vocab(sentences)*, or directly load the **K** most common English words with *infersent.build_vocab_k_words(K=100000)*.
 74 | If **tokenize** is True (by default), sentences will be tokenized using NTLK.
 75 | 
 76 | *4) Encode your sentences (list of *n* sentences):*
 77 | ```python
 78 | embeddings = infersent.encode(sentences, tokenize=True)
 79 | ```
 80 | This outputs a numpy array with *n* vectors of dimension **4096**. Speed is around *1000 sentences per second* with batch size 128 on a single GPU.
 81 | 
 82 | *5) Visualize the importance that our model attributes to each word:*
 83 | 
 84 | We provide a function to visualize the importance of each word in the encoding of a sentence:
 85 | ```python
 86 | infersent.visualize('A man plays an instrument.', tokenize=True)
 87 | ```
 88 | ![Model](https://s3.amazonaws.com/senteval/infersent/visualization.png)
 89 | 
 90 | 
 91 | ## Train model on Natural Language Inference (SNLI)
 92 | To reproduce our results on [SNLI](https://nlp.stanford.edu/projects/snli/), run:
 93 | ```bash
 94 | python train_nli.py --word_emb_path '<path to word embeddings>'
 95 | ```
 96 | You should obtain a dev accuracy of 85 and a test accuracy of **[84.5](https://nlp.stanford.edu/projects/snli/)** with the default setting.
 97 | 
 98 | ## Evaluate the encoder on transfer tasks
 99 | To evaluate the model on transfer tasks, see [SentEval](https://github.com/facebookresearch/SentEval/tree/master/examples). Be mindful to choose the same tokenization used for training the encoder. You should obtain the following test results for the baselines and the InferSent models:
100 | 
101 | Model | MR | CR | SUBJ | MPQA | STS14 | [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) | SICK Relatedness | SICK Entailment | SST | TREC | MRPC
102 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---:
103 | `InferSent1` | **81.1** | **86.3** | 92.4 | **90.2** | **.68/.65** | 75.8/75.5 | 0.884 | 86.1 | **84.6** | 88.2 | **76.2**/83.1
104 | `InferSent2` | 79.7 | 84.2 | 92.7 | 89.4 | **.68/.66** | **78.4/78.4** | **0.888** | **86.3** | 84.3 | **90.8** | 76.0/**83.8**
105 | `SkipThought` | 79.4 | 83.1 | **93.7** | 89.3 | .44/.45 | 72.1/70.2| 0.858 | 79.5 | 82.9 | 88.4 | -
106 | `fastText-BoV` | 78.2 | 80.2 | 91.8 | 88.0 | .65/.63 | 70.2/68.3 | 0.823 | 78.9 | 82.3 | 83.4 | 74.4/82.4
107 | 
108 | ## Reference
109 | 
110 | Please consider citing [[1]](https://arxiv.org/abs/1705.02364) if you found this code useful.
111 | 
112 | ### Supervised Learning of Universal Sentence Representations from Natural Language Inference Data (EMNLP 2017)
113 | 
114 | [1] A. Conneau, D. Kiela, H. Schwenk, L. Barrault, A. Bordes, [*Supervised Learning of Universal Sentence Representations from Natural Language Inference Data*](https://arxiv.org/abs/1705.02364)
115 | 
116 | ```
117 | @InProceedings{conneau-EtAl:2017:EMNLP2017,
118 |   author    = {Conneau, Alexis  and  Kiela, Douwe  and  Schwenk, Holger  and  Barrault, Lo\"{i}c  and  Bordes, Antoine},
119 |   title     = {Supervised Learning of Universal Sentence Representations from Natural Language Inference Data},
120 |   booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
121 |   month     = {September},
122 |   year      = {2017},
123 |   address   = {Copenhagen, Denmark},
124 |   publisher = {Association for Computational Linguistics},
125 |   pages     = {670--680},
126 |   url       = {https://www.aclweb.org/anthology/D17-1070}
127 | }
128 | ```
129 | 
130 | ### Related work
131 | * [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)
132 | * [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)
133 | * [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)
134 | * [A. Conneau, D. Kiela - SentEval: An Evaluation Toolkit for Universal Sentence Representations, LREC 2018](https://arxiv.org/abs/1803.05449)
135 | * [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)
136 | * [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)
137 | * [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)
138 | * [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)
139 | * [A. Wang, A. Singh, J. Michael, F. Hill, O. Levy, S. Bowman - GLUE: A Multi-Task Benchmark and Analysis Platform
140 | for Natural Language Understanding](https://arxiv.org/abs/1804.07461)
141 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | import os
 9 | import numpy as np
10 | import torch
11 | 
12 | 
13 | def get_batch(batch, word_vec, emb_dim=300):
14 |     # sent in batch in decreasing order of lengths (bsize, max_len, word_dim)
15 |     lengths = np.array([len(x) for x in batch])
16 |     max_len = np.max(lengths)
17 |     embed = np.zeros((max_len, len(batch), emb_dim))
18 | 
19 |     for i in range(len(batch)):
20 |         for j in range(len(batch[i])):
21 |             embed[j, i, :] = word_vec[batch[i][j]]
22 | 
23 |     return torch.from_numpy(embed).float(), lengths
24 | 
25 | 
26 | def get_word_dict(sentences):
27 |     # create vocab of words
28 |     word_dict = {}
29 |     for sent in sentences:
30 |         for word in sent.split():
31 |             if word not in word_dict:
32 |                 word_dict[word] = ''
33 |     word_dict['<s>'] = ''
34 |     word_dict['</s>'] = ''
35 |     word_dict['<p>'] = ''
36 |     return word_dict
37 | 
38 | 
39 | def get_glove(word_dict, glove_path):
40 |     # create word_vec with glove vectors
41 |     word_vec = {}
42 |     with open(glove_path) as f:
43 |         for line in f:
44 |             word, vec = line.split(' ', 1)
45 |             if word in word_dict:
46 |                 word_vec[word] = np.array(list(map(float, vec.split())))
47 |     print('Found {0}(/{1}) words with glove vectors'.format(
48 |                 len(word_vec), len(word_dict)))
49 |     return word_vec
50 | 
51 | 
52 | def build_vocab(sentences, glove_path):
53 |     word_dict = get_word_dict(sentences)
54 |     word_vec = get_glove(word_dict, glove_path)
55 |     print('Vocab size : {0}'.format(len(word_vec)))
56 |     return word_vec
57 | 
58 | 
59 | def get_nli(data_path):
60 |     s1 = {}
61 |     s2 = {}
62 |     target = {}
63 | 
64 |     dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}
65 | 
66 |     for data_type in ['train', 'dev', 'test']:
67 |         s1[data_type], s2[data_type], target[data_type] = {}, {}, {}
68 |         s1[data_type]['path'] = os.path.join(data_path, 's1.' + data_type)
69 |         s2[data_type]['path'] = os.path.join(data_path, 's2.' + data_type)
70 |         target[data_type]['path'] = os.path.join(data_path,
71 |                                                  'labels.' + data_type)
72 | 
73 |         s1[data_type]['sent'] = [line.rstrip() for line in
74 |                                  open(s1[data_type]['path'], 'r')]
75 |         s2[data_type]['sent'] = [line.rstrip() for line in
76 |                                  open(s2[data_type]['path'], 'r')]
77 |         target[data_type]['data'] = np.array([dico_label[line.rstrip('\n')]
78 |                 for line in open(target[data_type]['path'], 'r')])
79 | 
80 |         assert len(s1[data_type]['sent']) == len(s2[data_type]['sent']) == \
81 |             len(target[data_type]['data'])
82 | 
83 |         print('** {0} DATA : Found {1} pairs of {2} sentences.'.format(
84 |                 data_type.upper(), len(s1[data_type]['sent']), data_type))
85 | 
86 |     train = {'s1': s1['train']['sent'], 's2': s2['train']['sent'],
87 |              'label': target['train']['data']}
88 |     dev = {'s1': s1['dev']['sent'], 's2': s2['dev']['sent'],
89 |            'label': target['dev']['data']}
90 |     test = {'s1': s1['test']['sent'], 's2': s2['test']['sent'],
91 |             'label': target['test']['data']}
92 |     return train, dev, test
93 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/dataset/get_data.bash:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree. 
 6 | #
 7 | 
 8 | preprocess_exec="sed -f tokenizer.sed"
 9 | 
10 | SNLI='https://nlp.stanford.edu/projects/snli/snli_1.0.zip'
11 | MultiNLI='https://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip'
12 | 
13 | 
14 | ZIPTOOL="unzip"
15 | 
16 | #if [ "$OSTYPE" == "darwin"* ]; then
17 | #    # unzip can't handle large files on some MacOS versions
18 | #    ZIPTOOL="7za x"
19 | #fi
20 | 
21 | 
22 | ### download SNLI
23 | mkdir SNLI
24 | curl -Lo SNLI/snli_1.0.zip $SNLI
25 | $ZIPTOOL SNLI/snli_1.0.zip -d SNLI
26 | rm SNLI/snli_1.0.zip
27 | rm -r SNLI/__MACOSX
28 | 
29 | for split in train dev test
30 | do
31 |     fpath=SNLI/$split.snli.txt
32 |     awk '{ if ( $1 != "-" ) { print $0; } }' SNLI/snli_1.0/snli_1.0_$split.txt | cut -f 1,6,7 | sed '1d' > $fpath
33 |     cut -f1 $fpath > SNLI/labels.$split
34 |     cut -f2 $fpath | $preprocess_exec > SNLI/s1.$split
35 |     cut -f3 $fpath | $preprocess_exec > SNLI/s2.$split
36 |     rm $fpath
37 | done
38 | rm -r SNLI/snli_1.0
39 | 
40 | 
41 | # MultiNLI
42 | # Test set not available yet : we define dev set as the "matched" set and the test set as the "mismatched"
43 | mkdir MultiNLI
44 | curl -Lo MultiNLI/multinli_0.9.zip $MultiNLI
45 | $ZIPTOOL MultiNLI/multinli_0.9.zip -d MultiNLI
46 | rm MultiNLI/multinli_0.9.zip
47 | rm -r MultiNLI/__MACOSX
48 | 
49 | 
50 | mv MultiNLI/multinli_0.9/multinli_0.9_train.txt MultiNLI/train.multinli.txt
51 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_matched.txt MultiNLI/dev.matched.multinli.txt
52 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_mismatched.txt MultiNLI/dev.mismatched.multinli.txt
53 | 
54 | rm -r MultiNLI/multinli_0.9
55 | 
56 | for split in train dev.matched dev.mismatched
57 | do
58 |     fpath=MultiNLI/$split.multinli.txt
59 |     awk '{ if ( $1 != "-" ) { print $0; } }' $fpath | cut -f 1,6,7 | sed '1d' > $fpath.tok
60 |     cut -f1 $fpath.tok > MultiNLI/labels.$split
61 |     cut -f2 $fpath.tok | $preprocess_exec > MultiNLI/s1.$split
62 |     cut -f3 $fpath.tok | $preprocess_exec > MultiNLI/s2.$split
63 |     rm $fpath $fpath.tok 
64 | done
65 | 
66 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/dataset/tokenizer.sed:
--------------------------------------------------------------------------------
 1 | # Sed script to produce Penn Treebank tokenization on arbitrary raw text.
 2 | # Yeah, sure.
 3 | 
 4 | # expected input: raw text with ONE SENTENCE TOKEN PER LINE
 5 | 
 6 | # by Robert MacIntyre, University of Pennsylvania, late 1995.
 7 | 
 8 | # If this wasn't such a trivial program, I'd include all that stuff about
 9 | # no warrantee, free use, etc. from the GNU General Public License.  If you
10 | # want to be picky, assume that all of its terms apply.  Okay?
11 | 
12 | # attempt to get correct directional quotes
13 | s=^"=`` =g
14 | s=\([ ([{<]\)"=\1 `` =g
15 | # close quotes handled at end
16 | 
17 | s=\.\.\.= ... =g
18 | s=[,;:@#$%&]= & =g
19 | 
20 | # Assume sentence tokenization has been done first, so split FINAL periods
21 | # only. 
22 | s=\([^.]\)\([.]\)\([])}>"']*\)[     ]*$=\1 \2\3 =g
23 | # however, we may as well split ALL question marks and exclamation points,
24 | # since they shouldn't have the abbrev.-marker ambiguity problem
25 | s=[?!]= & =g
26 | 
27 | # parentheses, brackets, etc.
28 | s=[][(){}<>]= & =g
29 | # Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
30 | # version of these symbols.
31 | # UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
32 | # s/(/-LRB-/g
33 | # s/)/-RRB-/g
34 | # s/\[/-LSB-/g
35 | # s/\]/-RSB-/g
36 | # s/{/-LCB-/g
37 | # s/}/-RCB-/g
38 | 
39 | s=--= -- =g
40 | 
41 | # NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since
42 | # you might someday want to know how the words originally fit together --
43 | # but it's too late to make a better system now, given the millions of
44 | # words we've already done "wrong".
45 | 
46 | # First off, add a space to the beginning and end of each line, to reduce
47 | # necessary number of regexps.
48 | s=$= =
49 | s=^= =
50 | 
51 | s="= '' =g
52 | # possessive or close-single-quote
53 | s=\([^']\)' =\1 ' =g
54 | # as in it's, I'm, we'd
55 | s='\([sSmMdD]\) = '\1 =g
56 | s='ll = 'll =g
57 | s='re = 're =g
58 | s='ve = 've =g
59 | s=n't = n't =g
60 | s='LL = 'LL =g
61 | s='RE = 'RE =g
62 | s='VE = 'VE =g
63 | s=N'T = N'T =g
64 | 
65 | s= \([Cc]\)annot = \1an not =g
66 | s= \([Dd]\)'ye = \1' ye =g
67 | s= \([Gg]\)imme = \1im me =g
68 | s= \([Gg]\)onna = \1on na =g
69 | s= \([Gg]\)otta = \1ot ta =g
70 | s= \([Ll]\)emme = \1em me =g
71 | s= \([Mm]\)ore'n = \1ore 'n =g
72 | s= '\([Tt]\)is = '\1 is =g
73 | s= '\([Tt]\)was = '\1 was =g
74 | s= \([Ww]\)anna = \1an na =g
75 | # s= \([Ww]\)haddya = \1ha dd ya =g
76 | # s= \([Ww]\)hatcha = \1ha t cha =g
77 | 
78 | # clean out extra spaces
79 | s=  *= =g
80 | s=^ *==g
81 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/encoder/extract_features.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import torch
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | from models import InferSent
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser(
12 |         prog='extract-features',
13 |         description='Extract features from pretrained InferSent model')
14 | 
15 |     parser.add_argument('-g', '--w2v_path', type=str, required=True,
16 |                         help='Path to word vector file')
17 |     parser.add_argument('-v', '--version', type=int, required=True,
18 |                         help='Version of InferSent (GloVe-V1 or fastText-V2)')
19 |     parser.add_argument('-f', '--model_path', type=str, required=True,
20 |                         help='Path to pretrained .pkl model file')
21 |     parser.add_argument('-t', '--tokenize', action='store_true',
22 |                         help='Passes tokenize=True to build_vocab()')
23 |     parser.add_argument('-o', '--out-dir', type=str, required=True,
24 |                         help='Output folder to save feature files')
25 |     parser.add_argument('-c', '--cpu', action='store_true',
26 |                         help='Use CPU instead of GPU.')
27 |     parser.add_argument('-b', '--batch-size', type=int, default=64,
28 |                         help='Batch size (default: 64)')
29 |     parser.add_argument('files', nargs='+',
30 |                         help='List of files to extract sentence embeddings')
31 | 
32 |     args = parser.parse_args()
33 | 
34 |     params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
35 |                     'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version}
36 |     model = InferSent(params_model)
37 |     model.load_state_dict(torch.load(args.model_path))
38 | 
39 |     if not args.cpu:
40 |         model = model.cuda()
41 | 
42 |     model.set_w2v_path(args.w2v_path)
43 | 
44 |     # Ensure directory
45 |     if not os.path.exists(args.out_dir):
46 |         os.makedirs(args.out_dir)
47 | 
48 |     # Read files and extract features
49 |     for fpath in args.files:
50 |         print('Reading file {}'.format(fpath))
51 |         sents = []
52 |         with open(fpath) as f:
53 |             for line in f:
54 |                 line = line.strip()
55 |                 assert line, 'Empty line in {}'.format(fpath)
56 |                 sents.append(line)
57 | 
58 |         # Set output file name
59 |         out_name = os.path.join(
60 |             args.out_dir, "{}.embs.npy".format(os.path.basename(fpath)))
61 | 
62 |         # Build vocab
63 |         print('Building vocabulary')
64 |         model.build_vocab(sents, args.tokenize)
65 | 
66 |         # Get embeddings
67 |         embs = model.encode(sents, tokenize=args.tokenize,
68 |                             verbose=True, bsize=args.batch_size)
69 | 
70 |         print('Saving to {}'.format(out_name))
71 |         np.save(out_name, embs)
72 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/encoder/models.py:
--------------------------------------------------------------------------------
1 | ../models.py


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/mutils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | import re
 9 | import inspect
10 | from torch import optim
11 | 
12 | 
13 | def get_optimizer(s):
14 |     """
15 |     Parse optimizer parameters.
16 |     Input should be of the form:
17 |         - "sgd,lr=0.01"
18 |         - "adagrad,lr=0.1,lr_decay=0.05"
19 |     """
20 |     if "," in s:
21 |         method = s[:s.find(',')]
22 |         optim_params = {}
23 |         for x in s[s.find(',') + 1:].split(','):
24 |             split = x.split('=')
25 |             assert len(split) == 2
26 |             assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
27 |             optim_params[split[0]] = float(split[1])
28 |     else:
29 |         method = s
30 |         optim_params = {}
31 | 
32 |     if method == 'adadelta':
33 |         optim_fn = optim.Adadelta
34 |     elif method == 'adagrad':
35 |         optim_fn = optim.Adagrad
36 |     elif method == 'adam':
37 |         optim_fn = optim.Adam
38 |     elif method == 'adamax':
39 |         optim_fn = optim.Adamax
40 |     elif method == 'asgd':
41 |         optim_fn = optim.ASGD
42 |     elif method == 'rmsprop':
43 |         optim_fn = optim.RMSprop
44 |     elif method == 'rprop':
45 |         optim_fn = optim.Rprop
46 |     elif method == 'sgd':
47 |         optim_fn = optim.SGD
48 |         assert 'lr' in optim_params
49 |     else:
50 |         raise Exception('Unknown optimization method: "%s"' % method)
51 | 
52 |     # check that we give good parameters to the optimizer
53 |     expected_args = inspect.getargspec(optim_fn.__init__)[0]
54 |     assert expected_args[:2] == ['self', 'params']
55 |     if not all(k in expected_args[2:] for k in optim_params.keys()):
56 |         raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
57 |             str(expected_args[2:]), str(optim_params.keys())))
58 | 
59 |     return optim_fn, optim_params
60 | 
61 | 
62 | """
63 | Importing batcher and prepare for SentEval
64 | """
65 | 
66 | 
67 | def batcher(batch, params):
68 |     # batch contains list of words
69 |     batch = [['<s>'] + s + ['</s>'] for s in batch]
70 |     sentences = [' '.join(s) for s in batch]
71 |     embeddings = params.infersent.encode(sentences, bsize=params.batch_size,
72 |                                          tokenize=False)
73 | 
74 |     return embeddings
75 | 
76 | 
77 | def prepare(params, samples):
78 |     params.infersent.build_vocab([' '.join(s) for s in samples],
79 |                                  params.glove_path, tokenize=False)
80 | 
81 | 
82 | class dotdict(dict):
83 |     """ dot.notation access to dictionary attributes """
84 |     __getattr__ = dict.get
85 |     __setattr__ = dict.__setitem__
86 |     __delattr__ = dict.__delitem__
87 | 


--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/train_nli.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | import os
  9 | import sys
 10 | import time
 11 | import argparse
 12 | 
 13 | import numpy as np
 14 | 
 15 | import torch
 16 | from torch.autograd import Variable
 17 | import torch.nn as nn
 18 | 
 19 | from data import get_nli, get_batch, build_vocab
 20 | from mutils import get_optimizer
 21 | from models import NLINet
 22 | 
 23 | 
 24 | parser = argparse.ArgumentParser(description='NLI training')
 25 | # paths
 26 | parser.add_argument("--nlipath", type=str, default='dataset/SNLI/', help="NLI data path (SNLI or MultiNLI)")
 27 | parser.add_argument("--outputdir", type=str, default='savedir/', help="Output directory")
 28 | parser.add_argument("--outputmodelname", type=str, default='model.pickle')
 29 | parser.add_argument("--word_emb_path", type=str, default="dataset/GloVe/glove.840B.300d.txt", help="word embedding file path")
 30 | 
 31 | # training
 32 | parser.add_argument("--n_epochs", type=int, default=20)
 33 | parser.add_argument("--batch_size", type=int, default=64)
 34 | parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout")
 35 | parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout")
 36 | parser.add_argument("--nonlinear_fc", type=float, default=0, help="use nonlinearity in fc")
 37 | parser.add_argument("--optimizer", type=str, default="sgd,lr=0.1", help="adam or sgd,lr=0.1")
 38 | parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd")
 39 | parser.add_argument("--decay", type=float, default=0.99, help="lr decay")
 40 | parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
 41 | parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)")
 42 | 
 43 | # model
 44 | parser.add_argument("--encoder_type", type=str, default='InferSentV1', help="see list of encoders")
 45 | parser.add_argument("--enc_lstm_dim", type=int, default=2048, help="encoder nhid dimension")
 46 | parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers")
 47 | parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers")
 48 | parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction")
 49 | parser.add_argument("--pool_type", type=str, default='max', help="max or mean")
 50 | 
 51 | # gpu
 52 | parser.add_argument("--gpu_id", type=int, default=3, help="GPU ID")
 53 | parser.add_argument("--seed", type=int, default=1234, help="seed")
 54 | 
 55 | # data
 56 | parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension")
 57 | 
 58 | params, _ = parser.parse_known_args()
 59 | 
 60 | # set gpu device
 61 | torch.cuda.set_device(params.gpu_id)
 62 | 
 63 | # print parameters passed, and all parameters
 64 | print('\ntogrep : {0}\n'.format(sys.argv[1:]))
 65 | print(params)
 66 | 
 67 | 
 68 | """
 69 | SEED
 70 | """
 71 | np.random.seed(params.seed)
 72 | torch.manual_seed(params.seed)
 73 | torch.cuda.manual_seed(params.seed)
 74 | 
 75 | """
 76 | DATA
 77 | """
 78 | train, valid, test = get_nli(params.nlipath)
 79 | word_vec = build_vocab(train['s1'] + train['s2'] +
 80 |                        valid['s1'] + valid['s2'] +
 81 |                        test['s1'] + test['s2'], params.word_emb_path)
 82 | 
 83 | for split in ['s1', 's2']:
 84 |     for data_type in ['train', 'valid', 'test']:
 85 |         eval(data_type)[split] = np.array([['<s>'] +
 86 |             [word for word in sent.split() if word in word_vec] +
 87 |             ['</s>'] for sent in eval(data_type)[split]])
 88 | 
 89 | 
 90 | """
 91 | MODEL
 92 | """
 93 | # model config
 94 | config_nli_model = {
 95 |     'n_words'        :  len(word_vec)          ,
 96 |     'word_emb_dim'   :  params.word_emb_dim   ,
 97 |     'enc_lstm_dim'   :  params.enc_lstm_dim   ,
 98 |     'n_enc_layers'   :  params.n_enc_layers   ,
 99 |     'dpout_model'    :  params.dpout_model    ,
100 |     'dpout_fc'       :  params.dpout_fc       ,
101 |     'fc_dim'         :  params.fc_dim         ,
102 |     'bsize'          :  params.batch_size     ,
103 |     'n_classes'      :  params.n_classes      ,
104 |     'pool_type'      :  params.pool_type      ,
105 |     'nonlinear_fc'   :  params.nonlinear_fc   ,
106 |     'encoder_type'   :  params.encoder_type   ,
107 |     'use_cuda'       :  True                  ,
108 | 
109 | }
110 | 
111 | # model
112 | encoder_types = ['InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder',
113 |                  'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
114 |                  'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder']
115 | assert params.encoder_type in encoder_types, "encoder_type must be in " + \
116 |                                              str(encoder_types)
117 | nli_net = NLINet(config_nli_model)
118 | print(nli_net)
119 | 
120 | # loss
121 | weight = torch.FloatTensor(params.n_classes).fill_(1)
122 | loss_fn = nn.CrossEntropyLoss(weight=weight)
123 | loss_fn.size_average = False
124 | 
125 | # optimizer
126 | optim_fn, optim_params = get_optimizer(params.optimizer)
127 | optimizer = optim_fn(nli_net.parameters(), **optim_params)
128 | 
129 | # cuda by default
130 | nli_net.cuda()
131 | loss_fn.cuda()
132 | 
133 | 
134 | """
135 | TRAIN
136 | """
137 | val_acc_best = -1e10
138 | adam_stop = False
139 | stop_training = False
140 | lr = optim_params['lr'] if 'sgd' in params.optimizer else None
141 | 
142 | 
143 | def trainepoch(epoch):
144 |     print('\nTRAINING : Epoch ' + str(epoch))
145 |     nli_net.train()
146 |     all_costs = []
147 |     logs = []
148 |     words_count = 0
149 | 
150 |     last_time = time.time()
151 |     correct = 0.
152 |     # shuffle the data
153 |     permutation = np.random.permutation(len(train['s1']))
154 | 
155 |     s1 = train['s1'][permutation]
156 |     s2 = train['s2'][permutation]
157 |     target = train['label'][permutation]
158 | 
159 | 
160 |     optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
161 |         and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
162 |     print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))
163 | 
164 |     for stidx in range(0, len(s1), params.batch_size):
165 |         # prepare batch
166 |         s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
167 |                                      word_vec, params.word_emb_dim)
168 |         s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
169 |                                      word_vec, params.word_emb_dim)
170 |         s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
171 |         tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda()
172 |         k = s1_batch.size(1)  # actual batch size
173 | 
174 |         # model forward
175 |         output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
176 | 
177 |         pred = output.data.max(1)[1]
178 |         correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
179 |         assert len(pred) == len(s1[stidx:stidx + params.batch_size])
180 | 
181 |         # loss
182 |         loss = loss_fn(output, tgt_batch)
183 |         all_costs.append(loss.data[0])
184 |         words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim
185 | 
186 |         # backward
187 |         optimizer.zero_grad()
188 |         loss.backward()
189 | 
190 |         # gradient clipping (off by default)
191 |         shrink_factor = 1
192 |         total_norm = 0
193 | 
194 |         for p in nli_net.parameters():
195 |             if p.requires_grad:
196 |                 p.grad.data.div_(k)  # divide by the actual batch size
197 |                 total_norm += p.grad.data.norm() ** 2
198 |         total_norm = np.sqrt(total_norm)
199 | 
200 |         if total_norm > params.max_norm:
201 |             shrink_factor = params.max_norm / total_norm
202 |         current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
203 |         optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update
204 | 
205 |         # optimizer step
206 |         optimizer.step()
207 |         optimizer.param_groups[0]['lr'] = current_lr
208 | 
209 |         if len(all_costs) == 100:
210 |             logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
211 |                             stidx, round(np.mean(all_costs), 2),
212 |                             int(len(all_costs) * params.batch_size / (time.time() - last_time)),
213 |                             int(words_count * 1.0 / (time.time() - last_time)),
214 |                             round(100.*correct/(stidx+k), 2)))
215 |             print(logs[-1])
216 |             last_time = time.time()
217 |             words_count = 0
218 |             all_costs = []
219 |     train_acc = round(100 * correct/len(s1), 2)
220 |     print('results : epoch {0} ; mean accuracy train : {1}'
221 |           .format(epoch, train_acc))
222 |     return train_acc
223 | 
224 | 
225 | def evaluate(epoch, eval_type='valid', final_eval=False):
226 |     nli_net.eval()
227 |     correct = 0.
228 |     global val_acc_best, lr, stop_training, adam_stop
229 | 
230 |     if eval_type == 'valid':
231 |         print('\nVALIDATION : Epoch {0}'.format(epoch))
232 | 
233 |     s1 = valid['s1'] if eval_type == 'valid' else test['s1']
234 |     s2 = valid['s2'] if eval_type == 'valid' else test['s2']
235 |     target = valid['label'] if eval_type == 'valid' else test['label']
236 | 
237 |     for i in range(0, len(s1), params.batch_size):
238 |         # prepare batch
239 |         s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec, params.word_emb_dim)
240 |         s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec, params.word_emb_dim)
241 |         s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
242 |         tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda()
243 | 
244 |         # model forward
245 |         output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
246 | 
247 |         pred = output.data.max(1)[1]
248 |         correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
249 | 
250 |     # save model
251 |     eval_acc = round(100 * correct / len(s1), 2)
252 |     if final_eval:
253 |         print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
254 |     else:
255 |         print('togrep : results : epoch {0} ; mean accuracy {1} :\
256 |               {2}'.format(epoch, eval_type, eval_acc))
257 | 
258 |     if eval_type == 'valid' and epoch <= params.n_epochs:
259 |         if eval_acc > val_acc_best:
260 |             print('saving model at epoch {0}'.format(epoch))
261 |             if not os.path.exists(params.outputdir):
262 |                 os.makedirs(params.outputdir)
263 |             torch.save(nli_net.state_dict(), os.path.join(params.outputdir,
264 |                        params.outputmodelname))
265 |             val_acc_best = eval_acc
266 |         else:
267 |             if 'sgd' in params.optimizer:
268 |                 optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink
269 |                 print('Shrinking lr by : {0}. New lr = {1}'
270 |                       .format(params.lrshrink,
271 |                               optimizer.param_groups[0]['lr']))
272 |                 if optimizer.param_groups[0]['lr'] < params.minlr:
273 |                     stop_training = True
274 |             if 'adam' in params.optimizer:
275 |                 # early stopping (at 2nd decrease in accuracy)
276 |                 stop_training = adam_stop
277 |                 adam_stop = True
278 |     return eval_acc
279 | 
280 | 
281 | """
282 | Train model on Natural Language Inference task
283 | """
284 | epoch = 1
285 | 
286 | while not stop_training and epoch <= params.n_epochs:
287 |     train_acc = trainepoch(epoch)
288 |     eval_acc = evaluate(epoch, 'valid')
289 |     epoch += 1
290 | 
291 | # Run best model on test set.
292 | nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname)), map_location={'cuda:1' : 'cuda:0', 'cuda:2' : 'cuda:0'})
293 | 
294 | print('\nTEST : Epoch {0}'.format(epoch))
295 | evaluate(1e6, 'valid', True)
296 | evaluate(0, 'test', True)
297 | 
298 | # Save encoder instead of full model
299 | torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl'))
300 | 


--------------------------------------------------------------------------------
/aion/embeddings/sentence_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Edward Ma. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import os, datetime
17 | 
18 | from .embeddings import Embeddings
19 | 
20 | 
21 | class SentenceEmbeddings(Embeddings):
22 |     def __init__(self, verbose=0):
23 |         self.verbose = verbose


--------------------------------------------------------------------------------
/aion/embeddings/skip_thoughts.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Edward Ma. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os, datetime
 17 | 
 18 | class SkipThoughtsEmbeddingsTorch:
 19 |     DICTIONARY_URL = "http://www.cs.toronto.edu/~rkiros/models/dictionary.txt"
 20 |     UNISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/utable.npy"
 21 |     BISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/btable.npy"
 22 |     UNISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz"
 23 |     BISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz"
 24 |     UNISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl"
 25 |     BISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl"
 26 |     
 27 |     def __init__(self, model_dir, algorithm='uniskip', tokenizer=None, verbose=0):
 28 |         super().__init__(verbose=verbose)
 29 |         
 30 |         from torch import LongTensor
 31 |         from torch.autograd import Variable
 32 |         from skipthoughts import UniSkip, BiSkip
 33 |         
 34 |         self.model_dir = model_dir
 35 |         self.algorithm = algorithm
 36 |         self.vocab = {}
 37 |         self.vocabs = []
 38 |         if tokenizer is None:
 39 |             self.tokenizer = self._tokenizer_space
 40 |         else:
 41 |             self.tokenizer = tokenizer
 42 |         self.max_sentence_len = -1
 43 |         
 44 |     def downloads(self, dest_dir, sources=None):
 45 |         if sources is None:
 46 |             sources = [self.DICTIONARY_URL, self.UNISKIP_URL, self.BISKIP_URL, 
 47 |                        self.UNISKIPS_URL, self.BISKIPS_URL, self.UNISKIPS_PKL_URL, 
 48 |                        self.BISKIPS_PKL_URL]
 49 |         
 50 |         for src in sources:
 51 |             self.download(src=src, dest_dir=dest_dir, dest_file=None, unzip=False)
 52 |         
 53 |     def build_vocab(self, sentences, clear_vocab=True, max_sentence_len=-1):
 54 |         if clear_vocab:
 55 |             self.vocab = {}
 56 |             
 57 |         self.max_sentence_len = max_sentence_len
 58 |         
 59 |         for sentence in sentences:
 60 |             words = self.tokenizer(sentence)
 61 |             if max_sentence_len == -1:
 62 |                 self.max_sentence_len = max(self.max_sentence_len, len(words))
 63 | 
 64 |             for word in words:
 65 |                 if word not in self.vocab:
 66 |                     self.vocabs.append(word)
 67 |                     # Reserve the first one for padding
 68 |                     self.vocab[word] = len(self.vocab) + 1
 69 | 
 70 |     def process(self, sentences):
 71 |         word_id_sentences = []
 72 |         for sentence in sentences:
 73 |             word_ids = [self.vocab[w] for w in self.tokenizer(sentence) if w in self.vocab]
 74 |             
 75 |             if self.max_sentence_len > len(word_ids):
 76 |                 for i in range(0, self.max_sentence_len-len(word_ids)):
 77 |                     word_ids.append(0)
 78 |             elif self.max_sentence_len < len(word_ids):
 79 |                 word_ids = word_ids[:self.max_sentence_len]
 80 |                     
 81 |             word_id_sentences.append(word_ids)
 82 |             
 83 |         return word_id_sentences
 84 |     
 85 |     def get_algorithm(self, words, model_dir=None):
 86 |         if model_dir is None:
 87 |             model_dir = self.model_dir
 88 |             
 89 |         if self.algorithm == 'uniskip':
 90 |             return UniSkip(model_dir, words)
 91 |         else:
 92 |             return BiSkip(model_dir, words)
 93 |         
 94 |     def to_numpy_layer(self, layer):
 95 |         return layer.detach().numpy()        
 96 | 
 97 |     def encode(self, sentences, output_format='torch'):
 98 |         transformed_sentences = self.process(sentences)
 99 |         
100 |         algo = self.get_algorithm(self.vocabs)
101 |         inputs = Variable(LongTensor(transformed_sentences))
102 |         outpus = algo(inputs, lengths=[len(words) for words in transformed_sentences])
103 |         
104 |         if output_format == 'np':
105 |             return self.to_numpy_layer(outpus)
106 |         elif output_format == 'torch':
107 |             return outpus
108 |         
109 |     def predict_batch(self, sentences, output_format='torch', batch_size=1000):
110 |         batches = [sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size-1) // batch_size)]
111 | 
112 |         results = []
113 |         for batch in batches:
114 |             results.append(skip_thoughts_emb.predict(sentences=batch, output_format=output_format))
115 | 
116 |         if output_format == 'np':
117 |             return np.concatenate(results, axis=0)
118 |         elif output_format == 'torch':
119 |             return torch.cat(results, 0)


--------------------------------------------------------------------------------
/aion/embeddings/word_embeddings.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import numpy as np
 3 | 
 4 | from .embeddings import Embeddings
 5 | 
 6 | 
 7 | class WordEmbeddings(Embeddings):
 8 |     
 9 |     def __init__(self, 
10 |                  handle_oov=True, oov_vector=None, oov_vector_type='zero',
11 |                  padding=True, pad_vector=None, pad_vector_type='zero',
12 |                  max_sequence_length=10, dimension=300,
13 |                  verbose=0):
14 |         super().__init__(verbose=verbose)
15 |         self.handle_oov = handle_oov
16 |         self.oov_vector_type = oov_vector_type
17 |         if handle_oov and oov_vector is None:
18 |             if oov_vector_type == 'zero':
19 |                 self.oov_vector = np.zeros(dimension)
20 |             elif oov_vector_type == 'random':
21 |                 self.oov_vector = np.random.rand(dimension)
22 |         else:
23 |             self.oov_vector = oov_vector
24 |             
25 |         self.padding = padding
26 |         self.pad_vector_type = pad_vector_type
27 |         if padding and pad_vector is None:
28 |             if pad_vector_type == 'zero':
29 |                 self.pad_vector = np.zeros(dimension)
30 |             elif pad_vector_type == 'random':
31 |                 self.pad_vector = np.random.rand(dimension)
32 |         else:
33 |             self.pad_vector = pad_vector
34 |         
35 |         self.max_sequence_length = max_sequence_length
36 |         self.dimension = dimension
37 |         
38 |     def get_oov_vector(self):
39 |         return self.oov_vector
40 |         
41 |     def set_oov_vector(self, oov_vector):
42 |         self.oov_vector = oov_vector
43 |         
44 |     def get_pad_vector(self):
45 |         return self.pad_vector
46 |         
47 |     def set_pad_vector(self, pad_vector):
48 |         self.pad_vector = pad_vector
49 |         
50 |     def is_vector_exist(self, word):
51 |         return word in self.model


--------------------------------------------------------------------------------
/aion/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/aion/helper/__init__.py


--------------------------------------------------------------------------------
/aion/helper/file_helper.py:
--------------------------------------------------------------------------------
 1 | import datetime, os, urllib.request, zipfile
 2 | 
 3 | 
 4 | class FileHelper:
 5 |     def __init__(self, verbose=0):
 6 |         self.verbose = verbose
 7 |         
 8 |     def _log_time(self, status, msg, verbose):
 9 |         if self.verbose >= 0 or verbose >= 0:
10 |             print('%s. [%s] %s' % (datetime.datetime.now(), status, msg))
11 |             
12 |     def is_file_exist(self, file_path):
13 |         return os.path.exists(file_path)
14 |             
15 |     def download(self, src, dest_dir, dest_file, uncompress=False, housekeep=False, force_download=False, verbose=0):
16 |         if not os.path.exists(dest_dir):
17 |             os.makedirs(dest_dir)
18 |             
19 | #         print('dest_dir:', dest_dir)
20 |     
21 |         if dest_file is None:
22 |             dest_file = os.path.basename(src)
23 |             
24 | #         print('dest_file:', dest_file)
25 |             
26 |         if not self.is_file_exist(dest_dir + dest_file) or force_download:
27 |             self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose)
28 |             file = urllib.request.urlopen(src)
29 |             with open(dest_dir + dest_file,'wb') as output:
30 |                 output.write(file.read())
31 |         else:
32 |             self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose)
33 |             
34 | #         if uncompress:
35 | #             self.uncompress(dest_dir + dest_file)
36 |             
37 | #         if uncompress and housekeep:
38 | #             self.housekeep(dest_dir + dest_file)
39 |             
40 |         return dest_dir + dest_file


--------------------------------------------------------------------------------
/aion/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/aion/util/__init__.py


--------------------------------------------------------------------------------
/aion/util/spell_check.py:
--------------------------------------------------------------------------------
  1 | import re, os
  2 | from collections import Counter
  3 | from symspellpy.symspellpy import SymSpell as SymSpellPy, Verbosity
  4 | 
  5 | class SpellCheck:
  6 |     def __init__(self, dictionary=None, verbose=0):
  7 |         self.verbose = verbose
  8 |         self.dictionary = dictionary
  9 |         
 10 |     def correction(self, text):
 11 |         return ''
 12 | 
 13 | 
 14 | '''
 15 |     Source: https://norvig.com/spell-correct.html
 16 | '''
 17 | class SpellCorrector(SpellCheck):
 18 |     def __init__(self, dictionary, verbose=0):
 19 |         super().__init__(dictionary=dictionary, verbose=verbose)
 20 | 
 21 |     def words(text):
 22 |         return re.findall(r'\w+', text.lower())
 23 | 
 24 |     def P(self, word): 
 25 |         "Probability of `word`."
 26 |         N = sum(self.dictionary.values())
 27 |         return self.dictionary[word] / N
 28 | 
 29 |     def correction(self, word): 
 30 |         "Most probable spelling correction for word."
 31 |         return max(self.candidates(word), key=self.P)
 32 | 
 33 |     def candidates(self, word, verbose=0): 
 34 |         "Generate possible spelling corrections for word."
 35 |         
 36 |         known_result = self.known([word])
 37 |         edit1_result = self.known(self.edits1(word))
 38 |         edit2_result = self.known(self.edits2(word))
 39 |         
 40 |         if self.verbose > 0 or verbose > 0:
 41 |             print('Known Result: ', known_result)
 42 |             print('Edit1 Result: ', edit1_result)
 43 |             print('Edit2 Result: ', edit2_result)
 44 |         
 45 |         return (known_result or edit1_result or edit2_result or [word])
 46 | 
 47 |     def known(self, words):
 48 |         "The subset of `words` that appear in the dictionary of WORDS."
 49 |         return set(w for w in words if w in self.dictionary)
 50 | 
 51 |     def edits1(self, word):
 52 |         "All edits that are one edit away from `word`."
 53 |         letters    = 'abcdefghijklmnopqrstuvwxyz'
 54 |         splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
 55 |         deletes    = [L + R[1:]               for L, R in splits if R]
 56 |         transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
 57 |         replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
 58 |         inserts    = [L + c + R               for L, R in splits for c in letters]
 59 |         return set(deletes + transposes + replaces + inserts)
 60 | 
 61 |     def edits2(self, word): 
 62 |         "All edits that are two edits away from `word`."
 63 |         return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
 64 |     
 65 |     
 66 | class SymSpell(SpellCheck):
 67 |     def __init__(self, dictionary_file_path='', dictionary=None, verbose=0):
 68 |         super().__init__(dictionary=dictionary, verbose=verbose)
 69 |         
 70 |         self.dictionary_file_path = dictionary_file_path
 71 |         self.model = None
 72 |         
 73 |     def load_vocab(self, corpus_file_path, max_edit_distance_dictionary=2, prefix_length=5):
 74 |         #initial_capacity = len(corpus)
 75 |         
 76 |         #sym_spell = SymSpellPy(
 77 |         #    initial_capacity, max_edit_distance_dictionary, 
 78 |         #    prefix_length)
 79 |         self.model = SymSpellPy(
 80 |             max_dictionary_edit_distance=max_edit_distance_dictionary, 
 81 |             prefix_length=prefix_length)
 82 | 
 83 |         term_index = 0  # column of the term in the dictionary text file
 84 |         count_index = 1  # column of the term frequency in the dictionary text file
 85 |         if not self.model.load_dictionary(corpus_file_path, term_index, count_index):
 86 |             print("Dictionary file not found")
 87 |         
 88 |     def build_vocab(self, dictionary, file_dir, file_name, verbose=0):
 89 |         if not os.path.exists(file_dir):
 90 |             os.makedirs(file_dir)
 91 | 
 92 |         """
 93 |             Data format:
 94 |                 token, frequency
 95 |             Example:
 96 |                 edward 154
 97 |                 edwards 50
 98 |                 ...
 99 |         """ 
100 |         if self.verbose > 3 or verbose > 3:
101 |             print('Size of dictionary: %d' % len(dictionary))
102 | 
103 |         with open(file_dir + file_name, "w") as text_file:
104 |             for token, count in dictionary.items():
105 |                 text_file.write(token + ' ' + str(count))
106 |                 text_file.write('\n')
107 |         
108 |     def correction(self, word, max_edit_distance_lookup=2, mode='cloest'): 
109 |         if mode == 'cloest':
110 |             suggestion_verbosity = Verbosity.CLOSEST
111 |         elif mode == 'top':
112 |             suggestion_verbosity = Verbosity.TOP
113 |         elif mode == 'all':
114 |             suggestion_verbosity = Verbosity.ALL
115 |               
116 |         results = self.model.lookup(
117 |             word, suggestion_verbosity, max_edit_distance_lookup)
118 |         
119 |         results = [{'word': suggestion.term, 'count': suggestion.count, 'distance': suggestion.distance} for suggestion in results]
120 |         return results
121 |     
122 |     def corrections(self, sentence, max_edit_distance_lookup=2):
123 |         normalized_sentence = (sentence.lower())
124 |         results = self.model.lookup_compound(
125 |             normalized_sentence, max_edit_distance_lookup)
126 |         
127 |         results = [{'word': suggestion.term, 'distance': suggestion.distance} for suggestion in results]
128 |         return results
129 | 


--------------------------------------------------------------------------------
/sample/embeddings/nlp-embeddings-document-doc2vec.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Ingestion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Train: 2031\n",
 20 |       "Val: 226\n",
 21 |       "Test: 1502\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "from sklearn.datasets import fetch_20newsgroups\n",
 28 |     "from sklearn.model_selection import train_test_split\n",
 29 |     "\n",
 30 |     "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
 31 |     "\n",
 32 |     "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
 33 |     "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
 34 |     "\n",
 35 |     "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
 36 |     "x_test = np.array(test_raw_df.data)\n",
 37 |     "y_test = test_raw_df.target\n",
 38 |     "\n",
 39 |     "# x_train = [x_train[:200] for x in x_train]\n",
 40 |     "\n",
 41 |     "print('Train:', len(x_train))\n",
 42 |     "print('Val:', len(x_val))\n",
 43 |     "print('Test:', len(x_test))"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "/data/jupyter/common\n",
 56 |       "Added /data/jupyter/common into sys.path.\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "%reload_ext  autoreload\n",
 62 |     "%autoreload 2\n",
 63 |     "\n",
 64 |     "import sys, os\n",
 65 |     "def add_aion(curr_path=None):\n",
 66 |     "    if curr_path is None:\n",
 67 |     "        dir_path = os.getcwd()\n",
 68 |     "        target_path = os.path.dirname(os.path.dirname(dir_path))\n",
 69 |     "        print(target_path)\n",
 70 |     "        if target_path not in sys.path:\n",
 71 |     "            print('Added %s into sys.path.' % (target_path))\n",
 72 |     "            sys.path.insert(0, target_path)\n",
 73 |     "            \n",
 74 |     "add_aion()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "# Model"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from aion.embeddings.doc2vec import Doc2VecEmbeddings"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "2018-10-08 22:52:10.269082 start\n",
105 |       "2018-10-08 22:53:30.387969 end\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "doc2vec_embs = Doc2VecEmbeddings()\n",
111 |     "x_train_tokens = doc2vec_embs.build_vocab(documents=x_train)\n",
112 |     "doc2vec_embs.train(x_train_tokens)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 8,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "x_train_t = doc2vec_embs.encode(documents=x_train)\n",
124 |     "x_test_t = doc2vec_embs.encode(documents=x_test)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 9,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "from sklearn.linear_model import LogisticRegression\n",
136 |     "\n",
137 |     "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n",
138 |     "model.fit(x_train_t, y_train)\n",
139 |     "\n",
140 |     "y_pred = model.predict(x_test_t)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 10,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Accuracy:52.80%\n",
153 |       "Classification Report:\n",
154 |       "             precision    recall  f1-score   support\n",
155 |       "\n",
156 |       "          0       0.56      0.17      0.26       319\n",
157 |       "          1       0.82      0.63      0.72       389\n",
158 |       "          2       0.85      0.31      0.45       396\n",
159 |       "          3       0.38      0.93      0.54       398\n",
160 |       "\n",
161 |       "avg / total       0.66      0.53      0.50      1502\n",
162 |       "\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "from sklearn.metrics import accuracy_score\n",
168 |     "from sklearn.metrics import classification_report\n",
169 |     "\n",
170 |     "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
171 |     "print('Classification Report:')\n",
172 |     "print(classification_report(y_test, y_pred))"
173 |    ]
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.5.2"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 2
197 | }
198 | 


--------------------------------------------------------------------------------
/sample/nlp-distance-edit_distance.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Edit Distance"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {},
14 |    "outputs": [
15 |     {
16 |      "name": "stdout",
17 |      "output_type": "stream",
18 |      "text": [
19 |       "Edit Distance for \"edward\" and \"edwin\" is 3\n",
20 |       "Edit Distance for \"Edward\" and \"edwin\" is 4\n"
21 |      ]
22 |     }
23 |    ],
24 |    "source": [
25 |     "import editdistance\n",
26 |     "\n",
27 |     "data = ['edward', 'Edward']\n",
28 |     "\n",
29 |     "for record in data:\n",
30 |     "    dist = editdistance.eval(record, 'edwin')\n",
31 |     "    print('Edit Distance for \"%s\" and \"%s\" is %d' % (record, 'edwin', dist))"
32 |    ]
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.5.2"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 2
56 | }
57 | 


--------------------------------------------------------------------------------
/sample/nlp-embeddings-sentence-infersent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Ingestion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Train: 2031\n",
 20 |       "Val: 226\n",
 21 |       "Test: 1502\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "from sklearn.datasets import fetch_20newsgroups\n",
 28 |     "from sklearn.model_selection import train_test_split\n",
 29 |     "\n",
 30 |     "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
 31 |     "\n",
 32 |     "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
 33 |     "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
 34 |     "\n",
 35 |     "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
 36 |     "x_test = np.array(test_raw_df.data)\n",
 37 |     "y_test = test_raw_df.target\n",
 38 |     "\n",
 39 |     "# x_train = [x_train[:200] for x in x_train]\n",
 40 |     "\n",
 41 |     "print('Train:', len(x_train))\n",
 42 |     "print('Val:', len(x_val))\n",
 43 |     "print('Test:', len(x_test))"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Added /data/jupyter/common into sys.path.\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "%reload_ext  autoreload\n",
 61 |     "%autoreload 2\n",
 62 |     "\n",
 63 |     "import sys, os\n",
 64 |     "def add_aion(curr_path=None):\n",
 65 |     "    if curr_path is None:\n",
 66 |     "        dir_path = os.getcwd()\n",
 67 |     "        target_path = os.path.dirname(dir_path)\n",
 68 |     "        if target_path not in sys.path:\n",
 69 |     "            print('Added %s into sys.path.' % (target_path))\n",
 70 |     "            sys.path.insert(0, target_path)\n",
 71 |     "            \n",
 72 |     "add_aion()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "# Model"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "[nltk_data] Downloading package punkt to /home/dscoe/nltk_data...\n",
 92 |       "[nltk_data]   Package punkt is already up-to-date!\n"
 93 |      ]
 94 |     },
 95 |     {
 96 |      "data": {
 97 |       "text/plain": [
 98 |        "True"
 99 |       ]
100 |      },
101 |      "execution_count": 3,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "import nltk\n",
108 |     "nltk.download('punkt')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "from aion.embeddings.infersent import InferSentEmbeddings\n",
118 |     "\n",
119 |     "infer_sent_embs = InferSentEmbeddings(word_embeddings_dir='../model/text/stanford/glove/', verbose=20)\n",
120 |     "infer_sent_embs.load_model(dest_dir='../model/text/facebook/infersent/')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "Found 22119(/46170) words with w2v vectors\n",
133 |       "Vocab size : 22119\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "infer_sent_embs.build_vocab(x_train, tokenize=True)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 6,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stderr",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "/data/jupyter/common/aion/embeddings/infersent_lib/models.py:222: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
151 |       "  sentences[stidx:stidx + bsize]), volatile=True)\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "x_train_t = infer_sent_embs.encode(x_train, tokenize=True)\n",
157 |     "x_test_t = infer_sent_embs.encode(x_test, tokenize=True)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 7,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n",
170 |       "  warn('The line search algorithm did not converge', LineSearchWarning)\n",
171 |       "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n",
172 |       "  warnings.warn('Line Search failed')\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "from sklearn.linear_model import LogisticRegression\n",
178 |     "\n",
179 |     "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n",
180 |     "model.fit(x_train_t, y_train)\n",
181 |     "\n",
182 |     "y_pred = model.predict(x_test_t)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "Accuracy:86.55%\n",
195 |       "Classification Report:\n",
196 |       "             precision    recall  f1-score   support\n",
197 |       "\n",
198 |       "          0       0.85      0.76      0.80       319\n",
199 |       "          1       0.86      0.95      0.91       389\n",
200 |       "          2       0.95      0.79      0.86       396\n",
201 |       "          3       0.82      0.94      0.87       398\n",
202 |       "\n",
203 |       "avg / total       0.87      0.87      0.86      1502\n",
204 |       "\n"
205 |      ]
206 |     }
207 |    ],
208 |    "source": [
209 |     "from sklearn.metrics import accuracy_score\n",
210 |     "from sklearn.metrics import classification_report\n",
211 |     "\n",
212 |     "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
213 |     "print('Classification Report:')\n",
214 |     "print(classification_report(y_test, y_pred))"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "collapsed": true
222 |    },
223 |    "outputs": [],
224 |    "source": []
225 |   }
226 |  ],
227 |  "metadata": {
228 |   "kernelspec": {
229 |    "display_name": "Python 3",
230 |    "language": "python",
231 |    "name": "python3"
232 |   },
233 |   "language_info": {
234 |    "codemirror_mode": {
235 |     "name": "ipython",
236 |     "version": 3
237 |    },
238 |    "file_extension": ".py",
239 |    "mimetype": "text/x-python",
240 |    "name": "python",
241 |    "nbconvert_exporter": "python",
242 |    "pygments_lexer": "ipython3",
243 |    "version": "3.5.2"
244 |   }
245 |  },
246 |  "nbformat": 4,
247 |  "nbformat_minor": 2
248 | }
249 | 


--------------------------------------------------------------------------------
/sample/nlp-embeddings-word-cove.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Ingestion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Train: 2031\n",
 20 |       "Val: 226\n",
 21 |       "Test: 1502\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "from sklearn.datasets import fetch_20newsgroups\n",
 28 |     "from sklearn.model_selection import train_test_split\n",
 29 |     "\n",
 30 |     "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
 31 |     "\n",
 32 |     "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
 33 |     "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
 34 |     "\n",
 35 |     "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
 36 |     "x_test = np.array(test_raw_df.data)\n",
 37 |     "y_test = test_raw_df.target\n",
 38 |     "\n",
 39 |     "# x_train = [x_train[:200] for x in x_train]\n",
 40 |     "\n",
 41 |     "print('Train:', len(x_train))\n",
 42 |     "print('Val:', len(x_val))\n",
 43 |     "print('Test:', len(x_test))"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Added /data/jupyter/common into sys.path.\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "%reload_ext  autoreload\n",
 61 |     "%autoreload 2\n",
 62 |     "\n",
 63 |     "import sys, os\n",
 64 |     "def add_aion(curr_path=None):\n",
 65 |     "    if curr_path is None:\n",
 66 |     "        dir_path = os.getcwd()\n",
 67 |     "        target_path = os.path.dirname(dir_path)\n",
 68 |     "        if target_path not in sys.path:\n",
 69 |     "            print('Added %s into sys.path.' % (target_path))\n",
 70 |     "            sys.path.insert(0, target_path)\n",
 71 |     "            \n",
 72 |     "add_aion()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "# Model (Keras)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "max_sequence_length = 200"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 4,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stderr",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "Using TensorFlow backend.\n"
103 |      ]
104 |     },
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "2018-10-06 16:04:42.665310. [FOUND] Keras_CoVe.h5 in ../model/text/salesforce/cove/\n"
110 |      ]
111 |     },
112 |     {
113 |      "name": "stderr",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "/anaconda/envs/py35/lib/python3.5/site-packages/keras/engine/saving.py:269: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n",
117 |       "  warnings.warn('No training configuration found in save file: '\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "from aion.embeddings.cove import CoVeEmbeddings\n",
123 |     "\n",
124 |     "cove_embs = CoVeEmbeddings(\n",
125 |     "    word_embeddings_dir='../model/text/stanford/glove/', \n",
126 |     "    max_sequence_length=max_sequence_length, verbose=20)\n",
127 |     "tmp = cove_embs.load_model(dest_dir='../model/text/salesforce/cove/')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "x_train_t = cove_embs.encode(x_train)\n",
137 |     "x_test_t = cove_embs.encode(x_test)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 6,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "x_train_t2 = x_train_t.reshape(len(x_train_t), max_sequence_length*600)\n",
149 |     "x_test_t2 = x_test_t.reshape(len(x_test_t), max_sequence_length*600)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stderr",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n",
162 |       "  warn('The line search algorithm did not converge', LineSearchWarning)\n",
163 |       "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n",
164 |       "  warnings.warn('Line Search failed')\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "from sklearn.linear_model import LogisticRegression\n",
170 |     "\n",
171 |     "model = LogisticRegression(solver='newton-cg')\n",
172 |     "model.fit(x_train_t2, y_train)\n",
173 |     "\n",
174 |     "y_pred = model.predict(x_test_t2)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "For sake of easier demonstration, I did not do any data preprocessing. It leads lots of OOV and causing the result bad."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 8,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "Accuracy:32.56%\n",
194 |       "Classification Report:\n",
195 |       "             precision    recall  f1-score   support\n",
196 |       "\n",
197 |       "          0       0.26      0.03      0.05       319\n",
198 |       "          1       0.34      0.61      0.43       389\n",
199 |       "          2       0.33      0.05      0.08       396\n",
200 |       "          3       0.32      0.56      0.41       398\n",
201 |       "\n",
202 |       "avg / total       0.31      0.33      0.25      1502\n",
203 |       "\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "from sklearn.metrics import accuracy_score\n",
209 |     "from sklearn.metrics import classification_report\n",
210 |     "\n",
211 |     "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
212 |     "print('Classification Report:')\n",
213 |     "print(classification_report(y_test, y_pred))"
214 |    ]
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python 3",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.5.2"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 2
238 | }
239 | 


--------------------------------------------------------------------------------
/sample/nlp-lsa_lda.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 2 latent methods for dimension reduction and topic modeling\n",
  8 |     "\n",
  9 |     "![](https://cdn.pixabay.com/photo/2015/11/07/11/17/golden-gate-bridge-1030999_960_720.jpg)\n",
 10 |     "Photo: https://pixabay.com/en/golden-gate-bridge-women-back-1030999/\n",
 11 |     "\n",
 12 |     "Before the state-of-the-art word embedding technique, Latent Semantic Analysis (LSA) and Latent Dirichlet Allocation (LDA) area good approaches to deal with NLP problems. Both LSA and LDA have same input which is Bag of words in matrix format. LSA focus on reducing matrix dimension while LDA solves topic modeling problems.\n",
 13 |     "\n",
 14 |     "I will not go through mathematical detail and as there is lot of great material for that. You may check it from reference. For the sake of keeping it easy to understand, I did not do pre-processing such as stopwords removal. It is critical part when you use LSA, LSI and LDA. After reading this article, you will know:\n",
 15 |     "- Latent Semantic Analysis (LSA)\n",
 16 |     "- Latent Dirichlet Allocation (LDA)\n",
 17 |     "- Take Away"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from sklearn.datasets import fetch_20newsgroups\n",
 29 |     "train_raw = fetch_20newsgroups(subset='train')\n",
 30 |     "test_raw = fetch_20newsgroups(subset='test')\n",
 31 |     "\n",
 32 |     "x_train = train_raw.data\n",
 33 |     "y_train = train_raw.target\n",
 34 |     "\n",
 35 |     "x_test = test_raw.data\n",
 36 |     "y_test = test_raw.target"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "# Latent Senmantic Analysis (LSA)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "The idea is that words will occurs in similar pieces of text if they have similar meaning. People usually use Latent Semantic Indexing (LSI) as an alternative name in NLP field.\n",
 51 |     "\n",
 52 |     "First of all, we have m documents and n words as input. An m * n matrix can be constructed while column and row are document and word respectively. You can use count occurrence or TF-IDF score. However, TF-IDF is better than count occurrence in most of the time as high frequency do not account for better classification.\n",
 53 |     "\n",
 54 |     "![](https://1.bp.blogspot.com/-tnzPA6dDtTU/Vw6EWm_PjCI/AAAAAAABDwI/JatHtUJb4fsce9E-Ns5t02_nakFtGrsugCLcB/s1600/%25E8%259E%25A2%25E5%25B9%2595%25E5%25BF%25AB%25E7%2585%25A7%2B2016-04-14%2B%25E4%25B8%258A%25E5%258D%25881.39.07.png)\n",
 55 |     "Photo: http://mropengate.blogspot.com/2016/04/tf-idf-in-r-language.html\n",
 56 |     "\n",
 57 |     "The idea of TF-IDF is that high frequency may not able to provide much information gain. In another word, rare words contribute more weights to the model. Word importance will be increased if the number of occurrence within same document (i.e. training record). On the other hand, it will be decreased if it occurs in corpus (i.e. other training records). For detail, you may check this [blog](https://towardsdatascience.com/3-basic-approaches-in-bag-of-words-which-are-better-than-word-embeddings-c2cbc7398016).\n",
 58 |     "\n",
 59 |     "The challenge is that the matrix is very sparse (or high dimension) and noisy (or include lots of low frequency word). So truncated SVD is adopted to reduce dimension.\n",
 60 |     "\n",
 61 |     "![]()\n",
 62 |     "\n",
 63 |     "The idea of SVD is finding the most valuable information and using lower dimension t to represent same thing."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 26,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "TF-IDF output shape: (11314, 130107)\n",
 76 |       "LSA output shape: (11314, 50)\n",
 77 |       "Sum of explained variance ratio: 8%\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 83 |     "from sklearn.decomposition import TruncatedSVD\n",
 84 |     "\n",
 85 |     "def build_lsa(x_train, x_test, dim=50):\n",
 86 |     "    tfidf_vec = TfidfVectorizer(use_idf=True, norm='l2')\n",
 87 |     "    svd = TruncatedSVD(n_components=dim)\n",
 88 |     "    \n",
 89 |     "    transformed_x_train = tfidf_vec.fit_transform(x_train)\n",
 90 |     "    transformed_x_test = tfidf_vec.transform(x_test)\n",
 91 |     "    \n",
 92 |     "    print('TF-IDF output shape:', transformed_x_train.shape)\n",
 93 |     "    \n",
 94 |     "    x_train_svd = svd.fit_transform(transformed_x_train)\n",
 95 |     "    x_test_svd = svd.transform(transformed_x_test)\n",
 96 |     "    \n",
 97 |     "    print('LSA output shape:', x_train_svd.shape)\n",
 98 |     "    \n",
 99 |     "    explained_variance = svd.explained_variance_ratio_.sum()\n",
100 |     "    print(\"Sum of explained variance ratio: %d%%\" % (int(explained_variance * 100)))\n",
101 |     "    \n",
102 |     "    return tfidf_vec, svd, x_train_svd, x_test_svd\n",
103 |     "\n",
104 |     "\n",
105 |     "tfidf_vec, svd, x_train_lda, x_test_lda = build_lsa(x_train, x_test)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "We can see that the dimension reduces from 130k to 50 only."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 27,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Accuracy: 0.6511 (+/- 0.0201)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "from sklearn.linear_model import LogisticRegression\n",
130 |     "from sklearn.model_selection import cross_val_score, KFold\n",
131 |     "\n",
132 |     "lr_model = LogisticRegression(solver='newton-cg',n_jobs=-1)\n",
133 |     "lr_model.fit(x_train_svd, y_train)\n",
134 |     "\n",
135 |     "cv = KFold(n_splits=5, shuffle=True)\n",
136 |     "    \n",
137 |     "scores = cross_val_score(lr_model, x_test_svd, y_test, cv=cv, scoring='accuracy')\n",
138 |     "print(\"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2))"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# Latent Dirichlet Allocation (LDA)\n",
146 |     "\n",
147 |     "LDA is introduced by David Blei, Andrew Ng and Michael O. Jordan in 2003. It is unsupervised learning and topic model is the typical example. The assumption is that each document mix with various topics and every topic mix with various words.\n",
148 |     "\n",
149 |     "![]()\n",
150 |     "\n",
151 |     "Intuitively, you can image that we have two layer of aggregations. First layer is the distribution of categories. For example, we have finance news, weather news and political news. Second layer is distribution of words within the category. For instance, we can find \"sunny\" and \"cloud\" in weather news while \"money\" and \"stock\" exists in finance news. \n",
152 |     "\n",
153 |     "However,  \"a\", \"with\" and \"can\" do not contribute on topic modeling problem. Those words exist among documents and will have roughly same probability between categories. Therefore, stopwords removal is a critical step to achieve a better result.\n",
154 |     "\n",
155 |     "![]()\n",
156 |     "\n",
157 |     "For particular document d, we get the topic distribution which is θ. From this distribution(θ), topic t will be chosen and selecting corresponding word from ϕ."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 44,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "Topic 0:\n",
170 |       "['the', 'for', 'and', 'to', 'edu']\n",
171 |       "Topic 1:\n",
172 |       "['c_', 'w7', 'hz', 'mv', 'ck']\n",
173 |       "Topic 2:\n",
174 |       "['space', 'nasa', 'cmu', 'science', 'edu']\n",
175 |       "Topic 3:\n",
176 |       "['the', 'to', 'of', 'for', 'and']\n",
177 |       "Topic 4:\n",
178 |       "['the', 'to', 'of', 'and', 'in']\n",
179 |       "Topic 5:\n",
180 |       "['the', 'of', 'and', 'in', 'were']\n",
181 |       "Topic 6:\n",
182 |       "['edu', 'team', 'he', 'game', '10']\n",
183 |       "Topic 7:\n",
184 |       "['ax', 'max', 'g9v', 'b8f', 'a86']\n",
185 |       "Topic 8:\n",
186 |       "['db', 'bike', 'ac', 'image', 'dod']\n",
187 |       "Topic 9:\n",
188 |       "['nec', 'mil', 'navy', 'sg', 'behanna']\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
194 |     "from sklearn.decomposition import LatentDirichletAllocation\n",
195 |     "\n",
196 |     "def build_lda(x_train, num_of_topic=10):\n",
197 |     "    vec = CountVectorizer()\n",
198 |     "    transformed_x_train = vec.fit_transform(x_train)\n",
199 |     "    feature_names = vec.get_feature_names()\n",
200 |     "\n",
201 |     "    lda = LatentDirichletAllocation(\n",
202 |     "        n_components=num_of_topic, max_iter=5, \n",
203 |     "        learning_method='online', random_state=0)\n",
204 |     "    lda.fit(transformed_x_train)\n",
205 |     "\n",
206 |     "    return lda, vec, feature_names\n",
207 |     "\n",
208 |     "def display_word_distribution(model, feature_names, n_word):\n",
209 |     "    for topic_idx, topic in enumerate(model.components_):\n",
210 |     "        print(\"Topic %d:\" % (topic_idx))\n",
211 |     "        words = []\n",
212 |     "        for i in topic.argsort()[:-n_word - 1:-1]:\n",
213 |     "            words.append(feature_names[i])\n",
214 |     "        print(words)\n",
215 |     "\n",
216 |     "lda_model, vec, feature_names = build_lda(x_train)\n",
217 |     "display_word_distribution(\n",
218 |     "    model=lda_model, feature_names=feature_names, \n",
219 |     "    n_word=5)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "# Take Away\n",
227 |     "- Both of them use __Bag-of-words as input matrix__\n",
228 |     "- The challenge of SVD is that we are __hard to determine the optimal number of dimension__. In general, low dimension consume less resource but we may not able to distinguish opposite meaning words while high dimension overcome it but consuming more resource."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "# About Me\n",
236 |     "I am Data Scientist in Bay Area. Focusing on state-of-the-art in Data Science, Artificial Intelligence , especially in NLP and platform related. You can reach me from [Medium Blog](https://medium.com/@makcedward) or [Github](https://github.com/makcedward)."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "# Reference\n",
244 |     "- [1] SVD Tutorial: https://cs.fit.edu/~dmitra/SciComp/Resources/singular-value-decomposition-fast-track-tutorial.pdf\n",
245 |     "- [2] CUHK LSI Tutorial: http://www1.se.cuhk.edu.hk/~seem5680/lecture/LSI-Eg.pdf\n",
246 |     "- [3] Stanford LSI Tutorial: https://nlp.stanford.edu/IR-book/pdf/18lsi.pdf\n",
247 |     "- [4] LSA and LDA Explanation: https://cs.stanford.edu/~ppasupat/a9online/1140.html"
248 |    ]
249 |   }
250 |  ],
251 |  "metadata": {
252 |   "kernelspec": {
253 |    "display_name": "Python 3",
254 |    "language": "python",
255 |    "name": "python3"
256 |   },
257 |   "language_info": {
258 |    "codemirror_mode": {
259 |     "name": "ipython",
260 |     "version": 3
261 |    },
262 |    "file_extension": ".py",
263 |    "mimetype": "text/x-python",
264 |    "name": "python",
265 |    "nbconvert_exporter": "python",
266 |    "pygments_lexer": "ipython3",
267 |    "version": "3.5.2"
268 |   }
269 |  },
270 |  "nbformat": 4,
271 |  "nbformat_minor": 2
272 | }
273 | 


--------------------------------------------------------------------------------
/sample/nlp-named_entity_recognition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Named Entity Recognition](https://cdn-images-1.medium.com/max/800/0*6qNBX5v1XFr1pMvr.jpg)\n",
  8 |     "Source: https://hackernoon.com/named-entity-recognition-applications-and-use-cases-c2ef0904e9fe"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {
 15 |     "collapsed": true
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "ner_dir = '/stanford/ner/'"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "# Copy from https://en.wikipedia.org/wiki/Stanford_University\n",
 31 |     "\n",
 32 |     "article = \"The university was founded in 1885 by Leland and Jane Stanford in memory of \\\n",
 33 |     "their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous \\\n",
 34 |     "year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. \\\n",
 35 |     "The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\""
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "article2 = 'New York, New York , NY N.Y. new york'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Stanford NER"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "NTLK Version: 3.2.5\n"
 66 |      ]
 67 |     },
 68 |     {
 69 |      "name": "stderr",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "/anaconda/envs/py35/lib/python3.5/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: \n",
 73 |       "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
 74 |       "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
 75 |       "  super(StanfordNERTagger, self).__init__(*args, **kwargs)\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "import nltk\n",
 81 |     "print('NTLK Version: %s' % nltk.__version__)\n",
 82 |     "\n",
 83 |     "from nltk.tag import StanfordNERTagger\n",
 84 |     "\n",
 85 |     "stanford_ner_tagger = StanfordNERTagger(\n",
 86 |     "    ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz',\n",
 87 |     "    ner_dir + 'stanford-ner-3.9.1.jar'\n",
 88 |     ")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 11,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "results = stanford_ner_tagger.tag(article.split())"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 22,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
112 |       "\n",
113 |       "Type: LOCATION, Value: New\n",
114 |       "Type: LOCATION, Value: York\n",
115 |       "Type: LOCATION, Value: NY\n",
116 |       "Type: LOCATION, Value: N.Y.\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "print('Original Sentence: %s' % (article))\n",
122 |     "print()\n",
123 |     "for result in results:\n",
124 |     "    tag_value = result[0]\n",
125 |     "    tag_type = result[1]\n",
126 |     "    if tag_type != 'O':\n",
127 |     "        print('Type: %s, Value: %s' % (tag_type, tag_value))"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 14,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "results = stanford_ner_tagger.tag(article2.split())"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 21,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "Original Sentence: New York, New York , NY N.Y. new york\n",
151 |       "\n",
152 |       "Type: LOCATION, Value: New\n",
153 |       "Type: LOCATION, Value: York\n",
154 |       "Type: LOCATION, Value: NY\n",
155 |       "Type: LOCATION, Value: N.Y.\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "print('Original Sentence: %s' % (article2))\n",
161 |     "print()\n",
162 |     "for result in results:\n",
163 |     "    tag_value = result[0]\n",
164 |     "    tag_type = result[1]\n",
165 |     "    if tag_type != 'O':\n",
166 |     "        print('Type: %s, Value: %s' % (tag_type, tag_value))"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "# NLTK NE"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 25,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "NTLK version: 3.2.5\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "import nltk\n",
191 |     "\n",
192 |     "print('NTLK version: %s' % (nltk.__version__))\n",
193 |     "\n",
194 |     "from nltk import word_tokenize, pos_tag, ne_chunk\n",
195 |     "\n",
196 |     "nltk.download('words')\n",
197 |     "nltk.download('averaged_perceptron_tagger')\n",
198 |     "nltk.download('punkt')\n",
199 |     "nltk.download('maxent_ne_chunker')"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 43,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "results = ne_chunk(pos_tag(word_tokenize(article)))"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 44,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "name": "stdout",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
223 |       "\n",
224 |       "  (GPE Leland/NNP)\n",
225 |       "  (PERSON Jane/NNP Stanford/NNP)\n",
226 |       "  (GPE Leland/NNP)\n",
227 |       "  Stanford/NNP\n",
228 |       "  Jr./NNP\n",
229 |       "  (PERSON Stanford/NNP)\n",
230 |       "  Governor/NNP\n",
231 |       "  (GPE California/NNP)\n",
232 |       "  (GPE U.S/NNP)\n",
233 |       "  Senator/NNP\n",
234 |       "  October/NNP\n",
235 |       "  ]/NNP\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "print('Original Sentence: %s' % (article))\n",
241 |     "print()\n",
242 |     "for x in str(results).split('\\n'):\n",
243 |     "    if '/NNP' in x:\n",
244 |     "        print(x)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 45,
250 |    "metadata": {
251 |     "collapsed": true
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "results = ne_chunk(pos_tag(word_tokenize(article2)))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 46,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Original Sentence: New York, New York , NY N.Y. new york\n",
268 |       "\n",
269 |       "  (GPE New/NNP York/NNP)\n",
270 |       "  (GPE New/NNP York/NNP)\n",
271 |       "  (ORGANIZATION NY/NNP)\n",
272 |       "  N.Y./NNP\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "print('Original Sentence: %s' % (article2))\n",
278 |     "print()\n",
279 |     "for x in str(results).split('\\n'):\n",
280 |     "    if '/NNP' in x:\n",
281 |     "        print(x)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "# Spacy"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 7,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "spaCy: 2.0.11\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "import spacy\n",
306 |     "\n",
307 |     "print('spaCy: %s' % (spacy.__version__))"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 8,
313 |    "metadata": {
314 |     "collapsed": true
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "spacy_nlp = spacy.load('en')"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 20,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
331 |       "\n",
332 |       "Type: DATE, Value: 1885\n",
333 |       "Type: GPE, Value: Leland\n",
334 |       "Type: PERSON, Value: Jane Stanford\n",
335 |       "Type: PERSON, Value: Leland Stanford Jr.\n",
336 |       "Type: DATE, Value: age 15 the previous year\n",
337 |       "Type: ORG, Value: Stanford\n",
338 |       "Type: GPE, Value: California\n",
339 |       "Type: GPE, Value: U.S.\n",
340 |       "Type: ORDINAL, Value: first\n",
341 |       "Type: DATE, Value: October 1, 1891,[2][3\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "document = spacy_nlp(article)\n",
347 |     "\n",
348 |     "print('Original Sentence: %s' % (article))\n",
349 |     "print()\n",
350 |     "for element in document.ents:\n",
351 |     "    print('Type: %s, Value: %s' % (element.label_, element))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 24,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "name": "stdout",
361 |      "output_type": "stream",
362 |      "text": [
363 |       "Original Sentence: New York, New York , NY N.Y. new york\n",
364 |       "\n",
365 |       "Type: GPE, Value: New York\n",
366 |       "Type: GPE, Value: New York\n",
367 |       "Type: GPE, Value: NY N.Y.\n"
368 |      ]
369 |     }
370 |    ],
371 |    "source": [
372 |     "document = spacy_nlp(article2)\n",
373 |     "\n",
374 |     "print('Original Sentence: %s' % (article2))\n",
375 |     "print()\n",
376 |     "for element in document.ents:\n",
377 |     "    print('Type: %s, Value: %s' % (element.label_, element))"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 7,
383 |    "metadata": {
384 |     "collapsed": true
385 |    },
386 |    "outputs": [],
387 |    "source": []
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "collapsed": true
394 |    },
395 |    "outputs": [],
396 |    "source": []
397 |   }
398 |  ],
399 |  "metadata": {
400 |   "kernelspec": {
401 |    "display_name": "Python 3",
402 |    "language": "python",
403 |    "name": "python3"
404 |   },
405 |   "language_info": {
406 |    "codemirror_mode": {
407 |     "name": "ipython",
408 |     "version": 3
409 |    },
410 |    "file_extension": ".py",
411 |    "mimetype": "text/x-python",
412 |    "name": "python",
413 |    "nbconvert_exporter": "python",
414 |    "pygments_lexer": "ipython3",
415 |    "version": "3.5.2"
416 |   }
417 |  },
418 |  "nbformat": 4,
419 |  "nbformat_minor": 2
420 | }
421 | 


--------------------------------------------------------------------------------
/sample/nlp-part_of_speech.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Part of Speech](https://dailygenius.com/wp-content/uploads/2014/09/handwriting1.jpg)\n",
  8 |     "\n",
  9 |     "Source: https://dailygenius.com/handwriting-helps-learn-graphic/"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Part of Speech\n",
 17 |     "\n",
 18 |     "Part of Speech, aka POS, is referring to category of words. Same category of words can represent similar behavior. For example, \"Word\" is a noun while \"Run\" is a verb. To have a better understanding on article, we have to know the POS. \n",
 19 |     "\n",
 20 |     "In NLP, POS is an important part but we may not always deal with it directly. Lemmanization and Stemming process relies on POS but some libraries (e.g. spaCy) is very nice that helped us to tackle it.\n",
 21 |     "\n",
 22 |     "In English, we have noun, adjective, conjunction etc. Sometimes, same word can have both verb and noun. In Chinese, two major categories are Content Word and Function words which including noun, adverb, conjunction as well. \n",
 23 |     "This article includes how we can do it for English (via spaCy) and Chinese (via jieba)."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# Catpure from https://en.wikipedia.org/wiki/Part_of_speech\n",
 35 |     "\n",
 36 |     "article = 'In traditional grammar, a part of speech (abbreviated form: PoS or POS) is \\\n",
 37 |     "a category of words (or, more generally, of lexical items) which have similar grammatical properties. '"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Catpure from https://zh.wikipedia.org/wiki/%E8%A9%9E%E9%A1%9E\n",
 49 |     "\n",
 50 |     "article2 = '詞類是一個語言學術語，是一種語言中詞的語法分類，是以語法特徵\\\n",
 51 |     "（包括句法功能和形態變化）為主要依據、兼顧詞彙意義對詞進行劃分的結果。'"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### spaCy"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "spaCy Version: 2.0.11\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "import spacy\n",
 76 |     "print('spaCy Version: %s' % (spacy.__version__))\n",
 77 |     "spacy_nlp = spacy.load('en_core_web_sm')"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "\"a\" is DT which means deteminer. \"part\" is NN which is noun while \"of\" is IN which is preposition."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 4,
 90 |    "metadata": {
 91 |     "scrolled": true
 92 |    },
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "Original Article: In traditional grammar, a part of speech (abbreviated form: PoS or POS) is a category of words (or, more generally, of lexical items) which have similar grammatical properties. \n",
 99 |       "\n",
100 |       "Word: In, POS: IN\n",
101 |       "Word: traditional, POS: JJ\n",
102 |       "Word: grammar, POS: NN\n",
103 |       "Word: ,, POS: ,\n",
104 |       "Word: a, POS: DT\n",
105 |       "Word: part, POS: NN\n",
106 |       "Word: of, POS: IN\n",
107 |       "Word: speech, POS: NN\n",
108 |       "Word: (, POS: -LRB-\n",
109 |       "Word: abbreviated, POS: VBN\n",
110 |       "Word: form, POS: NN\n",
111 |       "Word: :, POS: :\n",
112 |       "Word: PoS, POS: NNP\n",
113 |       "Word: or, POS: CC\n",
114 |       "Word: POS, POS: NNP\n",
115 |       "Word: ), POS: -RRB-\n",
116 |       "Word: is, POS: VBZ\n",
117 |       "Word: a, POS: DT\n",
118 |       "Word: category, POS: NN\n",
119 |       "Word: of, POS: IN\n",
120 |       "Word: words, POS: NNS\n",
121 |       "Word: (, POS: -LRB-\n",
122 |       "Word: or, POS: CC\n",
123 |       "Word: ,, POS: ,\n",
124 |       "Word: more, POS: RBR\n",
125 |       "Word: generally, POS: RB\n",
126 |       "Word: ,, POS: ,\n",
127 |       "Word: of, POS: IN\n",
128 |       "Word: lexical, POS: JJ\n",
129 |       "Word: items, POS: NNS\n",
130 |       "Word: ), POS: -RRB-\n",
131 |       "Word: which, POS: WDT\n",
132 |       "Word: have, POS: VBP\n",
133 |       "Word: similar, POS: JJ\n",
134 |       "Word: grammatical, POS: JJ\n",
135 |       "Word: properties, POS: NNS\n",
136 |       "Word: ., POS: .\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "doc = spacy_nlp(article)\n",
142 |     "tokens = [token.text for token in doc if not token.is_stop]\n",
143 |     "\n",
144 |     "print('Original Article: %s' % (article))\n",
145 |     "print()\n",
146 |     "for token in doc:\n",
147 |     "    print('Word: %s, POS: %s' % (token.text, token.tag_))"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### jieba"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 5,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "jieba Version: 0.39\n"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "import jieba\n",
172 |     "print('jieba Version: %s' % jieba.__version__)\n",
173 |     "\n",
174 |     "import jieba.posseg as jieba_pos_tagger"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "\"詞類\" is noun while \"是\" is verb."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 6,
187 |    "metadata": {
188 |     "scrolled": true
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stderr",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "Building prefix dict from the default dictionary ...\n",
196 |       "Loading model from cache /tmp/jieba.cache\n"
197 |      ]
198 |     },
199 |     {
200 |      "name": "stdout",
201 |      "output_type": "stream",
202 |      "text": [
203 |       "Original Article: 詞類是一個語言學術語，是一種語言中詞的語法分類，是以語法特徵（包括句法功能和形態變化）為主要依據、兼顧詞彙意義對詞進行劃分的結果。\n",
204 |       "\n"
205 |      ]
206 |     },
207 |     {
208 |      "name": "stderr",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "Loading model cost 1.159 seconds.\n",
212 |       "Prefix dict has been built succesfully.\n"
213 |      ]
214 |     },
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "Word: 詞類, POS: n\n",
220 |       "Word: 是, POS: v\n",
221 |       "Word: 一個, POS: m\n",
222 |       "Word: 語言, POS: n\n",
223 |       "Word: 學術, POS: n\n",
224 |       "Word: 語, POS: n\n",
225 |       "Word: ，, POS: x\n",
226 |       "Word: 是, POS: v\n",
227 |       "Word: 一種, POS: m\n",
228 |       "Word: 語, POS: n\n",
229 |       "Word: 言中, POS: nr\n",
230 |       "Word: 詞, POS: n\n",
231 |       "Word: 的, POS: uj\n",
232 |       "Word: 語法, POS: n\n",
233 |       "Word: 分類, POS: vn\n",
234 |       "Word: ，, POS: x\n",
235 |       "Word: 是, POS: v\n",
236 |       "Word: 以, POS: p\n",
237 |       "Word: 語, POS: n\n",
238 |       "Word: 法特, POS: ns\n",
239 |       "Word: 徵, POS: zg\n",
240 |       "Word: （, POS: x\n",
241 |       "Word: 包括, POS: v\n",
242 |       "Word: 句法, POS: n\n",
243 |       "Word: 功能, POS: n\n",
244 |       "Word: 和, POS: c\n",
245 |       "Word: 形態, POS: n\n",
246 |       "Word: 變化, POS: vn\n",
247 |       "Word: ）, POS: x\n",
248 |       "Word: 為, POS: zg\n",
249 |       "Word: 主要, POS: b\n",
250 |       "Word: 依據, POS: p\n",
251 |       "Word: 、, POS: x\n",
252 |       "Word: 兼顧, POS: v\n",
253 |       "Word: 詞, POS: n\n",
254 |       "Word: 彙, POS: zg\n",
255 |       "Word: 意, POS: ng\n",
256 |       "Word: 義, POS: nt\n",
257 |       "Word: 對, POS: p\n",
258 |       "Word: 詞, POS: n\n",
259 |       "Word: 進, POS: v\n",
260 |       "Word: 行, POS: v\n",
261 |       "Word: 劃, POS: v\n",
262 |       "Word: 分, POS: q\n",
263 |       "Word: 的, POS: uj\n",
264 |       "Word: 結, POS: v\n",
265 |       "Word: 果, POS: ng\n",
266 |       "Word: 。, POS: x\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "print('Original Article: %s' % (article2))\n",
272 |     "print()\n",
273 |     "\n",
274 |     "words = jieba_pos_tagger.cut(article2)\n",
275 |     "\n",
276 |     "for word in words:\n",
277 |     "    print('Word: %s, POS: %s' % (word.word, word.flag))"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "# Conclusion\n",
285 |     "\n",
286 |     "POS helps a lot on text pre-processing. For example, we have to know the POS of word in order to perform lemmanization, stemming and stop word removal. These three pre-processing will be discussed in later article. Stay tuned."
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "# Reference\n",
294 |     "\n",
295 |     "Standard Syntactic Categories: https://cs.nyu.edu/grishman/jet/guide/PennPOS.html"
296 |    ]
297 |   }
298 |  ],
299 |  "metadata": {
300 |   "kernelspec": {
301 |    "display_name": "Python 3",
302 |    "language": "python",
303 |    "name": "python3"
304 |   },
305 |   "language_info": {
306 |    "codemirror_mode": {
307 |     "name": "ipython",
308 |     "version": 3
309 |    },
310 |    "file_extension": ".py",
311 |    "mimetype": "text/x-python",
312 |    "name": "python",
313 |    "nbconvert_exporter": "python",
314 |    "pygments_lexer": "ipython3",
315 |    "version": "3.5.2"
316 |   }
317 |  },
318 |  "nbformat": 4,
319 |  "nbformat_minor": 2
320 | }
321 | 


--------------------------------------------------------------------------------
/sample/nlp-sentence_tokenization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "![Sentence Tokenization](http://www.digitalmeetsculture.net/wp-content/uploads/2015/04/article.jpg)\n",
 10 |     "\n",
 11 |     "Source: http://www.digitalmeetsculture.net/article/article-about-preforma-published-in-archival-science/"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "source": [
 20 |     "# Sentence Tokenization\n",
 21 |     "\n",
 22 |     "In previous article, word tokenization is introduced. What if we want to tokenize sentence? In general, we can easily split sentence by some punctuation such ., ? and !. However, there are lots of exception if we splitting article by those punctuation only.\n",
 23 |     "In this article, you will go through why we need to use sentence tokenization and how can we use it."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "source": [
 32 |     "# Why?\n",
 33 |     "According to researchers, about 86% of article include the importance sentence in first one or two sentences. Believe that it is one of the reason why textsum model use first 2 sentences for training\n",
 34 |     "When I am in school, teacher teaches how we should write an article. The importance sentence will be placed in the first sentence most of the time. It may exists in last sentence sometimes."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# How?\n",
 42 |     "So how can we tokenize sentence? You can use the following simple python script to do that or using library such as nltk and spacy"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 1,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n",
 54 |     "\n",
 55 |     "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n",
 56 |     "converting a sequence of characters (such as in a computer program or web page) into a \\\n",
 57 |     "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n",
 58 |     "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n",
 59 |     "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n",
 60 |     "which together analyze the syntax of programming languages, web pages, and so forth.'\n",
 61 |     "\n",
 62 |     "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'\n",
 63 |     "\n",
 64 |     "article3 = 'It is a great moment from 10 a.m. to 1 p.m. every weekend.'"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### Self build"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 2,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
 84 |       "\n",
 85 |       "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning)\n",
 86 |       "-->Sentence 1: .\n",
 87 |       "-->Sentence 2:  A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer\n",
 88 |       "-->Sentence 3: .\n",
 89 |       "-->Sentence 4:  A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth\n",
 90 |       "-->Sentence 5: .\n",
 91 |       "-->Sentence 6: \n",
 92 |       "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
 93 |       "\n",
 94 |       "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_\n",
 95 |       "-->Sentence 1: !\n",
 96 |       "-->Sentence 2: @# \n",
 97 |       "-->Sentence 3: !\n",
 98 |       "-->Sentence 4: @#$%^&*()_+ 0123456\n",
 99 |       "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
100 |       "\n",
101 |       "-->Sentence 0: It is a great moment from 10 a\n",
102 |       "-->Sentence 1: .\n",
103 |       "-->Sentence 2: m\n",
104 |       "-->Sentence 3: .\n",
105 |       "-->Sentence 4:  to 1 p\n",
106 |       "-->Sentence 5: .\n",
107 |       "-->Sentence 6: m\n",
108 |       "-->Sentence 7: .\n",
109 |       "-->Sentence 8:  every weekend\n",
110 |       "-->Sentence 9: .\n",
111 |       "-->Sentence 10: \n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "import re\n",
117 |     "\n",
118 |     "for doc in [article, article2, article3]:\n",
119 |     "    print('Original Article: %s' % (doc))\n",
120 |     "    print()\n",
121 |     "\n",
122 |     "    sentences = re.split('(\\.|!|\\?)', doc)\n",
123 |     "    \n",
124 |     "    for i, s in enumerate(sentences):\n",
125 |     "        print('-->Sentence %d: %s' % (i, s))"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "You can see that, \"a.m.\" should treat as a \"word\". Of course, we can enhance the above regular expression to do it. But I will go for library rather than build the wheel again"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "### spaCy"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 3,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "spaCy Version: 2.0.11\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "import spacy\n",
157 |     "print('spaCy Version: %s' % spacy.__version__)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 4,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "spacy_nlp = spacy.load('en_core_web_sm')"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 5,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
181 |       "\n",
182 |       "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).\n",
183 |       "-->Sentence 1: A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.\n",
184 |       "-->Sentence 2: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
185 |       "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
186 |       "\n",
187 |       "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!@# !\n",
188 |       "-->Sentence 1: @#$%^&*()_+ 0123456\n",
189 |       "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
190 |       "\n",
191 |       "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "for article in [article, article2, article3]:\n",
197 |     "    print('Original Article: %s' % (article))\n",
198 |     "    print()\n",
199 |     "    doc = spacy_nlp(article)\n",
200 |     "    for i, token in enumerate(doc.sents):\n",
201 |     "        print('-->Sentence %d: %s' % (i, token.text))"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "Can see that spacy handled \"a.m.\" somehow."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "### NLTK"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 6,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "NTLK Version: 3.2.5\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "import nltk\n",
233 |     "from nltk.tokenize import sent_tokenize\n",
234 |     "print('NTLK Version: %s' % nltk.__version__)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 8,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# nltk.download('punkt')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 9,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
256 |       "\n",
257 |       "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
258 |       "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
259 |       "\n",
260 |       "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!\n",
261 |       "-->Sentence 1: @# !\n",
262 |       "-->Sentence 2: @#$%^&*()_+ 0123456\n",
263 |       "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
264 |       "\n",
265 |       "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "for article in [article, article2, article3]:\n",
271 |     "    print('Original Article: %s' % (article))\n",
272 |     "    print()\n",
273 |     "\n",
274 |     "    doc = sent_tokenize(article)\n",
275 |     "    for i, token in enumerate(doc):\n",
276 |     "        print('-->Sentence %d: %s' % (i, token))"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# Conclusion\n",
284 |     "So far both NLTK and spacy provides similar behavior so it depends on which library do you use in performing other preprocessing. \n",
285 |     "Recently, I works on text mining related project which is classifying news category. Of course, I can build a ML model to classify it but I go for a simple approach. Only focus on the first sentence for every news and performing simple key word searching to build a baseline model. The result is not bad but it is a very quick way to deliver an initial version."
286 |    ]
287 |   }
288 |  ],
289 |  "metadata": {
290 |   "kernelspec": {
291 |    "display_name": "Python 3",
292 |    "language": "python",
293 |    "name": "python3"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.5.2"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 2
310 | }
311 | 


--------------------------------------------------------------------------------
/sample/nlp-stemming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Stemming](https://i1.wp.com/s3-eu-west-1.amazonaws.com/leadersandco/wp-content/uploads/2017/05/31224050/Diary-writing-is-an-old-human-art.jpg?fit=800%2C600&ssl=1)\n",
  8 |     "\n",
  9 |     "Source: https://www.thisdaylive.com/index.php/2017/05/31/death-of-the-diary/"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Stemming\n",
 17 |     "\n",
 18 |     "After tokenized word, we may want a root form rather than the original input form for post processing or modelling such as topic classification. The root word does not necessarily a word itself. For example, \"reduc\" is a root word of \"reduce\", \"suffici\" is a root word of \"sufficient\".\n",
 19 |     "\n",
 20 |     "There are lots of stemming algorithm in NLTK. Porter Stemmer and Snowball Stemmer (aka Porter2) will be selected for demonstration because they are the most popular."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Copy from https://en.wikipedia.org/wiki/Stemming\n",
 32 |     "\n",
 33 |     "article = 'In linguistic morphology and information retrieval, stemming is the process of \\\n",
 34 |     "reducing inflected (or sometimes derived) words to their word stem, base or root \\\n",
 35 |     "form—generally a written word form. The stem need not be identical to the morphological \\\n",
 36 |     "root of the word; it is usually sufficient that related words map to the same stem, even \\\n",
 37 |     "if this stem is not in itself a valid root.'"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### Porter Stemmer"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "NLTK Version: 3.2.5\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "import nltk \n",
 62 |     "print('NLTK Version: %s' % (nltk.__version__))\n",
 63 |     "\n",
 64 |     "porter_stemmer = nltk.stem.PorterStemmer()"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n",
 77 |       "\n",
 78 |       "Original : linguistic, New: linguist\n",
 79 |       "Original : morphology, New: morpholog\n",
 80 |       "Original : information, New: inform\n",
 81 |       "Original : retrieval, New: retriev\n",
 82 |       "Original : stemming, New: stem\n",
 83 |       "Original : reducing, New: reduc\n",
 84 |       "Original : inflected, New: inflect\n",
 85 |       "Original : sometimes, New: sometim\n",
 86 |       "Original : derived, New: deriv\n",
 87 |       "Original : words, New: word\n",
 88 |       "Original : form—generally, New: form—gener\n",
 89 |       "Original : The, New: the\n",
 90 |       "Original : identical, New: ident\n",
 91 |       "Original : morphological, New: morpholog\n",
 92 |       "Original : usually, New: usual\n",
 93 |       "Original : sufficient, New: suffici\n",
 94 |       "Original : related, New: relat\n",
 95 |       "Original : words, New: word\n",
 96 |       "Original : this, New: thi\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "tokens = nltk.word_tokenize(article)\n",
102 |     "\n",
103 |     "print('Original Article: %s' % (article))\n",
104 |     "print()\n",
105 |     "\n",
106 |     "for token in tokens:\n",
107 |     "    stemmed_token = porter_stemmer.stem(token)\n",
108 |     "    \n",
109 |     "    if token != stemmed_token:\n",
110 |     "        print('Original : %s, New: %s' % (token, stemmed_token))"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "### Snowball Stemmer"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "NLTK Version: 3.2.5\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "import nltk \n",
135 |     "print('NLTK Version: %s' % (nltk.__version__))\n",
136 |     "\n",
137 |     "snowball_stemmer = nltk.stem.SnowballStemmer('english')"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 6,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n",
150 |       "\n",
151 |       "Original : In, New: in\n",
152 |       "Original : linguistic, New: linguist\n",
153 |       "Original : morphology, New: morpholog\n",
154 |       "Original : information, New: inform\n",
155 |       "Original : retrieval, New: retriev\n",
156 |       "Original : stemming, New: stem\n",
157 |       "Original : reducing, New: reduc\n",
158 |       "Original : inflected, New: inflect\n",
159 |       "Original : sometimes, New: sometim\n",
160 |       "Original : derived, New: deriv\n",
161 |       "Original : words, New: word\n",
162 |       "Original : form—generally, New: form—gener\n",
163 |       "Original : The, New: the\n",
164 |       "Original : identical, New: ident\n",
165 |       "Original : morphological, New: morpholog\n",
166 |       "Original : usually, New: usual\n",
167 |       "Original : sufficient, New: suffici\n",
168 |       "Original : related, New: relat\n",
169 |       "Original : words, New: word\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "tokens = nltk.word_tokenize(article)\n",
175 |     "\n",
176 |     "print('Original Article: %s' % (article))\n",
177 |     "print()\n",
178 |     "\n",
179 |     "for token in tokens:\n",
180 |     "    stemmed_token = snowball_stemmer.stem(token)\n",
181 |     "    \n",
182 |     "    if token != stemmed_token:\n",
183 |     "        print('Original : %s, New: %s' % (token, stemmed_token))"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "Except \"In\", the result of Snowball Stemmer are same as Porter Stemmer."
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "# Conclusion\n",
198 |     "\n",
199 |     "\n",
200 |     "Snowball Stemmer not only support English, but also Germanic and other languages as well. For detail, you may check on the Snowball website. \n",
201 |     "\n",
202 |     "Snowball Stemmer: http://snowballstem.org/algorithms/\n",
203 |     "\n",
204 |     "Besides Porter Stemmer and Snowball Stemmer, reader may also have on look on other stemmer algorithm such as Hunspell\n",
205 |     "\n",
206 |     "Hunspell Stemmer: https://github.com/hunspell/hunspell"
207 |    ]
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.5.2"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 2
231 | }
232 | 


--------------------------------------------------------------------------------
/sample/nlp-stop_words.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![title](https://www.channelone.com/wp-content/uploads/2015/03/bigstock-Pile-Of-Words-1896131-crop.jpg)\n",
  8 |     "\n",
  9 |     "Source: https://www.channelone.com/blog_post/web-tools-for-studying-vocabulary-words/"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Stop Words\n",
 17 |     "\n",
 18 |     "When we deal with text problem in Natural Language Processing, stop words removal process is a one of the important step to have a better input for any models. Stop words means that it is a very common words in a language (e.g. a, an, the in English. 的, 了 in Chinese. え, も in Japanese). It does not help on most of NLP problem such as semantic analysis, classification etc.\n",
 19 |     "\n",
 20 |     "In this article, we will look into using multi libraries pre-defined stop words, third party pre-defined stop words as well as domain specific stop words. Definition of stop words (capture from wiki) will be used to demonstrate the result after removing stop words.\n",
 21 |     "\n",
 22 |     "Word tokenization and lemmatization arethe essential part for removing stop words. You may refer to this article to understand word tokenization and lemmatization.\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Capture from https://en.wikipedia.org/wiki/Stop_words\n",
 34 |     "\n",
 35 |     "article = 'In computing, stop words are words which are filtered out before or \\\n",
 36 |     "after processing of natural language data (text).[1] Though \"stop words\" usually \\\n",
 37 |     "refers to the most common words in a language, there is no single universal list of \\\n",
 38 |     "stop words used by all natural language processing tools, and indeed not all tools \\\n",
 39 |     "even use such a list. Some tools specifically avoid removing these stop words to \\\n",
 40 |     "support phrase search.'"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# Catpure from https://zh.wikipedia.org/wiki/%E5%81%9C%E7%94%A8%E8%AF%8D\n",
 52 |     "\n",
 53 |     "article2 = '在信息檢索中，為節省存儲空間和提高搜索效率，在處理自然語言數據（或文本）之前或之後會自動過濾掉某些字或詞，\\\n",
 54 |     "這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的，\\\n",
 55 |     "生成後的停用詞會形成一個停用詞表。但是，並沒有一個明確的停用詞表能夠適用於所有的工具。\\\n",
 56 |     "甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。'"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### spaCy"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "spaCy Version: 2.0.11\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "import spacy\n",
 81 |     "print('spaCy Version: %s' % (spacy.__version__))\n",
 82 |     "spacy_nlp = spacy.load('en_core_web_sm')"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "Check pre-defined English stop words"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Number of stop words: 305\n",
102 |       "First ten stop words: ['from', 'i', 'cannot', 'seeming', 'seemed', 'him', 'them', 'hundred', 'whoever', 'few']\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS\n",
108 |     "\n",
109 |     "print('Number of stop words: %d' % len(spacy_stopwords))\n",
110 |     "print('First ten stop words: %s' % list(spacy_stopwords)[:10])"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "Remove stop words"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
130 |       "\n",
131 |       "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "doc = spacy_nlp(article)\n",
137 |     "tokens = [token.text for token in doc if not token.is_stop]\n",
138 |     "\n",
139 |     "print('Original Article: %s' % (article))\n",
140 |     "print()\n",
141 |     "print(tokens)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Add customize stop words"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 6,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
161 |       "\n",
162 |       "['In', ',', 'stop', 'words', 'words', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "customize_stop_words = [\n",
168 |     "    'computing', 'filtered'\n",
169 |     "]\n",
170 |     "\n",
171 |     "for w in customize_stop_words:\n",
172 |     "    spacy_nlp.vocab[w].is_stop = True\n",
173 |     "\n",
174 |     "\n",
175 |     "doc = spacy_nlp(article)\n",
176 |     "tokens = [token.text for token in doc if not token.is_stop]\n",
177 |     "\n",
178 |     "print('Original Article: %s' % (article))\n",
179 |     "print()\n",
180 |     "print(tokens)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### NLTK"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 7,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "NLTK Version: 3.2.5\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "import nltk \n",
205 |     "print('NLTK Version: %s' % (nltk.__version__))\n",
206 |     "\n",
207 |     "nltk.download('stopwords')"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 8,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "Number of stop words: 179\n",
220 |       "First ten stop words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\"]\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "nltk_stopwords = nltk.corpus.stopwords.words('english')\n",
226 |     "\n",
227 |     "print('Number of stop words: %d' % len(nltk_stopwords))\n",
228 |     "print('First ten stop words: %s' % list(nltk_stopwords)[:10])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "General words such as \"are\", \"the\" are removed as well. For example, \"indeed\" is removed in NLTK but not spaCy. On the other hand, \"used\" are removed in spaCy but not NLTK"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 9,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
248 |       "\n",
249 |       "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text', ')', '.', '[', '1', ']', 'Though', '``', 'stop', 'words', \"''\", 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'used', 'natural', 'language', 'processing', 'tools', ',', 'indeed', 'tools', 'even', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
250 |      ]
251 |     }
252 |    ],
253 |    "source": [
254 |     "tokens = nltk.tokenize.word_tokenize(article)\n",
255 |     "tokens = [token for token in tokens if not token in nltk_stopwords]\n",
256 |     "\n",
257 |     "print('Original Article: %s' % (article))\n",
258 |     "print()\n",
259 |     "print(tokens)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### jieba\n",
267 |     "For Chinese word, we use the similar ideas to filter out words if it is stop words."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 10,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "jieba Version: 0.39\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "import jieba\n",
285 |     "print('jieba Version: %s' % jieba.__version__)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 11,
291 |    "metadata": {
292 |     "collapsed": true
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "# Capture from https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt\n",
297 |     "\n",
298 |     "jieba_stop_words = [\n",
299 |     "    '的', '了', '和', '是', '就', '都', '而', '及', '與', \n",
300 |     "    '著', '或', '一個', '沒有', '我們', '你們', '妳們', \n",
301 |     "    '他們', '她們', '是否'\n",
302 |     "]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Different from English, word will not be removed if stop words belongs to part of word. For example, \"是\" is defined as stop words but \"但是\" still exist as \"但是\" is a kind of \"single word\". Therefore, word tokenization is very important for stop word removal."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 12,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "name": "stderr",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "Building prefix dict from the default dictionary ...\n",
322 |       "Loading model from cache /tmp/jieba.cache\n"
323 |      ]
324 |     },
325 |     {
326 |      "name": "stdout",
327 |      "output_type": "stream",
328 |      "text": [
329 |       "Original Article: 在信息檢索中，為節省存儲空間和提高搜索效率，在處理自然語言數據（或文本）之前或之後會自動過濾掉某些字或詞，這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的，生成後的停用詞會形成一個停用詞表。但是，並沒有一個明確的停用詞表能夠適用於所有的工具。甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。\n",
330 |       "\n"
331 |      ]
332 |     },
333 |     {
334 |      "name": "stderr",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "Loading model cost 1.118 seconds.\n",
338 |       "Prefix dict has been built succesfully.\n"
339 |      ]
340 |     },
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "['在', '信息', '檢索', '中', '，', '為節', '省存', '儲空間', '提高', '搜索', '效率', '，', '在', '處理', '自然', '語言數', '據', '（', '文本', '）', '之前', '之後會', '自動', '過濾', '掉', '某些', '字', '詞', '，', '這些', '字', '詞', '即', '被', '稱', '為', 'Stop', ' ', 'Words', '(', '停用', '詞', ')', '。', '不要', '把', '停用', '詞', '安全', '口令', '混淆', '。', ' ', '這些', '停用', '詞', '人工', '輸入', '、', '非自動', '化生成', '，', '生成', '後', '停用', '詞會', '形成', '停用', '詞表', '。', '但是', '，', '並沒有', '明確', '停用', '詞表能夠', '適用', '於', '所有', '工具', '。', '甚至', '有', '一些', '工具', '明確', '地', '避免', '使用', '停用', '詞來', '支持', '短語', '搜索', '。']\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "print('Original Article: %s' % (article2))\n",
351 |     "print()\n",
352 |     "words = jieba.cut(article2, cut_all=False)\n",
353 |     "words = [str(word) for word in words if not str(word) in jieba_stop_words]\n",
354 |     "print(words)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "source": [
363 |     "# Conclusion"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "The procedure of removing stop words is similar across libraries so the most importance is defining your own stop words. In initial phase, pre-defined stop words can be adopted but more and more words should be added into stop word list later on. \n",
371 |     "\n",
372 |     "So besides, using spaCy or NLTK pre-defined stop words, we can use other words which are defined by other party such as Stanford NLP and Rank NL. You may check out the stop list from \n",
373 |     "\n",
374 |     "Stanford NLP: https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt\n",
375 |     "\n",
376 |     "Rank NL: https://www.ranks.nl/stopwords\n",
377 |     "\n",
378 |     "jieba: https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt"
379 |    ]
380 |   }
381 |  ],
382 |  "metadata": {
383 |   "kernelspec": {
384 |    "display_name": "Python 3",
385 |    "language": "python",
386 |    "name": "python3"
387 |   },
388 |   "language_info": {
389 |    "codemirror_mode": {
390 |     "name": "ipython",
391 |     "version": 3
392 |    },
393 |    "file_extension": ".py",
394 |    "mimetype": "text/x-python",
395 |    "name": "python",
396 |    "nbconvert_exporter": "python",
397 |    "pygments_lexer": "ipython3",
398 |    "version": "3.5.2"
399 |   }
400 |  },
401 |  "nbformat": 4,
402 |  "nbformat_minor": 2
403 | }
404 | 


--------------------------------------------------------------------------------
/sample/nlp-word_mover_distance.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Word Mover's Distance\n",
  8 |     "\n",
  9 |     "![](https://cdn.pixabay.com/photo/2017/10/23/23/41/hong-kong-2883036_960_720.jpg)\n",
 10 |     "Photo: https://pixabay.com/en/hong-kong-harbor-boats-water-night-2883036/\n",
 11 |     "\n",
 12 |     "Word Mover's Distance (WMD) is proposed fro distance measurement between 2 documents (or sentences). It leverages Word Embeddings power to overcome those basic distance measurement limitations. \n",
 13 |     "\n",
 14 |     "WMD[1] was introduced by Kusner et al. in 2015. Instead of using Euclidean Distance and other bag-of-words based distance measurement, they proposed to use word embeddings to calculate the similarities. To be precise, it uses normalized [Bag-of-Words](https://towardsdatascience.com/3-basic-approaches-in-bag-of-words-which-are-better-than-word-embeddings-c2cbc7398016) and [Word Embeddings](https://medium.com/towards-data-science/3-silver-bullets-of-word-embedding-in-nlp-10fa8f50cc5a) to calculate the distance between documents.\n",
 15 |     "\n",
 16 |     "After reading this article, you will understand:\n",
 17 |     "- Earth Mover Distance (EMD)\n",
 18 |     "- Word Mover's Distance (WMD)\n",
 19 |     "- Relaxed Word Moving Distance (RWMD)\n",
 20 |     "- WMD Implementation\n",
 21 |     "- Take Away"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Earth Mover Distance (EMD)\n",
 29 |     "Before introducing WMD, I have to share the idea of Earth Mover Distance (EMD) first because the core part of WMD is EMD.\n",
 30 |     "\n",
 31 |     "EMD [2] solves transportation problem. For instance, we have m and n while m and n denote a set of suppliers and warehouses. The target is going to minimize transportation cost such that shipping all goods from m to n. Given that there are constraints:\n",
 32 |     "\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "- Only allowing transport from m to n. Not allowing transport from n to m\n",
 38 |     "- Total number of sending cargoes cannot exceed total capacity of m\n",
 39 |     "- Total number of receiving cargoes cannot exceed total capacity of n\n",
 40 |     "- Maximum number of transportation is the minimum between total cargoes in m and total cargoes in n\n",
 41 |     "\n",
 42 |     "The denotations are:\n",
 43 |     "- p: Set of origin\n",
 44 |     "- q: Set of destination\n",
 45 |     "- f(i,j): flow from i to j\n",
 46 |     "- m: Number of origin\n",
 47 |     "- n: Number of destination\n",
 48 |     "- w(i, j): Number of cargo transport from i to j\n",
 49 |     "\n",
 50 |     "To optimal flow F, the linear formula is\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "# Word Mover's Distance (WMD)\n",
 60 |     "In the previous blog, I shared how we can use simple way to find the \"similarity\" between two documents (or sentences). At that time, Euclidean Distance, Cosine Distance and Jaccard Similarity are introduced but it has some limitations.  WMD is designed to __overcome synonym problem__.\n",
 61 |     "\n",
 62 |     "The typical example is \n",
 63 |     "- Sentence 1: Obama speaks to the media in Illinois\n",
 64 |     "- Sentence 2: The president greets the press in Chicago\n",
 65 |     "\n",
 66 |     "Except the stop words, there is no common words among two sentences but both of them are taking about same topic (at that time).\n",
 67 |     "\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "WMD use word embeddings to calculate the distance so that it can calculate even though there is no common word. The assumption is that similar words should have similar vectors.\n",
 77 |     "\n",
 78 |     "First of all, lower case and removing stopwords is an essential step to reduce complexity and preventing misleading. \n",
 79 |     "- Sentence 1: obama speaks media illinois\n",
 80 |     "- Sentence 2: president greets press chicago\n",
 81 |     "\n",
 82 |     "Retrieve vectors from any pre-trained word embeddings models. It can be GloVe, word2vec, fasttext or custom vectors. After that it using normalized bag-of-words (nBOW) to represent the weight or importance. It assumes that higher frequency implies that it is more important.\n",
 83 |     "\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "\n",
 88 |     "\n",
 89 |     "It allows transfer every word from sentence 1 to sentence 2 because algorithm does not know \"obama\" should transfer to \"president\". At the end it will choose the minimum transportation cost to transport every word from sentence 1 to sentence 2.\n",
 90 |     "\n",
 91 |     "# Relaxed Word Moving Distance (RWMD)\n",
 92 |     "The best average time of solving WMD is about O(p³ log p) while p is number of unique word. It is a little bit slow so there are two approaches to improve the reduce computation time. First one is __Word Centroid Distance (WCD)__ which is summarizing the lower bound distance between. Second approach is __Relaxed Word Moving Distance (RWMD)__ which is using the closet distance without  considering there are multiple words transforming to single words.\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "Taking the previous sentence as an example. Assuming that shortest word in sentence of all word in sentence 1 is \"president\", it will use summarize these score instead of pairing one by one. So that the time complexity reduce to O(p²).\n",
 98 |     "\n",
 99 |     "\n",
100 |     "\n",
101 |     "\n",
102 |     "\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "# WMD Implementation\n",
110 |     "By using gensim, we only need to provide two list of tokens then it will take the rest of calculation"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 60,
116 |    "metadata": {
117 |     "collapsed": true
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "\"\"\"\n",
122 |     "    News headline get from \n",
123 |     "    \n",
124 |     "    https://www.reuters.com/article/us-musk-tunnel/elon-musks-boring-co-to-build-high-speed-airport-link-in-chicago-idUSKBN1JA224\n",
125 |     "    http://money.cnn.com/2018/06/14/technology/elon-musk-boring-company-chicago/index.html\n",
126 |     "    https://www.theverge.com/2018/6/13/17462496/elon-musk-boring-company-approved-tunnel-chicago\n",
127 |     "\n",
128 |     "\"\"\"\n",
129 |     "\n",
130 |     "news_headline1 = \"Elon Musk's Boring Co to build high-speed airport link in Chicago\"\n",
131 |     "news_headline2 = \"Elon Musk's Boring Company to build high-speed Chicago airport link\"\n",
132 |     "news_headline3 = \"Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\"\n",
133 |     "news_headline4 = \"Both apple and orange are fruit\"\n",
134 |     "\n",
135 |     "news_headlines = [news_headline1, news_headline2, news_headline3, news_headline4]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 65,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "gensim version: 3.4.0\n"
148 |      ]
149 |     }
150 |    ],
151 |    "source": [
152 |     "# Load Word Embedding Model\n",
153 |     "import gensim\n",
154 |     "print('gensim version: %s' % gensim.__version__)\n",
155 |     "glove_model = gensim.models.KeyedVectors.load_word2vec_format('../model/text/stanford/glove/glove.6B.50d.vec')"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 66,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "[['elon', 'musk', \"'s\", 'boring', 'co', 'build', 'high', '-', 'speed', 'airport', 'link', 'chicago'], ['elon', 'musk', \"'s\", 'boring', 'company', 'build', 'high', '-', 'speed', 'chicago', 'airport', 'link'], ['elon', 'musk', '’s', 'boring', 'company', 'approved', 'build', 'high', '-', 'speed', 'transit', 'downtown', 'chicago', 'o’hare', 'airport'], ['both', 'apple', 'orange', 'fruit']]\n"
168 |      ]
169 |     }
170 |    ],
171 |    "source": [
172 |     "# Remove stopwords\n",
173 |     "import spacy\n",
174 |     "spacy_nlp = spacy.load('en')\n",
175 |     "\n",
176 |     "headline_tokens = []\n",
177 |     "for news_headline in news_headlines:\n",
178 |     "    headline_tokens.append([token.text.lower() for token in spacy_nlp(news_headline) if not token.is_stop])\n",
179 |     "\n",
180 |     "print(headline_tokens)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 67,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "Headline:  Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
193 |       "==================================================\n",
194 |       "\n",
195 |       "--------------------------------------------------\n",
196 |       "Comparing to: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
197 |       "distance = 0.0000\n",
198 |       "--------------------------------------------------\n",
199 |       "Comparing to: Elon Musk's Boring Company to build high-speed Chicago airport link\n",
200 |       "distance = 0.3589\n",
201 |       "--------------------------------------------------\n",
202 |       "Comparing to: Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\n",
203 |       "distance = 1.9456\n",
204 |       "--------------------------------------------------\n",
205 |       "Comparing to: Both apple and orange are fruit\n",
206 |       "distance = 5.4350\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "subject_headline = news_headlines[0]\n",
212 |     "subject_token = headline_tokens[0]\n",
213 |     "\n",
214 |     "print('Headline: ', subject_headline)\n",
215 |     "print('=' * 50)\n",
216 |     "print()\n",
217 |     "\n",
218 |     "for token, headline in zip(headline_tokens, news_headlines):\n",
219 |     "    print('-' * 50)\n",
220 |     "    print('Comparing to:', headline)\n",
221 |     "    distance = glove_model.wmdistance(subject_token, token)\n",
222 |     "    print('distance = %.4f' % distance)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "In gensim implementation, OOV will be removed so that it will not throw an exception or using random vector."
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "# Take Away\n",
237 |     "For source code, you may check out from my github repo.\n",
238 |     "- The advantage of WMD are __hyper-parameter free and overcoming synonym problem__.\n",
239 |     "- Same as those simple approaches, WMD __does not consider ordering__.\n",
240 |     "- The __time complexity is an issue__. The original version is O(p³ log p) while the enhanced version is still O(p²).\n",
241 |     "- __Pre-train vectors may not apply to all scenario__.\n",
242 |     "\n",
243 |     "# Reference\n",
244 |     "[1] Kusner Matt J., Sun Yu, Kolkin Nicholas I., Weinberger Kilian Q. From Word Embeedings To Document Distance. 2015. http://proceedings.mlr.press/v37/kusnerb15.pdf\n",
245 |     "[2] EMD Theory: https://en.wikipedia.org/wiki/Earth_mover%27s_distance"
246 |    ]
247 |   }
248 |  ],
249 |  "metadata": {
250 |   "kernelspec": {
251 |    "display_name": "Python 3",
252 |    "language": "python",
253 |    "name": "python3"
254 |   },
255 |   "language_info": {
256 |    "codemirror_mode": {
257 |     "name": "ipython",
258 |     "version": 3
259 |    },
260 |    "file_extension": ".py",
261 |    "mimetype": "text/x-python",
262 |    "name": "python",
263 |    "nbconvert_exporter": "python",
264 |    "pygments_lexer": "ipython3",
265 |    "version": "3.5.2"
266 |   }
267 |  },
268 |  "nbformat": 4,
269 |  "nbformat_minor": 2
270 | }
271 | 


--------------------------------------------------------------------------------
/sample/nlp-word_tokenization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Word Tokenization](http://youthvoices.net/sites/default/files/image/69585/sep/persuasive-landing-pages-words-have-power.jpg)\n",
  8 |     "\n",
  9 |     "Source: http://youthvoices.net/discussion/will-you-1-powerful-words"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Word Tokenization\n",
 17 |     "To tackle text related problem in Machine Learning area, tokenization is one of the common pre-processing. In this article, we will go through how we can handle work toeknization and sentence tokenization by using three libraries which are spaCy, NLTK and jieba (for Chinese word)."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 3,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n",
 29 |     "\n",
 30 |     "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n",
 31 |     "converting a sequence of characters (such as in a computer program or web page) into a \\\n",
 32 |     "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n",
 33 |     "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n",
 34 |     "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n",
 35 |     "which together analyze the syntax of programming languages, web pages, and so forth.'"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "article3 = '你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛'"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# Capture from https://zh.wikipedia.org/wiki/%E8%AF%8D%E6%B3%95%E5%88%86%E6%9E%90\n",
 69 |     "\n",
 70 |     "article4 = '词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器，也叫扫描器。词法分析器一般以函数的形式存在，供语法分析器调用。'"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "# spaCy"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 7,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "spaCy Version: 2.0.11\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "import spacy\n",
 95 |     "print('spaCy Version: %s' % spacy.__version__)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "spacy_nlp = spacy.load('en_core_web_sm')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 9,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
119 |       "\n",
120 |       "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer,[1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "print('Original Article: %s' % (article))\n",
126 |     "print()\n",
127 |     "doc = spacy_nlp(article)\n",
128 |     "tokens = [token.text for token in doc]\n",
129 |     "print(tokens)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "Not all special character will be seperated."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
149 |       "\n",
150 |       "['ConcateStringAnd123', 'ConcateSepcialCharacter_!@', '#', '!', '@#$%^&*()_+', '0123456']\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "print('Original Article: %s' % (article2))\n",
156 |     "print()\n",
157 |     "doc = spacy_nlp(article2)\n",
158 |     "tokens = [token.text for token in doc]\n",
159 |     "print(tokens)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "First step of spaCy separates word by space and then applying some guidelines such as exception rule, prefix, suffix etc."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "# NLTK"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 9,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "NTLK Version: 3.2.5\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "import nltk\n",
191 |     "print('NTLK Version: %s' % nltk.__version__)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 10,
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "name": "stdout",
201 |      "output_type": "stream",
202 |      "text": [
203 |       "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
204 |       "\n",
205 |       "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer', ',', '[', '1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "print('Original Article: %s' % (article))\n",
211 |     "print()\n",
212 |     "print(nltk.word_tokenize(article))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Some special character (e.g. _) will not be seperated"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 11,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
232 |       "\n",
233 |       "['ConcateStringAnd123', 'ConcateSepcialCharacter_', '!', '@', '#', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_+', '0123456']\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "print('Original Article: %s' % (article2))\n",
239 |     "print()\n",
240 |     "print(nltk.word_tokenize(article2))"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "The behavior is a little difference from spaCy. NLTK treats most of special character as a \"word\" except \"_\". Of course, number will be tokenized as well."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "# jieba"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 12,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "name": "stdout",
264 |      "output_type": "stream",
265 |      "text": [
266 |       "jieba Version: 0.39\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "import jieba\n",
272 |     "print('jieba Version: %s' % jieba.__version__)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 13,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "name": "stderr",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "Building prefix dict from the default dictionary ...\n",
285 |       "Loading model from cache /tmp/jieba.cache\n"
286 |      ]
287 |     },
288 |     {
289 |      "name": "stdout",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "Original Article: 你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛\n",
293 |       "\n"
294 |      ]
295 |     },
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "Loading model cost 1.086 seconds.\n",
301 |       "Prefix dict has been built succesfully.\n"
302 |      ]
303 |     },
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "['你', '的', '姿態', ' ', '你', '的', '青睞', ' ', '我', '存在', '在', '你', '的', '存在', ' ', '你', '以', '為', '愛', ' ', '就是', '被', '愛']\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "print('Original Article: %s' % (article3))\n",
314 |     "print()\n",
315 |     "\n",
316 |     "words = jieba.cut(article3, cut_all=False)\n",
317 |     "words = [str(word) for word in words]\n",
318 |     "print(words)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 14,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "Original Article: 词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器，也叫扫描器。词法分析器一般以函数的形式存在，供语法分析器调用。\n",
331 |       "\n",
332 |       "['词法', '分析', '是', '计算机科学', '中将', '字符', '序列', '转换', '为', '标记', '序列', '的', '过程', '。', '进行', '词法', '分析', '的', '程序', '或者', '函数', '叫作', '词法', '分析器', '，', '也', '叫', '扫描器', '。', '词法', '分析器', '一般', '以', '函数', '的', '形式', '存在', '，', '供', '语法分析', '器', '调用', '。']\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "print('Original Article: %s' % (article4))\n",
338 |     "print()\n",
339 |     "\n",
340 |     "words = jieba.cut(article4, cut_all=False)\n",
341 |     "words = [str(word) for word in words]\n",
342 |     "print(words)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "source": [
351 |     "jieba does a great job on tokenizes Chinese word (both simplified chinese to traditional chinese)."
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "# Conculsion"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "spaCy seems like having a intelligence on tokenize and the performance is better than NLTK. If you need to tokenize, jieba is a good choice for you. Also, studied spaCy (version 2.x) Chinese language implementation. They wrapped jieba library. From lang/zh/__init__.py"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "collapsed": true
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "# copy from spaCy/lang/zh/__init__.py\n",
377 |     "class Chinese(Language):\n",
378 |     "    lang = 'zh'\n",
379 |     "    Defaults = ChineseDefaults  # override defaults\n",
380 |     "\n",
381 |     "    def make_doc(self, text):\n",
382 |     "        try:\n",
383 |     "            import jieba\n",
384 |     "        except ImportError:\n",
385 |     "            raise ImportError(\"The Chinese tokenizer requires the Jieba library: \"\n",
386 |     "                              \"https://github.com/fxsjy/jieba\")\n",
387 |     "        words = list(jieba.cut(text, cut_all=False))\n",
388 |     "        words = [x for x in words if x]\n",
389 |     "        return Doc(self.vocab, words=words, spaces=[False]*len(words))"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "On the other hand, Stanford NLP also released a word tokenize library for multiple language including English and Chinese. You may visit the official website if you are interested.\n",
397 |     "\n",
398 |     "URL: https://nlp.stanford.edu/software/tokenizer.html"
399 |    ]
400 |   }
401 |  ],
402 |  "metadata": {
403 |   "kernelspec": {
404 |    "display_name": "Python 3",
405 |    "language": "python",
406 |    "name": "python3"
407 |   },
408 |   "language_info": {
409 |    "codemirror_mode": {
410 |     "name": "ipython",
411 |     "version": 3
412 |    },
413 |    "file_extension": ".py",
414 |    "mimetype": "text/x-python",
415 |    "name": "python",
416 |    "nbconvert_exporter": "python",
417 |    "pygments_lexer": "ipython3",
418 |    "version": "3.5.2"
419 |   }
420 |  },
421 |  "nbformat": 4,
422 |  "nbformat_minor": 2
423 | }
424 | 


--------------------------------------------------------------------------------
/sample/nlp_lemmatization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Lemmatization](https://www.tell-a-tale.com/wp-content/uploads/2018/03/time-for-a-change-2015164_960_720_compressed-810x539.jpg)\n",
  8 |     "\n",
  9 |     "Source: https://www.tell-a-tale.com/unbox-idea-social-open-mic-tell-a-story-to-change-world/"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Lemmatization\n",
 17 |     "\n",
 18 |     "In English words (Other language as well), same word may have different form such as \"affected\", \"affects\" and \"affect\". \n",
 19 |     "To have a smaller size vocabulary and better representation on NLP problem, we want to have a single word to represent \"\", \"\" in some scenarios. In this article, we will go through some libraries to work on lemmatization."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "# Capture from https://en.wikipedia.org/wiki/Lemmatisation\n",
 31 |     "\n",
 32 |     "article = \"Lemmatisation (or lemmatization) in linguistics is the process of grouping together \\\n",
 33 |     "the inflected forms of a word so they can be analysed as a single item, identified by the word's \\\n",
 34 |     "lemma, or dictionary form.\""
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### spaCy"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "spaCy Version: 2.0.11\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "import spacy\n",
 59 |     "print('spaCy Version: %s' % (spacy.__version__))\n",
 60 |     "spacy_nlp = spacy.load('en_core_web_sm')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 3,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "Original Article: Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.\n",
 73 |       "\n",
 74 |       "Original : Lemmatisation, New: lemmatisation\n",
 75 |       "Original : linguistics, New: linguistic\n",
 76 |       "Original : is, New: be\n",
 77 |       "Original : grouping, New: group\n",
 78 |       "Original : inflected, New: inflect\n",
 79 |       "Original : forms, New: form\n",
 80 |       "Original : they, New: -PRON-\n",
 81 |       "Original : analysed, New: analyse\n",
 82 |       "Original : identified, New: identify\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "doc = spacy_nlp(article)\n",
 88 |     "tokens = [token.text for token in doc]\n",
 89 |     "\n",
 90 |     "print('Original Article: %s' % (article))\n",
 91 |     "print()\n",
 92 |     "\n",
 93 |     "for token in doc:\n",
 94 |     "    if token.text != token.lemma_:\n",
 95 |     "        print('Original : %s, New: %s' % (token.text, token.lemma_))"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "spaCy will convert word to lower case and changing past tense, gerund form (other tenses as well) to present tense. Also, \"they\" normalize to \"-PRON-\" which is pronoun."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "### NLTK"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "NLTK Version: 3.2.5\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "import nltk \n",
127 |     "print('NLTK Version: %s' % (nltk.__version__))\n",
128 |     "\n",
129 |     "nltk.download('wordnet')\n",
130 |     "\n",
131 |     "wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 5,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "Original Article: Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.\n",
144 |       "\n",
145 |       "Original : forms, New: form\n",
146 |       "Original : as, New: a\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "tokens = nltk.word_tokenize(article)\n",
152 |     "\n",
153 |     "print('Original Article: %s' % (article))\n",
154 |     "print()\n",
155 |     "\n",
156 |     "for token in tokens:\n",
157 |     "    lemmatized_token = wordnet_lemmatizer.lemmatize(token)\n",
158 |     "    \n",
159 |     "    if token != lemmatized_token:\n",
160 |     "        print('Original : %s, New: %s' % (token, lemmatized_token))"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "The result is totally difference from spaCy. Only two words are lemmaizated and one of them \"as\" is strange. It seems that \"s\" will removed if it is the last character. Therefore, \"as\" is converted to \"a\""
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "# Conclusion"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "The result of spaCy is better and expected. Taking \"as\" an example, it seems that spaCy\" has a kind of \"intelligent\" that it will convert \"as\" as \"a\". Therefore, I further studying on source code, it seems like there are well defined word and rule to support lemmatization."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 6,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "# Copy from spacy/lang/en/lemmatizer/_lemma_rules.py\n",
193 |     "ADJECTIVE_RULES = [\n",
194 |     "    [\"er\", \"\"],\n",
195 |     "    [\"est\", \"\"],\n",
196 |     "    [\"er\", \"e\"],\n",
197 |     "    [\"est\", \"e\"]\n",
198 |     "]\n",
199 |     "# Copy from spacy/lang/en/lemmatizer/_nouns_irreg.py\n",
200 |     "NOUNS_IRREG = {\n",
201 |     "    \"aardwolves\": (\"aardwolf\",),\n",
202 |     "    \"abaci\": (\"abacus\",),\n",
203 |     "    \"aboideaux\": (\"aboideau\",),\n",
204 |     "    \"aboiteaux\": (\"aboiteau\",),\n",
205 |     "    \"abscissae\": (\"abscissa\",),\n",
206 |     "    \"acanthi\": (\"acanthus\",),\n",
207 |     "    \"acari\": (\"acarus\",),\n",
208 |     "#     ...\n",
209 |     "}"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 7,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "# Copy from spacy/lang/fr/lemmatizer.py\n",
221 |     "LOOKUP = {\n",
222 |     "    \"Ap.\": \"après\",\n",
223 |     "    \"Apr.\": \"après\",\n",
224 |     "    \"Auxerroises\": \"Auxerrois\",\n",
225 |     "    \"Av.\": \"avenue\",\n",
226 |     "    \"Ave.\": \"avenue\",\n",
227 |     "    \"Avr.\": \"avril\",\n",
228 |     "    \"Bd.\": \"boulevard\",\n",
229 |     "    \"Boliviennes\": \"Bolivien\",\n",
230 |     "    \"Canadiennes\": \"Canadien\",\n",
231 |     "    \"Cannoises\": \"Cannois\",\n",
232 |     "#     ...\n",
233 |     "}"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "TL;DR\n",
241 |     "\n",
242 |     "How does spaCy work on lemmatizion in Enlgish. From source code, it will go through POS (Part of Speech) first. Lemmatization will be performed if the word is noun, verb, adjective or adverb. Later on, it will check whether existing in irregular list. Lemmatized word will be returned if existing in irregular list. Otherwise, it will go the pre-defined suffix rule."
243 |    ]
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "kernelspec": {
248 |    "display_name": "Python 3",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.5.2"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 2
267 | }
268 | 


--------------------------------------------------------------------------------
/sample/preprocessing/nlp-preprocessing-string_matching-fuzzywuzzy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Data Preparation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%reload_ext  autoreload\n",
 17 |     "%autoreload 2\n",
 18 |     "\n",
 19 |     "import sys, os\n",
 20 |     "def add_aion(curr_path=None):\n",
 21 |     "    if curr_path is None:\n",
 22 |     "        dir_path = os.getcwd()\n",
 23 |     "        target_path = os.path.dirname(os.path.dirname(dir_path))\n",
 24 |     "        if target_path not in sys.path:\n",
 25 |     "            sys.path.insert(0, target_path)\n",
 26 |     "            \n",
 27 |     "add_aion()"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "2018-12-27 19:39:50.455930. [DOWNLOAD] From https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv to ../../data/location/country.csv\n",
 40 |       "0 : Afghanistan\n",
 41 |       "1 : Åland Islands\n",
 42 |       "2 : Albania\n",
 43 |       "3 : Algeria\n",
 44 |       "4 : American Samoa\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "import pandas as pd\n",
 50 |     "from aion.helper.file_helper import FileHelper\n",
 51 |     "\n",
 52 |     "file_helper = FileHelper()\n",
 53 |     "countries_file_path = file_helper.download(\n",
 54 |     "    src='https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv', \n",
 55 |     "    dest_dir='../../data/location/', dest_file='country.csv', force_download=True)\n",
 56 |     "\n",
 57 |     "country_df = pd.read_csv(countries_file_path)\n",
 58 |     "countries = country_df['value'].tolist()\n",
 59 |     "\n",
 60 |     "for i, country in enumerate(countries[:5]):\n",
 61 |     "    print(i, \":\", country)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "### Fuzzywuzzy"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from fuzzywuzzy import fuzz\n",
 78 |     "from fuzzywuzzy import process"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "[('Hong Kong SAR China', 90), ('Congo - Kinshasa', 57)]\n",
 91 |       "[('Japan', 60), ('Yemen', 60)]\n",
 92 |       "[('United States', 96), ('United Arab Emirates', 86)]\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# Default scorer is Weighed Ratio\n",
 98 |     "for location in ['Hong Kong', 'jepen', 'United tates']:\n",
 99 |     "    result = process.extract(location, countries, limit=2)\n",
100 |     "    print(result)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "[('Edwards', 92), ('Edwards2', 86), ('drawdE', 50)]"
112 |       ]
113 |      },
114 |      "execution_count": 6,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "# Ratio\n",
121 |     "process.extract('Edward', ['Edwards', 'Edwards2', 'drawdE'], scorer=fuzz.ratio)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "[('Hong Kong SAR China', 64), ('Congo - Kinshasa', 48), ('Mongolia', 47)]"
133 |       ]
134 |      },
135 |      "execution_count": 7,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# Partial Ratio\n",
142 |     "process.extract('Hong Kong', countries, scorer=fuzz.QRatio, limit=3)"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 3",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.5.5"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 2
167 | }
168 | 


--------------------------------------------------------------------------------
/sample/resources/LSI and LDA.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlp/2f12277b952dca39d8a392fb14e5f086a562d269/sample/resources/LSI and LDA.pptx


--------------------------------------------------------------------------------
/sample/util/nlp-util-spell_corrector.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Ingestion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from collections import Counter\n",
 19 |     "from sklearn.datasets import fetch_20newsgroups\n",
 20 |     "import re\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "corpus = []\n",
 24 |     "for line in fetch_20newsgroups().data:\n",
 25 |     "    line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n",
 26 |     "    line = re.sub('[^a-z ]', ' ', line)\n",
 27 |     "    tokens = line.split(' ')\n",
 28 |     "    tokens = [token for token in tokens if len(token) > 0]\n",
 29 |     "    corpus.extend(tokens)\n",
 30 |     "\n",
 31 |     "corpus = Counter(corpus)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%reload_ext  autoreload\n",
 41 |     "%autoreload 2\n",
 42 |     "\n",
 43 |     "import sys, os\n",
 44 |     "def add_aion(curr_path=None):\n",
 45 |     "    if curr_path is None:\n",
 46 |     "        dir_path = os.getcwd()\n",
 47 |     "        target_path = os.path.dirname(os.path.dirname(dir_path))\n",
 48 |     "        if target_path not in sys.path:\n",
 49 |     "#             print('Added %s into sys.path.' % (target_path))\n",
 50 |     "            sys.path.insert(0, target_path)\n",
 51 |     "            \n",
 52 |     "add_aion()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "# SpellCorrector"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "Known Result:  set()\n",
 72 |       "Edit1 Result:  {'edward', 'edwards'}\n",
 73 |       "Edit2 Result:  {'gedwards', 'edward', 'eduard', 'edvard', 'tedward', 'edgardo', 'edwards', 'tedwards'}\n"
 74 |      ]
 75 |     },
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "'edward'"
 80 |       ]
 81 |      },
 82 |      "execution_count": 3,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "from aion.util.spell_corrector import SpellCorrector\n",
 89 |     "\n",
 90 |     "spell_corrector = SpellCorrector(dictionary=corpus, verbose=1)\n",
 91 |     "spell_corrector.correction('edwardd')"
 92 |    ]
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.5.2"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/sample/util/nlp-util-symspell.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Ingestion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from collections import Counter\n",
 19 |     "from sklearn.datasets import fetch_20newsgroups\n",
 20 |     "import re\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "corpus = []\n",
 24 |     "for line in fetch_20newsgroups().data:\n",
 25 |     "    line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n",
 26 |     "    line = re.sub('[^a-z ]', ' ', line)\n",
 27 |     "    tokens = line.split(' ')\n",
 28 |     "    tokens = [token for token in tokens if len(token) > 0]\n",
 29 |     "    corpus.extend(tokens)\n",
 30 |     "\n",
 31 |     "corpus = Counter(corpus)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%reload_ext  autoreload\n",
 41 |     "%autoreload 2\n",
 42 |     "\n",
 43 |     "import sys, os\n",
 44 |     "def add_aion(curr_path=None):\n",
 45 |     "    if curr_path is None:\n",
 46 |     "        dir_path = os.getcwd()\n",
 47 |     "        target_path = os.path.dirname(os.path.dirname(dir_path))\n",
 48 |     "        if target_path not in sys.path:\n",
 49 |     "#             print('Added %s into sys.path.' % (target_path))\n",
 50 |     "            sys.path.insert(0, target_path)\n",
 51 |     "            \n",
 52 |     "add_aion()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "source": [
 61 |     "# Symspell"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from aion.util.spell_check import SymSpell"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "Size of dictionary: 89038\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "corpus_dir = '../../data/'\n",
 90 |     "corpus_file_name = 'spell_check_dictionary.txt'\n",
 91 |     "\n",
 92 |     "symspell = SymSpell(verbose=10)\n",
 93 |     "symspell.build_vocab(\n",
 94 |     "    dictionary=corpus, \n",
 95 |     "    file_dir=corpus_dir, file_name=corpus_file_name)\n",
 96 |     "\n",
 97 |     "symspell.load_vocab(corpus_file_path=corpus_dir+corpus_file_name)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "Correct single word"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 5,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "[{'word': 'edward', 'distance': 1, 'count': 154}, {'word': 'edwards', 'distance': 1, 'count': 50}]\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "results = symspell.correction(word='edwarda')\n",
122 |     "print(results)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Correct sentence"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 6,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "[{'word': 'hello i am ed area', 'distance': 3}]\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "results = symspell.corrections(sentence='Hello I am Edarda')\n",
147 |     "print(results)"
148 |    ]
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "Python 3",
154 |    "language": "python",
155 |    "name": "python3"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.5.2"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------