├── .gitignore
├── aion
├── helper
│ ├── __init__.py
│ └── file_helper.py
├── util
│ ├── __init__.py
│ └── spell_check.py
└── embeddings
│ ├── infersent_lib
│ ├── encoder
│ │ ├── models.py
│ │ └── extract_features.py
│ ├── .gitignore
│ ├── dataset
│ │ ├── get_data.bash
│ │ └── tokenizer.sed
│ ├── mutils.py
│ ├── data.py
│ ├── README.md
│ └── train_nli.py
│ ├── document_embeddings.py
│ ├── sentence_embeddings.py
│ ├── word_embeddings.py
│ ├── cove.py
│ ├── infersent.py
│ ├── glove.py
│ ├── doc2vec.py
│ ├── embeddings.py
│ ├── elmo.py
│ └── skip_thoughts.py
└── sample
├── resources
└── LSI and LDA.pptx
├── nlp-distance-edit_distance.ipynb
├── util
├── nlp-util-spell_corrector.ipynb
└── nlp-util-symspell.ipynb
├── preprocessing
└── nlp-preprocessing-string_matching-fuzzywuzzy.ipynb
├── embeddings
└── nlp-embeddings-document-doc2vec.ipynb
├── nlp-embeddings-word-cove.ipynb
├── nlp-embeddings-sentence-infersent.ipynb
├── nlp-stemming.ipynb
├── nlp_lemmatization.ipynb
├── nlp-part_of_speech.ipynb
├── nlp-lsa_lda.ipynb
├── nlp-word_mover_distance.ipynb
├── nlp-sentence_tokenization.ipynb
├── nlp-named_entity_recognition.ipynb
├── nlp-word_tokenization.ipynb
├── nlp-stop_words.ipynb
└── nlp-3_basic_distance_measurement_in_text_mining.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
--------------------------------------------------------------------------------
/aion/helper/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/aion/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/encoder/models.py:
--------------------------------------------------------------------------------
1 | ../models.py
--------------------------------------------------------------------------------
/sample/resources/LSI and LDA.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlp/HEAD/sample/resources/LSI and LDA.pptx
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/GloVe
2 | dataset/MultiNLI
3 | dataset/SNLI
4 | encoder/infersent.allnli.pickle
5 |
6 | *.swp
7 |
--------------------------------------------------------------------------------
/aion/embeddings/document_embeddings.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Edward Ma. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import os, datetime
17 |
18 | from .embeddings import Embeddings
19 |
20 |
21 | class DocumentEmbeddings(Embeddings):
22 | def __init__(self, verbose=0):
23 | self.verbose = verbose
--------------------------------------------------------------------------------
/aion/embeddings/sentence_embeddings.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Edward Ma. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import os, datetime
17 |
18 | from .embeddings import Embeddings
19 |
20 |
21 | class SentenceEmbeddings(Embeddings):
22 | def __init__(self, verbose=0):
23 | self.verbose = verbose
--------------------------------------------------------------------------------
/sample/nlp-distance-edit_distance.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Edit Distance"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Edit Distance for \"edward\" and \"edwin\" is 3\n",
20 | "Edit Distance for \"Edward\" and \"edwin\" is 4\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "import editdistance\n",
26 | "\n",
27 | "data = ['edward', 'Edward']\n",
28 | "\n",
29 | "for record in data:\n",
30 | " dist = editdistance.eval(record, 'edwin')\n",
31 | " print('Edit Distance for \"%s\" and \"%s\" is %d' % (record, 'edwin', dist))"
32 | ]
33 | }
34 | ],
35 | "metadata": {
36 | "kernelspec": {
37 | "display_name": "Python 3",
38 | "language": "python",
39 | "name": "python3"
40 | },
41 | "language_info": {
42 | "codemirror_mode": {
43 | "name": "ipython",
44 | "version": 3
45 | },
46 | "file_extension": ".py",
47 | "mimetype": "text/x-python",
48 | "name": "python",
49 | "nbconvert_exporter": "python",
50 | "pygments_lexer": "ipython3",
51 | "version": "3.5.2"
52 | }
53 | },
54 | "nbformat": 4,
55 | "nbformat_minor": 2
56 | }
57 |
--------------------------------------------------------------------------------
/aion/helper/file_helper.py:
--------------------------------------------------------------------------------
1 | import datetime, os, urllib.request, zipfile
2 |
3 |
4 | class FileHelper:
5 | def __init__(self, verbose=0):
6 | self.verbose = verbose
7 |
8 | def _log_time(self, status, msg, verbose):
9 | if self.verbose >= 0 or verbose >= 0:
10 | print('%s. [%s] %s' % (datetime.datetime.now(), status, msg))
11 |
12 | def is_file_exist(self, file_path):
13 | return os.path.exists(file_path)
14 |
15 | def download(self, src, dest_dir, dest_file, uncompress=False, housekeep=False, force_download=False, verbose=0):
16 | if not os.path.exists(dest_dir):
17 | os.makedirs(dest_dir)
18 |
19 | # print('dest_dir:', dest_dir)
20 |
21 | if dest_file is None:
22 | dest_file = os.path.basename(src)
23 |
24 | # print('dest_file:', dest_file)
25 |
26 | if not self.is_file_exist(dest_dir + dest_file) or force_download:
27 | self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose)
28 | file = urllib.request.urlopen(src)
29 | with open(dest_dir + dest_file,'wb') as output:
30 | output.write(file.read())
31 | else:
32 | self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose)
33 |
34 | # if uncompress:
35 | # self.uncompress(dest_dir + dest_file)
36 |
37 | # if uncompress and housekeep:
38 | # self.housekeep(dest_dir + dest_file)
39 |
40 | return dest_dir + dest_file
--------------------------------------------------------------------------------
/aion/embeddings/word_embeddings.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import numpy as np
3 |
4 | from .embeddings import Embeddings
5 |
6 |
7 | class WordEmbeddings(Embeddings):
8 |
9 | def __init__(self,
10 | handle_oov=True, oov_vector=None, oov_vector_type='zero',
11 | padding=True, pad_vector=None, pad_vector_type='zero',
12 | max_sequence_length=10, dimension=300,
13 | verbose=0):
14 | super().__init__(verbose=verbose)
15 | self.handle_oov = handle_oov
16 | self.oov_vector_type = oov_vector_type
17 | if handle_oov and oov_vector is None:
18 | if oov_vector_type == 'zero':
19 | self.oov_vector = np.zeros(dimension)
20 | elif oov_vector_type == 'random':
21 | self.oov_vector = np.random.rand(dimension)
22 | else:
23 | self.oov_vector = oov_vector
24 |
25 | self.padding = padding
26 | self.pad_vector_type = pad_vector_type
27 | if padding and pad_vector is None:
28 | if pad_vector_type == 'zero':
29 | self.pad_vector = np.zeros(dimension)
30 | elif pad_vector_type == 'random':
31 | self.pad_vector = np.random.rand(dimension)
32 | else:
33 | self.pad_vector = pad_vector
34 |
35 | self.max_sequence_length = max_sequence_length
36 | self.dimension = dimension
37 |
38 | def get_oov_vector(self):
39 | return self.oov_vector
40 |
41 | def set_oov_vector(self, oov_vector):
42 | self.oov_vector = oov_vector
43 |
44 | def get_pad_vector(self):
45 | return self.pad_vector
46 |
47 | def set_pad_vector(self, pad_vector):
48 | self.pad_vector = pad_vector
49 |
50 | def is_vector_exist(self, word):
51 | return word in self.model
--------------------------------------------------------------------------------
/aion/embeddings/cove.py:
--------------------------------------------------------------------------------
1 | import keras
2 |
3 | from .word_embeddings import WordEmbeddings
4 | from .glove import GloVeEmbeddings
5 |
6 | '''
7 | Source: https://github.com/rgsachin/CoVe
8 | '''
9 |
10 |
11 | class CoVeEmbeddings(WordEmbeddings):
12 | COVE_MODEL_KERAS_URL = 'https://github.com/rgsachin/CoVe/raw/master/Keras_CoVe.h5'
13 |
14 | def __init__(self,
15 | word_embeddings_dir,
16 | handle_oov=True, oov_vector_type='random',
17 | padding=True, pad_vector_type='random',
18 | max_sequence_length=50, tokenizer=None,
19 | verbose=0):
20 | super().__init__(verbose=verbose)
21 |
22 | if tokenizer is None:
23 | self.tokenizer = self._tokenizer_space
24 |
25 | self.word_embeddings_dir = word_embeddings_dir
26 | self.handle_oov = handle_oov
27 | self.oov_vector_type = oov_vector_type
28 | self.padding = padding
29 | self.pad_vector_type = pad_vector_type
30 | self.max_sequence_length = max_sequence_length
31 |
32 | def load_model(self, dest_dir, src=None, trainable=True, verbose=0):
33 | if src is None:
34 | src = self.COVE_MODEL_KERAS_URL
35 |
36 | file_path = self.download(
37 | src=src, dest_dir=dest_dir, dest_file=None, uncompress=False)
38 |
39 | self.model = keras.models.load_model(file_path)
40 |
41 | self.word_embs_model = GloVeEmbeddings(
42 | handle_oov=self.handle_oov, oov_vector_type=self.oov_vector_type,
43 | padding=self.padding, pad_vector_type=self.pad_vector_type,
44 | max_sequence_length=self.max_sequence_length)
45 | self.word_embs_model.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose)
46 |
47 | def encode(self, x, tokenize=True):
48 | if tokenize:
49 | tokens = [self.tokenizer(sentence) for sentence in x]
50 | else:
51 | tokens = x
52 |
53 | x_embs = self.word_embs_model.encode(tokens)
54 |
55 | return self.model.predict(x_embs)
56 |
57 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/dataset/get_data.bash:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | preprocess_exec="sed -f tokenizer.sed"
9 |
10 | SNLI='https://nlp.stanford.edu/projects/snli/snli_1.0.zip'
11 | MultiNLI='https://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip'
12 |
13 |
14 | ZIPTOOL="unzip"
15 |
16 | #if [ "$OSTYPE" == "darwin"* ]; then
17 | # # unzip can't handle large files on some MacOS versions
18 | # ZIPTOOL="7za x"
19 | #fi
20 |
21 |
22 | ### download SNLI
23 | mkdir SNLI
24 | curl -Lo SNLI/snli_1.0.zip $SNLI
25 | $ZIPTOOL SNLI/snli_1.0.zip -d SNLI
26 | rm SNLI/snli_1.0.zip
27 | rm -r SNLI/__MACOSX
28 |
29 | for split in train dev test
30 | do
31 | fpath=SNLI/$split.snli.txt
32 | awk '{ if ( $1 != "-" ) { print $0; } }' SNLI/snli_1.0/snli_1.0_$split.txt | cut -f 1,6,7 | sed '1d' > $fpath
33 | cut -f1 $fpath > SNLI/labels.$split
34 | cut -f2 $fpath | $preprocess_exec > SNLI/s1.$split
35 | cut -f3 $fpath | $preprocess_exec > SNLI/s2.$split
36 | rm $fpath
37 | done
38 | rm -r SNLI/snli_1.0
39 |
40 |
41 | # MultiNLI
42 | # Test set not available yet : we define dev set as the "matched" set and the test set as the "mismatched"
43 | mkdir MultiNLI
44 | curl -Lo MultiNLI/multinli_0.9.zip $MultiNLI
45 | $ZIPTOOL MultiNLI/multinli_0.9.zip -d MultiNLI
46 | rm MultiNLI/multinli_0.9.zip
47 | rm -r MultiNLI/__MACOSX
48 |
49 |
50 | mv MultiNLI/multinli_0.9/multinli_0.9_train.txt MultiNLI/train.multinli.txt
51 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_matched.txt MultiNLI/dev.matched.multinli.txt
52 | mv MultiNLI/multinli_0.9/multinli_0.9_dev_mismatched.txt MultiNLI/dev.mismatched.multinli.txt
53 |
54 | rm -r MultiNLI/multinli_0.9
55 |
56 | for split in train dev.matched dev.mismatched
57 | do
58 | fpath=MultiNLI/$split.multinli.txt
59 | awk '{ if ( $1 != "-" ) { print $0; } }' $fpath | cut -f 1,6,7 | sed '1d' > $fpath.tok
60 | cut -f1 $fpath.tok > MultiNLI/labels.$split
61 | cut -f2 $fpath.tok | $preprocess_exec > MultiNLI/s1.$split
62 | cut -f3 $fpath.tok | $preprocess_exec > MultiNLI/s2.$split
63 | rm $fpath $fpath.tok
64 | done
65 |
66 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/dataset/tokenizer.sed:
--------------------------------------------------------------------------------
1 | # Sed script to produce Penn Treebank tokenization on arbitrary raw text.
2 | # Yeah, sure.
3 |
4 | # expected input: raw text with ONE SENTENCE TOKEN PER LINE
5 |
6 | # by Robert MacIntyre, University of Pennsylvania, late 1995.
7 |
8 | # If this wasn't such a trivial program, I'd include all that stuff about
9 | # no warrantee, free use, etc. from the GNU General Public License. If you
10 | # want to be picky, assume that all of its terms apply. Okay?
11 |
12 | # attempt to get correct directional quotes
13 | s=^"=`` =g
14 | s=\([ ([{<]\)"=\1 `` =g
15 | # close quotes handled at end
16 |
17 | s=\.\.\.= ... =g
18 | s=[,;:@#$%&]= & =g
19 |
20 | # Assume sentence tokenization has been done first, so split FINAL periods
21 | # only.
22 | s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
23 | # however, we may as well split ALL question marks and exclamation points,
24 | # since they shouldn't have the abbrev.-marker ambiguity problem
25 | s=[?!]= & =g
26 |
27 | # parentheses, brackets, etc.
28 | s=[][(){}<>]= & =g
29 | # Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
30 | # version of these symbols.
31 | # UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
32 | # s/(/-LRB-/g
33 | # s/)/-RRB-/g
34 | # s/\[/-LSB-/g
35 | # s/\]/-RSB-/g
36 | # s/{/-LCB-/g
37 | # s/}/-RCB-/g
38 |
39 | s=--= -- =g
40 |
41 | # NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since
42 | # you might someday want to know how the words originally fit together --
43 | # but it's too late to make a better system now, given the millions of
44 | # words we've already done "wrong".
45 |
46 | # First off, add a space to the beginning and end of each line, to reduce
47 | # necessary number of regexps.
48 | s=$= =
49 | s=^= =
50 |
51 | s="= '' =g
52 | # possessive or close-single-quote
53 | s=\([^']\)' =\1 ' =g
54 | # as in it's, I'm, we'd
55 | s='\([sSmMdD]\) = '\1 =g
56 | s='ll = 'll =g
57 | s='re = 're =g
58 | s='ve = 've =g
59 | s=n't = n't =g
60 | s='LL = 'LL =g
61 | s='RE = 'RE =g
62 | s='VE = 'VE =g
63 | s=N'T = N'T =g
64 |
65 | s= \([Cc]\)annot = \1an not =g
66 | s= \([Dd]\)'ye = \1' ye =g
67 | s= \([Gg]\)imme = \1im me =g
68 | s= \([Gg]\)onna = \1on na =g
69 | s= \([Gg]\)otta = \1ot ta =g
70 | s= \([Ll]\)emme = \1em me =g
71 | s= \([Mm]\)ore'n = \1ore 'n =g
72 | s= '\([Tt]\)is = '\1 is =g
73 | s= '\([Tt]\)was = '\1 was =g
74 | s= \([Ww]\)anna = \1an na =g
75 | # s= \([Ww]\)haddya = \1ha dd ya =g
76 | # s= \([Ww]\)hatcha = \1ha t cha =g
77 |
78 | # clean out extra spaces
79 | s= *= =g
80 | s=^ *==g
81 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/mutils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | import re
9 | import inspect
10 | from torch import optim
11 |
12 |
13 | def get_optimizer(s):
14 | """
15 | Parse optimizer parameters.
16 | Input should be of the form:
17 | - "sgd,lr=0.01"
18 | - "adagrad,lr=0.1,lr_decay=0.05"
19 | """
20 | if "," in s:
21 | method = s[:s.find(',')]
22 | optim_params = {}
23 | for x in s[s.find(',') + 1:].split(','):
24 | split = x.split('=')
25 | assert len(split) == 2
26 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
27 | optim_params[split[0]] = float(split[1])
28 | else:
29 | method = s
30 | optim_params = {}
31 |
32 | if method == 'adadelta':
33 | optim_fn = optim.Adadelta
34 | elif method == 'adagrad':
35 | optim_fn = optim.Adagrad
36 | elif method == 'adam':
37 | optim_fn = optim.Adam
38 | elif method == 'adamax':
39 | optim_fn = optim.Adamax
40 | elif method == 'asgd':
41 | optim_fn = optim.ASGD
42 | elif method == 'rmsprop':
43 | optim_fn = optim.RMSprop
44 | elif method == 'rprop':
45 | optim_fn = optim.Rprop
46 | elif method == 'sgd':
47 | optim_fn = optim.SGD
48 | assert 'lr' in optim_params
49 | else:
50 | raise Exception('Unknown optimization method: "%s"' % method)
51 |
52 | # check that we give good parameters to the optimizer
53 | expected_args = inspect.getargspec(optim_fn.__init__)[0]
54 | assert expected_args[:2] == ['self', 'params']
55 | if not all(k in expected_args[2:] for k in optim_params.keys()):
56 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
57 | str(expected_args[2:]), str(optim_params.keys())))
58 |
59 | return optim_fn, optim_params
60 |
61 |
62 | """
63 | Importing batcher and prepare for SentEval
64 | """
65 |
66 |
67 | def batcher(batch, params):
68 | # batch contains list of words
69 | batch = [[''] + s + [''] for s in batch]
70 | sentences = [' '.join(s) for s in batch]
71 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size,
72 | tokenize=False)
73 |
74 | return embeddings
75 |
76 |
77 | def prepare(params, samples):
78 | params.infersent.build_vocab([' '.join(s) for s in samples],
79 | params.glove_path, tokenize=False)
80 |
81 |
82 | class dotdict(dict):
83 | """ dot.notation access to dictionary attributes """
84 | __getattr__ = dict.get
85 | __setattr__ = dict.__setitem__
86 | __delattr__ = dict.__delitem__
87 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/encoder/extract_features.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import torch
4 | import argparse
5 |
6 | import numpy as np
7 | from models import InferSent
8 |
9 | if __name__ == '__main__':
10 |
11 | parser = argparse.ArgumentParser(
12 | prog='extract-features',
13 | description='Extract features from pretrained InferSent model')
14 |
15 | parser.add_argument('-g', '--w2v_path', type=str, required=True,
16 | help='Path to word vector file')
17 | parser.add_argument('-v', '--version', type=int, required=True,
18 | help='Version of InferSent (GloVe-V1 or fastText-V2)')
19 | parser.add_argument('-f', '--model_path', type=str, required=True,
20 | help='Path to pretrained .pkl model file')
21 | parser.add_argument('-t', '--tokenize', action='store_true',
22 | help='Passes tokenize=True to build_vocab()')
23 | parser.add_argument('-o', '--out-dir', type=str, required=True,
24 | help='Output folder to save feature files')
25 | parser.add_argument('-c', '--cpu', action='store_true',
26 | help='Use CPU instead of GPU.')
27 | parser.add_argument('-b', '--batch-size', type=int, default=64,
28 | help='Batch size (default: 64)')
29 | parser.add_argument('files', nargs='+',
30 | help='List of files to extract sentence embeddings')
31 |
32 | args = parser.parse_args()
33 |
34 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
35 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version}
36 | model = InferSent(params_model)
37 | model.load_state_dict(torch.load(args.model_path))
38 |
39 | if not args.cpu:
40 | model = model.cuda()
41 |
42 | model.set_w2v_path(args.w2v_path)
43 |
44 | # Ensure directory
45 | if not os.path.exists(args.out_dir):
46 | os.makedirs(args.out_dir)
47 |
48 | # Read files and extract features
49 | for fpath in args.files:
50 | print('Reading file {}'.format(fpath))
51 | sents = []
52 | with open(fpath) as f:
53 | for line in f:
54 | line = line.strip()
55 | assert line, 'Empty line in {}'.format(fpath)
56 | sents.append(line)
57 |
58 | # Set output file name
59 | out_name = os.path.join(
60 | args.out_dir, "{}.embs.npy".format(os.path.basename(fpath)))
61 |
62 | # Build vocab
63 | print('Building vocabulary')
64 | model.build_vocab(sents, args.tokenize)
65 |
66 | # Get embeddings
67 | embs = model.encode(sents, tokenize=args.tokenize,
68 | verbose=True, bsize=args.batch_size)
69 |
70 | print('Saving to {}'.format(out_name))
71 | np.save(out_name, embs)
72 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent.py:
--------------------------------------------------------------------------------
1 | import datetime, os, zipfile
2 | import numpy as np
3 | import torch
4 | import subprocess
5 |
6 | from .glove import GloVeEmbeddings
7 | from .sentence_embeddings import SentenceEmbeddings
8 |
9 | # InferSent (as of Sep 2018) is not a a library (https://github.com/facebookresearch/InferSent/issues/76), Cloned from https://github.com/facebookresearch/InferSent
10 | from .infersent_lib.models import InferSent
11 |
12 |
13 | class InferSentEmbeddings(SentenceEmbeddings):
14 | INFERSENT_GLOVE_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent1.pkl'
15 | INFERSENT_FASTTEXT_MODEL_URL = 'https://s3.amazonaws.com/senteval/infersent/infersent2.pkl'
16 |
17 | def __init__(self,
18 | word_embeddings_dir,
19 | batch_size=64, word_dimension=300, encoder_lstm_dimension=2048,
20 | pooling_type='max', model_version=1, dropout=0.0,
21 | verbose=0):
22 | super().__init__(verbose=verbose)
23 |
24 | self.word_embeddings_dir = word_embeddings_dir
25 | self.batch_size = batch_size
26 | self.word_dimension = word_dimension
27 | self.encoder_lstm_dimension = encoder_lstm_dimension
28 | self.pooling_type = pooling_type
29 | self.dropout = dropout
30 | self.model_version = model_version
31 |
32 | def get_params(self):
33 | return {
34 | 'bsize': self.batch_size,
35 | 'word_emb_dim': self.word_dimension,
36 | 'enc_lstm_dim': self.encoder_lstm_dimension,
37 | 'pool_type': self.pooling_type,
38 | 'dpout_model': self.dropout,
39 | 'version': self.model_version
40 | }
41 |
42 | def load_model(self, dest_dir, src=None, trainable=True, verbose=0):
43 | # TODO: Support V2 model
44 | if src is None:
45 | src = InferSentEmbeddings.INFERSENT_GLOVE_MODEL_URL
46 |
47 | dest_file = os.path.basename(src)
48 | file_path = self.download(
49 | src=src, dest_dir=dest_dir, dest_file=dest_file,
50 | uncompress=False, housekeep=False, verbose=verbose)
51 |
52 | self.model = InferSent(self.get_params())
53 | self.model.load_state_dict(torch.load(dest_dir + dest_file))
54 |
55 | # TODO: support different glove model and fasttext model
56 | word_embs = GloVeEmbeddings()
57 | word_embs.load_model(dest_dir=self.word_embeddings_dir, process=False, verbose=verbose)
58 |
59 | self.model.set_w2v_path(word_embs.model_path)
60 |
61 | def build_vocab(self, sentences, tokenize=True):
62 | return self.model.build_vocab(sentences, tokenize=tokenize)
63 |
64 | def encode(self, sentences, tokenize=True):
65 | return self.model.encode(sentences, tokenize=tokenize)
66 |
67 | def visualize(self, sentence, tokenize=True):
68 | self.model.visualize(sentence, tokenize=tokenize)
--------------------------------------------------------------------------------
/aion/embeddings/glove.py:
--------------------------------------------------------------------------------
1 | import datetime, os, zipfile
2 | import numpy as np
3 |
4 | from .word_embeddings import WordEmbeddings
5 |
6 |
7 | class GloVeEmbeddings(WordEmbeddings):
8 | GLOVE_COMMON_CRAWL_MODEL_URL = 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
9 |
10 | def __init__(self,
11 | handle_oov=True, oov_vector=None, oov_vector_type='zero',
12 | padding=True, pad_vector=None, pad_vector_type='zero',
13 | max_sequence_length=10, dimension=300,
14 | verbose=0):
15 | super().__init__(
16 | handle_oov=handle_oov, oov_vector=oov_vector, oov_vector_type=oov_vector_type,
17 | padding=padding, pad_vector=pad_vector, pad_vector_type=pad_vector_type,
18 | max_sequence_length=max_sequence_length, dimension=dimension,
19 | verbose=verbose)
20 |
21 | def load_model(self, dest_dir, src=None, trainable=True, process=True, verbose=0):
22 | if src is None:
23 | src = self.GLOVE_COMMON_CRAWL_MODEL_URL
24 |
25 | dest_file = os.path.basename(src)
26 |
27 | file_path = self.download(
28 | src=src, dest_dir=dest_dir, dest_file=None,
29 | uncompress=True, housekeep=False, verbose=verbose)
30 |
31 | self.model_path = dest_dir + dest_file
32 |
33 | dest_file = dest_file.replace('.zip', '.txt')
34 |
35 | if process and not self.is_file_exist(dest_dir + dest_file):
36 | with open(dest_dir + dest_file, encoding="utf8" ) as f:
37 | lines = f.readlines()
38 |
39 | for line in lines:
40 | line_contents = line.split()
41 | word = line_contents[0]
42 | self.model[word] = np.array([float(val) for val in line_contents[1:]])
43 |
44 | return self.model
45 |
46 | def uncompress(self, file_path):
47 | self.unzip(file_path)
48 |
49 | def encode(self, sentences):
50 | preds = np.empty([len(sentences), self.max_sequence_length, self.dimension])
51 |
52 | for i, words in enumerate(sentences):
53 | pred = np.empty([self.max_sequence_length, self.dimension])
54 | cnt = 0
55 |
56 | for word in words:
57 | if self.is_vector_exist(word):
58 | pred[cnt] = self.model[word]
59 | cnt += 1
60 | elif self.handle_oov:
61 | pred[cnt] = self.oov_vector
62 | cnt += 1
63 |
64 | if cnt + 1 >= self.max_sequence_length:
65 | break
66 |
67 | if self.padding and (cnt + 1 < self.max_sequence_length):
68 | for i in range(0, self.max_sequence_length - cnt):
69 | pred[cnt] = self.pad_vector
70 | cnt += 1
71 |
72 | preds[i] = pred
73 |
74 |
75 | return preds
--------------------------------------------------------------------------------
/aion/embeddings/doc2vec.py:
--------------------------------------------------------------------------------
1 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
2 |
3 | from .document_embeddings import DocumentEmbeddings
4 |
5 |
6 | class Doc2VecEmbeddings(DocumentEmbeddings):
7 | def __init__(self,
8 | merge_mode="concat", algorithms="dm",
9 | word_dimension=300, min_word_count=1,
10 | word_window=10, n_job=4,
11 | train_epoch=10, infer_epoch=5,
12 | infer_aplha=0.1, infer_min_alpha=0.0001,
13 | verbose=0):
14 | super().__init__(verbose=verbose)
15 |
16 | self.merge_mode = merge_mode
17 | if merge_mode == 'concat':
18 | self.dm_concat = 1
19 | self.dm_mean = None
20 | elif merge_mode == 'mean':
21 | self.dm_concat = None
22 | self.dm_mean = 1
23 | else:
24 | raise Exception('merge_mode only allows either concat or mean')
25 |
26 | self.algorithms = algorithms
27 | if algorithms == 'dm':
28 | self.dm = 1
29 | elif algorithms == 'dbow':
30 | self.dm = 0
31 |
32 | self.word_dimension = word_dimension
33 | self.min_word_count = min_word_count
34 | self.word_window = word_window
35 | self.n_job = n_job
36 | self.train_epoch = train_epoch
37 | self.infer_epoch = infer_epoch
38 | self.infer_alpha = infer_aplha
39 | self.infer_min_alpha = infer_min_alpha
40 |
41 | self.vocab_size = 0
42 | self.word2idx = {}
43 |
44 | def build_vocab(self, documents, training=True, tokenize=True):
45 | if tokenize:
46 | docs = [self._tokenizer_space(document) for document in documents]
47 | else:
48 | docs = documents
49 |
50 | vocab = {}
51 | for words in docs:
52 | for word in words:
53 | if word not in vocab:
54 | vocab[word] = 1
55 |
56 | if training:
57 | self.vocab_size = len(vocab)
58 |
59 |
60 | docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
61 | return docs
62 |
63 | def train(self, documents):
64 | self.model = Doc2Vec(
65 | documents, dm_concat=self.dm_concat, dm_mean=self.dm_mean,
66 | dm=self.dm, vector_size=self.word_dimension,
67 | window=self.word_window, min_count=self.min_word_count,
68 | workers=self.n_job)
69 |
70 | self.model.train(
71 | documents, total_words=self.vocab_size,
72 | epochs=self.train_epoch)
73 |
74 | def encode(self, documents, tokenize=True):
75 | if tokenize:
76 | docs = [self._tokenizer_space(document) for document in documents]
77 | else:
78 | docs = documents
79 |
80 | docs = [
81 | self.model.infer_vector(
82 | document, alpha=self.infer_alpha,
83 | min_alpha=self.infer_min_alpha,
84 | steps=self.infer_epoch)
85 | for document in docs
86 | ]
87 |
88 | return docs
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/sample/util/nlp-util-spell_corrector.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Ingestion"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "from collections import Counter\n",
19 | "from sklearn.datasets import fetch_20newsgroups\n",
20 | "import re\n",
21 | "\n",
22 | "\n",
23 | "corpus = []\n",
24 | "for line in fetch_20newsgroups().data:\n",
25 | " line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n",
26 | " line = re.sub('[^a-z ]', ' ', line)\n",
27 | " tokens = line.split(' ')\n",
28 | " tokens = [token for token in tokens if len(token) > 0]\n",
29 | " corpus.extend(tokens)\n",
30 | "\n",
31 | "corpus = Counter(corpus)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "%reload_ext autoreload\n",
41 | "%autoreload 2\n",
42 | "\n",
43 | "import sys, os\n",
44 | "def add_aion(curr_path=None):\n",
45 | " if curr_path is None:\n",
46 | " dir_path = os.getcwd()\n",
47 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n",
48 | " if target_path not in sys.path:\n",
49 | "# print('Added %s into sys.path.' % (target_path))\n",
50 | " sys.path.insert(0, target_path)\n",
51 | " \n",
52 | "add_aion()"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "# SpellCorrector"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 3,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "Known Result: set()\n",
72 | "Edit1 Result: {'edward', 'edwards'}\n",
73 | "Edit2 Result: {'gedwards', 'edward', 'eduard', 'edvard', 'tedward', 'edgardo', 'edwards', 'tedwards'}\n"
74 | ]
75 | },
76 | {
77 | "data": {
78 | "text/plain": [
79 | "'edward'"
80 | ]
81 | },
82 | "execution_count": 3,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "from aion.util.spell_corrector import SpellCorrector\n",
89 | "\n",
90 | "spell_corrector = SpellCorrector(dictionary=corpus, verbose=1)\n",
91 | "spell_corrector.correction('edwardd')"
92 | ]
93 | }
94 | ],
95 | "metadata": {
96 | "kernelspec": {
97 | "display_name": "Python 3",
98 | "language": "python",
99 | "name": "python3"
100 | },
101 | "language_info": {
102 | "codemirror_mode": {
103 | "name": "ipython",
104 | "version": 3
105 | },
106 | "file_extension": ".py",
107 | "mimetype": "text/x-python",
108 | "name": "python",
109 | "nbconvert_exporter": "python",
110 | "pygments_lexer": "ipython3",
111 | "version": "3.5.2"
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 2
116 | }
117 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | import os
9 | import numpy as np
10 | import torch
11 |
12 |
13 | def get_batch(batch, word_vec, emb_dim=300):
14 | # sent in batch in decreasing order of lengths (bsize, max_len, word_dim)
15 | lengths = np.array([len(x) for x in batch])
16 | max_len = np.max(lengths)
17 | embed = np.zeros((max_len, len(batch), emb_dim))
18 |
19 | for i in range(len(batch)):
20 | for j in range(len(batch[i])):
21 | embed[j, i, :] = word_vec[batch[i][j]]
22 |
23 | return torch.from_numpy(embed).float(), lengths
24 |
25 |
26 | def get_word_dict(sentences):
27 | # create vocab of words
28 | word_dict = {}
29 | for sent in sentences:
30 | for word in sent.split():
31 | if word not in word_dict:
32 | word_dict[word] = ''
33 | word_dict[''] = ''
34 | word_dict[''] = ''
35 | word_dict['
'] = ''
36 | return word_dict
37 |
38 |
39 | def get_glove(word_dict, glove_path):
40 | # create word_vec with glove vectors
41 | word_vec = {}
42 | with open(glove_path) as f:
43 | for line in f:
44 | word, vec = line.split(' ', 1)
45 | if word in word_dict:
46 | word_vec[word] = np.array(list(map(float, vec.split())))
47 | print('Found {0}(/{1}) words with glove vectors'.format(
48 | len(word_vec), len(word_dict)))
49 | return word_vec
50 |
51 |
52 | def build_vocab(sentences, glove_path):
53 | word_dict = get_word_dict(sentences)
54 | word_vec = get_glove(word_dict, glove_path)
55 | print('Vocab size : {0}'.format(len(word_vec)))
56 | return word_vec
57 |
58 |
59 | def get_nli(data_path):
60 | s1 = {}
61 | s2 = {}
62 | target = {}
63 |
64 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
65 |
66 | for data_type in ['train', 'dev', 'test']:
67 | s1[data_type], s2[data_type], target[data_type] = {}, {}, {}
68 | s1[data_type]['path'] = os.path.join(data_path, 's1.' + data_type)
69 | s2[data_type]['path'] = os.path.join(data_path, 's2.' + data_type)
70 | target[data_type]['path'] = os.path.join(data_path,
71 | 'labels.' + data_type)
72 |
73 | s1[data_type]['sent'] = [line.rstrip() for line in
74 | open(s1[data_type]['path'], 'r')]
75 | s2[data_type]['sent'] = [line.rstrip() for line in
76 | open(s2[data_type]['path'], 'r')]
77 | target[data_type]['data'] = np.array([dico_label[line.rstrip('\n')]
78 | for line in open(target[data_type]['path'], 'r')])
79 |
80 | assert len(s1[data_type]['sent']) == len(s2[data_type]['sent']) == \
81 | len(target[data_type]['data'])
82 |
83 | print('** {0} DATA : Found {1} pairs of {2} sentences.'.format(
84 | data_type.upper(), len(s1[data_type]['sent']), data_type))
85 |
86 | train = {'s1': s1['train']['sent'], 's2': s2['train']['sent'],
87 | 'label': target['train']['data']}
88 | dev = {'s1': s1['dev']['sent'], 's2': s2['dev']['sent'],
89 | 'label': target['dev']['data']}
90 | test = {'s1': s1['test']['sent'], 's2': s2['test']['sent'],
91 | 'label': target['test']['data']}
92 | return train, dev, test
93 |
--------------------------------------------------------------------------------
/aion/embeddings/embeddings.py:
--------------------------------------------------------------------------------
1 | import datetime, os, urllib, zipfile
2 |
3 |
4 | class Embeddings:
5 | def __init__(self, verbose=0):
6 | self.verbose = verbose
7 | self.model = {}
8 | self.model_path = ''
9 |
10 | def _log_time(self, status, msg, verbose):
11 | if self.verbose >= 10 or verbose >= 10:
12 | print('%s. [%s] %s' % (datetime.datetime.now(), status, msg))
13 |
14 | def download(self, src, dest_dir, dest_file, uncompress, housekeep=False, verbose=0):
15 | if not os.path.exists(dest_dir):
16 | os.makedirs(dest_dir)
17 |
18 | if dest_file is None:
19 | dest_file = os.path.basename(src)
20 |
21 | if not self.is_file_exist(dest_dir + dest_file):
22 | self._log_time(status='DOWNLOAD', msg='From '+src+' to '+dest_dir+dest_file, verbose=verbose)
23 | file = urllib.request.urlopen(src)
24 | with open(dest_dir + dest_file,'wb') as output:
25 | output.write(file.read())
26 | else:
27 | self._log_time(status='FOUND', msg=dest_file+' in '+dest_dir, verbose=verbose)
28 |
29 | if uncompress:
30 | self.uncompress(dest_dir + dest_file)
31 |
32 | if uncompress and housekeep:
33 | self.housekeep(dest_dir + dest_file)
34 |
35 |
36 | return dest_dir + dest_file
37 |
38 | """
39 | File related
40 | """
41 |
42 | def uncompress(self):
43 | raise NotImplemented()
44 |
45 | def unzip(self, file_path):
46 | dest_dir = os.path.dirname(file_path)
47 | with zipfile.ZipFile(file_path, "r") as zip_ref:
48 | zip_ref.extractall(dest_dir)
49 |
50 | def housekeep(self, file_path):
51 | os.remove(file_path)
52 |
53 | def is_file_exist(self, file_path):
54 | return os.path.exists(file_path)
55 |
56 | def save(self):
57 | raise NotImplemented()
58 |
59 | def load(self):
60 | raise NotImplemented()
61 |
62 | """
63 | Model related
64 | """
65 |
66 | def get_model(self):
67 | return self.model
68 |
69 | def set_model(self, model):
70 | self.model = model
71 |
72 | def load(self, src=None, dest_dir=None, trainable=True, verbose=0):
73 | raise NotImplemented()
74 |
75 | """
76 | Vocabulary realted
77 | """
78 |
79 | def load_vocab(self, **kwargs):
80 | raise NotImplemented()
81 |
82 | def build_vocab(self):
83 | raise NotImplemented()
84 |
85 | def get_vocab(self):
86 | raise NotImplemented()
87 |
88 | def _tokenizer_space(self, sentence):
89 | return sentence.split(' ')
90 |
91 | """
92 | Vector related
93 | """
94 |
95 | def train(self):
96 | raise NotImplemented()
97 |
98 | def encode(self, sentences):
99 | raise NotImplemented()
100 |
101 | def visualize(self):
102 | raise NotImplemented()
103 |
104 | """
105 | Netowrk realted
106 | """
107 |
108 | def to_numpy_layer(self):
109 | raise NotImplemented()
110 |
111 | def to_keras_layer(self):
112 | raise NotImplemented()
113 |
114 | def to_tensorflow_layer(self):
115 | raise NotImplemented()
116 |
117 | def to_pytorch_layer(self):
118 | raise NotImplemented()
--------------------------------------------------------------------------------
/sample/util/nlp-util-symspell.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Ingestion"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "from collections import Counter\n",
19 | "from sklearn.datasets import fetch_20newsgroups\n",
20 | "import re\n",
21 | "\n",
22 | "\n",
23 | "corpus = []\n",
24 | "for line in fetch_20newsgroups().data:\n",
25 | " line = line.replace('\\n', ' ').replace('\\t', ' ').lower()\n",
26 | " line = re.sub('[^a-z ]', ' ', line)\n",
27 | " tokens = line.split(' ')\n",
28 | " tokens = [token for token in tokens if len(token) > 0]\n",
29 | " corpus.extend(tokens)\n",
30 | "\n",
31 | "corpus = Counter(corpus)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "%reload_ext autoreload\n",
41 | "%autoreload 2\n",
42 | "\n",
43 | "import sys, os\n",
44 | "def add_aion(curr_path=None):\n",
45 | " if curr_path is None:\n",
46 | " dir_path = os.getcwd()\n",
47 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n",
48 | " if target_path not in sys.path:\n",
49 | "# print('Added %s into sys.path.' % (target_path))\n",
50 | " sys.path.insert(0, target_path)\n",
51 | " \n",
52 | "add_aion()"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "collapsed": true
59 | },
60 | "source": [
61 | "# Symspell"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "from aion.util.spell_check import SymSpell"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "Size of dictionary: 89038\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "corpus_dir = '../../data/'\n",
90 | "corpus_file_name = 'spell_check_dictionary.txt'\n",
91 | "\n",
92 | "symspell = SymSpell(verbose=10)\n",
93 | "symspell.build_vocab(\n",
94 | " dictionary=corpus, \n",
95 | " file_dir=corpus_dir, file_name=corpus_file_name)\n",
96 | "\n",
97 | "symspell.load_vocab(corpus_file_path=corpus_dir+corpus_file_name)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "Correct single word"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 5,
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "[{'word': 'edward', 'distance': 1, 'count': 154}, {'word': 'edwards', 'distance': 1, 'count': 50}]\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "results = symspell.correction(word='edwarda')\n",
122 | "print(results)"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "Correct sentence"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "name": "stdout",
139 | "output_type": "stream",
140 | "text": [
141 | "[{'word': 'hello i am ed area', 'distance': 3}]\n"
142 | ]
143 | }
144 | ],
145 | "source": [
146 | "results = symspell.corrections(sentence='Hello I am Edarda')\n",
147 | "print(results)"
148 | ]
149 | }
150 | ],
151 | "metadata": {
152 | "kernelspec": {
153 | "display_name": "Python 3",
154 | "language": "python",
155 | "name": "python3"
156 | },
157 | "language_info": {
158 | "codemirror_mode": {
159 | "name": "ipython",
160 | "version": 3
161 | },
162 | "file_extension": ".py",
163 | "mimetype": "text/x-python",
164 | "name": "python",
165 | "nbconvert_exporter": "python",
166 | "pygments_lexer": "ipython3",
167 | "version": "3.5.2"
168 | }
169 | },
170 | "nbformat": 4,
171 | "nbformat_minor": 2
172 | }
173 |
--------------------------------------------------------------------------------
/sample/preprocessing/nlp-preprocessing-string_matching-fuzzywuzzy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "%reload_ext autoreload\n",
17 | "%autoreload 2\n",
18 | "\n",
19 | "import sys, os\n",
20 | "def add_aion(curr_path=None):\n",
21 | " if curr_path is None:\n",
22 | " dir_path = os.getcwd()\n",
23 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n",
24 | " if target_path not in sys.path:\n",
25 | " sys.path.insert(0, target_path)\n",
26 | " \n",
27 | "add_aion()"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "2018-12-27 19:39:50.455930. [DOWNLOAD] From https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv to ../../data/location/country.csv\n",
40 | "0 : Afghanistan\n",
41 | "1 : Åland Islands\n",
42 | "2 : Albania\n",
43 | "3 : Algeria\n",
44 | "4 : American Samoa\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "import pandas as pd\n",
50 | "from aion.helper.file_helper import FileHelper\n",
51 | "\n",
52 | "file_helper = FileHelper()\n",
53 | "countries_file_path = file_helper.download(\n",
54 | " src='https://raw.githubusercontent.com/umpirsky/country-list/master/data/en/country.csv', \n",
55 | " dest_dir='../../data/location/', dest_file='country.csv', force_download=True)\n",
56 | "\n",
57 | "country_df = pd.read_csv(countries_file_path)\n",
58 | "countries = country_df['value'].tolist()\n",
59 | "\n",
60 | "for i, country in enumerate(countries[:5]):\n",
61 | " print(i, \":\", country)"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Fuzzywuzzy"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "from fuzzywuzzy import fuzz\n",
78 | "from fuzzywuzzy import process"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 5,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "[('Hong Kong SAR China', 90), ('Congo - Kinshasa', 57)]\n",
91 | "[('Japan', 60), ('Yemen', 60)]\n",
92 | "[('United States', 96), ('United Arab Emirates', 86)]\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# Default scorer is Weighed Ratio\n",
98 | "for location in ['Hong Kong', 'jepen', 'United tates']:\n",
99 | " result = process.extract(location, countries, limit=2)\n",
100 | " print(result)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 6,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/plain": [
111 | "[('Edwards', 92), ('Edwards2', 86), ('drawdE', 50)]"
112 | ]
113 | },
114 | "execution_count": 6,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "# Ratio\n",
121 | "process.extract('Edward', ['Edwards', 'Edwards2', 'drawdE'], scorer=fuzz.ratio)"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 7,
127 | "metadata": {},
128 | "outputs": [
129 | {
130 | "data": {
131 | "text/plain": [
132 | "[('Hong Kong SAR China', 64), ('Congo - Kinshasa', 48), ('Mongolia', 47)]"
133 | ]
134 | },
135 | "execution_count": 7,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "# Partial Ratio\n",
142 | "process.extract('Hong Kong', countries, scorer=fuzz.QRatio, limit=3)"
143 | ]
144 | }
145 | ],
146 | "metadata": {
147 | "kernelspec": {
148 | "display_name": "Python 3",
149 | "language": "python",
150 | "name": "python3"
151 | },
152 | "language_info": {
153 | "codemirror_mode": {
154 | "name": "ipython",
155 | "version": 3
156 | },
157 | "file_extension": ".py",
158 | "mimetype": "text/x-python",
159 | "name": "python",
160 | "nbconvert_exporter": "python",
161 | "pygments_lexer": "ipython3",
162 | "version": "3.5.5"
163 | }
164 | },
165 | "nbformat": 4,
166 | "nbformat_minor": 2
167 | }
168 |
--------------------------------------------------------------------------------
/aion/embeddings/elmo.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import tensorflow as tf
4 | import tensorflow_hub as tf_hub
5 |
6 | from .word_embeddings import WordEmbeddings
7 |
8 |
9 | class ELMoEmbeddings(WordEmbeddings):
10 | ELMO_MODEL_V2_URL = "https://tfhub.dev/google/elmo/2"
11 |
12 | def __init__(self, layer, verbose=0):
13 | super().__init__(verbose=verbose)
14 | self.layer = layer
15 |
16 | def _set_tf_log_level(self, verbose):
17 | if verbose >= 30:
18 | tf.logging.set_verbosity(tf.logging.INFO)
19 | elif verbose >= 20:
20 | tf.logging.set_verbosity(tf.logging.WARN)
21 | elif verbose >= 10:
22 | tf.logging.set_verbosity(tf.logging.DEBUG)
23 | else:
24 | tf.logging.set_verbosity(tf.logging.ERROR)
25 |
26 | def load(self, src=None, dest_dir=None, trainable=True, verbose=0):
27 | self._log_time(status='LOADING', msg='file', verbose=verbose)
28 | self._set_tf_log_level(verbose)
29 |
30 | if src == None:
31 | src = self.ELMO_MODEL_V2_URL
32 |
33 | if dest_dir is not None:
34 | os.environ["TFHUB_CACHE_DIR"] = dest_dir
35 |
36 | self.model = tf_hub.Module(src, trainable=trainable)
37 |
38 | self._log_time(status='LOADED', msg='', verbose=verbose)
39 |
40 | return self.model
41 |
42 | def to_keras_layer(self, x):
43 | # Source: https://github.com/strongio/keras-elmo/blob/master/Elmo%20Keras.ipynb
44 | '''
45 | For signature and layer parameters, you can visit https://alpha.tfhub.dev/google/elmo/2
46 | '''
47 | return self.model(
48 | tf.squeeze(tf.cast(x, tf.string)),
49 | signature="default", as_dict=True)[self.layer]
50 |
51 |
52 | # import operator
53 | # import datetime
54 | # import re
55 |
56 | # from bilm.data import Vocabulary
57 |
58 | # class ELMoEmbeddings:
59 | # def __init__(self, tokenizer=None, verbose=0):
60 | # self.verbose = verbose
61 |
62 | # self.tokenizer = self.get_tokenizer(tokenizer)
63 |
64 | # def _space_tokenizer(self, sentence):
65 | # # There is some unicode from source data
66 | # # return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.encode('ascii', 'ignore').decode('ascii').split(' ') if t != '']
67 | # # return [t.encode('ascii', 'ignore').decode('ascii') for t in sentence.split(' ') if t != '']
68 | # return [t for t in sentence.split(' ') if t != '']
69 |
70 | # def _spacy_tokenizer(self, sentence, model=None):
71 | # if model is None:
72 | # import spacy
73 | # model = spacy.load('en')
74 |
75 | # return [t.text.encode('ascii', 'ignore') for t in model(str(sentence)) if t.text != '']
76 |
77 | # def get_tokenizer(self, tokenizer):
78 | # if tokenizer is None or tokenizer == 'space':
79 | # tokenizer = self._space_tokenizer
80 | # elif tokenizer == 'spacy':
81 | # tokenizer = self._spacy_tokenizer
82 |
83 | # return tokenizer
84 |
85 | # def preprocess(self, sentence):
86 | # normalized_space = sentence.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
87 | # normalized_unicode = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', normalized_space)
88 |
89 | # normalized_text = re.sub(' +',' ', normalized_unicode)
90 |
91 | # return normalized_text
92 |
93 | # def get_basic_elements(self, mode):
94 | # if mode == 'build':
95 | # return ['', '', '', '']
98 | # return []
99 |
100 | # def build_vocab(self, sentences, mode, vocab_file_path):
101 | # word_dict = {}
102 |
103 | # basic_elements = self.get_basic_elements(mode)
104 |
105 | # for sentence in sentences:
106 | # sentence = self.preprocess(sentence)
107 | # for w in self.tokenizer(sentence):
108 |
109 | # if w not in word_dict:
110 | # word_dict[w] = 0
111 | # word_dict[w] += 1
112 |
113 | # word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
114 | # print('Total Word: %d' % (len(word_dict)))
115 |
116 | # with open(vocab_file_path, 'w') as f:
117 | # for item in basic_elements:
118 | # f.write("%s\n" % item)
119 |
120 | # for word, count in word_dict:
121 | # # Ximenez, characters <-- finding these word to check unicode issue
122 | # # print([word])
123 | # if word != '':
124 | # f.write("%s\n" % word)
125 |
126 | # def build_data(self, sentences, data_file_path):
127 | # with open(data_file_path, 'w') as f:
128 | # for sentence in sentences:
129 | # sentence = self.preprocess(sentence)
130 | # tokens = self.tokenizer(sentence)
131 | # if len(tokens) > 0:
132 | # f.write("%s\n" % ' '.join(str(tokens)))
--------------------------------------------------------------------------------
/aion/embeddings/skip_thoughts.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Edward Ma. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | import os, datetime
17 |
18 | class SkipThoughtsEmbeddingsTorch:
19 | DICTIONARY_URL = "http://www.cs.toronto.edu/~rkiros/models/dictionary.txt"
20 | UNISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/utable.npy"
21 | BISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/btable.npy"
22 | UNISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz"
23 | BISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz"
24 | UNISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl"
25 | BISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl"
26 |
27 | def __init__(self, model_dir, algorithm='uniskip', tokenizer=None, verbose=0):
28 | super().__init__(verbose=verbose)
29 |
30 | from torch import LongTensor
31 | from torch.autograd import Variable
32 | from skipthoughts import UniSkip, BiSkip
33 |
34 | self.model_dir = model_dir
35 | self.algorithm = algorithm
36 | self.vocab = {}
37 | self.vocabs = []
38 | if tokenizer is None:
39 | self.tokenizer = self._tokenizer_space
40 | else:
41 | self.tokenizer = tokenizer
42 | self.max_sentence_len = -1
43 |
44 | def downloads(self, dest_dir, sources=None):
45 | if sources is None:
46 | sources = [self.DICTIONARY_URL, self.UNISKIP_URL, self.BISKIP_URL,
47 | self.UNISKIPS_URL, self.BISKIPS_URL, self.UNISKIPS_PKL_URL,
48 | self.BISKIPS_PKL_URL]
49 |
50 | for src in sources:
51 | self.download(src=src, dest_dir=dest_dir, dest_file=None, unzip=False)
52 |
53 | def build_vocab(self, sentences, clear_vocab=True, max_sentence_len=-1):
54 | if clear_vocab:
55 | self.vocab = {}
56 |
57 | self.max_sentence_len = max_sentence_len
58 |
59 | for sentence in sentences:
60 | words = self.tokenizer(sentence)
61 | if max_sentence_len == -1:
62 | self.max_sentence_len = max(self.max_sentence_len, len(words))
63 |
64 | for word in words:
65 | if word not in self.vocab:
66 | self.vocabs.append(word)
67 | # Reserve the first one for padding
68 | self.vocab[word] = len(self.vocab) + 1
69 |
70 | def process(self, sentences):
71 | word_id_sentences = []
72 | for sentence in sentences:
73 | word_ids = [self.vocab[w] for w in self.tokenizer(sentence) if w in self.vocab]
74 |
75 | if self.max_sentence_len > len(word_ids):
76 | for i in range(0, self.max_sentence_len-len(word_ids)):
77 | word_ids.append(0)
78 | elif self.max_sentence_len < len(word_ids):
79 | word_ids = word_ids[:self.max_sentence_len]
80 |
81 | word_id_sentences.append(word_ids)
82 |
83 | return word_id_sentences
84 |
85 | def get_algorithm(self, words, model_dir=None):
86 | if model_dir is None:
87 | model_dir = self.model_dir
88 |
89 | if self.algorithm == 'uniskip':
90 | return UniSkip(model_dir, words)
91 | else:
92 | return BiSkip(model_dir, words)
93 |
94 | def to_numpy_layer(self, layer):
95 | return layer.detach().numpy()
96 |
97 | def encode(self, sentences, output_format='torch'):
98 | transformed_sentences = self.process(sentences)
99 |
100 | algo = self.get_algorithm(self.vocabs)
101 | inputs = Variable(LongTensor(transformed_sentences))
102 | outpus = algo(inputs, lengths=[len(words) for words in transformed_sentences])
103 |
104 | if output_format == 'np':
105 | return self.to_numpy_layer(outpus)
106 | elif output_format == 'torch':
107 | return outpus
108 |
109 | def predict_batch(self, sentences, output_format='torch', batch_size=1000):
110 | batches = [sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size-1) // batch_size)]
111 |
112 | results = []
113 | for batch in batches:
114 | results.append(skip_thoughts_emb.predict(sentences=batch, output_format=output_format))
115 |
116 | if output_format == 'np':
117 | return np.concatenate(results, axis=0)
118 | elif output_format == 'torch':
119 | return torch.cat(results, 0)
--------------------------------------------------------------------------------
/aion/util/spell_check.py:
--------------------------------------------------------------------------------
1 | import re, os
2 | from collections import Counter
3 | from symspellpy.symspellpy import SymSpell as SymSpellPy, Verbosity
4 |
5 | class SpellCheck:
6 | def __init__(self, dictionary=None, verbose=0):
7 | self.verbose = verbose
8 | self.dictionary = dictionary
9 |
10 | def correction(self, text):
11 | return ''
12 |
13 |
14 | '''
15 | Source: https://norvig.com/spell-correct.html
16 | '''
17 | class SpellCorrector(SpellCheck):
18 | def __init__(self, dictionary, verbose=0):
19 | super().__init__(dictionary=dictionary, verbose=verbose)
20 |
21 | def words(text):
22 | return re.findall(r'\w+', text.lower())
23 |
24 | def P(self, word):
25 | "Probability of `word`."
26 | N = sum(self.dictionary.values())
27 | return self.dictionary[word] / N
28 |
29 | def correction(self, word):
30 | "Most probable spelling correction for word."
31 | return max(self.candidates(word), key=self.P)
32 |
33 | def candidates(self, word, verbose=0):
34 | "Generate possible spelling corrections for word."
35 |
36 | known_result = self.known([word])
37 | edit1_result = self.known(self.edits1(word))
38 | edit2_result = self.known(self.edits2(word))
39 |
40 | if self.verbose > 0 or verbose > 0:
41 | print('Known Result: ', known_result)
42 | print('Edit1 Result: ', edit1_result)
43 | print('Edit2 Result: ', edit2_result)
44 |
45 | return (known_result or edit1_result or edit2_result or [word])
46 |
47 | def known(self, words):
48 | "The subset of `words` that appear in the dictionary of WORDS."
49 | return set(w for w in words if w in self.dictionary)
50 |
51 | def edits1(self, word):
52 | "All edits that are one edit away from `word`."
53 | letters = 'abcdefghijklmnopqrstuvwxyz'
54 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
55 | deletes = [L + R[1:] for L, R in splits if R]
56 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
57 | replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
58 | inserts = [L + c + R for L, R in splits for c in letters]
59 | return set(deletes + transposes + replaces + inserts)
60 |
61 | def edits2(self, word):
62 | "All edits that are two edits away from `word`."
63 | return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
64 |
65 |
66 | class SymSpell(SpellCheck):
67 | def __init__(self, dictionary_file_path='', dictionary=None, verbose=0):
68 | super().__init__(dictionary=dictionary, verbose=verbose)
69 |
70 | self.dictionary_file_path = dictionary_file_path
71 | self.model = None
72 |
73 | def load_vocab(self, corpus_file_path, max_edit_distance_dictionary=2, prefix_length=5):
74 | #initial_capacity = len(corpus)
75 |
76 | #sym_spell = SymSpellPy(
77 | # initial_capacity, max_edit_distance_dictionary,
78 | # prefix_length)
79 | self.model = SymSpellPy(
80 | max_dictionary_edit_distance=max_edit_distance_dictionary,
81 | prefix_length=prefix_length)
82 |
83 | term_index = 0 # column of the term in the dictionary text file
84 | count_index = 1 # column of the term frequency in the dictionary text file
85 | if not self.model.load_dictionary(corpus_file_path, term_index, count_index):
86 | print("Dictionary file not found")
87 |
88 | def build_vocab(self, dictionary, file_dir, file_name, verbose=0):
89 | if not os.path.exists(file_dir):
90 | os.makedirs(file_dir)
91 |
92 | """
93 | Data format:
94 | token, frequency
95 | Example:
96 | edward 154
97 | edwards 50
98 | ...
99 | """
100 | if self.verbose > 3 or verbose > 3:
101 | print('Size of dictionary: %d' % len(dictionary))
102 |
103 | with open(file_dir + file_name, "w") as text_file:
104 | for token, count in dictionary.items():
105 | text_file.write(token + ' ' + str(count))
106 | text_file.write('\n')
107 |
108 | def correction(self, word, max_edit_distance_lookup=2, mode='cloest'):
109 | if mode == 'cloest':
110 | suggestion_verbosity = Verbosity.CLOSEST
111 | elif mode == 'top':
112 | suggestion_verbosity = Verbosity.TOP
113 | elif mode == 'all':
114 | suggestion_verbosity = Verbosity.ALL
115 |
116 | results = self.model.lookup(
117 | word, suggestion_verbosity, max_edit_distance_lookup)
118 |
119 | results = [{'word': suggestion.term, 'count': suggestion.count, 'distance': suggestion.distance} for suggestion in results]
120 | return results
121 |
122 | def corrections(self, sentence, max_edit_distance_lookup=2):
123 | normalized_sentence = (sentence.lower())
124 | results = self.model.lookup_compound(
125 | normalized_sentence, max_edit_distance_lookup)
126 |
127 | results = [{'word': suggestion.term, 'distance': suggestion.distance} for suggestion in results]
128 | return results
129 |
--------------------------------------------------------------------------------
/sample/embeddings/nlp-embeddings-document-doc2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Ingestion"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Train: 2031\n",
20 | "Val: 226\n",
21 | "Test: 1502\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import numpy as np\n",
27 | "from sklearn.datasets import fetch_20newsgroups\n",
28 | "from sklearn.model_selection import train_test_split\n",
29 | "\n",
30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
31 | "\n",
32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
34 | "\n",
35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
36 | "x_test = np.array(test_raw_df.data)\n",
37 | "y_test = test_raw_df.target\n",
38 | "\n",
39 | "# x_train = [x_train[:200] for x in x_train]\n",
40 | "\n",
41 | "print('Train:', len(x_train))\n",
42 | "print('Val:', len(x_val))\n",
43 | "print('Test:', len(x_test))"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "/data/jupyter/common\n",
56 | "Added /data/jupyter/common into sys.path.\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "%reload_ext autoreload\n",
62 | "%autoreload 2\n",
63 | "\n",
64 | "import sys, os\n",
65 | "def add_aion(curr_path=None):\n",
66 | " if curr_path is None:\n",
67 | " dir_path = os.getcwd()\n",
68 | " target_path = os.path.dirname(os.path.dirname(dir_path))\n",
69 | " print(target_path)\n",
70 | " if target_path not in sys.path:\n",
71 | " print('Added %s into sys.path.' % (target_path))\n",
72 | " sys.path.insert(0, target_path)\n",
73 | " \n",
74 | "add_aion()"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "# Model"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 3,
87 | "metadata": {
88 | "collapsed": true
89 | },
90 | "outputs": [],
91 | "source": [
92 | "from aion.embeddings.doc2vec import Doc2VecEmbeddings"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 5,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "2018-10-08 22:52:10.269082 start\n",
105 | "2018-10-08 22:53:30.387969 end\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "doc2vec_embs = Doc2VecEmbeddings()\n",
111 | "x_train_tokens = doc2vec_embs.build_vocab(documents=x_train)\n",
112 | "doc2vec_embs.train(x_train_tokens)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 8,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "x_train_t = doc2vec_embs.encode(documents=x_train)\n",
124 | "x_test_t = doc2vec_embs.encode(documents=x_test)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 9,
130 | "metadata": {
131 | "collapsed": true
132 | },
133 | "outputs": [],
134 | "source": [
135 | "from sklearn.linear_model import LogisticRegression\n",
136 | "\n",
137 | "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n",
138 | "model.fit(x_train_t, y_train)\n",
139 | "\n",
140 | "y_pred = model.predict(x_test_t)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 10,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Accuracy:52.80%\n",
153 | "Classification Report:\n",
154 | " precision recall f1-score support\n",
155 | "\n",
156 | " 0 0.56 0.17 0.26 319\n",
157 | " 1 0.82 0.63 0.72 389\n",
158 | " 2 0.85 0.31 0.45 396\n",
159 | " 3 0.38 0.93 0.54 398\n",
160 | "\n",
161 | "avg / total 0.66 0.53 0.50 1502\n",
162 | "\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "from sklearn.metrics import accuracy_score\n",
168 | "from sklearn.metrics import classification_report\n",
169 | "\n",
170 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
171 | "print('Classification Report:')\n",
172 | "print(classification_report(y_test, y_pred))"
173 | ]
174 | }
175 | ],
176 | "metadata": {
177 | "kernelspec": {
178 | "display_name": "Python 3",
179 | "language": "python",
180 | "name": "python3"
181 | },
182 | "language_info": {
183 | "codemirror_mode": {
184 | "name": "ipython",
185 | "version": 3
186 | },
187 | "file_extension": ".py",
188 | "mimetype": "text/x-python",
189 | "name": "python",
190 | "nbconvert_exporter": "python",
191 | "pygments_lexer": "ipython3",
192 | "version": "3.5.2"
193 | }
194 | },
195 | "nbformat": 4,
196 | "nbformat_minor": 2
197 | }
198 |
--------------------------------------------------------------------------------
/sample/nlp-embeddings-word-cove.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Ingestion"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Train: 2031\n",
20 | "Val: 226\n",
21 | "Test: 1502\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import numpy as np\n",
27 | "from sklearn.datasets import fetch_20newsgroups\n",
28 | "from sklearn.model_selection import train_test_split\n",
29 | "\n",
30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
31 | "\n",
32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
34 | "\n",
35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
36 | "x_test = np.array(test_raw_df.data)\n",
37 | "y_test = test_raw_df.target\n",
38 | "\n",
39 | "# x_train = [x_train[:200] for x in x_train]\n",
40 | "\n",
41 | "print('Train:', len(x_train))\n",
42 | "print('Val:', len(x_val))\n",
43 | "print('Test:', len(x_test))"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "Added /data/jupyter/common into sys.path.\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "%reload_ext autoreload\n",
61 | "%autoreload 2\n",
62 | "\n",
63 | "import sys, os\n",
64 | "def add_aion(curr_path=None):\n",
65 | " if curr_path is None:\n",
66 | " dir_path = os.getcwd()\n",
67 | " target_path = os.path.dirname(dir_path)\n",
68 | " if target_path not in sys.path:\n",
69 | " print('Added %s into sys.path.' % (target_path))\n",
70 | " sys.path.insert(0, target_path)\n",
71 | " \n",
72 | "add_aion()"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "# Model (Keras)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "max_sequence_length = 200"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stderr",
100 | "output_type": "stream",
101 | "text": [
102 | "Using TensorFlow backend.\n"
103 | ]
104 | },
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "2018-10-06 16:04:42.665310. [FOUND] Keras_CoVe.h5 in ../model/text/salesforce/cove/\n"
110 | ]
111 | },
112 | {
113 | "name": "stderr",
114 | "output_type": "stream",
115 | "text": [
116 | "/anaconda/envs/py35/lib/python3.5/site-packages/keras/engine/saving.py:269: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually.\n",
117 | " warnings.warn('No training configuration found in save file: '\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "from aion.embeddings.cove import CoVeEmbeddings\n",
123 | "\n",
124 | "cove_embs = CoVeEmbeddings(\n",
125 | " word_embeddings_dir='../model/text/stanford/glove/', \n",
126 | " max_sequence_length=max_sequence_length, verbose=20)\n",
127 | "tmp = cove_embs.load_model(dest_dir='../model/text/salesforce/cove/')"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 5,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "x_train_t = cove_embs.encode(x_train)\n",
137 | "x_test_t = cove_embs.encode(x_test)"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 6,
143 | "metadata": {
144 | "collapsed": true
145 | },
146 | "outputs": [],
147 | "source": [
148 | "x_train_t2 = x_train_t.reshape(len(x_train_t), max_sequence_length*600)\n",
149 | "x_test_t2 = x_test_t.reshape(len(x_test_t), max_sequence_length*600)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 7,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stderr",
159 | "output_type": "stream",
160 | "text": [
161 | "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n",
162 | " warn('The line search algorithm did not converge', LineSearchWarning)\n",
163 | "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n",
164 | " warnings.warn('Line Search failed')\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "from sklearn.linear_model import LogisticRegression\n",
170 | "\n",
171 | "model = LogisticRegression(solver='newton-cg')\n",
172 | "model.fit(x_train_t2, y_train)\n",
173 | "\n",
174 | "y_pred = model.predict(x_test_t2)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "For sake of easier demonstration, I did not do any data preprocessing. It leads lots of OOV and causing the result bad."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 8,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "Accuracy:32.56%\n",
194 | "Classification Report:\n",
195 | " precision recall f1-score support\n",
196 | "\n",
197 | " 0 0.26 0.03 0.05 319\n",
198 | " 1 0.34 0.61 0.43 389\n",
199 | " 2 0.33 0.05 0.08 396\n",
200 | " 3 0.32 0.56 0.41 398\n",
201 | "\n",
202 | "avg / total 0.31 0.33 0.25 1502\n",
203 | "\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "from sklearn.metrics import accuracy_score\n",
209 | "from sklearn.metrics import classification_report\n",
210 | "\n",
211 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
212 | "print('Classification Report:')\n",
213 | "print(classification_report(y_test, y_pred))"
214 | ]
215 | }
216 | ],
217 | "metadata": {
218 | "kernelspec": {
219 | "display_name": "Python 3",
220 | "language": "python",
221 | "name": "python3"
222 | },
223 | "language_info": {
224 | "codemirror_mode": {
225 | "name": "ipython",
226 | "version": 3
227 | },
228 | "file_extension": ".py",
229 | "mimetype": "text/x-python",
230 | "name": "python",
231 | "nbconvert_exporter": "python",
232 | "pygments_lexer": "ipython3",
233 | "version": "3.5.2"
234 | }
235 | },
236 | "nbformat": 4,
237 | "nbformat_minor": 2
238 | }
239 |
--------------------------------------------------------------------------------
/sample/nlp-embeddings-sentence-infersent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Ingestion"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Train: 2031\n",
20 | "Val: 226\n",
21 | "Test: 1502\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import numpy as np\n",
27 | "from sklearn.datasets import fetch_20newsgroups\n",
28 | "from sklearn.model_selection import train_test_split\n",
29 | "\n",
30 | "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
31 | "\n",
32 | "train_raw_df = fetch_20newsgroups(subset='train', categories=categories)\n",
33 | "test_raw_df = fetch_20newsgroups(subset='test', categories=categories)\n",
34 | "\n",
35 | "x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)\n",
36 | "x_test = np.array(test_raw_df.data)\n",
37 | "y_test = test_raw_df.target\n",
38 | "\n",
39 | "# x_train = [x_train[:200] for x in x_train]\n",
40 | "\n",
41 | "print('Train:', len(x_train))\n",
42 | "print('Val:', len(x_val))\n",
43 | "print('Test:', len(x_test))"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "Added /data/jupyter/common into sys.path.\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "%reload_ext autoreload\n",
61 | "%autoreload 2\n",
62 | "\n",
63 | "import sys, os\n",
64 | "def add_aion(curr_path=None):\n",
65 | " if curr_path is None:\n",
66 | " dir_path = os.getcwd()\n",
67 | " target_path = os.path.dirname(dir_path)\n",
68 | " if target_path not in sys.path:\n",
69 | " print('Added %s into sys.path.' % (target_path))\n",
70 | " sys.path.insert(0, target_path)\n",
71 | " \n",
72 | "add_aion()"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "# Model"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "[nltk_data] Downloading package punkt to /home/dscoe/nltk_data...\n",
92 | "[nltk_data] Package punkt is already up-to-date!\n"
93 | ]
94 | },
95 | {
96 | "data": {
97 | "text/plain": [
98 | "True"
99 | ]
100 | },
101 | "execution_count": 3,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "import nltk\n",
108 | "nltk.download('punkt')"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 4,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "from aion.embeddings.infersent import InferSentEmbeddings\n",
118 | "\n",
119 | "infer_sent_embs = InferSentEmbeddings(word_embeddings_dir='../model/text/stanford/glove/', verbose=20)\n",
120 | "infer_sent_embs.load_model(dest_dir='../model/text/facebook/infersent/')"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 5,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "Found 22119(/46170) words with w2v vectors\n",
133 | "Vocab size : 22119\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "infer_sent_embs.build_vocab(x_train, tokenize=True)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 6,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "name": "stderr",
148 | "output_type": "stream",
149 | "text": [
150 | "/data/jupyter/common/aion/embeddings/infersent_lib/models.py:222: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
151 | " sentences[stidx:stidx + bsize]), volatile=True)\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "x_train_t = infer_sent_embs.encode(x_train, tokenize=True)\n",
157 | "x_test_t = infer_sent_embs.encode(x_test, tokenize=True)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 7,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "name": "stderr",
167 | "output_type": "stream",
168 | "text": [
169 | "/anaconda/envs/py35/lib/python3.5/site-packages/scipy/optimize/linesearch.py:313: LineSearchWarning: The line search algorithm did not converge\n",
170 | " warn('The line search algorithm did not converge', LineSearchWarning)\n",
171 | "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/utils/optimize.py:195: UserWarning: Line Search failed\n",
172 | " warnings.warn('Line Search failed')\n"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "from sklearn.linear_model import LogisticRegression\n",
178 | "\n",
179 | "model = LogisticRegression(solver='newton-cg', max_iter=1000)\n",
180 | "model.fit(x_train_t, y_train)\n",
181 | "\n",
182 | "y_pred = model.predict(x_test_t)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 8,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "Accuracy:86.55%\n",
195 | "Classification Report:\n",
196 | " precision recall f1-score support\n",
197 | "\n",
198 | " 0 0.85 0.76 0.80 319\n",
199 | " 1 0.86 0.95 0.91 389\n",
200 | " 2 0.95 0.79 0.86 396\n",
201 | " 3 0.82 0.94 0.87 398\n",
202 | "\n",
203 | "avg / total 0.87 0.87 0.86 1502\n",
204 | "\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "from sklearn.metrics import accuracy_score\n",
210 | "from sklearn.metrics import classification_report\n",
211 | "\n",
212 | "print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))\n",
213 | "print('Classification Report:')\n",
214 | "print(classification_report(y_test, y_pred))"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {
221 | "collapsed": true
222 | },
223 | "outputs": [],
224 | "source": []
225 | }
226 | ],
227 | "metadata": {
228 | "kernelspec": {
229 | "display_name": "Python 3",
230 | "language": "python",
231 | "name": "python3"
232 | },
233 | "language_info": {
234 | "codemirror_mode": {
235 | "name": "ipython",
236 | "version": 3
237 | },
238 | "file_extension": ".py",
239 | "mimetype": "text/x-python",
240 | "name": "python",
241 | "nbconvert_exporter": "python",
242 | "pygments_lexer": "ipython3",
243 | "version": "3.5.2"
244 | }
245 | },
246 | "nbformat": 4,
247 | "nbformat_minor": 2
248 | }
249 |
--------------------------------------------------------------------------------
/sample/nlp-stemming.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "\n",
9 | "Source: https://www.thisdaylive.com/index.php/2017/05/31/death-of-the-diary/"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# Stemming\n",
17 | "\n",
18 | "After tokenized word, we may want a root form rather than the original input form for post processing or modelling such as topic classification. The root word does not necessarily a word itself. For example, \"reduc\" is a root word of \"reduce\", \"suffici\" is a root word of \"sufficient\".\n",
19 | "\n",
20 | "There are lots of stemming algorithm in NLTK. Porter Stemmer and Snowball Stemmer (aka Porter2) will be selected for demonstration because they are the most popular."
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "# Copy from https://en.wikipedia.org/wiki/Stemming\n",
32 | "\n",
33 | "article = 'In linguistic morphology and information retrieval, stemming is the process of \\\n",
34 | "reducing inflected (or sometimes derived) words to their word stem, base or root \\\n",
35 | "form—generally a written word form. The stem need not be identical to the morphological \\\n",
36 | "root of the word; it is usually sufficient that related words map to the same stem, even \\\n",
37 | "if this stem is not in itself a valid root.'"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Porter Stemmer"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "NLTK Version: 3.2.5\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "import nltk \n",
62 | "print('NLTK Version: %s' % (nltk.__version__))\n",
63 | "\n",
64 | "porter_stemmer = nltk.stem.PorterStemmer()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n",
77 | "\n",
78 | "Original : linguistic, New: linguist\n",
79 | "Original : morphology, New: morpholog\n",
80 | "Original : information, New: inform\n",
81 | "Original : retrieval, New: retriev\n",
82 | "Original : stemming, New: stem\n",
83 | "Original : reducing, New: reduc\n",
84 | "Original : inflected, New: inflect\n",
85 | "Original : sometimes, New: sometim\n",
86 | "Original : derived, New: deriv\n",
87 | "Original : words, New: word\n",
88 | "Original : form—generally, New: form—gener\n",
89 | "Original : The, New: the\n",
90 | "Original : identical, New: ident\n",
91 | "Original : morphological, New: morpholog\n",
92 | "Original : usually, New: usual\n",
93 | "Original : sufficient, New: suffici\n",
94 | "Original : related, New: relat\n",
95 | "Original : words, New: word\n",
96 | "Original : this, New: thi\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "tokens = nltk.word_tokenize(article)\n",
102 | "\n",
103 | "print('Original Article: %s' % (article))\n",
104 | "print()\n",
105 | "\n",
106 | "for token in tokens:\n",
107 | " stemmed_token = porter_stemmer.stem(token)\n",
108 | " \n",
109 | " if token != stemmed_token:\n",
110 | " print('Original : %s, New: %s' % (token, stemmed_token))"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### Snowball Stemmer"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 5,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "NLTK Version: 3.2.5\n"
130 | ]
131 | }
132 | ],
133 | "source": [
134 | "import nltk \n",
135 | "print('NLTK Version: %s' % (nltk.__version__))\n",
136 | "\n",
137 | "snowball_stemmer = nltk.stem.SnowballStemmer('english')"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 6,
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "Original Article: In linguistic morphology and information retrieval, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root.\n",
150 | "\n",
151 | "Original : In, New: in\n",
152 | "Original : linguistic, New: linguist\n",
153 | "Original : morphology, New: morpholog\n",
154 | "Original : information, New: inform\n",
155 | "Original : retrieval, New: retriev\n",
156 | "Original : stemming, New: stem\n",
157 | "Original : reducing, New: reduc\n",
158 | "Original : inflected, New: inflect\n",
159 | "Original : sometimes, New: sometim\n",
160 | "Original : derived, New: deriv\n",
161 | "Original : words, New: word\n",
162 | "Original : form—generally, New: form—gener\n",
163 | "Original : The, New: the\n",
164 | "Original : identical, New: ident\n",
165 | "Original : morphological, New: morpholog\n",
166 | "Original : usually, New: usual\n",
167 | "Original : sufficient, New: suffici\n",
168 | "Original : related, New: relat\n",
169 | "Original : words, New: word\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "tokens = nltk.word_tokenize(article)\n",
175 | "\n",
176 | "print('Original Article: %s' % (article))\n",
177 | "print()\n",
178 | "\n",
179 | "for token in tokens:\n",
180 | " stemmed_token = snowball_stemmer.stem(token)\n",
181 | " \n",
182 | " if token != stemmed_token:\n",
183 | " print('Original : %s, New: %s' % (token, stemmed_token))"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "Except \"In\", the result of Snowball Stemmer are same as Porter Stemmer."
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "# Conclusion\n",
198 | "\n",
199 | "\n",
200 | "Snowball Stemmer not only support English, but also Germanic and other languages as well. For detail, you may check on the Snowball website. \n",
201 | "\n",
202 | "Snowball Stemmer: http://snowballstem.org/algorithms/\n",
203 | "\n",
204 | "Besides Porter Stemmer and Snowball Stemmer, reader may also have on look on other stemmer algorithm such as Hunspell\n",
205 | "\n",
206 | "Hunspell Stemmer: https://github.com/hunspell/hunspell"
207 | ]
208 | }
209 | ],
210 | "metadata": {
211 | "kernelspec": {
212 | "display_name": "Python 3",
213 | "language": "python",
214 | "name": "python3"
215 | },
216 | "language_info": {
217 | "codemirror_mode": {
218 | "name": "ipython",
219 | "version": 3
220 | },
221 | "file_extension": ".py",
222 | "mimetype": "text/x-python",
223 | "name": "python",
224 | "nbconvert_exporter": "python",
225 | "pygments_lexer": "ipython3",
226 | "version": "3.5.2"
227 | }
228 | },
229 | "nbformat": 4,
230 | "nbformat_minor": 2
231 | }
232 |
--------------------------------------------------------------------------------
/aion/embeddings/infersent_lib/README.md:
--------------------------------------------------------------------------------
1 | # InferSent
2 |
3 | *InferSent* is a *sentence embeddings* method that provides semantic representations for English sentences. It is trained on natural language inference data and generalizes well to many different tasks.
4 |
5 | We provide our pre-trained English sentence encoder [our paper](https://arxiv.org/abs/1705.02364) and our [SentEval](https://github.com/facebookresearch/SentEval) evaluation toolkit.
6 |
7 | **Recent changes**: Added infersent2 model trained on fastText vectors and added max-pool option.
8 |
9 | ## Dependencies
10 |
11 | This code is written in python. Dependencies include:
12 |
13 | * Python 2/3
14 | * [Pytorch](http://pytorch.org/) (recent version)
15 | * NLTK >= 3
16 |
17 | ## Download datasets
18 | To get SNLI and MultiNLI, run (in dataset/):
19 | ```bash
20 | ./get_data.bash
21 | ```
22 | This will download and preprocess SNLI/MultiNLI datasets. For MacOS, you may have to use *p7zip* instead of *unzip*.
23 |
24 |
25 | Download [GloVe](https://nlp.stanford.edu/projects/glove/) (V1) or [fastText](https://fasttext.cc/docs/en/english-vectors.html) (V2) vectors:
26 | ```bash
27 | mkdir dataset/GloVe
28 | curl -Lo dataset/GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
29 | unzip dataset/GloVe/glove.840B.300d.zip -d dataset/GloVe/
30 | mkdir dataset/fastText
31 | curl -Lo dataset/fastText/crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
32 | unzip dataset/fastText/crawl-300d-2M.vec.zip -d dataset/fastText/
33 | ```
34 |
35 | ## Use our sentence encoder
36 | We provide a simple interface to encode English sentences. **See [**encoder/demo.ipynb**](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb)
37 | for a practical example.** Get started with the following steps:
38 |
39 | *0.0) Download our InferSent models (V1 trained with GloVe, V2 trained with fastText)[147MB]:*
40 | ```bash
41 | curl -Lo encoder/infersent1.pkl https://s3.amazonaws.com/senteval/infersent/infersent1.pkl
42 | curl -Lo encoder/infersent2.pkl https://s3.amazonaws.com/senteval/infersent/infersent2.pkl
43 | ```
44 | Note that infersent1 is trained with GloVe (which have been trained on text preprocessed with the PTB tokenizer) and infersent2 is trained with fastText (which have been trained on text preprocessed with the MOSES tokenizer). The latter also removes the padding of zeros with max-pooling which was inconvenient when embedding sentences outside of their batches.
45 |
46 | *0.1) Make sure you have the NLTK tokenizer by running the following once:*
47 | ```python
48 | import nltk
49 | nltk.download('punkt')
50 | ```
51 |
52 | *1) [Load our pre-trained model](https://github.com/facebookresearch/InferSent/blob/master/encoder/demo.ipynb) (in encoder/):*
53 | ```python
54 | from models import InferSent
55 | V = 2
56 | MODEL_PATH = 'encoder/infersent%s.pkl' % V
57 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
58 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
59 | infersent = InferSent(params_model)
60 | infersent.load_state_dict(torch.load(MODEL_PATH))
61 | ```
62 |
63 | *2) Set word vector path for the model:*
64 | ```python
65 | W2V_PATH = 'fastText/crawl-300d-2M.vec'
66 | infersent.set_w2v_path(W2V_PATH)
67 | ```
68 |
69 | *3) Build the vocabulary of word vectors (i.e keep only those needed):*
70 | ```python
71 | infersent.build_vocab(sentences, tokenize=True)
72 | ```
73 | where *sentences* is your list of **n** sentences. You can update your vocabulary using *infersent.update_vocab(sentences)*, or directly load the **K** most common English words with *infersent.build_vocab_k_words(K=100000)*.
74 | If **tokenize** is True (by default), sentences will be tokenized using NTLK.
75 |
76 | *4) Encode your sentences (list of *n* sentences):*
77 | ```python
78 | embeddings = infersent.encode(sentences, tokenize=True)
79 | ```
80 | This outputs a numpy array with *n* vectors of dimension **4096**. Speed is around *1000 sentences per second* with batch size 128 on a single GPU.
81 |
82 | *5) Visualize the importance that our model attributes to each word:*
83 |
84 | We provide a function to visualize the importance of each word in the encoding of a sentence:
85 | ```python
86 | infersent.visualize('A man plays an instrument.', tokenize=True)
87 | ```
88 | 
89 |
90 |
91 | ## Train model on Natural Language Inference (SNLI)
92 | To reproduce our results on [SNLI](https://nlp.stanford.edu/projects/snli/), run:
93 | ```bash
94 | python train_nli.py --word_emb_path ''] +
86 | [word for word in sent.split() if word in word_vec] +
87 | [''] for sent in eval(data_type)[split]])
88 |
89 |
90 | """
91 | MODEL
92 | """
93 | # model config
94 | config_nli_model = {
95 | 'n_words' : len(word_vec) ,
96 | 'word_emb_dim' : params.word_emb_dim ,
97 | 'enc_lstm_dim' : params.enc_lstm_dim ,
98 | 'n_enc_layers' : params.n_enc_layers ,
99 | 'dpout_model' : params.dpout_model ,
100 | 'dpout_fc' : params.dpout_fc ,
101 | 'fc_dim' : params.fc_dim ,
102 | 'bsize' : params.batch_size ,
103 | 'n_classes' : params.n_classes ,
104 | 'pool_type' : params.pool_type ,
105 | 'nonlinear_fc' : params.nonlinear_fc ,
106 | 'encoder_type' : params.encoder_type ,
107 | 'use_cuda' : True ,
108 |
109 | }
110 |
111 | # model
112 | encoder_types = ['InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder',
113 | 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
114 | 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder']
115 | assert params.encoder_type in encoder_types, "encoder_type must be in " + \
116 | str(encoder_types)
117 | nli_net = NLINet(config_nli_model)
118 | print(nli_net)
119 |
120 | # loss
121 | weight = torch.FloatTensor(params.n_classes).fill_(1)
122 | loss_fn = nn.CrossEntropyLoss(weight=weight)
123 | loss_fn.size_average = False
124 |
125 | # optimizer
126 | optim_fn, optim_params = get_optimizer(params.optimizer)
127 | optimizer = optim_fn(nli_net.parameters(), **optim_params)
128 |
129 | # cuda by default
130 | nli_net.cuda()
131 | loss_fn.cuda()
132 |
133 |
134 | """
135 | TRAIN
136 | """
137 | val_acc_best = -1e10
138 | adam_stop = False
139 | stop_training = False
140 | lr = optim_params['lr'] if 'sgd' in params.optimizer else None
141 |
142 |
143 | def trainepoch(epoch):
144 | print('\nTRAINING : Epoch ' + str(epoch))
145 | nli_net.train()
146 | all_costs = []
147 | logs = []
148 | words_count = 0
149 |
150 | last_time = time.time()
151 | correct = 0.
152 | # shuffle the data
153 | permutation = np.random.permutation(len(train['s1']))
154 |
155 | s1 = train['s1'][permutation]
156 | s2 = train['s2'][permutation]
157 | target = train['label'][permutation]
158 |
159 |
160 | optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
161 | and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
162 | print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))
163 |
164 | for stidx in range(0, len(s1), params.batch_size):
165 | # prepare batch
166 | s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
167 | word_vec, params.word_emb_dim)
168 | s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
169 | word_vec, params.word_emb_dim)
170 | s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
171 | tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda()
172 | k = s1_batch.size(1) # actual batch size
173 |
174 | # model forward
175 | output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
176 |
177 | pred = output.data.max(1)[1]
178 | correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
179 | assert len(pred) == len(s1[stidx:stidx + params.batch_size])
180 |
181 | # loss
182 | loss = loss_fn(output, tgt_batch)
183 | all_costs.append(loss.data[0])
184 | words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim
185 |
186 | # backward
187 | optimizer.zero_grad()
188 | loss.backward()
189 |
190 | # gradient clipping (off by default)
191 | shrink_factor = 1
192 | total_norm = 0
193 |
194 | for p in nli_net.parameters():
195 | if p.requires_grad:
196 | p.grad.data.div_(k) # divide by the actual batch size
197 | total_norm += p.grad.data.norm() ** 2
198 | total_norm = np.sqrt(total_norm)
199 |
200 | if total_norm > params.max_norm:
201 | shrink_factor = params.max_norm / total_norm
202 | current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
203 | optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update
204 |
205 | # optimizer step
206 | optimizer.step()
207 | optimizer.param_groups[0]['lr'] = current_lr
208 |
209 | if len(all_costs) == 100:
210 | logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
211 | stidx, round(np.mean(all_costs), 2),
212 | int(len(all_costs) * params.batch_size / (time.time() - last_time)),
213 | int(words_count * 1.0 / (time.time() - last_time)),
214 | round(100.*correct/(stidx+k), 2)))
215 | print(logs[-1])
216 | last_time = time.time()
217 | words_count = 0
218 | all_costs = []
219 | train_acc = round(100 * correct/len(s1), 2)
220 | print('results : epoch {0} ; mean accuracy train : {1}'
221 | .format(epoch, train_acc))
222 | return train_acc
223 |
224 |
225 | def evaluate(epoch, eval_type='valid', final_eval=False):
226 | nli_net.eval()
227 | correct = 0.
228 | global val_acc_best, lr, stop_training, adam_stop
229 |
230 | if eval_type == 'valid':
231 | print('\nVALIDATION : Epoch {0}'.format(epoch))
232 |
233 | s1 = valid['s1'] if eval_type == 'valid' else test['s1']
234 | s2 = valid['s2'] if eval_type == 'valid' else test['s2']
235 | target = valid['label'] if eval_type == 'valid' else test['label']
236 |
237 | for i in range(0, len(s1), params.batch_size):
238 | # prepare batch
239 | s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec, params.word_emb_dim)
240 | s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec, params.word_emb_dim)
241 | s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
242 | tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda()
243 |
244 | # model forward
245 | output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
246 |
247 | pred = output.data.max(1)[1]
248 | correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
249 |
250 | # save model
251 | eval_acc = round(100 * correct / len(s1), 2)
252 | if final_eval:
253 | print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
254 | else:
255 | print('togrep : results : epoch {0} ; mean accuracy {1} :\
256 | {2}'.format(epoch, eval_type, eval_acc))
257 |
258 | if eval_type == 'valid' and epoch <= params.n_epochs:
259 | if eval_acc > val_acc_best:
260 | print('saving model at epoch {0}'.format(epoch))
261 | if not os.path.exists(params.outputdir):
262 | os.makedirs(params.outputdir)
263 | torch.save(nli_net.state_dict(), os.path.join(params.outputdir,
264 | params.outputmodelname))
265 | val_acc_best = eval_acc
266 | else:
267 | if 'sgd' in params.optimizer:
268 | optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink
269 | print('Shrinking lr by : {0}. New lr = {1}'
270 | .format(params.lrshrink,
271 | optimizer.param_groups[0]['lr']))
272 | if optimizer.param_groups[0]['lr'] < params.minlr:
273 | stop_training = True
274 | if 'adam' in params.optimizer:
275 | # early stopping (at 2nd decrease in accuracy)
276 | stop_training = adam_stop
277 | adam_stop = True
278 | return eval_acc
279 |
280 |
281 | """
282 | Train model on Natural Language Inference task
283 | """
284 | epoch = 1
285 |
286 | while not stop_training and epoch <= params.n_epochs:
287 | train_acc = trainepoch(epoch)
288 | eval_acc = evaluate(epoch, 'valid')
289 | epoch += 1
290 |
291 | # Run best model on test set.
292 | nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.outputmodelname)), map_location={'cuda:1' : 'cuda:0', 'cuda:2' : 'cuda:0'})
293 |
294 | print('\nTEST : Epoch {0}'.format(epoch))
295 | evaluate(1e6, 'valid', True)
296 | evaluate(0, 'test', True)
297 |
298 | # Save encoder instead of full model
299 | torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl'))
300 |
--------------------------------------------------------------------------------
/sample/nlp-sentence_tokenization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "\n",
10 | "\n",
11 | "Source: http://www.digitalmeetsculture.net/article/article-about-preforma-published-in-archival-science/"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "source": [
20 | "# Sentence Tokenization\n",
21 | "\n",
22 | "In previous article, word tokenization is introduced. What if we want to tokenize sentence? In general, we can easily split sentence by some punctuation such ., ? and !. However, there are lots of exception if we splitting article by those punctuation only.\n",
23 | "In this article, you will go through why we need to use sentence tokenization and how can we use it."
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {
29 | "collapsed": true
30 | },
31 | "source": [
32 | "# Why?\n",
33 | "According to researchers, about 86% of article include the importance sentence in first one or two sentences. Believe that it is one of the reason why textsum model use first 2 sentences for training\n",
34 | "When I am in school, teacher teaches how we should write an article. The importance sentence will be placed in the first sentence most of the time. It may exists in last sentence sometimes."
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "# How?\n",
42 | "So how can we tokenize sentence? You can use the following simple python script to do that or using library such as nltk and spacy"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 1,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n",
54 | "\n",
55 | "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n",
56 | "converting a sequence of characters (such as in a computer program or web page) into a \\\n",
57 | "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n",
58 | "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n",
59 | "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n",
60 | "which together analyze the syntax of programming languages, web pages, and so forth.'\n",
61 | "\n",
62 | "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'\n",
63 | "\n",
64 | "article3 = 'It is a great moment from 10 a.m. to 1 p.m. every weekend.'"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Self build"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 2,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
84 | "\n",
85 | "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning)\n",
86 | "-->Sentence 1: .\n",
87 | "-->Sentence 2: A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer\n",
88 | "-->Sentence 3: .\n",
89 | "-->Sentence 4: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth\n",
90 | "-->Sentence 5: .\n",
91 | "-->Sentence 6: \n",
92 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
93 | "\n",
94 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_\n",
95 | "-->Sentence 1: !\n",
96 | "-->Sentence 2: @# \n",
97 | "-->Sentence 3: !\n",
98 | "-->Sentence 4: @#$%^&*()_+ 0123456\n",
99 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
100 | "\n",
101 | "-->Sentence 0: It is a great moment from 10 a\n",
102 | "-->Sentence 1: .\n",
103 | "-->Sentence 2: m\n",
104 | "-->Sentence 3: .\n",
105 | "-->Sentence 4: to 1 p\n",
106 | "-->Sentence 5: .\n",
107 | "-->Sentence 6: m\n",
108 | "-->Sentence 7: .\n",
109 | "-->Sentence 8: every weekend\n",
110 | "-->Sentence 9: .\n",
111 | "-->Sentence 10: \n"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "import re\n",
117 | "\n",
118 | "for doc in [article, article2, article3]:\n",
119 | " print('Original Article: %s' % (doc))\n",
120 | " print()\n",
121 | "\n",
122 | " sentences = re.split('(\\.|!|\\?)', doc)\n",
123 | " \n",
124 | " for i, s in enumerate(sentences):\n",
125 | " print('-->Sentence %d: %s' % (i, s))"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "You can see that, \"a.m.\" should treat as a \"word\". Of course, we can enhance the above regular expression to do it. But I will go for library rather than build the wheel again"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "### spaCy"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 3,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "spaCy Version: 2.0.11\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "import spacy\n",
157 | "print('spaCy Version: %s' % spacy.__version__)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 4,
163 | "metadata": {
164 | "collapsed": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "spacy_nlp = spacy.load('en_core_web_sm')"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 5,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
181 | "\n",
182 | "-->Sentence 0: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).\n",
183 | "-->Sentence 1: A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.\n",
184 | "-->Sentence 2: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
185 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
186 | "\n",
187 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!@# !\n",
188 | "-->Sentence 1: @#$%^&*()_+ 0123456\n",
189 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
190 | "\n",
191 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n"
192 | ]
193 | }
194 | ],
195 | "source": [
196 | "for article in [article, article2, article3]:\n",
197 | " print('Original Article: %s' % (article))\n",
198 | " print()\n",
199 | " doc = spacy_nlp(article)\n",
200 | " for i, token in enumerate(doc.sents):\n",
201 | " print('-->Sentence %d: %s' % (i, token.text))"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "Can see that spacy handled \"a.m.\" somehow."
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "### NLTK"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 6,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "NTLK Version: 3.2.5\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "import nltk\n",
233 | "from nltk.tokenize import sent_tokenize\n",
234 | "print('NTLK Version: %s' % nltk.__version__)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 8,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "# nltk.download('punkt')"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
256 | "\n",
257 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
258 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
259 | "\n",
260 | "-->Sentence 0: ConcateStringAnd123 ConcateSepcialCharacter_!\n",
261 | "-->Sentence 1: @# !\n",
262 | "-->Sentence 2: @#$%^&*()_+ 0123456\n",
263 | "Original Article: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n",
264 | "\n",
265 | "-->Sentence 0: It is a great moment from 10 a.m. to 1 p.m. every weekend.\n"
266 | ]
267 | }
268 | ],
269 | "source": [
270 | "for article in [article, article2, article3]:\n",
271 | " print('Original Article: %s' % (article))\n",
272 | " print()\n",
273 | "\n",
274 | " doc = sent_tokenize(article)\n",
275 | " for i, token in enumerate(doc):\n",
276 | " print('-->Sentence %d: %s' % (i, token))"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "# Conclusion\n",
284 | "So far both NLTK and spacy provides similar behavior so it depends on which library do you use in performing other preprocessing. \n",
285 | "Recently, I works on text mining related project which is classifying news category. Of course, I can build a ML model to classify it but I go for a simple approach. Only focus on the first sentence for every news and performing simple key word searching to build a baseline model. The result is not bad but it is a very quick way to deliver an initial version."
286 | ]
287 | }
288 | ],
289 | "metadata": {
290 | "kernelspec": {
291 | "display_name": "Python 3",
292 | "language": "python",
293 | "name": "python3"
294 | },
295 | "language_info": {
296 | "codemirror_mode": {
297 | "name": "ipython",
298 | "version": 3
299 | },
300 | "file_extension": ".py",
301 | "mimetype": "text/x-python",
302 | "name": "python",
303 | "nbconvert_exporter": "python",
304 | "pygments_lexer": "ipython3",
305 | "version": "3.5.2"
306 | }
307 | },
308 | "nbformat": 4,
309 | "nbformat_minor": 2
310 | }
311 |
--------------------------------------------------------------------------------
/sample/nlp-named_entity_recognition.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "Source: https://hackernoon.com/named-entity-recognition-applications-and-use-cases-c2ef0904e9fe"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {
15 | "collapsed": true
16 | },
17 | "outputs": [],
18 | "source": [
19 | "ner_dir = '/stanford/ner/'"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {
26 | "collapsed": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "# Copy from https://en.wikipedia.org/wiki/Stanford_University\n",
31 | "\n",
32 | "article = \"The university was founded in 1885 by Leland and Jane Stanford in memory of \\\n",
33 | "their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous \\\n",
34 | "year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. \\\n",
35 | "The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\""
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "article2 = 'New York, New York , NY N.Y. new york'"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "# Stanford NER"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 4,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | "NTLK Version: 3.2.5\n"
66 | ]
67 | },
68 | {
69 | "name": "stderr",
70 | "output_type": "stream",
71 | "text": [
72 | "/anaconda/envs/py35/lib/python3.5/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: \n",
73 | "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
74 | "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
75 | " super(StanfordNERTagger, self).__init__(*args, **kwargs)\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "import nltk\n",
81 | "print('NTLK Version: %s' % nltk.__version__)\n",
82 | "\n",
83 | "from nltk.tag import StanfordNERTagger\n",
84 | "\n",
85 | "stanford_ner_tagger = StanfordNERTagger(\n",
86 | " ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz',\n",
87 | " ner_dir + 'stanford-ner-3.9.1.jar'\n",
88 | ")"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 11,
94 | "metadata": {
95 | "collapsed": true
96 | },
97 | "outputs": [],
98 | "source": [
99 | "results = stanford_ner_tagger.tag(article.split())"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 22,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
112 | "\n",
113 | "Type: LOCATION, Value: New\n",
114 | "Type: LOCATION, Value: York\n",
115 | "Type: LOCATION, Value: NY\n",
116 | "Type: LOCATION, Value: N.Y.\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "print('Original Sentence: %s' % (article))\n",
122 | "print()\n",
123 | "for result in results:\n",
124 | " tag_value = result[0]\n",
125 | " tag_type = result[1]\n",
126 | " if tag_type != 'O':\n",
127 | " print('Type: %s, Value: %s' % (tag_type, tag_value))"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 14,
133 | "metadata": {
134 | "collapsed": true
135 | },
136 | "outputs": [],
137 | "source": [
138 | "results = stanford_ner_tagger.tag(article2.split())"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 21,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "Original Sentence: New York, New York , NY N.Y. new york\n",
151 | "\n",
152 | "Type: LOCATION, Value: New\n",
153 | "Type: LOCATION, Value: York\n",
154 | "Type: LOCATION, Value: NY\n",
155 | "Type: LOCATION, Value: N.Y.\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "print('Original Sentence: %s' % (article2))\n",
161 | "print()\n",
162 | "for result in results:\n",
163 | " tag_value = result[0]\n",
164 | " tag_type = result[1]\n",
165 | " if tag_type != 'O':\n",
166 | " print('Type: %s, Value: %s' % (tag_type, tag_value))"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "# NLTK NE"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 25,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "NTLK version: 3.2.5\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "import nltk\n",
191 | "\n",
192 | "print('NTLK version: %s' % (nltk.__version__))\n",
193 | "\n",
194 | "from nltk import word_tokenize, pos_tag, ne_chunk\n",
195 | "\n",
196 | "nltk.download('words')\n",
197 | "nltk.download('averaged_perceptron_tagger')\n",
198 | "nltk.download('punkt')\n",
199 | "nltk.download('maxent_ne_chunker')"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 43,
205 | "metadata": {
206 | "collapsed": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "results = ne_chunk(pos_tag(word_tokenize(article)))"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 44,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
223 | "\n",
224 | " (GPE Leland/NNP)\n",
225 | " (PERSON Jane/NNP Stanford/NNP)\n",
226 | " (GPE Leland/NNP)\n",
227 | " Stanford/NNP\n",
228 | " Jr./NNP\n",
229 | " (PERSON Stanford/NNP)\n",
230 | " Governor/NNP\n",
231 | " (GPE California/NNP)\n",
232 | " (GPE U.S/NNP)\n",
233 | " Senator/NNP\n",
234 | " October/NNP\n",
235 | " ]/NNP\n"
236 | ]
237 | }
238 | ],
239 | "source": [
240 | "print('Original Sentence: %s' % (article))\n",
241 | "print()\n",
242 | "for x in str(results).split('\\n'):\n",
243 | " if '/NNP' in x:\n",
244 | " print(x)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 45,
250 | "metadata": {
251 | "collapsed": true
252 | },
253 | "outputs": [],
254 | "source": [
255 | "results = ne_chunk(pos_tag(word_tokenize(article2)))"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 46,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "Original Sentence: New York, New York , NY N.Y. new york\n",
268 | "\n",
269 | " (GPE New/NNP York/NNP)\n",
270 | " (GPE New/NNP York/NNP)\n",
271 | " (ORGANIZATION NY/NNP)\n",
272 | " N.Y./NNP\n"
273 | ]
274 | }
275 | ],
276 | "source": [
277 | "print('Original Sentence: %s' % (article2))\n",
278 | "print()\n",
279 | "for x in str(results).split('\\n'):\n",
280 | " if '/NNP' in x:\n",
281 | " print(x)"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "# Spacy"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 7,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "spaCy: 2.0.11\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "import spacy\n",
306 | "\n",
307 | "print('spaCy: %s' % (spacy.__version__))"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 8,
313 | "metadata": {
314 | "collapsed": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "spacy_nlp = spacy.load('en')"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 20,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "Original Sentence: The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.\n",
331 | "\n",
332 | "Type: DATE, Value: 1885\n",
333 | "Type: GPE, Value: Leland\n",
334 | "Type: PERSON, Value: Jane Stanford\n",
335 | "Type: PERSON, Value: Leland Stanford Jr.\n",
336 | "Type: DATE, Value: age 15 the previous year\n",
337 | "Type: ORG, Value: Stanford\n",
338 | "Type: GPE, Value: California\n",
339 | "Type: GPE, Value: U.S.\n",
340 | "Type: ORDINAL, Value: first\n",
341 | "Type: DATE, Value: October 1, 1891,[2][3\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "document = spacy_nlp(article)\n",
347 | "\n",
348 | "print('Original Sentence: %s' % (article))\n",
349 | "print()\n",
350 | "for element in document.ents:\n",
351 | " print('Type: %s, Value: %s' % (element.label_, element))"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 24,
357 | "metadata": {},
358 | "outputs": [
359 | {
360 | "name": "stdout",
361 | "output_type": "stream",
362 | "text": [
363 | "Original Sentence: New York, New York , NY N.Y. new york\n",
364 | "\n",
365 | "Type: GPE, Value: New York\n",
366 | "Type: GPE, Value: New York\n",
367 | "Type: GPE, Value: NY N.Y.\n"
368 | ]
369 | }
370 | ],
371 | "source": [
372 | "document = spacy_nlp(article2)\n",
373 | "\n",
374 | "print('Original Sentence: %s' % (article2))\n",
375 | "print()\n",
376 | "for element in document.ents:\n",
377 | " print('Type: %s, Value: %s' % (element.label_, element))"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 7,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": []
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {
393 | "collapsed": true
394 | },
395 | "outputs": [],
396 | "source": []
397 | }
398 | ],
399 | "metadata": {
400 | "kernelspec": {
401 | "display_name": "Python 3",
402 | "language": "python",
403 | "name": "python3"
404 | },
405 | "language_info": {
406 | "codemirror_mode": {
407 | "name": "ipython",
408 | "version": 3
409 | },
410 | "file_extension": ".py",
411 | "mimetype": "text/x-python",
412 | "name": "python",
413 | "nbconvert_exporter": "python",
414 | "pygments_lexer": "ipython3",
415 | "version": "3.5.2"
416 | }
417 | },
418 | "nbformat": 4,
419 | "nbformat_minor": 2
420 | }
421 |
--------------------------------------------------------------------------------
/sample/nlp-word_tokenization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "\n",
9 | "Source: http://youthvoices.net/discussion/will-you-1-powerful-words"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# Word Tokenization\n",
17 | "To tackle text related problem in Machine Learning area, tokenization is one of the common pre-processing. In this article, we will go through how we can handle work toeknization and sentence tokenization by using three libraries which are spaCy, NLTK and jieba (for Chinese word)."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 3,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "# Capture from https://en.wikipedia.org/wiki/Lexical_analysis\n",
29 | "\n",
30 | "article = 'In computer science, lexical analysis, lexing or tokenization is the process of \\\n",
31 | "converting a sequence of characters (such as in a computer program or web page) into a \\\n",
32 | "sequence of tokens (strings with an assigned and thus identified meaning). A program that \\\n",
33 | "performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner \\\n",
34 | "is also a term for the first stage of a lexer. A lexer is generally combined with a parser, \\\n",
35 | "which together analyze the syntax of programming languages, web pages, and so forth.'"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "article2 = 'ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456'"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 5,
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "outputs": [],
56 | "source": [
57 | "article3 = '你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛'"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 6,
63 | "metadata": {
64 | "collapsed": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "# Capture from https://zh.wikipedia.org/wiki/%E8%AF%8D%E6%B3%95%E5%88%86%E6%9E%90\n",
69 | "\n",
70 | "article4 = '词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器,也叫扫描器。词法分析器一般以函数的形式存在,供语法分析器调用。'"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "# spaCy"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 7,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "spaCy Version: 2.0.11\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "import spacy\n",
95 | "print('spaCy Version: %s' % spacy.__version__)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 8,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "spacy_nlp = spacy.load('en_core_web_sm')"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 9,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
119 | "\n",
120 | "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer,[1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "print('Original Article: %s' % (article))\n",
126 | "print()\n",
127 | "doc = spacy_nlp(article)\n",
128 | "tokens = [token.text for token in doc]\n",
129 | "print(tokens)"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "Not all special character will be seperated."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 8,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
149 | "\n",
150 | "['ConcateStringAnd123', 'ConcateSepcialCharacter_!@', '#', '!', '@#$%^&*()_+', '0123456']\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "print('Original Article: %s' % (article2))\n",
156 | "print()\n",
157 | "doc = spacy_nlp(article2)\n",
158 | "tokens = [token.text for token in doc]\n",
159 | "print(tokens)"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "First step of spaCy separates word by space and then applying some guidelines such as exception rule, prefix, suffix etc."
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "# NLTK"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 9,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "NTLK Version: 3.2.5\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "import nltk\n",
191 | "print('NTLK Version: %s' % nltk.__version__)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 10,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "Original Article: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\n",
204 | "\n",
205 | "['In', 'computer', 'science', ',', 'lexical', 'analysis', ',', 'lexing', 'or', 'tokenization', 'is', 'the', 'process', 'of', 'converting', 'a', 'sequence', 'of', 'characters', '(', 'such', 'as', 'in', 'a', 'computer', 'program', 'or', 'web', 'page', ')', 'into', 'a', 'sequence', 'of', 'tokens', '(', 'strings', 'with', 'an', 'assigned', 'and', 'thus', 'identified', 'meaning', ')', '.', 'A', 'program', 'that', 'performs', 'lexical', 'analysis', 'may', 'be', 'termed', 'a', 'lexer', ',', 'tokenizer', ',', '[', '1', ']', 'or', 'scanner', ',', 'though', 'scanner', 'is', 'also', 'a', 'term', 'for', 'the', 'first', 'stage', 'of', 'a', 'lexer', '.', 'A', 'lexer', 'is', 'generally', 'combined', 'with', 'a', 'parser', ',', 'which', 'together', 'analyze', 'the', 'syntax', 'of', 'programming', 'languages', ',', 'web', 'pages', ',', 'and', 'so', 'forth', '.']\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "print('Original Article: %s' % (article))\n",
211 | "print()\n",
212 | "print(nltk.word_tokenize(article))"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "Some special character (e.g. _) will not be seperated"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 11,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stdout",
229 | "output_type": "stream",
230 | "text": [
231 | "Original Article: ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\n",
232 | "\n",
233 | "['ConcateStringAnd123', 'ConcateSepcialCharacter_', '!', '@', '#', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_+', '0123456']\n"
234 | ]
235 | }
236 | ],
237 | "source": [
238 | "print('Original Article: %s' % (article2))\n",
239 | "print()\n",
240 | "print(nltk.word_tokenize(article2))"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "The behavior is a little difference from spaCy. NLTK treats most of special character as a \"word\" except \"_\". Of course, number will be tokenized as well."
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "# jieba"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 12,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "name": "stdout",
264 | "output_type": "stream",
265 | "text": [
266 | "jieba Version: 0.39\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "import jieba\n",
272 | "print('jieba Version: %s' % jieba.__version__)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 13,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stderr",
282 | "output_type": "stream",
283 | "text": [
284 | "Building prefix dict from the default dictionary ...\n",
285 | "Loading model from cache /tmp/jieba.cache\n"
286 | ]
287 | },
288 | {
289 | "name": "stdout",
290 | "output_type": "stream",
291 | "text": [
292 | "Original Article: 你的姿態 你的青睞 我存在在你的存在 你以為愛 就是被愛\n",
293 | "\n"
294 | ]
295 | },
296 | {
297 | "name": "stderr",
298 | "output_type": "stream",
299 | "text": [
300 | "Loading model cost 1.086 seconds.\n",
301 | "Prefix dict has been built succesfully.\n"
302 | ]
303 | },
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "['你', '的', '姿態', ' ', '你', '的', '青睞', ' ', '我', '存在', '在', '你', '的', '存在', ' ', '你', '以', '為', '愛', ' ', '就是', '被', '愛']\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "print('Original Article: %s' % (article3))\n",
314 | "print()\n",
315 | "\n",
316 | "words = jieba.cut(article3, cut_all=False)\n",
317 | "words = [str(word) for word in words]\n",
318 | "print(words)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 14,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "Original Article: 词法分析是计算机科学中将字符序列转换为标记序列的过程。进行词法分析的程序或者函数叫作词法分析器,也叫扫描器。词法分析器一般以函数的形式存在,供语法分析器调用。\n",
331 | "\n",
332 | "['词法', '分析', '是', '计算机科学', '中将', '字符', '序列', '转换', '为', '标记', '序列', '的', '过程', '。', '进行', '词法', '分析', '的', '程序', '或者', '函数', '叫作', '词法', '分析器', ',', '也', '叫', '扫描器', '。', '词法', '分析器', '一般', '以', '函数', '的', '形式', '存在', ',', '供', '语法分析', '器', '调用', '。']\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "print('Original Article: %s' % (article4))\n",
338 | "print()\n",
339 | "\n",
340 | "words = jieba.cut(article4, cut_all=False)\n",
341 | "words = [str(word) for word in words]\n",
342 | "print(words)"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {
348 | "collapsed": true
349 | },
350 | "source": [
351 | "jieba does a great job on tokenizes Chinese word (both simplified chinese to traditional chinese)."
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "metadata": {},
357 | "source": [
358 | "# Conculsion"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "spaCy seems like having a intelligence on tokenize and the performance is better than NLTK. If you need to tokenize, jieba is a good choice for you. Also, studied spaCy (version 2.x) Chinese language implementation. They wrapped jieba library. From lang/zh/__init__.py"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {
372 | "collapsed": true
373 | },
374 | "outputs": [],
375 | "source": [
376 | "# copy from spaCy/lang/zh/__init__.py\n",
377 | "class Chinese(Language):\n",
378 | " lang = 'zh'\n",
379 | " Defaults = ChineseDefaults # override defaults\n",
380 | "\n",
381 | " def make_doc(self, text):\n",
382 | " try:\n",
383 | " import jieba\n",
384 | " except ImportError:\n",
385 | " raise ImportError(\"The Chinese tokenizer requires the Jieba library: \"\n",
386 | " \"https://github.com/fxsjy/jieba\")\n",
387 | " words = list(jieba.cut(text, cut_all=False))\n",
388 | " words = [x for x in words if x]\n",
389 | " return Doc(self.vocab, words=words, spaces=[False]*len(words))"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "On the other hand, Stanford NLP also released a word tokenize library for multiple language including English and Chinese. You may visit the official website if you are interested.\n",
397 | "\n",
398 | "URL: https://nlp.stanford.edu/software/tokenizer.html"
399 | ]
400 | }
401 | ],
402 | "metadata": {
403 | "kernelspec": {
404 | "display_name": "Python 3",
405 | "language": "python",
406 | "name": "python3"
407 | },
408 | "language_info": {
409 | "codemirror_mode": {
410 | "name": "ipython",
411 | "version": 3
412 | },
413 | "file_extension": ".py",
414 | "mimetype": "text/x-python",
415 | "name": "python",
416 | "nbconvert_exporter": "python",
417 | "pygments_lexer": "ipython3",
418 | "version": "3.5.2"
419 | }
420 | },
421 | "nbformat": 4,
422 | "nbformat_minor": 2
423 | }
424 |
--------------------------------------------------------------------------------
/sample/nlp-stop_words.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "\n",
9 | "Source: https://www.channelone.com/blog_post/web-tools-for-studying-vocabulary-words/"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# Stop Words\n",
17 | "\n",
18 | "When we deal with text problem in Natural Language Processing, stop words removal process is a one of the important step to have a better input for any models. Stop words means that it is a very common words in a language (e.g. a, an, the in English. 的, 了 in Chinese. え, も in Japanese). It does not help on most of NLP problem such as semantic analysis, classification etc.\n",
19 | "\n",
20 | "In this article, we will look into using multi libraries pre-defined stop words, third party pre-defined stop words as well as domain specific stop words. Definition of stop words (capture from wiki) will be used to demonstrate the result after removing stop words.\n",
21 | "\n",
22 | "Word tokenization and lemmatization arethe essential part for removing stop words. You may refer to this article to understand word tokenization and lemmatization.\n"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 1,
28 | "metadata": {
29 | "collapsed": true
30 | },
31 | "outputs": [],
32 | "source": [
33 | "# Capture from https://en.wikipedia.org/wiki/Stop_words\n",
34 | "\n",
35 | "article = 'In computing, stop words are words which are filtered out before or \\\n",
36 | "after processing of natural language data (text).[1] Though \"stop words\" usually \\\n",
37 | "refers to the most common words in a language, there is no single universal list of \\\n",
38 | "stop words used by all natural language processing tools, and indeed not all tools \\\n",
39 | "even use such a list. Some tools specifically avoid removing these stop words to \\\n",
40 | "support phrase search.'"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {
47 | "collapsed": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "# Catpure from https://zh.wikipedia.org/wiki/%E5%81%9C%E7%94%A8%E8%AF%8D\n",
52 | "\n",
53 | "article2 = '在信息檢索中,為節省存儲空間和提高搜索效率,在處理自然語言數據(或文本)之前或之後會自動過濾掉某些字或詞,\\\n",
54 | "這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的,\\\n",
55 | "生成後的停用詞會形成一個停用詞表。但是,並沒有一個明確的停用詞表能夠適用於所有的工具。\\\n",
56 | "甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。'"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### spaCy"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "spaCy Version: 2.0.11\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "import spacy\n",
81 | "print('spaCy Version: %s' % (spacy.__version__))\n",
82 | "spacy_nlp = spacy.load('en_core_web_sm')"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "Check pre-defined English stop words"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 4,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "Number of stop words: 305\n",
102 | "First ten stop words: ['from', 'i', 'cannot', 'seeming', 'seemed', 'him', 'them', 'hundred', 'whoever', 'few']\n"
103 | ]
104 | }
105 | ],
106 | "source": [
107 | "spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS\n",
108 | "\n",
109 | "print('Number of stop words: %d' % len(spacy_stopwords))\n",
110 | "print('First ten stop words: %s' % list(spacy_stopwords)[:10])"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "Remove stop words"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 5,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
130 | "\n",
131 | "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
132 | ]
133 | }
134 | ],
135 | "source": [
136 | "doc = spacy_nlp(article)\n",
137 | "tokens = [token.text for token in doc if not token.is_stop]\n",
138 | "\n",
139 | "print('Original Article: %s' % (article))\n",
140 | "print()\n",
141 | "print(tokens)"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "Add customize stop words"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 6,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "name": "stdout",
158 | "output_type": "stream",
159 | "text": [
160 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
161 | "\n",
162 | "['In', ',', 'stop', 'words', 'words', 'processing', 'natural', 'language', 'data', '(', 'text).[1', ']', 'Though', '\"', 'stop', 'words', '\"', 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'natural', 'language', 'processing', 'tools', ',', 'tools', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "customize_stop_words = [\n",
168 | " 'computing', 'filtered'\n",
169 | "]\n",
170 | "\n",
171 | "for w in customize_stop_words:\n",
172 | " spacy_nlp.vocab[w].is_stop = True\n",
173 | "\n",
174 | "\n",
175 | "doc = spacy_nlp(article)\n",
176 | "tokens = [token.text for token in doc if not token.is_stop]\n",
177 | "\n",
178 | "print('Original Article: %s' % (article))\n",
179 | "print()\n",
180 | "print(tokens)"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "### NLTK"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 7,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "NLTK Version: 3.2.5\n"
200 | ]
201 | }
202 | ],
203 | "source": [
204 | "import nltk \n",
205 | "print('NLTK Version: %s' % (nltk.__version__))\n",
206 | "\n",
207 | "nltk.download('stopwords')"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 8,
213 | "metadata": {},
214 | "outputs": [
215 | {
216 | "name": "stdout",
217 | "output_type": "stream",
218 | "text": [
219 | "Number of stop words: 179\n",
220 | "First ten stop words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\"]\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "nltk_stopwords = nltk.corpus.stopwords.words('english')\n",
226 | "\n",
227 | "print('Number of stop words: %d' % len(nltk_stopwords))\n",
228 | "print('First ten stop words: %s' % list(nltk_stopwords)[:10])"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "General words such as \"are\", \"the\" are removed as well. For example, \"indeed\" is removed in NLTK but not spaCy. On the other hand, \"used\" are removed in spaCy but not NLTK"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 9,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "Original Article: In computing, stop words are words which are filtered out before or after processing of natural language data (text).[1] Though \"stop words\" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.\n",
248 | "\n",
249 | "['In', 'computing', ',', 'stop', 'words', 'words', 'filtered', 'processing', 'natural', 'language', 'data', '(', 'text', ')', '.', '[', '1', ']', 'Though', '``', 'stop', 'words', \"''\", 'usually', 'refers', 'common', 'words', 'language', ',', 'single', 'universal', 'list', 'stop', 'words', 'used', 'natural', 'language', 'processing', 'tools', ',', 'indeed', 'tools', 'even', 'use', 'list', '.', 'Some', 'tools', 'specifically', 'avoid', 'removing', 'stop', 'words', 'support', 'phrase', 'search', '.']\n"
250 | ]
251 | }
252 | ],
253 | "source": [
254 | "tokens = nltk.tokenize.word_tokenize(article)\n",
255 | "tokens = [token for token in tokens if not token in nltk_stopwords]\n",
256 | "\n",
257 | "print('Original Article: %s' % (article))\n",
258 | "print()\n",
259 | "print(tokens)"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "### jieba\n",
267 | "For Chinese word, we use the similar ideas to filter out words if it is stop words."
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 10,
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "name": "stdout",
277 | "output_type": "stream",
278 | "text": [
279 | "jieba Version: 0.39\n"
280 | ]
281 | }
282 | ],
283 | "source": [
284 | "import jieba\n",
285 | "print('jieba Version: %s' % jieba.__version__)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 11,
291 | "metadata": {
292 | "collapsed": true
293 | },
294 | "outputs": [],
295 | "source": [
296 | "# Capture from https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt\n",
297 | "\n",
298 | "jieba_stop_words = [\n",
299 | " '的', '了', '和', '是', '就', '都', '而', '及', '與', \n",
300 | " '著', '或', '一個', '沒有', '我們', '你們', '妳們', \n",
301 | " '他們', '她們', '是否'\n",
302 | "]"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "Different from English, word will not be removed if stop words belongs to part of word. For example, \"是\" is defined as stop words but \"但是\" still exist as \"但是\" is a kind of \"single word\". Therefore, word tokenization is very important for stop word removal."
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 12,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "name": "stderr",
319 | "output_type": "stream",
320 | "text": [
321 | "Building prefix dict from the default dictionary ...\n",
322 | "Loading model from cache /tmp/jieba.cache\n"
323 | ]
324 | },
325 | {
326 | "name": "stdout",
327 | "output_type": "stream",
328 | "text": [
329 | "Original Article: 在信息檢索中,為節省存儲空間和提高搜索效率,在處理自然語言數據(或文本)之前或之後會自動過濾掉某些字或詞,這些字或詞即被稱為Stop Words(停用詞)。不要把停用詞與安全口令混淆。 這些停用詞都是人工輸入、非自動化生成的,生成後的停用詞會形成一個停用詞表。但是,並沒有一個明確的停用詞表能夠適用於所有的工具。甚至有一些工具是明確地避免使用停用詞來支持短語搜索的。\n",
330 | "\n"
331 | ]
332 | },
333 | {
334 | "name": "stderr",
335 | "output_type": "stream",
336 | "text": [
337 | "Loading model cost 1.118 seconds.\n",
338 | "Prefix dict has been built succesfully.\n"
339 | ]
340 | },
341 | {
342 | "name": "stdout",
343 | "output_type": "stream",
344 | "text": [
345 | "['在', '信息', '檢索', '中', ',', '為節', '省存', '儲空間', '提高', '搜索', '效率', ',', '在', '處理', '自然', '語言數', '據', '(', '文本', ')', '之前', '之後會', '自動', '過濾', '掉', '某些', '字', '詞', ',', '這些', '字', '詞', '即', '被', '稱', '為', 'Stop', ' ', 'Words', '(', '停用', '詞', ')', '。', '不要', '把', '停用', '詞', '安全', '口令', '混淆', '。', ' ', '這些', '停用', '詞', '人工', '輸入', '、', '非自動', '化生成', ',', '生成', '後', '停用', '詞會', '形成', '停用', '詞表', '。', '但是', ',', '並沒有', '明確', '停用', '詞表能夠', '適用', '於', '所有', '工具', '。', '甚至', '有', '一些', '工具', '明確', '地', '避免', '使用', '停用', '詞來', '支持', '短語', '搜索', '。']\n"
346 | ]
347 | }
348 | ],
349 | "source": [
350 | "print('Original Article: %s' % (article2))\n",
351 | "print()\n",
352 | "words = jieba.cut(article2, cut_all=False)\n",
353 | "words = [str(word) for word in words if not str(word) in jieba_stop_words]\n",
354 | "print(words)"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {
360 | "collapsed": true
361 | },
362 | "source": [
363 | "# Conclusion"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "The procedure of removing stop words is similar across libraries so the most importance is defining your own stop words. In initial phase, pre-defined stop words can be adopted but more and more words should be added into stop word list later on. \n",
371 | "\n",
372 | "So besides, using spaCy or NLTK pre-defined stop words, we can use other words which are defined by other party such as Stanford NLP and Rank NL. You may check out the stop list from \n",
373 | "\n",
374 | "Stanford NLP: https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt\n",
375 | "\n",
376 | "Rank NL: https://www.ranks.nl/stopwords\n",
377 | "\n",
378 | "jieba: https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt"
379 | ]
380 | }
381 | ],
382 | "metadata": {
383 | "kernelspec": {
384 | "display_name": "Python 3",
385 | "language": "python",
386 | "name": "python3"
387 | },
388 | "language_info": {
389 | "codemirror_mode": {
390 | "name": "ipython",
391 | "version": 3
392 | },
393 | "file_extension": ".py",
394 | "mimetype": "text/x-python",
395 | "name": "python",
396 | "nbconvert_exporter": "python",
397 | "pygments_lexer": "ipython3",
398 | "version": "3.5.2"
399 | }
400 | },
401 | "nbformat": 4,
402 | "nbformat_minor": 2
403 | }
404 |
--------------------------------------------------------------------------------
/sample/nlp-3_basic_distance_measurement_in_text_mining.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "Photo Credit: https://pixabay.com/en/hong-kong-night-light-rail-city-2288999/"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "# 3 basic Distance Measurement in Text Mining\n",
16 | "In NLP, we also want to find the similarity among sentence or document. Text is not like number and coordination that we cannot compare the different between \"Apple\" and \"Orange\" but similarity score can be calculated."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Why?\n",
24 | "Since we cannot simply subtract between \"Apple is fruit\" and \"Orange is fruit\" so that we have to find a way to convert text to numeric in order to calculate it. Having the score, we can understand how similar among two objects."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "# When?\n",
32 | "In my data science work, I tried:\n",
33 | "- Compare whether 2 article are describing same news\n",
34 | "- Identifying similar documents\n",
35 | "- Classifying the category by giving product description"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# How?\n",
43 | "In this article, we will go through 4 basic distance measurements:\n",
44 | "Euclidean Distance\n",
45 | "Cosine Distance\n",
46 | "Jaccard Similarity\n",
47 | "\n",
48 | "Before any distance measurement, text have to be tokenzied. If you do not familiar with word tokenization, you can visit this [article](https://medium.com/@makcedward/nlp-pipeline-word-tokenization-part-1-4b2b547e6a3)."
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "import pandas as pd\n",
60 | "import numpy as np\n",
61 | "import nltk\n",
62 | "import sklearn"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 38,
68 | "metadata": {
69 | "collapsed": true
70 | },
71 | "outputs": [],
72 | "source": [
73 | "\"\"\"\n",
74 | " News headline get from \n",
75 | " \n",
76 | " https://www.reuters.com/article/us-musk-tunnel/elon-musks-boring-co-to-build-high-speed-airport-link-in-chicago-idUSKBN1JA224\n",
77 | " http://money.cnn.com/2018/06/14/technology/elon-musk-boring-company-chicago/index.html\n",
78 | " https://www.theverge.com/2018/6/13/17462496/elon-musk-boring-company-approved-tunnel-chicago\n",
79 | "\n",
80 | "\"\"\"\n",
81 | "\n",
82 | "news_headline1 = \"Elon Musk's Boring Co to build high-speed airport link in Chicago\"\n",
83 | "news_headline2 = \"Elon Musk's Boring Company to build high-speed Chicago airport link\"\n",
84 | "news_headline3 = \"Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\"\n",
85 | "news_headline4 = \"Both apple and orange are fruit\"\n",
86 | "\n",
87 | "news_headlines = [news_headline1, news_headline2, news_headline3, news_headline4]"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "# Preprocessing"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "Tokenize headline to list of words"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "First 7 tokens from news headlines: ['Elon', 'Musk', \"'s\", 'Boring', 'Co', 'to', 'build']\n",
114 | "First 7 tokens from news headlines: ['Elon', 'Musk', \"'s\", 'Boring', 'Company', 'to', 'build']\n",
115 | "First 7 tokens from news headlines: ['Elon', 'Musk', '’', 's', 'Boring', 'Company', 'approved']\n",
116 | "First 7 tokens from news headlines: ['Both', 'apple', 'and', 'orange', 'are', 'fruit']\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "news_headline1_tokens = nltk.word_tokenize(news_headline1)\n",
122 | "news_headline2_tokens = nltk.word_tokenize(news_headline2)\n",
123 | "news_headline3_tokens = nltk.word_tokenize(news_headline3)\n",
124 | "news_headline4_tokens = nltk.word_tokenize(news_headline4)\n",
125 | "\n",
126 | "for words in [news_headline1_tokens, news_headline2_tokens, news_headline3_tokens, news_headline4_tokens]:\n",
127 | " print('First 7 tokens from news headlines: ', words[:7])"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 7,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stdout",
137 | "output_type": "stream",
138 | "text": [
139 | "\n",
140 | "All Tokens:\n",
141 | "['Elon', 'Musk', \"'s\", 'Boring', 'Co', 'to', 'build', 'high-speed', 'airport', 'link', 'in', 'Chicago', 'Elon', 'Musk', \"'s\", 'Boring', 'Company', 'to', 'build', 'high-speed', 'Chicago', 'airport', 'link', 'Elon', 'Musk', '’', 's', 'Boring', 'Company', 'approved', 'to', 'build', 'high-speed', 'transit', 'between', 'downtown', 'Chicago', 'and', 'O', '’', 'Hare', 'Airport', 'Both', 'apple', 'and', 'orange', 'are', 'fruit']\n",
142 | "\n",
143 | "Original Input: ['Elon', 'Musk', \"'s\", 'Boring', 'Co', 'to', 'build', 'high-speed', 'airport', 'link', 'in', 'Chicago']\n",
144 | "Encoded by Label Encoder: [ 7 9 0 2 5 25 17 20 11 22 21 4]\n",
145 | "Encoded by OneHot Encoder:\n",
146 | " (0, 7)\t1.0\n",
147 | " (1, 9)\t1.0\n",
148 | " (2, 0)\t1.0\n",
149 | " (3, 2)\t1.0\n",
150 | " (4, 5)\t1.0\n",
151 | " (5, 25)\t1.0\n",
152 | " (6, 17)\t1.0\n",
153 | " (7, 20)\t1.0\n",
154 | " (8, 11)\t1.0\n",
155 | " (9, 22)\t1.0\n",
156 | " (10, 21)\t1.0\n",
157 | " (11, 4)\t1.0\n",
158 | "\n",
159 | "Original Input: ['Elon', 'Musk', \"'s\", 'Boring', 'Company', 'to', 'build', 'high-speed', 'Chicago', 'airport', 'link']\n",
160 | "Encoded by Label Encoder: [ 7 9 0 2 6 25 17 20 4 11 22]\n",
161 | "Encoded by OneHot Encoder:\n",
162 | " (0, 7)\t1.0\n",
163 | " (1, 9)\t1.0\n",
164 | " (2, 0)\t1.0\n",
165 | " (3, 2)\t1.0\n",
166 | " (4, 6)\t1.0\n",
167 | " (5, 25)\t1.0\n",
168 | " (6, 17)\t1.0\n",
169 | " (7, 20)\t1.0\n",
170 | " (8, 4)\t1.0\n",
171 | " (9, 11)\t1.0\n",
172 | " (10, 22)\t1.0\n",
173 | "\n",
174 | "Original Input: ['Elon', 'Musk', '’', 's', 'Boring', 'Company', 'approved', 'to', 'build', 'high-speed', 'transit', 'between', 'downtown', 'Chicago', 'and', 'O', '’', 'Hare', 'Airport']\n",
175 | "Encoded by Label Encoder: [ 7 9 27 24 2 6 14 25 17 20 26 16 18 4 12 10 27 8 1]\n",
176 | "Encoded by OneHot Encoder:\n",
177 | " (0, 7)\t1.0\n",
178 | " (1, 9)\t1.0\n",
179 | " (2, 27)\t1.0\n",
180 | " (3, 24)\t1.0\n",
181 | " (4, 2)\t1.0\n",
182 | " (5, 6)\t1.0\n",
183 | " (6, 14)\t1.0\n",
184 | " (7, 25)\t1.0\n",
185 | " (8, 17)\t1.0\n",
186 | " (9, 20)\t1.0\n",
187 | " (10, 26)\t1.0\n",
188 | " (11, 16)\t1.0\n",
189 | " (12, 18)\t1.0\n",
190 | " (13, 4)\t1.0\n",
191 | " (14, 12)\t1.0\n",
192 | " (15, 10)\t1.0\n",
193 | " (16, 27)\t1.0\n",
194 | " (17, 8)\t1.0\n",
195 | " (18, 1)\t1.0\n",
196 | "\n",
197 | "Original Input: ['Both', 'apple', 'and', 'orange', 'are', 'fruit']\n",
198 | "Encoded by Label Encoder: [ 3 13 12 23 15 19]\n",
199 | "Encoded by OneHot Encoder:\n",
200 | " (0, 3)\t1.0\n",
201 | " (1, 13)\t1.0\n",
202 | " (2, 12)\t1.0\n",
203 | " (3, 23)\t1.0\n",
204 | " (4, 15)\t1.0\n",
205 | " (5, 19)\t1.0\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "from numpy import argmax\n",
211 | "\n",
212 | "def transform(headlines):\n",
213 | " tokens = [w for s in headlines for w in s ]\n",
214 | " print()\n",
215 | " print('All Tokens:')\n",
216 | " print(tokens)\n",
217 | "\n",
218 | " results = []\n",
219 | " label_enc = sklearn.preprocessing.LabelEncoder()\n",
220 | " onehot_enc = sklearn.preprocessing.OneHotEncoder()\n",
221 | " \n",
222 | " encoded_all_tokens = label_enc.fit_transform(list(set(tokens)))\n",
223 | " encoded_all_tokens = encoded_all_tokens.reshape(len(encoded_all_tokens), 1)\n",
224 | " \n",
225 | " onehot_enc.fit(encoded_all_tokens)\n",
226 | " \n",
227 | " for headline_tokens in headlines:\n",
228 | " print()\n",
229 | " print('Original Input:', headline_tokens)\n",
230 | " \n",
231 | " encoded_words = label_enc.transform(headline_tokens)\n",
232 | " print('Encoded by Label Encoder:', encoded_words)\n",
233 | " \n",
234 | " encoded_words = onehot_enc.transform(encoded_words.reshape(len(encoded_words), 1))\n",
235 | " print('Encoded by OneHot Encoder:')\n",
236 | " print(encoded_words)\n",
237 | "\n",
238 | " results.append(np.sum(encoded_words.toarray(), axis=0))\n",
239 | " \n",
240 | " return results\n",
241 | "\n",
242 | "transformed_results = transform([\n",
243 | " news_headline1_tokens, news_headline2_tokens, news_headline3_tokens, news_headline4_tokens])"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "### Euclidean Distance\n",
251 | "\n",
252 | "\n",
253 | "\n",
254 | "Photo Credit: http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "Comparing the shortest distance among two objects. It uses Pythagorean Theorem which learnt from secondary school.\n",
262 | "\n",
263 | "Score means the distance between two objects. If it is 0, it means that both objects are identical. The following example shows score when comparing the first sentence."
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 49,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "Master Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
276 | "-----\n",
277 | "Score: 0.00, Comparing Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
278 | "-----\n",
279 | "Score: 1.73, Comparing Sentence: Elon Musk's Boring Company to build high-speed Chicago airport link\n",
280 | "-----\n",
281 | "Score: 4.36, Comparing Sentence: Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\n",
282 | "-----\n",
283 | "Score: 4.24, Comparing Sentence: Both apple and orange are fruit\n"
284 | ]
285 | }
286 | ],
287 | "source": [
288 | "print('Master Sentence: %s' % news_headlines[0])\n",
289 | "for i, news_headline in enumerate(news_headlines):\n",
290 | " score = sklearn.metrics.pairwise.euclidean_distances([transformed_results[i]], [transformed_results[0]])[0][0]\n",
291 | " print('-----')\n",
292 | " print('Score: %.2f, Comparing Sentence: %s' % (score, news_headline))"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### Cosine Similarity\n",
300 | "\n",
301 | "\n",
302 | "\n",
303 | "Photo Credit: http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "Determine the angle between two objects is the calculation method to the find similarity. The range of score is 0 to 1. If score is 1, it means that they are same in orientation (not magnitude). The following example shows score when comparing the first sentence."
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 52,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "Master Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
323 | "-----\n",
324 | "Score: 1.00, Comparing Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
325 | "-----\n",
326 | "Score: 0.87, Comparing Sentence: Elon Musk's Boring Company to build high-speed Chicago airport link\n",
327 | "-----\n",
328 | "Score: 0.44, Comparing Sentence: Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\n",
329 | "-----\n",
330 | "Score: 0.00, Comparing Sentence: Both apple and orange are fruit\n"
331 | ]
332 | }
333 | ],
334 | "source": [
335 | "print('Master Sentence: %s' % news_headlines[0])\n",
336 | "for i, news_headline in enumerate(news_headlines):\n",
337 | " score = sklearn.metrics.pairwise.cosine_similarity([transfaormed_results[i]], [transformed_results[0]])[0][0]\n",
338 | " print('-----')\n",
339 | " print('Score: %.2f, Comparing Sentence: %s' % (score, news_headline))"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | "### Jaccard Similarity\n",
347 | "\n",
348 | "Photo Credit: http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/\n",
349 | "\n",
350 | "The measurement is refer to number of common words over all words. More commons mean both objects should be similarity.\n",
351 | "Jaccard Similarity = (Intersection of A and B) / (Union of A and B)\n",
352 | "The range is 0 to 1. If score is 1, it means that they are identical. There is no any common word between the first sentence and the last sentence so the score is 0. The following example shows score when comparing the first sentence."
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 55,
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "name": "stdout",
362 | "output_type": "stream",
363 | "text": [
364 | "Master Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
365 | "-----\n",
366 | "Score: 1.00, Comparing Sentence: Elon Musk's Boring Co to build high-speed airport link in Chicago\n",
367 | "-----\n",
368 | "Score: 0.67, Comparing Sentence: Elon Musk's Boring Company to build high-speed Chicago airport link\n",
369 | "-----\n",
370 | "Score: 0.17, Comparing Sentence: Elon Musk’s Boring Company approved to build high-speed transit between downtown Chicago and O’Hare Airport\n",
371 | "-----\n",
372 | "Score: 0.00, Comparing Sentence: Both apple and orange are fruit\n"
373 | ]
374 | }
375 | ],
376 | "source": [
377 | "\"\"\"\n",
378 | " Finding the posistion (from lookup table) of word instead of using 1 or 0\n",
379 | " to prevent misleading of the meaning of \"common\" word\n",
380 | "\"\"\"\n",
381 | "\n",
382 | "def calculate_position(values):\n",
383 | " x = []\n",
384 | " for pos, matrix in enumerate(values):\n",
385 | " if matrix > 0:\n",
386 | " x.append(pos)\n",
387 | " return x\n",
388 | "\n",
389 | "\"\"\"\n",
390 | " Since scikit-learn can only compare same number of dimension of input. \n",
391 | " Add padding to the shortest sentence.\n",
392 | "\"\"\"\n",
393 | "def padding(sentence1, sentence2):\n",
394 | " x1 = sentence1.copy()\n",
395 | " x2 = sentence2.copy()\n",
396 | " \n",
397 | " diff = len(x1) - len(x2)\n",
398 | " \n",
399 | " if diff > 0:\n",
400 | " for i in range(0, diff):\n",
401 | " x2.append(-1)\n",
402 | " elif diff < 0:\n",
403 | " for i in range(0, abs(diff)):\n",
404 | " x1.append(-1)\n",
405 | " \n",
406 | " return x1, x2 \n",
407 | "\n",
408 | "y_actual = calculate_position(transformed_results[0])\n",
409 | "\n",
410 | "print('Master Sentence: %s' % news_headlines[0])\n",
411 | "for i, news_headline in enumerate(news_headlines):\n",
412 | " y_compare = calculate_position(transformed_results[i])\n",
413 | " x1, x2 = padding(y_actual, y_compare)\n",
414 | " score = sklearn.metrics.jaccard_similarity_score(x1, x2)\n",
415 | " print('-----')\n",
416 | " print('Score: %.2f, Comparing Sentence: %s' % (score, news_headline))\n",
417 | " "
418 | ]
419 | },
420 | {
421 | "cell_type": "markdown",
422 | "metadata": {
423 | "collapsed": true
424 | },
425 | "source": [
426 | "# Conclusion\n",
427 | "Three methods also have same assumption which is the document (or sentence) are \n",
428 | "__similar if having common words\n",
429 | "__. This idea is very straight forward and simple. It fits some basic cases such as comparing first 2 sentence. However, the score is relative low by comparing first sentence and third sentence although both of them describe same news. \n",
430 | "\n",
431 | "Another limitation is that above methods __does not handle synonym scenario__. For example buy and purchase, it should have same meaning (in some cases) but above methods will treat both words are difference. \n",
432 | "\n",
433 | "So what is the cue? You may consider to use Word Embedding which introduced by Tomas Mikolov in 2013."
434 | ]
435 | }
436 | ],
437 | "metadata": {
438 | "kernelspec": {
439 | "display_name": "Python 3",
440 | "language": "python",
441 | "name": "python3"
442 | },
443 | "language_info": {
444 | "codemirror_mode": {
445 | "name": "ipython",
446 | "version": 3
447 | },
448 | "file_extension": ".py",
449 | "mimetype": "text/x-python",
450 | "name": "python",
451 | "nbconvert_exporter": "python",
452 | "pygments_lexer": "ipython3",
453 | "version": "3.5.2"
454 | }
455 | },
456 | "nbformat": 4,
457 | "nbformat_minor": 2
458 | }
459 |
--------------------------------------------------------------------------------