├── examples
    ├── README.md
    ├── keras classifier usage examples.ipynb
    └── word2vec usage examples.ipynb
├── train
    ├── build_w2v_dict.py
    ├── learn.py
    └── clean.py
├── .gitignore
├── split_to_chunks
    ├── subsample.py
    ├── csv_to_pd_chunks.py
    ├── txt_to_sentences.py
    ├── pd_to_sentences.py
    └── wiki_to_sentences.py
├── predict
    ├── save_best.py
    ├── build_joined_vect_dict.py
    └── predict.py
├── README.md
└── utils.py


/examples/README.md:
--------------------------------------------------------------------------------
1 | ### Приклади використання навчених моделей 
2 | [word2vec usage examples](word2vec usage examples.ipynb) - використання моделі `word2vec` для простої арифметики  
3 | [keras classifier usage examples](keras classifier usage examples.ipynb) - використання навченої нейронної мережі для класифікації тональності 


--------------------------------------------------------------------------------
/train/build_w2v_dict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | builds a vocabulary for word2vec and saves it
 5 | """
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | import gensim, logging
10 | 
11 | from utils import *
12 | from colorama import Fore, Back, Style
13 | 
14 | log('starting')
15 | 
16 | # create log directory
17 | log_root_folder = create_log_folder(__file__)
18 | 
19 | # setup logging to write to current log folder
20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
21 |                     filename=log_root_folder + 'word2vec.log')
22 | 
23 | # create empty model
24 | model = gensim.models.Word2Vec(size=200)
25 | 
26 | log('building dictionary')
27 | model.build_vocab(all_sentences)
28 | 
29 | log('saving model')
30 | model.save(trained_folder + 'word2vec_200_dmt.model')
31 | 
32 | log('done', Fore.GREEN)
33 | 


--------------------------------------------------------------------------------
/train/learn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | loads model with vocabulary saved by build_w2v_dict.py
 5 | trains a model on sentences and saves it
 6 | """
 7 | import sys
 8 | sys.path.append('../')
 9 | 
10 | import gensim, logging
11 | from utils import *
12 | from colorama import Fore, Back, Style
13 | 
14 | log('starting')
15 | 
16 | # create log directory
17 | log_root_folder = create_log_folder(__file__)
18 | 
19 | # setup logging to write to current log folder
20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
21 |                     filename=log_root_folder + 'word2vec.log')
22 | 
23 | # load model with dictionary from file
24 | model = gensim.models.Word2Vec.load(trained_folder + 'big_word2vec_200.model')
25 | 
26 | log('training')
27 | model.train(all_sentences)
28 | 
29 | log('saving model')
30 | model.save(trained_folder + 'big_word2vec_trained_200.model')
31 | 
32 | log('done', Fore.GREEN)
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # IDE folders
10 | .idea/
11 | 
12 | # Test files
13 | 
14 | # Distribution / packaging
15 | .Python
16 | env/
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | .hypothesis/
52 | 
53 | # Translations
54 | *.mo
55 | *.pot
56 | 
57 | # Django stuff:
58 | *.log
59 | 
60 | # Sphinx documentation
61 | docs/_build/
62 | 
63 | # PyBuilder
64 | target/
65 | 
66 | #Ipython Notebook
67 | .ipynb_checkpoints
68 | 
69 | snapshots
70 | *.swp
71 | 


--------------------------------------------------------------------------------
/split_to_chunks/subsample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp1251 -*-
 2 | 
 3 | import sys
 4 | sys.path.append('../')
 5 | from utils import *
 6 | from colorama import Fore, Back, Style
 7 | 
 8 | chunk_size = 5000
 9 | 
10 | files = [
11 |          '../raw src data/wiki_sent_tok.txt',
12 |          # '../raw src data/up.csv',
13 |          # '../raw src data/zaxid.csv',
14 |          # '../raw src data/dt.all.txt',
15 |          # '../raw src data/um.txt',
16 |          # '../raw src data/vz.txt',
17 |          # '../raw src data/korr.csv',
18 |          # '../raw src data/unian.csv',
19 |     ]
20 | 
21 | log('starting')
22 | 
23 | for src_file in files:
24 |     log('reading src file ' + src_file)
25 | 
26 |     content = ''
27 |     lines_read = 0
28 |     with open(src_file, 'r') as f:
29 |         for line in f:
30 |             content += line
31 |             lines_read += 1
32 | 
33 |             if lines_read == chunk_size:
34 |                 break
35 | 
36 |     log('writing sample to file')
37 |     with open(subsets_folder + os.path.basename(src_file), 'w') as f:
38 |         f.write(content)
39 | 
40 |     log('writing done', Fore.GREEN)
41 | 
42 | log('done', Fore.GREEN)
43 | 


--------------------------------------------------------------------------------
/predict/save_best.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append('..')
 4 | 
 5 | # -*- coding: utf-8 -*-
 6 | import numpy as np
 7 | import pandas as pd
 8 | import codecs
 9 | from sklearn.utils import shuffle
10 | import msgpack
11 | from utils import *
12 | 
13 | log('loading index2word file')
14 | with open(result_folder + 'index2word', 'rb') as f:
15 |     index2word = np.array(msgpack.unpack(f, encoding='utf-8'))
16 | 
17 | log('read original tsv file')
18 | tsv = pd.read_csv(data_folder + 'orig-tone-dict-v2.tsv', sep='\t', header=None, names=['word', 'tone', '_', '_1'])
19 | tones = dict(zip(tsv.word, tsv.tone))
20 | 
21 | log('loading predictions')
22 | preds = np.load(result_folder + 'predict/preds-all.npy')
23 | dic = []
24 | for i in range(0, len(preds)):
25 |     w = index2word[i]
26 |     if w not in tones:
27 |         dic.append([index2word[i], preds[i][0]])
28 | 
29 | log('sorting')
30 | dic = np.array(sorted(dic, key=lambda l: l[1], reverse=True))
31 | 
32 | # top positives and negatives
33 | pos = dic[:2000,  :]
34 | neg = dic[-2000:, :][::-1]
35 | 
36 | with codecs.open(result_folder + 'predict/top_pos_N.txt', "w", "utf-8") as stream:
37 |     for i in range(0, len(pos)):
38 |         stream.write(pos[i][0] + '\t' + str(pos[i][1]) + u"\n")
39 | 
40 | with codecs.open(result_folder + 'predict/top_neg_N.txt', "w", "utf-8") as stream:
41 |     for i in range(0, len(neg)):
42 |         stream.write(neg[i][0] + '\t' + str(neg[i][1]) + u"\n")
43 | 
44 | print('done')
45 | 


--------------------------------------------------------------------------------
/split_to_chunks/csv_to_pd_chunks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: cp1251 -*-
 2 | """
 3 | split big files to smaller chunks
 4 | each chunk is csv
 5 | """
 6 | import sys
 7 | sys.path.append('../')
 8 | import pandas as pd
 9 | import math
10 | from utils import *
11 | from colorama import Fore, Back, Style
12 | 
13 | chunk_size = int(1e5)  # 100 000
14 | chunks_saved = 0
15 | files = [{
16 |     'url': raw_data_folder + 'ukrlib.csv',
17 |     'cols': [1, 0]
18 | }, {
19 |     'url': raw_data_folder + 'up.csv',
20 |     'cols': [6, 4]
21 | }, {
22 |     'url': raw_data_folder + 'zaxid.csv',
23 |     'cols': [5, 2]
24 | }, {
25 |     'url': raw_data_folder + 'korr.csv',
26 |     'cols': [1, 2]
27 | }, {
28 |     'url': raw_data_folder + 'unian.csv',
29 |     'cols': [5, 4]
30 | }]
31 | 
32 | log('starting')
33 | 
34 | for src_file in files:
35 |     log('reading src file ' + src_file['url'])
36 |     data = pd.read_csv(src_file['url'])
37 |     chunks_number = int(math.ceil(data.shape[0] / float(chunk_size)))
38 | 
39 |     log('chunks to save ' + str(chunks_number))
40 | 
41 |     for i in range(0, chunks_number):
42 |         start_time = time.time()
43 |         chunk = data.iloc[i * chunk_size: (i + 1) * chunk_size, src_file['cols']]
44 | 
45 |         log('saving chunk ' + str(i))
46 | 
47 |         chunk.to_csv(data_folder + 'chunks/chunk_{}.csv'.format(chunks_saved))
48 |         total_time = time.time() - start_time
49 | 
50 |         log('saving completed {}, running time {}'.format(chunks_saved, total_time), Fore.GREEN)
51 | 
52 |         chunks_saved += 1
53 | 
54 | log('done', Fore.GREEN)
55 | 


--------------------------------------------------------------------------------
/split_to_chunks/txt_to_sentences.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | load entire txt file, tokenize it and save lists of sentences in chunks
 5 | """
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | import math
10 | 
11 | from utils import *
12 | import tokenize_uk
13 | from colorama import Fore, Back, Style
14 | 
15 | items_processed = 0
16 | chunks_folder = data_folder + 'chunks/'
17 | sents_folder = data_folder + 'sents/'
18 | sents_per_chunk = int(7e5)  # 700 000
19 | log_interval = 10000
20 | 
21 | files = [raw_data_folder + 'ukr_lit.txt', raw_data_folder + 'td.txt']
22 | 
23 | log('starting')
24 | 
25 | for src_file in files:
26 |     with open(src_file, 'rb') as f:
27 |         data = f.read()
28 | 
29 |     log('processing file ' + src_file)
30 | 
31 |     text = data.decode('utf-8')
32 |     tokens_text = tokenize_uk.tokenize_sents(text)
33 | 
34 |     log('tokenization finished')
35 | 
36 |     sents_number = int(math.ceil(len(tokens_text) / float(sents_per_chunk)))
37 | 
38 |     for i in range(0, sents_number):
39 |         sentences = []
40 |         chunk = tokens_text[i * sents_per_chunk: (i + 1) * sents_per_chunk]
41 | 
42 |         for sentence in chunk:
43 |             sentences.append(tokenize_uk.tokenize_words(sentence))
44 | 
45 |             if items_processed % log_interval == 0:
46 |                 log('items processed {}'.format(items_processed))
47 | 
48 |             items_processed += 1
49 | 
50 |         result_file = os.path.basename(src_file) + str(i) + '.msg'
51 |         with open(sents_folder + result_file, 'wb') as f:
52 |             msgpack.pack(sentences, f)
53 | 
54 |         log('file {} saved'.format(result_file))
55 | 
56 | log('done', Fore.GREEN)
57 | 


--------------------------------------------------------------------------------
/split_to_chunks/pd_to_sentences.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | load chunks one by one and save to chunks with array of sentences each sentences is an array of words
 5 | in simple words, the goal this script is a tokanization of text in chunks
 6 | """
 7 | import sys
 8 | sys.path.append('../')
 9 | 
10 | import pandas as pd
11 | from utils import *
12 | import tokenize_uk
13 | from colorama import Fore, Back, Style
14 | 
15 | items_processed = 0
16 | chunks_folder = data_folder + 'chunks/'
17 | sents_folder = data_folder + 'sents/'
18 | log_interval = 10000
19 | 
20 | log('starting')
21 | 
22 | start_time = time.time()
23 | 
24 | for chunk_file in os.listdir(chunks_folder):
25 |     data = pd.read_csv(chunks_folder + chunk_file)
26 |     sentences = []
27 | 
28 |     for row in data.itertuples():
29 |         title = row[3].decode('utf-8') if isinstance(row[3], basestring) else ''
30 |         text = row[2].decode('utf-8') if isinstance(row[2], basestring) else ''
31 | 
32 |         tokens_title = tokenize_uk.tokenize_sents(title)
33 |         tokens_text = tokenize_uk.tokenize_sents(text)
34 | 
35 |         for sentence in tokens_title + tokens_text:
36 |             sentences.append(tokenize_uk.tokenize_words(sentence))
37 | 
38 |         total_time = time.time() - start_time
39 | 
40 |         if items_processed % log_interval == 0:
41 |             log('items processed {}, running time {}'.format(items_processed, total_time))
42 | 
43 |         items_processed += 1
44 | 
45 |     log('saving chunk ')
46 | 
47 |     result_file = os.path.splitext(chunk_file)[0] + '.msg'
48 |     with open(sents_folder + result_file, 'wb') as f:
49 |         msgpack.pack(sentences, f)
50 | 
51 | log('done', Fore.GREEN)
52 | 


--------------------------------------------------------------------------------
/split_to_chunks/wiki_to_sentences.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | in wiki file each line is a sentence, so we can read and tokanize line by line
 5 | saves lists of sentences in chunks
 6 | """
 7 | import sys
 8 | sys.path.append('../')
 9 | from itertools import islice
10 | 
11 | from utils import *
12 | import tokenize_uk
13 | from colorama import Fore, Back, Style
14 | 
15 | items_processed = 0
16 | files_saved = 0
17 | lines_per_chunk = int(7e5)  # 700 000
18 | log_interval = 10000
19 | 
20 | files = [raw_data_folder + 'wiki_sent_tok.txt']
21 | 
22 | log('starting')
23 | 
24 | for src_file in files:
25 |     log('processing file ' + src_file)
26 | 
27 |     with open(src_file, 'rb') as f:
28 |         while True:
29 |             next_n_lines = list(islice(f, lines_per_chunk))
30 |             if not next_n_lines:
31 |                 break
32 | 
33 |             text = ''.join(next_n_lines).decode('utf-8')
34 | 
35 |             tokens_text = tokenize_uk.tokenize_sents(text)
36 | 
37 |             log('tokenization finished')
38 | 
39 |             sentences = []
40 | 
41 |             for sentence in tokens_text:
42 |                 sentences.append(tokenize_uk.tokenize_words(sentence))
43 | 
44 |                 if items_processed % log_interval == 0:
45 |                     log('items processed {}'.format(items_processed))
46 | 
47 |                 items_processed += 1
48 | 
49 |             result_file = os.path.basename(src_file) + str(files_saved) + '.msg'
50 |             with open(sents_folder + result_file, 'wb') as s_f:
51 |                 msgpack.pack(sentences, s_f)
52 | 
53 |             log('file {} saved'.format(result_file))
54 | 
55 |             files_saved += 1
56 | 
57 | log('done', Fore.GREEN)
58 | 


--------------------------------------------------------------------------------
/train/clean.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | clean data by removing special non alphabetic characters and converting everything to lowercase
 5 | """
 6 | import sys
 7 | sys.path.append('../')
 8 | 
 9 | import re
10 | 
11 | from utils import *
12 | from colorama import Fore, Back, Style
13 | 
14 | 
15 | items_processed = 0
16 | clean_folder = data_folder + 'clean/'
17 | log_interval = 100000
18 | 
19 | log('starting')
20 | 
21 | start_time = time.time()
22 | 
23 | non_char_re = re.compile('\W', re.UNICODE)
24 | 
25 | 
26 | def convert_number(num):
27 |     try:
28 |         inum = int(num)
29 | 
30 |         if inum < 10:
31 |             return str(inum)
32 |         else:
33 |             return "0" * min(len(num), 5)
34 | 
35 |     except ValueError:
36 |         return num
37 | 
38 | for file_name in os.listdir(sents_folder):
39 |     with open(sents_folder + file_name, 'rb') as f:
40 |         sentences = msgpack.unpack(f)
41 | 
42 |     for i, sentence in enumerate(sentences):
43 |         for j, word in enumerate(sentence):
44 |             word = word.decode('utf-8')
45 |             sentence[j] = None if non_char_re.search(word) else convert_number(word.lower())
46 | 
47 |         sentences[i] = get_not_none(sentence)
48 | 
49 |         if len(sentences[i]) == 0:
50 |             sentences[i] = None
51 | 
52 |         total_time = time.time() - start_time
53 | 
54 |         if items_processed % log_interval == 0:
55 |             log('items processed {} and it took {}'.format(items_processed, total_time))
56 | 
57 |         items_processed += 1
58 | 
59 |     sentences = get_not_none(sentences)
60 | 
61 |     log('saving chunk ' + file_name)
62 | 
63 |     with open(clean_folder + file_name, 'wb') as f:
64 |         msgpack.pack(sentences, f)
65 | 
66 |     log('chunk ' + file_name + ' saved', Fore.GREEN)
67 | 
68 | log('done', Fore.GREEN)
69 | 


--------------------------------------------------------------------------------
/predict/build_joined_vect_dict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | requires trained lexvec with word2vec models placed to ../data/result/ folder
 5 | it will join them and save syn0, index2word and joined_dict files
 6 | you can find download links for models here: https://github.com/lang-uk/tonal-model/#used-word-embeddings-models
 7 | """
 8 | 
 9 | import sys
10 | import os
11 | 
12 | import numpy as np
13 | from gensim.models import Word2Vec
14 | 
15 | currentDirectory = os.path.dirname(__file__)
16 | sys.path.append(os.path.join(currentDirectory, '../')) # This will get you to source
17 | 
18 | from utils import *
19 | 
20 | log('loading lexVecModel')
21 | lexVecModel = Word2Vec.load_word2vec_format(result_folder + 'lexvec', binary=False)
22 | lexVecModel.init_sims(replace=True)
23 | log('done')
24 | 
25 | log('loading word2vec')
26 | word2vecModel = Word2Vec.load_word2vec_format(result_folder + 'word2vec', binary=False)
27 | word2vecModel.init_sims(replace=True)
28 | log('done')
29 | 
30 | # maps word to vector
31 | joined_dict = {}
32 | 
33 | # 2d array of vectors
34 | syn0 = []
35 | 
36 | # 1d array of words
37 | index2word = []
38 | 
39 | log_interval = int(10e3)
40 | processed_words = 0
41 | 
42 | for word in lexVecModel.vocab:
43 |     if word not in word2vecModel.vocab:
44 |         continue
45 | 
46 |     joined_vec = np.concatenate((lexVecModel[word], word2vecModel[word])).tolist()
47 |     joined_dict[word] = joined_vec
48 |     index2word.append(word)
49 |     syn0.append(joined_vec)
50 | 
51 |     processed_words += 1
52 | 
53 |     if not processed_words % log_interval:
54 |         log('words processed {:5d} out of {:5d}'.format(processed_words, len(lexVecModel.vocab)))
55 | 
56 | print('saving')
57 | 
58 | with open(result_folder + 'joined_dict', 'wb') as f:
59 |     msgpack.pack(joined_dict, f)
60 | 
61 | with open(result_folder + 'index2word', 'wb') as f:
62 |     msgpack.pack(index2word, f)
63 | 
64 | with open(result_folder + 'syn0', 'wb') as f:
65 |     msgpack.pack(syn0, f)
66 | 
67 | print('done')
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Goal
 2 | The main goal of the project is extending [Ukrainian tonal dictionary](https://github.com/lang-uk/tone-dict-uk). At first, we tried to achieve it by looking at words similar to ones with known tonality. **Word2vec** and **LexVec** models are used to find similar words. Then we built NN classifier and used word embeddings and existing tonal dictionary to train it.
 3 | 
 4 | ### General
 5 | `split_to_chunks/subsample.py` - is used to take a piece of files so it can be read with notepad:  
 6 | `utils.py` - has some useful methods and folders paths
 7 | 
 8 | ### Split to sentences:
 9 | We have different sources of text: csv, txt and wiki. There are different files to preprocess them.
10 | 
11 | ##### for csv: (each item is a news or article)
12 | 
13 | 1. split raw csv data to chunks and save as chunks: `split_to_chunks/csv_to_pd_chunks.py`  
14 | Result items are saved in data\chunks  
15 | 2. read chunk items, tokenize text, save list of sentences: `split_to_chunks/pd_to_sentences.py`  
16 | Result items are saved in data\sents
17 | 
18 | ##### for txt:
19 | run `split_to_chunks/txt_to_sentences` to tokenize text and save chunks with lists of sentences   
20 | Result is saved to data\sents
21 |  
22 | ##### for wiki
23 | run `split_to_chunks/wiki_to_sentences` to tokenize text and save chunks with lists of sentences  
24 | 
25 | ### Cleanup words:
26 | `train/clean.py` - will read all existing sentences files and clean words
27 | 
28 | ### Building word2vec model
29 | `train/build_w2v_dict.py` - will build **word2vec** model  
30 | `train/learn.py` - will train **word2vec** model
31 | 
32 | ### Building LexVec model
33 | [LexVec](https://github.com/alexandres/lexvec) was used on the same data with settings identical to Word2Vec to calculate embeddings.
34 | 
35 | ### Used word embeddings models
36 | If you don't want to calculate word vectors for yourself, you can obtain them from http://lang.org.ua/models website or download from Google Drive ([https://drive.google.com/file/d/0B9adEr6qDus4TjVVUW9CcEkzSjQ/view](LexVec), [https://drive.google.com/open?id=0B9adEr6qDus4dkRpaDZ4bWZCc2M](Word2Vec))
37 | 
38 | ### Using LexVec and word2vec models to predict the tone of the word
39 | `predict/build_joined_vect_dict.py` - is used to concatenate two models: LexVec and word2vec  
40 | `predict/predict.py` - predict, save the whole set, save subsample  
41 | `predict/save_best.py` - take best negative and positive candidates  
42 | 
43 | ### Credits
44 | **Oleksandr Marykovskyi**, **Vyacheslav Tykhonov** provided the seed dictionary  
45 | **Serhiy Shehovtsov** wrote the code and ran numerous experiments  
46 | **Oles Petriv** created and trained neural network model  
47 | **Vsevolod Dyomkin** proof-read the result and prepared it for publishing  
48 | **Dmitry Chaplinsky** led the project :)
49 | 


--------------------------------------------------------------------------------
/predict/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | load syn0, index2word and joined_dict models, 
  5 | load dictionary downloaded from https://github.com/lang-uk/tone-dict-uk/
  6 | train NN model and predict for all vocabulary
  7 | """
  8 | 
  9 | import sys
 10 | import os
 11 | currentDirectory = os.path.dirname(__file__)
 12 | sys.path.append(os.path.join(currentDirectory, '../')) # This will get you to source
 13 | 
 14 | from utils import *
 15 | from keras.layers import Dense, Dropout
 16 | from keras.models import Sequential
 17 | import numpy as np
 18 | import pandas as pd
 19 | from keras.callbacks import ModelCheckpoint
 20 | import codecs
 21 | from sklearn.utils import shuffle
 22 | import msgpack
 23 | 
 24 | log('loading files')
 25 | 
 26 | with open(result_folder + 'joined_dict', 'rb') as f:
 27 |     joined_dict = msgpack.unpack(f, encoding='utf-8')
 28 | 
 29 | log('files loaded')
 30 | 
 31 | X = []
 32 | y = []
 33 | 
 34 | log('read original tsv file')
 35 | tsv = pd.read_csv(data_folder + 'tone-dict-uk-manual.tsv', sep='\t', header=None, 
 36 |                   names=['word', 'tone', '_', '_1'], encoding='utf-8')
 37 | 
 38 | # log('read and concat appendix tsv file')
 39 | # appendix = pd.read_csv(result_folder + 'appendix1.tsv', sep='\t', header=None, 
 40 | #                       names=['word', 'tone'], encoding='utf-8')
 41 | 
 42 | # tsv = pd.concat([tsv, appendix], axis=0, ignore_index=True)
 43 | 
 44 | tsv = tsv.drop_duplicates(subset='word', keep='last')
 45 | 
 46 | for index, row in tsv.iterrows():
 47 |     w = row['word']
 48 |     t = row['tone']
 49 |     if w in joined_dict:
 50 |         X.append(joined_dict[w])
 51 |         y.append(t)
 52 | 
 53 | del joined_dict
 54 | 
 55 | X = np.array(X)
 56 | y = np.array(y, dtype=np.float)
 57 | 
 58 | X, y = shuffle(X, y, random_state=0)
 59 | 
 60 | np.save(result_folder + 'predict/tonedataX.npy', X)
 61 | np.save(result_folder + 'predict/tonedataY.npy', y)
 62 | 
 63 | log('training data is ready')
 64 | 
 65 | X = np.load(result_folder + 'predict/tonedataX.npy')
 66 | y = np.load(result_folder + 'predict/tonedataY.npy').clip(-2, 2)
 67 | y /= 4.0  # y /= 3.0
 68 | y += 0.5
 69 | # y = y.clip(0, 1)
 70 | 
 71 | model = Sequential()
 72 | model.add(Dense(800, activation='relu', input_shape=(600,)))
 73 | model.add(Dropout(0.5))
 74 | model.add(Dense(300, activation='tanh'))
 75 | model.add(Dropout(0.5))
 76 | model.add(Dense(1, activation='sigmoid'))
 77 | model.compile(optimizer='adam', loss='mse')
 78 | 
 79 | #model.load_weights(result_folder + 'predict/tonePredictorUkr.h5')
 80 | log('starting learning')
 81 | 
 82 | model.fit(X, y,
 83 |           batch_size=1,
 84 |           nb_epoch=100,
 85 |           verbose=1,
 86 |           validation_split=0.03,
 87 |           callbacks=[ModelCheckpoint(result_folder + 'predict/tonePredictorUkr.h5',
 88 |                                      save_best_only=True, monitor='val_loss')])
 89 | 
 90 | model.save_weights(result_folder + 'predict/model.h5')
 91 | 
 92 | with open(result_folder + 'syn0', 'rb') as f:
 93 |     syn0 = np.array(msgpack.unpack(f))
 94 | 
 95 | log('predicting')
 96 | preds = model.predict(syn0, verbose=1)
 97 | 
 98 | log('saving results')
 99 | np.save(result_folder + 'predict/preds-all.npy', preds)
100 | np.save(result_folder + 'predict/preds100000.npy', preds[:100000])
101 | 
102 | with open(result_folder + 'index2word', 'rb') as f:
103 |     index2word = np.array(msgpack.unpack(f, encoding='utf-8'))
104 | 
105 | preds = np.load(result_folder + 'predict/preds100000.npy')
106 | words = index2word[:100000]
107 | 
108 | dic = []
109 | for i in range(0, len(preds)):
110 |     dic.append([words[i], preds[i][0]])
111 | 
112 | dic = sorted(dic, key=lambda l: l[1], reverse=True)
113 | 
114 | with codecs.open(result_folder + 'predict/ToneResults100000.txt', "w", "utf-8") as stream:
115 |     for i in range(0, len(dic)):
116 |         stream.write(dic[i][0] + '\t' + str(dic[i][1]) + u"\n")
117 | 
118 | print('done')
119 | 


--------------------------------------------------------------------------------
/examples/keras classifier usage examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Using Theano backend.\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "import warnings\n",
 20 |     "warnings.filterwarnings('ignore')\n",
 21 |     "\n",
 22 |     "import sys\n",
 23 |     "import os\n",
 24 |     "import numpy as np\n",
 25 |     "import msgpack\n",
 26 |     "from keras.models import Sequential\n",
 27 |     "from keras.layers import Dense, Dropout\n",
 28 |     "\n",
 29 |     "data_folder = os.path.join(os.getcwd(), '../../data/')\n",
 30 |     "result_folder = data_folder + 'result/'"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "Wall time: 4.67 s\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "%%time\n",
 50 |     "model = Sequential()\n",
 51 |     "model.add(Dense(800, activation='relu', input_shape=(600,)))\n",
 52 |     "model.add(Dropout(0.5))\n",
 53 |     "model.add(Dense(300, activation='tanh'))\n",
 54 |     "model.add(Dropout(0.5))\n",
 55 |     "model.add(Dense(1, activation='sigmoid'))\n",
 56 |     "model.compile(optimizer='adam', loss='mse')\n",
 57 |     "\n",
 58 |     "model.load_weights(result_folder + 'predict/model.h5')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "Wall time: 40.3 s\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "%%time\n",
 78 |     "with open(result_folder + 'joined_dict', 'rb') as f:\n",
 79 |     "    joined_dict = msgpack.unpack(f, encoding='utf-8')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def print_tone(word):\n",
 91 |     "    X = np.array( [ joined_dict[word] ] )\n",
 92 |     "    pred = model.predict(X, verbose=0)\n",
 93 |     "    # convert from [0.0, 1.0] to [-2, 2]\n",
 94 |     "    tone = (pred[0][0] - 0.5) * 4\n",
 95 |     "    print(u'tone of word \"{}\" is {:0.2f}'.format(word, tone))"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "tone of word \"приголомшливі\" is 1.29\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "print_tone(u'приголомшливі')"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "tone of word \"калічать\" is -1.94\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "print_tone(u'калічать')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": []
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 2",
149 |    "language": "python",
150 |    "name": "python2"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 2
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython2",
162 |    "version": "2.7.13"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 2
167 | }
168 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from shutil import copyfile
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | import msgpack
  7 | import time
  8 | from sklearn.learning_curve import learning_curve
  9 | from colorama import Fore, Back, Style, init as colorama_init
 10 | 
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf-8')
 13 | 
 14 | colorama_init()
 15 | 
 16 | code_folder = os.path.dirname(os.path.realpath(__file__))
 17 | data_folder = os.path.join(code_folder, '../data/')
 18 | chunks_folder = data_folder + 'chunks/'
 19 | sents_folder = data_folder + 'sents/'
 20 | clean_folder = data_folder + 'clean/'
 21 | trained_folder = data_folder + 'trained/'
 22 | result_folder = data_folder + 'result/'
 23 | raw_data_folder = os.path.join(code_folder, '../raw src data/')
 24 | subsets_folder = os.path.join(code_folder, '../raw subsets/')
 25 | 
 26 | log_root_folder = ''
 27 | 
 28 | 
 29 | class AllSentences(object):
 30 |     def __init__(self, dirname):
 31 |         self.dirname = dirname
 32 | 
 33 |     def __iter__(self):
 34 |         for fname in os.listdir(self.dirname):
 35 |             log('reading file ' + fname)
 36 |             with open(os.path.join(self.dirname, fname), 'rb') as f:
 37 |                 sentences = msgpack.unpack(f)
 38 |                 for sentence in sentences:
 39 |                     yield sentence
 40 | 
 41 | 
 42 | def get_all_sentences():
 43 |     return AllSentences(os.path.join(code_folder, '../raw src data/toks'))
 44 | 
 45 | 
 46 | def log(message, color=None):
 47 |     txt = "at " + time.strftime('%H:%M:%S') + " " + str(message)
 48 |     if not color:
 49 |         # %Y-%m-%d
 50 |         print txt
 51 |     else:
 52 |         print color + txt + Style.RESET_ALL
 53 | 
 54 |     with open(log_root_folder + 'text.log', 'a') as myfile:
 55 |         myfile.write(txt + '\n')
 56 | 
 57 | 
 58 | def get_not_none(items):
 59 |     return [item for item in items if item is not None]
 60 | 
 61 | 
 62 | def create_log_folder(script_file):
 63 |     global log_root_folder
 64 | 
 65 |     # create log directory
 66 |     time_str = time.strftime('%Y_%m_%d--%H_%M_%S')
 67 |     log_root_folder = './log/' + time_str + '/'
 68 |     os.makedirs(os.path.dirname(log_root_folder))
 69 | 
 70 |     # copy current file to log folder in order to see what code was running
 71 |     copyfile(script_file, log_root_folder + os.path.basename(script_file))
 72 | 
 73 |     return log_root_folder
 74 | 
 75 | 
 76 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
 77 |                         train_sizes = np.linspace(.1, 1.0, 5)):
 78 | 
 79 |     """
 80 |     Generate a simple plot of the test and traning learning curve.
 81 | 
 82 |     Parameters
 83 |     ----------
 84 |     estimator : object type that implements the "fit" and "predict" methods
 85 |         An object of that type which is cloned for each validation.
 86 | 
 87 |     title : string
 88 |         Title for the chart.
 89 | 
 90 |     X : array-like, shape (n_samples, n_features)
 91 |         Training vector, where n_samples is the number of samples and
 92 |         n_features is the number of features.
 93 | 
 94 |     y : array-like, shape (n_samples) or (n_samples, n_features), optional
 95 |         Target relative to X for classification or regression;
 96 |         None for unsupervised learning.
 97 | 
 98 |     ylim : tuple, shape (ymin, ymax), optional
 99 |         Defines minimum and maximum yvalues plotted.
100 | 
101 |     cv : integer, cross-validation generator, optional
102 |         If an integer is passed, it is the number of folds (defaults to 3).
103 |         Specific cross-validation objects can be passed, see
104 |         sklearn.cross_validation module for the list of possible objects
105 | 
106 |     n_jobs : integer, optional
107 |         Number of jobs to run in parallel (default 1).
108 |     """
109 |     plt.figure()
110 |     plt.title(title)
111 |     if ylim is not None:
112 |         plt.ylim(*ylim)
113 |     plt.xlabel("Training examples")
114 |     plt.ylabel("Score")
115 |     train_sizes, train_scores, test_scores = learning_curve(
116 |         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='f1')
117 |     train_scores_mean = np.mean(train_scores, axis=1)
118 |     train_scores_std = np.std(train_scores, axis=1)
119 |     test_scores_mean = np.mean(test_scores, axis=1)
120 |     test_scores_std = np.std(test_scores, axis=1)
121 |     plt.grid()
122 | 
123 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
124 |                      train_scores_mean + train_scores_std, alpha=0.1,
125 |                      color="r")
126 |     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
127 |                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
128 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
129 |              label="Training score")
130 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
131 |              label="Cross-validation score")
132 | 
133 |     plt.legend(loc="best")
134 |     return plt
135 | 


--------------------------------------------------------------------------------
/examples/word2vec usage examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings('ignore')\n",
 13 |     "\n",
 14 |     "import sys\n",
 15 |     "import os\n",
 16 |     "import numpy as np\n",
 17 |     "from gensim.models import Word2Vec\n",
 18 |     "\n",
 19 |     "data_folder = os.path.join(os.getcwd(), '../../data/')\n",
 20 |     "result_folder = data_folder + 'result/'"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {
 26 |     "collapsed": false
 27 |    },
 28 |    "source": [
 29 |     "#### Завантаження моделі. \n",
 30 |     "Займає 2-3 хвилини."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "Wall time: 2min 30s\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "%%time\n",
 50 |     "model = Word2Vec.load_word2vec_format(result_folder + 'word2vec', binary=False)\n",
 51 |     "model.init_sims(replace=True)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### Класика: \"Король\" - \"Чоловік\" + \"Жінка\" = ?"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "[('королева', 0.6586748361587524)]"
 72 |       ]
 73 |      },
 74 |      "execution_count": 4,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "model.most_similar(positive=['король', 'жінка'], negative=['чоловік'], topn=1)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### \"Париж\" - \"Франція\" + \"Україна\" = ?\n",
 88 |     "Якщо з Парижу забрати Францію і додати Україну. Або іншими словами, знаходимо вектор переходу від держави до столиці і додаємо його до потрібної держави"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "[('київ', 0.555548369884491)]"
102 |       ]
103 |      },
104 |      "execution_count": 5,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "model.most_similar(positive=['париж', 'україна'], negative=['франція'], topn=1)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 6,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "[('сполучених', 0.6039266586303711),\n",
124 |        " ('буш', 0.5893256068229675),\n",
125 |        " ('обама', 0.5754402279853821),\n",
126 |        " ('штатів', 0.5719932913780212),\n",
127 |        " ('барак', 0.5545384883880615)]"
128 |       ]
129 |      },
130 |      "execution_count": 6,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "model.most_similar(positive=['ющенко', 'сша'], negative=['україна'], topn=5)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "### Пошук найближчих за значенням слів"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "[('хлопець', 0.7193020582199097),\n",
157 |        " ('молодик', 0.6740586161613464),\n",
158 |        " ('юнак', 0.6462587118148804),\n",
159 |        " ('хлопчина', 0.6355247497558594),\n",
160 |        " ('односелець', 0.6325840353965759)]"
161 |       ]
162 |      },
163 |      "execution_count": 7,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "model.most_similar(positive=['чоловік'], topn=5)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "[('дівчина', 0.8387635946273804),\n",
183 |        " ('дівчинка', 0.7695997953414917),\n",
184 |        " ('людина', 0.7262020111083984),\n",
185 |        " ('пенсіонерка', 0.723316490650177),\n",
186 |        " ('дитина', 0.7158530950546265)]"
187 |       ]
188 |      },
189 |      "execution_count": 8,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "model.most_similar(positive=['жінка'], topn=5)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "source": [
204 |     "### Ще кілька прикладів\n",
205 |     "\"Стрункий\" відноситься до \"стрункіший\" як \"бідний\" до \"?\""
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 19,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "[('бідніший', 0.5530775785446167),\n",
219 |        " ('зліший', 0.5398036241531372),\n",
220 |        " ('робучий', 0.5387618541717529)]"
221 |       ]
222 |      },
223 |      "execution_count": 19,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "model.most_similar(positive=['щасливіший', 'бідний'], negative=['щасливий'], topn=3)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "Відношення теперішнього часу до минулого"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 20,
242 |    "metadata": {
243 |     "collapsed": false
244 |    },
245 |    "outputs": [
246 |     {
247 |      "data": {
248 |       "text/plain": [
249 |        "[('співав', 0.7110381126403809),\n",
250 |        " ('танцював', 0.6071131229400635),\n",
251 |        " ('наспівував', 0.5819215178489685)]"
252 |       ]
253 |      },
254 |      "execution_count": 20,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "model.most_similar(positive=['бігав', 'співати'], negative=['бігти'], topn=3)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "Відношення країни до національної валюти"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 23,
273 |    "metadata": {
274 |     "collapsed": false
275 |    },
276 |    "outputs": [
277 |     {
278 |      "data": {
279 |       "text/plain": [
280 |        "[('валюта', 0.572724461555481),\n",
281 |        " ('євровалюта', 0.5442174673080444),\n",
282 |        " ('долар', 0.5256495475769043)]"
283 |       ]
284 |      },
285 |      "execution_count": 23,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "model.most_similar(positive=['гривня', 'європа'], negative=['україна'], topn=3)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "Однина до множини"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 32,
304 |    "metadata": {
305 |     "collapsed": false
306 |    },
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "[('птахи', 0.4623814821243286),\n",
312 |        " ('ширяв', 0.427895724773407),\n",
313 |        " ('фазан', 0.42269349098205566)]"
314 |       ]
315 |      },
316 |      "execution_count": 32,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "model.most_similar(positive=['слова', 'птах'], negative=['слово'], topn=3)"
323 |    ]
324 |   }
325 |  ],
326 |  "metadata": {
327 |   "kernelspec": {
328 |    "display_name": "Python 2",
329 |    "language": "python",
330 |    "name": "python2"
331 |   },
332 |   "language_info": {
333 |    "codemirror_mode": {
334 |     "name": "ipython",
335 |     "version": 2
336 |    },
337 |    "file_extension": ".py",
338 |    "mimetype": "text/x-python",
339 |    "name": "python",
340 |    "nbconvert_exporter": "python",
341 |    "pygments_lexer": "ipython2",
342 |    "version": "2.7.13"
343 |   }
344 |  },
345 |  "nbformat": 4,
346 |  "nbformat_minor": 0
347 | }
348 | 


--------------------------------------------------------------------------------