├── examples ├── README.md ├── keras classifier usage examples.ipynb └── word2vec usage examples.ipynb ├── train ├── build_w2v_dict.py ├── learn.py └── clean.py ├── .gitignore ├── split_to_chunks ├── subsample.py ├── csv_to_pd_chunks.py ├── txt_to_sentences.py ├── pd_to_sentences.py └── wiki_to_sentences.py ├── predict ├── save_best.py ├── build_joined_vect_dict.py └── predict.py ├── README.md └── utils.py /examples/README.md: -------------------------------------------------------------------------------- 1 | ### Приклади використання навчених моделей 2 | [word2vec usage examples](word2vec usage examples.ipynb) - використання моделі `word2vec` для простої арифметики 3 | [keras classifier usage examples](keras classifier usage examples.ipynb) - використання навченої нейронної мережі для класифікації тональності -------------------------------------------------------------------------------- /train/build_w2v_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | builds a vocabulary for word2vec and saves it 5 | """ 6 | import sys 7 | sys.path.append('../') 8 | 9 | import gensim, logging 10 | 11 | from utils import * 12 | from colorama import Fore, Back, Style 13 | 14 | log('starting') 15 | 16 | # create log directory 17 | log_root_folder = create_log_folder(__file__) 18 | 19 | # setup logging to write to current log folder 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, 21 | filename=log_root_folder + 'word2vec.log') 22 | 23 | # create empty model 24 | model = gensim.models.Word2Vec(size=200) 25 | 26 | log('building dictionary') 27 | model.build_vocab(all_sentences) 28 | 29 | log('saving model') 30 | model.save(trained_folder + 'word2vec_200_dmt.model') 31 | 32 | log('done', Fore.GREEN) 33 | -------------------------------------------------------------------------------- /train/learn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | loads model with vocabulary saved by build_w2v_dict.py 5 | trains a model on sentences and saves it 6 | """ 7 | import sys 8 | sys.path.append('../') 9 | 10 | import gensim, logging 11 | from utils import * 12 | from colorama import Fore, Back, Style 13 | 14 | log('starting') 15 | 16 | # create log directory 17 | log_root_folder = create_log_folder(__file__) 18 | 19 | # setup logging to write to current log folder 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, 21 | filename=log_root_folder + 'word2vec.log') 22 | 23 | # load model with dictionary from file 24 | model = gensim.models.Word2Vec.load(trained_folder + 'big_word2vec_200.model') 25 | 26 | log('training') 27 | model.train(all_sentences) 28 | 29 | log('saving model') 30 | model.save(trained_folder + 'big_word2vec_trained_200.model') 31 | 32 | log('done', Fore.GREEN) 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # IDE folders 10 | .idea/ 11 | 12 | # Test files 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | 60 | # Sphinx documentation 61 | docs/_build/ 62 | 63 | # PyBuilder 64 | target/ 65 | 66 | #Ipython Notebook 67 | .ipynb_checkpoints 68 | 69 | snapshots 70 | *.swp 71 | -------------------------------------------------------------------------------- /split_to_chunks/subsample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: cp1251 -*- 2 | 3 | import sys 4 | sys.path.append('../') 5 | from utils import * 6 | from colorama import Fore, Back, Style 7 | 8 | chunk_size = 5000 9 | 10 | files = [ 11 | '../raw src data/wiki_sent_tok.txt', 12 | # '../raw src data/up.csv', 13 | # '../raw src data/zaxid.csv', 14 | # '../raw src data/dt.all.txt', 15 | # '../raw src data/um.txt', 16 | # '../raw src data/vz.txt', 17 | # '../raw src data/korr.csv', 18 | # '../raw src data/unian.csv', 19 | ] 20 | 21 | log('starting') 22 | 23 | for src_file in files: 24 | log('reading src file ' + src_file) 25 | 26 | content = '' 27 | lines_read = 0 28 | with open(src_file, 'r') as f: 29 | for line in f: 30 | content += line 31 | lines_read += 1 32 | 33 | if lines_read == chunk_size: 34 | break 35 | 36 | log('writing sample to file') 37 | with open(subsets_folder + os.path.basename(src_file), 'w') as f: 38 | f.write(content) 39 | 40 | log('writing done', Fore.GREEN) 41 | 42 | log('done', Fore.GREEN) 43 | -------------------------------------------------------------------------------- /predict/save_best.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append('..') 4 | 5 | # -*- coding: utf-8 -*- 6 | import numpy as np 7 | import pandas as pd 8 | import codecs 9 | from sklearn.utils import shuffle 10 | import msgpack 11 | from utils import * 12 | 13 | log('loading index2word file') 14 | with open(result_folder + 'index2word', 'rb') as f: 15 | index2word = np.array(msgpack.unpack(f, encoding='utf-8')) 16 | 17 | log('read original tsv file') 18 | tsv = pd.read_csv(data_folder + 'orig-tone-dict-v2.tsv', sep='\t', header=None, names=['word', 'tone', '_', '_1']) 19 | tones = dict(zip(tsv.word, tsv.tone)) 20 | 21 | log('loading predictions') 22 | preds = np.load(result_folder + 'predict/preds-all.npy') 23 | dic = [] 24 | for i in range(0, len(preds)): 25 | w = index2word[i] 26 | if w not in tones: 27 | dic.append([index2word[i], preds[i][0]]) 28 | 29 | log('sorting') 30 | dic = np.array(sorted(dic, key=lambda l: l[1], reverse=True)) 31 | 32 | # top positives and negatives 33 | pos = dic[:2000, :] 34 | neg = dic[-2000:, :][::-1] 35 | 36 | with codecs.open(result_folder + 'predict/top_pos_N.txt', "w", "utf-8") as stream: 37 | for i in range(0, len(pos)): 38 | stream.write(pos[i][0] + '\t' + str(pos[i][1]) + u"\n") 39 | 40 | with codecs.open(result_folder + 'predict/top_neg_N.txt', "w", "utf-8") as stream: 41 | for i in range(0, len(neg)): 42 | stream.write(neg[i][0] + '\t' + str(neg[i][1]) + u"\n") 43 | 44 | print('done') 45 | -------------------------------------------------------------------------------- /split_to_chunks/csv_to_pd_chunks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: cp1251 -*- 2 | """ 3 | split big files to smaller chunks 4 | each chunk is csv 5 | """ 6 | import sys 7 | sys.path.append('../') 8 | import pandas as pd 9 | import math 10 | from utils import * 11 | from colorama import Fore, Back, Style 12 | 13 | chunk_size = int(1e5) # 100 000 14 | chunks_saved = 0 15 | files = [{ 16 | 'url': raw_data_folder + 'ukrlib.csv', 17 | 'cols': [1, 0] 18 | }, { 19 | 'url': raw_data_folder + 'up.csv', 20 | 'cols': [6, 4] 21 | }, { 22 | 'url': raw_data_folder + 'zaxid.csv', 23 | 'cols': [5, 2] 24 | }, { 25 | 'url': raw_data_folder + 'korr.csv', 26 | 'cols': [1, 2] 27 | }, { 28 | 'url': raw_data_folder + 'unian.csv', 29 | 'cols': [5, 4] 30 | }] 31 | 32 | log('starting') 33 | 34 | for src_file in files: 35 | log('reading src file ' + src_file['url']) 36 | data = pd.read_csv(src_file['url']) 37 | chunks_number = int(math.ceil(data.shape[0] / float(chunk_size))) 38 | 39 | log('chunks to save ' + str(chunks_number)) 40 | 41 | for i in range(0, chunks_number): 42 | start_time = time.time() 43 | chunk = data.iloc[i * chunk_size: (i + 1) * chunk_size, src_file['cols']] 44 | 45 | log('saving chunk ' + str(i)) 46 | 47 | chunk.to_csv(data_folder + 'chunks/chunk_{}.csv'.format(chunks_saved)) 48 | total_time = time.time() - start_time 49 | 50 | log('saving completed {}, running time {}'.format(chunks_saved, total_time), Fore.GREEN) 51 | 52 | chunks_saved += 1 53 | 54 | log('done', Fore.GREEN) 55 | -------------------------------------------------------------------------------- /split_to_chunks/txt_to_sentences.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | load entire txt file, tokenize it and save lists of sentences in chunks 5 | """ 6 | import sys 7 | sys.path.append('../') 8 | 9 | import math 10 | 11 | from utils import * 12 | import tokenize_uk 13 | from colorama import Fore, Back, Style 14 | 15 | items_processed = 0 16 | chunks_folder = data_folder + 'chunks/' 17 | sents_folder = data_folder + 'sents/' 18 | sents_per_chunk = int(7e5) # 700 000 19 | log_interval = 10000 20 | 21 | files = [raw_data_folder + 'ukr_lit.txt', raw_data_folder + 'td.txt'] 22 | 23 | log('starting') 24 | 25 | for src_file in files: 26 | with open(src_file, 'rb') as f: 27 | data = f.read() 28 | 29 | log('processing file ' + src_file) 30 | 31 | text = data.decode('utf-8') 32 | tokens_text = tokenize_uk.tokenize_sents(text) 33 | 34 | log('tokenization finished') 35 | 36 | sents_number = int(math.ceil(len(tokens_text) / float(sents_per_chunk))) 37 | 38 | for i in range(0, sents_number): 39 | sentences = [] 40 | chunk = tokens_text[i * sents_per_chunk: (i + 1) * sents_per_chunk] 41 | 42 | for sentence in chunk: 43 | sentences.append(tokenize_uk.tokenize_words(sentence)) 44 | 45 | if items_processed % log_interval == 0: 46 | log('items processed {}'.format(items_processed)) 47 | 48 | items_processed += 1 49 | 50 | result_file = os.path.basename(src_file) + str(i) + '.msg' 51 | with open(sents_folder + result_file, 'wb') as f: 52 | msgpack.pack(sentences, f) 53 | 54 | log('file {} saved'.format(result_file)) 55 | 56 | log('done', Fore.GREEN) 57 | -------------------------------------------------------------------------------- /split_to_chunks/pd_to_sentences.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | load chunks one by one and save to chunks with array of sentences each sentences is an array of words 5 | in simple words, the goal this script is a tokanization of text in chunks 6 | """ 7 | import sys 8 | sys.path.append('../') 9 | 10 | import pandas as pd 11 | from utils import * 12 | import tokenize_uk 13 | from colorama import Fore, Back, Style 14 | 15 | items_processed = 0 16 | chunks_folder = data_folder + 'chunks/' 17 | sents_folder = data_folder + 'sents/' 18 | log_interval = 10000 19 | 20 | log('starting') 21 | 22 | start_time = time.time() 23 | 24 | for chunk_file in os.listdir(chunks_folder): 25 | data = pd.read_csv(chunks_folder + chunk_file) 26 | sentences = [] 27 | 28 | for row in data.itertuples(): 29 | title = row[3].decode('utf-8') if isinstance(row[3], basestring) else '' 30 | text = row[2].decode('utf-8') if isinstance(row[2], basestring) else '' 31 | 32 | tokens_title = tokenize_uk.tokenize_sents(title) 33 | tokens_text = tokenize_uk.tokenize_sents(text) 34 | 35 | for sentence in tokens_title + tokens_text: 36 | sentences.append(tokenize_uk.tokenize_words(sentence)) 37 | 38 | total_time = time.time() - start_time 39 | 40 | if items_processed % log_interval == 0: 41 | log('items processed {}, running time {}'.format(items_processed, total_time)) 42 | 43 | items_processed += 1 44 | 45 | log('saving chunk ') 46 | 47 | result_file = os.path.splitext(chunk_file)[0] + '.msg' 48 | with open(sents_folder + result_file, 'wb') as f: 49 | msgpack.pack(sentences, f) 50 | 51 | log('done', Fore.GREEN) 52 | -------------------------------------------------------------------------------- /split_to_chunks/wiki_to_sentences.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | in wiki file each line is a sentence, so we can read and tokanize line by line 5 | saves lists of sentences in chunks 6 | """ 7 | import sys 8 | sys.path.append('../') 9 | from itertools import islice 10 | 11 | from utils import * 12 | import tokenize_uk 13 | from colorama import Fore, Back, Style 14 | 15 | items_processed = 0 16 | files_saved = 0 17 | lines_per_chunk = int(7e5) # 700 000 18 | log_interval = 10000 19 | 20 | files = [raw_data_folder + 'wiki_sent_tok.txt'] 21 | 22 | log('starting') 23 | 24 | for src_file in files: 25 | log('processing file ' + src_file) 26 | 27 | with open(src_file, 'rb') as f: 28 | while True: 29 | next_n_lines = list(islice(f, lines_per_chunk)) 30 | if not next_n_lines: 31 | break 32 | 33 | text = ''.join(next_n_lines).decode('utf-8') 34 | 35 | tokens_text = tokenize_uk.tokenize_sents(text) 36 | 37 | log('tokenization finished') 38 | 39 | sentences = [] 40 | 41 | for sentence in tokens_text: 42 | sentences.append(tokenize_uk.tokenize_words(sentence)) 43 | 44 | if items_processed % log_interval == 0: 45 | log('items processed {}'.format(items_processed)) 46 | 47 | items_processed += 1 48 | 49 | result_file = os.path.basename(src_file) + str(files_saved) + '.msg' 50 | with open(sents_folder + result_file, 'wb') as s_f: 51 | msgpack.pack(sentences, s_f) 52 | 53 | log('file {} saved'.format(result_file)) 54 | 55 | files_saved += 1 56 | 57 | log('done', Fore.GREEN) 58 | -------------------------------------------------------------------------------- /train/clean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | clean data by removing special non alphabetic characters and converting everything to lowercase 5 | """ 6 | import sys 7 | sys.path.append('../') 8 | 9 | import re 10 | 11 | from utils import * 12 | from colorama import Fore, Back, Style 13 | 14 | 15 | items_processed = 0 16 | clean_folder = data_folder + 'clean/' 17 | log_interval = 100000 18 | 19 | log('starting') 20 | 21 | start_time = time.time() 22 | 23 | non_char_re = re.compile('\W', re.UNICODE) 24 | 25 | 26 | def convert_number(num): 27 | try: 28 | inum = int(num) 29 | 30 | if inum < 10: 31 | return str(inum) 32 | else: 33 | return "0" * min(len(num), 5) 34 | 35 | except ValueError: 36 | return num 37 | 38 | for file_name in os.listdir(sents_folder): 39 | with open(sents_folder + file_name, 'rb') as f: 40 | sentences = msgpack.unpack(f) 41 | 42 | for i, sentence in enumerate(sentences): 43 | for j, word in enumerate(sentence): 44 | word = word.decode('utf-8') 45 | sentence[j] = None if non_char_re.search(word) else convert_number(word.lower()) 46 | 47 | sentences[i] = get_not_none(sentence) 48 | 49 | if len(sentences[i]) == 0: 50 | sentences[i] = None 51 | 52 | total_time = time.time() - start_time 53 | 54 | if items_processed % log_interval == 0: 55 | log('items processed {} and it took {}'.format(items_processed, total_time)) 56 | 57 | items_processed += 1 58 | 59 | sentences = get_not_none(sentences) 60 | 61 | log('saving chunk ' + file_name) 62 | 63 | with open(clean_folder + file_name, 'wb') as f: 64 | msgpack.pack(sentences, f) 65 | 66 | log('chunk ' + file_name + ' saved', Fore.GREEN) 67 | 68 | log('done', Fore.GREEN) 69 | -------------------------------------------------------------------------------- /predict/build_joined_vect_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | requires trained lexvec with word2vec models placed to ../data/result/ folder 5 | it will join them and save syn0, index2word and joined_dict files 6 | you can find download links for models here: https://github.com/lang-uk/tonal-model/#used-word-embeddings-models 7 | """ 8 | 9 | import sys 10 | import os 11 | 12 | import numpy as np 13 | from gensim.models import Word2Vec 14 | 15 | currentDirectory = os.path.dirname(__file__) 16 | sys.path.append(os.path.join(currentDirectory, '../')) # This will get you to source 17 | 18 | from utils import * 19 | 20 | log('loading lexVecModel') 21 | lexVecModel = Word2Vec.load_word2vec_format(result_folder + 'lexvec', binary=False) 22 | lexVecModel.init_sims(replace=True) 23 | log('done') 24 | 25 | log('loading word2vec') 26 | word2vecModel = Word2Vec.load_word2vec_format(result_folder + 'word2vec', binary=False) 27 | word2vecModel.init_sims(replace=True) 28 | log('done') 29 | 30 | # maps word to vector 31 | joined_dict = {} 32 | 33 | # 2d array of vectors 34 | syn0 = [] 35 | 36 | # 1d array of words 37 | index2word = [] 38 | 39 | log_interval = int(10e3) 40 | processed_words = 0 41 | 42 | for word in lexVecModel.vocab: 43 | if word not in word2vecModel.vocab: 44 | continue 45 | 46 | joined_vec = np.concatenate((lexVecModel[word], word2vecModel[word])).tolist() 47 | joined_dict[word] = joined_vec 48 | index2word.append(word) 49 | syn0.append(joined_vec) 50 | 51 | processed_words += 1 52 | 53 | if not processed_words % log_interval: 54 | log('words processed {:5d} out of {:5d}'.format(processed_words, len(lexVecModel.vocab))) 55 | 56 | print('saving') 57 | 58 | with open(result_folder + 'joined_dict', 'wb') as f: 59 | msgpack.pack(joined_dict, f) 60 | 61 | with open(result_folder + 'index2word', 'wb') as f: 62 | msgpack.pack(index2word, f) 63 | 64 | with open(result_folder + 'syn0', 'wb') as f: 65 | msgpack.pack(syn0, f) 66 | 67 | print('done') 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Goal 2 | The main goal of the project is extending [Ukrainian tonal dictionary](https://github.com/lang-uk/tone-dict-uk). At first, we tried to achieve it by looking at words similar to ones with known tonality. **Word2vec** and **LexVec** models are used to find similar words. Then we built NN classifier and used word embeddings and existing tonal dictionary to train it. 3 | 4 | ### General 5 | `split_to_chunks/subsample.py` - is used to take a piece of files so it can be read with notepad: 6 | `utils.py` - has some useful methods and folders paths 7 | 8 | ### Split to sentences: 9 | We have different sources of text: csv, txt and wiki. There are different files to preprocess them. 10 | 11 | ##### for csv: (each item is a news or article) 12 | 13 | 1. split raw csv data to chunks and save as chunks: `split_to_chunks/csv_to_pd_chunks.py` 14 | Result items are saved in data\chunks 15 | 2. read chunk items, tokenize text, save list of sentences: `split_to_chunks/pd_to_sentences.py` 16 | Result items are saved in data\sents 17 | 18 | ##### for txt: 19 | run `split_to_chunks/txt_to_sentences` to tokenize text and save chunks with lists of sentences 20 | Result is saved to data\sents 21 | 22 | ##### for wiki 23 | run `split_to_chunks/wiki_to_sentences` to tokenize text and save chunks with lists of sentences 24 | 25 | ### Cleanup words: 26 | `train/clean.py` - will read all existing sentences files and clean words 27 | 28 | ### Building word2vec model 29 | `train/build_w2v_dict.py` - will build **word2vec** model 30 | `train/learn.py` - will train **word2vec** model 31 | 32 | ### Building LexVec model 33 | [LexVec](https://github.com/alexandres/lexvec) was used on the same data with settings identical to Word2Vec to calculate embeddings. 34 | 35 | ### Used word embeddings models 36 | If you don't want to calculate word vectors for yourself, you can obtain them from http://lang.org.ua/models website or download from Google Drive ([https://drive.google.com/file/d/0B9adEr6qDus4TjVVUW9CcEkzSjQ/view](LexVec), [https://drive.google.com/open?id=0B9adEr6qDus4dkRpaDZ4bWZCc2M](Word2Vec)) 37 | 38 | ### Using LexVec and word2vec models to predict the tone of the word 39 | `predict/build_joined_vect_dict.py` - is used to concatenate two models: LexVec and word2vec 40 | `predict/predict.py` - predict, save the whole set, save subsample 41 | `predict/save_best.py` - take best negative and positive candidates 42 | 43 | ### Credits 44 | **Oleksandr Marykovskyi**, **Vyacheslav Tykhonov** provided the seed dictionary 45 | **Serhiy Shehovtsov** wrote the code and ran numerous experiments 46 | **Oles Petriv** created and trained neural network model 47 | **Vsevolod Dyomkin** proof-read the result and prepared it for publishing 48 | **Dmitry Chaplinsky** led the project :) 49 | -------------------------------------------------------------------------------- /predict/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | load syn0, index2word and joined_dict models, 5 | load dictionary downloaded from https://github.com/lang-uk/tone-dict-uk/ 6 | train NN model and predict for all vocabulary 7 | """ 8 | 9 | import sys 10 | import os 11 | currentDirectory = os.path.dirname(__file__) 12 | sys.path.append(os.path.join(currentDirectory, '../')) # This will get you to source 13 | 14 | from utils import * 15 | from keras.layers import Dense, Dropout 16 | from keras.models import Sequential 17 | import numpy as np 18 | import pandas as pd 19 | from keras.callbacks import ModelCheckpoint 20 | import codecs 21 | from sklearn.utils import shuffle 22 | import msgpack 23 | 24 | log('loading files') 25 | 26 | with open(result_folder + 'joined_dict', 'rb') as f: 27 | joined_dict = msgpack.unpack(f, encoding='utf-8') 28 | 29 | log('files loaded') 30 | 31 | X = [] 32 | y = [] 33 | 34 | log('read original tsv file') 35 | tsv = pd.read_csv(data_folder + 'tone-dict-uk-manual.tsv', sep='\t', header=None, 36 | names=['word', 'tone', '_', '_1'], encoding='utf-8') 37 | 38 | # log('read and concat appendix tsv file') 39 | # appendix = pd.read_csv(result_folder + 'appendix1.tsv', sep='\t', header=None, 40 | # names=['word', 'tone'], encoding='utf-8') 41 | 42 | # tsv = pd.concat([tsv, appendix], axis=0, ignore_index=True) 43 | 44 | tsv = tsv.drop_duplicates(subset='word', keep='last') 45 | 46 | for index, row in tsv.iterrows(): 47 | w = row['word'] 48 | t = row['tone'] 49 | if w in joined_dict: 50 | X.append(joined_dict[w]) 51 | y.append(t) 52 | 53 | del joined_dict 54 | 55 | X = np.array(X) 56 | y = np.array(y, dtype=np.float) 57 | 58 | X, y = shuffle(X, y, random_state=0) 59 | 60 | np.save(result_folder + 'predict/tonedataX.npy', X) 61 | np.save(result_folder + 'predict/tonedataY.npy', y) 62 | 63 | log('training data is ready') 64 | 65 | X = np.load(result_folder + 'predict/tonedataX.npy') 66 | y = np.load(result_folder + 'predict/tonedataY.npy').clip(-2, 2) 67 | y /= 4.0 # y /= 3.0 68 | y += 0.5 69 | # y = y.clip(0, 1) 70 | 71 | model = Sequential() 72 | model.add(Dense(800, activation='relu', input_shape=(600,))) 73 | model.add(Dropout(0.5)) 74 | model.add(Dense(300, activation='tanh')) 75 | model.add(Dropout(0.5)) 76 | model.add(Dense(1, activation='sigmoid')) 77 | model.compile(optimizer='adam', loss='mse') 78 | 79 | #model.load_weights(result_folder + 'predict/tonePredictorUkr.h5') 80 | log('starting learning') 81 | 82 | model.fit(X, y, 83 | batch_size=1, 84 | nb_epoch=100, 85 | verbose=1, 86 | validation_split=0.03, 87 | callbacks=[ModelCheckpoint(result_folder + 'predict/tonePredictorUkr.h5', 88 | save_best_only=True, monitor='val_loss')]) 89 | 90 | model.save_weights(result_folder + 'predict/model.h5') 91 | 92 | with open(result_folder + 'syn0', 'rb') as f: 93 | syn0 = np.array(msgpack.unpack(f)) 94 | 95 | log('predicting') 96 | preds = model.predict(syn0, verbose=1) 97 | 98 | log('saving results') 99 | np.save(result_folder + 'predict/preds-all.npy', preds) 100 | np.save(result_folder + 'predict/preds100000.npy', preds[:100000]) 101 | 102 | with open(result_folder + 'index2word', 'rb') as f: 103 | index2word = np.array(msgpack.unpack(f, encoding='utf-8')) 104 | 105 | preds = np.load(result_folder + 'predict/preds100000.npy') 106 | words = index2word[:100000] 107 | 108 | dic = [] 109 | for i in range(0, len(preds)): 110 | dic.append([words[i], preds[i][0]]) 111 | 112 | dic = sorted(dic, key=lambda l: l[1], reverse=True) 113 | 114 | with codecs.open(result_folder + 'predict/ToneResults100000.txt', "w", "utf-8") as stream: 115 | for i in range(0, len(dic)): 116 | stream.write(dic[i][0] + '\t' + str(dic[i][1]) + u"\n") 117 | 118 | print('done') 119 | -------------------------------------------------------------------------------- /examples/keras classifier usage examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using Theano backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import warnings\n", 20 | "warnings.filterwarnings('ignore')\n", 21 | "\n", 22 | "import sys\n", 23 | "import os\n", 24 | "import numpy as np\n", 25 | "import msgpack\n", 26 | "from keras.models import Sequential\n", 27 | "from keras.layers import Dense, Dropout\n", 28 | "\n", 29 | "data_folder = os.path.join(os.getcwd(), '../../data/')\n", 30 | "result_folder = data_folder + 'result/'" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Wall time: 4.67 s\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "%%time\n", 50 | "model = Sequential()\n", 51 | "model.add(Dense(800, activation='relu', input_shape=(600,)))\n", 52 | "model.add(Dropout(0.5))\n", 53 | "model.add(Dense(300, activation='tanh'))\n", 54 | "model.add(Dropout(0.5))\n", 55 | "model.add(Dense(1, activation='sigmoid'))\n", 56 | "model.compile(optimizer='adam', loss='mse')\n", 57 | "\n", 58 | "model.load_weights(result_folder + 'predict/model.h5')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Wall time: 40.3 s\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "%%time\n", 78 | "with open(result_folder + 'joined_dict', 'rb') as f:\n", 79 | " joined_dict = msgpack.unpack(f, encoding='utf-8')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "def print_tone(word):\n", 91 | " X = np.array( [ joined_dict[word] ] )\n", 92 | " pred = model.predict(X, verbose=0)\n", 93 | " # convert from [0.0, 1.0] to [-2, 2]\n", 94 | " tone = (pred[0][0] - 0.5) * 4\n", 95 | " print(u'tone of word \"{}\" is {:0.2f}'.format(word, tone))" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "tone of word \"приголомшливі\" is 1.29\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "print_tone(u'приголомшливі')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "tone of word \"калічать\" is -1.94\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "print_tone(u'калічать')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 2", 149 | "language": "python", 150 | "name": "python2" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 2 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython2", 162 | "version": "2.7.13" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from shutil import copyfile 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import msgpack 7 | import time 8 | from sklearn.learning_curve import learning_curve 9 | from colorama import Fore, Back, Style, init as colorama_init 10 | 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | 14 | colorama_init() 15 | 16 | code_folder = os.path.dirname(os.path.realpath(__file__)) 17 | data_folder = os.path.join(code_folder, '../data/') 18 | chunks_folder = data_folder + 'chunks/' 19 | sents_folder = data_folder + 'sents/' 20 | clean_folder = data_folder + 'clean/' 21 | trained_folder = data_folder + 'trained/' 22 | result_folder = data_folder + 'result/' 23 | raw_data_folder = os.path.join(code_folder, '../raw src data/') 24 | subsets_folder = os.path.join(code_folder, '../raw subsets/') 25 | 26 | log_root_folder = '' 27 | 28 | 29 | class AllSentences(object): 30 | def __init__(self, dirname): 31 | self.dirname = dirname 32 | 33 | def __iter__(self): 34 | for fname in os.listdir(self.dirname): 35 | log('reading file ' + fname) 36 | with open(os.path.join(self.dirname, fname), 'rb') as f: 37 | sentences = msgpack.unpack(f) 38 | for sentence in sentences: 39 | yield sentence 40 | 41 | 42 | def get_all_sentences(): 43 | return AllSentences(os.path.join(code_folder, '../raw src data/toks')) 44 | 45 | 46 | def log(message, color=None): 47 | txt = "at " + time.strftime('%H:%M:%S') + " " + str(message) 48 | if not color: 49 | # %Y-%m-%d 50 | print txt 51 | else: 52 | print color + txt + Style.RESET_ALL 53 | 54 | with open(log_root_folder + 'text.log', 'a') as myfile: 55 | myfile.write(txt + '\n') 56 | 57 | 58 | def get_not_none(items): 59 | return [item for item in items if item is not None] 60 | 61 | 62 | def create_log_folder(script_file): 63 | global log_root_folder 64 | 65 | # create log directory 66 | time_str = time.strftime('%Y_%m_%d--%H_%M_%S') 67 | log_root_folder = './log/' + time_str + '/' 68 | os.makedirs(os.path.dirname(log_root_folder)) 69 | 70 | # copy current file to log folder in order to see what code was running 71 | copyfile(script_file, log_root_folder + os.path.basename(script_file)) 72 | 73 | return log_root_folder 74 | 75 | 76 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, 77 | train_sizes = np.linspace(.1, 1.0, 5)): 78 | 79 | """ 80 | Generate a simple plot of the test and traning learning curve. 81 | 82 | Parameters 83 | ---------- 84 | estimator : object type that implements the "fit" and "predict" methods 85 | An object of that type which is cloned for each validation. 86 | 87 | title : string 88 | Title for the chart. 89 | 90 | X : array-like, shape (n_samples, n_features) 91 | Training vector, where n_samples is the number of samples and 92 | n_features is the number of features. 93 | 94 | y : array-like, shape (n_samples) or (n_samples, n_features), optional 95 | Target relative to X for classification or regression; 96 | None for unsupervised learning. 97 | 98 | ylim : tuple, shape (ymin, ymax), optional 99 | Defines minimum and maximum yvalues plotted. 100 | 101 | cv : integer, cross-validation generator, optional 102 | If an integer is passed, it is the number of folds (defaults to 3). 103 | Specific cross-validation objects can be passed, see 104 | sklearn.cross_validation module for the list of possible objects 105 | 106 | n_jobs : integer, optional 107 | Number of jobs to run in parallel (default 1). 108 | """ 109 | plt.figure() 110 | plt.title(title) 111 | if ylim is not None: 112 | plt.ylim(*ylim) 113 | plt.xlabel("Training examples") 114 | plt.ylabel("Score") 115 | train_sizes, train_scores, test_scores = learning_curve( 116 | estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='f1') 117 | train_scores_mean = np.mean(train_scores, axis=1) 118 | train_scores_std = np.std(train_scores, axis=1) 119 | test_scores_mean = np.mean(test_scores, axis=1) 120 | test_scores_std = np.std(test_scores, axis=1) 121 | plt.grid() 122 | 123 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 124 | train_scores_mean + train_scores_std, alpha=0.1, 125 | color="r") 126 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 127 | test_scores_mean + test_scores_std, alpha=0.1, color="g") 128 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 129 | label="Training score") 130 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", 131 | label="Cross-validation score") 132 | 133 | plt.legend(loc="best") 134 | return plt 135 | -------------------------------------------------------------------------------- /examples/word2vec usage examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings('ignore')\n", 13 | "\n", 14 | "import sys\n", 15 | "import os\n", 16 | "import numpy as np\n", 17 | "from gensim.models import Word2Vec\n", 18 | "\n", 19 | "data_folder = os.path.join(os.getcwd(), '../../data/')\n", 20 | "result_folder = data_folder + 'result/'" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "collapsed": false 27 | }, 28 | "source": [ 29 | "#### Завантаження моделі. \n", 30 | "Займає 2-3 хвилини." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Wall time: 2min 30s\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "%%time\n", 50 | "model = Word2Vec.load_word2vec_format(result_folder + 'word2vec', binary=False)\n", 51 | "model.init_sims(replace=True)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### Класика: \"Король\" - \"Чоловік\" + \"Жінка\" = ?" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "[('королева', 0.6586748361587524)]" 72 | ] 73 | }, 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "model.most_similar(positive=['король', 'жінка'], negative=['чоловік'], topn=1)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### \"Париж\" - \"Франція\" + \"Україна\" = ?\n", 88 | "Якщо з Парижу забрати Францію і додати Україну. Або іншими словами, знаходимо вектор переходу від держави до столиці і додаємо його до потрібної держави" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "[('київ', 0.555548369884491)]" 102 | ] 103 | }, 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "model.most_similar(positive=['париж', 'україна'], negative=['франція'], topn=1)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "[('сполучених', 0.6039266586303711),\n", 124 | " ('буш', 0.5893256068229675),\n", 125 | " ('обама', 0.5754402279853821),\n", 126 | " ('штатів', 0.5719932913780212),\n", 127 | " ('барак', 0.5545384883880615)]" 128 | ] 129 | }, 130 | "execution_count": 6, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "model.most_similar(positive=['ющенко', 'сша'], negative=['україна'], topn=5)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### Пошук найближчих за значенням слів" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "[('хлопець', 0.7193020582199097),\n", 157 | " ('молодик', 0.6740586161613464),\n", 158 | " ('юнак', 0.6462587118148804),\n", 159 | " ('хлопчина', 0.6355247497558594),\n", 160 | " ('односелець', 0.6325840353965759)]" 161 | ] 162 | }, 163 | "execution_count": 7, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "model.most_similar(positive=['чоловік'], topn=5)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "[('дівчина', 0.8387635946273804),\n", 183 | " ('дівчинка', 0.7695997953414917),\n", 184 | " ('людина', 0.7262020111083984),\n", 185 | " ('пенсіонерка', 0.723316490650177),\n", 186 | " ('дитина', 0.7158530950546265)]" 187 | ] 188 | }, 189 | "execution_count": 8, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "model.most_similar(positive=['жінка'], topn=5)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "source": [ 204 | "### Ще кілька прикладів\n", 205 | "\"Стрункий\" відноситься до \"стрункіший\" як \"бідний\" до \"?\"" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 19, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "[('бідніший', 0.5530775785446167),\n", 219 | " ('зліший', 0.5398036241531372),\n", 220 | " ('робучий', 0.5387618541717529)]" 221 | ] 222 | }, 223 | "execution_count": 19, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "model.most_similar(positive=['щасливіший', 'бідний'], negative=['щасливий'], topn=3)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "Відношення теперішнього часу до минулого" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 20, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "[('співав', 0.7110381126403809),\n", 250 | " ('танцював', 0.6071131229400635),\n", 251 | " ('наспівував', 0.5819215178489685)]" 252 | ] 253 | }, 254 | "execution_count": 20, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "model.most_similar(positive=['бігав', 'співати'], negative=['бігти'], topn=3)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "Відношення країни до національної валюти" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 23, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "[('валюта', 0.572724461555481),\n", 281 | " ('євровалюта', 0.5442174673080444),\n", 282 | " ('долар', 0.5256495475769043)]" 283 | ] 284 | }, 285 | "execution_count": 23, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "model.most_similar(positive=['гривня', 'європа'], negative=['україна'], topn=3)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "Однина до множини" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 32, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "[('птахи', 0.4623814821243286),\n", 312 | " ('ширяв', 0.427895724773407),\n", 313 | " ('фазан', 0.42269349098205566)]" 314 | ] 315 | }, 316 | "execution_count": 32, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "model.most_similar(positive=['слова', 'птах'], negative=['слово'], topn=3)" 323 | ] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 2", 329 | "language": "python", 330 | "name": "python2" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 2 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython2", 342 | "version": "2.7.13" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 0 347 | } 348 | --------------------------------------------------------------------------------