├── README.md ├── examples ├── bert_crf_ner_predictor.py ├── bert_crf_ner_trainer.py ├── bert_en_word_embeddings.py ├── bert_ko_word_embeddings.py ├── bert_ner_predictor.py ├── bert_ner_trainer.py ├── bert_sentiment_predictor.py ├── bert_sentiment_trainer.py ├── doc2vec_clustering.py ├── doc2vec_tester.py ├── doc2vec_trainer.py ├── fasttext_tester.py ├── fasttext_trainer.py ├── glove_tester.py ├── koreanKeywordTest.py ├── koreanLemmatizationTest.py ├── koreanNounExtractionTest.py ├── koreanSegmentationTest.py ├── koreanSpecialTokenizerTest.py ├── koreanTokenizerTest.py ├── naver_newscomments_processor.py ├── node2vec_tester.py ├── node2vec_traianer.py ├── scibert_ner_train.py ├── scibert_test.py ├── test222.py ├── test3.py ├── test4.py ├── testBertLSTM.py ├── testCooccurrence.py ├── testDocTermMatrix.py ├── testEXCo.py ├── testFirst.py ├── testMallet.py ├── testPMI.py ├── testSVM.py ├── test_document_clustering.py ├── test_korean_lemmatizer.py ├── test_pyTextMinerTopicModel.py ├── test_word2veclite.py ├── testt.py ├── word2vec_tester.py ├── word2vec_trainer.py └── zipfsManager.py ├── glove-win_devc_x64 ├── cooccur.exe ├── cooccurrence.bin ├── cooccurrence.shuf.bin ├── demo.bat ├── demo.sh ├── donald.txt ├── eval │ └── python │ │ ├── distance.py │ │ ├── evaluate.py │ │ └── word_analogy.py ├── glove.exe ├── pthreadVC2.dll ├── shuffle.exe └── vocab_count.exe ├── pyTextMiner ├── __init__.py ├── chunker │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── cooccurrence │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── cooccurrence.cpython-37.pyc │ └── cooccurrence.py ├── counter │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── graphml │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── helper │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── keyword │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── textrank.cpython-37.pyc │ └── textrank.py ├── lemmatizer │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── ngram │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── noun_extractor │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── pmi │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── segmentation │ ├── WordSemgmentationModelBuilder.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── lstmWordSegmentationModel.cpython-37.pyc │ │ └── wordSegmentationModelUtil.cpython-37.pyc │ ├── lstmWordSegmentationModel.py │ ├── lstmWordSegmentationModelBuilder.py │ ├── model │ │ ├── checkpoint │ │ ├── dic.pickle │ │ ├── segm.ckpt.data-00000-of-00001 │ │ ├── segm.ckpt.index │ │ └── segm.ckpt.meta │ └── wordSegmentationModelUtil.py ├── splitter │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── stemmer │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── tagger │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── tokenizer │ ├── __init__.py │ └── __pycache__ │ │ └── __init__.cpython-37.pyc ├── utility │ └── __init__.py └── version.py ├── py_bert ├── __init__.py ├── bert_classification_model.py ├── bert_dataset.py ├── bert_predictor.py ├── bert_trainer.py ├── bert_util.py ├── tokenization_kobert.py └── tokenization_korbert.py ├── py_doc2vec ├── . ... ├── __init__.py └── doc2vecModel.py ├── py_document_classification ├── __init__.py ├── lasso_term_extraction.py ├── ml_textclassification.py └── test_ml_text_classfier.py ├── py_document_clustering ├── __init__.py └── documentclustering.py ├── py_ner ├── bert_bilstm_crf_ner_train.py ├── bert_crf_ner_prediction.py ├── bert_crf_ner_train.py ├── bert_crf_ner_visualization.py ├── bert_gru_crf_ner_train.py ├── bert_ner_prediction.py ├── bert_ner_train.py ├── bertviz │ ├── attention.py │ ├── head_view.js │ ├── head_view.py │ ├── model_view.js │ ├── model_view.py │ ├── neuron_view.js │ ├── neuron_view.py │ └── pytorch_transformers_attn │ │ ├── ... │ │ ├── file_utils.py │ │ ├── modeling_bert.py │ │ ├── modeling_gpt2.py │ │ ├── modeling_openai.py │ │ ├── modeling_roberta.py │ │ ├── modeling_transfo_xl.py │ │ ├── modeling_transfo_xl_utilities.py │ │ ├── modeling_utils.py │ │ ├── modeling_xlm.py │ │ ├── modeling_xlnet.py │ │ ├── tokenization_bert.py │ │ ├── tokenization_gpt2.py │ │ ├── tokenization_openai.py │ │ ├── tokenization_roberta.py │ │ ├── tokenization_transfo_xl.py │ │ ├── tokenization_utils.py │ │ ├── tokenization_xlm.py │ │ └── tokenization_xlnet.py ├── config │ ├── ... │ ├── config.json │ └── ner_to_index.json ├── data │ ├── conlleval │ ├── dataset_info.txt │ ├── eng.testa │ ├── eng.testb │ ├── eng.train │ ├── eng.train54019 │ ├── expo_kor.test │ ├── expo_kor.train │ ├── test.txt │ └── train.txt ├── data_utils │ ├── ... │ ├── ner_dataset.py │ ├── pad_sequence.py │ ├── utils.py │ └── vocab_tokenizer.py ├── find_learning_rate.py ├── kobert │ ├── ... │ ├── mxnet_kobert.py │ ├── pytorch_kobert.py │ └── utils.py ├── lstm_cnn_crf_evaluator.py ├── lstm_cnn_crf_model.py ├── lstm_cnn_crf_trainer.py ├── lstm_cnn_crf_utils.py ├── model │ ├── ... │ ├── net.py │ ├── optimizers.py │ └── utils.py ├── ner_crf.py ├── ner_crf_ko.py ├── ner_data_loader.py └── scibert_ner_train.py ├── py_node2vec └── node2vecModel.py ├── py_topic_model ├── MalletWrapper.py ├── __init__.py ├── gdmr_plot.py ├── lda.py ├── ldaInference.py ├── ldaSeqModel.py ├── ldaVisualizer.py ├── pyTextMinerTopicModel.py └── tfidf.py ├── py_word2vec ├── __init__.py ├── avgDocumentByW2V.py ├── gloveWikiKoreanTrainer.py ├── utils.py ├── visualizeW2V.py ├── visualizeW2VPlot.py ├── word2vecFilteringFunction.py ├── word2veclite.py └── word_embeddings.py └── stopwords ├── stopword_company.txt ├── stopwordsEng.txt └── stopwordsKor.txt /README.md: -------------------------------------------------------------------------------- 1 | # pyTextMiner 2 | A text mining tool for Korean and English 3 | 4 | pyTextMiner was orginally designed as a teaching aid for my Text Mining class at Yonsei University. pyTextMiner was developed in Python. 5 | Prior to the development of pyTextMiner, I developed the yTextMiner text mining tool in Java for a teaching purpose. I used yTextMiner for my MOOC and K-MOOC courses as well as my courses that I taught at Yonsei University. 6 | 7 | In the current version of pyTextMiner, pyTextMiner can handle both English and Korean texts. However, the majority of the compoents is for Korean texts. 8 | 9 | pyTextMiner follows the principle of the pipeline architecture where each pipe takes care of its task of processing and representing the incoming text. Pipeline allows for a simple, modularized process of text. 10 | 11 | In the future, I plant to include preprocessing techinuqes for other languages such as Chinese, Japanese, and French. 12 | -------------------------------------------------------------------------------- /examples/bert_crf_ner_predictor.py: -------------------------------------------------------------------------------- 1 | from py_ner.bert_crf_ner_prediction import BertCRFNERPredictor 2 | 3 | model_dir = '../py_ner/experiments/base_model_with_crf_val' 4 | predictor = BertCRFNERPredictor(model_dir) 5 | 6 | 7 | tokenizer_path = "./ptr_lm_model/tokenizer_78b3253a26.model" 8 | 9 | #model name needs to be changed to the one you trained 10 | model_name = 'best-epoch-9-step-750-acc-0.980.bin' 11 | 12 | algorithm = 'bert_lstm_crf' 13 | if algorithm == 'bert_crf': 14 | checkpoint_file = './experiments/base_model_with_crf/' + model_name 15 | 16 | elif algorithm == 'bert_lstm_crf': 17 | checkpoint_file = './experiments/base_model_with_lstm_crf/' + model_name 18 | 19 | elif algorithm == 'bert_gru_crf': 20 | checkpoint_file = './experiments/base_model_with_gru_crf/' + model_name 21 | 22 | 23 | predictor.load_model(model_name=model_name, tokenizer_path=tokenizer_path, checkpoint_file=checkpoint_file) 24 | 25 | text = '오늘은 비도 오고 학생들이 졸려 보여서 나도 졸리운데 송강호의 괴물 영화나 볼까?' 26 | ne_text = predictor.predict(text) 27 | print(ne_text) -------------------------------------------------------------------------------- /examples/bert_crf_ner_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_ner.bert_bilstm_crf_ner_train import BertBiLstmCrfTrainer 4 | from py_ner.bert_crf_ner_train import BertCrfTrainer 5 | from shutil import copyfile 6 | 7 | data = '../py_ner/data' 8 | 9 | def make_dir(directory, parent_dir): 10 | # Path 11 | path = os.path.join(parent_dir, directory) 12 | 13 | # Create the directory 14 | try: 15 | os.makedirs(path, exist_ok=True) 16 | print("Directory '%s' created successfully" % directory) 17 | except OSError as error: 18 | print("Directory '%s' can not be created") 19 | 20 | algorithm = 'bert_gru_crf' #bert_crf, bert_lstm_crf, bert_gru_crf 21 | # we need two mandatory files in this new directory: config.json and ner_to_index.json 22 | if algorithm == 'bert_crf': 23 | model_dir = 'experiments/base_model_with_crf' 24 | make_dir(model_dir, "./") 25 | copyfile('../py_ner/config/config.json', model_dir+"/config.json") 26 | copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json") 27 | 28 | trainer = BertCrfTrainer(data_dir=data, model_dir=model_dir) 29 | elif algorithm == 'bert_lstm_crf': 30 | model_dir = 'experiments/base_model_with_lstm_crf' 31 | make_dir(model_dir, "./") 32 | copyfile('../py_ner/config/config.json', model_dir + "/config.json") 33 | copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json") 34 | 35 | trainer = BertBiLstmCrfTrainer(data_dir=data, model_dir=model_dir) 36 | elif algorithm == 'bert_gru_crf': 37 | model_dir = 'experiments/base_model_with_gru_crf' 38 | make_dir(model_dir, "./") 39 | copyfile('../py_ner/config/config.json', model_dir + "/config.json") 40 | copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json") 41 | 42 | trainer = BertBiLstmCrfTrainer(data_dir=data, model_dir=model_dir) 43 | 44 | trainer.data_loading() 45 | trainer.train() -------------------------------------------------------------------------------- /examples/bert_en_word_embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM 3 | from sklearn.metrics.pairwise import cosine_similarity 4 | 5 | #pip install pytorch-pretrained-bert 6 | # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows 7 | import logging 8 | logging.basicConfig(level=logging.INFO) 9 | 10 | import matplotlib.pyplot as plt 11 | 12 | # Load pre-trained model tokenizer (vocabulary) 13 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 14 | 15 | #1 Sentence Input: 16 | #text = "Here is the sentence I want embeddings for." 17 | text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank." 18 | marked_text = "[CLS] " + text + " [SEP]" 19 | print (marked_text) 20 | 21 | #We’ve imported a BERT-specific tokenizer, let’s take a look at the output: 22 | tokenized_text = tokenizer.tokenize(marked_text) 23 | print (tokenized_text) 24 | 25 | list(tokenizer.vocab.keys())[5000:5020] 26 | 27 | indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) 28 | 29 | for tup in zip(tokenized_text, indexed_tokens): 30 | print (tup) 31 | 32 | segments_ids = [1] * len(tokenized_text) 33 | print (segments_ids) 34 | 35 | # Convert inputs to PyTorch tensors 36 | tokens_tensor = torch.tensor([indexed_tokens]) 37 | segments_tensors = torch.tensor([segments_ids]) 38 | 39 | # Load pre-trained model (weights) 40 | model = BertModel.from_pretrained('bert-base-uncased') 41 | 42 | # Put the model in "evaluation" mode, meaning feed-forward operation. 43 | model.eval() 44 | 45 | # Predict hidden states features for each layer 46 | with torch.no_grad(): 47 | encoded_layers, _ = model(tokens_tensor, segments_tensors) 48 | 49 | print ("Number of layers:", len(encoded_layers)) 50 | layer_i = 0 51 | 52 | print ("Number of batches:", len(encoded_layers[layer_i])) 53 | batch_i = 0 54 | 55 | print ("Number of tokens:", len(encoded_layers[layer_i][batch_i])) 56 | token_i = 0 57 | 58 | print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i])) 59 | 60 | 61 | # For the 5th token in our sentence, select its feature values from layer 5. 62 | token_i = 5 63 | layer_i = 5 64 | vec = encoded_layers[layer_i][batch_i][token_i] 65 | 66 | # Plot the values as a histogram to show their distribution. 67 | plt.figure(figsize=(10,10)) 68 | plt.hist(vec, bins=200) 69 | plt.show() 70 | 71 | # Convert the hidden state embeddings into single token vectors 72 | 73 | # Holds the list of 12 layer embeddings for each token 74 | # Will have the shape: [# tokens, # layers, # features] 75 | token_embeddings = [] 76 | 77 | # For each token in the sentence... 78 | for token_i in range(len(tokenized_text)): 79 | 80 | # Holds 12 layers of hidden states for each token 81 | hidden_layers = [] 82 | 83 | # For each of the 12 layers... 84 | for layer_i in range(len(encoded_layers)): 85 | # Lookup the vector for `token_i` in `layer_i` 86 | vec = encoded_layers[layer_i][batch_i][token_i] 87 | 88 | hidden_layers.append(vec) 89 | 90 | token_embeddings.append(hidden_layers) 91 | 92 | print('------------------------------------------------------------') 93 | 94 | # Sanity check the dimensions: 95 | print("Number of tokens in sequence:", len(token_embeddings)) 96 | print("Number of layers per token:", len(token_embeddings[0])) 97 | 98 | concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072] 99 | 100 | summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768] 101 | 102 | sentence_embedding = torch.mean(encoded_layers[11], 1) 103 | 104 | print ("Our final sentence embedding vector of shape:"), sentence_embedding[0].shape[0] 105 | 106 | print (text) 107 | for i,x in enumerate(tokenized_text): 108 | print (i,x) 109 | 110 | print ("First fifteen values of 'bank' as in 'bank robber':") 111 | print (summed_last_4_layers[10][:15]) 112 | 113 | print ("First fifteen values of 'bank' as in 'bank vault':") 114 | print(summed_last_4_layers[6][:15]) 115 | 116 | print ("First fifteen values of 'bank' as in 'river bank':") 117 | print(summed_last_4_layers[19][:15]) 118 | 119 | # Compare "bank" as in "bank robber" to "bank" as in "river bank" 120 | different_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[19].reshape(1,-1))[0][0] 121 | 122 | # Compare "bank" as in "bank robber" to "bank" as in "bank vault" 123 | same_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[6].reshape(1,-1))[0][0] 124 | 125 | print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'bank vault':", same_bank) 126 | 127 | print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'river bank':", different_bank) 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /examples/bert_ner_predictor.py: -------------------------------------------------------------------------------- 1 | from pytorch_pretrained_bert import BertTokenizer 2 | 3 | import py_ner.bert_ner_prediction as prediction 4 | import torch 5 | import py_ner.lstm_cnn_crf_utils as utils 6 | 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 8 | json_file = './models/tag_info_kr.json' #tag_info_kr.json or tag_info_en.json 9 | tag2idx = utils.load_from_json(json_file) 10 | MAX_LEN = 160 11 | 12 | #text = 'this is a good John Smith as my friend' 13 | text = '이승만 대통령은 대한민국 박명환 대통령입니다.' 14 | 15 | #bert-base-multilingual-cased, bert-base-cased 16 | tokenizer_name = 'bert-base-multilingual-cased' 17 | tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True) 18 | 19 | predictor = prediction.BERTNERPredictor() 20 | model_name = "bert_ner_kr.model" #bert_ner_kr.model or bert_ner_en.model 21 | predictor.load_model(model_name) 22 | predictions = predictor.predict_each(device, text, tokenizer, MAX_LEN, tag2idx) 23 | #pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions] 24 | print(str(predictions)) -------------------------------------------------------------------------------- /examples/bert_ner_trainer.py: -------------------------------------------------------------------------------- 1 | from pytorch_pretrained_bert import BertTokenizer 2 | 3 | import py_ner.bert_ner_train as train 4 | import torch 5 | import py_ner.lstm_cnn_crf_utils as utils 6 | 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 8 | trainer = train.BERTNERTrainer() 9 | 10 | mode = 'txt' 11 | if mode == 'csv': 12 | data = "../py_ner/data/ner.csv" 13 | else: 14 | #for Korean text for now 15 | data = '../py_ner/data/train.txt' 16 | 17 | sentences, tag2idx, labels = trainer.data_processing(data) 18 | 19 | #bert-base-multilingual-cased, bert-base-cased 20 | tokenizer_name = 'bert-base-multilingual-cased' 21 | tokenizer = trainer.tokenizer(tokenizer_name) 22 | trainer.data_loading(tokenizer,sentences,tag2idx,labels) 23 | classifier_model_name='bert-base-multilingual-cased' 24 | trainer.load_token_classifier(classifier_model_name,tag2idx) 25 | 26 | trainer.set_optimizer() 27 | 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | trainer.train_epoch(device) 30 | 31 | trainer.eval(device, labels) 32 | 33 | language = "kr" 34 | trainer.save_model(language) 35 | -------------------------------------------------------------------------------- /examples/bert_sentiment_predictor.py: -------------------------------------------------------------------------------- 1 | from textwrap import wrap 2 | 3 | from sklearn.metrics import classification_report, confusion_matrix 4 | from transformers import BertForSequenceClassification 5 | 6 | from py_bert.bert_dataset import PYBERTDataset 7 | from py_bert.bert_classification_model import PYBERTClassifier 8 | from py_bert.bert_predictor import bert_predictor 9 | from py_bert.bert_trainer import PYBERTTrainer 10 | from py_bert.bert_util import create_data_loader, add_sentiment_label, convert_to_df, get_korean_tokenizer, show_confusion_matrix 11 | from transformers import BertModel, BertTokenizer 12 | from sklearn.model_selection import train_test_split 13 | 14 | from py_bert.tokenization_kobert import KoBertTokenizer 15 | import matplotlib.pyplot as plt 16 | import seaborn as sns 17 | 18 | import pyTextMiner as ptm 19 | import torch 20 | import pandas as pd 21 | 22 | #mode is either en or kr 23 | mode = 'kr' 24 | df = None 25 | 26 | if mode == 'en': 27 | df = pd.read_csv("../data/reviews.csv") 28 | df, class_names = add_sentiment_label(df) 29 | elif mode == 'kr': 30 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 31 | stopwords = '../stopwords/stopwordsKor.txt' 32 | input_file = '../data/ratings_test.txt' 33 | 34 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), 35 | ptm.tokenizer.MeCab(mecab_path), 36 | ptm.lemmatizer.SejongPOSLemmatizer(), 37 | ptm.helper.SelectWordOnly(), 38 | ptm.helper.StopwordFilter(file=stopwords)) 39 | 40 | corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter='\t', doc_index=1, class_index=2) 41 | 42 | documents = [] 43 | labels = [] 44 | result = pipeline.processCorpus(corpus) 45 | i = 1 46 | #below is just for a sample test 47 | for doc in result[1:500]: 48 | document = '' 49 | for sent in doc: 50 | for word in sent: 51 | document += word + ' ' 52 | documents.append(document.strip()) 53 | labels.append(corpus.pair_map[i]) 54 | i += 1 55 | 56 | df, class_names = convert_to_df(documents,labels) 57 | 58 | print(df.head()) 59 | print(df.info()) 60 | 61 | # Report the number of sentences. 62 | print('Number of test sentences: {:,}\n'.format(df.shape[0])) 63 | 64 | tokenizer = None 65 | # bert-base-multilingual-cased, bert-base-cased, monologg/kobert, monologg/distilkobert, bert_models/vocab_etri.list 66 | # bert_model_name='../bert_models/vocab_mecab.list' 67 | bert_model_name = 'monologg/kobert' 68 | tokenizer = get_korean_tokenizer(bert_model_name) 69 | 70 | #we need a better way of setting MAX_LEN 71 | MAX_LEN = 160 72 | 73 | predictor = bert_predictor() 74 | predictor.load_data(df, tokenizer, MAX_LEN) 75 | 76 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 77 | 78 | #algorithm and saved_training_model goes hand-in-hand 79 | algorithm='no_transformers' 80 | #saved_training_model = './model_save/best_model_state.bin' 81 | if algorithm=='transformers': 82 | saved_training_model = './model_save/best_model_state.bin' 83 | else: 84 | saved_training_model = './model_save/best_model_states.bin' 85 | 86 | predictor.load_model(saved_training_model) 87 | 88 | y_texts, y_pred, y_pred_probs, y_test = predictor.predict(device, algorithm=algorithm) 89 | print(y_pred) 90 | print(y_test) 91 | 92 | print(classification_report(y_test, y_pred, target_names=class_names)) 93 | cm = confusion_matrix(y_test, y_pred) 94 | df_cm = pd.DataFrame(cm, index=class_names, columns=class_names) 95 | show_confusion_matrix(df_cm) 96 | 97 | ''' 98 | for i, (a, b) in enumerate(zip(y_test, y_pred)): 99 | print(classification_report(a, b, target_names=class_names)) 100 | cm = confusion_matrix(y_test[i], y_pred[i]) 101 | df_cm = pd.DataFrame(cm, index=class_names, columns=class_names) 102 | show_confusion_matrix(df_cm) 103 | ''' 104 | 105 | #let’s have a look at an example from our test data: 106 | idx = 2 107 | text = y_texts[idx] 108 | true_sentiment = y_test[idx] 109 | pred_df = pd.DataFrame({ 110 | 'class_names': class_names, 111 | 'values': y_pred_probs[idx] 112 | }) 113 | print("\n".join(wrap(text))) 114 | print() 115 | print(f'True sentiment: {class_names[true_sentiment]}') 116 | print('\n') 117 | 118 | #we can look at the confidence of each sentiment of our model: 119 | sns.barplot(x='values', y='class_names', data=pred_df, orient='h') 120 | plt.ylabel('sentiment') 121 | plt.xlabel('probability') 122 | plt.xlim([0, 1]); 123 | plt.show() 124 | 125 | text = '정말 형편없네 ㅠㅠ 눈을 버렸어' 126 | prediction = predictor.predict_each(device,text,tokenizer,MAX_LEN, algorithm=algorithm) 127 | print(f'Review text: {text}') 128 | print(f'Sentiment : {class_names[prediction]}') 129 | 130 | #predictor.predict(device) 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /examples/bert_sentiment_trainer.py: -------------------------------------------------------------------------------- 1 | from transformers import BertForSequenceClassification 2 | 3 | from py_bert.bert_dataset import PYBERTDataset 4 | from py_bert.bert_classification_model import PYBERTClassifier, PYBERTClassifierGenAtten, PYBertForSequenceClassification 5 | from py_bert.bert_trainer import PYBERTTrainer 6 | from py_bert.bert_util import create_data_loader, add_sentiment_label, convert_to_df, get_korean_tokenizer 7 | from transformers import BertModel, BertTokenizer 8 | from sklearn.model_selection import train_test_split 9 | 10 | from py_bert.tokenization_kobert import KoBertTokenizer 11 | 12 | import pyTextMiner as ptm 13 | import torch 14 | import numpy as np 15 | import pandas as pd 16 | 17 | #mode is either en or kr 18 | mode = 'kr' 19 | df = None 20 | 21 | if mode == 'en': 22 | df = pd.read_csv("../data/reviews.csv") 23 | df, class_names = add_sentiment_label(df) 24 | elif mode == 'kr': 25 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 26 | stopwords = '../stopwords/stopwordsKor.txt' 27 | input_file = '../data/ratings_train.txt' 28 | 29 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), 30 | ptm.tokenizer.MeCab(mecab_path), 31 | ptm.lemmatizer.SejongPOSLemmatizer(), 32 | ptm.helper.SelectWordOnly(), 33 | ptm.helper.StopwordFilter(file=stopwords)) 34 | 35 | corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter='\t', doc_index=1, class_index=2) 36 | 37 | documents = [] 38 | labels = [] 39 | result = pipeline.processCorpus(corpus) 40 | i = 1 41 | 42 | #below is just for a sample test 43 | for doc in result[1:2000]: 44 | document = '' 45 | for sent in doc: 46 | for word in sent: 47 | document += word + ' ' 48 | documents.append(document.strip()) 49 | labels.append(corpus.pair_map[i]) 50 | i += 1 51 | 52 | df, class_names = convert_to_df(documents,labels) 53 | 54 | print(df.head()) 55 | print(df.info()) 56 | 57 | RANDOM_SEED = 42 58 | np.random.seed(RANDOM_SEED) 59 | torch.manual_seed(RANDOM_SEED) 60 | 61 | #we need a better way of setting MAX_LEN 62 | MAX_LEN = 160 63 | 64 | #split 65 | df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED) 66 | df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED) 67 | 68 | print(df_train.shape, df_val.shape, df_test.shape) 69 | 70 | tokenizer = None 71 | #bert-base-multilingual-cased, bert-base-cased, monologg/kobert, monologg/distilkobert, bert_models/vocab_etri.list 72 | #bert_model_name='../bert_models/vocab_mecab.list' 73 | bert_model_name='monologg/kobert' 74 | tokenizer =get_korean_tokenizer(bert_model_name) 75 | 76 | BATCH_SIZE = 16 77 | train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE) 78 | val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE) 79 | test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE) 80 | 81 | # print(str(train_data_loader.dataset.__getitem__(0))) 82 | data = next(iter(train_data_loader)) 83 | data.keys() 84 | 85 | print(data['input_ids'].shape) 86 | print(data['attention_mask'].shape) 87 | print(data['token_type_ids'].shape) 88 | print(data['targets'].shape) 89 | 90 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 91 | 92 | classifier = 'transformers' 93 | if classifier == 'basic': 94 | model = PYBERTClassifier(len(class_names), bert_model_name) 95 | elif classifier == 'attention': 96 | dr_rate = 0.3 97 | model = PYBERTClassifierGenAtten(len(class_names), bert_model_name, dr_rate=dr_rate) 98 | elif classifier == 'transformers': 99 | model = PYBertForSequenceClassification(len(class_names), bert_model_name).__call__() 100 | 101 | model = model.to(device) 102 | 103 | algorithm='transformers' #transformers or non_transformers 104 | if algorithm =='transformers': 105 | torch_model_name='best_model_state.bin' 106 | else: 107 | torch_model_name = 'best_model_states.bin' 108 | 109 | #BERT authors suggests epoch from 2 to 4 110 | num_epochs = 2 111 | trainer = PYBERTTrainer() 112 | trainer.train(model, device, train_data_loader, val_data_loader, 113 | df_val, df_train, tokenizer, num_epochs=num_epochs, algorithm=algorithm, torch_model_name=torch_model_name) 114 | 115 | trainer.summanry_training_stats() 116 | 117 | trainer.visualize_performance() 118 | -------------------------------------------------------------------------------- /examples/doc2vec_clustering.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from sklearn.cluster import KMeans 4 | 5 | from py_doc2vec.doc2vecModel import Doc2VecTrainer, Doc2VecSimilarity 6 | import logging 7 | import pyTextMiner as ptm 8 | import csv 9 | import sys 10 | from py_document_clustering.documentclustering import DocumentClustering 11 | import matplotlib.pyplot as plt 12 | from sklearn.cluster import KMeans 13 | from sklearn.decomposition import PCA 14 | 15 | model_file = './tmp/1595123417030_pv_dma_dim=100_window=5_epochs=20/doc2vec.model' 16 | doc2vec = Doc2VecSimilarity() 17 | doc2vec.load_model(model_file) 18 | model = doc2vec.get_model() 19 | # name either k-means, agglo, spectral_cocluster 20 | name = 'spectral_cocluster' 21 | clustering = DocumentClustering(k=3) 22 | # n_components means the number of words to be used as features 23 | clustering.make_matrix(n_components=-1, doc2vec_matrix=model.docvecs.vectors_docs) 24 | clustering.cluster(name) 25 | 26 | clustering.visualize() 27 | -------------------------------------------------------------------------------- /examples/doc2vec_tester.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | from doc2vec.doc2vecModel import Doc2VecSimilarity 3 | import logging 4 | import pyTextMiner as ptm 5 | 6 | model_file = '../doc2vec/tmp/1594484106304_pv_dma_dim=100_window=5_epochs=20/doc2vec.model' 7 | doc2vec = Doc2VecSimilarity() 8 | doc2vec.load_model(model_file) 9 | 10 | test_sample = '한국 경제가 위기에 처하다' 11 | # Convert the sample document into a list and use the infer_vector method to get a vector representation for it 12 | new_doc_words = test_sample.split() 13 | similars = doc2vec.most_similar(test_sample) 14 | for sim in similars: 15 | print(str(sim)) 16 | 17 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 18 | # stopwords file path 19 | stopwords = '../stopwords/stopwordsKor.txt' 20 | 21 | test_sample1 = '중국 시장은 위축되었다' 22 | 23 | pipeline = ptm.Pipeline(ptm.tokenizer.MeCab(mecab_path), 24 | ptm.lemmatizer.SejongPOSLemmatizer(), 25 | ptm.helper.SelectWordOnly(), 26 | ptm.helper.StopwordFilter(file=stopwords)) 27 | 28 | doc_vec1 = pipeline.processCorpus([test_sample]) 29 | doc_vec2 = pipeline.processCorpus([test_sample1]) 30 | 31 | print(doc_vec1[0]) 32 | print(doc_vec2[0]) 33 | 34 | # use the most_similar utility to find the most similar documents. 35 | similarity = doc2vec.compute_similarity_vec(first_vec=doc_vec1[0], second_vec=doc_vec2[0]) 36 | print('similarity between two document: ') 37 | print(str(similarity)) 38 | 39 | 40 | -------------------------------------------------------------------------------- /examples/doc2vec_trainer.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | from doc2vec.doc2vecModel import Doc2VecTrainer 3 | import logging 4 | import pyTextMiner as ptm 5 | from gensim.models.doc2vec import TaggedDocument 6 | #pv_dmc, pv_dma, pv_dbow 7 | algorithm = 'pv_dma' 8 | # ignores all words with total frequency lower than this 9 | vocab_min_count = 10 10 | # word and document vector siz 11 | dim = 100 12 | # window size 13 | window = 5 14 | #number of training epochs 15 | epochs = 20 16 | # initial learning rate 17 | alpha = 0.025 18 | # learning rate will linearly drop to min_alpha as training progresses 19 | min_alpha = 0.001 20 | # number of cores to train on 21 | cores = 2 22 | # number of cores to train on 23 | train = True 24 | 25 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 26 | 27 | # stopwords file path 28 | stopwords = '../stopwords/stopwordsKor.txt' 29 | # train documents input path 30 | input_path = '../data/donald.txt' 31 | # output base directory 32 | output_base_dir = './tmp' 33 | 34 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), 35 | ptm.tokenizer.MeCab(mecab_path), 36 | ptm.lemmatizer.SejongPOSLemmatizer(), 37 | ptm.helper.SelectWordOnly(), 38 | ptm.helper.StopwordFilter(file=stopwords)) 39 | 40 | corpus = ptm.CorpusFromFile(input_path) 41 | documents = [] 42 | result = pipeline.processCorpus(corpus) 43 | i = 0 44 | for doc in result: 45 | document = [] 46 | for sent in doc: 47 | for word in sent: 48 | document.append(word) 49 | documents.append(TaggedDocument(document, [i])) 50 | i += 1 51 | 52 | #--epochs 40 --vocab-min-count 10 data/stopwords_german.txt dewiki-preprocessed.txt /tmp/models/doc2vec-dewiki 53 | 54 | doc2vec = Doc2VecTrainer() 55 | logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=logging.INFO) 56 | doc2vec.run(documents, output_base_dir=output_base_dir, vocab_min_count=vocab_min_count, 57 | num_epochs=epochs, algorithm=algorithm, vector_size=dim, alpha=alpha, 58 | min_alpha=min_alpha, train=train, window=window, cores=cores) -------------------------------------------------------------------------------- /examples/fasttext_tester.py: -------------------------------------------------------------------------------- 1 | from word2vec.word_embeddings import FastText 2 | 3 | fasttext = FastText() 4 | binary=True 5 | model_file = 'fasttext.bin' 6 | fasttext.load_model(model_file) 7 | mode = 'jamo_split' 8 | print(fasttext.most_similar(mode, positives=['이재명', '경제'], negatives=['정치인'], topn=10)) 9 | #print(fasttext.most_similar(mode, positives=['이재명'], negatives=[], topn=10)) 10 | 11 | print('-----------------------------------') 12 | 13 | print(fasttext.similar_by_word(mode, '이재명')) -------------------------------------------------------------------------------- /examples/fasttext_trainer.py: -------------------------------------------------------------------------------- 1 | 2 | from word2vec.word_embeddings import FastText 3 | 4 | fasttext = FastText() 5 | mode = 'jamo_split_filtered' 6 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 7 | stopword_file = '../stopwords/stopwordsKor.txt' 8 | files = [] 9 | files.append('../data/donald.txt') 10 | is_directory=False 11 | doc_index=2 12 | max=-1 13 | fasttext.preprocessing(mode,mecab_path,stopword_file,files,is_directory,doc_index,max) 14 | 15 | min_count=1 16 | window=5 17 | size=50 18 | negative=5 19 | fasttext.train(min_count, window, size, negative) 20 | 21 | model_file = 'fasttext.bin' 22 | binary=True; 23 | fasttext.save_model(model_file) 24 | 25 | 26 | -------------------------------------------------------------------------------- /examples/glove_tester.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from word2vec.word_embeddings import GloVe 4 | 5 | glove = GloVe() 6 | binary=True 7 | model_file = '../glove-win_devc_x64/vectors.txt' 8 | glove.load_model(model_file) 9 | print(glove.most_similars(positives=['이재명', '경제'], negatives=['정치인'], topn=10)) 10 | 11 | print('-----------------------------------') 12 | 13 | print(glove.most_similar('이재명')) -------------------------------------------------------------------------------- /examples/koreanKeywordTest.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | min_count = 5 # 단어의 최소 출현 빈도수 (그래프 생성 시) 4 | max_length = 20 # 단어의 최대 길이 5 | beta = 0.95 6 | max_iter = 20 7 | verbose = True 8 | num_words=30 9 | keyword_extractor=ptm.keyword.KeywordExtractionKorean(min_count,max_length,beta,max_iter,verbose,num_words) 10 | 11 | sents = ['최순실 씨가 외국인투자촉진법 개정안 통과와 예산안 반영까지 꼼꼼이 챙긴 건데, 이른바 외촉법, 어떤 법이길래 최 씨가 열심히 챙긴 걸까요. 자신의 이해관계와 맞아 떨어지는 부분이 없었는지 취재기자와 한걸음 더 들여다보겠습니다. 이서준 기자, 우선 외국인투자촉진법 개정안, 어떤 내용입니까?', 12 | '한마디로 대기업이 외국 투자를 받아 계열사를 설립할 때 규제를 완화시켜 주는 법안입니다. 대기업 지주사의 손자 회사가 이른바 증손회사를 만들 때 지분 100%를 출자해야 합니다. 대기업의 문어발식 계열사 확장을 막기 위한 조치인데요. 외촉법 개정안은 손자회사가 외국 투자를 받아서 증손회사를 만들 땐 예외적으로 50% 지분만 투자해도 되게끔 해주는 내용입니다.', 13 | '그만큼 쉽게 완화해주는 거잖아요. 그때 기억을 더듬어보면 야당의 반발이 매우 심했습니다. 그 이유가 뭐였죠? ', 14 | '대기업 특혜 법안이라는 취지였는데요. (당연히 그랬겠죠.) 당시 박영선 의원의 국회 발언을 들어보시겠습니다. [박영선 의원/더불어민주당 (2013년 12월 31일) : 경제의 근간을 흔드는 법을 무원칙적으로 이렇게 특정 재벌 회사에게 특혜를 주기 위해 간청하는 민원법을 우리가 새해부터 왜 통과시켜야 합니까.]', 15 | '최순실 씨 사건을 쫓아가다 보면 본의 아니게 이번 정부의 과거로 올라가면서 복기하는 듯한 느낌이 드는데 이것도 바로 그중 하나입니다. 생생하게 기억합니다. 이 때 장면들은. 특정 재벌 회사를 위한 특혜라고 말하는데, 어떤 기업을 말하는 건가요?', 16 | 'SK와 GS 입니다. 개정안이 통과되는 걸 전제로 두 회사는 외국 투자를 받아 증손회사 설립을 진행중이었기 때문인데요. 당시 개정안이 통과되지 않으면 두 기업이 수조원의 손실이 생길 수 있는 것으로 알려져 있었습니다. 허창수 GS 회장과 김창근 SK회장은 2013년 8월 박 대통령과 청와대에서 대기업 회장단 오찬자리에서 외촉법 통과를 요청한 바도 있습니다. ', 17 | '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.', 18 | '네 그렇습니다. 통화 내용을 다시 짚어보면요. 최 씨는 외촉법 관련 예산이 12월 2일, 반드시 되어야 한다, 작년 예산으로 돼서는 안 된다고 얘기하고 있는데요. 다시 말해서 외촉법 관련 예산안이 내년에 반영되어야 한다고 압박을 하고 있는 겁니다. 그러면서 "국민을 볼모로 잡고 있다"며 "국회와 정치권에 책임을 묻겠다"고 으름장까지 놓고 있는데요. 매우 집착하는 모습인데요. 이에 대해서 정 전 비서관이 "예산이 그렇게 빨리 통과된 적 없습니다"고 말하자 말을 끊으면서 매우 흥분한 듯, "그렇더라도, 그렇더라도" 하면서 "야당이 공약 지키라고 하면서 협조는 안 한다", "대통령으로 할 수 있는 일이 없다", "불공정 사태와 난맥상이 나온다"며 굉장한 압박까지 하고 있습니다.', 19 | '이 얘기들만 들여다봐도 마치 본인이 대통령처럼 얘기하고 있습니다. 내용들 보면 그렇지 않습니까? 혹시 최 씨가 이 외촉법 통과로 이득을 본 경우도 있습니까. ', 20 | '최 씨가 입김을 넣어 차은택 씨가 주도를 한 걸로 알려진 K컬처밸리 사업이 그렇다는 얘기가 나오고 있습니다. 외촉법을 편법으로 활용해 1% 금리를 적용받았다는 지적이 나오고 있습니다. 본격 사업이 추진되기 전 최순실 국정개입 사건이 터지기는 했지만, 이외에도 다른 혜택을 받았는지는 조사가 필요해 보입니다. ', 21 | '그런데 녹취파일을 보면 "남자1"이 등장합니다. 이 사람은 누구입니까?', 22 | '정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.' 23 | ] 24 | 25 | keyword=keyword_extractor(sents) 26 | for word, r in sorted(keyword.items(), key=lambda x: x[1], reverse=True)[:30]: 27 | print('%8s:\t%.4f' % (word, r)) 28 | 29 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2) 30 | 31 | # import nltk 32 | # nltk.download() 33 | # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. 34 | # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. 35 | # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. 36 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), 37 | ptm.helper.POSFilter('NN*'), 38 | ptm.helper.SelectWordOnly(), 39 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 40 | result = pipeline.processCorpus(corpus) 41 | print(result) 42 | print() 43 | 44 | documents=[] 45 | for doc in result: 46 | document='' 47 | for sent in doc: 48 | document = " ".join(sent) 49 | documents.append(document) 50 | 51 | 52 | keyword_extractor1=ptm.keyword.KeywordExtractionKorean(min_count,max_length,beta,max_iter,verbose,num_words) 53 | keyword1=keyword_extractor1(documents) 54 | for word, r in sorted(keyword1.items(), key=lambda x: x[1], reverse=True)[:30]: 55 | print('%8s:\t%.4f' % (word, r)) -------------------------------------------------------------------------------- /examples/koreanLemmatizationTest.py: -------------------------------------------------------------------------------- 1 | 2 | import pyTextMiner as ptm 3 | 4 | korean_lemmatizer=ptm.lemmatizer.KoreanLemmatizer() 5 | 6 | test = [ 7 | ('모', '았다'), 8 | ('하', '다'), 9 | ('서툰', ''), 10 | ('와서', ''), 11 | ('내려논', ''), 12 | ] 13 | 14 | for l, r in test: 15 | print('({}, {}) -> {}'.format(l, r, korean_lemmatizer(l + r))) 16 | # print(_lemma_candidate(l, r), end='\n\n') 17 | -------------------------------------------------------------------------------- /examples/koreanNounExtractionTest.py: -------------------------------------------------------------------------------- 1 | 2 | import pyTextMiner as ptm 3 | 4 | #corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2) 5 | corpus=ptm.CorpusFromFile('./data/134963_norm.txt') 6 | # import nltk 7 | # nltk.download() 8 | # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. 9 | # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. 10 | # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. 11 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), 12 | ptm.helper.POSFilter('NN*'), 13 | ptm.helper.SelectWordOnly(), 14 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 15 | result = pipeline.processCorpus(corpus) 16 | print(result) 17 | print() 18 | 19 | documents=[] 20 | for doc in result: 21 | document='' 22 | for sent in doc: 23 | document = " ".join(sent) 24 | documents.append(document) 25 | 26 | #2016-10-20.txt 27 | corpus1=ptm.CorpusFromFile('./data/2016-10-20.txt') 28 | noun_extractor=ptm.noun_extractor.NounExtractionKorean(corpus1) 29 | sent='두바이월드센터시카고옵션거래소' 30 | result=noun_extractor.__call__(sent) 31 | print(result) -------------------------------------------------------------------------------- /examples/koreanSegmentationTest.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | test='이건진짜좋은영화라라랜드진짜좋은영화' 4 | 5 | model_path='./model/korean_segmentation_model.crfsuite' 6 | segmentation=ptm.segmentation.SegmentationKorean(model_path) 7 | correct=segmentation(test) 8 | print(correct) 9 | 10 | chatspace_segmentation=ptm.segmentation.ChatSpaceSegmentationKorean() 11 | chatspace_correct=chatspace_segmentation(test) 12 | print(chatspace_correct) 13 | 14 | lstm_model_path='./pyTextMiner/segmentation/model' 15 | lstm_segmentation=ptm.segmentation.LSTMSegmentationKorean(lstm_model_path) 16 | lstm_correct=lstm_segmentation(test) 17 | print(lstm_correct) 18 | 19 | lstm_segmentation.close() -------------------------------------------------------------------------------- /examples/koreanSpecialTokenizerTest.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | #tokenize by subwords 4 | scores = {'파스': 0.3, '파스타': 0.7, '좋아요': 0.2, '좋아':0.5} 5 | tokenizer = ptm.tokenizer.MaxScoreTokenizerKorean(scores=scores) 6 | tokens = tokenizer.inst.tokenize('파스타가좋아요') 7 | print(str(tokens)) 8 | 9 | #띄어쓰기가 잘 되어 있는 한국어 문서의 경우에는 MaxScoreTokenizer를 이용할 필요가 없다. 10 | # 한국어는 L+[R] 구조이기 때문이다 11 | # 이 때에는 한 어절의 왼쪽에서부터 글자 점수가 가장 높은 부분을 기준으로 토크나이징을 한다 12 | scores = {'데이':0.5, '데이터':0.5, '데이터마이닝':0.5, '공부':0.5, '공부중':0.45} 13 | tokenizer = ptm.tokenizer.LTokenizerKorean(scores=scores) 14 | print('\nflatten=True\nsent = 데이터마이닝을 공부한다') 15 | print(tokenizer.inst.tokenize('데이터마이닝을 공부한다')) 16 | 17 | print('\nflatten=False\nsent = 데이터마이닝을 공부한다') 18 | print(tokenizer.inst.tokenize('데이터마이닝을 공부한다', flatten=False)) 19 | 20 | print('\nflatten=False\nsent = 데이터분석을 위해서 데이터마이닝을 공부한다') 21 | print(tokenizer.inst.tokenize('데이터분석을 위해서 데이터마이닝을 공부한다', flatten=False)) 22 | 23 | print('\nflatten=True\nsent = 데이터분석을 위해서 데이터마이닝을 공부한다') 24 | print(tokenizer.inst.tokenize('데이터분석을 위해서 데이터마이닝을 공부한다')) 25 | 26 | #Tolerance는 한 어절에서 subword 들의 점수의 차이가 그 어절의 점수 최대값과 tolerance 이하로 난다면, 길이가 가장 긴 어절을 선택한다. 27 | # CohesionProbability에서는 합성명사들은 각각의 요소들보다 낮기 때문에 tolerance를 이용할 수 있다. 28 | # 29 | print('tolerance=0.0\nsent = 데이터마이닝을 공부중이다') 30 | print(tokenizer.inst.tokenize('데이터마이닝을 공부중이다')) 31 | 32 | print('\ntolerance=0.1\nsent = 데이터마이닝을 공부중이다') 33 | print(tokenizer.inst.tokenize('데이터마이닝을 공부중이다', tolerance=0.1)) 34 | 35 | #RegexTokenizer는 regular extression을 이용하여 언어가 달라지는 순간들을 띄어쓴다. 36 | # 영어의 경우에는 움라우트가 들어가는 경우들이 있어서 알파벳 뿐 아니라 라틴까지 포함하였다. 37 | tokenizer = ptm.tokenizer.RegexTokenizerKorean() 38 | 39 | sents = [ 40 | '이렇게연속된문장은잘리지않습니다만', 41 | '숫자123이영어abc에섞여있으면ㅋㅋ잘리겠죠', 42 | '띄어쓰기가 포함되어있으면 이정보는10점!꼭띄워야죠' 43 | ] 44 | 45 | for sent in sents: 46 | print(' %s\n->%s\n' % (sent, tokenizer.inst.tokenize(sent))) 47 | -------------------------------------------------------------------------------- /examples/koreanTokenizerTest.py: -------------------------------------------------------------------------------- 1 | 2 | import pyTextMiner as ptm 3 | import time 4 | from collections import Counter 5 | 6 | mecab_path='C:\\mecab\\mecab-ko-dic' 7 | komoran = ptm.tokenizer.Komoran() 8 | kkma = ptm.tokenizer.KokomaKorean() 9 | twitter = ptm.tokenizer.TwitterKorean() 10 | mecab = ptm.tokenizer.MeCab(mecab_path) 11 | 12 | sent = '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.' 13 | taggers = [komoran, kkma, twitter, mecab] 14 | names = 'komoran kkma twitter mecab'.split() 15 | for tagger in taggers: 16 | pos = tagger.inst.pos(sent) 17 | print(str(pos)) 18 | 19 | #performance (speed) measure 20 | sents = ['최순실 씨가 외국인투자촉진법 개정안 통과와 예산안 반영까지 꼼꼼이 챙긴 건데, 이른바 외촉법, 어떤 법이길래 최 씨가 열심히 챙긴 걸까요. 자신의 이해관계와 맞아 떨어지는 부분이 없었는지 취재기자와 한걸음 더 들여다보겠습니다. 이서준 기자, 우선 외국인투자촉진법 개정안, 어떤 내용입니까?', 21 | '한마디로 대기업이 외국 투자를 받아 계열사를 설립할 때 규제를 완화시켜 주는 법안입니다. 대기업 지주사의 손자 회사가 이른바 증손회사를 만들 때 지분 100%를 출자해야 합니다. 대기업의 문어발식 계열사 확장을 막기 위한 조치인데요. 외촉법 개정안은 손자회사가 외국 투자를 받아서 증손회사를 만들 땐 예외적으로 50% 지분만 투자해도 되게끔 해주는 내용입니다.', 22 | '그만큼 쉽게 완화해주는 거잖아요. 그때 기억을 더듬어보면 야당의 반발이 매우 심했습니다. 그 이유가 뭐였죠? ', 23 | '대기업 특혜 법안이라는 취지였는데요. (당연히 그랬겠죠.) 당시 박영선 의원의 국회 발언을 들어보시겠습니다. [박영선 의원/더불어민주당 (2013년 12월 31일) : 경제의 근간을 흔드는 법을 무원칙적으로 이렇게 특정 재벌 회사에게 특혜를 주기 위해 간청하는 민원법을 우리가 새해부터 왜 통과시켜야 합니까.]', 24 | '최순실 씨 사건을 쫓아가다 보면 본의 아니게 이번 정부의 과거로 올라가면서 복기하는 듯한 느낌이 드는데 이것도 바로 그중 하나입니다. 생생하게 기억합니다. 이 때 장면들은. 특정 재벌 회사를 위한 특혜라고 말하는데, 어떤 기업을 말하는 건가요?', 25 | 'SK와 GS 입니다. 개정안이 통과되는 걸 전제로 두 회사는 외국 투자를 받아 증손회사 설립을 진행중이었기 때문인데요. 당시 개정안이 통과되지 않으면 두 기업이 수조원의 손실이 생길 수 있는 것으로 알려져 있었습니다. 허창수 GS 회장과 김창근 SK회장은 2013년 8월 박 대통령과 청와대에서 대기업 회장단 오찬자리에서 외촉법 통과를 요청한 바도 있습니다. ', 26 | '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.', 27 | '네 그렇습니다. 통화 내용을 다시 짚어보면요. 최 씨는 외촉법 관련 예산이 12월 2일, 반드시 되어야 한다, 작년 예산으로 돼서는 안 된다고 얘기하고 있는데요. 다시 말해서 외촉법 관련 예산안이 내년에 반영되어야 한다고 압박을 하고 있는 겁니다. 그러면서 "국민을 볼모로 잡고 있다"며 "국회와 정치권에 책임을 묻겠다"고 으름장까지 놓고 있는데요. 매우 집착하는 모습인데요. 이에 대해서 정 전 비서관이 "예산이 그렇게 빨리 통과된 적 없습니다"고 말하자 말을 끊으면서 매우 흥분한 듯, "그렇더라도, 그렇더라도" 하면서 "야당이 공약 지키라고 하면서 협조는 안 한다", "대통령으로 할 수 있는 일이 없다", "불공정 사태와 난맥상이 나온다"며 굉장한 압박까지 하고 있습니다.', 28 | '이 얘기들만 들여다봐도 마치 본인이 대통령처럼 얘기하고 있습니다. 내용들 보면 그렇지 않습니까? 혹시 최 씨가 이 외촉법 통과로 이득을 본 경우도 있습니까. ', 29 | '최 씨가 입김을 넣어 차은택 씨가 주도를 한 걸로 알려진 K컬처밸리 사업이 그렇다는 얘기가 나오고 있습니다. 외촉법을 편법으로 활용해 1% 금리를 적용받았다는 지적이 나오고 있습니다. 본격 사업이 추진되기 전 최순실 국정개입 사건이 터지기는 했지만, 이외에도 다른 혜택을 받았는지는 조사가 필요해 보입니다. ', 30 | '그런데 녹취파일을 보면 "남자1"이 등장합니다. 이 사람은 누구입니까?', 31 | '정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.' 32 | ] 33 | 34 | tokens = [] 35 | 36 | for name, tagger in zip(names, taggers): 37 | t = time.time() 38 | tokens.append( 39 | [pos for sent in sents for pos in tagger.inst.pos(sent)] 40 | ) 41 | t = time.time() - t 42 | 43 | print('{:8}: {:.3f} secs'.format(name, t)) 44 | 45 | #print first 15 words in the first sentence --> example of out of vocabulary problem 46 | print(tokens[0][:15]) 47 | print('\n\n') 48 | 49 | #word frequency calculation 50 | counter = Counter(tokens[0]) 51 | counter = { 52 | word:freq for word, freq in counter.items() 53 | if (freq >= 4) and (word[1][:2] == 'NN') 54 | } 55 | print(sorted(counter.items(), key=lambda x:-x[1])) 56 | 57 | print('\n\n') 58 | 59 | #using all three POS tokenizer 60 | for name, tokens_ in zip(names, tokens): 61 | 62 | print('\n\nTagger name = {}'.format(name)) 63 | 64 | counter = Counter(tokens_) 65 | counter = {word:freq for word, freq in counter.items() 66 | if (freq >= 4) and (word[1][:1] == 'N')} 67 | 68 | print(sorted(counter.items(), key=lambda x:x[1], reverse=True)) 69 | 70 | -------------------------------------------------------------------------------- /examples/naver_newscomments_processor.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from time import time 3 | 4 | import gensim 5 | import pyTextMiner as ptm 6 | from gensim.models import Word2Vec 7 | 8 | cores = multiprocessing.cpu_count() # Count the number of cores in a computer 9 | 10 | print('Start reading the dataset 1....') 11 | path = '/usr/local/lib/mecab/dic/mecab-ko-dic' 12 | 13 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), 14 | ptm.tokenizer.MeCab(path), 15 | ptm.lemmatizer.SejongPOSLemmatizer(), 16 | ptm.helper.SelectWordOnly(), 17 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 18 | 19 | corpus = ptm.CorpusFromFieldDelimitedEmojiFile('/Data/ko_sns_comments/xab',1) 20 | result1 = pipeline.processCorpus(corpus) 21 | 22 | print ('Finish processing... ') 23 | 24 | i = 0 25 | file = open("naver_comments15_16_filtered.txt", "a+") 26 | for doc in result1: 27 | if i % 10000 == 0: 28 | print('processing ' + str(i)) 29 | i += 1 30 | document = '' 31 | for sent in doc: 32 | for word in sent: 33 | document += word + ' ' 34 | file.write(document.strip() + '\n') 35 | 36 | file.close() 37 | print('Document size for the total dataset: ' + str(i)) 38 | 39 | -------------------------------------------------------------------------------- /examples/node2vec_tester.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | from py_node2vec.node2vecModel import Node2VecModel 3 | 4 | embedding_filename='./node2vec.emb' 5 | n2vec = Node2VecModel() 6 | n2vec.load_model(embedding_filename) 7 | results= n2vec.most_similars('정치') 8 | print(results) 9 | 10 | pair_similarity = n2vec.compute_similarity('문재인', '정치') 11 | for pair in pair_similarity: 12 | print(str(pair[0]) + " -- " + str(pair[1])) -------------------------------------------------------------------------------- /examples/node2vec_traianer.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | 3 | import pyTextMiner as ptm 4 | import re 5 | from py_node2vec.node2vecModel import Node2VecModel 6 | 7 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 8 | stopword_file = '../stopwords/stopwordsKor.txt' 9 | 10 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), 11 | ptm.tokenizer.MeCab(mecab_path), 12 | ptm.lemmatizer.SejongPOSLemmatizer(), 13 | ptm.helper.SelectWordOnly(), 14 | ptm.helper.StopwordFilter(file=stopword_file)) 15 | 16 | corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt',2) 17 | result = pipeline.processCorpus(corpus) 18 | #print(result) 19 | #print() 20 | 21 | documents = [] 22 | for doc in result: 23 | document = '' 24 | for sent in doc: 25 | n_sent = " ".join(sent) 26 | #for English text to remove special chars 27 | document += re.sub('[^A-Za-zㄱ-ㅣ가-힣 ]+', '', n_sent) 28 | documents.append(document) 29 | 30 | co = ptm.cooccurrence.CooccurrenceWorker() 31 | co_result, vocab = co.__call__(documents) 32 | 33 | cv = CountVectorizer() 34 | cv_fit = cv.fit_transform(documents) 35 | word_list = cv.get_feature_names(); 36 | count_list = cv_fit.toarray().sum(axis=0) 37 | word_hist = dict(zip(word_list, count_list)) 38 | 39 | threshold = 2.0 40 | dimensions=300 41 | walk_length=30 42 | num_walks=200 43 | 44 | n2vec = Node2VecModel() 45 | 46 | n2vec.create_graph(co_result, word_hist, threshold) 47 | n2vec.train(dimensions, walk_length, num_walks) 48 | 49 | embedding_filename='node2vec.emb' 50 | embedding_model_file='node2vec.model' 51 | n2vec.save_model(embedding_filename,embedding_model_file) -------------------------------------------------------------------------------- /examples/scibert_ner_train.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | from torch.optim import Adam 5 | from torch.utils.data import DataLoader 6 | 7 | from transformers import (get_linear_schedule_with_warmup, 8 | BertForTokenClassification, 9 | AutoTokenizer) 10 | 11 | from py_ner.data_utils.ner_dataset import read_data_from_file, get_labels, NerDataset 12 | from py_ner.model.optimizers import get_optimizer_with_weight_decay 13 | 14 | # https://github.com/cambridgeltl/MTL-Bioinformatics-2016/tree/master/data 15 | from py_ner.scibert_ner_train import SciBertTrainer 16 | 17 | #dataset for NER 18 | DATA_TR_PATH = '../py_ner/data/JNLPBA/Genia4ERtask1.iob2' 19 | DATA_TS_PATH = '../py_ner/data/JNLPBA/Genia4EReval1.iob2' 20 | SEED = 42 21 | 22 | # MODEL 23 | #MODEL_NAME = 'allenai/scibert_scivocab_uncased' 24 | #MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT' 25 | #MODEL_NAME = 'adamlin/ClinicalBert_all_notes' 26 | #MODEL_NAME = 'monologg/biobert_v1.0_pubmed_pmc' 27 | MODEL_NAME = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' 28 | MAX_LEN_SEQ = 128 29 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 30 | 31 | # Optimization parameters 32 | N_EPOCHS = 6 33 | BATCH_SIZE = 8 34 | BATCH_SIZE_VAL = 28 35 | WEIGHT_DECAY = 0 36 | LEARNING_RATE = 5e-5 # 2e-4 37 | RATIO_WARMUP_STEPS = .1 38 | DROPOUT = .3 39 | ACUMULATE_GRAD_EVERY = 4 40 | OPTIMIZER = Adam 41 | 42 | # Seeds 43 | random.seed(SEED) 44 | np.random.seed(SEED) 45 | torch.manual_seed(SEED) 46 | torch.cuda.manual_seed_all(SEED) 47 | 48 | # get data 49 | training_set = read_data_from_file(DATA_TR_PATH) 50 | test_set = read_data_from_file(DATA_TS_PATH) 51 | 52 | # Automatically extract labels and their indexes from data. 53 | labels2ind, labels_count = get_labels(training_set + test_set) 54 | 55 | # Load tokenizer 56 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 57 | 58 | # Create loaders for datasets 59 | training_set = NerDataset(dataset=training_set, 60 | tokenizer=tokenizer, 61 | labels2ind=labels2ind, 62 | max_len_seq=MAX_LEN_SEQ) 63 | 64 | test_set = NerDataset(dataset=test_set, 65 | tokenizer=tokenizer, 66 | labels2ind=labels2ind, 67 | max_len_seq=MAX_LEN_SEQ) 68 | 69 | dataloader_tr = DataLoader(dataset=training_set, 70 | batch_size=BATCH_SIZE, 71 | shuffle=True) 72 | 73 | dataloader_ts = DataLoader(dataset=test_set, 74 | batch_size=BATCH_SIZE_VAL, 75 | shuffle=False) 76 | 77 | # Load model 78 | nerbert = BertForTokenClassification.from_pretrained(MODEL_NAME, 79 | hidden_dropout_prob=DROPOUT, 80 | attention_probs_dropout_prob=DROPOUT, 81 | label2id=labels2ind, 82 | num_labels=len(labels2ind), 83 | id2label={str(v): k for k, v in labels2ind.items()}) 84 | 85 | # Prepare optimizer and schedule (linear warmup and decay) 86 | optimizer = get_optimizer_with_weight_decay(model=nerbert, 87 | optimizer=OPTIMIZER, 88 | learning_rate=LEARNING_RATE, 89 | weight_decay=WEIGHT_DECAY) 90 | 91 | training_steps = (len(dataloader_tr)//ACUMULATE_GRAD_EVERY) * N_EPOCHS 92 | scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 93 | num_warmup_steps=training_steps * RATIO_WARMUP_STEPS, 94 | num_training_steps=training_steps) 95 | 96 | # Trainer 97 | trainer = SciBertTrainer(model=nerbert, 98 | tokenizer=tokenizer, 99 | optimizer=optimizer, 100 | scheduler=scheduler, 101 | dataloader_train=dataloader_tr, 102 | dataloader_test=dataloader_ts, 103 | labels2ind=labels2ind, 104 | device=DEVICE, 105 | n_epochs=N_EPOCHS, 106 | accumulate_grad_every=ACUMULATE_GRAD_EVERY, 107 | output_dir='./models') 108 | 109 | tr_losses, val_losses = trainer.train() 110 | 111 | -------------------------------------------------------------------------------- /examples/scibert_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForTokenClassification 3 | 4 | # Example 5 | text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes." 6 | 7 | # Load model 8 | tokenizer = AutoTokenizer.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba") 9 | model = AutoModelForTokenClassification.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba") 10 | 11 | #tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 12 | #model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 13 | 14 | # Get input for BERT 15 | input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0) 16 | 17 | # Predict 18 | with torch.no_grad(): 19 | outputs = model(input_ids) 20 | 21 | # From the output let's take the first element of the tuple. 22 | # Then, let's get rid of [CLS] and [SEP] tokens (first and last) 23 | predictions = outputs[0].argmax(axis=-1)[0][1:-1] 24 | 25 | # Map label class indexes to string labels. 26 | for token, pred in zip(tokenizer.tokenize(text), predictions): 27 | print(token, '->', model.config.id2label[pred.numpy().item()]) -------------------------------------------------------------------------------- /examples/test222.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | _stopwords = [] 4 | with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file: 5 | for line in file: 6 | line = line.strip() #or some other preprocessing 7 | _stopwords.append(line) #storing everything in memory! 8 | 9 | path='C:\\mecab\\mecab-ko-dic' 10 | #pos_tagger_name - either komoran, okt, nltk 11 | #lang = ko or en 12 | pipeline = ptm.Pipeline(ptm.keyword.TextRankExtractor(pos_tagger_name='mecab', 13 | mecab_path=path, 14 | max=5, 15 | lang='ko', 16 | stopwords=_stopwords, 17 | combined_keywords=True)) 18 | 19 | corpus = ptm.CorpusFromFile('./data/sampleKor.txt') 20 | result = pipeline.processCorpus(corpus) 21 | print('== Splitting Sentence ==') 22 | print(result) 23 | print() 24 | 25 | from sklearn.datasets import fetch_20newsgroups 26 | ng20 = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes')) 27 | 28 | print("XXXX " + str(ng20.data[0])) -------------------------------------------------------------------------------- /examples/test3.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | dictionary_path='./dict/user_dic.txt' 4 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 5 | ptm.tokenizer.Komoran(userdic=dictionary_path), 6 | ptm.helper.POSFilter('NN*'), 7 | ptm.helper.SelectWordOnly(), 8 | #ptm.tokenizer.MaxScoreTokenizerKorean(), 9 | #ptm.tokenizer.Word(), 10 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 11 | #ptm.ngram.NGramTokenizer(2,3), 12 | #ptm.counter.WordCounter()) 13 | 14 | corpus = ptm.CorpusFromEojiFile('./data/filtered_content.txt') 15 | #result = pipeline.processCorpus(corpus) 16 | 17 | #print(result) 18 | print() 19 | 20 | import numpy as np 21 | print(np.__version__) 22 | 23 | s = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습." 24 | 25 | 26 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter()) 27 | corpus = [s] 28 | result = pipeline.processCorpus(corpus) 29 | print(result) -------------------------------------------------------------------------------- /examples/test4.py: -------------------------------------------------------------------------------- 1 | import tomotopy as tp 2 | print(tp.isa) # 'avx2'나 'avx', 'sse2', 'none'를 출력합니다. 3 | 4 | import pyTextMiner as ptm 5 | import io 6 | import nltk 7 | 8 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 9 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 10 | ptm.tokenizer.MeCab(mecab_path), 11 | #ptm.tokenizer.Komoran(), 12 | ptm.lemmatizer.SejongPOSLemmatizer(), 13 | ptm.helper.SelectWordOnly(), 14 | #ptm.ngram.NGramTokenizer(1, 2, concat=' ')) 15 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 16 | 17 | documents = ['오늘은 비가와서 그런지 매우 우울한 날이다', 18 | '시험이 끝나야 놀지 스트레스 받아ㅠㅠ', 19 | '행복한 하루의 끝이라 좋네!'] 20 | 21 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2) 22 | #result = pipeline.processCorpus(corpus) 23 | 24 | result = pipeline.processCorpus(documents) 25 | print(result) 26 | 27 | 28 | from soylemma import Lemmatizer 29 | lemmatizer = Lemmatizer(dictionary_name='default') 30 | re = lemmatizer.lemmatize('밝은') 31 | print('result ' + str(re)) 32 | 33 | test_list = ['http://www.google.com', "why", "ftpfjdjkwjkjw", "no no!"] 34 | PROTOCOLS = ('http', 'https', 'ftp', 'git') 35 | for s in test_list: 36 | if s.startswith(tuple(p for p in PROTOCOLS)): 37 | print("true " + s) 38 | else: 39 | print("false " + s) 40 | 41 | -------------------------------------------------------------------------------- /examples/testCooccurrence.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | import networkx as nx 3 | from matplotlib import pyplot as plt 4 | import numpy as np 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | import matplotlib as mpl 7 | 8 | if __name__ == '__main__': 9 | #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.chunker.KoreanChunker()) 10 | 11 | # 다음은 분석에 사용할 corpus를 불러오는 일입니다. sampleEng.txt 파일을 준비해두었으니, 이를 읽어와봅시다. 12 | # ptm의 CorpusFromFile이라는 클래스를 통해 문헌집합을 가져올 수 있습니다. 이 경우 파일 내의 한 라인이 문헌 하나가 됩니다. 13 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2) 14 | 15 | #import nltk 16 | #nltk.download() 17 | # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. 18 | # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. 19 | # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. 20 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 21 | ptm.tokenizer.Komoran(), 22 | ptm.helper.POSFilter('NN*'), 23 | ptm.helper.SelectWordOnly(), 24 | ptm.ngram.NGramTokenizer(1,2), 25 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') 26 | ) 27 | result = pipeline.processCorpus(corpus) 28 | print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==') 29 | print(result) 30 | print() 31 | 32 | print('== ==') 33 | 34 | import re 35 | documents = [] 36 | for doc in result: 37 | document = '' 38 | for sent in doc: 39 | document = " ".join(sent) 40 | #for English text to remove special chars 41 | document = re.sub('[^A-Za-z0-9]+', '', document) 42 | 43 | documents.append(document) 44 | co = ptm.cooccurrence.CooccurrenceWorker() 45 | co_result, vocab = co.__call__(documents) 46 | 47 | graph_builder = ptm.graphml.GraphMLCreator() 48 | 49 | #mode is either with_threshold or without_threshod 50 | mode='with_threshold' 51 | 52 | if mode is 'without_threshold': 53 | print(str(co_result)) 54 | print(str(vocab)) 55 | graph_builder.createGraphML(co_result, vocab, "test1.graphml") 56 | 57 | elif mode is 'with_threshold': 58 | cv = CountVectorizer() 59 | cv_fit = cv.fit_transform(documents) 60 | word_list = cv.get_feature_names(); 61 | count_list = cv_fit.toarray().sum(axis=0) 62 | word_hist = dict(zip(word_list, count_list)) 63 | 64 | print(str(co_result)) 65 | print(str(word_hist)) 66 | 67 | graph_builder.createGraphMLWithThreshold(co_result, word_hist, vocab, "test.graphml",threshold=35.0) 68 | display_limit=50 69 | graph_builder.summarize_centrality(limit=display_limit) 70 | title = '동시출현 기반 그래프' 71 | file_name='test.png' 72 | graph_builder.plot_graph(title,file=file_name) 73 | -------------------------------------------------------------------------------- /examples/testDocTermMatrix.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 3 | import pyTextMiner as ptm 4 | 5 | def vectorizeCaseOne(): 6 | documents = [ 7 | 'This is the first document.', 8 | 'This document is the second document.', 9 | 'And this is the third one.', 10 | 'Is this the first document?', 11 | ] 12 | 13 | vectorizer = CountVectorizer() 14 | X = vectorizer.fit_transform(documents) 15 | print(vectorizer.get_feature_names()) 16 | print(X.toarray()) 17 | 18 | vectorizer = TfidfVectorizer() 19 | X = vectorizer.fit_transform(documents) 20 | print(vectorizer.get_feature_names()) 21 | print(X.toarray()) 22 | 23 | def vectorizeCaseTwo(): 24 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2) 25 | 26 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 27 | ptm.tokenizer.Komoran(), 28 | ptm.helper.POSFilter('NN*'), 29 | ptm.helper.SelectWordOnly(), 30 | ptm.ngram.NGramTokenizer(2, 2), 31 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') 32 | ) 33 | result = pipeline.processCorpus(corpus) 34 | print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==') 35 | print(result) 36 | print() 37 | 38 | print('== ==') 39 | 40 | documents = [] 41 | for doc in result: 42 | document = '' 43 | for sent in doc: 44 | document += " ".join(sent) 45 | documents.append(document) 46 | 47 | vectorizer = CountVectorizer() 48 | X = vectorizer.fit_transform(documents) 49 | print(vectorizer.get_feature_names()) 50 | print(X.shape) 51 | 52 | print(X.toarray()) 53 | 54 | vectorizer = TfidfVectorizer() 55 | X = vectorizer.fit_transform(documents) 56 | print(vectorizer.get_feature_names()) 57 | print(len(vectorizer.get_feature_names())) 58 | print(X.toarray()) 59 | 60 | 61 | #vectorizeCaseOne() 62 | 63 | vectorizeCaseTwo() 64 | 65 | -------------------------------------------------------------------------------- /examples/testEXCo.py: -------------------------------------------------------------------------------- 1 | import os, subprocess 2 | 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | 5 | import pyTextMiner as ptm 6 | 7 | mecab_path='C:\\mecab\\mecab-ko-dic' 8 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 9 | ptm.tokenizer.MeCab(mecab_path), 10 | ptm.helper.POSFilter('NN*'), 11 | ptm.helper.SelectWordOnly(), 12 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 13 | 14 | corpus = ptm.CorpusFromFile('./data/134963_norm.txt') 15 | result = pipeline.processCorpus(corpus) 16 | 17 | with open('processed_134963.txt', 'w', encoding='utf-8') as f_out: 18 | for doc in result: 19 | for sent in doc: 20 | new_sent = '' 21 | for word in sent: 22 | new_sent += word + ' ' 23 | new_sent = new_sent.strip() 24 | f_out.write(new_sent + "\n") 25 | f_out.close() 26 | 27 | file_path='D:\\python_workspace\\pyTextMiner\\processed_134963.txt' 28 | co='D:\\python_workspace\\pyTextMiner\\external_programs\\ccount.exe ' + "--input " + file_path + " --threshold " + str(2) + " --output " + "co_result.txt" 29 | 30 | subprocess.run(co, shell=True) 31 | co_results={} 32 | vocabulary = {} 33 | with open("co_result.txt", 'r', encoding='utf-8') as f_in: 34 | for line in f_in: 35 | fields = line.split() 36 | token1 = fields[0] 37 | token2 = fields[1] 38 | token3 = fields[2] 39 | 40 | tup=(str(token1),str(token2)) 41 | co_results[tup]=float(token3) 42 | 43 | vocabulary[token1] = vocabulary.get(token1, 0) + 1 44 | vocabulary[token2] = vocabulary.get(token2, 0) + 1 45 | 46 | word_hist = dict(zip(vocabulary.keys(), vocabulary.values())) 47 | 48 | graph_builder = ptm.graphml.GraphMLCreator() 49 | 50 | #mode is either with_threshold or without_threshod 51 | mode='with_threshold' 52 | 53 | if mode is 'without_threshold': 54 | graph_builder.createGraphML(co_results, vocabulary.keys(), "test1.graphml") 55 | 56 | elif mode is 'with_threshold': 57 | graph_builder.createGraphMLWithThresholdInDictionary(co_results, word_hist, "test.graphml",threshold=35.0) 58 | display_limit=50 59 | graph_builder.summarize_centrality(limit=display_limit) 60 | title = '동시출현 기반 그래프' 61 | file_name='test.png' 62 | graph_builder.plot_graph(title,file=file_name) 63 | -------------------------------------------------------------------------------- /examples/testFirst.py: -------------------------------------------------------------------------------- 1 | # yTextMiner의 파이썬 버전, PyTextMiner를 ptm이라는 이름으로 사용하겠다고 선언합니다 2 | # ptm 역시 파이프라인 구조로 텍스트를 처리합니다. 3 | # 만약 pyTextMiner에 빨간줄이 계속 뜬다면 왼쪽의 Project 트리뷰에서 pyTextMiner가 포함된 폴더를 우클릭하여 4 | # 'Mark Directory as'에서 'Sources Root'를 눌러주도록 합시다. 5 | # 이 패키지가 동작하기 위해서는 konlpy와 nltk라는 라이브러리가 필요합니다. konlpy는 저번에 설치했으므로, 6 | # 이번에는 nltk를 설치해봅시다. pip install nltk로 간단하게 설치하시면 됩니다. 7 | import pyTextMiner as ptm 8 | import io 9 | 10 | # 다음은 분석에 사용할 corpus를 불러오는 일입니다. sampleEng.txt 파일을 준비해두었으니, 이를 읽어와봅시다. 11 | # ptm의 CorpusFromFile이라는 클래스를 통해 문헌집합을 가져올 수 있습니다. 이 경우 파일 내의 한 라인이 문헌 하나가 됩니다. 12 | #corpus = ptm.CorpusFromFile('donald.txt') 13 | corpus = ptm.CorpusFromDirectory('./tmp', True) 14 | 15 | #corpus, pair_map = ptm.CorpusFromFieldDelimitedFileWithYear('./data/donald.txt') 16 | 17 | # 이번에는 PyTextMiner로 한국어 처리를 해보도록 하겠습니다. 한국어의 교착어적인 특성 및 복잡한 띄어쓰기 규칙 때문에 18 | # 공백 기준으로 단어를 분리하는 것에는 한계가 있어서 형태소 분석기를 사용합니다. 19 | # ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다. 20 | # 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다. 21 | 22 | #import nltk 23 | #nltk.download('punkt') 24 | 25 | #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), 26 | # ptm.helper.POSFilter('NN*'), 27 | # ptm.helper.SelectWordOnly(), 28 | # ptm.ngram.NGramTokenizer(3), 29 | # ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') 30 | # ) 31 | 32 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 33 | ptm.segmentation.SegmentationKorean('./model/korean_segmentation_model.crfsuite'), 34 | ptm.ngram.NGramTokenizer(3), 35 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') 36 | ) 37 | 38 | result = pipeline.processCorpus(corpus) 39 | 40 | with io.open("demofile.csv",'w',encoding='utf8') as f: 41 | for doc in result: 42 | for sent in doc: 43 | f.write('\t'.join(sent) + "\n") 44 | 45 | print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==') 46 | print(result) 47 | print() 48 | -------------------------------------------------------------------------------- /examples/testMallet.py: -------------------------------------------------------------------------------- 1 | from topic_model.MalletWrapper import MalletTopicModel 2 | 3 | model = MalletTopicModel('D:\python_workspace\pyTextMiner\mallet') 4 | #model.import_file(input=r'C:\mallet\topic_input\dblp_sample.txt') 5 | model.import_file(input=r'D:\python_workspace\pyTextMiner\mallet\topic_input\sample_dmr_input.txt') 6 | model.train_topics() 7 | 8 | #print(model.topic_keys) # see output_topic_keys parameter in Train Topics documentation 9 | # print(model.doc_topics) # see output_doc_topics parameter in Train Topics documentation 10 | #print(model.word_weights) # see topic_word_weights_file parameter in Train Topics documentationn -------------------------------------------------------------------------------- /examples/testPMI.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | corpus=ptm.CorpusFromFile('./data/2016-10-20.txt') 4 | pmi=ptm.pmi.PMICalculator(corpus) 5 | sent='아이오아이' 6 | result=pmi.__call__(sent) 7 | print(result) -------------------------------------------------------------------------------- /examples/test_document_clustering.py: -------------------------------------------------------------------------------- 1 | 2 | import pyTextMiner as ptm 3 | from py_document_clustering.documentclustering import DocumentClustering 4 | 5 | if __name__ == '__main__': 6 | corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2) 7 | 8 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 9 | ptm.tokenizer.Komoran(), 10 | ptm.helper.POSFilter('NN*'), 11 | ptm.helper.SelectWordOnly(), 12 | #ptm.ngram.NGramTokenizer(2, 2), 13 | ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt') 14 | ) 15 | result = pipeline.processCorpus(corpus) 16 | print('== ==') 17 | 18 | documents = [] 19 | for doc in result: 20 | document = '' 21 | for sent in doc: 22 | document += " ".join(sent) 23 | documents.append(document) 24 | 25 | print(len(documents)) 26 | #name either k-means, agglo, spectral_cocluster 27 | name = 'agglo' 28 | clustering=DocumentClustering(k=5) 29 | #n_components means the number of words to be used as features 30 | clustering.make_matrix(documents,n_components=-1,doc2vec_matrix=None) 31 | clustering.cluster(name) 32 | clustering.print_results() 33 | 34 | clustering.visualize() 35 | -------------------------------------------------------------------------------- /examples/test_korean_lemmatizer.py: -------------------------------------------------------------------------------- 1 | import pyTextMiner as ptm 2 | 3 | pipeline = None 4 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2) 5 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 6 | mode = 'korean_lemmatizer' 7 | if mode is not 'korean_lemmatizer': 8 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 9 | ptm.tokenizer.MeCab(mecab_path), 10 | #ptm.tokenizer.Komoran(), 11 | ptm.helper.SelectWordOnly(), 12 | ptm.ngram.NGramTokenizer(1,2,concat=' '), 13 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 14 | else : 15 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 16 | ptm.tokenizer.MeCab(mecab_path), 17 | #ptm.tokenizer.Komoran(), 18 | ptm.lemmatizer.SejongPOSLemmatizer(), 19 | ptm.helper.SelectWordOnly(), 20 | # ptm.ngram.NGramTokenizer(1, 2, concat=' ')) 21 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) 22 | 23 | 24 | documents = ['오늘은 비가와서 그런지 매우 우울하다', 25 | '시험이 끝나야 놀지 스트레스 받아ㅠㅠ', 26 | '행복한 하루의 끝이라 아름답고 좋네!', 27 | '더운날에는 아이스 커피가 최고지~~!'] 28 | 29 | #result = pipeline.processCorpus(corpus) 30 | result = pipeline.processCorpus(documents) 31 | print(result) -------------------------------------------------------------------------------- /examples/test_pyTextMinerTopicModel.py: -------------------------------------------------------------------------------- 1 | from topic_model.pyTextMinerTopicModel import pyTextMinerTopicModel 2 | import pyTextMiner as ptm 3 | 4 | if __name__ == '__main__': 5 | 6 | mecab_path='C:\\mecab\\mecab-ko-dic' 7 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 8 | ptm.tokenizer.MeCab(mecab_path), 9 | ptm.helper.POSFilter('NN*'), 10 | ptm.helper.SelectWordOnly(), 11 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') 12 | ) 13 | 14 | corpus = ptm.CorpusFromFieldDelimitedFileWithYear('./mallet/topic_input/sample_dmr_input.txt',doc_index=2,year_index=1) 15 | pair_map = corpus.pair_map 16 | 17 | result = pipeline.processCorpus(corpus.docs) 18 | text_data = [] 19 | for doc in result: 20 | new_doc = [] 21 | for sent in doc: 22 | for _str in sent: 23 | if len(_str) > 0: 24 | new_doc.append(_str) 25 | text_data.append(new_doc) 26 | 27 | topic_model = pyTextMinerTopicModel() 28 | topic_number=10 29 | mdl=None 30 | #mode is either lda, dmr, hdp, infer, etc 31 | mode='infer' 32 | label='' 33 | if mode is 'lda': 34 | print('Running LDA') 35 | label='LDA' 36 | lda_model_name = './test.lda.bin' 37 | mdl=topic_model.lda_model(text_data, lda_model_name, topic_number) 38 | 39 | print('perplexity score ' + str(mdl.perplexity)) 40 | 41 | elif mode is 'dmr': 42 | print('Running DMR') 43 | label='DMR' 44 | dmr_model_name='./test.dmr.bin' 45 | mdl=topic_model.dmr_model(text_data, pair_map, dmr_model_name, topic_number) 46 | 47 | print('perplexity score ' + str(mdl.perplexity)) 48 | 49 | elif mode is 'hdp': 50 | print('Running HDP') 51 | label='HDP' 52 | hdp_model_name='./test.hdp.bin' 53 | mdl, topic_num=topic_model.hdp_model(text_data, hdp_model_name) 54 | topic_number=topic_num 55 | elif mode is 'hlda': 56 | print('Running HLDA') 57 | label='HLDA' 58 | hlda_model_name = './test.hlda.bin' 59 | mdl=topic_model.hlda_model(text_data, hlda_model_name) 60 | elif mode is 'infer': 61 | lda_model_name = './test.lda.bin' 62 | unseen_text='아사이 베리 블루베리 비슷하다' 63 | topic_model.inferLDATopicModel(lda_model_name, unseen_text) 64 | 65 | if (mode is not 'infer'): 66 | # The below code extracts this dominant topic for each sentence 67 | # and shows the weight of the topic and the keywords in a nicely formatted output. 68 | df_topic_sents_keywords, matrix = topic_model.format_topics_sentences(topic_number=topic_number, mdl=mdl) 69 | 70 | # Format 71 | df_dominant_topic = df_topic_sents_keywords.reset_index() 72 | df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] 73 | df_dominant_topic.head(10) 74 | 75 | # Sometimes we want to get samples of sentences that most represent a given topic. 76 | # This code gets the most exemplar sentence for each topic. 77 | topic_model.distribution_document_word_count(df_topic_sents_keywords, df_dominant_topic) 78 | 79 | #When working with a large number of documents, 80 | # we want to know how big the documents are as a whole and by topic. 81 | #Let’s plot the document word counts distribution. 82 | topic_model.distribution_word_count_by_dominant_topic(df_dominant_topic) 83 | 84 | # Though we’ve already seen what are the topic keywords in each topic, 85 | # a word cloud with the size of the words proportional to the weight is a pleasant sight. 86 | # The coloring of the topics I’ve taken here is followed in the subsequent plots as well. 87 | ##topic_model.word_cloud_by_topic(mdl) 88 | 89 | # Let’s plot the word counts and the weights of each keyword in the same chart. 90 | topic_model.word_count_by_keywords(mdl,matrix) 91 | 92 | # Each word in the document is representative of one of the N topics. 93 | # Let’s color each word in the given documents by the topic id it is attributed to. 94 | # The color of the enclosing rectangle is the topic assigned to the document. 95 | topic_model.sentences_chart(mdl,start=0, end=5, topic_number=topic_number) 96 | 97 | #visualize documents by tSNE 98 | topic_model.tSNE(mdl,matrix,label,topic_number=10) 99 | 100 | topic_model.make_pyLDAVis(mdl,matrix,text_data) 101 | -------------------------------------------------------------------------------- /examples/test_word2veclite.py: -------------------------------------------------------------------------------- 1 | 2 | from word2vec.word2veclite import Word2Vec 3 | 4 | corpus = "I like playing football with my friends" 5 | cbow = Word2Vec(method="cbow", corpus=corpus, 6 | window_size=1, n_hidden=2, 7 | n_epochs=10, learning_rate=0.8) 8 | W1, W2, loss_vs_epoch = cbow.run() 9 | 10 | print(W1) 11 | #[[ 0.99870389 0.20697257] 12 | # [-1.01911559 2.26364436] 13 | # [-0.69737232 0.14131477] 14 | # [ 3.28315183 1.13801973] 15 | # [-1.42944927 -0.62142097] 16 | # [ 0.65359329 -2.21415048] 17 | # [-0.22343751 -1.17927987]] 18 | 19 | print(W2) 20 | #[[-0.97080793 1.21120331 2.15603796 -1.79083151 3.38445043 -1.65295511 21 | # 1.36685097] 22 | # [2.77323464 0.78710269 2.74152617 0.08953005 0.04400675 -1.34149651 23 | # -2.19375528]] 24 | 25 | print(loss_vs_epoch) 26 | #[14.328868654443703, 12.290456644464603, 10.366644621637064, 27 | # 9.1759777684446622, 8.4233626997233895, 7.3952948684910256, 28 | # 6.1727393307549736, 5.1639476117698191, 4.6333377088153043, 29 | # 4.2944697259465485] 30 | 31 | #smax=cbow.predict('I like playing',W1,W2) 32 | #print(smax) -------------------------------------------------------------------------------- /examples/testt.py: -------------------------------------------------------------------------------- 1 | 2 | import pyTextMiner as ptm 3 | import io 4 | from nltk.corpus import sentiwordnet as swn 5 | import nltk 6 | 7 | class EnglishDictionarySentimentAnalyzer: 8 | def __init__(self): 9 | name = 'EnglishDictionarySentimentAnalyzer' 10 | 11 | def createDictionary(self): 12 | nltk.download('sentiwordnet') 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | corpus = ptm.CorpusFromFile('./data/sampleEng.txt') 18 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 19 | ptm.tokenizer.Word(), 20 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'), 21 | ptm.tagger.NLTK(), 22 | ptm.lemmatizer.WordNet()) 23 | 24 | result = pipeline.processCorpus(corpus) 25 | 26 | EnglishDictionarySentimentAnalyzer().createDictionary() 27 | 28 | for doc in result: 29 | for sent in doc: 30 | for _str in sent: 31 | _str[0] 32 | _str[1] 33 | pos = '' 34 | if (str(_str[1]).startswith("N")): 35 | pos = 'n' 36 | elif (str(_str[1]).startswith("A")): 37 | pos = 'a' 38 | elif (str(_str[1]).startswith("V")): 39 | pos = 'v' 40 | try: 41 | if (len(pos) > 0): 42 | breakdown = swn.senti_synset(str(_str[0]) + '.'+ pos + '.01') 43 | print(str(breakdown) + " " + str(breakdown.pos_score()) 44 | + " " + str(breakdown.neg_score()) + " " + str(breakdown.obj_score())) 45 | except: 46 | print('not found') 47 | 48 | -------------------------------------------------------------------------------- /examples/word2vec_tester.py: -------------------------------------------------------------------------------- 1 | 2 | from word2vec.word_embeddings import Word2Vec 3 | 4 | word2vec = Word2Vec() 5 | binary=True 6 | model_file = 'word2vec.bin' 7 | word2vec.load_model(model_file, binary) 8 | 9 | print(word2vec.most_similar(positives=['이재명', '경제'], negatives=['정치인'], topn=10)) 10 | print('-----------------------------------') 11 | 12 | print(word2vec.similar_by_word('이재명')) 13 | -------------------------------------------------------------------------------- /examples/word2vec_trainer.py: -------------------------------------------------------------------------------- 1 | 2 | from word2vec.word_embeddings import Word2Vec 3 | 4 | word2vec = Word2Vec() 5 | mode = 'simple' 6 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 7 | stopword_file = '../stopwords/stopwordsKor.txt' 8 | files = [] 9 | files.append('../data/donald.txt') 10 | is_directory=False 11 | doc_index=2 12 | max=-1 13 | word2vec.preprocessing(mode,mecab_path,stopword_file,files,is_directory,doc_index,max) 14 | 15 | min_count=1 16 | window=5 17 | size=50 18 | negative=5 19 | word2vec.train(min_count, window, size, negative) 20 | 21 | model_file = 'word2vec.bin' 22 | binary=True; 23 | word2vec.save_model(model_file, binary) 24 | 25 | 26 | -------------------------------------------------------------------------------- /examples/zipfsManager.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import pyTextMiner as ptm 3 | 4 | dictionary_path='./dict/user_dic.txt' 5 | 6 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 7 | ptm.tokenizer.Komoran(), 8 | #ptm.tokenizer.WordPos(), 9 | ptm.helper.POSFilter('NN*'), 10 | ptm.helper.SelectWordOnly(), 11 | ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'), 12 | ptm.counter.WordCounter()) 13 | 14 | corpus = ptm.CorpusFromFile('./data/sampleEng.txt') 15 | 16 | #corpus = ptm.CorpusFromFile('Gulliver_Travels.txt') 17 | #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 18 | # ptm.tokenizer.Word(), 19 | # ptm.counter.WordCounter()) 20 | result = pipeline.processCorpus(corpus) 21 | 22 | print(result) 23 | print() 24 | 25 | doc_collection = '' 26 | term_counts = {} 27 | for doc in result: 28 | for sent in doc: 29 | for _str in sent: 30 | term_counts[_str[0]] = term_counts.get(_str[0], 0) + int(_str[1]) 31 | freq = range(int(_str[1])) 32 | co = '' 33 | for n in freq: 34 | co += ' ' + _str[0] 35 | 36 | doc_collection += ' ' + co 37 | word_freq = [] 38 | for key, value in term_counts.items(): 39 | word_freq.append((value,key)) 40 | 41 | word_freq.sort(reverse=True) 42 | print(word_freq) 43 | 44 | f = open("demo_result.txt", "w", encoding='utf8') 45 | for pair in word_freq: 46 | f.write(pair[1] + '\t' + str(pair[0]) + '\n') 47 | f.close() 48 | 49 | from wordcloud import WordCloud 50 | 51 | # Read the whole text. 52 | 53 | # Generate a word cloud image 54 | wordcloud = WordCloud().generate(doc_collection) 55 | 56 | # Display the generated image: 57 | # the matplotlib way: 58 | import matplotlib.pyplot as plt 59 | 60 | # Window의 경우 폰트 경로 61 | # font_path = 'C:/Windows/Fonts/malgun.ttf' 62 | 63 | #for Mac 64 | #font_path='/Library/Fonts/AppleGothic.ttf' 65 | 66 | # lower max_font_size 67 | wordcloud = WordCloud(max_font_size=40, 68 | background_color='white', 69 | collocations=False) 70 | 71 | wordcloud.generate(doc_collection) 72 | 73 | plt.figure() 74 | plt.imshow(wordcloud, interpolation="bilinear") 75 | plt.axis("off") 76 | plt.show() 77 | -------------------------------------------------------------------------------- /glove-win_devc_x64/cooccur.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccur.exe -------------------------------------------------------------------------------- /glove-win_devc_x64/cooccurrence.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccurrence.bin -------------------------------------------------------------------------------- /glove-win_devc_x64/cooccurrence.shuf.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccurrence.shuf.bin -------------------------------------------------------------------------------- /glove-win_devc_x64/demo.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | :: This batch file reveals OS, hardware, and networking configuration. 3 | TITLE My System Info 4 | ECHO Please wait... Checking system information. 5 | :: Section 1: OS information. 6 | ECHO ============================ 7 | ECHO OS INFO 8 | ECHO ============================ 9 | systeminfo | findstr /c:"OS Name" 10 | systeminfo | findstr /c:"OS Version" 11 | systeminfo | findstr /c:"System Type" 12 | :: Section 2: Hardware information. 13 | ECHO ============================ 14 | ECHO HARDWARE INFO 15 | ECHO ============================ 16 | 17 | SET CORPUS=donald.txt 18 | SET VOCAB_FILE=vocab.txt 19 | SET COOCCURRENCE_FILE=cooccurrence.bin 20 | SET COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 21 | SET SAVE_FILE=vectors 22 | SET VERBOSE=2 23 | SET MEMORY=4.0 24 | SET VOCAB_MIN_COUNT=2 25 | SET VECTOR_SIZE=50 26 | SET MAX_ITER=15 27 | SET WINDOW_SIZE=15 28 | SET BINARY=2 29 | SET NUM_THREADS=8 30 | SET X_MAX=10 31 | 32 | SET PYTHON=C:\Users\minsong\AppData\Local\Programs\Python\Python37\python.exe 33 | 34 | ECHO vocab_count -min-count %VOCAB_MIN_COUNT% -verbose %VERBOSE% < %CORPUS% > %VOCAB_FILE% 35 | vocab_count -min-count %VOCAB_MIN_COUNT% -verbose %VERBOSE% < %CORPUS% > %VOCAB_FILE% 36 | 37 | ECHO cooccur -memory %MEMORY% -vocab-file %VOCAB_FILE% -verbose %VERBOSE% -window-size %WINDOW_SIZE% < %CORPUS% > %COOCCURRENCE_FILE% 38 | cooccur -memory %MEMORY% -vocab-file %VOCAB_FILE% -verbose %VERBOSE% -window-size %WINDOW_SIZE% < %CORPUS% > %COOCCURRENCE_FILE% 39 | 40 | ECHO shuffle -memory %MEMORY% -verbose %VERBOSE% < %COOCCURRENCE_FILE% > %COOCCURRENCE_SHUF_FILE% 41 | shuffle -memory %MEMORY% -verbose %VERBOSE% < %COOCCURRENCE_FILE% > %COOCCURRENCE_SHUF_FILE% 42 | 43 | ECHO glove -save-file %SAVE_FILE% -threads %NUM_THREADS% -input-file %COOCCURRENCE_SHUF_FILE% -x-max %X_MAX% -iter %MAX_ITER% -vector-size %VECTOR_SIZE% -binary %BINARY%0 -vocab-file %VOCAB_FILE% -verbose %VERBOSE% 44 | glove -save-file %SAVE_FILE% -threads %NUM_THREADS% -input-file %COOCCURRENCE_SHUF_FILE% -x-max %X_MAX% -iter %MAX_ITER% -vector-size %VECTOR_SIZE% -binary %BINARY%0 -vocab-file %VOCAB_FILE% -verbose %VERBOSE% 45 | -------------------------------------------------------------------------------- /glove-win_devc_x64/demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 6 | 7 | make 8 | if [ ! -e text8 ]; then 9 | if hash wget 2>/dev/null; then 10 | wget http://mattmahoney.net/dc/text8.zip 11 | else 12 | curl -O http://mattmahoney.net/dc/text8.zip 13 | fi 14 | unzip text8.zip 15 | rm text8.zip 16 | fi 17 | 18 | CORPUS=text8 19 | VOCAB_FILE=vocab.txt 20 | COOCCURRENCE_FILE=cooccurrence.bin 21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 22 | BUILDDIR=build 23 | SAVE_FILE=vectors 24 | VERBOSE=2 25 | MEMORY=4.0 26 | VOCAB_MIN_COUNT=5 27 | VECTOR_SIZE=50 28 | MAX_ITER=15 29 | WINDOW_SIZE=15 30 | BINARY=2 31 | NUM_THREADS=8 32 | X_MAX=10 33 | if hash python 2>/dev/null; then 34 | PYTHON=python 35 | else 36 | PYTHON=python3 37 | fi 38 | 39 | echo 40 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE" 41 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 42 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE" 43 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 44 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE" 45 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 46 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE" 47 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 48 | if [ "$CORPUS" = 'text8' ]; then 49 | if [ "$1" = 'matlab' ]; then 50 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 51 | elif [ "$1" = 'octave' ]; then 52 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 53 | else 54 | echo "$ $PYTHON eval/python/evaluate.py" 55 | $PYTHON eval/python/evaluate.py 56 | fi 57 | fi -------------------------------------------------------------------------------- /glove-win_devc_x64/eval/python/distance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | 5 | def generate(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 9 | args = parser.parse_args() 10 | 11 | with open(args.vocab_file, 'r') as f: 12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 13 | with open(args.vectors_file, 'r') as f: 14 | vectors = {} 15 | for line in f: 16 | vals = line.rstrip().split(' ') 17 | vectors[vals[0]] = [float(x) for x in vals[1:]] 18 | 19 | vocab_size = len(words) 20 | vocab = {w: idx for idx, w in enumerate(words)} 21 | ivocab = {idx: w for idx, w in enumerate(words)} 22 | 23 | vector_dim = len(vectors[ivocab[0]]) 24 | W = np.zeros((vocab_size, vector_dim)) 25 | for word, v in vectors.items(): 26 | if word == '': 27 | continue 28 | W[vocab[word], :] = v 29 | 30 | # normalize each word vector to unit variance 31 | W_norm = np.zeros(W.shape) 32 | d = (np.sum(W ** 2, 1) ** (0.5)) 33 | W_norm = (W.T / d).T 34 | return (W_norm, vocab, ivocab) 35 | 36 | 37 | def distance(W, vocab, ivocab, input_term): 38 | for idx, term in enumerate(input_term.split(' ')): 39 | if term in vocab: 40 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 41 | if idx == 0: 42 | vec_result = np.copy(W[vocab[term], :]) 43 | else: 44 | vec_result += W[vocab[term], :] 45 | else: 46 | print('Word: %s Out of dictionary!\n' % term) 47 | return 48 | 49 | vec_norm = np.zeros(vec_result.shape) 50 | d = (np.sum(vec_result ** 2,) ** (0.5)) 51 | vec_norm = (vec_result.T / d).T 52 | 53 | dist = np.dot(W, vec_norm.T) 54 | 55 | for term in input_term.split(' '): 56 | index = vocab[term] 57 | dist[index] = -np.Inf 58 | 59 | a = np.argsort(-dist)[:N] 60 | 61 | print("\n Word Cosine distance\n") 62 | print("---------------------------------------------------------\n") 63 | for x in a: 64 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 65 | 66 | 67 | if __name__ == "__main__": 68 | N = 100 # number of closest words that will be shown 69 | W, vocab, ivocab = generate() 70 | while True: 71 | input_term = input("\nEnter word or sentence (EXIT to break): ") 72 | if input_term == 'EXIT': 73 | break 74 | else: 75 | distance(W, vocab, ivocab, input_term) 76 | -------------------------------------------------------------------------------- /glove-win_devc_x64/eval/python/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 7 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 8 | args = parser.parse_args() 9 | 10 | with open(args.vocab_file, 'r') as f: 11 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 12 | with open(args.vectors_file, 'r') as f: 13 | vectors = {} 14 | for line in f: 15 | vals = line.rstrip().split(' ') 16 | vectors[vals[0]] = [float(x) for x in vals[1:]] 17 | 18 | vocab_size = len(words) 19 | vocab = {w: idx for idx, w in enumerate(words)} 20 | ivocab = {idx: w for idx, w in enumerate(words)} 21 | 22 | vector_dim = len(vectors[ivocab[0]]) 23 | W = np.zeros((vocab_size, vector_dim)) 24 | for word, v in vectors.items(): 25 | if word == '': 26 | continue 27 | W[vocab[word], :] = v 28 | 29 | # normalize each word vector to unit length 30 | W_norm = np.zeros(W.shape) 31 | d = (np.sum(W ** 2, 1) ** (0.5)) 32 | W_norm = (W.T / d).T 33 | evaluate_vectors(W_norm, vocab) 34 | 35 | def evaluate_vectors(W, vocab): 36 | """Evaluate the trained word vectors on a variety of tasks""" 37 | 38 | filenames = [ 39 | 'capital-common-countries.txt', 'capital-world.txt', 'currency.txt', 40 | 'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt', 41 | 'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt', 42 | 'gram5-present-participle.txt', 'gram6-nationality-adjective.txt', 43 | 'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt', 44 | ] 45 | prefix = './eval/question-data/' 46 | 47 | # to avoid memory overflow, could be increased/decreased 48 | # depending on system and vocab size 49 | split_size = 100 50 | 51 | correct_sem = 0; # count correct semantic questions 52 | correct_syn = 0; # count correct syntactic questions 53 | correct_tot = 0 # count correct questions 54 | count_sem = 0; # count all semantic questions 55 | count_syn = 0; # count all syntactic questions 56 | count_tot = 0 # count all questions 57 | full_count = 0 # count all questions, including those with unknown words 58 | 59 | for i in range(len(filenames)): 60 | with open('%s/%s' % (prefix, filenames[i]), 'r') as f: 61 | full_data = [line.rstrip().split(' ') for line in f] 62 | full_count += len(full_data) 63 | data = [x for x in full_data if all(word in vocab for word in x)] 64 | 65 | if len(data) == 0: 66 | print("ERROR: no lines of vocab kept for %s !" % filenames[i]) 67 | print("Example missing line:", full_data[0]) 68 | continue 69 | 70 | indices = np.array([[vocab[word] for word in row] for row in data]) 71 | ind1, ind2, ind3, ind4 = indices.T 72 | 73 | predictions = np.zeros((len(indices),)) 74 | num_iter = int(np.ceil(len(indices) / float(split_size))) 75 | for j in range(num_iter): 76 | subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1))) 77 | 78 | pred_vec = (W[ind2[subset], :] - W[ind1[subset], :] 79 | + W[ind3[subset], :]) 80 | #cosine similarity if input W has been normalized 81 | dist = np.dot(W, pred_vec.T) 82 | 83 | for k in range(len(subset)): 84 | dist[ind1[subset[k]], k] = -np.Inf 85 | dist[ind2[subset[k]], k] = -np.Inf 86 | dist[ind3[subset[k]], k] = -np.Inf 87 | 88 | # predicted word index 89 | predictions[subset] = np.argmax(dist, 0).flatten() 90 | 91 | val = (ind4 == predictions) # correct predictions 92 | count_tot = count_tot + len(ind1) 93 | correct_tot = correct_tot + sum(val) 94 | if i < 5: 95 | count_sem = count_sem + len(ind1) 96 | correct_sem = correct_sem + sum(val) 97 | else: 98 | count_syn = count_syn + len(ind1) 99 | correct_syn = correct_syn + sum(val) 100 | 101 | print("%s:" % filenames[i]) 102 | print('ACCURACY TOP1: %.2f%% (%d/%d)' % 103 | (np.mean(val) * 100, np.sum(val), len(val))) 104 | 105 | print('Questions seen/total: %.2f%% (%d/%d)' % 106 | (100 * count_tot / float(full_count), count_tot, full_count)) 107 | print('Semantic accuracy: %.2f%% (%i/%i)' % 108 | (100 * correct_sem / float(count_sem), correct_sem, count_sem)) 109 | print('Syntactic accuracy: %.2f%% (%i/%i)' % 110 | (100 * correct_syn / float(count_syn), correct_syn, count_syn)) 111 | print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot)) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /glove-win_devc_x64/eval/python/word_analogy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | 5 | def generate(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 9 | args = parser.parse_args() 10 | 11 | with open(args.vocab_file, 'r') as f: 12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 13 | with open(args.vectors_file, 'r') as f: 14 | vectors = {} 15 | for line in f: 16 | vals = line.rstrip().split(' ') 17 | vectors[vals[0]] = [float(x) for x in vals[1:]] 18 | 19 | vocab_size = len(words) 20 | vocab = {w: idx for idx, w in enumerate(words)} 21 | ivocab = {idx: w for idx, w in enumerate(words)} 22 | 23 | vector_dim = len(vectors[ivocab[0]]) 24 | W = np.zeros((vocab_size, vector_dim)) 25 | for word, v in vectors.items(): 26 | if word == '': 27 | continue 28 | W[vocab[word], :] = v 29 | 30 | # normalize each word vector to unit variance 31 | W_norm = np.zeros(W.shape) 32 | d = (np.sum(W ** 2, 1) ** (0.5)) 33 | W_norm = (W.T / d).T 34 | return (W_norm, vocab, ivocab) 35 | 36 | 37 | def distance(W, vocab, ivocab, input_term): 38 | vecs = {} 39 | if len(input_term.split(' ')) < 3: 40 | print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' '))) 41 | return 42 | else: 43 | for idx, term in enumerate(input_term.split(' ')): 44 | if term in vocab: 45 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 46 | vecs[idx] = W[vocab[term], :] 47 | else: 48 | print('Word: %s Out of dictionary!\n' % term) 49 | return 50 | 51 | vec_result = vecs[1] - vecs[0] + vecs[2] 52 | 53 | vec_norm = np.zeros(vec_result.shape) 54 | d = (np.sum(vec_result ** 2,) ** (0.5)) 55 | vec_norm = (vec_result.T / d).T 56 | 57 | dist = np.dot(W, vec_norm.T) 58 | 59 | for term in input_term.split(' '): 60 | index = vocab[term] 61 | dist[index] = -np.Inf 62 | 63 | a = np.argsort(-dist)[:N] 64 | 65 | print("\n Word Cosine distance\n") 66 | print("---------------------------------------------------------\n") 67 | for x in a: 68 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 69 | 70 | 71 | if __name__ == "__main__": 72 | N = 100; # number of closest words that will be shown 73 | W, vocab, ivocab = generate() 74 | while True: 75 | input_term = raw_input("\nEnter three words (EXIT to break): ") 76 | if input_term == 'EXIT': 77 | break 78 | else: 79 | distance(W, vocab, ivocab, input_term) 80 | 81 | -------------------------------------------------------------------------------- /glove-win_devc_x64/glove.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/glove.exe -------------------------------------------------------------------------------- /glove-win_devc_x64/pthreadVC2.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/pthreadVC2.dll -------------------------------------------------------------------------------- /glove-win_devc_x64/shuffle.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/shuffle.exe -------------------------------------------------------------------------------- /glove-win_devc_x64/vocab_count.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/vocab_count.exe -------------------------------------------------------------------------------- /pyTextMiner/chunker/__init__.py: -------------------------------------------------------------------------------- 1 | class BaseChunker: 2 | IN_TYPE = [str] 3 | OUT_TYPE = [list, str] 4 | 5 | class KoreanChunker(BaseChunker): 6 | def __init__(self): 7 | 8 | import nltk 9 | grammar = """ 10 | NP: {*?} # Noun phrase 11 | VP: {*} # Verb phrase 12 | AP: {*} # Adjective phrase 13 | """ 14 | 15 | self.inst=nltk.RegexpParser(grammar) 16 | 17 | 18 | def __call__(self, *args, **kwargs): 19 | import konlpy 20 | words = konlpy.tag.Komoran().pos(*args) 21 | 22 | chunks = self.inst.parse(words) 23 | 24 | return chunks -------------------------------------------------------------------------------- /pyTextMiner/chunker/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/chunker/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/cooccurrence/__init__.py: -------------------------------------------------------------------------------- 1 | import string 2 | from collections import Counter 3 | import os 4 | from nltk import bigrams 5 | from collections import defaultdict 6 | import operator 7 | import numpy as np 8 | 9 | class BaseCooccurrence: 10 | INPUT=[list,str] 11 | OUTPUT=[list,tuple] 12 | 13 | class CooccurrenceWorker(BaseCooccurrence): 14 | def __init__(self): 15 | name = 'cooccurrence' 16 | 17 | from sklearn.feature_extraction.text import CountVectorizer 18 | import pyTextMiner.cooccurrence.cooccurrence as co 19 | self.inst = co.Cooccurrence(ngram_range=(2, 2), stop_words='english') 20 | 21 | def __call__(self, *args, **kwargs): 22 | 23 | # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary={'awesome unicorns': 0, 'batman forever': 1}) 24 | co_occurrences = self.inst.fit_transform(args[0]) 25 | # print('Printing sparse matrix:', co_occurrences) 26 | # print(co_occurrences.todense()) 27 | sum_occ = np.sum(co_occurrences.todense(), axis=0) 28 | # print('Sum of word-word occurrences:', sum_occ) 29 | 30 | # Converting itertor to set 31 | result = zip(self.inst.get_feature_names(), np.array(sum_occ)[0].tolist()) 32 | result_set = list(result) 33 | return result_set, self.inst.vocab() 34 | 35 | class CooccurrenceManager: 36 | def __init__(self): 37 | self.d = {} # 단어->단어ID로 변환할때 사용 38 | self.w = [] # 단어ID->단어로 변환할 때 사용 39 | 40 | def getIdOrAdd(self, word): 41 | # 단어가 이미 사전에 등록된 것이면 해당하는 ID를 돌려주고 42 | if word in self.d: return self.d[word] 43 | # 그렇지 않으면 새로 사전에 등록하고 ID를 부여함 44 | self.d[word] = len(self.d) 45 | self.w.append(word) 46 | return len(self.d) - 1 47 | 48 | def getWord(self, id): 49 | return self.w[id] 50 | 51 | def calculateCooccurrence(self, list): 52 | count = {} # 동시출현 빈도가 저장될 dict 53 | words = list(set(list)) # 단어별로 분리한 것을 set에 넣어 중복 제거하고, 다시 list로 변경 54 | wids = [self.getIdOrAdd(w) for w in words] 55 | for i, a in enumerate(wids): 56 | for b in wids[i + 1:]: 57 | if a == b: continue # 같은 단어의 경우는 세지 않음 58 | if a > b: a, b = b, a # A, B와 B, A가 다르게 세어지는것을 막기 위해 항상 a < b로 순서 고정 59 | count[a, b] = count.get((a, b), 0) + 1 # 실제로 센다 60 | 61 | sorted = [] 62 | for tup in count: 63 | freq = count[tup] 64 | left_word = self.getWord(count[0]) 65 | right_word = self.getWord(count[1]) 66 | sorted.append(((left_word, right_word), freq)) 67 | return sorted, words 68 | 69 | def computeCooccurence(self, list, target=''): 70 | com = defaultdict(lambda: defaultdict(int)) 71 | count_all = Counter() 72 | count_all1 = Counter() 73 | 74 | uniqueList = [] 75 | for _array in list: 76 | for line in _array: 77 | for word in line: 78 | if len(target) < 1: 79 | if word not in uniqueList: 80 | uniqueList.append(word) 81 | 82 | terms_bigram = bigrams(line) 83 | # Update the counter 84 | count_all.update(line) 85 | count_all1.update(terms_bigram) 86 | 87 | # Build co-occurrence matrix 88 | for i in range(len(line) - 1): 89 | for j in range(i + 1, len(line)): 90 | w1, w2 = sorted([line[i], line[j]]) 91 | if w1 != w2: 92 | com[w1][w2] += 1 93 | 94 | 95 | 96 | com_max = [] 97 | # For each term, look for the most common co-occurrent terms 98 | for t1 in com: 99 | t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5] 100 | for t2, t2_count in t1_max_terms: 101 | if len(target)>0 and (target is t1 or target is t2): 102 | if t1 not in uniqueList: 103 | uniqueList.append(t1) 104 | if t2 not in uniqueList: 105 | uniqueList.append(t2) 106 | com_max.append(((t1, t2), t2_count)) 107 | # Get the most frequent co-occurrences 108 | terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True) 109 | 110 | return terms_max, uniqueList 111 | 112 | -------------------------------------------------------------------------------- /pyTextMiner/cooccurrence/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/cooccurrence/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/cooccurrence/__pycache__/cooccurrence.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/cooccurrence/__pycache__/cooccurrence.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/cooccurrence/cooccurrence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | 5 | class Cooccurrence(CountVectorizer): 6 | """Co-ocurrence matrix 7 | Convert collection of raw documents to word-word co-ocurrence matrix 8 | 9 | Parameters 10 | ---------- 11 | encoding : string, 'utf-8' by default. 12 | If bytes or files are given to analyze, this encoding is used to 13 | decode. 14 | 15 | ngram_range : tuple (min_n, max_n) 16 | The lower and upper boundary of the range of n-values for different 17 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 18 | will be used. 19 | 20 | max_df: float in range [0, 1] or int, default=1.0 21 | 22 | min_df: float in range [0, 1] or int, default=1 23 | 24 | Example 25 | ------- 26 | 27 | >> import Cooccurrence 28 | >> docs = ['this book is good', 29 | 'this cat is good', 30 | 'cat is good shit'] 31 | >> model = Cooccurrence() 32 | >> Xc = model.fit_transform(docs) 33 | 34 | Check vocabulary by printing 35 | >> model.vocabulary_ 36 | 37 | """ 38 | 39 | def __init__(self, encoding='utf-8', ngram_range=(1, 1), 40 | max_df=1.0, min_df=1, max_features=None, 41 | stop_words=None, normalize=True, vocabulary=None): 42 | 43 | super(Cooccurrence, self).__init__( 44 | ngram_range=ngram_range, 45 | max_df=max_df, 46 | min_df=min_df, 47 | max_features=max_features, 48 | stop_words=stop_words, 49 | vocabulary=vocabulary 50 | ) 51 | 52 | self.X = None 53 | 54 | self.normalize = normalize 55 | 56 | def fit_transform(self, raw_documents, y=None): 57 | """Fit cooccurrence matrix 58 | 59 | Parameters 60 | ---------- 61 | raw_documents : iterable 62 | an iterable which yields either str, unicode or file objects 63 | 64 | Returns 65 | ------- 66 | Xc : Cooccurrence matrix 67 | 68 | """ 69 | X = super(Cooccurrence, self).fit_transform(raw_documents) 70 | self.X = X 71 | 72 | n_samples, n_features = X.shape 73 | 74 | Xc = (X.T * X) 75 | if self.normalize: 76 | g = sp.diags(1./Xc.diagonal()) 77 | Xc = g * Xc 78 | else: 79 | Xc.setdiag(0) 80 | 81 | return Xc 82 | 83 | def vocab(self): 84 | tuples = super(Cooccurrence, self).get_feature_names() 85 | vocabulary=[] 86 | for e_tuple in tuples: 87 | tokens = e_tuple.split() 88 | for t in tokens: 89 | if t not in vocabulary: 90 | vocabulary.append(t) 91 | 92 | return vocabulary 93 | 94 | def word_histgram(self): 95 | word_list = super(Cooccurrence, self).get_feature_names() 96 | count_list = self.X.toarray().sum(axis=0) 97 | return dict(zip(word_list,count_list)) -------------------------------------------------------------------------------- /pyTextMiner/counter/__init__.py: -------------------------------------------------------------------------------- 1 | class WordCounter: 2 | IN_TYPE = [list, str] 3 | OUT_TYPE = [list, tuple] 4 | 5 | def __call__(self, *args, **kwargs): 6 | from collections import Counter 7 | return list(Counter(args[0]).most_common()) 8 | 9 | -------------------------------------------------------------------------------- /pyTextMiner/counter/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/counter/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/graphml/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/graphml/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/helper/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | class POSFilter: 3 | IN_TYPE = [list, tuple] 4 | OUT_TYPE = [list, tuple] 5 | 6 | def __init__(self, *posWanted): 7 | import re 8 | self.wanted = set(p for p in posWanted if not p.endswith('*')) 9 | self.re = re.compile('(' + '|'.join(p[:-1] for p in posWanted if p.endswith('*')) + ').*') 10 | 11 | def test(self, pos): 12 | if pos in self.wanted: return True 13 | if self.re.match(pos): return True 14 | return False 15 | 16 | def __call__(self, *args, **kwargs): 17 | return [i for i in args[0] if self.test(i[1])] 18 | 19 | class StopwordFilter: 20 | IN_TYPE = [list, str] 21 | OUT_TYPE = [list, str] 22 | 23 | def __init__(self, stopwords = [], file = None): 24 | if file: 25 | stopwords = stopwords + [line.strip() for line in open(file, encoding='utf-8')] 26 | self.stopwords = set(stopwords) 27 | self.stopwordsPrefix = ('http', 'https', 'ftp', 'git', 'thatt') 28 | 29 | def __call__(self, *args, **kwargs): 30 | #any(e for e in test_list if e.startswith('three') or e.endswith('four')) 31 | return [i for i in args[0] if i.lower() not in self.stopwords and (i.lower().startswith(tuple(p for p in self.stopwordsPrefix)) == False)] 32 | 33 | class SelectWordOnly: 34 | IN_TYPE = [tuple] 35 | OUT_TYPE = [str] 36 | 37 | def __call__(self, *args, **kwargs): 38 | return args[0][0] 39 | 40 | class ToLowerCase: 41 | IN_TYPE = [str] 42 | OUT_TYPE = [str] 43 | 44 | def __call__(self, *args, **kwargs): 45 | return args[0].lower() -------------------------------------------------------------------------------- /pyTextMiner/helper/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/helper/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/keyword/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from krwordrank.word import KRWordRank 3 | 4 | class BaseKeywordExtraction: 5 | IN_TYPE = [str] 6 | OUT_TYPE = [str] 7 | 8 | class TextRankExtractor(BaseKeywordExtraction): 9 | def __init__(self, pos_tagger_name=None, mecab_path=None, 10 | lang='ko', max=10, 11 | stopwords=[], combined_keywords=False): 12 | import pyTextMiner.keyword.textrank as tr 13 | self.inst = tr.TextRank(pos_tagger_name=pos_tagger_name,mecab_path=mecab_path,lang=lang,stopwords=stopwords) 14 | self.max=max 15 | self.combined_keywords = combined_keywords 16 | def __call__(self, *args, **kwargs): 17 | import nltk.tokenize 18 | sents = nltk.tokenize.sent_tokenize(*args) 19 | for sent in sents: 20 | self.inst.build_keywords(sent) 21 | return self.inst.get_keywords(self.max,self.combined_keywords) 22 | 23 | class TextRankSummarizer(BaseKeywordExtraction): 24 | def __init__(self,pos_tagger_name=None,mecab_path=None,max=3): 25 | import pyTextMiner.keyword.textrank as tr 26 | self.inst=tr.TextRank(pos_tagger_name=pos_tagger_name,mecab_path=mecab_path) 27 | self.max=max 28 | 29 | def __call__(self, *args, **kwargs): 30 | return self.inst.summarize(args[0],self.max) 31 | 32 | class KeywordExtractionKorean(BaseKeywordExtraction): 33 | def __init__(self, min_count=2, max_length=10, 34 | beta=0.85, max_iter=10, verbose=True, num_words=20): 35 | self.min_count=min_count 36 | self.max_length=max_length 37 | self.beta=beta 38 | self.max_iter=max_iter 39 | self.verbose=verbose 40 | self.num_words=num_words 41 | 42 | self.inst=KRWordRank(min_count, max_length,self.verbose) 43 | 44 | def __call__(self, *args, **kwargs): 45 | _num_keywords=10 46 | #print(str(args[0]) + "\n") 47 | keywords, rank, graph = self.inst.extract(args[0], self.beta, self.max_iter, self.num_words) 48 | 49 | return keywords -------------------------------------------------------------------------------- /pyTextMiner/keyword/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/keyword/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/keyword/__pycache__/textrank.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/keyword/__pycache__/textrank.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/lemmatizer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/lemmatizer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/ngram/__init__.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | class BaseNgram: 4 | IN_TYPE = [list, str] 5 | OUT_TYPE = [list, str] 6 | 7 | class NGramTokenizer(BaseNgram): 8 | def __init__(self, min=1, ngramCount=3, concat='_'): 9 | self.ngramCount = ngramCount 10 | self.min = min 11 | self.converted = [] 12 | self.concat = concat 13 | 14 | def __call__(self, *args, **kwargs): 15 | converted = [] 16 | from nltk.util import ngrams 17 | for i in range(self.min, self.ngramCount+1): 18 | output = list(ngrams((args[0]), i)) 19 | for x in output: 20 | if (len(x) > 0): 21 | converted.append(self.concat.join(x)) 22 | 23 | #print("NGRAM " + str(converted)) 24 | self.converted = converted 25 | 26 | return converted -------------------------------------------------------------------------------- /pyTextMiner/ngram/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/ngram/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/noun_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from soynlp.noun import LRNounExtractor_v2 2 | 3 | class BaseNounExtraction: 4 | INPUT=[str] 5 | OUTPUT=[list,str] 6 | 7 | class NounExtractionKorean(BaseNounExtraction): 8 | def __init__(self,sents): 9 | self.inst = LRNounExtractor_v2(verbose=False, extract_compound=True) 10 | self.inst.train(sents) 11 | self.inst.extract() 12 | 13 | def __call__(self, *args, **kwargs): 14 | return self.inst.decompose_compound(args[0]) -------------------------------------------------------------------------------- /pyTextMiner/noun_extractor/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/noun_extractor/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/pmi/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from soynlp import DoublespaceLineCorpus 3 | from soynlp.word import WordExtractor 4 | from soynlp.tokenizer import LTokenizer 5 | from soynlp.vectorizer import sent_to_word_contexts_matrix 6 | from soynlp.word import pmi as pmi_func 7 | 8 | class BasePMICalculator: 9 | INPUT=[str] 10 | OUTPUT=[list,tuple] 11 | 12 | class PMICalculator(BasePMICalculator): 13 | def __init__(self, corpus=None): 14 | word_extractor = WordExtractor() 15 | word_extractor.train(corpus) 16 | cohesions = word_extractor.all_cohesion_scores() 17 | l_cohesions = {word: score[0] for word, score in cohesions.items()} 18 | tokenizer = LTokenizer(l_cohesions) 19 | x, self.idx2vocab = sent_to_word_contexts_matrix( 20 | corpus, 21 | windows=3, 22 | min_tf=10, 23 | tokenizer=tokenizer, # (default) lambda x:x.split(), 24 | dynamic_weight=False, 25 | verbose=True) 26 | 27 | self.vocab2idx = {vocab: idx for idx, vocab in enumerate(self.idx2vocab)} 28 | 29 | self.pmi, px, py = pmi_func( 30 | x, 31 | min_pmi=0, 32 | alpha=0.0, 33 | beta=0.75 34 | ) 35 | def __call__(self, *args, **kwargs): 36 | query = self.vocab2idx[args[0]] 37 | submatrix = self.pmi[query, :].tocsr() # get the row of query 38 | contexts = submatrix.nonzero()[1] # nonzero() return (rows, columns) 39 | pmi_i = submatrix.data 40 | 41 | most_relateds = [(idx, pmi_ij) for idx, pmi_ij in zip(contexts, pmi_i)] 42 | most_relateds = sorted(most_relateds, key=lambda x: -x[1])[:10] 43 | most_relateds = [(self.idx2vocab[idx], pmi_ij) for idx, pmi_ij in most_relateds] 44 | 45 | return most_relateds -------------------------------------------------------------------------------- /pyTextMiner/pmi/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/pmi/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/segmentation/WordSemgmentationModelBuilder.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from pycrfsuite_spacing import TemplateGenerator 3 | from pycrfsuite_spacing import CharacterFeatureTransformer 4 | from pycrfsuite_spacing import sent_to_chartags 5 | from pycrfsuite_spacing import sent_to_xy 6 | from pycrfsuite_spacing import PyCRFSuiteSpacing 7 | 8 | with open('../../data/134963_norm.txt', encoding='utf-8') as f: 9 | docs = [doc.strip() for doc in f] 10 | 11 | print('n docs = %d' % len(docs)) 12 | pprint(docs[:5]) 13 | 14 | to_feature = CharacterFeatureTransformer( 15 | TemplateGenerator(begin=-2, 16 | end=2, 17 | min_range_length=3, 18 | max_range_length=3) 19 | ) 20 | 21 | x, y = sent_to_xy('이것도 너프해 보시지', to_feature) 22 | pprint(x) 23 | print(y) 24 | 25 | correct = PyCRFSuiteSpacing( 26 | to_feature = to_feature, 27 | feature_minfreq=3, # default = 0 28 | max_iterations=100, 29 | l1_cost=1.0, 30 | l2_cost=1.0 31 | ) 32 | correct.train(docs, '../../model/korean_segmentation_model.crfsuite') 33 | 34 | -------------------------------------------------------------------------------- /pyTextMiner/segmentation/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/segmentation/__pycache__/lstmWordSegmentationModel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/lstmWordSegmentationModel.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/segmentation/__pycache__/wordSegmentationModelUtil.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/wordSegmentationModelUtil.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/segmentation/lstmWordSegmentationModel.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- coding: utf8 -*- 3 | 4 | import tensorflow as tf 5 | 6 | def weight_variable(shape): 7 | initial = tf.truncated_normal(shape, stddev=0.1) 8 | return tf.Variable(initial) 9 | 10 | def bias_variable(shape): 11 | initial = tf.constant(0.1, shape=shape) 12 | return tf.Variable(initial) 13 | 14 | def RNN(_X, _istate, _weights, _biases, n_hidden, n_steps, n_input, early_stop): 15 | # input _X shape: Tensor("Placeholder:0", shape=(?, n_steps, n_input), dtype=float32) 16 | # switch n_steps and batch_size, Tensor("transpose:0", shape=(n_steps, ?, n_input), dtype=float32) 17 | _X = tf.transpose(_X, [1, 0, 2]) 18 | # Reshape to prepare input to hidden activation 19 | # (n_steps*batch_size, n_input) => (?, n_input), Tensor("Reshape:0", shape=(?, n_input), dtype=float32) 20 | _X = tf.reshape(_X, [-1, n_input]) 21 | # Linear activation 22 | _X = tf.matmul(_X, _weights['hidden']) + _biases['hidden'] # (?, n_hidden)+scalar(n_hidden,)=(?,n_hidden) 23 | 24 | # Define a lstm cell with tensorflow 25 | lstm_cell = tf.contrib.rnn.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=False) 26 | # Split data because rnn cell needs a list of inputs for the RNN inner loop 27 | # n_steps splits each of which contains (?, n_hidden) 28 | # ex) [, ... , ] 29 | _X = tf.split(_X, n_steps, 0) 30 | # Get lstm cell output 31 | outputs, states = tf.contrib.rnn.static_rnn(cell=lstm_cell, inputs=_X, initial_state=_istate, sequence_length=early_stop) 32 | final_outputs = [] 33 | for output in outputs : 34 | # Linear activation 35 | final_output = tf.matmul(output, _weights['out']) + _biases['out'] # (?, n_classes) 36 | final_outputs.append(final_output) 37 | # [] 38 | return final_outputs -------------------------------------------------------------------------------- /pyTextMiner/segmentation/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "segm.ckpt" 2 | all_model_checkpoint_paths: "segm.ckpt" 3 | -------------------------------------------------------------------------------- /pyTextMiner/segmentation/model/dic.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/dic.pickle -------------------------------------------------------------------------------- /pyTextMiner/segmentation/model/segm.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /pyTextMiner/segmentation/model/segm.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.index -------------------------------------------------------------------------------- /pyTextMiner/segmentation/model/segm.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.meta -------------------------------------------------------------------------------- /pyTextMiner/splitter/__init__.py: -------------------------------------------------------------------------------- 1 | ''' str => list(str) ''' 2 | 3 | class BaseSplitter: 4 | IN_TYPE = [str] 5 | OUT_TYPE = [list, str] 6 | 7 | class SpecialCharRemover(BaseSplitter): 8 | IN_TYPE = [str] 9 | OUT_TYPE = [str] 10 | 11 | def __init__(self): 12 | import re 13 | self.hangul = re.compile('[^ ㄱ-ㅣ가-힣\\.\\?\\,]+') 14 | 15 | def __call__(self, *args, **kwargs): 16 | return self.hangul.sub('', *args) 17 | 18 | class NLTK(BaseSplitter): 19 | def __init__(self): 20 | import nltk.tokenize 21 | self.func = nltk.tokenize.sent_tokenize 22 | 23 | def __call__(self, *args, **kwargs): 24 | return self.func(*args) 25 | 26 | class KoSentSplitter(BaseSplitter): 27 | def __init__(self): 28 | import kss 29 | self.func = kss.split_sentences 30 | 31 | def __call__(self, *args, **kwargs): 32 | return self.func(*args) 33 | -------------------------------------------------------------------------------- /pyTextMiner/splitter/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/splitter/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/stemmer/__init__.py: -------------------------------------------------------------------------------- 1 | ''' str => str ''' 2 | 3 | class BaseStemmer: 4 | IN_TYPE = [str] 5 | OUT_TYPE = [str] 6 | 7 | class Porter(BaseStemmer): 8 | def __init__(self): 9 | import nltk 10 | self.inst = nltk.stem.PorterStemmer() 11 | 12 | def __call__(self, *args, **kwargs): 13 | return self.inst.stem(args[0]) 14 | 15 | class Lancaster(BaseStemmer): 16 | def __init__(self): 17 | import nltk 18 | self.inst = nltk.stem.LancasterStemmer() 19 | 20 | def __call__(self, *args, **kwargs): 21 | return self.inst.stem(args[0]) -------------------------------------------------------------------------------- /pyTextMiner/stemmer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/stemmer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/tagger/__init__.py: -------------------------------------------------------------------------------- 1 | ''' list(str) => list(tuple) ''' 2 | 3 | class BaseTagger: 4 | IN_TYPE = [list, str] 5 | OUT_TYPE = [list, tuple] 6 | 7 | class NLTK(BaseTagger): 8 | def __init__(self): 9 | import nltk 10 | nltk.download('averaged_perceptron_tagger') 11 | 12 | from nltk.tag.perceptron import PerceptronTagger 13 | 14 | self.inst = PerceptronTagger() 15 | 16 | def __call__(self, *args, **kwargs): 17 | return self.inst.tag(args[0]) -------------------------------------------------------------------------------- /pyTextMiner/tagger/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/tagger/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | ''' str => list(str) ''' 2 | 3 | class BaseTokenizer: 4 | IN_TYPE = [str] 5 | OUT_TYPE = [list, str] 6 | 7 | # [English] 8 | class Tweet(BaseTokenizer): 9 | def __init__(self): 10 | import nltk.tokenize 11 | self.inst = nltk.tokenize.TweetTokenizer() 12 | 13 | def __call__(self, *args, **kwargs): 14 | return self.inst.tokenize(*args) 15 | 16 | class Whitespace(BaseTokenizer): 17 | def __init__(self): 18 | import nltk.tokenize 19 | self.inst = nltk.tokenize.WhitespaceTokenizer() 20 | 21 | def __call__(self, *args, **kwargs): 22 | return self.inst.tokenize(*args) 23 | 24 | class Word(BaseTokenizer): 25 | def __init__(self): 26 | import nltk.tokenize 27 | self.inst = nltk.tokenize.word_tokenize 28 | 29 | def __call__(self, *args, **kwargs): 30 | print(str(self.inst(*args))) 31 | return self.inst(*args) 32 | 33 | class WordPos(BaseTokenizer): 34 | def __init__(self): 35 | import nltk 36 | self.inst = nltk 37 | self.OUT_TYPE = [list, tuple] 38 | 39 | def __call__(self, *args, **kwargs): 40 | tokens = self.inst.word_tokenize(*args) 41 | 42 | return self.inst.pos_tag(tokens) 43 | 44 | # [Korean] 45 | class Komoran(BaseTokenizer): 46 | def __init__(self,userdic=None): 47 | from konlpy.tag import Komoran 48 | import os 49 | if userdic is not None: 50 | print("user dict " + str(os.path.abspath(userdic))) 51 | self.inst = Komoran(userdic=os.path.abspath(userdic)) 52 | else: 53 | self.inst = Komoran() 54 | self.OUT_TYPE = [list, tuple] 55 | 56 | def __call__(self, *args, **kwargs): 57 | return self.inst.pos(args[0]) 58 | 59 | class TwitterKorean(BaseTokenizer): 60 | def __init__(self): 61 | from konlpy.tag import Twitter 62 | self.inst = Twitter() 63 | 64 | self.OUT_TYPE = [list, tuple] 65 | 66 | def __call__(self, *args, **kwargs): 67 | return self.inst.pos(args[0]) 68 | 69 | class KokomaKorean(BaseTokenizer): 70 | def __init__(self): 71 | from konlpy.tag import Kkma 72 | self.inst = Kkma() 73 | 74 | self.OUT_TYPE = [list, tuple] 75 | 76 | def __call__(self, *args, **kwargs): 77 | return self.inst.pos(args[0]) 78 | 79 | class MeCab(BaseTokenizer): 80 | def __init__(self, path=None): 81 | #import MeCab 82 | #self.inst = MeCab.Tagger() 83 | 84 | from konlpy.tag import Mecab 85 | self.inst = Mecab(path) 86 | 87 | self.OUT_TYPE = [list, tuple] 88 | 89 | def __call__(self, *args, **kwargs): 90 | try: 91 | return self.inst.pos(args[0]) 92 | except: 93 | return [] 94 | 95 | class SpecialTokenizer: 96 | IN_TYPE = [str] 97 | OUT_TYPE = [str] 98 | 99 | class MaxScoreTokenizerKorean(SpecialTokenizer): 100 | def __init__(self, scores=None): 101 | from soynlp.tokenizer import MaxScoreTokenizer 102 | self.inst=MaxScoreTokenizer(scores=scores) 103 | self.OUT_TYPE = [list, str] 104 | 105 | def __call__(self, *args, **kwargs): 106 | tokens = self.inst.tokenize(args[0]) 107 | return tokens 108 | 109 | class LTokenizerKorean(SpecialTokenizer): 110 | def __init__(self, scores=None): 111 | from soynlp.tokenizer import LTokenizer 112 | self.inst=LTokenizer(scores=scores) 113 | 114 | self.OUT_TYPE = [list, str] 115 | 116 | def __call__(self, *args, **kwargs): 117 | tokens = self.inst.tokenize(args[0]) 118 | return tokens 119 | 120 | class RegexTokenizerKorean(SpecialTokenizer): 121 | def __init__(self): 122 | from soynlp.tokenizer import RegexTokenizer 123 | self.inst=RegexTokenizer() 124 | self.OUT_TYPE = [list, str] 125 | 126 | def __call__(self, *args, **kwargs): 127 | tokens=self.inst.tokenize(args[0]) 128 | return tokens -------------------------------------------------------------------------------- /pyTextMiner/tokenizer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/tokenizer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pyTextMiner/utility/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from gensim.models import fasttext 3 | from soynlp.hangle import decompose, compose 4 | import re 5 | 6 | class Utility: 7 | def __init__(self): 8 | name = 'Utility Class' 9 | self.doublespace_pattern = re.compile('\s+') 10 | 11 | def jamo_sentence(self, sent): 12 | def transform(char): 13 | if char == ' ': 14 | return char 15 | cjj = decompose(char) 16 | if cjj != None: 17 | if len(cjj) == 1: 18 | return cjj 19 | cjj_ = ''.join(c if c != ' ' else '-' for c in cjj) 20 | 21 | if cjj == None: 22 | return '' 23 | 24 | return cjj_ 25 | 26 | sent_ = ''.join(transform(char) for char in sent) 27 | sent_ = self.doublespace_pattern.sub(' ', sent_) 28 | return sent_ 29 | 30 | def decode(self, s): 31 | def process(t): 32 | assert len(t) % 3 == 0 33 | t_ = t.replace('-', ' ') 34 | chars = [tuple(t_[3 * i:3 * (i + 1)]) for i in range(len(t_) // 3)] 35 | recovered = [] 36 | for char in chars: 37 | try: 38 | composed = compose(*char) 39 | recovered.append(composed) 40 | except: 41 | pass 42 | #recovered = [compose(*char) for char in chars] 43 | recovered = ''.join(recovered) 44 | return recovered 45 | 46 | return ' '.join(process(t) for t in s.split()) 47 | 48 | def decode_sentence(self, sent): 49 | return ' '.join(self.decode(token) for token in sent.split()) 50 | 51 | def cosine_similarity(self, word1, word2, model): 52 | cjj1 = self.jamo_sentence(word1) 53 | cjj2 = self.jamo_sentence(word2) 54 | cos_sim = model.cosine_similarity(cjj1, cjj2) 55 | return cos_sim 56 | 57 | def most_similar(self, word, model): 58 | jamo_result = [] 59 | cjj = self.jamo_sentence(word) 60 | result = model.most_similar(cjj) 61 | for token in result: 62 | word = token[0] 63 | encoded_word = self.decode(word) 64 | sim = token[1] 65 | jamo_result.append((encoded_word,sim)) 66 | 67 | return jamo_result 68 | 69 | def most_similars(self, model, positives, negatives, topn=10): 70 | jamo_result = [] 71 | result = model.most_similar(positive=positives,negative=negatives,topn=topn) 72 | for token in result: 73 | word = token[0] 74 | if len(word) > 3: 75 | encoded_word = self.decode(word) 76 | sim = token[1] 77 | jamo_result.append((encoded_word,sim)) 78 | 79 | return jamo_result 80 | 81 | def similar_by_word(self, model, word, topn=10): 82 | jamo_result = [] 83 | result = model.similar_by_word(word, topn=topn) 84 | for token in result: 85 | word = token[0] 86 | if len(word) > 3: 87 | encoded_word = self.decode(word) 88 | sim = token[1] 89 | jamo_result.append((encoded_word, sim)) 90 | 91 | return jamo_result -------------------------------------------------------------------------------- /pyTextMiner/version.py: -------------------------------------------------------------------------------- 1 | # Store the version here so: 2 | # 1) we don't load dependencies by storing it in __init__.py 3 | # 2) we can import it in setup.py for the same reason 4 | # 3) we can import it into your module module 5 | __version__ = '1.1.116b7' -------------------------------------------------------------------------------- /py_bert/__init__.py: -------------------------------------------------------------------------------- 1 | from py_bert import * -------------------------------------------------------------------------------- /py_bert/bert_classification_model.py: -------------------------------------------------------------------------------- 1 | from transformers import BertModel, BertForSequenceClassification 2 | from torch import nn, optim 3 | import torch 4 | import os 5 | from kobert_transformers import get_kobert_model 6 | 7 | class PYBERTClassifier(nn.Module): 8 | ''' 9 | Customized BERT Sequence Model 10 | ''' 11 | def __init__(self, n_classes, model_name): 12 | #PRE_TRAINED_MODEL_NAME = 'bert-base-cased' 13 | super(PYBERTClassifier, self).__init__() 14 | if 'etri' in model_name or 'mecab' in model_name: 15 | self.bert = BertModel.from_pretrained(os.path.abspath('pytorch_model.bin'), 16 | output_hidden_states = False) 17 | else: 18 | self.bert = BertModel.from_pretrained(model_name) 19 | 20 | #print(self.bert.config.hidden_size) 21 | 22 | self.drop = nn.Dropout(p=0.3) 23 | self.out = nn.Linear(self.bert.config.hidden_size, n_classes) 24 | 25 | def forward(self, input_ids, attention_mask): 26 | _, pooled_output = self.bert( 27 | input_ids=input_ids, 28 | attention_mask=attention_mask 29 | ) 30 | #print(pooled_output.shape) 31 | 32 | output = self.drop(pooled_output) 33 | return self.out(output) 34 | 35 | def name(self): 36 | return 'PYBERTClassifier' 37 | 38 | class PYBERTClassifierGenAtten(nn.Module): 39 | def __init__(self, 40 | n_classes, 41 | model_name, 42 | dr_rate=None, 43 | params=None): 44 | 45 | ''' 46 | bert, 47 | hidden_size=768, 48 | num_classes=2, 49 | dr_rate=None, 50 | params=None 51 | ''' 52 | 53 | super(PYBERTClassifierGenAtten, self).__init__() 54 | if 'etri' in model_name or 'mecab' in model_name: 55 | self.bert = BertModel.from_pretrained(os.path.abspath('pytorch_model.bin'), 56 | output_hidden_states=False) 57 | else: 58 | self.bert = BertModel.from_pretrained(model_name) 59 | self.out = nn.Linear(self.bert.config.hidden_size, n_classes) 60 | self.dr_rate = dr_rate 61 | self.attention_mask=None 62 | 63 | if self.dr_rate != None: 64 | print('dropout ' + str(self.dr_rate)) 65 | self.dropout = nn.Dropout(p=dr_rate) 66 | 67 | def gen_attention_mask(self, token_ids, targets): 68 | attention_mask = torch.zeros_like(token_ids) 69 | for i, v in enumerate(targets): 70 | attention_mask[i][:v] = 1 71 | return attention_mask.float() 72 | 73 | def get_attention_mask(self, atten_mask): 74 | self.attention_mask = atten_mask 75 | 76 | def forward(self, token_ids, targets, segment_ids, attention_mask): 77 | if attention_mask is None: 78 | self.attention_mask = self.gen_attention_mask(token_ids, targets) 79 | else: 80 | self.attention_mask = attention_mask 81 | 82 | _, pooler = self.bert(input_ids=token_ids, 83 | token_type_ids=segment_ids.long(), 84 | attention_mask=self.attention_mask.float().to(token_ids.device)) 85 | 86 | if self.dr_rate: 87 | output = self.dropout(pooler) 88 | 89 | return self.out(output) 90 | 91 | def name(self): 92 | return 'PYBERTClassifierGenAtten' 93 | 94 | class PYBertForSequenceClassification: 95 | ''' 96 | Use pytorch's BERTForSeqeunceClassification 97 | Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. 98 | labels (torch.LongTensor of shape (batch_size,), optional, defaults to None) 99 | – Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), 100 | If config.num_labels > 1 a classification loss is computed (Cross-Entropy). 101 | ''' 102 | def __init__(self, n_classes, model_name): 103 | self.model = BertForSequenceClassification.from_pretrained( 104 | model_name, # Use the 12-layer BERT model, with an uncased vocab. 105 | num_labels=n_classes, # The number of output labels--2 for binary classification. 106 | # You can increase this for multi-class tasks. 107 | output_attentions=False, # Whether the model returns attentions weights. 108 | output_hidden_states=False, # Whether the model returns all hidden-states. 109 | ) 110 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 111 | self.model.to(device) 112 | 113 | def __call__(self, *args, **kwargs): 114 | return self.model 115 | 116 | def name(self): 117 | return 'PYBertForSequenceClassification' -------------------------------------------------------------------------------- /py_bert/bert_dataset.py: -------------------------------------------------------------------------------- 1 | from torch import nn, optim 2 | from torch.utils.data import Dataset, DataLoader, TensorDataset 3 | import torch 4 | 5 | class PYBERTDataset(Dataset): 6 | def __init__(self, contents, targets, tokenizer, max_len): 7 | super(PYBERTDataset, self).__init__() 8 | self.contents = contents 9 | self.targets = targets 10 | self.tokenizer = tokenizer 11 | self.max_len = max_len 12 | 13 | def __len__(self): 14 | return len(self.contents) 15 | 16 | def __getitem__(self, item): 17 | content = str(self.contents[item]) 18 | target = self.targets[item] 19 | 20 | encoding = self.tokenizer.encode_plus( 21 | content, 22 | add_special_tokens=True, 23 | max_length=self.max_len, 24 | return_token_type_ids=True, 25 | pad_to_max_length=True, 26 | return_attention_mask=True, 27 | return_tensors='pt', 28 | ) 29 | 30 | return { 31 | 'document_text': content, 32 | 'input_ids': encoding['input_ids'].flatten(), 33 | 'token_type_ids': encoding['token_type_ids'].flatten(), 34 | 'attention_mask': encoding['attention_mask'].flatten(), 35 | 'targets': torch.tensor(target, dtype=torch.long) 36 | } -------------------------------------------------------------------------------- /py_bert/bert_util.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.data import Dataset, DataLoader 3 | from py_bert.bert_dataset import PYBERTDataset 4 | import pandas as pd 5 | from transformers import BertModel, BertTokenizer 6 | from py_bert.tokenization_kobert import KoBertTokenizer 7 | from py_bert.tokenization_korbert import KorBertTokenizer 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | import os 11 | 12 | def get_korean_tokenizer(bert_model_name): 13 | tokenizer = None 14 | if bert_model_name.startswith('monologg'): 15 | tokenizer = KoBertTokenizer.from_pretrained(bert_model_name) 16 | elif 'etri' or 'mecab' in bert_model_name: 17 | tokenizer = KorBertTokenizer.from_pretrained(os.path.abspath(bert_model_name)) 18 | else: 19 | tokenizer = BertTokenizer.from_pretrained(bert_model_name) 20 | 21 | return tokenizer 22 | 23 | def to_sentiment(rating): 24 | ''' 25 | assuming the class rating scale is from 0 to 5 26 | ''' 27 | rating = int(rating) 28 | if rating <= 2: 29 | return 0 30 | elif rating == 3: 31 | return 1 32 | else: 33 | return 2 34 | 35 | def add_sentiment_label(df): 36 | df['sentiment'] = df.score.apply(to_sentiment) 37 | if len(df['sentiment'].unique()) == 2: 38 | class_names = ['positive', 'negative'] 39 | elif len(df['sentiment'].unique()) == 3: 40 | class_names = ['positive', 'neutral', 'negative'] 41 | 42 | return df, class_names 43 | 44 | def create_data_loader(df, tokenizer, max_len, batch_size): 45 | ds = PYBERTDataset( 46 | contents=df.content.to_numpy(), 47 | targets=df.sentiment.to_numpy(), 48 | tokenizer=tokenizer, 49 | max_len=max_len) 50 | 51 | return DataLoader( 52 | ds, 53 | batch_size=batch_size, 54 | num_workers=0 55 | ) 56 | 57 | def convert_to_df(documents, labels): 58 | pd.set_option('display.max_columns', None) 59 | document_df = pd.DataFrame() 60 | combined = zip(documents,labels) 61 | for i, (text, label) in enumerate(combined): 62 | document_df = document_df.append(pd.Series([text, int(label)]), ignore_index=True) 63 | 64 | document_df.columns = ['content', 'sentiment'] 65 | class_names = [] 66 | if len(document_df['sentiment'].unique()) == 2: 67 | class_names = ['positive', 'negative'] 68 | elif len(document_df['sentiment'].unique()) == 3: 69 | class_names = ['positive', 'neutral', 'negative'] 70 | 71 | return document_df, class_names 72 | 73 | 74 | def convert_to_df_for_classification(documents, labels): 75 | pd.set_option('display.max_columns', None) 76 | document_df = pd.DataFrame() 77 | combined = zip(documents,labels) 78 | for i, (text, label) in enumerate(combined): 79 | document_df = document_df.append(pd.Series([text, int(label)]), ignore_index=True) 80 | 81 | document_df.columns = ['content', 'label'] 82 | class_names = document_df['label'].unique() 83 | 84 | return document_df, class_names 85 | 86 | def show_confusion_matrix(confusion_matrix): 87 | hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues") 88 | hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right') 89 | hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right') 90 | 91 | plt.ylabel('True sentiment') 92 | plt.xlabel('Predicted sentiment') 93 | plt.show() 94 | 95 | def token_count_distribution(df, tokenizer): 96 | token_lens = [] 97 | for txt in df.content: 98 | tokens = tokenizer.encode(txt, max_length=512) 99 | token_lens.append(len(tokens)) 100 | 101 | sns.distplot(token_lens) 102 | plt.xlim([0, 256]) 103 | plt.xlabel('Token count') 104 | 105 | plt.show() -------------------------------------------------------------------------------- /py_doc2vec/. ...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_doc2vec/__init__.py: -------------------------------------------------------------------------------- 1 | from py_doc2vec import * -------------------------------------------------------------------------------- /py_document_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from py_document_classification import * -------------------------------------------------------------------------------- /py_document_classification/test_ml_text_classfier.py: -------------------------------------------------------------------------------- 1 | from document_classification.ml_textclassification import documentClassifier 2 | import pyTextMiner as ptm 3 | 4 | if __name__ == '__main__': 5 | document_classifier = documentClassifier() 6 | mecab_path = 'C:\\mecab\\mecab-ko-dic' 7 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 8 | ptm.tokenizer.MeCab(mecab_path), 9 | ptm.helper.POSFilter('NN*'), 10 | ptm.helper.SelectWordOnly(), 11 | ptm.ngram.NGramTokenizer(2, 2), 12 | #ptm.tokenizer.LTokenizerKorean(), 13 | ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt') 14 | ) 15 | 16 | #mode is either train or predict 17 | mode = 'train' 18 | if mode is 'train': 19 | input_file ='./data/3_class_naver_news.csv' 20 | # 1. text processing and representation 21 | corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, 22 | delimiter=',', 23 | doc_index=4, 24 | class_index=1, 25 | title_index=3) 26 | corpus.docs 27 | tups = corpus.pair_map 28 | class_list = [] 29 | for id in tups: 30 | #print(tups[id]) 31 | class_list.append(tups[id]) 32 | 33 | result = pipeline.processCorpus(corpus) 34 | print('== ==') 35 | 36 | documents = [] 37 | for doc in result: 38 | document = '' 39 | for sent in doc: 40 | document += " ".join(sent) 41 | documents.append(document) 42 | 43 | document_classifier.preprocess(documents,class_list) 44 | 45 | #model_name = 0 -- RandomForestClassifier 46 | #model_name = 1 -- LinearSVC 47 | #model_name = 2 -- MultinomialNB 48 | #model_name = 3 -- LogisticRegression 49 | #model_name = 4 -- K-NN 50 | #model_name = 5 -- SGDClassifier 51 | X_train, X_test, y_train, y_test, y_pred, indices_test, model = document_classifier.train(model_index=1) 52 | 53 | print('training is finished') 54 | 55 | document_classifier.evaluate(y_test,y_pred,indices_test,model) 56 | document_classifier.save(model, model_name='./model/svm_classifier.model') 57 | document_classifier.saveVectorizer(model_name='./model/vectorizer.model') 58 | 59 | elif mode is 'predict': 60 | model=document_classifier.load('./model/svm_classifier.model') 61 | vectorizer_model=document_classifier.loadVectorizer(model_name='./model/vectorizer.model') 62 | document_classifier.predict(model,vectorizer_model) 63 | 64 | #7. prediction 65 | input = "../data/navernews.txt" 66 | corpus = ptm.CorpusFromFieldDelimitedFile(input,3) 67 | 68 | result = pipeline.processCorpus(corpus) 69 | print('== ==') 70 | 71 | documents = [] 72 | for doc in result: 73 | document = '' 74 | for sent in doc: 75 | document += " ".join(sent) 76 | documents.append(document) 77 | 78 | document_classifier.predict_realtime(model,vectorizer_model, documents) 79 | -------------------------------------------------------------------------------- /py_document_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from py_document_clustering import * -------------------------------------------------------------------------------- /py_ner/bert_crf_ner_visualization.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | import torch 4 | from gluonnlp.data import SentencepieceTokenizer 5 | 6 | from py_ner.bert_crf_ner_prediction import DecoderFromNamedEntitySequence 7 | from py_ner.model.net import KobertCRFViz 8 | from py_ner.data_utils.utils import Config 9 | from py_ner.data_utils.vocab_tokenizer import Tokenizer 10 | from py_ner.data_utils.pad_sequence import keras_pad_fn 11 | from pathlib import Path 12 | 13 | from py_ner.bertviz.head_view import show 14 | 15 | class BertCrfNerVisualization: 16 | def __init__(self, model_dir=''): 17 | #'./experiments/base_model_with_crf' 18 | self.model_dir = model_dir 19 | self.model_config = Config(json_path=self.model_dir + '/config.json') 20 | self.tokenizer = None 21 | self.model = None 22 | self.decoder_from_res = None 23 | 24 | def load_model(self, tokenizer_model_name, ner_model_name): 25 | # load vocab & tokenizer 26 | #tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model" 27 | tok_path = tokenizer_model_name 28 | ptr_tokenizer = SentencepieceTokenizer(tok_path) 29 | 30 | with open(self.model_dir + "/vocab.pkl", 'rb') as f: 31 | vocab = pickle.load(f) 32 | self.tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=self.model_config.maxlen) 33 | 34 | # load ner_to_index.json 35 | with open(self.model_dir + "/ner_to_index.json", 'rb') as f: 36 | ner_to_index = json.load(f) 37 | index_to_ner = {v: k for k, v in ner_to_index.items()} 38 | 39 | # model 40 | self.model = KobertCRFViz(config=self.model_config, num_classes=len(ner_to_index), vocab=vocab) 41 | 42 | #ner_model_name = "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin" 43 | # load 44 | model_dict = self.model.state_dict() 45 | checkpoint = torch.load(ner_model_name, map_location=torch.device('cpu')) 46 | convert_keys = {} 47 | for k, v in checkpoint['model_state_dict'].items(): 48 | new_key_name = k.replace("module.", '') 49 | if new_key_name not in model_dict: 50 | print("{} is not int model_dict".format(new_key_name)) 51 | continue 52 | convert_keys[new_key_name] = v 53 | 54 | self.model.load_state_dict(convert_keys) 55 | self.model.eval() 56 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 57 | self.model.to(device) 58 | self.decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=self.tokenizer, index_to_ner=index_to_ner) 59 | 60 | def visualize(self): 61 | input_text = '김대중 대통령은 노벨평화상을 받으러 스웨덴으로 출국해서 5박6일 동안 스웨덴에 머물며 대한민국의 위상을 높였다.' 62 | list_of_input_ids = self.tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) 63 | x_input = torch.tensor(list_of_input_ids).long() 64 | list_of_pred_ids, _ = self.model(x_input) 65 | 66 | list_of_ner_word, decoding_ner_sentence = self.decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) 67 | print("output>", decoding_ner_sentence) 68 | model_type = 'bert' 69 | show(self.model, model_type, self.tokenizer, decoding_ner_sentence, input_text) 70 | print("") 71 | 72 | if __name__ == '__main__': 73 | model_dir = '../examples/exper/base_model_with_crf' 74 | visualizer = BertCrfNerVisualization(model_dir) 75 | 76 | tokenizer_model_name = "./ptr_lm_model/tokenizer_78b3253a26.model" 77 | ner_model_name = "../examples/exper/base_model_with_crf/best-epoch-6-step-500-acc-0.943.bin" 78 | 79 | visualizer.load_model(tokenizer_model_name, ner_model_name) 80 | 81 | visualizer.visualize() -------------------------------------------------------------------------------- /py_ner/bert_ner_prediction.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from keras_preprocessing.sequence import pad_sequences 4 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 5 | import numpy as np 6 | from torch import nn 7 | 8 | import py_ner.lstm_cnn_crf_utils as utils 9 | import pickle 10 | 11 | class BERTNERPredictor: 12 | def __init__(self): 13 | print('BertNERPredictor') 14 | self.model = None 15 | 16 | def load_model(self, model_name): 17 | # open a file, where you stored the pickled data 18 | file = open(model_name, 'rb') 19 | # dump information to that file 20 | self.model = pickle.load(file) 21 | # close the file 22 | file.close() 23 | 24 | def getKeyByValue(self, dictOfElements, value): 25 | key = '' 26 | listOfItems = dictOfElements.items() 27 | for item in listOfItems: 28 | if item[1] == value: 29 | key = item[0] 30 | return key 31 | 32 | def align_predictions(self, items, predictions: np.ndarray, label_ids: np.ndarray): 33 | """Formats the predictions.""" 34 | preds = np.argmax(predictions, axis=2) 35 | batch_size, seq_len = preds.shape 36 | out_label_list = [[] for _ in range(batch_size)] 37 | preds_list = [[] for _ in range(batch_size)] 38 | for i in range(batch_size): 39 | for j in range(seq_len): 40 | if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: 41 | out_label_list[i].append(self.getKeyByValue(items, [label_ids[i][j]])) 42 | preds_list[i].append(self.getKeyByValue(items, [preds[i][j]])) 43 | return preds_list, out_label_list 44 | 45 | def predict_each(self, device, text, tokenizer, MAX_LEN, items): 46 | 47 | tokenized_texts = tokenizer.tokenize(text) 48 | print(tokenized_texts) 49 | 50 | input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], 51 | maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") 52 | 53 | pred_masks = [[float(i > 0) for i in ii] for ii in input_ids] 54 | pred_tags = [[int(i > 0) for i in ii] for ii in input_ids] 55 | 56 | #print(pred_tags) 57 | pred_ids = torch.tensor(input_ids) 58 | pred_tags = torch.tensor(pred_tags) 59 | pred_masks = torch.tensor(pred_masks) 60 | 61 | real_ids = np.argmax(pred_masks, axis=1).tolist() 62 | print(str(len(real_ids))) 63 | 64 | pred_data = TensorDataset(pred_ids, pred_masks, pred_tags) 65 | pred_sampler = RandomSampler(pred_data) 66 | pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=1) 67 | i = 0 68 | predictions = [] 69 | for batch in pred_dataloader: 70 | if i > 0: 71 | break 72 | i += 1 73 | batch = tuple(t.to(device) for t in batch) 74 | b_input_ids, b_input_mask, b_labels = batch 75 | b_input_ids = torch.tensor(b_input_ids).to(device).long() 76 | b_labels = torch.tensor(b_labels).to(device).long() 77 | 78 | with torch.no_grad(): 79 | logits = self.model(b_input_ids, token_type_ids=None, 80 | attention_mask=b_input_mask) 81 | 82 | logits = logits.detach().cpu().numpy() 83 | 84 | pred_flat = np.argmax(logits, axis=2).flatten() 85 | 86 | ''' 87 | for m, a in enumerate(pred_flat): 88 | if m >= len(real_ids): 89 | break 90 | predictions.append(a) 91 | ''' 92 | #print(predictions) 93 | #print(str(len(predictions))) 94 | 95 | #preds_list, out_label_list = self.align_predictions(items, logits, pred_tags) 96 | #print(preds_list) 97 | #print(out_label_list) 98 | 99 | return pred_flat 100 | -------------------------------------------------------------------------------- /py_ner/bertviz/head_view.py: -------------------------------------------------------------------------------- 1 | """Module for postprocessing and displaying transformer attentions. 2 | 3 | """ 4 | 5 | import json 6 | from py_ner.bertviz.attention import get_attention 7 | 8 | import os 9 | 10 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None): 11 | 12 | if sentence_b: 13 | vis_html = """ 14 | 15 | Layer: 16 | Attention: 23 | 24 |
25 | """ 26 | else: 27 | vis_html = """ 28 | 29 | Layer: 30 | 31 |
32 | """ 33 | 34 | __location__ = os.path.realpath( 35 | os.path.join(os.getcwd(), os.path.dirname(__file__))) 36 | vis_js = open(os.path.join(__location__, 'head_view.js')).read() 37 | attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b) 38 | params = { 39 | 'attention': attn_data, 40 | 'default_filter': "all" 41 | } 42 | 43 | with open('bert_visualization.html', 'w') as f: 44 | _head = "" + "" + "" 45 | f.write(_head) 46 | f.write(vis_html + '\n') 47 | 48 | f.write('window.params = %s' % json.dumps(params) + '\n') 49 | f.write(vis_js + '\n') -------------------------------------------------------------------------------- /py_ner/bertviz/model_view.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Tensor2Tensor Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Change log 17 | # 12/12/18 Jesse Vig Adapted to BERT model 18 | # 12/19/18 Jesse Vig Assorted cleanup. Changed orientation of attention matrices. Updated comments. 19 | 20 | 21 | """Module for postprocessing and displaying transformer attentions. 22 | 23 | This module is designed to be called from an ipython notebook. 24 | """ 25 | 26 | import json 27 | from py_ner.bertviz.attention import get_attention 28 | 29 | import os 30 | 31 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None): 32 | 33 | if sentence_b: 34 | vis_html = """ 35 | 36 | Attention: 43 | 44 |
45 | """ 46 | else: 47 | vis_html = """ 48 |
49 | """ 50 | 51 | __location__ = os.path.realpath( 52 | os.path.join(os.getcwd(), os.path.dirname(__file__))) 53 | vis_js = open(os.path.join(__location__, 'model_view.js')).read() 54 | attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b) 55 | params = { 56 | 'attention': attn_data, 57 | 'default_filter': "all" 58 | } 59 | 60 | with open('bert_visualization.html', 'w') as f: 61 | f.write(vis_html + '\n') 62 | f.write("require.config({paths: {d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'," 63 | "jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',}});") 64 | 65 | f.write('window.params = %s' % json.dumps(params) + '\n') 66 | f.write(vis_js + '\n') 67 | 68 | -------------------------------------------------------------------------------- /py_ner/bertviz/neuron_view.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Tensor2Tensor Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Change log 17 | # 12/12/18 Jesse Vig Adapted to BERT model 18 | # 12/19/18 Jesse Vig Assorted cleanup. Changed orientation of attention matrices. Updated comments. 19 | 20 | 21 | """Module for postprocessing and displaying transformer attentions. 22 | 23 | This module is designed to be called from an ipython notebook. 24 | """ 25 | 26 | import json 27 | from py_ner.bertviz.attention import get_attention 28 | 29 | import os 30 | 31 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None): 32 | if sentence_b: 33 | vis_html = """ 34 | 35 | Layer: 36 | Head: 37 | Attention: 44 | 45 |
46 | """ 47 | else: 48 | vis_html = """ 49 | 50 | Layer: 51 | Head: 52 | 53 |
54 | """ 55 | 56 | __location__ = os.path.realpath( 57 | os.path.join(os.getcwd(), os.path.dirname(__file__))) 58 | vis_js = open(os.path.join(__location__, 'neuron_view.js')).read() 59 | attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b, include_queries_and_keys=True) 60 | if model_type == 'gpt2': 61 | bidirectional = False 62 | else: 63 | bidirectional = True 64 | params = { 65 | 'attention': attn_data, 66 | 'default_filter': "all", 67 | 'bidirectional': bidirectional 68 | } 69 | 70 | with open('bert_visualization.html', 'w') as f: 71 | f.write(vis_html + '\n') 72 | f.write("require.config({paths: {d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'," 73 | "jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',}});") 74 | 75 | f.write('window.params = %s' % json.dumps(params) + '\n') 76 | f.write(vis_js + '\n') -------------------------------------------------------------------------------- /py_ner/bertviz/pytorch_transformers_attn/...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_ner/config/...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_ner/config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "hidden_size": 768, 3 | "maxlen" : 30, 4 | "epochs": 10, 5 | "batch_size": 256, 6 | "dropout": 0.1, 7 | "learning_rate": 5e-5, 8 | "warmup_proportion": 0.1, 9 | "gradient_accumulation_steps": 1, 10 | "summary_step": 250, 11 | "adam_epsilon": 1e-8, 12 | "warmup_steps": 0, 13 | "max_grad_norm": 1, 14 | "logging_steps": 50, 15 | "evaluate_during_training": true, 16 | "save_steps": 250, 17 | "output_dir": "./experiments/base_model_with_crf/checkpoints" 18 | } -------------------------------------------------------------------------------- /py_ner/config/ner_to_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "[CLS]": 0, 3 | "[SEP]": 1, 4 | "[PAD]": 2, 5 | "[MASK]": 3, 6 | "O": 4, 7 | "B-POH": 5, 8 | "I-POH": 6, 9 | "B-NOH": 7, 10 | "I-NOH": 8, 11 | "B-PNT": 9, 12 | "I-PNT": 10, 13 | "B-DAT": 11, 14 | "I-DAT": 12, 15 | "B-PER": 13, 16 | "I-PER": 14, 17 | "B-TIM": 15, 18 | "I-TIM": 16, 19 | "B-LOC": 17, 20 | "I-LOC": 18, 21 | "B-ORG": 19, 22 | "I-ORG": 20, 23 | "B-MNY": 21, 24 | "I-MNY": 22, 25 | "B-DUR": 23, 26 | "I-DUR": 24 27 | } -------------------------------------------------------------------------------- /py_ner/data/dataset_info.txt: -------------------------------------------------------------------------------- 1 | download all datasets for NER from http://informatics.yonsei.ac.kr/tsmm/download/ner_data.zip 2 | -------------------------------------------------------------------------------- /py_ner/data_utils/...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_ner/data_utils/pad_sequence.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import six 7 | 8 | def keras_pad_fn(token_ids_batch, maxlen, pad_id=0, padding='post', truncating='post'): 9 | padded_token_ids_batch = pad_sequences(token_ids_batch, 10 | value=pad_id, # vocab.transform_token2idx(PAD), 11 | padding=padding, 12 | truncating=truncating, 13 | maxlen=maxlen) 14 | return padded_token_ids_batch 15 | 16 | # pad_sequences_fn in keras.preprocessing.sequence.pad_sequences 17 | def pad_sequences(sequences, maxlen=None, dtype='int32', 18 | padding='pre', truncating='pre', value=0.): 19 | """Pads sequences to the same length. 20 | 21 | This function transforms a list of 22 | `num_samples` sequences (lists of integers) 23 | into a 2D Numpy array of shape `(num_samples, num_timesteps)`. 24 | `num_timesteps` is either the `maxlen` argument if provided, 25 | or the length of the longest sequence otherwise. 26 | 27 | Sequences that are shorter than `num_timesteps` 28 | are padded with `value` at the end. 29 | 30 | Sequences longer than `num_timesteps` are truncated 31 | so that they fit the desired length. 32 | The position where padding or truncation happens is determined by 33 | the arguments `padding` and `truncating`, respectively. 34 | 35 | Pre-padding is the default. 36 | 37 | # Arguments 38 | sequences: List of lists, where each element is a sequence. 39 | maxlen: Int, maximum length of all sequences. 40 | dtype: Type of the output sequences. 41 | To pad sequences with variable length strings, you can use `object`. 42 | padding: String, 'pre' or 'post': 43 | pad either before or after each sequence. 44 | truncating: String, 'pre' or 'post': 45 | remove values from sequences larger than 46 | `maxlen`, either at the beginning or at the end of the sequences. 47 | value: Float or String, padding value. 48 | 49 | # Returns 50 | x: Numpy array with shape `(len(sequences), maxlen)` 51 | 52 | # Raises 53 | ValueError: In case of invalid values for `truncating` or `padding`, 54 | or in case of invalid shape for a `sequences` entry. 55 | """ 56 | if not hasattr(sequences, '__len__'): 57 | raise ValueError('`sequences` must be iterable.') 58 | num_samples = len(sequences) 59 | 60 | lengths = [] 61 | for x in sequences: 62 | try: 63 | lengths.append(len(x)) 64 | except TypeError: 65 | raise ValueError('`sequences` must be a list of iterables. ' 66 | 'Found non-iterable: ' + str(x)) 67 | 68 | if maxlen is None: 69 | maxlen = np.max(lengths) 70 | 71 | # take the sample shape from the first non empty sequence 72 | # checking for consistency in the main loop below. 73 | sample_shape = tuple() 74 | for s in sequences: 75 | if len(s) > 0: 76 | sample_shape = np.asarray(s).shape[1:] 77 | break 78 | 79 | is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_) 80 | if isinstance(value, six.string_types) and dtype != object and not is_dtype_str: 81 | raise ValueError("`dtype` {} is not compatible with `value`'s type: {}\n" 82 | "You should set `dtype=object` for variable length strings." 83 | .format(dtype, type(value))) 84 | 85 | x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype) 86 | for idx, s in enumerate(sequences): 87 | if not len(s): 88 | continue # empty list/array was found 89 | if truncating == 'pre': 90 | trunc = s[-maxlen:] 91 | elif truncating == 'post': 92 | trunc = s[:maxlen] 93 | else: 94 | raise ValueError('Truncating type "%s" ' 95 | 'not understood' % truncating) 96 | 97 | # check `trunc` has expected shape 98 | trunc = np.asarray(trunc, dtype=dtype) 99 | if trunc.shape[1:] != sample_shape: 100 | raise ValueError('Shape of sample %s of sequence at position %s ' 101 | 'is different from expected shape %s' % 102 | (trunc.shape[1:], idx, sample_shape)) 103 | 104 | if padding == 'post': 105 | x[idx, :len(trunc)] = trunc 106 | elif padding == 'pre': 107 | x[idx, -len(trunc):] = trunc 108 | else: 109 | raise ValueError('Padding type "%s" not understood' % padding) 110 | return x 111 | 112 | 113 | if __name__ == '__main__': 114 | sequences = [[2, 4, 62], [2,35,12,24,2]] 115 | pad_res = pad_sequences(sequences, maxlen=10, dtype='int32', padding='pre', truncating='post', value=0.) 116 | keras_pad_res = keras_pad_fn(sequences, maxlen=10, pad_id=0, padding='post', truncating='post') 117 | print(pad_res) 118 | print(keras_pad_res) -------------------------------------------------------------------------------- /py_ner/data_utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | ref: https://github.com/aisolab/nlp_implementation/blob/master/Character-level_Convolutional_Networks_for_Text_Classification/utils.py 3 | """ 4 | import json 5 | import torch 6 | from pathlib import Path 7 | 8 | 9 | class Config: 10 | def __init__(self, json_path): 11 | with open(json_path, mode='r') as io: 12 | params = json.loads(io.read()) 13 | self.__dict__.update(params) 14 | 15 | def save(self, json_path): 16 | with open(json_path, mode='w') as io: 17 | json.dump(self.__dict__, io, indent=4) 18 | 19 | def update(self, json_path): 20 | with open(json_path, mode='r') as io: 21 | params = json.loads(io.read()) 22 | self.__dict__.update(params) 23 | 24 | @property 25 | def dict(self): 26 | return self.__dict__ 27 | 28 | 29 | class CheckpointManager: 30 | def __init__(self, model_dir): 31 | if not isinstance(model_dir, Path): 32 | model_dir = Path(model_dir) 33 | self._model_dir = model_dir 34 | 35 | def save_checkpoint(self, state, filename): 36 | torch.save(state, self._model_dir / filename) 37 | 38 | def load_checkpoint(self, filename): 39 | state = torch.load(self._model_dir / filename, map_location=torch.device('cpu')) 40 | return state 41 | 42 | 43 | class SummaryManager: 44 | def __init__(self, model_dir): 45 | if not isinstance(model_dir, Path): 46 | model_dir = Path(model_dir) 47 | self._model_dir = model_dir 48 | self._summary = {} 49 | 50 | def save(self, filename): 51 | with open(self._model_dir / filename, mode='w') as io: 52 | json.dump(self._summary, io, indent=4) 53 | 54 | def load(self, filename): 55 | with open(self._model_dir / filename, mode='r') as io: 56 | metric = json.loads(io.read()) 57 | self.update(metric) 58 | 59 | def update(self, summary): 60 | self._summary.update(summary) 61 | 62 | def reset(self): 63 | self._summary = {} 64 | 65 | @property 66 | def summary(self): 67 | return self._summary -------------------------------------------------------------------------------- /py_ner/find_learning_rate.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from torch import nn 4 | from torch.optim import Adam 5 | from torch.utils.data import DataLoader 6 | from torch_lr_finder import LRFinder 7 | from transformers import AutoTokenizer 8 | import torch 9 | import numpy as np 10 | 11 | from py_ner.data_utils.data_utils import * 12 | 13 | from py_ner.data_utils.ner_dataset import read_data_from_file, get_labels, NerDataset 14 | from py_ner.model.net import BertForTokenClassificationCustom 15 | from py_ner.model.optimizers import get_optimizer_with_weight_decay 16 | 17 | DATA_TR_PATH = './data/JNLPBA/Genia4ERtask1.iob2' 18 | SEED = 42 19 | 20 | # MODEL 21 | MODEL_NAME = 'allenai/scibert_scivocab_cased' 22 | MAX_LEN_SEQ = 128 23 | 24 | # Optimization parameters 25 | BATCH_SIZE_TR = 32 26 | LEARNING_RATE = 1e-6 27 | CLIPPING = None 28 | OPTIMIZER = Adam 29 | 30 | # get data 31 | training_set = read_data_from_file(DATA_TR_PATH) 32 | 33 | # Automatically extract labels and their indexes from data. 34 | labels2ind, labels_count = get_labels(training_set) 35 | 36 | # Load data 37 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 38 | training_set = NerDataset(dataset=training_set, 39 | tokenizer=tokenizer, 40 | labels2ind=labels2ind, 41 | max_len_seq=MAX_LEN_SEQ, 42 | bert_hugging=False) 43 | 44 | 45 | dataloader_tr = DataLoader(dataset=training_set, 46 | batch_size=BATCH_SIZE_TR, 47 | shuffle=True) 48 | 49 | # Seeds 50 | random.seed(SEED) 51 | np.random.seed(SEED) 52 | torch.manual_seed(SEED) 53 | torch.cuda.manual_seed_all(SEED) 54 | 55 | legend = [] 56 | fig = None 57 | 58 | for wd in [0, .1, 1e-2, 1e-3, 1e-4]: 59 | for dp in [.1, 0.2, .3]: 60 | nerbert = BertForTokenClassificationCustom.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, 61 | num_labels=len(labels2ind), 62 | hidden_dropout_prob=dp, 63 | attention_probs_dropout_prob=dp) 64 | 65 | # Prepare optimizer and schedule (linear warmup and decay) 66 | optimizer = get_optimizer_with_weight_decay(model=nerbert, 67 | optimizer=OPTIMIZER, 68 | learning_rate=LEARNING_RATE, 69 | weight_decay=wd) 70 | 71 | lr_finder = LRFinder(nerbert, optimizer, nn.CrossEntropyLoss(), device='cuda') 72 | lr_finder.range_test(train_loader=dataloader_tr, end_lr=1, num_iter=100) 73 | fig = lr_finder.plot(ax=fig) 74 | legend.append(f"wd: {wd}") 75 | 76 | fig.figure.legend(legend, loc='best') 77 | fig.figure.tight_layout() 78 | fig.figure.show() 79 | fig.figure.savefig('lr_finder.png') 80 | -------------------------------------------------------------------------------- /py_ner/kobert/...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_ner/kobert/mxnet_kobert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 SK T-Brain Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import sys 18 | import requests 19 | import hashlib 20 | 21 | import mxnet as mx 22 | import gluonnlp as nlp 23 | from gluonnlp.model import BERTModel, BERTEncoder 24 | 25 | from .utils import download as _download 26 | 27 | 28 | kobert_models = { 29 | 'mxnet_kobert': { 30 | 'url': 31 | 'https://kobert.blob.core.windows.net/models/kobert/mxnet/mxnet_kobert_45b6957552.params', 32 | 'fname': 'mxnet_kobert_45b6957552.params', 33 | 'chksum': '45b6957552' 34 | }, 35 | 'vocab': { 36 | 'url': 37 | 'https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json', 38 | 'fname': 'kobertvocab_f38b8a4d6d.json', 39 | 'chksum': 'f38b8a4d6d' 40 | } 41 | } 42 | 43 | 44 | def get_mxnet_kobert_model(use_pooler=True, 45 | use_decoder=True, 46 | use_classifier=True, 47 | ctx=mx.cpu(0), 48 | cachedir='./ptr_lm_model'): 49 | # download model 50 | model_info = kobert_models['mxnet_kobert'] 51 | model_path = _download(model_info['url'], 52 | model_info['fname'], 53 | model_info['chksum'], 54 | cachedir=cachedir) 55 | # download vocab 56 | vocab_info = kobert_models['vocab'] 57 | vocab_path = _download(vocab_info['url'], 58 | vocab_info['fname'], 59 | vocab_info['chksum'], 60 | cachedir=cachedir) 61 | return get_kobert_model(model_path, vocab_path, use_pooler, use_decoder, 62 | use_classifier, ctx) 63 | 64 | 65 | def get_kobert_model(model_file, 66 | vocab_file, 67 | use_pooler=True, 68 | use_decoder=True, 69 | use_classifier=True, 70 | ctx=mx.cpu(0)): 71 | vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read()) 72 | 73 | predefined_args = { 74 | 'attention_cell': 'multi_head', 75 | 'num_layers': 12, 76 | 'units': 768, 77 | 'hidden_size': 3072, 78 | 'max_length': 512, 79 | 'num_heads': 12, 80 | 'scaled': True, 81 | 'dropout': 0.1, 82 | 'use_residual': True, 83 | 'embed_size': 768, 84 | 'embed_dropout': 0.1, 85 | 'token_type_vocab_size': 2, 86 | 'word_embed': None, 87 | } 88 | 89 | encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], 90 | num_layers=predefined_args['num_layers'], 91 | units=predefined_args['units'], 92 | hidden_size=predefined_args['hidden_size'], 93 | max_length=predefined_args['max_length'], 94 | num_heads=predefined_args['num_heads'], 95 | scaled=predefined_args['scaled'], 96 | dropout=predefined_args['dropout'], 97 | output_attention=False, 98 | output_all_encodings=False, 99 | use_residual=predefined_args['use_residual']) 100 | 101 | # BERT 102 | net = BERTModel( 103 | encoder, 104 | len(vocab_b_obj.idx_to_token), 105 | token_type_vocab_size=predefined_args['token_type_vocab_size'], 106 | units=predefined_args['units'], 107 | embed_size=predefined_args['embed_size'], 108 | embed_dropout=predefined_args['embed_dropout'], 109 | word_embed=predefined_args['word_embed'], 110 | use_pooler=use_pooler, 111 | use_decoder=use_decoder, 112 | use_classifier=use_classifier) 113 | net.initialize(ctx=ctx) 114 | net.load_parameters(model_file, ctx, ignore_extra=True) 115 | return (net, vocab_b_obj) 116 | -------------------------------------------------------------------------------- /py_ner/kobert/pytorch_kobert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 SK T-Brain Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import sys 18 | import requests 19 | import hashlib 20 | 21 | import torch 22 | 23 | from transformers import BertModel, BertConfig 24 | # from pytorch_pretrained_bert import BertModel, BertConfig 25 | import gluonnlp as nlp 26 | 27 | from .utils import download as _download 28 | 29 | kobert_models = { 30 | 'pytorch_kobert': { 31 | 'url': 32 | 'https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params', 33 | 'fname': 'pytorch_kobert_2439f391a6.params', 34 | 'chksum': '2439f391a6' 35 | }, 36 | 'vocab': { 37 | 'url': 38 | 'https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json', 39 | 'fname': 'kobertvocab_f38b8a4d6d.json', 40 | 'chksum': 'f38b8a4d6d' 41 | } 42 | } 43 | 44 | 45 | bert_config = {'attention_probs_dropout_prob': 0.1, 46 | 'hidden_act': 'gelu', 47 | 'hidden_dropout_prob': 0.1, 48 | 'hidden_size': 768, 49 | 'initializer_range': 0.02, 50 | 'intermediate_size': 3072, 51 | 'max_position_embeddings': 512, 52 | 'num_attention_heads': 12, 53 | 'num_hidden_layers': 12, 54 | 'type_vocab_size': 2, 55 | 'vocab_size': 8002} 56 | 57 | 58 | 59 | def get_pytorch_kobert_model(ctx='cpu', 60 | cachedir='./ptr_lm_model'): 61 | # download model 62 | model_info = kobert_models['pytorch_kobert'] 63 | model_path = _download(model_info['url'], 64 | model_info['fname'], 65 | model_info['chksum'], 66 | cachedir=cachedir) 67 | # download vocab 68 | vocab_info = kobert_models['vocab'] 69 | vocab_path = _download(vocab_info['url'], 70 | vocab_info['fname'], 71 | vocab_info['chksum'], 72 | cachedir=cachedir) 73 | return get_kobert_model(model_path, vocab_path, ctx) 74 | 75 | 76 | 77 | def get_kobert_model(model_file, vocab_file, ctx="cpu"): 78 | bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) 79 | bertmodel.load_state_dict(torch.load(model_file)) 80 | device = torch.device(ctx) 81 | bertmodel.to(device) 82 | bertmodel.eval() 83 | vocab_b_obj = nlp.vocab.BERTVocab.from_json( 84 | open(vocab_file, 'rt').read()) 85 | return bertmodel, vocab_b_obj 86 | -------------------------------------------------------------------------------- /py_ner/kobert/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 SK T-Brain Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import sys 18 | import requests 19 | import hashlib 20 | 21 | kobert_models = { 22 | 'onnx_kobert': { 23 | 'url': 24 | 'https://kobert.blob.core.windows.net/models/kobert/onnx/onnx_kobert_44529811f0.onnx', 25 | 'fname': 'onnx_kobert_44529811f0.onnx', 26 | 'chksum': '44529811f0' 27 | }, 28 | 'tokenizer': { 29 | 'url': 30 | 'https://kobert.blob.core.windows.net/models/kobert/tokenizer/tokenizer_78b3253a26.model', 31 | 'fname': 'tokenizer_78b3253a26.model', 32 | 'chksum': '78b3253a26' 33 | } 34 | } 35 | 36 | 37 | def download(url, filename, chksum, cachedir='./ptr_lm_model'): 38 | f_cachedir = os.path.expanduser(cachedir) 39 | os.makedirs(f_cachedir, exist_ok=True) 40 | file_path = os.path.join(f_cachedir, filename) 41 | if os.path.isfile(file_path): 42 | if hashlib.md5(open(file_path, 43 | 'rb').read()).hexdigest()[:10] == chksum: 44 | print('using cached model') 45 | return file_path 46 | with open(file_path, 'wb') as f: 47 | response = requests.get(url, stream=True) 48 | total = response.headers.get('content-length') 49 | 50 | if total is None: 51 | f.write(response.content) 52 | else: 53 | downloaded = 0 54 | total = int(total) 55 | for data in response.iter_content( 56 | chunk_size=max(int(total / 1000), 1024 * 1024)): 57 | downloaded += len(data) 58 | f.write(data) 59 | done = int(50 * downloaded / total) 60 | sys.stdout.write('\r[{}{}]'.format('█' * done, 61 | '.' * (50 - done))) 62 | sys.stdout.flush() 63 | sys.stdout.write('\n') 64 | assert chksum == hashlib.md5(open( 65 | file_path, 'rb').read()).hexdigest()[:10], 'corrupted file!' 66 | return file_path 67 | 68 | 69 | def get_onnx(cachedir='./ptr_lm_model'): 70 | """Get KoBERT ONNX file path after downloading 71 | """ 72 | model_info = kobert_models['onnx_kobert'] 73 | return download(model_info['url'], 74 | model_info['fname'], 75 | model_info['chksum'], 76 | cachedir=cachedir) 77 | 78 | def get_tokenizer(cachedir='./ptr_lm_model'): 79 | """Get KoBERT Tokenizer file path after downloading 80 | """ 81 | model_info = kobert_models['tokenizer'] 82 | return download(model_info['url'], 83 | model_info['fname'], 84 | model_info['chksum'], 85 | cachedir=cachedir) 86 | -------------------------------------------------------------------------------- /py_ner/model/...: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /py_ner/model/optimizers.py: -------------------------------------------------------------------------------- 1 | from transformers import PreTrainedModel 2 | import torch 3 | from typing import Union 4 | 5 | 6 | def get_optimizer_with_weight_decay(model: PreTrainedModel, 7 | optimizer: torch.optim.Optimizer, 8 | learning_rate: Union[float, int], 9 | weight_decay: Union[float, int]) -> torch.optim.Optimizer: 10 | """ 11 | Apply weight decay to all the network parameters but those called `bias` or `LayerNorm.weight`. 12 | Args: 13 | model (`PreTrainedModel`): model to apply weight decay. 14 | optimizer (`torch.optim.Optimizer`): The optimizer to use during training. 15 | learning_rate (`float` or `int`): value of the learning rate to use during training. 16 | weight_decay (`float` or `int`): value of the weight decay to apply. 17 | 18 | Returns: 19 | optimizer (`torch.optim.Optimizer`): the optimizer instantiated with the selected 20 | learning rate and the parameters with and without weight decay. 21 | 22 | """ 23 | no_decay = ["bias", "LayerNorm.weight"] 24 | params = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] 25 | params_nd = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] 26 | optimizer_grouped_parameters = [{"params": params, "weight_decay": weight_decay}, 27 | {"params": params_nd, "weight_decay": 0.0}] 28 | 29 | return optimizer(optimizer_grouped_parameters, lr=learning_rate) -------------------------------------------------------------------------------- /py_ner/model/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable, Union, Dict 2 | import json 3 | 4 | class Config: 5 | def __init__(self, json_path): 6 | with open(json_path, mode='r') as io: 7 | params = json.loads(io.read()) 8 | self.__dict__.update(params) 9 | 10 | def save(self, json_path): 11 | with open(json_path, mode='w') as io: 12 | json.dump(self.__dict__, io, indent=4) 13 | 14 | def update(self, json_path): 15 | with open(json_path, mode='r') as io: 16 | params = json.loads(io.read()) 17 | self.__dict__.update(params) 18 | 19 | @property 20 | def dict(self): 21 | return self.__dict__ 22 | 23 | 24 | 25 | class PadSequence: 26 | """PadSequence class""" 27 | 28 | def __init__(self, length: int, pad_val: int = 0, clip: bool = True) -> None: 29 | """Instantiating PadSequence class 30 | Args: 31 | length (int): the maximum length to pad/clip the sequence 32 | pad_val (int): the pad value 33 | clip (bool): whether to clip the length, if sample length is longer than maximum length 34 | """ 35 | self._length = length 36 | self._pad_val = pad_val 37 | self._clip = clip 38 | 39 | def __call__(self, sample): 40 | sample_length = len(sample) 41 | if sample_length >= self._length: 42 | if self._clip and sample_length > self._length: 43 | return sample[: self._length] 44 | else: 45 | return sample 46 | else: 47 | return sample + [self._pad_val for _ in range(self._length - sample_length)] -------------------------------------------------------------------------------- /py_ner/ner_crf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sklearn.model_selection import train_test_split 4 | from sklearn_crfsuite import CRF 5 | from sklearn_crfsuite.metrics import flat_f1_score 6 | from sklearn_crfsuite.metrics import flat_classification_report 7 | 8 | #Reading the csv file 9 | df = pd.read_csv('data/ner_dataset.csv', encoding = "ISO-8859-1") 10 | 11 | #Display first 10 rows 12 | print(str(df.head(10))) 13 | 14 | print(str(df.describe())) 15 | 16 | #Displaying the unique Tags 17 | print(str(df['Tag'].unique())) 18 | 19 | 20 | #Checking null values, if any. 21 | df.isnull().sum() 22 | 23 | df = df.fillna(method = 'ffill') 24 | 25 | # This is a class te get sentence. The each sentence will be list of tuples with its tag and pos. 26 | class sentence(object): 27 | def __init__(self, df): 28 | self.n_sent = 1 29 | self.df = df 30 | self.empty = False 31 | agg = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 32 | s['POS'].values.tolist(), 33 | s['Tag'].values.tolist())] 34 | self.grouped = self.df.groupby("Sentence #").apply(agg) 35 | self.sentences = [s for s in self.grouped] 36 | 37 | def get_text(self): 38 | try: 39 | s = self.grouped['Sentence: {}'.format(self.n_sent)] 40 | self.n_sent += 1 41 | return s 42 | except: 43 | return None 44 | 45 | #Displaying one full sentence 46 | getter = sentence(df) 47 | sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences] 48 | sentences[0] 49 | 50 | #sentence with its pos and tag. 51 | sent = getter.get_text() 52 | print(sent) 53 | 54 | sentences = getter.sentences 55 | 56 | def word2features(sent, i): 57 | word = sent[i][0] 58 | postag = sent[i][1] 59 | 60 | features = { 61 | 'bias': 1.0, 62 | 'word.lower()': word.lower(), 63 | 'word[-3:]': word[-3:], 64 | 'word[-2:]': word[-2:], 65 | 'word.isupper()': word.isupper(), 66 | 'word.istitle()': word.istitle(), 67 | 'word.isdigit()': word.isdigit(), 68 | 'postag': postag, 69 | 'postag[:2]': postag[:2], 70 | } 71 | if i > 0: 72 | word1 = sent[i-1][0] 73 | postag1 = sent[i-1][1] 74 | features.update({ 75 | '-1:word.lower()': word1.lower(), 76 | '-1:word.istitle()': word1.istitle(), 77 | '-1:word.isupper()': word1.isupper(), 78 | '-1:postag': postag1, 79 | '-1:postag[:2]': postag1[:2], 80 | }) 81 | else: 82 | features['BOS'] = True 83 | if i < len(sent)-1: 84 | word1 = sent[i+1][0] 85 | postag1 = sent[i+1][1] 86 | features.update({ 87 | '+1:word.lower()': word1.lower(), 88 | '+1:word.istitle()': word1.istitle(), 89 | '+1:word.isupper()': word1.isupper(), 90 | '+1:postag': postag1, 91 | '+1:postag[:2]': postag1[:2], 92 | }) 93 | else: 94 | features['EOS'] = True 95 | 96 | return features 97 | 98 | 99 | def sent2features(sent): 100 | return [word2features(sent, i) for i in range(len(sent))] 101 | 102 | def sent2labels(sent): 103 | return [label for token, postag, label in sent] 104 | 105 | def sent2tokens(sent): 106 | return [token for token, postag, label in sent] 107 | 108 | X = [sent2features(s) for s in sentences] 109 | y = [sent2labels(s) for s in sentences] 110 | 111 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 112 | 113 | crf = CRF(algorithm = 'lbfgs', 114 | c1 = 0.1, 115 | c2 = 0.1, 116 | max_iterations = 100, 117 | all_possible_transitions = False) 118 | crf.fit(X_train, y_train) 119 | 120 | #Predicting on the test set. 121 | y_pred = crf.predict(X_test) 122 | 123 | f1_score = flat_f1_score(y_test, y_pred, average = 'weighted') 124 | print(f1_score) 125 | 126 | report = flat_classification_report(y_test, y_pred) 127 | print(report) 128 | 129 | -------------------------------------------------------------------------------- /py_node2vec/node2vecModel.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import networkx as nx 3 | from node2vec import Node2Vec 4 | 5 | # Embed edges using Hadamard method 6 | from node2vec.edges import HadamardEmbedder 7 | import multiprocessing 8 | 9 | 10 | class Node2VecModel: 11 | def __init__(self): 12 | self.model = None 13 | self.G = nx.Graph() 14 | 15 | def create_random_graph(self): 16 | # Create a graph 17 | self.G = nx.fast_gnp_random_graph(n=100, p=0.5) 18 | 19 | def create_graph(self, co_occurrence, word_hist, threshold): 20 | filtered_word_list = [] 21 | for pair in co_occurrence: 22 | node1 = '' 23 | node2 = '' 24 | for inner_pair in pair: 25 | if type(inner_pair) is tuple: 26 | node1 = inner_pair[0] 27 | node2 = inner_pair[1] 28 | elif type(inner_pair) is str: 29 | inner_pair = inner_pair.split() 30 | if len(inner_pair) == 2: 31 | node1 = inner_pair[0] 32 | node2 = inner_pair[1] 33 | elif type(inner_pair) is int: 34 | if float(inner_pair) >= threshold: 35 | # print ("X " + node1 + " == " + node2 + " == " + str(inner_pair) + " : " + str(tuple[node1])) 36 | self.G.add_edge(node1, node2, weight=float(inner_pair)) 37 | if node1 not in filtered_word_list: 38 | filtered_word_list.append(node1) 39 | if node2 not in filtered_word_list: 40 | filtered_word_list.append(node2) 41 | elif type(inner_pair) is float: 42 | if float(inner_pair) >= threshold: 43 | # print ("X " + node1 + " == " + node2 + " == " + str(inner_pair) + " : ") 44 | self.G.add_edge(node1, node2, weight=float(inner_pair)) 45 | if node1 not in filtered_word_list: 46 | filtered_word_list.append(node1) 47 | if node2 not in filtered_word_list: 48 | filtered_word_list.append(node2) 49 | 50 | for word in word_hist: 51 | if str(word) in filtered_word_list: 52 | self.G.add_node(word, count=word_hist[word]) 53 | 54 | print(self.G.number_of_nodes()) 55 | 56 | def train(self, dimensions, walk_length, num_walks): 57 | cores = multiprocessing.cpu_count() # Count the number of cores in a computer 58 | # Precompute probabilities and generate walks 59 | node2vec = Node2Vec(self.G, 60 | dimensions=dimensions, 61 | walk_length=walk_length, 62 | num_walks=num_walks, 63 | workers=cores - 1) 64 | 65 | ## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space 66 | # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs 67 | # node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data") 68 | 69 | # Embed 70 | self.model = node2vec.fit(window=10, min_count=1, 71 | batch_words=4) # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor) 72 | 73 | def save_model(self, embedding_filename, embedding_model_file): 74 | # Save embeddings for later use 75 | self.model.wv.save_word2vec_format(embedding_filename) 76 | 77 | # Save model for later use 78 | self.model.save(embedding_model_file) 79 | 80 | def load_model(self, embedding_filename): 81 | self.model = gensim.models.KeyedVectors.load_word2vec_format(embedding_filename) 82 | 83 | def most_similars(self, word): 84 | # Look for most similar nodes 85 | return self.model.wv.most_similar(word) # Output node names are always strings 86 | 87 | def compute_similarity(self, first_node, second_node): 88 | edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv) 89 | 90 | # Look for embeddings on the fly - here we pass normal tuples 91 | edges_embs[(first_node, second_node)] 92 | ''' OUTPUT 93 | array([ 5.75068220e-03, -1.10937878e-02, 3.76693785e-01, 2.69105062e-02, 94 | ... ... .... 95 | ..................................................................], 96 | dtype=float32) 97 | ''' 98 | 99 | # Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks 100 | edges_kv = edges_embs.as_keyed_vectors() 101 | 102 | # Look for most similar edges - this time tuples must be sorted and as str 103 | results = edges_kv.most_similar(str((first_node, second_node))) 104 | 105 | # Save embeddings for later use 106 | # edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME) 107 | 108 | return results -------------------------------------------------------------------------------- /py_topic_model/__init__.py: -------------------------------------------------------------------------------- 1 | from py_topic_model import * -------------------------------------------------------------------------------- /py_topic_model/gdmr_plot.py: -------------------------------------------------------------------------------- 1 | 2 | import tomotopy as tp 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import matplotlib.colors as clr 6 | 7 | class ExpNormalize(clr.Normalize): 8 | def __init__(self, scale): 9 | super().__init__() 10 | self.scale = scale 11 | 12 | def __call__(self, value, clip=None): 13 | if clip is None: 14 | clip = self.clip 15 | 16 | result, is_scalar = self.process_value(value) 17 | 18 | self.autoscale_None(result) 19 | (vmin,), _ = self.process_value(self.vmin) 20 | (vmax,), _ = self.process_value(self.vmax) 21 | if vmin == vmax: 22 | result.fill(0) 23 | elif vmin > vmax: 24 | raise ValueError("minvalue must be less than or equal to maxvalue") 25 | else: 26 | if clip: 27 | mask = np.ma.getmask(result) 28 | result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax), 29 | mask=mask) 30 | resdat = result.data 31 | resdat = 1 - np.exp(-2 * resdat / self.scale) 32 | result = np.ma.array(resdat, mask=result.mask, copy=False) 33 | if is_scalar: 34 | result = result[0] 35 | return result 36 | 37 | heat = clr.LinearSegmentedColormap.from_list('heat', 38 | [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)], 39 | N=1024 40 | ) 41 | 42 | corpus = tp.utils.Corpus() 43 | for line in open('./topic_model/dataset2.txt', encoding='utf-8'): 44 | fd = line.strip().split() 45 | corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2]))) 46 | 47 | # We set a range of the first metadata as [2000, 2017] 48 | # and one of the second metadata as [0, 1]. 49 | mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=10, degrees=[4, 3], 50 | alpha=1e-2, sigma=0.25, sigma0=3.0, 51 | metadata_range=[(2000, 2017), (0, 1)], corpus=corpus 52 | ) 53 | mdl.optim_interval = 20 54 | mdl.burn_in = 200 55 | 56 | mdl.train(0) 57 | 58 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( 59 | len(mdl.docs), len(mdl.used_vocabs), mdl.num_words 60 | )) 61 | 62 | # Let's train the model 63 | for i in range(0, 1000, 20): 64 | print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word)) 65 | mdl.train(20) 66 | print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word)) 67 | 68 | # Let's visualize the result 69 | topic_counts = mdl.get_count_by_topics() 70 | lambdas = mdl.lambdas 71 | 72 | md_range = mdl.metadata_range 73 | # Our topic distribution map has 74 | # 400 pixels for the first axis and 75 | # 200 pixels for the second axis. 76 | r = mdl.tdf_linspace( 77 | [md_range[0][0], md_range[1][0]], 78 | [md_range[0][1], md_range[1][1]], 79 | [400, 200] 80 | ) 81 | 82 | for k in (-topic_counts).argsort(): 83 | print('Topic #{} ({})'.format(k, topic_counts[k])) 84 | print(*(w for w, _ in mdl.get_topic_words(k))) 85 | print('Lambda:', lambdas[k]) 86 | 87 | imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()), 88 | origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04), 89 | extent=[*md_range[0], *md_range[1]], 90 | aspect='auto' 91 | ) 92 | plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5)))) 93 | plt.colorbar() 94 | plt.show() -------------------------------------------------------------------------------- /py_topic_model/ldaInference.py: -------------------------------------------------------------------------------- 1 | import pyLDAvis.gensim 2 | import pickle 3 | import gensim 4 | 5 | class ldaInference: 6 | def __init__(self, dictionary_model='dictionary.gensim', corpus_model='corpus.pkl', lda_model='model5.gensim'): 7 | self.dictionary = gensim.corpora.Dictionary.load(dictionary_model) 8 | self.corpus = pickle.load(open(corpus_model, 'rb')) 9 | self.lda = gensim.models.ldamodel.LdaModel.load(lda_model) 10 | 11 | def infer(self, document): 12 | test_doc = [self.dictionary.doc2bow(document.split(" "))] 13 | inferred_matrix = self.lda.inference(test_doc) 14 | 15 | return inferred_matrix 16 | 17 | if __name__ == '__main__': 18 | a_document = '한국 시장경제가 위기입니다.' 19 | inferred_topics = ldaInference().infer(a_document) 20 | 21 | print(str(inferred_topics)) 22 | -------------------------------------------------------------------------------- /py_topic_model/ldaSeqModel.py: -------------------------------------------------------------------------------- 1 | import numpy # for arrays, array broadcasting etc. 2 | from gensim.models import ldaseqmodel, ldamodel 3 | from gensim.corpora import Dictionary 4 | import os.path 5 | import logging 6 | 7 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 8 | 9 | class ldaSeqModel(): 10 | 11 | def __init__(self): 12 | name = 'ldaSeqModel' 13 | 14 | def run(self, document_collection, topic_count=2, time_group=[10,10,11]): 15 | """document_collection should be sorted in order of time_slice.""" 16 | dictionary = Dictionary(document_collection) 17 | corpus = [dictionary.doc2bow(text) for text in document_collection] 18 | ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=topic_count, time_slice=time_group) 19 | 20 | topics = ldaseq.print_topics(1) 21 | for topic in topics: 22 | print("TOPIC " + str(topic)) 23 | 24 | return ldaseq 25 | 26 | def parseDocuments(self, document_file, year_index, document_index): 27 | """make document along with time information""" 28 | dict = {} 29 | with open(document_file, encoding='utf-8') as ins: 30 | for line in ins: 31 | #print("LINE " + line) 32 | fields = line.split('\t') 33 | _year = fields[year_index] 34 | _document = fields[document_index] 35 | 36 | if _year not in dict: 37 | d = [] 38 | d.append(_document) 39 | dict[_year] = d 40 | 41 | else: 42 | print("DOC " + _year) 43 | _docu_ = dict.get(_year) 44 | _docu_.append(_document) 45 | return dict 46 | 47 | def parseProcessedText(self, processed_documents, pair_map): 48 | """make document along with time information""" 49 | dict = {} 50 | for doc in processed_documents: 51 | for line in doc: 52 | #print("LINE " + line) 53 | fields = line.split('\t') 54 | _year = fields[year_index] 55 | _document = fields[document_index] 56 | 57 | if _year not in dict: 58 | d = [] 59 | d.append(_document) 60 | dict[_year] = d 61 | 62 | else: 63 | print("DOC " + _year) 64 | _docu_ = dict.get(_year) 65 | _docu_.append(_document) 66 | return dict 67 | 68 | if __name__ == '__main__': 69 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 70 | document_file = "../time_test.txt" 71 | year_index = 0 72 | document_index = 1 73 | _dict = ldaSeqModel().parseDocuments(document_file, year_index, document_index) 74 | 75 | #import pyTextMiner as ptm 76 | #corpus = ptm.CorpusFromFieldDelimitedFileWithYear('time_test.txt', 1, 0) 77 | #pair_map = corpus.pair_map 78 | 79 | time_slice = [] 80 | key_size = len(_dict) 81 | doc_coll = _dict.values() 82 | for k, v in _dict.items(): 83 | time_slice.append(len(v)) 84 | ldaSeqModel().run(doc_coll,5,time_slice) 85 | -------------------------------------------------------------------------------- /py_topic_model/ldaVisualizer.py: -------------------------------------------------------------------------------- 1 | import pyLDAvis.gensim 2 | import pickle 3 | import gensim 4 | 5 | dictionary = gensim.corpora.Dictionary.load('dictionary.gensim') 6 | corpus = pickle.load(open('corpus.pkl', 'rb')) 7 | lda = gensim.models.ldamodel.LdaModel.load('model5.gensim') 8 | 9 | lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) 10 | #pyLDAvis.display(lda_display) 11 | 12 | pyLDAvis.save_html(lda_display, 'vis.html') 13 | 14 | from gensim.test.utils import common_corpus 15 | 16 | print(str(common_corpus)) -------------------------------------------------------------------------------- /py_topic_model/tfidf.py: -------------------------------------------------------------------------------- 1 | from gensim import corpora, models, similarities 2 | import pickle 3 | import gensim 4 | 5 | class tfidf: 6 | def __init__(self): 7 | name = 'tfidf' 8 | 9 | def createDictionary(self, text_data): 10 | dictionary = corpora.Dictionary(text_data) 11 | corpus = [dictionary.doc2bow(text) for text in text_data] 12 | 13 | pickle.dump(corpus, open('corpus.pkl', 'wb')) 14 | dictionary.save('dictionary.gensim') 15 | 16 | return corpus, dictionary 17 | 18 | def run(self, text_data): 19 | _corpus, dictionary = self.createDictionary(text_data) 20 | tf_idf = models.TfidfModel(_corpus) # step 1 -- initialize a model 21 | corpus_tfidf = tf_idf[_corpus] 22 | for doc in corpus_tfidf: 23 | print(doc) 24 | 25 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5) # initialize an LSI transformation 26 | result = lsi.print_topics(5,20) 27 | for a_topic in result: 28 | print("LSI results " + str(a_topic)) 29 | 30 | corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 31 | #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly 32 | #print(doc) 33 | 34 | if __name__ == '__main__': 35 | import pyTextMiner as ptm 36 | import io 37 | import nltk 38 | 39 | corpus = ptm.CorpusFromFile('../donald.txt') 40 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), 41 | ptm.helper.POSFilter('NN*'), 42 | ptm.helper.SelectWordOnly(), 43 | ptm.helper.StopwordFilter(file='../stopwordsKor.txt'), 44 | ptm.ngram.NGramTokenizer(3)) 45 | 46 | result = pipeline.processCorpus(corpus) 47 | 48 | id = 0 49 | text_data = [] 50 | for doc in result: 51 | new_doc = [] 52 | for sent in doc: 53 | for _str in sent: 54 | if len(_str) > 0: 55 | new_doc.append(_str) 56 | text_data.append(new_doc) 57 | id += 1 58 | 59 | tfidf().run(text_data) -------------------------------------------------------------------------------- /py_word2vec/__init__.py: -------------------------------------------------------------------------------- 1 | from py_word2vec import * 2 | -------------------------------------------------------------------------------- /py_word2vec/avgDocumentByW2V.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | from gensim import utils 3 | import numpy as np 4 | import sys 5 | from sklearn.datasets import fetch_20newsgroups 6 | from nltk import word_tokenize 7 | from nltk import download 8 | from nltk.corpus import stopwords 9 | import matplotlib.pyplot as plt 10 | from sklearn.decomposition import PCA 11 | 12 | from matplotlib import pyplot 13 | import pyTextMiner as ptm 14 | 15 | #model Google News, run once to download pre-trained vectors 16 | #!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz 17 | model = gensim.models.KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True) 18 | 19 | # Fetch ng20 dataset 20 | ng20 = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes')) 21 | # text and ground truth labels 22 | texts, y = ng20.data, ng20.target 23 | 24 | #corpus = [preprocess(text) for text in texts] 25 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), 26 | ptm.tokenizer.Word(), 27 | ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'), 28 | ptm.stemmer.Porter()) 29 | result = pipeline.processCorpus(texts) 30 | corpus = [] 31 | for doc in result: 32 | document = [] 33 | for sent in doc: 34 | for word in sent: 35 | document.append(word) 36 | corpus.append(document) 37 | 38 | # ### Remove empty docs 39 | def filter_docs(corpus, texts, labels, condition_on_doc): 40 | """ 41 | Filter corpus, texts and labels given the function condition_on_doc which takes 42 | a doc. 43 | The document doc is kept if condition_on_doc(doc) is true. 44 | """ 45 | number_of_docs = len(corpus) 46 | 47 | if texts is not None: 48 | texts = [text for (text, doc) in zip(texts, corpus) 49 | if condition_on_doc(doc)] 50 | 51 | labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)] 52 | corpus = [doc for doc in corpus if condition_on_doc(doc)] 53 | 54 | print("{} docs removed".format(number_of_docs - len(corpus))) 55 | 56 | return (corpus, texts, labels) 57 | 58 | corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0)) 59 | 60 | # ### Remove OOV words and documents with no words in model dictionary 61 | def document_vector(word2vec_model, doc): 62 | # remove out-of-vocabulary words 63 | doc = [word for word in doc if word in word2vec_model.vocab] 64 | return np.mean(word2vec_model[doc], axis=0) 65 | 66 | def has_vector_representation(word2vec_model, doc): 67 | """check if at least one word of the document is in the 68 | word2vec dictionary""" 69 | return not all(word not in word2vec_model.vocab for word in doc) 70 | 71 | corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: has_vector_representation(model, doc)) 72 | 73 | x =[] 74 | for doc in corpus: #look up each doc in model 75 | x.append(document_vector(model, doc)) 76 | 77 | X = np.array(x) #list to array 78 | 79 | 80 | np.savetxt('documents_vectors.txt', X) 81 | np.savetxt('labels.txt', y) 82 | 83 | print(str(X.shape) + " " + str(len(y))) 84 | 85 | # ### Sanity check 86 | print(texts[4664]) 87 | 88 | print(str(y[4664]) + " " + str(ng20.target_names[11])) 89 | 90 | # ### Plot 2 PCA components 91 | pca = PCA(n_components=2) 92 | x_pca = pca.fit_transform(X) 93 | 94 | plt.figure(1, figsize=(30, 20),) 95 | plt.scatter(x_pca[:, 0], x_pca[:, 1],s=100, c=y, alpha=0.2) 96 | plt.savefig('doc_vector_PCA.png', dpi=100) 97 | 98 | # ### Plot t-SNE 99 | from sklearn.manifold import TSNE 100 | X_tsne = TSNE(n_components=2, verbose=2).fit_transform(X) 101 | 102 | 103 | plt.figure(1, figsize=(30, 20),) 104 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1],s=100, c=y, alpha=0.2) 105 | plt.show() 106 | -------------------------------------------------------------------------------- /py_word2vec/gloveWikiKoreanTrainer.py: -------------------------------------------------------------------------------- 1 | 2 | #see in glove-win_devc_x64, demo.sh or demo.bat 3 | ''' 4 | MacOS and Linux 5 | Go to: https://github.com/stanfordnlp/GloVe 6 | 7 | $ git clone http://github.com/stanfordnlp/glove 8 | $ cd glove && make 9 | $ ./demo.sh 10 | 11 | 12 | Windows 10 13 | https://github.com/anoidgit/GloVe-win 14 | 15 | ''' 16 | -------------------------------------------------------------------------------- /py_word2vec/utils.py: -------------------------------------------------------------------------------- 1 | from keras.utils import np_utils 2 | from keras.preprocessing.text import Tokenizer 3 | import numpy as np 4 | 5 | 6 | def tokenize(corpus): 7 | """ 8 | Tokenize the corpus of text. 9 | :param corpus: list containing a string of text (example: ["I like playing football with my friends"]) 10 | :return corpus_tokenized: indexed list of words in the corpus, in the same order as the original corpus (the example above would return [[1, 2, 3, 4]]) 11 | :return V: size of vocabulary 12 | """ 13 | tokenizer = Tokenizer() 14 | tokenizer.fit_on_texts(corpus) 15 | corpus_tokenized = tokenizer.texts_to_sequences(corpus) 16 | V = len(tokenizer.word_index) 17 | return corpus_tokenized, V 18 | 19 | 20 | def initialize(V, N): 21 | """ 22 | Initialize the weights of the neural network. 23 | :param V: size of the vocabulary 24 | :param N: size of the hidden layer 25 | :return: weights W1, W2 26 | """ 27 | np.random.seed(100) 28 | W1 = np.random.rand(V, N) 29 | W2 = np.random.rand(N, V) 30 | 31 | return W1, W2 32 | 33 | 34 | def corpus2io(corpus_tokenized, V, window_size): 35 | """Converts corpus text into context and center words 36 | # Arguments 37 | corpus_tokenized: corpus text 38 | window_size: size of context window 39 | # Returns 40 | context and center words (arrays) 41 | """ 42 | for words in corpus_tokenized: 43 | L = len(words) 44 | for index, word in enumerate(words): 45 | contexts = [] 46 | center = [] 47 | s = index - window_size 48 | e = index + window_size + 1 49 | contexts = contexts + [words[i]-1 for i in range(s, e) if 0 <= i < L and i != index] 50 | center.append(word-1) 51 | # x has shape c x V where c is size of contexts 52 | x = np_utils.to_categorical(contexts, V) 53 | # y has shape k x V where k is number of center words 54 | y = np_utils.to_categorical(center, V) 55 | yield (x, y) 56 | 57 | 58 | 59 | def softmax(x): 60 | """Calculate softmax based probability for given input vector 61 | # Arguments 62 | x: numpy array/list 63 | # Returns 64 | softmax of input array 65 | """ 66 | e_x = np.exp(x - np.max(x)) 67 | return e_x / e_x.sum(axis=0) -------------------------------------------------------------------------------- /py_word2vec/visualizeW2VPlot.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from builtins import zip 4 | 5 | import numpy as np 6 | from sklearn.decomposition import PCA 7 | from matplotlib import pyplot 8 | 9 | import matplotlib.pyplot as plt 10 | from sklearn.manifold import TSNE 11 | import gensim 12 | 13 | class visualizeW2VPlot: 14 | def __init__(self): 15 | name = 'visualizeW2VPlot' 16 | 17 | def load(self, modelFile): 18 | model = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True, unicode_errors='ignore') 19 | return model 20 | 21 | def visualizePCA(self, model): 22 | pyplot.rc('font', family='New Gulim') 23 | 24 | words = ['이재명', '문재인', '승인', '당', '핵', '평화', '정치인', '대표'] 25 | 26 | word_vectors = np.vstack([model[w] for w in words]) 27 | twodim = PCA().fit_transform(word_vectors)[:, :2] 28 | twodim.shape 29 | plt.figure(figsize=(5, 5)) 30 | plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r') 31 | for word, (x, y) in zip(words, twodim): 32 | plt.text(x, y, word) 33 | plt.axis('off'); 34 | 35 | fig1 = plt.gcf() 36 | plt.show() 37 | plt.draw() 38 | fig1.savefig('testPCA.png', dpi=100) 39 | 40 | 41 | def visualizeTSNE(self, model, word, vector_size): 42 | pyplot.rc('font', family='New Gulim') 43 | 44 | arr = np.empty((0, vector_size), dtype='f') 45 | word_labels = [word] 46 | 47 | # get close words 48 | close_words = model.similar_by_word(word,topn=20) 49 | 50 | # add the vector for each of the closest words to the array 51 | arr = np.append(arr, np.array([model[word]]), axis=0) 52 | for wrd_score in close_words: 53 | wrd_vector = model[wrd_score[0]] 54 | word_labels.append(wrd_score[0]) 55 | arr = np.append(arr, np.array([wrd_vector]), axis=0) 56 | 57 | # find tsne coords for 2 dimensions 58 | tsne = TSNE(n_components=2, random_state=0) 59 | np.set_printoptions(suppress=True) 60 | Y = tsne.fit_transform(arr) 61 | 62 | x_coords = Y[:, 0] 63 | y_coords = Y[:, 1] 64 | # display scatter plot 65 | plt.scatter(x_coords, y_coords) 66 | 67 | for label, x, y in zip(word_labels, x_coords, y_coords): 68 | plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') 69 | plt.xlim(x_coords.min() + 0.00005, x_coords.max() + 0.00005) 70 | plt.ylim(y_coords.min() + 0.00005, y_coords.max() + 0.00005) 71 | plt.show() 72 | 73 | if __name__ == '__main__': 74 | model_file = '../embeddings/word2vec/korean_wiki_w2v.bin' 75 | model = visualizeW2VPlot().load(model_file) 76 | mode = 't-sne' # t-sne or pca 77 | if mode == 'pca': 78 | visualizeW2VPlot().visualizePCA(model) 79 | elif mode == 't-sne': 80 | vector_size = 300 81 | visualizeW2VPlot().visualizeTSNE(model, '이재명', vector_size) -------------------------------------------------------------------------------- /stopwords/stopwordsEng.txt: -------------------------------------------------------------------------------- 1 | photograph 2 | a 3 | about 4 | above 5 | after 6 | again 7 | against 8 | all 9 | am 10 | an 11 | and 12 | any 13 | are 14 | aren't 15 | as 16 | at 17 | be 18 | because 19 | been 20 | before 21 | being 22 | below 23 | between 24 | both 25 | but 26 | by 27 | can't 28 | cannot 29 | could 30 | couldn't 31 | did 32 | didn't 33 | do 34 | does 35 | doesn't 36 | doing 37 | don't 38 | down 39 | during 40 | each 41 | few 42 | for 43 | from 44 | further 45 | had 46 | hadn't 47 | has 48 | hasn't 49 | have 50 | haven't 51 | having 52 | he 53 | he'd 54 | he'll 55 | he's 56 | her 57 | here 58 | here's 59 | hers 60 | herself 61 | him 62 | himself 63 | his 64 | how 65 | how's 66 | i 67 | i'd 68 | i'll 69 | i'm 70 | i've 71 | if 72 | in 73 | into 74 | is 75 | isn't 76 | it 77 | it's 78 | its 79 | itself 80 | let's 81 | me 82 | more 83 | most 84 | mustn't 85 | my 86 | myself 87 | no 88 | nor 89 | not 90 | of 91 | off 92 | on 93 | once 94 | only 95 | or 96 | other 97 | ought 98 | our 99 | ours ourselves 100 | out 101 | over 102 | own 103 | same 104 | shan't 105 | she 106 | she'd 107 | she'll 108 | she's 109 | should 110 | shouldn't 111 | so 112 | some 113 | such 114 | than 115 | that 116 | that's 117 | the 118 | their 119 | theirs 120 | them 121 | themselves 122 | then 123 | there 124 | there's 125 | these 126 | they 127 | they'd 128 | they'll 129 | they're 130 | they've 131 | this 132 | those 133 | through 134 | to 135 | too 136 | under 137 | until 138 | up 139 | very 140 | was 141 | wasn't 142 | we 143 | we'd 144 | we'll 145 | we're 146 | we've 147 | were 148 | weren't 149 | what 150 | what's 151 | when 152 | when's 153 | where 154 | where's 155 | which 156 | while 157 | who 158 | who's 159 | whom 160 | why 161 | why's 162 | with 163 | won't 164 | would 165 | wouldn't 166 | you 167 | you'd 168 | you'll 169 | you're 170 | you've 171 | your 172 | yours 173 | yourself 174 | yourselves -------------------------------------------------------------------------------- /stopwords/stopwordsKor.txt: -------------------------------------------------------------------------------- 1 | 유전 2 | 단독_보조금 3 | 청원 4 | 모식도 5 | 이 6 | 라 7 | 의 8 | 네 9 | 은 10 | 야 11 | 아 12 | 있 13 | 하 14 | 것 15 | 들 16 | 그 17 | 되 18 | 수 19 | 이 20 | 보 21 | 않 22 | 없 23 | 나 24 | 사람 25 | 주 26 | 아니 27 | 등 28 | 같 29 | 우리 30 | 때 31 | 년 32 | 가 33 | 한 34 | 지 35 | 대하 36 | 오 37 | 말 38 | 일 39 | 그렇 40 | 위하 41 | 때문 42 | 그것 43 | 두 44 | 말하 45 | 알 46 | 그러나 47 | 받 48 | 못하 49 | 일 50 | 그런 51 | 또 52 | 문제 53 | 더 54 | 사회 55 | 많 56 | 그리고 57 | 좋 58 | 크 59 | 따르 60 | 중 61 | 나오 62 | 가지 63 | 씨 64 | 시키 65 | 만들 66 | 지금 67 | 생각하 68 | 그러 69 | 속 70 | 하나 71 | 집 72 | 살 73 | 모르 74 | 적 75 | 월 76 | 데 77 | 자신 78 | 안 79 | 어떤 80 | 내 81 | 내 82 | 경우 83 | 명 84 | 생각 85 | 시간 86 | 그녀 87 | 다시 88 | 이런 89 | 앞 90 | 보이 91 | 번 92 | 나 93 | 다른 94 | 어떻 95 | 여자 96 | 개 97 | 전 98 | 들 99 | 사실 100 | 이렇 101 | 점 102 | 싶 103 | 말 104 | 정도 105 | 좀 106 | 원 107 | 잘 108 | 통하 109 | 소리 110 | 놓 111 | 동안 112 | 을 113 | 다음 114 | 연도 115 | 이상 116 | 위 117 | 아래 118 | 간 119 | 대 120 | 각종 121 | 후 122 | 반면 123 | 대부분 124 | 회 125 | 년대 126 | 조 127 | 포함 128 | 차 129 | 산하 130 | 바 131 | 이내 132 | 뿐 133 | 급 134 | 별지 135 | 량 136 | 초기 137 | 미만 138 | 관련 139 | 형 140 | 위주 141 | 외 142 | 한편 143 | 최대한 144 | 그동안 145 | 호 146 | 이후 147 | 과 148 | 당초 149 | 아래 150 | 실 151 | 앞으로 152 | 본래 153 | 이전 154 | 형 155 | 양 156 | 항 157 | 타 158 | 편 159 | 내외 160 | 카 161 | 마 162 | 개월 163 | 동 164 | 단 165 | 그중 166 | 예 167 | 자 168 | 곳 169 | 관련 170 | 르 171 | 다 172 | 척 173 | 분 174 | 선 175 | 칙 176 | 단 177 | 장 178 | 밖 179 | 유 180 | 이외 181 | 국 182 | 경 183 | 미 184 | 만 185 | 건 186 | 일부 187 | 업 188 | 직 189 | 그간 190 | 기 191 | 이하 192 | 이래 193 | 력 194 | 증 195 | 붙임 196 | 개월 197 | 기타 198 | 약간 199 | 향후 200 | 만큼 201 | 화 202 | 기존 203 | 가칭 204 | 보임 205 | 정 206 | 포괄 207 | 나머지 208 | 여명 209 | 실 210 | 올해 211 | 전년 212 | 생수 213 | 여타 214 | 최근 215 | 다수 216 | 추후 217 | 에 218 | 대한 219 | 통해 220 | 등에 221 | 개소 222 | 소 223 | 배 224 | 현 225 | 으 226 | 로 227 | 종 228 | 각각 229 | --------------------------------------------------------------------------------