├── README.md
├── examples
    ├── bert_crf_ner_predictor.py
    ├── bert_crf_ner_trainer.py
    ├── bert_en_word_embeddings.py
    ├── bert_ko_word_embeddings.py
    ├── bert_ner_predictor.py
    ├── bert_ner_trainer.py
    ├── bert_sentiment_predictor.py
    ├── bert_sentiment_trainer.py
    ├── doc2vec_clustering.py
    ├── doc2vec_tester.py
    ├── doc2vec_trainer.py
    ├── fasttext_tester.py
    ├── fasttext_trainer.py
    ├── glove_tester.py
    ├── koreanKeywordTest.py
    ├── koreanLemmatizationTest.py
    ├── koreanNounExtractionTest.py
    ├── koreanSegmentationTest.py
    ├── koreanSpecialTokenizerTest.py
    ├── koreanTokenizerTest.py
    ├── naver_newscomments_processor.py
    ├── node2vec_tester.py
    ├── node2vec_traianer.py
    ├── scibert_ner_train.py
    ├── scibert_test.py
    ├── test222.py
    ├── test3.py
    ├── test4.py
    ├── testBertLSTM.py
    ├── testCooccurrence.py
    ├── testDocTermMatrix.py
    ├── testEXCo.py
    ├── testFirst.py
    ├── testMallet.py
    ├── testPMI.py
    ├── testSVM.py
    ├── test_document_clustering.py
    ├── test_korean_lemmatizer.py
    ├── test_pyTextMinerTopicModel.py
    ├── test_word2veclite.py
    ├── testt.py
    ├── word2vec_tester.py
    ├── word2vec_trainer.py
    └── zipfsManager.py
├── glove-win_devc_x64
    ├── cooccur.exe
    ├── cooccurrence.bin
    ├── cooccurrence.shuf.bin
    ├── demo.bat
    ├── demo.sh
    ├── donald.txt
    ├── eval
    │   └── python
    │   │   ├── distance.py
    │   │   ├── evaluate.py
    │   │   └── word_analogy.py
    ├── glove.exe
    ├── pthreadVC2.dll
    ├── shuffle.exe
    └── vocab_count.exe
├── pyTextMiner
    ├── __init__.py
    ├── chunker
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── cooccurrence
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── cooccurrence.cpython-37.pyc
    │   └── cooccurrence.py
    ├── counter
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── graphml
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── helper
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── keyword
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── textrank.cpython-37.pyc
    │   └── textrank.py
    ├── lemmatizer
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── ngram
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── noun_extractor
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── pmi
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── segmentation
    │   ├── WordSemgmentationModelBuilder.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── lstmWordSegmentationModel.cpython-37.pyc
    │   │   └── wordSegmentationModelUtil.cpython-37.pyc
    │   ├── lstmWordSegmentationModel.py
    │   ├── lstmWordSegmentationModelBuilder.py
    │   ├── model
    │   │   ├── checkpoint
    │   │   ├── dic.pickle
    │   │   ├── segm.ckpt.data-00000-of-00001
    │   │   ├── segm.ckpt.index
    │   │   └── segm.ckpt.meta
    │   └── wordSegmentationModelUtil.py
    ├── splitter
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── stemmer
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── tagger
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── tokenizer
    │   ├── __init__.py
    │   └── __pycache__
    │   │   └── __init__.cpython-37.pyc
    ├── utility
    │   └── __init__.py
    └── version.py
├── py_bert
    ├── __init__.py
    ├── bert_classification_model.py
    ├── bert_dataset.py
    ├── bert_predictor.py
    ├── bert_trainer.py
    ├── bert_util.py
    ├── tokenization_kobert.py
    └── tokenization_korbert.py
├── py_doc2vec
    ├── . ...
    ├── __init__.py
    └── doc2vecModel.py
├── py_document_classification
    ├── __init__.py
    ├── lasso_term_extraction.py
    ├── ml_textclassification.py
    └── test_ml_text_classfier.py
├── py_document_clustering
    ├── __init__.py
    └── documentclustering.py
├── py_ner
    ├── bert_bilstm_crf_ner_train.py
    ├── bert_crf_ner_prediction.py
    ├── bert_crf_ner_train.py
    ├── bert_crf_ner_visualization.py
    ├── bert_gru_crf_ner_train.py
    ├── bert_ner_prediction.py
    ├── bert_ner_train.py
    ├── bertviz
    │   ├── attention.py
    │   ├── head_view.js
    │   ├── head_view.py
    │   ├── model_view.js
    │   ├── model_view.py
    │   ├── neuron_view.js
    │   ├── neuron_view.py
    │   └── pytorch_transformers_attn
    │   │   ├── ...
    │   │   ├── file_utils.py
    │   │   ├── modeling_bert.py
    │   │   ├── modeling_gpt2.py
    │   │   ├── modeling_openai.py
    │   │   ├── modeling_roberta.py
    │   │   ├── modeling_transfo_xl.py
    │   │   ├── modeling_transfo_xl_utilities.py
    │   │   ├── modeling_utils.py
    │   │   ├── modeling_xlm.py
    │   │   ├── modeling_xlnet.py
    │   │   ├── tokenization_bert.py
    │   │   ├── tokenization_gpt2.py
    │   │   ├── tokenization_openai.py
    │   │   ├── tokenization_roberta.py
    │   │   ├── tokenization_transfo_xl.py
    │   │   ├── tokenization_utils.py
    │   │   ├── tokenization_xlm.py
    │   │   └── tokenization_xlnet.py
    ├── config
    │   ├── ...
    │   ├── config.json
    │   └── ner_to_index.json
    ├── data
    │   ├── conlleval
    │   ├── dataset_info.txt
    │   ├── eng.testa
    │   ├── eng.testb
    │   ├── eng.train
    │   ├── eng.train54019
    │   ├── expo_kor.test
    │   ├── expo_kor.train
    │   ├── test.txt
    │   └── train.txt
    ├── data_utils
    │   ├── ...
    │   ├── ner_dataset.py
    │   ├── pad_sequence.py
    │   ├── utils.py
    │   └── vocab_tokenizer.py
    ├── find_learning_rate.py
    ├── kobert
    │   ├── ...
    │   ├── mxnet_kobert.py
    │   ├── pytorch_kobert.py
    │   └── utils.py
    ├── lstm_cnn_crf_evaluator.py
    ├── lstm_cnn_crf_model.py
    ├── lstm_cnn_crf_trainer.py
    ├── lstm_cnn_crf_utils.py
    ├── model
    │   ├── ...
    │   ├── net.py
    │   ├── optimizers.py
    │   └── utils.py
    ├── ner_crf.py
    ├── ner_crf_ko.py
    ├── ner_data_loader.py
    └── scibert_ner_train.py
├── py_node2vec
    └── node2vecModel.py
├── py_topic_model
    ├── MalletWrapper.py
    ├── __init__.py
    ├── gdmr_plot.py
    ├── lda.py
    ├── ldaInference.py
    ├── ldaSeqModel.py
    ├── ldaVisualizer.py
    ├── pyTextMinerTopicModel.py
    └── tfidf.py
├── py_word2vec
    ├── __init__.py
    ├── avgDocumentByW2V.py
    ├── gloveWikiKoreanTrainer.py
    ├── utils.py
    ├── visualizeW2V.py
    ├── visualizeW2VPlot.py
    ├── word2vecFilteringFunction.py
    ├── word2veclite.py
    └── word_embeddings.py
└── stopwords
    ├── stopword_company.txt
    ├── stopwordsEng.txt
    └── stopwordsKor.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # pyTextMiner
 2 | A text mining tool for Korean and English
 3 | 
 4 | pyTextMiner was orginally designed as a teaching aid for my Text Mining class at Yonsei University. pyTextMiner was developed in Python. 
 5 | Prior to the development of pyTextMiner, I developed the yTextMiner text mining tool in Java for a teaching purpose. I used yTextMiner for my MOOC and K-MOOC courses as well as my courses that I taught at Yonsei University.
 6 | 
 7 | In the current version of pyTextMiner, pyTextMiner can handle both English and Korean texts. However, the majority of the compoents is for Korean texts. 
 8 | 
 9 | pyTextMiner follows the principle of the pipeline architecture where each pipe takes care of its task of processing and representing the incoming text. Pipeline allows for a simple, modularized process of text.
10 | 
11 | In the future, I plant to include preprocessing techinuqes for other languages such as Chinese, Japanese, and French.
12 | 


--------------------------------------------------------------------------------
/examples/bert_crf_ner_predictor.py:
--------------------------------------------------------------------------------
 1 | from py_ner.bert_crf_ner_prediction import BertCRFNERPredictor
 2 | 
 3 | model_dir = '../py_ner/experiments/base_model_with_crf_val'
 4 | predictor = BertCRFNERPredictor(model_dir)
 5 | 
 6 | 
 7 | tokenizer_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
 8 | 
 9 | #model name needs to be changed to the one you trained
10 | model_name = 'best-epoch-9-step-750-acc-0.980.bin'
11 | 
12 | algorithm = 'bert_lstm_crf'
13 | if algorithm == 'bert_crf':
14 |     checkpoint_file = './experiments/base_model_with_crf/' + model_name
15 | 
16 | elif algorithm == 'bert_lstm_crf':
17 |     checkpoint_file = './experiments/base_model_with_lstm_crf/' + model_name
18 | 
19 | elif algorithm == 'bert_gru_crf':
20 |     checkpoint_file = './experiments/base_model_with_gru_crf/' + model_name
21 | 
22 | 
23 | predictor.load_model(model_name=model_name, tokenizer_path=tokenizer_path, checkpoint_file=checkpoint_file)
24 | 
25 | text = '오늘은 비도 오고 학생들이 졸려 보여서 나도 졸리운데 송강호의 괴물 영화나 볼까?'
26 | ne_text = predictor.predict(text)
27 | print(ne_text)


--------------------------------------------------------------------------------
/examples/bert_crf_ner_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from py_ner.bert_bilstm_crf_ner_train import BertBiLstmCrfTrainer
 4 | from py_ner.bert_crf_ner_train import BertCrfTrainer
 5 | from shutil import copyfile
 6 | 
 7 | data = '../py_ner/data'
 8 | 
 9 | def make_dir(directory, parent_dir):
10 |     # Path
11 |     path = os.path.join(parent_dir, directory)
12 | 
13 |     # Create the directory
14 |     try:
15 |         os.makedirs(path, exist_ok=True)
16 |         print("Directory '%s' created successfully" % directory)
17 |     except OSError as error:
18 |         print("Directory '%s' can not be created")
19 | 
20 | algorithm = 'bert_gru_crf' #bert_crf, bert_lstm_crf, bert_gru_crf
21 | # we need two mandatory files in this new directory: config.json and ner_to_index.json
22 | if algorithm == 'bert_crf':
23 |     model_dir = 'experiments/base_model_with_crf'
24 |     make_dir(model_dir, "./")
25 |     copyfile('../py_ner/config/config.json', model_dir+"/config.json")
26 |     copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json")
27 | 
28 |     trainer = BertCrfTrainer(data_dir=data, model_dir=model_dir)
29 | elif algorithm == 'bert_lstm_crf':
30 |     model_dir = 'experiments/base_model_with_lstm_crf'
31 |     make_dir(model_dir, "./")
32 |     copyfile('../py_ner/config/config.json', model_dir + "/config.json")
33 |     copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json")
34 | 
35 |     trainer = BertBiLstmCrfTrainer(data_dir=data, model_dir=model_dir)
36 | elif algorithm == 'bert_gru_crf':
37 |     model_dir = 'experiments/base_model_with_gru_crf'
38 |     make_dir(model_dir, "./")
39 |     copyfile('../py_ner/config/config.json', model_dir + "/config.json")
40 |     copyfile('../py_ner/config/ner_to_index.json', model_dir + "/ner_to_index.json")
41 | 
42 |     trainer = BertBiLstmCrfTrainer(data_dir=data, model_dir=model_dir)
43 | 
44 | trainer.data_loading()
45 | trainer.train()


--------------------------------------------------------------------------------
/examples/bert_en_word_embeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
  3 | from sklearn.metrics.pairwise import cosine_similarity
  4 | 
  5 | #pip install pytorch-pretrained-bert
  6 | # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
  7 | import logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | # Load pre-trained model tokenizer (vocabulary)
 13 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 14 | 
 15 | #1 Sentence Input:
 16 | #text = "Here is the sentence I want embeddings for."
 17 | text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
 18 | marked_text = "[CLS] " + text + " [SEP]"
 19 | print (marked_text)
 20 | 
 21 | #We’ve imported a BERT-specific tokenizer, let’s take a look at the output:
 22 | tokenized_text = tokenizer.tokenize(marked_text)
 23 | print (tokenized_text)
 24 | 
 25 | list(tokenizer.vocab.keys())[5000:5020]
 26 | 
 27 | indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 28 | 
 29 | for tup in zip(tokenized_text, indexed_tokens):
 30 |   print (tup)
 31 | 
 32 | segments_ids = [1] * len(tokenized_text)
 33 | print (segments_ids)
 34 | 
 35 | # Convert inputs to PyTorch tensors
 36 | tokens_tensor = torch.tensor([indexed_tokens])
 37 | segments_tensors = torch.tensor([segments_ids])
 38 | 
 39 | # Load pre-trained model (weights)
 40 | model = BertModel.from_pretrained('bert-base-uncased')
 41 | 
 42 | # Put the model in "evaluation" mode, meaning feed-forward operation.
 43 | model.eval()
 44 | 
 45 | # Predict hidden states features for each layer
 46 | with torch.no_grad():
 47 |     encoded_layers, _ = model(tokens_tensor, segments_tensors)
 48 | 
 49 | print ("Number of layers:", len(encoded_layers))
 50 | layer_i = 0
 51 | 
 52 | print ("Number of batches:", len(encoded_layers[layer_i]))
 53 | batch_i = 0
 54 | 
 55 | print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
 56 | token_i = 0
 57 | 
 58 | print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
 59 | 
 60 | 
 61 | # For the 5th token in our sentence, select its feature values from layer 5.
 62 | token_i = 5
 63 | layer_i = 5
 64 | vec = encoded_layers[layer_i][batch_i][token_i]
 65 | 
 66 | # Plot the values as a histogram to show their distribution.
 67 | plt.figure(figsize=(10,10))
 68 | plt.hist(vec, bins=200)
 69 | plt.show()
 70 | 
 71 | # Convert the hidden state embeddings into single token vectors
 72 | 
 73 | # Holds the list of 12 layer embeddings for each token
 74 | # Will have the shape: [# tokens, # layers, # features]
 75 | token_embeddings = []
 76 | 
 77 | # For each token in the sentence...
 78 | for token_i in range(len(tokenized_text)):
 79 | 
 80 |     # Holds 12 layers of hidden states for each token
 81 |     hidden_layers = []
 82 | 
 83 |     # For each of the 12 layers...
 84 |     for layer_i in range(len(encoded_layers)):
 85 |         # Lookup the vector for `token_i` in `layer_i`
 86 |         vec = encoded_layers[layer_i][batch_i][token_i]
 87 | 
 88 |         hidden_layers.append(vec)
 89 | 
 90 |     token_embeddings.append(hidden_layers)
 91 | 
 92 | print('------------------------------------------------------------')
 93 | 
 94 | # Sanity check the dimensions:
 95 | print("Number of tokens in sequence:", len(token_embeddings))
 96 | print("Number of layers per token:", len(token_embeddings[0]))
 97 | 
 98 | concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]
 99 | 
100 | summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768]
101 | 
102 | sentence_embedding = torch.mean(encoded_layers[11], 1)
103 | 
104 | print ("Our final sentence embedding vector of shape:"), sentence_embedding[0].shape[0]
105 | 
106 | print (text)
107 | for i,x in enumerate(tokenized_text):
108 |   print (i,x)
109 | 
110 | print ("First fifteen values of 'bank' as in 'bank robber':")
111 | print (summed_last_4_layers[10][:15])
112 | 
113 | print ("First fifteen values of 'bank' as in 'bank vault':")
114 | print(summed_last_4_layers[6][:15])
115 | 
116 | print ("First fifteen values of 'bank' as in 'river bank':")
117 | print(summed_last_4_layers[19][:15])
118 | 
119 | # Compare "bank" as in "bank robber" to "bank" as in "river bank"
120 | different_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[19].reshape(1,-1))[0][0]
121 | 
122 | # Compare "bank" as in "bank robber" to "bank" as in "bank vault"
123 | same_bank = cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[6].reshape(1,-1))[0][0]
124 | 
125 | print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'bank vault':",  same_bank)
126 | 
127 | print ("Similarity of 'bank' as in 'bank robber' to 'bank' as in 'river bank':",  different_bank)
128 | 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/examples/bert_ner_predictor.py:
--------------------------------------------------------------------------------
 1 | from pytorch_pretrained_bert import BertTokenizer
 2 | 
 3 | import py_ner.bert_ner_prediction as prediction
 4 | import torch
 5 | import py_ner.lstm_cnn_crf_utils as utils
 6 | 
 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 8 | json_file = './models/tag_info_kr.json' #tag_info_kr.json or tag_info_en.json
 9 | tag2idx = utils.load_from_json(json_file)
10 | MAX_LEN = 160
11 | 
12 | #text = 'this is a good John Smith as my friend'
13 | text = '이승만 대통령은 대한민국 박명환 대통령입니다.'
14 | 
15 | #bert-base-multilingual-cased, bert-base-cased
16 | tokenizer_name = 'bert-base-multilingual-cased'
17 | tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
18 | 
19 | predictor = prediction.BERTNERPredictor()
20 | model_name = "bert_ner_kr.model" #bert_ner_kr.model or bert_ner_en.model
21 | predictor.load_model(model_name)
22 | predictions = predictor.predict_each(device, text, tokenizer, MAX_LEN, tag2idx)
23 | #pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
24 | print(str(predictions))


--------------------------------------------------------------------------------
/examples/bert_ner_trainer.py:
--------------------------------------------------------------------------------
 1 | from pytorch_pretrained_bert import BertTokenizer
 2 | 
 3 | import py_ner.bert_ner_train as train
 4 | import torch
 5 | import py_ner.lstm_cnn_crf_utils as utils
 6 | 
 7 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 8 | trainer = train.BERTNERTrainer()
 9 | 
10 | mode = 'txt'
11 | if mode == 'csv':
12 |     data = "../py_ner/data/ner.csv"
13 | else:
14 |     #for Korean text for now
15 |     data = '../py_ner/data/train.txt'
16 | 
17 | sentences, tag2idx, labels = trainer.data_processing(data)
18 | 
19 | #bert-base-multilingual-cased, bert-base-cased
20 | tokenizer_name = 'bert-base-multilingual-cased'
21 | tokenizer = trainer.tokenizer(tokenizer_name)
22 | trainer.data_loading(tokenizer,sentences,tag2idx,labels)
23 | classifier_model_name='bert-base-multilingual-cased'
24 | trainer.load_token_classifier(classifier_model_name,tag2idx)
25 | 
26 | trainer.set_optimizer()
27 | 
28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 | trainer.train_epoch(device)
30 | 
31 | trainer.eval(device, labels)
32 | 
33 | language = "kr"
34 | trainer.save_model(language)
35 | 


--------------------------------------------------------------------------------
/examples/bert_sentiment_predictor.py:
--------------------------------------------------------------------------------
  1 | from textwrap import wrap
  2 | 
  3 | from sklearn.metrics import classification_report, confusion_matrix
  4 | from transformers import BertForSequenceClassification
  5 | 
  6 | from py_bert.bert_dataset import PYBERTDataset
  7 | from py_bert.bert_classification_model import PYBERTClassifier
  8 | from py_bert.bert_predictor import bert_predictor
  9 | from py_bert.bert_trainer import PYBERTTrainer
 10 | from py_bert.bert_util import create_data_loader, add_sentiment_label, convert_to_df, get_korean_tokenizer, show_confusion_matrix
 11 | from transformers import BertModel, BertTokenizer
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | from py_bert.tokenization_kobert import KoBertTokenizer
 15 | import matplotlib.pyplot as plt
 16 | import seaborn as sns
 17 | 
 18 | import pyTextMiner as ptm
 19 | import torch
 20 | import pandas as pd
 21 | 
 22 | #mode is either en or kr
 23 | mode = 'kr'
 24 | df = None
 25 | 
 26 | if mode == 'en':
 27 |     df = pd.read_csv("../data/reviews.csv")
 28 |     df, class_names = add_sentiment_label(df)
 29 | elif mode == 'kr':
 30 |     mecab_path = 'C:\\mecab\\mecab-ko-dic'
 31 |     stopwords = '../stopwords/stopwordsKor.txt'
 32 |     input_file = '../data/ratings_test.txt'
 33 | 
 34 |     pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
 35 |                             ptm.tokenizer.MeCab(mecab_path),
 36 |                             ptm.lemmatizer.SejongPOSLemmatizer(),
 37 |                             ptm.helper.SelectWordOnly(),
 38 |                             ptm.helper.StopwordFilter(file=stopwords))
 39 | 
 40 |     corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter='\t', doc_index=1, class_index=2)
 41 | 
 42 |     documents = []
 43 |     labels = []
 44 |     result = pipeline.processCorpus(corpus)
 45 |     i = 1
 46 |     #below is just for a sample test
 47 |     for doc in result[1:500]:
 48 |         document = ''
 49 |         for sent in doc:
 50 |             for word in sent:
 51 |                 document += word + ' '
 52 |         documents.append(document.strip())
 53 |         labels.append(corpus.pair_map[i])
 54 |         i += 1
 55 | 
 56 |     df, class_names = convert_to_df(documents,labels)
 57 | 
 58 | print(df.head())
 59 | print(df.info())
 60 | 
 61 | # Report the number of sentences.
 62 | print('Number of test sentences: {:,}\n'.format(df.shape[0]))
 63 | 
 64 | tokenizer = None
 65 | # bert-base-multilingual-cased, bert-base-cased, monologg/kobert, monologg/distilkobert, bert_models/vocab_etri.list
 66 | # bert_model_name='../bert_models/vocab_mecab.list'
 67 | bert_model_name = 'monologg/kobert'
 68 | tokenizer = get_korean_tokenizer(bert_model_name)
 69 | 
 70 | #we need a better way of setting MAX_LEN
 71 | MAX_LEN = 160
 72 | 
 73 | predictor = bert_predictor()
 74 | predictor.load_data(df, tokenizer, MAX_LEN)
 75 | 
 76 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 77 | 
 78 | #algorithm and saved_training_model goes hand-in-hand
 79 | algorithm='no_transformers'
 80 | #saved_training_model = './model_save/best_model_state.bin'
 81 | if algorithm=='transformers':
 82 |     saved_training_model = './model_save/best_model_state.bin'
 83 | else:
 84 |     saved_training_model = './model_save/best_model_states.bin'
 85 | 
 86 | predictor.load_model(saved_training_model)
 87 | 
 88 | y_texts, y_pred, y_pred_probs, y_test = predictor.predict(device, algorithm=algorithm)
 89 | print(y_pred)
 90 | print(y_test)
 91 | 
 92 | print(classification_report(y_test, y_pred, target_names=class_names))
 93 | cm = confusion_matrix(y_test, y_pred)
 94 | df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
 95 | show_confusion_matrix(df_cm)
 96 | 
 97 | '''
 98 | for i, (a, b) in enumerate(zip(y_test, y_pred)):
 99 |     print(classification_report(a, b, target_names=class_names))
100 |     cm = confusion_matrix(y_test[i], y_pred[i])
101 |     df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
102 |     show_confusion_matrix(df_cm)
103 | '''
104 | 
105 | #let’s have a look at an example from our test data:
106 | idx = 2
107 | text = y_texts[idx]
108 | true_sentiment = y_test[idx]
109 | pred_df = pd.DataFrame({
110 |   'class_names': class_names,
111 |   'values': y_pred_probs[idx]
112 | })
113 | print("\n".join(wrap(text)))
114 | print()
115 | print(f'True sentiment: {class_names[true_sentiment]}')
116 | print('\n')
117 | 
118 | #we can look at the confidence of each sentiment of our model:
119 | sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
120 | plt.ylabel('sentiment')
121 | plt.xlabel('probability')
122 | plt.xlim([0, 1]);
123 | plt.show()
124 | 
125 | text = '정말 형편없네 ㅠㅠ 눈을 버렸어'
126 | prediction = predictor.predict_each(device,text,tokenizer,MAX_LEN, algorithm=algorithm)
127 | print(f'Review text: {text}')
128 | print(f'Sentiment  : {class_names[prediction]}')
129 | 
130 | #predictor.predict(device)
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/examples/bert_sentiment_trainer.py:
--------------------------------------------------------------------------------
  1 | from transformers import BertForSequenceClassification
  2 | 
  3 | from py_bert.bert_dataset import PYBERTDataset
  4 | from py_bert.bert_classification_model import PYBERTClassifier, PYBERTClassifierGenAtten, PYBertForSequenceClassification
  5 | from py_bert.bert_trainer import PYBERTTrainer
  6 | from py_bert.bert_util import create_data_loader, add_sentiment_label, convert_to_df, get_korean_tokenizer
  7 | from transformers import BertModel, BertTokenizer
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | from py_bert.tokenization_kobert import KoBertTokenizer
 11 | 
 12 | import pyTextMiner as ptm
 13 | import torch
 14 | import numpy as np
 15 | import pandas as pd
 16 | 
 17 | #mode is either en or kr
 18 | mode = 'kr'
 19 | df = None
 20 | 
 21 | if mode == 'en':
 22 |     df = pd.read_csv("../data/reviews.csv")
 23 |     df, class_names = add_sentiment_label(df)
 24 | elif mode == 'kr':
 25 |     mecab_path = 'C:\\mecab\\mecab-ko-dic'
 26 |     stopwords = '../stopwords/stopwordsKor.txt'
 27 |     input_file = '../data/ratings_train.txt'
 28 | 
 29 |     pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
 30 |                             ptm.tokenizer.MeCab(mecab_path),
 31 |                             ptm.lemmatizer.SejongPOSLemmatizer(),
 32 |                             ptm.helper.SelectWordOnly(),
 33 |                             ptm.helper.StopwordFilter(file=stopwords))
 34 | 
 35 |     corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file, delimiter='\t', doc_index=1, class_index=2)
 36 | 
 37 |     documents = []
 38 |     labels = []
 39 |     result = pipeline.processCorpus(corpus)
 40 |     i = 1
 41 | 
 42 |     #below is just for a sample test
 43 |     for doc in result[1:2000]:
 44 |         document = ''
 45 |         for sent in doc:
 46 |             for word in sent:
 47 |                 document += word + ' '
 48 |         documents.append(document.strip())
 49 |         labels.append(corpus.pair_map[i])
 50 |         i += 1
 51 | 
 52 |     df, class_names = convert_to_df(documents,labels)
 53 | 
 54 | print(df.head())
 55 | print(df.info())
 56 | 
 57 | RANDOM_SEED = 42
 58 | np.random.seed(RANDOM_SEED)
 59 | torch.manual_seed(RANDOM_SEED)
 60 | 
 61 | #we need a better way of setting MAX_LEN
 62 | MAX_LEN = 160
 63 | 
 64 | #split
 65 | df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
 66 | df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
 67 | 
 68 | print(df_train.shape, df_val.shape, df_test.shape)
 69 | 
 70 | tokenizer = None
 71 | #bert-base-multilingual-cased, bert-base-cased, monologg/kobert, monologg/distilkobert, bert_models/vocab_etri.list
 72 | #bert_model_name='../bert_models/vocab_mecab.list'
 73 | bert_model_name='monologg/kobert'
 74 | tokenizer =get_korean_tokenizer(bert_model_name)
 75 | 
 76 | BATCH_SIZE = 16
 77 | train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
 78 | val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
 79 | test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
 80 | 
 81 | # print(str(train_data_loader.dataset.__getitem__(0)))
 82 | data = next(iter(train_data_loader))
 83 | data.keys()
 84 | 
 85 | print(data['input_ids'].shape)
 86 | print(data['attention_mask'].shape)
 87 | print(data['token_type_ids'].shape)
 88 | print(data['targets'].shape)
 89 | 
 90 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 91 | 
 92 | classifier = 'transformers'
 93 | if classifier == 'basic':
 94 |     model = PYBERTClassifier(len(class_names), bert_model_name)
 95 | elif classifier == 'attention':
 96 |     dr_rate = 0.3
 97 |     model = PYBERTClassifierGenAtten(len(class_names), bert_model_name, dr_rate=dr_rate)
 98 | elif classifier == 'transformers':
 99 |     model = PYBertForSequenceClassification(len(class_names), bert_model_name).__call__()
100 | 
101 | model = model.to(device)
102 | 
103 | algorithm='transformers' #transformers or non_transformers
104 | if algorithm =='transformers':
105 |     torch_model_name='best_model_state.bin'
106 | else:
107 |     torch_model_name = 'best_model_states.bin'
108 | 
109 | #BERT authors suggests epoch from 2 to 4
110 | num_epochs = 2
111 | trainer = PYBERTTrainer()
112 | trainer.train(model, device, train_data_loader, val_data_loader,
113 |               df_val, df_train, tokenizer, num_epochs=num_epochs, algorithm=algorithm, torch_model_name=torch_model_name)
114 | 
115 | trainer.summanry_training_stats()
116 | 
117 | trainer.visualize_performance()
118 | 


--------------------------------------------------------------------------------
/examples/doc2vec_clustering.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | from sklearn.cluster import KMeans
 4 | 
 5 | from py_doc2vec.doc2vecModel import Doc2VecTrainer, Doc2VecSimilarity
 6 | import logging
 7 | import pyTextMiner as ptm
 8 | import csv
 9 | import sys
10 | from py_document_clustering.documentclustering import DocumentClustering
11 | import matplotlib.pyplot as plt
12 | from sklearn.cluster import KMeans
13 | from sklearn.decomposition import PCA
14 | 
15 | model_file = './tmp/1595123417030_pv_dma_dim=100_window=5_epochs=20/doc2vec.model'
16 | doc2vec = Doc2VecSimilarity()
17 | doc2vec.load_model(model_file)
18 | model = doc2vec.get_model()
19 | # name either k-means, agglo, spectral_cocluster
20 | name = 'spectral_cocluster'
21 | clustering = DocumentClustering(k=3)
22 | # n_components means the number of words to be used as features
23 | clustering.make_matrix(n_components=-1, doc2vec_matrix=model.docvecs.vectors_docs)
24 | clustering.cluster(name)
25 | 
26 | clustering.visualize()
27 | 


--------------------------------------------------------------------------------
/examples/doc2vec_tester.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     from doc2vec.doc2vecModel import Doc2VecSimilarity
 3 |     import logging
 4 |     import pyTextMiner as ptm
 5 | 
 6 |     model_file = '../doc2vec/tmp/1594484106304_pv_dma_dim=100_window=5_epochs=20/doc2vec.model'
 7 |     doc2vec = Doc2VecSimilarity()
 8 |     doc2vec.load_model(model_file)
 9 | 
10 |     test_sample = '한국 경제가 위기에 처하다'
11 |     # Convert the sample document into a list and use the infer_vector method to get a vector representation for it
12 |     new_doc_words = test_sample.split()
13 |     similars = doc2vec.most_similar(test_sample)
14 |     for sim in similars:
15 |         print(str(sim))
16 | 
17 |     mecab_path = 'C:\\mecab\\mecab-ko-dic'
18 |     # stopwords file path
19 |     stopwords = '../stopwords/stopwordsKor.txt'
20 | 
21 |     test_sample1 = '중국 시장은 위축되었다'
22 | 
23 |     pipeline = ptm.Pipeline(ptm.tokenizer.MeCab(mecab_path),
24 |                             ptm.lemmatizer.SejongPOSLemmatizer(),
25 |                             ptm.helper.SelectWordOnly(),
26 |                             ptm.helper.StopwordFilter(file=stopwords))
27 | 
28 |     doc_vec1 = pipeline.processCorpus([test_sample])
29 |     doc_vec2 = pipeline.processCorpus([test_sample1])
30 | 
31 |     print(doc_vec1[0])
32 |     print(doc_vec2[0])
33 | 
34 |     # use the most_similar utility to find the most similar documents.
35 |     similarity = doc2vec.compute_similarity_vec(first_vec=doc_vec1[0], second_vec=doc_vec2[0])
36 |     print('similarity between two document: ')
37 |     print(str(similarity))
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/examples/doc2vec_trainer.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     from doc2vec.doc2vecModel import Doc2VecTrainer
 3 |     import logging
 4 |     import pyTextMiner as ptm
 5 |     from gensim.models.doc2vec import TaggedDocument
 6 |     #pv_dmc, pv_dma, pv_dbow
 7 |     algorithm = 'pv_dma'
 8 |     # ignores all words with total frequency lower than this
 9 |     vocab_min_count = 10
10 |     # word and document vector siz
11 |     dim = 100
12 |     # window size
13 |     window = 5
14 |     #number of training epochs
15 |     epochs = 20
16 |     # initial learning rate
17 |     alpha = 0.025
18 |     # learning rate will linearly drop to min_alpha as training progresses
19 |     min_alpha = 0.001
20 |     # number of cores to train on
21 |     cores = 2
22 |     # number of cores to train on
23 |     train = True
24 | 
25 |     mecab_path = 'C:\\mecab\\mecab-ko-dic'
26 | 
27 |     # stopwords file path
28 |     stopwords = '../stopwords/stopwordsKor.txt'
29 |     # train documents input path
30 |     input_path = '../data/donald.txt'
31 |     # output base directory
32 |     output_base_dir = './tmp'
33 | 
34 |     pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
35 |                             ptm.tokenizer.MeCab(mecab_path),
36 |                             ptm.lemmatizer.SejongPOSLemmatizer(),
37 |                             ptm.helper.SelectWordOnly(),
38 |                             ptm.helper.StopwordFilter(file=stopwords))
39 | 
40 |     corpus = ptm.CorpusFromFile(input_path)
41 |     documents = []
42 |     result = pipeline.processCorpus(corpus)
43 |     i = 0
44 |     for doc in result:
45 |         document = []
46 |         for sent in doc:
47 |             for word in sent:
48 |                 document.append(word)
49 |         documents.append(TaggedDocument(document, [i]))
50 |         i += 1
51 | 
52 |     #--epochs 40 --vocab-min-count 10 data/stopwords_german.txt dewiki-preprocessed.txt /tmp/models/doc2vec-dewiki
53 | 
54 |     doc2vec = Doc2VecTrainer()
55 |     logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=logging.INFO)
56 |     doc2vec.run(documents, output_base_dir=output_base_dir, vocab_min_count=vocab_min_count,
57 |         num_epochs=epochs, algorithm=algorithm, vector_size=dim, alpha=alpha,
58 |         min_alpha=min_alpha, train=train, window=window, cores=cores)


--------------------------------------------------------------------------------
/examples/fasttext_tester.py:
--------------------------------------------------------------------------------
 1 | from word2vec.word_embeddings import FastText
 2 | 
 3 | fasttext = FastText()
 4 | binary=True
 5 | model_file = 'fasttext.bin'
 6 | fasttext.load_model(model_file)
 7 | mode = 'jamo_split'
 8 | print(fasttext.most_similar(mode, positives=['이재명', '경제'], negatives=['정치인'], topn=10))
 9 | #print(fasttext.most_similar(mode, positives=['이재명'], negatives=[], topn=10))
10 | 
11 | print('-----------------------------------')
12 | 
13 | print(fasttext.similar_by_word(mode, '이재명'))


--------------------------------------------------------------------------------
/examples/fasttext_trainer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from word2vec.word_embeddings import FastText
 3 | 
 4 | fasttext = FastText()
 5 | mode = 'jamo_split_filtered'
 6 | mecab_path = 'C:\\mecab\\mecab-ko-dic'
 7 | stopword_file = '../stopwords/stopwordsKor.txt'
 8 | files = []
 9 | files.append('../data/donald.txt')
10 | is_directory=False
11 | doc_index=2
12 | max=-1
13 | fasttext.preprocessing(mode,mecab_path,stopword_file,files,is_directory,doc_index,max)
14 | 
15 | min_count=1
16 | window=5
17 | size=50
18 | negative=5
19 | fasttext.train(min_count, window, size, negative)
20 | 
21 | model_file = 'fasttext.bin'
22 | binary=True;
23 | fasttext.save_model(model_file)
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/examples/glove_tester.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from word2vec.word_embeddings import GloVe
 4 | 
 5 | glove = GloVe()
 6 | binary=True
 7 | model_file = '../glove-win_devc_x64/vectors.txt'
 8 | glove.load_model(model_file)
 9 | print(glove.most_similars(positives=['이재명', '경제'], negatives=['정치인'], topn=10))
10 | 
11 | print('-----------------------------------')
12 | 
13 | print(glove.most_similar('이재명'))


--------------------------------------------------------------------------------
/examples/koreanKeywordTest.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | min_count = 5   # 단어의 최소 출현 빈도수 (그래프 생성 시)
 4 | max_length = 20 # 단어의 최대 길이
 5 | beta = 0.95
 6 | max_iter = 20
 7 | verbose = True
 8 | num_words=30
 9 | keyword_extractor=ptm.keyword.KeywordExtractionKorean(min_count,max_length,beta,max_iter,verbose,num_words)
10 | 
11 | sents = ['최순실 씨가 외국인투자촉진법 개정안 통과와 예산안 반영까지 꼼꼼이 챙긴 건데, 이른바 외촉법, 어떤 법이길래 최 씨가 열심히 챙긴 걸까요. 자신의 이해관계와 맞아 떨어지는 부분이 없었는지 취재기자와 한걸음 더 들여다보겠습니다. 이서준 기자, 우선 외국인투자촉진법 개정안, 어떤 내용입니까?',
12 |         '한마디로 대기업이 외국 투자를 받아 계열사를 설립할 때 규제를 완화시켜 주는 법안입니다. 대기업 지주사의 손자 회사가 이른바 증손회사를 만들 때 지분 100%를 출자해야 합니다. 대기업의 문어발식 계열사 확장을 막기 위한 조치인데요. 외촉법 개정안은 손자회사가 외국 투자를 받아서 증손회사를 만들 땐 예외적으로 50% 지분만 투자해도 되게끔 해주는 내용입니다.',
13 |         '그만큼 쉽게 완화해주는 거잖아요. 그때 기억을 더듬어보면 야당의 반발이 매우 심했습니다. 그 이유가 뭐였죠? ',
14 |         '대기업 특혜 법안이라는 취지였는데요. (당연히 그랬겠죠.) 당시 박영선 의원의 국회 발언을 들어보시겠습니다. [박영선 의원/더불어민주당 (2013년 12월 31일) : 경제의 근간을 흔드는 법을 무원칙적으로 이렇게 특정 재벌 회사에게 특혜를 주기 위해 간청하는 민원법을 우리가 새해부터 왜 통과시켜야 합니까.]',
15 |         '최순실 씨 사건을 쫓아가다 보면 본의 아니게 이번 정부의 과거로 올라가면서 복기하는 듯한 느낌이 드는데 이것도 바로 그중 하나입니다. 생생하게 기억합니다. 이 때 장면들은. 특정 재벌 회사를 위한 특혜라고 말하는데, 어떤 기업을 말하는 건가요?',
16 |         'SK와 GS 입니다. 개정안이 통과되는 걸 전제로 두 회사는 외국 투자를 받아 증손회사 설립을 진행중이었기 때문인데요. 당시 개정안이 통과되지 않으면 두 기업이 수조원의 손실이 생길 수 있는 것으로 알려져 있었습니다. 허창수 GS 회장과 김창근 SK회장은 2013년 8월 박 대통령과 청와대에서 대기업 회장단 오찬자리에서 외촉법 통과를 요청한 바도 있습니다. ',
17 |         '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.',
18 |         '네 그렇습니다. 통화 내용을 다시 짚어보면요. 최 씨는 외촉법 관련 예산이 12월 2일, 반드시 되어야 한다, 작년 예산으로 돼서는 안 된다고 얘기하고 있는데요. 다시 말해서 외촉법 관련 예산안이 내년에 반영되어야 한다고 압박을 하고 있는 겁니다. 그러면서 "국민을 볼모로 잡고 있다"며 "국회와 정치권에 책임을 묻겠다"고 으름장까지 놓고 있는데요. 매우 집착하는 모습인데요. 이에 대해서 정 전 비서관이 "예산이 그렇게 빨리 통과된 적 없습니다"고 말하자 말을 끊으면서 매우 흥분한 듯, "그렇더라도, 그렇더라도" 하면서 "야당이 공약 지키라고 하면서 협조는 안 한다", "대통령으로 할 수 있는 일이 없다", "불공정 사태와 난맥상이 나온다"며 굉장한 압박까지 하고 있습니다.',
19 |         '이 얘기들만 들여다봐도 마치 본인이 대통령처럼 얘기하고 있습니다. 내용들 보면 그렇지 않습니까? 혹시 최 씨가 이 외촉법 통과로 이득을 본 경우도 있습니까. ',
20 |         '최 씨가 입김을 넣어 차은택 씨가 주도를 한 걸로 알려진 K컬처밸리 사업이 그렇다는 얘기가 나오고 있습니다. 외촉법을 편법으로 활용해 1% 금리를 적용받았다는 지적이 나오고 있습니다. 본격 사업이 추진되기 전 최순실 국정개입 사건이 터지기는 했지만, 이외에도 다른 혜택을 받았는지는 조사가 필요해 보입니다. ',
21 |         '그런데 녹취파일을 보면 "남자1"이 등장합니다. 이 사람은 누구입니까?',
22 |         '정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.'
23 | ]
24 | 
25 | keyword=keyword_extractor(sents)
26 | for word, r in sorted(keyword.items(), key=lambda x: x[1], reverse=True)[:30]:
27 |     print('%8s:\t%.4f' % (word, r))
28 | 
29 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)
30 | 
31 | # import nltk
32 | # nltk.download()
33 | # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
34 | # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
35 | # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
36 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
37 |                         ptm.helper.POSFilter('NN*'),
38 |                         ptm.helper.SelectWordOnly(),
39 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
40 | result = pipeline.processCorpus(corpus)
41 | print(result)
42 | print()
43 | 
44 | documents=[]
45 | for doc in result:
46 |     document=''
47 |     for sent in doc:
48 |         document = " ".join(sent)
49 |     documents.append(document)
50 | 
51 | 
52 | keyword_extractor1=ptm.keyword.KeywordExtractionKorean(min_count,max_length,beta,max_iter,verbose,num_words)
53 | keyword1=keyword_extractor1(documents)
54 | for word, r in sorted(keyword1.items(), key=lambda x: x[1], reverse=True)[:30]:
55 |     print('%8s:\t%.4f' % (word, r))


--------------------------------------------------------------------------------
/examples/koreanLemmatizationTest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pyTextMiner as ptm
 3 | 
 4 | korean_lemmatizer=ptm.lemmatizer.KoreanLemmatizer()
 5 | 
 6 | test = [
 7 | ('모', '았다'),
 8 | ('하', '다'),
 9 | ('서툰', ''),
10 | ('와서', ''),
11 | ('내려논', ''),
12 | ]
13 | 
14 | for l, r in test:
15 |     print('({}, {}) -> {}'.format(l, r, korean_lemmatizer(l + r)))
16 |     # print(_lemma_candidate(l, r), end='\n\n')
17 | 


--------------------------------------------------------------------------------
/examples/koreanNounExtractionTest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pyTextMiner as ptm
 3 | 
 4 | #corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)
 5 | corpus=ptm.CorpusFromFile('./data/134963_norm.txt')
 6 | # import nltk
 7 | # nltk.download()
 8 | # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
 9 | # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
10 | # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
11 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
12 |                         ptm.helper.POSFilter('NN*'),
13 |                         ptm.helper.SelectWordOnly(),
14 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
15 | result = pipeline.processCorpus(corpus)
16 | print(result)
17 | print()
18 | 
19 | documents=[]
20 | for doc in result:
21 |     document=''
22 |     for sent in doc:
23 |         document = " ".join(sent)
24 |     documents.append(document)
25 | 
26 | #2016-10-20.txt
27 | corpus1=ptm.CorpusFromFile('./data/2016-10-20.txt')
28 | noun_extractor=ptm.noun_extractor.NounExtractionKorean(corpus1)
29 | sent='두바이월드센터시카고옵션거래소'
30 | result=noun_extractor.__call__(sent)
31 | print(result)


--------------------------------------------------------------------------------
/examples/koreanSegmentationTest.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | test='이건진짜좋은영화라라랜드진짜좋은영화'
 4 | 
 5 | model_path='./model/korean_segmentation_model.crfsuite'
 6 | segmentation=ptm.segmentation.SegmentationKorean(model_path)
 7 | correct=segmentation(test)
 8 | print(correct)
 9 | 
10 | chatspace_segmentation=ptm.segmentation.ChatSpaceSegmentationKorean()
11 | chatspace_correct=chatspace_segmentation(test)
12 | print(chatspace_correct)
13 | 
14 | lstm_model_path='./pyTextMiner/segmentation/model'
15 | lstm_segmentation=ptm.segmentation.LSTMSegmentationKorean(lstm_model_path)
16 | lstm_correct=lstm_segmentation(test)
17 | print(lstm_correct)
18 | 
19 | lstm_segmentation.close()


--------------------------------------------------------------------------------
/examples/koreanSpecialTokenizerTest.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | #tokenize by subwords
 4 | scores = {'파스': 0.3, '파스타': 0.7, '좋아요': 0.2, '좋아':0.5}
 5 | tokenizer = ptm.tokenizer.MaxScoreTokenizerKorean(scores=scores)
 6 | tokens = tokenizer.inst.tokenize('파스타가좋아요')
 7 | print(str(tokens))
 8 | 
 9 | #띄어쓰기가 잘 되어 있는 한국어 문서의 경우에는 MaxScoreTokenizer를 이용할 필요가 없다.
10 | # 한국어는 L+[R] 구조이기 때문이다
11 | # 이 때에는 한 어절의 왼쪽에서부터 글자 점수가 가장 높은 부분을 기준으로 토크나이징을 한다
12 | scores = {'데이':0.5, '데이터':0.5, '데이터마이닝':0.5, '공부':0.5, '공부중':0.45}
13 | tokenizer = ptm.tokenizer.LTokenizerKorean(scores=scores)
14 | print('\nflatten=True\nsent = 데이터마이닝을 공부한다')
15 | print(tokenizer.inst.tokenize('데이터마이닝을 공부한다'))
16 | 
17 | print('\nflatten=False\nsent = 데이터마이닝을 공부한다')
18 | print(tokenizer.inst.tokenize('데이터마이닝을 공부한다', flatten=False))
19 | 
20 | print('\nflatten=False\nsent = 데이터분석을 위해서 데이터마이닝을 공부한다')
21 | print(tokenizer.inst.tokenize('데이터분석을 위해서 데이터마이닝을 공부한다', flatten=False))
22 | 
23 | print('\nflatten=True\nsent = 데이터분석을 위해서 데이터마이닝을 공부한다')
24 | print(tokenizer.inst.tokenize('데이터분석을 위해서 데이터마이닝을 공부한다'))
25 | 
26 | #Tolerance는 한 어절에서 subword 들의 점수의 차이가 그 어절의 점수 최대값과 tolerance 이하로 난다면, 길이가 가장 긴 어절을 선택한다.
27 | # CohesionProbability에서는 합성명사들은 각각의 요소들보다 낮기 때문에 tolerance를 이용할 수 있다.
28 | #
29 | print('tolerance=0.0\nsent = 데이터마이닝을 공부중이다')
30 | print(tokenizer.inst.tokenize('데이터마이닝을 공부중이다'))
31 | 
32 | print('\ntolerance=0.1\nsent = 데이터마이닝을 공부중이다')
33 | print(tokenizer.inst.tokenize('데이터마이닝을 공부중이다', tolerance=0.1))
34 | 
35 | #RegexTokenizer는 regular extression을 이용하여 언어가 달라지는 순간들을 띄어쓴다.
36 | # 영어의 경우에는 움라우트가 들어가는 경우들이 있어서 알파벳 뿐 아니라 라틴까지 포함하였다.
37 | tokenizer = ptm.tokenizer.RegexTokenizerKorean()
38 | 
39 | sents = [
40 |     '이렇게연속된문장은잘리지않습니다만',
41 |     '숫자123이영어abc에섞여있으면ㅋㅋ잘리겠죠',
42 |     '띄어쓰기가 포함되어있으면 이정보는10점!꼭띄워야죠'
43 | ]
44 | 
45 | for sent in sents:
46 |     print('   %s\n->%s\n' % (sent, tokenizer.inst.tokenize(sent)))
47 | 


--------------------------------------------------------------------------------
/examples/koreanTokenizerTest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pyTextMiner as ptm
 3 | import time
 4 | from collections import Counter
 5 | 
 6 | mecab_path='C:\\mecab\\mecab-ko-dic'
 7 | komoran = ptm.tokenizer.Komoran()
 8 | kkma = ptm.tokenizer.KokomaKorean()
 9 | twitter = ptm.tokenizer.TwitterKorean()
10 | mecab = ptm.tokenizer.MeCab(mecab_path)
11 | 
12 | sent = '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.'
13 | taggers = [komoran, kkma, twitter, mecab]
14 | names = 'komoran kkma twitter mecab'.split()
15 | for tagger in taggers:
16 |     pos = tagger.inst.pos(sent)
17 |     print(str(pos))
18 | 
19 | #performance (speed) measure
20 | sents = ['최순실 씨가 외국인투자촉진법 개정안 통과와 예산안 반영까지 꼼꼼이 챙긴 건데, 이른바 외촉법, 어떤 법이길래 최 씨가 열심히 챙긴 걸까요. 자신의 이해관계와 맞아 떨어지는 부분이 없었는지 취재기자와 한걸음 더 들여다보겠습니다. 이서준 기자, 우선 외국인투자촉진법 개정안, 어떤 내용입니까?',
21 |         '한마디로 대기업이 외국 투자를 받아 계열사를 설립할 때 규제를 완화시켜 주는 법안입니다. 대기업 지주사의 손자 회사가 이른바 증손회사를 만들 때 지분 100%를 출자해야 합니다. 대기업의 문어발식 계열사 확장을 막기 위한 조치인데요. 외촉법 개정안은 손자회사가 외국 투자를 받아서 증손회사를 만들 땐 예외적으로 50% 지분만 투자해도 되게끔 해주는 내용입니다.',
22 |         '그만큼 쉽게 완화해주는 거잖아요. 그때 기억을 더듬어보면 야당의 반발이 매우 심했습니다. 그 이유가 뭐였죠? ',
23 |         '대기업 특혜 법안이라는 취지였는데요. (당연히 그랬겠죠.) 당시 박영선 의원의 국회 발언을 들어보시겠습니다. [박영선 의원/더불어민주당 (2013년 12월 31일) : 경제의 근간을 흔드는 법을 무원칙적으로 이렇게 특정 재벌 회사에게 특혜를 주기 위해 간청하는 민원법을 우리가 새해부터 왜 통과시켜야 합니까.]',
24 |         '최순실 씨 사건을 쫓아가다 보면 본의 아니게 이번 정부의 과거로 올라가면서 복기하는 듯한 느낌이 드는데 이것도 바로 그중 하나입니다. 생생하게 기억합니다. 이 때 장면들은. 특정 재벌 회사를 위한 특혜라고 말하는데, 어떤 기업을 말하는 건가요?',
25 |         'SK와 GS 입니다. 개정안이 통과되는 걸 전제로 두 회사는 외국 투자를 받아 증손회사 설립을 진행중이었기 때문인데요. 당시 개정안이 통과되지 않으면 두 기업이 수조원의 손실이 생길 수 있는 것으로 알려져 있었습니다. 허창수 GS 회장과 김창근 SK회장은 2013년 8월 박 대통령과 청와대에서 대기업 회장단 오찬자리에서 외촉법 통과를 요청한 바도 있습니다. ',
26 |         '물론 두 기업과 최순실 씨와 연결고리가 나온 건 아니지만, 정 전 비서관 녹취파일 속 최 씨는 외촉법에 상당히 집착을 하는 걸로 보이긴 합니다.',
27 |         '네 그렇습니다. 통화 내용을 다시 짚어보면요. 최 씨는 외촉법 관련 예산이 12월 2일, 반드시 되어야 한다, 작년 예산으로 돼서는 안 된다고 얘기하고 있는데요. 다시 말해서 외촉법 관련 예산안이 내년에 반영되어야 한다고 압박을 하고 있는 겁니다. 그러면서 "국민을 볼모로 잡고 있다"며 "국회와 정치권에 책임을 묻겠다"고 으름장까지 놓고 있는데요. 매우 집착하는 모습인데요. 이에 대해서 정 전 비서관이 "예산이 그렇게 빨리 통과된 적 없습니다"고 말하자 말을 끊으면서 매우 흥분한 듯, "그렇더라도, 그렇더라도" 하면서 "야당이 공약 지키라고 하면서 협조는 안 한다", "대통령으로 할 수 있는 일이 없다", "불공정 사태와 난맥상이 나온다"며 굉장한 압박까지 하고 있습니다.',
28 |         '이 얘기들만 들여다봐도 마치 본인이 대통령처럼 얘기하고 있습니다. 내용들 보면 그렇지 않습니까? 혹시 최 씨가 이 외촉법 통과로 이득을 본 경우도 있습니까. ',
29 |         '최 씨가 입김을 넣어 차은택 씨가 주도를 한 걸로 알려진 K컬처밸리 사업이 그렇다는 얘기가 나오고 있습니다. 외촉법을 편법으로 활용해 1% 금리를 적용받았다는 지적이 나오고 있습니다. 본격 사업이 추진되기 전 최순실 국정개입 사건이 터지기는 했지만, 이외에도 다른 혜택을 받았는지는 조사가 필요해 보입니다. ',
30 |         '그런데 녹취파일을 보면 "남자1"이 등장합니다. 이 사람은 누구입니까?',
31 |         '정 전 비서관을 "정 과장님"으로 부르며 반말을 하는 남자인데요. 최순실 씨처럼 정 전 비서관을 하대하고 있습니다. 또 청와대 내부 정보를 알고 있는 듯하고 또 인사에까지 개입하려고 하고 있습니다. 그렇기 때문에 정윤회 씨로 추정은 됩니다만 확인은 되지 않습니다.'
32 | ]
33 | 
34 | tokens = []
35 | 
36 | for name, tagger in zip(names, taggers):
37 |     t = time.time()
38 |     tokens.append(
39 |         [pos for sent in sents for pos in tagger.inst.pos(sent)]
40 |     )
41 |     t = time.time() - t
42 | 
43 |     print('{:8}: {:.3f} secs'.format(name, t))
44 | 
45 | #print first 15 words in the first sentence --> example of out of vocabulary problem
46 | print(tokens[0][:15])
47 | print('\n\n')
48 | 
49 | #word frequency calculation
50 | counter = Counter(tokens[0])
51 | counter = {
52 |     word:freq for word, freq in counter.items()
53 |     if (freq >= 4) and (word[1][:2] == 'NN')
54 | }
55 | print(sorted(counter.items(), key=lambda x:-x[1]))
56 | 
57 | print('\n\n')
58 | 
59 | #using all three POS tokenizer
60 | for name, tokens_ in zip(names, tokens):
61 | 
62 |     print('\n\nTagger name = {}'.format(name))
63 | 
64 |     counter = Counter(tokens_)
65 |     counter = {word:freq for word, freq in counter.items()
66 |                if (freq >= 4) and (word[1][:1] == 'N')}
67 | 
68 |     print(sorted(counter.items(), key=lambda x:x[1], reverse=True))
69 | 
70 | 


--------------------------------------------------------------------------------
/examples/naver_newscomments_processor.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | from time import time
 3 | 
 4 | import gensim
 5 | import pyTextMiner as ptm
 6 | from gensim.models import Word2Vec
 7 | 
 8 | cores = multiprocessing.cpu_count() # Count the number of cores in a computer
 9 | 
10 | print('Start reading the dataset 1....')
11 | path = '/usr/local/lib/mecab/dic/mecab-ko-dic'
12 | 
13 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
14 |                         ptm.tokenizer.MeCab(path),
15 |                         ptm.lemmatizer.SejongPOSLemmatizer(),
16 |                         ptm.helper.SelectWordOnly(),
17 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
18 | 
19 | corpus = ptm.CorpusFromFieldDelimitedEmojiFile('/Data/ko_sns_comments/xab',1)
20 | result1 = pipeline.processCorpus(corpus)
21 | 
22 | print ('Finish processing... ')
23 | 
24 | i = 0
25 | file = open("naver_comments15_16_filtered.txt", "a+")
26 | for doc in result1:
27 |     if i % 10000 == 0:
28 |         print('processing ' + str(i))
29 |     i += 1
30 |     document = ''
31 |     for sent in doc:
32 |         for word in sent:
33 |             document += word + ' '
34 |     file.write(document.strip() + '\n')
35 | 
36 | file.close()
37 | print('Document size for the total dataset: ' + str(i))
38 | 
39 | 


--------------------------------------------------------------------------------
/examples/node2vec_tester.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | from py_node2vec.node2vecModel import Node2VecModel
 3 | 
 4 | embedding_filename='./node2vec.emb'
 5 | n2vec = Node2VecModel()
 6 | n2vec.load_model(embedding_filename)
 7 | results= n2vec.most_similars('정치')
 8 | print(results)
 9 | 
10 | pair_similarity = n2vec.compute_similarity('문재인', '정치')
11 | for pair in pair_similarity:
12 |     print(str(pair[0]) + " -- " + str(pair[1]))


--------------------------------------------------------------------------------
/examples/node2vec_traianer.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | 
 3 | import pyTextMiner as ptm
 4 | import re
 5 | from py_node2vec.node2vecModel import Node2VecModel
 6 | 
 7 | mecab_path = 'C:\\mecab\\mecab-ko-dic'
 8 | stopword_file = '../stopwords/stopwordsKor.txt'
 9 | 
10 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
11 |                         ptm.tokenizer.MeCab(mecab_path),
12 |                         ptm.lemmatizer.SejongPOSLemmatizer(),
13 |                         ptm.helper.SelectWordOnly(),
14 |                         ptm.helper.StopwordFilter(file=stopword_file))
15 | 
16 | corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt',2)
17 | result = pipeline.processCorpus(corpus)
18 | #print(result)
19 | #print()
20 | 
21 | documents = []
22 | for doc in result:
23 |     document = ''
24 |     for sent in doc:
25 |         n_sent = " ".join(sent)
26 |         #for English text to remove special chars
27 |         document += re.sub('[^A-Za-zㄱ-ㅣ가-힣 ]+', '', n_sent)
28 |     documents.append(document)
29 | 
30 | co = ptm.cooccurrence.CooccurrenceWorker()
31 | co_result, vocab = co.__call__(documents)
32 | 
33 | cv = CountVectorizer()
34 | cv_fit = cv.fit_transform(documents)
35 | word_list = cv.get_feature_names();
36 | count_list = cv_fit.toarray().sum(axis=0)
37 | word_hist = dict(zip(word_list, count_list))
38 | 
39 | threshold = 2.0
40 | dimensions=300
41 | walk_length=30
42 | num_walks=200
43 | 
44 | n2vec = Node2VecModel()
45 | 
46 | n2vec.create_graph(co_result, word_hist, threshold)
47 | n2vec.train(dimensions, walk_length, num_walks)
48 | 
49 | embedding_filename='node2vec.emb'
50 | embedding_model_file='node2vec.model'
51 | n2vec.save_model(embedding_filename,embedding_model_file)


--------------------------------------------------------------------------------
/examples/scibert_ner_train.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import torch
  4 | from torch.optim import Adam
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | from transformers import (get_linear_schedule_with_warmup,
  8 |                           BertForTokenClassification,
  9 |                           AutoTokenizer)
 10 | 
 11 | from py_ner.data_utils.ner_dataset import read_data_from_file, get_labels, NerDataset
 12 | from py_ner.model.optimizers import get_optimizer_with_weight_decay
 13 | 
 14 | # https://github.com/cambridgeltl/MTL-Bioinformatics-2016/tree/master/data
 15 | from py_ner.scibert_ner_train import SciBertTrainer
 16 | 
 17 | #dataset for NER
 18 | DATA_TR_PATH = '../py_ner/data/JNLPBA/Genia4ERtask1.iob2'
 19 | DATA_TS_PATH = '../py_ner/data/JNLPBA/Genia4EReval1.iob2'
 20 | SEED = 42
 21 | 
 22 | # MODEL
 23 | #MODEL_NAME = 'allenai/scibert_scivocab_uncased'
 24 | #MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
 25 | #MODEL_NAME = 'adamlin/ClinicalBert_all_notes'
 26 | #MODEL_NAME = 'monologg/biobert_v1.0_pubmed_pmc'
 27 | MODEL_NAME = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
 28 | MAX_LEN_SEQ = 128
 29 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 30 | 
 31 | # Optimization parameters
 32 | N_EPOCHS = 6
 33 | BATCH_SIZE = 8
 34 | BATCH_SIZE_VAL = 28
 35 | WEIGHT_DECAY = 0
 36 | LEARNING_RATE = 5e-5  # 2e-4
 37 | RATIO_WARMUP_STEPS = .1
 38 | DROPOUT = .3
 39 | ACUMULATE_GRAD_EVERY = 4
 40 | OPTIMIZER = Adam
 41 | 
 42 | # Seeds
 43 | random.seed(SEED)
 44 | np.random.seed(SEED)
 45 | torch.manual_seed(SEED)
 46 | torch.cuda.manual_seed_all(SEED)
 47 | 
 48 | # get data
 49 | training_set = read_data_from_file(DATA_TR_PATH)
 50 | test_set = read_data_from_file(DATA_TS_PATH)
 51 | 
 52 | # Automatically extract labels and their indexes from data.
 53 | labels2ind, labels_count = get_labels(training_set + test_set)
 54 | 
 55 | # Load tokenizer
 56 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 57 | 
 58 | # Create loaders for datasets
 59 | training_set = NerDataset(dataset=training_set,
 60 |                           tokenizer=tokenizer,
 61 |                           labels2ind=labels2ind,
 62 |                           max_len_seq=MAX_LEN_SEQ)
 63 | 
 64 | test_set = NerDataset(dataset=test_set,
 65 |                       tokenizer=tokenizer,
 66 |                       labels2ind=labels2ind,
 67 |                       max_len_seq=MAX_LEN_SEQ)
 68 | 
 69 | dataloader_tr = DataLoader(dataset=training_set,
 70 |                            batch_size=BATCH_SIZE,
 71 |                            shuffle=True)
 72 | 
 73 | dataloader_ts = DataLoader(dataset=test_set,
 74 |                            batch_size=BATCH_SIZE_VAL,
 75 |                            shuffle=False)
 76 | 
 77 | # Load model
 78 | nerbert = BertForTokenClassification.from_pretrained(MODEL_NAME,
 79 |                                                      hidden_dropout_prob=DROPOUT,
 80 |                                                      attention_probs_dropout_prob=DROPOUT,
 81 |                                                      label2id=labels2ind,
 82 |                                                      num_labels=len(labels2ind),
 83 |                                                      id2label={str(v): k for k, v in labels2ind.items()})
 84 | 
 85 | # Prepare optimizer and schedule (linear warmup and decay)
 86 | optimizer = get_optimizer_with_weight_decay(model=nerbert,
 87 |                                             optimizer=OPTIMIZER,
 88 |                                             learning_rate=LEARNING_RATE,
 89 |                                             weight_decay=WEIGHT_DECAY)
 90 | 
 91 | training_steps = (len(dataloader_tr)//ACUMULATE_GRAD_EVERY) * N_EPOCHS
 92 | scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
 93 |                                             num_warmup_steps=training_steps * RATIO_WARMUP_STEPS,
 94 |                                             num_training_steps=training_steps)
 95 | 
 96 | # Trainer
 97 | trainer = SciBertTrainer(model=nerbert,
 98 |                       tokenizer=tokenizer,
 99 |                       optimizer=optimizer,
100 |                       scheduler=scheduler,
101 |                       dataloader_train=dataloader_tr,
102 |                       dataloader_test=dataloader_ts,
103 |                       labels2ind=labels2ind,
104 |                       device=DEVICE,
105 |                       n_epochs=N_EPOCHS,
106 |                       accumulate_grad_every=ACUMULATE_GRAD_EVERY,
107 |                       output_dir='./models')
108 | 
109 | tr_losses, val_losses = trainer.train()
110 | 
111 | 


--------------------------------------------------------------------------------
/examples/scibert_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForTokenClassification
 3 | 
 4 | # Example
 5 | text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes."
 6 | 
 7 | # Load model
 8 | tokenizer = AutoTokenizer.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")
 9 | model = AutoModelForTokenClassification.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")
10 | 
11 | #tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
12 | #model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
13 | 
14 | # Get input for BERT
15 | input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
16 | 
17 | # Predict
18 | with torch.no_grad():
19 |   outputs = model(input_ids)
20 | 
21 | # From the output let's take the first element of the tuple.
22 | # Then, let's get rid of [CLS] and [SEP] tokens (first and last)
23 | predictions = outputs[0].argmax(axis=-1)[0][1:-1]
24 | 
25 | # Map label class indexes to string labels.
26 | for token, pred in zip(tokenizer.tokenize(text), predictions):
27 |   print(token, '->', model.config.id2label[pred.numpy().item()])


--------------------------------------------------------------------------------
/examples/test222.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | _stopwords = []
 4 | with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file:
 5 |     for line in file:
 6 |         line = line.strip() #or some other preprocessing
 7 |         _stopwords.append(line) #storing everything in memory!
 8 | 
 9 | path='C:\\mecab\\mecab-ko-dic'
10 | #pos_tagger_name - either komoran, okt, nltk
11 | #lang = ko or en
12 | pipeline = ptm.Pipeline(ptm.keyword.TextRankExtractor(pos_tagger_name='mecab',
13 |                                                       mecab_path=path,
14 |                                                       max=5,
15 |                                                       lang='ko',
16 |                                                       stopwords=_stopwords,
17 |                                                       combined_keywords=True))
18 | 
19 | corpus = ptm.CorpusFromFile('./data/sampleKor.txt')
20 | result = pipeline.processCorpus(corpus)
21 | print('== Splitting Sentence ==')
22 | print(result)
23 | print()
24 | 
25 | from sklearn.datasets import fetch_20newsgroups
26 | ng20 = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))
27 | 
28 | print("XXXX " + str(ng20.data[0]))


--------------------------------------------------------------------------------
/examples/test3.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | dictionary_path='./dict/user_dic.txt'
 4 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 5 |                         ptm.tokenizer.Komoran(userdic=dictionary_path),
 6 |                         ptm.helper.POSFilter('NN*'),
 7 |                         ptm.helper.SelectWordOnly(),
 8 |                         #ptm.tokenizer.MaxScoreTokenizerKorean(),
 9 |                         #ptm.tokenizer.Word(),
10 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
11 |                         #ptm.ngram.NGramTokenizer(2,3),
12 |                         #ptm.counter.WordCounter())
13 | 
14 | corpus = ptm.CorpusFromEojiFile('./data/filtered_content.txt')
15 | #result = pipeline.processCorpus(corpus)
16 | 
17 | #print(result)
18 | print()
19 | 
20 | import numpy as np
21 | print(np.__version__)
22 | 
23 | s = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습."
24 | 
25 | 
26 | pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter())
27 | corpus = [s]
28 | result = pipeline.processCorpus(corpus)
29 | print(result)


--------------------------------------------------------------------------------
/examples/test4.py:
--------------------------------------------------------------------------------
 1 | import tomotopy as tp
 2 | print(tp.isa) # 'avx2'나 'avx', 'sse2', 'none'를 출력합니다.
 3 | 
 4 | import pyTextMiner as ptm
 5 | import io
 6 | import nltk
 7 | 
 8 | mecab_path = 'C:\\mecab\\mecab-ko-dic'
 9 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
10 |                         ptm.tokenizer.MeCab(mecab_path),
11 |                         #ptm.tokenizer.Komoran(),
12 |                         ptm.lemmatizer.SejongPOSLemmatizer(),
13 |                         ptm.helper.SelectWordOnly(),
14 |                         #ptm.ngram.NGramTokenizer(1, 2, concat=' '))
15 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
16 | 
17 | documents = ['오늘은 비가와서 그런지 매우 우울한 날이다',
18 |              '시험이 끝나야 놀지 스트레스 받아ㅠㅠ',
19 |              '행복한 하루의 끝이라 좋네!']
20 | 
21 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2)
22 | #result = pipeline.processCorpus(corpus)
23 | 
24 | result = pipeline.processCorpus(documents)
25 | print(result)
26 | 
27 | 
28 | from soylemma import Lemmatizer
29 | lemmatizer = Lemmatizer(dictionary_name='default')
30 | re = lemmatizer.lemmatize('밝은')
31 | print('result ' + str(re))
32 | 
33 | test_list = ['http://www.google.com', "why", "ftpfjdjkwjkjw", "no no!"]
34 | PROTOCOLS = ('http', 'https', 'ftp', 'git')
35 | for s in test_list:
36 |     if s.startswith(tuple(p for p in PROTOCOLS)):
37 |         print("true " + s)
38 |     else:
39 |         print("false " + s)
40 | 
41 | 


--------------------------------------------------------------------------------
/examples/testCooccurrence.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | import networkx as nx
 3 | from matplotlib import pyplot as plt
 4 | import numpy as np
 5 | from sklearn.feature_extraction.text import CountVectorizer
 6 | import matplotlib as mpl
 7 | 
 8 | if __name__ == '__main__':
 9 |     #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.chunker.KoreanChunker())
10 | 
11 |     # 다음은 분석에 사용할 corpus를 불러오는 일입니다. sampleEng.txt 파일을 준비해두었으니, 이를 읽어와봅시다.
12 |     # ptm의 CorpusFromFile이라는 클래스를 통해 문헌집합을 가져올 수 있습니다. 이 경우 파일 내의 한 라인이 문헌 하나가 됩니다.
13 |     corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2)
14 | 
15 |     #import nltk
16 |     #nltk.download()
17 |     # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
18 |     # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
19 |     # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
20 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
21 |                             ptm.tokenizer.Komoran(),
22 |                             ptm.helper.POSFilter('NN*'),
23 |                             ptm.helper.SelectWordOnly(),
24 |                             ptm.ngram.NGramTokenizer(1,2),
25 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
26 |                             )
27 |     result = pipeline.processCorpus(corpus)
28 |     print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==')
29 |     print(result)
30 |     print()
31 | 
32 |     print('==  ==')
33 | 
34 |     import re
35 |     documents = []
36 |     for doc in result:
37 |         document = ''
38 |         for sent in doc:
39 |             document = " ".join(sent)
40 |             #for English text to remove special chars
41 |             document = re.sub('[^A-Za-z0-9]+', '', document)
42 | 
43 |         documents.append(document)
44 |     co = ptm.cooccurrence.CooccurrenceWorker()
45 |     co_result, vocab = co.__call__(documents)
46 | 
47 |     graph_builder = ptm.graphml.GraphMLCreator()
48 | 
49 |     #mode is either with_threshold or without_threshod
50 |     mode='with_threshold'
51 | 
52 |     if mode is 'without_threshold':
53 |         print(str(co_result))
54 |         print(str(vocab))
55 |         graph_builder.createGraphML(co_result, vocab, "test1.graphml")
56 | 
57 |     elif mode is 'with_threshold':
58 |         cv = CountVectorizer()
59 |         cv_fit = cv.fit_transform(documents)
60 |         word_list = cv.get_feature_names();
61 |         count_list = cv_fit.toarray().sum(axis=0)
62 |         word_hist = dict(zip(word_list, count_list))
63 | 
64 |         print(str(co_result))
65 |         print(str(word_hist))
66 | 
67 |         graph_builder.createGraphMLWithThreshold(co_result, word_hist, vocab, "test.graphml",threshold=35.0)
68 |         display_limit=50
69 |         graph_builder.summarize_centrality(limit=display_limit)
70 |         title = '동시출현 기반 그래프'
71 |         file_name='test.png'
72 |         graph_builder.plot_graph(title,file=file_name)
73 | 


--------------------------------------------------------------------------------
/examples/testDocTermMatrix.py:
--------------------------------------------------------------------------------
 1 | import sklearn
 2 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 3 | import pyTextMiner as ptm
 4 | 
 5 | def vectorizeCaseOne():
 6 |     documents = [
 7 |                  'This is the first document.',
 8 |                  'This document is the second document.',
 9 |                  'And this is the third one.',
10 |                  'Is this the first document?',
11 |                 ]
12 | 
13 |     vectorizer = CountVectorizer()
14 |     X = vectorizer.fit_transform(documents)
15 |     print(vectorizer.get_feature_names())
16 |     print(X.toarray())
17 | 
18 |     vectorizer = TfidfVectorizer()
19 |     X = vectorizer.fit_transform(documents)
20 |     print(vectorizer.get_feature_names())
21 |     print(X.toarray())
22 | 
23 | def vectorizeCaseTwo():
24 |     corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2)
25 | 
26 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
27 |                             ptm.tokenizer.Komoran(),
28 |                             ptm.helper.POSFilter('NN*'),
29 |                             ptm.helper.SelectWordOnly(),
30 |                             ptm.ngram.NGramTokenizer(2, 2),
31 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
32 |                             )
33 |     result = pipeline.processCorpus(corpus)
34 |     print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==')
35 |     print(result)
36 |     print()
37 | 
38 |     print('==  ==')
39 | 
40 |     documents = []
41 |     for doc in result:
42 |         document = ''
43 |         for sent in doc:
44 |             document += " ".join(sent)
45 |         documents.append(document)
46 | 
47 |     vectorizer = CountVectorizer()
48 |     X = vectorizer.fit_transform(documents)
49 |     print(vectorizer.get_feature_names())
50 |     print(X.shape)
51 | 
52 |     print(X.toarray())
53 | 
54 |     vectorizer = TfidfVectorizer()
55 |     X = vectorizer.fit_transform(documents)
56 |     print(vectorizer.get_feature_names())
57 |     print(len(vectorizer.get_feature_names()))
58 |     print(X.toarray())
59 | 
60 | 
61 | #vectorizeCaseOne()
62 | 
63 | vectorizeCaseTwo()
64 | 
65 | 


--------------------------------------------------------------------------------
/examples/testEXCo.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess
 2 | 
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | 
 5 | import pyTextMiner as ptm
 6 | 
 7 | mecab_path='C:\\mecab\\mecab-ko-dic'
 8 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 9 |                         ptm.tokenizer.MeCab(mecab_path),
10 |                         ptm.helper.POSFilter('NN*'),
11 |                         ptm.helper.SelectWordOnly(),
12 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
13 | 
14 | corpus = ptm.CorpusFromFile('./data/134963_norm.txt')
15 | result = pipeline.processCorpus(corpus)
16 | 
17 | with open('processed_134963.txt', 'w', encoding='utf-8') as f_out:
18 |     for doc in result:
19 |         for sent in doc:
20 |             new_sent = ''
21 |             for word in sent:
22 |                 new_sent += word + ' '
23 |             new_sent = new_sent.strip()
24 |             f_out.write(new_sent + "\n")
25 | f_out.close()
26 | 
27 | file_path='D:\\python_workspace\\pyTextMiner\\processed_134963.txt'
28 | co='D:\\python_workspace\\pyTextMiner\\external_programs\\ccount.exe ' + "--input " + file_path + " --threshold " + str(2) + " --output " + "co_result.txt"
29 | 
30 | subprocess.run(co, shell=True)
31 | co_results={}
32 | vocabulary = {}
33 | with open("co_result.txt", 'r', encoding='utf-8') as f_in:
34 |     for line in f_in:
35 |         fields = line.split()
36 |         token1 = fields[0]
37 |         token2 = fields[1]
38 |         token3 = fields[2]
39 | 
40 |         tup=(str(token1),str(token2))
41 |         co_results[tup]=float(token3)
42 | 
43 |         vocabulary[token1] = vocabulary.get(token1, 0) + 1
44 |         vocabulary[token2] = vocabulary.get(token2, 0) + 1
45 | 
46 |         word_hist = dict(zip(vocabulary.keys(), vocabulary.values()))
47 | 
48 | graph_builder = ptm.graphml.GraphMLCreator()
49 | 
50 | #mode is either with_threshold or without_threshod
51 | mode='with_threshold'
52 | 
53 | if mode is 'without_threshold':
54 |     graph_builder.createGraphML(co_results, vocabulary.keys(), "test1.graphml")
55 | 
56 | elif mode is 'with_threshold':
57 |     graph_builder.createGraphMLWithThresholdInDictionary(co_results, word_hist, "test.graphml",threshold=35.0)
58 |     display_limit=50
59 |     graph_builder.summarize_centrality(limit=display_limit)
60 |     title = '동시출현 기반 그래프'
61 |     file_name='test.png'
62 |     graph_builder.plot_graph(title,file=file_name)
63 | 


--------------------------------------------------------------------------------
/examples/testFirst.py:
--------------------------------------------------------------------------------
 1 | # yTextMiner의 파이썬 버전, PyTextMiner를 ptm이라는 이름으로 사용하겠다고 선언합니다
 2 | # ptm 역시 파이프라인 구조로 텍스트를 처리합니다.
 3 | # 만약 pyTextMiner에 빨간줄이 계속 뜬다면 왼쪽의 Project 트리뷰에서 pyTextMiner가 포함된 폴더를 우클릭하여
 4 | # 'Mark Directory as'에서 'Sources Root'를 눌러주도록 합시다.
 5 | # 이 패키지가 동작하기 위해서는 konlpy와 nltk라는 라이브러리가 필요합니다. konlpy는 저번에 설치했으므로,
 6 | # 이번에는 nltk를 설치해봅시다. pip install nltk로 간단하게 설치하시면 됩니다.
 7 | import pyTextMiner as ptm
 8 | import io
 9 | 
10 | # 다음은 분석에 사용할 corpus를 불러오는 일입니다. sampleEng.txt 파일을 준비해두었으니, 이를 읽어와봅시다.
11 | # ptm의 CorpusFromFile이라는 클래스를 통해 문헌집합을 가져올 수 있습니다. 이 경우 파일 내의 한 라인이 문헌 하나가 됩니다.
12 | #corpus = ptm.CorpusFromFile('donald.txt')
13 | corpus = ptm.CorpusFromDirectory('./tmp', True)
14 | 
15 | #corpus, pair_map = ptm.CorpusFromFieldDelimitedFileWithYear('./data/donald.txt')
16 | 
17 | # 이번에는 PyTextMiner로 한국어 처리를 해보도록 하겠습니다. 한국어의 교착어적인 특성 및 복잡한 띄어쓰기 규칙 때문에
18 | # 공백 기준으로 단어를 분리하는 것에는 한계가 있어서 형태소 분석기를 사용합니다.
19 | # ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다.
20 | # 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다.
21 | 
22 | #import nltk
23 | #nltk.download('punkt')
24 | 
25 | #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
26 | #                        ptm.helper.POSFilter('NN*'),
27 | #                        ptm.helper.SelectWordOnly(),
28 | #                        ptm.ngram.NGramTokenizer(3),
29 | #                        ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
30 | #                        )
31 | 
32 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
33 |                         ptm.segmentation.SegmentationKorean('./model/korean_segmentation_model.crfsuite'),
34 |                         ptm.ngram.NGramTokenizer(3),
35 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
36 |                         )
37 | 
38 | result = pipeline.processCorpus(corpus)
39 | 
40 | with io.open("demofile.csv",'w',encoding='utf8') as f:
41 |     for doc in result:
42 |         for sent in doc:
43 |             f.write('\t'.join(sent) + "\n")
44 | 
45 | print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==')
46 | print(result)
47 | print()
48 | 


--------------------------------------------------------------------------------
/examples/testMallet.py:
--------------------------------------------------------------------------------
 1 | from topic_model.MalletWrapper import MalletTopicModel
 2 | 
 3 | model = MalletTopicModel('D:\python_workspace\pyTextMiner\mallet')
 4 | #model.import_file(input=r'C:\mallet\topic_input\dblp_sample.txt')
 5 | model.import_file(input=r'D:\python_workspace\pyTextMiner\mallet\topic_input\sample_dmr_input.txt')
 6 | model.train_topics()
 7 | 
 8 | #print(model.topic_keys)  # see output_topic_keys parameter in Train Topics documentation
 9 | # print(model.doc_topics)  # see output_doc_topics parameter in Train Topics documentation
10 | #print(model.word_weights)  # see topic_word_weights_file parameter in Train Topics documentationn


--------------------------------------------------------------------------------
/examples/testPMI.py:
--------------------------------------------------------------------------------
1 | import pyTextMiner as ptm
2 | 
3 | corpus=ptm.CorpusFromFile('./data/2016-10-20.txt')
4 | pmi=ptm.pmi.PMICalculator(corpus)
5 | sent='아이오아이'
6 | result=pmi.__call__(sent)
7 | print(result)


--------------------------------------------------------------------------------
/examples/test_document_clustering.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pyTextMiner as ptm
 3 | from py_document_clustering.documentclustering import DocumentClustering
 4 | 
 5 | if __name__ == '__main__':
 6 |     corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2)
 7 | 
 8 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 9 |                             ptm.tokenizer.Komoran(),
10 |                             ptm.helper.POSFilter('NN*'),
11 |                             ptm.helper.SelectWordOnly(),
12 |                             #ptm.ngram.NGramTokenizer(2, 2),
13 |                             ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')
14 |                             )
15 |     result = pipeline.processCorpus(corpus)
16 |     print('==  ==')
17 | 
18 |     documents = []
19 |     for doc in result:
20 |         document = ''
21 |         for sent in doc:
22 |             document += " ".join(sent)
23 |         documents.append(document)
24 | 
25 |     print(len(documents))
26 |     #name either k-means, agglo, spectral_cocluster
27 |     name = 'agglo'
28 |     clustering=DocumentClustering(k=5)
29 |     #n_components means the number of words to be used as features
30 |     clustering.make_matrix(documents,n_components=-1,doc2vec_matrix=None)
31 |     clustering.cluster(name)
32 |     clustering.print_results()
33 | 
34 |     clustering.visualize()
35 | 


--------------------------------------------------------------------------------
/examples/test_korean_lemmatizer.py:
--------------------------------------------------------------------------------
 1 | import pyTextMiner as ptm
 2 | 
 3 | pipeline = None
 4 | corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2)
 5 | mecab_path = 'C:\\mecab\\mecab-ko-dic'
 6 | mode = 'korean_lemmatizer'
 7 | if mode is not 'korean_lemmatizer':
 8 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 9 |                             ptm.tokenizer.MeCab(mecab_path),
10 |                             #ptm.tokenizer.Komoran(),
11 |                             ptm.helper.SelectWordOnly(),
12 |                             ptm.ngram.NGramTokenizer(1,2,concat=' '),
13 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
14 | else :
15 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
16 |                             ptm.tokenizer.MeCab(mecab_path),
17 |                             #ptm.tokenizer.Komoran(),
18 |                             ptm.lemmatizer.SejongPOSLemmatizer(),
19 |                             ptm.helper.SelectWordOnly(),
20 |                             # ptm.ngram.NGramTokenizer(1, 2, concat=' '))
21 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
22 | 
23 | 
24 | documents = ['오늘은 비가와서 그런지 매우 우울하다',
25 |              '시험이 끝나야 놀지 스트레스 받아ㅠㅠ',
26 |              '행복한 하루의 끝이라 아름답고 좋네!',
27 |              '더운날에는 아이스 커피가 최고지~~!']
28 | 
29 | #result = pipeline.processCorpus(corpus)
30 | result = pipeline.processCorpus(documents)
31 | print(result)


--------------------------------------------------------------------------------
/examples/test_pyTextMinerTopicModel.py:
--------------------------------------------------------------------------------
  1 | from topic_model.pyTextMinerTopicModel import pyTextMinerTopicModel
  2 | import pyTextMiner as ptm
  3 | 
  4 | if __name__ == '__main__':
  5 | 
  6 |     mecab_path='C:\\mecab\\mecab-ko-dic'
  7 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
  8 |                             ptm.tokenizer.MeCab(mecab_path),
  9 |                             ptm.helper.POSFilter('NN*'),
 10 |                             ptm.helper.SelectWordOnly(),
 11 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')
 12 |                             )
 13 | 
 14 |     corpus = ptm.CorpusFromFieldDelimitedFileWithYear('./mallet/topic_input/sample_dmr_input.txt',doc_index=2,year_index=1)
 15 |     pair_map = corpus.pair_map
 16 | 
 17 |     result = pipeline.processCorpus(corpus.docs)
 18 |     text_data = []
 19 |     for doc in result:
 20 |         new_doc = []
 21 |         for sent in doc:
 22 |             for _str in sent:
 23 |                 if len(_str) > 0:
 24 |                     new_doc.append(_str)
 25 |         text_data.append(new_doc)
 26 | 
 27 |     topic_model = pyTextMinerTopicModel()
 28 |     topic_number=10
 29 |     mdl=None
 30 |     #mode is either lda, dmr, hdp, infer, etc
 31 |     mode='infer'
 32 |     label=''
 33 |     if mode is 'lda':
 34 |         print('Running LDA')
 35 |         label='LDA'
 36 |         lda_model_name = './test.lda.bin'
 37 |         mdl=topic_model.lda_model(text_data, lda_model_name, topic_number)
 38 | 
 39 |         print('perplexity score ' + str(mdl.perplexity))
 40 | 
 41 |     elif mode is 'dmr':
 42 |         print('Running DMR')
 43 |         label='DMR'
 44 |         dmr_model_name='./test.dmr.bin'
 45 |         mdl=topic_model.dmr_model(text_data, pair_map, dmr_model_name, topic_number)
 46 | 
 47 |         print('perplexity score ' + str(mdl.perplexity))
 48 | 
 49 |     elif mode is 'hdp':
 50 |         print('Running HDP')
 51 |         label='HDP'
 52 |         hdp_model_name='./test.hdp.bin'
 53 |         mdl, topic_num=topic_model.hdp_model(text_data, hdp_model_name)
 54 |         topic_number=topic_num
 55 |     elif mode is 'hlda':
 56 |         print('Running HLDA')
 57 |         label='HLDA'
 58 |         hlda_model_name = './test.hlda.bin'
 59 |         mdl=topic_model.hlda_model(text_data, hlda_model_name)
 60 |     elif mode is 'infer':
 61 |         lda_model_name = './test.lda.bin'
 62 |         unseen_text='아사이 베리 블루베리 비슷하다'
 63 |         topic_model.inferLDATopicModel(lda_model_name, unseen_text)
 64 | 
 65 |     if (mode is not 'infer'):
 66 |         # The below code extracts this dominant topic for each sentence
 67 |         # and shows the weight of the topic and the keywords in a nicely formatted output.
 68 |         df_topic_sents_keywords, matrix = topic_model.format_topics_sentences(topic_number=topic_number, mdl=mdl)
 69 | 
 70 |         # Format
 71 |         df_dominant_topic = df_topic_sents_keywords.reset_index()
 72 |         df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
 73 |         df_dominant_topic.head(10)
 74 | 
 75 |         # Sometimes we want to get samples of sentences that most represent a given topic.
 76 |         # This code gets the most exemplar sentence for each topic.
 77 |         topic_model.distribution_document_word_count(df_topic_sents_keywords, df_dominant_topic)
 78 | 
 79 |         #When working with a large number of documents,
 80 |         # we want to know how big the documents are as a whole and by topic.
 81 |         #Let’s plot the document word counts distribution.
 82 |         topic_model.distribution_word_count_by_dominant_topic(df_dominant_topic)
 83 | 
 84 |         # Though we’ve already seen what are the topic keywords in each topic,
 85 |         # a word cloud with the size of the words proportional to the weight is a pleasant sight.
 86 |         # The coloring of the topics I’ve taken here is followed in the subsequent plots as well.
 87 |         ##topic_model.word_cloud_by_topic(mdl)
 88 | 
 89 |         # Let’s plot the word counts and the weights of each keyword in the same chart.
 90 |         topic_model.word_count_by_keywords(mdl,matrix)
 91 | 
 92 |         # Each word in the document is representative of one of the N topics.
 93 |         # Let’s color each word in the given documents by the topic id it is attributed to.
 94 |         # The color of the enclosing rectangle is the topic assigned to the document.
 95 |         topic_model.sentences_chart(mdl,start=0, end=5, topic_number=topic_number)
 96 | 
 97 |         #visualize documents by tSNE
 98 |         topic_model.tSNE(mdl,matrix,label,topic_number=10)
 99 | 
100 |         topic_model.make_pyLDAVis(mdl,matrix,text_data)
101 | 


--------------------------------------------------------------------------------
/examples/test_word2veclite.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from word2vec.word2veclite import Word2Vec
 3 | 
 4 | corpus = "I like playing football with my friends"
 5 | cbow = Word2Vec(method="cbow", corpus=corpus,
 6 |                 window_size=1, n_hidden=2,
 7 |                 n_epochs=10, learning_rate=0.8)
 8 | W1, W2, loss_vs_epoch = cbow.run()
 9 | 
10 | print(W1)
11 | #[[ 0.99870389  0.20697257]
12 | # [-1.01911559  2.26364436]
13 | # [-0.69737232  0.14131477]
14 | # [ 3.28315183  1.13801973]
15 | # [-1.42944927 -0.62142097]
16 | # [ 0.65359329 -2.21415048]
17 | # [-0.22343751 -1.17927987]]
18 | 
19 | print(W2)
20 | #[[-0.97080793  1.21120331  2.15603796 -1.79083151  3.38445043 -1.65295511
21 | #   1.36685097]
22 | # [2.77323464  0.78710269  2.74152617  0.08953005  0.04400675 -1.34149651
23 | #   -2.19375528]]
24 | 
25 | print(loss_vs_epoch)
26 | #[14.328868654443703, 12.290456644464603, 10.366644621637064,
27 | # 9.1759777684446622, 8.4233626997233895, 7.3952948684910256,
28 | # 6.1727393307549736, 5.1639476117698191, 4.6333377088153043,
29 | # 4.2944697259465485]
30 | 
31 | #smax=cbow.predict('I like playing',W1,W2)
32 | #print(smax)


--------------------------------------------------------------------------------
/examples/testt.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pyTextMiner as ptm
 3 | import io
 4 | from nltk.corpus import sentiwordnet as swn
 5 | import nltk
 6 | 
 7 | class EnglishDictionarySentimentAnalyzer:
 8 |     def __init__(self):
 9 |         name = 'EnglishDictionarySentimentAnalyzer'
10 | 
11 |     def createDictionary(self):
12 |         nltk.download('sentiwordnet')
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 
17 |     corpus = ptm.CorpusFromFile('./data/sampleEng.txt')
18 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
19 |                             ptm.tokenizer.Word(),
20 |                             ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'),
21 |                             ptm.tagger.NLTK(),
22 |                             ptm.lemmatizer.WordNet())
23 | 
24 |     result = pipeline.processCorpus(corpus)
25 | 
26 |     EnglishDictionarySentimentAnalyzer().createDictionary()
27 | 
28 |     for doc in result:
29 |         for sent in doc:
30 |             for _str in sent:
31 |                 _str[0]
32 |                 _str[1]
33 |                 pos = ''
34 |                 if (str(_str[1]).startswith("N")):
35 |                     pos = 'n'
36 |                 elif (str(_str[1]).startswith("A")):
37 |                     pos = 'a'
38 |                 elif (str(_str[1]).startswith("V")):
39 |                     pos = 'v'
40 |                 try:
41 |                     if (len(pos) > 0):
42 |                         breakdown = swn.senti_synset(str(_str[0]) + '.'+ pos + '.01')
43 |                         print(str(breakdown) + " " + str(breakdown.pos_score())
44 |                               + " " + str(breakdown.neg_score()) + " " + str(breakdown.obj_score()))
45 |                 except:
46 |                     print('not found')
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/word2vec_tester.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from word2vec.word_embeddings import Word2Vec
 3 | 
 4 | word2vec = Word2Vec()
 5 | binary=True
 6 | model_file = 'word2vec.bin'
 7 | word2vec.load_model(model_file, binary)
 8 | 
 9 | print(word2vec.most_similar(positives=['이재명', '경제'], negatives=['정치인'], topn=10))
10 | print('-----------------------------------')
11 | 
12 | print(word2vec.similar_by_word('이재명'))
13 | 


--------------------------------------------------------------------------------
/examples/word2vec_trainer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from word2vec.word_embeddings import Word2Vec
 3 | 
 4 | word2vec = Word2Vec()
 5 | mode = 'simple'
 6 | mecab_path = 'C:\\mecab\\mecab-ko-dic'
 7 | stopword_file = '../stopwords/stopwordsKor.txt'
 8 | files = []
 9 | files.append('../data/donald.txt')
10 | is_directory=False
11 | doc_index=2
12 | max=-1
13 | word2vec.preprocessing(mode,mecab_path,stopword_file,files,is_directory,doc_index,max)
14 | 
15 | min_count=1
16 | window=5
17 | size=50
18 | negative=5
19 | word2vec.train(min_count, window, size, negative)
20 | 
21 | model_file = 'word2vec.bin'
22 | binary=True;
23 | word2vec.save_model(model_file, binary)
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/examples/zipfsManager.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding:utf8 -*-
 2 | import pyTextMiner as ptm
 3 | 
 4 | dictionary_path='./dict/user_dic.txt'
 5 | 
 6 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 7 |                         ptm.tokenizer.Komoran(),
 8 |                         #ptm.tokenizer.WordPos(),
 9 |                         ptm.helper.POSFilter('NN*'),
10 |                         ptm.helper.SelectWordOnly(),
11 |                         ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'),
12 |                         ptm.counter.WordCounter())
13 | 
14 | corpus = ptm.CorpusFromFile('./data/sampleEng.txt')
15 | 
16 | #corpus = ptm.CorpusFromFile('Gulliver_Travels.txt')
17 | #pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
18 | #                        ptm.tokenizer.Word(),
19 | #                        ptm.counter.WordCounter())
20 | result = pipeline.processCorpus(corpus)
21 | 
22 | print(result)
23 | print()
24 | 
25 | doc_collection = ''
26 | term_counts = {}
27 | for doc in result:
28 |     for sent in doc:
29 |         for _str in sent:
30 |             term_counts[_str[0]] = term_counts.get(_str[0], 0) + int(_str[1])
31 |             freq = range(int(_str[1]))
32 |             co = ''
33 |             for n in freq:
34 |                 co +=  ' ' + _str[0]
35 | 
36 |             doc_collection += ' ' + co
37 | word_freq = []
38 | for key, value in term_counts.items():
39 |     word_freq.append((value,key))
40 | 
41 | word_freq.sort(reverse=True)
42 | print(word_freq)
43 | 
44 | f = open("demo_result.txt", "w", encoding='utf8')
45 | for pair in word_freq:
46 |     f.write(pair[1] + '\t' + str(pair[0]) + '\n')
47 | f.close()
48 | 
49 | from wordcloud import WordCloud
50 | 
51 | # Read the whole text.
52 | 
53 | # Generate a word cloud image
54 | wordcloud = WordCloud().generate(doc_collection)
55 | 
56 | # Display the generated image:
57 | # the matplotlib way:
58 | import matplotlib.pyplot as plt
59 | 
60 | # Window의 경우 폰트 경로
61 | # font_path = 'C:/Windows/Fonts/malgun.ttf'
62 | 
63 | #for Mac
64 | #font_path='/Library/Fonts/AppleGothic.ttf'
65 | 
66 | # lower max_font_size
67 | wordcloud = WordCloud(max_font_size=40,
68 |                       background_color='white',
69 |                       collocations=False)
70 | 
71 | wordcloud.generate(doc_collection)
72 | 
73 | plt.figure()
74 | plt.imshow(wordcloud, interpolation="bilinear")
75 | plt.axis("off")
76 | plt.show()
77 | 


--------------------------------------------------------------------------------
/glove-win_devc_x64/cooccur.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccur.exe


--------------------------------------------------------------------------------
/glove-win_devc_x64/cooccurrence.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccurrence.bin


--------------------------------------------------------------------------------
/glove-win_devc_x64/cooccurrence.shuf.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/cooccurrence.shuf.bin


--------------------------------------------------------------------------------
/glove-win_devc_x64/demo.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | :: This batch file reveals OS, hardware, and networking configuration.
 3 | TITLE My System Info
 4 | ECHO Please wait... Checking system information.
 5 | :: Section 1: OS information.
 6 | ECHO ============================
 7 | ECHO OS INFO
 8 | ECHO ============================
 9 | systeminfo | findstr /c:"OS Name"
10 | systeminfo | findstr /c:"OS Version"
11 | systeminfo | findstr /c:"System Type"
12 | :: Section 2: Hardware information.
13 | ECHO ============================
14 | ECHO HARDWARE INFO
15 | ECHO ============================
16 | 
17 | SET CORPUS=donald.txt
18 | SET VOCAB_FILE=vocab.txt
19 | SET COOCCURRENCE_FILE=cooccurrence.bin
20 | SET COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
21 | SET SAVE_FILE=vectors
22 | SET VERBOSE=2
23 | SET MEMORY=4.0
24 | SET VOCAB_MIN_COUNT=2
25 | SET VECTOR_SIZE=50
26 | SET MAX_ITER=15
27 | SET WINDOW_SIZE=15
28 | SET BINARY=2
29 | SET NUM_THREADS=8
30 | SET X_MAX=10
31 | 
32 | SET PYTHON=C:\Users\minsong\AppData\Local\Programs\Python\Python37\python.exe
33 | 
34 | ECHO vocab_count -min-count %VOCAB_MIN_COUNT% -verbose %VERBOSE% < %CORPUS% > %VOCAB_FILE%
35 | vocab_count -min-count %VOCAB_MIN_COUNT% -verbose %VERBOSE% < %CORPUS% > %VOCAB_FILE%
36 | 
37 | ECHO cooccur -memory %MEMORY% -vocab-file %VOCAB_FILE% -verbose %VERBOSE% -window-size %WINDOW_SIZE% < %CORPUS% > %COOCCURRENCE_FILE%
38 | cooccur -memory %MEMORY% -vocab-file %VOCAB_FILE% -verbose %VERBOSE% -window-size %WINDOW_SIZE% < %CORPUS% > %COOCCURRENCE_FILE%
39 | 
40 | ECHO shuffle -memory %MEMORY% -verbose %VERBOSE% < %COOCCURRENCE_FILE% > %COOCCURRENCE_SHUF_FILE%
41 | shuffle -memory %MEMORY% -verbose %VERBOSE% < %COOCCURRENCE_FILE% > %COOCCURRENCE_SHUF_FILE%
42 | 
43 | ECHO glove -save-file %SAVE_FILE% -threads %NUM_THREADS% -input-file %COOCCURRENCE_SHUF_FILE% -x-max %X_MAX% -iter %MAX_ITER% -vector-size %VECTOR_SIZE% -binary %BINARY%0 -vocab-file %VOCAB_FILE% -verbose %VERBOSE%
44 | glove -save-file %SAVE_FILE% -threads %NUM_THREADS% -input-file %COOCCURRENCE_SHUF_FILE% -x-max %X_MAX% -iter %MAX_ITER% -vector-size %VECTOR_SIZE% -binary %BINARY%0 -vocab-file %VOCAB_FILE% -verbose %VERBOSE%
45 | 


--------------------------------------------------------------------------------
/glove-win_devc_x64/demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
 5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
 6 | 
 7 | make
 8 | if [ ! -e text8 ]; then
 9 |   if hash wget 2>/dev/null; then
10 |     wget http://mattmahoney.net/dc/text8.zip
11 |   else
12 |     curl -O http://mattmahoney.net/dc/text8.zip
13 |   fi
14 |   unzip text8.zip
15 |   rm text8.zip
16 | fi
17 | 
18 | CORPUS=text8
19 | VOCAB_FILE=vocab.txt
20 | COOCCURRENCE_FILE=cooccurrence.bin
21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
22 | BUILDDIR=build
23 | SAVE_FILE=vectors
24 | VERBOSE=2
25 | MEMORY=4.0
26 | VOCAB_MIN_COUNT=5
27 | VECTOR_SIZE=50
28 | MAX_ITER=15
29 | WINDOW_SIZE=15
30 | BINARY=2
31 | NUM_THREADS=8
32 | X_MAX=10
33 | if hash python 2>/dev/null; then
34 |     PYTHON=python
35 | else
36 |     PYTHON=python3
37 | fi
38 | 
39 | echo
40 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
41 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
42 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
43 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
44 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
45 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
46 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
47 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
48 | if [ "$CORPUS" = 'text8' ]; then
49 |    if [ "$1" = 'matlab' ]; then
50 |        matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
51 |    elif [ "$1" = 'octave' ]; then
52 |        octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
53 |    else
54 |        echo "$ $PYTHON eval/python/evaluate.py"
55 |        $PYTHON eval/python/evaluate.py
56 |    fi
57 | fi


--------------------------------------------------------------------------------
/glove-win_devc_x64/eval/python/distance.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     for idx, term in enumerate(input_term.split(' ')):
39 |         if term in vocab:
40 |             print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
41 |             if idx == 0:
42 |                 vec_result = np.copy(W[vocab[term], :])
43 |             else:
44 |                 vec_result += W[vocab[term], :]
45 |         else:
46 |             print('Word: %s  Out of dictionary!\n' % term)
47 |             return
48 | 
49 |     vec_norm = np.zeros(vec_result.shape)
50 |     d = (np.sum(vec_result ** 2,) ** (0.5))
51 |     vec_norm = (vec_result.T / d).T
52 | 
53 |     dist = np.dot(W, vec_norm.T)
54 | 
55 |     for term in input_term.split(' '):
56 |         index = vocab[term]
57 |         dist[index] = -np.Inf
58 | 
59 |     a = np.argsort(-dist)[:N]
60 | 
61 |     print("\n                               Word       Cosine distance\n")
62 |     print("---------------------------------------------------------\n")
63 |     for x in a:
64 |         print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     N = 100 # number of closest words that will be shown
69 |     W, vocab, ivocab = generate()
70 |     while True:
71 |         input_term = input("\nEnter word or sentence (EXIT to break): ")
72 |         if input_term == 'EXIT':
73 |             break
74 |         else:
75 |             distance(W, vocab, ivocab, input_term)
76 | 


--------------------------------------------------------------------------------
/glove-win_devc_x64/eval/python/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | 
  4 | def main():
  5 |     parser = argparse.ArgumentParser()
  6 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
  7 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
  8 |     args = parser.parse_args()
  9 | 
 10 |     with open(args.vocab_file, 'r') as f:
 11 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
 12 |     with open(args.vectors_file, 'r') as f:
 13 |         vectors = {}
 14 |         for line in f:
 15 |             vals = line.rstrip().split(' ')
 16 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
 17 | 
 18 |     vocab_size = len(words)
 19 |     vocab = {w: idx for idx, w in enumerate(words)}
 20 |     ivocab = {idx: w for idx, w in enumerate(words)}
 21 | 
 22 |     vector_dim = len(vectors[ivocab[0]])
 23 |     W = np.zeros((vocab_size, vector_dim))
 24 |     for word, v in vectors.items():
 25 |         if word == '<unk>':
 26 |             continue
 27 |         W[vocab[word], :] = v
 28 | 
 29 |     # normalize each word vector to unit length
 30 |     W_norm = np.zeros(W.shape)
 31 |     d = (np.sum(W ** 2, 1) ** (0.5))
 32 |     W_norm = (W.T / d).T
 33 |     evaluate_vectors(W_norm, vocab)
 34 | 
 35 | def evaluate_vectors(W, vocab):
 36 |     """Evaluate the trained word vectors on a variety of tasks"""
 37 | 
 38 |     filenames = [
 39 |         'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
 40 |         'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
 41 |         'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
 42 |         'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
 43 |         'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
 44 |         ]
 45 |     prefix = './eval/question-data/'
 46 | 
 47 |     # to avoid memory overflow, could be increased/decreased
 48 |     # depending on system and vocab size
 49 |     split_size = 100
 50 | 
 51 |     correct_sem = 0; # count correct semantic questions
 52 |     correct_syn = 0; # count correct syntactic questions
 53 |     correct_tot = 0 # count correct questions
 54 |     count_sem = 0; # count all semantic questions
 55 |     count_syn = 0; # count all syntactic questions
 56 |     count_tot = 0 # count all questions
 57 |     full_count = 0 # count all questions, including those with unknown words
 58 | 
 59 |     for i in range(len(filenames)):
 60 |         with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
 61 |             full_data = [line.rstrip().split(' ') for line in f]
 62 |             full_count += len(full_data)
 63 |             data = [x for x in full_data if all(word in vocab for word in x)]
 64 | 
 65 |         if len(data) == 0:
 66 |             print("ERROR: no lines of vocab kept for %s !" % filenames[i])
 67 |             print("Example missing line:", full_data[0])
 68 |             continue
 69 | 
 70 |         indices = np.array([[vocab[word] for word in row] for row in data])
 71 |         ind1, ind2, ind3, ind4 = indices.T
 72 | 
 73 |         predictions = np.zeros((len(indices),))
 74 |         num_iter = int(np.ceil(len(indices) / float(split_size)))
 75 |         for j in range(num_iter):
 76 |             subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
 77 | 
 78 |             pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
 79 |                 +  W[ind3[subset], :])
 80 |             #cosine similarity if input W has been normalized
 81 |             dist = np.dot(W, pred_vec.T)
 82 | 
 83 |             for k in range(len(subset)):
 84 |                 dist[ind1[subset[k]], k] = -np.Inf
 85 |                 dist[ind2[subset[k]], k] = -np.Inf
 86 |                 dist[ind3[subset[k]], k] = -np.Inf
 87 | 
 88 |             # predicted word index
 89 |             predictions[subset] = np.argmax(dist, 0).flatten()
 90 | 
 91 |         val = (ind4 == predictions) # correct predictions
 92 |         count_tot = count_tot + len(ind1)
 93 |         correct_tot = correct_tot + sum(val)
 94 |         if i < 5:
 95 |             count_sem = count_sem + len(ind1)
 96 |             correct_sem = correct_sem + sum(val)
 97 |         else:
 98 |             count_syn = count_syn + len(ind1)
 99 |             correct_syn = correct_syn + sum(val)
100 | 
101 |         print("%s:" % filenames[i])
102 |         print('ACCURACY TOP1: %.2f%% (%d/%d)' %
103 |             (np.mean(val) * 100, np.sum(val), len(val)))
104 | 
105 |     print('Questions seen/total: %.2f%% (%d/%d)' %
106 |         (100 * count_tot / float(full_count), count_tot, full_count))
107 |     print('Semantic accuracy: %.2f%%  (%i/%i)' %
108 |         (100 * correct_sem / float(count_sem), correct_sem, count_sem))
109 |     print('Syntactic accuracy: %.2f%%  (%i/%i)' %
110 |         (100 * correct_syn / float(count_syn), correct_syn, count_syn))
111 |     print('Total accuracy: %.2f%%  (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/glove-win_devc_x64/eval/python/word_analogy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     vecs = {}
39 |     if len(input_term.split(' ')) < 3:
40 |         print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' ')))
41 |         return 
42 |     else:
43 |         for idx, term in enumerate(input_term.split(' ')):
44 |             if term in vocab:
45 |                 print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
46 |                 vecs[idx] = W[vocab[term], :] 
47 |             else:
48 |                 print('Word: %s  Out of dictionary!\n' % term)
49 |                 return
50 | 
51 |         vec_result = vecs[1] - vecs[0] + vecs[2]
52 |         
53 |         vec_norm = np.zeros(vec_result.shape)
54 |         d = (np.sum(vec_result ** 2,) ** (0.5))
55 |         vec_norm = (vec_result.T / d).T
56 | 
57 |         dist = np.dot(W, vec_norm.T)
58 | 
59 |         for term in input_term.split(' '):
60 |             index = vocab[term]
61 |             dist[index] = -np.Inf
62 | 
63 |         a = np.argsort(-dist)[:N]
64 | 
65 |         print("\n                               Word       Cosine distance\n")
66 |         print("---------------------------------------------------------\n")
67 |         for x in a:
68 |             print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     N = 100;          # number of closest words that will be shown
73 |     W, vocab, ivocab = generate()
74 |     while True:
75 |         input_term = raw_input("\nEnter three words (EXIT to break): ")
76 |         if input_term == 'EXIT':
77 |             break
78 |         else:
79 |             distance(W, vocab, ivocab, input_term)
80 | 
81 | 


--------------------------------------------------------------------------------
/glove-win_devc_x64/glove.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/glove.exe


--------------------------------------------------------------------------------
/glove-win_devc_x64/pthreadVC2.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/pthreadVC2.dll


--------------------------------------------------------------------------------
/glove-win_devc_x64/shuffle.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/shuffle.exe


--------------------------------------------------------------------------------
/glove-win_devc_x64/vocab_count.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/glove-win_devc_x64/vocab_count.exe


--------------------------------------------------------------------------------
/pyTextMiner/chunker/__init__.py:
--------------------------------------------------------------------------------
 1 | class BaseChunker:
 2 |     IN_TYPE = [str]
 3 |     OUT_TYPE = [list, str]
 4 | 
 5 | class KoreanChunker(BaseChunker):
 6 |     def __init__(self):
 7 | 
 8 |         import nltk
 9 |         grammar = """
10 |         NP: {<N.*>*<Suffix>?}   # Noun phrase
11 |         VP: {<V.*>*}            # Verb phrase
12 |         AP: {<A.*>*}            # Adjective phrase
13 |         """
14 | 
15 |         self.inst=nltk.RegexpParser(grammar)
16 | 
17 | 
18 |     def __call__(self, *args, **kwargs):
19 |         import konlpy
20 |         words = konlpy.tag.Komoran().pos(*args)
21 | 
22 |         chunks = self.inst.parse(words)
23 | 
24 |         return chunks


--------------------------------------------------------------------------------
/pyTextMiner/chunker/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/chunker/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/cooccurrence/__init__.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | from collections import Counter
  3 | import os
  4 | from nltk import bigrams
  5 | from collections import defaultdict
  6 | import operator
  7 | import numpy as np
  8 | 
  9 | class BaseCooccurrence:
 10 |     INPUT=[list,str]
 11 |     OUTPUT=[list,tuple]
 12 | 
 13 | class CooccurrenceWorker(BaseCooccurrence):
 14 |     def __init__(self):
 15 |         name = 'cooccurrence'
 16 | 
 17 |         from sklearn.feature_extraction.text import CountVectorizer
 18 |         import pyTextMiner.cooccurrence.cooccurrence as co
 19 |         self.inst = co.Cooccurrence(ngram_range=(2, 2), stop_words='english')
 20 | 
 21 |     def __call__(self, *args, **kwargs):
 22 | 
 23 |         # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary={'awesome unicorns': 0, 'batman forever': 1})
 24 |         co_occurrences = self.inst.fit_transform(args[0])
 25 |         # print('Printing sparse matrix:', co_occurrences)
 26 |         # print(co_occurrences.todense())
 27 |         sum_occ = np.sum(co_occurrences.todense(), axis=0)
 28 |         # print('Sum of word-word occurrences:', sum_occ)
 29 | 
 30 |         # Converting itertor to set
 31 |         result = zip(self.inst.get_feature_names(), np.array(sum_occ)[0].tolist())
 32 |         result_set = list(result)
 33 |         return result_set, self.inst.vocab()
 34 | 
 35 | class CooccurrenceManager:
 36 |     def __init__(self):
 37 |         self.d = {}  # 단어->단어ID로 변환할때 사용
 38 |         self.w = []  # 단어ID->단어로 변환할 때 사용
 39 | 
 40 |     def getIdOrAdd(self, word):
 41 |         # 단어가 이미 사전에 등록된 것이면 해당하는 ID를 돌려주고
 42 |         if word in self.d: return self.d[word]
 43 |         # 그렇지 않으면 새로 사전에 등록하고 ID를 부여함
 44 |         self.d[word] = len(self.d)
 45 |         self.w.append(word)
 46 |         return len(self.d) - 1
 47 | 
 48 |     def getWord(self, id):
 49 |         return self.w[id]
 50 | 
 51 |     def calculateCooccurrence(self, list):
 52 |         count = {}  # 동시출현 빈도가 저장될 dict
 53 |         words = list(set(list))  # 단어별로 분리한 것을 set에 넣어 중복 제거하고, 다시 list로 변경
 54 |         wids = [self.getIdOrAdd(w) for w in words]
 55 |         for i, a in enumerate(wids):
 56 |             for b in wids[i + 1:]:
 57 |                 if a == b: continue  # 같은 단어의 경우는 세지 않음
 58 |                 if a > b: a, b = b, a  # A, B와 B, A가 다르게 세어지는것을 막기 위해 항상 a < b로 순서 고정
 59 |                 count[a, b] = count.get((a, b), 0) + 1  # 실제로 센다
 60 | 
 61 |         sorted = []
 62 |         for tup in count:
 63 |             freq = count[tup]
 64 |             left_word = self.getWord(count[0])
 65 |             right_word = self.getWord(count[1])
 66 |             sorted.append(((left_word, right_word), freq))
 67 |         return sorted, words
 68 | 
 69 |     def computeCooccurence(self, list, target=''):
 70 |         com = defaultdict(lambda: defaultdict(int))
 71 |         count_all = Counter()
 72 |         count_all1 = Counter()
 73 | 
 74 |         uniqueList = []
 75 |         for _array in list:
 76 |             for line in _array:
 77 |                 for word in line:
 78 |                     if len(target) < 1:
 79 |                         if word not in uniqueList:
 80 |                             uniqueList.append(word)
 81 | 
 82 |                 terms_bigram = bigrams(line)
 83 |                 # Update the counter
 84 |                 count_all.update(line)
 85 |                 count_all1.update(terms_bigram)
 86 | 
 87 |                 # Build co-occurrence matrix
 88 |                 for i in range(len(line) - 1):
 89 |                     for j in range(i + 1, len(line)):
 90 |                         w1, w2 = sorted([line[i], line[j]])
 91 |                         if w1 != w2:
 92 |                             com[w1][w2] += 1
 93 | 
 94 | 
 95 | 
 96 |         com_max = []
 97 |         # For each term, look for the most common co-occurrent terms
 98 |         for t1 in com:
 99 |             t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
100 |             for t2, t2_count in t1_max_terms:
101 |                 if len(target)>0 and (target is t1 or target is t2):
102 |                     if t1 not in uniqueList:
103 |                         uniqueList.append(t1)
104 |                     if t2 not in uniqueList:
105 |                         uniqueList.append(t2)
106 |                     com_max.append(((t1, t2), t2_count))
107 |         # Get the most frequent co-occurrences
108 |         terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
109 | 
110 |         return terms_max, uniqueList
111 | 
112 | 


--------------------------------------------------------------------------------
/pyTextMiner/cooccurrence/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/cooccurrence/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/cooccurrence/__pycache__/cooccurrence.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/cooccurrence/__pycache__/cooccurrence.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/cooccurrence/cooccurrence.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse as sp
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | 
 5 | class Cooccurrence(CountVectorizer):
 6 |     """Co-ocurrence matrix
 7 |     Convert collection of raw documents to word-word co-ocurrence matrix
 8 | 
 9 |     Parameters
10 |     ----------
11 |     encoding : string, 'utf-8' by default.
12 |         If bytes or files are given to analyze, this encoding is used to
13 |         decode.
14 | 
15 |     ngram_range : tuple (min_n, max_n)
16 |         The lower and upper boundary of the range of n-values for different
17 |         n-grams to be extracted. All values of n such that min_n <= n <= max_n
18 |         will be used.
19 | 
20 |     max_df: float in range [0, 1] or int, default=1.0
21 | 
22 |     min_df: float in range [0, 1] or int, default=1
23 | 
24 |     Example
25 |     -------
26 | 
27 |     >> import Cooccurrence
28 |     >> docs = ['this book is good',
29 |                'this cat is good',
30 |                'cat is good shit']
31 |     >> model = Cooccurrence()
32 |     >> Xc = model.fit_transform(docs)
33 | 
34 |     Check vocabulary by printing
35 |     >> model.vocabulary_
36 | 
37 |     """
38 | 
39 |     def __init__(self, encoding='utf-8', ngram_range=(1, 1),
40 |                  max_df=1.0, min_df=1, max_features=None,
41 |                  stop_words=None, normalize=True, vocabulary=None):
42 | 
43 |         super(Cooccurrence, self).__init__(
44 |             ngram_range=ngram_range,
45 |             max_df=max_df,
46 |             min_df=min_df,
47 |             max_features=max_features,
48 |             stop_words=stop_words,
49 |             vocabulary=vocabulary
50 |         )
51 | 
52 |         self.X = None
53 | 
54 |         self.normalize = normalize
55 | 
56 |     def fit_transform(self, raw_documents, y=None):
57 |         """Fit cooccurrence matrix
58 | 
59 |         Parameters
60 |         ----------
61 |         raw_documents : iterable
62 |             an iterable which yields either str, unicode or file objects
63 | 
64 |         Returns
65 |         -------
66 |         Xc : Cooccurrence matrix
67 | 
68 |         """
69 |         X = super(Cooccurrence, self).fit_transform(raw_documents)
70 |         self.X = X
71 | 
72 |         n_samples, n_features = X.shape
73 | 
74 |         Xc = (X.T * X)
75 |         if self.normalize:
76 |             g = sp.diags(1./Xc.diagonal())
77 |             Xc = g * Xc
78 |         else:
79 |             Xc.setdiag(0)
80 | 
81 |         return Xc
82 | 
83 |     def vocab(self):
84 |         tuples = super(Cooccurrence, self).get_feature_names()
85 |         vocabulary=[]
86 |         for e_tuple in tuples:
87 |             tokens = e_tuple.split()
88 |             for t in tokens:
89 |                 if t not in vocabulary:
90 |                     vocabulary.append(t)
91 | 
92 |         return vocabulary
93 | 
94 |     def word_histgram(self):
95 |         word_list = super(Cooccurrence, self).get_feature_names()
96 |         count_list = self.X.toarray().sum(axis=0)
97 |         return dict(zip(word_list,count_list))


--------------------------------------------------------------------------------
/pyTextMiner/counter/__init__.py:
--------------------------------------------------------------------------------
1 | class WordCounter:
2 |     IN_TYPE = [list, str]
3 |     OUT_TYPE = [list, tuple]
4 | 
5 |     def __call__(self, *args, **kwargs):
6 |         from collections import Counter
7 |         return list(Counter(args[0]).most_common())
8 | 
9 | 


--------------------------------------------------------------------------------
/pyTextMiner/counter/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/counter/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/graphml/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/graphml/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/helper/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class POSFilter:
 3 |     IN_TYPE = [list, tuple]
 4 |     OUT_TYPE = [list, tuple]
 5 | 
 6 |     def __init__(self, *posWanted):
 7 |         import re
 8 |         self.wanted = set(p for p in posWanted if not p.endswith('*'))
 9 |         self.re = re.compile('(' + '|'.join(p[:-1] for p in posWanted if p.endswith('*')) + ').*')
10 | 
11 |     def test(self, pos):
12 |         if pos in self.wanted: return True
13 |         if self.re.match(pos): return True
14 |         return False
15 | 
16 |     def __call__(self, *args, **kwargs):
17 |         return [i for i in args[0] if self.test(i[1])]
18 | 
19 | class StopwordFilter:
20 |     IN_TYPE = [list, str]
21 |     OUT_TYPE = [list, str]
22 | 
23 |     def __init__(self, stopwords = [], file = None):
24 |         if file:
25 |             stopwords = stopwords + [line.strip() for line in open(file, encoding='utf-8')]
26 |         self.stopwords = set(stopwords)
27 |         self.stopwordsPrefix = ('http', 'https', 'ftp', 'git', 'thatt')
28 | 
29 |     def __call__(self, *args, **kwargs):
30 |         #any(e for e in test_list if e.startswith('three') or e.endswith('four'))
31 |         return [i for i in args[0] if i.lower() not in self.stopwords and (i.lower().startswith(tuple(p for p in self.stopwordsPrefix)) == False)]
32 | 
33 | class SelectWordOnly:
34 |     IN_TYPE = [tuple]
35 |     OUT_TYPE = [str]
36 | 
37 |     def __call__(self, *args, **kwargs):
38 |         return args[0][0]
39 | 
40 | class ToLowerCase:
41 |     IN_TYPE = [str]
42 |     OUT_TYPE = [str]
43 | 
44 |     def __call__(self, *args, **kwargs):
45 |         return args[0].lower()


--------------------------------------------------------------------------------
/pyTextMiner/helper/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/helper/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/keyword/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from krwordrank.word import KRWordRank
 3 | 
 4 | class BaseKeywordExtraction:
 5 |     IN_TYPE = [str]
 6 |     OUT_TYPE = [str]
 7 | 
 8 | class TextRankExtractor(BaseKeywordExtraction):
 9 |     def __init__(self, pos_tagger_name=None, mecab_path=None,
10 |                  lang='ko', max=10,
11 |                  stopwords=[], combined_keywords=False):
12 |         import pyTextMiner.keyword.textrank as tr
13 |         self.inst = tr.TextRank(pos_tagger_name=pos_tagger_name,mecab_path=mecab_path,lang=lang,stopwords=stopwords)
14 |         self.max=max
15 |         self.combined_keywords = combined_keywords
16 |     def __call__(self, *args, **kwargs):
17 |         import nltk.tokenize
18 |         sents = nltk.tokenize.sent_tokenize(*args)
19 |         for sent in sents:
20 |             self.inst.build_keywords(sent)
21 |         return self.inst.get_keywords(self.max,self.combined_keywords)
22 | 
23 | class TextRankSummarizer(BaseKeywordExtraction):
24 |     def __init__(self,pos_tagger_name=None,mecab_path=None,max=3):
25 |         import pyTextMiner.keyword.textrank as tr
26 |         self.inst=tr.TextRank(pos_tagger_name=pos_tagger_name,mecab_path=mecab_path)
27 |         self.max=max
28 | 
29 |     def __call__(self, *args, **kwargs):
30 |         return self.inst.summarize(args[0],self.max)
31 | 
32 | class KeywordExtractionKorean(BaseKeywordExtraction):
33 |     def __init__(self, min_count=2, max_length=10,
34 |                  beta=0.85, max_iter=10, verbose=True, num_words=20):
35 |         self.min_count=min_count
36 |         self.max_length=max_length
37 |         self.beta=beta
38 |         self.max_iter=max_iter
39 |         self.verbose=verbose
40 |         self.num_words=num_words
41 | 
42 |         self.inst=KRWordRank(min_count, max_length,self.verbose)
43 | 
44 |     def __call__(self, *args, **kwargs):
45 |         _num_keywords=10
46 |         #print(str(args[0]) + "\n")
47 |         keywords, rank, graph = self.inst.extract(args[0], self.beta, self.max_iter, self.num_words)
48 | 
49 |         return keywords


--------------------------------------------------------------------------------
/pyTextMiner/keyword/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/keyword/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/keyword/__pycache__/textrank.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/keyword/__pycache__/textrank.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/lemmatizer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/lemmatizer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/ngram/__init__.py:
--------------------------------------------------------------------------------
 1 | from itertools import chain
 2 | 
 3 | class BaseNgram:
 4 |     IN_TYPE = [list, str]
 5 |     OUT_TYPE = [list, str]
 6 | 
 7 | class NGramTokenizer(BaseNgram):
 8 |     def __init__(self, min=1, ngramCount=3, concat='_'):
 9 |         self.ngramCount = ngramCount
10 |         self.min = min
11 |         self.converted = []
12 |         self.concat = concat
13 | 
14 |     def __call__(self, *args, **kwargs):
15 |         converted = []
16 |         from nltk.util import ngrams
17 |         for i in range(self.min, self.ngramCount+1):
18 |             output = list(ngrams((args[0]), i))
19 |             for x in output:
20 |                 if (len(x) > 0):
21 |                     converted.append(self.concat.join(x))
22 | 
23 |         #print("NGRAM " + str(converted))
24 |         self.converted = converted
25 | 
26 |         return converted


--------------------------------------------------------------------------------
/pyTextMiner/ngram/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/ngram/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/noun_extractor/__init__.py:
--------------------------------------------------------------------------------
 1 | from soynlp.noun import LRNounExtractor_v2
 2 | 
 3 | class BaseNounExtraction:
 4 |     INPUT=[str]
 5 |     OUTPUT=[list,str]
 6 | 
 7 | class NounExtractionKorean(BaseNounExtraction):
 8 |     def __init__(self,sents):
 9 |         self.inst = LRNounExtractor_v2(verbose=False, extract_compound=True)
10 |         self.inst.train(sents)
11 |         self.inst.extract()
12 | 
13 |     def __call__(self, *args, **kwargs):
14 |         return self.inst.decompose_compound(args[0])


--------------------------------------------------------------------------------
/pyTextMiner/noun_extractor/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/noun_extractor/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/pmi/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from soynlp import DoublespaceLineCorpus
 3 | from soynlp.word import WordExtractor
 4 | from soynlp.tokenizer import LTokenizer
 5 | from soynlp.vectorizer import sent_to_word_contexts_matrix
 6 | from soynlp.word import pmi as pmi_func
 7 | 
 8 | class BasePMICalculator:
 9 |     INPUT=[str]
10 |     OUTPUT=[list,tuple]
11 | 
12 | class PMICalculator(BasePMICalculator):
13 |     def __init__(self, corpus=None):
14 |         word_extractor = WordExtractor()
15 |         word_extractor.train(corpus)
16 |         cohesions = word_extractor.all_cohesion_scores()
17 |         l_cohesions = {word: score[0] for word, score in cohesions.items()}
18 |         tokenizer = LTokenizer(l_cohesions)
19 |         x, self.idx2vocab = sent_to_word_contexts_matrix(
20 |             corpus,
21 |             windows=3,
22 |             min_tf=10,
23 |             tokenizer=tokenizer,  # (default) lambda x:x.split(),
24 |             dynamic_weight=False,
25 |             verbose=True)
26 | 
27 |         self.vocab2idx = {vocab: idx for idx, vocab in enumerate(self.idx2vocab)}
28 | 
29 |         self.pmi, px, py = pmi_func(
30 |             x,
31 |             min_pmi=0,
32 |             alpha=0.0,
33 |             beta=0.75
34 |         )
35 |     def __call__(self, *args, **kwargs):
36 |         query = self.vocab2idx[args[0]]
37 |         submatrix = self.pmi[query, :].tocsr()  # get the row of query
38 |         contexts = submatrix.nonzero()[1]  # nonzero() return (rows, columns)
39 |         pmi_i = submatrix.data
40 | 
41 |         most_relateds = [(idx, pmi_ij) for idx, pmi_ij in zip(contexts, pmi_i)]
42 |         most_relateds = sorted(most_relateds, key=lambda x: -x[1])[:10]
43 |         most_relateds = [(self.idx2vocab[idx], pmi_ij) for idx, pmi_ij in most_relateds]
44 | 
45 |         return most_relateds


--------------------------------------------------------------------------------
/pyTextMiner/pmi/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/pmi/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/WordSemgmentationModelBuilder.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | from pycrfsuite_spacing import TemplateGenerator
 3 | from pycrfsuite_spacing import CharacterFeatureTransformer
 4 | from pycrfsuite_spacing import sent_to_chartags
 5 | from pycrfsuite_spacing import sent_to_xy
 6 | from pycrfsuite_spacing import PyCRFSuiteSpacing
 7 | 
 8 | with open('../../data/134963_norm.txt', encoding='utf-8') as f:
 9 |     docs = [doc.strip() for doc in f]
10 | 
11 | print('n docs = %d' % len(docs))
12 | pprint(docs[:5])
13 | 
14 | to_feature = CharacterFeatureTransformer(
15 |     TemplateGenerator(begin=-2,
16 |     end=2,
17 |     min_range_length=3,
18 |     max_range_length=3)
19 | )
20 | 
21 | x, y = sent_to_xy('이것도 너프해 보시지', to_feature)
22 | pprint(x)
23 | print(y)
24 | 
25 | correct = PyCRFSuiteSpacing(
26 |     to_feature = to_feature,
27 |     feature_minfreq=3, # default = 0
28 |     max_iterations=100,
29 |     l1_cost=1.0,
30 |     l2_cost=1.0
31 | )
32 | correct.train(docs, '../../model/korean_segmentation_model.crfsuite')
33 | 
34 | 


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/__pycache__/lstmWordSegmentationModel.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/lstmWordSegmentationModel.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/__pycache__/wordSegmentationModelUtil.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/__pycache__/wordSegmentationModelUtil.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/lstmWordSegmentationModel.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- coding: utf8 -*-
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | def weight_variable(shape):
 7 |     initial = tf.truncated_normal(shape, stddev=0.1)
 8 |     return tf.Variable(initial)
 9 | 
10 | def bias_variable(shape):
11 |     initial = tf.constant(0.1, shape=shape)
12 |     return tf.Variable(initial)
13 | 
14 | def RNN(_X, _istate, _weights, _biases, n_hidden, n_steps, n_input, early_stop):
15 |     # input _X shape: Tensor("Placeholder:0", shape=(?, n_steps, n_input), dtype=float32)
16 |     # switch n_steps and batch_size, Tensor("transpose:0", shape=(n_steps, ?, n_input), dtype=float32)
17 |     _X = tf.transpose(_X, [1, 0, 2])
18 |     # Reshape to prepare input to hidden activation
19 |     # (n_steps*batch_size, n_input) => (?, n_input), Tensor("Reshape:0", shape=(?, n_input), dtype=float32)
20 |     _X = tf.reshape(_X, [-1, n_input])
21 |     # Linear activation
22 |     _X = tf.matmul(_X, _weights['hidden']) + _biases['hidden'] # (?, n_hidden)+scalar(n_hidden,)=(?,n_hidden)
23 | 
24 |     # Define a lstm cell with tensorflow
25 |     lstm_cell = tf.contrib.rnn.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=False)
26 |     # Split data because rnn cell needs a list of inputs for the RNN inner loop
27 |     # n_steps splits each of which contains (?, n_hidden)
28 |     # ex) [<tf.Tensor 'split:0' shape=(?, n_hidden) dtype=float32>, ... , <tf.Tensor 'split:n_steps-1' shape=(?, n_hidden) dtype=float32>]
29 |     _X = tf.split(_X, n_steps, 0)
30 |     # Get lstm cell output
31 |     outputs, states = tf.contrib.rnn.static_rnn(cell=lstm_cell, inputs=_X, initial_state=_istate, sequence_length=early_stop)
32 |     final_outputs = []
33 |     for output in outputs :
34 |         # Linear activation
35 |         final_output = tf.matmul(output, _weights['out']) + _biases['out'] # (?, n_classes)
36 |         final_outputs.append(final_output)
37 |     # [<tf.Tensor 'add_1:0' shape=(?, n_classes), ..., <tf.Tensor 'add_n_steps:0' shape=(?, n_classes) dtype=float32>]
38 |     return final_outputs


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "segm.ckpt"
2 | all_model_checkpoint_paths: "segm.ckpt"
3 | 


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/model/dic.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/dic.pickle


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/model/segm.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/model/segm.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.index


--------------------------------------------------------------------------------
/pyTextMiner/segmentation/model/segm.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/segmentation/model/segm.ckpt.meta


--------------------------------------------------------------------------------
/pyTextMiner/splitter/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' str => list(str) '''
 2 | 
 3 | class BaseSplitter:
 4 |     IN_TYPE = [str]
 5 |     OUT_TYPE = [list, str]
 6 | 
 7 | class SpecialCharRemover(BaseSplitter):
 8 |     IN_TYPE = [str]
 9 |     OUT_TYPE = [str]
10 | 
11 |     def __init__(self):
12 |         import re
13 |         self.hangul = re.compile('[^ ㄱ-ㅣ가-힣\\.\\?\\,]+')
14 | 
15 |     def __call__(self, *args, **kwargs):
16 |         return self.hangul.sub('', *args)
17 | 
18 | class NLTK(BaseSplitter):
19 |     def __init__(self):
20 |         import nltk.tokenize
21 |         self.func = nltk.tokenize.sent_tokenize
22 | 
23 |     def __call__(self, *args, **kwargs):
24 |         return self.func(*args)
25 | 
26 | class KoSentSplitter(BaseSplitter):
27 |     def __init__(self):
28 |         import kss
29 |         self.func = kss.split_sentences
30 | 
31 |     def __call__(self, *args, **kwargs):
32 |         return self.func(*args)
33 | 


--------------------------------------------------------------------------------
/pyTextMiner/splitter/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/splitter/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/stemmer/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' str => str '''
 2 | 
 3 | class BaseStemmer:
 4 |     IN_TYPE = [str]
 5 |     OUT_TYPE = [str]
 6 | 
 7 | class Porter(BaseStemmer):
 8 |     def __init__(self):
 9 |         import nltk
10 |         self.inst = nltk.stem.PorterStemmer()
11 | 
12 |     def __call__(self, *args, **kwargs):
13 |         return self.inst.stem(args[0])
14 | 
15 | class Lancaster(BaseStemmer):
16 |     def __init__(self):
17 |         import nltk
18 |         self.inst = nltk.stem.LancasterStemmer()
19 | 
20 |     def __call__(self, *args, **kwargs):
21 |         return self.inst.stem(args[0])


--------------------------------------------------------------------------------
/pyTextMiner/stemmer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/stemmer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/tagger/__init__.py:
--------------------------------------------------------------------------------
 1 | ''' list(str) => list(tuple) '''
 2 | 
 3 | class BaseTagger:
 4 |     IN_TYPE = [list, str]
 5 |     OUT_TYPE = [list, tuple]
 6 | 
 7 | class NLTK(BaseTagger):
 8 |     def __init__(self):
 9 |         import nltk
10 |         nltk.download('averaged_perceptron_tagger')
11 | 
12 |         from nltk.tag.perceptron import PerceptronTagger
13 | 
14 |         self.inst = PerceptronTagger()
15 | 
16 |     def __call__(self, *args, **kwargs):
17 |         return self.inst.tag(args[0])


--------------------------------------------------------------------------------
/pyTextMiner/tagger/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/tagger/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/tokenizer/__init__.py:
--------------------------------------------------------------------------------
  1 | ''' str => list(str) '''
  2 | 
  3 | class BaseTokenizer:
  4 |     IN_TYPE = [str]
  5 |     OUT_TYPE = [list, str]
  6 | 
  7 | # [English]
  8 | class Tweet(BaseTokenizer):
  9 |     def __init__(self):
 10 |         import nltk.tokenize
 11 |         self.inst = nltk.tokenize.TweetTokenizer()
 12 | 
 13 |     def __call__(self, *args, **kwargs):
 14 |         return self.inst.tokenize(*args)
 15 | 
 16 | class Whitespace(BaseTokenizer):
 17 |     def __init__(self):
 18 |         import nltk.tokenize
 19 |         self.inst = nltk.tokenize.WhitespaceTokenizer()
 20 | 
 21 |     def __call__(self, *args, **kwargs):
 22 |         return self.inst.tokenize(*args)
 23 | 
 24 | class Word(BaseTokenizer):
 25 |     def __init__(self):
 26 |         import nltk.tokenize
 27 |         self.inst = nltk.tokenize.word_tokenize
 28 | 
 29 |     def __call__(self, *args, **kwargs):
 30 |         print(str(self.inst(*args)))
 31 |         return self.inst(*args)
 32 | 
 33 | class WordPos(BaseTokenizer):
 34 |     def __init__(self):
 35 |         import nltk
 36 |         self.inst = nltk
 37 |         self.OUT_TYPE = [list, tuple]
 38 | 
 39 |     def __call__(self, *args, **kwargs):
 40 |         tokens = self.inst.word_tokenize(*args)
 41 | 
 42 |         return self.inst.pos_tag(tokens)
 43 | 
 44 | # [Korean]
 45 | class Komoran(BaseTokenizer):
 46 |     def __init__(self,userdic=None):
 47 |         from konlpy.tag import Komoran
 48 |         import os
 49 |         if userdic is not None:
 50 |             print("user dict " + str(os.path.abspath(userdic)))
 51 |             self.inst = Komoran(userdic=os.path.abspath(userdic))
 52 |         else:
 53 |             self.inst = Komoran()
 54 |         self.OUT_TYPE = [list, tuple]
 55 | 
 56 |     def __call__(self, *args, **kwargs):
 57 |         return self.inst.pos(args[0])
 58 | 
 59 | class TwitterKorean(BaseTokenizer):
 60 |     def __init__(self):
 61 |         from konlpy.tag import Twitter
 62 |         self.inst = Twitter()
 63 | 
 64 |         self.OUT_TYPE = [list, tuple]
 65 | 
 66 |     def __call__(self, *args, **kwargs):
 67 |         return self.inst.pos(args[0])
 68 | 
 69 | class KokomaKorean(BaseTokenizer):
 70 |     def __init__(self):
 71 |         from konlpy.tag import Kkma
 72 |         self.inst = Kkma()
 73 | 
 74 |         self.OUT_TYPE = [list, tuple]
 75 | 
 76 |     def __call__(self, *args, **kwargs):
 77 |         return self.inst.pos(args[0])
 78 | 
 79 | class MeCab(BaseTokenizer):
 80 |     def __init__(self, path=None):
 81 |         #import MeCab
 82 |         #self.inst = MeCab.Tagger()
 83 | 
 84 |         from konlpy.tag import Mecab
 85 |         self.inst = Mecab(path)
 86 | 
 87 |         self.OUT_TYPE = [list, tuple]
 88 | 
 89 |     def __call__(self, *args, **kwargs):
 90 |         try:
 91 |             return self.inst.pos(args[0])
 92 |         except:
 93 |             return []
 94 | 
 95 | class SpecialTokenizer:
 96 |     IN_TYPE = [str]
 97 |     OUT_TYPE = [str]
 98 | 
 99 | class MaxScoreTokenizerKorean(SpecialTokenizer):
100 |     def __init__(self, scores=None):
101 |         from soynlp.tokenizer import MaxScoreTokenizer
102 |         self.inst=MaxScoreTokenizer(scores=scores)
103 |         self.OUT_TYPE = [list, str]
104 | 
105 |     def __call__(self, *args, **kwargs):
106 |         tokens = self.inst.tokenize(args[0])
107 |         return tokens
108 | 
109 | class LTokenizerKorean(SpecialTokenizer):
110 |     def __init__(self, scores=None):
111 |         from soynlp.tokenizer import LTokenizer
112 |         self.inst=LTokenizer(scores=scores)
113 | 
114 |         self.OUT_TYPE = [list, str]
115 | 
116 |     def __call__(self, *args, **kwargs):
117 |         tokens = self.inst.tokenize(args[0])
118 |         return tokens
119 | 
120 | class RegexTokenizerKorean(SpecialTokenizer):
121 |     def __init__(self):
122 |         from soynlp.tokenizer import RegexTokenizer
123 |         self.inst=RegexTokenizer()
124 |         self.OUT_TYPE = [list, str]
125 | 
126 |     def __call__(self, *args, **kwargs):
127 |         tokens=self.inst.tokenize(args[0])
128 |         return tokens


--------------------------------------------------------------------------------
/pyTextMiner/tokenizer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinSong2/pyTextMiner/acad0e5749044abb43226db43d434e18e586aafc/pyTextMiner/tokenizer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pyTextMiner/utility/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from gensim.models import fasttext
 3 | from soynlp.hangle import decompose, compose
 4 | import re
 5 | 
 6 | class Utility:
 7 |     def __init__(self):
 8 |         name = 'Utility Class'
 9 |         self.doublespace_pattern = re.compile('\s+')
10 | 
11 |     def jamo_sentence(self, sent):
12 |         def transform(char):
13 |             if char == ' ':
14 |                 return char
15 |             cjj = decompose(char)
16 |             if cjj != None:
17 |                 if len(cjj) == 1:
18 |                     return cjj
19 |                 cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
20 | 
21 |             if cjj == None:
22 |                 return ''
23 | 
24 |             return cjj_
25 | 
26 |         sent_ = ''.join(transform(char) for char in sent)
27 |         sent_ = self.doublespace_pattern.sub(' ', sent_)
28 |         return sent_
29 | 
30 |     def decode(self, s):
31 |         def process(t):
32 |             assert len(t) % 3 == 0
33 |             t_ = t.replace('-', ' ')
34 |             chars = [tuple(t_[3 * i:3 * (i + 1)]) for i in range(len(t_) // 3)]
35 |             recovered = []
36 |             for char in chars:
37 |                 try:
38 |                     composed = compose(*char)
39 |                     recovered.append(composed)
40 |                 except:
41 |                     pass
42 |             #recovered = [compose(*char) for char in chars]
43 |             recovered = ''.join(recovered)
44 |             return recovered
45 | 
46 |         return ' '.join(process(t) for t in s.split())
47 | 
48 |     def decode_sentence(self, sent):
49 |         return ' '.join(self.decode(token) for token in sent.split())
50 | 
51 |     def cosine_similarity(self, word1, word2, model):
52 |         cjj1 = self.jamo_sentence(word1)
53 |         cjj2 = self.jamo_sentence(word2)
54 |         cos_sim = model.cosine_similarity(cjj1, cjj2)
55 |         return cos_sim
56 | 
57 |     def most_similar(self, word, model):
58 |         jamo_result = []
59 |         cjj = self.jamo_sentence(word)
60 |         result = model.most_similar(cjj)
61 |         for token in result:
62 |             word = token[0]
63 |             encoded_word = self.decode(word)
64 |             sim = token[1]
65 |             jamo_result.append((encoded_word,sim))
66 | 
67 |         return jamo_result
68 | 
69 |     def most_similars(self, model, positives, negatives, topn=10):
70 |         jamo_result = []
71 |         result = model.most_similar(positive=positives,negative=negatives,topn=topn)
72 |         for token in result:
73 |             word = token[0]
74 |             if len(word) > 3:
75 |                 encoded_word = self.decode(word)
76 |                 sim = token[1]
77 |                 jamo_result.append((encoded_word,sim))
78 | 
79 |         return jamo_result
80 | 
81 |     def similar_by_word(self, model, word, topn=10):
82 |         jamo_result = []
83 |         result = model.similar_by_word(word, topn=topn)
84 |         for token in result:
85 |             word = token[0]
86 |             if len(word) > 3:
87 |                 encoded_word = self.decode(word)
88 |                 sim = token[1]
89 |                 jamo_result.append((encoded_word, sim))
90 | 
91 |         return jamo_result


--------------------------------------------------------------------------------
/pyTextMiner/version.py:
--------------------------------------------------------------------------------
1 | # Store the version here so:
2 | # 1) we don't load dependencies by storing it in __init__.py
3 | # 2) we can import it in setup.py for the same reason
4 | # 3) we can import it into your module module
5 | __version__ = '1.1.116b7'


--------------------------------------------------------------------------------
/py_bert/__init__.py:
--------------------------------------------------------------------------------
1 | from py_bert import *


--------------------------------------------------------------------------------
/py_bert/bert_classification_model.py:
--------------------------------------------------------------------------------
  1 | from transformers import BertModel, BertForSequenceClassification
  2 | from torch import nn, optim
  3 | import torch
  4 | import os
  5 | from kobert_transformers import get_kobert_model
  6 | 
  7 | class PYBERTClassifier(nn.Module):
  8 |     '''
  9 |      Customized BERT Sequence Model
 10 |     '''
 11 |     def __init__(self, n_classes, model_name):
 12 |         #PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
 13 |         super(PYBERTClassifier, self).__init__()
 14 |         if 'etri' in model_name or 'mecab' in model_name:
 15 |             self.bert = BertModel.from_pretrained(os.path.abspath('pytorch_model.bin'),
 16 |                                   output_hidden_states = False)
 17 |         else:
 18 |             self.bert = BertModel.from_pretrained(model_name)
 19 | 
 20 |         #print(self.bert.config.hidden_size)
 21 | 
 22 |         self.drop = nn.Dropout(p=0.3)
 23 |         self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
 24 | 
 25 |     def forward(self, input_ids, attention_mask):
 26 |         _, pooled_output = self.bert(
 27 |             input_ids=input_ids,
 28 |             attention_mask=attention_mask
 29 |         )
 30 |         #print(pooled_output.shape)
 31 | 
 32 |         output = self.drop(pooled_output)
 33 |         return self.out(output)
 34 | 
 35 |     def name(self):
 36 |         return 'PYBERTClassifier'
 37 | 
 38 | class PYBERTClassifierGenAtten(nn.Module):
 39 |     def __init__(self,
 40 |                  n_classes,
 41 |                  model_name,
 42 |                  dr_rate=None,
 43 |                  params=None):
 44 | 
 45 |         '''
 46 |         bert,
 47 |                  hidden_size=768,
 48 |                  num_classes=2,
 49 |                  dr_rate=None,
 50 |                  params=None
 51 |         '''
 52 | 
 53 |         super(PYBERTClassifierGenAtten, self).__init__()
 54 |         if 'etri' in model_name or 'mecab' in model_name:
 55 |             self.bert = BertModel.from_pretrained(os.path.abspath('pytorch_model.bin'),
 56 |                                                   output_hidden_states=False)
 57 |         else:
 58 |             self.bert = BertModel.from_pretrained(model_name)
 59 |         self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
 60 |         self.dr_rate = dr_rate
 61 |         self.attention_mask=None
 62 | 
 63 |         if self.dr_rate != None:
 64 |             print('dropout ' + str(self.dr_rate))
 65 |             self.dropout = nn.Dropout(p=dr_rate)
 66 | 
 67 |     def gen_attention_mask(self, token_ids, targets):
 68 |         attention_mask = torch.zeros_like(token_ids)
 69 |         for i, v in enumerate(targets):
 70 |             attention_mask[i][:v] = 1
 71 |         return attention_mask.float()
 72 | 
 73 |     def get_attention_mask(self, atten_mask):
 74 |         self.attention_mask = atten_mask
 75 | 
 76 |     def forward(self, token_ids, targets, segment_ids, attention_mask):
 77 |         if attention_mask is None:
 78 |             self.attention_mask = self.gen_attention_mask(token_ids, targets)
 79 |         else:
 80 |             self.attention_mask = attention_mask
 81 | 
 82 |         _, pooler = self.bert(input_ids=token_ids,
 83 |                               token_type_ids=segment_ids.long(),
 84 |                               attention_mask=self.attention_mask.float().to(token_ids.device))
 85 | 
 86 |         if self.dr_rate:
 87 |             output = self.dropout(pooler)
 88 | 
 89 |         return self.out(output)
 90 | 
 91 |     def name(self):
 92 |         return 'PYBERTClassifierGenAtten'
 93 | 
 94 | class PYBertForSequenceClassification:
 95 |     '''
 96 |         Use pytorch's BERTForSeqeunceClassification
 97 |         Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
 98 |         labels (torch.LongTensor of shape (batch_size,), optional, defaults to None)
 99 |         – Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss),
100 |         If config.num_labels > 1 a classification loss is computed (Cross-Entropy).
101 |     '''
102 |     def __init__(self, n_classes, model_name):
103 |         self.model = BertForSequenceClassification.from_pretrained(
104 |                                     model_name,  # Use the 12-layer BERT model, with an uncased vocab.
105 |                                     num_labels=n_classes,  # The number of output labels--2 for binary classification.
106 |                                     # You can increase this for multi-class tasks.
107 |                                     output_attentions=False,  # Whether the model returns attentions weights.
108 |                                     output_hidden_states=False,  # Whether the model returns all hidden-states.
109 |                                 )
110 |         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
111 |         self.model.to(device)
112 | 
113 |     def __call__(self, *args, **kwargs):
114 |         return self.model
115 | 
116 |     def name(self):
117 |         return 'PYBertForSequenceClassification'


--------------------------------------------------------------------------------
/py_bert/bert_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch import nn, optim
 2 | from torch.utils.data import Dataset, DataLoader, TensorDataset
 3 | import torch
 4 | 
 5 | class PYBERTDataset(Dataset):
 6 |     def __init__(self, contents, targets, tokenizer, max_len):
 7 |         super(PYBERTDataset, self).__init__()
 8 |         self.contents = contents
 9 |         self.targets = targets
10 |         self.tokenizer = tokenizer
11 |         self.max_len = max_len
12 | 
13 |     def __len__(self):
14 |         return len(self.contents)
15 | 
16 |     def __getitem__(self, item):
17 |         content = str(self.contents[item])
18 |         target = self.targets[item]
19 | 
20 |         encoding = self.tokenizer.encode_plus(
21 |             content,
22 |             add_special_tokens=True,
23 |             max_length=self.max_len,
24 |             return_token_type_ids=True,
25 |             pad_to_max_length=True,
26 |             return_attention_mask=True,
27 |             return_tensors='pt',
28 |         )
29 | 
30 |         return {
31 |             'document_text': content,
32 |             'input_ids': encoding['input_ids'].flatten(),
33 |             'token_type_ids': encoding['token_type_ids'].flatten(),
34 |             'attention_mask': encoding['attention_mask'].flatten(),
35 |             'targets': torch.tensor(target, dtype=torch.long)
36 |         }


--------------------------------------------------------------------------------
/py_bert/bert_util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from torch.utils.data import Dataset, DataLoader
  3 | from py_bert.bert_dataset import PYBERTDataset
  4 | import pandas as pd
  5 | from transformers import BertModel, BertTokenizer
  6 | from py_bert.tokenization_kobert import KoBertTokenizer
  7 | from py_bert.tokenization_korbert import KorBertTokenizer
  8 | import seaborn as sns
  9 | import matplotlib.pyplot as plt
 10 | import os
 11 | 
 12 | def get_korean_tokenizer(bert_model_name):
 13 |     tokenizer = None
 14 |     if bert_model_name.startswith('monologg'):
 15 |         tokenizer = KoBertTokenizer.from_pretrained(bert_model_name)
 16 |     elif 'etri' or 'mecab' in bert_model_name:
 17 |         tokenizer = KorBertTokenizer.from_pretrained(os.path.abspath(bert_model_name))
 18 |     else:
 19 |         tokenizer = BertTokenizer.from_pretrained(bert_model_name)
 20 | 
 21 |     return tokenizer
 22 | 
 23 | def to_sentiment(rating):
 24 |     '''
 25 |         assuming the class rating scale is from 0 to 5
 26 |     '''
 27 |     rating = int(rating)
 28 |     if rating <= 2:
 29 |         return 0
 30 |     elif rating == 3:
 31 |         return 1
 32 |     else:
 33 |         return 2
 34 | 
 35 | def add_sentiment_label(df):
 36 |     df['sentiment'] = df.score.apply(to_sentiment)
 37 |     if len(df['sentiment'].unique()) == 2:
 38 |         class_names = ['positive', 'negative']
 39 |     elif len(df['sentiment'].unique()) == 3:
 40 |         class_names = ['positive', 'neutral', 'negative']
 41 | 
 42 |     return df, class_names
 43 | 
 44 | def create_data_loader(df, tokenizer, max_len, batch_size):
 45 |     ds = PYBERTDataset(
 46 |         contents=df.content.to_numpy(),
 47 |         targets=df.sentiment.to_numpy(),
 48 |         tokenizer=tokenizer,
 49 |         max_len=max_len)
 50 | 
 51 |     return DataLoader(
 52 |         ds,
 53 |         batch_size=batch_size,
 54 |         num_workers=0
 55 |     )
 56 | 
 57 | def convert_to_df(documents, labels):
 58 |     pd.set_option('display.max_columns', None)
 59 |     document_df = pd.DataFrame()
 60 |     combined = zip(documents,labels)
 61 |     for i, (text, label) in enumerate(combined):
 62 |         document_df = document_df.append(pd.Series([text, int(label)]), ignore_index=True)
 63 | 
 64 |     document_df.columns = ['content', 'sentiment']
 65 |     class_names = []
 66 |     if len(document_df['sentiment'].unique()) == 2:
 67 |         class_names = ['positive', 'negative']
 68 |     elif len(document_df['sentiment'].unique()) == 3:
 69 |         class_names = ['positive', 'neutral', 'negative']
 70 | 
 71 |     return document_df, class_names
 72 | 
 73 | 
 74 | def convert_to_df_for_classification(documents, labels):
 75 |     pd.set_option('display.max_columns', None)
 76 |     document_df = pd.DataFrame()
 77 |     combined = zip(documents,labels)
 78 |     for i, (text, label) in enumerate(combined):
 79 |         document_df = document_df.append(pd.Series([text, int(label)]), ignore_index=True)
 80 | 
 81 |     document_df.columns = ['content', 'label']
 82 |     class_names = document_df['label'].unique()
 83 | 
 84 |     return document_df, class_names
 85 | 
 86 | def show_confusion_matrix(confusion_matrix):
 87 |     hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
 88 |     hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
 89 |     hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
 90 | 
 91 |     plt.ylabel('True sentiment')
 92 |     plt.xlabel('Predicted sentiment')
 93 |     plt.show()
 94 | 
 95 | def token_count_distribution(df, tokenizer):
 96 |     token_lens = []
 97 |     for txt in df.content:
 98 |         tokens = tokenizer.encode(txt, max_length=512)
 99 |         token_lens.append(len(tokens))
100 | 
101 |     sns.distplot(token_lens)
102 |     plt.xlim([0, 256])
103 |     plt.xlabel('Token count')
104 | 
105 |     plt.show()


--------------------------------------------------------------------------------
/py_doc2vec/. ...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_doc2vec/__init__.py:
--------------------------------------------------------------------------------
1 | from py_doc2vec import *


--------------------------------------------------------------------------------
/py_document_classification/__init__.py:
--------------------------------------------------------------------------------
1 | from py_document_classification import *


--------------------------------------------------------------------------------
/py_document_classification/test_ml_text_classfier.py:
--------------------------------------------------------------------------------
 1 | from document_classification.ml_textclassification import documentClassifier
 2 | import pyTextMiner as ptm
 3 | 
 4 | if __name__ == '__main__':
 5 |     document_classifier = documentClassifier()
 6 |     mecab_path = 'C:\\mecab\\mecab-ko-dic'
 7 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 8 |                             ptm.tokenizer.MeCab(mecab_path),
 9 |                             ptm.helper.POSFilter('NN*'),
10 |                             ptm.helper.SelectWordOnly(),
11 |                             ptm.ngram.NGramTokenizer(2, 2),
12 |                             #ptm.tokenizer.LTokenizerKorean(),
13 |                             ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')
14 |                             )
15 | 
16 |     #mode is either train or predict
17 |     mode = 'train'
18 |     if mode is 'train':
19 |         input_file ='./data/3_class_naver_news.csv'
20 |         # 1. text processing and representation
21 |         corpus = ptm.CorpusFromFieldDelimitedFileForClassification(input_file,
22 |                                                                    delimiter=',',
23 |                                                                    doc_index=4,
24 |                                                                    class_index=1,
25 |                                                                    title_index=3)
26 |         corpus.docs
27 |         tups = corpus.pair_map
28 |         class_list = []
29 |         for id in tups:
30 |             #print(tups[id])
31 |             class_list.append(tups[id])
32 | 
33 |         result = pipeline.processCorpus(corpus)
34 |         print('==  ==')
35 | 
36 |         documents = []
37 |         for doc in result:
38 |             document = ''
39 |             for sent in doc:
40 |                 document += " ".join(sent)
41 |             documents.append(document)
42 | 
43 |         document_classifier.preprocess(documents,class_list)
44 | 
45 |         #model_name = 0  -- RandomForestClassifier
46 |         #model_name = 1  -- LinearSVC
47 |         #model_name = 2  -- MultinomialNB
48 |         #model_name = 3  -- LogisticRegression
49 |         #model_name = 4  -- K-NN
50 |         #model_name = 5  -- SGDClassifier
51 |         X_train, X_test, y_train, y_test, y_pred, indices_test, model = document_classifier.train(model_index=1)
52 | 
53 |         print('training is finished')
54 | 
55 |         document_classifier.evaluate(y_test,y_pred,indices_test,model)
56 |         document_classifier.save(model, model_name='./model/svm_classifier.model')
57 |         document_classifier.saveVectorizer(model_name='./model/vectorizer.model')
58 | 
59 |     elif mode is 'predict':
60 |         model=document_classifier.load('./model/svm_classifier.model')
61 |         vectorizer_model=document_classifier.loadVectorizer(model_name='./model/vectorizer.model')
62 |         document_classifier.predict(model,vectorizer_model)
63 | 
64 |         #7. prediction
65 |         input = "../data/navernews.txt"
66 |         corpus = ptm.CorpusFromFieldDelimitedFile(input,3)
67 | 
68 |         result = pipeline.processCorpus(corpus)
69 |         print('==  ==')
70 | 
71 |         documents = []
72 |         for doc in result:
73 |             document = ''
74 |             for sent in doc:
75 |                 document += " ".join(sent)
76 |             documents.append(document)
77 | 
78 |         document_classifier.predict_realtime(model,vectorizer_model, documents)
79 | 


--------------------------------------------------------------------------------
/py_document_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from py_document_clustering import *


--------------------------------------------------------------------------------
/py_ner/bert_crf_ner_visualization.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | import torch
 4 | from gluonnlp.data import SentencepieceTokenizer
 5 | 
 6 | from py_ner.bert_crf_ner_prediction import DecoderFromNamedEntitySequence
 7 | from py_ner.model.net import KobertCRFViz
 8 | from py_ner.data_utils.utils import Config
 9 | from py_ner.data_utils.vocab_tokenizer import Tokenizer
10 | from py_ner.data_utils.pad_sequence import keras_pad_fn
11 | from pathlib import Path
12 | 
13 | from py_ner.bertviz.head_view import show
14 | 
15 | class BertCrfNerVisualization:
16 |     def __init__(self, model_dir=''):
17 |         #'./experiments/base_model_with_crf'
18 |         self.model_dir = model_dir
19 |         self.model_config = Config(json_path=self.model_dir + '/config.json')
20 |         self.tokenizer = None
21 |         self.model = None
22 |         self.decoder_from_res = None
23 | 
24 |     def load_model(self, tokenizer_model_name, ner_model_name):
25 |         # load vocab & tokenizer
26 |         #tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
27 |         tok_path = tokenizer_model_name
28 |         ptr_tokenizer = SentencepieceTokenizer(tok_path)
29 | 
30 |         with open(self.model_dir + "/vocab.pkl", 'rb') as f:
31 |             vocab = pickle.load(f)
32 |         self.tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=self.model_config.maxlen)
33 | 
34 |         # load ner_to_index.json
35 |         with open(self.model_dir + "/ner_to_index.json", 'rb') as f:
36 |             ner_to_index = json.load(f)
37 |             index_to_ner = {v: k for k, v in ner_to_index.items()}
38 | 
39 |         # model
40 |         self.model = KobertCRFViz(config=self.model_config, num_classes=len(ner_to_index), vocab=vocab)
41 | 
42 |         #ner_model_name = "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin"
43 |         # load
44 |         model_dict = self.model.state_dict()
45 |         checkpoint = torch.load(ner_model_name, map_location=torch.device('cpu'))
46 |         convert_keys = {}
47 |         for k, v in checkpoint['model_state_dict'].items():
48 |             new_key_name = k.replace("module.", '')
49 |             if new_key_name not in model_dict:
50 |                 print("{} is not int model_dict".format(new_key_name))
51 |                 continue
52 |             convert_keys[new_key_name] = v
53 | 
54 |         self.model.load_state_dict(convert_keys)
55 |         self.model.eval()
56 |         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
57 |         self.model.to(device)
58 |         self.decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=self.tokenizer, index_to_ner=index_to_ner)
59 | 
60 |     def visualize(self):
61 |         input_text = '김대중 대통령은 노벨평화상을 받으러 스웨덴으로 출국해서 5박6일 동안 스웨덴에 머물며 대한민국의 위상을 높였다.'
62 |         list_of_input_ids = self.tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
63 |         x_input = torch.tensor(list_of_input_ids).long()
64 |         list_of_pred_ids, _ = self.model(x_input)
65 | 
66 |         list_of_ner_word, decoding_ner_sentence = self.decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
67 |         print("output>", decoding_ner_sentence)
68 |         model_type = 'bert'
69 |         show(self.model, model_type, self.tokenizer, decoding_ner_sentence, input_text)
70 |         print("")
71 | 
72 | if __name__ == '__main__':
73 |     model_dir = '../examples/exper/base_model_with_crf'
74 |     visualizer = BertCrfNerVisualization(model_dir)
75 | 
76 |     tokenizer_model_name = "./ptr_lm_model/tokenizer_78b3253a26.model"
77 |     ner_model_name = "../examples/exper/base_model_with_crf/best-epoch-6-step-500-acc-0.943.bin"
78 | 
79 |     visualizer.load_model(tokenizer_model_name, ner_model_name)
80 | 
81 |     visualizer.visualize()


--------------------------------------------------------------------------------
/py_ner/bert_ner_prediction.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from keras_preprocessing.sequence import pad_sequences
  4 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
  5 | import numpy as np
  6 | from torch import nn
  7 | 
  8 | import py_ner.lstm_cnn_crf_utils as utils
  9 | import pickle
 10 | 
 11 | class BERTNERPredictor:
 12 |     def __init__(self):
 13 |         print('BertNERPredictor')
 14 |         self.model = None
 15 | 
 16 |     def load_model(self, model_name):
 17 |         # open a file, where you stored the pickled data
 18 |         file = open(model_name, 'rb')
 19 |         # dump information to that file
 20 |         self.model = pickle.load(file)
 21 |         # close the file
 22 |         file.close()
 23 | 
 24 |     def getKeyByValue(self, dictOfElements, value):
 25 |         key = ''
 26 |         listOfItems = dictOfElements.items()
 27 |         for item in listOfItems:
 28 |             if item[1] == value:
 29 |                 key = item[0]
 30 |         return key
 31 | 
 32 |     def align_predictions(self, items, predictions: np.ndarray, label_ids: np.ndarray):
 33 |         """Formats the predictions."""
 34 |         preds = np.argmax(predictions, axis=2)
 35 |         batch_size, seq_len = preds.shape
 36 |         out_label_list = [[] for _ in range(batch_size)]
 37 |         preds_list = [[] for _ in range(batch_size)]
 38 |         for i in range(batch_size):
 39 |             for j in range(seq_len):
 40 |                 if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
 41 |                     out_label_list[i].append(self.getKeyByValue(items, [label_ids[i][j]]))
 42 |                     preds_list[i].append(self.getKeyByValue(items, [preds[i][j]]))
 43 |         return preds_list, out_label_list
 44 | 
 45 |     def predict_each(self, device, text, tokenizer, MAX_LEN, items):
 46 | 
 47 |         tokenized_texts = tokenizer.tokenize(text)
 48 |         print(tokenized_texts)
 49 | 
 50 |         input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
 51 |                                   maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
 52 | 
 53 |         pred_masks = [[float(i > 0) for i in ii] for ii in input_ids]
 54 |         pred_tags = [[int(i > 0) for i in ii] for ii in input_ids]
 55 | 
 56 |         #print(pred_tags)
 57 |         pred_ids = torch.tensor(input_ids)
 58 |         pred_tags = torch.tensor(pred_tags)
 59 |         pred_masks = torch.tensor(pred_masks)
 60 | 
 61 |         real_ids = np.argmax(pred_masks, axis=1).tolist()
 62 |         print(str(len(real_ids)))
 63 | 
 64 |         pred_data = TensorDataset(pred_ids, pred_masks, pred_tags)
 65 |         pred_sampler = RandomSampler(pred_data)
 66 |         pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=1)
 67 |         i = 0
 68 |         predictions = []
 69 |         for batch in pred_dataloader:
 70 |             if i > 0:
 71 |                 break
 72 |             i += 1
 73 |             batch = tuple(t.to(device) for t in batch)
 74 |             b_input_ids, b_input_mask, b_labels = batch
 75 |             b_input_ids = torch.tensor(b_input_ids).to(device).long()
 76 |             b_labels = torch.tensor(b_labels).to(device).long()
 77 | 
 78 |             with torch.no_grad():
 79 |                 logits = self.model(b_input_ids, token_type_ids=None,
 80 |                                attention_mask=b_input_mask)
 81 | 
 82 |             logits = logits.detach().cpu().numpy()
 83 | 
 84 |             pred_flat = np.argmax(logits, axis=2).flatten()
 85 | 
 86 |             '''
 87 |             for m, a in enumerate(pred_flat):
 88 |                 if m >= len(real_ids):
 89 |                     break
 90 |                 predictions.append(a)
 91 |             '''
 92 |             #print(predictions)
 93 |             #print(str(len(predictions)))
 94 | 
 95 |         #preds_list, out_label_list = self.align_predictions(items, logits, pred_tags)
 96 |         #print(preds_list)
 97 |         #print(out_label_list)
 98 | 
 99 |         return pred_flat
100 | 


--------------------------------------------------------------------------------
/py_ner/bertviz/head_view.py:
--------------------------------------------------------------------------------
 1 | """Module for postprocessing and displaying transformer attentions.
 2 | 
 3 | """
 4 | 
 5 | import json
 6 | from py_ner.bertviz.attention import get_attention
 7 | 
 8 | import os
 9 | 
10 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None):
11 | 
12 |     if sentence_b:
13 |         vis_html = """
14 |           <span style="user-select:none">
15 |             Layer: <select id="layer"></select>
16 |             Attention: <select id="filter">
17 |               <option value="all">All</option>
18 |               <option value="aa">Sentence A -> Sentence A</option>
19 |               <option value="ab">Sentence A -> Sentence B</option>
20 |               <option value="ba">Sentence B -> Sentence A</option>
21 |               <option value="bb">Sentence B -> Sentence B</option>
22 |             </select>
23 |           </span>
24 |           <div id='vis'></div> 
25 |         """
26 |     else:
27 |         vis_html = """
28 |           <span style="user-select:none">
29 |             Layer: <select id="layer"></select>
30 |           </span>
31 |           <div id='vis'></div> 
32 |         """
33 | 
34 |     __location__ = os.path.realpath(
35 |         os.path.join(os.getcwd(), os.path.dirname(__file__)))
36 |     vis_js = open(os.path.join(__location__, 'head_view.js')).read()
37 |     attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b)
38 |     params = {
39 |         'attention': attn_data,
40 |         'default_filter': "all"
41 |     }
42 | 
43 |     with open('bert_visualization.html', 'w') as f:
44 |         _head = "<head>" + "<script src = \"https://d3js.org/d3.v4.min.js\" > </script>" + "</head>"
45 |         f.write(_head)
46 |         f.write(vis_html + '\n')
47 | 
48 |         f.write('window.params = %s' % json.dumps(params) + '\n')
49 |         f.write(vis_js + '\n')


--------------------------------------------------------------------------------
/py_ner/bertviz/model_view.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Tensor2Tensor Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Change log
17 | # 12/12/18  Jesse Vig   Adapted to BERT model
18 | # 12/19/18  Jesse Vig   Assorted cleanup. Changed orientation of attention matrices. Updated comments.
19 | 
20 | 
21 | """Module for postprocessing and displaying transformer attentions.
22 | 
23 | This module is designed to be called from an ipython notebook.
24 | """
25 | 
26 | import json
27 | from py_ner.bertviz.attention import get_attention
28 | 
29 | import os
30 | 
31 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None):
32 | 
33 |     if sentence_b:
34 |         vis_html = """
35 |           <span style="user-select:none">
36 |             Attention: <select id="filter">
37 |               <option value="all">All</option>
38 |               <option value="aa">Sentence A -> Sentence A</option>
39 |               <option value="ab">Sentence A -> Sentence B</option>
40 |               <option value="ba">Sentence B -> Sentence A</option>
41 |               <option value="bb">Sentence B -> Sentence B</option>
42 |             </select>
43 |           </span>
44 |           <div id='vis'></div> 
45 |         """
46 |     else:
47 |         vis_html = """
48 |           <div id='vis'></div> 
49 |         """
50 | 
51 |     __location__ = os.path.realpath(
52 |         os.path.join(os.getcwd(), os.path.dirname(__file__)))
53 |     vis_js = open(os.path.join(__location__, 'model_view.js')).read()
54 |     attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b)
55 |     params = {
56 |         'attention': attn_data,
57 |         'default_filter': "all"
58 |     }
59 | 
60 |     with open('bert_visualization.html', 'w') as f:
61 |         f.write(vis_html + '\n')
62 |         f.write("require.config({paths: {d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',"
63 |                 "jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',}});")
64 | 
65 |         f.write('window.params = %s' % json.dumps(params) + '\n')
66 |         f.write(vis_js + '\n')
67 | 
68 | 


--------------------------------------------------------------------------------
/py_ner/bertviz/neuron_view.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Tensor2Tensor Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Change log
17 | # 12/12/18  Jesse Vig   Adapted to BERT model
18 | # 12/19/18  Jesse Vig   Assorted cleanup. Changed orientation of attention matrices. Updated comments.
19 | 
20 | 
21 | """Module for postprocessing and displaying transformer attentions.
22 | 
23 | This module is designed to be called from an ipython notebook.
24 | """
25 | 
26 | import json
27 | from py_ner.bertviz.attention import get_attention
28 | 
29 | import os
30 | 
31 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None):
32 |     if sentence_b:
33 |         vis_html = """
34 |           <span style="user-select:none">
35 |             Layer: <select id="layer"></select>
36 |             Head: <select id="att_head"></select>
37 |             Attention: <select id="filter">
38 |               <option value="all">All</option>
39 |               <option value="aa">Sentence A -> Sentence A</option>
40 |               <option value="ab">Sentence A -> Sentence B</option>
41 |               <option value="ba">Sentence B -> Sentence A</option>
42 |               <option value="bb">Sentence B -> Sentence B</option>
43 |             </select>
44 |           </span>
45 |           <div id='vis'></div>
46 |         """
47 |     else:
48 |         vis_html = """
49 |           <span style="user-select:none">
50 |             Layer: <select id="layer"></select>
51 |             Head: <select id="att_head"></select>
52 |           </span>
53 |           <div id='vis'></div>
54 |         """
55 | 
56 |     __location__ = os.path.realpath(
57 |         os.path.join(os.getcwd(), os.path.dirname(__file__)))
58 |     vis_js = open(os.path.join(__location__, 'neuron_view.js')).read()
59 |     attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b, include_queries_and_keys=True)
60 |     if model_type == 'gpt2':
61 |         bidirectional = False
62 |     else:
63 |         bidirectional = True
64 |     params = {
65 |         'attention': attn_data,
66 |         'default_filter': "all",
67 |         'bidirectional': bidirectional
68 |     }
69 | 
70 |     with open('bert_visualization.html', 'w') as f:
71 |         f.write(vis_html + '\n')
72 |         f.write("require.config({paths: {d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',"
73 |                 "jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',}});")
74 | 
75 |         f.write('window.params = %s' % json.dumps(params) + '\n')
76 |         f.write(vis_js + '\n')


--------------------------------------------------------------------------------
/py_ner/bertviz/pytorch_transformers_attn/...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_ner/config/...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_ner/config/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hidden_size": 768,
 3 |   "maxlen" : 30,
 4 |   "epochs": 10,
 5 |   "batch_size": 256,
 6 |   "dropout": 0.1,
 7 |   "learning_rate": 5e-5,
 8 |   "warmup_proportion": 0.1,
 9 |   "gradient_accumulation_steps": 1,
10 |   "summary_step": 250,
11 |   "adam_epsilon": 1e-8,
12 |   "warmup_steps": 0,
13 |   "max_grad_norm": 1,
14 |   "logging_steps": 50,
15 |   "evaluate_during_training": true,
16 |   "save_steps": 250,
17 |   "output_dir": "./experiments/base_model_with_crf/checkpoints"
18 | }


--------------------------------------------------------------------------------
/py_ner/config/ner_to_index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "[CLS]": 0,
 3 |     "[SEP]": 1,
 4 |     "[PAD]": 2,
 5 |     "[MASK]": 3,
 6 |     "O": 4,
 7 |     "B-POH": 5,
 8 |     "I-POH": 6,
 9 |     "B-NOH": 7,
10 |     "I-NOH": 8,
11 |     "B-PNT": 9,
12 |     "I-PNT": 10,
13 |     "B-DAT": 11,
14 |     "I-DAT": 12,
15 |     "B-PER": 13,
16 |     "I-PER": 14,
17 |     "B-TIM": 15,
18 |     "I-TIM": 16,
19 |     "B-LOC": 17,
20 |     "I-LOC": 18,
21 |     "B-ORG": 19,
22 |     "I-ORG": 20,
23 |     "B-MNY": 21,
24 |     "I-MNY": 22,
25 |     "B-DUR": 23,
26 |     "I-DUR": 24
27 | }


--------------------------------------------------------------------------------
/py_ner/data/dataset_info.txt:
--------------------------------------------------------------------------------
1 | download all datasets for NER from http://informatics.yonsei.ac.kr/tsmm/download/ner_data.zip
2 | 


--------------------------------------------------------------------------------
/py_ner/data_utils/...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_ner/data_utils/pad_sequence.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import six
  7 | 
  8 | def keras_pad_fn(token_ids_batch, maxlen, pad_id=0, padding='post', truncating='post'):
  9 |     padded_token_ids_batch = pad_sequences(token_ids_batch,
 10 |                                             value=pad_id,  # vocab.transform_token2idx(PAD),
 11 |                                             padding=padding,
 12 |                                             truncating=truncating,
 13 |                                             maxlen=maxlen)
 14 |     return padded_token_ids_batch
 15 | 
 16 | # pad_sequences_fn in keras.preprocessing.sequence.pad_sequences
 17 | def pad_sequences(sequences, maxlen=None, dtype='int32',
 18 |                   padding='pre', truncating='pre', value=0.):
 19 |     """Pads sequences to the same length.
 20 | 
 21 |     This function transforms a list of
 22 |     `num_samples` sequences (lists of integers)
 23 |     into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
 24 |     `num_timesteps` is either the `maxlen` argument if provided,
 25 |     or the length of the longest sequence otherwise.
 26 | 
 27 |     Sequences that are shorter than `num_timesteps`
 28 |     are padded with `value` at the end.
 29 | 
 30 |     Sequences longer than `num_timesteps` are truncated
 31 |     so that they fit the desired length.
 32 |     The position where padding or truncation happens is determined by
 33 |     the arguments `padding` and `truncating`, respectively.
 34 | 
 35 |     Pre-padding is the default.
 36 | 
 37 |     # Arguments
 38 |         sequences: List of lists, where each element is a sequence.
 39 |         maxlen: Int, maximum length of all sequences.
 40 |         dtype: Type of the output sequences.
 41 |             To pad sequences with variable length strings, you can use `object`.
 42 |         padding: String, 'pre' or 'post':
 43 |             pad either before or after each sequence.
 44 |         truncating: String, 'pre' or 'post':
 45 |             remove values from sequences larger than
 46 |             `maxlen`, either at the beginning or at the end of the sequences.
 47 |         value: Float or String, padding value.
 48 | 
 49 |     # Returns
 50 |         x: Numpy array with shape `(len(sequences), maxlen)`
 51 | 
 52 |     # Raises
 53 |         ValueError: In case of invalid values for `truncating` or `padding`,
 54 |             or in case of invalid shape for a `sequences` entry.
 55 |     """
 56 |     if not hasattr(sequences, '__len__'):
 57 |         raise ValueError('`sequences` must be iterable.')
 58 |     num_samples = len(sequences)
 59 | 
 60 |     lengths = []
 61 |     for x in sequences:
 62 |         try:
 63 |             lengths.append(len(x))
 64 |         except TypeError:
 65 |             raise ValueError('`sequences` must be a list of iterables. '
 66 |                              'Found non-iterable: ' + str(x))
 67 | 
 68 |     if maxlen is None:
 69 |         maxlen = np.max(lengths)
 70 | 
 71 |     # take the sample shape from the first non empty sequence
 72 |     # checking for consistency in the main loop below.
 73 |     sample_shape = tuple()
 74 |     for s in sequences:
 75 |         if len(s) > 0:
 76 |             sample_shape = np.asarray(s).shape[1:]
 77 |             break
 78 | 
 79 |     is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)
 80 |     if isinstance(value, six.string_types) and dtype != object and not is_dtype_str:
 81 |         raise ValueError("`dtype` {} is not compatible with `value`'s type: {}\n"
 82 |                          "You should set `dtype=object` for variable length strings."
 83 |                          .format(dtype, type(value)))
 84 | 
 85 |     x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
 86 |     for idx, s in enumerate(sequences):
 87 |         if not len(s):
 88 |             continue  # empty list/array was found
 89 |         if truncating == 'pre':
 90 |             trunc = s[-maxlen:]
 91 |         elif truncating == 'post':
 92 |             trunc = s[:maxlen]
 93 |         else:
 94 |             raise ValueError('Truncating type "%s" '
 95 |                              'not understood' % truncating)
 96 | 
 97 |         # check `trunc` has expected shape
 98 |         trunc = np.asarray(trunc, dtype=dtype)
 99 |         if trunc.shape[1:] != sample_shape:
100 |             raise ValueError('Shape of sample %s of sequence at position %s '
101 |                              'is different from expected shape %s' %
102 |                              (trunc.shape[1:], idx, sample_shape))
103 | 
104 |         if padding == 'post':
105 |             x[idx, :len(trunc)] = trunc
106 |         elif padding == 'pre':
107 |             x[idx, -len(trunc):] = trunc
108 |         else:
109 |             raise ValueError('Padding type "%s" not understood' % padding)
110 |     return x
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     sequences = [[2, 4, 62], [2,35,12,24,2]]
115 |     pad_res = pad_sequences(sequences, maxlen=10, dtype='int32', padding='pre', truncating='post', value=0.)
116 |     keras_pad_res = keras_pad_fn(sequences, maxlen=10, pad_id=0, padding='post', truncating='post')
117 |     print(pad_res)
118 |     print(keras_pad_res)


--------------------------------------------------------------------------------
/py_ner/data_utils/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ref: https://github.com/aisolab/nlp_implementation/blob/master/Character-level_Convolutional_Networks_for_Text_Classification/utils.py
 3 | """
 4 | import json
 5 | import torch
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | class Config:
10 |     def __init__(self, json_path):
11 |         with open(json_path, mode='r') as io:
12 |             params = json.loads(io.read())
13 |         self.__dict__.update(params)
14 | 
15 |     def save(self, json_path):
16 |         with open(json_path, mode='w') as io:
17 |             json.dump(self.__dict__, io, indent=4)
18 | 
19 |     def update(self, json_path):
20 |         with open(json_path, mode='r') as io:
21 |             params = json.loads(io.read())
22 |         self.__dict__.update(params)
23 | 
24 |     @property
25 |     def dict(self):
26 |         return self.__dict__
27 | 
28 | 
29 | class CheckpointManager:
30 |     def __init__(self, model_dir):
31 |         if not isinstance(model_dir, Path):
32 |             model_dir = Path(model_dir)
33 |         self._model_dir = model_dir
34 | 
35 |     def save_checkpoint(self, state, filename):
36 |         torch.save(state, self._model_dir / filename)
37 | 
38 |     def load_checkpoint(self, filename):
39 |         state = torch.load(self._model_dir / filename, map_location=torch.device('cpu'))
40 |         return state
41 | 
42 | 
43 | class SummaryManager:
44 |     def __init__(self, model_dir):
45 |         if not isinstance(model_dir, Path):
46 |             model_dir = Path(model_dir)
47 |         self._model_dir = model_dir
48 |         self._summary = {}
49 | 
50 |     def save(self, filename):
51 |         with open(self._model_dir / filename, mode='w') as io:
52 |             json.dump(self._summary, io, indent=4)
53 | 
54 |     def load(self, filename):
55 |         with open(self._model_dir / filename, mode='r') as io:
56 |             metric = json.loads(io.read())
57 |         self.update(metric)
58 | 
59 |     def update(self, summary):
60 |         self._summary.update(summary)
61 | 
62 |     def reset(self):
63 |         self._summary = {}
64 | 
65 |     @property
66 |     def summary(self):
67 |         return self._summary


--------------------------------------------------------------------------------
/py_ner/find_learning_rate.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from torch import nn
 4 | from torch.optim import Adam
 5 | from torch.utils.data import DataLoader
 6 | from torch_lr_finder import LRFinder
 7 | from transformers import AutoTokenizer
 8 | import torch
 9 | import numpy as np
10 | 
11 | from py_ner.data_utils.data_utils import *
12 | 
13 | from py_ner.data_utils.ner_dataset import read_data_from_file, get_labels, NerDataset
14 | from py_ner.model.net import BertForTokenClassificationCustom
15 | from py_ner.model.optimizers import get_optimizer_with_weight_decay
16 | 
17 | DATA_TR_PATH = './data/JNLPBA/Genia4ERtask1.iob2'
18 | SEED = 42
19 | 
20 | # MODEL
21 | MODEL_NAME = 'allenai/scibert_scivocab_cased'
22 | MAX_LEN_SEQ = 128
23 | 
24 | # Optimization parameters
25 | BATCH_SIZE_TR = 32
26 | LEARNING_RATE = 1e-6
27 | CLIPPING = None
28 | OPTIMIZER = Adam
29 | 
30 | # get data
31 | training_set = read_data_from_file(DATA_TR_PATH)
32 | 
33 | # Automatically extract labels and their indexes from data.
34 | labels2ind, labels_count = get_labels(training_set)
35 | 
36 | # Load data
37 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38 | training_set = NerDataset(dataset=training_set,
39 |                           tokenizer=tokenizer,
40 |                           labels2ind=labels2ind,
41 |                           max_len_seq=MAX_LEN_SEQ,
42 |                           bert_hugging=False)
43 | 
44 | 
45 | dataloader_tr = DataLoader(dataset=training_set,
46 |                            batch_size=BATCH_SIZE_TR,
47 |                            shuffle=True)
48 | 
49 | # Seeds
50 | random.seed(SEED)
51 | np.random.seed(SEED)
52 | torch.manual_seed(SEED)
53 | torch.cuda.manual_seed_all(SEED)
54 | 
55 | legend = []
56 | fig = None
57 | 
58 | for wd in [0, .1, 1e-2, 1e-3, 1e-4]:
59 |     for dp in [.1, 0.2, .3]:
60 |         nerbert = BertForTokenClassificationCustom.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
61 |                                                                    num_labels=len(labels2ind),
62 |                                                                    hidden_dropout_prob=dp,
63 |                                                                    attention_probs_dropout_prob=dp)
64 | 
65 |         # Prepare optimizer and schedule (linear warmup and decay)
66 |         optimizer = get_optimizer_with_weight_decay(model=nerbert,
67 |                                                     optimizer=OPTIMIZER,
68 |                                                     learning_rate=LEARNING_RATE,
69 |                                                     weight_decay=wd)
70 | 
71 |         lr_finder = LRFinder(nerbert, optimizer, nn.CrossEntropyLoss(), device='cuda')
72 |         lr_finder.range_test(train_loader=dataloader_tr, end_lr=1, num_iter=100)
73 |         fig = lr_finder.plot(ax=fig)
74 |         legend.append(f"wd: {wd}")
75 | 
76 | fig.figure.legend(legend, loc='best')
77 | fig.figure.tight_layout()
78 | fig.figure.show()
79 | fig.figure.savefig('lr_finder.png')
80 | 


--------------------------------------------------------------------------------
/py_ner/kobert/...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_ner/kobert/mxnet_kobert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 SK T-Brain Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import os
 17 | import sys
 18 | import requests
 19 | import hashlib
 20 | 
 21 | import mxnet as mx
 22 | import gluonnlp as nlp
 23 | from gluonnlp.model import BERTModel, BERTEncoder
 24 | 
 25 | from .utils import download as _download
 26 | 
 27 | 
 28 | kobert_models = {
 29 |     'mxnet_kobert': {
 30 |         'url':
 31 |         'https://kobert.blob.core.windows.net/models/kobert/mxnet/mxnet_kobert_45b6957552.params',
 32 |         'fname': 'mxnet_kobert_45b6957552.params',
 33 |         'chksum': '45b6957552'
 34 |     },
 35 |     'vocab': {
 36 |         'url':
 37 |         'https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json',
 38 |         'fname': 'kobertvocab_f38b8a4d6d.json',
 39 |         'chksum': 'f38b8a4d6d'
 40 |     }
 41 | }
 42 | 
 43 | 
 44 | def get_mxnet_kobert_model(use_pooler=True,
 45 |                            use_decoder=True,
 46 |                            use_classifier=True,
 47 |                            ctx=mx.cpu(0),
 48 |                            cachedir='./ptr_lm_model'):
 49 |     # download model
 50 |     model_info = kobert_models['mxnet_kobert']
 51 |     model_path = _download(model_info['url'],
 52 |                            model_info['fname'],
 53 |                            model_info['chksum'],
 54 |                            cachedir=cachedir)
 55 |     # download vocab
 56 |     vocab_info = kobert_models['vocab']
 57 |     vocab_path = _download(vocab_info['url'],
 58 |                            vocab_info['fname'],
 59 |                            vocab_info['chksum'],
 60 |                            cachedir=cachedir)
 61 |     return get_kobert_model(model_path, vocab_path, use_pooler, use_decoder,
 62 |                             use_classifier, ctx)
 63 | 
 64 | 
 65 | def get_kobert_model(model_file,
 66 |                      vocab_file,
 67 |                      use_pooler=True,
 68 |                      use_decoder=True,
 69 |                      use_classifier=True,
 70 |                      ctx=mx.cpu(0)):
 71 |     vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read())
 72 | 
 73 |     predefined_args = {
 74 |         'attention_cell': 'multi_head',
 75 |         'num_layers': 12,
 76 |         'units': 768,
 77 |         'hidden_size': 3072,
 78 |         'max_length': 512,
 79 |         'num_heads': 12,
 80 |         'scaled': True,
 81 |         'dropout': 0.1,
 82 |         'use_residual': True,
 83 |         'embed_size': 768,
 84 |         'embed_dropout': 0.1,
 85 |         'token_type_vocab_size': 2,
 86 |         'word_embed': None,
 87 |     }
 88 | 
 89 |     encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
 90 |                           num_layers=predefined_args['num_layers'],
 91 |                           units=predefined_args['units'],
 92 |                           hidden_size=predefined_args['hidden_size'],
 93 |                           max_length=predefined_args['max_length'],
 94 |                           num_heads=predefined_args['num_heads'],
 95 |                           scaled=predefined_args['scaled'],
 96 |                           dropout=predefined_args['dropout'],
 97 |                           output_attention=False,
 98 |                           output_all_encodings=False,
 99 |                           use_residual=predefined_args['use_residual'])
100 | 
101 |     # BERT
102 |     net = BERTModel(
103 |         encoder,
104 |         len(vocab_b_obj.idx_to_token),
105 |         token_type_vocab_size=predefined_args['token_type_vocab_size'],
106 |         units=predefined_args['units'],
107 |         embed_size=predefined_args['embed_size'],
108 |         embed_dropout=predefined_args['embed_dropout'],
109 |         word_embed=predefined_args['word_embed'],
110 |         use_pooler=use_pooler,
111 |         use_decoder=use_decoder,
112 |         use_classifier=use_classifier)
113 |     net.initialize(ctx=ctx)
114 |     net.load_parameters(model_file, ctx, ignore_extra=True)
115 |     return (net, vocab_b_obj)
116 | 


--------------------------------------------------------------------------------
/py_ner/kobert/pytorch_kobert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 SK T-Brain Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import sys
18 | import requests
19 | import hashlib
20 | 
21 | import torch
22 | 
23 | from transformers import BertModel, BertConfig
24 | # from pytorch_pretrained_bert import BertModel, BertConfig
25 | import gluonnlp as nlp
26 | 
27 | from .utils import download as _download
28 | 
29 | kobert_models = {
30 |     'pytorch_kobert': {
31 |         'url':
32 |         'https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params',
33 |         'fname': 'pytorch_kobert_2439f391a6.params',
34 |         'chksum': '2439f391a6'
35 |     },
36 |     'vocab': {
37 |         'url':
38 |         'https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json',
39 |         'fname': 'kobertvocab_f38b8a4d6d.json',
40 |         'chksum': 'f38b8a4d6d'
41 |     }
42 | }
43 | 
44 | 
45 | bert_config = {'attention_probs_dropout_prob': 0.1,
46 |  'hidden_act': 'gelu',
47 |  'hidden_dropout_prob': 0.1,
48 |  'hidden_size': 768,
49 |  'initializer_range': 0.02,
50 |  'intermediate_size': 3072,
51 |  'max_position_embeddings': 512,
52 |  'num_attention_heads': 12,
53 |  'num_hidden_layers': 12,
54 |  'type_vocab_size': 2,
55 |  'vocab_size': 8002}
56 | 
57 | 
58 | 
59 | def get_pytorch_kobert_model(ctx='cpu',
60 |                            cachedir='./ptr_lm_model'):
61 |     # download model
62 |     model_info = kobert_models['pytorch_kobert']
63 |     model_path = _download(model_info['url'],
64 |                            model_info['fname'],
65 |                            model_info['chksum'],
66 |                            cachedir=cachedir)
67 |     # download vocab
68 |     vocab_info = kobert_models['vocab']
69 |     vocab_path = _download(vocab_info['url'],
70 |                            vocab_info['fname'],
71 |                            vocab_info['chksum'],
72 |                            cachedir=cachedir)
73 |     return get_kobert_model(model_path, vocab_path, ctx)
74 | 
75 | 
76 | 
77 | def get_kobert_model(model_file, vocab_file, ctx="cpu"):
78 |     bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
79 |     bertmodel.load_state_dict(torch.load(model_file))
80 |     device = torch.device(ctx)
81 |     bertmodel.to(device)
82 |     bertmodel.eval()
83 |     vocab_b_obj = nlp.vocab.BERTVocab.from_json(
84 |         open(vocab_file, 'rt').read())
85 |     return bertmodel, vocab_b_obj
86 | 


--------------------------------------------------------------------------------
/py_ner/kobert/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 SK T-Brain Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import sys
18 | import requests
19 | import hashlib
20 | 
21 | kobert_models = {
22 |     'onnx_kobert': {
23 |         'url':
24 |         'https://kobert.blob.core.windows.net/models/kobert/onnx/onnx_kobert_44529811f0.onnx',
25 |         'fname': 'onnx_kobert_44529811f0.onnx',
26 |         'chksum': '44529811f0'
27 |     },
28 |     'tokenizer': {
29 |         'url':
30 |         'https://kobert.blob.core.windows.net/models/kobert/tokenizer/tokenizer_78b3253a26.model',
31 |         'fname': 'tokenizer_78b3253a26.model',
32 |         'chksum': '78b3253a26'
33 |     }
34 | }
35 | 
36 | 
37 | def download(url, filename, chksum, cachedir='./ptr_lm_model'):
38 |     f_cachedir = os.path.expanduser(cachedir)
39 |     os.makedirs(f_cachedir, exist_ok=True)
40 |     file_path = os.path.join(f_cachedir, filename)
41 |     if os.path.isfile(file_path):
42 |         if hashlib.md5(open(file_path,
43 |                             'rb').read()).hexdigest()[:10] == chksum:
44 |             print('using cached model')
45 |             return file_path
46 |     with open(file_path, 'wb') as f:
47 |         response = requests.get(url, stream=True)
48 |         total = response.headers.get('content-length')
49 | 
50 |         if total is None:
51 |             f.write(response.content)
52 |         else:
53 |             downloaded = 0
54 |             total = int(total)
55 |             for data in response.iter_content(
56 |                     chunk_size=max(int(total / 1000), 1024 * 1024)):
57 |                 downloaded += len(data)
58 |                 f.write(data)
59 |                 done = int(50 * downloaded / total)
60 |                 sys.stdout.write('\r[{}{}]'.format('█' * done,
61 |                                                    '.' * (50 - done)))
62 |                 sys.stdout.flush()
63 |     sys.stdout.write('\n')
64 |     assert chksum == hashlib.md5(open(
65 |         file_path, 'rb').read()).hexdigest()[:10], 'corrupted file!'
66 |     return file_path
67 | 
68 | 
69 | def get_onnx(cachedir='./ptr_lm_model'):
70 |     """Get KoBERT ONNX file path after downloading
71 |     """
72 |     model_info = kobert_models['onnx_kobert']
73 |     return download(model_info['url'],
74 |                     model_info['fname'],
75 |                     model_info['chksum'],
76 |                     cachedir=cachedir)
77 | 
78 | def get_tokenizer(cachedir='./ptr_lm_model'):
79 |     """Get KoBERT Tokenizer file path after downloading
80 |     """
81 |     model_info = kobert_models['tokenizer']
82 |     return download(model_info['url'],
83 |                         model_info['fname'],
84 |                         model_info['chksum'],
85 |                         cachedir=cachedir)    
86 | 


--------------------------------------------------------------------------------
/py_ner/model/...:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_ner/model/optimizers.py:
--------------------------------------------------------------------------------
 1 | from transformers import PreTrainedModel
 2 | import torch
 3 | from typing import Union
 4 | 
 5 | 
 6 | def get_optimizer_with_weight_decay(model: PreTrainedModel,
 7 |                                     optimizer: torch.optim.Optimizer,
 8 |                                     learning_rate: Union[float, int],
 9 |                                     weight_decay: Union[float, int]) -> torch.optim.Optimizer:
10 |     """
11 |     Apply weight decay to all the network parameters but those called `bias` or  `LayerNorm.weight`.
12 |     Args:
13 |         model (`PreTrainedModel`): model to apply weight decay.
14 |         optimizer (`torch.optim.Optimizer`): The optimizer to use during training.
15 |         learning_rate (`float` or `int`): value of the learning rate to use during training.
16 |         weight_decay (`float` or `int`): value of the weight decay to apply.
17 | 
18 |     Returns:
19 |         optimizer (`torch.optim.Optimizer`): the optimizer instantiated with the selected
20 |         learning rate and the parameters with and without weight decay.
21 | 
22 |     """
23 |     no_decay = ["bias", "LayerNorm.weight"]
24 |     params = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]
25 |     params_nd = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]
26 |     optimizer_grouped_parameters = [{"params": params, "weight_decay": weight_decay},
27 |                                     {"params": params_nd, "weight_decay": 0.0}]
28 | 
29 |     return optimizer(optimizer_grouped_parameters, lr=learning_rate)


--------------------------------------------------------------------------------
/py_ner/model/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Callable, Union, Dict
 2 | import json
 3 | 
 4 | class Config:
 5 |     def __init__(self, json_path):
 6 |         with open(json_path, mode='r') as io:
 7 |             params = json.loads(io.read())
 8 |         self.__dict__.update(params)
 9 | 
10 |     def save(self, json_path):
11 |         with open(json_path, mode='w') as io:
12 |             json.dump(self.__dict__, io, indent=4)
13 | 
14 |     def update(self, json_path):
15 |         with open(json_path, mode='r') as io:
16 |             params = json.loads(io.read())
17 |         self.__dict__.update(params)
18 | 
19 |     @property
20 |     def dict(self):
21 |         return self.__dict__
22 | 
23 | 
24 | 
25 | class PadSequence:
26 |     """PadSequence class"""
27 | 
28 |     def __init__(self, length: int, pad_val: int = 0, clip: bool = True) -> None:
29 |         """Instantiating PadSequence class
30 |         Args:
31 |             length (int): the maximum length to pad/clip the sequence
32 |             pad_val (int): the pad value
33 |             clip (bool): whether to clip the length, if sample length is longer than maximum length
34 |         """
35 |         self._length = length
36 |         self._pad_val = pad_val
37 |         self._clip = clip
38 | 
39 |     def __call__(self, sample):
40 |         sample_length = len(sample)
41 |         if sample_length >= self._length:
42 |             if self._clip and sample_length > self._length:
43 |                 return sample[: self._length]
44 |             else:
45 |                 return sample
46 |         else:
47 |             return sample + [self._pad_val for _ in range(self._length - sample_length)]


--------------------------------------------------------------------------------
/py_ner/ner_crf.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn_crfsuite import CRF
  5 | from sklearn_crfsuite.metrics import flat_f1_score
  6 | from sklearn_crfsuite.metrics import flat_classification_report
  7 | 
  8 | #Reading the csv file
  9 | df = pd.read_csv('data/ner_dataset.csv', encoding = "ISO-8859-1")
 10 | 
 11 | #Display first 10 rows
 12 | print(str(df.head(10)))
 13 | 
 14 | print(str(df.describe()))
 15 | 
 16 | #Displaying the unique Tags
 17 | print(str(df['Tag'].unique()))
 18 | 
 19 | 
 20 | #Checking null values, if any.
 21 | df.isnull().sum()
 22 | 
 23 | df = df.fillna(method = 'ffill')
 24 | 
 25 | # This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
 26 | class sentence(object):
 27 |     def __init__(self, df):
 28 |         self.n_sent = 1
 29 |         self.df = df
 30 |         self.empty = False
 31 |         agg = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
 32 |                                                       s['POS'].values.tolist(),
 33 |                                                       s['Tag'].values.tolist())]
 34 |         self.grouped = self.df.groupby("Sentence #").apply(agg)
 35 |         self.sentences = [s for s in self.grouped]
 36 | 
 37 |     def get_text(self):
 38 |         try:
 39 |             s = self.grouped['Sentence: {}'.format(self.n_sent)]
 40 |             self.n_sent += 1
 41 |             return s
 42 |         except:
 43 |             return None
 44 | 
 45 | #Displaying one full sentence
 46 | getter = sentence(df)
 47 | sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
 48 | sentences[0]
 49 | 
 50 | #sentence with its pos and tag.
 51 | sent = getter.get_text()
 52 | print(sent)
 53 | 
 54 | sentences = getter.sentences
 55 | 
 56 | def word2features(sent, i):
 57 |     word = sent[i][0]
 58 |     postag = sent[i][1]
 59 | 
 60 |     features = {
 61 |         'bias': 1.0,
 62 |         'word.lower()': word.lower(),
 63 |         'word[-3:]': word[-3:],
 64 |         'word[-2:]': word[-2:],
 65 |         'word.isupper()': word.isupper(),
 66 |         'word.istitle()': word.istitle(),
 67 |         'word.isdigit()': word.isdigit(),
 68 |         'postag': postag,
 69 |         'postag[:2]': postag[:2],
 70 |     }
 71 |     if i > 0:
 72 |         word1 = sent[i-1][0]
 73 |         postag1 = sent[i-1][1]
 74 |         features.update({
 75 |             '-1:word.lower()': word1.lower(),
 76 |             '-1:word.istitle()': word1.istitle(),
 77 |             '-1:word.isupper()': word1.isupper(),
 78 |             '-1:postag': postag1,
 79 |             '-1:postag[:2]': postag1[:2],
 80 |         })
 81 |     else:
 82 |         features['BOS'] = True
 83 |     if i < len(sent)-1:
 84 |         word1 = sent[i+1][0]
 85 |         postag1 = sent[i+1][1]
 86 |         features.update({
 87 |             '+1:word.lower()': word1.lower(),
 88 |             '+1:word.istitle()': word1.istitle(),
 89 |             '+1:word.isupper()': word1.isupper(),
 90 |             '+1:postag': postag1,
 91 |             '+1:postag[:2]': postag1[:2],
 92 |         })
 93 |     else:
 94 |         features['EOS'] = True
 95 | 
 96 |     return features
 97 | 
 98 | 
 99 | def sent2features(sent):
100 |     return [word2features(sent, i) for i in range(len(sent))]
101 | 
102 | def sent2labels(sent):
103 |     return [label for token, postag, label in sent]
104 | 
105 | def sent2tokens(sent):
106 |     return [token for token, postag, label in sent]
107 | 
108 | X = [sent2features(s) for s in sentences]
109 | y = [sent2labels(s) for s in sentences]
110 | 
111 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
112 | 
113 | crf = CRF(algorithm = 'lbfgs',
114 |          c1 = 0.1,
115 |          c2 = 0.1,
116 |          max_iterations = 100,
117 |          all_possible_transitions = False)
118 | crf.fit(X_train, y_train)
119 | 
120 | #Predicting on the test set.
121 | y_pred = crf.predict(X_test)
122 | 
123 | f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
124 | print(f1_score)
125 | 
126 | report = flat_classification_report(y_test, y_pred)
127 | print(report)
128 | 
129 | 


--------------------------------------------------------------------------------
/py_node2vec/node2vecModel.py:
--------------------------------------------------------------------------------
  1 | import gensim
  2 | import networkx as nx
  3 | from node2vec import Node2Vec
  4 | 
  5 | # Embed edges using Hadamard method
  6 | from node2vec.edges import HadamardEmbedder
  7 | import multiprocessing
  8 | 
  9 | 
 10 | class Node2VecModel:
 11 |     def __init__(self):
 12 |         self.model = None
 13 |         self.G = nx.Graph()
 14 | 
 15 |     def create_random_graph(self):
 16 |         # Create a graph
 17 |         self.G = nx.fast_gnp_random_graph(n=100, p=0.5)
 18 | 
 19 |     def create_graph(self, co_occurrence, word_hist, threshold):
 20 |         filtered_word_list = []
 21 |         for pair in co_occurrence:
 22 |             node1 = ''
 23 |             node2 = ''
 24 |             for inner_pair in pair:
 25 |                 if type(inner_pair) is tuple:
 26 |                     node1 = inner_pair[0]
 27 |                     node2 = inner_pair[1]
 28 |                 elif type(inner_pair) is str:
 29 |                     inner_pair = inner_pair.split()
 30 |                     if len(inner_pair) == 2:
 31 |                         node1 = inner_pair[0]
 32 |                         node2 = inner_pair[1]
 33 |                 elif type(inner_pair) is int:
 34 |                     if float(inner_pair) >= threshold:
 35 |                         # print ("X " + node1 + " == " + node2 + " == " + str(inner_pair) + " : " + str(tuple[node1]))
 36 |                         self.G.add_edge(node1, node2, weight=float(inner_pair))
 37 |                         if node1 not in filtered_word_list:
 38 |                             filtered_word_list.append(node1)
 39 |                         if node2 not in filtered_word_list:
 40 |                             filtered_word_list.append(node2)
 41 |                 elif type(inner_pair) is float:
 42 |                     if float(inner_pair) >= threshold:
 43 |                         # print ("X " + node1 + " == " + node2 + " == " + str(inner_pair) + " : ")
 44 |                         self.G.add_edge(node1, node2, weight=float(inner_pair))
 45 |                         if node1 not in filtered_word_list:
 46 |                             filtered_word_list.append(node1)
 47 |                         if node2 not in filtered_word_list:
 48 |                             filtered_word_list.append(node2)
 49 | 
 50 |         for word in word_hist:
 51 |             if str(word) in filtered_word_list:
 52 |                 self.G.add_node(word, count=word_hist[word])
 53 | 
 54 |         print(self.G.number_of_nodes())
 55 | 
 56 |     def train(self, dimensions, walk_length, num_walks):
 57 |         cores = multiprocessing.cpu_count()  # Count the number of cores in a computer
 58 |         # Precompute probabilities and generate walks
 59 |         node2vec = Node2Vec(self.G,
 60 |                             dimensions=dimensions,
 61 |                             walk_length=walk_length,
 62 |                             num_walks=num_walks,
 63 |                             workers=cores - 1)
 64 | 
 65 |         ## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
 66 |         # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
 67 |         # node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")
 68 | 
 69 |         # Embed
 70 |         self.model = node2vec.fit(window=10, min_count=1,
 71 |                                   batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
 72 | 
 73 |     def save_model(self, embedding_filename, embedding_model_file):
 74 |         # Save embeddings for later use
 75 |         self.model.wv.save_word2vec_format(embedding_filename)
 76 | 
 77 |         # Save model for later use
 78 |         self.model.save(embedding_model_file)
 79 | 
 80 |     def load_model(self, embedding_filename):
 81 |         self.model = gensim.models.KeyedVectors.load_word2vec_format(embedding_filename)
 82 | 
 83 |     def most_similars(self, word):
 84 |         # Look for most similar nodes
 85 |         return self.model.wv.most_similar(word)  # Output node names are always strings
 86 | 
 87 |     def compute_similarity(self, first_node, second_node):
 88 |         edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv)
 89 | 
 90 |         # Look for embeddings on the fly - here we pass normal tuples
 91 |         edges_embs[(first_node, second_node)]
 92 |         ''' OUTPUT
 93 |         array([ 5.75068220e-03, -1.10937878e-02,  3.76693785e-01,  2.69105062e-02,
 94 |                ... ... ....
 95 |                ..................................................................],
 96 |               dtype=float32)
 97 |         '''
 98 | 
 99 |         # Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks
100 |         edges_kv = edges_embs.as_keyed_vectors()
101 | 
102 |         # Look for most similar edges - this time tuples must be sorted and as str
103 |         results = edges_kv.most_similar(str((first_node, second_node)))
104 | 
105 |         # Save embeddings for later use
106 |         # edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
107 | 
108 |         return results


--------------------------------------------------------------------------------
/py_topic_model/__init__.py:
--------------------------------------------------------------------------------
1 | from py_topic_model import *


--------------------------------------------------------------------------------
/py_topic_model/gdmr_plot.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tomotopy as tp
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.colors as clr
 6 | 
 7 | class ExpNormalize(clr.Normalize):
 8 |     def __init__(self, scale):
 9 |         super().__init__()
10 |         self.scale = scale
11 | 
12 |     def __call__(self, value, clip=None):
13 |         if clip is None:
14 |             clip = self.clip
15 | 
16 |         result, is_scalar = self.process_value(value)
17 | 
18 |         self.autoscale_None(result)
19 |         (vmin,), _ = self.process_value(self.vmin)
20 |         (vmax,), _ = self.process_value(self.vmax)
21 |         if vmin == vmax:
22 |             result.fill(0)
23 |         elif vmin > vmax:
24 |             raise ValueError("minvalue must be less than or equal to maxvalue")
25 |         else:
26 |             if clip:
27 |                 mask = np.ma.getmask(result)
28 |                 result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax),
29 |                                     mask=mask)
30 |             resdat = result.data
31 |             resdat = 1 - np.exp(-2 * resdat / self.scale)
32 |             result = np.ma.array(resdat, mask=result.mask, copy=False)
33 |         if is_scalar:
34 |             result = result[0]
35 |         return result
36 | 
37 | heat = clr.LinearSegmentedColormap.from_list('heat',
38 |     [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)],
39 |     N=1024
40 | )
41 | 
42 | corpus = tp.utils.Corpus()
43 | for line in open('./topic_model/dataset2.txt', encoding='utf-8'):
44 |     fd = line.strip().split()
45 |     corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))
46 | 
47 | # We set a range of the first metadata as [2000, 2017]
48 | # and one of the second metadata as [0, 1].
49 | mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=10, degrees=[4, 3],
50 |     alpha=1e-2, sigma=0.25, sigma0=3.0,
51 |     metadata_range=[(2000, 2017), (0, 1)], corpus=corpus
52 | )
53 | mdl.optim_interval = 20
54 | mdl.burn_in = 200
55 | 
56 | mdl.train(0)
57 | 
58 | print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
59 |     len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
60 | ))
61 | 
62 | # Let's train the model
63 | for i in range(0, 1000, 20):
64 |     print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
65 |     mdl.train(20)
66 | print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
67 | 
68 | # Let's visualize the result
69 | topic_counts = mdl.get_count_by_topics()
70 | lambdas = mdl.lambdas
71 | 
72 | md_range = mdl.metadata_range
73 | # Our topic distribution map has
74 | # 400 pixels for the first axis and
75 | # 200 pixels for the second axis.
76 | r = mdl.tdf_linspace(
77 |     [md_range[0][0], md_range[1][0]],
78 |     [md_range[0][1], md_range[1][1]],
79 |     [400, 200]
80 | )
81 | 
82 | for k in (-topic_counts).argsort():
83 |     print('Topic #{} ({})'.format(k, topic_counts[k]))
84 |     print(*(w for w, _ in mdl.get_topic_words(k)))
85 |     print('Lambda:', lambdas[k])
86 | 
87 |     imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()),
88 |         origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04),
89 |         extent=[*md_range[0], *md_range[1]],
90 |         aspect='auto'
91 |     )
92 |     plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
93 |     plt.colorbar()
94 |     plt.show()


--------------------------------------------------------------------------------
/py_topic_model/ldaInference.py:
--------------------------------------------------------------------------------
 1 | import pyLDAvis.gensim
 2 | import pickle
 3 | import gensim
 4 | 
 5 | class ldaInference:
 6 |     def __init__(self, dictionary_model='dictionary.gensim', corpus_model='corpus.pkl', lda_model='model5.gensim'):
 7 |         self.dictionary = gensim.corpora.Dictionary.load(dictionary_model)
 8 |         self.corpus = pickle.load(open(corpus_model, 'rb'))
 9 |         self.lda = gensim.models.ldamodel.LdaModel.load(lda_model)
10 | 
11 |     def infer(self, document):
12 |         test_doc = [self.dictionary.doc2bow(document.split(" "))]
13 |         inferred_matrix = self.lda.inference(test_doc)
14 | 
15 |         return inferred_matrix
16 | 
17 | if __name__ == '__main__':
18 |     a_document = '한국 시장경제가 위기입니다.'
19 |     inferred_topics = ldaInference().infer(a_document)
20 | 
21 |     print(str(inferred_topics))
22 | 


--------------------------------------------------------------------------------
/py_topic_model/ldaSeqModel.py:
--------------------------------------------------------------------------------
 1 | import numpy  # for arrays, array broadcasting etc.
 2 | from gensim.models import ldaseqmodel, ldamodel
 3 | from gensim.corpora import Dictionary
 4 | import os.path
 5 | import logging
 6 | 
 7 | module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
 8 | 
 9 | class ldaSeqModel():
10 | 
11 |     def __init__(self):
12 |         name = 'ldaSeqModel'
13 | 
14 |     def run(self, document_collection, topic_count=2, time_group=[10,10,11]):
15 |         """document_collection should be sorted in order of time_slice."""
16 |         dictionary = Dictionary(document_collection)
17 |         corpus = [dictionary.doc2bow(text) for text in document_collection]
18 |         ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=topic_count, time_slice=time_group)
19 | 
20 |         topics = ldaseq.print_topics(1)
21 |         for topic in topics:
22 |             print("TOPIC " + str(topic))
23 | 
24 |         return ldaseq
25 | 
26 |     def parseDocuments(self, document_file, year_index, document_index):
27 |         """make document along with time information"""
28 |         dict = {}
29 |         with open(document_file, encoding='utf-8') as ins:
30 |             for line in ins:
31 |                 #print("LINE " + line)
32 |                 fields = line.split('\t')
33 |                 _year = fields[year_index]
34 |                 _document = fields[document_index]
35 | 
36 |                 if _year not in dict:
37 |                     d = []
38 |                     d.append(_document)
39 |                     dict[_year] = d
40 | 
41 |                 else:
42 |                     print("DOC " + _year)
43 |                     _docu_ = dict.get(_year)
44 |                     _docu_.append(_document)
45 |         return dict
46 | 
47 |     def parseProcessedText(self, processed_documents, pair_map):
48 |         """make document along with time information"""
49 |         dict = {}
50 |         for doc in processed_documents:
51 |             for line in doc:
52 |                 #print("LINE " + line)
53 |                 fields = line.split('\t')
54 |                 _year = fields[year_index]
55 |                 _document = fields[document_index]
56 | 
57 |                 if _year not in dict:
58 |                     d = []
59 |                     d.append(_document)
60 |                     dict[_year] = d
61 | 
62 |                 else:
63 |                     print("DOC " + _year)
64 |                     _docu_ = dict.get(_year)
65 |                     _docu_.append(_document)
66 |         return dict
67 | 
68 | if __name__ == '__main__':
69 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
70 |     document_file = "../time_test.txt"
71 |     year_index = 0
72 |     document_index = 1
73 |     _dict = ldaSeqModel().parseDocuments(document_file, year_index, document_index)
74 | 
75 |     #import pyTextMiner as ptm
76 |     #corpus = ptm.CorpusFromFieldDelimitedFileWithYear('time_test.txt', 1, 0)
77 |     #pair_map = corpus.pair_map
78 | 
79 |     time_slice = []
80 |     key_size = len(_dict)
81 |     doc_coll = _dict.values()
82 |     for k, v in _dict.items():
83 |         time_slice.append(len(v))
84 |     ldaSeqModel().run(doc_coll,5,time_slice)
85 | 


--------------------------------------------------------------------------------
/py_topic_model/ldaVisualizer.py:
--------------------------------------------------------------------------------
 1 | import pyLDAvis.gensim
 2 | import pickle
 3 | import gensim
 4 | 
 5 | dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
 6 | corpus = pickle.load(open('corpus.pkl', 'rb'))
 7 | lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
 8 | 
 9 | lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
10 | #pyLDAvis.display(lda_display)
11 | 
12 | pyLDAvis.save_html(lda_display, 'vis.html')
13 | 
14 | from gensim.test.utils import common_corpus
15 | 
16 | print(str(common_corpus))


--------------------------------------------------------------------------------
/py_topic_model/tfidf.py:
--------------------------------------------------------------------------------
 1 | from gensim import corpora, models, similarities
 2 | import pickle
 3 | import gensim
 4 | 
 5 | class tfidf:
 6 |     def __init__(self):
 7 |         name = 'tfidf'
 8 | 
 9 |     def createDictionary(self, text_data):
10 |         dictionary = corpora.Dictionary(text_data)
11 |         corpus = [dictionary.doc2bow(text) for text in text_data]
12 | 
13 |         pickle.dump(corpus, open('corpus.pkl', 'wb'))
14 |         dictionary.save('dictionary.gensim')
15 | 
16 |         return corpus, dictionary
17 | 
18 |     def run(self, text_data):
19 |         _corpus, dictionary = self.createDictionary(text_data)
20 |         tf_idf = models.TfidfModel(_corpus)  # step 1 -- initialize a model
21 |         corpus_tfidf = tf_idf[_corpus]
22 |         for doc in corpus_tfidf:
23 |             print(doc)
24 | 
25 |         lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)  # initialize an LSI transformation
26 |         result = lsi.print_topics(5,20)
27 |         for a_topic in result:
28 |             print("LSI results " + str(a_topic))
29 | 
30 |         corpus_lsi = lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
31 |         #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
32 |             #print(doc)
33 | 
34 | if __name__ == '__main__':
35 |     import pyTextMiner as ptm
36 |     import io
37 |     import nltk
38 | 
39 |     corpus = ptm.CorpusFromFile('../donald.txt')
40 |     pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
41 |                             ptm.helper.POSFilter('NN*'),
42 |                             ptm.helper.SelectWordOnly(),
43 |                             ptm.helper.StopwordFilter(file='../stopwordsKor.txt'),
44 |                             ptm.ngram.NGramTokenizer(3))
45 | 
46 |     result = pipeline.processCorpus(corpus)
47 | 
48 |     id = 0
49 |     text_data = []
50 |     for doc in result:
51 |         new_doc = []
52 |         for sent in doc:
53 |             for _str in sent:
54 |                 if len(_str) > 0:
55 |                     new_doc.append(_str)
56 |         text_data.append(new_doc)
57 |         id += 1
58 | 
59 |     tfidf().run(text_data)


--------------------------------------------------------------------------------
/py_word2vec/__init__.py:
--------------------------------------------------------------------------------
1 | from py_word2vec import *
2 | 


--------------------------------------------------------------------------------
/py_word2vec/avgDocumentByW2V.py:
--------------------------------------------------------------------------------
  1 | import gensim
  2 | from gensim import utils
  3 | import numpy as np
  4 | import sys
  5 | from sklearn.datasets import fetch_20newsgroups
  6 | from nltk import word_tokenize
  7 | from nltk import download
  8 | from nltk.corpus import stopwords
  9 | import matplotlib.pyplot as plt
 10 | from sklearn.decomposition import PCA
 11 | 
 12 | from matplotlib import pyplot
 13 | import pyTextMiner as ptm
 14 | 
 15 | #model Google News, run once to download pre-trained vectors
 16 | #!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
 17 | model = gensim.models.KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)
 18 | 
 19 | # Fetch ng20 dataset
 20 | ng20 = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))
 21 | # text and ground truth labels
 22 | texts, y = ng20.data, ng20.target
 23 | 
 24 | #corpus = [preprocess(text) for text in texts]
 25 | pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
 26 |                         ptm.tokenizer.Word(),
 27 |                         ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'),
 28 |                         ptm.stemmer.Porter())
 29 | result = pipeline.processCorpus(texts)
 30 | corpus = []
 31 | for doc in result:
 32 |     document = []
 33 |     for sent in doc:
 34 |         for word in sent:
 35 |             document.append(word)
 36 |     corpus.append(document)
 37 | 
 38 | # ### Remove empty docs
 39 | def filter_docs(corpus, texts, labels, condition_on_doc):
 40 |     """
 41 |     Filter corpus, texts and labels given the function condition_on_doc which takes
 42 |     a doc.
 43 |     The document doc is kept if condition_on_doc(doc) is true.
 44 |     """
 45 |     number_of_docs = len(corpus)
 46 | 
 47 |     if texts is not None:
 48 |         texts = [text for (text, doc) in zip(texts, corpus)
 49 |                  if condition_on_doc(doc)]
 50 | 
 51 |     labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
 52 |     corpus = [doc for doc in corpus if condition_on_doc(doc)]
 53 | 
 54 |     print("{} docs removed".format(number_of_docs - len(corpus)))
 55 | 
 56 |     return (corpus, texts, labels)
 57 | 
 58 | corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0))
 59 | 
 60 | # ### Remove OOV words and documents with no words in model dictionary
 61 | def document_vector(word2vec_model, doc):
 62 |     # remove out-of-vocabulary words
 63 |     doc = [word for word in doc if word in word2vec_model.vocab]
 64 |     return np.mean(word2vec_model[doc], axis=0)
 65 | 
 66 | def has_vector_representation(word2vec_model, doc):
 67 |     """check if at least one word of the document is in the
 68 |     word2vec dictionary"""
 69 |     return not all(word not in word2vec_model.vocab for word in doc)
 70 | 
 71 | corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: has_vector_representation(model, doc))
 72 | 
 73 | x =[]
 74 | for doc in corpus: #look up each doc in model
 75 |     x.append(document_vector(model, doc))
 76 | 
 77 | X = np.array(x) #list to array
 78 | 
 79 | 
 80 | np.savetxt('documents_vectors.txt', X)
 81 | np.savetxt('labels.txt', y)
 82 | 
 83 | print(str(X.shape) + " " + str(len(y)))
 84 | 
 85 | # ### Sanity check
 86 | print(texts[4664])
 87 | 
 88 | print(str(y[4664]) + " " + str(ng20.target_names[11]))
 89 | 
 90 | # ### Plot 2 PCA components
 91 | pca = PCA(n_components=2)
 92 | x_pca = pca.fit_transform(X)
 93 | 
 94 | plt.figure(1, figsize=(30, 20),)
 95 | plt.scatter(x_pca[:, 0], x_pca[:, 1],s=100, c=y, alpha=0.2)
 96 | plt.savefig('doc_vector_PCA.png', dpi=100)
 97 | 
 98 | # ### Plot t-SNE
 99 | from sklearn.manifold import TSNE
100 | X_tsne = TSNE(n_components=2, verbose=2).fit_transform(X)
101 | 
102 | 
103 | plt.figure(1, figsize=(30, 20),)
104 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1],s=100, c=y, alpha=0.2)
105 | plt.show()
106 | 


--------------------------------------------------------------------------------
/py_word2vec/gloveWikiKoreanTrainer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #see in glove-win_devc_x64, demo.sh or demo.bat
 3 | '''
 4 | MacOS and Linux
 5 | Go to: https://github.com/stanfordnlp/GloVe
 6 | 
 7 | $ git clone http://github.com/stanfordnlp/glove
 8 | $ cd glove && make
 9 | $ ./demo.sh
10 | 
11 | 
12 | Windows 10
13 | https://github.com/anoidgit/GloVe-win
14 | 
15 | '''
16 | 


--------------------------------------------------------------------------------
/py_word2vec/utils.py:
--------------------------------------------------------------------------------
 1 | from keras.utils import np_utils
 2 | from keras.preprocessing.text import Tokenizer
 3 | import numpy as np
 4 | 
 5 | 
 6 | def tokenize(corpus):
 7 |     """
 8 |     Tokenize the corpus of text.
 9 |     :param corpus: list containing a string of text (example: ["I like playing football with my friends"])
10 |     :return corpus_tokenized: indexed list of words in the corpus, in the same order as the original corpus (the example above would return [[1, 2, 3, 4]])
11 |     :return V: size of vocabulary
12 |     """
13 |     tokenizer = Tokenizer()
14 |     tokenizer.fit_on_texts(corpus)
15 |     corpus_tokenized = tokenizer.texts_to_sequences(corpus)
16 |     V = len(tokenizer.word_index)
17 |     return corpus_tokenized, V
18 | 
19 | 
20 | def initialize(V, N):
21 |     """
22 |     Initialize the weights of the neural network.
23 |     :param V: size of the vocabulary
24 |     :param N: size of the hidden layer
25 |     :return: weights W1, W2
26 |     """
27 |     np.random.seed(100)
28 |     W1 = np.random.rand(V, N)
29 |     W2 = np.random.rand(N, V)
30 | 
31 |     return W1, W2
32 | 
33 | 
34 | def corpus2io(corpus_tokenized, V, window_size):
35 |     """Converts corpus text into context and center words
36 |     # Arguments
37 |         corpus_tokenized: corpus text
38 |         window_size: size of context window
39 |     # Returns
40 |         context and center words (arrays)
41 |     """
42 |     for words in corpus_tokenized:
43 |         L = len(words)
44 |         for index, word in enumerate(words):
45 |             contexts = []
46 |             center = []
47 |             s = index - window_size
48 |             e = index + window_size + 1
49 |             contexts = contexts + [words[i]-1 for i in range(s, e) if 0 <= i < L and i != index]
50 |             center.append(word-1)
51 |             # x has shape c x V where c is size of contexts
52 |             x = np_utils.to_categorical(contexts, V)
53 |             # y has shape k x V where k is number of center words
54 |             y = np_utils.to_categorical(center, V)
55 |             yield (x, y)
56 | 
57 | 
58 | 
59 | def softmax(x):
60 |     """Calculate softmax based probability for given input vector
61 |     # Arguments
62 |         x: numpy array/list
63 |     # Returns
64 |         softmax of input array
65 |     """
66 |     e_x = np.exp(x - np.max(x))
67 |     return e_x / e_x.sum(axis=0)


--------------------------------------------------------------------------------
/py_word2vec/visualizeW2VPlot.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | from builtins import zip
 4 | 
 5 | import numpy as np
 6 | from sklearn.decomposition import PCA
 7 | from matplotlib import pyplot
 8 | 
 9 | import matplotlib.pyplot as plt
10 | from sklearn.manifold import TSNE
11 | import gensim
12 | 
13 | class visualizeW2VPlot:
14 |     def __init__(self):
15 |         name = 'visualizeW2VPlot'
16 | 
17 |     def load(self, modelFile):
18 |         model = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True, unicode_errors='ignore')
19 |         return model
20 | 
21 |     def visualizePCA(self, model):
22 |         pyplot.rc('font', family='New Gulim')
23 | 
24 |         words = ['이재명', '문재인', '승인', '당', '핵', '평화', '정치인', '대표']
25 | 
26 |         word_vectors = np.vstack([model[w] for w in words])
27 |         twodim = PCA().fit_transform(word_vectors)[:, :2]
28 |         twodim.shape
29 |         plt.figure(figsize=(5, 5))
30 |         plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
31 |         for word, (x, y) in zip(words, twodim):
32 |             plt.text(x, y, word)
33 |         plt.axis('off');
34 | 
35 |         fig1 = plt.gcf()
36 |         plt.show()
37 |         plt.draw()
38 |         fig1.savefig('testPCA.png', dpi=100)
39 | 
40 | 
41 |     def visualizeTSNE(self, model, word, vector_size):
42 |         pyplot.rc('font', family='New Gulim')
43 | 
44 |         arr = np.empty((0, vector_size), dtype='f')
45 |         word_labels = [word]
46 | 
47 |         # get close words
48 |         close_words = model.similar_by_word(word,topn=20)
49 | 
50 |         # add the vector for each of the closest words to the array
51 |         arr = np.append(arr, np.array([model[word]]), axis=0)
52 |         for wrd_score in close_words:
53 |             wrd_vector = model[wrd_score[0]]
54 |             word_labels.append(wrd_score[0])
55 |             arr = np.append(arr, np.array([wrd_vector]), axis=0)
56 | 
57 |         # find tsne coords for 2 dimensions
58 |         tsne = TSNE(n_components=2, random_state=0)
59 |         np.set_printoptions(suppress=True)
60 |         Y = tsne.fit_transform(arr)
61 | 
62 |         x_coords = Y[:, 0]
63 |         y_coords = Y[:, 1]
64 |         # display scatter plot
65 |         plt.scatter(x_coords, y_coords)
66 | 
67 |         for label, x, y in zip(word_labels, x_coords, y_coords):
68 |             plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
69 |         plt.xlim(x_coords.min() + 0.00005, x_coords.max() + 0.00005)
70 |         plt.ylim(y_coords.min() + 0.00005, y_coords.max() + 0.00005)
71 |         plt.show()
72 | 
73 | if __name__ == '__main__':
74 |     model_file = '../embeddings/word2vec/korean_wiki_w2v.bin'
75 |     model = visualizeW2VPlot().load(model_file)
76 |     mode = 't-sne' # t-sne or pca
77 |     if mode == 'pca':
78 |         visualizeW2VPlot().visualizePCA(model)
79 |     elif mode == 't-sne':
80 |         vector_size = 300
81 |         visualizeW2VPlot().visualizeTSNE(model, '이재명', vector_size)


--------------------------------------------------------------------------------
/stopwords/stopwordsEng.txt:
--------------------------------------------------------------------------------
  1 | photograph
  2 | a
  3 | about
  4 | above
  5 | after
  6 | again
  7 | against
  8 | all
  9 | am
 10 | an
 11 | and
 12 | any
 13 | are
 14 | aren't
 15 | as
 16 | at
 17 | be
 18 | because
 19 | been
 20 | before
 21 | being
 22 | below
 23 | between
 24 | both
 25 | but
 26 | by
 27 | can't
 28 | cannot
 29 | could
 30 | couldn't
 31 | did
 32 | didn't
 33 | do
 34 | does
 35 | doesn't
 36 | doing
 37 | don't
 38 | down
 39 | during
 40 | each
 41 | few
 42 | for
 43 | from
 44 | further
 45 | had
 46 | hadn't
 47 | has
 48 | hasn't
 49 | have
 50 | haven't
 51 | having
 52 | he
 53 | he'd
 54 | he'll
 55 | he's
 56 | her
 57 | here
 58 | here's
 59 | hers
 60 | herself
 61 | him
 62 | himself
 63 | his
 64 | how
 65 | how's
 66 | i
 67 | i'd
 68 | i'll
 69 | i'm
 70 | i've
 71 | if
 72 | in
 73 | into
 74 | is
 75 | isn't
 76 | it
 77 | it's
 78 | its
 79 | itself
 80 | let's
 81 | me
 82 | more
 83 | most
 84 | mustn't
 85 | my
 86 | myself
 87 | no
 88 | nor
 89 | not
 90 | of
 91 | off
 92 | on
 93 | once
 94 | only
 95 | or
 96 | other
 97 | ought
 98 | our
 99 | ours	ourselves
100 | out
101 | over
102 | own
103 | same
104 | shan't
105 | she
106 | she'd
107 | she'll
108 | she's
109 | should
110 | shouldn't
111 | so
112 | some
113 | such
114 | than
115 | that
116 | that's
117 | the
118 | their
119 | theirs
120 | them
121 | themselves
122 | then
123 | there
124 | there's
125 | these
126 | they
127 | they'd
128 | they'll
129 | they're
130 | they've
131 | this
132 | those
133 | through
134 | to
135 | too
136 | under
137 | until
138 | up
139 | very
140 | was
141 | wasn't
142 | we
143 | we'd
144 | we'll
145 | we're
146 | we've
147 | were
148 | weren't
149 | what
150 | what's
151 | when
152 | when's
153 | where
154 | where's
155 | which
156 | while
157 | who
158 | who's
159 | whom
160 | why
161 | why's
162 | with
163 | won't
164 | would
165 | wouldn't
166 | you
167 | you'd
168 | you'll
169 | you're
170 | you've
171 | your
172 | yours
173 | yourself
174 | yourselves


--------------------------------------------------------------------------------
/stopwords/stopwordsKor.txt:
--------------------------------------------------------------------------------
  1 | 유전
  2 | 단독_보조금
  3 | 청원
  4 | 모식도
  5 | 이
  6 | 라
  7 | 의
  8 | 네
  9 | 은
 10 | 야
 11 | 아
 12 | 있
 13 | 하
 14 | 것
 15 | 들
 16 | 그
 17 | 되
 18 | 수
 19 | 이
 20 | 보
 21 | 않
 22 | 없
 23 | 나
 24 | 사람
 25 | 주
 26 | 아니
 27 | 등
 28 | 같
 29 | 우리
 30 | 때
 31 | 년
 32 | 가
 33 | 한
 34 | 지
 35 | 대하
 36 | 오
 37 | 말
 38 | 일
 39 | 그렇
 40 | 위하
 41 | 때문
 42 | 그것
 43 | 두
 44 | 말하
 45 | 알
 46 | 그러나
 47 | 받
 48 | 못하
 49 | 일
 50 | 그런
 51 | 또
 52 | 문제
 53 | 더
 54 | 사회
 55 | 많
 56 | 그리고
 57 | 좋
 58 | 크
 59 | 따르
 60 | 중
 61 | 나오
 62 | 가지
 63 | 씨
 64 | 시키
 65 | 만들
 66 | 지금
 67 | 생각하
 68 | 그러
 69 | 속
 70 | 하나
 71 | 집
 72 | 살
 73 | 모르
 74 | 적
 75 | 월
 76 | 데
 77 | 자신
 78 | 안
 79 | 어떤
 80 | 내
 81 | 내
 82 | 경우
 83 | 명
 84 | 생각
 85 | 시간
 86 | 그녀
 87 | 다시
 88 | 이런
 89 | 앞
 90 | 보이
 91 | 번
 92 | 나
 93 | 다른
 94 | 어떻
 95 | 여자
 96 | 개
 97 | 전
 98 | 들
 99 | 사실
100 | 이렇
101 | 점
102 | 싶
103 | 말
104 | 정도
105 | 좀
106 | 원
107 | 잘
108 | 통하
109 | 소리
110 | 놓
111 | 동안
112 | 을
113 | 다음
114 | 연도
115 | 이상
116 | 위
117 | 아래
118 | 간
119 | 대
120 | 각종
121 | 후
122 | 반면
123 | 대부분
124 | 회
125 | 년대
126 | 조
127 | 포함
128 | 차
129 | 산하
130 | 바
131 | 이내
132 | 뿐
133 | 급
134 | 별지
135 | 량
136 | 초기
137 | 미만
138 | 관련
139 | 형
140 | 위주
141 | 외
142 | 한편
143 | 최대한
144 | 그동안
145 | 호
146 | 이후
147 | 과
148 | 당초
149 | 아래
150 | 실
151 | 앞으로
152 | 본래
153 | 이전
154 | 형
155 | 양
156 | 항
157 | 타
158 | 편
159 | 내외
160 | 카
161 | 마
162 | 개월
163 | 동
164 | 단
165 | 그중
166 | 예
167 | 자
168 | 곳
169 | 관련
170 | 르
171 | 다
172 | 척
173 | 분
174 | 선
175 | 칙
176 | 단
177 | 장
178 | 밖
179 | 유
180 | 이외
181 | 국
182 | 경
183 | 미
184 | 만
185 | 건
186 | 일부
187 | 업
188 | 직
189 | 그간
190 | 기
191 | 이하
192 | 이래
193 | 력
194 | 증
195 | 붙임
196 | 개월
197 | 기타
198 | 약간
199 | 향후
200 | 만큼
201 | 화
202 | 기존
203 | 가칭
204 | 보임
205 | 정
206 | 포괄
207 | 나머지
208 | 여명
209 | 실
210 | 올해
211 | 전년
212 | 생수
213 | 여타
214 | 최근
215 | 다수
216 | 추후
217 | 에
218 | 대한
219 | 통해
220 | 등에
221 | 개소
222 | 소
223 | 배
224 | 현
225 | 으
226 | 로
227 | 종
228 | 각각
229 | 


--------------------------------------------------------------------------------