├── parse.sh ├── word2vec_tool ├── demo-word.sh ├── demo-classes.sh ├── demo-word-accuracy.sh ├── demo-analogy.sh ├── Makefile ├── demo-phrases.sh ├── demo-phrase-accuracy.sh ├── README.txt ├── distance.c ├── word-analogy.c ├── demo-train-big-model-v1.sh ├── compute-accuracy.c ├── word2phrase.c └── LICENSE ├── data ├── test.txt ├── DBLP.label └── stopwords.txt ├── src ├── preprocessing │ ├── from_raw_to_binary_text.cpp │ ├── from_raw_to_binary.cpp │ └── compute_idf.py ├── online_query │ ├── compute_offset.py │ ├── test_parser.cpp │ ├── segphrase_parser.h │ └── segphrase_parser.cpp ├── postprocessing │ ├── filter_by_support.py │ ├── clean_list_with_wordnet.py │ ├── combine_phrases.cpp │ ├── build_model.cpp │ ├── prune_and_combine.cpp │ ├── kd_tree.h │ ├── qualify_unigrams.cpp │ └── generateNN.cpp ├── frequent_phrase_mining │ ├── frequent_pattern_mining.py │ └── main.py ├── classification │ ├── aho_corasick.h │ ├── predict_quality.cpp │ ├── auto_label_generation.py │ ├── random_forest.h │ └── feature_extraction.cpp ├── utils │ └── helper.h └── model_training │ ├── recompute_features.cpp │ └── adjust_probability.cpp ├── Makefile ├── README.md ├── train_toy.sh ├── train_dblp.sh └── LICENSE /parse.sh: -------------------------------------------------------------------------------- 1 | ./bin/segphrase_parser results/segmentation.model results/salient.csv 0.6 ./data/test.txt ./results/parsed.txt 0 2 | # An alternative output format in terms of phrase offset 3 | # python ./src/online_query/compute_offset.py ./results/parsed.txt ./results/offset.txt 4 | -------------------------------------------------------------------------------- /word2vec_tool/demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /word2vec_tool/demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /data/test.txt: -------------------------------------------------------------------------------- 1 | I love data mining and database. 2 | A database is an organized collection of data. The data is typically organized to model aspects of reality in a way that supports processes requiring information. For example, modelling the availability of rooms in hotels in a way that supports finding a hotel with vacancies. 3 | This is an sentence used to test the consecutive numbers and ending characters in the end of sentences. The version is 1.10. 4 | -------------------------------------------------------------------------------- /word2vec_tool/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /word2vec_tool/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /word2vec_tool/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | word2phrase : word2phrase.c 10 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 11 | distance : distance.c 12 | $(CC) distance.c -o distance $(CFLAGS) 13 | word-analogy : word-analogy.c 14 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 15 | compute-accuracy : compute-accuracy.c 16 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 17 | chmod +x *.sh 18 | 19 | clean: 20 | rm -rf word2vec word2phrase distance word-analogy compute-accuracy -------------------------------------------------------------------------------- /word2vec_tool/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /word2vec_tool/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /src/preprocessing/from_raw_to_binary_text.cpp: -------------------------------------------------------------------------------- 1 | #include "../utils/helper.h" 2 | 3 | const string ENDINGS = ".!?,;:'[]"; 4 | 5 | int main(int argc, char* argv[]) 6 | { 7 | if (argc != 3) { 8 | cerr << "[Usage] " << endl; 9 | return -1; 10 | } 11 | 12 | FILE* in = tryOpen(argv[1], "r"); 13 | vector sentences; 14 | for (;getLine(in);) { 15 | string sentence = ""; 16 | for (int i = 0; line[i]; ++ i) { 17 | char ch = line[i]; 18 | if (ENDINGS.find(ch) != -1) { 19 | if (sentence.size() > 0) { 20 | sentences.push_back(sentence); 21 | } 22 | sentence = ""; 23 | } else { 24 | sentence += ch; 25 | } 26 | } 27 | if (sentence.size() > 0) { 28 | sentences.push_back(sentence); 29 | } 30 | } 31 | fclose(in); 32 | 33 | cerr << "# Sentences = " << sentences.size() << endl; 34 | 35 | FILE* out = tryOpen(argv[2], "wb"); 36 | 37 | Binary::write(out, sentences.size()); 38 | FOR (sentence, sentences) { 39 | Binary::write(out, *sentence); 40 | } 41 | 42 | return 0; 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/online_query/compute_offset.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | def main(argv): 5 | if (len(argv) != 2): 6 | print "[Usage] " 7 | return 8 | input_file = argv[0] 9 | output_file = argv[1] 10 | 11 | with open(output_file, 'w') as output: 12 | with open(input_file, 'r') as input: 13 | for line in input: 14 | output.write(re.sub('[\[\]]', '', line)) 15 | output.write('Offsets: ') 16 | offset = 0 17 | left = 0 18 | right = 0 19 | bias = 0 20 | for char in line: 21 | if char == '[': 22 | left = offset + 1 23 | if char == ']': 24 | right = offset 25 | bias += 1 26 | output.write('[' + str(left - bias * 2 + 1) + ', ' + str(right - bias * 2 + 1) + ']') 27 | output.write(' (' + line[left:right] + '); ') 28 | offset += 1 29 | output.write('\n') 30 | 31 | if __name__ == "__main__": 32 | main(sys.argv[1 : ]) 33 | -------------------------------------------------------------------------------- /word2vec_tool/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /src/online_query/test_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "segphrase_parser.h" 2 | 3 | template 4 | void printVector(vector a) { 5 | for (size_t i = 0; i < a.size(); ++ i) { 6 | cerr << a[i]; 7 | if (i + 1 == a.size()) { 8 | cerr << endl; 9 | } else { 10 | cerr << ", "; 11 | } 12 | } 13 | } 14 | 15 | int main(int argc, char* argv[]) 16 | { 17 | if (argc != 2) { 18 | cerr << "[usage] " << endl; 19 | return -1; 20 | } 21 | 22 | string model_path = (string)argv[1]; 23 | SegPhraseParser* parser = new SegPhraseParser(model_path, 0); 24 | cerr << "parser built." << endl; 25 | 26 | vector segments = parser->segment("data mining is an area"); 27 | printVector(segments); 28 | 29 | cerr << "Please type in a sentence in a single line (or exit()):" << endl; 30 | while (getLine(stdin)) { 31 | if (strcmp(line, "exit()") == 0) { 32 | break; 33 | } 34 | segments = parser->segment(line); 35 | cerr << "[Segmentation Result]" << endl; 36 | printVector(segments); 37 | cerr << "\nPlease type in a sentence in a single line (or exit()):" << endl; 38 | } 39 | 40 | cerr << "[done]" << endl; 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /src/postprocessing/filter_by_support.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def main(argv): 4 | if len(argv) != 4: 5 | print '[usage] ' 6 | exit(-1) 7 | ranking_list_filename = argv[0] 8 | segmented_corpus_filename = argv[1] 9 | sigma = int(argv[2]) 10 | filtered_output_filename = argv[3] 11 | 12 | support = {} 13 | for line in open(ranking_list_filename): 14 | lexicon = line.split(',')[0] 15 | key ='_'.join(lexicon.lower().split(' ')) 16 | support[key] = 0 17 | for line in open(segmented_corpus_filename): 18 | tokens = line.split() 19 | for token in tokens: 20 | if token in support: 21 | support[token] += 1 22 | out = open(filtered_output_filename, 'w') 23 | filtered_cnt = 0 24 | keep_cnt = 0 25 | for line in open(ranking_list_filename): 26 | lexicon = line.split(',')[0] 27 | key ='_'.join(lexicon.lower().split(' ')) 28 | if support[key] >= sigma: 29 | keep_cnt += 1 30 | out.write(line) 31 | else: 32 | filtered_cnt += 1 33 | #print 'filtered: ', lexicon, support[key] 34 | print 'done. filtered_cnt =', filtered_cnt, 'keep_cnt =', keep_cnt 35 | 36 | if __name__ == '__main__': 37 | main(sys.argv[1:]) 38 | -------------------------------------------------------------------------------- /src/postprocessing/clean_list_with_wordnet.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import wordnet as wn 2 | from nltk.corpus.reader import NOUN 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("-input", help="input path for concepts file") 7 | parser.add_argument("-output", help="output path for noise file") 8 | args = parser.parse_args() 9 | 10 | poor_results = set() 11 | results = list() 12 | with open(args.input, 'r') as input: 13 | for line in input: 14 | results.append(line) 15 | concept = line.split(',')[0] 16 | words = concept.split('_') 17 | word = '' 18 | if len(words) > 1: 19 | synsets = wn.synsets(concept) 20 | if len(synsets) != 0: 21 | noun_synsets = wn.synsets(concept, NOUN) 22 | if len(noun_synsets) == 0: 23 | poor_results.add(concept + ',0.0000000000\n') 24 | continue 25 | else: 26 | continue 27 | word = words[-1] 28 | else: 29 | word = concept 30 | synsets = wn.synsets(word) 31 | if len(synsets) == 0: 32 | pass 33 | else: 34 | noun_synsets = wn.synsets(word, NOUN) 35 | if len(noun_synsets) == 0: 36 | poor_results.add(concept + ',0.0000000000\n') 37 | 38 | with open(args.output, 'w') as output: 39 | for line in results: 40 | if line not in poor_results: 41 | output.write(line) 42 | for line in poor_results: 43 | output.write(line) 44 | -------------------------------------------------------------------------------- /src/frequent_phrase_mining/frequent_pattern_mining.py: -------------------------------------------------------------------------------- 1 | from sets import Set 2 | 3 | def frequentPatternMining(tokens, patternOutputFilename, threshold): 4 | dict = {} 5 | 6 | tokensNumber = len(tokens) 7 | for i in xrange(tokensNumber): 8 | token = tokens[i] 9 | if token == '$': 10 | continue 11 | if token in dict: 12 | dict[token].append(i) 13 | else: 14 | dict[token] = [i] 15 | print "# of distinct tokens = ", len(dict) 16 | 17 | patternOutput = open(patternOutputFilename, 'w') 18 | 19 | frequentPatterns = [] 20 | patternLength = 1 21 | while (len(dict) > 0): 22 | if patternLength > 6: 23 | break 24 | #print "working on length = ", patternLength 25 | patternLength += 1 26 | newDict = {} 27 | for pattern, positions in dict.items(): 28 | occurrence = len(positions) 29 | if occurrence >= threshold: 30 | frequentPatterns.append(pattern) 31 | 32 | patternOutput.write(pattern + "," + str(occurrence) + "\n") 33 | for i in positions: 34 | if i + 1 < tokensNumber: 35 | if tokens[i + 1] == '$': 36 | continue 37 | newPattern = pattern + " " + tokens[i + 1] 38 | if newPattern in newDict: 39 | newDict[newPattern].append(i + 1) 40 | else: 41 | newDict[newPattern] = [i + 1] 42 | dict.clear() 43 | dict = newDict 44 | patternOutput.close() 45 | return frequentPatterns 46 | -------------------------------------------------------------------------------- /src/postprocessing/combine_phrases.cpp: -------------------------------------------------------------------------------- 1 | #include "../utils/helper.h" 2 | 3 | unordered_map phrases; 4 | 5 | void loadPatterns(string folder) 6 | { 7 | const int maxLen = 6; 8 | for (int length = 1; length <= maxLen; ++ length) { 9 | ostringstream filename; 10 | filename << "length" << length << ".csv"; 11 | 12 | FILE* in = tryOpen(folder + "/" + filename.str(), "r"); 13 | if (in == NULL) { 14 | continue; 15 | } 16 | while (getLine(in)) { 17 | vector tokens = splitBy(line, ','); 18 | string phrase = tokens[0]; 19 | double prob; 20 | fromString(tokens[3], prob); 21 | 22 | if (length == 1) { 23 | 24 | } else { 25 | for (size_t i = 0; i < phrase.size(); ++ i) { 26 | if (phrase[i] == ' ') { 27 | phrase[i] = '_'; 28 | } 29 | } 30 | phrases[phrase] = prob; 31 | } 32 | } 33 | fclose(in); 34 | } 35 | } 36 | 37 | int main(int argc, char *argv[]) 38 | { 39 | if (argc != 3) { 40 | printf("[usage] \n"); 41 | return 0; 42 | } 43 | loadPatterns(argv[1]); 44 | 45 | vector> order; 46 | FOR (w, phrases) { 47 | order.push_back(make_pair(w->second, w->first)); 48 | } 49 | sort(order.rbegin(), order.rend()); 50 | 51 | FILE* out = tryOpen(argv[2], "w"); 52 | FOR (word, order) { 53 | fprintf(out, "%s,%.10f\n", word->second.c_str(), word->first); 54 | } 55 | fclose(out); 56 | 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /src/frequent_phrase_mining/main.py: -------------------------------------------------------------------------------- 1 | from frequent_pattern_mining import * 2 | import re 3 | import sys 4 | 5 | def main(argv): 6 | ENDINGS = ".!?,;:\"[]" 7 | 8 | threshold = 1000 9 | rawTextInput = 'rawText.txt' 10 | patternOutputFilename = 'patterns.csv' 11 | argc = len(argv) 12 | for i in xrange(argc): 13 | if argv[i] == "-raw" and i + 1 < argc: 14 | rawTextInput = argv[i + 1] 15 | elif argv[i] == "-thres" and i + 1 < argc: 16 | threshold = int(argv[i + 1]) 17 | elif argv[i] == "-o" and i + 1 < argc: 18 | patternOutputFilename = argv[i + 1] 19 | 20 | raw = open(rawTextInput, 'r'); 21 | tokens = [] 22 | for line in raw: 23 | inside = 0 24 | chars = [] 25 | for ch in line: 26 | if ch == '(': 27 | inside += 1 28 | elif ch == ')': 29 | inside -= 1 30 | elif inside == 0: 31 | if ch.isalpha(): 32 | chars.append(ch.lower()) 33 | elif ch == '\'': 34 | chars.append(ch) 35 | else: 36 | if len(chars) > 0: 37 | tokens.append(''.join(chars)) 38 | chars = [] 39 | if ch in ENDINGS: 40 | tokens.append('$') 41 | if len(chars) > 0: 42 | tokens.append(''.join(chars)) 43 | chars = [] 44 | 45 | print "# tokens = ", len(tokens) 46 | 47 | frequentPatterns = frequentPatternMining(tokens, patternOutputFilename, threshold) 48 | 49 | print "# of frequent pattern = ", len(frequentPatterns) 50 | 51 | if __name__ == "__main__": 52 | main(sys.argv[1 : ]) 53 | -------------------------------------------------------------------------------- /src/preprocessing/from_raw_to_binary.cpp: -------------------------------------------------------------------------------- 1 | #include "../utils/helper.h" 2 | #include 3 | 4 | const string ENDINGS = ".!?,;:()\"[]"; 5 | 6 | int main(int argc, char* argv[]) 7 | { 8 | if (argc != 3) { 9 | cerr << "[Usage] " << endl; 10 | return -1; 11 | } 12 | 13 | FILE* in = tryOpen(argv[1], "r"); 14 | vector sentences; 15 | for (;getLine(in);) { 16 | string sentence = ""; 17 | for (int i = 0; line[i]; ++ i) { 18 | char ch = tolower(line[i]); 19 | if (ENDINGS.find(ch) != -1) { 20 | if (sentence.size() > 0) { 21 | sentences.push_back(sentence); 22 | } 23 | sentence = ""; 24 | } else { 25 | if (!isalpha(ch)) { 26 | if (ch == '\'') { 27 | sentence += ch; 28 | } else if (sentence.size() > 0 && sentence[sentence.size() - 1] != ' ') { 29 | sentence += ' '; 30 | } 31 | } else { 32 | sentence += ch; 33 | } 34 | } 35 | } 36 | if (sentence.size() > 0) { 37 | sentences.push_back(sentence); 38 | } 39 | } 40 | fclose(in); 41 | 42 | cerr << "# Sentences = " << sentences.size() << endl; 43 | 44 | unordered_set tokens; 45 | FOR (sentence, sentences) { 46 | vector temp = splitBy(*sentence, ' '); 47 | FOR(iter, temp) { 48 | tokens.insert(*iter); 49 | } 50 | assert(tokens.size() != 0); 51 | } 52 | vector unigrams(tokens.begin(), tokens.end()); 53 | 54 | cerr << "# Unigrams = " << unigrams.size() << endl; 55 | 56 | FILE* out = tryOpen(argv[2], "wb"); 57 | 58 | Binary::write(out, sentences.size()); 59 | FOR (sentence, sentences) { 60 | Binary::write(out, *sentence); 61 | } 62 | 63 | Binary::write(out, unigrams.size()); 64 | FOR (unigram, unigrams) { 65 | Binary::write(out, *unigram); 66 | } 67 | fclose(out); 68 | 69 | return 0; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | export PYTHON = pypy 2 | export CC = gcc 3 | export CXX = g++ 4 | export CFLAGS = -std=c++11 -Wall -O3 -msse2 -fopenmp -I.. 5 | 6 | BIN = ./bin/from_raw_to_binary ./bin/from_raw_to_binary_text ./bin/feature_extraction ./bin/predict_quality ./bin/adjust_probability ./bin/recompute_features ./bin/prune_and_combine ./bin/build_model ./bin/qualify_unigrams ./bin/segphrase_parser ./bin/generateNN ./bin/combine_phrases 7 | .PHONY: clean all 8 | 9 | all: ./bin $(BIN) 10 | 11 | ./bin/from_raw_to_binary: ./src/preprocessing/from_raw_to_binary.cpp ./src/utils/helper.h 12 | ./bin/from_raw_to_binary_text: ./src/preprocessing/from_raw_to_binary_text.cpp ./src/utils/helper.h 13 | ./bin/feature_extraction: ./src/classification/feature_extraction.cpp ./src/utils/helper.h ./src/classification/aho_corasick.h 14 | ./bin/predict_quality: ./src/classification/predict_quality.cpp ./src/utils/helper.h ./src/classification/random_forest.h 15 | ./bin/adjust_probability: ./src/model_training/adjust_probability.cpp ./src/utils/helper.h 16 | ./bin/recompute_features: ./src/model_training/recompute_features.cpp ./src/utils/helper.h 17 | ./bin/prune_and_combine: ./src/postprocessing/prune_and_combine.cpp ./src/utils/helper.h 18 | ./bin/build_model: ./src/postprocessing/build_model.cpp ./src/utils/helper.h 19 | ./bin/qualify_unigrams: ./src/postprocessing/qualify_unigrams.cpp ./src/utils/helper.h 20 | ./bin/segphrase_parser: ./src/online_query/segphrase_parser.cpp ./src/utils/helper.h ./src/online_query/segphrase_parser.h 21 | ./bin/generateNN: ./src/postprocessing/generateNN.cpp ./src/utils/helper.h ./src/postprocessing/kd_tree.h 22 | ./bin/combine_phrases: ./src/postprocessing/combine_phrases.cpp ./src/utils/helper.h 23 | 24 | ./bin: 25 | mkdir bin 26 | 27 | export LDFLAGS= -pthread -lm -Wno-unused-result -Wno-sign-compare -Wno-unused-variable -Wno-parentheses -Wno-format 28 | $(BIN) : 29 | $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) 30 | $(OBJ) : 31 | $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) 32 | 33 | clean : 34 | rm -rf bin 35 | -------------------------------------------------------------------------------- /src/postprocessing/build_model.cpp: -------------------------------------------------------------------------------- 1 | #include "../utils/helper.h" 2 | 3 | void dump(const unordered_map &prob, FILE* out) 4 | { 5 | vector phrases; 6 | vector probability; 7 | size_t size; 8 | size = prob.size(); 9 | FOR (pairs, prob) { 10 | phrases.push_back(pairs->first); 11 | probability.push_back(pairs->second); 12 | } 13 | 14 | fwrite(&size, sizeof(size), 1, out); 15 | for (size_t i = 0; i < size; ++ i) { 16 | Binary::write(out, phrases[i]); 17 | } 18 | if (size > 0) { 19 | fwrite(&probability[0], sizeof(probability[0]), size, out); 20 | } 21 | } 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | int maxLen; 26 | if (argc != 5 || sscanf(argv[2], "%d", &maxLen) != 1) { 27 | cerr << "[usage] " << endl; 28 | return -1; 29 | } 30 | string folder = argv[1]; 31 | 32 | FILE* in = tryOpen(argv[3], "r"); 33 | double penalty; 34 | fscanf(in, "%lf", &penalty); 35 | fclose(in); 36 | 37 | string modelFilename = argv[4]; 38 | 39 | FILE* out = tryOpen(modelFilename, "wb"); 40 | fwrite(&penalty, sizeof(penalty), 1, out); 41 | 42 | unordered_map unigrams, phrases; 43 | for (int length = 1; length <= maxLen; ++ length) { 44 | ostringstream filename; 45 | filename << "length" << length << ".csv"; 46 | 47 | FILE* in = tryOpen(folder + "/" + filename.str(), "r"); 48 | if (in == NULL) { 49 | continue; 50 | } 51 | while (getLine(in)) { 52 | vector tokens = splitBy(line, ','); 53 | string phrase = tokens[0]; 54 | double prob; 55 | fromString(tokens[2], prob); 56 | 57 | if (length == 1) { 58 | unigrams[phrase] = prob; 59 | } else { 60 | phrases[phrase] = prob; 61 | } 62 | } 63 | fclose(in); 64 | } 65 | cerr << "penalty = " << penalty << endl; 66 | cerr << "# unigrams = " << unigrams.size() << endl; 67 | cerr << "# phrases = " << phrases.size() << endl; 68 | 69 | dump(unigrams, out); 70 | dump(phrases, out); 71 | 72 | cerr << "segmetation model saved." << endl; 73 | 74 | fclose(out); 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /src/preprocessing/compute_idf.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from math import * 4 | 5 | def main(argv): 6 | rawTextInput = 'rawText.txt' 7 | stopwordsOutput = 'stopwordsFromText.txt' 8 | argc = len(argv) 9 | for i in xrange(argc): 10 | if argv[i] == "-raw" and i + 1 < argc: 11 | rawTextInput = argv[i + 1] 12 | elif argv[i] == "-o" and i + 1 < argc: 13 | stopwordsOutput = argv[i + 1] 14 | 15 | inDocs = {} 16 | occurrence = {} 17 | docsN = 0 18 | tokensN = 0 19 | for line in open(rawTextInput, 'r'): 20 | docsN += 1 21 | 22 | inside = 0 23 | chars = [] 24 | tokens = {} 25 | for ch in line: 26 | if ch == '(': 27 | inside += 1 28 | elif ch == ')': 29 | inside -= 1 30 | elif inside == 0: 31 | if ch.isalpha(): 32 | chars.append(ch.lower()) 33 | elif ch == '\'': 34 | chars.append(ch) 35 | else: 36 | if len(chars) > 0: 37 | token = ''.join(chars) 38 | tokensN += 1 39 | if token in occurrence: 40 | occurrence[token] += 1 41 | else: 42 | occurrence[token] = 1 43 | tokens[token] = True 44 | chars = [] 45 | if len(chars) > 0: 46 | token = ''.join(chars) 47 | tokensN += 1 48 | if token in occurrence: 49 | occurrence[token] += 1 50 | else: 51 | occurrence[token] = 1 52 | tokens[token] = True 53 | chars = [] 54 | for token in tokens: 55 | if token in inDocs: 56 | inDocs[token] += 1 57 | else: 58 | inDocs[token] = 1 59 | 60 | #print 'tokens = ', tokensN 61 | #print 'docs = ', docsN 62 | 63 | rank = [] 64 | for token, occur in occurrence.items(): 65 | tf = occur / float(tokensN) 66 | idf = max(log(docsN / float(inDocs[token])), 1e-10) 67 | rank.append((token, tf * idf)) 68 | sorted_x = sorted(rank, key=lambda x: -x[1]) 69 | 70 | out = open(stopwordsOutput, 'w') 71 | for token, key in sorted_x: 72 | out.write(str(token) + ',' + str(key) + '\n'); 73 | out.close() 74 | 75 | if __name__ == "__main__": 76 | main(sys.argv[1 : ]) 77 | -------------------------------------------------------------------------------- /src/classification/aho_corasick.h: -------------------------------------------------------------------------------- 1 | #ifndef __AHO_CORASICK_H__ 2 | #define __AHO_CORASICK_H__ 3 | 4 | #include "../utils/helper.h" 5 | #include 6 | using namespace std; 7 | 8 | class AhoCorasick 9 | { 10 | vector< unordered_map > next; 11 | vector failed, depth; 12 | vector isEnd; 13 | int nodes; 14 | 15 | int addNewNode() { 16 | next.push_back(unordered_map()); 17 | isEnd.push_back(false); 18 | depth.push_back(0); 19 | return nodes ++; 20 | } 21 | 22 | public: 23 | AhoCorasick() { 24 | nodes = 0; 25 | addNewNode(); 26 | } 27 | 28 | void add(const string &s) { 29 | int u = 0; 30 | for (size_t i = 0; i < s.size(); ++ i) { 31 | char ch = s[i]; 32 | int v; 33 | if (!next[u].count(ch)) { 34 | v = addNewNode(); 35 | depth[v] = depth[u] + 1; 36 | next[u][ch] = v; 37 | } else { 38 | v = next[u][ch]; 39 | } 40 | u = v; 41 | } 42 | isEnd[u] = true; 43 | } 44 | 45 | void make() { 46 | queue q; 47 | failed.resize(nodes, -1); 48 | q.push(0); 49 | while (q.size()) { 50 | int u = q.front(); 51 | q.pop(); 52 | FOR (edge, next[u]) { 53 | char ch = edge->first; 54 | int v = edge->second; 55 | if (u == 0) { 56 | failed[v] = 0; 57 | } else { 58 | failed[v] = 0; 59 | for (int p = failed[u];p != -1;p = failed[p]) { 60 | if (next[p].count(ch)) { 61 | failed[v] = next[p][ch]; 62 | break; 63 | } 64 | } 65 | } 66 | q.push(v); 67 | } 68 | } 69 | } 70 | 71 | void search(const string &text, vector< pair > &ret) { 72 | for (int i = 0, p = 0; i < (int)text.size(); ++ i) { 73 | char ch = text[i]; 74 | while (!next[p].count(ch) && p != 0) { 75 | p = failed[p]; 76 | } 77 | if (next[p].count(ch)) { 78 | p = next[p][ch]; 79 | } 80 | int temp = p; 81 | while (temp != 0 && isEnd[temp]) { 82 | ret.push_back(make_pair(i - depth[temp] + 1, i + 1)); 83 | temp = failed[temp]; 84 | } 85 | } 86 | } 87 | }; 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /src/postprocessing/prune_and_combine.cpp: -------------------------------------------------------------------------------- 1 | #include "../utils/helper.h" 2 | 3 | int load(string filename, int window, double threshold, vector< pair > &ret, bool det, int n) 4 | { 5 | vector< pair > order; 6 | FILE* in = tryOpen(filename.c_str(), "r"); 7 | if (in != NULL) { 8 | for (;getLine(in);) { 9 | vector tokens = splitBy(line, ','); 10 | myAssert(tokens.size() == 4, "wrong number of columns"); 11 | string phrase = tokens[0]; 12 | double quality; 13 | fromString(tokens[3], quality); 14 | order.push_back(make_pair(phrase, quality)); 15 | } 16 | } 17 | vector sum(order.size() + 1, 0); 18 | for (size_t i = 0; i < order.size(); ++ i) { 19 | sum[i + 1] = sum[i] + order[i].second; 20 | } 21 | for (size_t i = 0; i < order.size(); ++ i) { 22 | int l = i - window; 23 | int r = i + window; 24 | if (l < 0) { 25 | l = 0; 26 | } 27 | if (r >= order.size()) { 28 | r = (int)order.size() - 1; 29 | } 30 | if (det && (sum[r + 1] - sum[l]) / (r - l + 1) < threshold || !det && i >= n) { 31 | fprintf(stderr, "%d/%d phrases in %s\n", i, (int)order.size(), filename.c_str()); 32 | return i; 33 | } 34 | ret.push_back(make_pair(order[i].second, order[i].first)); 35 | } 36 | fprintf(stderr, "%d/%d phrases in %s\n", (int)order.size(), (int)order.size(), filename.c_str()); 37 | return (int)order.size(); 38 | } 39 | 40 | int main(int argc, char* argv[]) 41 | { 42 | int window; 43 | double threshold; 44 | if (argc != 7 || sscanf(argv[2], "%d", &window) != 1 || sscanf(argv[3], "%lf", &threshold) != 1) { 45 | fprintf(stderr, "[usage] "); 46 | return -1; 47 | } 48 | 49 | int numbers[10]; 50 | if (strcmp(argv[5], "DET") != 0) { 51 | FILE* in = tryOpen(argv[5], "r"); 52 | for (int l = 2; l <= 6; ++ l) { 53 | fscanf(in, "%d", &numbers[l]); 54 | } 55 | fclose(in); 56 | } 57 | 58 | vector< pair > phrases; 59 | FILE* fn = tryOpen(argv[6], "w"); 60 | for (int length = 2; length <= 6; ++ length) { 61 | char filename[256]; 62 | sprintf(filename, "%s%d.csv", argv[1], length); 63 | int n = load(filename, window, threshold, phrases, strcmp(argv[5], "DET") == 0, numbers[length]); 64 | fprintf(fn, "%d\n", n); 65 | } 66 | fclose(fn); 67 | 68 | sort(phrases.rbegin(), phrases.rend()); 69 | FILE* out = tryOpen(argv[4], "w"); 70 | FOR (phrase, phrases) { 71 | fprintf(out, "%s,%.10f\n", phrase->second.c_str(), phrase->first); 72 | } 73 | fclose(out); 74 | 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /src/utils/helper.h: -------------------------------------------------------------------------------- 1 | #ifndef __MY_HELPER__ 2 | #define __MY_HELPER__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | using namespace std; 18 | 19 | #define FOR(i,a) for (__typeof((a).begin()) i = (a).begin(); i != (a).end(); ++ i) 20 | 21 | const double EPS = 1e-8; 22 | 23 | /*! \brief return a real numer uniform in (0,1) */ 24 | inline double next_double2(){ 25 | return (static_cast( rand() ) + 1.0 ) / (static_cast(RAND_MAX) + 2.0); 26 | } 27 | 28 | /*! \brief return x~N(0,1) */ 29 | inline double sample_normal(){ 30 | double x,y,s; 31 | do{ 32 | x = 2 * next_double2() - 1.0; 33 | y = 2 * next_double2() - 1.0; 34 | s = x*x + y*y; 35 | }while( s >= 1.0 || s == 0.0 ); 36 | 37 | return x * sqrt( -2.0 * log(s) / s ) ; 38 | } 39 | 40 | bool myAssert(bool flg, string msg) 41 | { 42 | if (!flg) { 43 | cerr << msg << endl; 44 | exit(-1); 45 | } 46 | return flg; 47 | } 48 | 49 | int sign(double x) 50 | { 51 | return x < -EPS ? -1 : x > EPS; 52 | } 53 | 54 | string replaceAll(const string &s, const string &from, const string &to) 55 | { 56 | string ret = ""; 57 | for (size_t i = 0; i < s.size(); ++ i) { 58 | bool found = true; 59 | for (size_t offset = 0; offset < from.size() && found; ++ offset) { 60 | found &= i + offset < s.size() && s[i + offset] == from[offset]; 61 | } 62 | if (found) { 63 | ret += to; 64 | i += from.size() - 1; 65 | } else { 66 | ret += s[i]; 67 | } 68 | } 69 | return ret; 70 | } 71 | 72 | double sqr(double x) 73 | { 74 | return x * x; 75 | } 76 | 77 | template 78 | void fromString(const string &s, T &x) 79 | { 80 | stringstream in(s); 81 | in >> x; 82 | } 83 | 84 | string tolower(const string &a) 85 | { 86 | string ret = a; 87 | for (size_t i = 0; i < ret.size(); ++ i) { 88 | ret[i] = tolower(ret[i]); 89 | } 90 | return ret; 91 | } 92 | 93 | const int MAX_LENGTH = 100000000; 94 | 95 | char line[MAX_LENGTH + 1]; 96 | 97 | bool getLine(FILE* in) 98 | { 99 | bool hasNext = fgets(line, MAX_LENGTH, in); 100 | int length = strlen(line); 101 | while (length > 0 && (line[length - 1] == '\n' || line[length - 1] == '\r')) { 102 | -- length; 103 | } 104 | line[length] = 0; 105 | return hasNext; 106 | } 107 | 108 | FILE* tryOpen(const string &filename, const string ¶m) 109 | { 110 | FILE* ret = fopen(filename.c_str(), param.c_str()); 111 | if (ret == NULL) { 112 | cerr << "[Warning] failed to open " << filename << " under parameters = " << param << endl; 113 | } 114 | return ret; 115 | } 116 | 117 | vector splitBy(const string &line, char sep) 118 | { 119 | vector tokens; 120 | string token = ""; 121 | for (size_t i = 0; i < line.size(); ++ i) { 122 | if (line[i] == sep) { 123 | if (token != "") { 124 | tokens.push_back(token); 125 | } 126 | token = ""; 127 | } else { 128 | token += line[i]; 129 | } 130 | } 131 | if (token != "") { 132 | tokens.push_back(token); 133 | } 134 | return tokens; 135 | } 136 | 137 | namespace Binary 138 | { 139 | void write(FILE* out, const size_t &size) { 140 | fwrite(&size, sizeof(size), 1, out); 141 | } 142 | 143 | void write(FILE* out, const string &s) { 144 | write(out, s.size()); 145 | if (s.size() > 0) { 146 | fwrite(&s[0], sizeof(char), s.size(), out); 147 | } 148 | } 149 | 150 | void read(FILE* in, size_t &size) { 151 | fread(&size, sizeof(size), 1, in); 152 | } 153 | 154 | void read(FILE* in, string &s) { 155 | size_t size; 156 | read(in, size); 157 | s.resize(size); 158 | if (size > 0) { 159 | fread(&s[0], sizeof(char), size, in); 160 | } 161 | } 162 | } 163 | 164 | #endif 165 | 166 | -------------------------------------------------------------------------------- /src/classification/predict_quality.cpp: -------------------------------------------------------------------------------- 1 | #include "random_forest.h" 2 | #include "../utils/helper.h" 3 | 4 | using namespace RandomForestRelated; 5 | 6 | vector< vector > train, all; 7 | vector candidates; 8 | vector trainY; 9 | 10 | map labels; 11 | 12 | void loadLabels(string filename) 13 | { 14 | FILE* in = tryOpen(filename.c_str(), "r"); 15 | for (;getLine(in);) { 16 | vector tokens = splitBy(line, '\t'); 17 | if (tokens.size() < 2) { 18 | continue; 19 | } 20 | string phrase = tolower(tokens[0]); 21 | int label; 22 | fromString(tokens[1], label); 23 | labels[phrase] = label; 24 | } 25 | fclose(in); 26 | fprintf(stderr, "%d labels loaded\n", labels.size()); 27 | } 28 | 29 | vector featureNames; 30 | 31 | int loadFeatureMatrix(string filename, string forbid) 32 | { 33 | vector forbidFeat = splitBy(forbid, ','); 34 | unordered_set forbidFeatSet(forbidFeat.begin(), forbidFeat.end()); 35 | 36 | FILE* in = tryOpen(filename.c_str(), "r"); 37 | getLine(in); // header 38 | vector attributes = splitBy(line, ','); 39 | int dimension = 0; 40 | FOR (feat, attributes) { 41 | if (*feat == "pattern") { 42 | continue; 43 | } 44 | dimension += !forbidFeatSet.count(*feat); 45 | if (!forbidFeatSet.count(*feat)) { 46 | featureNames.push_back(*feat); 47 | } 48 | } 49 | fprintf(stderr, "feature dimension = %d\n", dimension); 50 | for (;getLine(in);) { 51 | vector tokens = splitBy(line, ','); 52 | string phrase = tokens[0]; 53 | vector features(dimension, 0); 54 | int ptr = 0; 55 | for (size_t i = 1; i < tokens.size(); ++ i) { 56 | if (forbidFeatSet.count(attributes[i])) { 57 | continue; 58 | } 59 | fromString(tokens[i], features[ptr ++]); 60 | } 61 | myAssert(ptr == dimension, "ptr exceeds the dimension"); 62 | if (labels.count(phrase)) { 63 | train.push_back(features); 64 | trainY.push_back(labels[phrase]); 65 | } 66 | candidates.push_back(phrase); 67 | all.push_back(features); 68 | } 69 | fclose(in); 70 | fprintf(stderr, "%d candidates loaded\n", candidates.size()); 71 | fprintf(stderr, "%d are in labels\n", train.size()); 72 | return dimension; 73 | } 74 | 75 | int main(int argc, char* argv[]) 76 | { 77 | double threshold; 78 | if (argc != 8 || sscanf(argv[5], "%lf", &threshold) != 1 || threshold < 0 || threshold > 1) { 79 | fprintf(stderr, "[usage]