├── parse.sh
├── word2vec_tool
    ├── demo-word.sh
    ├── demo-classes.sh
    ├── demo-word-accuracy.sh
    ├── demo-analogy.sh
    ├── Makefile
    ├── demo-phrases.sh
    ├── demo-phrase-accuracy.sh
    ├── README.txt
    ├── distance.c
    ├── word-analogy.c
    ├── demo-train-big-model-v1.sh
    ├── compute-accuracy.c
    ├── word2phrase.c
    └── LICENSE
├── data
    ├── test.txt
    ├── DBLP.label
    └── stopwords.txt
├── src
    ├── preprocessing
    │   ├── from_raw_to_binary_text.cpp
    │   ├── from_raw_to_binary.cpp
    │   └── compute_idf.py
    ├── online_query
    │   ├── compute_offset.py
    │   ├── test_parser.cpp
    │   ├── segphrase_parser.h
    │   └── segphrase_parser.cpp
    ├── postprocessing
    │   ├── filter_by_support.py
    │   ├── clean_list_with_wordnet.py
    │   ├── combine_phrases.cpp
    │   ├── build_model.cpp
    │   ├── prune_and_combine.cpp
    │   ├── kd_tree.h
    │   ├── qualify_unigrams.cpp
    │   └── generateNN.cpp
    ├── frequent_phrase_mining
    │   ├── frequent_pattern_mining.py
    │   └── main.py
    ├── classification
    │   ├── aho_corasick.h
    │   ├── predict_quality.cpp
    │   ├── auto_label_generation.py
    │   ├── random_forest.h
    │   └── feature_extraction.cpp
    ├── utils
    │   └── helper.h
    └── model_training
    │   ├── recompute_features.cpp
    │   └── adjust_probability.cpp
├── Makefile
├── README.md
├── train_toy.sh
├── train_dblp.sh
└── LICENSE


/parse.sh:
--------------------------------------------------------------------------------
1 | ./bin/segphrase_parser results/segmentation.model results/salient.csv 0.6 ./data/test.txt ./results/parsed.txt 0
2 | # An alternative output format in terms of phrase offset
3 | # python ./src/online_query/compute_offset.py ./results/parsed.txt ./results/offset.txt
4 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-word.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./distance vectors.bin
8 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-classes.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
7 | sort classes.txt -k 2 -n > classes.sorted.txt
8 | echo The word classes were saved to file classes.sorted.txt
9 | 


--------------------------------------------------------------------------------
/data/test.txt:
--------------------------------------------------------------------------------
1 | I love data mining and database.
2 | A database is an organized collection of data. The data is typically organized to model aspects of reality in a way that supports processes requiring information. For example, modelling the availability of rooms in hotels in a way that supports finding a hotel with vacancies.
3 | This is an sentence used to test the consecutive numbers and ending characters in the end of sentences. The version is 1.10.
4 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-word-accuracy.sh:
--------------------------------------------------------------------------------
1 | make
2 | if [ ! -e text8 ]; then
3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
4 |   gzip -d text8.gz -f
5 | fi
6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt
8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
9 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-analogy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e text8 ]; then
 3 |   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
 4 |   gzip -d text8.gz -f
 5 | fi
 6 | echo ---------------------------------------------------------------------------------------------------
 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set
 8 | echo Example input: paris france berlin
 9 | echo ---------------------------------------------------------------------------------------------------
10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
11 | ./word-analogy vectors.bin
12 | 


--------------------------------------------------------------------------------
/word2vec_tool/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
 3 | CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
 4 | 
 5 | all: word2vec word2phrase distance word-analogy compute-accuracy
 6 | 
 7 | word2vec : word2vec.c
 8 | 	$(CC) word2vec.c -o word2vec $(CFLAGS)
 9 | word2phrase : word2phrase.c
10 | 	$(CC) word2phrase.c -o word2phrase $(CFLAGS)
11 | distance : distance.c
12 | 	$(CC) distance.c -o distance $(CFLAGS)
13 | word-analogy : word-analogy.c
14 | 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
15 | compute-accuracy : compute-accuracy.c
16 | 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
17 | 	chmod +x *.sh
18 | 
19 | clean:
20 | 	rm -rf word2vec word2phrase distance word-analogy compute-accuracy


--------------------------------------------------------------------------------
/word2vec_tool/demo-phrases.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./distance vectors-phrase.bin
12 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-phrase-accuracy.sh:
--------------------------------------------------------------------------------
 1 | make
 2 | if [ ! -e news.2012.en.shuffled ]; then
 3 |   wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 4 |   gzip -d news.2012.en.shuffled.gz -f
 5 | fi
 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt
12 | 


--------------------------------------------------------------------------------
/src/preprocessing/from_raw_to_binary_text.cpp:
--------------------------------------------------------------------------------
 1 | #include "../utils/helper.h"
 2 | 
 3 | const string ENDINGS = ".!?,;:'[]";
 4 | 
 5 | int main(int argc, char* argv[])
 6 | {
 7 | 	if (argc != 3) {
 8 | 		cerr << "[Usage] <input-raw-text-file> <output-binary-file>" << endl;
 9 | 		return -1;
10 | 	}
11 | 	
12 | 	FILE* in = tryOpen(argv[1], "r");
13 | 	vector<string> sentences;
14 | 	for (;getLine(in);) {
15 | 		string sentence = "";
16 | 		for (int i = 0; line[i]; ++ i) {
17 | 			char ch = line[i];
18 | 			if (ENDINGS.find(ch) != -1) {
19 | 				if (sentence.size() > 0) {
20 | 					sentences.push_back(sentence);
21 | 				}
22 | 				sentence = "";
23 | 			} else {
24 | 				sentence += ch;
25 | 			}
26 | 		}
27 | 		if (sentence.size() > 0) {
28 | 			sentences.push_back(sentence);
29 | 		}
30 | 	}
31 | 	fclose(in);
32 | 	
33 | 	cerr << "# Sentences = " << sentences.size() << endl;
34 | 	
35 | 	FILE* out = tryOpen(argv[2], "wb");
36 | 	
37 | 	Binary::write(out, sentences.size());
38 | 	FOR (sentence, sentences) {
39 | 		Binary::write(out, *sentence);
40 | 	}
41 | 		
42 | 	return 0;
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/src/online_query/compute_offset.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | 
 4 | def main(argv):
 5 |     if (len(argv) != 2):
 6 |         print "[Usage] <input-parsed-file> <output-offset-file>"
 7 |         return
 8 |     input_file = argv[0]
 9 |     output_file = argv[1]
10 | 
11 |     with open(output_file, 'w') as output:
12 |         with open(input_file, 'r') as input:
13 |             for line in input:
14 |                 output.write(re.sub('[\[\]]', '', line))
15 |                 output.write('Offsets: ')
16 |                 offset = 0
17 |                 left = 0
18 |                 right = 0
19 |                 bias = 0
20 |                 for char in line:
21 |                     if char == '[':
22 |                         left = offset + 1
23 |                     if char == ']':
24 |                         right = offset
25 |                         bias += 1
26 |                         output.write('[' + str(left - bias * 2 + 1) + ', ' + str(right - bias * 2 + 1) + ']')
27 |                         output.write(' (' + line[left:right] + '); ')
28 |                     offset += 1
29 |                 output.write('\n')
30 | 
31 | if __name__ == "__main__":
32 |     main(sys.argv[1 : ])
33 | 


--------------------------------------------------------------------------------
/word2vec_tool/README.txt:
--------------------------------------------------------------------------------
 1 | Tools for computing distributed representtion of words
 2 | ------------------------------------------------------
 3 | 
 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts.
 5 | 
 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous
 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following:
 8 |  - desired vector dimensionality
 9 |  - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model
10 |  - training algorithm: hierarchical softmax and / or negative sampling
11 |  - threshold for downsampling the frequent words 
12 |  - number of threads to use
13 |  - the format of the output word vector file (text or binary)
14 | 
15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 
16 | 
17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training
18 | is finished, the user can interactively explore the similarity of the words.
19 | 
20 | More information about the scripts is provided at https://code.google.com/p/word2vec/
21 | 
22 | 


--------------------------------------------------------------------------------
/src/online_query/test_parser.cpp:
--------------------------------------------------------------------------------
 1 | #include "segphrase_parser.h"
 2 | 
 3 | template<class T>
 4 | void printVector(vector<T> a) {
 5 |     for (size_t i = 0; i < a.size(); ++ i) {
 6 |         cerr << a[i];
 7 |         if (i + 1 == a.size()) {
 8 |             cerr << endl;
 9 |         } else {
10 |             cerr << ", ";
11 |         }
12 |     }
13 | }
14 | 
15 | int main(int argc, char* argv[])
16 | {
17 |     if (argc != 2) {
18 |         cerr << "[usage] <model-file>" << endl;
19 |         return -1;
20 |     }
21 |     
22 |     string model_path = (string)argv[1];
23 |     SegPhraseParser* parser = new SegPhraseParser(model_path, 0);
24 |     cerr << "parser built." << endl;
25 |     
26 |     vector<string> segments = parser->segment("data mining is an area");
27 |     printVector(segments);
28 |     
29 |     cerr << "Please type in a sentence in a single line (or exit()):" << endl;
30 |     while (getLine(stdin)) {
31 |         if (strcmp(line, "exit()") == 0) {
32 |             break;
33 |         }
34 |         segments = parser->segment(line);
35 |         cerr << "[Segmentation Result]" << endl;
36 |         printVector(segments);
37 |         cerr << "\nPlease type in a sentence in a single line (or exit()):" << endl;
38 |     }
39 |     
40 |     cerr << "[done]" << endl;
41 |     return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/src/postprocessing/filter_by_support.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def main(argv):
 4 |     if len(argv) != 4: 
 5 |         print '[usage] <ranking list, e.g. unified.csv> <segmented corpus, _ joined> <support_threshold> <filtered output>'
 6 |         exit(-1)
 7 |     ranking_list_filename = argv[0]
 8 |     segmented_corpus_filename = argv[1]
 9 |     sigma = int(argv[2])
10 |     filtered_output_filename = argv[3]
11 |     
12 |     support = {}
13 |     for line in open(ranking_list_filename):
14 |         lexicon = line.split(',')[0]
15 |         key ='_'.join(lexicon.lower().split(' '))
16 |         support[key] = 0
17 |     for line in open(segmented_corpus_filename):
18 |         tokens = line.split()
19 |         for token in tokens:
20 |             if token in support:
21 |                 support[token] += 1
22 |     out = open(filtered_output_filename, 'w')
23 |     filtered_cnt = 0
24 |     keep_cnt = 0
25 |     for line in open(ranking_list_filename):
26 |         lexicon = line.split(',')[0]
27 |         key ='_'.join(lexicon.lower().split(' '))
28 |         if support[key] >= sigma:
29 |             keep_cnt += 1
30 |             out.write(line)
31 |         else:
32 |             filtered_cnt += 1
33 |             #print 'filtered: ', lexicon, support[key]
34 |     print 'done. filtered_cnt =', filtered_cnt, 'keep_cnt =', keep_cnt
35 | 
36 | if __name__ == '__main__':
37 |     main(sys.argv[1:])
38 | 


--------------------------------------------------------------------------------
/src/postprocessing/clean_list_with_wordnet.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import wordnet as wn
 2 | from nltk.corpus.reader import NOUN
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("-input", help="input path for concepts file")
 7 | parser.add_argument("-output", help="output path for noise file")
 8 | args = parser.parse_args()
 9 | 
10 | poor_results = set()
11 | results = list()
12 | with open(args.input, 'r') as input:
13 |     for line in input:
14 |         results.append(line)
15 |         concept = line.split(',')[0]
16 |         words = concept.split('_')
17 |         word = ''
18 |         if len(words) > 1:
19 |             synsets = wn.synsets(concept)
20 |             if len(synsets) != 0:
21 |                 noun_synsets = wn.synsets(concept, NOUN)
22 |                 if len(noun_synsets) == 0:
23 |                     poor_results.add(concept + ',0.0000000000\n')
24 |                     continue
25 |             else:
26 |                 continue
27 |                 word = words[-1]
28 |         else:
29 |             word = concept
30 |         synsets = wn.synsets(word)
31 |         if len(synsets) == 0:
32 |             pass
33 |         else:
34 |             noun_synsets = wn.synsets(word, NOUN)
35 |             if len(noun_synsets) == 0:
36 |                 poor_results.add(concept + ',0.0000000000\n')
37 | 
38 | with open(args.output, 'w') as output:
39 |     for line in results:
40 |         if line not in poor_results:
41 |             output.write(line)
42 |     for line in poor_results:
43 |         output.write(line)
44 | 


--------------------------------------------------------------------------------
/src/frequent_phrase_mining/frequent_pattern_mining.py:
--------------------------------------------------------------------------------
 1 | from sets import Set
 2 | 
 3 | def frequentPatternMining(tokens, patternOutputFilename, threshold):
 4 |     dict = {}
 5 | 
 6 |     tokensNumber = len(tokens)
 7 |     for i in xrange(tokensNumber):
 8 |         token = tokens[i]
 9 |         if token == '$':
10 |             continue
11 |         if token in dict:
12 |             dict[token].append(i)
13 |         else:
14 |             dict[token] = [i]
15 |     print "# of distinct tokens = ", len(dict)
16 | 
17 |     patternOutput = open(patternOutputFilename, 'w')
18 | 
19 |     frequentPatterns = []
20 |     patternLength = 1
21 |     while (len(dict) > 0):
22 |         if patternLength > 6:
23 |             break
24 |         #print "working on length = ", patternLength
25 |         patternLength += 1
26 |         newDict = {}
27 |         for pattern, positions in dict.items():
28 |             occurrence = len(positions)
29 |             if occurrence >= threshold:
30 |                 frequentPatterns.append(pattern)
31 |                 
32 |                 patternOutput.write(pattern + "," + str(occurrence) + "\n")
33 |                 for i in positions:
34 |                     if i + 1 < tokensNumber:
35 |                         if tokens[i + 1] == '$':
36 |                             continue
37 |                         newPattern = pattern + " " + tokens[i + 1]
38 |                         if newPattern in newDict:
39 |                             newDict[newPattern].append(i + 1)
40 |                         else:
41 |                             newDict[newPattern] = [i + 1]
42 |         dict.clear()
43 |         dict = newDict
44 |     patternOutput.close()
45 |     return frequentPatterns
46 | 


--------------------------------------------------------------------------------
/src/postprocessing/combine_phrases.cpp:
--------------------------------------------------------------------------------
 1 | #include "../utils/helper.h"
 2 | 
 3 | unordered_map<string, double> phrases;
 4 | 
 5 | void loadPatterns(string folder)
 6 | {
 7 |     const int maxLen = 6;
 8 |     for (int length = 1; length <= maxLen; ++ length) {
 9 |         ostringstream filename;
10 |         filename << "length" << length << ".csv";
11 | 
12 |         FILE* in = tryOpen(folder + "/" + filename.str(), "r");
13 |         if (in == NULL) {
14 |             continue;
15 |         }
16 |         while (getLine(in)) {
17 |             vector<string> tokens = splitBy(line, ',');
18 |             string phrase = tokens[0];
19 |             double prob;
20 |             fromString(tokens[3], prob);
21 | 
22 |             if (length == 1) {
23 | 
24 |             } else {
25 |                 for (size_t i = 0; i < phrase.size(); ++ i) {
26 |                     if (phrase[i] == ' ') {
27 |                         phrase[i] = '_';
28 |                     }
29 |                 }
30 |                 phrases[phrase] = prob;
31 |             }
32 |         }
33 |         fclose(in);
34 |     }
35 | }
36 | 
37 | int main(int argc, char *argv[])
38 | {
39 |     if (argc != 3) {
40 |         printf("[usage] <length*.csv folder path> <output: unified-rank>\n");
41 |         return 0;
42 |     }
43 |     loadPatterns(argv[1]);
44 | 
45 |     vector<pair<double, string>> order;
46 |     FOR (w, phrases) {
47 |         order.push_back(make_pair(w->second, w->first));
48 |     }
49 |     sort(order.rbegin(), order.rend());
50 | 
51 |     FILE* out = tryOpen(argv[2], "w");
52 |     FOR (word, order) {
53 |         fprintf(out, "%s,%.10f\n", word->second.c_str(), word->first);
54 |     }
55 |     fclose(out);
56 | 
57 |     return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/src/frequent_phrase_mining/main.py:
--------------------------------------------------------------------------------
 1 | from frequent_pattern_mining import *
 2 | import re
 3 | import sys
 4 | 
 5 | def main(argv):
 6 |     ENDINGS = ".!?,;:\"[]"
 7 |     
 8 |     threshold = 1000
 9 |     rawTextInput = 'rawText.txt'
10 |     patternOutputFilename = 'patterns.csv'
11 |     argc = len(argv)
12 |     for i in xrange(argc):
13 |         if argv[i] == "-raw" and i + 1 < argc:
14 |             rawTextInput = argv[i + 1]
15 |         elif argv[i] == "-thres" and i + 1 < argc:
16 |             threshold = int(argv[i + 1])
17 |         elif argv[i] == "-o" and i + 1 < argc:
18 |             patternOutputFilename = argv[i + 1]
19 |     
20 |     raw = open(rawTextInput, 'r');
21 |     tokens = []
22 |     for line in raw:
23 |         inside = 0
24 |         chars = []
25 |         for ch in line:
26 |             if ch == '(':
27 |                 inside += 1
28 |             elif ch == ')':
29 |                 inside -= 1
30 |             elif inside == 0:
31 |                 if ch.isalpha():
32 |                     chars.append(ch.lower())
33 |                 elif ch == '\'':
34 |                     chars.append(ch)
35 |                 else:
36 |                     if len(chars) > 0:
37 |                         tokens.append(''.join(chars))
38 |                     chars = []
39 |             if ch in ENDINGS:
40 |                 tokens.append('$')
41 |         if len(chars) > 0:
42 |             tokens.append(''.join(chars))
43 |             chars = []
44 |         
45 |     print "# tokens = ", len(tokens)
46 | 
47 |     frequentPatterns = frequentPatternMining(tokens, patternOutputFilename, threshold)
48 | 
49 |     print "# of frequent pattern = ", len(frequentPatterns)
50 |     
51 | if __name__ == "__main__":
52 |     main(sys.argv[1 : ])
53 | 


--------------------------------------------------------------------------------
/src/preprocessing/from_raw_to_binary.cpp:
--------------------------------------------------------------------------------
 1 | #include "../utils/helper.h"
 2 | #include <cassert>
 3 | 
 4 | const string ENDINGS = ".!?,;:()\"[]";
 5 | 
 6 | int main(int argc, char* argv[])
 7 | {
 8 | 	if (argc != 3) {
 9 | 		cerr << "[Usage] <input-raw-text-file> <output-binary-file>" << endl;
10 | 		return -1;
11 | 	}
12 | 	
13 | 	FILE* in = tryOpen(argv[1], "r");
14 | 	vector<string> sentences;
15 | 	for (;getLine(in);) {
16 | 		string sentence = "";
17 | 		for (int i = 0; line[i]; ++ i) {
18 | 			char ch = tolower(line[i]);
19 | 			if (ENDINGS.find(ch) != -1) {
20 | 				if (sentence.size() > 0) {
21 | 					sentences.push_back(sentence);
22 | 				}
23 | 				sentence = "";
24 | 			} else {
25 | 				if (!isalpha(ch)) {
26 |                     if (ch == '\'') {
27 |                         sentence += ch;
28 |                     } else if (sentence.size() > 0 && sentence[sentence.size() - 1] != ' ') {
29 | 						sentence += ' ';
30 | 					}
31 | 				} else {
32 | 					sentence += ch;
33 | 				}
34 | 			}
35 | 		}
36 | 		if (sentence.size() > 0) {
37 | 			sentences.push_back(sentence);
38 | 		}
39 | 	}
40 | 	fclose(in);
41 | 	
42 | 	cerr << "# Sentences = " << sentences.size() << endl;
43 | 	
44 | 	unordered_set<string> tokens;
45 | 	FOR (sentence, sentences) {
46 | 		vector<string> temp = splitBy(*sentence, ' ');
47 | 		FOR(iter, temp) {
48 | 			tokens.insert(*iter);
49 | 		}
50 |         assert(tokens.size() != 0);
51 | 	}
52 | 	vector<string> unigrams(tokens.begin(), tokens.end());
53 | 	
54 | 	cerr << "# Unigrams = " << unigrams.size() << endl;
55 | 	
56 | 	FILE* out = tryOpen(argv[2], "wb");
57 | 	
58 | 	Binary::write(out, sentences.size());
59 | 	FOR (sentence, sentences) {
60 | 		Binary::write(out, *sentence);
61 | 	}
62 | 	
63 | 	Binary::write(out, unigrams.size());
64 | 	FOR (unigram, unigrams) {
65 | 		Binary::write(out, *unigram);
66 | 	}
67 | 	fclose(out);
68 | 	
69 | 	return 0;
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | export PYTHON = pypy
 2 | export CC  = gcc
 3 | export CXX = g++
 4 | export CFLAGS = -std=c++11 -Wall -O3 -msse2  -fopenmp  -I..
 5 | 
 6 | BIN = ./bin/from_raw_to_binary ./bin/from_raw_to_binary_text ./bin/feature_extraction ./bin/predict_quality ./bin/adjust_probability ./bin/recompute_features ./bin/prune_and_combine ./bin/build_model ./bin/qualify_unigrams ./bin/segphrase_parser ./bin/generateNN ./bin/combine_phrases
 7 | .PHONY: clean all
 8 | 
 9 | all: ./bin $(BIN)
10 | 
11 | ./bin/from_raw_to_binary: ./src/preprocessing/from_raw_to_binary.cpp ./src/utils/helper.h
12 | ./bin/from_raw_to_binary_text: ./src/preprocessing/from_raw_to_binary_text.cpp ./src/utils/helper.h
13 | ./bin/feature_extraction: ./src/classification/feature_extraction.cpp ./src/utils/helper.h ./src/classification/aho_corasick.h
14 | ./bin/predict_quality: ./src/classification/predict_quality.cpp ./src/utils/helper.h ./src/classification/random_forest.h
15 | ./bin/adjust_probability: ./src/model_training/adjust_probability.cpp ./src/utils/helper.h
16 | ./bin/recompute_features: ./src/model_training/recompute_features.cpp ./src/utils/helper.h
17 | ./bin/prune_and_combine: ./src/postprocessing/prune_and_combine.cpp ./src/utils/helper.h
18 | ./bin/build_model: ./src/postprocessing/build_model.cpp ./src/utils/helper.h
19 | ./bin/qualify_unigrams: ./src/postprocessing/qualify_unigrams.cpp ./src/utils/helper.h
20 | ./bin/segphrase_parser: ./src/online_query/segphrase_parser.cpp ./src/utils/helper.h ./src/online_query/segphrase_parser.h
21 | ./bin/generateNN: ./src/postprocessing/generateNN.cpp ./src/utils/helper.h ./src/postprocessing/kd_tree.h
22 | ./bin/combine_phrases: ./src/postprocessing/combine_phrases.cpp ./src/utils/helper.h
23 | 
24 | ./bin:
25 | 	mkdir bin
26 | 
27 | export LDFLAGS= -pthread -lm -Wno-unused-result -Wno-sign-compare -Wno-unused-variable -Wno-parentheses -Wno-format
28 | $(BIN) :
29 | 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
30 | $(OBJ) :
31 | 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
32 | 
33 | clean :
34 | 	rm -rf bin
35 | 


--------------------------------------------------------------------------------
/src/postprocessing/build_model.cpp:
--------------------------------------------------------------------------------
 1 | #include "../utils/helper.h"
 2 | 
 3 | void dump(const unordered_map<string, double> &prob, FILE* out)
 4 | {
 5 |     vector<string> phrases;
 6 |     vector<double> probability;
 7 |     size_t size;
 8 |     size = prob.size();
 9 |     FOR (pairs, prob) {
10 |         phrases.push_back(pairs->first);
11 |         probability.push_back(pairs->second);
12 |     }
13 |     
14 |     fwrite(&size, sizeof(size), 1, out);
15 |     for (size_t i = 0; i < size; ++ i) {
16 |         Binary::write(out, phrases[i]);
17 |     }
18 |     if (size > 0) {
19 |         fwrite(&probability[0], sizeof(probability[0]), size, out);
20 |     }
21 | }
22 | 
23 | int main(int argc, char* argv[])
24 | {
25 |     int maxLen;
26 |     if (argc != 5 || sscanf(argv[2], "%d", &maxLen) != 1) {
27 |         cerr << "[usage] <length*.csv folder path> <maxLen> <penalty-file> <output-model-name>" << endl;
28 |         return -1;
29 |     }
30 |     string folder = argv[1];
31 |     
32 |     FILE* in = tryOpen(argv[3], "r");
33 |     double penalty;
34 |     fscanf(in, "%lf", &penalty);
35 |     fclose(in);
36 |     
37 |     string modelFilename = argv[4];
38 |     
39 |     FILE* out = tryOpen(modelFilename, "wb");
40 |     fwrite(&penalty, sizeof(penalty), 1, out);
41 |     
42 |     unordered_map<string, double> unigrams, phrases;
43 |     for (int length = 1; length <= maxLen; ++ length) {
44 |         ostringstream filename;
45 |         filename << "length" << length << ".csv";
46 |         
47 |         FILE* in = tryOpen(folder + "/" + filename.str(), "r");
48 |         if (in == NULL) {
49 |             continue;
50 |         }
51 |         while (getLine(in)) {
52 |             vector<string> tokens = splitBy(line, ',');
53 |             string phrase = tokens[0];
54 |             double prob;
55 |             fromString(tokens[2], prob);
56 |             
57 |             if (length == 1) {
58 |                 unigrams[phrase] = prob;
59 |             } else {
60 |                 phrases[phrase] = prob;
61 |             }
62 |         }
63 |         fclose(in);
64 |     }
65 |     cerr << "penalty = " << penalty << endl;
66 |     cerr << "# unigrams = " << unigrams.size() << endl;
67 |     cerr << "# phrases = " << phrases.size() << endl;
68 |     
69 |     dump(unigrams, out);
70 |     dump(phrases, out);
71 |     
72 |     cerr << "segmetation model saved." << endl;
73 |     
74 |     fclose(out);
75 |     
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/preprocessing/compute_idf.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from math import *
 4 | 
 5 | def main(argv):
 6 |     rawTextInput = 'rawText.txt'
 7 |     stopwordsOutput = 'stopwordsFromText.txt'
 8 |     argc = len(argv)
 9 |     for i in xrange(argc):
10 |         if argv[i] == "-raw" and i + 1 < argc:
11 |             rawTextInput = argv[i + 1]
12 |         elif argv[i] == "-o" and i + 1 < argc:
13 |             stopwordsOutput = argv[i + 1]
14 |     
15 |     inDocs = {}
16 |     occurrence = {}
17 |     docsN = 0
18 |     tokensN = 0
19 |     for line in open(rawTextInput, 'r'):
20 |         docsN += 1
21 |         
22 |         inside = 0
23 |         chars = []
24 |         tokens = {}
25 |         for ch in line:
26 |             if ch == '(':
27 |                 inside += 1
28 |             elif ch == ')':
29 |                 inside -= 1
30 |             elif inside == 0:
31 |                 if ch.isalpha():
32 |                     chars.append(ch.lower())
33 |                 elif ch == '\'':
34 |                     chars.append(ch)
35 |                 else:
36 |                     if len(chars) > 0:
37 |                         token = ''.join(chars)
38 |                         tokensN += 1
39 |                         if token in occurrence:
40 |                             occurrence[token] += 1
41 |                         else:
42 |                             occurrence[token] = 1
43 |                         tokens[token] = True
44 |                     chars = []
45 |         if len(chars) > 0:
46 |             token = ''.join(chars)
47 |             tokensN += 1
48 |             if token in occurrence:
49 |                 occurrence[token] += 1
50 |             else:
51 |                 occurrence[token] = 1
52 |             tokens[token] = True
53 |             chars = []
54 |         for token in tokens:
55 |             if token in inDocs:
56 |                 inDocs[token] += 1
57 |             else:
58 |                 inDocs[token] = 1
59 | 
60 |     #print 'tokens = ', tokensN
61 |     #print 'docs = ', docsN
62 | 
63 |     rank = []
64 |     for token, occur in occurrence.items():
65 |         tf = occur / float(tokensN)
66 |         idf = max(log(docsN / float(inDocs[token])), 1e-10)
67 |         rank.append((token, tf * idf))
68 |     sorted_x = sorted(rank, key=lambda x: -x[1])
69 |     
70 |     out = open(stopwordsOutput, 'w')
71 |     for token, key in sorted_x:
72 |         out.write(str(token) + ',' + str(key) + '\n');
73 |     out.close()
74 | 
75 | if __name__ == "__main__":
76 |     main(sys.argv[1 : ])
77 | 


--------------------------------------------------------------------------------
/src/classification/aho_corasick.h:
--------------------------------------------------------------------------------
 1 | #ifndef __AHO_CORASICK_H__
 2 | #define __AHO_CORASICK_H__
 3 | 
 4 | #include "../utils/helper.h"
 5 | #include <queue>
 6 | using namespace std;
 7 | 
 8 | class AhoCorasick
 9 | {
10 |     vector< unordered_map<char, int> > next;
11 |     vector<int> failed, depth;
12 |     vector<bool> isEnd;
13 |     int nodes;
14 |     
15 |     int addNewNode() {
16 |         next.push_back(unordered_map<char, int>());
17 |         isEnd.push_back(false);
18 |         depth.push_back(0);
19 |         return nodes ++;
20 |     }
21 |     
22 | public:
23 |     AhoCorasick() {
24 |         nodes = 0;
25 |         addNewNode();
26 |     }
27 |     
28 |     void add(const string &s) {
29 |         int u = 0;
30 |         for (size_t i = 0; i < s.size(); ++ i) {
31 |             char ch = s[i];
32 |             int v;
33 |             if (!next[u].count(ch)) {
34 |                 v = addNewNode();
35 |                 depth[v] = depth[u] + 1;
36 |                 next[u][ch] = v;
37 |             } else {
38 |                 v = next[u][ch];
39 |             }
40 |             u = v;
41 |         }
42 |         isEnd[u] = true;
43 |     }
44 |     
45 |     void make() {
46 |         queue<int> q;
47 |         failed.resize(nodes, -1);
48 |         q.push(0);
49 |         while (q.size()) {
50 |             int u = q.front();
51 |             q.pop();
52 |             FOR (edge, next[u]) {
53 |                 char ch = edge->first;
54 |                 int v = edge->second;
55 |                 if (u == 0) {
56 |                     failed[v] = 0;
57 |                 } else {
58 |                     failed[v] = 0;
59 |                     for (int p = failed[u];p != -1;p = failed[p]) {
60 |                         if (next[p].count(ch)) {
61 |                             failed[v] = next[p][ch];
62 |                             break;
63 |                         }
64 |                     }
65 |                 }
66 |                 q.push(v);
67 |             }
68 |         }
69 |     }
70 |     
71 |     void search(const string &text, vector< pair<int, int> > &ret) {
72 |         for (int i = 0, p = 0; i < (int)text.size(); ++ i) {
73 |             char ch = text[i];
74 |             while (!next[p].count(ch) && p != 0) {
75 |                 p = failed[p];
76 |             }
77 |             if (next[p].count(ch)) {
78 |                 p = next[p][ch];
79 |             }
80 |             int temp = p;
81 |             while (temp != 0 && isEnd[temp]) {
82 |                 ret.push_back(make_pair(i - depth[temp] + 1, i + 1));
83 |                 temp = failed[temp];
84 |             }
85 |         }
86 |     }
87 | };
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/src/postprocessing/prune_and_combine.cpp:
--------------------------------------------------------------------------------
 1 | #include "../utils/helper.h"
 2 | 
 3 | int load(string filename, int window, double threshold, vector< pair<double, string> > &ret, bool det, int n)
 4 | {
 5 |     vector< pair<string, double> > order;
 6 |     FILE* in = tryOpen(filename.c_str(), "r");
 7 |     if (in != NULL) {
 8 |         for (;getLine(in);) {
 9 |             vector<string> tokens = splitBy(line, ',');
10 |             myAssert(tokens.size() == 4, "wrong number of columns");
11 |             string phrase = tokens[0];
12 |             double quality;
13 |             fromString(tokens[3], quality);
14 |             order.push_back(make_pair(phrase, quality));
15 |         }
16 |     }
17 |     vector<double> sum(order.size() + 1, 0);
18 |     for (size_t i = 0; i < order.size(); ++ i) {
19 |         sum[i + 1] = sum[i] + order[i].second;
20 |     }
21 |     for (size_t i = 0; i < order.size(); ++ i) {
22 |         int l = i - window;
23 |         int r = i + window;
24 |         if (l < 0) {
25 |             l = 0;
26 |         }
27 |         if (r >= order.size()) {
28 |             r = (int)order.size() - 1;
29 |         }
30 |         if (det && (sum[r + 1] - sum[l]) / (r - l + 1) < threshold || !det && i >= n) {
31 |             fprintf(stderr, "%d/%d phrases in %s\n", i, (int)order.size(), filename.c_str());
32 |             return i;
33 |         }
34 |         ret.push_back(make_pair(order[i].second, order[i].first));
35 |     }
36 |     fprintf(stderr, "%d/%d phrases in %s\n", (int)order.size(), (int)order.size(), filename.c_str());
37 |     return (int)order.size();
38 | }
39 | 
40 | int main(int argc, char* argv[])
41 | {
42 |     int window;
43 |     double threshold;
44 |     if (argc != 7 || sscanf(argv[2], "%d", &window) != 1 || sscanf(argv[3], "%lf", &threshold) != 1) {
45 |         fprintf(stderr, "[usage] <input prefix> <half slide window size> <avgerage quality> <output file> <load number> <save number>");
46 |         return -1;
47 |     }
48 | 
49 |     int numbers[10];
50 |     if (strcmp(argv[5], "DET") != 0) {
51 |         FILE* in = tryOpen(argv[5], "r");
52 |         for (int l = 2; l <= 6; ++ l) {
53 |             fscanf(in, "%d", &numbers[l]);
54 |         }
55 |         fclose(in);
56 |     }
57 |     
58 |     vector< pair<double, string> > phrases;
59 |     FILE* fn = tryOpen(argv[6], "w");
60 |     for (int length = 2; length <= 6; ++ length) {
61 |         char filename[256];
62 |         sprintf(filename, "%s%d.csv", argv[1], length);
63 |         int n = load(filename, window, threshold, phrases, strcmp(argv[5], "DET") == 0, numbers[length]);
64 |         fprintf(fn, "%d\n", n);
65 |     }
66 |     fclose(fn);
67 | 
68 |     sort(phrases.rbegin(), phrases.rend());
69 |     FILE* out = tryOpen(argv[4], "w");
70 |     FOR (phrase, phrases) {
71 |         fprintf(out, "%s,%.10f\n", phrase->second.c_str(), phrase->first);
72 |     }
73 |     fclose(out);    
74 |     
75 |     return 0;
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/src/utils/helper.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MY_HELPER__
  2 | #define __MY_HELPER__
  3 | 
  4 | #include <cstdio>
  5 | #include <cstring>
  6 | #include <cstdlib>
  7 | #include <algorithm>
  8 | #include <iostream>
  9 | #include <map>
 10 | #include <string>
 11 | #include <vector>
 12 | #include <cctype>
 13 | #include <sstream>
 14 | #include <cmath>
 15 | #include <unordered_map>
 16 | #include <unordered_set>
 17 | using namespace std;
 18 | 
 19 | #define FOR(i,a) for (__typeof((a).begin()) i = (a).begin(); i != (a).end(); ++ i)
 20 | 
 21 | const double EPS = 1e-8;
 22 | 
 23 | /*! \brief return a real numer uniform in (0,1) */
 24 | inline double next_double2(){
 25 |     return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
 26 | }
 27 | 
 28 | /*! \brief return  x~N(0,1) */
 29 | inline double sample_normal(){
 30 | 	double x,y,s;
 31 | 	do{
 32 | 		x = 2 * next_double2() - 1.0;
 33 | 		y = 2 * next_double2() - 1.0;
 34 | 		s = x*x + y*y;
 35 | 	}while( s >= 1.0 || s == 0.0 );
 36 | 	
 37 | 	return x * sqrt( -2.0 * log(s) / s ) ;
 38 | }
 39 | 
 40 | bool myAssert(bool flg, string msg)
 41 | {
 42 | 	if (!flg) {
 43 | 		cerr << msg << endl;
 44 | 		exit(-1);
 45 | 	}
 46 | 	return flg;
 47 | }
 48 | 
 49 | int sign(double x)
 50 | {
 51 | 	return x < -EPS ? -1 : x > EPS;
 52 | }
 53 | 
 54 | string replaceAll(const string &s, const string &from, const string &to)
 55 | {
 56 |     string ret = "";
 57 |     for (size_t i = 0; i < s.size(); ++ i) {
 58 |         bool found = true;
 59 |         for (size_t offset = 0; offset < from.size() && found; ++ offset) {
 60 |             found &= i + offset < s.size() && s[i + offset] == from[offset];
 61 |         }
 62 |         if (found) {
 63 |             ret += to;
 64 |             i += from.size() - 1;
 65 |         } else {
 66 |             ret += s[i];
 67 |         }
 68 |     }
 69 |     return ret;
 70 | }
 71 | 
 72 | double sqr(double x)
 73 | {
 74 |     return x * x;
 75 | }
 76 | 
 77 | template<class T>
 78 | void fromString(const string &s, T &x)
 79 | {
 80 | 	stringstream in(s);
 81 | 	in >> x;
 82 | }
 83 | 
 84 | string tolower(const string &a)
 85 | {
 86 | 	string ret = a;
 87 | 	for (size_t i = 0; i < ret.size(); ++ i) {
 88 | 		ret[i] = tolower(ret[i]);
 89 | 	}
 90 | 	return ret;
 91 | }
 92 | 
 93 | const int MAX_LENGTH = 100000000;
 94 | 
 95 | char line[MAX_LENGTH + 1];
 96 | 
 97 | bool getLine(FILE* in)
 98 | {
 99 | 	bool hasNext = fgets(line, MAX_LENGTH, in);
100 | 	int length = strlen(line);
101 | 	while (length > 0 && (line[length - 1] == '\n' || line[length - 1] == '\r')) {
102 | 		-- length;
103 | 	}
104 | 	line[length] = 0;
105 | 	return hasNext;
106 | }
107 | 
108 | FILE* tryOpen(const string &filename, const string &param)
109 | {
110 | 	FILE* ret = fopen(filename.c_str(), param.c_str());
111 | 	if (ret == NULL) {
112 | 		cerr << "[Warning] failed to open " << filename  << " under parameters = " << param << endl;
113 | 	}
114 | 	return ret;
115 | }
116 | 
117 | vector<string> splitBy(const string &line, char sep)
118 | {
119 | 	vector<string> tokens;
120 | 	string token = "";
121 | 	for (size_t i = 0; i < line.size(); ++ i) {
122 | 		if (line[i] == sep) {
123 | 		    if (token != "") {
124 |     			tokens.push_back(token);
125 | 			}
126 | 			token = "";
127 | 		} else {
128 | 			token += line[i];
129 | 		}
130 | 	}
131 | 	if (token != "") {
132 |     	tokens.push_back(token);
133 | 	}
134 | 	return tokens;
135 | }
136 | 
137 | namespace Binary
138 | {
139 | 	void write(FILE* out, const size_t &size) {
140 | 		fwrite(&size, sizeof(size), 1, out);
141 | 	}
142 | 	
143 | 	void write(FILE* out, const string &s) {
144 | 		write(out, s.size());
145 | 		if (s.size() > 0) {
146 | 			fwrite(&s[0], sizeof(char), s.size(), out);
147 | 		}
148 | 	}
149 | 	
150 | 	void read(FILE* in, size_t &size) {
151 | 		fread(&size, sizeof(size), 1, in);
152 | 	}
153 | 	
154 | 	void read(FILE* in, string &s) {
155 | 		size_t size;
156 | 		read(in, size);
157 | 		s.resize(size);
158 | 		if (size > 0) {
159 | 			fread(&s[0], sizeof(char), size, in);
160 | 		}
161 | 	}
162 | }
163 | 
164 | #endif
165 | 
166 | 


--------------------------------------------------------------------------------
/src/classification/predict_quality.cpp:
--------------------------------------------------------------------------------
  1 | #include "random_forest.h"
  2 | #include "../utils/helper.h"
  3 | 
  4 | using namespace RandomForestRelated;
  5 | 
  6 | vector< vector<double> > train, all;
  7 | vector<string> candidates;
  8 | vector<double> trainY;
  9 | 
 10 | map<string, int> labels;
 11 | 
 12 | void loadLabels(string filename)
 13 | {
 14 | 	FILE* in = tryOpen(filename.c_str(), "r");
 15 | 	for (;getLine(in);) {
 16 | 		vector<string> tokens = splitBy(line, '\t');
 17 | 		if (tokens.size() < 2) {
 18 | 			continue;
 19 | 		}
 20 | 		string phrase = tolower(tokens[0]);
 21 | 		int label;
 22 | 		fromString(tokens[1], label);
 23 | 		labels[phrase] = label;
 24 | 	}
 25 | 	fclose(in);
 26 | 	fprintf(stderr, "%d labels loaded\n", labels.size());
 27 | }
 28 | 
 29 | vector<string> featureNames;
 30 | 
 31 | int loadFeatureMatrix(string filename, string forbid)
 32 | {
 33 |     vector<string> forbidFeat = splitBy(forbid, ',');
 34 |     unordered_set<string> forbidFeatSet(forbidFeat.begin(), forbidFeat.end());
 35 |     
 36 | 	FILE* in = tryOpen(filename.c_str(), "r");
 37 | 	getLine(in); // header
 38 | 	vector<string> attributes = splitBy(line, ',');
 39 | 	int dimension = 0;
 40 | 	FOR (feat, attributes) {
 41 | 	    if (*feat == "pattern") {
 42 | 	        continue;
 43 | 	    }
 44 | 	    dimension += !forbidFeatSet.count(*feat);
 45 | 	    if (!forbidFeatSet.count(*feat)) {
 46 | 	        featureNames.push_back(*feat);
 47 | 	    }
 48 | 	}
 49 | 	fprintf(stderr, "feature dimension = %d\n", dimension);
 50 | 	for (;getLine(in);) {
 51 | 		vector<string> tokens = splitBy(line, ',');
 52 | 		string phrase = tokens[0];
 53 | 		vector<double> features(dimension, 0);
 54 | 		int ptr = 0;
 55 | 		for (size_t i = 1; i < tokens.size(); ++ i) {
 56 | 		    if (forbidFeatSet.count(attributes[i])) {
 57 | 		        continue;
 58 | 		    }
 59 | 			fromString(tokens[i], features[ptr ++]);
 60 | 		}
 61 | 		myAssert(ptr == dimension, "ptr exceeds the dimension");
 62 | 		if (labels.count(phrase)) {
 63 | 			train.push_back(features);
 64 | 			trainY.push_back(labels[phrase]);
 65 | 		}
 66 | 		candidates.push_back(phrase);
 67 | 		all.push_back(features);
 68 | 	}
 69 | 	fclose(in);
 70 | 	fprintf(stderr, "%d candidates loaded\n", candidates.size());
 71 | 	fprintf(stderr, "%d are in labels\n", train.size());
 72 |     return dimension;
 73 | }
 74 | 
 75 | int main(int argc, char* argv[])
 76 | {
 77 |     double threshold;
 78 | 	if (argc != 8 || sscanf(argv[5], "%lf", &threshold) != 1 || threshold < 0 || threshold > 1) {
 79 | 		fprintf(stderr, "[usage] <feature file: final.csv> <label file: DBLP.label> <output: ranking.csv> <forbid features, comma separated> <prune-threshold : [0, 1]> <in-random-forest-model> <out-random-forest-model>\n");
 80 | 		return -1;
 81 | 	}
 82 | 	loadLabels(argv[2]);
 83 | 	int dimension = loadFeatureMatrix(argv[1], argv[4]);
 84 | 	
 85 | 	RandomForest *solver = new RandomForest();
 86 | 	if (!strcmp(argv[6], "TRAIN")) {
 87 | 	    fprintf(stderr, "start to train...\n");
 88 | 	    rng.init();
 89 | 	    RANDOM_FEATURES = 4;
 90 | 	    RANDOM_POSITIONS = 8;
 91 | 	    solver->train(train, trainY, 100, 1);
 92 | 	} else {
 93 | 	    fprintf(stderr, "start to load...\n");
 94 | 	    solver->load(argv[6]);
 95 | 	}
 96 | /*	fprintf(stderr, "=== feature importance ===\n");
 97 |     for (size_t i = 0; i < featureNames.size(); ++ i) {
 98 |         fprintf(stderr, "%s\t%.10f\n", featureNames[i].c_str(), featureImportance[i]);
 99 |     }
100 | 	
101 | 	fprintf(stderr, "start to dump...\n");*/
102 | 	solver->dump(argv[7]);
103 | 	
104 | 	vector< pair<double, string> > results;
105 | 	for (size_t i = 0; i < all.size(); ++ i) {
106 | 		double pred = solver->estimate(all[i]);
107 | 		results.push_back(make_pair(pred, candidates[i]));
108 | 	}
109 | 	sort(results.rbegin(), results.rend());
110 | 	
111 | 	FILE* out = tryOpen(argv[3], "w");
112 | 	for (size_t i = 0; i < results.size(); ++ i) {
113 | 	    if (results[i].first < threshold) {
114 | 	        break;
115 | 	    }
116 | 		fprintf(out, "%s,%.10f\n", results[i].second.c_str(), results[i].first);
117 | 	}
118 | 	fclose(out);
119 | 	
120 | 	return 0;
121 | }
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SegPhrase
  2 | 
  3 | Please check out our new tool
  4 | [AutoPhrase](https://github.com/shangjingbo1226/AutoPhrase), which is
  5 | significantly more efficient and can support multiple languages.
  6 | 
  7 | ## Publication
  8 | 
  9 | *   Jialu Liu\*, Jingbo Shang\*, Chi Wang, Xiang Ren and Jiawei Han, "**[Mining
 10 |     Quality Phrases from Massive Text
 11 |     Corpora](http://jialu.cs.illinois.edu/paper/sigmod2015-liu.pdf)**”, Proc. of
 12 |     2015 ACM SIGMOD Int. Conf. on Management of Data (SIGMOD'15), Melbourne,
 13 |     Australia, May 2015. (\* equally contributed,
 14 |     [slides](http://jialu.cs.illinois.edu/paper/sigmod2015-liu-slides.pdf))
 15 | 
 16 | ## Notes
 17 | 
 18 | The current results support quality unigram mining, which is not covered in the
 19 | original paper. We plan to improve this part in the future updates.
 20 | 
 21 | Automatic labeling is another addon feature based on Wikipedia entities. We
 22 | suggest you to provide your own labels in order to achieve the best performance.
 23 | 
 24 | ## Requirements
 25 | 
 26 | We will take Ubuntu for example.
 27 | 
 28 | *   g++ 4.8 `$ sudo apt-get install g++-4.8`
 29 | *   python 2.7 `$ sudo apt-get install python`
 30 | *   scikit-learn `$ sudo apt-get install pip $ sudo pip install sklearn`
 31 | *   nltk (required only when WORDNET_NOUN=1) `$ sudo pip install nltk`
 32 | 
 33 | ## Build
 34 | 
 35 | SegPhrase can be easily built by Makefile in the terminal. `$ make`
 36 | 
 37 | ## Default Run
 38 | 
 39 | ```
 40 | $ ./train_toy.sh  #train a toy segmenter and output phrase list as results/unified.csv
 41 | $ ./train_dblp.sh  #train a segmenter and output phrase list for DBLP data
 42 | $ ./parse.sh  #use the segmenter to parse new documents
 43 | ```
 44 | 
 45 | ## Parameters - training
 46 | 
 47 | ```
 48 | RAW_TEXT=data/DBLP.5K.txt
 49 | ```
 50 | 
 51 | RAW_TEXT is the input of SegPhrase, where each line is a single document.
 52 | 
 53 | ```
 54 | AUTO_LABEL=1
 55 | DATA_LABEL=data/wiki.label.auto
 56 | ```
 57 | 
 58 | When AUTO_LABEL is set to 1, SegPhrase will automatically generate labels and
 59 | save it to DATA_LABEL. Otherwise, it will load labels from DATA_LABEL.
 60 | 
 61 | ```
 62 | WORDNET_NOUN=1
 63 | ```
 64 | 
 65 | when WORDNET_NOUN is set to 1, SegPhrase will resort to wordnet synsets to keep
 66 | only noun candidates as the last step of training. This requires you to install
 67 | nltk in python.
 68 | 
 69 | ```
 70 | KNOWLEDGE_BASE=data/wiki_labels_quality.txt
 71 | KNOWLEDGE_BASE_LARGE=data/wiki_labels_all.txt
 72 | ```
 73 | 
 74 | We have two knowledge bases, the smaller one contains high quality phrases for
 75 | positive labels while the larger one is used to exclude medium quality phrases
 76 | for negative labels.
 77 | 
 78 | ```
 79 | SUPPORT_THRESHOLD=10
 80 | ```
 81 | 
 82 | A hard threshold of raw frequency is specified for frequent phrase mining, which
 83 | will generate a candidate set.
 84 | 
 85 | ```
 86 | OMP_NUM_THREADS=4
 87 | ```
 88 | 
 89 | You can also specify how many threads can be used for SegPhrase
 90 | 
 91 | ```
 92 | DISCARD_RATIO=0.00
 93 | ```
 94 | 
 95 | The discard ratio (between 0 and 1) controls how many positive labels can be
 96 | broken. It is typically small, for example, 0.00, 0.05, or 0.10. It should be
 97 | EXACTLY 2 digits after decimal point.
 98 | 
 99 | ```
100 | MAX_ITERATION=5
101 | ```
102 | 
103 | This is the number of iterations of Viterbi training.
104 | 
105 | ```
106 | ALPHA=0.85
107 | ```
108 | 
109 | Alpha is used in the label propagation from phrases to unigrams.
110 | 
111 | ## Parameters - parse.sh
112 | 
113 | ```
114 | ./bin/segphrase_parser results/segmentation.model results/salient.csv 50000 ./data/test.txt ./results/parsed.txt 0
115 | ```
116 | 
117 | The first parameter is the segmentation model, which we saved in training
118 | process. The second parameter is the high quality phrases ranking list (together
119 | with unigrams). **The third one determines the ratio of top ranked phrases
120 | (unigrams) will be considered in this run of segmentation.** This parameter
121 | should be dataset and application specific. The later two are the input and the
122 | output of corpus. The last one is a debug flag and you can just leave it as 0.
123 | 


--------------------------------------------------------------------------------
/train_toy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHON=python
 4 | export PYPY=python
 5 | if type "pypy" > /dev/null; then
 6 | 	export PYPY=pypy
 7 | fi
 8 | 
 9 | RAW_TEXT='data/DBLP.5K.txt'
10 | AUTO_LABEL=1
11 | WORDNET_NOUN=0
12 | DATA_LABEL='data/wiki.label.auto'
13 | KNOWLEDGE_BASE='data/wiki_labels_quality.txt'
14 | KNOWLEDGE_BASE_LARGE='data/wiki_labels_all.txt'
15 | 
16 | STOPWORD_LIST='data/stopwords.txt'
17 | SUPPORT_THRESHOLD=10
18 | 
19 | OMP_NUM_THREADS=4
20 | DISCARD_RATIO=0.00
21 | MAX_ITERATION=5
22 | 
23 | NEED_UNIGRAM=0
24 | ALPHA=0.85
25 | 
26 | # clearance
27 | rm -rf tmp
28 | rm -rf results
29 | 
30 | mkdir tmp
31 | mkdir results
32 | 
33 | # preprocessing
34 | ./bin/from_raw_to_binary_text ${RAW_TEXT} tmp/sentencesWithPunc.buf
35 | 
36 | # frequent phrase mining for phrase candidates
37 | ${PYPY} ./src/frequent_phrase_mining/main.py -thres ${SUPPORT_THRESHOLD} -o ./results/patterns.csv -raw ${RAW_TEXT}
38 | ${PYPY} ./src/preprocessing/compute_idf.py -raw ${RAW_TEXT} -o results/wordIDF.txt
39 | 
40 | # feature extraction
41 | ./bin/feature_extraction tmp/sentencesWithPunc.buf results/patterns.csv ${STOPWORD_LIST} results/wordIDF.txt results/feature_table_0.csv
42 | 
43 | if [ ${AUTO_LABEL} -eq 1 ];
44 | then
45 | 	echo ===Auto Label Enable===
46 |     ${PYTHON} src/classification/auto_label_generation.py ${KNOWLEDGE_BASE} ${KNOWLEDGE_BASE_LARGE} results/feature_table_0.csv results/patterns.csv ${DATA_LABEL}
47 | else
48 | 	echo ===Auto Label Disable===
49 | fi
50 | 
51 | # classifier training
52 | ./bin/predict_quality results/feature_table_0.csv ${DATA_LABEL} results/ranking.csv outsideSentence,log_occur_feature,constant,frequency 0 TRAIN results/random_forest_0.model
53 | 
54 | MAX_ITERATION_1=$(expr $MAX_ITERATION + 1)
55 | 
56 | # 1-st round
57 | ./bin/from_raw_to_binary ${RAW_TEXT} tmp/sentences.buf
58 | ./bin/adjust_probability tmp/sentences.buf ${OMP_NUM_THREADS} results/ranking.csv results/patterns.csv ${DISCARD_RATIO} ${MAX_ITERATION} ./results/ ${DATA_LABEL} ./results/penalty.1
59 | 
60 | # 2-nd round
61 | ./bin/recompute_features results/iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/length results/feature_table_0.csv results/patterns.csv tmp/sentencesWithPunc.buf results/feature_table_1.csv ./results/penalty.1 1
62 | ./bin/predict_quality results/feature_table_1.csv ${DATA_LABEL} results/ranking_1.csv outsideSentence,log_occur_feature,constant,frequency 0 TRAIN results/random_forest_1.model
63 | ./bin/adjust_probability tmp/sentences.buf ${OMP_NUM_THREADS} results/ranking_1.csv results/patterns.csv ${DISCARD_RATIO} ${MAX_ITERATION} ./results/1. ${DATA_LABEL} ./results/penalty.2
64 | 
65 | # phrase list & segmentation model
66 | ./bin/build_model results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ 6 ./results/penalty.2 results/segmentation.model
67 | 
68 | if [ ${NEED_UNIGRAM} -eq 1 ];
69 | then
70 | 	echo ===Unigram Enable===
71 | 	# unigrams
72 | 	normalize_text() {
73 | 	  awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
74 | 	  -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
75 | 	  -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
76 | 	  -e 's/«/ /g' | tr 0-9 " "
77 | 	}
78 | 	normalize_text < results/1.iter${MAX_ITERATION}_discard${DISCARD_RATIO}/segmented.txt > tmp/normalized.txt
79 | 
80 | 	cd word2vec_tool
81 | 	make
82 | 	cd ..
83 | 	./word2vec_tool/word2vec -train tmp/normalized.txt -output ./results/vectors.bin -cbow 2 -size 300 -window 6 -negative 25 -hs 0 -sample 1e-4 -threads ${OMP_NUM_THREADS} -binary 1 -iter 15
84 | 	time ./bin/generateNN results/vectors.bin results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ 30 3 results/u2p_nn.txt results/w2w_nn.txt
85 | 	./bin/qualify_unigrams results/vectors.bin results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ results/u2p_nn.txt results/w2w_nn.txt ${ALPHA} results/unified.csv 100 ${STOPWORD_LIST}
86 | else
87 | 	echo ===Unigram Disable===
88 | 	./bin/combine_phrases results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ results/unified.csv
89 | fi
90 | 
91 | ${PYPY} src/postprocessing/filter_by_support.py results/unified.csv results/1.iter${MAX_ITERATION}_discard${DISCARD_RATIO}/segmented.txt ${SUPPORT_THRESHOLD} results/salient.csv 
92 | 	
93 | if [ ${WORDNET_NOUN} -eq 1 ];
94 | then
95 |     ${PYPY} src/postprocessing/clean_list_with_wordnet.py -input results/salient.csv -output results/salient.csv 
96 | fi
97 | 


--------------------------------------------------------------------------------
/src/classification/auto_label_generation.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from random import shuffle
  3 | from math import sqrt
  4 | from sklearn import cluster
  5 | 
  6 | if len(sys.argv) != 6:
  7 |     print '[usage] <knowledge base small> <knowledge base large> <feature table> <patterns> <generated label>'
  8 |     sys.exit(-1)
  9 | 
 10 | knowledge_base = sys.argv[1]
 11 | knowledge_base_large = sys.argv[2]
 12 | feature_table = sys.argv[3]
 13 | patterns = sys.argv[4]
 14 | generated_label = sys.argv[5]
 15 | 
 16 | def normalizeMatrix(matrix):
 17 |     for i in xrange(dimension):
 18 |         sum = 0
 19 |         sum2 = 0;
 20 |         for j in xrange(len(matrix)):
 21 |             sum += matrix[j][i]
 22 |             sum2 += matrix[j][i] * matrix[j][i]
 23 |         avg = sum / len(matrix)
 24 |         avg2 = sum2 / len(matrix)
 25 |         variance = avg2 - avg * avg
 26 |         stderror = sqrt(variance)
 27 |         for j in xrange(len(matrix)):
 28 |             matrix[j][i] = (matrix[j][i] - avg)
 29 |             if stderror > 1e-8:
 30 |                 matrix[j][i] /= stderror
 31 |     return matrix
 32 | 
 33 | def normalize(word):
 34 |     word = word.lower()
 35 |     result = []
 36 |     for i in xrange(len(word)):
 37 |         if word[i].isalpha() or word[i] == '\'':
 38 |             result.append(word[i])
 39 |         else:
 40 |             result.append(' ')
 41 |     word = ''.join(result);
 42 |     return ' '.join(word.split())
 43 | 
 44 | groundtruth = {}
 45 | for line in open(knowledge_base, 'r'):
 46 |     word = line.strip()
 47 |     #word = normalize(word)
 48 |     groundtruth[word] = True
 49 | kb_phrases_all= set()
 50 | for line in open(knowledge_base_large, 'r'):
 51 |     word = line.strip()
 52 |     word = normalize(word)
 53 |     kb_phrases_all.add(word) 
 54 | 
 55 | patterns_support = list()
 56 | for line in open(patterns, 'r'):
 57 |     tokens = line.split(',')
 58 |     patterns_support.append((tokens[0].strip(), int(tokens[1])))
 59 | sorted_patterns = sorted(patterns_support, key=lambda tup: -tup[1])
 60 | patterns_candidates = set([tup[0] for tup in sorted_patterns[:len(sorted_patterns) / 2]])
 61 | 
 62 | # loading
 63 | dimension = 0
 64 | attributes = []
 65 | forbid = ['outsideSentence', 'log_occur_feature' , 'constant', 'frequency']
 66 | matrixWiki = []
 67 | phraseWiki = []
 68 | matrixOther = []
 69 | phraseOther = []
 70 | for line in open(feature_table, 'r'):
 71 |     tokens = line.split(',')
 72 |     if tokens[0] == 'pattern':
 73 |         attributes = tokens
 74 |         #print attributes
 75 |         continue
 76 |     coordinates = []
 77 |     for i in xrange(1, len(tokens)):
 78 |         if attributes[i] in forbid:
 79 |             continue
 80 |         coordinates.append(float(tokens[i]))
 81 |     dimension = len(coordinates)
 82 |     if tokens[0] in groundtruth and tokens[0] in patterns_candidates:
 83 |         matrixWiki.append(coordinates)
 84 |         phraseWiki.append(tokens[0])
 85 |     else:
 86 |         matrixOther.append(coordinates)
 87 |         phraseOther.append(tokens[0])
 88 | 
 89 | # normalization
 90 | matrixWiki = normalizeMatrix(matrixWiki)
 91 | matrixOther = normalizeMatrix(matrixOther)
 92 | 
 93 | # k-means
 94 | kmeans = cluster.MiniBatchKMeans(n_clusters = min(200, len(matrixWiki)), max_iter = 300, batch_size = 5000)
 95 | kmeans.fit(matrixWiki)
 96 | labelsWiki = kmeans.labels_
 97 | bins = []
 98 | for i in xrange(1000):
 99 |     bins.append([])
100 | 
101 | for i in xrange(len(labelsWiki)):
102 |     bins[labelsWiki[i]].append(phraseWiki[i])
103 | 
104 | labels = []
105 | for bin in bins:
106 |     shuffle(bin)
107 |     if len(bin) > 0:
108 |         labels.append(bin[0] + '\t1\n')
109 | npos = len(labels)
110 | # k-means
111 | kmeans = cluster.MiniBatchKMeans(n_clusters = min(npos * 2, len(matrixOther)), max_iter = 300, batch_size = 5000)
112 | kmeans.fit(matrixOther)
113 | labelsOther = kmeans.labels_
114 | bins = []
115 | for i in xrange(min(npos * 2, len(matrixOther))):
116 |     bins.append([])
117 | 
118 | for i in xrange(len(labelsOther)):
119 |     bins[labelsOther[i]].append(phraseOther[i])
120 | 
121 | for bin in bins:
122 |     shuffle(bin)
123 |     if len(bin) > 0:
124 |         for i in xrange(len(bin)):
125 |             if bin[i] not in kb_phrases_all:
126 |                 labels.append(bin[i] + '\t0\n')
127 |                 break
128 |         
129 | out = open(generated_label, 'w')
130 | out.write(''.join(labels))
131 | out.close()
132 | 
133 | print len(labels), 'generated,', npos, 'positive'
134 | 
135 | 


--------------------------------------------------------------------------------
/train_dblp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PYTHON=python
  4 | export PYPY=python
  5 | if type "pypy" > /dev/null; then
  6 | 	export PYPY=pypy
  7 | fi
  8 | 
  9 | RAW_TEXT='data/DBLP.txt'
 10 | AUTO_LABEL=0
 11 | WORDNET_NOUN=0
 12 | DATA_LABEL='data/DBLP.label'
 13 | KNOWLEDGE_BASE='data/wiki_labels_quality.txt'
 14 | KNOWLEDGE_BASE_LARGE='data/wiki_labels_all.txt'
 15 | 
 16 | STOPWORD_LIST='data/stopwords.txt'
 17 | SUPPORT_THRESHOLD=10
 18 | 
 19 | OMP_NUM_THREADS=4
 20 | DISCARD_RATIO=0.00
 21 | MAX_ITERATION=5
 22 | 
 23 | NEED_UNIGRAM=0
 24 | ALPHA=0.85
 25 | 
 26 | # clearance
 27 | rm -rf tmp
 28 | rm -rf results
 29 | 
 30 | mkdir tmp
 31 | mkdir results
 32 | 
 33 | if [ ! -e data/DBLP.txt ]; then
 34 | 	echo ===Downloading dataset=== 
 35 | 	curl http://dmserv2.cs.illinois.edu/data/DBLP.txt.gz --output data/DBLP.txt.gz
 36 | #	wget http://dmserv4.cs.illinois.edu/DBLP.txt.gz -O data/DBLP.txt.gz
 37 | 	gzip -d data/DBLP.txt.gz -f
 38 | fi
 39 | 
 40 | # preprocessing
 41 | ./bin/from_raw_to_binary_text ${RAW_TEXT} tmp/sentencesWithPunc.buf
 42 | 
 43 | # frequent phrase mining for phrase candidates
 44 | ${PYPY} ./src/frequent_phrase_mining/main.py -thres ${SUPPORT_THRESHOLD} -o ./results/patterns.csv -raw ${RAW_TEXT}
 45 | ${PYPY} ./src/preprocessing/compute_idf.py -raw ${RAW_TEXT} -o results/wordIDF.txt
 46 | 
 47 | # feature extraction
 48 | ./bin/feature_extraction tmp/sentencesWithPunc.buf results/patterns.csv ${STOPWORD_LIST} results/wordIDF.txt results/feature_table_0.csv
 49 | 
 50 | if [ ${AUTO_LABEL} -eq 1 ];
 51 | then
 52 | 	echo ===Auto Label Enable===
 53 |     ${PYTHON} src/classification/auto_label_generation.py ${KNOWLEDGE_BASE} ${KNOWLEDGE_BASE_LARGE} results/feature_table_0.csv results/patterns.csv ${DATA_LABEL}
 54 | else
 55 | 	echo ===Auto Label Disable===
 56 | fi
 57 | 
 58 | # classifier training
 59 | ./bin/predict_quality results/feature_table_0.csv ${DATA_LABEL} results/ranking.csv outsideSentence,log_occur_feature,constant,frequency 0 TRAIN results/random_forest_0.model
 60 | 
 61 | MAX_ITERATION_1=$(expr $MAX_ITERATION + 1)
 62 | 
 63 | # 1-st round
 64 | ./bin/from_raw_to_binary ${RAW_TEXT} tmp/sentences.buf
 65 | ./bin/adjust_probability tmp/sentences.buf ${OMP_NUM_THREADS} results/ranking.csv results/patterns.csv ${DISCARD_RATIO} ${MAX_ITERATION} ./results/ ${DATA_LABEL} ./results/penalty.1
 66 | 
 67 | # 2-nd round
 68 | ./bin/recompute_features results/iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/length results/feature_table_0.csv results/patterns.csv tmp/sentencesWithPunc.buf results/feature_table_1.csv ./results/penalty.1 1
 69 | ./bin/predict_quality results/feature_table_1.csv ${DATA_LABEL} results/ranking_1.csv outsideSentence,log_occur_feature,constant,frequency 0 TRAIN results/random_forest_1.model
 70 | ./bin/adjust_probability tmp/sentences.buf ${OMP_NUM_THREADS} results/ranking_1.csv results/patterns.csv ${DISCARD_RATIO} ${MAX_ITERATION} ./results/1. ${DATA_LABEL} ./results/penalty.2
 71 | 
 72 | # phrase list & segmentation model
 73 | ./bin/build_model results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ 6 ./results/penalty.2 results/segmentation.model
 74 | 
 75 | if [ ${NEED_UNIGRAM} -eq 1 ];
 76 | then
 77 | 	echo ===Unigram Enable===
 78 | 	# unigrams
 79 | 	normalize_text() {
 80 | 	  awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
 81 | 	  -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
 82 | 	  -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
 83 | 	  -e 's/«/ /g' | tr 0-9 " "
 84 | 	}
 85 | 	normalize_text < results/1.iter${MAX_ITERATION}_discard${DISCARD_RATIO}/segmented.txt > tmp/normalized.txt
 86 | 
 87 | 	cd word2vec_tool
 88 | 	make
 89 | 	cd ..
 90 | 	./word2vec_tool/word2vec -train tmp/normalized.txt -output ./results/vectors.bin -cbow 2 -size 300 -window 6 -negative 25 -hs 0 -sample 1e-4 -threads ${OMP_NUM_THREADS} -binary 1 -iter 15
 91 | 	time ./bin/generateNN results/vectors.bin results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ 30 3 results/u2p_nn.txt results/w2w_nn.txt
 92 | 	./bin/qualify_unigrams results/vectors.bin results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ results/u2p_nn.txt results/w2w_nn.txt ${ALPHA} results/unified.csv 100 ${STOPWORD_LIST}
 93 | else
 94 | 	echo ===Unigram Disable===
 95 | 	./bin/combine_phrases results/1.iter${MAX_ITERATION_1}_discard${DISCARD_RATIO}/ results/unified.csv
 96 | fi
 97 | 
 98 | ${PYPY} src/postprocessing/filter_by_support.py results/unified.csv results/1.iter${MAX_ITERATION}_discard${DISCARD_RATIO}/segmented.txt ${SUPPORT_THRESHOLD} results/salient.csv 
 99 | 	
100 | if [ ${WORDNET_NOUN} -eq 1 ];
101 | then
102 |     ${PYPY} src/postprocessing/clean_list_with_wordnet.py -input results/salient.csv -output results/salient.csv 
103 | fi
104 | 


--------------------------------------------------------------------------------
/src/online_query/segphrase_parser.h:
--------------------------------------------------------------------------------
  1 | #ifndef __SEG_PHRASE_PARSER_H__
  2 | #define __SEG_PHRASE_PARSER_H__
  3 | 
  4 | #include "../utils/helper.h"
  5 | 
  6 | #include <unordered_map>
  7 | using namespace std;
  8 | 
  9 | class SegPhraseParser
 10 | {
 11 | private:
 12 |     static const double INF;
 13 | 
 14 | // need to load
 15 |     double penalty;
 16 |     unordered_map<string, double> prob;
 17 | 
 18 | // generated
 19 |     int maxLen;
 20 | 
 21 |     void loadPhraseProb(FILE* in, size_t topK) {
 22 |         vector<string> phrases;
 23 |         size_t size;
 24 |         fread(&size, sizeof(size), 1, in);
 25 |         phrases.resize(size);
 26 | 
 27 |         for (size_t i = 0; i < size; ++ i) {
 28 |             Binary::read(in, phrases[i]);
 29 |         }
 30 | 
 31 |         vector<double> probability;
 32 |         probability.resize(size);
 33 |         if (size > 0) {
 34 |             fread(&probability[0], sizeof(probability[0]), size, in);
 35 |         }
 36 | 
 37 |         if (topK > 0) {
 38 |             size = min(size, topK);
 39 |         }
 40 | 
 41 |         vector< pair<double, size_t> > order;
 42 |         for (size_t i = 0; i < size; ++ i) {
 43 |             order.push_back(make_pair(probability[i], i));
 44 |         }
 45 | 
 46 |         sort(order.rbegin(), order.rend());
 47 | 
 48 |         for (size_t iter = 0; iter < size; ++ iter) {
 49 |             size_t i = order[iter].second;
 50 |             prob[phrases[i]] = log(probability[i]);
 51 |         }
 52 |     }
 53 | 
 54 |     void loadModel(string filename, size_t topK) {
 55 |         FILE* in = tryOpen(filename, "rb");
 56 | 
 57 |         fread(&penalty, sizeof(penalty), 1, in);
 58 | 
 59 |         cerr << "[Model] penalty = " << penalty << endl;
 60 | 
 61 |         prob.clear();
 62 |         // unigrams
 63 |         loadPhraseProb(in, 0);
 64 | 
 65 |         cerr << "[Model] # unigrams = " << prob.size() << endl;
 66 | 
 67 |         // real phrases
 68 |         loadPhraseProb(in, topK);
 69 | 
 70 |         cerr << "[Model] # total phrases = " << prob.size() << endl;
 71 | 
 72 |         maxLen = 0;
 73 |         double logPenalty = log(penalty);
 74 |         FOR (pairs, prob) {
 75 |             int parts = 1;
 76 |             FOR (ch, pairs->first) {
 77 |                 if ((*ch) == ' ') {
 78 |                     ++ parts;
 79 |                 }
 80 |             }
 81 |             pairs->second -= (parts - 1) * logPenalty;
 82 |             maxLen = max(maxLen, parts + 1);
 83 |         }
 84 |     }
 85 | 
 86 | public:
 87 |     SegPhraseParser(string modelFilename, int topK = 0) {
 88 |         loadModel(modelFilename, topK);
 89 |     }
 90 | 
 91 |     unordered_set<string> dict;
 92 | 
 93 |     void setDict(const unordered_set<string> &x) {
 94 |         dict = x;
 95 |     }
 96 | 
 97 |     vector<pair<string, bool>> segment(const string &sentence) {
 98 |         vector<string> tokens = splitBy(sentence, ' ');
 99 | 
100 |     	vector<double> f(tokens.size() + 1, -INF);
101 |     	vector<int> pre(tokens.size() + 1, -1);
102 |     	f[0] = 0;
103 |     	pre[0] = 0;
104 |     	double penaltyForUnrecognizedUnigram = -1e50 / tokens.size();
105 |     	for (size_t i = 0 ; i < tokens.size(); ++ i) {
106 |     		if (f[i] < -1e80) {
107 |     			continue;
108 |     		}
109 |     		string token = "";
110 |     		size_t j = i;
111 |     		while (j < tokens.size()) {
112 |     			if (j == i) {
113 |     				token = tokens[i];
114 |     			} else {
115 |     				token += " ";
116 |     				token += tokens[j];
117 |     			}
118 |     			if (prob.count(token) && (dict.size() == 0 || dict.count(token))) {
119 |     				double p = prob[token];
120 |     				if (f[i] + p > f[j + 1]) {
121 |     					f[j + 1] = f[i] + p;
122 |     					pre[j + 1] = i;
123 |     				}
124 |     			} else {
125 |     			    if (i == j) {
126 |     			        double p = penaltyForUnrecognizedUnigram;
127 |     			        if (f[i] + p > f[j + 1]) {
128 |         					f[j + 1] = f[i] + p;
129 |         					pre[j + 1] = i;
130 |         				}
131 |     			    }
132 |     				if (j > maxLen + i) {
133 |     					break;
134 |     				}
135 |     			}
136 |     			++ j;
137 |     		}
138 |     	}
139 |     	if (true) {
140 |     	    // get the segmentation plan
141 |     		int i = (int)tokens.size();
142 |             vector<pair<string,bool>> segments;
143 |     		while (i > 0) {
144 |     			int j = pre[i];
145 |     			string token = "";
146 |     			for (int k = j; k < i; ++ k) {
147 |     				if (k > j) {
148 |     					token += " ";
149 |     				}
150 |     				token += tokens[k];
151 |     			}
152 |     			i = j;
153 |                 segments.push_back(make_pair(token, dict.count(token)));
154 |     		}
155 |     		reverse(segments.begin(), segments.end());
156 |     		return segments;
157 |     	}
158 |     }
159 | };
160 | 
161 | const double SegPhraseParser::INF = 1e100;
162 | 
163 | #endif
164 | 


--------------------------------------------------------------------------------
/word2vec_tool/distance.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char *bestw[N];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
 48 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 49 |   if (M == NULL) {
 50 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 51 |     return -1;
 52 |   }
 53 |   for (b = 0; b < words; b++) {
 54 |     a = 0;
 55 |     while (1) {
 56 |       vocab[b * max_w + a] = fgetc(f);
 57 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 58 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 59 |     }
 60 |     vocab[b * max_w + a] = 0;
 61 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 62 |     len = 0;
 63 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 64 |     len = sqrt(len);
 65 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 66 |   }
 67 |   fclose(f);
 68 |   while (1) {
 69 |     for (a = 0; a < N; a++) bestd[a] = 0;
 70 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 71 |     printf("Enter word or sentence (EXIT to break): ");
 72 |     a = 0;
 73 |     while (1) {
 74 |       st1[a] = fgetc(stdin);
 75 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 76 |         st1[a] = 0;
 77 |         break;
 78 |       }
 79 |       a++;
 80 |     }
 81 |     if (!strcmp(st1, "EXIT")) break;
 82 |     cn = 0;
 83 |     b = 0;
 84 |     c = 0;
 85 |     while (1) {
 86 |       st[cn][b] = st1[c];
 87 |       b++;
 88 |       c++;
 89 |       st[cn][b] = 0;
 90 |       if (st1[c] == 0) break;
 91 |       if (st1[c] == ' ') {
 92 |         cn++;
 93 |         b = 0;
 94 |         c++;
 95 |       }
 96 |     }
 97 |     cn++;
 98 |     for (a = 0; a < cn; a++) {
 99 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
100 |       if (b == words) b = -1;
101 |       bi[a] = b;
102 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
103 |       if (b == -1) {
104 |         printf("Out of dictionary word!\n");
105 |         break;
106 |       }
107 |     }
108 |     if (b == -1) continue;
109 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
110 |     for (a = 0; a < size; a++) vec[a] = 0;
111 |     for (b = 0; b < cn; b++) {
112 |       if (bi[b] == -1) continue;
113 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
114 |     }
115 |     len = 0;
116 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
117 |     len = sqrt(len);
118 |     for (a = 0; a < size; a++) vec[a] /= len;
119 |     for (a = 0; a < N; a++) bestd[a] = -1;
120 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
121 |     for (c = 0; c < words; c++) {
122 |       a = 0;
123 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
124 |       if (a == 1) continue;
125 |       dist = 0;
126 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
127 |       for (a = 0; a < N; a++) {
128 |         if (dist > bestd[a]) {
129 |           for (d = N - 1; d > a; d--) {
130 |             bestd[d] = bestd[d - 1];
131 |             strcpy(bestw[d], bestw[d - 1]);
132 |           }
133 |           bestd[a] = dist;
134 |           strcpy(bestw[a], &vocab[c * max_w]);
135 |           break;
136 |         }
137 |       }
138 |     }
139 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
140 |   }
141 |   return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/word2vec_tool/word-analogy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <malloc.h>
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 48 |   if (M == NULL) {
 49 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 50 |     return -1;
 51 |   }
 52 |   for (b = 0; b < words; b++) {
 53 |     a = 0;
 54 |     while (1) {
 55 |       vocab[b * max_w + a] = fgetc(f);
 56 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 57 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 58 |     }
 59 |     vocab[b * max_w + a] = 0;
 60 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 61 |     len = 0;
 62 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 63 |     len = sqrt(len);
 64 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 65 |   }
 66 |   fclose(f);
 67 |   while (1) {
 68 |     for (a = 0; a < N; a++) bestd[a] = 0;
 69 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 70 |     printf("Enter three words (EXIT to break): ");
 71 |     a = 0;
 72 |     while (1) {
 73 |       st1[a] = fgetc(stdin);
 74 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 75 |         st1[a] = 0;
 76 |         break;
 77 |       }
 78 |       a++;
 79 |     }
 80 |     if (!strcmp(st1, "EXIT")) break;
 81 |     cn = 0;
 82 |     b = 0;
 83 |     c = 0;
 84 |     while (1) {
 85 |       st[cn][b] = st1[c];
 86 |       b++;
 87 |       c++;
 88 |       st[cn][b] = 0;
 89 |       if (st1[c] == 0) break;
 90 |       if (st1[c] == ' ') {
 91 |         cn++;
 92 |         b = 0;
 93 |         c++;
 94 |       }
 95 |     }
 96 |     cn++;
 97 |     if (cn < 3) {
 98 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
 99 |       continue;
100 |     }
101 |     for (a = 0; a < cn; a++) {
102 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
103 |       if (b == words) b = 0;
104 |       bi[a] = b;
105 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
106 |       if (b == 0) {
107 |         printf("Out of dictionary word!\n");
108 |         break;
109 |       }
110 |     }
111 |     if (b == 0) continue;
112 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
113 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
114 |     len = 0;
115 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
116 |     len = sqrt(len);
117 |     for (a = 0; a < size; a++) vec[a] /= len;
118 |     for (a = 0; a < N; a++) bestd[a] = 0;
119 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
120 |     for (c = 0; c < words; c++) {
121 |       if (c == bi[0]) continue;
122 |       if (c == bi[1]) continue;
123 |       if (c == bi[2]) continue;
124 |       a = 0;
125 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
126 |       if (a == 1) continue;
127 |       dist = 0;
128 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
129 |       for (a = 0; a < N; a++) {
130 |         if (dist > bestd[a]) {
131 |           for (d = N - 1; d > a; d--) {
132 |             bestd[d] = bestd[d - 1];
133 |             strcpy(bestw[d], bestw[d - 1]);
134 |           }
135 |           bestd[a] = dist;
136 |           strcpy(bestw[a], &vocab[c * max_w]);
137 |           break;
138 |         }
139 |       }
140 |     }
141 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
142 |   }
143 |   return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/word2vec_tool/demo-train-big-model-v1.sh:
--------------------------------------------------------------------------------
  1 | ###############################################################################################
  2 | #
  3 | # Script for training good word and phrase vector model using public corpora, version 1.0.
  4 | # The training time will be from several hours to about a day.
  5 | #
  6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
  7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
  8 | #
  9 | ###############################################################################################
 10 | 
 11 | # This function will convert text to lowercase and remove special characters
 12 | normalize_text() {
 13 |   awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
 14 |   -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
 15 |   -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
 16 |   -e 's/«/ /g' | tr 0-9 " "
 17 | }
 18 | 
 19 | mkdir word2vec
 20 | cd word2vec
 21 | 
 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
 24 | gzip -d news.2012.en.shuffled.gz
 25 | gzip -d news.2013.en.shuffled.gz
 26 | normalize_text < news.2012.en.shuffled > data.txt
 27 | normalize_text < news.2013.en.shuffled >> data.txt
 28 | 
 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
 32 |   normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
 33 | done
 34 | 
 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
 37 | for i in `ls webbase_all`; do
 38 |   normalize_text < webbase_all/$i >> data.txt
 39 | done
 40 | 
 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)...
 45 | # All other characters are converted to spaces.  Only text which normally appears.
 46 | # in the web browser is displayed.  Tables are removed.  Image captions are.
 47 | # preserved.  Links are converted to normal text.  Digits are spelled out.
 48 | # *** Modified to not spell digits or throw away non-ASCII characters ***
 49 | 
 50 | # Written by Matt Mahoney, June 10, 2006.  This program is released to the public domain.
 51 | 
 52 | $/=">";                     # input record separator
 53 | while (<>) {
 54 |   if (/<text /) {$text=1;}  # remove all but between <text> ... </text>
 55 |   if (/#redirect/i) {$text=0;}  # remove #REDIRECT
 56 |   if ($text) {
 57 | 
 58 |     # Remove any text not normally visible
 59 |     if (/<\/text>/) {$text=0;}
 60 |     s/<.*>//;               # remove xml tags
 61 |     s/&amp;/&/g;            # decode URL encoded chars
 62 |     s/&lt;/</g;
 63 |     s/&gt;/>/g;
 64 |     s/<ref[^<]*<\/ref>//g;  # remove references <ref...> ... </ref>
 65 |     s/<[^>]*>//g;           # remove xhtml tags
 66 |     s/\[http:[^] ]*/[/g;    # remove normal url, preserve visible text
 67 |     s/\|thumb//ig;          # remove images links, preserve caption
 68 |     s/\|left//ig;
 69 |     s/\|right//ig;
 70 |     s/\|\d+px//ig;
 71 |     s/\[\[image:[^\[\]]*\|//ig;
 72 |     s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig;  # show categories without markup
 73 |     s/\[\[[a-z\-]*:[^\]]*\]\]//g;  # remove links to other languages
 74 |     s/\[\[[^\|\]]*\|/[[/g;  # remove wiki url, preserve visible text
 75 |     s/{{[^}]*}}//g;         # remove {{icons}} and {tables}
 76 |     s/{[^}]*}//g;
 77 |     s/\[//g;                # remove [ and ]
 78 |     s/\]//g;
 79 |     s/&[^;]*;/ /g;          # remove URL encoded chars
 80 | 
 81 |     $_=" $_ ";
 82 |     chop;
 83 |     print $_;
 84 |   }
 85 | }
 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
 87 | 
 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt     # should get to almost 78% accuracy on 99.7% of questions
100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt  # about 78% accuracy with 77% coverage
101 | 


--------------------------------------------------------------------------------
/src/postprocessing/kd_tree.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KD_TREE_H__
  2 | #define __KD_TREE_H__
  3 | 
  4 | #include "../utils/helper.h"
  5 | #include <queue>
  6 | 
  7 | struct Point
  8 | {
  9 |     vector<double> x;
 10 |     string name;
 11 |     
 12 |     Point () {
 13 |         name = "NULL";
 14 |     }
 15 |     
 16 |     Point (string _name, vector<double> _x) {
 17 |         name = _name;
 18 |         x = _x;
 19 |     }
 20 |     
 21 |     int size() const {
 22 |         return x.size();
 23 |     }
 24 |     
 25 |     double operator [](int index) const {
 26 |         return x[index];
 27 |     }
 28 |     
 29 |     double& operator [](int index) {
 30 |         return x[index];
 31 |     }
 32 | };
 33 | 
 34 | bool operator != (const Point &a, const Point &b)
 35 | {
 36 |     return a.name != b.name;
 37 | }
 38 | 
 39 | int pivot;
 40 | 
 41 | bool byPivot(const Point &a, const Point &b)
 42 | {
 43 |     for (int shift = 0, i = pivot; shift < a.size(); ++ shift) {
 44 |         if (a[i] + EPS < b[i]) {
 45 |             return true;
 46 |         }
 47 |         if (a[i] - EPS > b[i]) {
 48 |             return false;
 49 |         }
 50 |         ++ i;
 51 |         if (i == a.size()) {
 52 |             i = 0;
 53 |         }
 54 |     }
 55 |     return a.name < b.name;
 56 | }
 57 | 
 58 | class KDTree
 59 | {
 60 |     vector<Point> points;
 61 |     vector<Point> mini, maxi;
 62 |     int d;
 63 |     
 64 |     void build(int num, int l, int r, int pivot) {
 65 |         myAssert(num < mini.size(), "error in tree node id!");
 66 |         if (pivot == d) {
 67 |             pivot = 0;
 68 |         }
 69 | //cerr << num << " " << l << " " << r << endl;
 70 |         int mid = l + r >> 1;
 71 |         ::pivot = pivot;
 72 |         sort(points.begin() + l, points.begin() + r, byPivot);
 73 |         //nth_element(points.begin() + l, points.begin() + mid, points.begin() + r, byPivot);
 74 |         mini[num] = maxi[num] = points[mid];
 75 |         
 76 |         if (l < mid) {
 77 |             build(num * 2, l, mid, pivot + 1);
 78 |             for (int i = 0; i < d; ++ i) {
 79 |                 mini[num][i] = min(mini[num][i], mini[num * 2][i]);
 80 |                 maxi[num][i] = max(maxi[num][i], maxi[num * 2][i]);
 81 |             }
 82 |         }
 83 |         if (mid + 1 < r) {
 84 |             build(num * 2 + 1, mid + 1, r, pivot + 1);
 85 |             for (int i = 0; i < d; ++ i) {
 86 |                 mini[num][i] = min(mini[num][i], mini[num * 2 + 1][i]);
 87 |                 maxi[num][i] = max(maxi[num][i], maxi[num * 2 + 1][i]);
 88 |             }
 89 |         }
 90 |     }
 91 |     
 92 |     bool estimate(const Point &mini, const Point &maxi, const Point &target, double threshold) const {
 93 |         double s = 0;
 94 |         for (int i = 0; i < d && s + EPS < threshold; ++ i) {
 95 |             if (target[i] < mini[i]) {
 96 |                 s += sqr(target[i] - mini[i]);
 97 |             } else if (target[i] > maxi[i]) {
 98 |                 s += sqr(target[i] - maxi[i]);
 99 |             }
100 |         }
101 |         return s + EPS < threshold;
102 |     }
103 |     
104 |     void update(const Point &a, const Point &b, int k, priority_queue<pair<double, string>> &heap, double threshold) const {
105 |         double s = 0;
106 |         for (int i = 0; i < d && s + EPS < threshold; ++ i) {
107 |             s += sqr(a[i] - b[i]);
108 |         }
109 |         if (s + EPS < threshold) {
110 |             heap.push(make_pair(s, a.name));
111 |             if (heap.size() > k) {
112 |                 heap.pop();
113 |             }
114 |         }
115 |     }
116 |     
117 |     void query(int num, int l, int r, int pivot, const Point &target, int k, priority_queue<pair<double, string>> &heap) const {
118 |         if (l >= r || heap.size() == k && estimate(mini[num], maxi[num], target, heap.top().first)) {
119 |             return;
120 |         }
121 | 
122 |         if (pivot == d) {
123 |             pivot = 0;
124 |         }
125 |         int mid = l + r >> 1;
126 |         ::pivot = pivot;
127 |         
128 |         if (target != points[mid]) {
129 |             update(points[mid], target, k, heap, heap.size() == k ? heap.top().first : 1e100);
130 |         }
131 | 
132 |         if (byPivot(target, points[mid])) {
133 |             query(num * 2, l, mid, pivot + 1, target, k, heap);
134 |             query(num * 2 + 1, mid + 1, r, pivot + 1, target, k, heap);
135 |         } else {
136 |             query(num * 2 + 1, mid + 1, r, pivot + 1, target, k, heap);
137 |             query(num * 2, l, mid, pivot + 1, target, k, heap);
138 |         }
139 |     }
140 | public:
141 |     KDTree(const vector<Point> &points) : points(points) {
142 |         maxi.resize(points.size() * 8);
143 |         mini.resize(points.size() * 8);
144 |         if (points.size() > 0) {
145 |             d = points[0].size();
146 |             build(1, 0, points.size(), 0);
147 |             cerr << "KD Tree built, dimension = " << d << endl;
148 |             cerr << "# points = " << points.size() << endl;
149 |         } else {
150 |             cerr << "[Warning] 0 points in KD Tree" << endl;
151 |         }
152 |     }
153 |     
154 |     vector<string> query(const Point &target, int k) const {
155 |         priority_queue<pair<double, string>> heap;
156 |         if (d <= 15) {
157 |             query(1, 0, points.size(), 0, target, k, heap);
158 |         } else {
159 |             for (int i = 0; i < points.size(); ++ i) {
160 |                 if (points[i] != target) {
161 |                     update(points[i], target, k, heap, heap.size() == k ? heap.top().first : 1e100);
162 |                 }
163 |             }
164 |         }
165 |         vector<string> ret;
166 |         while (heap.size() > 0) {
167 |             ret.push_back(heap.top().second);
168 |             heap.pop();
169 |         }
170 |         return ret;
171 |     }
172 | };
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/word2vec_tool/compute-accuracy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <malloc.h>
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     a = 0;
 57 |     while (1) {
 58 |       vocab[b * max_w + a] = fgetc(f);
 59 |       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 60 |       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 61 |     }
 62 |     vocab[b * max_w + a] = 0;
 63 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 64 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 65 |     len = 0;
 66 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 67 |     len = sqrt(len);
 68 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 69 |   }
 70 |   fclose(f);
 71 |   TCN = 0;
 72 |   while (1) {
 73 |     for (a = 0; a < N; a++) bestd[a] = 0;
 74 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 75 |     scanf("%s", st1);
 76 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 77 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 78 |       if (TCN == 0) TCN = 1;
 79 |       if (QID != 0) {
 80 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 81 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 82 |       }
 83 |       QID++;
 84 |       scanf("%s", st1);
 85 |       if (feof(stdin)) break;
 86 |       printf("%s:\n", st1);
 87 |       TCN = 0;
 88 |       CCN = 0;
 89 |       continue;
 90 |     }
 91 |     if (!strcmp(st1, "EXIT")) break;
 92 |     scanf("%s", st2);
 93 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 94 |     scanf("%s", st3);
 95 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 96 |     scanf("%s", st4);
 97 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 98 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 99 |     b1 = b;
100 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
101 |     b2 = b;
102 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
103 |     b3 = b;
104 |     for (a = 0; a < N; a++) bestd[a] = 0;
105 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
106 |     TQ++;
107 |     if (b1 == words) continue;
108 |     if (b2 == words) continue;
109 |     if (b3 == words) continue;
110 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
111 |     if (b == words) continue;
112 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
113 |     TQS++;
114 |     for (c = 0; c < words; c++) {
115 |       if (c == b1) continue;
116 |       if (c == b2) continue;
117 |       if (c == b3) continue;
118 |       dist = 0;
119 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
120 |       for (a = 0; a < N; a++) {
121 |         if (dist > bestd[a]) {
122 |           for (d = N - 1; d > a; d--) {
123 |             bestd[d] = bestd[d - 1];
124 |             strcpy(bestw[d], bestw[d - 1]);
125 |           }
126 |           bestd[a] = dist;
127 |           strcpy(bestw[a], &vocab[c * max_w]);
128 |           break;
129 |         }
130 |       }
131 |     }
132 |     if (!strcmp(st4, bestw[0])) {
133 |       CCN++;
134 |       CACN++;
135 |       if (QID <= 5) SEAC++; else SYAC++;
136 |     }
137 |     if (QID <= 5) SECN++; else SYCN++;
138 |     TCN++;
139 |     TACN++;
140 |   }
141 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
142 |   return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/src/online_query/segphrase_parser.cpp:
--------------------------------------------------------------------------------
  1 | #include "segphrase_parser.h"
  2 | 
  3 | const string ENDINGS = ".!?,;:[]";
  4 | 
  5 | string sep = "[]";
  6 | 
  7 | void printVector(vector<string> a) {
  8 |     for (size_t i = 0; i < a.size(); ++ i) {
  9 |         if (sep == "_") {
 10 |             for (size_t j = 0; j < a[i].size(); ++ j) {
 11 |                 if (a[i][j] == ' ') {
 12 |                     a[i][j] = '_';
 13 |                 }
 14 |             }
 15 |         } else {
 16 |             a[i] = "[" + a[i] + "]";
 17 |         }
 18 |         cout << a[i];
 19 |         if (i + 1 == a.size()) {
 20 |             cout << endl;
 21 |         } else {
 22 |             cout << " ";
 23 |         }
 24 |     }
 25 | }
 26 | 
 27 | unordered_set<string> dict;
 28 | 
 29 | void loadRankList(string filename, double topNRatio)
 30 | {
 31 |     FILE* in = tryOpen(filename, "r");
 32 |     vector<pair<double, string>> order;
 33 |     if (topNRatio <= 0 || topNRatio > 1) {
 34 |         topNRatio = 0.5;
 35 |     }
 36 |     while (getLine(in)) {
 37 |         vector<string> tokens = splitBy(line, ',');
 38 |         string word = tokens[0];
 39 |         double score;
 40 |         fromString(tokens[1], score);
 41 |         for (size_t i = 0; i < word.size(); ++ i) {
 42 |             if (word[i] == '_') {
 43 |                 word[i] = ' ';
 44 |             }
 45 |         }
 46 |         if (score > topNRatio) {
 47 |             order.push_back(make_pair(score, word));
 48 |         }
 49 |     }
 50 |     sort(order.rbegin(), order.rend());
 51 |     dict.clear();
 52 |     FOR (pair, order) {
 53 |         dict.insert(pair->second);
 54 |     }
 55 | }
 56 | 
 57 | string translate(vector<pair<string, bool>> &segments, bool clean_mode, string &origin, string &text, vector<string> &betweens, int &index)
 58 | {
 59 |     string answer = "";
 60 |     if (clean_mode) {
 61 |         for (size_t i = 0; i < segments.size(); ++ i) {
 62 |             if (segments[i].second) {
 63 |                 answer += "[";
 64 |             }
 65 |             answer += segments[i].first;
 66 |             if (segments[i].second) {
 67 |                 answer += "]";
 68 |             }
 69 |             answer += " ";
 70 |         }
 71 |         answer += "$ ";
 72 |     } else {
 73 |         size_t last = 0;
 74 |         if (segments.size() == 0) {
 75 |             answer += origin;
 76 |         } else {
 77 |             for (size_t i = 0; i < segments.size(); ++ i) {
 78 |                 size_t st = last;
 79 |                 while (text[st] != segments[i].first[0]) {
 80 |                     ++ st;
 81 |                 }
 82 |                 size_t ed = st;
 83 |                 for (size_t j = 0; j < segments[i].first.size(); ++ j) {
 84 |                     while (text[ed] != segments[i].first[j]) {
 85 |                         ++ ed;
 86 |                     }
 87 |                     ++ ed;
 88 |                 }
 89 | 
 90 |                 for (size_t j = last; j < st; ++ j) {
 91 |                     answer += origin[j];
 92 |                 }
 93 |                 if (segments[i].second) {
 94 |                     answer += "[";
 95 |                 }
 96 |                 for (size_t j = st; j < ed; ++ j) {
 97 |                     answer += origin[j];
 98 |                 }
 99 |                 if (segments[i].second) {
100 |                     answer += "]";
101 |                 }
102 | 
103 |                 last = ed;
104 |             }
105 |             while (last < origin.size()) {
106 |                 answer += origin[last];
107 |                 ++ last;
108 |             }
109 |         }
110 |         if (index < betweens.size()) {
111 |             answer += betweens[index];
112 |             ++ index;
113 |         }
114 |     }
115 |     return answer;
116 | }
117 | 
118 | int main(int argc, char* argv[])
119 | {
120 |     double topNRatio;
121 |     if (argc != 7 || sscanf(argv[3], "%lf", &topNRatio) != 1) {
122 |         cerr << "[usage] <model-file> <rank-list> <top-n> <corpus_in> <segmented_out> <clean_mode>" << endl;
123 |         return -1;
124 |     }
125 | 
126 | 
127 |     string model_path = (string)argv[1];
128 |     SegPhraseParser* parser = new SegPhraseParser(model_path, 0);
129 |     cerr << "parser built." << endl;
130 |     
131 |     loadRankList(argv[2], topNRatio);
132 | 	parser->setDict(dict);
133 | 
134 |     FILE* in = tryOpen(argv[4], "r");
135 |     FILE* out = tryOpen(argv[5], "w");
136 | 
137 |     bool clean_mode = (strcmp(argv[6], "0") != 0);
138 |     for (;getLine(in);) {
139 |         vector<string> sentences;
140 |         vector<string> betweens;
141 |         betweens.push_back("");
142 | 
143 |         string sentence = "";
144 |         // if (line.size() == 0) continue;
145 |         for (int i = 0; line[i]; ++ i) {
146 |             char ch = line[i];
147 |             if (ENDINGS.find(ch) != -1) {
148 |                 if (sentence.size() > 0) {
149 |                     sentences.push_back(sentence);
150 |                     betweens.push_back(string(1, ch));
151 |                 } else {
152 |                     betweens.back() += ch;
153 |                 }
154 |                 sentence = "";
155 |             } else {
156 |                 sentence += ch;
157 |             }
158 |         }
159 |         if (sentence.size() > 0) {
160 |             sentences.push_back(sentence);
161 |         }
162 |         string corpus = "";
163 |         if (!clean_mode) {
164 |             corpus += betweens[0];
165 |         }
166 |         int index = 1;
167 |         FOR (sentence, sentences) {
168 |             string origin = *sentence;
169 |             string text = *sentence;
170 |             for (size_t i = 0; i < text.size(); ++ i) {
171 |                 if (isalpha(text[i])) {
172 |                     text[i] = tolower(text[i]);
173 |                 } else if (text[i] != '\'') {
174 |                     text[i] = ' ';
175 |                 }
176 |             }
177 |             vector<pair<string, bool>> segments = parser->segment(text);
178 |             string answer = translate(segments, clean_mode, origin, text, betweens, index);
179 |             corpus += answer;
180 |         }
181 |         fprintf(out, "%s\n", corpus.c_str());
182 |     }
183 | 
184 |     cerr << "[done]" << endl;
185 |     return 0;
186 | }
187 | 


--------------------------------------------------------------------------------
/data/DBLP.label:
--------------------------------------------------------------------------------
  1 | k means	1
  2 | probabilistic latent semantic analysis	1
  3 | this case	0
  4 | this kind	0
  5 | topic model	1
  6 | Indexing Terms	0
  7 | Real Time	0
  8 | Neural Network	1
  9 | Large Scale	0
 10 | Genetic Algorithm	1
 11 | Machine Learning	1
 12 | Data Structure	1
 13 | High Performance	0
 14 | Sensor Network	1
 15 | Three Dimensional	0
 16 | Distributed System	1
 17 | Wireless Network	1
 18 | Programming Language	1
 19 | Lower Bound	1
 20 | Information Retrieval	1
 21 | Data Mining	1
 22 | Computational Complexity	1
 23 | Pattern Recognition	1
 24 | Operating System	1
 25 | Upper Bound	1
 26 | Computer Vision	1
 27 | Fault Tolerant	1
 28 | World Wide Web	1
 29 | Artificial Intelligent	1
 30 | Optimization Problem	1
 31 | User Interface	1
 32 | Efficient Algorithm	0
 33 | Support Vector Machine	1
 34 | Software Engineering	1
 35 | Software Systems	1
 36 | Information System	1
 37 | Wireless Sensor Network	1
 38 | Learning Algorithm	1
 39 | High Speed	0
 40 | Performance Evaluation	0
 41 | Design and Implementation	0
 42 | Signal Processing	1
 43 | Knowledge Base	1
 44 | Wireless Communication	1
 45 | Object Oriented	0
 46 | Image Processing	1
 47 | First Order	0
 48 | Performance Analysis	1
 49 | Computer Graphic	1
 50 | Peer To Peer	0
 51 | Objective Function	1
 52 | Signal To Noise Ratio	1
 53 | Speech Recognition	1
 54 | Maximum Likelihood	1
 55 | Parallel Computer	1
 56 | Decision Making	1
 57 | Source Code	1
 58 | Computer Simulation	1
 59 | Feature Extraction	1
 60 | Hidden Markov Model	1
 61 | Database System	1
 62 | Cross Validation	1
 63 | point p	0
 64 | quantifies the	0
 65 | be constructed	0
 66 | point x	0
 67 | neural basic	0
 68 | approach the	0
 69 | in these two	0
 70 | as the ones	0
 71 | point a	0
 72 | we concentrate on the	0
 73 | be practically	0
 74 | systems in this article	0
 75 | networks sensor networks	0
 76 | changes in	0
 77 | or just	0
 78 | conditions under which a	0
 79 | changes it	0
 80 | p log	0
 81 | curve a	0
 82 | that reduce the	0
 83 | to prevent	0
 84 | switch between	0
 85 | the storage of	0
 86 | machine communication	1	
 87 | be consistent	0
 88 | singular value decomposition svd	0
 89 | competition with	0
 90 | under certain assumptions	0
 91 | of the problem and	0
 92 | of repair	0
 93 | for the validation of	0
 94 | component reuse	0
 95 | exploits a	0
 96 | for research on	0
 97 | seeks to	0
 98 | indicates that	0
 99 | be extended to	0
100 | on computing the	0
101 | using a variety	0
102 | complete classification	0
103 | matlab and	0
104 | simpler proof	0
105 | the optimization is	0
106 | uses as	0
107 | and near optimal	0
108 | of similar	0
109 | object type	0
110 | types in the	0
111 | web based applications	1
112 | uses an	0
113 | synthesis technique	0
114 | paper looks	0
115 | of learning to	0
116 | user behaviour	1
117 | peer p p networks	0
118 | performance at	0
119 | a state of the art	0
120 | two essential	0
121 | a large set of	0
122 | defects are	0
123 | show in this paper	0
124 | proving termination of	0
125 | tree in	0
126 | society a	0
127 | tree it	0
128 | for known	0
129 | parameter systems	0
130 | tree is	0
131 | hidden in	0
132 | society s	0
133 | commonly known as	0
134 | periodicals inc random	0
135 | as active	0
136 | strongly on	0
137 | disruption tolerant networks	1
138 | to the difficulty	0
139 | by changes	0
140 | non spatial	0
141 | of faults in	0
142 | can be proved	0
143 | in particular when	0
144 | selection problem	0
145 | evolution on	0
146 | than individual	0
147 | design example	0
148 | complex spatial	0
149 | processing of the	0
150 | evolution of	0
151 | introduce a new class of	0
152 | in a manner that is	0
153 | domination and	0
154 | characterizing a	0
155 | robust control of	0
156 | an environment for	0
157 | on the resulting	0
158 | useful in practice	0
159 | they don	0
160 | this paper we introduce a novel	0
161 | we ask	0
162 | efficiency can be	0
163 | neural network control	0
164 | checker for	0
165 | changes is	0
166 | are inefficient	0
167 | around a	0
168 | based intelligent	0
169 | input values	0
170 | can create	0
171 | for classes of	0
172 | modified by	0
173 | constraints which are	0
174 | from a point	0
175 | the same degree	0
176 | cases can	0
177 | problem this paper proposes a	0
178 | are not suitable	0
179 | of the square	0
180 | based edge	0
181 | and introduce a	0
182 | cores are	0
183 | have become more	0
184 | dependent type	0
185 | a state	0
186 | sensor can	0
187 | a simplified model	0
188 | cleaning and	0
189 | way that	0
190 | help identify	0
191 | the slow	0
192 | the slot	0
193 | this allows a	0
194 | the third one	0
195 | an analytical study	0
196 | in dynamic environments	0
197 | different methods for	0
198 | show that both	0
199 | algorithm the	0
200 | a different way	0
201 | show how	0
202 | and arcs	0
203 | potential future	0
204 | face verification	1
205 | in a modular	0
206 | conjunctive query	1
207 | is aware of	0
208 | kalman filter	1
209 | learning to rank	1
210 | deep learning	1
211 | sparse coding	1
212 | gibbs sampling	1
213 | variational inference	1
214 | expectation maximization	1
215 | spectral clustering	1
216 | support vector	1
217 | vc dimension	1
218 | sample complexity	1
219 | curse of dimensionality	1
220 | newton s method	1
221 | gaussian mixture model	1
222 | of the	0
223 | in the	0
224 | of a	0
225 | to the	0
226 | for the	0
227 | on the	0
228 | and the	0
229 | in a	0
230 | in this	0
231 | that the	0
232 | with the	0
233 | is a	0
234 | based on	0
235 | in this paper	0
236 | for a	0
237 | as a	0
238 | by the	0
239 | to a	0
240 | on a	0
241 | from the	0
242 | to be	0
243 | and a	0
244 | with a	0
245 | a new	0
246 | paper we	0
247 | is the	0
248 | number of	0
249 | the proposed	0
250 | analysis of	0
251 | this paper	0
252 | this paper we	0
253 | of this	0
254 | of an	0
255 | it is	0
256 | as the	0
257 | in this paper we	0
258 | algorithm for	0
259 | is to	0
260 | approach to	0
261 | the system	0
262 | set of	0
263 | can be	0
264 | use of	0
265 | using the	0
266 | used to	0
267 | using a	0
268 | the problem	0
269 | performance of	0
270 | show that	0
271 | at the	0
272 | the performance	0
273 | based on the	0
274 | method for	0
275 | by a	0
276 | we present	0
277 | the number	0
278 | the design	0
279 | model for	0
280 | system for	0
281 | the results	0
282 | such as	0
283 | design of	0
284 | the use	0
285 | in an	0
286 | of our	0
287 | we propose	0
288 | we show	0
289 | in order	0
290 | present a	0
291 | propose a	0
292 | of these	0
293 | approach for	0
294 | order to	0
295 | from a	0
296 | a novel	0
297 | problem of	0
298 | is an	0
299 | in which	0
300 | which is	0
301 | and its	0
302 | that is	0
303 | the number of	0
304 | framework for	0
305 | that are	0
306 | is based	0
307 | the same	0
308 | model of	0
309 | implementation of	0
310 | a set	0
311 | the paper	0
312 | the first	0
313 | algorithms for	0
314 | we also	0
315 | the data	0
316 | design and	0
317 | one of	0
318 | evaluation of	0
319 | we have	0
320 | we propose a	0
321 | 


--------------------------------------------------------------------------------
/src/model_training/recompute_features.cpp:
--------------------------------------------------------------------------------
  1 | #include "../utils/helper.h"
  2 | #include <cassert>
  3 | #include <omp.h>
  4 | 
  5 | typedef unordered_map<string, double> MAP_S_D;
  6 | typedef unordered_map<string, string> MAP_S_S;
  7 | 
  8 | const double INF = 1e10;
  9 | 
 10 | double penalty;
 11 | string suffix;
 12 | 
 13 | double segmentationByDP(const string &pattern, MAP_S_D &prob)
 14 | {
 15 |     vector<string> tokens = splitBy(pattern, ' ');
 16 |     vector<double> f(tokens.size() + 1, 0);
 17 |     f[0] = 1;
 18 |     for (int i = 0; i < (int)tokens.size(); ++ i) {
 19 |         string phrase = "";
 20 |         for (int j = i; j < (int)tokens.size(); ++ j) {
 21 |             if (phrase.size()) {
 22 |                 phrase += " ";
 23 |             }
 24 |             phrase += tokens[j];
 25 |             if (j - i + 1 == (int)tokens.size()) {
 26 |                 continue;
 27 |             }
 28 |             if (prob.count(phrase)) {
 29 |                 double cur = f[i] * prob[phrase] * penalty;
 30 | //                if (j + 1 != (int)tokens.size()) {
 31 | //                    cur *= penalty;
 32 | //                }
 33 |                 if (cur > f[j + 1]) {
 34 |                     f[j + 1] = cur;
 35 |                 }
 36 |             }
 37 |         }
 38 |     }
 39 |     return f[tokens.size()];
 40 | }
 41 | 
 42 | vector<string> printStat(MAP_S_S &patterns, MAP_S_D &num, MAP_S_D &den)
 43 | {
 44 | 	string f1_name = "F1." + suffix;
 45 | 	string f4_name = "F4." + suffix;
 46 |     vector<string> ret(1, f1_name + "," + f4_name);
 47 |     char temp[1000];
 48 | //    cerr << patterns.size() << " v.s. " << num.size() << endl;
 49 |     FOR (iter, patterns) {
 50 |         const string &pattern = iter->first;
 51 |         double best = segmentationByDP(pattern, den);
 52 |         double f1 = 0, f4 = 0;
 53 |         /*if (pattern == "support vector machine") {
 54 |             cerr << num[pattern] << endl;
 55 |             cerr << best << endl;
 56 |         }*/
 57 |         if (num.count(pattern)) {
 58 |             if (best > 1e-9) {
 59 |                 f1 = num[pattern] / best;
 60 |                 f4 = num[pattern] * log(f1);
 61 |             } else {
 62 |                 f1 = f4 = INF;
 63 |             }
 64 |         } else {
 65 |             f1 = 0;
 66 |             f4 = 0;
 67 |         }
 68 |         sprintf(temp, "%.10f,%.10f", f1, f4);
 69 | /*        if (pattern == "support vector machine") {
 70 |             cerr << f1 << " " << f4 << endl;
 71 |             cerr << temp << endl;
 72 |         }*/
 73 |         ret.push_back(temp);
 74 |     }
 75 |     return ret;
 76 | }
 77 | 
 78 | MAP_S_D prob, oldProb;
 79 | 
 80 | void loadProb(const string &prefix)
 81 | {
 82 |     for (int length = 1; length <= 6; ++ length) {
 83 |         char filename[255];
 84 |         sprintf(filename, "%s%d.csv", prefix.c_str(), length);
 85 |         FILE* in = tryOpen(filename, "r");
 86 |         
 87 |         if (in == NULL) {
 88 |             cerr << "[Warning] No length " << length << " phrases." << endl;
 89 |             continue;
 90 |         }
 91 |         getLine(in);
 92 |         double sum = 0;
 93 |         for (;getLine(in);) {
 94 |             vector<string> tokens = splitBy(line, ',');
 95 |     		string pattern = tolower(tokens[0]);
 96 |     		double value;
 97 |     		fromString(tokens[2], value);
 98 |     		prob[pattern] = value;
 99 |     		sum += value;
100 |         }
101 |         //fprintf(stderr, "sum %d = %.6f\n", length, sum);
102 |         fclose(in);
103 |     }
104 |     //cerr << "# prob = " << prob.size() << endl;
105 | }
106 | 
107 | void loadPattern(const string &filename)
108 | {
109 | 	FILE* in = tryOpen(filename, "r");
110 | 	for (;getLine(in);) {
111 | 		vector<string> tokens = splitBy(line, ',');
112 | 		string pattern = tolower(tokens[0]);
113 | 		int occurrence;
114 | 		fromString(tokens[1], occurrence);
115 | 
116 | 		oldProb[pattern] = occurrence;
117 | 	}
118 | 	fclose(in);
119 | //	cerr << "# old prob = " << oldProb.size() << endl;
120 | }
121 | 
122 | MAP_S_S patterns;
123 | 
124 | vector<string> loadFeatureTable(const string &filename)
125 | {
126 | 	FILE* in = tryOpen(filename, "r");
127 | 	vector<string> ret;
128 | 	getLine(in);
129 | 	ret.push_back(line);
130 | 	for (;getLine(in);) {
131 | 		vector<string> tokens = splitBy(line, ',');
132 | 		string pattern = tolower(tokens[0]);
133 | 		patterns[pattern] = line;
134 | 	}
135 | 	FOR (iter, patterns) {
136 |         ret.push_back(iter->second);
137 | 	}
138 | 	fclose(in);
139 | //	cerr << "# Pattern = " << patterns.size() << endl;
140 | 	return ret;
141 | }
142 | 
143 | vector<string> sentences;
144 | 
145 | void loadSentences(const string &filename)
146 | {
147 | 	FILE* in = tryOpen(filename, "rb");
148 | 	size_t size;
149 | 	Binary::read(in, size);
150 | 	sentences.resize(size);
151 | 	for (size_t i = 0; i < size; ++ i) {
152 | 		Binary::read(in, sentences[i]);
153 | 	}
154 | 	//cerr << "# Sentences Loaded = " << size << endl;
155 | }
156 | 
157 | 
158 | int main(int argc, char* argv[])
159 | {
160 |     if (argc != 8) {
161 |         cerr << "[usage] <folder-prefix for new prob> <prev-featureTable.csv> <patterns.csv> <sentencesWithPunc.buf> <new-featureTable.csv> <saved penalty> <recompute times>" << endl;
162 |         return -1;
163 |     }
164 | 
165 |     suffix = argv[7];
166 | 
167 |     if (true) {
168 |         FILE* in = tryOpen(argv[6], "r");
169 |         fscanf(in, "%lf", &penalty);
170 |         fclose(in);
171 |     }
172 | 
173 |     loadProb(argv[1]);
174 |     vector<string> compounds = loadFeatureTable(argv[2]);
175 |     loadPattern(argv[3]);
176 | 
177 |     loadSentences(argv[4]);
178 | 
179 |     omp_set_num_threads(10);
180 |     int corpusTokensN = 0;
181 |     #pragma omp parallel for schedule(dynamic, 1000) reduction(+:corpusTokensN)
182 |     for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
183 |         string alpha = sentences[sentenceID];
184 |         for (size_t i = 0; i < alpha.size(); ++ i) {
185 |             if (isalpha(alpha[i])) {
186 |                 alpha[i] = tolower(alpha[i]);
187 |             } else {
188 |                 if (alpha[i] != '\'') {
189 | 					alpha[i] = ' ';
190 | 				}
191 |             }
192 |         }
193 |         corpusTokensN += splitBy(alpha, ' ').size();
194 |     }
195 | //    cerr << "# corpus tokens = " << corpusTokensN << endl;
196 |     FOR (iter, oldProb) {
197 |         iter->second /= (double)corpusTokensN / splitBy(iter->first, ' ').size();
198 |     }
199 | 
200 | //    cerr << "# update prob = " << prob.size() << endl;
201 | 
202 |     double lambda = 0.5;
203 |     MAP_S_D hybrid;
204 |     FOR (iter, oldProb) {
205 |         double value = 0;
206 |         if (prob.count(iter->first)) {
207 |             value = prob[iter->first];
208 |         }
209 |         hybrid[iter->first] = value * lambda + iter->second * (1 - lambda);
210 |     }
211 | 
212 |     vector<string> stat = printStat(patterns, prob, prob);
213 | //    cerr << "stat ready" << endl;
214 | 
215 |     FILE* out = tryOpen(argv[5], "w");
216 |     for (size_t i = 0; i < compounds.size(); ++ i) {
217 |         fprintf(out, "%s,%s\n", compounds[i].c_str(), stat[i].c_str());
218 |         /*if (compounds[i].find("support vector machine,") == 0) {
219 |             cerr << compounds[i] + "," + stat[i] << endl;
220 |         }*/
221 |     }
222 |     fclose(out);
223 |     cerr << "recompute features done." << endl;
224 | }
225 | 
226 | 


--------------------------------------------------------------------------------
/src/postprocessing/qualify_unigrams.cpp:
--------------------------------------------------------------------------------
  1 | #include "../utils/helper.h"
  2 | #include <queue>
  3 | using namespace std;
  4 | 
  5 | const int K = 3;
  6 | const long long max_size = 2000;         // max length of strings
  7 | const long long N = 40;                  // number of closest words that will be shown
  8 | const long long max_w = 50;              // max length of vocabulary entries
  9 | 
 10 | unordered_map<string, vector<double>> word2vec;
 11 | 
 12 | void loadVector(string filename)
 13 | {
 14 |     FILE *f;
 15 |     char st1[max_size];
 16 |     char file_name[max_size], st[100][max_size];
 17 |     float dist, len, bestd[N], vec[max_size];
 18 |     long long words, size, a, b, c, d, cn, bi[100];
 19 |     char ch;
 20 |     float *M;
 21 |     char *vocab;
 22 |     f = tryOpen(filename, "rb");
 23 |     if (f == NULL) {
 24 |         printf("Input file not found\n");
 25 |         return;
 26 |     }
 27 |     fscanf(f, "%lld", &words);
 28 |     fscanf(f, "%lld", &size);
 29 |     vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 30 |     M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 31 |     if (M == NULL) {
 32 |         printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 33 |         return;
 34 |     }
 35 |     
 36 |     for (b = 0; b < words; b++) {
 37 |         a = 0;
 38 |         while (1) {
 39 |             vocab[b * max_w + a] = fgetc(f);
 40 |             if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 41 |             if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 42 |         }
 43 |         vocab[b * max_w + a] = 0;
 44 |         for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 45 |         string word = &vocab[b * max_w];
 46 |         double norm = 0;
 47 |         vector<double> vec;
 48 |         vec.resize(size, 0);
 49 |         for (a = 0; a < size; a ++) {
 50 |             vec[a] = M[a + b * size];
 51 |             norm += vec[a] * vec[a];
 52 |         }
 53 |         norm = sqrt(norm);
 54 |         for (size_t d = 0; d < vec.size(); ++ d) {
 55 |             vec[d] /= norm;
 56 |         }
 57 |         word2vec[word] = vec;
 58 |     }
 59 |     fclose(f);
 60 | }
 61 | 
 62 | unordered_map<string, double> unigrams, phrases, finalScoreMapping;
 63 | 
 64 | void loadPatterns(string folder)
 65 | {
 66 |     const int maxLen = 6;
 67 |     for (int length = 1; length <= maxLen; ++ length) {
 68 |         ostringstream filename;
 69 |         filename << "length" << length << ".csv";
 70 |         
 71 |         FILE* in = tryOpen(folder + "/" + filename.str(), "r");
 72 |         if (in == NULL) {
 73 |             continue;
 74 |         }
 75 |         while (getLine(in)) {
 76 |             vector<string> tokens = splitBy(line, ',');
 77 |             string phrase = tokens[0];
 78 |             double prob;
 79 |             fromString(tokens[3], prob);
 80 |             
 81 |             if (length == 1) {
 82 |                 unigrams[phrase] = 0;//prob;
 83 |                 finalScoreMapping[phrase] = 0;
 84 |             } else {
 85 |                 for (size_t i = 0; i < phrase.size(); ++ i) {
 86 |                     if (phrase[i] == ' ') {
 87 |                         phrase[i] = '_';
 88 |                     }
 89 |                 }
 90 |                 phrases[phrase] = prob;
 91 |                 finalScoreMapping[phrase] = prob;
 92 |             }
 93 |         }
 94 |         fclose(in);
 95 |     }
 96 | }
 97 | 
 98 | void loadNN(string filename, unordered_map<string, vector<pair<string, double>>> &neighbors)
 99 | {
100 |     FILE* in = tryOpen(filename, "r");
101 |     while (getLine(in)) {
102 |         vector<string> tokens = splitBy(line, '\t');
103 |         string w = tokens[0];
104 |         for (int i = 1; i + 1 < tokens.size(); i += 2) {
105 |             string neighbor = tokens[i];
106 |             myAssert(finalScoreMapping.count(neighbor), "wrong neighbor!! " + neighbor + "\n" + line);
107 |             double similarity;
108 |             fromString(tokens[i + 1], similarity);
109 |             neighbors[w].push_back(make_pair(neighbor, similarity));
110 |         }
111 |     }
112 |     fclose(in);
113 | }
114 | 
115 | unordered_set<string> stopwords;
116 | 
117 | void loadStopwords(string filename)
118 | {
119 |     FILE* in = tryOpen(filename, "r");
120 |     for (;getLine(in);) {
121 |         for (int i = 0; line[i]; ++ i) {
122 |             line[i] = tolower(line[i]);
123 |         }
124 |         stopwords.insert(line);
125 |     }
126 |     fclose(in);
127 | }
128 | 
129 | int main(int argc, char *argv[])
130 | {
131 |     double alpha = 0;
132 |     int maxIter;
133 |     if (argc != 9 || sscanf(argv[5], "%lf", &alpha) != 1 || alpha < 0 || alpha > 1 || sscanf(argv[7], "%d", &maxIter) != 1) {
134 |         printf("[usage] <vector.bin> <length*.csv folder path> <u2p-nn> <w2w-nn> <alpha: ratio for keep the previous value> <output: unified-rank> <max iterations> <stopwords.txt>\n");
135 |         return 0;
136 |     }
137 |     
138 |     loadStopwords(argv[8]);
139 |     
140 |     loadVector(argv[1]);
141 |     loadPatterns(argv[2]);
142 |     
143 |     unordered_map<string, vector<pair<string, double>>> u2p, w2w;
144 |     loadNN(argv[3], u2p);
145 |     loadNN(argv[4], w2w);    
146 |     
147 |     vector<string> unigramList;
148 |     vector<string> wordList;
149 |     FOR (unigram, unigrams) {
150 |         if (word2vec.count(unigram->first)) {
151 |             unigramList.push_back(unigram->first);
152 |             wordList.push_back(unigram->first);
153 |         }
154 |     }
155 |     FOR (phrase, phrases) {
156 |         if (word2vec.count(phrase->first)) {
157 |             wordList.push_back(phrase->first);
158 |         }
159 |     }
160 | 
161 |     #pragma omp parallel for schedule(dynamic, 1000)
162 |     for (int i = 0; i < unigramList.size(); ++ i) {
163 |         const string &key = unigramList[i];
164 |         if (stopwords.count(key)) {
165 |             continue;
166 |         }
167 |         myAssert(u2p.count(key), "missing key in u2p " + key);
168 |         const vector<pair<string, double>> &neighbors = u2p[key];
169 |         double sum = 0;
170 |         double sum_weight = 0;
171 |         FOR (iter, neighbors) {
172 |             double similarity = iter->second;
173 |             string phrase = iter->first;
174 |             double score = finalScoreMapping[phrase];
175 | 
176 |             sum_weight += similarity;
177 |             sum += similarity * score;
178 |         }
179 |         sum_weight = 3;
180 |         finalScoreMapping[key] = unigrams[key] = sum / sum_weight;
181 |     }
182 |     cerr << "unigram initialized" << endl;
183 |     
184 |     for (int iter = 0; iter < maxIter; ++ iter) {
185 |         //cerr << "iter " << iter << endl;
186 |         vector<double> newScores(wordList.size(), 0);
187 |         #pragma omp parallel for schedule(dynamic, 1000)
188 |         for (size_t i = 0; i < wordList.size(); ++ i) {
189 |             const string &key = wordList[i];
190 |             if (stopwords.count(key)) {
191 |                 continue;
192 |             }
193 |             myAssert(w2w.count(key), "missing key in w2w " + key);
194 |             const vector<pair<string, double>> &neighbors = w2w[key];
195 |             double sum = 0, sum_weight = 0;
196 |             FOR (neighbor, neighbors) {
197 |                 const string &wj = neighbor->first;
198 |                 const double &similarity = neighbor->second;
199 |                 double score = finalScoreMapping[wj];
200 |                 sum_weight += similarity;
201 |                 sum += similarity * score;
202 |             }
203 | //            sum_weight = 3;
204 |             newScores[i] = sum / sum_weight;
205 |         }
206 |         for (size_t i = 0; i < wordList.size(); ++ i) {
207 |             finalScoreMapping[wordList[i]] = finalScoreMapping[wordList[i]] * alpha + newScores[i] * (1 - alpha);
208 |         }
209 |     }
210 |     cerr << maxIter << " iterations done" << endl;
211 |     
212 |     FOR (phrase, phrases) {
213 |         finalScoreMapping[phrase->first] = phrase->second;
214 |     }
215 |     
216 |     vector<pair<double, string>> order;
217 |     FOR (w, finalScoreMapping) {
218 |         order.push_back(make_pair(w->second, w->first));
219 |     }
220 |     sort(order.rbegin(), order.rend());
221 |     
222 |     FILE* out = tryOpen(argv[6], "w");
223 |     FOR (unigram, order) {
224 |         fprintf(out, "%s,%.10f\n", unigram->second.c_str(), unigram->first);
225 |     }
226 |     fclose(out);
227 |     
228 |     return 0;
229 | }
230 | 


--------------------------------------------------------------------------------
/src/postprocessing/generateNN.cpp:
--------------------------------------------------------------------------------
  1 | #include "../utils/helper.h"
  2 | #include "kd_tree.h"
  3 | #include <queue>
  4 | using namespace std;
  5 | 
  6 | int dimension;
  7 | const long long max_size = 2000;         // max length of strings
  8 | const long long N = 40;                  // number of closest words that will be shown
  9 | const long long max_w = 50;              // max length of vocabulary entries
 10 | 
 11 | unordered_map<string, vector<double>> word2vec;
 12 | 
 13 | void loadVector(string filename)
 14 | {
 15 |     FILE *f;
 16 |     char st1[max_size];
 17 |     char file_name[max_size], st[100][max_size];
 18 |     float dist, len, bestd[N], vec[max_size];
 19 |     long long words, size, a, b, c, d, cn, bi[100];
 20 |     char ch;
 21 |     float *M;
 22 |     char *vocab;
 23 |     f = tryOpen(filename, "rb");
 24 |     if (f == NULL) {
 25 |         printf("Input file not found\n");
 26 |         return;
 27 |     }
 28 |     fscanf(f, "%lld", &words);
 29 |     fscanf(f, "%lld", &size);
 30 |     dimension = size;
 31 |     vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 32 |     M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 33 |     if (M == NULL) {
 34 |         printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 35 |         return;
 36 |     }
 37 |     
 38 |     for (b = 0; b < words; b++) {
 39 |         a = 0;
 40 |         while (1) {
 41 |             vocab[b * max_w + a] = fgetc(f);
 42 |             if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
 43 |             if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
 44 |         }
 45 |         vocab[b * max_w + a] = 0;
 46 |         for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 47 |         string word = &vocab[b * max_w];
 48 |         vector<double> vec;
 49 |         vec.resize(size, 0);
 50 |         for (a = 0; a < size; a ++) {
 51 |             vec[a] = M[a + b * size];
 52 |         }
 53 |         word2vec[word] = vec;
 54 |     }
 55 |     fclose(f);
 56 | }
 57 | 
 58 | unordered_map<string, double> unigrams, phrases;
 59 | 
 60 | void loadPatterns(string folder)
 61 | {
 62 |     const int maxLen = 6;
 63 |     for (int length = 1; length <= maxLen; ++ length) {
 64 |         ostringstream filename;
 65 |         filename << "length" << length << ".csv";
 66 |         
 67 |         FILE* in = tryOpen(folder + "/" + filename.str(), "r");
 68 |         if (in == NULL) {
 69 |             continue;
 70 |         }
 71 |         while (getLine(in)) {
 72 |             vector<string> tokens = splitBy(line, ',');
 73 |             string phrase = tokens[0];
 74 |             double prob;
 75 |             fromString(tokens[3], prob);
 76 |             
 77 |             if (length == 1) {
 78 |                 unigrams[phrase] = 0;//prob;
 79 |             } else {
 80 |                 for (size_t i = 0; i < phrase.size(); ++ i) {
 81 |                     if (phrase[i] == ' ') {
 82 |                         phrase[i] = '_';
 83 |                     }
 84 |                 }
 85 |                 phrases[phrase] = prob;
 86 |             }
 87 |         }
 88 |         fclose(in);
 89 |     }
 90 | }
 91 | 
 92 | vector<Point> project(const vector<Point> &a, const vector< vector<double> > &axis)
 93 | {
 94 |     vector<Point> ret;
 95 |     for (int i = 0; i < a.size(); ++ i) {
 96 |         vector<double> x;
 97 |         for (int j = 0; j < axis.size(); ++ j) {
 98 |             double dot = 0;
 99 |             myAssert(axis[j].size() == a[i].size(), "wrong dimensions!");
100 |             for (int d = 0; d < axis[j].size(); ++ d) {
101 |                 dot += axis[j][d] * a[i][d];
102 |             }
103 |             x.push_back(dot);
104 |         }
105 |         ret.push_back(Point(a[i].name, x));
106 |     }
107 |     return ret;
108 | }
109 | 
110 | void normalize(vector<Point> &a)
111 | {
112 |     for (int i = 0; i < a.size(); ++ i) {
113 |         double norm = 0;
114 |         for (int d = 0; d < a[i].size(); ++ d) {
115 |             norm += sqr(a[i][d]);
116 |         }
117 |         norm = sqrt(norm);
118 |         for (int d = 0; d < a[i].size(); ++ d) {
119 |             a[i][d] /= norm;
120 |         }
121 |     }
122 | }
123 | 
124 | int main(int argc, char *argv[])
125 | {
126 |     int D, K;
127 |     if (argc != 7 || sscanf(argv[3], "%d", &D) != 1 || sscanf(argv[4], "%d", &K) != 1) {
128 |         printf("[usage] <vector.bin> <length*.csv folder path> <D> <K> <unigram2phrase-nn> <word2word-nn>\n");
129 |         return 0;
130 |     }
131 |     
132 |     loadVector(argv[1]);
133 |     loadPatterns(argv[2]);
134 |     
135 |     cerr << unigrams.size() << endl;
136 |     cerr << phrases.size() << endl;
137 |     
138 |     vector< vector<double> > axis(D, vector<double>(dimension, 0));
139 |     double sqrt3 = sqrt(3.0);
140 |     for (int i = 0; i < D; ++ i) {
141 |         for (int j = 0; j < dimension; ++ j) {
142 |             axis[i][j] = sample_normal();
143 |             /*double roll = next_double2();
144 |             if (roll < 1.0 / 6) {
145 |                 axis[i][j] = sqrt3;
146 |             } else if (roll < 1.0 / 3) {
147 |                 axis[i][j] = -sqrt3;
148 |             }*/
149 |         }
150 |     }
151 |     for (int i = 0; i < D; ++ i) {
152 |         double sum = 0;
153 |         for (int j = 0; j < dimension; ++ j) {
154 |             sum += sqr(axis[i][j]);
155 |         }
156 |         sum = sqrt(sum);
157 |         for (int j = 0; j < dimension; ++ j) {
158 |             axis[i][j] /= sum;
159 |         }
160 |     }
161 | 
162 |     vector<Point> unigramPoints, phrasesPoints, wordsPoints;
163 | 
164 |     FOR (unigram, unigrams) {
165 |         if (word2vec.count(unigram->first)) {
166 |             Point cur(unigram->first, word2vec[unigram->first]);
167 |             unigramPoints.push_back(cur);
168 |             wordsPoints.push_back(cur);
169 |         }
170 |     }
171 |     
172 |     FOR (phrase, phrases) {
173 |         if (word2vec.count(phrase->first)) {
174 |             Point cur(phrase->first, word2vec[phrase->first]);
175 |             phrasesPoints.push_back(cur);
176 |             wordsPoints.push_back(cur);
177 |         }
178 |     }
179 |     
180 |     normalize(unigramPoints);
181 |     normalize(phrasesPoints);
182 |     normalize(wordsPoints);
183 |     
184 |     vector<Point> projUnigramPoints, projPhrasesPoints, projWordsPoints;
185 |     projUnigramPoints = project(unigramPoints, axis);
186 |     projPhrasesPoints = project(phrasesPoints, axis);
187 |     projWordsPoints = project(wordsPoints, axis);
188 |     
189 | /*    normalize(projUnigramPoints);
190 |     normalize(projPhrasesPoints);
191 |     normalize(projWordsPoints);*/
192 |     
193 |     
194 |     for (int i = 0; i < wordsPoints.size(); ++ i) {
195 |         word2vec[wordsPoints[i].name] = wordsPoints[i].x;
196 |     }
197 |     
198 |     cerr << "vectors projected" << endl;
199 |     
200 |     KDTree phraseTree(projPhrasesPoints);
201 |     vector<vector<pair<string, double>>> u2p(unigramPoints.size(), vector<pair<string,double>>(K, make_pair("", 0.0)));
202 |     #pragma omp parallel for schedule(dynamic, 1000)
203 |     for (int i = 0; i < unigramPoints.size(); ++ i) {
204 | //        if (i % 1000 == 0) {
205 | //            cerr << i << " " << unigramPoints[i].name<< endl;
206 | //        }
207 |         vector<string> sim = phraseTree.query(projUnigramPoints[i], 100);
208 |         
209 |         vector< pair<double, string> > order;
210 |         const vector<double> &u = unigramPoints[i].x;
211 |         for (int j = 0; j < sim.size(); ++ j) {
212 |             double dot = 0;
213 |             const vector<double> &v = word2vec[sim[j]];
214 |             myAssert(u.size() == v.size(), "dimension mismatch!");
215 |             for (int d = 0; d < u.size(); ++ d) {
216 |                 dot += u[d] * v[d];
217 |             }
218 |             order.push_back(make_pair(dot, sim[j]));
219 |         }
220 |         sort(order.rbegin(), order.rend());
221 |         if (order.size() > K) {
222 |             order.resize(K);
223 |         }
224 |         
225 |         myAssert(order.size() == K, "not enough neighbors!");
226 |         
227 |         double maxi = order[0].first;
228 |         for (int j = 0; j < order.size(); ++ j) {
229 |             u2p[i][j].first = order[j].second;
230 |             u2p[i][j].second = order[j].first / maxi;
231 |         }
232 |     }
233 |     cerr << "u2p nn done" << endl;
234 |     
235 |     FILE* out = tryOpen(argv[5], "w");
236 |     for (int i = 0; i < unigramPoints.size(); ++ i) {
237 |         fprintf(out, "%s", unigramPoints[i].name.c_str());
238 |         for (int j = 0; j < K; ++ j) {
239 |             fprintf(out, "\t%s\t%.10f", u2p[i][j].first.c_str(), u2p[i][j].second);
240 |         }
241 |         fprintf(out, "\n");
242 |     }
243 |     fclose(out);
244 |     cerr << "u2p output done" << endl;
245 |     
246 |     KDTree wordTree(projWordsPoints);
247 |     vector<vector<pair<string, double>>> w2w(wordsPoints.size(), vector<pair<string,double>>(K, make_pair("", 0.0)));
248 |     #pragma omp parallel for schedule(dynamic, 1000)
249 |     for (int i = 0; i < wordsPoints.size(); ++ i) {
250 |         vector<string> sim = wordTree.query(projWordsPoints[i], 100);
251 |         
252 |         vector< pair<double, string> > order;
253 |         const vector<double> &u = wordsPoints[i].x;
254 |         for (int j = 0; j < sim.size(); ++ j) {
255 |             double dot = 0;
256 |             const vector<double> &v = word2vec[sim[j]];
257 |             myAssert(u.size() == v.size(), "dimension mismatch!");
258 |             for (int d = 0; d < u.size(); ++ d) {
259 |                 dot += u[d] * v[d];
260 |             }
261 |             order.push_back(make_pair(dot, sim[j]));
262 |         }
263 |         sort(order.rbegin(), order.rend());
264 |         if (order.size() > K) {
265 |             order.resize(K);
266 |         }
267 |         
268 |         myAssert(order.size() == K, "not enough neighbors!");
269 |         
270 |         double maxi = order[0].first;
271 |         for (int j = 0; j < order.size(); ++ j) {
272 |             w2w[i][j].first = order[j].second;
273 |             w2w[i][j].second = order[j].first / maxi;
274 |         }
275 |     }
276 | 
277 |     cerr << "w2w nn done" << endl;
278 |     
279 |     out = tryOpen(argv[6], "w");
280 |     for (int i = 0; i < wordsPoints.size(); ++ i) {
281 |         fprintf(out, "%s", wordsPoints[i].name.c_str());
282 |         for (int j = 0; j < K; ++ j) {
283 |             fprintf(out, "\t%s\t%.10f", w2w[i][j].first.c_str(), w2w[i][j].second);
284 |         }
285 |         fprintf(out, "\n");
286 |     }
287 |     fclose(out);
288 |     cerr << "w2w output done" << endl;
289 | }
290 | 


--------------------------------------------------------------------------------
/word2vec_tool/word2phrase.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 60
 22 | 
 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
 24 | 
 25 | typedef float real;                    // Precision of float numbers
 26 | 
 27 | struct vocab_word {
 28 |   long long cn;
 29 |   char *word;
 30 | };
 31 | 
 32 | char train_file[MAX_STRING], output_file[MAX_STRING];
 33 | struct vocab_word *vocab;
 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
 35 | long long vocab_max_size = 10000, vocab_size = 0;
 36 | long long train_words = 0;
 37 | real threshold = 100;
 38 | 
 39 | unsigned long long next_random = 1;
 40 | 
 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 42 | void ReadWord(char *word, FILE *fin) {
 43 |   int a = 0, ch;
 44 |   while (!feof(fin)) {
 45 |     ch = fgetc(fin);
 46 |     if (ch == 13) continue;
 47 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 48 |       if (a > 0) {
 49 |         if (ch == '\n') ungetc(ch, fin);
 50 |         break;
 51 |       }
 52 |       if (ch == '\n') {
 53 |         strcpy(word, (char *)"</s>");
 54 |         return;
 55 |       } else continue;
 56 |     }
 57 |     word[a] = ch;
 58 |     a++;
 59 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 60 |   }
 61 |   word[a] = 0;
 62 | }
 63 | 
 64 | // Returns hash value of a word
 65 | int GetWordHash(char *word) {
 66 |   unsigned long long a, hash = 1;
 67 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 68 |   hash = hash % vocab_hash_size;
 69 |   return hash;
 70 | }
 71 | 
 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
 73 | int SearchVocab(char *word) {
 74 |   unsigned int hash = GetWordHash(word);
 75 |   while (1) {
 76 |     if (vocab_hash[hash] == -1) return -1;
 77 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
 78 |     hash = (hash + 1) % vocab_hash_size;
 79 |   }
 80 |   return -1;
 81 | }
 82 | 
 83 | // Reads a word and returns its index in the vocabulary
 84 | int ReadWordIndex(FILE *fin) {
 85 |   char word[MAX_STRING];
 86 |   ReadWord(word, fin);
 87 |   if (feof(fin)) return -1;
 88 |   return SearchVocab(word);
 89 | }
 90 | 
 91 | // Adds a word to the vocabulary
 92 | int AddWordToVocab(char *word) {
 93 |   unsigned int hash, length = strlen(word) + 1;
 94 |   if (length > MAX_STRING) length = MAX_STRING;
 95 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
 96 |   strcpy(vocab[vocab_size].word, word);
 97 |   vocab[vocab_size].cn = 0;
 98 |   vocab_size++;
 99 |   // Reallocate memory if needed
100 |   if (vocab_size + 2 >= vocab_max_size) {
101 |     vocab_max_size += 10000;
102 |     vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103 |   }
104 |   hash = GetWordHash(word);
105 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106 |   vocab_hash[hash]=vocab_size - 1;
107 |   return vocab_size - 1;
108 | }
109 | 
110 | // Used later for sorting by word counts
111 | int VocabCompare(const void *a, const void *b) {
112 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113 | }
114 | 
115 | // Sorts the vocabulary by frequency using word counts
116 | void SortVocab() {
117 |   int a;
118 |   unsigned int hash;
119 |   // Sort the vocabulary and keep </s> at the first position
120 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122 |   for (a = 0; a < vocab_size; a++) {
123 |     // Words occuring less than min_count times will be discarded from the vocab
124 |     if (vocab[a].cn < min_count) {
125 |       vocab_size--;
126 |       free(vocab[vocab_size].word);
127 |     } else {
128 |       // Hash will be re-computed, as after the sorting it is not actual
129 |       hash = GetWordHash(vocab[a].word);
130 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131 |       vocab_hash[hash] = a;
132 |     }
133 |   }
134 |   vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135 | }
136 | 
137 | // Reduces the vocabulary by removing infrequent tokens
138 | void ReduceVocab() {
139 |   int a, b = 0;
140 |   unsigned int hash;
141 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142 |     vocab[b].cn = vocab[a].cn;
143 |     vocab[b].word = vocab[a].word;
144 |     b++;
145 |   } else free(vocab[a].word);
146 |   vocab_size = b;
147 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148 |   for (a = 0; a < vocab_size; a++) {
149 |     // Hash will be re-computed, as it is not actual
150 |     hash = GetWordHash(vocab[a].word);
151 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152 |     vocab_hash[hash] = a;
153 |   }
154 |   fflush(stdout);
155 |   min_reduce++;
156 | }
157 | 
158 | void LearnVocabFromTrainFile() {
159 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160 |   FILE *fin;
161 |   long long a, i, start = 1;
162 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163 |   fin = fopen(train_file, "rb");
164 |   if (fin == NULL) {
165 |     printf("ERROR: training data file not found!\n");
166 |     exit(1);
167 |   }
168 |   vocab_size = 0;
169 |   AddWordToVocab((char *)"</s>");
170 |   while (1) {
171 |     ReadWord(word, fin);
172 |     if (feof(fin)) break;
173 |     if (!strcmp(word, "</s>")) {
174 |       start = 1;
175 |       continue;
176 |     } else start = 0;
177 |     train_words++;
178 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179 |       printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
180 |       fflush(stdout);
181 |     }
182 |     i = SearchVocab(word);
183 |     if (i == -1) {
184 |       a = AddWordToVocab(word);
185 |       vocab[a].cn = 1;
186 |     } else vocab[i].cn++;
187 |     if (start) continue;
188 |     sprintf(bigram_word, "%s_%s", last_word, word);
189 |     bigram_word[MAX_STRING - 1] = 0;
190 |     strcpy(last_word, word);
191 |     i = SearchVocab(bigram_word);
192 |     if (i == -1) {
193 |       a = AddWordToVocab(bigram_word);
194 |       vocab[a].cn = 1;
195 |     } else vocab[i].cn++;
196 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197 |   }
198 |   SortVocab();
199 |   if (debug_mode > 0) {
200 |     printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201 |     printf("Words in train file: %lld\n", train_words);
202 |   }
203 |   fclose(fin);
204 | }
205 | 
206 | void TrainModel() {
207 |   long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209 |   real score;
210 |   FILE *fo, *fin;
211 |   printf("Starting training using file %s\n", train_file);
212 |   LearnVocabFromTrainFile();
213 |   fin = fopen(train_file, "rb");
214 |   fo = fopen(output_file, "wb");
215 |   word[0] = 0;
216 |   while (1) {
217 |     strcpy(last_word, word);
218 |     ReadWord(word, fin);
219 |     if (feof(fin)) break;
220 |     if (!strcmp(word, "</s>")) {
221 |       fprintf(fo, "\n");
222 |       continue;
223 |     }
224 |     cn++;
225 |     if ((debug_mode > 1) && (cn % 100000 == 0)) {
226 |       printf("Words written: %lldK%c", cn / 1000, 13);
227 |       fflush(stdout);
228 |     }
229 |     oov = 0;
230 |     i = SearchVocab(word);
231 |     if (i == -1) oov = 1; else pb = vocab[i].cn;
232 |     if (li == -1) oov = 1;
233 |     li = i;
234 |     sprintf(bigram_word, "%s_%s", last_word, word);
235 |     bigram_word[MAX_STRING - 1] = 0;
236 |     i = SearchVocab(bigram_word);
237 |     if (i == -1) oov = 1; else pab = vocab[i].cn;
238 |     if (pa < min_count) oov = 1;
239 |     if (pb < min_count) oov = 1;
240 |     if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241 |     if (score > threshold) {
242 |       fprintf(fo, "_%s", word);
243 |       pb = 0;
244 |     } else fprintf(fo, " %s", word);
245 |     pa = pb;
246 |   }
247 |   fclose(fo);
248 |   fclose(fin);
249 | }
250 | 
251 | int ArgPos(char *str, int argc, char **argv) {
252 |   int a;
253 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254 |     if (a == argc - 1) {
255 |       printf("Argument missing for %s\n", str);
256 |       exit(1);
257 |     }
258 |     return a;
259 |   }
260 |   return -1;
261 | }
262 | 
263 | int main(int argc, char **argv) {
264 |   int i;
265 |   if (argc == 1) {
266 |     printf("WORD2PHRASE tool v0.1a\n\n");
267 |     printf("Options:\n");
268 |     printf("Parameters for training:\n");
269 |     printf("\t-train <file>\n");
270 |     printf("\t\tUse text data from <file> to train the model\n");
271 |     printf("\t-output <file>\n");
272 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273 |     printf("\t-min-count <int>\n");
274 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275 |     printf("\t-threshold <float>\n");
276 |     printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277 |     printf("\t-debug <int>\n");
278 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279 |     printf("\nExamples:\n");
280 |     printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281 |     return 0;
282 |   }
283 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287 |   if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290 |   TrainModel();
291 |   return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/src/classification/random_forest.h:
--------------------------------------------------------------------------------
  1 | #ifndef __RANDOM_FOREST_H__
  2 | #define __RANDOM_FOREST_H__
  3 | 
  4 | #include "../utils/helper.h"
  5 | 
  6 | #include <iostream>
  7 | #include <algorithm>
  8 | #include <cstdio>
  9 | #include <cstring>
 10 | #include <cstdlib>
 11 | #include <cmath>
 12 | #include <vector>
 13 | #include <string>
 14 | #include <map>
 15 | #include <set>
 16 | #include <sstream>
 17 | #include <ctime>
 18 | using namespace std;
 19 | 
 20 | vector<double> featureImportance;
 21 | 
 22 | int RANDOM_FEATURES = 4;
 23 | int RANDOM_POSITIONS = 8;
 24 | const double PI = acos(-1.0);
 25 | 
 26 | int decayPoint = 12;
 27 | 
 28 | namespace RandomNumbers
 29 | {
 30 | struct RandomNumberGenerator
 31 | {
 32 |     unsigned int MT[624];
 33 |     int index;
 34 |     
 35 |     void init(int seed = 1) {
 36 |         MT[0] = seed;
 37 | 		for (int i = 1; i < 624; ++ i) {
 38 | 			MT[i] = (1812433253UL * (MT[i-1] ^ (MT[i-1] >> 30)) + i);
 39 | 		}
 40 |         index = 0;
 41 |     }
 42 |     
 43 |     void generate() {
 44 |         const unsigned int MULT[] = {0, 2567483615UL};
 45 | 		for (int i = 0; i < 227; ++ i) {
 46 |             unsigned int y = (MT[i] & 0x8000000UL) + (MT[i+1] & 0x7FFFFFFFUL);
 47 |             MT[i] = MT[i+397] ^ (y >> 1);
 48 |             MT[i] ^= MULT[y&1];
 49 |         }
 50 | 		for (int i = 227; i < 623; ++ i) {
 51 |             unsigned int y = (MT[i] & 0x8000000UL) + (MT[i+1] & 0x7FFFFFFFUL);
 52 |             MT[i] = MT[i-227] ^ (y >> 1);
 53 |             MT[i] ^= MULT[y&1];
 54 |         }
 55 |         unsigned int y = (MT[623] & 0x8000000UL) + (MT[0] & 0x7FFFFFFFUL);
 56 |         MT[623] = MT[623-227] ^ (y >> 1);
 57 |         MT[623] ^= MULT[y&1];
 58 |     }
 59 |     
 60 |     unsigned int rand() {
 61 |         if (index == 0) {
 62 |             generate();
 63 |         }
 64 |         
 65 |         unsigned int y = MT[index];
 66 |         y ^= y >> 11;
 67 |         y ^= y << 7  & 2636928640UL;
 68 |         y ^= y << 15 & 4022730752UL;
 69 |         y ^= y >> 18;
 70 |         index = index == 623 ? 0 : index + 1;
 71 |         return y;
 72 |     }
 73 |     
 74 |     int next(int x) { // [0, x)
 75 |         return rand() % x;
 76 |     }
 77 |     
 78 |     int next(int a, int b) { // [a, b)
 79 |         return a + (rand() % (b - a));
 80 |     }
 81 |     
 82 |     double nextDouble() { // (0, 1)
 83 |         return (rand() + 0.5) * (1.0 / 4294967296.0);
 84 |     }
 85 | };
 86 | 
 87 | static RandomNumberGenerator rng;
 88 | }
 89 | using namespace RandomNumbers;
 90 | 
 91 | namespace RandomForestRelated
 92 | {
 93 | struct TreeNode {
 94 | 	bool leaf;
 95 | 	int level, feature;
 96 | 	double value, result;
 97 | 	int left, right;
 98 | 
 99 | 	TreeNode() {
100 | 		leaf = false;
101 | 		level = feature = left = right = -1;
102 | 		value = result = 0;
103 | 	}
104 | };
105 | 
106 | class DecisionTree
107 | {
108 | public:
109 | 	vector<TreeNode> nodes;
110 | 	vector<int> oobSamples;
111 | 	
112 | 		
113 | 	void dump(FILE* out) {
114 | 	    size_t size = nodes.size();
115 | 	    fwrite(&size, sizeof(size), 1, out);
116 | 	    if (size > 0) {
117 |     	    fwrite(&nodes[0], sizeof(nodes[0]), size, out);
118 | 	    }
119 | 	}
120 | 	
121 | 	void load(FILE* in) {
122 | 	    size_t size;
123 | 	    fread(&size, sizeof(size), 1, in);
124 | 	    nodes.resize(size);
125 | 	    if (size > 0) {
126 |     	    fread(&nodes[0], sizeof(nodes[0]), size, in);
127 | 	    }
128 |     }
129 | 
130 | 	DecisionTree() {}
131 | 
132 | 	DecisionTree(vector< vector<double> > &features, vector<double> &results, int minNodeSize, int maxLevel = 18) {
133 | //fprintf(stderr, "%d, %d\n", features.size(), features[0].size());
134 | 		TreeNode root;
135 | 		root.level = 0;
136 | 		nodes.push_back(root);
137 | 
138 | 		vector<int> index[2];
139 | 		for (int i = 0; i < (int)results.size(); ++ i) {
140 | 			index[(int)results[i]].push_back(i);
141 | 		}
142 | 		vector<int> rootBag, used(features.size(), 0);
143 | 		for (int type = 0; type < 2; ++ type) {
144 | 			int selected = (int)(min(index[0].size(), index[1].size())) * 16;
145 | //fprintf(stderr, "selected = %d\n", selected);
146 | 			for (int i = 0; i < selected; ++ i) {
147 | 				int id = index[type][rng.next(index[type].size())];
148 | 				rootBag.push_back(id);
149 | 				used[id] = 1;
150 | 			}
151 | 		}
152 | 
153 | 		for (int i = 0; i < (int)results.size(); ++ i) {
154 | 			if (!used[i]) {
155 | 				oobSamples.push_back(i);
156 | 			}
157 | 		}
158 | 		vector< vector<int> > nodeBags;
159 | 		nodeBags.push_back(rootBag);
160 | 
161 | 		for (int curNode = 0; curNode < (int)nodes.size(); ++ curNode) {
162 | 			TreeNode &node = nodes[curNode];
163 | 			vector<int> &bag = nodeBags[curNode];
164 | 
165 | 			myAssert((int)bag.size() > 0, "[ERROR] empty node in decision tree!");
166 | 
167 | 			bool equal = true;
168 | 
169 | 			double first = results[bag[0]];
170 | 			for (int i = 1; i < (int)bag.size(); ++ i) {
171 | 				if (sign(results[bag[i]] - first)) {
172 | 					equal = false;
173 | 					break;
174 | 				}
175 | 			}
176 | //fprintf(stderr, "equal = %d, minNodeSize = %d, maxLevel = %d\n", equal, minNodeSize, maxLevel);
177 | 
178 | 			if (equal || (int)bag.size() <= minNodeSize || node.level >= maxLevel) {
179 | 				// leaf
180 | 				node.leaf = true;
181 | 				for (int i = 0; i < (int)bag.size(); ++ i) {
182 | 					node.result += results[bag[i]];
183 | 				}
184 | 				node.result /= bag.size();
185 | 				continue;
186 | 			}
187 | 
188 |             double bag_sum = 0;
189 |             for (int i = 0; i < (int)bag.size(); ++ i) {
190 |                 bag_sum += results[bag[i]];
191 |             }
192 |             double bag_mean = bag_sum / bag.size();
193 |             double bag_mse = 0;
194 |             for (int i = 0; i < (int)bag.size(); ++ i) {
195 |                 double x = bag_mean - results[bag[i]];
196 |                 bag_mse += x * x;
197 |             }
198 | 
199 | 			int bestFeature = -1;
200 | 			int bestLeft = 0, bestRight = 0;
201 | 			double bestValue = 0;
202 | 			double bestMSE = 1e100;
203 | 			for (int _ = 0; _ < RANDOM_FEATURES; ++ _) {
204 | 				int featureID = rng.next(features[0].size());
205 |                 if (featureID >= decayPoint) {
206 |                     int number = features[0].size() - decayPoint;
207 |                     if (rand() % number != 0) {
208 |                         -- _;
209 |                         continue;
210 |                     }
211 |                 }
212 | 				for (int __ = 0; __ < RANDOM_POSITIONS; ++ __) {
213 | 					int instanceID = bag[rng.next(bag.size())];
214 | 					double splitValue = features[instanceID][featureID];
215 | //fprintf(stderr, "attribute %d, value = %.6f\n", featureID, splitValue);
216 | 					double sum[2] = {0, 0};
217 | 					int total[2] = {0, 0};
218 | 					for (int i = 0; i < (int)bag.size(); ++ i) {
219 | 						int id = bag[i];
220 | 						int side = features[id][featureID] < splitValue;
221 | 						sum[side] += results[id];
222 | 						++ total[side];
223 | 					}
224 | 
225 | 					if (total[0] == 0 || total[1] == 0) {
226 | 						continue;
227 | 					}
228 | 
229 | 					double mean[2] = {sum[0] / total[0], sum[1] / total[1]};
230 | 
231 | 					double mse = 0;
232 | 					for (int i = 0; i < (int)bag.size(); ++ i) {
233 | 						int id = bag[i];
234 | 						int side = features[id][featureID] < splitValue;
235 | 						
236 | 						mse += sqr(results[id] - mean[side]);
237 | 					}
238 | 
239 | 					if (mse < bestMSE) {
240 | 						bestMSE = mse;
241 | 						bestValue = splitValue;
242 | 						bestFeature = featureID;
243 | 						bestLeft = total[1];
244 | 						bestRight = total[0];
245 | 					}
246 | 				}
247 | 			}
248 | //fprintf(stderr, "bset mse = %.10f\n", bestMSE);
249 | 			if (bestLeft == 0 || bestRight == 0) {
250 | 				// leaf
251 | 				node.leaf = true;
252 | 				for (int i = 0; i < (int)bag.size(); ++ i) {
253 | 					node.result += results[bag[i]];
254 | 				}
255 | 				node.result /= bag.size();
256 | 				continue;
257 | 			}
258 | 			
259 | 			featureImportance[bestFeature] += bag_mse - bestMSE;
260 | 
261 | 			double nextValue = -1e100;
262 | 			for (int i = 0; i < (int)bag.size(); ++ i) {
263 | 				int id = bag[i];
264 | 				if (features[id][bestFeature] < bestValue) {
265 | 					nextValue = max(nextValue, features[id][bestFeature]);
266 | 				}
267 | 			}
268 | 
269 | 			TreeNode left, right;
270 | 
271 | 			left.level = right.level = node.level + 1;
272 | 			node.feature = bestFeature;
273 | 			node.value = (bestValue + nextValue) / 2;
274 | 			node.left = nodes.size();
275 | 			node.right = nodes.size() + 1;
276 | 
277 | 			vector<int> leftBag(bestLeft, 0), rightBag(bestRight, 0);
278 | 			for (int i = 0; i < (int)bag.size(); ++ i) {
279 | 				int id = bag[i];
280 | 				if (features[id][bestFeature] < bestValue) {
281 | 					leftBag[-- bestLeft] = id;
282 | 				} else {
283 | 					rightBag[-- bestRight] = id;
284 | 				}
285 | 			}
286 | 
287 | 			myAssert(bestLeft == 0, "Left Space Remained!");
288 | 			myAssert(bestRight == 0, "Right Space Remained!");
289 | 
290 | 			nodes.push_back(left);
291 | 			nodes.push_back(right);
292 | 
293 | 			nodeBags.push_back(leftBag);
294 | 			nodeBags.push_back(rightBag);
295 | 		}
296 | 	}
297 | 
298 | 	double estimate(vector<double> &features) {
299 | 		TreeNode *current = &nodes[0];
300 | 		while (!current->leaf) {
301 | 			if (features[current->feature] < current->value) {
302 | 				current = &nodes[current->left];
303 | 			} else {
304 | 				current = &nodes[current->right];
305 | 			}
306 | 		}
307 | 		return current->result;
308 | 	}
309 | };
310 | 
311 | class RandomForest
312 | {
313 | 	vector<DecisionTree> trees;
314 | 	vector< vector<double> > features;
315 | 	vector<double> results;
316 | public:
317 |     void dump(string filename) {
318 |         FILE* out = fopen(filename.c_str(), "wb");
319 |         size_t size = trees.size();
320 |         fwrite(&size, sizeof(size), 1, out);
321 |         for (size_t i = 0; i < trees.size(); ++ i) {
322 |             trees[i].dump(out);
323 |         }
324 |         fclose(out);
325 |     }
326 |     
327 |     void load(string filename) {
328 |         FILE* in = fopen(filename.c_str(), "rb");
329 |         size_t size;
330 |         fread(&size, sizeof(size), 1, in);
331 |         trees.resize(size);
332 |         for (size_t i = 0; i < trees.size(); ++ i) {
333 |             trees[i].load(in);
334 |         }
335 |         fclose(in);
336 |     }
337 | 
338 | 	void clear() {
339 | 		features.clear();
340 | 		results.clear();
341 | 		trees.clear();
342 | 	}
343 | 
344 | 	void train(vector< vector<double> > &_features, vector<double> _results, int treesNo = 100, int minNodeSize = 100) {
345 | 		if (features.size() == 0) {
346 | 			features = _features;
347 | 			results = _results;
348 | 			if (features.size() > 0) {
349 |     			featureImportance.resize(features[0].size(), 0);
350 | 			}
351 | 		}
352 | 		myAssert(features.size() == results.size(), "[ERROR] wrong training data!");
353 | 
354 | 		for (int i = 0; i < treesNo; ++ i) {
355 | 			trees.push_back(DecisionTree(_features, _results, minNodeSize));
356 | //fprintf(stderr, "treeNo = %d, node size = %d\n", i, trees.back().nodes.size());
357 | 		}
358 | 	}
359 | 
360 | 	double estimate(vector<double> &features) {
361 | 		if (trees.size() == 0) {
362 | 			return 0.0;
363 | 		}
364 | 
365 | 		double sum = 0;
366 | 		for (int i = 0; i < (int)trees.size(); ++ i) {
367 | 			sum += trees[i].estimate(features);
368 | 		}
369 | 		return sum / trees.size();
370 | 	}
371 | 
372 | 	vector<double> calcOOB() {
373 | 		vector<double> w(features.size(), 0);
374 | 		vector<double> v(features.size(), 0);
375 | 		vector<double> rv(features.size(), 0);
376 | 
377 | 		for (int i = 0; i < (int)trees.size(); ++ i) {
378 | 			DecisionTree &t = trees[i];
379 | 			for (int j = 0; j < (int)t.oobSamples.size(); ++ j) {
380 | 				int id = t.oobSamples[j];
381 | 				v[id] += t.estimate(features[i]);
382 | 				w[id] += 1;
383 | 			}
384 | 		}
385 | 
386 | 		for (int i = 0; i < (int)rv.size(); ++ i) {
387 | 			rv[i] = v[i] / w[i];
388 | 		}
389 | 		return rv;
390 | 	}
391 | };
392 | };
393 | 
394 | #endif
395 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/word2vec_tool/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/src/classification/feature_extraction.cpp:
--------------------------------------------------------------------------------
  1 | #include "../utils/helper.h"
  2 | #include "aho_corasick.h"
  3 | #include <cassert>
  4 | 
  5 | typedef unordered_map<string, double> MAP_S_D;
  6 | 
  7 | MAP_S_D word2idf, stopwords;
  8 | 
  9 | void loadStopwords(string stopFile, string idfFile)
 10 | {
 11 |     FILE* in = tryOpen(idfFile, "r");
 12 |     for (;getLine(in);) {
 13 |         vector<string> tokens = splitBy(line, ',');
 14 |         if (tokens.size() == 2) {
 15 |             string word = tokens[0];
 16 |             double idf;
 17 |             fromString(tokens[1], idf);
 18 |             word2idf[word] = idf;
 19 |         } else {
 20 |             cerr << line << endl;
 21 |         }
 22 |     }
 23 |     in = tryOpen(stopFile, "r");
 24 |     for (;getLine(in);) {
 25 |         stopwords[line] = 1;
 26 |     }
 27 | }
 28 | 
 29 | vector<string> printPunc(MAP_S_D &patterns, MAP_S_D &quote, MAP_S_D &dash, MAP_S_D &parenthesis, MAP_S_D &capital, MAP_S_D &total)
 30 | {
 31 |     vector<string> ret(1, "quote,dash,parenthesis,capital");
 32 |     char temp[1000];
 33 |     FOR (iter, patterns) {
 34 |         const string &pattern = iter->first;
 35 |         if (pattern.find(' ') == string::npos) {
 36 |             continue;
 37 |         }
 38 |         if (total[pattern] == 0) {
 39 |             ret.push_back("0,0,0,0");
 40 |         } else {
 41 |             sprintf(temp, "%.10f,%.10f,%.10f,%.10f",
 42 |                             quote[pattern] / total[pattern],
 43 |                             dash[pattern] / total[pattern],
 44 |                             parenthesis[pattern] / total[pattern],
 45 |                             capital[pattern] / total[pattern]
 46 |                    );
 47 |             ret.push_back(temp);
 48 |         }
 49 |     }
 50 |     return ret;
 51 | }
 52 | 
 53 | vector<string> printStat(MAP_S_D &patterns, MAP_S_D &prob, unordered_map<string, vector<double> > &f, MAP_S_D &total, unordered_map<string, vector<double> > &sumOutside)
 54 | {
 55 |     vector<string> ret(1, "prob_feature,occur_feature,log_occur_feature,prob_log_occur,constant,outsideSentence,new_outside,frequency");
 56 |     char temp[1000];
 57 |     FOR (iter, patterns) {
 58 |         const string &pattern = iter->first;
 59 |         if (pattern.find(' ') == string::npos) {
 60 |             continue;
 61 |         }
 62 |         string AB = "";
 63 |         string CD = "";
 64 |         double best = -1;
 65 |         for (size_t i = 0; i < pattern.size(); ++ i) {
 66 |             if (pattern[i] == ' ') {
 67 |                 string left = pattern.substr(0, i);
 68 |                 string right = pattern.substr(i + 1);
 69 |                 double current = prob[left] * prob[right];
 70 |                 if (current > best) {
 71 |                     best = current;
 72 |                     AB = left;
 73 |                     CD = right;
 74 |                 }
 75 |             }
 76 |         }
 77 |         double f1 = prob[pattern] / (prob[AB] * prob[CD]);
 78 |         double f2 = iter->second / sqrt(patterns[AB] * patterns[CD]);
 79 |         double f3 = sqrt(iter->second) * log(f1);
 80 |         double f4 = prob[pattern] * log(f1);
 81 |         
 82 |         vector<string> tokens = splitBy(pattern, ' ');
 83 |         double sum = 0, norm = 0;
 84 |         for (size_t i = 0; i < tokens.size(); ++ i) {
 85 |             if (total[pattern] > 0) {
 86 |                 f[pattern][i] /= total[pattern];
 87 |             }
 88 |             double wi = log(prob[tokens[i]]);
 89 |             sum += sqr(f[pattern][i]) * sqr(wi);
 90 |             norm += sqr(wi);
 91 |         }
 92 |         double outside = sqrt(sum / norm);
 93 |         
 94 |         sum = norm = 0;
 95 |         for (size_t i = 0; i < tokens.size(); ++ i) {
 96 |             sum += sumOutside[pattern][i] * word2idf[tokens[i]];
 97 |             norm += word2idf[tokens[i]];
 98 |         }
 99 |         if (total[pattern] > 0) {
100 |             sum /= total[pattern];
101 |         }
102 |         double newOutside = sum / norm;
103 |         
104 |         sprintf(temp, "%.10f,%.10f,%.10f,%.10f,1,%.10f,%.10f,%.0f", f1, f2, f3, f4, outside, newOutside, iter->second);
105 |         ret.push_back(temp);
106 |     }
107 |     return ret;
108 | }
109 | 
110 | vector<string> printStopwords(MAP_S_D &patterns)
111 | {
112 |     vector<string> ret(1, "avg_idf,stop_ratio,first_stop,last_stop");
113 |     char temp[1000];
114 |     FOR (iter, patterns) {
115 |         const string &pattern = iter->first;
116 |         if (pattern.find(' ') == string::npos) {
117 |             continue;
118 |         }
119 |         vector<string> tokens = splitBy(pattern, ' ');
120 |         double sum = 0;
121 |         int cnt = 0, stopCnt = 0;
122 |         FOR (token, tokens) {
123 |             if (word2idf.count(*token)) {
124 |                 ++ cnt;
125 |                 sum += word2idf[*token];
126 |             }
127 |             if (stopwords.count(*token)) {
128 |                 ++ stopCnt;
129 |             }
130 |         }
131 |         if (cnt > 0) {
132 |             sum /= cnt;
133 |         }
134 |         sprintf(temp, "%.10f,%.10f,%d,%d", sum, (double)stopCnt / tokens.size(), stopwords.count(tokens[0]), stopwords.count(tokens[tokens.size() - 1]));
135 |         ret.push_back(temp);
136 |     }
137 |     return ret;
138 | }
139 | 
140 | void selftest()
141 | {
142 |     // self test
143 |     AhoCorasick tree;
144 |     tree.add("a");
145 |     tree.add("aa");
146 |     tree.add("ab");
147 |     tree.make();
148 |     //                                                0123456789
149 |     vector< pair<int, int> > positions;
150 |     tree.search("aa ab baa", positions);
151 |     /*FOR (iter, positions) {
152 |         cerr << iter->first << " " << iter->second << endl;
153 |     }*/
154 |     assert(positions.size() == 8);
155 |     
156 |     //cerr << "self test on AC automaton passed" << endl;
157 | }
158 | 
159 | vector<string> sentences;
160 | 
161 | void loadSentences(const string &filename)
162 | {
163 | 	FILE* in = tryOpen(filename, "rb");
164 | 	size_t size;
165 | 	Binary::read(in, size);
166 | 	sentences.resize(size);
167 | 	for (size_t i = 0; i < size; ++ i) {
168 | 		Binary::read(in, sentences[i]);
169 | 	}
170 | 	//cerr << "# Sentences Loaded = " << size << endl;
171 | }
172 | 
173 | AhoCorasick tree;
174 | MAP_S_D patterns, prob, quote, parenthesis, dash, capital, total;
175 | unordered_map<string, vector<double> > f, sumOutside;
176 | 
177 | void loadPattern(const string &filename)
178 | {
179 | 	FILE* in = tryOpen(filename, "r");
180 | 	for (;getLine(in);) {
181 | 		vector<string> tokens = splitBy(line, ',');
182 | 		string pattern = tolower(tokens[0]);
183 | 		int occurrence;
184 | 		fromString(tokens[1], occurrence);
185 | 		
186 | 		patterns[pattern] = occurrence;
187 | 		prob[pattern] = occurrence;
188 | 		
189 | 		quote[pattern] = parenthesis[pattern] = dash[pattern] = capital[pattern] = total[pattern] = 0;
190 | 		
191 | 		size_t tokensN = splitBy(pattern, ' ').size();
192 | 		f[pattern].resize(tokensN, 0);
193 | 		sumOutside[pattern].resize(tokensN, 0);
194 | 		
195 | 		tree.add(" " + pattern + " ");
196 | 	}
197 | 	fclose(in);
198 | 	//cerr << "# Pattern = " << prob.size() << endl;
199 | 	
200 | 	tree.make();
201 | 	//cerr << "Tree is built" << endl;
202 | }
203 | 
204 | int main(int argc, char* argv[])
205 | {
206 |     if (argc != 6) {
207 |         cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl;
208 |         return -1;
209 |     }
210 |     selftest();
211 |     
212 |     loadSentences(argv[1]);
213 |     loadPattern(argv[2]);
214 |     loadStopwords(argv[3], argv[4]);
215 |     
216 |     int corpusTokensN = 0;
217 |     for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
218 |         const string &text = sentences[sentenceID];
219 |         string alpha = text;
220 |         for (size_t i = 0; i < alpha.size(); ++ i) {
221 |             if (isalpha(alpha[i])) {
222 |                 alpha[i] = tolower(alpha[i]);
223 |             } else {
224 |                 if (alpha[i] != '\'') {
225 | 					alpha[i] = ' ';
226 | 				}
227 |             }
228 |         }
229 |         corpusTokensN += splitBy(alpha, ' ').size();
230 |         
231 |         string outsideText = alpha;
232 |         if (sentenceID > 0) {
233 |             outsideText += " " + sentences[sentenceID - 1];
234 |         }
235 |         if (sentenceID + 1 < sentences.size()) {
236 |             outsideText += " " + sentences[sentenceID + 1];
237 |         }
238 |         for (size_t i = 0; i < outsideText.size(); ++ i) {
239 |             if (isalpha(outsideText[i])) {
240 |                 outsideText[i] = tolower(outsideText[i]);
241 |             } else {
242 |                 outsideText[i] = ' ';
243 |             }
244 |         }
245 |         
246 |         vector<string> outside = splitBy(outsideText, ' ');
247 |         unordered_map<string, int> outsideCnt;
248 |         FOR (token, outside) {
249 |             ++ outsideCnt[*token];
250 |         }
251 |         
252 |         vector< pair<int, int> > positions;
253 |         tree.search(" " + alpha + " ", positions);
254 |         
255 |         unordered_map<string, int> patternCnt;
256 |         FOR (pos, positions) {
257 |             int st = pos->first;
258 |             int ed = pos->second - 2;
259 |             string pattern = alpha.substr(st, ed - st);
260 |             ++ patternCnt[pattern];
261 |         }
262 |         FOR (pos, positions) {
263 |             int st = pos->first;
264 |             int ed = pos->second - 2;
265 |             string pattern = alpha.substr(st, ed - st);
266 |             
267 |             vector<string> tokens = splitBy(pattern, ' ');
268 |             unordered_map<string, int> tokenCnt;
269 |             int delta = patternCnt[pattern];
270 |             for (size_t i = 0; i < tokens.size(); ++ i) {
271 |                 tokenCnt[tokens[i]] += delta;
272 |             }
273 |             for (size_t i = 0; i < tokens.size(); ++ i) {
274 |                 if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) {
275 |                     f[pattern][i] += 1;
276 |                     sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]];
277 |                 }
278 |             }
279 |             
280 |             total[pattern] += 1;
281 |             
282 |             if (st > 0 && ed < (int)text.size()) {
283 |                 if (text[st - 1] == '(' && text[ed] == ')') {
284 |                     parenthesis[pattern] += 1;
285 |                 }
286 |                 if (text[st - 1] == '"' && text[ed] == '"') {
287 |                     quote[pattern] += 1;
288 |                 }
289 |             }
290 |             
291 |             bool found = false;
292 |             for (int i = st; i < ed && !found; ++ i) {
293 |                 found |= text[i] == '-';
294 |             }
295 |             dash[pattern] += found;
296 |             
297 |             bool valid = true;
298 |             for (int i = st; i < ed && valid; ++ i) {
299 |                 if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) {
300 |                     if (text[i] < 'A' || text[i] > 'Z') {
301 |                         valid = false;
302 |                     }
303 |                 }
304 |             }
305 |             capital[pattern] += valid;
306 |         }
307 |     }
308 |     FOR (iter, prob) {
309 |         iter->second /= (double)corpusTokensN / splitBy(iter->first, ' ').size();
310 |     }
311 |     //cerr << "prepared" << endl;
312 |     
313 |     vector<string> stat = printStat(patterns, prob, f, total, sumOutside);
314 |     //cerr << "stat features ready" << endl;
315 |     vector<string> punc = printPunc(patterns, quote, dash, parenthesis, capital, total);
316 |     //cerr << "punc features ready" << endl;
317 |     vector<string> stop = printStopwords(patterns);
318 |     //cerr << "stop features ready" << endl;
319 |     
320 |     vector<string> compounds(1, "pattern");
321 |     FOR (iter, patterns) {
322 |         if (iter->first.find(' ') != string::npos) {
323 |             compounds.push_back(iter->first);
324 |         }
325 |     }
326 |     //cerr << "compound ready" << endl;
327 |     
328 |     FILE* out = tryOpen(argv[5], "w");
329 |     for (size_t i = 0; i < compounds.size(); ++ i) {
330 |         fprintf(out, "%s,%s,%s,%s\n", compounds[i].c_str(), stat[i].c_str(), punc[i].c_str(), stop[i].c_str());
331 |     }
332 |     fclose(out);
333 |     cerr << "feature extraction done." << endl;
334 | }
335 | 
336 | 


--------------------------------------------------------------------------------
/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | 's
   2 | -
   3 | --
   4 | ---
   5 | ----
   6 | -----
   7 | *
   8 | =
   9 | +
  10 | a
  11 | able
  12 | about
  13 | above
  14 | according
  15 | accordingly
  16 | across
  17 | actually
  18 | after
  19 | afterwards
  20 | again
  21 | against
  22 | all
  23 | allow
  24 | allows
  25 | almost
  26 | alone
  27 | along
  28 | already
  29 | also
  30 | although
  31 | always
  32 | am
  33 | among
  34 | amongst
  35 | an
  36 | and
  37 | another
  38 | any
  39 | anybody
  40 | anyhow
  41 | anyone
  42 | anything
  43 | anyway
  44 | anyways
  45 | anywhere
  46 | apart
  47 | appear
  48 | appreciate
  49 | appropriate
  50 | are
  51 | around
  52 | as
  53 | aside
  54 | ask
  55 | asking
  56 | associated
  57 | at
  58 | available
  59 | away
  60 | awfully
  61 | b
  62 | be
  63 | became
  64 | because
  65 | become
  66 | becomes
  67 | becoming
  68 | been
  69 | before
  70 | beforehand
  71 | behind
  72 | being
  73 | believe
  74 | below
  75 | beside
  76 | besides
  77 | best
  78 | better
  79 | between
  80 | beyond
  81 | both
  82 | brief
  83 | but
  84 | by
  85 | c
  86 | came
  87 | can
  88 | cannot
  89 | cant
  90 | cause
  91 | causes
  92 | certain
  93 | certainly
  94 | changes
  95 | clearly
  96 | co
  97 | com
  98 | come
  99 | comes
 100 | concerning
 101 | consequently
 102 | consider
 103 | considering
 104 | contain
 105 | containing
 106 | contains
 107 | corresponding
 108 | could
 109 | course
 110 | currently
 111 | d
 112 | definitely
 113 | described
 114 | despite
 115 | did
 116 | different
 117 | do
 118 | does
 119 | doing
 120 | done
 121 | down
 122 | downwards
 123 | during
 124 | e
 125 | each
 126 | edu
 127 | eg
 128 | eight
 129 | either
 130 | else
 131 | elsewhere
 132 | enough
 133 | entirely
 134 | especially
 135 | et
 136 | etc
 137 | even
 138 | ever
 139 | every
 140 | everybody
 141 | everyone
 142 | everything
 143 | everywhere
 144 | ex
 145 | exactly
 146 | example
 147 | except
 148 | f
 149 | far
 150 | few
 151 | fifth
 152 | first
 153 | five
 154 | followed
 155 | following
 156 | follows
 157 | for
 158 | former
 159 | formerly
 160 | forth
 161 | four
 162 | from
 163 | further
 164 | furthermore
 165 | g
 166 | get
 167 | gets
 168 | getting
 169 | given
 170 | gives
 171 | go
 172 | goes
 173 | going
 174 | gone
 175 | got
 176 | gotten
 177 | greetings
 178 | h
 179 | had
 180 | happens
 181 | hardly
 182 | has
 183 | have
 184 | having
 185 | he
 186 | hello
 187 | help
 188 | hence
 189 | her
 190 | here
 191 | hereafter
 192 | hereby
 193 | herein
 194 | hereupon
 195 | hers
 196 | herself
 197 | hi
 198 | him
 199 | himself
 200 | his
 201 | hither
 202 | hopefully
 203 | how
 204 | howbeit
 205 | however
 206 | i
 207 | ie
 208 | if
 209 | ignored
 210 | immediate
 211 | in
 212 | inasmuch
 213 | inc
 214 | indeed
 215 | indicate
 216 | indicated
 217 | indicates
 218 | inner
 219 | insofar
 220 | instead
 221 | into
 222 | inward
 223 | is
 224 | it
 225 | its
 226 | itself
 227 | j
 228 | just
 229 | k
 230 | keep
 231 | keeps
 232 | kept
 233 | know
 234 | knows
 235 | known
 236 | l
 237 | last
 238 | lately
 239 | later
 240 | latter
 241 | latterly
 242 | least
 243 | less
 244 | lest
 245 | let
 246 | like
 247 | liked
 248 | likely
 249 | little
 250 | look
 251 | looking
 252 | looks
 253 | ltd
 254 | m
 255 | mainly
 256 | many
 257 | may
 258 | maybe
 259 | me
 260 | mean
 261 | meanwhile
 262 | merely
 263 | might
 264 | more
 265 | moreover
 266 | most
 267 | mostly
 268 | much
 269 | must
 270 | my
 271 | myself
 272 | n
 273 | name
 274 | namely
 275 | nd
 276 | near
 277 | nearly
 278 | necessary
 279 | need
 280 | needs
 281 | neither
 282 | never
 283 | nevertheless
 284 | new
 285 | next
 286 | nine
 287 | no
 288 | nobody
 289 | non
 290 | none
 291 | noone
 292 | nor
 293 | normally
 294 | not
 295 | nothing
 296 | novel
 297 | now
 298 | nowhere
 299 | o
 300 | obviously
 301 | of
 302 | off
 303 | often
 304 | oh
 305 | ok
 306 | okay
 307 | old
 308 | on
 309 | once
 310 | one
 311 | ones
 312 | only
 313 | onto
 314 | or
 315 | other
 316 | others
 317 | otherwise
 318 | ought
 319 | our
 320 | ours
 321 | ourselves
 322 | out
 323 | outside
 324 | over
 325 | overall
 326 | own
 327 | p
 328 | particular
 329 | particularly
 330 | per
 331 | perhaps
 332 | placed
 333 | please
 334 | plus
 335 | possible
 336 | presumably
 337 | probably
 338 | provides
 339 | q
 340 | que
 341 | quite
 342 | qv
 343 | r
 344 | rather
 345 | rd
 346 | re
 347 | really
 348 | reasonably
 349 | regarding
 350 | regardless
 351 | regards
 352 | relatively
 353 | respectively
 354 | right
 355 | s
 356 | said
 357 | same
 358 | saw
 359 | say
 360 | saying
 361 | says
 362 | second
 363 | secondly
 364 | see
 365 | seeing
 366 | seem
 367 | seemed
 368 | seeming
 369 | seems
 370 | seen
 371 | self
 372 | selves
 373 | sensible
 374 | sent
 375 | serious
 376 | seriously
 377 | seven
 378 | several
 379 | shall
 380 | she
 381 | should
 382 | since
 383 | six
 384 | so
 385 | some
 386 | somebody
 387 | somehow
 388 | someone
 389 | something
 390 | sometime
 391 | sometimes
 392 | somewhat
 393 | somewhere
 394 | soon
 395 | sorry
 396 | specified
 397 | specify
 398 | specifying
 399 | still
 400 | sub
 401 | such
 402 | sup
 403 | sure
 404 | t
 405 | take
 406 | taken
 407 | tell
 408 | tends
 409 | th
 410 | than
 411 | thank
 412 | thanks
 413 | thanx
 414 | that
 415 | thats
 416 | the
 417 | their
 418 | theirs
 419 | them
 420 | themselves
 421 | then
 422 | thence
 423 | there
 424 | thereafter
 425 | thereby
 426 | therefore
 427 | therein
 428 | theres
 429 | thereupon
 430 | these
 431 | they
 432 | think
 433 | third
 434 | this
 435 | thorough
 436 | thoroughly
 437 | those
 438 | though
 439 | three
 440 | through
 441 | throughout
 442 | thru
 443 | thus
 444 | to
 445 | together
 446 | too
 447 | took
 448 | toward
 449 | towards
 450 | tried
 451 | tries
 452 | truly
 453 | try
 454 | trying
 455 | twice
 456 | two
 457 | u
 458 | un
 459 | under
 460 | unfortunately
 461 | unless
 462 | unlikely
 463 | until
 464 | unto
 465 | up
 466 | upon
 467 | us
 468 | use
 469 | used
 470 | useful
 471 | uses
 472 | using
 473 | usually
 474 | uucp
 475 | v
 476 | value
 477 | various
 478 | very
 479 | via
 480 | viz
 481 | vs
 482 | w
 483 | want
 484 | wants
 485 | was
 486 | way
 487 | we
 488 | welcome
 489 | well
 490 | went
 491 | were
 492 | what
 493 | whatever
 494 | when
 495 | whence
 496 | whenever
 497 | where
 498 | whereafter
 499 | whereas
 500 | whereby
 501 | wherein
 502 | whereupon
 503 | wherever
 504 | whether
 505 | which
 506 | while
 507 | whither
 508 | who
 509 | whoever
 510 | whole
 511 | whom
 512 | whose
 513 | why
 514 | will
 515 | willing
 516 | wish
 517 | with
 518 | within
 519 | without
 520 | wonder
 521 | would
 522 | would
 523 | x
 524 | y
 525 | yes
 526 | yet
 527 | you
 528 | your
 529 | yours
 530 | yourself
 531 | yourselves
 532 | z
 533 | zero
 534 | &
 535 | 's
 536 | 's
 537 | -
 538 | --
 539 | ---
 540 | a
 541 | about
 542 | above
 543 | across
 544 | after
 545 | afterwards
 546 | again
 547 | against
 548 | all
 549 | almost
 550 | alone
 551 | along
 552 | already
 553 | also
 554 | although
 555 | always
 556 | am
 557 | among
 558 | amongst
 559 | amoungst
 560 | amount
 561 | an
 562 | and
 563 | another
 564 | any
 565 | anyhow
 566 | anyone
 567 | anything
 568 | anyway
 569 | anywhere
 570 | are
 571 | around
 572 | as
 573 | at
 574 | back
 575 | be
 576 | became
 577 | because
 578 | become
 579 | becomes
 580 | becoming
 581 | been
 582 | before
 583 | beforehand
 584 | behind
 585 | being
 586 | below
 587 | beside
 588 | besides
 589 | between
 590 | beyond
 591 | bill
 592 | both
 593 | bottom
 594 | but
 595 | by
 596 | call
 597 | can
 598 | cannot
 599 | co
 600 | computer
 601 | con
 602 | could
 603 | cry
 604 | de
 605 | describe
 606 | detail
 607 | do
 608 | done
 609 | down
 610 | due
 611 | during
 612 | each
 613 | eg
 614 | eight
 615 | either
 616 | eleven
 617 | else
 618 | elsewhere
 619 | empty
 620 | enough
 621 | etc
 622 | even
 623 | ever
 624 | every
 625 | everyone
 626 | everything
 627 | everywhere
 628 | except
 629 | few
 630 | fifteen
 631 | fify
 632 | fill
 633 | find
 634 | fire
 635 | first
 636 | five
 637 | for
 638 | former
 639 | formerly
 640 | forty
 641 | found
 642 | four
 643 | from
 644 | front
 645 | full
 646 | further
 647 | get
 648 | give
 649 | go
 650 | had
 651 | has
 652 | have
 653 | he
 654 | hence
 655 | her
 656 | here
 657 | hereafter
 658 | hereby
 659 | herein
 660 | hereupon
 661 | hers
 662 | herself
 663 | him
 664 | himself
 665 | his
 666 | how
 667 | however
 668 | hundred
 669 | i
 670 | ie
 671 | if
 672 | in
 673 | inc
 674 | indeed
 675 | interest
 676 | into
 677 | is
 678 | it
 679 | its
 680 | itself
 681 | keep
 682 | last
 683 | latter
 684 | latterly
 685 | least
 686 | less
 687 | ltd
 688 | made
 689 | many
 690 | may
 691 | me
 692 | meanwhile
 693 | might
 694 | mill
 695 | mine
 696 | more
 697 | moreover
 698 | most
 699 | mostly
 700 | move
 701 | much
 702 | must
 703 | my
 704 | myself
 705 | name
 706 | namely
 707 | neither
 708 | never
 709 | nevertheless
 710 | next
 711 | nine
 712 | no
 713 | nobody
 714 | none
 715 | noone
 716 | nor
 717 | not
 718 | nothing
 719 | now
 720 | nowhere
 721 | of
 722 | off
 723 | often
 724 | on
 725 | once
 726 | one
 727 | only
 728 | onto
 729 | or
 730 | other
 731 | others
 732 | otherwise
 733 | our
 734 | ours
 735 | ourselves
 736 | out
 737 | over
 738 | own
 739 | part
 740 | per
 741 | perhaps
 742 | please
 743 | put
 744 | rather
 745 | re
 746 | same
 747 | see
 748 | seem
 749 | seemed
 750 | seeming
 751 | seems
 752 | serious
 753 | several
 754 | she
 755 | should
 756 | show
 757 | side
 758 | since
 759 | sincere
 760 | six
 761 | sixty
 762 | so
 763 | some
 764 | somehow
 765 | someone
 766 | something
 767 | sometime
 768 | sometimes
 769 | somewhere
 770 | still
 771 | such
 772 | system
 773 | take
 774 | ten
 775 | than
 776 | that
 777 | the
 778 | their
 779 | them
 780 | themselves
 781 | then
 782 | thence
 783 | there
 784 | thereafter
 785 | thereby
 786 | therefore
 787 | therein
 788 | thereupon
 789 | these
 790 | they
 791 | thick
 792 | thin
 793 | third
 794 | this
 795 | those
 796 | though
 797 | three
 798 | through
 799 | throughout
 800 | thru
 801 | thus
 802 | to
 803 | together
 804 | too
 805 | top
 806 | toward
 807 | towards
 808 | twelve
 809 | twenty
 810 | two
 811 | un
 812 | under
 813 | until
 814 | up
 815 | upon
 816 | us
 817 | very
 818 | via
 819 | was
 820 | we
 821 | what
 822 | whatever
 823 | when
 824 | whence
 825 | whenever
 826 | where
 827 | whereafter
 828 | whereas
 829 | whereby
 830 | wherein
 831 | whereupon
 832 | wherever
 833 | whether
 834 | which
 835 | while
 836 | whither
 837 | who
 838 | whoever
 839 | whole
 840 | whom
 841 | whose
 842 | why
 843 | will
 844 | with
 845 | within
 846 | without
 847 | would
 848 | yet
 849 | you
 850 | your
 851 | yours
 852 | yourself
 853 | yourselves
 854 | tis
 855 | twas
 856 | aint
 857 | arent
 858 | ain't
 859 | aren't
 860 | cant
 861 | couldve
 862 | couldnt
 863 | can't
 864 | could've
 865 | could'nt
 866 | didn't
 867 | doesn't
 868 | don't
 869 | didnt
 870 | doesnt
 871 | dont
 872 | hasnt
 873 | hed
 874 | hell
 875 | hes
 876 | howd
 877 | howll
 878 | hows
 879 | hasn't
 880 | he'd
 881 | he'll
 882 | he's
 883 | how'd
 884 | how'll
 885 | how's
 886 | id
 887 | ill
 888 | im
 889 | ive
 890 | isnt
 891 | i'd
 892 | i'll
 893 | i'm
 894 | i've
 895 | isn't
 896 | mightve
 897 | mightnt
 898 | mustve
 899 | mustnt
 900 | might've
 901 | mightn't
 902 | must've
 903 | mustn't
 904 | shant
 905 | shed
 906 | shell
 907 | shes
 908 | shouldve
 909 | shouldnt
 910 | shan't
 911 | she'd
 912 | she'll
 913 | she's
 914 | should've
 915 | should'nt
 916 | thatll
 917 | thats
 918 | theres
 919 | theyd
 920 | theyll
 921 | theyre
 922 | theyve
 923 | that'll
 924 | that's
 925 | there's
 926 | they'd
 927 | they'll
 928 | they're
 929 | they've
 930 | wasnt
 931 | wed
 932 | well
 933 | were
 934 | werent
 935 | whatd
 936 | whats
 937 | wasn't
 938 | we'd
 939 | we'll
 940 | we're
 941 | were'nt
 942 | what'd
 943 | what's
 944 | whend
 945 | whenll
 946 | whens
 947 | whered
 948 | wherell
 949 | wheres
 950 | whod
 951 | wholl
 952 | whos
 953 | whyd
 954 | whyll
 955 | whys
 956 | wont
 957 | wouldve
 958 | wouldnt
 959 | youd
 960 | youll
 961 | youre
 962 | youve
 963 | when'd
 964 | when'll
 965 | when's
 966 | where'd
 967 | where'll
 968 | where's
 969 | who'd
 970 | who'll
 971 | who's
 972 | why'd
 973 | why'll
 974 | why's
 975 | wont'
 976 | would've
 977 | wouldn't
 978 | you'd
 979 | you'll
 980 | you're
 981 | you've
 982 | &
 983 | 's
 984 | 's
 985 | -
 986 | --
 987 | ---
 988 | a
 989 | about
 990 | above
 991 | after
 992 | again
 993 | against
 994 | all
 995 | am
 996 | an
 997 | and
 998 | any
 999 | are
1000 | aren't
1001 | as
1002 | at
1003 | be
1004 | because
1005 | been
1006 | before
1007 | being
1008 | below
1009 | between
1010 | both
1011 | but
1012 | by
1013 | can't
1014 | cannot
1015 | could
1016 | couldn't
1017 | did
1018 | didn't
1019 | do
1020 | does
1021 | doesn't
1022 | doing
1023 | don't
1024 | down
1025 | during
1026 | each
1027 | few
1028 | for
1029 | from
1030 | further
1031 | had
1032 | hadn't
1033 | has
1034 | hasn't
1035 | have
1036 | haven't
1037 | having
1038 | he
1039 | he'd
1040 | he'll
1041 | he's
1042 | her
1043 | here
1044 | here's
1045 | hers
1046 | herself
1047 | him
1048 | himself
1049 | his
1050 | how
1051 | how's
1052 | i
1053 | i'd
1054 | i'll
1055 | i'm
1056 | i've
1057 | if
1058 | in
1059 | into
1060 | is
1061 | isn't
1062 | it
1063 | it's
1064 | its
1065 | itself
1066 | let's
1067 | me
1068 | more
1069 | most
1070 | mustn't
1071 | my
1072 | myself
1073 | no
1074 | nor
1075 | not
1076 | of
1077 | off
1078 | on
1079 | once
1080 | only
1081 | or
1082 | other
1083 | ought
1084 | our
1085 | ours
1086 | ourselves
1087 | out
1088 | over
1089 | own
1090 | same
1091 | shan't
1092 | she
1093 | she'd
1094 | she'll
1095 | she's
1096 | should
1097 | shouldn't
1098 | so
1099 | some
1100 | such
1101 | than
1102 | that
1103 | that's
1104 | the
1105 | their
1106 | theirs
1107 | them
1108 | themselves
1109 | then
1110 | there
1111 | there's
1112 | these
1113 | they
1114 | they'd
1115 | they'll
1116 | they're
1117 | they've
1118 | this
1119 | those
1120 | through
1121 | to
1122 | too
1123 | under
1124 | until
1125 | up
1126 | very
1127 | was
1128 | wasn't
1129 | we
1130 | we'd
1131 | we'll
1132 | we're
1133 | we've
1134 | were
1135 | weren't
1136 | what
1137 | what's
1138 | when
1139 | when's
1140 | where
1141 | where's
1142 | which
1143 | while
1144 | who
1145 | who's
1146 | whom
1147 | why
1148 | why's
1149 | with
1150 | won't
1151 | would
1152 | wouldn't
1153 | you
1154 | you'd
1155 | you'll
1156 | you're
1157 | you've
1158 | your
1159 | yours
1160 | yourself
1161 | yourselves
1162 | a
1163 | b
1164 | c
1165 | d
1166 | e
1167 | f
1168 | g
1169 | h
1170 | i
1171 | j
1172 | k
1173 | l
1174 | m
1175 | n
1176 | o
1177 | p
1178 | q
1179 | r
1180 | s
1181 | t
1182 | u
1183 | v
1184 | w
1185 | x
1186 | y
1187 | z
1188 | the
1189 | be
1190 | to
1191 | of
1192 | and
1193 | a
1194 | in
1195 | that
1196 | have
1197 | I
1198 | it
1199 | for
1200 | not
1201 | on
1202 | with
1203 | he
1204 | as
1205 | you
1206 | do
1207 | at
1208 | this
1209 | but
1210 | his
1211 | by
1212 | from
1213 | they
1214 | we
1215 | say
1216 | her
1217 | she
1218 | or
1219 | an
1220 | will
1221 | my
1222 | one
1223 | all
1224 | would
1225 | there
1226 | their
1227 | what
1228 | so
1229 | up
1230 | out
1231 | if
1232 | about
1233 | who
1234 | get
1235 | which
1236 | go
1237 | me
1238 | when
1239 | make
1240 | can
1241 | like
1242 | time
1243 | no
1244 | just
1245 | him
1246 | know
1247 | take
1248 | people
1249 | into
1250 | year
1251 | your
1252 | good
1253 | some
1254 | could
1255 | them
1256 | see
1257 | other
1258 | than
1259 | then
1260 | now
1261 | look
1262 | only
1263 | come
1264 | its
1265 | over
1266 | think
1267 | also
1268 | back
1269 | after
1270 | use
1271 | two
1272 | how
1273 | our
1274 | work
1275 | first
1276 | well
1277 | way
1278 | even
1279 | new
1280 | want
1281 | because
1282 | any
1283 | these
1284 | give
1285 | day
1286 | most
1287 | us
1288 | 


--------------------------------------------------------------------------------
/src/model_training/adjust_probability.cpp:
--------------------------------------------------------------------------------
  1 | #include "../utils/helper.h"
  2 | #include <omp.h>
  3 | #include <cassert>
  4 | 
  5 | void makeLog(unordered_map<string, double> &prob, const vector<string> &allPhrases)
  6 | {
  7 |     #pragma omp parallel for schedule(static)
  8 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
  9 |         const string &phrase = allPhrases[i];
 10 |         if (prob.count(phrase)) {
 11 |             prob[phrase] = log(prob[phrase]);
 12 |         }
 13 | 	}
 14 | //    cerr << "    log finish" << endl;
 15 | }
 16 | 
 17 | const double INF = 1e100;
 18 | 
 19 | int nthreads, maxLen;
 20 | double discard;
 21 | string outputFile;
 22 | 
 23 | unordered_map<string, int> unigrams, unigram2id, phrase2id, partsInPhrase;
 24 | vector<string> allUnigrams, allPhrases;
 25 | 
 26 | unordered_map<string, double> prob, logistic;
 27 | vector<string> sentences;
 28 | vector< vector<string> > sentencesTokens;
 29 | 
 30 | void normalize()
 31 | {
 32 | 	double sumList[nthreads][maxLen];
 33 |     memset(sumList, 0, sizeof(sumList));
 34 |     #pragma omp parallel for schedule(static)
 35 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
 36 |         const string &phrase = allPhrases[i];
 37 |         if (prob.count(phrase)) {
 38 |             int tid = omp_get_thread_num();
 39 |             sumList[tid][partsInPhrase[phrase] - 1] += prob[phrase];
 40 |         }
 41 | 	}
 42 |     double sum[maxLen];
 43 |     #pragma omp parallel for schedule(static)
 44 |     for (int i = 0; i < maxLen; ++ i) {
 45 |         sum[i] = 0;
 46 |         for (int j = 0; j < nthreads; ++ j) {
 47 |             sum[i] += sumList[j][i];
 48 |         }
 49 |     }
 50 | 
 51 | 	#pragma omp parallel for schedule(static)
 52 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
 53 |         const string &phrase = allPhrases[i];
 54 |         if (prob.count(phrase)) {
 55 |             prob[phrase] /= sum[partsInPhrase[phrase] - 1];
 56 |         }
 57 | 	}
 58 | }
 59 | 
 60 | bool getPenalty(const string &filename, double penalty)
 61 | {
 62 |     unordered_map<string, double> pr;
 63 |     vector<double> pw(maxLen + 1, 1);// P(length)
 64 |     double totalPw = 1;
 65 |     for (int i = 1; i <= maxLen; ++ i) {
 66 |         pw[i] = pw[i - 1] / penalty;
 67 |         totalPw += pw[i];
 68 |     }
 69 |     for (int i = 0; i <= maxLen; ++ i) {
 70 |         pw[i] /= totalPw;
 71 |     }
 72 | 
 73 |     normalize();
 74 | 
 75 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
 76 |         const string &phrase = allPhrases[i];
 77 |         if (prob.count(phrase)) {
 78 |             int parts = partsInPhrase[phrase];
 79 |             double &value = prob[phrase];
 80 |             double pgood = value * pw[parts - 1] * logistic[phrase];
 81 |             pr[phrase] = pgood;
 82 |         }
 83 |     }
 84 | 
 85 |     FILE* in = tryOpen(filename, "r");
 86 |     int total = 0, wrong =  0;
 87 |     for (;getLine(in);) {
 88 |         vector<string> tokens = splitBy(line, '\t');
 89 |         if (tokens.size() < 2) {
 90 |             continue;
 91 |         }
 92 |         string pattern = tolower(tokens[0]);
 93 |         int label;
 94 |         fromString(tokens[1], label);
 95 |         if (label == 1) {
 96 |             total += 1;
 97 |             if (pr.count(pattern)) {
 98 |                 tokens = splitBy(pattern, ' ');
 99 |                 vector<double> f(tokens.size() + 1, 0);
100 |                 vector<int> pre(tokens.size() + 1, -1);
101 |                 f[0] = 1;
102 |                 pre[0] = 0;
103 |                 for (size_t i = 0 ; i < tokens.size(); ++ i) {
104 |                     string token = "";
105 |                     size_t j = i;
106 |                     while (j < tokens.size()) {
107 |                         if (j == i) {
108 |                             token = tokens[i];
109 |                         } else {
110 |                             token += " ";
111 |                             token += tokens[j];
112 |                         }
113 |                         if (prob.count(token)) {
114 |                             double p = pr[token];
115 |                             if (f[i] * p > f[j + 1]) {
116 |                                 f[j + 1] = f[i] * p;
117 |                                 pre[j + 1] = i;
118 |                             }
119 |                         }
120 |                         ++ j;
121 |                     }
122 |                 }
123 |                 wrong += pre[tokens.size()] != 0;
124 |             } else {
125 |                 wrong += 1;
126 |             }
127 |         }
128 |     }
129 |     fclose(in);
130 | //    cerr << "# separable labels = " << total << ", wrong = " << wrong << endl;
131 |     return total * discard >= wrong;
132 | }
133 | 
134 | string finalSegmentation;
135 | 
136 | string dumpResult(int round, const string &prefix, const vector<double> &pw, bool lastIter)
137 | {
138 | //    cerr << "    start to dump ranking results" << endl;
139 |     char folder[256];
140 |     sprintf(folder, "%siter%d_discard%.2f", prefix.c_str(), round, discard);
141 |     if (lastIter) {
142 |         system(("mkdir " + (string)folder).c_str());
143 |     }
144 | 
145 |     FILE* out[maxLen];
146 |     #pragma omp parallel for schedule(static)
147 |     for (int i = 0; i < maxLen; ++ i) {
148 |         if (lastIter) {
149 |             char file[255];
150 |             sprintf(file, "%s/length%d.csv", folder, i + 1);
151 |             out[i] = tryOpen(file, "w");
152 |         }
153 |     }
154 | 
155 |     vector< pair<double, string> > orderList[nthreads][maxLen];
156 |     char outputString[nthreads][10000];
157 |     #pragma omp parallel for schedule(dynamic, 1000)
158 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
159 |         const string &phrase = allPhrases[i];
160 |         if (prob.count(phrase)) {
161 |             int parts = partsInPhrase[phrase];
162 |             double &value = prob[phrase];
163 |             double pgood = value * pw[parts - 1] * logistic[phrase];
164 | 
165 |             int tid = omp_get_thread_num();
166 |             if (lastIter) {
167 |                 sprintf(outputString[tid], "%s,%.10f,%.10f,%.10f", phrase.c_str(), pgood, value * logistic[phrase], logistic[phrase]);
168 |                 orderList[tid][parts - 1].push_back(make_pair(pgood, outputString[tid]));
169 |             }
170 |             value = pgood;
171 |         }
172 |     }
173 | 
174 |     if (lastIter) {
175 |         #pragma omp parallel for schedule(static)
176 |         for (int i = 0; i < maxLen; ++ i) {
177 |             vector< pair<double, string> > order;
178 |             for (int tid = 0; tid < nthreads; ++ tid) {
179 |                 FOR (iter, orderList[tid][i]) {
180 |                     order.push_back(*iter);
181 |                 }
182 |             }
183 |             sort(order.rbegin(), order.rend());
184 |             FOR (iter, order) {
185 |                 fprintf(out[i], "%s\n", iter->second.c_str());
186 |             }
187 |             fclose(out[i]);
188 |         }
189 |     }
190 | 
191 |     return folder;
192 | }
193 | 
194 | void DP(int round, double penalty, bool needSegmentResult = false, bool onlyDump = false)
195 | {
196 | //    cerr << "round = " << round << ", penalty = " << penalty << endl;
197 |     vector<double> pw(maxLen + 1, 1);// P(length)
198 |     double totalPw = 1;
199 |     for (int i = 1; i <= maxLen; ++ i) {
200 |         pw[i] = pw[i - 1] / penalty;
201 |         totalPw += pw[i];
202 |     }
203 |     for (int i = 0; i <= maxLen; ++ i) {
204 |         pw[i] /= totalPw;
205 |     }
206 | 
207 |     normalize();
208 | 
209 |     string newpath = dumpResult(round, outputFile, pw, needSegmentResult || onlyDump);
210 | //    cerr << "    previous results dumped" << endl;
211 | 
212 |     if (onlyDump) {
213 |         return;
214 |     }
215 | 
216 |     // initialize
217 |     makeLog(prob, allPhrases);
218 | 	vector< vector<int> > occur(nthreads, vector<int>());
219 |     #pragma omp parallel for schedule(static)
220 |     for (int i = 0; i < nthreads; ++ i) {
221 |         occur[i].resize(allPhrases.size(), 0);
222 |     }
223 | 
224 | 	vector< string > parsed(sentences.size(), "");
225 | 
226 |     double energy = 0;
227 | 	#pragma omp parallel for schedule(dynamic, 1000) reduction(+:energy)
228 |     for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
229 |     	vector<string> &tokens = sentencesTokens[sentenceID];
230 |     	vector<double> f(tokens.size() + 1, -INF);
231 |     	vector<int> pre(tokens.size() + 1, -1);
232 |     	f[0] = 0;
233 |     	pre[0] = 0;
234 |     	for (size_t i = 0 ; i < tokens.size(); ++ i) {
235 |     		if (f[i] < -1e80) {
236 |     			continue;
237 |     		}
238 |     		string token = "";
239 |     		size_t j = i;
240 |     		while (j < tokens.size()) {
241 |     			if (j == i) {
242 |     				token = tokens[i];
243 |     			} else {
244 |     				token += " ";
245 |     				token += tokens[j];
246 |     			}
247 |     			if (prob.count(token)) {
248 |     				double p = prob[token];
249 |     				if (f[i] + p > f[j + 1]) {
250 |     					f[j + 1] = f[i] + p;
251 |     					pre[j + 1] = i;
252 |     				}
253 |     			} else {
254 |     				if (j > maxLen + i) {
255 |     					break;
256 |     				}
257 |     			}
258 |     			++ j;
259 |     		}
260 |     	}
261 |     	energy += f[tokens.size()];
262 |     	if (true) {
263 |     		int tid = omp_get_thread_num();
264 |     		int i = (int)tokens.size();
265 |             vector<string> segments;
266 |     		while (i > 0) {
267 |     			int j = pre[i];
268 |     			string token = "";
269 |     			for (int k = j; k < i; ++ k) {
270 |     				if (k > j) {
271 |     					token += " ";
272 |     				}
273 |     				token += tokens[k];
274 |     			}
275 |                 assert(phrase2id.count(token));
276 |     			++ occur[tid][phrase2id[token]];
277 |     			i = j;
278 | 
279 |                 if (needSegmentResult) {
280 |                     for (size_t k = 0; k < token.size(); ++ k) {
281 |                         if (token[k] == ' ') {
282 |                             token[k] = '_';
283 |                         }
284 |                     }
285 |                     segments.push_back(token);
286 |                 }
287 |     		}
288 | 
289 |             if (needSegmentResult) {
290 |                 string &ret = parsed[sentenceID];
291 |                 for (int _ = (int)segments.size() - 1; _ >= 0; -- _) {
292 |                     if (segments[_] == "") {
293 |                         continue;
294 |                     }
295 |                     if (ret.size()) {
296 |                         ret += " ";
297 |                     }
298 |                     ret += segments[_];
299 |                 }
300 |                 if (tokens.size() != 0) {
301 |                     assert(ret != "");
302 |                 }
303 |             }
304 |     	}
305 |     }
306 | 
307 | //    cerr << "    energy = " << energy << endl;
308 | 
309 |     vector<int> sum(allPhrases.size(), 0);
310 |     #pragma omp parallel for schedule(static)
311 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
312 |         int &value = sum[i];
313 |         for (int tid = 0; tid < nthreads; ++ tid) {
314 |             value += occur[tid][i];
315 |         }
316 |     }
317 | 
318 |     prob.clear();
319 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
320 |         if (sum[i] > 0) {
321 |             prob[allPhrases[i]] = sum[i];
322 |         }
323 |     }
324 | //    cerr << "    # candidate phrases = " << prob.size() << endl;
325 | 
326 |     if (needSegmentResult) {
327 |         finalSegmentation = "";
328 |         int cnt = 0;
329 |         for (size_t i = 0; i < parsed.size(); ++ i) {
330 |             if (parsed[i] == "") {
331 |                 ++ cnt;
332 |                 assert(sentencesTokens[i].size() == 0);
333 |                 continue;
334 |             }
335 |             if (finalSegmentation.size() && finalSegmentation[finalSegmentation.size() - 1] != '\n') {
336 |                 finalSegmentation += " ";
337 |             }
338 |             finalSegmentation += parsed[i] + "\n";
339 |         }
340 | //        cerr << "======= cnt = " << cnt << " ==========" << endl;
341 |         FILE* out = tryOpen(newpath + "/segmented.txt", "w");
342 |         fprintf(out, "%s\n", finalSegmentation.c_str());
343 |         fclose(out);
344 |     }
345 | }
346 | 
347 | void loadPattern(const string &filename)
348 | {
349 | 	FILE* in = tryOpen(filename, "r");
350 | 	for (;getLine(in);) {
351 | 		vector<string> tokens = splitBy(line, ',');
352 | 		string pattern = tolower(tokens[0]);
353 | 		if (logistic.count(pattern)) {
354 | 			int occurrence;
355 | 			fromString(tokens[1], occurrence);
356 | 			prob[pattern] = occurrence;
357 | 			allPhrases.push_back(pattern);
358 | 		}
359 | 	}
360 | 	fclose(in);
361 | 
362 | //	cerr << "# Pattern = " << prob.size() << endl;
363 | 
364 | 	FOR (iter, unigrams) {
365 | 		logistic[iter->first] = 1.0;
366 | 	}
367 | }
368 | 
369 | void loadLogistic(const string &filename)
370 | {
371 | 	FILE* in = tryOpen(filename, "r");
372 | 	for (;getLine(in);) {
373 | 		vector<string> tokens = splitBy(line, ',');
374 | 		string pattern = tolower(tokens[0]);
375 | 		double pred;
376 | 		fromString(tokens[1], pred);
377 | 		logistic[pattern] = pred;
378 | 	}
379 | 	fclose(in);
380 | 
381 | //	cerr << "# logistic = " << logistic.size() << endl;
382 | }
383 | 
384 | void loadSentences(const string &filename)
385 | {
386 | 	FILE* in = tryOpen(filename, "rb");
387 | 	size_t size;
388 | 	Binary::read(in, size);
389 | 	sentences.resize(size);
390 | 	sentencesTokens.resize(size);
391 | 	for (size_t i = 0; i < size; ++ i) {
392 | 		Binary::read(in, sentences[i]);
393 | 	}
394 | //	cerr << "# Sentences Loaded = " << size << endl;
395 | 
396 | 	Binary::read(in, size);
397 | 	allUnigrams.resize(size);
398 | 
399 | 	for (size_t i = 0; i < size; ++ i) {
400 | 		Binary::read(in, allUnigrams[i]);
401 |         unigram2id[allUnigrams[i]] = i;
402 | 		allPhrases.push_back(allUnigrams[i]);
403 | 	}
404 | //	cerr << "# Unigrams Loaded = " << size << endl;
405 | 	fclose(in);
406 | 
407 | 	vector< vector<int> > unigramsList(nthreads, vector<int>(allUnigrams.size(), 0));
408 | 
409 | 	#pragma omp parallel for schedule(dynamic, 1000)
410 | 	for (size_t i = 0; i < sentences.size(); ++ i) {
411 | 		vector<string> &tokens = sentencesTokens[i];
412 | 		tokens = splitBy(sentences[i], ' ');
413 | 
414 | 		int tid = omp_get_thread_num();
415 | 		FOR (token, tokens) {
416 | 			++ unigramsList[tid][unigram2id[*token]];
417 | 		}
418 | 	}
419 | 
420 |     vector<int> sum(allUnigrams.size(), 0);
421 | 	#pragma omp parallel for schedule(static)
422 | 	for (size_t i = 0; i < allUnigrams.size(); ++ i) {
423 | 		int &value = sum[i];
424 | 		for (int tid = 0; tid < nthreads; ++ tid) {
425 |             value += unigramsList[tid][i];
426 | 		}
427 | 	}
428 |     for (size_t i = 0; i < allUnigrams.size(); ++ i) {
429 |         unigrams[allUnigrams[i]] = sum[i];
430 |     }
431 | //	cerr << "# the = " << unigrams["the"] << endl;
432 | }
433 | 
434 | int main(int argc, char* argv[])
435 | {
436 | 	int maxIter;
437 | 	if (argc != 10 || sscanf(argv[2], "%d", &nthreads) != 1 || sscanf(argv[5], "%lf", &discard) != 1 || sscanf(argv[6], "%d", &maxIter) != 1) {
438 | 		cerr << "[Usage] <input-sentence-buffer> <nthreads> <logistic> <pattern> <discard> <maxIter> <outputFile> <labels> <save penalty>" << endl;
439 | 		return -1;
440 | 	}
441 |     outputFile = argv[7];
442 | 
443 | 	omp_set_num_threads(nthreads);
444 | 
445 | 	loadSentences(argv[1]);
446 | 	loadLogistic(argv[3]);
447 | 	loadPattern(argv[4]);
448 | 
449 | //    cerr << "# all phrase candidates = " << allPhrases.size() << endl;
450 | 
451 | 	FOR (iter, unigrams) {
452 | 		logistic[iter->first] = 1.0;
453 | 		prob[iter->first] = iter->second;
454 | 	}
455 | 
456 |     maxLen = 0;
457 |     vector<int> temp(allPhrases.size(), 0);
458 |     #pragma omp parallel for schedule(dynamic, 1000) reduction(max:maxLen)
459 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
460 |         int parts = 1;
461 |         const string &phrase = allPhrases[i];
462 |         for (size_t j = 0; j < phrase.size(); ++ j) {
463 |             parts += phrase[j] == ' ';
464 |         }
465 |         temp[i] = parts;
466 |         maxLen = max(maxLen, parts);
467 |     }
468 |     for (size_t i = 0; i < allPhrases.size(); ++ i) {
469 |         const string &phrase = allPhrases[i];
470 |         phrase2id[phrase] = i;
471 |         partsInPhrase[phrase] = temp[i];
472 |     }
473 |     cerr << "max #tokens in phrase = " << maxLen << endl;
474 | 
475 |     assert(!prob.count(""));
476 |     assert(!logistic.count(""));
477 | 
478 | 	// start adjust penalty, DP
479 | 	const double EPS = 1e-6;
480 | 	double lower = EPS, upper = 200;
481 | 	double penalty = EPS;
482 | 	unordered_map<string, double> backup = prob;
483 | 	DP(0, lower, false, false);
484 | 	unordered_map<string, double> large = prob;
485 | 	prob = backup;
486 | 	DP(0, upper, false, false);
487 | 	unordered_map<string, double> small = prob;
488 | 
489 | 	prob = backup;
490 | 	FOR (iter, prob) {
491 |         iter->second = (large[iter->first] + small[iter->first]) / 2.0;
492 | 	}
493 | 	backup = prob;
494 | 
495 | 	for (int _ = 0; _ < 10; ++ _) {
496 | //	    cerr << "[lower, upper] = [" << lower << ", " << upper << "]" << endl;
497 |         penalty = (lower + upper) / 2;
498 | 
499 |         prob = backup;
500 |         DP(-(_ + 1), penalty, false, false);
501 | 
502 | //        cerr << "penalty = "  << penalty << endl;
503 |         if (getPenalty(argv[8], penalty)) {
504 |             lower = penalty;
505 |         } else {
506 |             upper = penalty;
507 |         }
508 | 	}
509 | 
510 |     if (true) {
511 |         FILE* out = tryOpen(argv[9], "w");
512 |         fprintf(out, "%.10f\n", penalty);
513 |         fclose(out);
514 |     }
515 | 
516 | 	prob = backup;
517 |     for (int round = 1; round <= maxIter; ++ round) {
518 |         DP(round, penalty, round == maxIter);
519 |     }
520 |     DP(maxIter + 1, penalty, false, true);
521 | 
522 |     cerr << "adjust probability done." << endl;
523 | 
524 | 	return 0;
525 | }
526 | 
527 | 
528 | 


--------------------------------------------------------------------------------