├── Makefile ├── README.md ├── data ├── stopwords.txt ├── test.txt ├── wiki.cleaned.txt ├── wiki_labels_all.txt └── wiki_labels_quality.txt ├── parse.sh ├── src ├── classification │ ├── aho_corasick.h │ ├── auto_label_generation.py │ ├── feature_extraction.cpp │ ├── predict_quality.cpp │ └── random_forest.h ├── frequent_phrase_mining │ ├── frequent_pattern_mining.py │ └── main.py ├── model_training │ ├── adjust_probability.cpp │ └── recompute_features.cpp ├── online_query │ ├── compute_offset.py │ ├── decoding.py │ ├── encoding.py │ ├── segphrase_parser.cpp │ ├── segphrase_parser.h │ ├── test_parser.cpp │ └── tokenization.py ├── postprocessing │ ├── build_model.cpp │ ├── filter_by_support.py │ ├── generateNN.cpp │ ├── kd_tree.h │ ├── prune_and_combine.cpp │ └── qualify_unigrams.cpp ├── preprocessing │ ├── compute_idf.py │ ├── from_raw_to_binary.cpp │ ├── from_raw_to_binary_text.cpp │ ├── punctuation.py │ └── tokenization.py └── utils │ ├── decoding.py │ ├── encoding.py │ └── helper.h ├── train.sh └── word2vec_tool ├── LICENSE ├── Makefile ├── README.txt ├── compute-accuracy.c ├── demo-analogy.sh ├── demo-classes.sh ├── demo-phrase-accuracy.sh ├── demo-phrases.sh ├── demo-train-big-model-v1.sh ├── demo-word-accuracy.sh ├── demo-word.sh ├── distance.c ├── questions-phrases.txt ├── questions-words.txt ├── word-analogy.c ├── word2phrase.c └── word2vec.c /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/README.md -------------------------------------------------------------------------------- /data/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/data/stopwords.txt -------------------------------------------------------------------------------- /data/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/data/test.txt -------------------------------------------------------------------------------- /data/wiki.cleaned.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/data/wiki.cleaned.txt -------------------------------------------------------------------------------- /data/wiki_labels_all.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/data/wiki_labels_all.txt -------------------------------------------------------------------------------- /data/wiki_labels_quality.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/data/wiki_labels_quality.txt -------------------------------------------------------------------------------- /parse.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/parse.sh -------------------------------------------------------------------------------- /src/classification/aho_corasick.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/classification/aho_corasick.h -------------------------------------------------------------------------------- /src/classification/auto_label_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/classification/auto_label_generation.py -------------------------------------------------------------------------------- /src/classification/feature_extraction.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/classification/feature_extraction.cpp -------------------------------------------------------------------------------- /src/classification/predict_quality.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/classification/predict_quality.cpp -------------------------------------------------------------------------------- /src/classification/random_forest.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/classification/random_forest.h -------------------------------------------------------------------------------- /src/frequent_phrase_mining/frequent_pattern_mining.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/frequent_phrase_mining/frequent_pattern_mining.py -------------------------------------------------------------------------------- /src/frequent_phrase_mining/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/frequent_phrase_mining/main.py -------------------------------------------------------------------------------- /src/model_training/adjust_probability.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/model_training/adjust_probability.cpp -------------------------------------------------------------------------------- /src/model_training/recompute_features.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/model_training/recompute_features.cpp -------------------------------------------------------------------------------- /src/online_query/compute_offset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/compute_offset.py -------------------------------------------------------------------------------- /src/online_query/decoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/decoding.py -------------------------------------------------------------------------------- /src/online_query/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/encoding.py -------------------------------------------------------------------------------- /src/online_query/segphrase_parser.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/segphrase_parser.cpp -------------------------------------------------------------------------------- /src/online_query/segphrase_parser.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/segphrase_parser.h -------------------------------------------------------------------------------- /src/online_query/test_parser.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/test_parser.cpp -------------------------------------------------------------------------------- /src/online_query/tokenization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/online_query/tokenization.py -------------------------------------------------------------------------------- /src/postprocessing/build_model.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/build_model.cpp -------------------------------------------------------------------------------- /src/postprocessing/filter_by_support.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/filter_by_support.py -------------------------------------------------------------------------------- /src/postprocessing/generateNN.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/generateNN.cpp -------------------------------------------------------------------------------- /src/postprocessing/kd_tree.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/kd_tree.h -------------------------------------------------------------------------------- /src/postprocessing/prune_and_combine.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/prune_and_combine.cpp -------------------------------------------------------------------------------- /src/postprocessing/qualify_unigrams.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/postprocessing/qualify_unigrams.cpp -------------------------------------------------------------------------------- /src/preprocessing/compute_idf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/preprocessing/compute_idf.py -------------------------------------------------------------------------------- /src/preprocessing/from_raw_to_binary.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/preprocessing/from_raw_to_binary.cpp -------------------------------------------------------------------------------- /src/preprocessing/from_raw_to_binary_text.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/preprocessing/from_raw_to_binary_text.cpp -------------------------------------------------------------------------------- /src/preprocessing/punctuation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/preprocessing/punctuation.py -------------------------------------------------------------------------------- /src/preprocessing/tokenization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/preprocessing/tokenization.py -------------------------------------------------------------------------------- /src/utils/decoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/utils/decoding.py -------------------------------------------------------------------------------- /src/utils/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/utils/encoding.py -------------------------------------------------------------------------------- /src/utils/helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/src/utils/helper.h -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/train.sh -------------------------------------------------------------------------------- /word2vec_tool/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/LICENSE -------------------------------------------------------------------------------- /word2vec_tool/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/Makefile -------------------------------------------------------------------------------- /word2vec_tool/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/README.txt -------------------------------------------------------------------------------- /word2vec_tool/compute-accuracy.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/compute-accuracy.c -------------------------------------------------------------------------------- /word2vec_tool/demo-analogy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-analogy.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-classes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-classes.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-phrase-accuracy.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-phrases.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-phrases.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-train-big-model-v1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-train-big-model-v1.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-word-accuracy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-word-accuracy.sh -------------------------------------------------------------------------------- /word2vec_tool/demo-word.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/demo-word.sh -------------------------------------------------------------------------------- /word2vec_tool/distance.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/distance.c -------------------------------------------------------------------------------- /word2vec_tool/questions-phrases.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/questions-phrases.txt -------------------------------------------------------------------------------- /word2vec_tool/questions-words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/questions-words.txt -------------------------------------------------------------------------------- /word2vec_tool/word-analogy.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/word-analogy.c -------------------------------------------------------------------------------- /word2vec_tool/word2phrase.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/word2phrase.c -------------------------------------------------------------------------------- /word2vec_tool/word2vec.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remenberl/SegPhrase-MultiLingual/HEAD/word2vec_tool/word2vec.c --------------------------------------------------------------------------------