├── LICENSE ├── src ├── init_model.cpp ├── main.cpp └── utils.hpp └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Leafee98 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/init_model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "utils.hpp" 9 | 10 | 11 | std::vector split_words(std::string line) { 12 | static std::regex journal_number(R"(^\d{8}-\d{2}-\d{3}-\d{3}/m\s+)"); 13 | static std::regex words(R"(([^\s]+)/[a-zA-Z]+?)"); 14 | 15 | // remove the journal_number in front of each line 16 | line = std::regex_replace(line, journal_number, ""); 17 | 18 | std::vector word_list; 19 | std::smatch word_match; 20 | while (std::regex_search(line, word_match, words)) { 21 | // std::cout << "found word: " << word_match[1] << std::endl; 22 | 23 | word_list.emplace_back(word_match[1]); 24 | line = word_match.suffix(); 25 | } 26 | 27 | return word_list; 28 | } 29 | 30 | 31 | int main(int argc, char ** argv) { 32 | if (argc != 2) { 33 | std::cout << "usage: " << argv[0] << "" << std::endl; 34 | return 1; 35 | } 36 | 37 | std::map next_count; 38 | 39 | std::ifstream datafile(argv[1], std::ios::in); 40 | 41 | std::vector> full_data; 42 | for (std::string line; std::getline(datafile, line); ) { 43 | std::vector word_list = split_words(line); 44 | full_data.emplace_back(word_list); 45 | } 46 | 47 | bigram_tool b_tool(full_data); 48 | 49 | std::cout << b_tool.export_str() << std::endl; 50 | 51 | return 0; 52 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BiGram-EXP 2 | 3 | An experiment of Natural Language Processing Course. 4 | 5 | Using bigram to find the best way of participle. 6 | 7 | The standard of CPP is C++11 or C++17 I don't know, try them if you want to know. 8 | 9 | ## Usage 10 | 11 | ### Compile 12 | 13 | ``` 14 | mkdir out 15 | g++ src/init_model.cpp -o out/init 16 | g++ src/main.cpp -o out/main 17 | ``` 18 | 19 | ### Build The Model 20 | 21 | The *model* is just a text file storing an array and a matrix. I don't know any other better name, just call it *model*. 22 | 23 | ``` 24 | ./out/init data/199801-UTF8.txt > data/model 25 | ``` 26 | 27 | ### Start Using 28 | 29 | ``` 30 | ./out/main data/model 31 | ``` 32 | 33 | ### Debug 34 | 35 | ``` 36 | export BIGRAM_DEBUG=1 37 | ./out/main 38 | ``` 39 | 40 | ## Sample 41 | 42 | ``` 43 | $ ./out/main data/model 44 | loading model... 45 | succeed! 46 | > 他是研究生物化学的 47 | 1.0667e-42 他是研究生物化学的 48 | 4.31417e-39 他是研究生物化学的 49 | 6.02929e-38 他是研究生物化学的 50 | 6.03014e-38 他是研究生物化学的 51 | 2.97204e-37 他是研究生物化学的 52 | 4.87769e-34 他是研究生物化学的 53 | 1.20202e-33 他是研究生物化学的 54 | 3.40606e-33 他是研究生物化学的 55 | 1.67989e-32 他是研究生物化学的 56 | 1.68012e-32 他是研究生物化学的 57 | 1.37756e-29 他是研究生物化学的 58 | 1.35903e-28 他是研究生物化学的 59 | 1.92521e-28 他是研究生物化学的 60 | > 61 | ``` 62 | 63 | ## Principle 64 | 65 | N-gram assume that the possibility of a word appearing in a sentence is only realted with the n-1 words before it. The bigram, is just 2-gram, replacing n with 2. 66 | 67 | Image a sentence ABC. In 2-gram the word C appearing possibility is $P(C|B)$, and in 3-gram it's $P(C|AB)$. 68 | We can easily get these priori probability with statistics, assuming the appearing time as its possibility. 69 | 70 | $$ 71 | P(C|B) = \frac{P(BC)}{P(B)} 72 | $$ 73 | 74 | $$ 75 | P(C|AB) = \frac{P(ABC)}{P(AB)} 76 | $$ 77 | 78 | For example, in "ABABC". 79 | 80 | $$ 81 | P(C|B) = \frac{\frac{1}{5}}{\frac{2}{5}} 82 | $$ 83 | 84 | $$ 85 | P(C|AB) = \frac{\frac{1}{4}}{\frac{1}{4}} 86 | $$ 87 | 88 | For more information, visit [n-gram](https://en.wikipedia.org/wiki/N-gram). 89 | 90 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "utils.hpp" 9 | 10 | bool DEBUG = false; 11 | 12 | struct participle_item { 13 | std::string str; 14 | double possibility; 15 | 16 | bool operator<(const participle_item & p) { 17 | return this->possibility < p.possibility; 18 | } 19 | }; 20 | 21 | int main(int argc, char ** argv) { 22 | if (argc != 2) { 23 | std::cout << "usage: " << argv[0] << " " << std::endl; 24 | return 1; 25 | } 26 | 27 | DEBUG = std::getenv("BIGRAM_DEBUG") != NULL; 28 | 29 | std::cout << "loading model..." << std::endl; 30 | 31 | std::ifstream infile(argv[1], std::ios::in); 32 | std::stringstream ss; 33 | ss << infile.rdbuf(); 34 | bigram_tool bt = bigram_tool::import_str(ss.str()); 35 | 36 | std::cout << "succeed!" << std::endl; 37 | 38 | std::string s; 39 | std::cout << "> "; 40 | std::cout.flush(); 41 | while (std::getline(std::cin, s)) { 42 | std::vector> sequences = bt.get_possible_sequences(s); 43 | std::vector participle_result; 44 | participle_result.reserve(sequences.size()); 45 | 46 | for (const std::vector & seq : sequences) { 47 | std::string formated_str = bt.sequence_to_string(seq); 48 | 49 | if (DEBUG) { 50 | std::cout << "================================" << std::endl; 51 | std::cout << "DEBUG: participle is " << formated_str << std::endl; 52 | std::cout << "DEBUG: sequence is "; 53 | for (size_t i = 0; i < seq.size(); i++) { 54 | if (i != 0) 55 | std::cout << ' '; 56 | std::cout << seq[i]; 57 | } 58 | std::cout << std::endl; 59 | } 60 | 61 | double possibility = bt.calc_possibility(seq, true, DEBUG); 62 | 63 | participle_result.emplace_back(participle_item{formated_str, possibility}); 64 | } 65 | 66 | std::sort(participle_result.begin(), participle_result.end()); 67 | 68 | for (size_t i = 0; i < participle_result.size(); i++) { 69 | std::cout << participle_result[i].possibility << '\t' << participle_result[i].str << std::endl; 70 | } 71 | 72 | std::cout << "> "; 73 | std::cout.flush(); 74 | } 75 | return 0; 76 | } -------------------------------------------------------------------------------- /src/utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_HPP 2 | #define UTILS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | class bigram_tool { 14 | private: 15 | // 0 is placeholder, 1 is BEGIN, 2 is END 16 | std::map word_to_num; 17 | std::vector num_to_word; 18 | 19 | // bi_matrix[A][B] means the count of B occurres after A. 20 | // bi_matrix[A][0] means the count of A occurres. 21 | // std::vector> bi_matrix; 22 | std::map> bi_matrix; 23 | 24 | static std::map init_word_to_num(const std::vector & ntw) { 25 | std::map wtn; 26 | for (size_t i = 3; i < ntw.size(); i++) 27 | wtn[ntw[i]] = i; 28 | return wtn; 29 | } 30 | 31 | static std::map> init_bi_matrix( 32 | const std::vector> & word_data, 33 | std::map wtn) 34 | { 35 | std::map> bi_matrix; 36 | std::function bi_matrix_counter = [&bi_matrix](unsigned a, unsigned b) { 37 | if (bi_matrix.count(a)) { 38 | if (bi_matrix[a].count(b)) { 39 | bi_matrix[a][b] += 1; 40 | } else { 41 | bi_matrix[a][b] = 1; 42 | } 43 | } else { 44 | std::map tmp; 45 | tmp[b] = 1; 46 | bi_matrix[a] = tmp; 47 | } 48 | }; 49 | 50 | for (const std::vector & line_words : word_data) { 51 | if (line_words.size() == 0) 52 | continue; 53 | 54 | unsigned pre = wtn[line_words[0]]; 55 | bi_matrix_counter(1, pre); 56 | 57 | for (unsigned i = 1; i < line_words.size(); i++) { 58 | unsigned num = wtn[line_words[i]]; 59 | bi_matrix_counter(pre, num); 60 | 61 | pre = num; 62 | } 63 | 64 | bi_matrix_counter(pre, 2); 65 | } 66 | 67 | // init the bi_matrix[A][0], which means the number of times of the A appears 68 | for (std::map>::iterator i = bi_matrix.begin(); 69 | i != bi_matrix.end(); 70 | i++) 71 | { 72 | unsigned count_sum = 0; 73 | for (std::map::iterator j = i->second.begin(); 74 | j!= i->second.end(); 75 | j++) 76 | { 77 | count_sum += j->second; 78 | } 79 | 80 | // assert no such element like bi_matrix[A][0], the place to store the number of times of the A appears 81 | assert(i->second.count(0) == 0); 82 | i->second[0] = count_sum; 83 | } 84 | return bi_matrix; 85 | } 86 | 87 | void get_possible_sequences( 88 | std::vector> & result, 89 | std::vector tmp, 90 | const std::string & s) 91 | { 92 | for (size_t i = s.size(); i > 0; i--) { 93 | const std::string & sub = s.substr(0, i); 94 | if (this->word_to_num.count(sub)) { 95 | tmp.emplace_back(this->word_to_num[sub]); 96 | 97 | if (i < s.size()) { 98 | get_possible_sequences(result, tmp, s.substr(i)); 99 | } else { 100 | result.emplace_back(tmp); 101 | } 102 | 103 | tmp.erase(tmp.end() - 1); 104 | } 105 | } 106 | } 107 | 108 | bigram_tool(std::vector num_to_word, std::map> matrix) { 109 | this->num_to_word = num_to_word; 110 | this->word_to_num = bigram_tool::init_word_to_num(this->num_to_word); 111 | 112 | this->bi_matrix = matrix; 113 | } 114 | 115 | 116 | public: 117 | /********* 118 | * the word_data is many lines of words 119 | */ 120 | bigram_tool(const std::vector> & word_data) { 121 | // build the map between word and num 122 | this->num_to_word = std::vector{"#", "#BEGIN", "#END"}; 123 | 124 | std::set s; 125 | for (const std::vector & line_words : word_data) { 126 | for (const std::string & wd : line_words) { 127 | s.emplace(wd); 128 | } 129 | } 130 | 131 | this->num_to_word.insert(this->num_to_word.end(), s.begin(), s.end()); 132 | this->word_to_num = bigram_tool::init_word_to_num(this->num_to_word); 133 | this->bi_matrix = bigram_tool::init_bi_matrix(word_data, this->word_to_num); 134 | } 135 | 136 | std::vector> get_possible_sequences(const std::string & s) { 137 | std::vector> result; 138 | std::vector tmp; 139 | this->get_possible_sequences(result, tmp, s); 140 | return result; 141 | } 142 | 143 | double calc_possibility(const std::vector & v, bool smooth = false, bool debug = false) { 144 | double result = 1.0; 145 | unsigned pre_x, pre_sum; 146 | 147 | std::function load_value = 148 | [&pre_x, &pre_sum, this, smooth](unsigned pre, unsigned x) 149 | { 150 | pre_x = this->bi_matrix[pre][x]; 151 | pre_sum = this->bi_matrix[pre][0]; 152 | if (smooth) { 153 | pre_x += 1; 154 | pre_sum += this->num_to_word.size() - 3; // except the #, #BEGIN and #END 155 | } 156 | }; 157 | 158 | unsigned pre = 1; // the #BEGIN 159 | for (unsigned x : v) { 160 | load_value(pre, x); 161 | result *= double(pre_x) / pre_sum; 162 | 163 | if (debug) { 164 | std::cout << "DEBUG: " << "pre-x: " << pre << "-" << x 165 | << "(" << this->num_to_word[pre] << ' ' << this->num_to_word[x] << ")" 166 | << "; P(x|pre): " << pre_x << "/" << pre_sum << std::endl; 167 | } 168 | pre = x; 169 | } 170 | 171 | unsigned x = 2; // the #END 172 | load_value(pre, x); 173 | result *= double(pre_x) / pre_sum; 174 | if (debug) { 175 | std::cout << "DEBUG: " << "pre-x: " << pre << "-" << x 176 | << "(" << this->num_to_word[pre] << ' ' << this->num_to_word[x] << ")" 177 | << "; P(x|pre): " << pre_x << "/" << pre_sum << std::endl; 178 | } 179 | 180 | return result; 181 | } 182 | 183 | std::string sequence_to_string(const std::vector & v) { 184 | if (v.size() == 0) 185 | return ""; 186 | 187 | std::stringstream ss; 188 | ss << this->num_to_word[v[0]]; 189 | for (size_t i = 1; i < v.size(); i++) { 190 | ss << ' ' << this->num_to_word[v[i]]; 191 | } 192 | 193 | return ss.str(); 194 | } 195 | 196 | /********* 197 | * export num_to_word and matrix as a big string 198 | */ 199 | std::string export_str() { 200 | std::stringstream ss; 201 | ss << this->num_to_word.size() << std::endl; 202 | for (const std::string & s : this->num_to_word) { 203 | ss << s << std::endl; 204 | } 205 | 206 | for (std::map>::iterator i = this->bi_matrix.begin(); 207 | i != this->bi_matrix.end(); 208 | i++) 209 | { 210 | for (std::map::iterator j = i->second.begin(); j != i->second.end(); j++) { 211 | ss << i->first << ' ' << j->first << ' ' << j->second << std::endl; 212 | } 213 | } 214 | 215 | return ss.str(); 216 | } 217 | 218 | /********* 219 | * import data from a string, to build a bigram_tool object. 220 | * the string should come from export_str() 221 | */ 222 | static bigram_tool import_str(const std::string & s) { 223 | std::stringstream ss(s); 224 | unsigned len; 225 | ss >> len; 226 | 227 | std::vector num_to_word(len); 228 | std::map> matrix; 229 | 230 | for (unsigned i = 0; i < len; i++) { 231 | ss >> num_to_word[i]; 232 | } 233 | 234 | unsigned a, b, count; 235 | while (ss >> a >> b >> count) { 236 | if (matrix.count(a)) { 237 | matrix[a][b] = count; 238 | } else { 239 | matrix[a] = std::map(); 240 | matrix[a][b] = count; 241 | } 242 | } 243 | 244 | return bigram_tool(num_to_word, matrix); 245 | } 246 | 247 | }; 248 | 249 | #endif --------------------------------------------------------------------------------