├── LICENSE ├── README.md ├── add_label.py ├── data ├── cora.embeddings.walks.0 ├── cora_features.mat ├── cora_train_data.npy ├── ind.cora.allx ├── ind.cora.ally ├── ind.cora.graph ├── ind.cora.test.index ├── ind.cora.tx ├── ind.cora.ty ├── ind.cora.x └── ind.cora.y ├── p_feature.py ├── run_emf.m ├── test.py ├── utils.py ├── w2vsbd.m └── word2vec ├── LICENSE ├── README.txt ├── compute-accuracy.c ├── demo-analogy.sh ├── demo-classes.sh ├── demo-phrase-accuracy.sh ├── demo-phrases.sh ├── demo-train-big-model-v1.sh ├── demo-word-accuracy.sh ├── demo-word.sh ├── distance.c ├── makefile ├── savedC.txt ├── savedW.txt ├── savednsc.txt ├── vectors.bin ├── word-analogy.c ├── word2phrase ├── word2phrase.c ├── word2vec └── word2vec.c /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Junliang Guo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## APNE 2 | This project contains code related to the paper "Enhancing Network Embedding with Auxiliary Information: An Explicit Matrix Factorization Perspective". 3 | APNE (Auxiliary information Preserved Netork Embedding) is a graph/network embedding method that can incorporate the structure information, 4 | content information and label information simultaneously in an unsupervised manner, i.e., 5 | without leveraging downstream classifiers. 6 | APNE outperforms unsupervised baselines from 10.1% to 34.4% on the node classification task. 7 | Details can be accessed [here](https://arxiv.org/abs/1711.04094). 8 | 9 | ### Requirements 10 | 11 | * Matlab (>R2014a) 12 | * gcc (>4.4.5) 13 | * networkx 14 | 15 | ### Run the demo 16 | 17 | Run the main script in matlab as: 18 | ``` 19 | run_emf 20 | ``` 21 | 22 | ### Data 23 | 24 | We test our model in four datasets in the paper, here in `./data/` folder we provide 25 | files of `cora` dataset. 26 | We use dataset splits provided by [Planetoid](https://github.com/kimiyoung/planetoid), 27 | where data files are formatted as `ind..`, 28 | as well as several files processed from the origin files: 29 | * {dataset}.embeddings.walks.0: random walk sequences obtained by directly running [DeepWalk](https://github.com/phanein/deepwalk) 30 | * {dataset}_features.mat: features saved `.mat` file 31 | * {dataset}_train_data.npy: training nodes and corresponding labels in `.npy` file 32 | 33 | You can specify a dataset by editing `run_emf.m`, where details about other hyper-parameters 34 | can also be found. 35 | 36 | ### Output 37 | 38 | We save and test checkpoints at every `verbose`, and you 39 | can change its value to fit your storage. 40 | 41 | The final output as well as checkpoints are `.mat` files which 42 | contain matrix **W** and matrix **S** described in our paper. 43 | Matrix **W** is the embedding matrix of the input graph with size 44 | `(num_nodes * emb_dim)`, and you can refer to `test.py` 45 | to evalute its performance. 46 | 47 | ### Acknowledgements 48 | 49 | The original version of this code base was forked from [emf](https://github.com/etali/emf), 50 | and we also referred to [GCN](https://github.com/tkipf/gcn) 51 | while preprocessing datasets. Many thanks to the authors for making their code available. 52 | 53 | ### Citing 54 | 55 | Please cite our paper if you find APNE useful in your research: 56 | ``` 57 | @inproceedings{guo2018enhancing, 58 | title={Enhancing Network Embedding with Auxiliary Information: An Explicit Matrix Factorization Perspective}, 59 | author={Guo, Junliang and Xu, Linli and Huang, Xunpeng and Chen, Enhong}, 60 | booktitle={International Conference on Database Systems for Advanced Applications}, 61 | pages={3--19}, 62 | year={2018}, 63 | organization={Springer} 64 | } 65 | ``` 66 | -------------------------------------------------------------------------------- /add_label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | ############################################### 4 | # File Name: add_label.py 5 | # Author: Junliang Guo@USTC 6 | # Email: guojunll@mail.ustc.edu.cn 7 | ############################################### 8 | 9 | #python add_label.py deep_dictc.txt deep_matrix.txt ./data/citeseer/train_data.npy sample_size out_deep_matrix.txt 10 | import sys 11 | import numpy as np 12 | from io import open 13 | from collections import defaultdict as dd 14 | import random 15 | import os 16 | 17 | dict_path = sys.argv[1] 18 | co_path = sys.argv[2] 19 | train_data_path = sys.argv[3] 20 | label_sample_size = int(sys.argv[4]) 21 | matrix_out = sys.argv[5] 22 | 23 | REMOVE = False 24 | if co_path == matrix_out: 25 | REMOVE = True 26 | 27 | emb_dic = {} 28 | with open(dict_path, 'r') as f: 29 | k = 0 30 | for line in f: 31 | if k == 0: 32 | k += 1 33 | continue 34 | else: 35 | word = line.strip().split()[0] 36 | word = int(word) 37 | emb_dic[word] = k 38 | k += 1 39 | 40 | train_data = np.load(train_data_path) 41 | train_index = train_data[:, 0] 42 | train_label = train_data[:, 1] 43 | occ_m = np.zeros([len(emb_dic), len(emb_dic)]) 44 | 45 | with open(co_path, 'r') as f: 46 | for line in f: 47 | words = line.strip().split() 48 | words = [int(i) for i in words] 49 | if len(words) != 3: 50 | continue 51 | occ_m[words[0] - 1][words[1] - 1] += words[2] 52 | 53 | if REMOVE: #output is same as input. Remove input 54 | if os.path.exists(co_path): 55 | print 'remove previous co-occurrence matrix' 56 | os.remove(co_path) 57 | 58 | pairs, label2inst = [], dd(list) 59 | for i in xrange(len(train_index)): 60 | label2inst[train_label[i]].append(i) 61 | 62 | for _ in range(label_sample_size): 63 | x1 = random.randint(0, len(train_index) - 1) 64 | label = train_label[x1] 65 | x2 = random.choice(label2inst[label]) 66 | pairs.append([train_index[x1], train_index[x2]]) 67 | 68 | for pair in pairs: 69 | occ_m[emb_dic[pair[0]] - 1][emb_dic[pair[1]] - 1] += 1 70 | 71 | with open(matrix_out, 'w', encoding = 'utf-8') as f: 72 | for i in xrange(occ_m.shape[0]): 73 | for j in xrange(occ_m.shape[1]): 74 | occ = occ_m[i][j] 75 | if occ != 0: 76 | occ = int(occ) 77 | out = [str(i + 1), str(j + 1), str(occ)] 78 | out = ' '.join(out) 79 | f.write(unicode(out) + '\n') 80 | -------------------------------------------------------------------------------- /data/cora_features.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/cora_features.mat -------------------------------------------------------------------------------- /data/cora_train_data.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/cora_train_data.npy -------------------------------------------------------------------------------- /data/ind.cora.allx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.allx -------------------------------------------------------------------------------- /data/ind.cora.ally: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.ally -------------------------------------------------------------------------------- /data/ind.cora.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.graph -------------------------------------------------------------------------------- /data/ind.cora.test.index: -------------------------------------------------------------------------------- 1 | 2692 2 | 2532 3 | 2050 4 | 1715 5 | 2362 6 | 2609 7 | 2622 8 | 1975 9 | 2081 10 | 1767 11 | 2263 12 | 1725 13 | 2588 14 | 2259 15 | 2357 16 | 1998 17 | 2574 18 | 2179 19 | 2291 20 | 2382 21 | 1812 22 | 1751 23 | 2422 24 | 1937 25 | 2631 26 | 2510 27 | 2378 28 | 2589 29 | 2345 30 | 1943 31 | 1850 32 | 2298 33 | 1825 34 | 2035 35 | 2507 36 | 2313 37 | 1906 38 | 1797 39 | 2023 40 | 2159 41 | 2495 42 | 1886 43 | 2122 44 | 2369 45 | 2461 46 | 1925 47 | 2565 48 | 1858 49 | 2234 50 | 2000 51 | 1846 52 | 2318 53 | 1723 54 | 2559 55 | 2258 56 | 1763 57 | 1991 58 | 1922 59 | 2003 60 | 2662 61 | 2250 62 | 2064 63 | 2529 64 | 1888 65 | 2499 66 | 2454 67 | 2320 68 | 2287 69 | 2203 70 | 2018 71 | 2002 72 | 2632 73 | 2554 74 | 2314 75 | 2537 76 | 1760 77 | 2088 78 | 2086 79 | 2218 80 | 2605 81 | 1953 82 | 2403 83 | 1920 84 | 2015 85 | 2335 86 | 2535 87 | 1837 88 | 2009 89 | 1905 90 | 2636 91 | 1942 92 | 2193 93 | 2576 94 | 2373 95 | 1873 96 | 2463 97 | 2509 98 | 1954 99 | 2656 100 | 2455 101 | 2494 102 | 2295 103 | 2114 104 | 2561 105 | 2176 106 | 2275 107 | 2635 108 | 2442 109 | 2704 110 | 2127 111 | 2085 112 | 2214 113 | 2487 114 | 1739 115 | 2543 116 | 1783 117 | 2485 118 | 2262 119 | 2472 120 | 2326 121 | 1738 122 | 2170 123 | 2100 124 | 2384 125 | 2152 126 | 2647 127 | 2693 128 | 2376 129 | 1775 130 | 1726 131 | 2476 132 | 2195 133 | 1773 134 | 1793 135 | 2194 136 | 2581 137 | 1854 138 | 2524 139 | 1945 140 | 1781 141 | 1987 142 | 2599 143 | 1744 144 | 2225 145 | 2300 146 | 1928 147 | 2042 148 | 2202 149 | 1958 150 | 1816 151 | 1916 152 | 2679 153 | 2190 154 | 1733 155 | 2034 156 | 2643 157 | 2177 158 | 1883 159 | 1917 160 | 1996 161 | 2491 162 | 2268 163 | 2231 164 | 2471 165 | 1919 166 | 1909 167 | 2012 168 | 2522 169 | 1865 170 | 2466 171 | 2469 172 | 2087 173 | 2584 174 | 2563 175 | 1924 176 | 2143 177 | 1736 178 | 1966 179 | 2533 180 | 2490 181 | 2630 182 | 1973 183 | 2568 184 | 1978 185 | 2664 186 | 2633 187 | 2312 188 | 2178 189 | 1754 190 | 2307 191 | 2480 192 | 1960 193 | 1742 194 | 1962 195 | 2160 196 | 2070 197 | 2553 198 | 2433 199 | 1768 200 | 2659 201 | 2379 202 | 2271 203 | 1776 204 | 2153 205 | 1877 206 | 2027 207 | 2028 208 | 2155 209 | 2196 210 | 2483 211 | 2026 212 | 2158 213 | 2407 214 | 1821 215 | 2131 216 | 2676 217 | 2277 218 | 2489 219 | 2424 220 | 1963 221 | 1808 222 | 1859 223 | 2597 224 | 2548 225 | 2368 226 | 1817 227 | 2405 228 | 2413 229 | 2603 230 | 2350 231 | 2118 232 | 2329 233 | 1969 234 | 2577 235 | 2475 236 | 2467 237 | 2425 238 | 1769 239 | 2092 240 | 2044 241 | 2586 242 | 2608 243 | 1983 244 | 2109 245 | 2649 246 | 1964 247 | 2144 248 | 1902 249 | 2411 250 | 2508 251 | 2360 252 | 1721 253 | 2005 254 | 2014 255 | 2308 256 | 2646 257 | 1949 258 | 1830 259 | 2212 260 | 2596 261 | 1832 262 | 1735 263 | 1866 264 | 2695 265 | 1941 266 | 2546 267 | 2498 268 | 2686 269 | 2665 270 | 1784 271 | 2613 272 | 1970 273 | 2021 274 | 2211 275 | 2516 276 | 2185 277 | 2479 278 | 2699 279 | 2150 280 | 1990 281 | 2063 282 | 2075 283 | 1979 284 | 2094 285 | 1787 286 | 2571 287 | 2690 288 | 1926 289 | 2341 290 | 2566 291 | 1957 292 | 1709 293 | 1955 294 | 2570 295 | 2387 296 | 1811 297 | 2025 298 | 2447 299 | 2696 300 | 2052 301 | 2366 302 | 1857 303 | 2273 304 | 2245 305 | 2672 306 | 2133 307 | 2421 308 | 1929 309 | 2125 310 | 2319 311 | 2641 312 | 2167 313 | 2418 314 | 1765 315 | 1761 316 | 1828 317 | 2188 318 | 1972 319 | 1997 320 | 2419 321 | 2289 322 | 2296 323 | 2587 324 | 2051 325 | 2440 326 | 2053 327 | 2191 328 | 1923 329 | 2164 330 | 1861 331 | 2339 332 | 2333 333 | 2523 334 | 2670 335 | 2121 336 | 1921 337 | 1724 338 | 2253 339 | 2374 340 | 1940 341 | 2545 342 | 2301 343 | 2244 344 | 2156 345 | 1849 346 | 2551 347 | 2011 348 | 2279 349 | 2572 350 | 1757 351 | 2400 352 | 2569 353 | 2072 354 | 2526 355 | 2173 356 | 2069 357 | 2036 358 | 1819 359 | 1734 360 | 1880 361 | 2137 362 | 2408 363 | 2226 364 | 2604 365 | 1771 366 | 2698 367 | 2187 368 | 2060 369 | 1756 370 | 2201 371 | 2066 372 | 2439 373 | 1844 374 | 1772 375 | 2383 376 | 2398 377 | 1708 378 | 1992 379 | 1959 380 | 1794 381 | 2426 382 | 2702 383 | 2444 384 | 1944 385 | 1829 386 | 2660 387 | 2497 388 | 2607 389 | 2343 390 | 1730 391 | 2624 392 | 1790 393 | 1935 394 | 1967 395 | 2401 396 | 2255 397 | 2355 398 | 2348 399 | 1931 400 | 2183 401 | 2161 402 | 2701 403 | 1948 404 | 2501 405 | 2192 406 | 2404 407 | 2209 408 | 2331 409 | 1810 410 | 2363 411 | 2334 412 | 1887 413 | 2393 414 | 2557 415 | 1719 416 | 1732 417 | 1986 418 | 2037 419 | 2056 420 | 1867 421 | 2126 422 | 1932 423 | 2117 424 | 1807 425 | 1801 426 | 1743 427 | 2041 428 | 1843 429 | 2388 430 | 2221 431 | 1833 432 | 2677 433 | 1778 434 | 2661 435 | 2306 436 | 2394 437 | 2106 438 | 2430 439 | 2371 440 | 2606 441 | 2353 442 | 2269 443 | 2317 444 | 2645 445 | 2372 446 | 2550 447 | 2043 448 | 1968 449 | 2165 450 | 2310 451 | 1985 452 | 2446 453 | 1982 454 | 2377 455 | 2207 456 | 1818 457 | 1913 458 | 1766 459 | 1722 460 | 1894 461 | 2020 462 | 1881 463 | 2621 464 | 2409 465 | 2261 466 | 2458 467 | 2096 468 | 1712 469 | 2594 470 | 2293 471 | 2048 472 | 2359 473 | 1839 474 | 2392 475 | 2254 476 | 1911 477 | 2101 478 | 2367 479 | 1889 480 | 1753 481 | 2555 482 | 2246 483 | 2264 484 | 2010 485 | 2336 486 | 2651 487 | 2017 488 | 2140 489 | 1842 490 | 2019 491 | 1890 492 | 2525 493 | 2134 494 | 2492 495 | 2652 496 | 2040 497 | 2145 498 | 2575 499 | 2166 500 | 1999 501 | 2434 502 | 1711 503 | 2276 504 | 2450 505 | 2389 506 | 2669 507 | 2595 508 | 1814 509 | 2039 510 | 2502 511 | 1896 512 | 2168 513 | 2344 514 | 2637 515 | 2031 516 | 1977 517 | 2380 518 | 1936 519 | 2047 520 | 2460 521 | 2102 522 | 1745 523 | 2650 524 | 2046 525 | 2514 526 | 1980 527 | 2352 528 | 2113 529 | 1713 530 | 2058 531 | 2558 532 | 1718 533 | 1864 534 | 1876 535 | 2338 536 | 1879 537 | 1891 538 | 2186 539 | 2451 540 | 2181 541 | 2638 542 | 2644 543 | 2103 544 | 2591 545 | 2266 546 | 2468 547 | 1869 548 | 2582 549 | 2674 550 | 2361 551 | 2462 552 | 1748 553 | 2215 554 | 2615 555 | 2236 556 | 2248 557 | 2493 558 | 2342 559 | 2449 560 | 2274 561 | 1824 562 | 1852 563 | 1870 564 | 2441 565 | 2356 566 | 1835 567 | 2694 568 | 2602 569 | 2685 570 | 1893 571 | 2544 572 | 2536 573 | 1994 574 | 1853 575 | 1838 576 | 1786 577 | 1930 578 | 2539 579 | 1892 580 | 2265 581 | 2618 582 | 2486 583 | 2583 584 | 2061 585 | 1796 586 | 1806 587 | 2084 588 | 1933 589 | 2095 590 | 2136 591 | 2078 592 | 1884 593 | 2438 594 | 2286 595 | 2138 596 | 1750 597 | 2184 598 | 1799 599 | 2278 600 | 2410 601 | 2642 602 | 2435 603 | 1956 604 | 2399 605 | 1774 606 | 2129 607 | 1898 608 | 1823 609 | 1938 610 | 2299 611 | 1862 612 | 2420 613 | 2673 614 | 1984 615 | 2204 616 | 1717 617 | 2074 618 | 2213 619 | 2436 620 | 2297 621 | 2592 622 | 2667 623 | 2703 624 | 2511 625 | 1779 626 | 1782 627 | 2625 628 | 2365 629 | 2315 630 | 2381 631 | 1788 632 | 1714 633 | 2302 634 | 1927 635 | 2325 636 | 2506 637 | 2169 638 | 2328 639 | 2629 640 | 2128 641 | 2655 642 | 2282 643 | 2073 644 | 2395 645 | 2247 646 | 2521 647 | 2260 648 | 1868 649 | 1988 650 | 2324 651 | 2705 652 | 2541 653 | 1731 654 | 2681 655 | 2707 656 | 2465 657 | 1785 658 | 2149 659 | 2045 660 | 2505 661 | 2611 662 | 2217 663 | 2180 664 | 1904 665 | 2453 666 | 2484 667 | 1871 668 | 2309 669 | 2349 670 | 2482 671 | 2004 672 | 1965 673 | 2406 674 | 2162 675 | 1805 676 | 2654 677 | 2007 678 | 1947 679 | 1981 680 | 2112 681 | 2141 682 | 1720 683 | 1758 684 | 2080 685 | 2330 686 | 2030 687 | 2432 688 | 2089 689 | 2547 690 | 1820 691 | 1815 692 | 2675 693 | 1840 694 | 2658 695 | 2370 696 | 2251 697 | 1908 698 | 2029 699 | 2068 700 | 2513 701 | 2549 702 | 2267 703 | 2580 704 | 2327 705 | 2351 706 | 2111 707 | 2022 708 | 2321 709 | 2614 710 | 2252 711 | 2104 712 | 1822 713 | 2552 714 | 2243 715 | 1798 716 | 2396 717 | 2663 718 | 2564 719 | 2148 720 | 2562 721 | 2684 722 | 2001 723 | 2151 724 | 2706 725 | 2240 726 | 2474 727 | 2303 728 | 2634 729 | 2680 730 | 2055 731 | 2090 732 | 2503 733 | 2347 734 | 2402 735 | 2238 736 | 1950 737 | 2054 738 | 2016 739 | 1872 740 | 2233 741 | 1710 742 | 2032 743 | 2540 744 | 2628 745 | 1795 746 | 2616 747 | 1903 748 | 2531 749 | 2567 750 | 1946 751 | 1897 752 | 2222 753 | 2227 754 | 2627 755 | 1856 756 | 2464 757 | 2241 758 | 2481 759 | 2130 760 | 2311 761 | 2083 762 | 2223 763 | 2284 764 | 2235 765 | 2097 766 | 1752 767 | 2515 768 | 2527 769 | 2385 770 | 2189 771 | 2283 772 | 2182 773 | 2079 774 | 2375 775 | 2174 776 | 2437 777 | 1993 778 | 2517 779 | 2443 780 | 2224 781 | 2648 782 | 2171 783 | 2290 784 | 2542 785 | 2038 786 | 1855 787 | 1831 788 | 1759 789 | 1848 790 | 2445 791 | 1827 792 | 2429 793 | 2205 794 | 2598 795 | 2657 796 | 1728 797 | 2065 798 | 1918 799 | 2427 800 | 2573 801 | 2620 802 | 2292 803 | 1777 804 | 2008 805 | 1875 806 | 2288 807 | 2256 808 | 2033 809 | 2470 810 | 2585 811 | 2610 812 | 2082 813 | 2230 814 | 1915 815 | 1847 816 | 2337 817 | 2512 818 | 2386 819 | 2006 820 | 2653 821 | 2346 822 | 1951 823 | 2110 824 | 2639 825 | 2520 826 | 1939 827 | 2683 828 | 2139 829 | 2220 830 | 1910 831 | 2237 832 | 1900 833 | 1836 834 | 2197 835 | 1716 836 | 1860 837 | 2077 838 | 2519 839 | 2538 840 | 2323 841 | 1914 842 | 1971 843 | 1845 844 | 2132 845 | 1802 846 | 1907 847 | 2640 848 | 2496 849 | 2281 850 | 2198 851 | 2416 852 | 2285 853 | 1755 854 | 2431 855 | 2071 856 | 2249 857 | 2123 858 | 1727 859 | 2459 860 | 2304 861 | 2199 862 | 1791 863 | 1809 864 | 1780 865 | 2210 866 | 2417 867 | 1874 868 | 1878 869 | 2116 870 | 1961 871 | 1863 872 | 2579 873 | 2477 874 | 2228 875 | 2332 876 | 2578 877 | 2457 878 | 2024 879 | 1934 880 | 2316 881 | 1841 882 | 1764 883 | 1737 884 | 2322 885 | 2239 886 | 2294 887 | 1729 888 | 2488 889 | 1974 890 | 2473 891 | 2098 892 | 2612 893 | 1834 894 | 2340 895 | 2423 896 | 2175 897 | 2280 898 | 2617 899 | 2208 900 | 2560 901 | 1741 902 | 2600 903 | 2059 904 | 1747 905 | 2242 906 | 2700 907 | 2232 908 | 2057 909 | 2147 910 | 2682 911 | 1792 912 | 1826 913 | 2120 914 | 1895 915 | 2364 916 | 2163 917 | 1851 918 | 2391 919 | 2414 920 | 2452 921 | 1803 922 | 1989 923 | 2623 924 | 2200 925 | 2528 926 | 2415 927 | 1804 928 | 2146 929 | 2619 930 | 2687 931 | 1762 932 | 2172 933 | 2270 934 | 2678 935 | 2593 936 | 2448 937 | 1882 938 | 2257 939 | 2500 940 | 1899 941 | 2478 942 | 2412 943 | 2107 944 | 1746 945 | 2428 946 | 2115 947 | 1800 948 | 1901 949 | 2397 950 | 2530 951 | 1912 952 | 2108 953 | 2206 954 | 2091 955 | 1740 956 | 2219 957 | 1976 958 | 2099 959 | 2142 960 | 2671 961 | 2668 962 | 2216 963 | 2272 964 | 2229 965 | 2666 966 | 2456 967 | 2534 968 | 2697 969 | 2688 970 | 2062 971 | 2691 972 | 2689 973 | 2154 974 | 2590 975 | 2626 976 | 2390 977 | 1813 978 | 2067 979 | 1952 980 | 2518 981 | 2358 982 | 1789 983 | 2076 984 | 2049 985 | 2119 986 | 2013 987 | 2124 988 | 2556 989 | 2105 990 | 2093 991 | 1885 992 | 2305 993 | 2354 994 | 2135 995 | 2601 996 | 1770 997 | 1995 998 | 2504 999 | 1749 1000 | 2157 1001 | -------------------------------------------------------------------------------- /data/ind.cora.tx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.tx -------------------------------------------------------------------------------- /data/ind.cora.ty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.ty -------------------------------------------------------------------------------- /data/ind.cora.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.x -------------------------------------------------------------------------------- /data/ind.cora.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/data/ind.cora.y -------------------------------------------------------------------------------- /p_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | ############################################### 4 | # File Name: p_feature.py 5 | # Author: Junliang Guo@USTC 6 | # Email: guojunll@mail.ustc.edu.cn 7 | ############################################### 8 | 9 | import sys 10 | from scipy import sparse as sp 11 | from scipy import io as sio 12 | import numpy as np 13 | from io import open 14 | #python p_feature.py in_file out_file dict_file 15 | 16 | feature_in = sys.argv[1] 17 | out_p = sys.argv[2] 18 | dic_p = sys.argv[3] 19 | 20 | dic = {} 21 | k = 0 22 | with open(dic_p, 'r') as f: 23 | for line in f: 24 | #print line 25 | if k == 0: 26 | k += 1 27 | else: 28 | node = line.strip().split()[0] 29 | dic[k] = node 30 | k += 1 31 | #print len(dic) 32 | features = sio.loadmat(feature_in)['features'] 33 | #print features[int(dic[11])] 34 | features = features.todense() 35 | #print features.shape 36 | temp_m = np.zeros([len(dic), features.shape[1]]) 37 | #print temp_m.shape 38 | for i in xrange(temp_m.shape[0]): 39 | temp_m[i] = features[int(dic[i + 1])] 40 | temp_m = sp.csr_matrix(temp_m, dtype = 'double') 41 | #print temp_m[10] 42 | sio.savemat(out_p, {'features': temp_m}) 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /run_emf.m: -------------------------------------------------------------------------------- 1 | % run_emf.m 2 | % Author: Junliang Guo@USTC 3 | % Email: guojunll@mail.ustc.edu.cn 4 | 5 | % Generalized Neural Graph Embedding with Matrix Factorization 6 | clear; 7 | 8 | % Options 9 | GET_COOCCURRENCE = 0; 10 | EXECUTE_EMF = 1; 11 | USE_LABEL = 1; 12 | CHECKPOINT = 0; 13 | dataset = 'cora'; 14 | num_class = 7; 15 | % Configuration of co-occurrence matrix 16 | checkpoint_file = './data/save_temp/cora_100.mat'; % checkpoint file if CHECKPOINT==1 17 | feature_file = 'cora_features.mat'; % features of input networks 18 | data_filename = 'cora.embeddings.walks.0'; % preprocessed random walk sequences 19 | train_save = 'cora_train_data.npy'; % training data saved path 20 | vocab_filename = 'cora_dictc.txt'; % vocabulary filename 21 | co_occurrence_matrix_filename = 'cora_matrix.txt'; % co-occurrence matrix filename 22 | co_occurrence_mat_outfilename = 'cora_w2v.mat'; % co-occurrence matrix filename (matlab format) 23 | label_sample = 200; % number of sampled label context 24 | window_size = 10; % window size of word2vec(toolbox) that will influence the construction of co-occurrence matrix 25 | window_size = floor(window_size/2); 26 | min_count = 0; % min-count of word2vec(toolbox) that filters out words of low frequency 27 | 28 | % Configuration of learning algorithm 29 | maxiter = 200; % maximum number of iteration of main loop 30 | inner_maxiter = 50; % maximum number of iteration of inner loop 31 | stepsize = 1e-7; % step-size of descending/ascending 32 | negative = 2; % negative sampling parameter that is represented by k in our paper 33 | embedding_vector_dim = 200; % embedding dimentionality 34 | save_embedding_vector_filename = 'cora_vector.mat'; % filename for saving embedding vector 35 | verbose = 5; % set verbose_acc to 0, there will be no verbose description 36 | 37 | % Run skip-gram negative sampling(SGNS) in word2vec and get the co-occurrence matrix 38 | % where the element in i-th column and j-th row represent the co-occurrence count of i-th word and j-th word 39 | if(GET_COOCCURRENCE) 40 | display('start extraction of co-occurrence matrix from SGNS'); 41 | cd word2vec 42 | system('make'); % we only compile the word2vec.c 43 | system(['chmod u+x ', 'word2vec']); 44 | cmd_line = sprintf('time ./word2vec -train %s -save-vocab %s -matrix %s -output vectors.bin -saveW savedW.txt -saveC savedC.txt -nsc savednsc.txt -cbow 0 -size %d -window %d -negative %d -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 -min-count %d', ..., 45 | ['../data/', data_filename], ['../data/', vocab_filename], ['../data/', co_occurrence_matrix_filename], embedding_vector_dim, window_size, negative, min_count); 46 | display(cmd_line); 47 | system(cmd_line); 48 | cd .. 49 | if(USE_LABEL) 50 | display('add label into co-occurrence matrix'); 51 | cmd_line = sprintf('python add_label.py %s %s %s %d %s', ['./data/', vocab_filename], ['./data/', co_occurrence_matrix_filename], ['./data/', train_save], label_sample, ['./data/', co_occurrence_matrix_filename]); 52 | display(cmd_line); 53 | system(cmd_line); 54 | end 55 | temp = load(['./data/', co_occurrence_matrix_filename]); 56 | w2vmatrix = spconvert(temp); 57 | save(['./data/', co_occurrence_mat_outfilename], 'w2vmatrix'); 58 | display('end get co-occurrence'); 59 | end 60 | 61 | % Run Explicit Matrix Factorization 62 | if(EXECUTE_EMF) 63 | clc; 64 | % run EMF 65 | display('start EMF'); 66 | w2vsbd(['./data/', co_occurrence_mat_outfilename], ..., 67 | maxiter, inner_maxiter, stepsize, negative, embedding_vector_dim, verbose, ['./data/', save_embedding_vector_filename], ..., 68 | ['./data/', feature_file], ['./data/', vocab_filename], ..., 69 | CHECKPOINT, checkpoint_file, dataset, num_class); 70 | display('end EMF'); 71 | end 72 | 73 | 74 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ############################################### 4 | # File Name: test.py 5 | # Author: Junliang Guo@USTC 6 | # Email: guojunll@mail.ustc.edu.cn 7 | ############################################### 8 | 9 | import numpy as np 10 | from scipy import sparse 11 | from io import open 12 | import json 13 | import random 14 | import sys 15 | from utils import load_pdata 16 | from scipy.io import loadmat 17 | from sklearn.linear_model import LogisticRegression 18 | from sklearn.metrics import accuracy_score, f1_score 19 | from sklearn.multiclass import OneVsRestClassifier 20 | from itertools import izip 21 | 22 | #python embedding_file dict dataset classes 23 | 24 | mat_variables = loadmat(sys.argv[1]) 25 | W_matrix = mat_variables['W_t'] 26 | 27 | dic_p = sys.argv[2] 28 | emb_dic = {} 29 | with open(dic_p, 'r') as f: 30 | k = 0 31 | for line in f: 32 | if k == 0: 33 | k += 1 34 | continue 35 | else: 36 | word = line.strip().split()[0] 37 | word = int(word) 38 | emb_dic[word] = k 39 | k += 1 40 | classes = int(sys.argv[4]) 41 | _, _, train_data, test_data = load_pdata(sys.argv[3]) 42 | index = test_data[:, 0] 43 | test_l = test_data[:, 1] 44 | test_label = [] 45 | for i in xrange(test_data.shape[0]): 46 | temp = [0] * classes 47 | temp[test_data[i][1] - 1] += 1 48 | test_label.append(temp) 49 | test_label = np.array(test_label) #1000 * 6 50 | 51 | train_index = train_data[:, 0] 52 | train_l = train_data[:, 1] 53 | train_label = [] 54 | for i in xrange(train_data.shape[0]): 55 | temp = [0] * classes 56 | temp[train_data[i][1] - 1] += 1 57 | train_label.append(temp) 58 | train_label = np.array(train_label) #120 * 6 59 | test_in = [] 60 | train_in = [] 61 | 62 | W = np.transpose(W_matrix) 63 | for i in index: 64 | zeros = [0] * W.shape[1] 65 | if i in emb_dic: 66 | emb_id = emb_dic[i] 67 | if emb_id <= W.shape[0]: 68 | emb_v = W[emb_id - 1, :] 69 | test_in.append(emb_v) 70 | else: 71 | test_in.append(zeros) 72 | else: 73 | test_in.append(zeros) 74 | 75 | for i in train_index: 76 | zeros = [0] * W.shape[1] 77 | if i in emb_dic: 78 | emb_id = emb_dic[i] 79 | if emb_id <= W.shape[0]: 80 | emb_v = W[emb_id - 1, :] 81 | train_in.append(emb_v) 82 | else: 83 | train_in.append(zeros) 84 | else: 85 | train_in.append(zeros) 86 | 87 | 88 | test_in = np.asarray(test_in) 89 | train_in = np.asarray(train_in) 90 | 91 | class TopKRanker(OneVsRestClassifier): 92 | def predict(self, X, top_k_list): 93 | assert X.shape[0] == len(top_k_list) 94 | probs = np.asarray(super(TopKRanker, self).predict_proba(X)) 95 | all_labels = [] 96 | for i, k in enumerate(top_k_list): 97 | probs_ = probs[i, :] 98 | labels = self.classes_[probs_.argsort()[-k:]].tolist() 99 | all_labels.append(labels) 100 | return all_labels 101 | y_train_ = sparse.coo_matrix(train_label) 102 | y_train = [[] for x in xrange(y_train_.shape[0])] 103 | cy = y_train_.tocoo() 104 | for i, j in izip(cy.row, cy.col): 105 | y_train[i].append(j) 106 | 107 | assert sum(len(l) for l in y_train) == y_train_.nnz 108 | 109 | y_test_ = sparse.coo_matrix(test_label) 110 | 111 | y_test = [[] for x in xrange(y_test_.shape[0])] 112 | cy = y_test_.tocoo() 113 | for i, j in izip(cy.row, cy.col): 114 | y_test[i].append(j) 115 | y_train = np.array(y_train) 116 | clf = TopKRanker(LogisticRegression()) 117 | clf.fit(train_in, y_train) 118 | 119 | top_k_list = [len(l) for l in y_test] 120 | preds = clf.predict(test_in, top_k_list) 121 | acc = accuracy_score(y_test, preds) 122 | print 'acc: %.3f' % acc -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ############################################### 4 | # File Name: utils.py 5 | # Author: Junliang Guo@USTC 6 | # Email: guojunll@mail.ustc.edu.cn 7 | ############################################### 8 | 9 | import argparse 10 | import random 11 | from scipy import sparse as sp 12 | from scipy import io as sio 13 | import networkx as nx 14 | import numpy as np 15 | import pickle as pkl 16 | 17 | def sparse_to_tuple(sparse_mx): 18 | if not sp.isspmatrix_coo(sparse_mx): 19 | sparse_mx = sparse_mx.tocoo() 20 | coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() 21 | values = sparse_mx.data 22 | shape = sparse_mx.shape 23 | return coords, values, shape 24 | 25 | def parse_index_file(filename): 26 | index = [] 27 | for line in open(filename): 28 | index.append(int(line.strip())) 29 | return index 30 | 31 | def sample_mask(idx, l): 32 | """Create mask.""" 33 | mask = np.zeros(l) 34 | mask[idx] = 1 35 | return np.array(mask, dtype=np.bool) 36 | 37 | def process_feature(f_before, dic_p): 38 | 39 | dic = {} 40 | k = 0 41 | with open(dic_p, 'r') as f: 42 | for line in f: 43 | if k == 0: 44 | k += 1 45 | else: 46 | node = line.strip().split()[0] 47 | dic[k] = node 48 | k += 1 49 | features = f_before.todense() 50 | temp_m = np.zeros([features.shape[0], features.shape[1]]) 51 | for i in xrange(temp_m.shape[0]): 52 | temp_m[i] = features[int(dic[i + 1])] 53 | f_after = sp.csr_matrix(temp_m) 54 | return f_after 55 | 56 | def load_pdata(dataset_str): 57 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 58 | objects = [] 59 | for i in xrange(len(names)): 60 | objects.append(pkl.load(open("./data/ind.{}.{}".format(dataset_str, names[i])))) 61 | x, y, tx, ty, allx, ally, graph = tuple(objects) 62 | test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str)) 63 | test_idx_range = np.sort(test_idx_reorder) 64 | if dataset_str == 'citeseer': 65 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 66 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 67 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 68 | tx = tx_extended 69 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 70 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 71 | ty = ty_extended 72 | features = sp.vstack((allx, tx)).tolil() 73 | features[test_idx_reorder, :] = features[test_idx_range, :] 74 | 75 | labels = np.vstack((ally, ty)) 76 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 77 | 78 | idx_test = test_idx_range.tolist() 79 | idx_train = range(len(y)) 80 | 81 | train_mask = sample_mask(idx_train, labels.shape[0]) 82 | test_mask = sample_mask(idx_test, labels.shape[0]) 83 | 84 | y_train = np.zeros(labels.shape) 85 | y_test = np.zeros(labels.shape) 86 | y_train[train_mask, :] = labels[train_mask, :] 87 | y_test[test_mask, :] = labels[test_mask, :] 88 | 89 | train_out = [] 90 | for i in idx_train: 91 | ll = y_train[i].tolist() 92 | ll = ll.index(1) + 1 93 | train_out.append([i, ll]) 94 | train_out = np.array(train_out) 95 | np.random.shuffle(train_out) 96 | 97 | test_out = [] 98 | for i in idx_test: 99 | ll = y_test[i].tolist() 100 | ll = ll.index(1) + 1 101 | test_out.append([i, ll]) 102 | test_out = np.array(test_out) 103 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 104 | adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) 105 | adj.eliminate_zeros() 106 | # Check that diag is zero: 107 | assert np.diag(adj.todense()).sum() == 0 108 | 109 | adj_triu = sp.triu(adj) 110 | adj_tuple = sparse_to_tuple(adj_triu) 111 | edges = adj_tuple[0] 112 | edges_all = sparse_to_tuple(adj)[0] 113 | num_mask = int(np.floor(edges.shape[0] / 10.)) 114 | 115 | return graph, features, train_out, test_out 116 | -------------------------------------------------------------------------------- /w2vsbd.m: -------------------------------------------------------------------------------- 1 | % w2vsbd.m 2 | % EMF block minimization/maximization 3 | % Author: Junliang Guo@USTC 4 | % Email: guojunll@mail.ustc.edu.cn 5 | 6 | % Objective: min MF(D, C*W) 7 | % Algorithm: 8 | % While 1 9 | % while 10 | % C = C - MFgrad(C); 11 | % while 12 | % W = W - MFgrad(D); 13 | % Until Converge 14 | 15 | function w2vsbd(co_mat_filename, maxiter, ..., 16 | inner_maxiter, stepsize, k, dim, verbose_acc, save_embedding_vector_filename, ..., 17 | feature_file, dict_file, ..., 18 | CHECKPOINT, checkpoint_file, dataset, class_num) 19 | load(co_mat_filename); 20 | rand('state',2014); 21 | 22 | H = full(w2vmatrix); 23 | disp(['size: ', num2str(size(H))]); 24 | 25 | display('process the feature matrix'); 26 | t_file = 'data/temp.mat'; 27 | cmd_line = sprintf('python p_feature.py %s %s %s', feature_file, t_file, dict_file); 28 | %display(cmd_line); 29 | system(cmd_line); 30 | display('process end') 31 | load(t_file); 32 | 33 | F = full(features); 34 | F = F(1:size(w2vmatrix, 1), :); 35 | d_f = size(F, 2); 36 | 37 | randlist = []; 38 | [sample_num, context_num] = size(H); 39 | D = H'; 40 | eff = 1/k; 41 | 42 | % construct Q 43 | Qw = sum(H, 2); 44 | Qc = sum(H, 1); 45 | Qnum = sum(sum(H)); 46 | Qtemp = Qw*Qc./(eff*Qnum); 47 | Q = Qtemp + H; 48 | 49 | if (CHECKPOINT) 50 | load(checkpoint_file); 51 | W = W_t 52 | S = S_t 53 | else 54 | % random initialize 55 | W = (rand(dim, sample_num) - 0.5)/dim/200; 56 | S = (rand(d_f, dim) - 0.5) / dim; 57 | end 58 | % Use GPU acceleration 59 | S = gpuArray(S); 60 | D = gpuArray(D); 61 | Q = gpuArray(Q); 62 | W = gpuArray(W); 63 | F = gpuArray(F); 64 | 65 | accuracy_list = []; 66 | err_list = []; 67 | 68 | for iter = 1:maxiter 69 | W_last = W; 70 | S_last = S; 71 | 72 | if mod(iter,2) 73 | % descent W 74 | for inner_iter = 1:inner_maxiter 75 | ED = Q'.*(1./(1 + exp(-F*S*W))); 76 | recons = D - ED; 77 | W_grad = S'*F'*recons; 78 | W = W + stepsize*W_grad; 79 | end 80 | else 81 | % descent S 82 | for inner_iter = 1:inner_maxiter 83 | ED = Q'.*(1./(1 + exp(-F*S*W))); 84 | recons = D - ED; 85 | S_grad = F'*recons*W'; 86 | S = S + stepsize*S_grad; 87 | end 88 | end 89 | 90 | if 0 == mod(iter, verbose_acc) 91 | err = norm(recons, 'fro'); 92 | err_list = [err_list err]; 93 | W_reg_fro = norm(W, 'fro')/sample_num; 94 | S_reg_fro = norm(S, 'fro')/sample_num; 95 | disp(['epoch:', num2str(iter),',err:', num2str(err), ',W:', num2str(W_reg_fro), ..., 96 | ',S:', num2str(S_reg_fro), ',stepsize:', num2str(stepsize)]); 97 | % Save checkpoint and test 98 | save_temp = ['data/save_temp/', dataset, num2str(iter), '.mat']; 99 | W_t = gather(W); 100 | S_t = gather(S); 101 | save(save_temp, 'W_t', 'S_t'); 102 | cmd_line = sprintf('python test.py %s %s %s %d', save_temp, dict_file, dataset, class_num); 103 | system(cmd_line); 104 | end 105 | end 106 | W = gather(W); 107 | S = gather(S); 108 | save(save_embedding_vector_filename, 'W', 'S'); 109 | end 110 | -------------------------------------------------------------------------------- /word2vec/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /word2vec/README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | -------------------------------------------------------------------------------- /word2vec/compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | char *outputmatrix = "train_vector.txt"; 26 | 27 | int main(int argc, char **argv) 28 | { 29 | FILE *f,*fom; 30 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 31 | float dist, len, bestd[N], vec[max_size]; 32 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 33 | float *M; 34 | char *vocab; 35 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 36 | if (argc < 2) { 37 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 38 | return 0; 39 | } 40 | strcpy(file_name, argv[1]); 41 | if (argc > 2) threshold = atoi(argv[2]); 42 | f = fopen(file_name, "rb"); 43 | if (f == NULL) { 44 | printf("Input file not found\n"); 45 | return -1; 46 | } 47 | // altered by Yitan Li 48 | fom = fopen(outputmatrix, "w"); 49 | if (fom == NULL) { 50 | printf("outputmatrix not open\n"); 51 | return -1; 52 | } 53 | 54 | fscanf(f, "%lld", &words); 55 | if (threshold) if (words > threshold) words = threshold; 56 | fscanf(f, "%lld", &size); 57 | vocab = (char *)malloc(words * max_w * sizeof(char)); 58 | M = (float *)malloc(words * size * sizeof(float)); 59 | if (M == NULL) { 60 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 61 | return -1; 62 | } 63 | for (b = 0; b < words; b++) { 64 | a = 0; 65 | while (1) { 66 | vocab[b * max_w + a] = fgetc(f); 67 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 68 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 69 | } 70 | vocab[b * max_w + a] = 0; 71 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 72 | for (a = 0; a < size; a++) 73 | { 74 | fread(&M[a + b * size], sizeof(float), 1, f); 75 | fprintf(fom, "%f ", M[a + b * size]); 76 | } 77 | fprintf(fom, "\n"); 78 | len = 0; 79 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 80 | len = sqrt(len); 81 | for (a = 0; a < size; a++) M[a + b * size] /= len; 82 | } 83 | fclose(f); 84 | fclose(fom); 85 | TCN = 0; 86 | while (1) { 87 | for (a = 0; a < N; a++) bestd[a] = 0; 88 | for (a = 0; a < N; a++) bestw[a][0] = 0; 89 | scanf("%s", st1); 90 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 91 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 92 | if (TCN == 0) TCN = 1; 93 | if (QID != 0) { 94 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 95 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 96 | } 97 | QID++; 98 | scanf("%s", st1); 99 | if (feof(stdin)) break; 100 | printf("%s:\n", st1); 101 | TCN = 0; 102 | CCN = 0; 103 | continue; 104 | } 105 | if (!strcmp(st1, "EXIT")) break; 106 | scanf("%s", st2); 107 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 108 | scanf("%s", st3); 109 | for (a = 0; a bestd[a]) { 136 | for (d = N - 1; d > a; d--) { 137 | bestd[d] = bestd[d - 1]; 138 | strcpy(bestw[d], bestw[d - 1]); 139 | } 140 | bestd[a] = dist; 141 | strcpy(bestw[a], &vocab[c * max_w]); 142 | break; 143 | } 144 | } 145 | } 146 | if (!strcmp(st4, bestw[0])) { 147 | CCN++; 148 | CACN++; 149 | if (QID <= 5) SEAC++; else SYAC++; 150 | } 151 | if (QID <= 5) SECN++; else SYCN++; 152 | TCN++; 153 | TACN++; 154 | } 155 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 156 | return 0; 157 | } 158 | -------------------------------------------------------------------------------- /word2vec/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /word2vec/demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /word2vec/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /word2vec/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /word2vec/demo-train-big-model-v1.sh: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Script for training good word and phrase vector model using public corpora, version 1.0. 4 | # The training time will be from several hours to about a day. 5 | # 6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains 7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. 8 | # 9 | ############################################################################################### 10 | 11 | # This function will convert text to lowercase and remove special characters 12 | normalize_text() { 13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ 14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ 15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ 16 | -e 's/«/ /g' | tr 0-9 " " 17 | } 18 | 19 | mkdir word2vec 20 | cd word2vec 21 | 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz 24 | gzip -d news.2012.en.shuffled.gz 25 | gzip -d news.2013.en.shuffled.gz 26 | normalize_text < news.2012.en.shuffled > data.txt 27 | normalize_text < news.2013.en.shuffled >> data.txt 28 | 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do 32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt 33 | done 34 | 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt 37 | for i in `ls webbase_all`; do 38 | normalize_text < webbase_all/$i >> data.txt 39 | done 40 | 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)... 45 | # All other characters are converted to spaces. Only text which normally appears. 46 | # in the web browser is displayed. Tables are removed. Image captions are. 47 | # preserved. Links are converted to normal text. Digits are spelled out. 48 | # *** Modified to not spell digits or throw away non-ASCII characters *** 49 | 50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. 51 | 52 | $/=">"; # input record separator 53 | while (<>) { 54 | if (/ ... 55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT 56 | if ($text) { 57 | 58 | # Remove any text not normally visible 59 | if (/<\/text>/) {$text=0;} 60 | s/<.*>//; # remove xml tags 61 | s/&/&/g; # decode URL encoded chars 62 | s/<//g; 64 | s///g; # remove references ... 65 | s/<[^>]*>//g; # remove xhtml tags 66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text 67 | s/\|thumb//ig; # remove images links, preserve caption 68 | s/\|left//ig; 69 | s/\|right//ig; 70 | s/\|\d+px//ig; 71 | s/\[\[image:[^\[\]]*\|//ig; 72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup 73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages 74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text 75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables} 76 | s/{[^}]*}//g; 77 | s/\[//g; # remove [ and ] 78 | s/\]//g; 79 | s/&[^;]*;/ /g; # remove URL encoded chars 80 | 81 | $_=" $_ "; 82 | chop; 83 | print $_; 84 | } 85 | } 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt 87 | 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions 100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage 101 | -------------------------------------------------------------------------------- /word2vec/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | #if [ ! -e text8 ]; then 3 | # wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | # gzip -d text8.gz -f 5 | #fi 6 | # time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /word2vec/demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | #if [ ! -e text8 ]; then 3 | # wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | # gzip -d text8.gz -f 5 | #fi 6 | time ./word2vec -train result9 -save-vocab dictc.txt -matrix matrix.txt -output vectors.bin -saveW savedW.txt -saveC savedC.txt -nsc savednsc.txt -cbow 0 -size 200 -window 2 -negative 4 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 -min-count 1000 7 | #./distance vectors.bin 8 | -------------------------------------------------------------------------------- /word2vec/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | a = 0; 55 | while (1) { 56 | vocab[b * max_w + a] = fgetc(f); 57 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 58 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 59 | } 60 | vocab[b * max_w + a] = 0; 61 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 62 | len = 0; 63 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 64 | len = sqrt(len); 65 | for (a = 0; a < size; a++) M[a + b * size] /= len; 66 | } 67 | fclose(f); 68 | while (1) { 69 | for (a = 0; a < N; a++) bestd[a] = 0; 70 | for (a = 0; a < N; a++) bestw[a][0] = 0; 71 | printf("Enter word or sentence (EXIT to break): "); 72 | a = 0; 73 | while (1) { 74 | st1[a] = fgetc(stdin); 75 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 76 | st1[a] = 0; 77 | break; 78 | } 79 | a++; 80 | } 81 | if (!strcmp(st1, "EXIT")) break; 82 | cn = 0; 83 | b = 0; 84 | c = 0; 85 | while (1) { 86 | st[cn][b] = st1[c]; 87 | b++; 88 | c++; 89 | st[cn][b] = 0; 90 | if (st1[c] == 0) break; 91 | if (st1[c] == ' ') { 92 | cn++; 93 | b = 0; 94 | c++; 95 | } 96 | } 97 | cn++; 98 | for (a = 0; a < cn; a++) { 99 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 100 | if (b == words) b = -1; 101 | bi[a] = b; 102 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 103 | if (b == -1) { 104 | printf("Out of dictionary word!\n"); 105 | break; 106 | } 107 | } 108 | if (b == -1) continue; 109 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 110 | for (a = 0; a < size; a++) vec[a] = 0; 111 | for (b = 0; b < cn; b++) { 112 | if (bi[b] == -1) continue; 113 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 114 | } 115 | len = 0; 116 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 117 | len = sqrt(len); 118 | for (a = 0; a < size; a++) vec[a] /= len; 119 | for (a = 0; a < N; a++) bestd[a] = -1; 120 | for (a = 0; a < N; a++) bestw[a][0] = 0; 121 | for (c = 0; c < words; c++) { 122 | a = 0; 123 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 124 | if (a == 1) continue; 125 | dist = 0; 126 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 127 | for (a = 0; a < N; a++) { 128 | if (dist > bestd[a]) { 129 | for (d = N - 1; d > a; d--) { 130 | bestd[d] = bestd[d - 1]; 131 | strcpy(bestw[d], bestw[d - 1]); 132 | } 133 | bestd[a] = dist; 134 | strcpy(bestw[a], &vocab[c * max_w]); 135 | break; 136 | } 137 | } 138 | } 139 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 140 | } 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /word2vec/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -O2 instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -pthread -O3 -Wall -funroll-loops 4 | 5 | all: word2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) 9 | clean: 10 | rm -rf word2vec 11 | -------------------------------------------------------------------------------- /word2vec/savednsc.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 337238 3 | 159345 4 | 154827 5 | 143372 6 | 94488 7 | 88265 8 | 84534 9 | 78215 10 | 69514 11 | 67394 12 | 67226 13 | 65188 14 | 63840 15 | 60845 16 | 60529 17 | 56409 18 | 54536 19 | 53958 20 | 47308 21 | 46183 22 | 45404 23 | 42464 24 | 42094 25 | 42174 26 | 41036 27 | 39039 28 | 38281 29 | 37920 30 | 37113 31 | 36373 32 | 36085 33 | 35807 34 | 35556 35 | 35170 36 | 34854 37 | 34996 38 | 34517 39 | 34328 40 | 34571 41 | 33886 42 | 33967 43 | 33459 44 | 33594 45 | 33736 46 | 33606 47 | 33171 48 | 32915 49 | 32323 50 | 32533 51 | 31878 52 | 31675 53 | 31588 54 | 30910 55 | 30312 56 | 29934 57 | 29721 58 | 29323 59 | 28619 60 | 28107 61 | 27535 62 | 27304 63 | 27176 64 | 26921 65 | 26650 66 | 26334 67 | 26291 68 | 26122 69 | 26104 70 | 26091 71 | 25908 72 | 26032 73 | 25806 74 | 25800 75 | 25272 76 | 25070 77 | 24455 78 | 24336 79 | 24225 80 | 24242 81 | 23931 82 | 24124 83 | 24153 84 | 24169 85 | 23834 86 | 23818 87 | 23806 88 | 23675 89 | 23717 90 | 23231 91 | 23836 92 | 23330 93 | 22787 94 | 23016 95 | 22602 96 | 22697 97 | 22774 98 | 22377 99 | 22546 100 | 22441 101 | 22300 102 | 21986 103 | 21773 104 | 21860 105 | 21616 106 | 21847 107 | 21456 108 | 21854 109 | 21760 110 | 21342 111 | 21346 112 | 21131 113 | 21104 114 | 21021 115 | 20846 116 | 20869 117 | 20880 118 | 20827 119 | 20379 120 | 20636 121 | 20507 122 | 20004 123 | 20386 124 | 20204 125 | 20336 126 | 20208 127 | 19763 128 | 20109 129 | 19799 130 | 19718 131 | 19832 132 | 19441 133 | 19423 134 | 19341 135 | 19324 136 | 19284 137 | 19149 138 | 19062 139 | 19050 140 | 18825 141 | 18704 142 | 18716 143 | 18625 144 | 18460 145 | 18424 146 | 18687 147 | 18473 148 | 18358 149 | 18259 150 | 18335 151 | 18083 152 | 18136 153 | 18250 154 | 18012 155 | 17874 156 | 17795 157 | 17575 158 | 17629 159 | 17829 160 | 17774 161 | 17670 162 | 17219 163 | 17785 164 | 17397 165 | 17392 166 | 17408 167 | 17307 168 | 17287 169 | 17258 170 | 17090 171 | 17294 172 | 16967 173 | 16982 174 | 16940 175 | 16757 176 | 16925 177 | 16986 178 | 17072 179 | 16956 180 | 16916 181 | 16800 182 | 16742 183 | 16476 184 | 16659 185 | 16824 186 | 16640 187 | 16590 188 | 16589 189 | 16554 190 | 16377 191 | 16767 192 | 16665 193 | 16734 194 | 16300 195 | 16558 196 | 16606 197 | 16258 198 | 16452 199 | 16449 200 | 16236 201 | 16483 202 | 16301 203 | 16221 204 | 16397 205 | 16296 206 | 16088 207 | 15927 208 | 16259 209 | 15828 210 | 16093 211 | 15900 212 | 15995 213 | 15869 214 | 15881 215 | 15881 216 | 15749 217 | 15924 218 | 15846 219 | 15936 220 | 15671 221 | 15658 222 | 15905 223 | 15735 224 | 15711 225 | 15609 226 | 15470 227 | 15488 228 | 15529 229 | 15416 230 | 15373 231 | 15314 232 | 15323 233 | 15342 234 | 15243 235 | 15468 236 | 15202 237 | 15284 238 | 15327 239 | 15181 240 | 15336 241 | 15116 242 | 15064 243 | 15356 244 | 15006 245 | 15321 246 | 15134 247 | 15332 248 | 15173 249 | 14796 250 | 14839 251 | 15317 252 | 15087 253 | 14992 254 | 14943 255 | 14934 256 | 14923 257 | 14711 258 | 14640 259 | 14746 260 | 14738 261 | 14713 262 | 14702 263 | 14878 264 | 14781 265 | 14673 266 | 14883 267 | 14711 268 | 14817 269 | 14732 270 | 14984 271 | 14691 272 | 14883 273 | 14764 274 | 14498 275 | 14601 276 | 14868 277 | 14560 278 | 14507 279 | 14807 280 | 14493 281 | 14433 282 | 14395 283 | 14489 284 | 14317 285 | 14702 286 | 14440 287 | 14416 288 | 14565 289 | 14426 290 | 14425 291 | 14570 292 | 14266 293 | 14505 294 | 14336 295 | 14072 296 | 14067 297 | 14176 298 | 14037 299 | 13914 300 | 14082 301 | 14056 302 | 13852 303 | 14011 304 | 14049 305 | 13874 306 | 14045 307 | 13921 308 | 13930 309 | 13948 310 | 13803 311 | 13794 312 | 14069 313 | 14094 314 | 13908 315 | 13802 316 | 13709 317 | 13695 318 | 13843 319 | 13534 320 | 13798 321 | 13564 322 | 13783 323 | 13420 324 | 13828 325 | 13558 326 | 13343 327 | 13524 328 | 13711 329 | 13514 330 | 13497 331 | 13360 332 | 13397 333 | 13567 334 | 13257 335 | 13398 336 | 13485 337 | 13262 338 | 13487 339 | 13490 340 | 13366 341 | 13336 342 | 13321 343 | 13512 344 | 13449 345 | 13442 346 | 13183 347 | 13312 348 | 13217 349 | 13133 350 | 13190 351 | 13122 352 | 13079 353 | 13116 354 | 13214 355 | 13185 356 | 13118 357 | 13185 358 | 13196 359 | 13200 360 | 13171 361 | 12817 362 | 12742 363 | 13137 364 | 12867 365 | 13116 366 | 12883 367 | 12926 368 | 13026 369 | 12966 370 | 12877 371 | 12870 372 | 12938 373 | 12697 374 | 12937 375 | 12825 376 | 12996 377 | 12850 378 | 12849 379 | 12849 380 | 13043 381 | 12853 382 | 12799 383 | 12882 384 | 12549 385 | 12709 386 | 12854 387 | 12548 388 | 12745 389 | 12651 390 | 12759 391 | 12638 392 | 12629 393 | 12561 394 | 12605 395 | 12643 396 | 12711 397 | 12480 398 | 12689 399 | 12541 400 | 12515 401 | 12540 402 | 12536 403 | 12714 404 | 12494 405 | 12620 406 | 12528 407 | 12497 408 | 12511 409 | 12403 410 | 12288 411 | 12206 412 | 12496 413 | 12466 414 | 12433 415 | 12397 416 | 12112 417 | 12444 418 | 12099 419 | 12194 420 | 12360 421 | 12084 422 | 12001 423 | 12200 424 | 12230 425 | 12271 426 | 12294 427 | 12071 428 | 12370 429 | 12244 430 | 12072 431 | 12121 432 | 12199 433 | 12293 434 | 12141 435 | 12217 436 | 12067 437 | 12056 438 | 12252 439 | 12168 440 | 12171 441 | 12004 442 | 12116 443 | 12161 444 | 12050 445 | 11938 446 | 12184 447 | 12001 448 | 12103 449 | 11945 450 | 12041 451 | 11956 452 | 12151 453 | 11948 454 | 12152 455 | 12084 456 | 11950 457 | 11843 458 | 11810 459 | 11611 460 | 11840 461 | 11807 462 | 12008 463 | 11955 464 | 11853 465 | 11767 466 | 11746 467 | 11845 468 | 12003 469 | 11653 470 | 11687 471 | 11763 472 | 11610 473 | 11878 474 | 11777 475 | 11752 476 | 11528 477 | 11670 478 | 11535 479 | 11511 480 | 11745 481 | 11399 482 | 11590 483 | 11557 484 | 11452 485 | 11735 486 | 11445 487 | 11503 488 | 11734 489 | 11523 490 | 11773 491 | 11760 492 | 11398 493 | 11737 494 | 11643 495 | 11533 496 | 11462 497 | 11621 498 | 11394 499 | 11391 500 | 11547 501 | 11339 502 | 11528 503 | 11474 504 | 11328 505 | 11363 506 | 11295 507 | 11506 508 | 11432 509 | 11166 510 | 11306 511 | 11572 512 | 11428 513 | 11240 514 | 11414 515 | 11427 516 | 11261 517 | 11207 518 | 11315 519 | 11042 520 | 11341 521 | 11166 522 | 11222 523 | 11208 524 | 11279 525 | 11316 526 | 11201 527 | 11200 528 | 11130 529 | 11405 530 | 11373 531 | 11211 532 | 11238 533 | 11163 534 | 11140 535 | 11028 536 | 11256 537 | 10916 538 | 11172 539 | 11146 540 | 11311 541 | 11113 542 | 11132 543 | 11091 544 | 11222 545 | 10934 546 | 10891 547 | 10972 548 | 10960 549 | 11018 550 | 11135 551 | 10934 552 | 11092 553 | 10901 554 | 11006 555 | 11323 556 | 11195 557 | 10928 558 | 10911 559 | 10979 560 | 10842 561 | 11063 562 | 10667 563 | 10840 564 | 10984 565 | 10907 566 | 10891 567 | 10880 568 | 10786 569 | 11229 570 | 10750 571 | 10865 572 | 10870 573 | 11030 574 | 10944 575 | 10898 576 | 10938 577 | 11007 578 | 10867 579 | 10875 580 | 10945 581 | 11087 582 | 10669 583 | 10963 584 | 10679 585 | 10746 586 | 10822 587 | 10848 588 | 10775 589 | 10790 590 | 10834 591 | 10871 592 | 10882 593 | 10869 594 | 10934 595 | 10615 596 | 10549 597 | 10862 598 | 10874 599 | 10586 600 | 10616 601 | 10894 602 | 10772 603 | 10815 604 | 10740 605 | 10584 606 | 10699 607 | 10772 608 | 10428 609 | 10760 610 | 10812 611 | 10661 612 | 10604 613 | 10525 614 | 10453 615 | 10713 616 | 10685 617 | 10438 618 | 10636 619 | 10476 620 | 10601 621 | 10672 622 | 10531 623 | 10669 624 | 10514 625 | 10695 626 | 10445 627 | 10390 628 | 10372 629 | 10707 630 | 10396 631 | 10523 632 | 10427 633 | 10399 634 | 10410 635 | 10443 636 | 10275 637 | 10532 638 | 10448 639 | 10278 640 | 10627 641 | 10299 642 | 10455 643 | 10427 644 | 10472 645 | 10551 646 | 10363 647 | 10486 648 | 10466 649 | 10465 650 | 10327 651 | 10300 652 | 10248 653 | 10572 654 | 10263 655 | 10392 656 | 10382 657 | 10321 658 | 10362 659 | 10345 660 | 10474 661 | 10352 662 | 10251 663 | 10338 664 | 10152 665 | 10072 666 | 10285 667 | 10068 668 | 10132 669 | 10235 670 | 10244 671 | 9885 672 | 10145 673 | 9947 674 | 10411 675 | 10206 676 | 10256 677 | 10080 678 | 10100 679 | 10075 680 | 10273 681 | 10184 682 | 10136 683 | 10112 684 | 10154 685 | 9967 686 | 9988 687 | 10036 688 | 9987 689 | 10157 690 | 10210 691 | 10036 692 | 10006 693 | 10069 694 | 10030 695 | 9963 696 | 9964 697 | 9959 698 | 9977 699 | 9792 700 | 9775 701 | 9935 702 | 9981 703 | 9952 704 | 9808 705 | 9905 706 | 9846 707 | 10009 708 | 10058 709 | 10048 710 | 9870 711 | 10068 712 | 9939 713 | 9882 714 | 9830 715 | 9875 716 | 9995 717 | 9873 718 | 9831 719 | 9879 720 | 10029 721 | 9929 722 | 9893 723 | 9923 724 | 9922 725 | 9716 726 | 9971 727 | 9809 728 | 9684 729 | 9889 730 | 9963 731 | 9715 732 | 9746 733 | 9798 734 | 10022 735 | 9863 736 | 9738 737 | 9808 738 | 9862 739 | 9777 740 | 9873 741 | 9962 742 | 9918 743 | 9709 744 | 9762 745 | 9612 746 | 9850 747 | 9932 748 | 9798 749 | 9616 750 | 9622 751 | 9896 752 | 9887 753 | 9693 754 | 9801 755 | 9684 756 | 9844 757 | 9761 758 | 9565 759 | 9787 760 | 9621 761 | 9655 762 | 9621 763 | 9797 764 | 9788 765 | 9699 766 | 9457 767 | 9736 768 | 9697 769 | 9664 770 | 9716 771 | 9649 772 | 9662 773 | 9833 774 | 9470 775 | 9570 776 | 9651 777 | 9527 778 | 9644 779 | 9569 780 | 9369 781 | 9691 782 | 9586 783 | 9785 784 | 9723 785 | 9534 786 | 9906 787 | 9536 788 | 9683 789 | 9583 790 | 9621 791 | 9600 792 | 9476 793 | 9574 794 | 9394 795 | 9694 796 | 9683 797 | 9360 798 | 9661 799 | 9487 800 | 9473 801 | 9520 802 | 9399 803 | 9341 804 | 9447 805 | 9325 806 | 9407 807 | 9419 808 | 9368 809 | 9424 810 | 9441 811 | 9510 812 | 9394 813 | 9252 814 | 9232 815 | 9420 816 | 9163 817 | 9215 818 | 9500 819 | 9127 820 | 9329 821 | 9274 822 | 9254 823 | 9366 824 | 9077 825 | 9298 826 | 9356 827 | 9096 828 | 9293 829 | 9392 830 | 9053 831 | 9231 832 | 9166 833 | 9206 834 | 9330 835 | 9153 836 | 9420 837 | 9368 838 | 9178 839 | 9251 840 | 9132 841 | 9194 842 | 9186 843 | 9321 844 | 9316 845 | 9214 846 | 9248 847 | 9187 848 | 9093 849 | 9056 850 | 9104 851 | 9250 852 | 9179 853 | 9152 854 | 9311 855 | 9216 856 | 9093 857 | 9078 858 | 8988 859 | 9130 860 | 9280 861 | 9256 862 | 9060 863 | 8894 864 | 9044 865 | 9010 866 | 9164 867 | 9164 868 | 9147 869 | 9242 870 | 9328 871 | 9084 872 | 9027 873 | 9008 874 | 8922 875 | 9021 876 | 9127 877 | 8955 878 | 9041 879 | 9185 880 | 9155 881 | 9013 882 | 8930 883 | 9153 884 | 9093 885 | 9091 886 | 9080 887 | 9053 888 | 9235 889 | 9162 890 | 9058 891 | 9194 892 | 9044 893 | 9014 894 | 9160 895 | 9056 896 | 9162 897 | 9276 898 | 9184 899 | 8953 900 | 8923 901 | 9143 902 | 9175 903 | 9044 904 | 9013 905 | 8964 906 | 9039 907 | 9115 908 | 9193 909 | 9097 910 | 9033 911 | 9109 912 | 9216 913 | 9109 914 | 9094 915 | 8984 916 | 9084 917 | 9034 918 | 9093 919 | 8888 920 | 8980 921 | 8883 922 | 9003 923 | 9119 924 | 8965 925 | 8897 926 | 9243 927 | 9062 928 | 9186 929 | 9134 930 | 9104 931 | 9027 932 | 9114 933 | 9054 934 | 9150 935 | 9215 936 | 9229 937 | 9056 938 | 8965 939 | 9086 940 | 9195 941 | 9077 942 | 9017 943 | 9209 944 | 9208 945 | 9023 946 | 9116 947 | 9017 948 | 8965 949 | 8950 950 | 8991 951 | 9063 952 | 9162 953 | 9180 954 | 9033 955 | 9089 956 | 9180 957 | 9076 958 | 9099 959 | 9042 960 | 9009 961 | 9027 962 | 9173 963 | 9159 964 | 8958 965 | 8965 966 | 9030 967 | 9169 968 | 9142 969 | 9050 970 | 9102 971 | 9044 972 | 9105 973 | 8955 974 | 9047 975 | 9249 976 | 9054 977 | 9000 978 | 9259 979 | 9101 980 | 8943 981 | 9116 982 | 9175 983 | 9010 984 | 9078 985 | 8920 986 | 9104 987 | 9053 988 | 9117 989 | 9087 990 | 9194 991 | 9130 992 | 9065 993 | 9058 994 | 9222 995 | 9024 996 | 9085 997 | 9004 998 | 9019 999 | 9168 1000 | 8837 1001 | 8953 1002 | 9095 1003 | 9073 1004 | 8947 1005 | 9027 1006 | 9122 1007 | 8773 1008 | 8882 1009 | 8908 1010 | 9109 1011 | 8936 1012 | 8930 1013 | 8938 1014 | 8806 1015 | 9156 1016 | 8987 1017 | 8881 1018 | 8895 1019 | 8976 1020 | 8924 1021 | 8959 1022 | 8848 1023 | 8903 1024 | 8766 1025 | 8861 1026 | 8863 1027 | 8999 1028 | 8825 1029 | 8940 1030 | 9133 1031 | 8842 1032 | 8945 1033 | 9044 1034 | 8750 1035 | 8949 1036 | 9019 1037 | 8978 1038 | 8918 1039 | 8932 1040 | 8847 1041 | 8808 1042 | 8880 1043 | 8739 1044 | 8949 1045 | 8879 1046 | 8689 1047 | 8718 1048 | 8891 1049 | 8855 1050 | 8779 1051 | 8727 1052 | 8795 1053 | 8702 1054 | 8937 1055 | 8785 1056 | 8690 1057 | 8933 1058 | 8802 1059 | 8810 1060 | 8868 1061 | 8900 1062 | 8898 1063 | 8817 1064 | 8911 1065 | 8624 1066 | 8901 1067 | 8789 1068 | 8656 1069 | 8683 1070 | 8745 1071 | 8740 1072 | 8858 1073 | 8653 1074 | 8720 1075 | 8666 1076 | 8648 1077 | 8660 1078 | 8739 1079 | 8760 1080 | 8657 1081 | 8707 1082 | 8821 1083 | 9064 1084 | 8645 1085 | 8870 1086 | 8778 1087 | 8736 1088 | 8518 1089 | 8648 1090 | 8767 1091 | 8601 1092 | 8667 1093 | 8672 1094 | 8687 1095 | 8673 1096 | 8744 1097 | 8780 1098 | 8723 1099 | 8710 1100 | 8664 1101 | 8659 1102 | 8678 1103 | 8450 1104 | 8550 1105 | 8575 1106 | 8619 1107 | 8591 1108 | 8735 1109 | 8763 1110 | 8610 1111 | 8590 1112 | 8587 1113 | 8593 1114 | 8855 1115 | 8498 1116 | 8591 1117 | 8654 1118 | 8512 1119 | 8565 1120 | 8512 1121 | 8663 1122 | 8608 1123 | 8394 1124 | 8817 1125 | 8602 1126 | 8728 1127 | 8816 1128 | 8576 1129 | 8634 1130 | 8483 1131 | 8423 1132 | 8502 1133 | 8448 1134 | 8498 1135 | 8505 1136 | 8521 1137 | 8609 1138 | 8559 1139 | 8568 1140 | 8621 1141 | 8648 1142 | 8440 1143 | 8556 1144 | 8535 1145 | 8543 1146 | 8505 1147 | 8501 1148 | 8452 1149 | 8149 1150 | 8585 1151 | 8325 1152 | 8552 1153 | 8627 1154 | 8447 1155 | 8369 1156 | 8513 1157 | 8390 1158 | 8372 1159 | 8504 1160 | 8692 1161 | 8434 1162 | 8598 1163 | 8400 1164 | 8329 1165 | 8506 1166 | 8386 1167 | 8197 1168 | 8229 1169 | 8329 1170 | 8278 1171 | 8278 1172 | 8512 1173 | 8255 1174 | 8518 1175 | 8339 1176 | 8341 1177 | 8535 1178 | 8515 1179 | 8377 1180 | 8250 1181 | 8310 1182 | 8275 1183 | 8418 1184 | 8263 1185 | 8360 1186 | 8442 1187 | 8222 1188 | 8390 1189 | 8224 1190 | 8403 1191 | 8276 1192 | 8340 1193 | 8296 1194 | 8348 1195 | 8287 1196 | 8266 1197 | 8291 1198 | 8294 1199 | 8424 1200 | 8209 1201 | 8196 1202 | 8096 1203 | 8217 1204 | 8291 1205 | 8241 1206 | 8231 1207 | 8332 1208 | 8240 1209 | 8192 1210 | 8155 1211 | 8358 1212 | 8264 1213 | 8324 1214 | 8070 1215 | 8174 1216 | 8109 1217 | 8199 1218 | 8043 1219 | 8010 1220 | 8262 1221 | 8157 1222 | 7986 1223 | 8195 1224 | 8100 1225 | 8097 1226 | 8329 1227 | 8000 1228 | 8030 1229 | 8196 1230 | 8188 1231 | 8203 1232 | 8169 1233 | 8014 1234 | 8119 1235 | 8259 1236 | 8157 1237 | 8023 1238 | 8063 1239 | 7944 1240 | 8024 1241 | 8122 1242 | 7975 1243 | 8019 1244 | 7885 1245 | 8036 1246 | 7880 1247 | 8166 1248 | 7908 1249 | 7858 1250 | 7984 1251 | 7923 1252 | 8021 1253 | 7948 1254 | 8076 1255 | 7900 1256 | 7984 1257 | 7805 1258 | 7753 1259 | 7998 1260 | 8035 1261 | 7769 1262 | 7942 1263 | 7832 1264 | 8065 1265 | 7811 1266 | 7967 1267 | 7946 1268 | 7950 1269 | 7766 1270 | 7747 1271 | 7788 1272 | 7763 1273 | 7893 1274 | 7700 1275 | 7806 1276 | 7808 1277 | 7747 1278 | 7899 1279 | 7787 1280 | 7705 1281 | 7765 1282 | 7863 1283 | 7883 1284 | 7894 1285 | 7794 1286 | 7860 1287 | 7725 1288 | 7685 1289 | 7735 1290 | 7561 1291 | 7913 1292 | 7693 1293 | 7709 1294 | 7702 1295 | 7906 1296 | 7676 1297 | 7538 1298 | 7688 1299 | 7807 1300 | 7783 1301 | 7690 1302 | 7505 1303 | 7539 1304 | 7708 1305 | 7689 1306 | 7679 1307 | 7563 1308 | 7490 1309 | 7605 1310 | 7502 1311 | 7610 1312 | 7688 1313 | 7691 1314 | 7617 1315 | 7485 1316 | 7686 1317 | 7454 1318 | 7601 1319 | 7567 1320 | 7532 1321 | 7733 1322 | 7632 1323 | 7599 1324 | 7578 1325 | 7445 1326 | 7546 1327 | 7459 1328 | 7596 1329 | 7526 1330 | 7433 1331 | 7525 1332 | 7374 1333 | 7559 1334 | 7653 1335 | 7660 1336 | 7435 1337 | 7493 1338 | 7346 1339 | 7462 1340 | 7346 1341 | 7471 1342 | 7474 1343 | 7575 1344 | 7331 1345 | 7563 1346 | 7558 1347 | 7500 1348 | 7447 1349 | 7469 1350 | 7380 1351 | 7331 1352 | 7273 1353 | 7504 1354 | 7388 1355 | 7312 1356 | 7596 1357 | 7275 1358 | 7385 1359 | 7359 1360 | 7362 1361 | 7399 1362 | 7484 1363 | 7380 1364 | 7288 1365 | 7449 1366 | 7371 1367 | 7423 1368 | 7374 1369 | 7304 1370 | 7352 1371 | 7347 1372 | 7381 1373 | 7356 1374 | 7473 1375 | 7432 1376 | 7416 1377 | 7231 1378 | 7288 1379 | 7240 1380 | 7434 1381 | 7376 1382 | 7323 1383 | 7348 1384 | 7250 1385 | 7366 1386 | 7135 1387 | 7149 1388 | 7401 1389 | 7405 1390 | 7090 1391 | 7149 1392 | 7306 1393 | 7253 1394 | 7355 1395 | 7298 1396 | 7158 1397 | 7250 1398 | 7199 1399 | 7272 1400 | 7189 1401 | 7277 1402 | 7348 1403 | 7142 1404 | 7092 1405 | 7152 1406 | 7253 1407 | 7313 1408 | 7209 1409 | 7143 1410 | 7172 1411 | 7051 1412 | 7174 1413 | 7219 1414 | 7271 1415 | 7177 1416 | 7266 1417 | 7146 1418 | 7150 1419 | 7189 1420 | 7274 1421 | 7136 1422 | 7198 1423 | 7034 1424 | 7040 1425 | 7031 1426 | 7112 1427 | 7124 1428 | 7009 1429 | 7126 1430 | 7123 1431 | 7106 1432 | 7160 1433 | 7083 1434 | 7059 1435 | 7265 1436 | 7200 1437 | 7023 1438 | 6994 1439 | 7010 1440 | 6955 1441 | 7107 1442 | 7020 1443 | 7080 1444 | 7022 1445 | 7087 1446 | 7119 1447 | 6994 1448 | 7024 1449 | 7024 1450 | 7137 1451 | 6983 1452 | 6975 1453 | 7095 1454 | 6945 1455 | 6984 1456 | 7027 1457 | 7128 1458 | 7080 1459 | 6976 1460 | 6921 1461 | 6900 1462 | 7177 1463 | 6942 1464 | 7059 1465 | 7040 1466 | 7156 1467 | 6864 1468 | 6958 1469 | 7089 1470 | 6997 1471 | 6924 1472 | 6987 1473 | 6910 1474 | 7001 1475 | 6877 1476 | 7018 1477 | 6865 1478 | 6997 1479 | 6750 1480 | 7043 1481 | 6896 1482 | 6873 1483 | 7040 1484 | 6946 1485 | 6959 1486 | 6861 1487 | 6905 1488 | 6931 1489 | 6779 1490 | 6965 1491 | 6957 1492 | 6937 1493 | 6885 1494 | 6993 1495 | 6985 1496 | 7038 1497 | 6908 1498 | 6760 1499 | 6788 1500 | 6978 1501 | 6908 1502 | 6684 1503 | 6689 1504 | 6974 1505 | 6911 1506 | 6761 1507 | 6912 1508 | 6927 1509 | 6864 1510 | 6911 1511 | 6818 1512 | 6642 1513 | 7045 1514 | 6797 1515 | 6847 1516 | 6877 1517 | 6842 1518 | 6921 1519 | 6683 1520 | 6724 1521 | 6773 1522 | 6776 1523 | 6762 1524 | 7028 1525 | 6915 1526 | 6805 1527 | 6829 1528 | 6844 1529 | 6847 1530 | 6609 1531 | 6768 1532 | 6715 1533 | 6769 1534 | 6760 1535 | 6914 1536 | 6903 1537 | 6846 1538 | 6909 1539 | 6813 1540 | 6709 1541 | 6825 1542 | 6832 1543 | 6719 1544 | 6787 1545 | 6759 1546 | 6691 1547 | 6873 1548 | 6730 1549 | 6738 1550 | 6819 1551 | 6748 1552 | 6627 1553 | 6796 1554 | 6800 1555 | 6861 1556 | 6652 1557 | 6691 1558 | 6718 1559 | 6710 1560 | 6705 1561 | 6692 1562 | 6722 1563 | 6645 1564 | 6597 1565 | 6702 1566 | 6605 1567 | 6613 1568 | 6766 1569 | 6821 1570 | 6660 1571 | 6676 1572 | 6886 1573 | 6840 1574 | 6730 1575 | 6761 1576 | 6593 1577 | 6568 1578 | 6748 1579 | 6706 1580 | 6720 1581 | 6726 1582 | 6600 1583 | 6773 1584 | 6501 1585 | 6591 1586 | 6715 1587 | 6686 1588 | 6657 1589 | 6714 1590 | 6680 1591 | 6600 1592 | 6726 1593 | 6494 1594 | 6463 1595 | 6664 1596 | 6475 1597 | 6675 1598 | 6392 1599 | 6665 1600 | 6636 1601 | 6629 1602 | 6538 1603 | 6579 1604 | 6511 1605 | 6646 1606 | 6633 1607 | 6573 1608 | 6541 1609 | 6616 1610 | 6620 1611 | 6718 1612 | 6548 1613 | 6559 1614 | 6655 1615 | 6556 1616 | 6678 1617 | 6559 1618 | 6529 1619 | 6586 1620 | 6672 1621 | 6526 1622 | 6603 1623 | 6655 1624 | 6666 1625 | 6514 1626 | 6522 1627 | 6628 1628 | 6491 1629 | 6542 1630 | 6537 1631 | 6518 1632 | 6564 1633 | 6651 1634 | 6608 1635 | 6411 1636 | 6484 1637 | 6548 1638 | 6523 1639 | 6539 1640 | 6542 1641 | 6514 1642 | 6617 1643 | 6511 1644 | 6551 1645 | 6591 1646 | 6473 1647 | 6528 1648 | 6583 1649 | 6599 1650 | 6437 1651 | 6526 1652 | 6647 1653 | 6376 1654 | 6408 1655 | 6598 1656 | 6373 1657 | 6583 1658 | 6591 1659 | 6531 1660 | 6579 1661 | 6273 1662 | 6480 1663 | 6557 1664 | 6412 1665 | 6569 1666 | 6492 1667 | 6439 1668 | 6480 1669 | 6676 1670 | 6501 1671 | 6428 1672 | 6473 1673 | 6472 1674 | 6481 1675 | 6474 1676 | 6537 1677 | 6497 1678 | 6480 1679 | 6371 1680 | 6468 1681 | 6468 1682 | 6431 1683 | 6623 1684 | 6383 1685 | 6367 1686 | 6490 1687 | 6218 1688 | 6304 1689 | 6247 1690 | 6401 1691 | 6425 1692 | 6337 1693 | 6373 1694 | 6453 1695 | 6485 1696 | 6390 1697 | 6412 1698 | 6345 1699 | 6489 1700 | 6262 1701 | 6328 1702 | 6277 1703 | 6378 1704 | 6266 1705 | 6398 1706 | 6248 1707 | 6436 1708 | 6198 1709 | 6280 1710 | 6266 1711 | 6362 1712 | 6349 1713 | 6258 1714 | 6212 1715 | 6405 1716 | 6347 1717 | 6273 1718 | 6405 1719 | 6254 1720 | 6296 1721 | 6372 1722 | 6329 1723 | 6269 1724 | 6299 1725 | 6195 1726 | 6301 1727 | 6212 1728 | 6295 1729 | 6270 1730 | 6312 1731 | 6455 1732 | 6381 1733 | 6260 1734 | 6208 1735 | 6193 1736 | 6264 1737 | 6242 1738 | 6239 1739 | 6263 1740 | 6363 1741 | 6428 1742 | 6163 1743 | 6207 1744 | 6277 1745 | 6312 1746 | 6269 1747 | 6301 1748 | 6424 1749 | 6295 1750 | 6253 1751 | 6214 1752 | 6072 1753 | 6189 1754 | 6132 1755 | 6300 1756 | 6381 1757 | 6199 1758 | 6176 1759 | 6224 1760 | 6162 1761 | 6317 1762 | 6151 1763 | 6166 1764 | 6133 1765 | 6351 1766 | 6293 1767 | 6232 1768 | 6186 1769 | 6063 1770 | 6227 1771 | 6225 1772 | 6140 1773 | 6073 1774 | 6176 1775 | 6175 1776 | 6134 1777 | 6224 1778 | 6199 1779 | 6198 1780 | 5998 1781 | 6176 1782 | 6073 1783 | 6079 1784 | 6105 1785 | 6006 1786 | 6070 1787 | 6068 1788 | 6164 1789 | 5960 1790 | 6109 1791 | 6030 1792 | 6110 1793 | 6069 1794 | 6095 1795 | 6053 1796 | 6035 1797 | 6044 1798 | 6084 1799 | 5864 1800 | 6000 1801 | 6029 1802 | 6107 1803 | 5964 1804 | 5983 1805 | 6070 1806 | 6011 1807 | 6124 1808 | 6031 1809 | 5877 1810 | 5955 1811 | 6017 1812 | 5846 1813 | 6033 1814 | 5878 1815 | 5931 1816 | 5812 1817 | 5988 1818 | 5952 1819 | 5903 1820 | 5829 1821 | 5892 1822 | 5953 1823 | 5928 1824 | 5778 1825 | 5868 1826 | 5763 1827 | 5892 1828 | 5858 1829 | 5688 1830 | 5917 1831 | 5941 1832 | 5742 1833 | 5770 1834 | 5706 1835 | 5816 1836 | 5827 1837 | 5680 1838 | 5935 1839 | 5669 1840 | 5641 1841 | 5727 1842 | 5568 1843 | 5760 1844 | 5696 1845 | 5702 1846 | 5640 1847 | 5615 1848 | 5515 1849 | 5519 1850 | 5617 1851 | 5643 1852 | 5583 1853 | 5534 1854 | 5364 1855 | 5606 1856 | 5707 1857 | 5474 1858 | 5603 1859 | 5450 1860 | 5553 1861 | 5429 1862 | 5516 1863 | 5519 1864 | 5595 1865 | 5359 1866 | 5437 1867 | 5413 1868 | 5425 1869 | 5395 1870 | 5424 1871 | 5520 1872 | 5350 1873 | 5471 1874 | 5409 1875 | 5454 1876 | 5408 1877 | 5372 1878 | 5475 1879 | 5425 1880 | 5430 1881 | 5202 1882 | 5448 1883 | 5325 1884 | 5377 1885 | 5368 1886 | 5347 1887 | 5338 1888 | 5504 1889 | 5331 1890 | 5229 1891 | 5393 1892 | 5417 1893 | 5420 1894 | 5286 1895 | 5170 1896 | 5410 1897 | 5298 1898 | 5371 1899 | 5222 1900 | 5345 1901 | 5267 1902 | 5307 1903 | 5277 1904 | 5198 1905 | 5293 1906 | 5256 1907 | 5093 1908 | 5377 1909 | 5157 1910 | 5247 1911 | 5336 1912 | 5204 1913 | 5200 1914 | 5180 1915 | 5327 1916 | 5164 1917 | 5272 1918 | 5173 1919 | 5259 1920 | 5115 1921 | 5256 1922 | 5102 1923 | 5149 1924 | 5139 1925 | 5194 1926 | 5144 1927 | 5158 1928 | 5249 1929 | 5206 1930 | 5163 1931 | 5273 1932 | 5254 1933 | 5118 1934 | 5113 1935 | 5159 1936 | 5207 1937 | 5168 1938 | 5200 1939 | 5096 1940 | 5086 1941 | 5162 1942 | 5001 1943 | 5208 1944 | 4981 1945 | 5060 1946 | 5200 1947 | 5026 1948 | 5182 1949 | 5066 1950 | 5063 1951 | 5100 1952 | 5058 1953 | 5145 1954 | 4935 1955 | 5132 1956 | 5194 1957 | 4978 1958 | 5101 1959 | 4960 1960 | 5112 1961 | 5038 1962 | 5094 1963 | 4955 1964 | 5040 1965 | 5005 1966 | 5061 1967 | 4970 1968 | 4933 1969 | 4973 1970 | 4958 1971 | 4964 1972 | 5111 1973 | 4877 1974 | 5061 1975 | 4875 1976 | 4890 1977 | 5047 1978 | 4995 1979 | 4972 1980 | 5032 1981 | 4932 1982 | 4738 1983 | 4770 1984 | 4988 1985 | 4946 1986 | 4901 1987 | 4929 1988 | 5030 1989 | 5122 1990 | 4901 1991 | 4833 1992 | 5004 1993 | 4867 1994 | 4858 1995 | 4916 1996 | 4959 1997 | 4983 1998 | 4964 1999 | 4862 2000 | 5069 2001 | 4983 2002 | 4923 2003 | 4813 2004 | 4891 2005 | 4862 2006 | 4860 2007 | 4923 2008 | 4980 2009 | 4982 2010 | 4938 2011 | 4942 2012 | 4997 2013 | 4830 2014 | 4856 2015 | 5004 2016 | 4892 2017 | 4798 2018 | 4847 2019 | 4933 2020 | 4823 2021 | 4831 2022 | 4826 2023 | 4969 2024 | 4949 2025 | 4879 2026 | 4881 2027 | 4847 2028 | 4713 2029 | 4865 2030 | 4975 2031 | 4789 2032 | 4912 2033 | 4796 2034 | 4814 2035 | 4875 2036 | 4752 2037 | 4867 2038 | 4759 2039 | 4902 2040 | 4691 2041 | 4750 2042 | 4874 2043 | 4801 2044 | 4729 2045 | 4835 2046 | 4877 2047 | 4798 2048 | 4784 2049 | 4759 2050 | 4755 2051 | 4946 2052 | 4841 2053 | 4695 2054 | 4875 2055 | 4790 2056 | 4839 2057 | 4877 2058 | 4931 2059 | 4824 2060 | 4851 2061 | 4817 2062 | 4704 2063 | 4789 2064 | 4771 2065 | 4911 2066 | 4811 2067 | 4704 2068 | 4813 2069 | 4812 2070 | 4828 2071 | 4780 2072 | 4838 2073 | 4781 2074 | 4790 2075 | 4702 2076 | 4802 2077 | 4851 2078 | 4732 2079 | 4674 2080 | 4698 2081 | 4825 2082 | 4836 2083 | 4605 2084 | 4744 2085 | 4680 2086 | 4736 2087 | 4664 2088 | 4732 2089 | 4785 2090 | 4760 2091 | 4659 2092 | 4656 2093 | 4690 2094 | 4747 2095 | 4588 2096 | 4652 2097 | 4790 2098 | 4644 2099 | 4762 2100 | 4761 2101 | 4747 2102 | 4759 2103 | 4727 2104 | 4798 2105 | 4757 2106 | 4688 2107 | 4684 2108 | 4802 2109 | 4754 2110 | 4884 2111 | 4816 2112 | 4674 2113 | 4707 2114 | 4668 2115 | 4579 2116 | 4714 2117 | 4679 2118 | 4705 2119 | 4579 2120 | 4653 2121 | 4637 2122 | 4593 2123 | 4632 2124 | 4667 2125 | 4738 2126 | 4646 2127 | 4545 2128 | 4688 2129 | 4660 2130 | 4735 2131 | 4696 2132 | 4638 2133 | 4693 2134 | 4632 2135 | 4632 2136 | 4624 2137 | 4557 2138 | 4581 2139 | 4717 2140 | 4744 2141 | 4578 2142 | 4635 2143 | 4655 2144 | 4682 2145 | 4678 2146 | 4529 2147 | 4621 2148 | 4699 2149 | 4568 2150 | 4543 2151 | 4605 2152 | 4581 2153 | 4696 2154 | 4596 2155 | 4578 2156 | 4558 2157 | 4512 2158 | 4537 2159 | 4699 2160 | 4551 2161 | 4536 2162 | 4483 2163 | 4489 2164 | 4581 2165 | 4680 2166 | 4687 2167 | 4571 2168 | 4629 2169 | 4580 2170 | 4583 2171 | 4526 2172 | 4457 2173 | 4684 2174 | 4572 2175 | 4544 2176 | 4495 2177 | 4403 2178 | 4591 2179 | 4486 2180 | 4528 2181 | 4533 2182 | 4575 2183 | 4450 2184 | 4615 2185 | 4451 2186 | 4454 2187 | 4594 2188 | 4443 2189 | 4561 2190 | 4519 2191 | 4533 2192 | 4362 2193 | 4481 2194 | 4606 2195 | 4423 2196 | 4496 2197 | 4423 2198 | 4472 2199 | 4480 2200 | 4485 2201 | 4390 2202 | 4430 2203 | 4504 2204 | 4399 2205 | 4423 2206 | 4524 2207 | 4537 2208 | 4504 2209 | 4464 2210 | 4473 2211 | 4512 2212 | 4412 2213 | 4329 2214 | 4533 2215 | 4471 2216 | 4410 2217 | 4490 2218 | 4386 2219 | 4416 2220 | 4368 2221 | 4345 2222 | 4513 2223 | 4401 2224 | 4422 2225 | 4482 2226 | 4360 2227 | 4460 2228 | 4403 2229 | 4418 2230 | 4369 2231 | 4480 2232 | 4231 2233 | 4281 2234 | 4429 2235 | 4417 2236 | 4337 2237 | 4392 2238 | 4368 2239 | 4379 2240 | 4453 2241 | 4443 2242 | 4338 2243 | 4326 2244 | 4416 2245 | 4198 2246 | 4323 2247 | 4213 2248 | 4407 2249 | 4411 2250 | 4307 2251 | 4367 2252 | 4442 2253 | 4387 2254 | 4333 2255 | 4351 2256 | 4272 2257 | 4284 2258 | 4377 2259 | 4314 2260 | 4289 2261 | 4239 2262 | 4258 2263 | 4251 2264 | 4318 2265 | 4504 2266 | 4295 2267 | 4293 2268 | 4207 2269 | 4262 2270 | 4228 2271 | 4331 2272 | 4339 2273 | 4186 2274 | 4162 2275 | 4227 2276 | 4221 2277 | 4309 2278 | 4298 2279 | 4162 2280 | 4180 2281 | 4268 2282 | 4329 2283 | 4155 2284 | 4098 2285 | 4322 2286 | 4224 2287 | 4269 2288 | 4243 2289 | 4340 2290 | 4300 2291 | 4276 2292 | 4248 2293 | 4208 2294 | 4242 2295 | 4272 2296 | 4285 2297 | 4200 2298 | 4307 2299 | 4223 2300 | 4260 2301 | 4158 2302 | 4217 2303 | 4237 2304 | 4122 2305 | 4170 2306 | 4290 2307 | 4141 2308 | 4218 2309 | 4146 2310 | 4287 2311 | 4207 2312 | 4075 2313 | 4120 2314 | 4241 2315 | 4176 2316 | 4143 2317 | 4079 2318 | 4174 2319 | 4132 2320 | 4185 2321 | 4197 2322 | 4049 2323 | 4056 2324 | 4128 2325 | 4140 2326 | 4100 2327 | 4125 2328 | 4040 2329 | 4105 2330 | 4048 2331 | 4085 2332 | 4045 2333 | 4183 2334 | 4089 2335 | 4016 2336 | 4022 2337 | 4037 2338 | 4111 2339 | 4167 2340 | 3996 2341 | 4022 2342 | 3968 2343 | 4042 2344 | 4002 2345 | 3961 2346 | 3846 2347 | 3977 2348 | 3982 2349 | 3874 2350 | 3872 2351 | 3992 2352 | 3921 2353 | 3842 2354 | 3798 2355 | 3967 2356 | 3930 2357 | 4012 2358 | 3894 2359 | 3867 2360 | 3772 2361 | 3951 2362 | 3836 2363 | 3878 2364 | 3727 2365 | 3595 2366 | 3745 2367 | 3626 2368 | 3649 2369 | 3760 2370 | 3655 2371 | 3701 2372 | 3510 2373 | 3523 2374 | 3665 2375 | 3488 2376 | 3532 2377 | 3484 2378 | 3417 2379 | 3442 2380 | 3476 2381 | 3429 2382 | 3530 2383 | 3401 2384 | 3456 2385 | 3374 2386 | 3326 2387 | 3278 2388 | 3363 2389 | 3329 2390 | 3480 2391 | 3165 2392 | 3316 2393 | 3278 2394 | 3256 2395 | 3275 2396 | 3305 2397 | 3105 2398 | 3197 2399 | 3228 2400 | 3173 2401 | 3099 2402 | 3159 2403 | 3162 2404 | 3171 2405 | 3148 2406 | 3118 2407 | 3208 2408 | 3128 2409 | 3111 2410 | 3142 2411 | 3215 2412 | 3104 2413 | 3122 2414 | 3068 2415 | 3159 2416 | 3093 2417 | 3064 2418 | 3103 2419 | 3087 2420 | 3052 2421 | 3023 2422 | 3113 2423 | 2980 2424 | 3097 2425 | 3107 2426 | 2925 2427 | 2986 2428 | 2875 2429 | 2916 2430 | 3038 2431 | 3074 2432 | 2942 2433 | 2931 2434 | 2891 2435 | 2984 2436 | 2914 2437 | 2913 2438 | 2923 2439 | 2949 2440 | 2944 2441 | 2846 2442 | 2933 2443 | 2954 2444 | 2959 2445 | 2954 2446 | 2892 2447 | 2901 2448 | 2927 2449 | 2868 2450 | 2835 2451 | 2948 2452 | 2811 2453 | 2769 2454 | 2941 2455 | 2794 2456 | 2877 2457 | 2894 2458 | 2823 2459 | 2825 2460 | 2829 2461 | 2979 2462 | 2839 2463 | 2785 2464 | 2849 2465 | 2798 2466 | 2773 2467 | 2860 2468 | 2955 2469 | 2781 2470 | 2809 2471 | 2816 2472 | 2945 2473 | 2828 2474 | 2858 2475 | 2731 2476 | 2889 2477 | 2884 2478 | 2807 2479 | 2752 2480 | 2764 2481 | 2680 2482 | 2774 2483 | 2756 2484 | 2856 2485 | 2774 2486 | 2826 2487 | 2864 2488 | 2842 2489 | 2692 2490 | 2698 2491 | 2820 2492 | 2742 2493 | 2738 2494 | 2663 2495 | 2834 2496 | 2753 2497 | 2813 2498 | 2725 2499 | 2708 2500 | 2758 2501 | 2722 2502 | 2779 2503 | 2708 2504 | 2785 2505 | 2661 2506 | 2654 2507 | 2742 2508 | 2720 2509 | 2753 2510 | 2667 2511 | 2775 2512 | 2729 2513 | 2802 2514 | 2770 2515 | 2798 2516 | 2783 2517 | 2664 2518 | 2669 2519 | 2624 2520 | 2692 2521 | 2711 2522 | 2667 2523 | 2577 2524 | 2817 2525 | 2722 2526 | 2697 2527 | 2718 2528 | 2699 2529 | 2764 2530 | 2663 2531 | 2731 2532 | 2574 2533 | 2705 2534 | 2747 2535 | 2670 2536 | 2670 2537 | 2645 2538 | 2709 2539 | 2724 2540 | 2747 2541 | 2684 2542 | 2563 2543 | 2659 2544 | 2561 2545 | 2514 2546 | 2625 2547 | 2637 2548 | 2507 2549 | 2679 2550 | 2583 2551 | 2645 2552 | 2621 2553 | 2642 2554 | 2714 2555 | 2507 2556 | 2626 2557 | 2603 2558 | 2611 2559 | 2661 2560 | 2653 2561 | 2571 2562 | 2716 2563 | 2590 2564 | 2540 2565 | 2610 2566 | 2602 2567 | 2535 2568 | 2531 2569 | 2499 2570 | 2495 2571 | 2565 2572 | 2548 2573 | 2639 2574 | 2586 2575 | 2542 2576 | 2572 2577 | 2618 2578 | 2586 2579 | 2561 2580 | 2588 2581 | 2564 2582 | 2620 2583 | 2602 2584 | 2575 2585 | 2615 2586 | 2549 2587 | 2532 2588 | 2469 2589 | 2519 2590 | 2535 2591 | 2624 2592 | 2493 2593 | 2422 2594 | 2511 2595 | 2527 2596 | 2497 2597 | 2423 2598 | 2596 2599 | 2524 2600 | 2525 2601 | 2493 2602 | 2560 2603 | 2558 2604 | 2464 2605 | 2467 2606 | 2536 2607 | 2478 2608 | 2528 2609 | 2392 2610 | 2515 2611 | 2386 2612 | 2461 2613 | 2431 2614 | 2393 2615 | 2348 2616 | 2530 2617 | 2436 2618 | 2381 2619 | 2491 2620 | 2461 2621 | 2433 2622 | 2422 2623 | 2542 2624 | 2478 2625 | 2352 2626 | 2469 2627 | 2373 2628 | 2463 2629 | 2572 2630 | 2437 2631 | 2340 2632 | 2441 2633 | 2458 2634 | 2409 2635 | 2316 2636 | 2417 2637 | 2439 2638 | 2421 2639 | 2343 2640 | 2416 2641 | 2318 2642 | 2409 2643 | 2305 2644 | 2340 2645 | 2356 2646 | 2409 2647 | 2372 2648 | 2441 2649 | 2426 2650 | 2396 2651 | 2377 2652 | 2394 2653 | 2321 2654 | 2376 2655 | 2279 2656 | 2355 2657 | 2368 2658 | 2349 2659 | 2262 2660 | 2384 2661 | 2317 2662 | 2318 2663 | 2374 2664 | 2326 2665 | 2383 2666 | 2332 2667 | 2280 2668 | 2279 2669 | 2283 2670 | 2335 2671 | 2242 2672 | 2283 2673 | 2309 2674 | 2214 2675 | 2301 2676 | 2212 2677 | 2207 2678 | 2300 2679 | 2224 2680 | 2311 2681 | 2267 2682 | 2190 2683 | 2227 2684 | 2166 2685 | 2194 2686 | 2169 2687 | 2118 2688 | 2178 2689 | 2084 2690 | 2163 2691 | 2189 2692 | 2171 2693 | 2095 2694 | 2116 2695 | 2159 2696 | 2212 2697 | 2110 2698 | 2095 2699 | 2047 2700 | 2083 2701 | 2062 2702 | 1963 2703 | 2102 2704 | 2094 2705 | 1948 2706 | 1924 2707 | 1902 2708 | 1861 2709 | 1914 2710 | -------------------------------------------------------------------------------- /word2vec/vectors.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/word2vec/vectors.bin -------------------------------------------------------------------------------- /word2vec/word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 48 | if (M == NULL) { 49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 50 | return -1; 51 | } 52 | for (b = 0; b < words; b++) { 53 | a = 0; 54 | while (1) { 55 | vocab[b * max_w + a] = fgetc(f); 56 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 57 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 58 | } 59 | vocab[b * max_w + a] = 0; 60 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 61 | len = 0; 62 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 63 | len = sqrt(len); 64 | for (a = 0; a < size; a++) M[a + b * size] /= len; 65 | } 66 | fclose(f); 67 | while (1) { 68 | for (a = 0; a < N; a++) bestd[a] = 0; 69 | for (a = 0; a < N; a++) bestw[a][0] = 0; 70 | printf("Enter three words (EXIT to break): "); 71 | a = 0; 72 | while (1) { 73 | st1[a] = fgetc(stdin); 74 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 75 | st1[a] = 0; 76 | break; 77 | } 78 | a++; 79 | } 80 | if (!strcmp(st1, "EXIT")) break; 81 | cn = 0; 82 | b = 0; 83 | c = 0; 84 | while (1) { 85 | st[cn][b] = st1[c]; 86 | b++; 87 | c++; 88 | st[cn][b] = 0; 89 | if (st1[c] == 0) break; 90 | if (st1[c] == ' ') { 91 | cn++; 92 | b = 0; 93 | c++; 94 | } 95 | } 96 | cn++; 97 | if (cn < 3) { 98 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 99 | continue; 100 | } 101 | for (a = 0; a < cn; a++) { 102 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 103 | if (b == words) b = 0; 104 | bi[a] = b; 105 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 106 | if (b == 0) { 107 | printf("Out of dictionary word!\n"); 108 | break; 109 | } 110 | } 111 | if (b == 0) continue; 112 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 113 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 114 | len = 0; 115 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 116 | len = sqrt(len); 117 | for (a = 0; a < size; a++) vec[a] /= len; 118 | for (a = 0; a < N; a++) bestd[a] = 0; 119 | for (a = 0; a < N; a++) bestw[a][0] = 0; 120 | for (c = 0; c < words; c++) { 121 | if (c == bi[0]) continue; 122 | if (c == bi[1]) continue; 123 | if (c == bi[2]) continue; 124 | a = 0; 125 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 126 | if (a == 1) continue; 127 | dist = 0; 128 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 129 | for (a = 0; a < N; a++) { 130 | if (dist > bestd[a]) { 131 | for (d = N - 1; d > a; d--) { 132 | bestd[d] = bestd[d - 1]; 133 | strcpy(bestw[d], bestw[d - 1]); 134 | } 135 | bestd[a] = dist; 136 | strcpy(bestw[a], &vocab[c * max_w]); 137 | break; 138 | } 139 | } 140 | } 141 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 142 | } 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /word2vec/word2phrase: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/word2vec/word2phrase -------------------------------------------------------------------------------- /word2vec/word2phrase.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 60 22 | 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 42 | void ReadWord(char *word, FILE *fin) { 43 | int a = 0, ch; 44 | while (!feof(fin)) { 45 | ch = fgetc(fin); 46 | if (ch == 13) continue; 47 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 48 | if (a > 0) { 49 | if (ch == '\n') ungetc(ch, fin); 50 | break; 51 | } 52 | if (ch == '\n') { 53 | strcpy(word, (char *)""); 54 | return; 55 | } else continue; 56 | } 57 | word[a] = ch; 58 | a++; 59 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 60 | } 61 | word[a] = 0; 62 | } 63 | 64 | // Returns hash value of a word 65 | int GetWordHash(char *word) { 66 | unsigned long long a, hash = 1; 67 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 68 | hash = hash % vocab_hash_size; 69 | return hash; 70 | } 71 | 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 73 | int SearchVocab(char *word) { 74 | unsigned int hash = GetWordHash(word); 75 | while (1) { 76 | if (vocab_hash[hash] == -1) return -1; 77 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 78 | hash = (hash + 1) % vocab_hash_size; 79 | } 80 | return -1; 81 | } 82 | 83 | // Reads a word and returns its index in the vocabulary 84 | int ReadWordIndex(FILE *fin) { 85 | char word[MAX_STRING]; 86 | ReadWord(word, fin); 87 | if (feof(fin)) return -1; 88 | return SearchVocab(word); 89 | } 90 | 91 | // Adds a word to the vocabulary 92 | int AddWordToVocab(char *word) { 93 | unsigned int hash, length = strlen(word) + 1; 94 | if (length > MAX_STRING) length = MAX_STRING; 95 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 96 | strcpy(vocab[vocab_size].word, word); 97 | vocab[vocab_size].cn = 0; 98 | vocab_size++; 99 | // Reallocate memory if needed 100 | if (vocab_size + 2 >= vocab_max_size) { 101 | vocab_max_size += 10000; 102 | vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 103 | } 104 | hash = GetWordHash(word); 105 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 106 | vocab_hash[hash]=vocab_size - 1; 107 | return vocab_size - 1; 108 | } 109 | 110 | // Used later for sorting by word counts 111 | int VocabCompare(const void *a, const void *b) { 112 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 113 | } 114 | 115 | // Sorts the vocabulary by frequency using word counts 116 | void SortVocab() { 117 | int a; 118 | unsigned int hash; 119 | // Sort the vocabulary and keep at the first position 120 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 121 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 122 | for (a = 0; a < vocab_size; a++) { 123 | // Words occuring less than min_count times will be discarded from the vocab 124 | if (vocab[a].cn < min_count) { 125 | vocab_size--; 126 | free(vocab[vocab_size].word); 127 | } else { 128 | // Hash will be re-computed, as after the sorting it is not actual 129 | hash = GetWordHash(vocab[a].word); 130 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 131 | vocab_hash[hash] = a; 132 | } 133 | } 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word)); 135 | } 136 | 137 | // Reduces the vocabulary by removing infrequent tokens 138 | void ReduceVocab() { 139 | int a, b = 0; 140 | unsigned int hash; 141 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 142 | vocab[b].cn = vocab[a].cn; 143 | vocab[b].word = vocab[a].word; 144 | b++; 145 | } else free(vocab[a].word); 146 | vocab_size = b; 147 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 148 | for (a = 0; a < vocab_size; a++) { 149 | // Hash will be re-computed, as it is not actual 150 | hash = GetWordHash(vocab[a].word); 151 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 152 | vocab_hash[hash] = a; 153 | } 154 | fflush(stdout); 155 | min_reduce++; 156 | } 157 | 158 | void LearnVocabFromTrainFile() { 159 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 160 | FILE *fin; 161 | long long a, i, start = 1; 162 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 163 | fin = fopen(train_file, "rb"); 164 | if (fin == NULL) { 165 | printf("ERROR: training data file not found!\n"); 166 | exit(1); 167 | } 168 | vocab_size = 0; 169 | AddWordToVocab((char *)""); 170 | while (1) { 171 | ReadWord(word, fin); 172 | if (feof(fin)) break; 173 | if (!strcmp(word, "")) { 174 | start = 1; 175 | continue; 176 | } else start = 0; 177 | train_words++; 178 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 179 | printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); 180 | fflush(stdout); 181 | } 182 | i = SearchVocab(word); 183 | if (i == -1) { 184 | a = AddWordToVocab(word); 185 | vocab[a].cn = 1; 186 | } else vocab[i].cn++; 187 | if (start) continue; 188 | sprintf(bigram_word, "%s_%s", last_word, word); 189 | bigram_word[MAX_STRING - 1] = 0; 190 | strcpy(last_word, word); 191 | i = SearchVocab(bigram_word); 192 | if (i == -1) { 193 | a = AddWordToVocab(bigram_word); 194 | vocab[a].cn = 1; 195 | } else vocab[i].cn++; 196 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 197 | } 198 | SortVocab(); 199 | if (debug_mode > 0) { 200 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 201 | printf("Words in train file: %lld\n", train_words); 202 | } 203 | fclose(fin); 204 | } 205 | 206 | void TrainModel() { 207 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 208 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 209 | real score; 210 | FILE *fo, *fin; 211 | printf("Starting training using file %s\n", train_file); 212 | LearnVocabFromTrainFile(); 213 | fin = fopen(train_file, "rb"); 214 | fo = fopen(output_file, "wb"); 215 | word[0] = 0; 216 | while (1) { 217 | strcpy(last_word, word); 218 | ReadWord(word, fin); 219 | if (feof(fin)) break; 220 | if (!strcmp(word, "")) { 221 | fprintf(fo, "\n"); 222 | continue; 223 | } 224 | cn++; 225 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 226 | printf("Words written: %lldK%c", cn / 1000, 13); 227 | fflush(stdout); 228 | } 229 | oov = 0; 230 | i = SearchVocab(word); 231 | if (i == -1) oov = 1; else pb = vocab[i].cn; 232 | if (li == -1) oov = 1; 233 | li = i; 234 | sprintf(bigram_word, "%s_%s", last_word, word); 235 | bigram_word[MAX_STRING - 1] = 0; 236 | i = SearchVocab(bigram_word); 237 | if (i == -1) oov = 1; else pab = vocab[i].cn; 238 | if (pa < min_count) oov = 1; 239 | if (pb < min_count) oov = 1; 240 | if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 241 | if (score > threshold) { 242 | fprintf(fo, "_%s", word); 243 | pb = 0; 244 | } else fprintf(fo, " %s", word); 245 | pa = pb; 246 | } 247 | fclose(fo); 248 | fclose(fin); 249 | } 250 | 251 | int ArgPos(char *str, int argc, char **argv) { 252 | int a; 253 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 254 | if (a == argc - 1) { 255 | printf("Argument missing for %s\n", str); 256 | exit(1); 257 | } 258 | return a; 259 | } 260 | return -1; 261 | } 262 | 263 | int main(int argc, char **argv) { 264 | int i; 265 | if (argc == 1) { 266 | printf("WORD2PHRASE tool v0.1a\n\n"); 267 | printf("Options:\n"); 268 | printf("Parameters for training:\n"); 269 | printf("\t-train \n"); 270 | printf("\t\tUse text data from to train the model\n"); 271 | printf("\t-output \n"); 272 | printf("\t\tUse to save the resulting word vectors / word clusters / phrases\n"); 273 | printf("\t-min-count \n"); 274 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 275 | printf("\t-threshold \n"); 276 | printf("\t\t The value represents threshold for forming the phrases (higher means less phrases); default 100\n"); 277 | printf("\t-debug \n"); 278 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 279 | printf("\nExamples:\n"); 280 | printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n"); 281 | return 0; 282 | } 283 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 284 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 285 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 286 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 287 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]); 288 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 289 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 290 | TrainModel(); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /word2vec/word2vec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/APNE/fca2027738de40127490d85b8ade066f79f25cfc/word2vec/word2vec -------------------------------------------------------------------------------- /word2vec/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | // We altered several snippets of the original word2vec code 17 | // Yitan Li - etali@mail.ustc.edu.cn 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define MAX_STRING 100 26 | #define EXP_TABLE_SIZE 1000 27 | #define MAX_EXP 6 28 | #define MAX_SENTENCE_LENGTH 1000 29 | #define MAX_CODE_LENGTH 40 30 | 31 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 32 | 33 | typedef float real; // Precision of float numbers 34 | 35 | struct vocab_word { 36 | long long cn; 37 | int *point; 38 | char *word, *code, codelen; 39 | }; 40 | 41 | char train_file[MAX_STRING], output_file[MAX_STRING]; 42 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING], matrix_file[MAX_STRING], save_W_file[MAX_STRING], save_C_file[MAX_STRING], save_nsc_file[MAX_STRING]; 43 | struct vocab_word *vocab; 44 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; 45 | int *vocab_hash; 46 | long long *matrix, *nscounts; 47 | long long matrixcount = 0; 48 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 49 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; 50 | real alpha = 0.025, starting_alpha, sample = 1e-3; 51 | real *syn0, *syn1, *syn1neg, *expTable; 52 | clock_t start; 53 | 54 | int hs = 0, negative = 5; 55 | const int table_size = 1e8; 56 | int *table; 57 | 58 | void InitUnigramTable() { 59 | int a, i; 60 | long long train_words_pow = 0; 61 | // real d1, power = 0.75; 62 | // altered by Yitan Li 63 | real d1, power = 1; 64 | table = (int *)malloc(table_size * sizeof(int)); 65 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 66 | i = 0; 67 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 68 | for (a = 0; a < table_size; a++) { 69 | table[a] = i; 70 | if (a / (real)table_size > d1) { 71 | i++; 72 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 73 | } 74 | if (i >= vocab_size) i = vocab_size - 1; 75 | } 76 | } 77 | 78 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 79 | void ReadWord(char *word, FILE *fin) { 80 | int a = 0, ch; 81 | while (!feof(fin)) { 82 | ch = fgetc(fin); 83 | if (ch == 13) continue; 84 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 85 | if (a > 0) { 86 | if (ch == '\n') ungetc(ch, fin); 87 | break; 88 | } 89 | if (ch == '\n') { 90 | strcpy(word, (char *)""); 91 | return; 92 | } else continue; 93 | } 94 | word[a] = ch; 95 | a++; 96 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 97 | } 98 | word[a] = 0; 99 | } 100 | 101 | // Returns hash value of a word 102 | int GetWordHash(char *word) { 103 | unsigned long long a, hash = 0; 104 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 105 | hash = hash % vocab_hash_size; 106 | return hash; 107 | } 108 | 109 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 110 | int SearchVocab(char *word) { 111 | unsigned int hash = GetWordHash(word); 112 | while (1) { 113 | if (vocab_hash[hash] == -1) return -1; 114 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 115 | hash = (hash + 1) % vocab_hash_size; 116 | } 117 | return -1; 118 | } 119 | 120 | // Reads a word and returns its index in the vocabulary 121 | int ReadWordIndex(FILE *fin) { 122 | char word[MAX_STRING]; 123 | ReadWord(word, fin); 124 | if (feof(fin)) return -1; 125 | return SearchVocab(word); 126 | } 127 | 128 | // Adds a word to the vocabulary 129 | int AddWordToVocab(char *word) { 130 | unsigned int hash, length = strlen(word) + 1; 131 | if (length > MAX_STRING) length = MAX_STRING; 132 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 133 | strcpy(vocab[vocab_size].word, word); 134 | vocab[vocab_size].cn = 0; 135 | vocab_size++; 136 | // Reallocate memory if needed 137 | if (vocab_size + 2 >= vocab_max_size) { 138 | vocab_max_size += 1000; 139 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 140 | } 141 | hash = GetWordHash(word); 142 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 143 | vocab_hash[hash] = vocab_size - 1; 144 | return vocab_size - 1; 145 | } 146 | 147 | // Used later for sorting by word counts 148 | int VocabCompare(const void *a, const void *b) { 149 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 150 | } 151 | 152 | // Sorts the vocabulary by frequency using word counts 153 | void SortVocab() { 154 | int a, size; 155 | unsigned int hash; 156 | // Sort the vocabulary and keep at the first position 157 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 158 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 159 | size = vocab_size; 160 | train_words = 0; 161 | for (a = 0; a < size; a++) { 162 | // Words occuring less than min_count times will be discarded from the vocab 163 | if ((vocab[a].cn < min_count) && (a != 0)) { 164 | vocab_size--; 165 | free(vocab[a].word); 166 | } else { 167 | // Hash will be re-computed, as after the sorting it is not actual 168 | hash=GetWordHash(vocab[a].word); 169 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 170 | vocab_hash[hash] = a; 171 | train_words += vocab[a].cn; 172 | } 173 | } 174 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 175 | // Allocate memory for the binary tree construction 176 | for (a = 0; a < vocab_size; a++) { 177 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 178 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 179 | } 180 | } 181 | 182 | // Reduces the vocabulary by removing infrequent tokens 183 | void ReduceVocab() { 184 | int a, b = 0; 185 | unsigned int hash; 186 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 187 | vocab[b].cn = vocab[a].cn; 188 | vocab[b].word = vocab[a].word; 189 | b++; 190 | } else free(vocab[a].word); 191 | vocab_size = b; 192 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 193 | for (a = 0; a < vocab_size; a++) { 194 | // Hash will be re-computed, as it is not actual 195 | hash = GetWordHash(vocab[a].word); 196 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 197 | vocab_hash[hash] = a; 198 | } 199 | fflush(stdout); 200 | min_reduce++; 201 | } 202 | 203 | // Create binary Huffman tree using the word counts 204 | // Frequent words will have short uniqe binary codes 205 | void CreateBinaryTree() { 206 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 207 | char code[MAX_CODE_LENGTH]; 208 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 209 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 210 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 211 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 212 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 213 | pos1 = vocab_size - 1; 214 | pos2 = vocab_size; 215 | // Following algorithm constructs the Huffman tree by adding one node at a time 216 | for (a = 0; a < vocab_size - 1; a++) { 217 | // First, find two smallest nodes 'min1, min2' 218 | if (pos1 >= 0) { 219 | if (count[pos1] < count[pos2]) { 220 | min1i = pos1; 221 | pos1--; 222 | } else { 223 | min1i = pos2; 224 | pos2++; 225 | } 226 | } else { 227 | min1i = pos2; 228 | pos2++; 229 | } 230 | if (pos1 >= 0) { 231 | if (count[pos1] < count[pos2]) { 232 | min2i = pos1; 233 | pos1--; 234 | } else { 235 | min2i = pos2; 236 | pos2++; 237 | } 238 | } else { 239 | min2i = pos2; 240 | pos2++; 241 | } 242 | count[vocab_size + a] = count[min1i] + count[min2i]; 243 | parent_node[min1i] = vocab_size + a; 244 | parent_node[min2i] = vocab_size + a; 245 | binary[min2i] = 1; 246 | } 247 | // Now assign binary code to each vocabulary word 248 | for (a = 0; a < vocab_size; a++) { 249 | b = a; 250 | i = 0; 251 | while (1) { 252 | code[i] = binary[b]; 253 | point[i] = b; 254 | i++; 255 | b = parent_node[b]; 256 | if (b == vocab_size * 2 - 2) break; 257 | } 258 | vocab[a].codelen = i; 259 | vocab[a].point[0] = vocab_size - 2; 260 | for (b = 0; b < i; b++) { 261 | vocab[a].code[i - b - 1] = code[b]; 262 | vocab[a].point[i - b] = point[b] - vocab_size; 263 | } 264 | } 265 | free(count); 266 | free(binary); 267 | free(parent_node); 268 | } 269 | 270 | void LearnVocabFromTrainFile() { 271 | char word[MAX_STRING]; 272 | FILE *fin; 273 | long long a, i; 274 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 275 | fin = fopen(train_file, "rb"); 276 | if (fin == NULL) { 277 | printf("ERROR: training data file not found!\n"); 278 | exit(1); 279 | } 280 | vocab_size = 0; 281 | AddWordToVocab((char *)""); 282 | while (1) { 283 | ReadWord(word, fin); 284 | if (feof(fin)) break; 285 | train_words++; 286 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 287 | printf("%lldK%c", train_words / 1000, 13); 288 | fflush(stdout); 289 | } 290 | i = SearchVocab(word); 291 | if (i == -1) { 292 | a = AddWordToVocab(word); 293 | vocab[a].cn = 1; 294 | } else vocab[i].cn++; 295 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 296 | } 297 | SortVocab(); 298 | if (debug_mode > 0) { 299 | printf("Vocab size: %lld\n", vocab_size); 300 | printf("Words in train file: %lld\n", train_words); 301 | } 302 | file_size = ftell(fin); 303 | fclose(fin); 304 | } 305 | 306 | void SaveVocab() { 307 | long long i; 308 | FILE *fo = fopen(save_vocab_file, "wb"); 309 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 310 | fclose(fo); 311 | } 312 | 313 | void ReadVocab() { 314 | long long a, i = 0; 315 | char c; 316 | char word[MAX_STRING]; 317 | FILE *fin = fopen(read_vocab_file, "rb"); 318 | if (fin == NULL) { 319 | printf("Vocabulary file not found\n"); 320 | exit(1); 321 | } 322 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 323 | vocab_size = 0; 324 | while (1) { 325 | ReadWord(word, fin); 326 | if (feof(fin)) break; 327 | a = AddWordToVocab(word); 328 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 329 | i++; 330 | } 331 | SortVocab(); 332 | if (debug_mode > 0) { 333 | printf("Vocab size: %lld\n", vocab_size); 334 | printf("Words in train file: %lld\n", train_words); 335 | } 336 | fin = fopen(train_file, "rb"); 337 | if (fin == NULL) { 338 | printf("ERROR: training data file not found!\n"); 339 | exit(1); 340 | } 341 | fseek(fin, 0, SEEK_END); 342 | file_size = ftell(fin); 343 | fclose(fin); 344 | } 345 | 346 | void InitNet() { 347 | long long a, b; 348 | unsigned long long next_random = 1; 349 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 350 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 351 | if (hs) { 352 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 353 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 354 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 355 | syn1[a * layer1_size + b] = 0; 356 | } 357 | if (negative>0) { 358 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 359 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 360 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) 361 | syn1neg[a * layer1_size + b] = 0; 362 | } 363 | for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { 364 | next_random = next_random * (unsigned long long)25214903917 + 11; 365 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 366 | } 367 | 368 | CreateBinaryTree(); 369 | } 370 | 371 | // Main Traing Function(#Yitan) 372 | void *TrainModelThread(void *id) { 373 | long long a, b, d, cw, word, last_word, mpos, sentence_length = 0, sentence_position = 0; 374 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 375 | long long l1, l2, c, target, label, local_iter = iter; 376 | unsigned long long next_random = (long long)id; 377 | real f, g; 378 | clock_t now; 379 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 380 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 381 | FILE *fi = fopen(train_file, "rb"); 382 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 383 | while (1) { 384 | if (word_count - last_word_count > 10000) { 385 | word_count_actual += word_count - last_word_count; 386 | last_word_count = word_count; 387 | if ((debug_mode > 1)) { 388 | now=clock(); 389 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 390 | word_count_actual / (real)(iter * train_words + 1) * 100, 391 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 392 | fflush(stdout); 393 | } 394 | alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); 395 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 396 | } 397 | if (sentence_length == 0) { 398 | while (1) { 399 | word = ReadWordIndex(fi); 400 | if (feof(fi)) break; 401 | if (word == -1) continue; 402 | word_count++; 403 | if (word == 0) break; 404 | // The subsampling randomly discards frequent words while keeping the ranking same 405 | if (sample > 0) { 406 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 407 | next_random = next_random * (unsigned long long)25214903917 + 11; 408 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 409 | } 410 | sen[sentence_length] = word; 411 | sentence_length++; 412 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 413 | } 414 | sentence_position = 0; 415 | } 416 | if (feof(fi) || (word_count > train_words / num_threads)) { 417 | word_count_actual += word_count - last_word_count; 418 | local_iter--; 419 | if (local_iter == 0) break; 420 | word_count = 0; 421 | last_word_count = 0; 422 | sentence_length = 0; 423 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 424 | continue; 425 | } 426 | word = sen[sentence_position]; 427 | if (word == -1) continue; 428 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 429 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 430 | next_random = next_random * (unsigned long long)25214903917 + 11; 431 | b = next_random % window; 432 | if (cbow) { //train the cbow architecture 433 | // in -> hidden 434 | cw = 0; 435 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 436 | c = sentence_position - window + a; 437 | if (c < 0) continue; 438 | if (c >= sentence_length) continue; 439 | last_word = sen[c]; 440 | if (last_word == -1) continue; 441 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 442 | cw++; 443 | } 444 | if (cw) { 445 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 446 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 447 | f = 0; 448 | l2 = vocab[word].point[d] * layer1_size; 449 | // Propagate hidden -> output 450 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 451 | if (f <= -MAX_EXP) continue; 452 | else if (f >= MAX_EXP) continue; 453 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 454 | // 'g' is the gradient multiplied by the learning rate 455 | g = (1 - vocab[word].code[d] - f) * alpha; 456 | // Propagate errors output -> hidden 457 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 458 | // Learn weights hidden -> output 459 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 460 | } 461 | // NEGATIVE SAMPLING 462 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 463 | if (d == 0) { 464 | target = word; 465 | label = 1; 466 | } else { 467 | next_random = next_random * (unsigned long long)25214903917 + 11; 468 | target = table[(next_random >> 16) % table_size]; 469 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 470 | if (target == word) continue; 471 | label = 0; 472 | } 473 | l2 = target * layer1_size; 474 | f = 0; 475 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 476 | if (f > MAX_EXP) g = (label - 1) * alpha; 477 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 478 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 479 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 480 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 481 | } 482 | // hidden -> in 483 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 484 | c = sentence_position - window + a; 485 | if (c < 0) continue; 486 | if (c >= sentence_length) continue; 487 | last_word = sen[c]; 488 | if (last_word == -1) continue; 489 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 490 | } 491 | } 492 | } else { //train skip-gram 493 | // printf("start skip-gram!\n"); 494 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 495 | c = sentence_position - window + a; 496 | if (c < 0) continue; 497 | if (c >= sentence_length) continue; 498 | last_word = sen[c]; 499 | if (last_word == -1) continue; 500 | 501 | //altered by Yitan Li 502 | mpos = word*vocab_size + last_word; 503 | matrix[mpos] = matrix[mpos] + 1; 504 | matrixcount++; 505 | 506 | l1 = last_word * layer1_size; 507 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 508 | // HIERARCHICAL SOFTMAX 509 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 510 | f = 0; 511 | l2 = vocab[word].point[d] * layer1_size; 512 | // Propagate hidden -> output 513 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 514 | if (f <= -MAX_EXP) continue; 515 | else if (f >= MAX_EXP) continue; 516 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 517 | // 'g' is the gradient multiplied by the learning rate 518 | g = (1 - vocab[word].code[d] - f) * alpha; 519 | // Propagate errors output -> hidden 520 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 521 | // Learn weights hidden -> output 522 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 523 | } 524 | // NEGATIVE SAMPLING 525 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 526 | if (d == 0) { 527 | target = word; 528 | label = 1; 529 | } else { 530 | next_random = next_random * (unsigned long long)25214903917 + 11; 531 | target = table[(next_random >> 16) % table_size]; 532 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 533 | if (target == word) continue; 534 | nscounts[target] = nscounts[target] + 1; 535 | label = 0; 536 | } 537 | l2 = target * layer1_size; 538 | f = 0; 539 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 540 | if (f > MAX_EXP) g = (label - 1) * alpha; 541 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 542 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 543 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 544 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 545 | } 546 | // Learn weights input -> hidden 547 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 548 | } 549 | } 550 | sentence_position++; 551 | if (sentence_position >= sentence_length) { 552 | sentence_length = 0; 553 | continue; 554 | } 555 | } 556 | fclose(fi); 557 | free(neu1); 558 | free(neu1e); 559 | pthread_exit(NULL); 560 | } 561 | 562 | void TrainModel() { 563 | FILE *fo, *foW, *foC, *fonsc; 564 | long a, b, c, d; 565 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 566 | printf("Starting training using file %s\n", train_file); 567 | starting_alpha = alpha; 568 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 569 | if (save_vocab_file[0] != 0) SaveVocab(); 570 | 571 | //altered by Yitan Li 572 | printf("malloc matrix:%lld\n", vocab_size*vocab_size); 573 | matrix = (long long *)malloc((vocab_size*vocab_size) * sizeof(long long)); 574 | nscounts = (long long *)malloc((vocab_size) * sizeof(long long)); 575 | if(NULL == matrix){ 576 | printf("NULL wrong!\n"); 577 | } 578 | memset(matrix, (long long)0, sizeof(long long)*vocab_size*vocab_size); 579 | memset(nscounts, (long long)0, sizeof(long long)*vocab_size); 580 | 581 | if (output_file[0] == 0) return; 582 | InitNet(); 583 | if (negative > 0) InitUnigramTable(); 584 | start = clock(); 585 | printf("start multi-thread!\n"); 586 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 587 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 588 | printf("end multi-thread!\n"); 589 | fo = fopen(output_file, "wb"); 590 | if (classes == 0) { 591 | // Save the word vectors 592 | printf("store start"); 593 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 594 | for (a = 0; a < vocab_size; a++) { 595 | fprintf(fo, "%s ", vocab[a].word); 596 | for (b = 0; b < layer1_size; b++) 597 | fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 598 | fprintf(fo, "\n"); 599 | } 600 | foW = fopen(save_W_file, "w"); 601 | for (a = 0; a < vocab_size; a++) { 602 | for (b = 0; b < layer1_size; b++) 603 | fprintf(foW, "%f ", syn0[a * layer1_size + b]); 604 | fprintf(foW, "\n"); 605 | } 606 | fclose(foW); 607 | 608 | foC = fopen(save_C_file, "w"); 609 | for (a = 0; a < vocab_size; a++) { 610 | for (b = 0; b < layer1_size; b++) 611 | fprintf(foC, "%f ", syn1neg[a * layer1_size + b]); 612 | fprintf(foC, "\n"); 613 | } 614 | fclose(foC); 615 | 616 | fonsc = fopen(save_nsc_file, "w"); 617 | for (a = 0; a < vocab_size; a++){ 618 | fprintf(fonsc, "%lld\n", nscounts[a]); 619 | } 620 | fclose(fonsc); 621 | printf("store end"); 622 | 623 | } else { 624 | // Run K-means on the word vectors 625 | int clcn = classes, iter = 10, closeid; 626 | int *centcn = (int *)malloc(classes * sizeof(int)); 627 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 628 | real closev, x; 629 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 630 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 631 | for (a = 0; a < iter; a++) { 632 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 633 | for (b = 0; b < clcn; b++) centcn[b] = 1; 634 | for (c = 0; c < vocab_size; c++) { 635 | for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 636 | centcn[cl[c]]++; 637 | } 638 | for (b = 0; b < clcn; b++) { 639 | closev = 0; 640 | for (c = 0; c < layer1_size; c++) { 641 | cent[layer1_size * b + c] /= centcn[b]; 642 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 643 | } 644 | closev = sqrt(closev); 645 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 646 | } 647 | for (c = 0; c < vocab_size; c++) { 648 | closev = -10; 649 | closeid = 0; 650 | for (d = 0; d < clcn; d++) { 651 | x = 0; 652 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 653 | if (x > closev) { 654 | closev = x; 655 | closeid = d; 656 | } 657 | } 658 | cl[c] = closeid; 659 | } 660 | } 661 | // Save the K-means classes 662 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 663 | free(centcn); 664 | free(cent); 665 | free(cl); 666 | } 667 | fclose(fo); 668 | } 669 | 670 | int ArgPos(char *str, int argc, char **argv) { 671 | int a; 672 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 673 | if (a == argc - 1) { 674 | printf("Argument missing for %s\n", str); 675 | exit(1); 676 | } 677 | return a; 678 | } 679 | return -1; 680 | } 681 | 682 | int main(int argc, char **argv) { 683 | long i, a, b, allcount = 0; 684 | long long mpv = 0, matrixsum = 0; 685 | FILE *matrixf = NULL; 686 | if (argc == 1) { 687 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 688 | printf("Options:\n"); 689 | printf("Parameters for training:\n"); 690 | printf("\t-train \n"); 691 | printf("\t\tUse text data from to train the model\n"); 692 | printf("\t-output \n"); 693 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 694 | printf("\t-size \n"); 695 | printf("\t\tSet size of word vectors; default is 100\n"); 696 | printf("\t-window \n"); 697 | printf("\t\tSet max skip length between words; default is 5\n"); 698 | printf("\t-sample \n"); 699 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); 700 | printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); 701 | printf("\t-hs \n"); 702 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 703 | printf("\t-negative \n"); 704 | printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); 705 | printf("\t-threads \n"); 706 | printf("\t\tUse threads (default 12)\n"); 707 | printf("\t-iter \n"); 708 | printf("\t\tRun more training iterations (default 5)\n"); 709 | printf("\t-min-count \n"); 710 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 711 | printf("\t-alpha \n"); 712 | printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); 713 | printf("\t-classes \n"); 714 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 715 | printf("\t-debug \n"); 716 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 717 | printf("\t-binary \n"); 718 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 719 | printf("\t-save-vocab \n"); 720 | printf("\t\tThe vocabulary will be saved to \n"); 721 | printf("\t-read-vocab \n"); 722 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 723 | printf("\t-cbow \n"); 724 | printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); 725 | printf("\nExamples:\n"); 726 | printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 727 | return 0; 728 | } 729 | output_file[0] = 0; 730 | save_vocab_file[0] = 0; 731 | read_vocab_file[0] = 0; 732 | matrix_file[0] = 0; 733 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 734 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 735 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 736 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 737 | if ((i = ArgPos((char *)"-matrix", argc, argv)) > 0) strcpy(matrix_file, argv[i + 1]); 738 | if ((i = ArgPos((char *)"-saveW", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]); 739 | if ((i = ArgPos((char *)"-saveC", argc, argv)) > 0) strcpy(save_C_file, argv[i + 1]); 740 | if ((i = ArgPos((char *)"-nsc", argc, argv)) > 0) strcpy(save_nsc_file, argv[i + 1]); 741 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 742 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 743 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 744 | if (cbow) alpha = 0.05; 745 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 746 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 747 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 748 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 749 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 750 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 751 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 752 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 753 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 754 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 755 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 756 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 757 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 758 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 759 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 760 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 761 | } 762 | 763 | // create matrix 764 | TrainModel(); 765 | //altered by Yitan Li 766 | printf("save matrix!\n"); 767 | matrixf = fopen(matrix_file, "w"); 768 | printf("vocab_size:%lld\n", vocab_size); 769 | for(a=0;a