├── .gitignore ├── data ├── karate.edgelist ├── karate.txt └── karate_emb.txt ├── docker-snap └── Dockerfile ├── helper.py ├── make_random_walks.py ├── random_walk.py ├── start.py ├── test └── test_random_walk.py ├── train.py └── word2vec.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .DS_Store 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | # lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | #Ipython Notebook 66 | .ipynb_checkpoints 67 | -------------------------------------------------------------------------------- /data/karate.edgelist: -------------------------------------------------------------------------------- 1 | 1 32 2 | 1 22 3 | 1 20 4 | 1 18 5 | 1 14 6 | 1 13 7 | 1 12 8 | 1 11 9 | 1 9 10 | 1 8 11 | 1 7 12 | 1 6 13 | 1 5 14 | 1 4 15 | 1 3 16 | 1 2 17 | 2 31 18 | 2 22 19 | 2 20 20 | 2 18 21 | 2 14 22 | 2 8 23 | 2 4 24 | 2 3 25 | 3 14 26 | 3 9 27 | 3 10 28 | 3 33 29 | 3 29 30 | 3 28 31 | 3 8 32 | 3 4 33 | 4 14 34 | 4 13 35 | 4 8 36 | 5 11 37 | 5 7 38 | 6 17 39 | 6 11 40 | 6 7 41 | 7 17 42 | 9 34 43 | 9 33 44 | 9 33 45 | 10 34 46 | 14 34 47 | 15 34 48 | 15 33 49 | 16 34 50 | 16 33 51 | 19 34 52 | 19 33 53 | 20 34 54 | 21 34 55 | 21 33 56 | 23 34 57 | 23 33 58 | 24 30 59 | 24 34 60 | 24 33 61 | 24 28 62 | 24 26 63 | 25 32 64 | 25 28 65 | 25 26 66 | 26 32 67 | 27 34 68 | 27 30 69 | 28 34 70 | 29 34 71 | 29 32 72 | 30 34 73 | 30 33 74 | 31 34 75 | 31 33 76 | 32 34 77 | 32 33 78 | 33 34 -------------------------------------------------------------------------------- /data/karate.txt: -------------------------------------------------------------------------------- 1 | 0 31 2 | 0 21 3 | 0 19 4 | 0 17 5 | 0 13 6 | 0 12 7 | 0 11 8 | 0 10 9 | 0 8 10 | 0 7 11 | 0 6 12 | 0 5 13 | 0 4 14 | 0 3 15 | 0 2 16 | 0 1 17 | 1 30 18 | 1 21 19 | 1 19 20 | 1 17 21 | 1 13 22 | 1 7 23 | 1 3 24 | 1 2 25 | 2 13 26 | 2 8 27 | 2 9 28 | 2 32 29 | 2 28 30 | 2 27 31 | 2 7 32 | 2 3 33 | 3 13 34 | 3 12 35 | 3 7 36 | 4 10 37 | 4 6 38 | 5 16 39 | 5 10 40 | 5 6 41 | 6 16 42 | 8 33 43 | 8 32 44 | 8 32 45 | 9 33 46 | 13 33 47 | 14 33 48 | 14 32 49 | 15 33 50 | 15 32 51 | 18 33 52 | 18 32 53 | 19 33 54 | 20 33 55 | 20 32 56 | 22 33 57 | 22 32 58 | 23 29 59 | 23 33 60 | 23 32 61 | 23 27 62 | 23 25 63 | 24 31 64 | 24 27 65 | 24 25 66 | 25 31 67 | 26 33 68 | 26 29 69 | 27 33 70 | 28 33 71 | 28 31 72 | 29 33 73 | 29 32 74 | 30 33 75 | 30 32 76 | 31 33 77 | 31 32 78 | 32 33 79 | -------------------------------------------------------------------------------- /data/karate_emb.txt: -------------------------------------------------------------------------------- 1 | 34 64 2 | 0 -0.001478 -0.004382 0.000245 -0.007569 -0.006209 0.004529 -0.002949 -0.002226 -0.001829 0.000264 0.006443 0.000529 0.001409 -0.000279 0.006813 -0.003427 0.005404 -0.001235 -0.004344 -0.001933 -0.003851 -0.004703 -0.000893 0.002229 -0.007048 -0.001171 0.002043 -0.005793 -0.006461 -0.006901 0.004066 0.005445 0.000827 -0.003599 0.006363 -0.003682 0.006125 0.003071 0.004895 -0.005375 -0.006488 -0.004437 -0.004532 0.004399 -0.000756 0.006681 -0.001376 -0.004283 -0.005774 0.002837 -0.004321 -0.001670 -0.004461 -0.007005 -0.002478 -0.001732 -0.006088 0.007087 -0.007893 0.003285 0.001064 -0.006536 0.000587 0.005204 3 | 32 -0.003657 -0.005944 -0.007604 0.000410 0.000661 0.000847 0.004861 -0.001534 -0.003656 0.001617 -0.002034 0.004074 -0.002837 0.002107 0.000825 0.004529 0.002271 -0.002520 -0.007324 0.000722 -0.000148 0.005559 -0.006204 -0.003831 -0.006241 -0.002291 -0.003565 0.007235 0.002845 -0.004305 -0.002138 -0.000759 0.007492 0.004767 -0.004599 0.000513 0.005936 0.006955 -0.001547 -0.004967 -0.005543 0.003061 -0.001009 -0.001337 -0.006485 -0.002333 0.005751 0.003223 -0.001167 -0.006711 -0.000841 0.002657 -0.002595 0.003681 -0.006089 -0.005016 -0.003062 -0.000502 -0.000198 -0.001900 0.002024 0.000651 -0.006070 0.007024 4 | 2 0.006017 -0.001165 -0.001458 -0.002130 -0.002159 0.000450 0.003152 0.004615 -0.006925 -0.000920 -0.002416 0.004210 0.004141 -0.005008 -0.001286 -0.005864 -0.004034 0.004123 -0.001204 -0.000521 0.001612 0.001740 0.005698 -0.000641 -0.005863 -0.001552 -0.003533 0.000916 0.005056 0.002884 0.005601 -0.002968 -0.006425 0.006478 -0.006638 0.004512 0.001145 -0.005716 0.003122 -0.002256 0.003374 0.005281 0.003564 -0.001277 -0.004848 -0.001492 -0.004132 -0.002355 0.002976 -0.004020 0.001506 -0.006409 0.003419 0.007448 -0.007771 -0.000213 0.004669 0.006682 -0.004503 0.007114 -0.004456 0.003997 -0.001157 -0.005594 5 | 1 -0.007735 -0.007822 0.007283 -0.005410 0.003365 -0.007586 -0.001277 0.005482 0.006237 0.000481 -0.004625 -0.001312 0.004407 -0.003295 -0.003809 -0.004791 0.004370 -0.003413 0.007288 -0.003932 0.003145 -0.000696 0.001475 0.002896 0.001561 0.002091 0.004725 0.007160 0.003035 -0.004113 -0.007478 0.004661 -0.000607 0.001778 -0.003543 -0.006159 -0.007466 -0.003456 0.007058 -0.007892 -0.005415 -0.005678 0.004998 0.004518 -0.002682 -0.005150 -0.004709 0.000741 0.006950 0.006086 -0.006634 0.006820 -0.002718 0.007583 -0.006855 0.001649 -0.000786 0.007078 -0.005385 0.003488 0.003799 -0.007662 -0.005736 0.000111 6 | 6 0.002071 -0.002099 0.001781 0.007119 0.000813 0.002474 0.003869 0.004858 -0.000926 0.000847 0.002634 -0.001479 -0.002357 -0.003705 0.004330 0.000085 -0.002140 -0.005418 0.003582 -0.005130 -0.002541 -0.004866 -0.002067 -0.006413 0.005680 -0.004303 0.002477 -0.007220 0.000801 0.006628 0.001066 -0.005865 0.004189 -0.001384 0.002266 0.000217 0.001576 0.006568 -0.007146 -0.007913 -0.003720 0.004227 0.007282 -0.004811 -0.007268 0.002579 0.004855 -0.007121 0.007182 0.006042 -0.000536 0.004388 -0.006938 0.000462 0.005665 -0.004099 -0.000529 -0.003629 0.005075 -0.006887 0.007704 0.004535 0.000332 0.000442 7 | 8 0.001113 -0.007071 0.004114 0.001666 -0.005125 0.002709 0.003933 0.001555 -0.000424 0.005822 -0.003446 0.000121 -0.004891 0.000063 0.006135 -0.005038 0.004235 0.000309 0.007290 0.004839 -0.002278 -0.000278 -0.001175 -0.007169 0.002211 0.007766 -0.000481 0.005490 -0.003369 0.004677 -0.001910 0.000977 0.001846 0.004753 -0.002555 -0.002465 0.006714 -0.001808 0.006582 0.007182 -0.002274 0.000395 -0.005471 0.005556 0.001940 0.005634 -0.005102 -0.001160 -0.003164 0.004103 -0.004195 -0.004533 0.007801 0.004731 -0.005982 0.000148 -0.007150 -0.002494 -0.003871 -0.004531 -0.004450 -0.006505 0.001524 0.002476 8 | 23 0.001430 0.005739 -0.007410 0.002426 -0.001234 0.005407 -0.001935 0.006405 0.003486 -0.003135 -0.003118 0.000782 0.001462 -0.005832 -0.005631 0.004408 -0.000939 0.005753 0.003496 0.007430 -0.002458 -0.003012 -0.006573 0.005313 0.001853 -0.001423 -0.006705 -0.006118 -0.007027 0.004659 -0.001166 0.002166 -0.001486 -0.000403 -0.006771 -0.004855 0.003213 -0.004877 -0.005671 0.001032 -0.001348 0.000716 0.003004 0.003389 0.004533 -0.002912 -0.003074 0.006255 -0.004586 -0.001478 0.002238 -0.002100 0.005504 -0.006395 -0.000446 -0.006834 -0.004224 -0.000931 -0.000469 -0.007070 0.000871 -0.003361 0.005865 0.001890 9 | 33 -0.000377 -0.004888 0.001485 0.006722 0.006108 -0.006825 -0.001620 0.007424 0.004130 -0.000649 0.004091 -0.000117 0.000802 -0.005852 0.006346 0.003865 -0.002170 0.005373 -0.006260 -0.000678 0.006193 -0.000315 0.005486 -0.005679 -0.005026 -0.000454 -0.004648 0.000252 0.004001 0.003496 -0.001208 0.006583 -0.000721 0.007137 -0.007096 -0.005646 -0.000892 -0.000318 -0.007124 0.006548 -0.002791 0.003379 0.003907 -0.004599 -0.003129 0.003871 -0.002133 -0.005706 -0.005916 -0.000953 -0.002328 0.006722 0.004490 -0.003709 -0.007829 0.002839 -0.004276 0.002389 0.005992 0.004791 0.007629 -0.007821 -0.004844 0.003788 10 | 3 -0.006910 0.001454 0.007069 0.001260 0.002194 -0.000567 0.004195 -0.000033 -0.007365 -0.002822 0.003380 -0.005689 -0.007555 -0.007575 -0.003391 -0.005470 0.007297 -0.005694 -0.001334 -0.004292 -0.006244 0.002926 -0.001782 0.007436 0.007390 -0.003232 -0.000177 0.005446 0.007856 0.007049 -0.004055 -0.004613 -0.006809 0.007076 -0.002120 -0.006105 0.002368 0.004785 0.002508 0.004154 0.001352 -0.007575 0.001943 0.006594 -0.003707 -0.001359 0.005266 -0.005294 -0.007810 0.005391 0.006693 -0.002815 -0.007402 0.002052 -0.004985 -0.004733 -0.003855 0.002151 -0.003596 0.007245 -0.007038 0.003493 -0.002161 0.007278 11 | 5 0.001171 -0.006912 -0.007826 -0.000199 0.005793 -0.002444 0.006854 -0.006710 0.007335 -0.005260 0.005976 0.005880 -0.005591 0.001782 0.004279 -0.006786 -0.003966 -0.006255 -0.002413 0.003960 0.004333 0.006273 -0.007056 0.007840 0.002735 -0.005554 -0.001285 -0.006524 -0.002268 -0.001680 -0.004360 -0.002464 -0.004405 0.003037 0.004523 -0.005928 0.001824 0.003172 -0.001889 0.000307 0.006703 -0.001200 -0.004178 -0.000602 -0.005000 0.000860 0.001047 -0.001739 -0.000171 -0.003720 -0.001653 0.006451 0.006410 -0.007036 0.004511 -0.004536 0.004409 -0.002470 0.005241 -0.007363 -0.002899 -0.005019 0.004687 0.005653 12 | 7 0.007231 0.000589 -0.006923 -0.001839 -0.006190 0.005150 0.001578 0.002015 0.007265 -0.001585 -0.003493 0.001320 0.001745 -0.007867 0.005591 0.001267 0.001693 -0.000748 -0.005579 -0.001408 0.002045 -0.003726 0.000926 0.005929 0.006384 -0.005539 -0.007278 -0.004035 -0.004750 -0.002844 0.005250 0.006505 -0.004947 -0.006748 -0.005143 -0.000841 -0.004432 -0.007172 -0.002663 -0.004578 -0.007541 0.002460 0.002718 0.004669 0.000034 0.004263 0.002695 -0.000146 0.005984 0.001428 -0.004891 0.005900 -0.000120 0.006773 -0.005419 0.004383 0.000932 -0.004897 0.005705 0.007283 0.002825 0.000434 0.001681 0.004078 13 | 4 -0.000232 0.007378 0.000096 0.003263 0.004755 -0.005174 -0.006897 0.007763 0.001109 -0.004385 0.006914 0.001942 0.005391 -0.002091 -0.002006 -0.001819 0.001604 -0.004826 0.002247 0.004714 0.002772 -0.001598 -0.004733 -0.002889 0.003730 0.001742 -0.005103 0.004086 0.006560 -0.003180 -0.003884 -0.005129 -0.001031 -0.000871 -0.003394 -0.002266 0.004967 0.004547 0.002830 0.003808 0.002437 -0.006375 0.006507 0.006863 0.003254 -0.006326 -0.001855 -0.006315 -0.005198 -0.004945 0.005684 -0.006095 0.001697 0.002838 -0.004059 0.002844 0.003780 0.001066 0.003287 -0.003222 0.007590 -0.004193 0.002991 0.000752 14 | 31 0.003514 0.002066 0.005884 0.005016 -0.001736 -0.006607 -0.001344 -0.007168 -0.006901 0.001854 0.006280 0.007873 0.006976 -0.006880 0.000108 -0.007043 -0.007627 0.003139 0.003113 -0.000320 0.001787 -0.003196 -0.005356 0.005961 0.002523 0.002061 0.004562 0.007210 0.000112 0.000430 -0.004946 0.007452 -0.007364 0.005790 0.004247 0.005746 -0.001480 -0.001193 0.006842 -0.007772 0.001052 0.004965 0.000481 -0.003540 -0.003215 -0.000064 -0.005402 -0.006123 -0.002420 0.003820 -0.006778 0.006048 0.000596 -0.003445 0.001753 -0.004031 0.002776 -0.003947 -0.007542 -0.006957 0.006451 -0.002450 -0.006599 -0.006253 15 | 13 0.005411 0.000665 0.002676 -0.005478 -0.002327 -0.001417 0.002329 -0.005114 0.001869 0.005424 -0.004285 -0.005046 0.003452 -0.006676 0.002306 -0.000779 -0.006054 -0.002712 -0.002609 -0.005817 0.007452 -0.000299 -0.007202 0.002033 -0.007359 -0.000140 0.007189 0.005149 0.001878 0.000208 -0.004799 0.004233 -0.002177 -0.004377 0.000346 -0.004761 0.007291 0.007539 0.001153 -0.001658 0.004021 0.000613 0.000189 0.003590 0.000029 0.006677 0.002315 0.001592 -0.003900 0.004965 -0.005223 0.005467 0.004245 0.005132 -0.005538 -0.005027 -0.006400 -0.005152 0.004826 -0.005223 0.004942 0.000970 -0.004546 -0.005648 16 | 29 -0.001836 -0.000331 -0.004461 -0.003108 0.003442 -0.004398 -0.000264 -0.006355 -0.000082 0.002621 0.003350 0.007045 -0.003864 -0.002814 0.000094 -0.004518 -0.007020 -0.007148 0.004332 -0.000920 0.004075 -0.003405 -0.006946 -0.001861 -0.005781 -0.003557 -0.000635 -0.003676 0.005952 0.004636 0.005546 -0.002196 0.004783 -0.000822 -0.005664 0.001787 0.007129 0.002960 0.003841 -0.007421 0.007068 0.004536 -0.007550 -0.007322 0.004361 0.007728 -0.003407 -0.001411 -0.004872 0.000643 -0.002352 -0.001558 -0.004509 0.000116 -0.007886 0.007788 0.007771 0.005209 0.003091 0.003208 0.002568 -0.006639 -0.004899 0.000346 17 | 27 0.004253 -0.007686 -0.005208 -0.006410 0.001269 0.002954 0.003298 0.006821 -0.005480 0.001740 -0.003885 -0.003446 -0.002392 0.006550 -0.000895 -0.004734 0.007091 -0.004609 0.000155 -0.007536 0.005924 0.003369 0.002816 0.007132 0.000656 -0.006510 -0.007185 0.006433 -0.003854 0.002853 0.001171 -0.006139 -0.003153 -0.005183 0.004265 -0.004802 -0.003721 -0.000825 -0.002768 -0.005715 -0.005885 -0.000063 -0.001670 0.001616 0.002434 0.005681 0.007582 -0.001049 0.000484 0.004634 0.005088 0.004600 -0.005022 0.006790 -0.001586 -0.006090 -0.001421 0.002430 0.006565 0.003390 -0.005016 0.007520 -0.004110 -0.000526 18 | 25 -0.005581 -0.002810 -0.004630 -0.002728 -0.006948 -0.002009 -0.004136 -0.005631 0.007267 -0.001051 -0.002902 -0.004502 0.000164 0.005955 -0.001649 -0.006622 0.004238 -0.004020 0.004513 0.006909 0.005885 -0.002178 0.005499 0.005240 0.002147 -0.006738 0.002165 -0.002275 -0.007401 0.004111 0.003919 0.000700 0.001487 -0.006628 -0.007106 0.002994 -0.006293 0.006144 -0.004574 -0.003884 -0.006380 0.004323 -0.004175 -0.006212 0.004373 0.007120 0.003226 0.006648 0.005462 0.000285 -0.007480 0.005458 0.004027 0.002952 -0.006685 -0.002955 0.005228 -0.007515 -0.001835 0.006380 -0.003472 0.004781 0.004088 -0.002935 19 | 24 0.004444 0.003087 0.005817 0.007223 0.002179 -0.001630 -0.000666 0.001618 0.002428 0.002930 0.006816 -0.000518 0.005646 -0.006353 -0.000545 -0.003615 -0.002722 -0.001467 0.001939 -0.001937 0.001625 0.002175 0.000391 0.006793 0.002519 0.006053 -0.002028 -0.004842 -0.001211 0.004818 -0.006737 0.001183 -0.007280 -0.002544 0.002372 -0.002112 0.007832 -0.004237 -0.000611 -0.004822 0.002052 0.007104 -0.000308 0.003687 -0.001318 0.003304 -0.005254 0.006320 0.002151 -0.004941 0.007265 0.002137 -0.006612 0.007026 0.006200 0.006560 0.005978 0.000997 -0.001938 -0.003119 -0.000363 -0.001179 0.002442 0.000805 20 | 10 0.001612 -0.000837 -0.004048 0.001702 0.003354 0.007451 0.004475 -0.000370 0.006533 -0.005567 0.004289 0.004831 -0.004897 0.005923 -0.002988 0.006128 0.002602 0.005140 -0.004156 0.001806 0.003915 -0.001107 0.001307 0.007427 -0.003204 -0.001795 0.002951 -0.005347 -0.002125 -0.004854 -0.006120 -0.006516 0.003066 0.005880 -0.006217 -0.001374 0.005423 -0.006841 -0.005782 -0.006627 -0.004465 -0.002668 0.006590 0.006572 0.006302 0.007702 0.007637 -0.001759 0.007791 -0.006139 0.000717 0.002300 0.006215 -0.003264 -0.002519 0.004708 0.003342 -0.007016 -0.006899 0.003403 0.002796 -0.000357 -0.007401 0.002012 21 | 19 0.006815 -0.004017 0.006779 -0.003960 -0.002012 0.002004 -0.001893 -0.004692 0.004098 0.000712 -0.006357 0.004917 -0.002201 0.001681 -0.002585 0.002046 -0.006168 -0.006583 0.005068 0.007071 0.002336 -0.006520 0.000495 0.003675 0.000069 0.006424 -0.002191 0.000879 -0.002524 -0.002729 0.000213 0.002582 0.002279 -0.007601 -0.002207 -0.000327 -0.007596 0.005052 0.005279 0.002122 0.000538 -0.001298 -0.000141 -0.002736 0.006170 0.003322 0.005079 -0.005399 -0.001773 0.002904 -0.003036 -0.001341 -0.006719 -0.003569 0.006891 0.004725 -0.005981 0.003851 0.007479 -0.002480 -0.001127 -0.004701 0.006117 0.005035 22 | 21 -0.006290 0.007171 0.000491 -0.007269 -0.007097 -0.001490 -0.002065 -0.005225 0.001018 0.007446 0.000815 0.000657 0.001385 -0.006845 -0.000713 0.000943 0.000335 -0.000005 0.004185 0.001455 0.004749 -0.000861 -0.002475 0.003904 0.000941 -0.002479 0.003027 0.000973 0.000660 -0.000273 0.005684 -0.002064 0.000313 0.003428 -0.006299 -0.004861 -0.001677 0.007344 0.005456 0.002533 -0.005884 0.007273 -0.005516 -0.004695 -0.006341 0.002516 0.001422 0.001941 0.005820 -0.001464 0.006458 0.004056 0.005451 -0.004961 -0.002360 0.003329 0.000654 -0.006689 -0.007392 0.006189 -0.004380 0.000207 0.001407 0.005851 23 | 17 -0.000805 -0.001858 0.001563 0.005045 -0.007587 -0.005245 0.004436 0.003987 0.007311 0.006428 0.002965 0.007797 0.003695 0.004525 -0.004043 0.005866 0.000159 -0.007505 -0.003967 0.004580 -0.000314 -0.007063 0.005446 -0.005035 0.006034 0.004694 -0.002317 -0.000289 0.003683 0.006034 0.001954 0.000489 0.004240 -0.001633 -0.003606 -0.006311 0.004941 0.003475 -0.001698 0.004137 -0.001954 -0.006978 -0.004432 0.005188 -0.003580 -0.003553 0.002928 0.004854 -0.001225 0.003305 -0.004078 -0.004074 0.001069 -0.007673 0.004450 -0.002850 -0.007208 0.007420 0.005413 -0.007028 0.001998 -0.002477 0.003945 -0.003327 24 | 12 -0.007487 -0.006461 0.000734 -0.005455 -0.004133 -0.002887 -0.005039 -0.007241 -0.004787 0.005245 0.007472 0.007930 -0.001857 0.003738 0.000486 0.005294 -0.007075 0.002303 0.004846 -0.007232 0.002988 0.006151 0.002956 -0.003983 -0.007267 -0.003172 0.004247 0.002267 0.000977 -0.001017 0.003448 0.003090 -0.007724 0.002328 0.002331 0.005420 -0.002137 0.004986 0.000124 0.003881 0.000034 -0.000366 0.003974 -0.002142 -0.002151 0.007461 0.003480 -0.001789 -0.006460 -0.002806 -0.003331 0.003651 -0.001464 -0.007562 -0.006450 -0.006570 0.000931 0.001038 -0.000241 0.003160 0.005229 0.002989 -0.005202 0.004523 25 | 28 -0.006453 -0.005362 0.007761 0.000532 -0.000072 0.006119 0.000923 -0.002028 -0.005412 -0.000623 0.003060 0.005510 -0.000719 0.005182 -0.004488 -0.000974 -0.001562 -0.004173 0.004524 0.003636 -0.006974 -0.000429 -0.000741 -0.001900 -0.003781 0.005327 -0.006756 0.002702 0.006833 -0.000804 -0.001444 0.006330 0.003936 0.006656 0.000815 -0.006766 0.004657 0.002890 0.002997 0.001775 -0.002791 -0.005930 0.004435 0.005496 -0.006527 -0.004849 -0.007433 -0.001649 -0.006909 -0.004759 -0.007589 0.001196 0.004266 -0.005255 0.006729 -0.000715 -0.005777 -0.005872 -0.007337 0.000569 -0.000324 -0.004426 0.005280 0.004539 26 | 30 0.006450 -0.006602 -0.006470 0.007508 -0.007411 0.002677 0.005532 -0.003438 -0.000869 0.003603 -0.007661 -0.000196 -0.002458 -0.003536 -0.003527 -0.005002 0.002278 -0.000825 -0.007525 0.007139 0.001920 -0.004079 0.007442 -0.007487 0.002900 -0.004323 0.005595 -0.000416 -0.006550 0.003826 -0.001124 -0.004998 0.001964 -0.003559 -0.004629 0.005967 0.000512 0.005494 0.001969 -0.006774 0.000364 0.002805 0.002378 0.006680 0.000603 0.001334 0.000273 -0.006066 -0.001336 0.003065 -0.000963 0.004818 0.003999 -0.007662 0.004457 0.002415 -0.007687 0.002452 0.004542 0.000971 0.001889 -0.006201 0.006273 0.001721 27 | 9 -0.002332 0.001004 -0.006596 -0.001853 0.000143 0.000039 -0.001011 0.003810 0.007449 -0.001148 0.001599 0.006214 0.000881 -0.007223 -0.002180 -0.001882 -0.003600 0.005312 -0.003804 0.002594 -0.007113 0.006565 0.007584 -0.007217 -0.002997 0.004866 -0.003121 -0.007493 -0.001807 0.003822 0.002552 0.006745 -0.004186 -0.004359 0.007222 -0.004334 0.006153 -0.000185 -0.007109 0.005654 -0.006752 -0.006942 0.002905 0.006239 -0.005617 -0.004112 0.000401 -0.004360 -0.003682 -0.007461 -0.004999 0.001778 -0.004822 0.001616 0.006666 -0.002349 -0.003330 -0.000475 -0.001609 -0.004454 0.000416 -0.000662 -0.005729 0.003361 28 | 22 0.004995 -0.003821 -0.001618 0.000185 -0.001132 -0.005285 -0.002092 -0.002258 -0.004816 -0.002293 -0.002733 0.002318 0.004246 -0.004071 0.004531 -0.002661 -0.005298 -0.002285 0.007371 0.006533 0.005129 -0.000605 0.006707 0.000895 -0.006039 0.006814 -0.002077 -0.007724 0.003852 0.005249 -0.000144 -0.005182 -0.005828 0.000175 -0.002400 -0.005530 -0.001763 -0.002895 -0.004554 -0.005997 0.006999 -0.007293 -0.006060 0.005814 0.004329 0.007681 -0.006722 0.004726 0.005530 -0.003529 -0.004812 -0.004149 -0.006724 -0.002902 0.006829 0.001219 0.005341 0.006278 0.004154 -0.006066 0.003256 0.003163 0.003648 0.006602 29 | 14 -0.003291 -0.004648 0.007503 -0.006162 0.004069 0.002658 -0.004347 -0.002671 -0.004240 0.004084 -0.007562 0.003386 -0.002824 -0.000623 0.000260 0.002647 0.004328 -0.000131 0.001713 -0.004575 0.005180 0.007650 -0.000343 0.004455 -0.002543 0.003778 0.005761 0.007713 -0.007065 0.005940 0.003898 0.002296 -0.002676 0.005868 0.004923 -0.002743 -0.002844 -0.000026 -0.002494 -0.000474 -0.006666 0.000534 0.003495 0.004724 -0.000608 0.002061 0.002213 0.004607 0.000036 0.007329 -0.002940 -0.000174 -0.002226 0.004337 0.000292 -0.002932 -0.006332 0.007393 -0.005270 -0.004015 -0.000654 -0.005887 0.004734 -0.006574 30 | 26 0.006286 0.004336 0.001610 -0.002614 0.003376 -0.004401 0.003670 -0.003138 -0.007676 -0.003378 -0.002698 -0.006483 0.006783 -0.005747 -0.002301 0.005131 0.003069 -0.000270 -0.002681 0.002874 0.006999 0.001702 0.006485 0.003935 -0.003856 -0.003501 -0.007458 -0.004704 -0.003845 0.006157 0.004218 0.001565 0.002460 -0.006961 -0.007218 -0.006250 0.003610 -0.002640 -0.002971 -0.002425 -0.000785 -0.006453 -0.005981 -0.002273 -0.003678 0.007911 -0.006691 -0.000017 -0.003211 0.005904 0.006497 0.004369 -0.003293 -0.006103 0.000188 -0.001420 0.003048 0.002123 -0.006433 0.001115 0.003648 -0.003427 0.001132 0.007059 31 | 11 -0.006362 -0.005192 0.002886 0.002464 0.004037 -0.003131 -0.005596 0.006595 0.005251 -0.006325 -0.001304 0.001757 -0.004191 0.004569 0.007368 -0.006643 -0.002877 -0.001545 0.000943 -0.006233 0.005166 -0.002105 0.004753 0.001206 -0.002284 -0.004423 -0.003676 0.000691 -0.005196 -0.005590 0.004176 -0.001031 0.004539 -0.007266 0.003801 0.006835 0.007163 0.004975 0.000213 0.006967 0.004835 0.005569 0.007318 -0.000623 -0.002794 0.004574 0.005941 0.005425 -0.002884 0.003488 0.003192 0.001828 -0.000019 -0.000082 -0.000456 -0.005479 0.001681 0.005151 -0.007530 0.002259 -0.000437 -0.004724 0.007445 0.005527 32 | 15 -0.007144 -0.004589 0.005440 -0.006179 0.001889 0.007453 0.003841 -0.007120 0.000180 -0.002820 0.003108 -0.000693 0.000487 0.003039 0.003258 -0.002843 -0.005756 -0.004031 -0.005798 -0.006886 0.006487 0.003788 0.000654 0.002199 -0.006042 0.000228 -0.004834 -0.000321 0.000822 -0.005565 -0.004407 0.004124 0.006213 -0.001271 0.000833 0.000097 -0.003526 -0.000163 0.007416 0.002853 -0.005397 0.000036 -0.001563 0.000526 0.005333 0.007091 -0.007608 -0.001338 -0.001696 -0.003241 -0.000798 0.000433 -0.001098 0.003578 -0.005458 -0.007469 -0.007808 -0.007265 0.002017 0.005736 0.006579 0.006445 0.001277 -0.006609 33 | 18 0.002695 0.000593 -0.006548 -0.002196 0.003440 -0.002041 -0.006942 0.003790 0.005236 0.007774 -0.001520 0.005521 0.006164 -0.000715 0.002006 -0.006704 -0.002786 0.003254 -0.005913 0.007430 0.007654 -0.001001 0.007752 0.001180 0.006517 0.006747 0.001247 -0.000204 0.003568 0.002497 -0.000086 0.004835 0.002776 0.004873 0.003643 -0.004004 -0.002602 -0.000722 0.002219 0.006294 -0.002464 0.007576 -0.006393 -0.002848 -0.001854 0.005657 -0.007019 0.000711 0.003275 -0.003880 -0.000733 -0.006183 0.006241 0.006848 -0.001189 0.007024 0.001326 -0.007188 -0.003712 -0.002433 0.003663 -0.007240 0.000197 -0.001163 34 | 20 -0.001053 -0.003011 0.000017 0.001690 -0.005653 -0.007223 0.006901 0.003630 -0.005688 -0.003080 -0.004516 -0.005313 0.007834 -0.001070 -0.002811 -0.001685 -0.005889 0.002655 -0.002669 -0.006123 0.004334 -0.003477 0.005335 0.000674 0.003000 0.005695 0.007625 -0.001945 0.002452 -0.006314 -0.005893 -0.002448 0.002081 -0.001832 0.005210 0.003506 0.007299 -0.005700 -0.005260 -0.007870 -0.007060 -0.004193 -0.004984 0.000298 0.003631 -0.000112 0.006460 0.000586 0.002208 -0.005538 -0.007343 0.000543 0.004959 -0.007016 -0.001529 0.001269 0.000650 -0.007453 -0.006350 0.004478 0.007833 0.003153 0.002670 0.002527 35 | 16 -0.000864 0.002679 0.001650 0.000220 -0.002124 -0.000500 0.004957 -0.001929 0.000288 0.001392 -0.004610 0.003240 -0.003466 0.006099 0.006136 -0.004495 -0.007130 -0.007880 -0.007591 -0.006826 -0.001197 0.002529 0.004923 -0.002593 0.003040 0.000781 0.002438 -0.001011 -0.002680 -0.007021 -0.005411 0.003896 -0.002359 -0.007297 0.003286 0.003342 0.007266 -0.005902 -0.004568 -0.007530 -0.004669 0.006573 0.004710 0.000568 0.003750 -0.002687 0.004342 0.005515 -0.005616 0.007644 0.005839 0.001733 0.001503 0.003434 0.003169 0.000472 -0.000513 0.001180 0.001144 -0.006810 0.004367 -0.007632 -0.002515 -0.007255 36 | -------------------------------------------------------------------------------- /docker-snap/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM asgard/python-ml:latest 2 | 3 | # DL snap repo 4 | RUN git clone https://github.com/snap-stanford/snap 5 | 6 | # install snap 7 | RUN cd snap && make all 8 | 9 | 10 | -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | import random 4 | import numpy as np 5 | import scipy.sparse as sparse 6 | 7 | 8 | def read_csr_matrix(path, comments="#", delimiter=None, weight=False, 9 | make_sym=False): 10 | """ """ 11 | dtype = [('f0', int), ('f1', int)] 12 | usecols = (0, 1) 13 | if weight: 14 | type.append(('weight', float)) 15 | usecols = (0, 1, 2) 16 | arr_list = np.loadtxt(path, comments=comments, delimiter=delimiter, 17 | dtype=dtype, usecols=usecols, unpack=True) 18 | if make_sym: 19 | tmp = arr_list[0] 20 | arr_list[0] = np.concatenate((arr_list[0], arr_list[1]), axis=0) 21 | arr_list[1] = np.concatenate((arr_list[1], tmp), axis=0) 22 | del tmp 23 | # 24 | n = max(arr_list[0].max() + 1, arr_list[1].max() + 1) 25 | if len(usecols) == 2: 26 | arr_list.append(np.ones([arr_list[0].shape[0]])) 27 | # 28 | csr = sparse.csr_matrix((arr_list[2], (arr_list[0], arr_list[1])), 29 | shape=(n, n), dtype=float) 30 | return csr 31 | 32 | 33 | def itershuffle(iterable, bufsize=1000): 34 | """ 35 | Shuffle an iterator. This works by holding `bufsize` items back 36 | and yielding them sometime later. This is NOT 100% random, 37 | proved or anything. 38 | """ 39 | iterable = iter(iterable) 40 | buf = [] 41 | try: 42 | while True: 43 | for i in xrange(random.randint(1, bufsize - len(buf))): 44 | buf.append(iterable.next()) 45 | random.shuffle(buf) 46 | for i in xrange(random.randint(1, bufsize)): 47 | if buf: 48 | yield buf.pop() 49 | else: 50 | break 51 | except StopIteration: 52 | random.shuffle(buf) 53 | while buf: 54 | yield buf.pop() 55 | raise StopIteration 56 | -------------------------------------------------------------------------------- /make_random_walks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | from random_walk import RandomWalksGeneratorCSR 4 | from helper import read_csr_matrix 5 | import argparse 6 | 7 | 8 | parser = argparse.ArgumentParser(description="Compute Random Walks") 9 | 10 | 11 | parser.add_argument( 12 | '--input', type=str, 13 | help=u"path to your edgelist representing the graph") 14 | 15 | parser.add_argument( 16 | '--output', type=str, 17 | help=u"path a folder to store computed random walk files ") 18 | 19 | parser.add_argument( 20 | '--num-walks', type=int, default=10, 21 | help=u"Number of walks per node") 22 | 23 | parser.add_argument( 24 | '--walk-length', type=int, default=15, 25 | help=u"Number of walks per node") 26 | 27 | parser.add_argument( 28 | '--p', type=float, default=1.0, 29 | help=u"Node2vec parameter p. Default to 1.0") 30 | 31 | parser.add_argument( 32 | '--q', type=float, default=1.0, 33 | help=u"Node2vec parameter q. Default to 1.0") 34 | 35 | parser.add_argument( 36 | '--make-sym', type=bool, default=False, 37 | help=u"Flag to duplicate edge symetrically. Make the the undirected") 38 | 39 | parser.add_argument( 40 | '--worker', type=int, default=2, 41 | help=u"Number of worker to use.") 42 | 43 | 44 | def main(input, output, make_sym, walk_length, num_walks, 45 | p, q, worker=4): 46 | 47 | # load karate graph in csr matrix 48 | csr = read_csr_matrix(input, make_sym=make_sym) 49 | 50 | # init RW generator 51 | RWG = RandomWalksGeneratorCSR(P=csr, walk_length=walk_length, 52 | num_walks=num_walks, p=p, q=q, 53 | preprocess=True) 54 | 55 | # preprocess transition 56 | # RWG.preprocess_transition() 57 | RWG.write_walks(output, worker, writer=1) 58 | 59 | 60 | if __name__ == '__main__': 61 | args = parser.parse_args() 62 | main(**args.__dict__) 63 | -------------------------------------------------------------------------------- /random_walk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | from functools import partial 4 | from smart_open import smart_open 5 | import numpy as np 6 | import random 7 | from helper import itershuffle 8 | from scipy.sparse import csr_matrix 9 | from sklearn.preprocessing import normalize 10 | import multiprocessing as mp 11 | import os 12 | import sys 13 | 14 | 15 | class RandomWalksGeneratorNX(object): 16 | """ """ 17 | 18 | def __init__(self, graph, walk_length, num_walks, 19 | p=1, q=1): 20 | """ """ 21 | # handle nx graph 22 | self.G = graph 23 | self.walk_length = walk_length 24 | self.num_walks = num_walks 25 | self.p = p 26 | self.q = q 27 | if p == 1 and q == 1: 28 | self._make_RW = make_random_walk 29 | else: 30 | self._make_RW = partial(make_random_walk_biased, 31 | p=self.p, q=self.q) 32 | 33 | def __iter__(self, shuffle=True, buffsize=1000): 34 | """ """ 35 | # G.nodes_iter 36 | for n in range(self.num_walks): 37 | # nodes_iter 38 | nodes_iter = self.G.nodes_iter() 39 | if shuffle: 40 | it = itershuffle(nodes_iter, buffsize) 41 | else: 42 | it = nodes_iter 43 | # for node in node_iter 44 | for node in it: 45 | walk = self._make_RW(self.G, node, self.walk_length) 46 | yield [str(n) for n in walk] 47 | 48 | def write_walks(self, path): 49 | with smart_open(path, 'wb') as fout: 50 | for walk in self: 51 | fout.write(" ".join(walk)) 52 | 53 | 54 | class RandomWalksGeneratorCSR(object): 55 | """ """ 56 | 57 | def __init__(self, P=None, path=None, walk_length=15, num_walks=10, 58 | p=1, q=1, preprocess=False): 59 | """ """ 60 | if path is not None: 61 | self.path = path 62 | if P is not None: 63 | self.P = P 64 | self.walk_length = walk_length 65 | self.num_walks = num_walks 66 | self.p = p 67 | self.q = q 68 | self.n = P.shape[0] 69 | # preprocess transition for node2vec if needed 70 | if preprocess: 71 | self.P = preprocess_transition(P, p=p, q=q) 72 | if P is None and path is None: 73 | raise ValueError( 74 | "Need a csr matrix P or a path to file that containes RWs") 75 | # 76 | self._make_RW = make_random_walk_csr 77 | 78 | def preprocess_transition(self): 79 | """ """ 80 | self.P = preprocess_transition(self.P, p=self.p, q=self.q) 81 | 82 | def __iter__(self, buffsize=1000, shuffle=True): 83 | """ """ 84 | # if walk already computed and stored locally 85 | if hasattr(self, "path"): 86 | for f in filter(lambda x: not x.startswith("."), 87 | os.listdir(self.path)): 88 | fpath = os.path.join(self.path, f) 89 | with open(fpath) as f: 90 | for line in f: 91 | walk = line.split() 92 | yield walk 93 | # otherwise compute it on the fly 94 | # slower but ok for small graph 95 | else: 96 | # G.nodes_iter 97 | for n in range(self.num_walks): 98 | # nodes_iter 99 | nodes_iter = xrange(self.P.shape[0]) 100 | if shuffle: 101 | it = itershuffle(nodes_iter, buffsize) 102 | else: 103 | it = nodes_iter 104 | # for node in node_iter 105 | for node in it: 106 | walk = self._make_RW(self.P, node, self.walk_length) 107 | yield [str(n) for n in walk] 108 | 109 | def write_walks(self, path, chunck_size=10000, worker=4, writer=1): 110 | """ """ 111 | # check if path exist 112 | if not os.path.exists(path): 113 | os.mkdir(path) 114 | 115 | # queue object 116 | input_q = mp.JoinableQueue() 117 | output_q = mp.JoinableQueue() 118 | # unpack 119 | P = self.P 120 | walk_length = self.walk_length 121 | # instantiate workers 122 | rwg_workers = [RWGWorker(input_q, output_q, walk_length, 123 | P.data, P.indices, P.indptr, P.shape) 124 | for i in range(worker)] 125 | # instantiate writers 126 | writers = [Writer(task_queue=output_q, path_root=path) 127 | for i in range(writer)] 128 | workers = rwg_workers + writers 129 | 130 | # start workers 131 | for w in workers: 132 | w.start() 133 | 134 | # give work to do 135 | n = self.n 136 | n_chunck = n // chunck_size 137 | for k in xrange(self.num_walks): 138 | s = 0 139 | for k in xrange(n_chunck): 140 | input_q.put(range(s, s + chunck_size)) 141 | s = s + chunck_size 142 | input_q.put(range(s, n)) 143 | 144 | # Add a poison pill for each consumer 145 | for i in xrange(len(workers)): 146 | input_q.put(None) 147 | 148 | # Wait for all of the tasks to finish 149 | # output_q.join() 150 | # input_q.join() 151 | for w in workers: 152 | w.join() 153 | # set the rootpath as attribute 154 | self.path = path 155 | 156 | 157 | def make_random_walk_nx(G, start_node, length): 158 | """ """ 159 | # first node of the RW 160 | walk = [start_node] 161 | # 162 | while len(walk) < length: 163 | current_node = walk[-1] 164 | neighbors = G.neighbors(current_node) 165 | d = G.degree(current_node, weight="weight") 166 | p = [1.0 * G[current_node][neighbor]["weight"] / d 167 | for neighbor in neighbors] 168 | walk.append(int(np.random.choice(neighbors, 1, p=p)[0])) 169 | return walk 170 | 171 | 172 | def make_random_walk_csr(P, start_node, length): 173 | """ 174 | Assusme that P is the probability transition matrix 175 | """ 176 | # first node of the RW 177 | walk = [start_node] 178 | while len(walk) < length: 179 | current_node = walk[-1] 180 | nbrs = P.getrow(current_node) 181 | nbrs_idx = nbrs.indices 182 | if len(nbrs_idx) == 0: 183 | break 184 | p = nbrs.data 185 | walk.append(int(np.random.choice(nbrs_idx, p=p))) 186 | return walk 187 | 188 | 189 | def alpha(p, q, t, x, P): 190 | if t == x: 191 | return 1.0 / p 192 | elif P[t, x] > 0: 193 | return 1.0 194 | else: 195 | return 1.0 / q 196 | 197 | 198 | def preprocess_transition(M, p=1, q=1): 199 | """ """ 200 | # deep walk case 201 | if p == 1 and q == 1: 202 | P = normalize(M, norm='l1', axis=1) 203 | # node2vec case 204 | else: 205 | P = M.copy() 206 | for src in xrange(M.shape[0]): 207 | src_row = M.getrow(src) 208 | src_nbr = src_row.indices 209 | for dst in src_nbr: 210 | dst_row = M.getrow(dst) 211 | dst_nbr = dst_row.indices 212 | dst_prob = dst_row.data 213 | dst_ptr = dst_row.indptr 214 | # compute coef alpha of transition 215 | alphas = np.array([alpha(p, q, src, x, P) for x in dst_nbr]) 216 | dst_prob = alphas * dst_prob 217 | # update transition matrix P 218 | P[dst] = csr_matrix((dst_prob, dst_nbr, dst_ptr), 219 | shape=(1, P.shape[1])) 220 | # normalize probability 221 | P = normalize(P, norm='l1', axis=1) 222 | return P 223 | 224 | 225 | def make_random_walk_biased(G, start_node, length, p, q): 226 | """ """ 227 | # first node of the RW 228 | walk = [start_node] 229 | 230 | while len(walk) < length: 231 | current_node = walk[-1] 232 | neighbors = G.neighbors(current_node) 233 | if len(walk) == 1: 234 | d = G.degree(current_node, weight="weight") 235 | prob = [1.0 * G[current_node][neighbor]["weight"] / d 236 | for neighbor in neighbors] 237 | else: 238 | prev = walk[-2] 239 | prob = [] 240 | for nbr in neighbors: 241 | if nbr == prev: 242 | pr = 1.0 * G[current_node][nbr]["weight"] / p 243 | elif G.has_edge(prev, nbr): 244 | pr = 1.0 * G[current_node][nbr]["weight"] 245 | else: 246 | pr = 1.0 * G[current_node][nbr]["weight"] / q 247 | prob.append(pr) 248 | # normalize 249 | prob = np.array(prob) / sum(prob) 250 | 251 | # sample a next node according to prob 252 | walk.append(int(np.random.choice(neighbors, 1, p=prob)[0])) 253 | return walk 254 | 255 | 256 | class RWGWorker(mp.Process): 257 | """ Worker that generate RW """ 258 | def __init__(self, input_q, output_q, walk_length, data, indices, 259 | indptr, shape, shuffle=False): 260 | """ """ 261 | super(RWGWorker, self).__init__() 262 | 263 | # instantiate the CSR matrix 264 | self.P = csr_matrix((data, indices, indptr), shape=shape, copy=False) 265 | self.input_q = input_q 266 | self.output_q = output_q 267 | self.walk_length = walk_length 268 | self.shuffle = shuffle 269 | 270 | def run(self): 271 | """ """ 272 | # nodes_iter 273 | # node = self.input_q.get() 274 | # if node is not None: 275 | try: 276 | for list_of_nodes in iter(self.input_q.get, None): 277 | # it = itershuffle(xrange(self.P.shape[0]), bufsize=10000) 278 | if list_of_nodes is not None: 279 | random.shuffle(list_of_nodes) 280 | for node in list_of_nodes: 281 | walk = make_random_walk_csr(self.P, node, self.walk_length) 282 | if len(walk) > 2: 283 | walk = [str(node) for node in walk] 284 | # send result in output queue 285 | self.output_q.put(walk) 286 | self.input_q.task_done() 287 | # 288 | self.output_q.put(None) 289 | except KeyboardInterrupt: 290 | sys.exit(0) 291 | 292 | 293 | class Writer(mp.Process): 294 | """ """ 295 | c = 0 296 | 297 | def __init__(self, task_queue, path_root): 298 | """ """ 299 | super(Writer, self).__init__() 300 | self.task_queue = task_queue 301 | self.path_root = path_root 302 | self.id = self._count() 303 | self.path = os.path.join(path_root, "part-%s" % self.c) 304 | 305 | def run(self): 306 | """ """ 307 | try: 308 | proc_name = self.name 309 | with open(self.path, "w") as fout: 310 | for walk in iter(self.task_queue.get, None): 311 | fout.write(" ".join(walk) + "\n") 312 | self.task_queue.task_done() 313 | return 314 | except KeyboardInterrupt: 315 | sys.exit(0) 316 | except Exception, e: 317 | print e.message, e.args 318 | 319 | @classmethod 320 | def _count(cls): 321 | Writer.c += 1 322 | return Writer.c 323 | -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | 4 | import networkx as nx 5 | import numpy as np 6 | from gensim.models import Word2Vec 7 | import random 8 | import logging 9 | import sys 10 | from sklearn import datasets 11 | from sklearn.manifold import TSNE 12 | from collections import defaultdict 13 | from six import iteritems, itervalues 14 | from math import sqrt 15 | from concurrent.futures import ProcessPoolExecutor 16 | from functools import partial 17 | from smart_open import smart_open 18 | 19 | 20 | # start with karate graph 21 | path = "./karate.edgelist" 22 | 23 | # read graph edge list 24 | 25 | G = nx.read_edgelist(path, nodetype=int, create_using=nx.DiGraph()) 26 | 27 | # no weight so default weight of 1 28 | for edge in G.edges(): 29 | G[edge[0]][edge[1]]['weight'] = 1 30 | 31 | # not directed graph 32 | G = G.to_undirected() 33 | 34 | 35 | # access nodes, and edges 36 | G.nodes() # return list of nodes 37 | G.edges() # return list of edges tuple 38 | G.is_directed() # shoulb be false :) 39 | # adjacency matrix 40 | M = nx.adjacency_matrix(G) 41 | 42 | 43 | make_random_walk(G, 2, 5) 44 | 45 | 46 | def make_random_walk(G, start_node, length, alpha=None): 47 | """ """ 48 | # first node of the RW 49 | walk = [start_node] 50 | # 51 | while len(walk) < length: 52 | current_node = walk[-1] 53 | neighbors = G.neighbors(current_node) 54 | d = G.degree(current_node, weight="weight") 55 | p = [1.0 * G[current_node][neighbor]["weight"] / d 56 | for neighbor in neighbors] 57 | walk.append(int(np.random.choice(neighbors, 1, p=p)[0])) 58 | return walk 59 | 60 | 61 | def make_random_walk_biased(G, start_node, length, p, q): 62 | """ """ 63 | # first node of the RW 64 | walk = [start_node] 65 | 66 | while len(walk) < length: 67 | current_node = walk[-1] 68 | neighbors = G.neighbors(current_node) 69 | if len(walk) == 1: 70 | d = G.degree(current_node, weight="weight") 71 | prob = [1.0 * G[current_node][neighbor]["weight"] / d 72 | for neighbor in neighbors] 73 | else: 74 | prev = walk[-2] 75 | prob = [] 76 | for nbr in neighbors: 77 | if nbr == prev: 78 | pr = 1.0 * G[current_node][nbr]["weight"] / p 79 | elif G.has_edge(prev, nbr): 80 | pr = 1.0 * G[current_node][nbr]["weight"] 81 | else: 82 | pr = 1.0 * G[current_node][nbr]["weight"] / q 83 | prob.append(pr) 84 | # normalize 85 | prob = np.array(prob) / sum(prob) 86 | 87 | # sample a next node according to prob 88 | walk.append(int(np.random.choice(neighbors, 1, p=prob)[0])) 89 | return walk 90 | 91 | 92 | class RandomWalksGenerator(object): 93 | """ """ 94 | def __init__(self, graph, walk_length, num_walks, p=1, q=1): 95 | """ """ 96 | self.G = graph 97 | self.walk_length = walk_length 98 | self.num_walks = num_walks 99 | self.p = p 100 | self.q = q 101 | if p == 1 and q == 1: 102 | self._make_RW = make_random_walk 103 | else: 104 | self._make_RW = partial(make_random_walk_biased, 105 | p=self.p, q=self.q) 106 | 107 | def __iter__(self, buffsize=1000): 108 | """ """ 109 | # G.nodes_iter 110 | for n in range(self.num_walks): 111 | # nodes_iter 112 | nodes_iter = G.nodes_iter() 113 | # for node in node_iter 114 | for node in itershuffle(nodes_iter, buffsize): 115 | walk = self._make_RW(self.G, node, self.walk_length) 116 | yield [str(n) for n in walk] 117 | 118 | def write_walks(self, path): 119 | with smart_open(path, 'wb') as fout: 120 | for walk in self: 121 | fout.write(" ".join(walk)) 122 | 123 | 124 | def itershuffle(iterable, bufsize=1000): 125 | """ 126 | Shuffle an iterator. This works by holding `bufsize` items back 127 | and yielding them sometime later. This is NOT 100% random, 128 | proved or anything. 129 | """ 130 | iterable = iter(iterable) 131 | buf = [] 132 | try: 133 | while True: 134 | for i in xrange(random.randint(1, bufsize - len(buf))): 135 | buf.append(iterable.next()) 136 | random.shuffle(buf) 137 | for i in xrange(random.randint(1, bufsize)): 138 | if buf: 139 | yield buf.pop() 140 | else: 141 | break 142 | except StopIteration: 143 | random.shuffle(buf) 144 | while buf: 145 | yield buf.pop() 146 | raise StopIteration 147 | 148 | 149 | def deep_walk(G, walk_length=10, num_walks=20, dim=2, iter=50): 150 | RWG = RandomWalksGenerator(G, walk_length=walk_length, num_walks=num_walks) 151 | skipgram = Word2Vec(sg=1, iter=iter, min_count=0, size=dim, batch_words=100) 152 | skipgram.build_vocab(RWG) 153 | skipgram.train(RWG) 154 | return skipgram 155 | 156 | class MyWord2Vec(Word2Vec): 157 | """ 158 | """ 159 | def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, 160 | progress_per=10000, update=False): 161 | """ 162 | Build vocabulary from a sequence of sentences 163 | (can be a once-only generator stream). 164 | Each sentence must be a list of unicode strings. 165 | 166 | """ 167 | self.scan_vocab(sentences, progress_per=progress_per, 168 | trim_rule=trim_rule, update=update) 169 | self.scale_vocab(keep_raw_vocab=keep_raw_vocab, 170 | trim_rule=trim_rule, update=update) 171 | self.finalize_vocab(update=update) 172 | 173 | def scan_vocab(self, sentences, progress_per=10000, trim_rule=None, 174 | update=False): 175 | """Do an initial scan of all words appearing in sentences.""" 176 | logger.info("collecting all nodes and their counts") 177 | sentence_no = -1 178 | total_words = 0 179 | vocab = defaultdict(int) 180 | 181 | for sentence_no, sentence in enumerate(sentences): 182 | for word in sentence: 183 | vocab[word] += 1 184 | total_words += sum(itervalues(vocab)) 185 | logger.info("collected %i word types from a corpus of %i raw words and %i sentences", 186 | len(vocab), total_words, sentence_no + 1) 187 | self.corpus_count = sentence_no + 1 188 | self.raw_vocab = vocab 189 | self.total_words = total_words 190 | 191 | def scale_vocab(self, sample=None, dry_run=False, 192 | keep_raw_vocab=False, trim_rule=None, update=False): 193 | """ 194 | Apply vocabulary settings for `min_count` (discarding less-frequent words) 195 | and `sample` (controlling the downsampling of more-frequent words). 196 | 197 | Calling with `dry_run=True` will only simulate the provided settings and 198 | report the size of the retained vocabulary, effective corpus length, and 199 | estimated memory requirements. Results are both printed via logging and 200 | returned as a dict. 201 | 202 | Delete the raw vocabulary after the scaling is done to free up RAM, 203 | unless `keep_raw_vocab` is set. 204 | 205 | """ 206 | sample = sample or self.sample 207 | 208 | logger.info("Loading a fresh vocabulary") 209 | 210 | # Discard words less-frequent than min_count 211 | if not dry_run: 212 | self.index2word = [] 213 | # make stored settings match these applied settings 214 | self.sample = sample 215 | self.vocab = {} 216 | 217 | for word, v in iteritems(self.raw_vocab): 218 | if not dry_run: 219 | self.vocab[word] = Vocab(count=v, index=len(self.index2word)) 220 | self.index2word.append(word) 221 | 222 | retain_total = self.total_words 223 | 224 | # Precalculate each vocabulary item's threshold for sampling 225 | if not sample: 226 | # no words downsampled 227 | threshold_count = retain_total 228 | elif sample < 1.0: 229 | # traditional meaning: set parameter as proportion of total 230 | threshold_count = sample * retain_total 231 | else: 232 | # new shorthand: sample >= 1 means downsample all words with 233 | # higher count than sample 234 | threshold_count = int(sample * (3 + sqrt(5)) / 2) 235 | 236 | downsample_total, downsample_unique = 0, 0 237 | for w in self.raw_vocab.iterkeys(): 238 | v = self.raw_vocab[w] 239 | word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) 240 | if word_probability < 1.0: 241 | downsample_unique += 1 242 | downsample_total += word_probability * v 243 | else: 244 | word_probability = 1.0 245 | downsample_total += v 246 | if not dry_run: 247 | self.vocab[w].sample_int = int(round(word_probability * 2**32)) 248 | 249 | if not dry_run and not keep_raw_vocab: 250 | logger.info("deleting the raw counts dictionary of %i items", 251 | len(self.raw_vocab)) 252 | self.raw_vocab = defaultdict(int) 253 | 254 | logger.info("sample=%g downsamples %i most-common words", 255 | sample, downsample_unique) 256 | logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", 257 | downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) 258 | 259 | # print extra memory estimates 260 | memory = self.estimate_memory(vocab_size=len(self.vocab)) 261 | 262 | return memory 263 | 264 | def finalize_vocab(self, update=False): 265 | """ 266 | Build tables and model weights based on final vocabulary settings. 267 | """ 268 | if not self.index2word: 269 | self.scale_vocab() 270 | if self.sorted_vocab and not update: 271 | self.sort_vocab() 272 | if self.hs: 273 | # add info about each word's Huffman encoding 274 | self.create_binary_tree() 275 | if self.negative: 276 | # build the table for drawing random words (for negative sampling) 277 | self.make_cum_table() 278 | # set initial input/projection and hidden weights 279 | if not update: 280 | self.reset_weights() 281 | else: 282 | self.update_weights() 283 | 284 | 285 | logger = logging.getLogger("gensim") 286 | logger.handlers[0].stream = sys.stdout 287 | 288 | 289 | RWG = RandomWalksGenerator(G, walk_length=10, num_walks=20) 290 | skipgram = Word2Vec(sg=1, iter=50, min_count=0, size=4, batch_words=100) 291 | skipgram.build_vocab(RWG) 292 | skipgram.train(RWG) 293 | skipgram.init_sims() 294 | 295 | tsne = TSNE(n_components=2, random_state=0) 296 | 297 | 298 | Y1 = tsne.fit_transform(skipgram.syn0) 299 | Y2 = skipgram.syn0 300 | 301 | 302 | from bokeh.plotting import figure, output_file, show 303 | 304 | # output to static HTML file 305 | output_file("line.html") 306 | 307 | p = figure(plot_width=400, plot_height=400) 308 | p.circle(Y2[:, 0], Y2[:, 1], color="navy", alpha=0.5) 309 | # show the results 310 | show(p) 311 | 312 | 313 | # Two moons 314 | 315 | n_samples = 100 316 | X, Y = datasets.make_moons(n_samples=n_samples, noise=.05) 317 | 318 | # colors 319 | colors = ["#B3DE69" if y else "#CAB2D6" for y in Y] 320 | 321 | # output to static HTML file 322 | output_file("two_moons.html") 323 | 324 | p = figure(plot_width=400, plot_height=400) 325 | p.circle(X[:, 0], X[:, 1], color=colors, alpha=0.5) 326 | # show the results 327 | show(p) 328 | 329 | 330 | from scipy.spatial.distance import pdist, euclidean 331 | from sklearn.metric.pairwise import pairwise_distances 332 | from sklearn.neighbors import kneighbors_graph 333 | 334 | 335 | # build graph 336 | def RBF(x1, x2, sigma=1): 337 | """ """ 338 | return np.exp(euclidean(x1, x2) / sigma) 339 | 340 | 341 | sim_mat = pairwise_distances(X, metric=RBF) 342 | 343 | 344 | n_neighbors = 10 345 | mode = "distance" # connectivity or distance 346 | kneighbors_graph(X, n_neighbors=n_neighbors, metric=RBF, mode=mode) 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | -------------------------------------------------------------------------------- /test/test_random_walk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | from random_walk import RandomWalksGeneratorCSR 4 | from helper import read_csr_matrix 5 | 6 | 7 | # load karate graph in csr matrix 8 | karate_csr = read_csr_matrix("karate.txt", make_sym=True) 9 | 10 | # 11 | p = 2 12 | q = 0.3 13 | walk_length = 15 14 | num_walks = 10 15 | 16 | # init RW generator 17 | RWG = RandomWalksGeneratorCSR(P=karate_csr, walk_length=walk_length, 18 | num_walks=num_walks, p=2, q=0.5, 19 | preprocess=False) 20 | 21 | # preprocess transition 22 | RWG.preprocess_transition() 23 | 24 | # compute and write Random walks 25 | path = "./RW" 26 | RWG.write_walks(path, worker=4, writer=1, chunck_size=10) 27 | 28 | # load random walk generator 29 | RWG = RandomWalksGeneratorCSR(path='./RW') 30 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | from random_walk import RandomWalksGeneratorCSR 4 | from word2vec import Word2Vec 5 | import argparse 6 | import logging 7 | 8 | 9 | # logging config 10 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 11 | level=logging.INFO) 12 | 13 | # command line arg parser 14 | parser = argparse.ArgumentParser( 15 | description="Learn Node Embedding using SkipGram") 16 | 17 | 18 | parser.add_argument( 19 | '--input', type=str, 20 | help=u"path to your edgelist representing the graph") 21 | 22 | parser.add_argument( 23 | '--output', type=str, 24 | help=u"path a folder to store computed random walk files ") 25 | 26 | parser.add_argument( 27 | '--iter', type=int, default=5, 28 | help=u"Number iteration over the random walk corpus") 29 | 30 | parser.add_argument( 31 | '--size', type=int, default=128, 32 | help=u"Size of embedding vectors") 33 | 34 | parser.add_argument( 35 | '--worker', type=int, default=4, 36 | help=u"Number of worker for the skipgram algorithm") 37 | 38 | parser.add_argument( 39 | '--batch-nodes', type=int, default=10000, 40 | help=u"Number of node per batch") 41 | 42 | parser.add_argument( 43 | '--negative', type=int, default=5, 44 | help=u"Value for the negative parameter of word2vec. \ 45 | Number of negative samples") 46 | 47 | parser.add_argument( 48 | '--sample', type=float, default=1e-5, 49 | help=u"Value for the sample parameter of word2vec. \ 50 | Treshold for downsampling reccurent nodes") 51 | 52 | parser.add_argument( 53 | '--output-format', type=str, default="gensim", 54 | help=u"Format of the ouput: \ 55 | gensim (gensim.word2vec.Word2Vec picklize model) or \ 56 | txt (standard format in txt file for word2vec)") 57 | 58 | 59 | def main(input, output, iter=5, size=128, worker=4, batch_nodes=10000, 60 | negative=5, sample=1e-4, output_format="gensim"): 61 | 62 | # load karate graph in csr matrix 63 | RWG = RandomWalksGeneratorCSR(path=input) 64 | # init model 65 | skipgram = Word2Vec(sg=1, iter=iter, min_count=0, size=size, 66 | workers=worker, batch_words=batch_nodes, 67 | sample=sample, negative=negative) 68 | # build vocab 69 | skipgram.build_vocab(RWG) 70 | # learn embbeding 71 | skipgram.train(RWG) 72 | if output_format == "gensim": 73 | skipgram.save(output) 74 | elif output_format == "txt": 75 | skipgram.save_word2vec_format(output) 76 | 77 | 78 | if __name__ == '__main__': 79 | args = parser.parse_args() 80 | main(**args.__dict__) 81 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: ThomasO 3 | from gensim.models.word2vec import Word2Vec, Vocab 4 | from collections import defaultdict 5 | from six import iteritems, itervalues 6 | import logging 7 | from math import sqrt 8 | 9 | 10 | logger = logging.getLogger("gensim") 11 | 12 | 13 | class Word2Vec(Word2Vec): 14 | """ 15 | """ 16 | def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, 17 | progress_per=10000, update=False): 18 | """ 19 | Build vocabulary from a sequence of sentences 20 | (can be a once-only generator stream). 21 | Each sentence must be a list of unicode strings. 22 | 23 | """ 24 | self.scan_vocab(sentences, progress_per=progress_per, 25 | trim_rule=trim_rule, update=update) 26 | self.scale_vocab(keep_raw_vocab=keep_raw_vocab, 27 | trim_rule=trim_rule, update=update) 28 | self.finalize_vocab(update=update) 29 | 30 | def scan_vocab(self, sentences, progress_per=10000, trim_rule=None, 31 | update=False): 32 | """Do an initial scan of all words appearing in sentences.""" 33 | logger.info("collecting all nodes and their counts") 34 | sentence_no = -1 35 | total_words = 0 36 | vocab = defaultdict(int) 37 | 38 | for sentence_no, sentence in enumerate(sentences): 39 | for word in sentence: 40 | vocab[word] += 1 41 | total_words += sum(itervalues(vocab)) 42 | logger.info("collected %i word types from a corpus of %i raw words and %i sentences", 43 | len(vocab), total_words, sentence_no + 1) 44 | self.corpus_count = sentence_no + 1 45 | self.raw_vocab = vocab 46 | self.total_words = total_words 47 | 48 | def scale_vocab(self, sample=None, dry_run=False, 49 | keep_raw_vocab=False, trim_rule=None, update=False): 50 | """ 51 | Apply vocabulary settings for `min_count` (discarding less-frequent words) 52 | and `sample` (controlling the downsampling of more-frequent words). 53 | 54 | Calling with `dry_run=True` will only simulate the provided settings and 55 | report the size of the retained vocabulary, effective corpus length, and 56 | estimated memory requirements. Results are both printed via logging and 57 | returned as a dict. 58 | 59 | Delete the raw vocabulary after the scaling is done to free up RAM, 60 | unless `keep_raw_vocab` is set. 61 | 62 | """ 63 | sample = sample or self.sample 64 | 65 | logger.info("Loading a fresh vocabulary") 66 | 67 | # Discard words less-frequent than min_count 68 | if not dry_run: 69 | self.index2word = [] 70 | # make stored settings match these applied settings 71 | self.sample = sample 72 | self.vocab = {} 73 | 74 | for word, v in iteritems(self.raw_vocab): 75 | if not dry_run: 76 | self.vocab[word] = Vocab(count=v, index=len(self.index2word)) 77 | self.index2word.append(word) 78 | 79 | retain_total = self.total_words 80 | 81 | # Precalculate each vocabulary item's threshold for sampling 82 | if not sample: 83 | # no words downsampled 84 | threshold_count = retain_total 85 | elif sample < 1.0: 86 | # traditional meaning: set parameter as proportion of total 87 | threshold_count = sample * retain_total 88 | else: 89 | # new shorthand: sample >= 1 means downsample all words with 90 | # higher count than sample 91 | threshold_count = int(sample * (3 + sqrt(5)) / 2) 92 | 93 | downsample_total, downsample_unique = 0, 0 94 | for w in self.raw_vocab.iterkeys(): 95 | v = self.raw_vocab[w] 96 | word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) 97 | if word_probability < 1.0: 98 | downsample_unique += 1 99 | downsample_total += word_probability * v 100 | else: 101 | word_probability = 1.0 102 | downsample_total += v 103 | if not dry_run: 104 | self.vocab[w].sample_int = int(round(word_probability * 2**32)) 105 | 106 | if not dry_run and not keep_raw_vocab: 107 | logger.info("deleting the raw counts dictionary of %i items", 108 | len(self.raw_vocab)) 109 | self.raw_vocab = defaultdict(int) 110 | 111 | logger.info("sample=%g downsamples %i most-common words", 112 | sample, downsample_unique) 113 | logger.info("downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", 114 | downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) 115 | 116 | # print extra memory estimates 117 | memory = self.estimate_memory(vocab_size=len(self.vocab)) 118 | 119 | return memory 120 | 121 | def finalize_vocab(self, update=False): 122 | """ 123 | Build tables and model weights based on final vocabulary settings. 124 | """ 125 | if not self.index2word: 126 | self.scale_vocab() 127 | if self.sorted_vocab and not update: 128 | self.sort_vocab() 129 | if self.hs: 130 | # add info about each word's Huffman encoding 131 | self.create_binary_tree() 132 | if self.negative: 133 | # build the table for drawing random words (for negative sampling) 134 | self.make_cum_table() 135 | # set initial input/projection and hidden weights 136 | if not update: 137 | self.reset_weights() 138 | else: 139 | self.update_weights() 140 | 141 | --------------------------------------------------------------------------------