├── README.md
├── preprocess
    └── clean_assii.py
├── LICENSE
├── v1
    ├── train_word2vec_model.py
    └── process_wiki.py
├── .gitignore
└── v2
    └── train_word2vec_with_gensim.py


/README.md:
--------------------------------------------------------------------------------
1 | # Word2vec 4 Wikipedia
2 | Train Word2vec Model based on Wikipedia by Python Gensim
3 | 


--------------------------------------------------------------------------------
/preprocess/clean_assii.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Pan Yang (panyangnlp@gmail.com)
 4 | # Copyright 2017
 5 | 
 6 | import string
 7 | from sys import stdin
 8 | 
 9 | printable = set(string.printable)
10 | 
11 | for line in stdin:
12 |     filter_line = filter(lambda x: x not in printable, line).strip()
13 |     if filter_line != "":
14 |         print filter_line
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Pan Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/v1/train_word2vec_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Pan Yang (panyangnlp@gmail.com)
 4 | # Copyright 2017
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import logging
 9 | import os
10 | import sys
11 | import multiprocessing
12 | 
13 | from gensim.models import Word2Vec
14 | from gensim.models.word2vec import LineSentence
15 | 
16 | if __name__ == '__main__':
17 |     program = os.path.basename(sys.argv[0])
18 |     logger = logging.getLogger(program)
19 | 
20 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
21 |     logging.root.setLevel(level=logging.INFO)
22 |     logger.info("running %s" % ' '.join(sys.argv))
23 | 
24 |     # check and process input arguments
25 |     if len(sys.argv) < 4:
26 |         print("Useing: python train_word2vec_model.py input_text "
27 |               "output_gensim_model output_word_vector")
28 |         sys.exit(1)
29 |     inp, outp1, outp2 = sys.argv[1:4]
30 | 
31 |     model = Word2Vec(LineSentence(inp), size=200, window=5, min_count=5,
32 |                      workers=multiprocessing.cpu_count())
33 | 
34 |     model.save(outp1)
35 |     model.wv.save_word2vec_format(outp2, binary=False)
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/v1/process_wiki.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Pan Yang (panyangnlp@gmail.com)
 4 | # Copyrigh 2017
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import logging
 9 | import os.path
10 | import six
11 | import sys
12 | 
13 | from gensim.corpora import WikiCorpus
14 | 
15 | if __name__ == '__main__':
16 |     program = os.path.basename(sys.argv[0])
17 |     logger = logging.getLogger(program)
18 | 
19 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
20 |     logging.root.setLevel(level=logging.INFO)
21 |     logger.info("running %s" % ' '.join(sys.argv))
22 | 
23 |     # check and process input arguments
24 |     if len(sys.argv) != 3:
25 |         print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text")
26 |         sys.exit(1)
27 |     inp, outp = sys.argv[1:3]
28 |     space = " "
29 |     i = 0
30 | 
31 |     output = open(outp, 'w')
32 |     wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
33 |     for text in wiki.get_texts():
34 |         if six.PY3:
35 |             output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
36 |         #   ###another method###
37 |         #    output.write(
38 |         #            space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
39 |         else:
40 |             output.write(space.join(text) + "\n")
41 |         i = i + 1
42 |         if (i % 10000 == 0):
43 |             logger.info("Saved " + str(i) + " articles")
44 | 
45 |     output.close()
46 |     logger.info("Finished Saved " + str(i) + " articles")
47 | 


--------------------------------------------------------------------------------
/v2/train_word2vec_with_gensim.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Pan Yang (panyangnlp@gmail.com)
 4 | # Copyright 2017 @ Yu Zhen
 5 | 
 6 | import gensim
 7 | import logging
 8 | import multiprocessing
 9 | import os
10 | import re
11 | import sys
12 | 
13 | from pattern.en import tokenize
14 | from time import time
15 | 
16 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
17 |                     level=logging.INFO)
18 | 
19 | 
20 | def cleanhtml(raw_html):
21 |     cleanr = re.compile('<.*?>')
22 |     cleantext = re.sub(cleanr, ' ', raw_html)
23 |     return cleantext
24 | 
25 | 
26 | class MySentences(object):
27 |     def __init__(self, dirname):
28 |         self.dirname = dirname
29 | 
30 |     def __iter__(self):
31 |         for root, dirs, files in os.walk(self.dirname):
32 |             for filename in files:
33 |                 file_path = root + '/' + filename
34 |                 for line in open(file_path):
35 |                     sline = line.strip()
36 |                     if sline == "":
37 |                         continue
38 |                     rline = cleanhtml(sline)
39 |                     tokenized_line = ' '.join(tokenize(rline))
40 |                     is_alpha_word_line = [word for word in
41 |                                           tokenized_line.lower().split()
42 |                                           if word.isalpha()]
43 |                     yield is_alpha_word_line
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     if len(sys.argv) != 2:
48 |         print "Please use python train_with_gensim.py data_path"
49 |         exit()
50 |     data_path = sys.argv[1]
51 |     begin = time()
52 | 
53 |     sentences = MySentences(data_path)
54 |     model = gensim.models.Word2Vec(sentences,
55 |                                    size=200,
56 |                                    window=10,
57 |                                    min_count=10,
58 |                                    workers=multiprocessing.cpu_count())
59 |     model.save("data/model/word2vec_gensim")
60 |     model.wv.save_word2vec_format("data/model/word2vec_org",
61 |                                   "data/model/vocabulary",
62 |                                   binary=False)
63 | 
64 |     end = time()
65 |     print "Total procesing time: %d seconds" % (end - begin)
66 | 


--------------------------------------------------------------------------------