├── README.md ├── preprocess └── clean_assii.py ├── LICENSE ├── v1 ├── train_word2vec_model.py └── process_wiki.py ├── .gitignore └── v2 └── train_word2vec_with_gensim.py /README.md: -------------------------------------------------------------------------------- 1 | # Word2vec 4 Wikipedia 2 | Train Word2vec Model based on Wikipedia by Python Gensim 3 | -------------------------------------------------------------------------------- /preprocess/clean_assii.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Pan Yang (panyangnlp@gmail.com) 4 | # Copyright 2017 5 | 6 | import string 7 | from sys import stdin 8 | 9 | printable = set(string.printable) 10 | 11 | for line in stdin: 12 | filter_line = filter(lambda x: x not in printable, line).strip() 13 | if filter_line != "": 14 | print filter_line 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Pan Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /v1/train_word2vec_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Pan Yang (panyangnlp@gmail.com) 4 | # Copyright 2017 5 | 6 | from __future__ import print_function 7 | 8 | import logging 9 | import os 10 | import sys 11 | import multiprocessing 12 | 13 | from gensim.models import Word2Vec 14 | from gensim.models.word2vec import LineSentence 15 | 16 | if __name__ == '__main__': 17 | program = os.path.basename(sys.argv[0]) 18 | logger = logging.getLogger(program) 19 | 20 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 21 | logging.root.setLevel(level=logging.INFO) 22 | logger.info("running %s" % ' '.join(sys.argv)) 23 | 24 | # check and process input arguments 25 | if len(sys.argv) < 4: 26 | print("Useing: python train_word2vec_model.py input_text " 27 | "output_gensim_model output_word_vector") 28 | sys.exit(1) 29 | inp, outp1, outp2 = sys.argv[1:4] 30 | 31 | model = Word2Vec(LineSentence(inp), size=200, window=5, min_count=5, 32 | workers=multiprocessing.cpu_count()) 33 | 34 | model.save(outp1) 35 | model.wv.save_word2vec_format(outp2, binary=False) 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /v1/process_wiki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Pan Yang (panyangnlp@gmail.com) 4 | # Copyrigh 2017 5 | 6 | from __future__ import print_function 7 | 8 | import logging 9 | import os.path 10 | import six 11 | import sys 12 | 13 | from gensim.corpora import WikiCorpus 14 | 15 | if __name__ == '__main__': 16 | program = os.path.basename(sys.argv[0]) 17 | logger = logging.getLogger(program) 18 | 19 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 20 | logging.root.setLevel(level=logging.INFO) 21 | logger.info("running %s" % ' '.join(sys.argv)) 22 | 23 | # check and process input arguments 24 | if len(sys.argv) != 3: 25 | print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text") 26 | sys.exit(1) 27 | inp, outp = sys.argv[1:3] 28 | space = " " 29 | i = 0 30 | 31 | output = open(outp, 'w') 32 | wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) 33 | for text in wiki.get_texts(): 34 | if six.PY3: 35 | output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') 36 | # ###another method### 37 | # output.write( 38 | # space.join(map(lambda x:x.decode("utf-8"), text)) + '\n') 39 | else: 40 | output.write(space.join(text) + "\n") 41 | i = i + 1 42 | if (i % 10000 == 0): 43 | logger.info("Saved " + str(i) + " articles") 44 | 45 | output.close() 46 | logger.info("Finished Saved " + str(i) + " articles") 47 | -------------------------------------------------------------------------------- /v2/train_word2vec_with_gensim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Pan Yang (panyangnlp@gmail.com) 4 | # Copyright 2017 @ Yu Zhen 5 | 6 | import gensim 7 | import logging 8 | import multiprocessing 9 | import os 10 | import re 11 | import sys 12 | 13 | from pattern.en import tokenize 14 | from time import time 15 | 16 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 17 | level=logging.INFO) 18 | 19 | 20 | def cleanhtml(raw_html): 21 | cleanr = re.compile('<.*?>') 22 | cleantext = re.sub(cleanr, ' ', raw_html) 23 | return cleantext 24 | 25 | 26 | class MySentences(object): 27 | def __init__(self, dirname): 28 | self.dirname = dirname 29 | 30 | def __iter__(self): 31 | for root, dirs, files in os.walk(self.dirname): 32 | for filename in files: 33 | file_path = root + '/' + filename 34 | for line in open(file_path): 35 | sline = line.strip() 36 | if sline == "": 37 | continue 38 | rline = cleanhtml(sline) 39 | tokenized_line = ' '.join(tokenize(rline)) 40 | is_alpha_word_line = [word for word in 41 | tokenized_line.lower().split() 42 | if word.isalpha()] 43 | yield is_alpha_word_line 44 | 45 | 46 | if __name__ == '__main__': 47 | if len(sys.argv) != 2: 48 | print "Please use python train_with_gensim.py data_path" 49 | exit() 50 | data_path = sys.argv[1] 51 | begin = time() 52 | 53 | sentences = MySentences(data_path) 54 | model = gensim.models.Word2Vec(sentences, 55 | size=200, 56 | window=10, 57 | min_count=10, 58 | workers=multiprocessing.cpu_count()) 59 | model.save("data/model/word2vec_gensim") 60 | model.wv.save_word2vec_format("data/model/word2vec_org", 61 | "data/model/vocabulary", 62 | binary=False) 63 | 64 | end = time() 65 | print "Total procesing time: %d seconds" % (end - begin) 66 | --------------------------------------------------------------------------------