├── ├── COPYING ├── COPYING.LESSER ├── MANIFEST.in ├── __init__.py ├── __init__.pyc ├── ez_setup.py ├── gensim ├── __init__.py ├── __init__.pyc ├── corpora │ ├── __init__.py │ ├── __init__.pyc │ ├── bleicorpus.py │ ├── bleicorpus.pyc │ ├── csvcorpus.py │ ├── dictionary.py │ ├── dictionary.pyc │ ├── hashdictionary.py │ ├── hashdictionary.pyc │ ├── indexedcorpus.py │ ├── indexedcorpus.pyc │ ├── lowcorpus.py │ ├── lowcorpus.pyc │ ├── malletcorpus.py │ ├── malletcorpus.pyc │ ├── mmcorpus.py │ ├── mmcorpus.pyc │ ├── svmlightcorpus.py │ ├── svmlightcorpus.pyc │ ├── textcorpus.py │ ├── textcorpus.pyc │ ├── ucicorpus.py │ ├── ucicorpus.pyc │ ├── wikicorpus.py │ └── wikicorpus.pyc ├── examples │ └── dmlcz │ │ ├── __init__.py │ │ ├── dmlcorpus.py │ │ ├── gensim_build.py │ │ ├── gensim_genmodel.py │ │ ├── gensim_xml.py │ │ ├── runall.sh │ │ └── sources.py ├── interfaces.py ├── interfaces.pyc ├── matutils.py ├── matutils.pyc ├── models │ ├── __init__.py │ ├── __init__.pyc │ ├── hdpmodel.py │ ├── hdpmodel.pyc │ ├── lda_dispatcher.py │ ├── lda_worker.py │ ├── ldamallet.py │ ├── ldamallet.pyc │ ├── ldamodel.py │ ├── ldamodel.pyc │ ├── logentropy_model.py │ ├── logentropy_model.pyc │ ├── lsi_dispatcher.py │ ├── lsi_worker.py │ ├── lsimodel.py │ ├── lsimodel.pyc │ ├── rpmodel.py │ ├── rpmodel.pyc │ ├── tfidfmodel.py │ ├── tfidfmodel.pyc │ ├── voidptr.h │ ├── word2vec.py │ ├── word2vec.pyc │ └── word2vec_inner.pyx ├── nosy.py ├── parsing │ ├── __init__.py │ ├── porter.py │ └── preprocessing.py ├── scripts │ ├── __init__.py │ ├── make_wiki.py │ ├── make_wiki_lemma.py │ ├── make_wiki_online.py │ ├── make_wiki_online_lemma.py │ ├── make_wiki_online_nodebug.py │ └── make_wikicorpus.py ├── similarities │ ├── __init__.py │ ├── __init__.pyc │ ├── docsim.py │ └── docsim.pyc ├── utils.py └── utils.pyc ├── gensim_addons ├── __init__.py ├── __init__.pyc └── models │ ├── __init__.py │ ├── __init__.pyc │ ├── word2vec_inner.c │ └── word2vec_inner.pyx ├── readme.md ├── setup.cfg ├── setup.py ├── standard.py ├── svm_test.py ├── test.cc ├── test.py ├── test_it.sh ├── test_nn.py └── test_word2vec.py / : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/ -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include docs * 2 | recursive-include gensim/test/test_data * 3 | recursive-include . *.sh 4 | prune docs/src* 5 | include README.rst 6 | include CHANGELOG.txt 7 | include COPYING 8 | include COPYING.LESSER 9 | include ez_setup.py 10 | include gensim/models/voidptr.h 11 | include gensim/models/word2vec_inner.pyx 12 | include gensim_addons/models/word2vec_inner.pyx 13 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # script by dedan: helps him to symlink gensim 2 | import os 3 | dirname = __path__[0] # Package's main folder 4 | __path__.insert(0, os.path.join(dirname, "gensim")) 5 | -------------------------------------------------------------------------------- /__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/__init__.pyc -------------------------------------------------------------------------------- /gensim/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains interfaces and functionality to compute pair-wise document 3 | similarities within a corpus of documents. 4 | """ 5 | 6 | from gensim import utils, matutils, interfaces, corpora, models, similarities 7 | import logging 8 | 9 | try: 10 | __version__ = __import__('pkg_resources').get_distribution('gensim').version 11 | except: 12 | __version__ = '?' 13 | 14 | 15 | class NullHandler(logging.Handler): 16 | """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" 17 | def emit(self, record): 18 | pass 19 | 20 | logger = logging.getLogger('gensim') 21 | if len(logger.handlers) == 0: # To ensure reload() doesn't add another one 22 | logger.addHandler(NullHandler()) 23 | -------------------------------------------------------------------------------- /gensim/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/__init__.pyc -------------------------------------------------------------------------------- /gensim/corpora/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains implementations of various streaming corpus I/O format. 3 | """ 4 | 5 | # bring corpus classes directly into package namespace, to save some typing 6 | from .indexedcorpus import IndexedCorpus # must appear before the other classes 7 | 8 | from .mmcorpus import MmCorpus 9 | from .bleicorpus import BleiCorpus 10 | from .svmlightcorpus import SvmLightCorpus 11 | from .lowcorpus import LowCorpus 12 | from .dictionary import Dictionary 13 | from .hashdictionary import HashDictionary 14 | from .wikicorpus import WikiCorpus 15 | from .textcorpus import TextCorpus 16 | from .ucicorpus import UciCorpus 17 | from .malletcorpus import MalletCorpus 18 | -------------------------------------------------------------------------------- /gensim/corpora/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/__init__.pyc -------------------------------------------------------------------------------- /gensim/corpora/bleicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Blei's LDA-C format. 10 | """ 11 | 12 | from __future__ import with_statement 13 | 14 | from os import path 15 | import logging 16 | 17 | from gensim import interfaces, utils 18 | from gensim.corpora import IndexedCorpus 19 | from six.moves import xrange 20 | 21 | 22 | logger = logging.getLogger('gensim.corpora.bleicorpus') 23 | 24 | 25 | class BleiCorpus(IndexedCorpus): 26 | """ 27 | Corpus in Blei's LDA-C format. 28 | 29 | The corpus is represented as two files: one describing the documents, and another 30 | describing the mapping between words and their ids. 31 | 32 | Each document is one line:: 33 | 34 | N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN 35 | 36 | The vocabulary is a file with words, one word per line; word at line K has an 37 | implicit ``id=K``. 38 | """ 39 | 40 | def __init__(self, fname, fname_vocab=None): 41 | """ 42 | Initialize the corpus from a file. 43 | 44 | `fname_vocab` is the file with vocabulary; if not specified, it defaults to 45 | `fname.vocab`. 46 | """ 47 | IndexedCorpus.__init__(self, fname) 48 | logger.info("loading corpus from %s" % fname) 49 | 50 | if fname_vocab is None: 51 | fname_base, _ = path.splitext(fname) 52 | fname_dir = path.dirname(fname) 53 | for fname_vocab in [ 54 | fname + '.vocab', 55 | fname + '/vocab.txt', 56 | fname_base + '.vocab', 57 | fname_dir + '/vocab.txt', 58 | ]: 59 | if path.exists(fname_vocab): 60 | break 61 | else: 62 | raise IOError('BleiCorpus: could not find vocabulary file') 63 | 64 | self.fname = fname 65 | with utils.smart_open(fname_vocab) as fin: 66 | words = [utils.to_unicode(word).rstrip() for word in fin] 67 | self.id2word = dict(enumerate(words)) 68 | self.length = 0 69 | 70 | def __iter__(self): 71 | """ 72 | Iterate over the corpus, returning one sparse vector at a time. 73 | """ 74 | lineno = -1 75 | with utils.smart_open(self.fname) as fin: 76 | for lineno, line in enumerate(fin): 77 | yield self.line2doc(line) 78 | self.length = lineno + 1 79 | 80 | def line2doc(self, line): 81 | parts = utils.to_unicode(line).split() 82 | if int(parts[0]) != len(parts) - 1: 83 | raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) 84 | doc = [part.rsplit(':', 1) for part in parts[1:]] 85 | doc = [(int(p1), float(p2)) for p1, p2 in doc] 86 | return doc 87 | 88 | @staticmethod 89 | def save_corpus(fname, corpus, id2word=None, metadata=False): 90 | """ 91 | Save a corpus in the LDA-C format. 92 | 93 | There are actually two files saved: `fname` and `fname.vocab`, where 94 | `fname.vocab` is the vocabulary file. 95 | 96 | This function is automatically called by `BleiCorpus.serialize`; don't 97 | call it directly, call `serialize` instead. 98 | """ 99 | if id2word is None: 100 | logger.info("no word id mapping provided; initializing from corpus") 101 | id2word = utils.dict_from_corpus(corpus) 102 | num_terms = len(id2word) 103 | else: 104 | num_terms = 1 + max([-1] + id2word.keys()) 105 | 106 | logger.info("storing corpus in Blei's LDA-C format into %s" % fname) 107 | with utils.smart_open(fname, 'wb') as fout: 108 | offsets = [] 109 | for doc in corpus: 110 | doc = list(doc) 111 | offsets.append(fout.tell()) 112 | parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7] 113 | fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) 114 | 115 | # write out vocabulary, in a format compatible with Blei's topics.py script 116 | fname_vocab = fname + '.vocab' 117 | logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) 118 | with utils.smart_open(fname_vocab, 'wb') as fout: 119 | for featureid in xrange(num_terms): 120 | fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) 121 | 122 | return offsets 123 | 124 | def docbyoffset(self, offset): 125 | """ 126 | Return the document stored at file position `offset`. 127 | """ 128 | with utils.smart_open(self.fname) as f: 129 | f.seek(offset) 130 | return self.line2doc(f.readline()) 131 | 132 | # endclass BleiCorpus 133 | -------------------------------------------------------------------------------- /gensim/corpora/bleicorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/bleicorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/csvcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2013 Zygmunt Zając 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Corpus in CSV format. 9 | 10 | """ 11 | 12 | 13 | from __future__ import with_statement 14 | 15 | import logging 16 | import csv 17 | import itertools 18 | 19 | from gensim import interfaces 20 | 21 | logger = logging.getLogger('gensim.corpora.csvcorpus') 22 | 23 | 24 | class CsvCorpus(interfaces.CorpusABC): 25 | """ 26 | Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically 27 | based on the file content. 28 | 29 | All row values are expected to be ints/floats. 30 | 31 | """ 32 | 33 | def __init__(self, fname, labels): 34 | """ 35 | Initialize the corpus from a file. 36 | `labels` = are class labels present in the input file? => skip the first column 37 | 38 | """ 39 | logger.info("loading corpus from %s" % fname) 40 | self.fname = fname 41 | self.length = None 42 | self.labels = labels 43 | 44 | # load the first few lines, to guess the CSV dialect 45 | head = ''.join(itertools.islice(open(self.fname), 5)) 46 | self.headers = csv.Sniffer().has_header(head) 47 | self.dialect = csv.Sniffer().sniff(head) 48 | logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers)) 49 | 50 | def __iter__(self): 51 | """ 52 | Iterate over the corpus, returning one sparse vector at a time. 53 | 54 | """ 55 | reader = csv.reader(open(self.fname), self.dialect) 56 | if self.headers: 57 | next(reader) # skip the headers 58 | 59 | line_no = -1 60 | for line_no, line in enumerate(reader): 61 | if self.labels: 62 | line.pop(0) # ignore the first column = class label 63 | yield list(enumerate(map(float, line))) 64 | 65 | self.length = line_no + 1 # store the total number of CSV rows = documents 66 | 67 | # endclass CsvCorpus 68 | -------------------------------------------------------------------------------- /gensim/corpora/dictionary.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/dictionary.pyc -------------------------------------------------------------------------------- /gensim/corpora/hashdictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2012 Homer Strong, Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | This module implements the `"hashing trick" `_ -- 10 | a mapping between words and their integer ids using a fixed, static mapping. The 11 | static mapping has a constant memory footprint, regardless of the number of word-types (features) 12 | in your corpus, so it's suitable for processing extremely large corpora. 13 | 14 | The ids are computed as `hash(word) % id_range`, where `hash` is a user-configurable 15 | function (adler32 by default). Using HashDictionary, new words can be represented immediately, 16 | without an extra pass through the corpus to collect all the ids first. This is another 17 | advantage: HashDictionary can be used with non-repeatable (once-only) streams of documents. 18 | 19 | A disadvantage of HashDictionary is that, unline plain :class:`Dictionary`, several words may map 20 | to the same id, causing hash collisions. The word<->id mapping is no longer a bijection. 21 | 22 | """ 23 | 24 | from __future__ import with_statement 25 | 26 | import logging 27 | import itertools 28 | import zlib 29 | 30 | from gensim import utils 31 | from six import iteritems, iterkeys 32 | 33 | 34 | logger = logging.getLogger('gensim.corpora.hashdictionary') 35 | 36 | 37 | 38 | class HashDictionary(utils.SaveLoad, dict): 39 | """ 40 | HashDictionary encapsulates the mapping between normalized words and their 41 | integer ids. 42 | 43 | Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary 44 | step. The documents can be computed immediately, from an uninitialized `HashDictionary`, 45 | without seeing the rest of the corpus first. 46 | 47 | The main function is `doc2bow`, which converts a collection of words to its 48 | bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. 49 | 50 | """ 51 | def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True): 52 | """ 53 | By default, keep track of debug statistics and mappings. If you find yourself 54 | running out of memory (or are sure you don't need the debug info), set 55 | `debug=False`. 56 | """ 57 | self.myhash = myhash # hash fnc: string->integer 58 | self.id_range = id_range # hash range: id = myhash(key) % id_range 59 | self.debug = debug 60 | 61 | # the following (potentially massive!) dictionaries are only formed if `debug` is True 62 | self.token2id = {} 63 | self.id2token = {} # reverse mapping int->set(words) 64 | self.dfs = {} # token_id -> how many documents this token_id appeared in 65 | self.dfs_debug = {} # token_string->how many documents this word appeared in 66 | 67 | self.num_docs = 0 # number of documents processed 68 | self.num_pos = 0 # total number of corpus positions 69 | self.num_nnz = 0 # total number of non-zeroes in the BOW matrix 70 | self.allow_update = True 71 | 72 | if documents is not None: 73 | self.add_documents(documents) 74 | 75 | 76 | def __getitem__(self, tokenid): 77 | """ 78 | Return all words that have mapped to the given id so far, as a set. 79 | 80 | Only works if `self.debug` was enabled. 81 | """ 82 | return self.id2token.get(tokenid, set()) 83 | 84 | 85 | def restricted_hash(self, token): 86 | """ 87 | Calculate id of the given token. Also keep track of what words were mapped 88 | to what ids, for debugging reasons. 89 | """ 90 | h = self.myhash(utils.to_utf8(token)) % self.id_range 91 | if self.debug: 92 | self.token2id[token] = h 93 | self.id2token.setdefault(h, set()).add(token) 94 | return h 95 | 96 | 97 | def __len__(self): 98 | """ 99 | Return the number of distinct ids = the entire dictionary size. 100 | """ 101 | return self.id_range 102 | 103 | 104 | def keys(self): 105 | """Return a list of all token ids.""" 106 | return range(len(self)) 107 | 108 | 109 | def __str__(self): 110 | return ("HashDictionary(%i id range)" % len(self)) 111 | 112 | 113 | @staticmethod 114 | def from_documents(*args, **kwargs): 115 | return HashDictionary(*args, **kwargs) 116 | 117 | 118 | def add_documents(self, documents): 119 | """ 120 | Build dictionary from a collection of documents. Each document is a list 121 | of tokens = **tokenized and normalized** utf-8 encoded strings. 122 | 123 | This is only a convenience wrapper for calling `doc2bow` on each document 124 | with `allow_update=True`. 125 | """ 126 | for docno, document in enumerate(documents): 127 | if docno % 10000 == 0: 128 | logger.info("adding document #%i to %s" % (docno, self)) 129 | _ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids 130 | logger.info("built %s from %i documents (total %i corpus positions)" % 131 | (self, self.num_docs, self.num_pos)) 132 | 133 | 134 | def doc2bow(self, document, allow_update=False, return_missing=False): 135 | """ 136 | Convert `document` (a list of words) into the bag-of-words format = list 137 | of `(token_id, token_count)` 2-tuples. Each word is assumed to be a 138 | **tokenized and normalized** utf-8 encoded string. No further preprocessing 139 | is done on the words in `document`; apply tokenization, stemming etc. before 140 | calling this method. 141 | 142 | If `allow_update` or `self.allow_update` is set, then also update dictionary 143 | in the process: update overall corpus statistics and document frequencies. 144 | For each id appearing in this document, increase its document frequency 145 | (`self.dfs`) by one. 146 | 147 | """ 148 | result = {} 149 | missing = {} 150 | document = sorted(document) # convert the input to plain list (needed below) 151 | for word_norm, group in itertools.groupby(document): 152 | frequency = len(list(group)) # how many times does this word appear in the input document 153 | tokenid = self.restricted_hash(word_norm) 154 | result[tokenid] = result.get(tokenid, 0) + frequency 155 | if self.debug: 156 | # increment document count for each unique token that appeared in the document 157 | self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 158 | 159 | if allow_update or self.allow_update: 160 | self.num_docs += 1 161 | self.num_pos += len(document) 162 | self.num_nnz += len(result) 163 | if self.debug: 164 | # increment document count for each unique tokenid that appeared in the document 165 | # done here, because several words may map to the same tokenid 166 | for tokenid in iterkeys(result): 167 | self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 168 | 169 | # return tokenids, in ascending id order 170 | result = sorted(iteritems(result)) 171 | if return_missing: 172 | return result, missing 173 | else: 174 | return result 175 | 176 | 177 | def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): 178 | """ 179 | Remove document frequency statistics for tokens that appear in 180 | 181 | 1. less than `no_below` documents (absolute number) or 182 | 2. more than `no_above` documents (fraction of total corpus size, *not* 183 | absolute number). 184 | 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or 185 | keep all if `None`). 186 | 187 | **Note:** since HashDictionary's id range is fixed and doesn't depend on 188 | the number of tokens seen, this doesn't really "remove" anything. It only 189 | clears some supplementary statistics, for easier debugging and a smaller RAM 190 | footprint. 191 | """ 192 | no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold 193 | ok = [item for item in iteritems(self.dfs_debug) 194 | if no_below <= item[1] <= no_above_abs] 195 | ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n]) 196 | 197 | self.dfs_debug = dict((word, freq) 198 | for word, freq in iteritems(self.dfs_debug) 199 | if word in ok) 200 | self.token2id = dict((token, tokenid) 201 | for token, tokenid in iteritems(self.token2id) 202 | if token in self.dfs_debug) 203 | self.id2token = dict((tokenid, set(token for token in tokens 204 | if token in self.dfs_debug)) 205 | for tokenid, tokens in iteritems(self.id2token)) 206 | self.dfs = dict((tokenid, freq) 207 | for tokenid, freq in iteritems(self.dfs) 208 | if self.id2token.get(tokenid, set())) 209 | 210 | # for word->document frequency 211 | logger.info("kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents" % 212 | (no_below, no_above_abs, 100.0 * no_above)) 213 | 214 | 215 | def save_as_text(self, fname): 216 | """ 217 | Save this HashDictionary to a text file, for easier debugging. 218 | 219 | The format is: 220 | `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. 221 | 222 | Note: use `save`/`load` to store in binary format instead (pickle). 223 | """ 224 | logger.info("saving HashDictionary mapping to %s" % fname) 225 | with utils.smart_open(fname, 'wb') as fout: 226 | for tokenid in self.keys(): 227 | words = sorted(self[tokenid]) 228 | if words: 229 | words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] 230 | words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] 231 | fout.write(utils.to_utf8("%i\t%i\t%s\n" % 232 | (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df)))) 233 | #endclass HashDictionary 234 | -------------------------------------------------------------------------------- /gensim/corpora/hashdictionary.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/hashdictionary.pyc -------------------------------------------------------------------------------- /gensim/corpora/indexedcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Indexed corpus is a mechanism for random-accessing corpora. 10 | 11 | While the standard corpus interface in gensim allows iterating over corpus with 12 | `for doc in corpus: pass`, indexed corpus allows accessing the documents with 13 | `corpus[docno]` (in O(1) look-up time). 14 | 15 | This functionality is achieved by storing an extra file (by default named the same 16 | as the corpus file plus '.index' suffix) that stores the byte offset of the beginning 17 | of each document. 18 | """ 19 | 20 | import logging 21 | import shelve 22 | 23 | from gensim import interfaces, utils 24 | 25 | logger = logging.getLogger('gensim.corpora.indexedcorpus') 26 | 27 | 28 | class IndexedCorpus(interfaces.CorpusABC): 29 | def __init__(self, fname, index_fname=None): 30 | """ 31 | Initialize this abstract base class, by loading a previously saved index 32 | from `index_fname` (or `fname.index` if `index_fname` is not set). 33 | This index will allow subclasses to support the `corpus[docno]` syntax 34 | (random access to document #`docno` in O(1)). 35 | 36 | >>> # save corpus in SvmLightCorpus format with an index 37 | >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] 38 | >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) 39 | >>> # load back as a document stream (*not* plain Python list) 40 | >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') 41 | >>> print(corpus_with_random_access[1]) 42 | [(0, 1.0), (1, 2.0)] 43 | 44 | """ 45 | try: 46 | if index_fname is None: 47 | index_fname = fname + '.index' 48 | self.index = utils.unpickle(index_fname) 49 | logger.info("loaded corpus index from %s" % index_fname) 50 | except: 51 | self.index = None 52 | self.length = None 53 | 54 | @classmethod 55 | def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): 56 | """ 57 | Iterate through the document stream `corpus`, saving the documents to `fname` 58 | and recording byte offset of each document. Save the resulting index 59 | structure to file `index_fname` (or `fname`.index is not set). 60 | 61 | This relies on the underlying corpus class `serializer` providing (in 62 | addition to standard iteration): 63 | 64 | * `save_corpus` method that returns a sequence of byte offsets, one for 65 | each saved document, 66 | * the `docbyoffset(offset)` method, which returns a document 67 | positioned at `offset` bytes within the persistent storage (file). 68 | 69 | Example: 70 | 71 | >>> MmCorpus.serialize('test.mm', corpus) 72 | >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access 73 | >>> print(mm[42]) # retrieve document no. 42, etc. 74 | """ 75 | if getattr(corpus, 'fname', None) == fname: 76 | raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) 77 | 78 | if index_fname is None: 79 | index_fname = fname + '.index' 80 | 81 | if progress_cnt is not None: 82 | if labels is not None: 83 | offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) 84 | else: 85 | offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) 86 | else: 87 | if labels is not None: 88 | offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) 89 | else: 90 | offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) 91 | 92 | if offsets is None: 93 | raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % 94 | serializer.__name__) 95 | 96 | # store offsets persistently, using pickle 97 | logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) 98 | utils.pickle(offsets, index_fname) 99 | 100 | def __len__(self): 101 | """ 102 | Return the index length if the corpus is indexed. Otherwise, make a pass 103 | over self to calculate the corpus length and cache this number. 104 | """ 105 | if self.index is not None: 106 | return len(self.index) 107 | if self.length is None: 108 | logger.info("caching corpus length") 109 | self.length = sum(1 for doc in self) 110 | return self.length 111 | 112 | def __getitem__(self, docno): 113 | if self.index is None: 114 | raise RuntimeError("cannot call corpus[docid] without an index") 115 | return self.docbyoffset(self.index[docno]) 116 | 117 | # endclass IndexedCorpus 118 | -------------------------------------------------------------------------------- /gensim/corpora/indexedcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/indexedcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/lowcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus in GibbsLda++ format of List-Of-Words. 10 | """ 11 | 12 | from __future__ import with_statement 13 | 14 | import logging 15 | 16 | from gensim import utils 17 | from gensim.corpora import IndexedCorpus 18 | from six import iteritems, iterkeys 19 | from six.moves import xrange, zip as izip 20 | 21 | 22 | logger = logging.getLogger('gensim.corpora.lowcorpus') 23 | 24 | 25 | def split_on_space(s): 26 | return [word for word in utils.to_unicode(s).strip().split(' ') if word] 27 | 28 | 29 | class LowCorpus(IndexedCorpus): 30 | """ 31 | List_Of_Words corpus handles input in GibbsLda++ format. 32 | 33 | Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: 34 | 35 | Both data for training/estimating the model and new data (i.e., previously 36 | unseen data) have the same format as follows: 37 | 38 | [M] 39 | [document1] 40 | [document2] 41 | ... 42 | [documentM] 43 | 44 | in which the first line is the total number for documents [M]. Each line 45 | after that is one document. [documenti] is the ith document of the dataset 46 | that consists of a list of Ni words/terms. 47 | 48 | [documenti] = [wordi1] [wordi2] ... [wordiNi] 49 | 50 | in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated 51 | by the blank character. 52 | """ 53 | def __init__(self, fname, id2word=None, line2words=split_on_space): 54 | """ 55 | Initialize the corpus from a file. 56 | 57 | `id2word` and `line2words` are optional parameters. 58 | If provided, `id2word` is a dictionary mapping between word_ids (integers) 59 | and words (strings). If not provided, the mapping is constructed from 60 | the documents. 61 | 62 | `line2words` is a function which converts lines into tokens. Defaults to 63 | simple splitting on spaces. 64 | """ 65 | IndexedCorpus.__init__(self, fname) 66 | logger.info("loading corpus from %s" % fname) 67 | 68 | self.fname = fname # input file, see class doc for format 69 | self.line2words = line2words # how to translate lines into words (simply split on space by default) 70 | self.num_docs = self._calculate_num_docs() 71 | 72 | if not id2word: 73 | # build a list of all word types in the corpus (distinct words) 74 | logger.info("extracting vocabulary from the corpus") 75 | all_terms = set() 76 | self.use_wordids = False # return documents as (word, wordCount) 2-tuples 77 | for doc in self: 78 | all_terms.update(word for word, wordCnt in doc) 79 | all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id 80 | self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) 81 | else: 82 | logger.info("using provided word mapping (%i ids)" % len(id2word)) 83 | self.id2word = id2word 84 | self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) 85 | self.num_terms = len(self.word2id) 86 | self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples 87 | 88 | logger.info("loaded corpus with %i documents and %i terms from %s" % 89 | (self.num_docs, self.num_terms, fname)) 90 | 91 | def _calculate_num_docs(self): 92 | # the first line in input data is the number of documents (integer). throws exception on bad input. 93 | with utils.smart_open(self.fname) as fin: 94 | try: 95 | result = int(next(fin)) 96 | except StopIteration: 97 | result = 0 98 | 99 | return result 100 | 101 | def __len__(self): 102 | return self.num_docs 103 | 104 | def line2doc(self, line): 105 | words = self.line2words(line) 106 | 107 | if self.use_wordids: 108 | # get all distinct terms in this document, ignore unknown words 109 | uniq_words = set(words).intersection(iterkeys(self.word2id)) 110 | 111 | # the following creates a unique list of words *in the same order* 112 | # as they were in the input. when iterating over the documents, 113 | # the (word, count) pairs will appear in the same order as they 114 | # were in the input (bar duplicates), which looks better. 115 | # if this was not needed, we might as well have used useWords = set(words) 116 | use_words, marker = [], set() 117 | for word in words: 118 | if (word in uniq_words) and (word not in marker): 119 | use_words.append(word) 120 | marker.add(word) 121 | # construct a list of (wordIndex, wordFrequency) 2-tuples 122 | doc = list(zip(map(self.word2id.get, use_words), 123 | map(words.count, use_words))) 124 | else: 125 | uniq_words = set(words) 126 | # construct a list of (word, wordFrequency) 2-tuples 127 | doc = list(zip(uniq_words, map(words.count, uniq_words))) 128 | 129 | # return the document, then forget it and move on to the next one 130 | # note that this way, only one doc is stored in memory at a time, not the whole corpus 131 | return doc 132 | 133 | def __iter__(self): 134 | """ 135 | Iterate over the corpus, returning one bag-of-words vector at a time. 136 | """ 137 | with utils.smart_open(self.fname) as fin: 138 | for lineno, line in enumerate(fin): 139 | if lineno > 0: # ignore the first line = number of documents 140 | yield self.line2doc(line) 141 | 142 | @staticmethod 143 | def save_corpus(fname, corpus, id2word=None, metadata=False): 144 | """ 145 | Save a corpus in the List-of-words format. 146 | 147 | This function is automatically called by `LowCorpus.serialize`; don't 148 | call it directly, call `serialize` instead. 149 | """ 150 | if id2word is None: 151 | logger.info("no word id mapping provided; initializing from corpus") 152 | id2word = utils.dict_from_corpus(corpus) 153 | 154 | logger.info("storing corpus in List-Of-Words format into %s" % fname) 155 | truncated = 0 156 | offsets = [] 157 | with utils.smart_open(fname, 'wb') as fout: 158 | fout.write(utils.to_utf8('%i\n' % len(corpus))) 159 | for doc in corpus: 160 | words = [] 161 | for wordid, value in doc: 162 | if abs(int(value) - value) > 1e-6: 163 | truncated += 1 164 | words.extend([utils.to_unicode(id2word[wordid])] * int(value)) 165 | offsets.append(fout.tell()) 166 | fout.write(utils.to_utf8('%s\n' % ' '.join(words))) 167 | 168 | if truncated: 169 | logger.warning("List-of-words format can only save vectors with " 170 | "integer elements; %i float entries were truncated to integer value" % 171 | truncated) 172 | return offsets 173 | 174 | def docbyoffset(self, offset): 175 | """ 176 | Return the document stored at file position `offset`. 177 | """ 178 | with utils.smart_open(self.fname) as f: 179 | f.seek(offset) 180 | return self.line2doc(f.readline()) 181 | 182 | # endclass LowCorpus 183 | -------------------------------------------------------------------------------- /gensim/corpora/lowcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/lowcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/malletcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Corpus in Mallet format of List-Of-Words. 8 | """ 9 | 10 | from __future__ import with_statement 11 | 12 | import logging 13 | 14 | from gensim import utils 15 | from gensim.corpora import LowCorpus 16 | 17 | 18 | logger = logging.getLogger('gensim.corpora.malletcorpus') 19 | 20 | 21 | class MalletCorpus(LowCorpus): 22 | """ 23 | Quoting http://mallet.cs.umass.edu/import.php: 24 | 25 | One file, one instance per line 26 | Assume the data is in the following format: 27 | 28 | [URL] [language] [text of the page...] 29 | 30 | Or, more generally, 31 | [document #1 id] [label] [text of the document...] 32 | [document #2 id] [label] [text of the document...] 33 | ... 34 | [document #N id] [label] [text of the document...] 35 | 36 | Note that language/label is *not* considered in Gensim. 37 | 38 | """ 39 | def __init__(self, fname, id2word=None, metadata=False): 40 | self.metadata = metadata 41 | LowCorpus.__init__(self, fname, id2word) 42 | 43 | def _calculate_num_docs(self): 44 | with utils.smart_open(self.fname) as fin: 45 | result = sum([1 for x in fin]) 46 | return result 47 | 48 | def __iter__(self): 49 | """ 50 | Iterate over the corpus at the given filename. 51 | 52 | Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary. 53 | """ 54 | with utils.smart_open(self.fname) as f: 55 | for line in f: 56 | yield self.line2doc(line) 57 | 58 | def line2doc(self, line): 59 | l = [word for word in utils.to_unicode(line).strip().split(' ') if word] 60 | docid, doclang, words = l[0], l[1], l[2:] 61 | 62 | doc = super(MalletCorpus, self).line2doc(' '.join(words)) 63 | 64 | if self.metadata: 65 | return doc, (docid, doclang) 66 | else: 67 | return doc 68 | 69 | @staticmethod 70 | def save_corpus(fname, corpus, id2word=None, metadata=False): 71 | """ 72 | Save a corpus in the Mallet format. 73 | 74 | The document id will be generated by enumerating the corpus. 75 | That is, it will range between 0 and number of documents in the corpus. 76 | 77 | Since Mallet has a language field in the format, this defaults to the string '__unknown__'. 78 | If the language needs to be saved, post-processing will be required. 79 | 80 | This function is automatically called by `MalletCorpus.serialize`; don't 81 | call it directly, call `serialize` instead. 82 | 83 | """ 84 | if id2word is None: 85 | logger.info("no word id mapping provided; initializing from corpus") 86 | id2word = utils.dict_from_corpus(corpus) 87 | 88 | logger.info("storing corpus in Mallet format into %s" % fname) 89 | 90 | truncated = 0 91 | offsets = [] 92 | with utils.smart_open(fname, 'wb') as fout: 93 | for doc_id, doc in enumerate(corpus): 94 | if metadata: 95 | doc_id, doc_lang = doc[1] 96 | doc = doc[0] 97 | else: 98 | doc_lang = '__unknown__' 99 | 100 | words = [] 101 | for wordid, value in doc: 102 | if abs(int(value) - value) > 1e-6: 103 | truncated += 1 104 | words.extend([utils.to_unicode(id2word[wordid])] * int(value)) 105 | offsets.append(fout.tell()) 106 | fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) 107 | 108 | if truncated: 109 | logger.warning("Mallet format can only save vectors with " 110 | "integer elements; %i float entries were truncated to integer value" % 111 | truncated) 112 | 113 | return offsets 114 | 115 | def docbyoffset(self, offset): 116 | """ 117 | Return the document stored at file position `offset`. 118 | """ 119 | with utils.smart_open(self.fname) as f: 120 | f.seek(offset) 121 | return self.line2doc(f.readline()) 122 | 123 | # endclass MalletCorpus 124 | -------------------------------------------------------------------------------- /gensim/corpora/malletcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/malletcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/mmcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus in the Matrix Market format. 10 | """ 11 | 12 | 13 | import logging 14 | 15 | from gensim import interfaces, matutils 16 | from gensim.corpora import IndexedCorpus 17 | 18 | 19 | logger = logging.getLogger('gensim.corpora.mmcorpus') 20 | 21 | 22 | class MmCorpus(matutils.MmReader, IndexedCorpus): 23 | """ 24 | Corpus in the Matrix Market format. 25 | """ 26 | def __init__(self, fname): 27 | # avoid calling super(), too confusing 28 | IndexedCorpus.__init__(self, fname) 29 | matutils.MmReader.__init__(self, fname) 30 | 31 | def __iter__(self): 32 | """ 33 | Interpret a matrix in Matrix Market format as a streamed gensim corpus 34 | (yielding one document at a time). 35 | """ 36 | for doc_id, doc in super(MmCorpus, self).__iter__(): 37 | yield doc # get rid of doc id, return the sparse vector only 38 | 39 | @staticmethod 40 | def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): 41 | """ 42 | Save a corpus in the Matrix Market format to disk. 43 | 44 | This function is automatically called by `MmCorpus.serialize`; don't 45 | call it directly, call `serialize` instead. 46 | """ 47 | logger.info("storing corpus in Matrix Market format to %s" % fname) 48 | num_terms = len(id2word) if id2word is not None else None 49 | return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata) 50 | 51 | # endclass MmCorpus 52 | -------------------------------------------------------------------------------- /gensim/corpora/mmcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/mmcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/svmlightcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus in SVMlight format. 10 | """ 11 | 12 | 13 | from __future__ import with_statement 14 | 15 | import logging 16 | 17 | from gensim import utils 18 | from gensim.corpora import IndexedCorpus 19 | 20 | 21 | logger = logging.getLogger('gensim.corpora.svmlightcorpus') 22 | 23 | 24 | class SvmLightCorpus(IndexedCorpus): 25 | """ 26 | Corpus in SVMlight format. 27 | 28 | Quoting http://svmlight.joachims.org/: 29 | The input file contains the training examples. The first lines 30 | may contain comments and are ignored if they start with #. Each of the following 31 | lines represents one training example and is of the following format:: 32 | 33 | .=. : : ... : # 34 | .=. +1 | -1 | 0 | 35 | .=. | "qid" 36 | .=. 37 | .=. 38 | 39 | The "qid" feature (used for SVMlight ranking), if present, is ignored. 40 | 41 | Although not mentioned in the specification above, SVMlight also expect its 42 | feature ids to be 1-based (counting starts at 1). We convert features to 0-base 43 | internally by decrementing all ids when loading a SVMlight input file, and 44 | increment them again when saving as SVMlight. 45 | 46 | """ 47 | 48 | def __init__(self, fname, store_labels=True): 49 | """ 50 | Initialize the corpus from a file. 51 | 52 | Although vector labels (~SVM target class) are not used in gensim in any way, 53 | they are parsed and stored in `self.labels` for convenience. Set `store_labels=False` 54 | to skip storing these labels (e.g. if there are too many vectors to store 55 | the self.labels array in memory). 56 | 57 | """ 58 | IndexedCorpus.__init__(self, fname) 59 | logger.info("loading corpus from %s" % fname) 60 | 61 | self.fname = fname # input file, see class doc for format 62 | self.length = None 63 | self.store_labels = store_labels 64 | self.labels = [] 65 | 66 | def __iter__(self): 67 | """ 68 | Iterate over the corpus, returning one sparse vector at a time. 69 | """ 70 | lineno = -1 71 | self.labels = [] 72 | with utils.smart_open(self.fname) as fin: 73 | for lineno, line in enumerate(fin): 74 | doc = self.line2doc(line) 75 | if doc is not None: 76 | if self.store_labels: 77 | self.labels.append(doc[1]) 78 | yield doc[0] 79 | self.length = lineno + 1 80 | 81 | @staticmethod 82 | def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): 83 | """ 84 | Save a corpus in the SVMlight format. 85 | 86 | The SVMlight `` class tag is taken from the `labels` array, or set 87 | to 0 for all documents if `labels` is not supplied. 88 | 89 | This function is automatically called by `SvmLightCorpus.serialize`; don't 90 | call it directly, call `serialize` instead. 91 | """ 92 | logger.info("converting corpus to SVMlight format: %s" % fname) 93 | 94 | offsets = [] 95 | with utils.smart_open(fname, 'wb') as fout: 96 | for docno, doc in enumerate(corpus): 97 | label = labels[docno] if labels else 0 # target class is 0 by default 98 | offsets.append(fout.tell()) 99 | fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) 100 | return offsets 101 | 102 | def docbyoffset(self, offset): 103 | """ 104 | Return the document stored at file position `offset`. 105 | """ 106 | with utils.smart_open(self.fname) as f: 107 | f.seek(offset) 108 | return self.line2doc(f.readline())[0] 109 | 110 | def line2doc(self, line): 111 | """ 112 | Create a document from a single line (string) in SVMlight format 113 | """ 114 | line = utils.to_unicode(line) 115 | line = line[: line.find('#')].strip() 116 | if not line: 117 | return None # ignore comments and empty lines 118 | parts = line.split() 119 | if not parts: 120 | raise ValueError('invalid line format in %s' % self.fname) 121 | target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] 122 | doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based 123 | return doc, target 124 | 125 | @staticmethod 126 | def doc2line(doc, label=0): 127 | """ 128 | Output the document in SVMlight format, as a string. Inverse function to `line2doc`. 129 | """ 130 | pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base 131 | return "%s %s\n" % (label, pairs) 132 | 133 | # endclass SvmLightCorpus 134 | -------------------------------------------------------------------------------- /gensim/corpora/svmlightcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/svmlightcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/textcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Text corpora usually reside on disk, as text files in one format or another 8 | In a common scenario, we need to build a dictionary (a `word->integer id` 9 | mapping), which is then used to construct sparse bag-of-word vectors 10 | (= sequences of `(word_id, word_weight)` 2-tuples). 11 | 12 | This module provides some code scaffolding to simplify this pipeline. For 13 | example, given a corpus where each document is a separate line in file on disk, 14 | you would override the `TextCorpus.get_texts` method to read one line=document 15 | at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence 16 | of words. 17 | 18 | Overriding `get_texts` is enough; you can then initialize the corpus with e.g. 19 | `MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a 20 | corpus of sparse vectors. The `__iter__` methods is automatically set up, and 21 | dictionary is automatically populated with all `word->id` mappings. 22 | 23 | The resulting object can be used as input to all gensim models (TFIDF, LSI, ...), 24 | serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc). 25 | 26 | See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example. 27 | """ 28 | 29 | 30 | from __future__ import with_statement 31 | 32 | import logging 33 | 34 | from gensim import interfaces, utils 35 | from six import string_types 36 | from gensim.corpora.dictionary import Dictionary 37 | 38 | logger = logging.getLogger('gensim.corpora.textcorpus') 39 | 40 | 41 | class TextCorpus(interfaces.CorpusABC): 42 | """ 43 | Helper class to simplify the pipeline of getting bag-of-words vectors (= a 44 | gensim corpus) from plain text. 45 | 46 | This is an abstract base class: override the `get_texts()` method to match 47 | your particular input. 48 | 49 | Given a filename (or a file-like object) in constructor, the corpus object 50 | will be automatically initialized with a dictionary in `self.dictionary` and 51 | will support the `iter` corpus method. You must only provide a correct `get_texts` 52 | implementation. 53 | 54 | """ 55 | def __init__(self, input=None): 56 | super(TextCorpus, self).__init__() 57 | self.input = input 58 | self.dictionary = Dictionary() 59 | self.metadata = False 60 | if input is not None: 61 | self.dictionary.add_documents(self.get_texts()) 62 | else: 63 | logger.warning("No input document stream provided; assuming " 64 | "dictionary will be initialized some other way.") 65 | 66 | def __iter__(self): 67 | """ 68 | The function that defines a corpus. 69 | 70 | Iterating over the corpus must yield sparse vectors, one for each document. 71 | """ 72 | for text in self.get_texts(): 73 | if self.metadata: 74 | yield self.dictionary.doc2bow(text[0], allow_update=False), text[1] 75 | else: 76 | yield self.dictionary.doc2bow(text, allow_update=False) 77 | 78 | def getstream(self): 79 | return utils.file_or_filename(self.input) 80 | 81 | def get_texts(self): 82 | """ 83 | Iterate over the collection, yielding one document at a time. A document 84 | is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. 85 | 86 | Override this function to match your input (parse input files, do any 87 | text preprocessing, lowercasing, tokenizing etc.). There will be no further 88 | preprocessing of the words coming out of this function. 89 | """ 90 | # Instead of raising NotImplementedError, let's provide a sample implementation: 91 | # assume documents are lines in a single file (one document per line). 92 | # Yield each document as a list of lowercase tokens, via `utils.tokenize`. 93 | lineno = -1 94 | with self.getstream() as lines: 95 | for lineno, line in enumerate(lines): 96 | if self.metadata: 97 | yield utils.tokenize(line, lowercase=True), (lineno,) 98 | else: 99 | yield utils.tokenize(line, lowercase=True) 100 | self.length = lineno + 1 # will be 0 if loop never executes 101 | 102 | def __len__(self): 103 | return self.length # will throw if corpus not initialized 104 | 105 | # endclass TextCorpus 106 | -------------------------------------------------------------------------------- /gensim/corpora/textcorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/textcorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/ucicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2012 Jonathan Esterhazy 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | University of California, Irvine (UCI) Bag-of-Words format. 10 | 11 | http://archive.ics.uci.edu/ml/datasets/Bag+of+Words 12 | """ 13 | 14 | from __future__ import with_statement 15 | 16 | import logging 17 | from collections import defaultdict 18 | 19 | from gensim import utils 20 | from gensim.corpora import Dictionary 21 | from gensim.corpora import IndexedCorpus 22 | from gensim.matutils import MmReader 23 | from gensim.matutils import MmWriter 24 | from six import iteritems, string_types 25 | from six.moves import xrange 26 | 27 | 28 | logger = logging.getLogger('gensim.corpora.ucicorpus') 29 | 30 | 31 | class UciReader(MmReader): 32 | def __init__(self, input): 33 | """ 34 | Initialize the reader. 35 | 36 | The `input` parameter refers to a file on the local filesystem, 37 | which is expected to be in the UCI Bag-of-Words format. 38 | """ 39 | 40 | logger.info('Initializing corpus reader from %s' % input) 41 | 42 | self.input = input 43 | 44 | with utils.smart_open(self.input) as fin: 45 | self.num_docs = self.num_terms = self.num_nnz = 0 46 | try: 47 | self.num_docs = int(next(fin).strip()) 48 | self.num_terms = int(next(fin).strip()) 49 | self.num_nnz = int(next(fin).strip()) 50 | except StopIteration: 51 | pass 52 | 53 | logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % 54 | (self.num_docs, self.num_terms, self.num_nnz)) 55 | 56 | def skip_headers(self, input_file): 57 | for lineno, _ in enumerate(input_file): 58 | if lineno == 2: 59 | break 60 | 61 | # endclass UciReader 62 | 63 | 64 | class UciWriter(MmWriter): 65 | """ 66 | Store a corpus in UCI Bag-of-Words format. 67 | 68 | This corpus format is identical to MM format, except for 69 | different file headers. There is no format line, and the first 70 | three lines of the file contain number_docs, num_terms, and num_nnz, 71 | one value per line. 72 | 73 | This implementation is based on matutils.MmWriter, and works the same way. 74 | 75 | """ 76 | MAX_HEADER_LENGTH = 20 # reserve 20 bytes per header value 77 | FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n') 78 | 79 | def write_headers(self): 80 | """ 81 | Write blank header lines. Will be updated later, once corpus stats are known. 82 | """ 83 | for _ in range(3): 84 | self.fout.write(self.FAKE_HEADER) 85 | 86 | self.last_docno = -1 87 | self.headers_written = True 88 | 89 | def update_headers(self, num_docs, num_terms, num_nnz): 90 | """ 91 | Update headers with actual values. 92 | """ 93 | offset = 0 94 | values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] 95 | 96 | for value in values: 97 | if len(value) > len(self.FAKE_HEADER): 98 | raise ValueError('Invalid header: value too large!') 99 | self.fout.seek(offset) 100 | self.fout.write(value) 101 | offset += len(self.FAKE_HEADER) 102 | 103 | @staticmethod 104 | def write_corpus(fname, corpus, progress_cnt=1000, index=False): 105 | writer = UciWriter(fname) 106 | writer.write_headers() 107 | 108 | num_terms, num_nnz = 0, 0 109 | docno, poslast = -1, -1 110 | offsets = [] 111 | for docno, bow in enumerate(corpus): 112 | if docno % progress_cnt == 0: 113 | logger.info("PROGRESS: saving document #%i" % docno) 114 | if index: 115 | posnow = writer.fout.tell() 116 | if posnow == poslast: 117 | offsets[-1] = -1 118 | offsets.append(posnow) 119 | poslast = posnow 120 | 121 | vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights 122 | max_id, veclen = writer.write_vector(docno, vector) 123 | num_terms = max(num_terms, 1 + max_id) 124 | num_nnz += veclen 125 | num_docs = docno + 1 126 | 127 | if num_docs * num_terms != 0: 128 | logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % 129 | (num_docs, num_terms, 130 | 100.0 * num_nnz / (num_docs * num_terms), 131 | num_nnz, 132 | num_docs * num_terms)) 133 | 134 | # now write proper headers, by seeking and overwriting the spaces written earlier 135 | writer.update_headers(num_docs, num_terms, num_nnz) 136 | 137 | writer.close() 138 | if index: 139 | return offsets 140 | 141 | # endclass UciWriter 142 | 143 | 144 | class UciCorpus(UciReader, IndexedCorpus): 145 | """ 146 | Corpus in the UCI bag-of-words format. 147 | """ 148 | def __init__(self, fname, fname_vocab=None): 149 | IndexedCorpus.__init__(self, fname) 150 | UciReader.__init__(self, fname) 151 | 152 | if fname_vocab is None: 153 | fname_vocab = fname + '.vocab' 154 | 155 | self.fname = fname 156 | with utils.smart_open(fname_vocab) as fin: 157 | words = [word.strip() for word in fin] 158 | self.id2word = dict(enumerate(words)) 159 | 160 | self.transposed = True 161 | 162 | def __iter__(self): 163 | """ 164 | Interpret a matrix in UCI bag-of-words format as a streamed gensim corpus 165 | (yielding one document at a time). 166 | """ 167 | for docId, doc in super(UciCorpus, self).__iter__(): 168 | yield doc # get rid of docId, return the sparse vector only 169 | 170 | def create_dictionary(self): 171 | """ 172 | Utility method to generate gensim-style Dictionary directly from 173 | the corpus and vocabulary data. 174 | """ 175 | dictionary = Dictionary() 176 | 177 | # replace dfs with defaultdict to avoid downstream KeyErrors 178 | # uci vocabularies may contain terms that are not used in the document data 179 | dictionary.dfs = defaultdict(int) 180 | 181 | dictionary.id2token = self.id2word 182 | dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) 183 | 184 | dictionary.num_docs = self.num_docs 185 | dictionary.num_nnz = self.num_nnz 186 | 187 | for docno, doc in enumerate(self): 188 | if docno % 10000 == 0: 189 | logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) 190 | 191 | for word, count in doc: 192 | dictionary.dfs[word] += 1 193 | dictionary.num_pos += count 194 | 195 | return dictionary 196 | 197 | @staticmethod 198 | def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): 199 | """ 200 | Save a corpus in the UCI Bag-of-Words format. 201 | 202 | There are actually two files saved: `fname` and `fname.vocab`, where 203 | `fname.vocab` is the vocabulary file. 204 | 205 | This function is automatically called by `UciCorpus.serialize`; don't 206 | call it directly, call `serialize` instead. 207 | """ 208 | if id2word is None: 209 | logger.info("no word id mapping provided; initializing from corpus") 210 | id2word = utils.dict_from_corpus(corpus) 211 | num_terms = len(id2word) 212 | else: 213 | num_terms = 1 + max([-1] + id2word.keys()) 214 | 215 | # write out vocabulary 216 | fname_vocab = fname + '.vocab' 217 | logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) 218 | with utils.smart_open(fname_vocab, 'wb') as fout: 219 | for featureid in xrange(num_terms): 220 | fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) 221 | 222 | logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) 223 | 224 | return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt) 225 | 226 | # endclass UciCorpus 227 | -------------------------------------------------------------------------------- /gensim/corpora/ucicorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/ucicorpus.pyc -------------------------------------------------------------------------------- /gensim/corpora/wikicorpus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/wikicorpus.pyc -------------------------------------------------------------------------------- /gensim/examples/dmlcz/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/dmlcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus for the DML-CZ project. 10 | """ 11 | 12 | 13 | import logging 14 | import itertools 15 | import os.path 16 | 17 | from gensim import interfaces, matutils 18 | import dictionary # for constructing word->id mappings 19 | 20 | 21 | logger = logging.getLogger('gensim.corpora.dmlcorpus') 22 | 23 | 24 | class DmlConfig(object): 25 | """ 26 | DmlConfig contains parameters necessary for the abstraction of a 'corpus of 27 | articles' (see the `DmlCorpus` class). 28 | 29 | Articles may come from different sources (=different locations on disk/network, 30 | different file formats etc.), so the main purpose of DmlConfig is to keep all 31 | sources in one place. 32 | 33 | Apart from glueing sources together, DmlConfig also decides where to store 34 | output files and which articles to accept for the corpus (= an additional filter 35 | over the sources). 36 | """ 37 | def __init__(self, configId, resultDir, acceptLangs = None): 38 | self.resultDir = resultDir # output files will be stored in this directory 39 | self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple 40 | self.sources = {} # all article sources; see sources.DmlSource class for an example of source 41 | 42 | if acceptLangs is None: # which languages to accept 43 | acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified) 44 | self.acceptLangs = set(acceptLangs) 45 | logger.info('initialized %s' % self) 46 | 47 | 48 | def resultFile(self, fname): 49 | return os.path.join(self.resultDir, self.configId + '_' + fname) 50 | 51 | 52 | def acceptArticle(self, metadata): 53 | lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown 54 | if 'any' not in self.acceptLangs and lang not in self.acceptLangs: 55 | return False 56 | return True 57 | 58 | 59 | def addSource(self, source): 60 | sourceId = str(source) 61 | assert sourceId not in self.sources, "source %s already present in the config!" % sourceId 62 | self.sources[sourceId] = source 63 | 64 | 65 | def __str__(self): 66 | return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % 67 | (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) 68 | #endclass DmlConfig 69 | 70 | 71 | 72 | class DmlCorpus(interfaces.CorpusABC): 73 | """ 74 | DmlCorpus implements a collection of articles. It is initialized via a DmlConfig 75 | object, which holds information about where to look for the articles and how 76 | to process them. 77 | 78 | Apart from being a regular corpus (bag-of-words iterable with a `len()` method), 79 | DmlCorpus has methods for building a dictionary (mapping between words and 80 | their ids). 81 | """ 82 | def __init__(self): 83 | self.documents = [] 84 | self.config = None 85 | self.dictionary = dictionary.Dictionary() 86 | 87 | 88 | def __len__(self): 89 | return len(self.documents) 90 | 91 | 92 | def __iter__(self): 93 | """ 94 | The function that defines a corpus -- iterating over the corpus yields 95 | bag-of-words vectors, one for each document. 96 | 97 | A bag-of-words vector is simply a list of ``(tokenId, tokenCount)`` 2-tuples. 98 | """ 99 | for docNo, (sourceId, docUri) in enumerate(self.documents): 100 | source = self.config.sources[sourceId] 101 | 102 | contents = source.getContent(docUri) 103 | words = [source.normalizeWord(word) for word in source.tokenize(contents)] 104 | yield self.dictionary.doc2bow(words, allowUpdate = False) 105 | 106 | 107 | def buildDictionary(self): 108 | """ 109 | Populate dictionary mapping and statistics. 110 | 111 | This is done by sequentially retrieving the article fulltexts, splitting 112 | them into tokens and converting tokens to their ids (creating new ids as 113 | necessary). 114 | """ 115 | logger.info("creating dictionary from %i articles" % len(self.documents)) 116 | self.dictionary = dictionary.Dictionary() 117 | numPositions = 0 118 | for docNo, (sourceId, docUri) in enumerate(self.documents): 119 | if docNo % 1000 == 0: 120 | logger.info("PROGRESS: at document #%i/%i (%s, %s)" % 121 | (docNo, len(self.documents), sourceId, docUri)) 122 | source = self.config.sources[sourceId] 123 | contents = source.getContent(docUri) 124 | words = [source.normalizeWord(word) for word in source.tokenize(contents)] 125 | numPositions += len(words) 126 | 127 | # convert to bag-of-words, but ignore the result -- here we only care about updating token ids 128 | _ = self.dictionary.doc2bow(words, allowUpdate = True) 129 | logger.info("built %s from %i documents (total %i corpus positions)" % 130 | (self.dictionary, len(self.documents), numPositions)) 131 | 132 | 133 | def processConfig(self, config, shuffle = False): 134 | """ 135 | Parse the directories specified in the config, looking for suitable articles. 136 | 137 | This updates the self.documents var, which keeps a list of (source id, 138 | article uri) 2-tuples. Each tuple is a unique identifier of one article. 139 | 140 | Note that some articles are ignored based on config settings (for example 141 | if the article's language doesn't match any language specified in the 142 | config etc.). 143 | """ 144 | self.config = config 145 | self.documents = [] 146 | logger.info("processing config %s" % config) 147 | for sourceId, source in config.sources.iteritems(): 148 | logger.info("processing source '%s'" % sourceId) 149 | accepted = [] 150 | for articleUri in source.findArticles(): 151 | meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) 152 | if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata 153 | accepted.append((sourceId, articleUri)) 154 | logger.info("accepted %i articles for source '%s'" % 155 | (len(accepted), sourceId)) 156 | self.documents.extend(accepted) 157 | 158 | if not self.documents: 159 | logger.warning('no articles at all found from the config; something went wrong!') 160 | 161 | if shuffle: 162 | logger.info("shuffling %i documents for random order" % len(self.documents)) 163 | import random 164 | random.shuffle(self.documents) 165 | 166 | logger.info("accepted total of %i articles for %s" % 167 | (len(self.documents), str(config))) 168 | 169 | 170 | def saveDictionary(self, fname): 171 | logger.info("saving dictionary mapping to %s" % fname) 172 | fout = open(fname, 'w') 173 | for tokenId, token in self.dictionary.id2token.iteritems(): 174 | fout.write("%i\t%s\n" % (tokenId, token)) 175 | fout.close() 176 | 177 | @staticmethod 178 | def loadDictionary(fname): 179 | result = {} 180 | for lineNo, line in enumerate(open(fname)): 181 | pair = line[:-1].split('\t') 182 | if len(pair) != 2: 183 | continue 184 | wordId, word = pair 185 | result[int(wordId)] = word 186 | return result 187 | 188 | def saveDocuments(self, fname): 189 | logger.info("saving documents mapping to %s" % fname) 190 | fout = open(fname, 'w') 191 | for docNo, docId in enumerate(self.documents): 192 | sourceId, docUri = docId 193 | intId, pathId = docUri 194 | fout.write("%i\t%s\n" % (docNo, repr(docId))) 195 | fout.close() 196 | 197 | 198 | def saveAsText(self): 199 | """ 200 | Store the corpus to disk, in a human-readable text format. 201 | 202 | This actually saves multiple files: 203 | 204 | 1. Pure document-term co-occurence frequency counts, as a Matrix Market file. 205 | 2. Token to integer mapping, as a text file. 206 | 3. Document to document URI mapping, as a text file. 207 | 208 | The exact filesystem paths and filenames are determined from the config. 209 | """ 210 | self.saveDictionary(self.config.resultFile('wordids.txt')) 211 | self.saveDocuments(self.config.resultFile('docids.txt')) 212 | matutils.MmWriter.writeCorpus(self.config.resultFile('bow.mm'), self) 213 | 214 | 215 | def articleDir(self, docNo): 216 | """ 217 | Return absolute normalized path on filesystem to article no. `docNo`. 218 | """ 219 | sourceId, (_, outPath) = self.documents[docNo] 220 | source = self.config.sources[sourceId] 221 | return os.path.join(source.baseDir, outPath) 222 | 223 | 224 | def getMeta(self, docNo): 225 | """ 226 | Return metadata for article no. `docNo`. 227 | """ 228 | sourceId, uri = self.documents[docNo] 229 | source = self.config.sources[sourceId] 230 | return source.getMeta(uri) 231 | #endclass DmlCorpus 232 | 233 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE 8 | Process the repository, accepting articles in LANGUAGE (or 'any'). 9 | Store the word co-occurence matrix and id mappings, which are needed for subsequent processing. 10 | 11 | Example: ./gensim_build.py eng 12 | """ 13 | 14 | 15 | import logging 16 | import sys 17 | import os.path 18 | import re 19 | 20 | 21 | from gensim.corpora import sources, dmlcorpus 22 | 23 | 24 | PREFIX = 'dmlcz' 25 | 26 | AT_HOME = False 27 | 28 | if AT_HOME: 29 | SOURCE_LIST = [ 30 | sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), 31 | sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), 32 | sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), 33 | ] 34 | 35 | # SOURCE_LIST = [ 36 | # sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), 37 | # ] 38 | 39 | RESULT_DIR = '/Users/kofola/workspace/dml/data/results' 40 | 41 | else: 42 | 43 | SOURCE_LIST = [ 44 | sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), 45 | sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), 46 | sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), 47 | ] 48 | 49 | RESULT_DIR = '/data/dmlcz/xrehurek/results' 50 | 51 | 52 | def buildDmlCorpus(config): 53 | dml = dmlcorpus.DmlCorpus() 54 | dml.processConfig(config, shuffle = True) 55 | dml.buildDictionary() 56 | dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words 57 | 58 | dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) 59 | dml.saveAsText() # save id mappings and documents as text data (matrix market format) 60 | return dml 61 | 62 | 63 | if __name__ == '__main__': 64 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 65 | logging.root.setLevel(level=logging.INFO) 66 | logging.info("running %s" % ' '.join(sys.argv)) 67 | 68 | program = os.path.basename(sys.argv[0]) 69 | 70 | # check and process input arguments 71 | if len(sys.argv) < 2: 72 | print(globals()['__doc__'] % locals()) 73 | sys.exit(1) 74 | language = sys.argv[1] 75 | 76 | # construct the config, which holds information about sources, data file filenames etc. 77 | config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language]) 78 | for source in SOURCE_LIST: 79 | config.addSource(source) 80 | buildDmlCorpus(config) 81 | 82 | logging.info("finished running %s" % program) 83 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_genmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE METHOD 8 | Generate topic models for the specified subcorpus. METHOD is currently one \ 9 | of 'tfidf', 'lsi', 'lda', 'rp'. 10 | 11 | Example: ./gensim_genmodel.py any lsi 12 | """ 13 | 14 | 15 | import logging 16 | import sys 17 | import os.path 18 | import re 19 | 20 | 21 | from gensim.corpora import sources, dmlcorpus, MmCorpus 22 | from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel 23 | 24 | import gensim_build 25 | 26 | 27 | # internal method parameters 28 | DIM_RP = 300 # dimensionality for random projections 29 | DIM_LSI = 200 # for lantent semantic indexing 30 | DIM_LDA = 100 # for latent dirichlet allocation 31 | 32 | 33 | 34 | if __name__ == '__main__': 35 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 36 | logging.root.setLevel(level = logging.INFO) 37 | logging.info("running %s" % ' '.join(sys.argv)) 38 | 39 | program = os.path.basename(sys.argv[0]) 40 | 41 | # check and process input arguments 42 | if len(sys.argv) < 3: 43 | print(globals()['__doc__'] % locals()) 44 | sys.exit(1) 45 | language = sys.argv[1] 46 | method = sys.argv[2].strip().lower() 47 | 48 | logging.info("loading corpus mappings") 49 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), 50 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) 51 | 52 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) 53 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) 54 | logging.info("loaded %i word ids" % len(id2word)) 55 | 56 | corpus = MmCorpus(config.resultFile('bow.mm')) 57 | 58 | if method == 'tfidf': 59 | model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 60 | model.save(config.resultFile('model_tfidf.pkl')) 61 | elif method == 'lda': 62 | model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) 63 | model.save(config.resultFile('model_lda.pkl')) 64 | elif method == 'lsi': 65 | # first, transform word counts to tf-idf weights 66 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 67 | # then find the transformation from tf-idf to latent space 68 | model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) 69 | model.save(config.resultFile('model_lsi.pkl')) 70 | elif method == 'rp': 71 | # first, transform word counts to tf-idf weights 72 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 73 | # then find the transformation from tf-idf to latent space 74 | model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) 75 | model.save(config.resultFile('model_rp.pkl')) 76 | else: 77 | raise ValueError('unknown topic extraction method: %s' % repr(method)) 78 | 79 | MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) 80 | 81 | logging.info("finished running %s" % program) 82 | 83 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE METHOD 8 | Generate similar.xml files, using a previously built model for METHOD. 9 | 10 | Example: ./gensim_xml.py eng lsi 11 | """ 12 | 13 | 14 | import logging 15 | import sys 16 | import os.path 17 | import re 18 | 19 | 20 | from gensim.corpora import sources, dmlcorpus, MmCorpus 21 | from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity 22 | 23 | import gensim_build 24 | 25 | 26 | # set to True to do everything EXCEPT actually writing out similar.xml files to disk. 27 | # similar.xml files are NOT written if DRY_RUN is true. 28 | DRY_RUN = False 29 | 30 | # how many 'most similar' documents to store in each similar.xml? 31 | MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) 32 | MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). 33 | 34 | # if there are no similar articles (after the pruning), do we still want to generate similar.xml? 35 | SAVE_EMPTY = True 36 | 37 | # xml template for similar articles 38 | ARTICLE = """ 39 |
40 | 41 | %(author)s 42 | 43 | %(title)s 44 | %(suffix)s 45 | 46 | 47 | 48 |
""" 49 | 50 | # template for the whole similar.xml file (will be filled with multiple ARTICLE instances) 51 | SIMILAR = """\ 52 | 53 | %s 54 | 55 | """ 56 | 57 | 58 | 59 | def generateSimilar(corpus, index, method): 60 | for docNo, topSims in enumerate(index): # for each document 61 | # store similarities to the following file 62 | outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method) 63 | 64 | articles = [] # collect similars in this list 65 | for docNo2, score in topSims: # for each most similar article 66 | if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) 67 | source, (intId, pathId) = corpus.documents[docNo2] 68 | meta = corpus.getMeta(docNo2) 69 | suffix, author, title = '', meta.get('author', ''), meta.get('title', '') 70 | articles.append(ARTICLE % locals()) # add the similar article to output 71 | if len(articles) >= MAX_SIMILAR: 72 | break 73 | 74 | # now `articles` holds multiple strings in similar_*.xml format 75 | if SAVE_EMPTY or articles: 76 | output = ''.join(articles) # concat all similars to one string 77 | if not DRY_RUN: # only open output files for writing if DRY_RUN is false 78 | logging.info("generating %s (%i similars)" % (outfile, len(articles))) 79 | outfile = open(outfile, 'w') 80 | outfile.write(SIMILAR % output) # add xml headers and print to file 81 | outfile.close() 82 | else: 83 | logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output)) 84 | else: 85 | logging.debug("skipping %s (no similar found)" % outfile) 86 | 87 | 88 | 89 | if __name__ == '__main__': 90 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 91 | logging.root.setLevel(level=logging.INFO) 92 | logging.info("running %s" % ' '.join(sys.argv)) 93 | 94 | program = os.path.basename(sys.argv[0]) 95 | 96 | # check and process input arguments 97 | if len(sys.argv) < 3: 98 | print(globals()['__doc__'] % locals()) 99 | sys.exit(1) 100 | language = sys.argv[1] 101 | method = sys.argv[2].strip().lower() 102 | 103 | logging.info("loading corpus mappings") 104 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), 105 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) 106 | 107 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) 108 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) 109 | logging.info("loaded %i word ids" % len(id2word)) 110 | 111 | corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) 112 | input = MmCorpus(config.resultFile('_%s.mm' % method)) 113 | assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) 114 | 115 | # initialize structure for similarity queries 116 | if method == 'lsi' or method == 'rp': # for these methods, use dense vectors 117 | index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) 118 | else: 119 | index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) 120 | 121 | index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) 122 | generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format 123 | 124 | logging.info("finished running %s" % program) 125 | 126 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/runall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # full path to gensim executables 4 | BIN_PATH=~/xrehurek/gensim/dmlcz 5 | 6 | # intermediate data will be stored to this dir 7 | RESULT_PATH=~/xrehurek/results 8 | 9 | # set python path, so that python can find and import gensim modules 10 | export PYTHONPATH=~/xrehurek:$PYTHONPATH 11 | 12 | # Language is set to 'any', meaning all articles are processed for similarity in 13 | # one go, regardless of their language. 14 | # Set language to 'eng', 'fre', 'rus' etc. to only process a specific subset of 15 | # articles (an article's language is determined from its metadata). 16 | language=any 17 | 18 | 19 | # ========== parse all article sources, build article co-occurence matrix ====== 20 | ${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log 21 | 22 | 23 | # ========== build transformation models ======================================= 24 | for method in tfidf rp; 25 | do 26 | ( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) & 27 | done 28 | wait 29 | 30 | method=lsi 31 | ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log 32 | 33 | 34 | # =========== generate output xml files ======================================== 35 | # generate xml files for all methods at once, in parallel, to save time. 36 | # NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here) 37 | for method in tfidf lsi rp; 38 | do 39 | ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) & 40 | done 41 | wait 42 | -------------------------------------------------------------------------------- /gensim/interfaces.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | This module contains basic interfaces used throughout the whole gensim package. 9 | 10 | The interfaces are realized as abstract base classes (ie., some optional functionality 11 | is provided in the interface itself, so that the interfaces can be subclassed). 12 | """ 13 | 14 | from __future__ import with_statement 15 | 16 | import logging 17 | import itertools 18 | 19 | from gensim import utils, matutils 20 | from six.moves import xrange 21 | 22 | 23 | logger = logging.getLogger('gensim.interfaces') 24 | 25 | 26 | class CorpusABC(utils.SaveLoad): 27 | """ 28 | Interface (abstract base class) for corpora. A *corpus* is simply an iterable, 29 | where each iteration step yields one document: 30 | 31 | >>> for doc in corpus: 32 | >>> # do something with the doc... 33 | 34 | A document is a sequence of `(fieldId, fieldValue)` 2-tuples: 35 | 36 | >>> for attr_id, attr_value in doc: 37 | >>> # do something with the attribute 38 | 39 | Note that although a default :func:`len` method is provided, it is very inefficient 40 | (performs a linear scan through the corpus to determine its length). Wherever 41 | the corpus size is needed and known in advance (or at least doesn't change so 42 | that it can be cached), the :func:`len` method should be overridden. 43 | 44 | See the :mod:`gensim.corpora.svmlightcorpus` module for an example of a corpus. 45 | 46 | Saving the corpus with the `save` method (inherited from `utils.SaveLoad`) will 47 | only store the *in-memory* (binary, pickled) object representation=the stream 48 | state, and **not** the documents themselves. See the `save_corpus` static method 49 | for serializing the actual stream content. 50 | """ 51 | def __iter__(self): 52 | """ 53 | Iterate over the corpus, yielding one document at a time. 54 | """ 55 | raise NotImplementedError('cannot instantiate abstract base class') 56 | 57 | 58 | def save(self, *args, **kwargs): 59 | import warnings 60 | warnings.warn("corpus.save() stores only the (tiny) iteration object; " 61 | "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)") 62 | super(CorpusABC, self).save(*args, **kwargs) 63 | 64 | def __len__(self): 65 | """ 66 | Return the number of documents in the corpus. 67 | 68 | This method is just the least common denominator and should really be 69 | overridden when possible. 70 | """ 71 | raise NotImplementedError("must override __len__() before calling len(corpus)") 72 | # logger.warning("performing full corpus scan to determine its length; was this intended?") 73 | # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus 74 | 75 | @staticmethod 76 | def save_corpus(fname, corpus, id2word=None, metadata=False): 77 | """ 78 | Save an existing `corpus` to disk. 79 | 80 | Some formats also support saving the dictionary (`feature_id->word` mapping), 81 | which can in this case be provided by the optional `id2word` parameter. 82 | 83 | >>> MmCorpus.save_corpus('file.mm', corpus) 84 | 85 | Some corpora also support an index of where each document begins, so 86 | that the documents on disk can be accessed in O(1) time (see the 87 | `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically 88 | called internally by `serialize`, which does `save_corpus` plus saves the index 89 | at the same time, so you want to store the corpus with:: 90 | 91 | >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents 92 | 93 | Calling `serialize()` is preferred to calling `save_corpus()`. 94 | 95 | """ 96 | raise NotImplementedError('cannot instantiate abstract base class') 97 | 98 | # example code: 99 | logger.info("converting corpus to ??? format: %s" % fname) 100 | with utils.smart_open(fname, 'wb') as fout: 101 | for doc in corpus: # iterate over the document stream 102 | fmt = str(doc) # format the document appropriately... 103 | fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk 104 | #endclass CorpusABC 105 | 106 | 107 | class TransformedCorpus(CorpusABC): 108 | def __init__(self, obj, corpus, chunksize=None): 109 | self.obj, self.corpus, self.chunksize = obj, corpus, chunksize 110 | self.metadata = False 111 | 112 | def __len__(self): 113 | return len(self.corpus) 114 | 115 | def __iter__(self): 116 | if self.chunksize: 117 | for chunk in utils.grouper(self.corpus, self.chunksize): 118 | for transformed in self.obj.__getitem__(chunk, chunksize=None): 119 | yield transformed 120 | else: 121 | for doc in self.corpus: 122 | yield self.obj[doc] 123 | #endclass TransformedCorpus 124 | 125 | 126 | class TransformationABC(utils.SaveLoad): 127 | """ 128 | Interface for transformations. A 'transformation' is any object which accepts 129 | a sparse document via the dictionary notation `[]` and returns another sparse 130 | document in its stead:: 131 | 132 | >>> transformed_doc = transformation[doc] 133 | 134 | or also:: 135 | 136 | >>> transformed_corpus = transformation[corpus] 137 | 138 | See the :mod:`gensim.models.tfidfmodel` module for an example of a transformation. 139 | 140 | """ 141 | 142 | def __getitem__(self, vec): 143 | """ 144 | Transform vector from one vector space into another 145 | 146 | **or** 147 | 148 | Transform a whole corpus into another. 149 | """ 150 | raise NotImplementedError('cannot instantiate abstract base class') 151 | 152 | 153 | def _apply(self, corpus, chunksize=None): 154 | """ 155 | Apply the transformation to a whole corpus (as opposed to a single document) 156 | and return the result as another corpus. 157 | """ 158 | return TransformedCorpus(self, corpus, chunksize) 159 | #endclass TransformationABC 160 | 161 | 162 | class SimilarityABC(utils.SaveLoad): 163 | """ 164 | Abstract interface for similarity searches over a corpus. 165 | 166 | In all instances, there is a corpus against which we want to perform the 167 | similarity search. 168 | 169 | For each similarity search, the input is a document and the output are its 170 | similarities to individual corpus documents. 171 | 172 | Similarity queries are realized by calling ``self[query_document]``. 173 | 174 | There is also a convenience wrapper, where iterating over `self` yields 175 | similarities of each document in the corpus against the whole corpus (ie., 176 | the query is each corpus document in turn). 177 | """ 178 | def __init__(self, corpus): 179 | raise NotImplementedError("cannot instantiate Abstract Base Class") 180 | 181 | 182 | def get_similarities(self, doc): 183 | # (Sparse)MatrixSimilarity override this method so that they both use the 184 | # same __getitem__ method, defined below 185 | raise NotImplementedError("cannot instantiate Abstract Base Class") 186 | 187 | 188 | def __getitem__(self, query): 189 | """Get similarities of document `query` to all documents in the corpus. 190 | 191 | **or** 192 | 193 | If `query` is a corpus (iterable of documents), return a matrix of similarities 194 | of all query documents vs. all corpus document. Using this type of batch 195 | query is more efficient than computing the similarities one document after 196 | another. 197 | """ 198 | is_corpus, query = utils.is_corpus(query) 199 | if self.normalize: 200 | # self.normalize only works if the input is a plain gensim vector/corpus (as 201 | # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix 202 | # as well, but in that case assume tricks are happening and don't normalize 203 | # anything (self.normalize has no effect). 204 | if matutils.ismatrix(query): 205 | import warnings 206 | # warnings.warn("non-gensim input must already come normalized") 207 | else: 208 | if is_corpus: 209 | query = [matutils.unitvec(v) for v in query] 210 | else: 211 | query = matutils.unitvec(query) 212 | result = self.get_similarities(query) 213 | 214 | if self.num_best is None: 215 | return result 216 | 217 | # if the input query was a corpus (=more documents), compute the top-n 218 | # most similar for each document in turn 219 | if matutils.ismatrix(result): 220 | return [matutils.full2sparse_clipped(v, self.num_best) for v in result] 221 | else: 222 | # otherwise, return top-n of the single input document 223 | return matutils.full2sparse_clipped(result, self.num_best) 224 | 225 | 226 | def __iter__(self): 227 | """ 228 | For each index document, compute cosine similarity against all other 229 | documents in the index and yield the result. 230 | """ 231 | # turn off query normalization (vectors in the index are assumed to be already normalized) 232 | norm = self.normalize 233 | self.normalize = False 234 | 235 | # Try to compute similarities in bigger chunks of documents (not 236 | # one query = a single document after another). The point is, a 237 | # bigger query of N documents is faster than N small queries of one 238 | # document. 239 | # 240 | # After computing similarities of the bigger query in `self[chunk]`, 241 | # yield the resulting similarities one after another, so that it looks 242 | # exactly the same as if they had been computed with many small queries. 243 | try: 244 | chunking = self.chunksize > 1 245 | except AttributeError: 246 | # chunking not supported; fall back to the (slower) mode of 1 query=1 document 247 | chunking = False 248 | if chunking: 249 | # assumes `self.corpus` holds the index as a 2-d numpy array. 250 | # this is true for MatrixSimilarity and SparseMatrixSimilarity, but 251 | # may not be true for other (future) classes..? 252 | for chunk_start in xrange(0, self.index.shape[0], self.chunksize): 253 | # scipy.sparse doesn't allow slicing beyond real size of the matrix 254 | # (unlike numpy). so, clip the end of the chunk explicitly to make 255 | # scipy.sparse happy 256 | chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) 257 | chunk = self.index[chunk_start : chunk_end] 258 | if chunk.shape[0] > 1: 259 | for sim in self[chunk]: 260 | yield sim 261 | else: 262 | yield self[chunk] 263 | else: 264 | for doc in self.index: 265 | yield self[doc] 266 | 267 | # restore old normalization value 268 | self.normalize = norm 269 | #endclass SimilarityABC 270 | -------------------------------------------------------------------------------- /gensim/interfaces.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/interfaces.pyc -------------------------------------------------------------------------------- /gensim/matutils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/matutils.pyc -------------------------------------------------------------------------------- /gensim/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains algorithms for extracting document representations from their raw 3 | bag-of-word counts. 4 | """ 5 | 6 | # bring model classes directly into package namespace, to save some typing 7 | from .hdpmodel import HdpModel 8 | from .ldamodel import LdaModel 9 | from .ldamallet import LdaMallet 10 | from .lsimodel import LsiModel 11 | from .tfidfmodel import TfidfModel 12 | from .rpmodel import RpModel 13 | from .logentropy_model import LogEntropyModel 14 | from .word2vec import Word2Vec 15 | 16 | from gensim import interfaces, utils 17 | 18 | 19 | class VocabTransform(interfaces.TransformationABC): 20 | """ 21 | Remap feature ids to new values. 22 | 23 | Given a mapping between old ids and new ids (some old ids may be missing = these 24 | features are to be discarded), this will wrap a corpus so that iterating over 25 | `VocabTransform[corpus]` returns the same vectors but with the new ids. 26 | 27 | Old features that have no counterpart in the new ids are discarded. This 28 | can be used to filter vocabulary of a corpus "online":: 29 | 30 | >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep)) 31 | >>> vt = VocabTransform(old2new) 32 | >>> for vec_with_new_ids in vt[corpus_with_old_ids]: 33 | >>> ... 34 | 35 | """ 36 | def __init__(self, old2new, id2token=None): 37 | # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems()) 38 | self.old2new = old2new 39 | self.id2token = id2token 40 | 41 | 42 | def __getitem__(self, bow): 43 | """ 44 | Return representation with the ids transformed. 45 | """ 46 | # if the input vector is in fact a corpus, return a transformed corpus as a result 47 | is_corpus, bow = utils.is_corpus(bow) 48 | if is_corpus: 49 | return self._apply(bow) 50 | 51 | return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new) 52 | #endclass VocabTransform 53 | -------------------------------------------------------------------------------- /gensim/models/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/__init__.pyc -------------------------------------------------------------------------------- /gensim/models/hdpmodel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/hdpmodel.pyc -------------------------------------------------------------------------------- /gensim/models/lda_dispatcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s SIZE_OF_JOBS_QUEUE 9 | 10 | Dispatcher process which orchestrates distributed LDA computations. Run this \ 11 | script only once, on any node in your cluster. 12 | 13 | Example: python -m gensim.models.lda_dispatcher 14 | """ 15 | 16 | 17 | from __future__ import with_statement 18 | import os, sys, logging, threading, time 19 | from Queue import Queue 20 | 21 | from gensim import utils 22 | 23 | 24 | logger = logging.getLogger("gensim.models.lda_dispatcher") 25 | 26 | 27 | # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? 28 | # A small number is usually enough, unless iteration over the corpus is very very 29 | # slow (slower than the actual computation of LDA), in which case you can override 30 | # this value from command line. ie. run "python ./lda_dispatcher.py 100" 31 | MAX_JOBS_QUEUE = 10 32 | 33 | # timeout for the Queue object put/get blocking methods. 34 | # it should theoretically be infinity, but then keyboard interrupts don't work. 35 | # so this is really just a hack, see http://bugs.python.org/issue1360 36 | HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year 37 | 38 | 39 | 40 | class Dispatcher(object): 41 | """ 42 | Dispatcher object that communicates and coordinates individual workers. 43 | 44 | There should never be more than one dispatcher running at any one time. 45 | """ 46 | 47 | def __init__(self, maxsize=MAX_JOBS_QUEUE): 48 | """ 49 | Note that the constructor does not fully initialize the dispatcher; 50 | use the `initialize()` function to populate it with workers etc. 51 | """ 52 | self.maxsize = maxsize 53 | self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) 54 | 55 | 56 | def initialize(self, **model_params): 57 | """ 58 | `model_params` are parameters used to initialize individual workers (gets 59 | handed all the way down to `worker.initialize()`). 60 | """ 61 | self.jobs = Queue(maxsize=self.maxsize) 62 | self.lock_update = threading.Lock() 63 | self._jobsdone = 0 64 | self._jobsreceived = 0 65 | 66 | # locate all available workers and store their proxies, for subsequent RMI calls 67 | self.workers = {} 68 | import Pyro4 69 | with utils.getNS() as ns: 70 | self.callback = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') # = self 71 | self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously 72 | for name, uri in ns.list(prefix='gensim.lda_worker').iteritems(): 73 | try: 74 | worker = Pyro4.Proxy(uri) 75 | workerid = len(self.workers) 76 | # make time consuming methods work asynchronously 77 | worker._pyroOneway.add("requestjob") 78 | worker._pyroOneway.add("exit") 79 | logger.info("registering worker #%i at %s" % (workerid, uri)) 80 | worker.initialize(workerid, dispatcher=self.callback, **model_params) 81 | self.workers[workerid] = worker 82 | except Pyro4.errors.PyroError: 83 | logger.warning("unresponsive worker at %s, deleting it from the name server" % uri) 84 | ns.remove(name) 85 | 86 | if not self.workers: 87 | raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!') 88 | 89 | 90 | def getworkers(self): 91 | """ 92 | Return pyro URIs of all registered workers. 93 | """ 94 | return [worker._pyroUri for worker in self.workers.itervalues()] 95 | 96 | 97 | def getjob(self, worker_id): 98 | logger.info("worker #%i requesting a new job" % worker_id) 99 | job = self.jobs.get(block=True, timeout=1) 100 | logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) 101 | return job 102 | 103 | 104 | def putjob(self, job): 105 | self._jobsreceived += 1 106 | self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) 107 | logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) 108 | 109 | 110 | def getstate(self): 111 | """ 112 | Merge states from across all workers and return the result. 113 | """ 114 | logger.info("end of input, assigning all remaining jobs") 115 | logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) 116 | while self._jobsdone < self._jobsreceived: 117 | time.sleep(0.5) # check every half a second 118 | 119 | logger.info("merging states from %i workers" % len(self.workers)) 120 | workers = self.workers.values() 121 | result = workers[0].getstate() 122 | for worker in workers[1:]: 123 | result.merge(worker.getstate()) 124 | 125 | logger.info("sending out merged state") 126 | return result 127 | 128 | 129 | def reset(self, state): 130 | """ 131 | Initialize all workers for a new EM iterations. 132 | """ 133 | for workerid, worker in self.workers.iteritems(): 134 | logger.info("resetting worker %s" % workerid) 135 | worker.reset(state) 136 | worker.requestjob() 137 | self._jobsdone = 0 138 | self._jobsreceived = 0 139 | 140 | 141 | @utils.synchronous('lock_update') 142 | def jobdone(self, workerid): 143 | """ 144 | A worker has finished its job. Log this event and then asynchronously 145 | transfer control back to the worker. 146 | 147 | In this way, control flow basically oscillates between `dispatcher.jobdone()` 148 | and `worker.requestjob()`. 149 | """ 150 | self._jobsdone += 1 151 | logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) 152 | self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) 153 | 154 | 155 | def jobsdone(self): 156 | """Wrap self._jobsdone, needed for remote access through Pyro proxies""" 157 | return self._jobsdone 158 | 159 | 160 | def exit(self): 161 | """ 162 | Terminate all registered workers and then the dispatcher. 163 | """ 164 | for workerid, worker in self.workers.iteritems(): 165 | logger.info("terminating worker %s" % workerid) 166 | worker.exit() 167 | logger.info("terminating dispatcher") 168 | os._exit(0) # exit the whole process (not just this thread ala sys.exit()) 169 | #endclass Dispatcher 170 | 171 | 172 | 173 | def main(): 174 | logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 175 | logger.info("running %s" % " ".join(sys.argv)) 176 | 177 | program = os.path.basename(sys.argv[0]) 178 | # make sure we have enough cmd line parameters 179 | if len(sys.argv) < 1: 180 | print(globals()["__doc__"] % locals()) 181 | sys.exit(1) 182 | 183 | if len(sys.argv) < 2: 184 | maxsize = MAX_JOBS_QUEUE 185 | else: 186 | maxsize = int(sys.argv[1]) 187 | utils.pyro_daemon('gensim.lda_dispatcher', Dispatcher(maxsize=maxsize)) 188 | 189 | logger.info("finished running %s" % program) 190 | 191 | 192 | 193 | if __name__ == '__main__': 194 | main() 195 | -------------------------------------------------------------------------------- /gensim/models/lda_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2011 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s 9 | 10 | Worker ("slave") process used in computing distributed LDA. Run this script \ 11 | on every node in your cluster. If you wish, you may even run it multiple times \ 12 | on a single machine, to make better use of multiple cores (just beware that \ 13 | memory footprint increases accordingly). 14 | 15 | Example: python -m gensim.models.lda_worker 16 | """ 17 | 18 | 19 | from __future__ import with_statement 20 | import os, sys, logging 21 | import threading 22 | import tempfile 23 | import Queue 24 | 25 | from gensim.models import ldamodel 26 | from gensim import utils 27 | 28 | logger = logging.getLogger('gensim.models.lda_worker') 29 | 30 | 31 | # periodically save intermediate models after every SAVE_DEBUG updates (0 for never) 32 | SAVE_DEBUG = 0 33 | 34 | 35 | 36 | class Worker(object): 37 | def __init__(self): 38 | self.model = None 39 | 40 | 41 | def initialize(self, myid, dispatcher, **model_params): 42 | self.lock_update = threading.Lock() 43 | self.jobsdone = 0 # how many jobs has this worker completed? 44 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? 45 | self.dispatcher = dispatcher 46 | self.finished = False 47 | logger.info("initializing worker #%s" % myid) 48 | self.model = ldamodel.LdaModel(**model_params) 49 | 50 | 51 | def requestjob(self): 52 | """ 53 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. 54 | """ 55 | if self.model is None: 56 | raise RuntimeError("worker must be initialized before receiving jobs") 57 | 58 | job = None 59 | while job is None and not self.finished: 60 | try: 61 | job = self.dispatcher.getjob(self.myid) 62 | except Queue.Empty: 63 | # no new job: try again, unless we're finished with all work 64 | continue 65 | if job is not None: 66 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) 67 | self.processjob(job) 68 | self.dispatcher.jobdone(self.myid) 69 | else: 70 | logger.info("worker #%i stopping asking for jobs" % self.myid) 71 | 72 | 73 | @utils.synchronous('lock_update') 74 | def processjob(self, job): 75 | logger.debug("starting to process job #%i" % self.jobsdone) 76 | self.model.do_estep(job) 77 | self.jobsdone += 1 78 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0: 79 | fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl') 80 | self.model.save(fname) 81 | logger.info("finished processing job #%i" % (self.jobsdone - 1)) 82 | 83 | 84 | @utils.synchronous('lock_update') 85 | def getstate(self): 86 | logger.info("worker #%i returning its state after %s jobs" % 87 | (self.myid, self.jobsdone)) 88 | result = self.model.state 89 | assert isinstance(result, ldamodel.LdaState) 90 | self.model.clear() # free up mem in-between two EM cycles 91 | self.finished = True 92 | return result 93 | 94 | 95 | @utils.synchronous('lock_update') 96 | def reset(self, state): 97 | assert state is not None 98 | logger.info("resetting worker #%i" % self.myid) 99 | self.model.state = state 100 | self.model.sync_state() 101 | self.model.state.reset() 102 | self.finished = False 103 | 104 | 105 | def exit(self): 106 | logger.info("terminating worker #%i" % self.myid) 107 | os._exit(0) 108 | #endclass Worker 109 | 110 | 111 | 112 | def main(): 113 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 114 | logger.info("running %s" % " ".join(sys.argv)) 115 | 116 | program = os.path.basename(sys.argv[0]) 117 | # make sure we have enough cmd line parameters 118 | if len(sys.argv) < 1: 119 | print(globals()["__doc__"] % locals()) 120 | sys.exit(1) 121 | 122 | utils.pyro_daemon('gensim.lda_worker', Worker(), random_suffix=True) 123 | 124 | logger.info("finished running %s" % program) 125 | 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /gensim/models/ldamallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2014 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Python wrapper for Latent Dirichlet Allocation (LDA) from MALLET, the Java topic modelling 10 | toolkit [1]_. 11 | 12 | This module allows both LDA model estimation from a training corpus and inference of topic 13 | distribution on new, unseen documents, using an (optimized version of) collapsed 14 | gibbs sampling from MALLET. 15 | 16 | MALLET's LDA training requires O(#corpus_words) of memory, keeping the entire corpus in RAM. 17 | If you find yourself running out of memory, either decrease the `workers` constructor 18 | parameter, or use `LdaModel` which needs only O(1) memory. 19 | 20 | The wrapped model can NOT be updated with new documents for online training -- use gensim's `LdaModel` for that. 21 | 22 | Example: 23 | 24 | >>> model = gensim.models.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) 25 | >>> print model[my_vector] # print LDA topics of a document 26 | 27 | .. [1] http://mallet.cs.umass.edu/ 28 | 29 | """ 30 | 31 | 32 | import logging 33 | import random 34 | import tempfile 35 | import os 36 | from subprocess import call 37 | 38 | import numpy 39 | 40 | from gensim import utils 41 | 42 | logger = logging.getLogger('gensim.models.ldamallet') 43 | 44 | 45 | def read_doctopics(fname, eps=1e-6): 46 | """ 47 | Yield document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors. 48 | 49 | """ 50 | with utils.smart_open(fname) as fin: 51 | next(fin) # skip the header line 52 | for lineno, line in enumerate(fin): 53 | parts = line.split()[2:] # skip "doc" and "source" columns 54 | if len(parts) % 2 != 0: 55 | raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname)) 56 | doc = [(int(id), float(weight)) for id, weight in zip(parts[::2], parts[1::2]) if abs(float(weight)) > eps] 57 | # explicitly normalize probs to sum up to 1.0, just to be sure... 58 | weights = float(sum([weight for _, weight in doc])) 59 | yield [] if weights == 0 else sorted((id, 1.0 * weight / weights) for id, weight in doc) 60 | 61 | 62 | 63 | class LdaMallet(utils.SaveLoad): 64 | """ 65 | Class for LDA training using MALLET. Communication between MALLET and Python 66 | takes place by passing around data files on disk and calling Java with subprocess.call(). 67 | 68 | """ 69 | def __init__(self, mallet_path, corpus=None, num_topics=100, id2word=None, workers=4, prefix=None, 70 | optimize_interval=0, iterations=1000): 71 | """ 72 | `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`. 73 | `corpus` is a gensim corpus, aka a stream of sparse document vectors. 74 | `id2word` is a mapping between tokens ids and token. 75 | `workers` is the number of threads, for parallel training. 76 | `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix. 77 | `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization). 78 | `iterations` is the number of sampling iterations. 79 | 80 | """ 81 | self.mallet_path = mallet_path 82 | self.id2word = id2word 83 | if self.id2word is None: 84 | logger.warning("no word id mapping provided; initializing from corpus, assuming identity") 85 | self.id2word = utils.dict_from_corpus(corpus) 86 | self.num_terms = len(self.id2word) 87 | else: 88 | self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) 89 | if self.num_terms == 0: 90 | raise ValueError("cannot compute LDA over an empty collection (no terms)") 91 | self.num_topics = num_topics 92 | if prefix is None: 93 | rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' 94 | prefix = os.path.join(tempfile.gettempdir(), rand_prefix) 95 | self.prefix = prefix 96 | self.workers = workers 97 | self.optimize_interval = optimize_interval 98 | self.iterations = iterations 99 | 100 | if corpus is not None: 101 | self.train(corpus) 102 | 103 | def finferencer(self): 104 | return self.prefix + 'inferencer.mallet' 105 | 106 | def ftopickeys(self): 107 | return self.prefix + 'topickeys.txt' 108 | 109 | def fstate(self): 110 | return self.prefix + 'state.mallet.gz' 111 | 112 | def fdoctopics(self): 113 | return self.prefix + 'doctopics.txt' 114 | 115 | def fcorpustxt(self): 116 | return self.prefix + 'corpus.txt' 117 | 118 | def fcorpusmallet(self): 119 | return self.prefix + 'corpus.mallet' 120 | 121 | def fwordweights(self): 122 | return self.prefix + 'wordweights.txt' 123 | 124 | def convert_input(self, corpus, infer=False): 125 | """ 126 | Serialize documents (lists of unicode tokens) to a temporary text file, 127 | then convert that text file to MALLET format `outfile`. 128 | 129 | """ 130 | logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) 131 | # write out the corpus in a file format that MALLET understands: one document per line: 132 | # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens 133 | with utils.smart_open(self.fcorpustxt(), 'wb') as fout: 134 | for docno, doc in enumerate(corpus): 135 | if self.id2word: 136 | tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) 137 | else: 138 | tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) 139 | fout.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) 140 | 141 | # convert the text file above into MALLET's internal format 142 | cmd = self.mallet_path + " import-file --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" 143 | if infer: 144 | cmd += ' --use-pipe-from ' + self.fcorpusmallet() 145 | cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') 146 | else: 147 | cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) 148 | logger.info("converting temporary corpus to MALLET format with %s" % cmd) 149 | call(cmd, shell=True) 150 | 151 | 152 | def train(self, corpus): 153 | self.convert_input(corpus, infer=False) 154 | cmd = self.mallet_path + " train-topics --input %s --num-topics %s --optimize-interval %s "\ 155 | "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\ 156 | "--num-iterations %s --inferencer-filename %s" 157 | cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.optimize_interval, self.workers, 158 | self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer()) 159 | # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory 160 | logger.info("training MALLET LDA with %s" % cmd) 161 | call(cmd, shell=True) 162 | self.word_topics = self.load_word_topics() 163 | 164 | 165 | def __getitem__(self, bow, iterations=100): 166 | is_corpus, corpus = utils.is_corpus(bow) 167 | if not is_corpus: 168 | # query is a single document => make a corpus out of it 169 | bow = [bow] 170 | 171 | self.convert_input(bow, infer=True) 172 | cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s" 173 | cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations) 174 | logger.info("inferring topics with MALLET LDA '%s'" % cmd) 175 | retval = call(cmd, shell=True) 176 | if retval != 0: 177 | raise RuntimeError("MALLET failed with error %s on return" % retval) 178 | result = list(read_doctopics(self.fdoctopics() + '.infer')) 179 | return result if is_corpus else result[0] 180 | 181 | 182 | def load_word_topics(self): 183 | logger.info("loading assigned topics from %s" % self.fstate()) 184 | wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) 185 | with utils.smart_open(self.fstate()) as fin: 186 | _ = next(fin) # header 187 | self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) 188 | assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" 189 | _ = next(fin) # beta 190 | for lineno, line in enumerate(fin): 191 | line = utils.to_unicode(line) 192 | doc, source, pos, typeindex, token, topic = line.split() 193 | tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) 194 | wordtopics[int(topic), tokenid] += 1 195 | logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) 196 | self.wordtopics = wordtopics 197 | self.print_topics(15) 198 | 199 | 200 | def print_topics(self, topics=10, topn=10): 201 | return self.show_topics(topics, topn, log=True) 202 | 203 | 204 | def show_topics(self, topics=10, topn=10, log=False, formatted=True): 205 | """ 206 | Print the `topN` most probable words for `topics` number of topics. 207 | Set `topics=-1` to print all topics. 208 | 209 | Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs. 210 | 211 | """ 212 | if topics < 0 or topics >= self.num_topics: 213 | topics = self.num_topics 214 | chosen_topics = range(topics) 215 | else: 216 | topics = min(topics, self.num_topics) 217 | sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha 218 | sorted_topics = list(numpy.argsort(sort_alpha)) 219 | chosen_topics = sorted_topics[ : topics/2] + sorted_topics[-topics/2 : ] 220 | shown = [] 221 | for i in chosen_topics: 222 | if formatted: 223 | topic = self.print_topic(i, topn=topn) 224 | else: 225 | topic = self.show_topic(i, topn=topn) 226 | shown.append(topic) 227 | if log: 228 | logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], topic)) 229 | return shown 230 | 231 | 232 | def show_topic(self, topicid, topn=10): 233 | topic = self.wordtopics[topicid] 234 | topic = topic / topic.sum() # normalize to probability dist 235 | bestn = numpy.argsort(topic)[::-1][:topn] 236 | beststr = [(topic[id], self.id2word[id]) for id in bestn] 237 | return beststr 238 | 239 | 240 | def print_topic(self, topicid, topn=10): 241 | return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)]) 242 | -------------------------------------------------------------------------------- /gensim/models/ldamallet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/ldamallet.pyc -------------------------------------------------------------------------------- /gensim/models/ldamodel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/ldamodel.pyc -------------------------------------------------------------------------------- /gensim/models/logentropy_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | import logging 7 | import math 8 | from gensim import interfaces, matutils, utils 9 | 10 | 11 | logger = logging.getLogger('gensim.models.logentropy_model') 12 | 13 | 14 | class LogEntropyModel(interfaces.TransformationABC): 15 | """ 16 | Objects of this class realize the transformation between word-document 17 | co-occurence matrix (integers) into a locally/globally weighted matrix 18 | (positive floats). 19 | 20 | This is done by a log entropy normalization, optionally normalizing the 21 | resulting documents to unit length. The following formulas explain how 22 | to compute the log entropy weight for term `i` in document `j`:: 23 | 24 | local_weight_{i,j} = log(frequency_{i,j} + 1) 25 | 26 | P_{i,j} = frequency_{i,j} / sum_j frequency_{i,j} 27 | 28 | sum_j P_{i,j} * log(P_{i,j}) 29 | global_weight_i = 1 + ---------------------------- 30 | log(number_of_documents + 1) 31 | 32 | final_weight_{i,j} = local_weight_{i,j} * global_weight_i 33 | 34 | The main methods are: 35 | 36 | 1. constructor, which calculates the global weighting for all terms in 37 | a corpus. 38 | 2. the [] method, which transforms a simple count representation into the 39 | log entropy normalized space. 40 | 41 | >>> log_ent = LogEntropyModel(corpus) 42 | >>> print(log_ent[some_doc]) 43 | >>> log_ent.save('/tmp/foo.log_ent_model') 44 | 45 | Model persistency is achieved via its load/save methods. 46 | """ 47 | 48 | def __init__(self, corpus, id2word=None, normalize=True): 49 | """ 50 | `normalize` dictates whether the resulting vectors will be 51 | set to unit length. 52 | """ 53 | self.normalize = normalize 54 | self.n_docs = 0 55 | self.n_words = 0 56 | self.entr = {} 57 | if corpus is not None: 58 | self.initialize(corpus) 59 | 60 | def __str__(self): 61 | return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, 62 | self.n_words) 63 | 64 | def initialize(self, corpus): 65 | """ 66 | Initialize internal statistics based on a training corpus. Called 67 | automatically from the constructor. 68 | """ 69 | logger.info("calculating counts") 70 | glob_freq = {} 71 | glob_num_words, doc_no = 0, -1 72 | for doc_no, bow in enumerate(corpus): 73 | if doc_no % 10000 == 0: 74 | logger.info("PROGRESS: processing document #%i" % doc_no) 75 | glob_num_words += len(bow) 76 | for term_id, term_count in bow: 77 | glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count 78 | 79 | # keep some stats about the training corpus 80 | self.n_docs = doc_no + 1 81 | self.n_words = glob_num_words 82 | 83 | # and finally compute the global weights 84 | logger.info("calculating global log entropy weights for %i " 85 | "documents and %i features (%i matrix non-zeros)" 86 | % (self.n_docs, len(glob_freq), self.n_words)) 87 | logger.debug('iterating over corpus') 88 | for doc_no2, bow in enumerate(corpus): 89 | for key, freq in bow: 90 | p = (float(freq) / glob_freq[key]) * math.log(float(freq) / 91 | glob_freq[key]) 92 | self.entr[key] = self.entr.get(key, 0.0) + p 93 | if doc_no2 != doc_no: 94 | raise ValueError("LogEntropyModel doesn't support generators as training data") 95 | 96 | logger.debug('iterating over keys') 97 | for key in self.entr: 98 | self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1) 99 | 100 | def __getitem__(self, bow): 101 | """ 102 | Return log entropy representation of the input vector and/or corpus. 103 | """ 104 | # if the input vector is in fact a corpus, return a transformed corpus 105 | is_corpus, bow = utils.is_corpus(bow) 106 | if is_corpus: 107 | return self._apply(bow) 108 | 109 | # unknown (new) terms will be given zero weight (NOT infinity/huge) 110 | vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) 111 | for term_id, tf in bow if term_id in self.entr] 112 | if self.normalize: 113 | vector = matutils.unitvec(vector) 114 | return vector 115 | -------------------------------------------------------------------------------- /gensim/models/logentropy_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/logentropy_model.pyc -------------------------------------------------------------------------------- /gensim/models/lsi_dispatcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s SIZE_OF_JOBS_QUEUE 9 | 10 | Dispatcher process which orchestrates distributed LSI computations. Run this \ 11 | script only once, on any node in your cluster. 12 | 13 | Example: python -m gensim.models.lsi_dispatcher 14 | """ 15 | 16 | 17 | from __future__ import with_statement 18 | import os, sys, logging, threading, time 19 | from Queue import Queue 20 | 21 | from gensim import utils 22 | 23 | 24 | logger = logging.getLogger("gensim.models.lsi_dispatcher") 25 | 26 | 27 | # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? 28 | # A small number is usually enough, unless iteration over the corpus is very very 29 | # slow (slower than the actual computation of LSI), in which case you can override 30 | # this value from command line. ie. run "python ./lsi_dispatcher.py 100" 31 | MAX_JOBS_QUEUE = 10 32 | 33 | # timeout for the Queue object put/get blocking methods. 34 | # it should really be infinity, but then keyboard interrupts don't work. 35 | # so this is really just a hack, see http://bugs.python.org/issue1360 36 | HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year 37 | 38 | 39 | 40 | class Dispatcher(object): 41 | """ 42 | Dispatcher object that communicates and coordinates individual workers. 43 | 44 | There should never be more than one dispatcher running at any one time. 45 | """ 46 | 47 | def __init__(self, maxsize=0): 48 | """ 49 | Note that the constructor does not fully initialize the dispatcher; 50 | use the `initialize()` function to populate it with workers etc. 51 | """ 52 | self.maxsize = maxsize 53 | self.workers = {} 54 | self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) 55 | 56 | 57 | def initialize(self, **model_params): 58 | """ 59 | `model_params` are parameters used to initialize individual workers (gets 60 | handed all the way down to worker.initialize()). 61 | """ 62 | self.jobs = Queue(maxsize=self.maxsize) 63 | self.lock_update = threading.Lock() 64 | self._jobsdone = 0 65 | self._jobsreceived = 0 66 | 67 | # locate all available workers and store their proxies, for subsequent RMI calls 68 | self.workers = {} 69 | with utils.getNS() as ns: 70 | import Pyro4 71 | self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self 72 | self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously 73 | for name, uri in ns.list(prefix='gensim.lsi_worker').iteritems(): 74 | try: 75 | worker = Pyro4.Proxy(uri) 76 | workerid = len(self.workers) 77 | # make time consuming methods work asynchronously 78 | worker._pyroOneway.add("requestjob") 79 | worker._pyroOneway.add("exit") 80 | logger.info("registering worker #%i from %s" % (workerid, uri)) 81 | worker.initialize(workerid, dispatcher=self.callback, **model_params) 82 | self.workers[workerid] = worker 83 | except Pyro4.errors.PyroError: 84 | logger.exception("unresponsive worker at %s, deleting it from the name server" % uri) 85 | ns.remove(name) 86 | 87 | if not self.workers: 88 | raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!') 89 | 90 | 91 | def getworkers(self): 92 | """ 93 | Return pyro URIs of all registered workers. 94 | """ 95 | return [worker._pyroUri for worker in self.workers.itervalues()] 96 | 97 | 98 | def getjob(self, worker_id): 99 | logger.info("worker #%i requesting a new job" % worker_id) 100 | job = self.jobs.get(block=True, timeout=1) 101 | logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize())) 102 | return job 103 | 104 | 105 | def putjob(self, job): 106 | self._jobsreceived += 1 107 | self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) 108 | logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize()) 109 | 110 | 111 | def getstate(self): 112 | """ 113 | Merge projections from across all workers and return the final projection. 114 | """ 115 | logger.info("end of input, assigning all remaining jobs") 116 | logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived)) 117 | while self._jobsdone < self._jobsreceived: 118 | time.sleep(0.5) # check every half a second 119 | 120 | # TODO: merge in parallel, so that we're done in `log_2(workers)` merges, 121 | # and not `workers - 1` merges! 122 | # but merging only takes place once, after all input data has been processed, 123 | # so the overall effect would be small... compared to the amount of coding :-) 124 | logger.info("merging states from %i workers" % len(self.workers)) 125 | workers = self.workers.items() 126 | result = workers[0][1].getstate() 127 | for workerid, worker in workers[1:]: 128 | logger.info("pulling state from worker %s" % workerid) 129 | result.merge(worker.getstate()) 130 | logger.info("sending out merged projection") 131 | return result 132 | 133 | 134 | def reset(self): 135 | """ 136 | Initialize all workers for a new decomposition. 137 | """ 138 | for workerid, worker in self.workers.iteritems(): 139 | logger.info("resetting worker %s" % workerid) 140 | worker.reset() 141 | worker.requestjob() 142 | self._jobsdone = 0 143 | self._jobsreceived = 0 144 | 145 | 146 | @utils.synchronous('lock_update') 147 | def jobdone(self, workerid): 148 | """ 149 | A worker has finished its job. Log this event and then asynchronously 150 | transfer control back to the worker. 151 | 152 | In this way, control flow basically oscillates between dispatcher.jobdone() 153 | worker.requestjob(). 154 | """ 155 | self._jobsdone += 1 156 | logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone)) 157 | worker = self.workers[workerid] 158 | worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) 159 | 160 | 161 | def jobsdone(self): 162 | """Wrap self._jobsdone, needed for remote access through proxies""" 163 | return self._jobsdone 164 | 165 | 166 | def exit(self): 167 | """ 168 | Terminate all registered workers and then the dispatcher. 169 | """ 170 | for workerid, worker in self.workers.iteritems(): 171 | logger.info("terminating worker %s" % workerid) 172 | worker.exit() 173 | logger.info("terminating dispatcher") 174 | os._exit(0) # exit the whole process (not just this thread ala sys.exit()) 175 | #endclass Dispatcher 176 | 177 | 178 | 179 | def main(): 180 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 181 | logger.info("running %s" % " ".join(sys.argv)) 182 | 183 | program = os.path.basename(sys.argv[0]) 184 | # make sure we have enough cmd line parameters 185 | if len(sys.argv) < 1: 186 | print(globals()["__doc__"] % locals()) 187 | sys.exit(1) 188 | 189 | if len(sys.argv) < 2: 190 | maxsize = MAX_JOBS_QUEUE 191 | else: 192 | maxsize = int(sys.argv[1]) 193 | utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=maxsize)) 194 | 195 | logger.info("finished running %s" % program) 196 | 197 | 198 | 199 | if __name__ == '__main__': 200 | main() 201 | -------------------------------------------------------------------------------- /gensim/models/lsi_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s 9 | 10 | Worker ("slave") process used in computing distributed LSI. Run this script \ 11 | on every node in your cluster. If you wish, you may even run it multiple times \ 12 | on a single machine, to make better use of multiple cores (just beware that \ 13 | memory footprint increases accordingly). 14 | 15 | Example: python -m gensim.models.lsi_worker 16 | """ 17 | 18 | 19 | from __future__ import with_statement 20 | import os, sys, logging 21 | import threading 22 | import tempfile 23 | import Queue 24 | 25 | from gensim.models import lsimodel 26 | from gensim import utils 27 | 28 | logger = logging.getLogger('gensim.models.lsi_worker') 29 | 30 | 31 | SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never) 32 | 33 | 34 | 35 | class Worker(object): 36 | def __init__(self): 37 | self.model = None 38 | 39 | 40 | def initialize(self, myid, dispatcher, **model_params): 41 | self.lock_update = threading.Lock() 42 | self.jobsdone = 0 # how many jobs has this worker completed? 43 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? 44 | self.dispatcher = dispatcher 45 | self.finished = False 46 | logger.info("initializing worker #%s" % myid) 47 | self.model = lsimodel.LsiModel(**model_params) 48 | 49 | 50 | def requestjob(self): 51 | """ 52 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. 53 | """ 54 | if self.model is None: 55 | raise RuntimeError("worker must be initialized before receiving jobs") 56 | 57 | job = None 58 | while job is None and not self.finished: 59 | try: 60 | job = self.dispatcher.getjob(self.myid) 61 | except Queue.Empty: 62 | # no new job: try again, unless we're finished with all work 63 | continue 64 | if job is not None: 65 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) 66 | self.processjob(job) 67 | self.dispatcher.jobdone(self.myid) 68 | else: 69 | logger.info("worker #%i stopping asking for jobs" % self.myid) 70 | 71 | 72 | @utils.synchronous('lock_update') 73 | def processjob(self, job): 74 | self.model.add_documents(job) 75 | self.jobsdone += 1 76 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0: 77 | fname = os.path.join(tempfile.gettempdir(), 'lsi_worker.pkl') 78 | self.model.save(fname) 79 | 80 | 81 | @utils.synchronous('lock_update') 82 | def getstate(self): 83 | logger.info("worker #%i returning its state after %s jobs" % 84 | (self.myid, self.jobsdone)) 85 | assert isinstance(self.model.projection, lsimodel.Projection) 86 | self.finished = True 87 | return self.model.projection 88 | 89 | 90 | @utils.synchronous('lock_update') 91 | def reset(self): 92 | logger.info("resetting worker #%i" % self.myid) 93 | self.model.projection = self.model.projection.empty_like() 94 | self.finished = False 95 | 96 | 97 | def exit(self): 98 | logger.info("terminating worker #%i" % self.myid) 99 | os._exit(0) 100 | #endclass Worker 101 | 102 | 103 | 104 | def main(): 105 | logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 106 | logger.info("running %s" % " ".join(sys.argv)) 107 | 108 | program = os.path.basename(sys.argv[0]) 109 | # make sure we have enough cmd line parameters 110 | if len(sys.argv) < 1: 111 | print(globals()["__doc__"] % locals()) 112 | sys.exit(1) 113 | 114 | utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True) 115 | 116 | logger.info("finished running %s" % program) 117 | 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /gensim/models/lsimodel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/lsimodel.pyc -------------------------------------------------------------------------------- /gensim/models/rpmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | import logging 9 | import itertools 10 | 11 | import numpy 12 | import scipy 13 | 14 | from gensim import interfaces, matutils, utils 15 | 16 | 17 | logger = logging.getLogger('gensim.models.rpmodel') 18 | 19 | 20 | class RpModel(interfaces.TransformationABC): 21 | """ 22 | Objects of this class allow building and maintaining a model for Random Projections 23 | (also known as Random Indexing). For theoretical background on RP, see: 24 | 25 | Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." 26 | 27 | The main methods are: 28 | 29 | 1. constructor, which creates the random projection matrix 30 | 2. the [] method, which transforms a simple count representation into the TfIdf 31 | space. 32 | 33 | >>> rp = RpModel(corpus) 34 | >>> print(rp[some_doc]) 35 | >>> rp.save('/tmp/foo.rp_model') 36 | 37 | Model persistency is achieved via its load/save methods. 38 | """ 39 | def __init__(self, corpus, id2word=None, num_topics=300): 40 | """ 41 | `id2word` is a mapping from word ids (integers) to words (strings). It is 42 | used to determine the vocabulary size, as well as for debugging and topic 43 | printing. If not set, it will be determined from the corpus. 44 | """ 45 | self.id2word = id2word 46 | self.num_topics = num_topics 47 | if corpus is not None: 48 | self.initialize(corpus) 49 | 50 | 51 | def __str__(self): 52 | return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) 53 | 54 | 55 | def initialize(self, corpus): 56 | """ 57 | Initialize the random projection matrix. 58 | """ 59 | if self.id2word is None: 60 | logger.info("no word id mapping provided; initializing from corpus, assuming identity") 61 | self.id2word = utils.dict_from_corpus(corpus) 62 | self.num_terms = len(self.id2word) 63 | else: 64 | self.num_terms = 1 + max([-1] + self.id2word.keys()) 65 | 66 | shape = self.num_topics, self.num_terms 67 | logger.info("constructing %s random matrix" % str(shape)) 68 | # Now construct the projection matrix itself. 69 | # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", 70 | # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). 71 | randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 72 | self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications 73 | 74 | 75 | def __getitem__(self, bow): 76 | """ 77 | Return RP representation of the input vector and/or corpus. 78 | """ 79 | # if the input vector is in fact a corpus, return a transformed corpus as result 80 | is_corpus, bow = utils.is_corpus(bow) 81 | if is_corpus: 82 | return self._apply(bow) 83 | 84 | vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) 85 | vec = numpy.asfortranarray(vec, dtype=numpy.float32) 86 | topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) 87 | return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) 88 | if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)] 89 | 90 | 91 | def __setstate__(self, state): 92 | """ 93 | This is a hack to work around a bug in numpy, where a FORTRAN-order array 94 | unpickled from disk segfaults on using it. 95 | """ 96 | self.__dict__ = state 97 | if self.projection is not None: 98 | self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array 99 | #endclass RpModel 100 | -------------------------------------------------------------------------------- /gensim/models/rpmodel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/rpmodel.pyc -------------------------------------------------------------------------------- /gensim/models/tfidfmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2012 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | import logging 9 | import math 10 | 11 | from gensim import interfaces, matutils, utils 12 | from six import iteritems 13 | 14 | 15 | logger = logging.getLogger('gensim.models.tfidfmodel') 16 | 17 | 18 | def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): 19 | """ 20 | Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: 21 | 22 | idf = add + log(totaldocs / doc_freq) 23 | """ 24 | return add + math.log(1.0 * totaldocs / docfreq, log_base) 25 | 26 | 27 | def precompute_idfs(wglobal, dfs, total_docs): 28 | """Precompute the inverse document frequency mapping for all terms.""" 29 | # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. 30 | # this method is here just to speed things up a little. 31 | return dict((termid, wglobal(df, total_docs)) 32 | for termid, df in iteritems(dfs)) 33 | 34 | 35 | class TfidfModel(interfaces.TransformationABC): 36 | """ 37 | Objects of this class realize the transformation between word-document co-occurrence 38 | matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats). 39 | 40 | The main methods are: 41 | 42 | 1. constructor, which calculates inverse document counts for all terms in the training corpus. 43 | 2. the [] method, which transforms a simple count representation into the TfIdf 44 | space. 45 | 46 | >>> tfidf = TfidfModel(corpus) 47 | >>> print(tfidf[some_doc]) 48 | >>> tfidf.save('/tmp/foo.tfidf_model') 49 | 50 | Model persistency is achieved via its load/save methods. 51 | """ 52 | def __init__(self, corpus=None, id2word=None, dictionary=None, 53 | wlocal=utils.identity, wglobal=df2idf, normalize=True): 54 | """ 55 | Compute tf-idf by multiplying a local component (term frequency) with a 56 | global component (inverse document frequency), and normalizing 57 | the resulting documents to unit length. Formula for unnormalized weight 58 | of term `i` in document `j` in a corpus of D documents:: 59 | 60 | weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i}) 61 | 62 | or, more generally:: 63 | 64 | weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D) 65 | 66 | so you can plug in your own custom `wlocal` and `wglobal` functions. 67 | 68 | Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...) 69 | and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the 70 | formula above. 71 | 72 | `normalize` dictates how the final transformed vectors will be normalized. 73 | `normalize=True` means set to unit length (default); `False` means don't 74 | normalize. You can also set `normalize` to your own function that accepts 75 | and returns a sparse vector. 76 | 77 | If `dictionary` is specified, it must be a `corpora.Dictionary` object 78 | and it will be used to directly construct the inverse document frequency 79 | mapping (then `corpus`, if specified, is ignored). 80 | """ 81 | self.normalize = normalize 82 | self.id2word = id2word 83 | self.wlocal, self.wglobal = wlocal, wglobal 84 | self.num_docs, self.num_nnz, self.idfs = None, None, None 85 | if dictionary is not None: 86 | # user supplied a Dictionary object, which already contains all the 87 | # statistics we need to construct the IDF mapping. we can skip the 88 | # step that goes through the corpus (= an optimization). 89 | if corpus is not None: 90 | logger.warning("constructor received both corpus and explicit " 91 | "inverse document frequencies; ignoring the corpus") 92 | self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz 93 | self.dfs = dictionary.dfs.copy() 94 | self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) 95 | elif corpus is not None: 96 | self.initialize(corpus) 97 | else: 98 | # NOTE: everything is left uninitialized; presumably the model will 99 | # be initialized in some other way 100 | pass 101 | 102 | 103 | def __str__(self): 104 | return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) 105 | 106 | 107 | def initialize(self, corpus): 108 | """ 109 | Compute inverse document weights, which will be used to modify term 110 | frequencies for documents. 111 | """ 112 | logger.info("collecting document frequencies") 113 | dfs = {} 114 | numnnz, docno = 0, -1 115 | for docno, bow in enumerate(corpus): 116 | if docno % 10000 == 0: 117 | logger.info("PROGRESS: processing document #%i" % docno) 118 | numnnz += len(bow) 119 | for termid, _ in bow: 120 | dfs[termid] = dfs.get(termid, 0) + 1 121 | 122 | # keep some stats about the training corpus 123 | self.num_docs = docno + 1 124 | self.num_nnz = numnnz 125 | self.dfs = dfs 126 | 127 | # and finally compute the idf weights 128 | n_features = max(dfs) if dfs else 0 129 | logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" % 130 | (self.num_docs, n_features, self.num_nnz)) 131 | self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) 132 | 133 | 134 | def __getitem__(self, bow, eps=1e-12): 135 | """ 136 | Return tf-idf representation of the input vector and/or corpus. 137 | """ 138 | # if the input vector is in fact a corpus, return a transformed corpus as a result 139 | is_corpus, bow = utils.is_corpus(bow) 140 | if is_corpus: 141 | return self._apply(bow) 142 | 143 | # unknown (new) terms will be given zero weight (NOT infinity/huge weight, 144 | # as strict application of the IDF formula would dictate) 145 | vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) 146 | for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] 147 | 148 | # and finally, normalize the vector either to unit length, or use a 149 | # user-defined normalization function 150 | if self.normalize is True: 151 | vector = matutils.unitvec(vector) 152 | elif self.normalize: 153 | vector = self.normalize(vector) 154 | 155 | # make sure there are no explicit zeroes in the vector (must be sparse) 156 | vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] 157 | return vector 158 | #endclass TfidfModel 159 | -------------------------------------------------------------------------------- /gensim/models/tfidfmodel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/tfidfmodel.pyc -------------------------------------------------------------------------------- /gensim/models/voidptr.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #if PY_VERSION_HEX >= 0x03020000 4 | 5 | /* 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore 7 | */ 8 | static void * PyCObject_AsVoidPtr(PyObject *obj) 9 | { 10 | void *ret = PyCapsule_GetPointer(obj, NULL); 11 | if (ret == NULL) { 12 | PyErr_Clear(); 13 | } 14 | return ret; 15 | } 16 | 17 | #endif -------------------------------------------------------------------------------- /gensim/models/word2vec.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/word2vec.pyc -------------------------------------------------------------------------------- /gensim/nosy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A simple testrunner for nose (or anything else). 5 | 6 | Watch for changes in all file types specified in 'EXTENSIONS'. 7 | If changes, run test executable in 'EXECUTABLE', with default 8 | arguments 'DEFAULTARGS'. 9 | 10 | The --with-color option needs the "rudolf" nose plugin. See: 11 | http://pypi.python.org/pypi/rudolf/ 12 | 13 | Originally by Jeff Winkler, http://jeffwinkler.net 14 | Forked from wkral http://github.com/wkral/Nosy 15 | """ 16 | 17 | import os 18 | import stat 19 | import time 20 | import datetime 21 | import sys 22 | import fnmatch 23 | 24 | 25 | EXTENSIONS = ['*.py'] 26 | EXECUTABLE = 'nosetests test/' 27 | DEFAULTARGS = '--with-color -exe'# -w tests' 28 | 29 | 30 | def checkSum(): 31 | """ 32 | Return a long which can be used to know if any .py files have changed. 33 | """ 34 | val = 0 35 | for root, dirs, files in os.walk(os.getcwd()): 36 | for extension in EXTENSIONS: 37 | for f in fnmatch.filter(files, extension): 38 | stats = os.stat(os.path.join(root, f)) 39 | val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME] 40 | return val 41 | 42 | if __name__ == '__main__': 43 | val = 0 44 | try: 45 | while True: 46 | if checkSum() != val: 47 | val = checkSum() 48 | os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, 49 | ' '.join(sys.argv[1:]))) 50 | print(datetime.datetime.now().__str__()) 51 | print('=' * 77) 52 | time.sleep(1) 53 | except KeyboardInterrupt: 54 | print('Goodbye') 55 | -------------------------------------------------------------------------------- /gensim/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains functions to preprocess raw text 3 | """ 4 | 5 | # bring model classes directly into package namespace, to save some typing 6 | from .porter import PorterStemmer 7 | from .preprocessing import * 8 | -------------------------------------------------------------------------------- /gensim/parsing/porter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Porter Stemming Algorithm 4 | This is the Porter stemming algorithm, ported to Python from the 5 | version coded up in ANSI C by the author. It may be be regarded 6 | as canonical, in that it follows the algorithm presented in 7 | 8 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 9 | no. 3, pp 130-137, 10 | 11 | only differing from it at the points maked --DEPARTURE-- below. 12 | 13 | See also http://www.tartarus.org/~martin/PorterStemmer 14 | 15 | The algorithm as described in the paper could be exactly replicated 16 | by adjusting the points of DEPARTURE, but this is barely necessary, 17 | because (a) the points of DEPARTURE are definitely improvements, and 18 | (b) no encoding of the Porter stemmer I have seen is anything like 19 | as exact as this version, even with the points of DEPARTURE! 20 | 21 | Vivake Gupta (v@nano.com) 22 | 23 | Release 1: January 2001 24 | 25 | Further adjustments by Santiago Bruno (bananabruno@gmail.com) 26 | to allow word input not restricted to one word per line, leading 27 | to: 28 | 29 | Release 2: July 2008 30 | 31 | Optimizations and cleanup of the code by Lars Buitinck, July 2012. 32 | """ 33 | 34 | 35 | from six.moves import xrange 36 | 37 | 38 | class PorterStemmer(object): 39 | def __init__(self): 40 | """The main part of the stemming algorithm starts here. 41 | b is a buffer holding a word to be stemmed. The letters are in b[0], 42 | b[1] ... ending at b[k]. k is readjusted downwards as the stemming 43 | progresses. 44 | 45 | Note that only lower case sequences are stemmed. Forcing to lower case 46 | should be done before stem(...) is called. 47 | """ 48 | 49 | self.b = "" # buffer for word to be stemmed 50 | self.k = 0 51 | self.j = 0 # j is a general offset into the string 52 | 53 | def _cons(self, i): 54 | """True <=> b[i] is a consonant.""" 55 | ch = self.b[i] 56 | if ch in "aeiou": 57 | return False 58 | if ch == 'y': 59 | return i == 0 or not self._cons(i - 1) 60 | return True 61 | 62 | def _m(self): 63 | """Returns the number of consonant sequences between 0 and j. 64 | 65 | If c is a consonant sequence and v a vowel sequence, and <..> 66 | indicates arbitrary presence, 67 | 68 | gives 0 69 | vc gives 1 70 | vcvc gives 2 71 | vcvcvc gives 3 72 | .... 73 | """ 74 | i = 0 75 | while True: 76 | if i > self.j: 77 | return 0 78 | if not self._cons(i): 79 | break 80 | i += 1 81 | i += 1 82 | n = 0 83 | while True: 84 | while True: 85 | if i > self.j: 86 | return n 87 | if self._cons(i): 88 | break 89 | i += 1 90 | i += 1 91 | n += 1 92 | while 1: 93 | if i > self.j: 94 | return n 95 | if not self._cons(i): 96 | break 97 | i += 1 98 | i += 1 99 | 100 | def _vowelinstem(self): 101 | """True <=> 0,...j contains a vowel""" 102 | return not all(self._cons(i) for i in xrange(self.j + 1)) 103 | 104 | def _doublec(self, j): 105 | """True <=> j,(j-1) contain a double consonant.""" 106 | return j > 0 and self.b[j] == self.b[j-1] and self._cons(j) 107 | 108 | def _cvc(self, i): 109 | """True <=> i-2,i-1,i has the form consonant - vowel - consonant 110 | and also if the second c is not w,x or y. This is used when trying to 111 | restore an e at the end of a short word, e.g. 112 | 113 | cav(e), lov(e), hop(e), crim(e), but 114 | snow, box, tray. 115 | """ 116 | if i < 2 or not self._cons(i) or self._cons(i-1) or not self._cons(i-2): 117 | return False 118 | return self.b[i] not in "wxy" 119 | 120 | def _ends(self, s): 121 | """True <=> 0,...k ends with the string s.""" 122 | if s[-1] != self.b[self.k]: # tiny speed-up 123 | return 0 124 | length = len(s) 125 | if length > (self.k + 1): 126 | return 0 127 | if self.b[self.k-length+1:self.k+1] != s: 128 | return 0 129 | self.j = self.k - length 130 | return 1 131 | 132 | def _setto(self, s): 133 | """Set (j+1),...k to the characters in the string s, adjusting k.""" 134 | self.b = self.b[:self.j+1] + s 135 | self.k = len(self.b) - 1 136 | 137 | def _r(self, s): 138 | if self._m() > 0: 139 | self._setto(s) 140 | 141 | def _step1ab(self): 142 | """Get rid of plurals and -ed or -ing. E.g., 143 | 144 | caresses -> caress 145 | ponies -> poni 146 | ties -> ti 147 | caress -> caress 148 | cats -> cat 149 | 150 | feed -> feed 151 | agreed -> agree 152 | disabled -> disable 153 | 154 | matting -> mat 155 | mating -> mate 156 | meeting -> meet 157 | milling -> mill 158 | messing -> mess 159 | 160 | meetings -> meet 161 | """ 162 | if self.b[self.k] == 's': 163 | if self._ends("sses"): 164 | self.k -= 2 165 | elif self._ends("ies"): 166 | self._setto("i") 167 | elif self.b[self.k - 1] != 's': 168 | self.k -= 1 169 | if self._ends("eed"): 170 | if self._m() > 0: 171 | self.k -= 1 172 | elif (self._ends("ed") or self._ends("ing")) and self._vowelinstem(): 173 | self.k = self.j 174 | if self._ends("at"): self._setto("ate") 175 | elif self._ends("bl"): self._setto("ble") 176 | elif self._ends("iz"): self._setto("ize") 177 | elif self._doublec(self.k): 178 | if self.b[self.k - 1] not in "lsz": 179 | self.k -= 1 180 | elif self._m() == 1 and self._cvc(self.k): 181 | self._setto("e") 182 | 183 | def _step1c(self): 184 | """Turn terminal y to i when there is another vowel in the stem.""" 185 | if self._ends("y") and self._vowelinstem(): 186 | self.b = self.b[:self.k] + 'i' 187 | 188 | def _step2(self): 189 | """Map double suffices to single ones. 190 | 191 | So, -ization ( = -ize plus -ation) maps to -ize etc. Note that the 192 | string before the suffix must give _m() > 0. 193 | """ 194 | ch = self.b[self.k - 1] 195 | if ch == 'a': 196 | if self._ends("ational"): self._r("ate") 197 | elif self._ends("tional"): self._r("tion") 198 | elif ch == 'c': 199 | if self._ends("enci"): self._r("ence") 200 | elif self._ends("anci"): self._r("ance") 201 | elif ch == 'e': 202 | if self._ends("izer"): self._r("ize") 203 | elif ch == 'l': 204 | if self._ends("bli"): self._r("ble") # --DEPARTURE-- 205 | # To match the published algorithm, replace this phrase with 206 | # if self._ends("abli"): self._r("able") 207 | elif self._ends("alli"): self._r("al") 208 | elif self._ends("entli"): self._r("ent") 209 | elif self._ends("eli"): self._r("e") 210 | elif self._ends("ousli"): self._r("ous") 211 | elif ch == 'o': 212 | if self._ends("ization"): self._r("ize") 213 | elif self._ends("ation"): self._r("ate") 214 | elif self._ends("ator"): self._r("ate") 215 | elif ch == 's': 216 | if self._ends("alism"): self._r("al") 217 | elif self._ends("iveness"): self._r("ive") 218 | elif self._ends("fulness"): self._r("ful") 219 | elif self._ends("ousness"): self._r("ous") 220 | elif ch == 't': 221 | if self._ends("aliti"): self._r("al") 222 | elif self._ends("iviti"): self._r("ive") 223 | elif self._ends("biliti"): self._r("ble") 224 | elif ch == 'g': # --DEPARTURE-- 225 | if self._ends("logi"): self._r("log") 226 | # To match the published algorithm, delete this phrase 227 | 228 | def _step3(self): 229 | """Deal with -ic-, -full, -ness etc. Similar strategy to _step2.""" 230 | ch = self.b[self.k] 231 | if ch == 'e': 232 | if self._ends("icate"): self._r("ic") 233 | elif self._ends("ative"): self._r("") 234 | elif self._ends("alize"): self._r("al") 235 | elif ch == 'i': 236 | if self._ends("iciti"): self._r("ic") 237 | elif ch == 'l': 238 | if self._ends("ical"): self._r("ic") 239 | elif self._ends("ful"): self._r("") 240 | elif ch == 's': 241 | if self._ends("ness"): self._r("") 242 | 243 | def _step4(self): 244 | """_step4() takes off -ant, -ence etc., in context vcvc.""" 245 | ch = self.b[self.k - 1] 246 | if ch == 'a': 247 | if not self._ends("al"): return 248 | elif ch == 'c': 249 | if not self._ends("ance") and not self._ends("ence"): return 250 | elif ch == 'e': 251 | if not self._ends("er"): return 252 | elif ch == 'i': 253 | if not self._ends("ic"): return 254 | elif ch == 'l': 255 | if not self._ends("able") and not self._ends("ible"): return 256 | elif ch == 'n': 257 | if self._ends("ant"): pass 258 | elif self._ends("ement"): pass 259 | elif self._ends("ment"): pass 260 | elif self._ends("ent"): pass 261 | else: return 262 | elif ch == 'o': 263 | if self._ends("ion") and self.b[self.j] in "st": pass 264 | elif self._ends("ou"): pass 265 | # takes care of -ous 266 | else: return 267 | elif ch == 's': 268 | if not self._ends("ism"): return 269 | elif ch == 't': 270 | if not self._ends("ate") and not self._ends("iti"): return 271 | elif ch == 'u': 272 | if not self._ends("ous"): return 273 | elif ch == 'v': 274 | if not self._ends("ive"): return 275 | elif ch == 'z': 276 | if not self._ends("ize"): return 277 | else: 278 | return 279 | if self._m() > 1: 280 | self.k = self.j 281 | 282 | def _step5(self): 283 | """Remove a final -e if _m() > 1, and change -ll to -l if m() > 1. 284 | """ 285 | k = self.j = self.k 286 | if self.b[k] == 'e': 287 | a = self._m() 288 | if a > 1 or (a == 1 and not self._cvc(k - 1)): 289 | self.k -= 1 290 | if self.b[self.k] == 'l' and self._doublec(self.k) and self._m() > 1: 291 | self.k -= 1 292 | 293 | def stem(self, w): 294 | """Stem the word w, return the stemmed form.""" 295 | w = w.lower() 296 | k = len(w) - 1 297 | if k <= 1: 298 | return w # --DEPARTURE-- 299 | 300 | # With this line, strings of length 1 or 2 don't go through the 301 | # stemming process, although no mention is made of this in the 302 | # published algorithm. Remove the line to match the published 303 | # algorithm. 304 | 305 | self.b = w 306 | self.k = k 307 | 308 | self._step1ab() 309 | self._step1c() 310 | self._step2() 311 | self._step3() 312 | self._step4() 313 | self._step5() 314 | return self.b[:self.k+1] 315 | 316 | def stem_sentence(self, txt): 317 | return " ".join(map(self.stem, txt.split())) 318 | 319 | def stem_documents(self, docs): 320 | return map(self.stem_sentence, docs) 321 | 322 | 323 | if __name__ == '__main__': 324 | import sys 325 | 326 | p = PorterStemmer() 327 | 328 | for f in sys.argv[1:]: 329 | with open(f) as infile: 330 | for line in infile: 331 | print(p.stem_sentence(line)) 332 | -------------------------------------------------------------------------------- /gensim/parsing/preprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import glob 4 | 5 | from gensim import utils 6 | from gensim.parsing.porter import PorterStemmer 7 | 8 | 9 | # improved list from Stone, Denis, Kwantes (2010) 10 | STOPWORDS = """ 11 | a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be 12 | became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can 13 | cannot cant co computer con could couldnt cry de describe 14 | detail did do does doesn doing don done down due during 15 | each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen 16 | fify fill find fire first five for former formerly forty found four from front full further get give go 17 | had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie 18 | if in inc indeed interest into is it its itself keep last latter latterly least less ltd 19 | just 20 | kg km 21 | made many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely 22 | neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off 23 | often on once one only onto or other others otherwise our ours ourselves out over own part per 24 | perhaps please put rather re 25 | quite 26 | rather really regarding 27 | same see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten 28 | than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under 29 | until up unless upon us used using 30 | various very very via 31 | was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you 32 | your yours yourself yourselves 33 | """ 34 | STOPWORDS = frozenset(w for w in STOPWORDS.split() if w) 35 | 36 | 37 | def remove_stopwords(s): 38 | s = utils.to_unicode(s) 39 | return " ".join(w for w in s.split() if w not in STOPWORDS) 40 | 41 | 42 | RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) 43 | def strip_punctuation(s): 44 | s = utils.to_unicode(s) 45 | return RE_PUNCT.sub(" ", s) 46 | 47 | 48 | # unicode.translate cannot delete characters like str can 49 | strip_punctuation2 = strip_punctuation 50 | # def strip_punctuation2(s): 51 | # s = utils.to_unicode(s) 52 | # return s.translate(None, string.punctuation) 53 | 54 | 55 | RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) 56 | def strip_tags(s): 57 | s = utils.to_unicode(s) 58 | return RE_TAGS.sub("",s) 59 | 60 | 61 | def strip_short(s, minsize=3): 62 | s = utils.to_unicode(s) 63 | return " ".join(e for e in s.split() if len(e) >= minsize) 64 | 65 | 66 | RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) 67 | def strip_numeric(s): 68 | s = utils.to_unicode(s) 69 | return RE_NUMERIC.sub("", s) 70 | 71 | 72 | RE_NONALPHA = re.compile(r"\W", re.UNICODE) 73 | def strip_non_alphanum(s): 74 | s = utils.to_unicode(s) 75 | return RE_NONALPHA.sub(" ", s) 76 | 77 | 78 | RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) 79 | def strip_multiple_whitespaces(s): 80 | s = utils.to_unicode(s) 81 | return RE_WHITESPACE.sub(" ", s) 82 | 83 | 84 | RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) 85 | RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) 86 | def split_alphanum(s): 87 | s = utils.to_unicode(s) 88 | s = RE_AL_NUM.sub(r"\1 \2", s) 89 | return RE_NUM_AL.sub(r"\1 \2", s) 90 | 91 | 92 | def stem_text(text): 93 | """ 94 | Return lowercase and (porter-)stemmed version of string `text`. 95 | """ 96 | text = utils.to_unicode(text) 97 | p = PorterStemmer() 98 | return ' '.join(p.stem(word) for word in text.split()) 99 | stem = stem_text 100 | 101 | DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, 102 | strip_numeric, remove_stopwords, strip_short, stem_text] 103 | 104 | 105 | def preprocess_string(s, filters=DEFAULT_FILTERS): 106 | s = utils.to_unicode(s) 107 | for f in filters: 108 | s = f(s) 109 | return s.split() 110 | 111 | 112 | def preprocess_documents(docs): 113 | return [preprocess_string(d) for d in docs] 114 | 115 | 116 | def read_file(path): 117 | with utils.smart_open(path) as fin: 118 | return fin.read() 119 | 120 | 121 | def read_files(pattern): 122 | return [read_file(fname) for fname in glob.glob(pattern)] 123 | -------------------------------------------------------------------------------- /gensim/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/scripts/__init__.py -------------------------------------------------------------------------------- /gensim/scripts/make_wiki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_lemma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online_lemma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online_nodebug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/scripts/make_wikicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | 22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 23 | disk space; gensim's corpus iterators can work with compressed input, too. 24 | 25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 26 | removing tokens that appear in more than 10%% of all documents). Defaults to 27 | 50,000. 28 | 29 | If you have the `pattern` package installed, this script will use a fancy 30 | lemmatization to get a lemma of each token (instead of plain alphabetic 31 | tokenizer). The package is available at https://github.com/clips/pattern . 32 | 33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 34 | """ 35 | 36 | 37 | import logging 38 | import os.path 39 | import sys 40 | 41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 42 | from gensim.models import TfidfModel 43 | 44 | 45 | # Wiki is first scanned for all distinct word types (~7M). The types that 46 | # appear in more than 10% of articles are removed and from the rest, the 47 | # DEFAULT_DICT_SIZE most frequent types are kept. 48 | DEFAULT_DICT_SIZE = 100000 49 | 50 | 51 | if __name__ == '__main__': 52 | program = os.path.basename(sys.argv[0]) 53 | logger = logging.getLogger(program) 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 56 | logging.root.setLevel(level=logging.INFO) 57 | logger.info("running %s" % ' '.join(sys.argv)) 58 | 59 | # check and process input arguments 60 | if len(sys.argv) < 3: 61 | print(globals()['__doc__'] % locals()) 62 | sys.exit(1) 63 | inp, outp = sys.argv[1:3] 64 | if len(sys.argv) > 3: 65 | keep_words = int(sys.argv[3]) 66 | else: 67 | keep_words = DEFAULT_DICT_SIZE 68 | online = 'online' in program 69 | lemmatize = 'lemma' in program 70 | debug = 'nodebug' not in program 71 | 72 | if online: 73 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 74 | dictionary.allow_update = True # start collecting document frequencies 75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 79 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 80 | wiki.save(outp + '_corpus.pkl.bz2') 81 | dictionary.allow_update = False 82 | else: 83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 84 | # only keep the most frequent words (out of total ~8.2m unique tokens) 85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 | # save dictionary and bag-of-words (term-document frequency matrix) 87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 89 | # load back the id->word mapping directly from file 90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 92 | del wiki 93 | 94 | # initialize corpus reader and word->id mapping 95 | mm = MmCorpus(outp + '_bow.mm') 96 | 97 | # build tfidf, ~50min 98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 99 | 100 | # save tfidf vectors in matrix market format 101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 103 | 104 | logger.info("finished running %s" % program) 105 | -------------------------------------------------------------------------------- /gensim/similarities/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains implementations of pairwise similarity queries. 3 | """ 4 | 5 | # bring classes directly into package namespace, to save some typing 6 | from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity 7 | -------------------------------------------------------------------------------- /gensim/similarities/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/similarities/__init__.pyc -------------------------------------------------------------------------------- /gensim/similarities/docsim.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/similarities/docsim.pyc -------------------------------------------------------------------------------- /gensim/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/utils.pyc -------------------------------------------------------------------------------- /gensim_addons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/__init__.py -------------------------------------------------------------------------------- /gensim_addons/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/__init__.pyc -------------------------------------------------------------------------------- /gensim_addons/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/models/__init__.py -------------------------------------------------------------------------------- /gensim_addons/models/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/models/__init__.pyc -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ###Paragraph vectors DM Model 2 | 3 | Usage : `python test.py training_text testing_text` 4 | 5 | Output : `para_vectors_train.txt` and `para_vectors_test.txt` 6 | 7 | 8 | If you use the code, please cite this paper: 9 | 10 | Yang Liu, Zhiyuan Liu, Tat-Seng Chua, Maosong Sun. *Topical Word Embeddings*. The 29th AAAI Conference on Artificial Intelligence (AAAI'15). 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2012 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Run with: 9 | 10 | sudo python ./setup.py install 11 | """ 12 | 13 | import os 14 | import sys 15 | 16 | if sys.version_info[:2] < (2, 5): 17 | raise Exception('This version of gensim needs Python 2.5 or later. ') 18 | 19 | import ez_setup 20 | ez_setup.use_setuptools() 21 | from setuptools import setup, find_packages, Extension 22 | 23 | 24 | # Commonly used information 25 | pkg_name = 'gensim' 26 | pkg_ver = '0.10.1' 27 | pkg_desc = 'Python framework for fast Vector Space Modelling' 28 | 29 | # there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562 30 | pkg_author = 'Radim Rehurek' # u'Radim Řehůřek', # <- should really be this... 31 | pkg_author_email = 'radimrehurek@seznam.cz' 32 | pkg_url = 'http://radimrehurek.com/gensim' 33 | pkg_download_url = 'http://pypi.python.org/pypi/gensim' 34 | 35 | pkg_keywords = 'Singular Value Decomposition, SVD, Latent Semantic Indexing, ' 36 | 'LSA, LSI, Latent Dirichlet Allocation, LDA, ' 37 | 'Hierarchical Dirichlet Process, HDP, Random Projections, ' 38 | 'TFIDF, word2vec' 39 | 40 | pkg_classifiers = [ # from http://pypi.python.org/pypi?%3Aaction=list_classifiers 41 | 'Development Status :: 5 - Production/Stable', 42 | 'Environment :: Console', 43 | 'Intended Audience :: Science/Research', 44 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', 45 | 'Operating System :: OS Independent', 46 | 'Programming Language :: Python :: 2.6', 47 | 'Programming Language :: Python :: 3.3', 48 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 49 | 'Topic :: Scientific/Engineering :: Information Analysis', 50 | 'Topic :: Text Processing :: Linguistic', 51 | ] 52 | 53 | pkg_license = 'LGPL' 54 | 55 | def read(fname): 56 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 57 | 58 | native_ext = False 59 | 60 | setup( 61 | name = pkg_name, 62 | version = pkg_ver, 63 | description = pkg_desc, 64 | long_description = read('README.rst'), 65 | 66 | packages = find_packages(exclude=[ pkg_name + '_addons', pkg_name + '_addons.*']), 67 | 68 | author = pkg_author, 69 | author_email = pkg_author_email, 70 | 71 | url = pkg_url, 72 | download_url = pkg_download_url, 73 | 74 | keywords = pkg_keywords, 75 | 76 | license = pkg_license, 77 | platforms = 'any', 78 | 79 | zip_safe = False, 80 | 81 | classifiers = pkg_classifiers, 82 | 83 | test_suite = "gensim.test", 84 | 85 | install_requires = [ 86 | 'scipy >= 0.7.0', 87 | 'six >= 1.2.0', 88 | ], 89 | 90 | extras_require = { 91 | 'distributed': ['Pyro4 >= 4.8'], 92 | }, 93 | 94 | include_package_data = True, 95 | 96 | # lines below are commented out to avoid installing system-wide scripts 97 | # scripts can be run by running `python -m module_name` instead: less 98 | # flexible but more explicit and imo cleaner. 99 | # entry_points = { 100 | # 'console_scripts': [ 101 | # 'lsi_worker = gensim.models.lsi_worker:main', 102 | # 'lsi_dispatcher = gensim.models.lsi_dispatcher:main', 103 | # ], 104 | # }, 105 | 106 | ) 107 | 108 | # Here comes the setup for cythonized native addon-extension. 109 | 110 | # try: 111 | # from Cython.Distutils import build_ext 112 | # import numpy 113 | # models_dir = os.path.join(os.path.dirname(__file__), 'gensim', 'models') 114 | 115 | # ext_modules = [ 116 | # Extension('gensim_addons.models.word2vec_inner', 117 | # ['gensim_addons/models/word2vec_inner.pyx'], 118 | # include_dirs = [models_dir, numpy.get_include()]) 119 | # ] 120 | 121 | # native_ext = True 122 | 123 | # except ImportError: 124 | # sys.stderr.write(''' 125 | # ========================================================= 126 | 127 | # Please install Cython (http://cython.org/), if you 128 | # want to use the highly optimized version of word2vec. 129 | 130 | # Usually you can install it (optional) using: 131 | 132 | # pip install -U cython 133 | 134 | # or 135 | 136 | # easy_install -U cython 137 | 138 | # or 139 | 140 | # the package-management of your distribution. 141 | 142 | # If you install Cython *after* installing gensim, the 143 | # optimized version of word2vec will still be automatically 144 | # generated, on the first use of word2vec. 145 | 146 | # ========================================================= 147 | # ''') 148 | 149 | # if native_ext: 150 | 151 | # setup( 152 | 153 | # name = pkg_name + '_addons', 154 | # version = pkg_ver, 155 | # description = pkg_desc, 156 | # long_description = read('README.rst'), 157 | 158 | # packages = find_packages(exclude=[ pkg_name, pkg_name + '.*']), 159 | 160 | # author = pkg_author, 161 | # author_email = pkg_author_email, 162 | 163 | # url = pkg_url, 164 | # download_url = pkg_download_url, 165 | 166 | # keywords = pkg_keywords, 167 | 168 | # license = pkg_license, 169 | # platforms = 'any', 170 | 171 | # zip_safe = False, 172 | 173 | # classifiers = pkg_classifiers, 174 | 175 | # install_requires = [ 176 | # 'gensim == ' + pkg_ver, 177 | # ], 178 | 179 | # include_package_data = True, 180 | 181 | # cmdclass = { 182 | # 'build_ext': build_ext 183 | # }, 184 | 185 | # ext_modules = ext_modules, 186 | # ) 187 | -------------------------------------------------------------------------------- /standard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | #-*- coding: UTF-8 -*- 3 | #File: 4 | #Date: 5 | #Author: Yang Liu 6 | #Description: 7 | with open("para_vectors_train.txt") as f: 8 | with open("para_train.txt","w") as fo: 9 | for i in range(100000): 10 | f.readline() 11 | for i in range(25000): 12 | f.readline() 13 | fo.write(f.readline()) 14 | 15 | -------------------------------------------------------------------------------- /svm_test.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | 4 | if __name__=="__main__": 5 | train_filename = "para_train.txt" 6 | test_filename = "para_test.txt" 7 | train_x = [] 8 | test_x = [] 9 | train_y = [] 10 | test_y = [] 11 | with open("train_y.txt") as f: 12 | for l in f: 13 | train_y.append(float(l.strip())) 14 | with open("test_y.txt") as f : 15 | for l in f: 16 | test_y.append(float(l.strip())) 17 | 18 | with open(train_filename,"r") as f: 19 | while True: 20 | 21 | 22 | 23 | l = f.readline() 24 | if not l : 25 | break 26 | l = l.strip().split() 27 | result = [] 28 | for i in range(len(l)): 29 | result.append(float(l[i])) 30 | train_x.append(result) 31 | print len(train_x) 32 | print "FINISH LOADING TRAIN" 33 | with open(test_filename,"r") as f: 34 | while True: 35 | 36 | 37 | 38 | l = f.readline() 39 | if not l : 40 | break 41 | l = l.strip().split() 42 | result = [] 43 | for i in range(len(l)): 44 | result.append(float(l[i])) 45 | test_x.append(result) 46 | print "FINSIH LOADING TEST" 47 | from sklearn.svm import SVC 48 | x = SVC() 49 | x.fit(train_x, train_y) 50 | print "TRAINING..." 51 | result = x.predict(test_x) 52 | print "PREDICTING..." 53 | num = 0 54 | for i in range(len(test_y)): 55 | if test_y[i]!=result[i]: 56 | num+=1 57 | print float(num)/float(len(test_y)) 58 | result = x.predict(train_x) 59 | num = 0 60 | for i in range(len(train_y)): 61 | if train_y[i]!=result[i]: 62 | num+=1 63 | print float(num)/float(len(train_y)) 64 | -------------------------------------------------------------------------------- /test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | int main(){ 5 | return 0; 6 | } 7 | -------------------------------------------------------------------------------- /test_it.sh: -------------------------------------------------------------------------------- 1 | python standard.py 2 | python svm_test.py 3 | -------------------------------------------------------------------------------- /test_nn.py: -------------------------------------------------------------------------------- 1 | import ffnet 2 | 3 | if __name__=="__main__": 4 | train_x = [] 5 | train_y = [] 6 | test_x = [] 7 | test_y = [] 8 | for i in range(12500): 9 | test_y.append(1) 10 | train_y.append(1) 11 | for i in range(12500): 12 | test_y.append(0) 13 | train_y.append(0) 14 | with open("para_train.txt") as f: 15 | for l in f: 16 | l = l.strip().split() 17 | result = [] 18 | for i in range(len(l)): 19 | result.append(float(l[i])) 20 | train_x.append(result) 21 | print "FINISH READING TRAIN FILE" 22 | with open("para_test.txt") as f: 23 | for l in f: 24 | l = l.strip().split() 25 | result = [] 26 | for i in range(len(l)): 27 | result.append(float(l[i])) 28 | test_x.append(result) 29 | print "FINISH READING TEST FILE" 30 | #train_x = train_x[:5] 31 | #test_x = test_x[:5] 32 | #train_y = train_y[:5] 33 | #test_y = test_y[:5] 34 | c = ffnet.ffnet(ffnet.mlgraph((len(train_x[0]), 50, 1))) 35 | print "TRAINING....", 36 | c.train_tnc(train_x, train_y, messages = 1, nproc = 'ncpu', maxfun = 1000) 37 | print "OK" 38 | print "TESTING....", 39 | wrong= 0 40 | for i in range(len(test_y)): 41 | result = c.call(test_x[i]).tolist()[0] 42 | if result >=0.5: 43 | result = 1.0 44 | else: 45 | result = 0.0 46 | if result != test_y[i]: 47 | wrong+=1 48 | print "OK" 49 | print float(wrong) / float(len(test_y)) -------------------------------------------------------------------------------- /test_word2vec.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | s = [] 3 | with open("../text8") as f: 4 | for l in f: 5 | s.append(l.strip().split()) 6 | 7 | w = gensim.models.Word2Vec(s,workers=24) 8 | print w.similarity("man","woman") 9 | --------------------------------------------------------------------------------