├──
├── COPYING
├── COPYING.LESSER
├── MANIFEST.in
├── __init__.py
├── __init__.pyc
├── ez_setup.py
├── gensim
├── __init__.py
├── __init__.pyc
├── corpora
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── bleicorpus.py
│ ├── bleicorpus.pyc
│ ├── csvcorpus.py
│ ├── dictionary.py
│ ├── dictionary.pyc
│ ├── hashdictionary.py
│ ├── hashdictionary.pyc
│ ├── indexedcorpus.py
│ ├── indexedcorpus.pyc
│ ├── lowcorpus.py
│ ├── lowcorpus.pyc
│ ├── malletcorpus.py
│ ├── malletcorpus.pyc
│ ├── mmcorpus.py
│ ├── mmcorpus.pyc
│ ├── svmlightcorpus.py
│ ├── svmlightcorpus.pyc
│ ├── textcorpus.py
│ ├── textcorpus.pyc
│ ├── ucicorpus.py
│ ├── ucicorpus.pyc
│ ├── wikicorpus.py
│ └── wikicorpus.pyc
├── examples
│ └── dmlcz
│ │ ├── __init__.py
│ │ ├── dmlcorpus.py
│ │ ├── gensim_build.py
│ │ ├── gensim_genmodel.py
│ │ ├── gensim_xml.py
│ │ ├── runall.sh
│ │ └── sources.py
├── interfaces.py
├── interfaces.pyc
├── matutils.py
├── matutils.pyc
├── models
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── hdpmodel.py
│ ├── hdpmodel.pyc
│ ├── lda_dispatcher.py
│ ├── lda_worker.py
│ ├── ldamallet.py
│ ├── ldamallet.pyc
│ ├── ldamodel.py
│ ├── ldamodel.pyc
│ ├── logentropy_model.py
│ ├── logentropy_model.pyc
│ ├── lsi_dispatcher.py
│ ├── lsi_worker.py
│ ├── lsimodel.py
│ ├── lsimodel.pyc
│ ├── rpmodel.py
│ ├── rpmodel.pyc
│ ├── tfidfmodel.py
│ ├── tfidfmodel.pyc
│ ├── voidptr.h
│ ├── word2vec.py
│ ├── word2vec.pyc
│ └── word2vec_inner.pyx
├── nosy.py
├── parsing
│ ├── __init__.py
│ ├── porter.py
│ └── preprocessing.py
├── scripts
│ ├── __init__.py
│ ├── make_wiki.py
│ ├── make_wiki_lemma.py
│ ├── make_wiki_online.py
│ ├── make_wiki_online_lemma.py
│ ├── make_wiki_online_nodebug.py
│ └── make_wikicorpus.py
├── similarities
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── docsim.py
│ └── docsim.pyc
├── utils.py
└── utils.pyc
├── gensim_addons
├── __init__.py
├── __init__.pyc
└── models
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── word2vec_inner.c
│ └── word2vec_inner.pyx
├── readme.md
├── setup.cfg
├── setup.py
├── standard.py
├── svm_test.py
├── test.cc
├── test.py
├── test_it.sh
├── test_nn.py
└── test_word2vec.py
/ :
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/
--------------------------------------------------------------------------------
/COPYING.LESSER:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include docs *
2 | recursive-include gensim/test/test_data *
3 | recursive-include . *.sh
4 | prune docs/src*
5 | include README.rst
6 | include CHANGELOG.txt
7 | include COPYING
8 | include COPYING.LESSER
9 | include ez_setup.py
10 | include gensim/models/voidptr.h
11 | include gensim/models/word2vec_inner.pyx
12 | include gensim_addons/models/word2vec_inner.pyx
13 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # script by dedan: helps him to symlink gensim
2 | import os
3 | dirname = __path__[0] # Package's main folder
4 | __path__.insert(0, os.path.join(dirname, "gensim"))
5 |
--------------------------------------------------------------------------------
/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/__init__.pyc
--------------------------------------------------------------------------------
/gensim/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains interfaces and functionality to compute pair-wise document
3 | similarities within a corpus of documents.
4 | """
5 |
6 | from gensim import utils, matutils, interfaces, corpora, models, similarities
7 | import logging
8 |
9 | try:
10 | __version__ = __import__('pkg_resources').get_distribution('gensim').version
11 | except:
12 | __version__ = '?'
13 |
14 |
15 | class NullHandler(logging.Handler):
16 | """For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""
17 | def emit(self, record):
18 | pass
19 |
20 | logger = logging.getLogger('gensim')
21 | if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
22 | logger.addHandler(NullHandler())
23 |
--------------------------------------------------------------------------------
/gensim/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/__init__.pyc
--------------------------------------------------------------------------------
/gensim/corpora/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains implementations of various streaming corpus I/O format.
3 | """
4 |
5 | # bring corpus classes directly into package namespace, to save some typing
6 | from .indexedcorpus import IndexedCorpus # must appear before the other classes
7 |
8 | from .mmcorpus import MmCorpus
9 | from .bleicorpus import BleiCorpus
10 | from .svmlightcorpus import SvmLightCorpus
11 | from .lowcorpus import LowCorpus
12 | from .dictionary import Dictionary
13 | from .hashdictionary import HashDictionary
14 | from .wikicorpus import WikiCorpus
15 | from .textcorpus import TextCorpus
16 | from .ucicorpus import UciCorpus
17 | from .malletcorpus import MalletCorpus
18 |
--------------------------------------------------------------------------------
/gensim/corpora/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/__init__.pyc
--------------------------------------------------------------------------------
/gensim/corpora/bleicorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Blei's LDA-C format.
10 | """
11 |
12 | from __future__ import with_statement
13 |
14 | from os import path
15 | import logging
16 |
17 | from gensim import interfaces, utils
18 | from gensim.corpora import IndexedCorpus
19 | from six.moves import xrange
20 |
21 |
22 | logger = logging.getLogger('gensim.corpora.bleicorpus')
23 |
24 |
25 | class BleiCorpus(IndexedCorpus):
26 | """
27 | Corpus in Blei's LDA-C format.
28 |
29 | The corpus is represented as two files: one describing the documents, and another
30 | describing the mapping between words and their ids.
31 |
32 | Each document is one line::
33 |
34 | N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
35 |
36 | The vocabulary is a file with words, one word per line; word at line K has an
37 | implicit ``id=K``.
38 | """
39 |
40 | def __init__(self, fname, fname_vocab=None):
41 | """
42 | Initialize the corpus from a file.
43 |
44 | `fname_vocab` is the file with vocabulary; if not specified, it defaults to
45 | `fname.vocab`.
46 | """
47 | IndexedCorpus.__init__(self, fname)
48 | logger.info("loading corpus from %s" % fname)
49 |
50 | if fname_vocab is None:
51 | fname_base, _ = path.splitext(fname)
52 | fname_dir = path.dirname(fname)
53 | for fname_vocab in [
54 | fname + '.vocab',
55 | fname + '/vocab.txt',
56 | fname_base + '.vocab',
57 | fname_dir + '/vocab.txt',
58 | ]:
59 | if path.exists(fname_vocab):
60 | break
61 | else:
62 | raise IOError('BleiCorpus: could not find vocabulary file')
63 |
64 | self.fname = fname
65 | with utils.smart_open(fname_vocab) as fin:
66 | words = [utils.to_unicode(word).rstrip() for word in fin]
67 | self.id2word = dict(enumerate(words))
68 | self.length = 0
69 |
70 | def __iter__(self):
71 | """
72 | Iterate over the corpus, returning one sparse vector at a time.
73 | """
74 | lineno = -1
75 | with utils.smart_open(self.fname) as fin:
76 | for lineno, line in enumerate(fin):
77 | yield self.line2doc(line)
78 | self.length = lineno + 1
79 |
80 | def line2doc(self, line):
81 | parts = utils.to_unicode(line).split()
82 | if int(parts[0]) != len(parts) - 1:
83 | raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
84 | doc = [part.rsplit(':', 1) for part in parts[1:]]
85 | doc = [(int(p1), float(p2)) for p1, p2 in doc]
86 | return doc
87 |
88 | @staticmethod
89 | def save_corpus(fname, corpus, id2word=None, metadata=False):
90 | """
91 | Save a corpus in the LDA-C format.
92 |
93 | There are actually two files saved: `fname` and `fname.vocab`, where
94 | `fname.vocab` is the vocabulary file.
95 |
96 | This function is automatically called by `BleiCorpus.serialize`; don't
97 | call it directly, call `serialize` instead.
98 | """
99 | if id2word is None:
100 | logger.info("no word id mapping provided; initializing from corpus")
101 | id2word = utils.dict_from_corpus(corpus)
102 | num_terms = len(id2word)
103 | else:
104 | num_terms = 1 + max([-1] + id2word.keys())
105 |
106 | logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
107 | with utils.smart_open(fname, 'wb') as fout:
108 | offsets = []
109 | for doc in corpus:
110 | doc = list(doc)
111 | offsets.append(fout.tell())
112 | parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7]
113 | fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))
114 |
115 | # write out vocabulary, in a format compatible with Blei's topics.py script
116 | fname_vocab = fname + '.vocab'
117 | logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
118 | with utils.smart_open(fname_vocab, 'wb') as fout:
119 | for featureid in xrange(num_terms):
120 | fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
121 |
122 | return offsets
123 |
124 | def docbyoffset(self, offset):
125 | """
126 | Return the document stored at file position `offset`.
127 | """
128 | with utils.smart_open(self.fname) as f:
129 | f.seek(offset)
130 | return self.line2doc(f.readline())
131 |
132 | # endclass BleiCorpus
133 |
--------------------------------------------------------------------------------
/gensim/corpora/bleicorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/bleicorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/csvcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2013 Zygmunt Zając
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | Corpus in CSV format.
9 |
10 | """
11 |
12 |
13 | from __future__ import with_statement
14 |
15 | import logging
16 | import csv
17 | import itertools
18 |
19 | from gensim import interfaces
20 |
21 | logger = logging.getLogger('gensim.corpora.csvcorpus')
22 |
23 |
24 | class CsvCorpus(interfaces.CorpusABC):
25 | """
26 | Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
27 | based on the file content.
28 |
29 | All row values are expected to be ints/floats.
30 |
31 | """
32 |
33 | def __init__(self, fname, labels):
34 | """
35 | Initialize the corpus from a file.
36 | `labels` = are class labels present in the input file? => skip the first column
37 |
38 | """
39 | logger.info("loading corpus from %s" % fname)
40 | self.fname = fname
41 | self.length = None
42 | self.labels = labels
43 |
44 | # load the first few lines, to guess the CSV dialect
45 | head = ''.join(itertools.islice(open(self.fname), 5))
46 | self.headers = csv.Sniffer().has_header(head)
47 | self.dialect = csv.Sniffer().sniff(head)
48 | logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
49 |
50 | def __iter__(self):
51 | """
52 | Iterate over the corpus, returning one sparse vector at a time.
53 |
54 | """
55 | reader = csv.reader(open(self.fname), self.dialect)
56 | if self.headers:
57 | next(reader) # skip the headers
58 |
59 | line_no = -1
60 | for line_no, line in enumerate(reader):
61 | if self.labels:
62 | line.pop(0) # ignore the first column = class label
63 | yield list(enumerate(map(float, line)))
64 |
65 | self.length = line_no + 1 # store the total number of CSV rows = documents
66 |
67 | # endclass CsvCorpus
68 |
--------------------------------------------------------------------------------
/gensim/corpora/dictionary.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/dictionary.pyc
--------------------------------------------------------------------------------
/gensim/corpora/hashdictionary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2012 Homer Strong, Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | This module implements the `"hashing trick" `_ --
10 | a mapping between words and their integer ids using a fixed, static mapping. The
11 | static mapping has a constant memory footprint, regardless of the number of word-types (features)
12 | in your corpus, so it's suitable for processing extremely large corpora.
13 |
14 | The ids are computed as `hash(word) % id_range`, where `hash` is a user-configurable
15 | function (adler32 by default). Using HashDictionary, new words can be represented immediately,
16 | without an extra pass through the corpus to collect all the ids first. This is another
17 | advantage: HashDictionary can be used with non-repeatable (once-only) streams of documents.
18 |
19 | A disadvantage of HashDictionary is that, unline plain :class:`Dictionary`, several words may map
20 | to the same id, causing hash collisions. The word<->id mapping is no longer a bijection.
21 |
22 | """
23 |
24 | from __future__ import with_statement
25 |
26 | import logging
27 | import itertools
28 | import zlib
29 |
30 | from gensim import utils
31 | from six import iteritems, iterkeys
32 |
33 |
34 | logger = logging.getLogger('gensim.corpora.hashdictionary')
35 |
36 |
37 |
38 | class HashDictionary(utils.SaveLoad, dict):
39 | """
40 | HashDictionary encapsulates the mapping between normalized words and their
41 | integer ids.
42 |
43 | Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary
44 | step. The documents can be computed immediately, from an uninitialized `HashDictionary`,
45 | without seeing the rest of the corpus first.
46 |
47 | The main function is `doc2bow`, which converts a collection of words to its
48 | bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
49 |
50 | """
51 | def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=True):
52 | """
53 | By default, keep track of debug statistics and mappings. If you find yourself
54 | running out of memory (or are sure you don't need the debug info), set
55 | `debug=False`.
56 | """
57 | self.myhash = myhash # hash fnc: string->integer
58 | self.id_range = id_range # hash range: id = myhash(key) % id_range
59 | self.debug = debug
60 |
61 | # the following (potentially massive!) dictionaries are only formed if `debug` is True
62 | self.token2id = {}
63 | self.id2token = {} # reverse mapping int->set(words)
64 | self.dfs = {} # token_id -> how many documents this token_id appeared in
65 | self.dfs_debug = {} # token_string->how many documents this word appeared in
66 |
67 | self.num_docs = 0 # number of documents processed
68 | self.num_pos = 0 # total number of corpus positions
69 | self.num_nnz = 0 # total number of non-zeroes in the BOW matrix
70 | self.allow_update = True
71 |
72 | if documents is not None:
73 | self.add_documents(documents)
74 |
75 |
76 | def __getitem__(self, tokenid):
77 | """
78 | Return all words that have mapped to the given id so far, as a set.
79 |
80 | Only works if `self.debug` was enabled.
81 | """
82 | return self.id2token.get(tokenid, set())
83 |
84 |
85 | def restricted_hash(self, token):
86 | """
87 | Calculate id of the given token. Also keep track of what words were mapped
88 | to what ids, for debugging reasons.
89 | """
90 | h = self.myhash(utils.to_utf8(token)) % self.id_range
91 | if self.debug:
92 | self.token2id[token] = h
93 | self.id2token.setdefault(h, set()).add(token)
94 | return h
95 |
96 |
97 | def __len__(self):
98 | """
99 | Return the number of distinct ids = the entire dictionary size.
100 | """
101 | return self.id_range
102 |
103 |
104 | def keys(self):
105 | """Return a list of all token ids."""
106 | return range(len(self))
107 |
108 |
109 | def __str__(self):
110 | return ("HashDictionary(%i id range)" % len(self))
111 |
112 |
113 | @staticmethod
114 | def from_documents(*args, **kwargs):
115 | return HashDictionary(*args, **kwargs)
116 |
117 |
118 | def add_documents(self, documents):
119 | """
120 | Build dictionary from a collection of documents. Each document is a list
121 | of tokens = **tokenized and normalized** utf-8 encoded strings.
122 |
123 | This is only a convenience wrapper for calling `doc2bow` on each document
124 | with `allow_update=True`.
125 | """
126 | for docno, document in enumerate(documents):
127 | if docno % 10000 == 0:
128 | logger.info("adding document #%i to %s" % (docno, self))
129 | _ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
130 | logger.info("built %s from %i documents (total %i corpus positions)" %
131 | (self, self.num_docs, self.num_pos))
132 |
133 |
134 | def doc2bow(self, document, allow_update=False, return_missing=False):
135 | """
136 | Convert `document` (a list of words) into the bag-of-words format = list
137 | of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
138 | **tokenized and normalized** utf-8 encoded string. No further preprocessing
139 | is done on the words in `document`; apply tokenization, stemming etc. before
140 | calling this method.
141 |
142 | If `allow_update` or `self.allow_update` is set, then also update dictionary
143 | in the process: update overall corpus statistics and document frequencies.
144 | For each id appearing in this document, increase its document frequency
145 | (`self.dfs`) by one.
146 |
147 | """
148 | result = {}
149 | missing = {}
150 | document = sorted(document) # convert the input to plain list (needed below)
151 | for word_norm, group in itertools.groupby(document):
152 | frequency = len(list(group)) # how many times does this word appear in the input document
153 | tokenid = self.restricted_hash(word_norm)
154 | result[tokenid] = result.get(tokenid, 0) + frequency
155 | if self.debug:
156 | # increment document count for each unique token that appeared in the document
157 | self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1
158 |
159 | if allow_update or self.allow_update:
160 | self.num_docs += 1
161 | self.num_pos += len(document)
162 | self.num_nnz += len(result)
163 | if self.debug:
164 | # increment document count for each unique tokenid that appeared in the document
165 | # done here, because several words may map to the same tokenid
166 | for tokenid in iterkeys(result):
167 | self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
168 |
169 | # return tokenids, in ascending id order
170 | result = sorted(iteritems(result))
171 | if return_missing:
172 | return result, missing
173 | else:
174 | return result
175 |
176 |
177 | def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
178 | """
179 | Remove document frequency statistics for tokens that appear in
180 |
181 | 1. less than `no_below` documents (absolute number) or
182 | 2. more than `no_above` documents (fraction of total corpus size, *not*
183 | absolute number).
184 | 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or
185 | keep all if `None`).
186 |
187 | **Note:** since HashDictionary's id range is fixed and doesn't depend on
188 | the number of tokens seen, this doesn't really "remove" anything. It only
189 | clears some supplementary statistics, for easier debugging and a smaller RAM
190 | footprint.
191 | """
192 | no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
193 | ok = [item for item in iteritems(self.dfs_debug)
194 | if no_below <= item[1] <= no_above_abs]
195 | ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n])
196 |
197 | self.dfs_debug = dict((word, freq)
198 | for word, freq in iteritems(self.dfs_debug)
199 | if word in ok)
200 | self.token2id = dict((token, tokenid)
201 | for token, tokenid in iteritems(self.token2id)
202 | if token in self.dfs_debug)
203 | self.id2token = dict((tokenid, set(token for token in tokens
204 | if token in self.dfs_debug))
205 | for tokenid, tokens in iteritems(self.id2token))
206 | self.dfs = dict((tokenid, freq)
207 | for tokenid, freq in iteritems(self.dfs)
208 | if self.id2token.get(tokenid, set()))
209 |
210 | # for word->document frequency
211 | logger.info("kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents" %
212 | (no_below, no_above_abs, 100.0 * no_above))
213 |
214 |
215 | def save_as_text(self, fname):
216 | """
217 | Save this HashDictionary to a text file, for easier debugging.
218 |
219 | The format is:
220 | `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.
221 |
222 | Note: use `save`/`load` to store in binary format instead (pickle).
223 | """
224 | logger.info("saving HashDictionary mapping to %s" % fname)
225 | with utils.smart_open(fname, 'wb') as fout:
226 | for tokenid in self.keys():
227 | words = sorted(self[tokenid])
228 | if words:
229 | words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
230 | words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
231 | fout.write(utils.to_utf8("%i\t%i\t%s\n" %
232 | (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
233 | #endclass HashDictionary
234 |
--------------------------------------------------------------------------------
/gensim/corpora/hashdictionary.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/hashdictionary.pyc
--------------------------------------------------------------------------------
/gensim/corpora/indexedcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Indexed corpus is a mechanism for random-accessing corpora.
10 |
11 | While the standard corpus interface in gensim allows iterating over corpus with
12 | `for doc in corpus: pass`, indexed corpus allows accessing the documents with
13 | `corpus[docno]` (in O(1) look-up time).
14 |
15 | This functionality is achieved by storing an extra file (by default named the same
16 | as the corpus file plus '.index' suffix) that stores the byte offset of the beginning
17 | of each document.
18 | """
19 |
20 | import logging
21 | import shelve
22 |
23 | from gensim import interfaces, utils
24 |
25 | logger = logging.getLogger('gensim.corpora.indexedcorpus')
26 |
27 |
28 | class IndexedCorpus(interfaces.CorpusABC):
29 | def __init__(self, fname, index_fname=None):
30 | """
31 | Initialize this abstract base class, by loading a previously saved index
32 | from `index_fname` (or `fname.index` if `index_fname` is not set).
33 | This index will allow subclasses to support the `corpus[docno]` syntax
34 | (random access to document #`docno` in O(1)).
35 |
36 | >>> # save corpus in SvmLightCorpus format with an index
37 | >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
38 | >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
39 | >>> # load back as a document stream (*not* plain Python list)
40 | >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
41 | >>> print(corpus_with_random_access[1])
42 | [(0, 1.0), (1, 2.0)]
43 |
44 | """
45 | try:
46 | if index_fname is None:
47 | index_fname = fname + '.index'
48 | self.index = utils.unpickle(index_fname)
49 | logger.info("loaded corpus index from %s" % index_fname)
50 | except:
51 | self.index = None
52 | self.length = None
53 |
54 | @classmethod
55 | def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False):
56 | """
57 | Iterate through the document stream `corpus`, saving the documents to `fname`
58 | and recording byte offset of each document. Save the resulting index
59 | structure to file `index_fname` (or `fname`.index is not set).
60 |
61 | This relies on the underlying corpus class `serializer` providing (in
62 | addition to standard iteration):
63 |
64 | * `save_corpus` method that returns a sequence of byte offsets, one for
65 | each saved document,
66 | * the `docbyoffset(offset)` method, which returns a document
67 | positioned at `offset` bytes within the persistent storage (file).
68 |
69 | Example:
70 |
71 | >>> MmCorpus.serialize('test.mm', corpus)
72 | >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
73 | >>> print(mm[42]) # retrieve document no. 42, etc.
74 | """
75 | if getattr(corpus, 'fname', None) == fname:
76 | raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
77 |
78 | if index_fname is None:
79 | index_fname = fname + '.index'
80 |
81 | if progress_cnt is not None:
82 | if labels is not None:
83 | offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
84 | else:
85 | offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
86 | else:
87 | if labels is not None:
88 | offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
89 | else:
90 | offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
91 |
92 | if offsets is None:
93 | raise NotImplementedError("called serialize on class %s which doesn't support indexing!" %
94 | serializer.__name__)
95 |
96 | # store offsets persistently, using pickle
97 | logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
98 | utils.pickle(offsets, index_fname)
99 |
100 | def __len__(self):
101 | """
102 | Return the index length if the corpus is indexed. Otherwise, make a pass
103 | over self to calculate the corpus length and cache this number.
104 | """
105 | if self.index is not None:
106 | return len(self.index)
107 | if self.length is None:
108 | logger.info("caching corpus length")
109 | self.length = sum(1 for doc in self)
110 | return self.length
111 |
112 | def __getitem__(self, docno):
113 | if self.index is None:
114 | raise RuntimeError("cannot call corpus[docid] without an index")
115 | return self.docbyoffset(self.index[docno])
116 |
117 | # endclass IndexedCorpus
118 |
--------------------------------------------------------------------------------
/gensim/corpora/indexedcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/indexedcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/lowcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Corpus in GibbsLda++ format of List-Of-Words.
10 | """
11 |
12 | from __future__ import with_statement
13 |
14 | import logging
15 |
16 | from gensim import utils
17 | from gensim.corpora import IndexedCorpus
18 | from six import iteritems, iterkeys
19 | from six.moves import xrange, zip as izip
20 |
21 |
22 | logger = logging.getLogger('gensim.corpora.lowcorpus')
23 |
24 |
25 | def split_on_space(s):
26 | return [word for word in utils.to_unicode(s).strip().split(' ') if word]
27 |
28 |
29 | class LowCorpus(IndexedCorpus):
30 | """
31 | List_Of_Words corpus handles input in GibbsLda++ format.
32 |
33 | Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format::
34 |
35 | Both data for training/estimating the model and new data (i.e., previously
36 | unseen data) have the same format as follows:
37 |
38 | [M]
39 | [document1]
40 | [document2]
41 | ...
42 | [documentM]
43 |
44 | in which the first line is the total number for documents [M]. Each line
45 | after that is one document. [documenti] is the ith document of the dataset
46 | that consists of a list of Ni words/terms.
47 |
48 | [documenti] = [wordi1] [wordi2] ... [wordiNi]
49 |
50 | in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated
51 | by the blank character.
52 | """
53 | def __init__(self, fname, id2word=None, line2words=split_on_space):
54 | """
55 | Initialize the corpus from a file.
56 |
57 | `id2word` and `line2words` are optional parameters.
58 | If provided, `id2word` is a dictionary mapping between word_ids (integers)
59 | and words (strings). If not provided, the mapping is constructed from
60 | the documents.
61 |
62 | `line2words` is a function which converts lines into tokens. Defaults to
63 | simple splitting on spaces.
64 | """
65 | IndexedCorpus.__init__(self, fname)
66 | logger.info("loading corpus from %s" % fname)
67 |
68 | self.fname = fname # input file, see class doc for format
69 | self.line2words = line2words # how to translate lines into words (simply split on space by default)
70 | self.num_docs = self._calculate_num_docs()
71 |
72 | if not id2word:
73 | # build a list of all word types in the corpus (distinct words)
74 | logger.info("extracting vocabulary from the corpus")
75 | all_terms = set()
76 | self.use_wordids = False # return documents as (word, wordCount) 2-tuples
77 | for doc in self:
78 | all_terms.update(word for word, wordCnt in doc)
79 | all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
80 | self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
81 | else:
82 | logger.info("using provided word mapping (%i ids)" % len(id2word))
83 | self.id2word = id2word
84 | self.word2id = dict((v, k) for k, v in iteritems(self.id2word))
85 | self.num_terms = len(self.word2id)
86 | self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
87 |
88 | logger.info("loaded corpus with %i documents and %i terms from %s" %
89 | (self.num_docs, self.num_terms, fname))
90 |
91 | def _calculate_num_docs(self):
92 | # the first line in input data is the number of documents (integer). throws exception on bad input.
93 | with utils.smart_open(self.fname) as fin:
94 | try:
95 | result = int(next(fin))
96 | except StopIteration:
97 | result = 0
98 |
99 | return result
100 |
101 | def __len__(self):
102 | return self.num_docs
103 |
104 | def line2doc(self, line):
105 | words = self.line2words(line)
106 |
107 | if self.use_wordids:
108 | # get all distinct terms in this document, ignore unknown words
109 | uniq_words = set(words).intersection(iterkeys(self.word2id))
110 |
111 | # the following creates a unique list of words *in the same order*
112 | # as they were in the input. when iterating over the documents,
113 | # the (word, count) pairs will appear in the same order as they
114 | # were in the input (bar duplicates), which looks better.
115 | # if this was not needed, we might as well have used useWords = set(words)
116 | use_words, marker = [], set()
117 | for word in words:
118 | if (word in uniq_words) and (word not in marker):
119 | use_words.append(word)
120 | marker.add(word)
121 | # construct a list of (wordIndex, wordFrequency) 2-tuples
122 | doc = list(zip(map(self.word2id.get, use_words),
123 | map(words.count, use_words)))
124 | else:
125 | uniq_words = set(words)
126 | # construct a list of (word, wordFrequency) 2-tuples
127 | doc = list(zip(uniq_words, map(words.count, uniq_words)))
128 |
129 | # return the document, then forget it and move on to the next one
130 | # note that this way, only one doc is stored in memory at a time, not the whole corpus
131 | return doc
132 |
133 | def __iter__(self):
134 | """
135 | Iterate over the corpus, returning one bag-of-words vector at a time.
136 | """
137 | with utils.smart_open(self.fname) as fin:
138 | for lineno, line in enumerate(fin):
139 | if lineno > 0: # ignore the first line = number of documents
140 | yield self.line2doc(line)
141 |
142 | @staticmethod
143 | def save_corpus(fname, corpus, id2word=None, metadata=False):
144 | """
145 | Save a corpus in the List-of-words format.
146 |
147 | This function is automatically called by `LowCorpus.serialize`; don't
148 | call it directly, call `serialize` instead.
149 | """
150 | if id2word is None:
151 | logger.info("no word id mapping provided; initializing from corpus")
152 | id2word = utils.dict_from_corpus(corpus)
153 |
154 | logger.info("storing corpus in List-Of-Words format into %s" % fname)
155 | truncated = 0
156 | offsets = []
157 | with utils.smart_open(fname, 'wb') as fout:
158 | fout.write(utils.to_utf8('%i\n' % len(corpus)))
159 | for doc in corpus:
160 | words = []
161 | for wordid, value in doc:
162 | if abs(int(value) - value) > 1e-6:
163 | truncated += 1
164 | words.extend([utils.to_unicode(id2word[wordid])] * int(value))
165 | offsets.append(fout.tell())
166 | fout.write(utils.to_utf8('%s\n' % ' '.join(words)))
167 |
168 | if truncated:
169 | logger.warning("List-of-words format can only save vectors with "
170 | "integer elements; %i float entries were truncated to integer value" %
171 | truncated)
172 | return offsets
173 |
174 | def docbyoffset(self, offset):
175 | """
176 | Return the document stored at file position `offset`.
177 | """
178 | with utils.smart_open(self.fname) as f:
179 | f.seek(offset)
180 | return self.line2doc(f.readline())
181 |
182 | # endclass LowCorpus
183 |
--------------------------------------------------------------------------------
/gensim/corpora/lowcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/lowcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/malletcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | """
7 | Corpus in Mallet format of List-Of-Words.
8 | """
9 |
10 | from __future__ import with_statement
11 |
12 | import logging
13 |
14 | from gensim import utils
15 | from gensim.corpora import LowCorpus
16 |
17 |
18 | logger = logging.getLogger('gensim.corpora.malletcorpus')
19 |
20 |
21 | class MalletCorpus(LowCorpus):
22 | """
23 | Quoting http://mallet.cs.umass.edu/import.php:
24 |
25 | One file, one instance per line
26 | Assume the data is in the following format:
27 |
28 | [URL] [language] [text of the page...]
29 |
30 | Or, more generally,
31 | [document #1 id] [label] [text of the document...]
32 | [document #2 id] [label] [text of the document...]
33 | ...
34 | [document #N id] [label] [text of the document...]
35 |
36 | Note that language/label is *not* considered in Gensim.
37 |
38 | """
39 | def __init__(self, fname, id2word=None, metadata=False):
40 | self.metadata = metadata
41 | LowCorpus.__init__(self, fname, id2word)
42 |
43 | def _calculate_num_docs(self):
44 | with utils.smart_open(self.fname) as fin:
45 | result = sum([1 for x in fin])
46 | return result
47 |
48 | def __iter__(self):
49 | """
50 | Iterate over the corpus at the given filename.
51 |
52 | Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
53 | """
54 | with utils.smart_open(self.fname) as f:
55 | for line in f:
56 | yield self.line2doc(line)
57 |
58 | def line2doc(self, line):
59 | l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
60 | docid, doclang, words = l[0], l[1], l[2:]
61 |
62 | doc = super(MalletCorpus, self).line2doc(' '.join(words))
63 |
64 | if self.metadata:
65 | return doc, (docid, doclang)
66 | else:
67 | return doc
68 |
69 | @staticmethod
70 | def save_corpus(fname, corpus, id2word=None, metadata=False):
71 | """
72 | Save a corpus in the Mallet format.
73 |
74 | The document id will be generated by enumerating the corpus.
75 | That is, it will range between 0 and number of documents in the corpus.
76 |
77 | Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
78 | If the language needs to be saved, post-processing will be required.
79 |
80 | This function is automatically called by `MalletCorpus.serialize`; don't
81 | call it directly, call `serialize` instead.
82 |
83 | """
84 | if id2word is None:
85 | logger.info("no word id mapping provided; initializing from corpus")
86 | id2word = utils.dict_from_corpus(corpus)
87 |
88 | logger.info("storing corpus in Mallet format into %s" % fname)
89 |
90 | truncated = 0
91 | offsets = []
92 | with utils.smart_open(fname, 'wb') as fout:
93 | for doc_id, doc in enumerate(corpus):
94 | if metadata:
95 | doc_id, doc_lang = doc[1]
96 | doc = doc[0]
97 | else:
98 | doc_lang = '__unknown__'
99 |
100 | words = []
101 | for wordid, value in doc:
102 | if abs(int(value) - value) > 1e-6:
103 | truncated += 1
104 | words.extend([utils.to_unicode(id2word[wordid])] * int(value))
105 | offsets.append(fout.tell())
106 | fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))
107 |
108 | if truncated:
109 | logger.warning("Mallet format can only save vectors with "
110 | "integer elements; %i float entries were truncated to integer value" %
111 | truncated)
112 |
113 | return offsets
114 |
115 | def docbyoffset(self, offset):
116 | """
117 | Return the document stored at file position `offset`.
118 | """
119 | with utils.smart_open(self.fname) as f:
120 | f.seek(offset)
121 | return self.line2doc(f.readline())
122 |
123 | # endclass MalletCorpus
124 |
--------------------------------------------------------------------------------
/gensim/corpora/malletcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/malletcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/mmcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Corpus in the Matrix Market format.
10 | """
11 |
12 |
13 | import logging
14 |
15 | from gensim import interfaces, matutils
16 | from gensim.corpora import IndexedCorpus
17 |
18 |
19 | logger = logging.getLogger('gensim.corpora.mmcorpus')
20 |
21 |
22 | class MmCorpus(matutils.MmReader, IndexedCorpus):
23 | """
24 | Corpus in the Matrix Market format.
25 | """
26 | def __init__(self, fname):
27 | # avoid calling super(), too confusing
28 | IndexedCorpus.__init__(self, fname)
29 | matutils.MmReader.__init__(self, fname)
30 |
31 | def __iter__(self):
32 | """
33 | Interpret a matrix in Matrix Market format as a streamed gensim corpus
34 | (yielding one document at a time).
35 | """
36 | for doc_id, doc in super(MmCorpus, self).__iter__():
37 | yield doc # get rid of doc id, return the sparse vector only
38 |
39 | @staticmethod
40 | def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
41 | """
42 | Save a corpus in the Matrix Market format to disk.
43 |
44 | This function is automatically called by `MmCorpus.serialize`; don't
45 | call it directly, call `serialize` instead.
46 | """
47 | logger.info("storing corpus in Matrix Market format to %s" % fname)
48 | num_terms = len(id2word) if id2word is not None else None
49 | return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata)
50 |
51 | # endclass MmCorpus
52 |
--------------------------------------------------------------------------------
/gensim/corpora/mmcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/mmcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/svmlightcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Corpus in SVMlight format.
10 | """
11 |
12 |
13 | from __future__ import with_statement
14 |
15 | import logging
16 |
17 | from gensim import utils
18 | from gensim.corpora import IndexedCorpus
19 |
20 |
21 | logger = logging.getLogger('gensim.corpora.svmlightcorpus')
22 |
23 |
24 | class SvmLightCorpus(IndexedCorpus):
25 | """
26 | Corpus in SVMlight format.
27 |
28 | Quoting http://svmlight.joachims.org/:
29 | The input file contains the training examples. The first lines
30 | may contain comments and are ignored if they start with #. Each of the following
31 | lines represents one training example and is of the following format::
32 |
33 | .=. :: ... : #
34 | .=. +1 | -1 | 0 |
35 | .=. | "qid"
36 | .=.
37 | .=.
38 |
39 | The "qid" feature (used for SVMlight ranking), if present, is ignored.
40 |
41 | Although not mentioned in the specification above, SVMlight also expect its
42 | feature ids to be 1-based (counting starts at 1). We convert features to 0-base
43 | internally by decrementing all ids when loading a SVMlight input file, and
44 | increment them again when saving as SVMlight.
45 |
46 | """
47 |
48 | def __init__(self, fname, store_labels=True):
49 | """
50 | Initialize the corpus from a file.
51 |
52 | Although vector labels (~SVM target class) are not used in gensim in any way,
53 | they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
54 | to skip storing these labels (e.g. if there are too many vectors to store
55 | the self.labels array in memory).
56 |
57 | """
58 | IndexedCorpus.__init__(self, fname)
59 | logger.info("loading corpus from %s" % fname)
60 |
61 | self.fname = fname # input file, see class doc for format
62 | self.length = None
63 | self.store_labels = store_labels
64 | self.labels = []
65 |
66 | def __iter__(self):
67 | """
68 | Iterate over the corpus, returning one sparse vector at a time.
69 | """
70 | lineno = -1
71 | self.labels = []
72 | with utils.smart_open(self.fname) as fin:
73 | for lineno, line in enumerate(fin):
74 | doc = self.line2doc(line)
75 | if doc is not None:
76 | if self.store_labels:
77 | self.labels.append(doc[1])
78 | yield doc[0]
79 | self.length = lineno + 1
80 |
81 | @staticmethod
82 | def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
83 | """
84 | Save a corpus in the SVMlight format.
85 |
86 | The SVMlight `` class tag is taken from the `labels` array, or set
87 | to 0 for all documents if `labels` is not supplied.
88 |
89 | This function is automatically called by `SvmLightCorpus.serialize`; don't
90 | call it directly, call `serialize` instead.
91 | """
92 | logger.info("converting corpus to SVMlight format: %s" % fname)
93 |
94 | offsets = []
95 | with utils.smart_open(fname, 'wb') as fout:
96 | for docno, doc in enumerate(corpus):
97 | label = labels[docno] if labels else 0 # target class is 0 by default
98 | offsets.append(fout.tell())
99 | fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
100 | return offsets
101 |
102 | def docbyoffset(self, offset):
103 | """
104 | Return the document stored at file position `offset`.
105 | """
106 | with utils.smart_open(self.fname) as f:
107 | f.seek(offset)
108 | return self.line2doc(f.readline())[0]
109 |
110 | def line2doc(self, line):
111 | """
112 | Create a document from a single line (string) in SVMlight format
113 | """
114 | line = utils.to_unicode(line)
115 | line = line[: line.find('#')].strip()
116 | if not line:
117 | return None # ignore comments and empty lines
118 | parts = line.split()
119 | if not parts:
120 | raise ValueError('invalid line format in %s' % self.fname)
121 | target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
122 | doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
123 | return doc, target
124 |
125 | @staticmethod
126 | def doc2line(doc, label=0):
127 | """
128 | Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
129 | """
130 | pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
131 | return "%s %s\n" % (label, pairs)
132 |
133 | # endclass SvmLightCorpus
134 |
--------------------------------------------------------------------------------
/gensim/corpora/svmlightcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/svmlightcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/textcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | """
7 | Text corpora usually reside on disk, as text files in one format or another
8 | In a common scenario, we need to build a dictionary (a `word->integer id`
9 | mapping), which is then used to construct sparse bag-of-word vectors
10 | (= sequences of `(word_id, word_weight)` 2-tuples).
11 |
12 | This module provides some code scaffolding to simplify this pipeline. For
13 | example, given a corpus where each document is a separate line in file on disk,
14 | you would override the `TextCorpus.get_texts` method to read one line=document
15 | at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence
16 | of words.
17 |
18 | Overriding `get_texts` is enough; you can then initialize the corpus with e.g.
19 | `MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a
20 | corpus of sparse vectors. The `__iter__` methods is automatically set up, and
21 | dictionary is automatically populated with all `word->id` mappings.
22 |
23 | The resulting object can be used as input to all gensim models (TFIDF, LSI, ...),
24 | serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc).
25 |
26 | See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example.
27 | """
28 |
29 |
30 | from __future__ import with_statement
31 |
32 | import logging
33 |
34 | from gensim import interfaces, utils
35 | from six import string_types
36 | from gensim.corpora.dictionary import Dictionary
37 |
38 | logger = logging.getLogger('gensim.corpora.textcorpus')
39 |
40 |
41 | class TextCorpus(interfaces.CorpusABC):
42 | """
43 | Helper class to simplify the pipeline of getting bag-of-words vectors (= a
44 | gensim corpus) from plain text.
45 |
46 | This is an abstract base class: override the `get_texts()` method to match
47 | your particular input.
48 |
49 | Given a filename (or a file-like object) in constructor, the corpus object
50 | will be automatically initialized with a dictionary in `self.dictionary` and
51 | will support the `iter` corpus method. You must only provide a correct `get_texts`
52 | implementation.
53 |
54 | """
55 | def __init__(self, input=None):
56 | super(TextCorpus, self).__init__()
57 | self.input = input
58 | self.dictionary = Dictionary()
59 | self.metadata = False
60 | if input is not None:
61 | self.dictionary.add_documents(self.get_texts())
62 | else:
63 | logger.warning("No input document stream provided; assuming "
64 | "dictionary will be initialized some other way.")
65 |
66 | def __iter__(self):
67 | """
68 | The function that defines a corpus.
69 |
70 | Iterating over the corpus must yield sparse vectors, one for each document.
71 | """
72 | for text in self.get_texts():
73 | if self.metadata:
74 | yield self.dictionary.doc2bow(text[0], allow_update=False), text[1]
75 | else:
76 | yield self.dictionary.doc2bow(text, allow_update=False)
77 |
78 | def getstream(self):
79 | return utils.file_or_filename(self.input)
80 |
81 | def get_texts(self):
82 | """
83 | Iterate over the collection, yielding one document at a time. A document
84 | is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
85 |
86 | Override this function to match your input (parse input files, do any
87 | text preprocessing, lowercasing, tokenizing etc.). There will be no further
88 | preprocessing of the words coming out of this function.
89 | """
90 | # Instead of raising NotImplementedError, let's provide a sample implementation:
91 | # assume documents are lines in a single file (one document per line).
92 | # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
93 | lineno = -1
94 | with self.getstream() as lines:
95 | for lineno, line in enumerate(lines):
96 | if self.metadata:
97 | yield utils.tokenize(line, lowercase=True), (lineno,)
98 | else:
99 | yield utils.tokenize(line, lowercase=True)
100 | self.length = lineno + 1 # will be 0 if loop never executes
101 |
102 | def __len__(self):
103 | return self.length # will throw if corpus not initialized
104 |
105 | # endclass TextCorpus
106 |
--------------------------------------------------------------------------------
/gensim/corpora/textcorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/textcorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/ucicorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2012 Jonathan Esterhazy
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | University of California, Irvine (UCI) Bag-of-Words format.
10 |
11 | http://archive.ics.uci.edu/ml/datasets/Bag+of+Words
12 | """
13 |
14 | from __future__ import with_statement
15 |
16 | import logging
17 | from collections import defaultdict
18 |
19 | from gensim import utils
20 | from gensim.corpora import Dictionary
21 | from gensim.corpora import IndexedCorpus
22 | from gensim.matutils import MmReader
23 | from gensim.matutils import MmWriter
24 | from six import iteritems, string_types
25 | from six.moves import xrange
26 |
27 |
28 | logger = logging.getLogger('gensim.corpora.ucicorpus')
29 |
30 |
31 | class UciReader(MmReader):
32 | def __init__(self, input):
33 | """
34 | Initialize the reader.
35 |
36 | The `input` parameter refers to a file on the local filesystem,
37 | which is expected to be in the UCI Bag-of-Words format.
38 | """
39 |
40 | logger.info('Initializing corpus reader from %s' % input)
41 |
42 | self.input = input
43 |
44 | with utils.smart_open(self.input) as fin:
45 | self.num_docs = self.num_terms = self.num_nnz = 0
46 | try:
47 | self.num_docs = int(next(fin).strip())
48 | self.num_terms = int(next(fin).strip())
49 | self.num_nnz = int(next(fin).strip())
50 | except StopIteration:
51 | pass
52 |
53 | logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
54 | (self.num_docs, self.num_terms, self.num_nnz))
55 |
56 | def skip_headers(self, input_file):
57 | for lineno, _ in enumerate(input_file):
58 | if lineno == 2:
59 | break
60 |
61 | # endclass UciReader
62 |
63 |
64 | class UciWriter(MmWriter):
65 | """
66 | Store a corpus in UCI Bag-of-Words format.
67 |
68 | This corpus format is identical to MM format, except for
69 | different file headers. There is no format line, and the first
70 | three lines of the file contain number_docs, num_terms, and num_nnz,
71 | one value per line.
72 |
73 | This implementation is based on matutils.MmWriter, and works the same way.
74 |
75 | """
76 | MAX_HEADER_LENGTH = 20 # reserve 20 bytes per header value
77 | FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n')
78 |
79 | def write_headers(self):
80 | """
81 | Write blank header lines. Will be updated later, once corpus stats are known.
82 | """
83 | for _ in range(3):
84 | self.fout.write(self.FAKE_HEADER)
85 |
86 | self.last_docno = -1
87 | self.headers_written = True
88 |
89 | def update_headers(self, num_docs, num_terms, num_nnz):
90 | """
91 | Update headers with actual values.
92 | """
93 | offset = 0
94 | values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]]
95 |
96 | for value in values:
97 | if len(value) > len(self.FAKE_HEADER):
98 | raise ValueError('Invalid header: value too large!')
99 | self.fout.seek(offset)
100 | self.fout.write(value)
101 | offset += len(self.FAKE_HEADER)
102 |
103 | @staticmethod
104 | def write_corpus(fname, corpus, progress_cnt=1000, index=False):
105 | writer = UciWriter(fname)
106 | writer.write_headers()
107 |
108 | num_terms, num_nnz = 0, 0
109 | docno, poslast = -1, -1
110 | offsets = []
111 | for docno, bow in enumerate(corpus):
112 | if docno % progress_cnt == 0:
113 | logger.info("PROGRESS: saving document #%i" % docno)
114 | if index:
115 | posnow = writer.fout.tell()
116 | if posnow == poslast:
117 | offsets[-1] = -1
118 | offsets.append(posnow)
119 | poslast = posnow
120 |
121 | vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
122 | max_id, veclen = writer.write_vector(docno, vector)
123 | num_terms = max(num_terms, 1 + max_id)
124 | num_nnz += veclen
125 | num_docs = docno + 1
126 |
127 | if num_docs * num_terms != 0:
128 | logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" %
129 | (num_docs, num_terms,
130 | 100.0 * num_nnz / (num_docs * num_terms),
131 | num_nnz,
132 | num_docs * num_terms))
133 |
134 | # now write proper headers, by seeking and overwriting the spaces written earlier
135 | writer.update_headers(num_docs, num_terms, num_nnz)
136 |
137 | writer.close()
138 | if index:
139 | return offsets
140 |
141 | # endclass UciWriter
142 |
143 |
144 | class UciCorpus(UciReader, IndexedCorpus):
145 | """
146 | Corpus in the UCI bag-of-words format.
147 | """
148 | def __init__(self, fname, fname_vocab=None):
149 | IndexedCorpus.__init__(self, fname)
150 | UciReader.__init__(self, fname)
151 |
152 | if fname_vocab is None:
153 | fname_vocab = fname + '.vocab'
154 |
155 | self.fname = fname
156 | with utils.smart_open(fname_vocab) as fin:
157 | words = [word.strip() for word in fin]
158 | self.id2word = dict(enumerate(words))
159 |
160 | self.transposed = True
161 |
162 | def __iter__(self):
163 | """
164 | Interpret a matrix in UCI bag-of-words format as a streamed gensim corpus
165 | (yielding one document at a time).
166 | """
167 | for docId, doc in super(UciCorpus, self).__iter__():
168 | yield doc # get rid of docId, return the sparse vector only
169 |
170 | def create_dictionary(self):
171 | """
172 | Utility method to generate gensim-style Dictionary directly from
173 | the corpus and vocabulary data.
174 | """
175 | dictionary = Dictionary()
176 |
177 | # replace dfs with defaultdict to avoid downstream KeyErrors
178 | # uci vocabularies may contain terms that are not used in the document data
179 | dictionary.dfs = defaultdict(int)
180 |
181 | dictionary.id2token = self.id2word
182 | dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))
183 |
184 | dictionary.num_docs = self.num_docs
185 | dictionary.num_nnz = self.num_nnz
186 |
187 | for docno, doc in enumerate(self):
188 | if docno % 10000 == 0:
189 | logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))
190 |
191 | for word, count in doc:
192 | dictionary.dfs[word] += 1
193 | dictionary.num_pos += count
194 |
195 | return dictionary
196 |
197 | @staticmethod
198 | def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
199 | """
200 | Save a corpus in the UCI Bag-of-Words format.
201 |
202 | There are actually two files saved: `fname` and `fname.vocab`, where
203 | `fname.vocab` is the vocabulary file.
204 |
205 | This function is automatically called by `UciCorpus.serialize`; don't
206 | call it directly, call `serialize` instead.
207 | """
208 | if id2word is None:
209 | logger.info("no word id mapping provided; initializing from corpus")
210 | id2word = utils.dict_from_corpus(corpus)
211 | num_terms = len(id2word)
212 | else:
213 | num_terms = 1 + max([-1] + id2word.keys())
214 |
215 | # write out vocabulary
216 | fname_vocab = fname + '.vocab'
217 | logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
218 | with utils.smart_open(fname_vocab, 'wb') as fout:
219 | for featureid in xrange(num_terms):
220 | fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
221 |
222 | logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)
223 |
224 | return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
225 |
226 | # endclass UciCorpus
227 |
--------------------------------------------------------------------------------
/gensim/corpora/ucicorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/ucicorpus.pyc
--------------------------------------------------------------------------------
/gensim/corpora/wikicorpus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/corpora/wikicorpus.pyc
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/dmlcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Corpus for the DML-CZ project.
10 | """
11 |
12 |
13 | import logging
14 | import itertools
15 | import os.path
16 |
17 | from gensim import interfaces, matutils
18 | import dictionary # for constructing word->id mappings
19 |
20 |
21 | logger = logging.getLogger('gensim.corpora.dmlcorpus')
22 |
23 |
24 | class DmlConfig(object):
25 | """
26 | DmlConfig contains parameters necessary for the abstraction of a 'corpus of
27 | articles' (see the `DmlCorpus` class).
28 |
29 | Articles may come from different sources (=different locations on disk/network,
30 | different file formats etc.), so the main purpose of DmlConfig is to keep all
31 | sources in one place.
32 |
33 | Apart from glueing sources together, DmlConfig also decides where to store
34 | output files and which articles to accept for the corpus (= an additional filter
35 | over the sources).
36 | """
37 | def __init__(self, configId, resultDir, acceptLangs = None):
38 | self.resultDir = resultDir # output files will be stored in this directory
39 | self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple
40 | self.sources = {} # all article sources; see sources.DmlSource class for an example of source
41 |
42 | if acceptLangs is None: # which languages to accept
43 | acceptLangs = set(['any']) # if not specified, accept all languages (including unknown/unspecified)
44 | self.acceptLangs = set(acceptLangs)
45 | logger.info('initialized %s' % self)
46 |
47 |
48 | def resultFile(self, fname):
49 | return os.path.join(self.resultDir, self.configId + '_' + fname)
50 |
51 |
52 | def acceptArticle(self, metadata):
53 | lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown
54 | if 'any' not in self.acceptLangs and lang not in self.acceptLangs:
55 | return False
56 | return True
57 |
58 |
59 | def addSource(self, source):
60 | sourceId = str(source)
61 | assert sourceId not in self.sources, "source %s already present in the config!" % sourceId
62 | self.sources[sourceId] = source
63 |
64 |
65 | def __str__(self):
66 | return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" %
67 | (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs)))
68 | #endclass DmlConfig
69 |
70 |
71 |
72 | class DmlCorpus(interfaces.CorpusABC):
73 | """
74 | DmlCorpus implements a collection of articles. It is initialized via a DmlConfig
75 | object, which holds information about where to look for the articles and how
76 | to process them.
77 |
78 | Apart from being a regular corpus (bag-of-words iterable with a `len()` method),
79 | DmlCorpus has methods for building a dictionary (mapping between words and
80 | their ids).
81 | """
82 | def __init__(self):
83 | self.documents = []
84 | self.config = None
85 | self.dictionary = dictionary.Dictionary()
86 |
87 |
88 | def __len__(self):
89 | return len(self.documents)
90 |
91 |
92 | def __iter__(self):
93 | """
94 | The function that defines a corpus -- iterating over the corpus yields
95 | bag-of-words vectors, one for each document.
96 |
97 | A bag-of-words vector is simply a list of ``(tokenId, tokenCount)`` 2-tuples.
98 | """
99 | for docNo, (sourceId, docUri) in enumerate(self.documents):
100 | source = self.config.sources[sourceId]
101 |
102 | contents = source.getContent(docUri)
103 | words = [source.normalizeWord(word) for word in source.tokenize(contents)]
104 | yield self.dictionary.doc2bow(words, allowUpdate = False)
105 |
106 |
107 | def buildDictionary(self):
108 | """
109 | Populate dictionary mapping and statistics.
110 |
111 | This is done by sequentially retrieving the article fulltexts, splitting
112 | them into tokens and converting tokens to their ids (creating new ids as
113 | necessary).
114 | """
115 | logger.info("creating dictionary from %i articles" % len(self.documents))
116 | self.dictionary = dictionary.Dictionary()
117 | numPositions = 0
118 | for docNo, (sourceId, docUri) in enumerate(self.documents):
119 | if docNo % 1000 == 0:
120 | logger.info("PROGRESS: at document #%i/%i (%s, %s)" %
121 | (docNo, len(self.documents), sourceId, docUri))
122 | source = self.config.sources[sourceId]
123 | contents = source.getContent(docUri)
124 | words = [source.normalizeWord(word) for word in source.tokenize(contents)]
125 | numPositions += len(words)
126 |
127 | # convert to bag-of-words, but ignore the result -- here we only care about updating token ids
128 | _ = self.dictionary.doc2bow(words, allowUpdate = True)
129 | logger.info("built %s from %i documents (total %i corpus positions)" %
130 | (self.dictionary, len(self.documents), numPositions))
131 |
132 |
133 | def processConfig(self, config, shuffle = False):
134 | """
135 | Parse the directories specified in the config, looking for suitable articles.
136 |
137 | This updates the self.documents var, which keeps a list of (source id,
138 | article uri) 2-tuples. Each tuple is a unique identifier of one article.
139 |
140 | Note that some articles are ignored based on config settings (for example
141 | if the article's language doesn't match any language specified in the
142 | config etc.).
143 | """
144 | self.config = config
145 | self.documents = []
146 | logger.info("processing config %s" % config)
147 | for sourceId, source in config.sources.iteritems():
148 | logger.info("processing source '%s'" % sourceId)
149 | accepted = []
150 | for articleUri in source.findArticles():
151 | meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value)
152 | if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata
153 | accepted.append((sourceId, articleUri))
154 | logger.info("accepted %i articles for source '%s'" %
155 | (len(accepted), sourceId))
156 | self.documents.extend(accepted)
157 |
158 | if not self.documents:
159 | logger.warning('no articles at all found from the config; something went wrong!')
160 |
161 | if shuffle:
162 | logger.info("shuffling %i documents for random order" % len(self.documents))
163 | import random
164 | random.shuffle(self.documents)
165 |
166 | logger.info("accepted total of %i articles for %s" %
167 | (len(self.documents), str(config)))
168 |
169 |
170 | def saveDictionary(self, fname):
171 | logger.info("saving dictionary mapping to %s" % fname)
172 | fout = open(fname, 'w')
173 | for tokenId, token in self.dictionary.id2token.iteritems():
174 | fout.write("%i\t%s\n" % (tokenId, token))
175 | fout.close()
176 |
177 | @staticmethod
178 | def loadDictionary(fname):
179 | result = {}
180 | for lineNo, line in enumerate(open(fname)):
181 | pair = line[:-1].split('\t')
182 | if len(pair) != 2:
183 | continue
184 | wordId, word = pair
185 | result[int(wordId)] = word
186 | return result
187 |
188 | def saveDocuments(self, fname):
189 | logger.info("saving documents mapping to %s" % fname)
190 | fout = open(fname, 'w')
191 | for docNo, docId in enumerate(self.documents):
192 | sourceId, docUri = docId
193 | intId, pathId = docUri
194 | fout.write("%i\t%s\n" % (docNo, repr(docId)))
195 | fout.close()
196 |
197 |
198 | def saveAsText(self):
199 | """
200 | Store the corpus to disk, in a human-readable text format.
201 |
202 | This actually saves multiple files:
203 |
204 | 1. Pure document-term co-occurence frequency counts, as a Matrix Market file.
205 | 2. Token to integer mapping, as a text file.
206 | 3. Document to document URI mapping, as a text file.
207 |
208 | The exact filesystem paths and filenames are determined from the config.
209 | """
210 | self.saveDictionary(self.config.resultFile('wordids.txt'))
211 | self.saveDocuments(self.config.resultFile('docids.txt'))
212 | matutils.MmWriter.writeCorpus(self.config.resultFile('bow.mm'), self)
213 |
214 |
215 | def articleDir(self, docNo):
216 | """
217 | Return absolute normalized path on filesystem to article no. `docNo`.
218 | """
219 | sourceId, (_, outPath) = self.documents[docNo]
220 | source = self.config.sources[sourceId]
221 | return os.path.join(source.baseDir, outPath)
222 |
223 |
224 | def getMeta(self, docNo):
225 | """
226 | Return metadata for article no. `docNo`.
227 | """
228 | sourceId, uri = self.documents[docNo]
229 | source = self.config.sources[sourceId]
230 | return source.getMeta(uri)
231 | #endclass DmlCorpus
232 |
233 |
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright (C) 2010 Radim Rehurek
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | """
7 | USAGE: %(program)s LANGUAGE
8 | Process the repository, accepting articles in LANGUAGE (or 'any').
9 | Store the word co-occurence matrix and id mappings, which are needed for subsequent processing.
10 |
11 | Example: ./gensim_build.py eng
12 | """
13 |
14 |
15 | import logging
16 | import sys
17 | import os.path
18 | import re
19 |
20 |
21 | from gensim.corpora import sources, dmlcorpus
22 |
23 |
24 | PREFIX = 'dmlcz'
25 |
26 | AT_HOME = False
27 |
28 | if AT_HOME:
29 | SOURCE_LIST = [
30 | sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
31 | sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
32 | sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
33 | ]
34 |
35 | # SOURCE_LIST = [
36 | # sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'),
37 | # ]
38 |
39 | RESULT_DIR = '/Users/kofola/workspace/dml/data/results'
40 |
41 | else:
42 |
43 | SOURCE_LIST = [
44 | sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
45 | sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
46 | sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
47 | ]
48 |
49 | RESULT_DIR = '/data/dmlcz/xrehurek/results'
50 |
51 |
52 | def buildDmlCorpus(config):
53 | dml = dmlcorpus.DmlCorpus()
54 | dml.processConfig(config, shuffle = True)
55 | dml.buildDictionary()
56 | dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words
57 |
58 | dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs)
59 | dml.saveAsText() # save id mappings and documents as text data (matrix market format)
60 | return dml
61 |
62 |
63 | if __name__ == '__main__':
64 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
65 | logging.root.setLevel(level=logging.INFO)
66 | logging.info("running %s" % ' '.join(sys.argv))
67 |
68 | program = os.path.basename(sys.argv[0])
69 |
70 | # check and process input arguments
71 | if len(sys.argv) < 2:
72 | print(globals()['__doc__'] % locals())
73 | sys.exit(1)
74 | language = sys.argv[1]
75 |
76 | # construct the config, which holds information about sources, data file filenames etc.
77 | config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language])
78 | for source in SOURCE_LIST:
79 | config.addSource(source)
80 | buildDmlCorpus(config)
81 |
82 | logging.info("finished running %s" % program)
83 |
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_genmodel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright (C) 2010 Radim Rehurek
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | """
7 | USAGE: %(program)s LANGUAGE METHOD
8 | Generate topic models for the specified subcorpus. METHOD is currently one \
9 | of 'tfidf', 'lsi', 'lda', 'rp'.
10 |
11 | Example: ./gensim_genmodel.py any lsi
12 | """
13 |
14 |
15 | import logging
16 | import sys
17 | import os.path
18 | import re
19 |
20 |
21 | from gensim.corpora import sources, dmlcorpus, MmCorpus
22 | from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel
23 |
24 | import gensim_build
25 |
26 |
27 | # internal method parameters
28 | DIM_RP = 300 # dimensionality for random projections
29 | DIM_LSI = 200 # for lantent semantic indexing
30 | DIM_LDA = 100 # for latent dirichlet allocation
31 |
32 |
33 |
34 | if __name__ == '__main__':
35 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
36 | logging.root.setLevel(level = logging.INFO)
37 | logging.info("running %s" % ' '.join(sys.argv))
38 |
39 | program = os.path.basename(sys.argv[0])
40 |
41 | # check and process input arguments
42 | if len(sys.argv) < 3:
43 | print(globals()['__doc__'] % locals())
44 | sys.exit(1)
45 | language = sys.argv[1]
46 | method = sys.argv[2].strip().lower()
47 |
48 | logging.info("loading corpus mappings")
49 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
50 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
51 |
52 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
53 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
54 | logging.info("loaded %i word ids" % len(id2word))
55 |
56 | corpus = MmCorpus(config.resultFile('bow.mm'))
57 |
58 | if method == 'tfidf':
59 | model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
60 | model.save(config.resultFile('model_tfidf.pkl'))
61 | elif method == 'lda':
62 | model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
63 | model.save(config.resultFile('model_lda.pkl'))
64 | elif method == 'lsi':
65 | # first, transform word counts to tf-idf weights
66 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
67 | # then find the transformation from tf-idf to latent space
68 | model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
69 | model.save(config.resultFile('model_lsi.pkl'))
70 | elif method == 'rp':
71 | # first, transform word counts to tf-idf weights
72 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
73 | # then find the transformation from tf-idf to latent space
74 | model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
75 | model.save(config.resultFile('model_rp.pkl'))
76 | else:
77 | raise ValueError('unknown topic extraction method: %s' % repr(method))
78 |
79 | MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus])
80 |
81 | logging.info("finished running %s" % program)
82 |
83 |
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_xml.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright (C) 2010 Radim Rehurek
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | """
7 | USAGE: %(program)s LANGUAGE METHOD
8 | Generate similar.xml files, using a previously built model for METHOD.
9 |
10 | Example: ./gensim_xml.py eng lsi
11 | """
12 |
13 |
14 | import logging
15 | import sys
16 | import os.path
17 | import re
18 |
19 |
20 | from gensim.corpora import sources, dmlcorpus, MmCorpus
21 | from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity
22 |
23 | import gensim_build
24 |
25 |
26 | # set to True to do everything EXCEPT actually writing out similar.xml files to disk.
27 | # similar.xml files are NOT written if DRY_RUN is true.
28 | DRY_RUN = False
29 |
30 | # how many 'most similar' documents to store in each similar.xml?
31 | MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored)
32 | MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit).
33 |
34 | # if there are no similar articles (after the pruning), do we still want to generate similar.xml?
35 | SAVE_EMPTY = True
36 |
37 | # xml template for similar articles
38 | ARTICLE = """
39 |
40 |
41 | %(author)s
42 |
43 | %(title)s
44 | %(suffix)s
45 |
46 |
47 |
48 | """
49 |
50 | # template for the whole similar.xml file (will be filled with multiple ARTICLE instances)
51 | SIMILAR = """\
52 |
53 | %s
54 |
55 | """
56 |
57 |
58 |
59 | def generateSimilar(corpus, index, method):
60 | for docNo, topSims in enumerate(index): # for each document
61 | # store similarities to the following file
62 | outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method)
63 |
64 | articles = [] # collect similars in this list
65 | for docNo2, score in topSims: # for each most similar article
66 | if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring)
67 | source, (intId, pathId) = corpus.documents[docNo2]
68 | meta = corpus.getMeta(docNo2)
69 | suffix, author, title = '', meta.get('author', ''), meta.get('title', '')
70 | articles.append(ARTICLE % locals()) # add the similar article to output
71 | if len(articles) >= MAX_SIMILAR:
72 | break
73 |
74 | # now `articles` holds multiple strings in similar_*.xml format
75 | if SAVE_EMPTY or articles:
76 | output = ''.join(articles) # concat all similars to one string
77 | if not DRY_RUN: # only open output files for writing if DRY_RUN is false
78 | logging.info("generating %s (%i similars)" % (outfile, len(articles)))
79 | outfile = open(outfile, 'w')
80 | outfile.write(SIMILAR % output) # add xml headers and print to file
81 | outfile.close()
82 | else:
83 | logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output))
84 | else:
85 | logging.debug("skipping %s (no similar found)" % outfile)
86 |
87 |
88 |
89 | if __name__ == '__main__':
90 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
91 | logging.root.setLevel(level=logging.INFO)
92 | logging.info("running %s" % ' '.join(sys.argv))
93 |
94 | program = os.path.basename(sys.argv[0])
95 |
96 | # check and process input arguments
97 | if len(sys.argv) < 3:
98 | print(globals()['__doc__'] % locals())
99 | sys.exit(1)
100 | language = sys.argv[1]
101 | method = sys.argv[2].strip().lower()
102 |
103 | logging.info("loading corpus mappings")
104 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
105 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
106 |
107 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
108 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
109 | logging.info("loaded %i word ids" % len(id2word))
110 |
111 | corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
112 | input = MmCorpus(config.resultFile('_%s.mm' % method))
113 | assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus))
114 |
115 | # initialize structure for similarity queries
116 | if method == 'lsi' or method == 'rp': # for these methods, use dense vectors
117 | index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms)
118 | else:
119 | index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)
120 |
121 | index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
122 | generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format
123 |
124 | logging.info("finished running %s" % program)
125 |
126 |
--------------------------------------------------------------------------------
/gensim/examples/dmlcz/runall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # full path to gensim executables
4 | BIN_PATH=~/xrehurek/gensim/dmlcz
5 |
6 | # intermediate data will be stored to this dir
7 | RESULT_PATH=~/xrehurek/results
8 |
9 | # set python path, so that python can find and import gensim modules
10 | export PYTHONPATH=~/xrehurek:$PYTHONPATH
11 |
12 | # Language is set to 'any', meaning all articles are processed for similarity in
13 | # one go, regardless of their language.
14 | # Set language to 'eng', 'fre', 'rus' etc. to only process a specific subset of
15 | # articles (an article's language is determined from its metadata).
16 | language=any
17 |
18 |
19 | # ========== parse all article sources, build article co-occurence matrix ======
20 | ${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log
21 |
22 |
23 | # ========== build transformation models =======================================
24 | for method in tfidf rp;
25 | do
26 | ( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) &
27 | done
28 | wait
29 |
30 | method=lsi
31 | ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log
32 |
33 |
34 | # =========== generate output xml files ========================================
35 | # generate xml files for all methods at once, in parallel, to save time.
36 | # NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here)
37 | for method in tfidf lsi rp;
38 | do
39 | ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) &
40 | done
41 | wait
42 |
--------------------------------------------------------------------------------
/gensim/interfaces.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | This module contains basic interfaces used throughout the whole gensim package.
9 |
10 | The interfaces are realized as abstract base classes (ie., some optional functionality
11 | is provided in the interface itself, so that the interfaces can be subclassed).
12 | """
13 |
14 | from __future__ import with_statement
15 |
16 | import logging
17 | import itertools
18 |
19 | from gensim import utils, matutils
20 | from six.moves import xrange
21 |
22 |
23 | logger = logging.getLogger('gensim.interfaces')
24 |
25 |
26 | class CorpusABC(utils.SaveLoad):
27 | """
28 | Interface (abstract base class) for corpora. A *corpus* is simply an iterable,
29 | where each iteration step yields one document:
30 |
31 | >>> for doc in corpus:
32 | >>> # do something with the doc...
33 |
34 | A document is a sequence of `(fieldId, fieldValue)` 2-tuples:
35 |
36 | >>> for attr_id, attr_value in doc:
37 | >>> # do something with the attribute
38 |
39 | Note that although a default :func:`len` method is provided, it is very inefficient
40 | (performs a linear scan through the corpus to determine its length). Wherever
41 | the corpus size is needed and known in advance (or at least doesn't change so
42 | that it can be cached), the :func:`len` method should be overridden.
43 |
44 | See the :mod:`gensim.corpora.svmlightcorpus` module for an example of a corpus.
45 |
46 | Saving the corpus with the `save` method (inherited from `utils.SaveLoad`) will
47 | only store the *in-memory* (binary, pickled) object representation=the stream
48 | state, and **not** the documents themselves. See the `save_corpus` static method
49 | for serializing the actual stream content.
50 | """
51 | def __iter__(self):
52 | """
53 | Iterate over the corpus, yielding one document at a time.
54 | """
55 | raise NotImplementedError('cannot instantiate abstract base class')
56 |
57 |
58 | def save(self, *args, **kwargs):
59 | import warnings
60 | warnings.warn("corpus.save() stores only the (tiny) iteration object; "
61 | "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)")
62 | super(CorpusABC, self).save(*args, **kwargs)
63 |
64 | def __len__(self):
65 | """
66 | Return the number of documents in the corpus.
67 |
68 | This method is just the least common denominator and should really be
69 | overridden when possible.
70 | """
71 | raise NotImplementedError("must override __len__() before calling len(corpus)")
72 | # logger.warning("performing full corpus scan to determine its length; was this intended?")
73 | # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus
74 |
75 | @staticmethod
76 | def save_corpus(fname, corpus, id2word=None, metadata=False):
77 | """
78 | Save an existing `corpus` to disk.
79 |
80 | Some formats also support saving the dictionary (`feature_id->word` mapping),
81 | which can in this case be provided by the optional `id2word` parameter.
82 |
83 | >>> MmCorpus.save_corpus('file.mm', corpus)
84 |
85 | Some corpora also support an index of where each document begins, so
86 | that the documents on disk can be accessed in O(1) time (see the
87 | `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
88 | called internally by `serialize`, which does `save_corpus` plus saves the index
89 | at the same time, so you want to store the corpus with::
90 |
91 | >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents
92 |
93 | Calling `serialize()` is preferred to calling `save_corpus()`.
94 |
95 | """
96 | raise NotImplementedError('cannot instantiate abstract base class')
97 |
98 | # example code:
99 | logger.info("converting corpus to ??? format: %s" % fname)
100 | with utils.smart_open(fname, 'wb') as fout:
101 | for doc in corpus: # iterate over the document stream
102 | fmt = str(doc) # format the document appropriately...
103 | fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
104 | #endclass CorpusABC
105 |
106 |
107 | class TransformedCorpus(CorpusABC):
108 | def __init__(self, obj, corpus, chunksize=None):
109 | self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
110 | self.metadata = False
111 |
112 | def __len__(self):
113 | return len(self.corpus)
114 |
115 | def __iter__(self):
116 | if self.chunksize:
117 | for chunk in utils.grouper(self.corpus, self.chunksize):
118 | for transformed in self.obj.__getitem__(chunk, chunksize=None):
119 | yield transformed
120 | else:
121 | for doc in self.corpus:
122 | yield self.obj[doc]
123 | #endclass TransformedCorpus
124 |
125 |
126 | class TransformationABC(utils.SaveLoad):
127 | """
128 | Interface for transformations. A 'transformation' is any object which accepts
129 | a sparse document via the dictionary notation `[]` and returns another sparse
130 | document in its stead::
131 |
132 | >>> transformed_doc = transformation[doc]
133 |
134 | or also::
135 |
136 | >>> transformed_corpus = transformation[corpus]
137 |
138 | See the :mod:`gensim.models.tfidfmodel` module for an example of a transformation.
139 |
140 | """
141 |
142 | def __getitem__(self, vec):
143 | """
144 | Transform vector from one vector space into another
145 |
146 | **or**
147 |
148 | Transform a whole corpus into another.
149 | """
150 | raise NotImplementedError('cannot instantiate abstract base class')
151 |
152 |
153 | def _apply(self, corpus, chunksize=None):
154 | """
155 | Apply the transformation to a whole corpus (as opposed to a single document)
156 | and return the result as another corpus.
157 | """
158 | return TransformedCorpus(self, corpus, chunksize)
159 | #endclass TransformationABC
160 |
161 |
162 | class SimilarityABC(utils.SaveLoad):
163 | """
164 | Abstract interface for similarity searches over a corpus.
165 |
166 | In all instances, there is a corpus against which we want to perform the
167 | similarity search.
168 |
169 | For each similarity search, the input is a document and the output are its
170 | similarities to individual corpus documents.
171 |
172 | Similarity queries are realized by calling ``self[query_document]``.
173 |
174 | There is also a convenience wrapper, where iterating over `self` yields
175 | similarities of each document in the corpus against the whole corpus (ie.,
176 | the query is each corpus document in turn).
177 | """
178 | def __init__(self, corpus):
179 | raise NotImplementedError("cannot instantiate Abstract Base Class")
180 |
181 |
182 | def get_similarities(self, doc):
183 | # (Sparse)MatrixSimilarity override this method so that they both use the
184 | # same __getitem__ method, defined below
185 | raise NotImplementedError("cannot instantiate Abstract Base Class")
186 |
187 |
188 | def __getitem__(self, query):
189 | """Get similarities of document `query` to all documents in the corpus.
190 |
191 | **or**
192 |
193 | If `query` is a corpus (iterable of documents), return a matrix of similarities
194 | of all query documents vs. all corpus document. Using this type of batch
195 | query is more efficient than computing the similarities one document after
196 | another.
197 | """
198 | is_corpus, query = utils.is_corpus(query)
199 | if self.normalize:
200 | # self.normalize only works if the input is a plain gensim vector/corpus (as
201 | # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
202 | # as well, but in that case assume tricks are happening and don't normalize
203 | # anything (self.normalize has no effect).
204 | if matutils.ismatrix(query):
205 | import warnings
206 | # warnings.warn("non-gensim input must already come normalized")
207 | else:
208 | if is_corpus:
209 | query = [matutils.unitvec(v) for v in query]
210 | else:
211 | query = matutils.unitvec(query)
212 | result = self.get_similarities(query)
213 |
214 | if self.num_best is None:
215 | return result
216 |
217 | # if the input query was a corpus (=more documents), compute the top-n
218 | # most similar for each document in turn
219 | if matutils.ismatrix(result):
220 | return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
221 | else:
222 | # otherwise, return top-n of the single input document
223 | return matutils.full2sparse_clipped(result, self.num_best)
224 |
225 |
226 | def __iter__(self):
227 | """
228 | For each index document, compute cosine similarity against all other
229 | documents in the index and yield the result.
230 | """
231 | # turn off query normalization (vectors in the index are assumed to be already normalized)
232 | norm = self.normalize
233 | self.normalize = False
234 |
235 | # Try to compute similarities in bigger chunks of documents (not
236 | # one query = a single document after another). The point is, a
237 | # bigger query of N documents is faster than N small queries of one
238 | # document.
239 | #
240 | # After computing similarities of the bigger query in `self[chunk]`,
241 | # yield the resulting similarities one after another, so that it looks
242 | # exactly the same as if they had been computed with many small queries.
243 | try:
244 | chunking = self.chunksize > 1
245 | except AttributeError:
246 | # chunking not supported; fall back to the (slower) mode of 1 query=1 document
247 | chunking = False
248 | if chunking:
249 | # assumes `self.corpus` holds the index as a 2-d numpy array.
250 | # this is true for MatrixSimilarity and SparseMatrixSimilarity, but
251 | # may not be true for other (future) classes..?
252 | for chunk_start in xrange(0, self.index.shape[0], self.chunksize):
253 | # scipy.sparse doesn't allow slicing beyond real size of the matrix
254 | # (unlike numpy). so, clip the end of the chunk explicitly to make
255 | # scipy.sparse happy
256 | chunk_end = min(self.index.shape[0], chunk_start + self.chunksize)
257 | chunk = self.index[chunk_start : chunk_end]
258 | if chunk.shape[0] > 1:
259 | for sim in self[chunk]:
260 | yield sim
261 | else:
262 | yield self[chunk]
263 | else:
264 | for doc in self.index:
265 | yield self[doc]
266 |
267 | # restore old normalization value
268 | self.normalize = norm
269 | #endclass SimilarityABC
270 |
--------------------------------------------------------------------------------
/gensim/interfaces.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/interfaces.pyc
--------------------------------------------------------------------------------
/gensim/matutils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/matutils.pyc
--------------------------------------------------------------------------------
/gensim/models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains algorithms for extracting document representations from their raw
3 | bag-of-word counts.
4 | """
5 |
6 | # bring model classes directly into package namespace, to save some typing
7 | from .hdpmodel import HdpModel
8 | from .ldamodel import LdaModel
9 | from .ldamallet import LdaMallet
10 | from .lsimodel import LsiModel
11 | from .tfidfmodel import TfidfModel
12 | from .rpmodel import RpModel
13 | from .logentropy_model import LogEntropyModel
14 | from .word2vec import Word2Vec
15 |
16 | from gensim import interfaces, utils
17 |
18 |
19 | class VocabTransform(interfaces.TransformationABC):
20 | """
21 | Remap feature ids to new values.
22 |
23 | Given a mapping between old ids and new ids (some old ids may be missing = these
24 | features are to be discarded), this will wrap a corpus so that iterating over
25 | `VocabTransform[corpus]` returns the same vectors but with the new ids.
26 |
27 | Old features that have no counterpart in the new ids are discarded. This
28 | can be used to filter vocabulary of a corpus "online"::
29 |
30 | >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep))
31 | >>> vt = VocabTransform(old2new)
32 | >>> for vec_with_new_ids in vt[corpus_with_old_ids]:
33 | >>> ...
34 |
35 | """
36 | def __init__(self, old2new, id2token=None):
37 | # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems())
38 | self.old2new = old2new
39 | self.id2token = id2token
40 |
41 |
42 | def __getitem__(self, bow):
43 | """
44 | Return representation with the ids transformed.
45 | """
46 | # if the input vector is in fact a corpus, return a transformed corpus as a result
47 | is_corpus, bow = utils.is_corpus(bow)
48 | if is_corpus:
49 | return self._apply(bow)
50 |
51 | return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
52 | #endclass VocabTransform
53 |
--------------------------------------------------------------------------------
/gensim/models/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/__init__.pyc
--------------------------------------------------------------------------------
/gensim/models/hdpmodel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/hdpmodel.pyc
--------------------------------------------------------------------------------
/gensim/models/lda_dispatcher.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | USAGE: %(program)s SIZE_OF_JOBS_QUEUE
9 |
10 | Dispatcher process which orchestrates distributed LDA computations. Run this \
11 | script only once, on any node in your cluster.
12 |
13 | Example: python -m gensim.models.lda_dispatcher
14 | """
15 |
16 |
17 | from __future__ import with_statement
18 | import os, sys, logging, threading, time
19 | from Queue import Queue
20 |
21 | from gensim import utils
22 |
23 |
24 | logger = logging.getLogger("gensim.models.lda_dispatcher")
25 |
26 |
27 | # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue?
28 | # A small number is usually enough, unless iteration over the corpus is very very
29 | # slow (slower than the actual computation of LDA), in which case you can override
30 | # this value from command line. ie. run "python ./lda_dispatcher.py 100"
31 | MAX_JOBS_QUEUE = 10
32 |
33 | # timeout for the Queue object put/get blocking methods.
34 | # it should theoretically be infinity, but then keyboard interrupts don't work.
35 | # so this is really just a hack, see http://bugs.python.org/issue1360
36 | HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year
37 |
38 |
39 |
40 | class Dispatcher(object):
41 | """
42 | Dispatcher object that communicates and coordinates individual workers.
43 |
44 | There should never be more than one dispatcher running at any one time.
45 | """
46 |
47 | def __init__(self, maxsize=MAX_JOBS_QUEUE):
48 | """
49 | Note that the constructor does not fully initialize the dispatcher;
50 | use the `initialize()` function to populate it with workers etc.
51 | """
52 | self.maxsize = maxsize
53 | self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later)
54 |
55 |
56 | def initialize(self, **model_params):
57 | """
58 | `model_params` are parameters used to initialize individual workers (gets
59 | handed all the way down to `worker.initialize()`).
60 | """
61 | self.jobs = Queue(maxsize=self.maxsize)
62 | self.lock_update = threading.Lock()
63 | self._jobsdone = 0
64 | self._jobsreceived = 0
65 |
66 | # locate all available workers and store their proxies, for subsequent RMI calls
67 | self.workers = {}
68 | import Pyro4
69 | with utils.getNS() as ns:
70 | self.callback = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') # = self
71 | self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously
72 | for name, uri in ns.list(prefix='gensim.lda_worker').iteritems():
73 | try:
74 | worker = Pyro4.Proxy(uri)
75 | workerid = len(self.workers)
76 | # make time consuming methods work asynchronously
77 | worker._pyroOneway.add("requestjob")
78 | worker._pyroOneway.add("exit")
79 | logger.info("registering worker #%i at %s" % (workerid, uri))
80 | worker.initialize(workerid, dispatcher=self.callback, **model_params)
81 | self.workers[workerid] = worker
82 | except Pyro4.errors.PyroError:
83 | logger.warning("unresponsive worker at %s, deleting it from the name server" % uri)
84 | ns.remove(name)
85 |
86 | if not self.workers:
87 | raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!')
88 |
89 |
90 | def getworkers(self):
91 | """
92 | Return pyro URIs of all registered workers.
93 | """
94 | return [worker._pyroUri for worker in self.workers.itervalues()]
95 |
96 |
97 | def getjob(self, worker_id):
98 | logger.info("worker #%i requesting a new job" % worker_id)
99 | job = self.jobs.get(block=True, timeout=1)
100 | logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize()))
101 | return job
102 |
103 |
104 | def putjob(self, job):
105 | self._jobsreceived += 1
106 | self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
107 | logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize())
108 |
109 |
110 | def getstate(self):
111 | """
112 | Merge states from across all workers and return the result.
113 | """
114 | logger.info("end of input, assigning all remaining jobs")
115 | logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived))
116 | while self._jobsdone < self._jobsreceived:
117 | time.sleep(0.5) # check every half a second
118 |
119 | logger.info("merging states from %i workers" % len(self.workers))
120 | workers = self.workers.values()
121 | result = workers[0].getstate()
122 | for worker in workers[1:]:
123 | result.merge(worker.getstate())
124 |
125 | logger.info("sending out merged state")
126 | return result
127 |
128 |
129 | def reset(self, state):
130 | """
131 | Initialize all workers for a new EM iterations.
132 | """
133 | for workerid, worker in self.workers.iteritems():
134 | logger.info("resetting worker %s" % workerid)
135 | worker.reset(state)
136 | worker.requestjob()
137 | self._jobsdone = 0
138 | self._jobsreceived = 0
139 |
140 |
141 | @utils.synchronous('lock_update')
142 | def jobdone(self, workerid):
143 | """
144 | A worker has finished its job. Log this event and then asynchronously
145 | transfer control back to the worker.
146 |
147 | In this way, control flow basically oscillates between `dispatcher.jobdone()`
148 | and `worker.requestjob()`.
149 | """
150 | self._jobsdone += 1
151 | logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone))
152 | self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way)
153 |
154 |
155 | def jobsdone(self):
156 | """Wrap self._jobsdone, needed for remote access through Pyro proxies"""
157 | return self._jobsdone
158 |
159 |
160 | def exit(self):
161 | """
162 | Terminate all registered workers and then the dispatcher.
163 | """
164 | for workerid, worker in self.workers.iteritems():
165 | logger.info("terminating worker %s" % workerid)
166 | worker.exit()
167 | logger.info("terminating dispatcher")
168 | os._exit(0) # exit the whole process (not just this thread ala sys.exit())
169 | #endclass Dispatcher
170 |
171 |
172 |
173 | def main():
174 | logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
175 | logger.info("running %s" % " ".join(sys.argv))
176 |
177 | program = os.path.basename(sys.argv[0])
178 | # make sure we have enough cmd line parameters
179 | if len(sys.argv) < 1:
180 | print(globals()["__doc__"] % locals())
181 | sys.exit(1)
182 |
183 | if len(sys.argv) < 2:
184 | maxsize = MAX_JOBS_QUEUE
185 | else:
186 | maxsize = int(sys.argv[1])
187 | utils.pyro_daemon('gensim.lda_dispatcher', Dispatcher(maxsize=maxsize))
188 |
189 | logger.info("finished running %s" % program)
190 |
191 |
192 |
193 | if __name__ == '__main__':
194 | main()
195 |
--------------------------------------------------------------------------------
/gensim/models/lda_worker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2011 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | USAGE: %(program)s
9 |
10 | Worker ("slave") process used in computing distributed LDA. Run this script \
11 | on every node in your cluster. If you wish, you may even run it multiple times \
12 | on a single machine, to make better use of multiple cores (just beware that \
13 | memory footprint increases accordingly).
14 |
15 | Example: python -m gensim.models.lda_worker
16 | """
17 |
18 |
19 | from __future__ import with_statement
20 | import os, sys, logging
21 | import threading
22 | import tempfile
23 | import Queue
24 |
25 | from gensim.models import ldamodel
26 | from gensim import utils
27 |
28 | logger = logging.getLogger('gensim.models.lda_worker')
29 |
30 |
31 | # periodically save intermediate models after every SAVE_DEBUG updates (0 for never)
32 | SAVE_DEBUG = 0
33 |
34 |
35 |
36 | class Worker(object):
37 | def __init__(self):
38 | self.model = None
39 |
40 |
41 | def initialize(self, myid, dispatcher, **model_params):
42 | self.lock_update = threading.Lock()
43 | self.jobsdone = 0 # how many jobs has this worker completed?
44 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
45 | self.dispatcher = dispatcher
46 | self.finished = False
47 | logger.info("initializing worker #%s" % myid)
48 | self.model = ldamodel.LdaModel(**model_params)
49 |
50 |
51 | def requestjob(self):
52 | """
53 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called.
54 | """
55 | if self.model is None:
56 | raise RuntimeError("worker must be initialized before receiving jobs")
57 |
58 | job = None
59 | while job is None and not self.finished:
60 | try:
61 | job = self.dispatcher.getjob(self.myid)
62 | except Queue.Empty:
63 | # no new job: try again, unless we're finished with all work
64 | continue
65 | if job is not None:
66 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
67 | self.processjob(job)
68 | self.dispatcher.jobdone(self.myid)
69 | else:
70 | logger.info("worker #%i stopping asking for jobs" % self.myid)
71 |
72 |
73 | @utils.synchronous('lock_update')
74 | def processjob(self, job):
75 | logger.debug("starting to process job #%i" % self.jobsdone)
76 | self.model.do_estep(job)
77 | self.jobsdone += 1
78 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0:
79 | fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl')
80 | self.model.save(fname)
81 | logger.info("finished processing job #%i" % (self.jobsdone - 1))
82 |
83 |
84 | @utils.synchronous('lock_update')
85 | def getstate(self):
86 | logger.info("worker #%i returning its state after %s jobs" %
87 | (self.myid, self.jobsdone))
88 | result = self.model.state
89 | assert isinstance(result, ldamodel.LdaState)
90 | self.model.clear() # free up mem in-between two EM cycles
91 | self.finished = True
92 | return result
93 |
94 |
95 | @utils.synchronous('lock_update')
96 | def reset(self, state):
97 | assert state is not None
98 | logger.info("resetting worker #%i" % self.myid)
99 | self.model.state = state
100 | self.model.sync_state()
101 | self.model.state.reset()
102 | self.finished = False
103 |
104 |
105 | def exit(self):
106 | logger.info("terminating worker #%i" % self.myid)
107 | os._exit(0)
108 | #endclass Worker
109 |
110 |
111 |
112 | def main():
113 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
114 | logger.info("running %s" % " ".join(sys.argv))
115 |
116 | program = os.path.basename(sys.argv[0])
117 | # make sure we have enough cmd line parameters
118 | if len(sys.argv) < 1:
119 | print(globals()["__doc__"] % locals())
120 | sys.exit(1)
121 |
122 | utils.pyro_daemon('gensim.lda_worker', Worker(), random_suffix=True)
123 |
124 | logger.info("finished running %s" % program)
125 |
126 |
127 |
128 | if __name__ == '__main__':
129 | main()
130 |
--------------------------------------------------------------------------------
/gensim/models/ldamallet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2014 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | """
9 | Python wrapper for Latent Dirichlet Allocation (LDA) from MALLET, the Java topic modelling
10 | toolkit [1]_.
11 |
12 | This module allows both LDA model estimation from a training corpus and inference of topic
13 | distribution on new, unseen documents, using an (optimized version of) collapsed
14 | gibbs sampling from MALLET.
15 |
16 | MALLET's LDA training requires O(#corpus_words) of memory, keeping the entire corpus in RAM.
17 | If you find yourself running out of memory, either decrease the `workers` constructor
18 | parameter, or use `LdaModel` which needs only O(1) memory.
19 |
20 | The wrapped model can NOT be updated with new documents for online training -- use gensim's `LdaModel` for that.
21 |
22 | Example:
23 |
24 | >>> model = gensim.models.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary)
25 | >>> print model[my_vector] # print LDA topics of a document
26 |
27 | .. [1] http://mallet.cs.umass.edu/
28 |
29 | """
30 |
31 |
32 | import logging
33 | import random
34 | import tempfile
35 | import os
36 | from subprocess import call
37 |
38 | import numpy
39 |
40 | from gensim import utils
41 |
42 | logger = logging.getLogger('gensim.models.ldamallet')
43 |
44 |
45 | def read_doctopics(fname, eps=1e-6):
46 | """
47 | Yield document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors.
48 |
49 | """
50 | with utils.smart_open(fname) as fin:
51 | next(fin) # skip the header line
52 | for lineno, line in enumerate(fin):
53 | parts = line.split()[2:] # skip "doc" and "source" columns
54 | if len(parts) % 2 != 0:
55 | raise RuntimeError("invalid doc topics format at line %i in %s" % (lineno + 1, fname))
56 | doc = [(int(id), float(weight)) for id, weight in zip(parts[::2], parts[1::2]) if abs(float(weight)) > eps]
57 | # explicitly normalize probs to sum up to 1.0, just to be sure...
58 | weights = float(sum([weight for _, weight in doc]))
59 | yield [] if weights == 0 else sorted((id, 1.0 * weight / weights) for id, weight in doc)
60 |
61 |
62 |
63 | class LdaMallet(utils.SaveLoad):
64 | """
65 | Class for LDA training using MALLET. Communication between MALLET and Python
66 | takes place by passing around data files on disk and calling Java with subprocess.call().
67 |
68 | """
69 | def __init__(self, mallet_path, corpus=None, num_topics=100, id2word=None, workers=4, prefix=None,
70 | optimize_interval=0, iterations=1000):
71 | """
72 | `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`.
73 | `corpus` is a gensim corpus, aka a stream of sparse document vectors.
74 | `id2word` is a mapping between tokens ids and token.
75 | `workers` is the number of threads, for parallel training.
76 | `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix.
77 | `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization).
78 | `iterations` is the number of sampling iterations.
79 |
80 | """
81 | self.mallet_path = mallet_path
82 | self.id2word = id2word
83 | if self.id2word is None:
84 | logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
85 | self.id2word = utils.dict_from_corpus(corpus)
86 | self.num_terms = len(self.id2word)
87 | else:
88 | self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
89 | if self.num_terms == 0:
90 | raise ValueError("cannot compute LDA over an empty collection (no terms)")
91 | self.num_topics = num_topics
92 | if prefix is None:
93 | rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
94 | prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
95 | self.prefix = prefix
96 | self.workers = workers
97 | self.optimize_interval = optimize_interval
98 | self.iterations = iterations
99 |
100 | if corpus is not None:
101 | self.train(corpus)
102 |
103 | def finferencer(self):
104 | return self.prefix + 'inferencer.mallet'
105 |
106 | def ftopickeys(self):
107 | return self.prefix + 'topickeys.txt'
108 |
109 | def fstate(self):
110 | return self.prefix + 'state.mallet.gz'
111 |
112 | def fdoctopics(self):
113 | return self.prefix + 'doctopics.txt'
114 |
115 | def fcorpustxt(self):
116 | return self.prefix + 'corpus.txt'
117 |
118 | def fcorpusmallet(self):
119 | return self.prefix + 'corpus.mallet'
120 |
121 | def fwordweights(self):
122 | return self.prefix + 'wordweights.txt'
123 |
124 | def convert_input(self, corpus, infer=False):
125 | """
126 | Serialize documents (lists of unicode tokens) to a temporary text file,
127 | then convert that text file to MALLET format `outfile`.
128 |
129 | """
130 | logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
131 | # write out the corpus in a file format that MALLET understands: one document per line:
132 | # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
133 | with utils.smart_open(self.fcorpustxt(), 'wb') as fout:
134 | for docno, doc in enumerate(corpus):
135 | if self.id2word:
136 | tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
137 | else:
138 | tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
139 | fout.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))
140 |
141 | # convert the text file above into MALLET's internal format
142 | cmd = self.mallet_path + " import-file --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
143 | if infer:
144 | cmd += ' --use-pipe-from ' + self.fcorpusmallet()
145 | cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
146 | else:
147 | cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
148 | logger.info("converting temporary corpus to MALLET format with %s" % cmd)
149 | call(cmd, shell=True)
150 |
151 |
152 | def train(self, corpus):
153 | self.convert_input(corpus, infer=False)
154 | cmd = self.mallet_path + " train-topics --input %s --num-topics %s --optimize-interval %s "\
155 | "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\
156 | "--num-iterations %s --inferencer-filename %s"
157 | cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.optimize_interval, self.workers,
158 | self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer())
159 | # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
160 | logger.info("training MALLET LDA with %s" % cmd)
161 | call(cmd, shell=True)
162 | self.word_topics = self.load_word_topics()
163 |
164 |
165 | def __getitem__(self, bow, iterations=100):
166 | is_corpus, corpus = utils.is_corpus(bow)
167 | if not is_corpus:
168 | # query is a single document => make a corpus out of it
169 | bow = [bow]
170 |
171 | self.convert_input(bow, infer=True)
172 | cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
173 | cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations)
174 | logger.info("inferring topics with MALLET LDA '%s'" % cmd)
175 | retval = call(cmd, shell=True)
176 | if retval != 0:
177 | raise RuntimeError("MALLET failed with error %s on return" % retval)
178 | result = list(read_doctopics(self.fdoctopics() + '.infer'))
179 | return result if is_corpus else result[0]
180 |
181 |
182 | def load_word_topics(self):
183 | logger.info("loading assigned topics from %s" % self.fstate())
184 | wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
185 | with utils.smart_open(self.fstate()) as fin:
186 | _ = next(fin) # header
187 | self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
188 | assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
189 | _ = next(fin) # beta
190 | for lineno, line in enumerate(fin):
191 | line = utils.to_unicode(line)
192 | doc, source, pos, typeindex, token, topic = line.split()
193 | tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
194 | wordtopics[int(topic), tokenid] += 1
195 | logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
196 | self.wordtopics = wordtopics
197 | self.print_topics(15)
198 |
199 |
200 | def print_topics(self, topics=10, topn=10):
201 | return self.show_topics(topics, topn, log=True)
202 |
203 |
204 | def show_topics(self, topics=10, topn=10, log=False, formatted=True):
205 | """
206 | Print the `topN` most probable words for `topics` number of topics.
207 | Set `topics=-1` to print all topics.
208 |
209 | Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs.
210 |
211 | """
212 | if topics < 0 or topics >= self.num_topics:
213 | topics = self.num_topics
214 | chosen_topics = range(topics)
215 | else:
216 | topics = min(topics, self.num_topics)
217 | sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha
218 | sorted_topics = list(numpy.argsort(sort_alpha))
219 | chosen_topics = sorted_topics[ : topics/2] + sorted_topics[-topics/2 : ]
220 | shown = []
221 | for i in chosen_topics:
222 | if formatted:
223 | topic = self.print_topic(i, topn=topn)
224 | else:
225 | topic = self.show_topic(i, topn=topn)
226 | shown.append(topic)
227 | if log:
228 | logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], topic))
229 | return shown
230 |
231 |
232 | def show_topic(self, topicid, topn=10):
233 | topic = self.wordtopics[topicid]
234 | topic = topic / topic.sum() # normalize to probability dist
235 | bestn = numpy.argsort(topic)[::-1][:topn]
236 | beststr = [(topic[id], self.id2word[id]) for id in bestn]
237 | return beststr
238 |
239 |
240 | def print_topic(self, topicid, topn=10):
241 | return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)])
242 |
--------------------------------------------------------------------------------
/gensim/models/ldamallet.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/ldamallet.pyc
--------------------------------------------------------------------------------
/gensim/models/ldamodel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/ldamodel.pyc
--------------------------------------------------------------------------------
/gensim/models/logentropy_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5 |
6 | import logging
7 | import math
8 | from gensim import interfaces, matutils, utils
9 |
10 |
11 | logger = logging.getLogger('gensim.models.logentropy_model')
12 |
13 |
14 | class LogEntropyModel(interfaces.TransformationABC):
15 | """
16 | Objects of this class realize the transformation between word-document
17 | co-occurence matrix (integers) into a locally/globally weighted matrix
18 | (positive floats).
19 |
20 | This is done by a log entropy normalization, optionally normalizing the
21 | resulting documents to unit length. The following formulas explain how
22 | to compute the log entropy weight for term `i` in document `j`::
23 |
24 | local_weight_{i,j} = log(frequency_{i,j} + 1)
25 |
26 | P_{i,j} = frequency_{i,j} / sum_j frequency_{i,j}
27 |
28 | sum_j P_{i,j} * log(P_{i,j})
29 | global_weight_i = 1 + ----------------------------
30 | log(number_of_documents + 1)
31 |
32 | final_weight_{i,j} = local_weight_{i,j} * global_weight_i
33 |
34 | The main methods are:
35 |
36 | 1. constructor, which calculates the global weighting for all terms in
37 | a corpus.
38 | 2. the [] method, which transforms a simple count representation into the
39 | log entropy normalized space.
40 |
41 | >>> log_ent = LogEntropyModel(corpus)
42 | >>> print(log_ent[some_doc])
43 | >>> log_ent.save('/tmp/foo.log_ent_model')
44 |
45 | Model persistency is achieved via its load/save methods.
46 | """
47 |
48 | def __init__(self, corpus, id2word=None, normalize=True):
49 | """
50 | `normalize` dictates whether the resulting vectors will be
51 | set to unit length.
52 | """
53 | self.normalize = normalize
54 | self.n_docs = 0
55 | self.n_words = 0
56 | self.entr = {}
57 | if corpus is not None:
58 | self.initialize(corpus)
59 |
60 | def __str__(self):
61 | return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs,
62 | self.n_words)
63 |
64 | def initialize(self, corpus):
65 | """
66 | Initialize internal statistics based on a training corpus. Called
67 | automatically from the constructor.
68 | """
69 | logger.info("calculating counts")
70 | glob_freq = {}
71 | glob_num_words, doc_no = 0, -1
72 | for doc_no, bow in enumerate(corpus):
73 | if doc_no % 10000 == 0:
74 | logger.info("PROGRESS: processing document #%i" % doc_no)
75 | glob_num_words += len(bow)
76 | for term_id, term_count in bow:
77 | glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count
78 |
79 | # keep some stats about the training corpus
80 | self.n_docs = doc_no + 1
81 | self.n_words = glob_num_words
82 |
83 | # and finally compute the global weights
84 | logger.info("calculating global log entropy weights for %i "
85 | "documents and %i features (%i matrix non-zeros)"
86 | % (self.n_docs, len(glob_freq), self.n_words))
87 | logger.debug('iterating over corpus')
88 | for doc_no2, bow in enumerate(corpus):
89 | for key, freq in bow:
90 | p = (float(freq) / glob_freq[key]) * math.log(float(freq) /
91 | glob_freq[key])
92 | self.entr[key] = self.entr.get(key, 0.0) + p
93 | if doc_no2 != doc_no:
94 | raise ValueError("LogEntropyModel doesn't support generators as training data")
95 |
96 | logger.debug('iterating over keys')
97 | for key in self.entr:
98 | self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1)
99 |
100 | def __getitem__(self, bow):
101 | """
102 | Return log entropy representation of the input vector and/or corpus.
103 | """
104 | # if the input vector is in fact a corpus, return a transformed corpus
105 | is_corpus, bow = utils.is_corpus(bow)
106 | if is_corpus:
107 | return self._apply(bow)
108 |
109 | # unknown (new) terms will be given zero weight (NOT infinity/huge)
110 | vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
111 | for term_id, tf in bow if term_id in self.entr]
112 | if self.normalize:
113 | vector = matutils.unitvec(vector)
114 | return vector
115 |
--------------------------------------------------------------------------------
/gensim/models/logentropy_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/logentropy_model.pyc
--------------------------------------------------------------------------------
/gensim/models/lsi_dispatcher.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | USAGE: %(program)s SIZE_OF_JOBS_QUEUE
9 |
10 | Dispatcher process which orchestrates distributed LSI computations. Run this \
11 | script only once, on any node in your cluster.
12 |
13 | Example: python -m gensim.models.lsi_dispatcher
14 | """
15 |
16 |
17 | from __future__ import with_statement
18 | import os, sys, logging, threading, time
19 | from Queue import Queue
20 |
21 | from gensim import utils
22 |
23 |
24 | logger = logging.getLogger("gensim.models.lsi_dispatcher")
25 |
26 |
27 | # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue?
28 | # A small number is usually enough, unless iteration over the corpus is very very
29 | # slow (slower than the actual computation of LSI), in which case you can override
30 | # this value from command line. ie. run "python ./lsi_dispatcher.py 100"
31 | MAX_JOBS_QUEUE = 10
32 |
33 | # timeout for the Queue object put/get blocking methods.
34 | # it should really be infinity, but then keyboard interrupts don't work.
35 | # so this is really just a hack, see http://bugs.python.org/issue1360
36 | HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year
37 |
38 |
39 |
40 | class Dispatcher(object):
41 | """
42 | Dispatcher object that communicates and coordinates individual workers.
43 |
44 | There should never be more than one dispatcher running at any one time.
45 | """
46 |
47 | def __init__(self, maxsize=0):
48 | """
49 | Note that the constructor does not fully initialize the dispatcher;
50 | use the `initialize()` function to populate it with workers etc.
51 | """
52 | self.maxsize = maxsize
53 | self.workers = {}
54 | self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later)
55 |
56 |
57 | def initialize(self, **model_params):
58 | """
59 | `model_params` are parameters used to initialize individual workers (gets
60 | handed all the way down to worker.initialize()).
61 | """
62 | self.jobs = Queue(maxsize=self.maxsize)
63 | self.lock_update = threading.Lock()
64 | self._jobsdone = 0
65 | self._jobsreceived = 0
66 |
67 | # locate all available workers and store their proxies, for subsequent RMI calls
68 | self.workers = {}
69 | with utils.getNS() as ns:
70 | import Pyro4
71 | self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self
72 | self.callback._pyroOneway.add("jobdone") # make sure workers transfer control back to dispatcher asynchronously
73 | for name, uri in ns.list(prefix='gensim.lsi_worker').iteritems():
74 | try:
75 | worker = Pyro4.Proxy(uri)
76 | workerid = len(self.workers)
77 | # make time consuming methods work asynchronously
78 | worker._pyroOneway.add("requestjob")
79 | worker._pyroOneway.add("exit")
80 | logger.info("registering worker #%i from %s" % (workerid, uri))
81 | worker.initialize(workerid, dispatcher=self.callback, **model_params)
82 | self.workers[workerid] = worker
83 | except Pyro4.errors.PyroError:
84 | logger.exception("unresponsive worker at %s, deleting it from the name server" % uri)
85 | ns.remove(name)
86 |
87 | if not self.workers:
88 | raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!')
89 |
90 |
91 | def getworkers(self):
92 | """
93 | Return pyro URIs of all registered workers.
94 | """
95 | return [worker._pyroUri for worker in self.workers.itervalues()]
96 |
97 |
98 | def getjob(self, worker_id):
99 | logger.info("worker #%i requesting a new job" % worker_id)
100 | job = self.jobs.get(block=True, timeout=1)
101 | logger.info("worker #%i got a new job (%i left)" % (worker_id, self.jobs.qsize()))
102 | return job
103 |
104 |
105 | def putjob(self, job):
106 | self._jobsreceived += 1
107 | self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
108 | logger.info("added a new job (len(queue)=%i items)" % self.jobs.qsize())
109 |
110 |
111 | def getstate(self):
112 | """
113 | Merge projections from across all workers and return the final projection.
114 | """
115 | logger.info("end of input, assigning all remaining jobs")
116 | logger.debug("jobs done: %s, jobs received: %s" % (self._jobsdone, self._jobsreceived))
117 | while self._jobsdone < self._jobsreceived:
118 | time.sleep(0.5) # check every half a second
119 |
120 | # TODO: merge in parallel, so that we're done in `log_2(workers)` merges,
121 | # and not `workers - 1` merges!
122 | # but merging only takes place once, after all input data has been processed,
123 | # so the overall effect would be small... compared to the amount of coding :-)
124 | logger.info("merging states from %i workers" % len(self.workers))
125 | workers = self.workers.items()
126 | result = workers[0][1].getstate()
127 | for workerid, worker in workers[1:]:
128 | logger.info("pulling state from worker %s" % workerid)
129 | result.merge(worker.getstate())
130 | logger.info("sending out merged projection")
131 | return result
132 |
133 |
134 | def reset(self):
135 | """
136 | Initialize all workers for a new decomposition.
137 | """
138 | for workerid, worker in self.workers.iteritems():
139 | logger.info("resetting worker %s" % workerid)
140 | worker.reset()
141 | worker.requestjob()
142 | self._jobsdone = 0
143 | self._jobsreceived = 0
144 |
145 |
146 | @utils.synchronous('lock_update')
147 | def jobdone(self, workerid):
148 | """
149 | A worker has finished its job. Log this event and then asynchronously
150 | transfer control back to the worker.
151 |
152 | In this way, control flow basically oscillates between dispatcher.jobdone()
153 | worker.requestjob().
154 | """
155 | self._jobsdone += 1
156 | logger.info("worker #%s finished job #%i" % (workerid, self._jobsdone))
157 | worker = self.workers[workerid]
158 | worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way)
159 |
160 |
161 | def jobsdone(self):
162 | """Wrap self._jobsdone, needed for remote access through proxies"""
163 | return self._jobsdone
164 |
165 |
166 | def exit(self):
167 | """
168 | Terminate all registered workers and then the dispatcher.
169 | """
170 | for workerid, worker in self.workers.iteritems():
171 | logger.info("terminating worker %s" % workerid)
172 | worker.exit()
173 | logger.info("terminating dispatcher")
174 | os._exit(0) # exit the whole process (not just this thread ala sys.exit())
175 | #endclass Dispatcher
176 |
177 |
178 |
179 | def main():
180 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
181 | logger.info("running %s" % " ".join(sys.argv))
182 |
183 | program = os.path.basename(sys.argv[0])
184 | # make sure we have enough cmd line parameters
185 | if len(sys.argv) < 1:
186 | print(globals()["__doc__"] % locals())
187 | sys.exit(1)
188 |
189 | if len(sys.argv) < 2:
190 | maxsize = MAX_JOBS_QUEUE
191 | else:
192 | maxsize = int(sys.argv[1])
193 | utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=maxsize))
194 |
195 | logger.info("finished running %s" % program)
196 |
197 |
198 |
199 | if __name__ == '__main__':
200 | main()
201 |
--------------------------------------------------------------------------------
/gensim/models/lsi_worker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | USAGE: %(program)s
9 |
10 | Worker ("slave") process used in computing distributed LSI. Run this script \
11 | on every node in your cluster. If you wish, you may even run it multiple times \
12 | on a single machine, to make better use of multiple cores (just beware that \
13 | memory footprint increases accordingly).
14 |
15 | Example: python -m gensim.models.lsi_worker
16 | """
17 |
18 |
19 | from __future__ import with_statement
20 | import os, sys, logging
21 | import threading
22 | import tempfile
23 | import Queue
24 |
25 | from gensim.models import lsimodel
26 | from gensim import utils
27 |
28 | logger = logging.getLogger('gensim.models.lsi_worker')
29 |
30 |
31 | SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never)
32 |
33 |
34 |
35 | class Worker(object):
36 | def __init__(self):
37 | self.model = None
38 |
39 |
40 | def initialize(self, myid, dispatcher, **model_params):
41 | self.lock_update = threading.Lock()
42 | self.jobsdone = 0 # how many jobs has this worker completed?
43 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
44 | self.dispatcher = dispatcher
45 | self.finished = False
46 | logger.info("initializing worker #%s" % myid)
47 | self.model = lsimodel.LsiModel(**model_params)
48 |
49 |
50 | def requestjob(self):
51 | """
52 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called.
53 | """
54 | if self.model is None:
55 | raise RuntimeError("worker must be initialized before receiving jobs")
56 |
57 | job = None
58 | while job is None and not self.finished:
59 | try:
60 | job = self.dispatcher.getjob(self.myid)
61 | except Queue.Empty:
62 | # no new job: try again, unless we're finished with all work
63 | continue
64 | if job is not None:
65 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
66 | self.processjob(job)
67 | self.dispatcher.jobdone(self.myid)
68 | else:
69 | logger.info("worker #%i stopping asking for jobs" % self.myid)
70 |
71 |
72 | @utils.synchronous('lock_update')
73 | def processjob(self, job):
74 | self.model.add_documents(job)
75 | self.jobsdone += 1
76 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0:
77 | fname = os.path.join(tempfile.gettempdir(), 'lsi_worker.pkl')
78 | self.model.save(fname)
79 |
80 |
81 | @utils.synchronous('lock_update')
82 | def getstate(self):
83 | logger.info("worker #%i returning its state after %s jobs" %
84 | (self.myid, self.jobsdone))
85 | assert isinstance(self.model.projection, lsimodel.Projection)
86 | self.finished = True
87 | return self.model.projection
88 |
89 |
90 | @utils.synchronous('lock_update')
91 | def reset(self):
92 | logger.info("resetting worker #%i" % self.myid)
93 | self.model.projection = self.model.projection.empty_like()
94 | self.finished = False
95 |
96 |
97 | def exit(self):
98 | logger.info("terminating worker #%i" % self.myid)
99 | os._exit(0)
100 | #endclass Worker
101 |
102 |
103 |
104 | def main():
105 | logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
106 | logger.info("running %s" % " ".join(sys.argv))
107 |
108 | program = os.path.basename(sys.argv[0])
109 | # make sure we have enough cmd line parameters
110 | if len(sys.argv) < 1:
111 | print(globals()["__doc__"] % locals())
112 | sys.exit(1)
113 |
114 | utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True)
115 |
116 | logger.info("finished running %s" % program)
117 |
118 |
119 |
120 | if __name__ == '__main__':
121 | main()
122 |
--------------------------------------------------------------------------------
/gensim/models/lsimodel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/lsimodel.pyc
--------------------------------------------------------------------------------
/gensim/models/rpmodel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | import logging
9 | import itertools
10 |
11 | import numpy
12 | import scipy
13 |
14 | from gensim import interfaces, matutils, utils
15 |
16 |
17 | logger = logging.getLogger('gensim.models.rpmodel')
18 |
19 |
20 | class RpModel(interfaces.TransformationABC):
21 | """
22 | Objects of this class allow building and maintaining a model for Random Projections
23 | (also known as Random Indexing). For theoretical background on RP, see:
24 |
25 | Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis."
26 |
27 | The main methods are:
28 |
29 | 1. constructor, which creates the random projection matrix
30 | 2. the [] method, which transforms a simple count representation into the TfIdf
31 | space.
32 |
33 | >>> rp = RpModel(corpus)
34 | >>> print(rp[some_doc])
35 | >>> rp.save('/tmp/foo.rp_model')
36 |
37 | Model persistency is achieved via its load/save methods.
38 | """
39 | def __init__(self, corpus, id2word=None, num_topics=300):
40 | """
41 | `id2word` is a mapping from word ids (integers) to words (strings). It is
42 | used to determine the vocabulary size, as well as for debugging and topic
43 | printing. If not set, it will be determined from the corpus.
44 | """
45 | self.id2word = id2word
46 | self.num_topics = num_topics
47 | if corpus is not None:
48 | self.initialize(corpus)
49 |
50 |
51 | def __str__(self):
52 | return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)
53 |
54 |
55 | def initialize(self, corpus):
56 | """
57 | Initialize the random projection matrix.
58 | """
59 | if self.id2word is None:
60 | logger.info("no word id mapping provided; initializing from corpus, assuming identity")
61 | self.id2word = utils.dict_from_corpus(corpus)
62 | self.num_terms = len(self.id2word)
63 | else:
64 | self.num_terms = 1 + max([-1] + self.id2word.keys())
65 |
66 | shape = self.num_topics, self.num_terms
67 | logger.info("constructing %s random matrix" % str(shape))
68 | # Now construct the projection matrix itself.
69 | # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
70 | # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
71 | randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1
72 | self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications
73 |
74 |
75 | def __getitem__(self, bow):
76 | """
77 | Return RP representation of the input vector and/or corpus.
78 | """
79 | # if the input vector is in fact a corpus, return a transformed corpus as result
80 | is_corpus, bow = utils.is_corpus(bow)
81 | if is_corpus:
82 | return self._apply(bow)
83 |
84 | vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
85 | vec = numpy.asfortranarray(vec, dtype=numpy.float32)
86 | topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
87 | return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
88 | if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
89 |
90 |
91 | def __setstate__(self, state):
92 | """
93 | This is a hack to work around a bug in numpy, where a FORTRAN-order array
94 | unpickled from disk segfaults on using it.
95 | """
96 | self.__dict__ = state
97 | if self.projection is not None:
98 | self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array
99 | #endclass RpModel
100 |
--------------------------------------------------------------------------------
/gensim/models/rpmodel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/rpmodel.pyc
--------------------------------------------------------------------------------
/gensim/models/tfidfmodel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2012 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 |
8 | import logging
9 | import math
10 |
11 | from gensim import interfaces, matutils, utils
12 | from six import iteritems
13 |
14 |
15 | logger = logging.getLogger('gensim.models.tfidfmodel')
16 |
17 |
18 | def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
19 | """
20 | Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
21 |
22 | idf = add + log(totaldocs / doc_freq)
23 | """
24 | return add + math.log(1.0 * totaldocs / docfreq, log_base)
25 |
26 |
27 | def precompute_idfs(wglobal, dfs, total_docs):
28 | """Precompute the inverse document frequency mapping for all terms."""
29 | # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
30 | # this method is here just to speed things up a little.
31 | return dict((termid, wglobal(df, total_docs))
32 | for termid, df in iteritems(dfs))
33 |
34 |
35 | class TfidfModel(interfaces.TransformationABC):
36 | """
37 | Objects of this class realize the transformation between word-document co-occurrence
38 | matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats).
39 |
40 | The main methods are:
41 |
42 | 1. constructor, which calculates inverse document counts for all terms in the training corpus.
43 | 2. the [] method, which transforms a simple count representation into the TfIdf
44 | space.
45 |
46 | >>> tfidf = TfidfModel(corpus)
47 | >>> print(tfidf[some_doc])
48 | >>> tfidf.save('/tmp/foo.tfidf_model')
49 |
50 | Model persistency is achieved via its load/save methods.
51 | """
52 | def __init__(self, corpus=None, id2word=None, dictionary=None,
53 | wlocal=utils.identity, wglobal=df2idf, normalize=True):
54 | """
55 | Compute tf-idf by multiplying a local component (term frequency) with a
56 | global component (inverse document frequency), and normalizing
57 | the resulting documents to unit length. Formula for unnormalized weight
58 | of term `i` in document `j` in a corpus of D documents::
59 |
60 | weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
61 |
62 | or, more generally::
63 |
64 | weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D)
65 |
66 | so you can plug in your own custom `wlocal` and `wglobal` functions.
67 |
68 | Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
69 | and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
70 | formula above.
71 |
72 | `normalize` dictates how the final transformed vectors will be normalized.
73 | `normalize=True` means set to unit length (default); `False` means don't
74 | normalize. You can also set `normalize` to your own function that accepts
75 | and returns a sparse vector.
76 |
77 | If `dictionary` is specified, it must be a `corpora.Dictionary` object
78 | and it will be used to directly construct the inverse document frequency
79 | mapping (then `corpus`, if specified, is ignored).
80 | """
81 | self.normalize = normalize
82 | self.id2word = id2word
83 | self.wlocal, self.wglobal = wlocal, wglobal
84 | self.num_docs, self.num_nnz, self.idfs = None, None, None
85 | if dictionary is not None:
86 | # user supplied a Dictionary object, which already contains all the
87 | # statistics we need to construct the IDF mapping. we can skip the
88 | # step that goes through the corpus (= an optimization).
89 | if corpus is not None:
90 | logger.warning("constructor received both corpus and explicit "
91 | "inverse document frequencies; ignoring the corpus")
92 | self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
93 | self.dfs = dictionary.dfs.copy()
94 | self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
95 | elif corpus is not None:
96 | self.initialize(corpus)
97 | else:
98 | # NOTE: everything is left uninitialized; presumably the model will
99 | # be initialized in some other way
100 | pass
101 |
102 |
103 | def __str__(self):
104 | return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
105 |
106 |
107 | def initialize(self, corpus):
108 | """
109 | Compute inverse document weights, which will be used to modify term
110 | frequencies for documents.
111 | """
112 | logger.info("collecting document frequencies")
113 | dfs = {}
114 | numnnz, docno = 0, -1
115 | for docno, bow in enumerate(corpus):
116 | if docno % 10000 == 0:
117 | logger.info("PROGRESS: processing document #%i" % docno)
118 | numnnz += len(bow)
119 | for termid, _ in bow:
120 | dfs[termid] = dfs.get(termid, 0) + 1
121 |
122 | # keep some stats about the training corpus
123 | self.num_docs = docno + 1
124 | self.num_nnz = numnnz
125 | self.dfs = dfs
126 |
127 | # and finally compute the idf weights
128 | n_features = max(dfs) if dfs else 0
129 | logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" %
130 | (self.num_docs, n_features, self.num_nnz))
131 | self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
132 |
133 |
134 | def __getitem__(self, bow, eps=1e-12):
135 | """
136 | Return tf-idf representation of the input vector and/or corpus.
137 | """
138 | # if the input vector is in fact a corpus, return a transformed corpus as a result
139 | is_corpus, bow = utils.is_corpus(bow)
140 | if is_corpus:
141 | return self._apply(bow)
142 |
143 | # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
144 | # as strict application of the IDF formula would dictate)
145 | vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
146 | for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
147 |
148 | # and finally, normalize the vector either to unit length, or use a
149 | # user-defined normalization function
150 | if self.normalize is True:
151 | vector = matutils.unitvec(vector)
152 | elif self.normalize:
153 | vector = self.normalize(vector)
154 |
155 | # make sure there are no explicit zeroes in the vector (must be sparse)
156 | vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
157 | return vector
158 | #endclass TfidfModel
159 |
--------------------------------------------------------------------------------
/gensim/models/tfidfmodel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/tfidfmodel.pyc
--------------------------------------------------------------------------------
/gensim/models/voidptr.h:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #if PY_VERSION_HEX >= 0x03020000
4 |
5 | /*
6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore
7 | */
8 | static void * PyCObject_AsVoidPtr(PyObject *obj)
9 | {
10 | void *ret = PyCapsule_GetPointer(obj, NULL);
11 | if (ret == NULL) {
12 | PyErr_Clear();
13 | }
14 | return ret;
15 | }
16 |
17 | #endif
--------------------------------------------------------------------------------
/gensim/models/word2vec.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/models/word2vec.pyc
--------------------------------------------------------------------------------
/gensim/nosy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | A simple testrunner for nose (or anything else).
5 |
6 | Watch for changes in all file types specified in 'EXTENSIONS'.
7 | If changes, run test executable in 'EXECUTABLE', with default
8 | arguments 'DEFAULTARGS'.
9 |
10 | The --with-color option needs the "rudolf" nose plugin. See:
11 | http://pypi.python.org/pypi/rudolf/
12 |
13 | Originally by Jeff Winkler, http://jeffwinkler.net
14 | Forked from wkral http://github.com/wkral/Nosy
15 | """
16 |
17 | import os
18 | import stat
19 | import time
20 | import datetime
21 | import sys
22 | import fnmatch
23 |
24 |
25 | EXTENSIONS = ['*.py']
26 | EXECUTABLE = 'nosetests test/'
27 | DEFAULTARGS = '--with-color -exe'# -w tests'
28 |
29 |
30 | def checkSum():
31 | """
32 | Return a long which can be used to know if any .py files have changed.
33 | """
34 | val = 0
35 | for root, dirs, files in os.walk(os.getcwd()):
36 | for extension in EXTENSIONS:
37 | for f in fnmatch.filter(files, extension):
38 | stats = os.stat(os.path.join(root, f))
39 | val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME]
40 | return val
41 |
42 | if __name__ == '__main__':
43 | val = 0
44 | try:
45 | while True:
46 | if checkSum() != val:
47 | val = checkSum()
48 | os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS,
49 | ' '.join(sys.argv[1:])))
50 | print(datetime.datetime.now().__str__())
51 | print('=' * 77)
52 | time.sleep(1)
53 | except KeyboardInterrupt:
54 | print('Goodbye')
55 |
--------------------------------------------------------------------------------
/gensim/parsing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains functions to preprocess raw text
3 | """
4 |
5 | # bring model classes directly into package namespace, to save some typing
6 | from .porter import PorterStemmer
7 | from .preprocessing import *
8 |
--------------------------------------------------------------------------------
/gensim/parsing/porter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Porter Stemming Algorithm
4 | This is the Porter stemming algorithm, ported to Python from the
5 | version coded up in ANSI C by the author. It may be be regarded
6 | as canonical, in that it follows the algorithm presented in
7 |
8 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
9 | no. 3, pp 130-137,
10 |
11 | only differing from it at the points maked --DEPARTURE-- below.
12 |
13 | See also http://www.tartarus.org/~martin/PorterStemmer
14 |
15 | The algorithm as described in the paper could be exactly replicated
16 | by adjusting the points of DEPARTURE, but this is barely necessary,
17 | because (a) the points of DEPARTURE are definitely improvements, and
18 | (b) no encoding of the Porter stemmer I have seen is anything like
19 | as exact as this version, even with the points of DEPARTURE!
20 |
21 | Vivake Gupta (v@nano.com)
22 |
23 | Release 1: January 2001
24 |
25 | Further adjustments by Santiago Bruno (bananabruno@gmail.com)
26 | to allow word input not restricted to one word per line, leading
27 | to:
28 |
29 | Release 2: July 2008
30 |
31 | Optimizations and cleanup of the code by Lars Buitinck, July 2012.
32 | """
33 |
34 |
35 | from six.moves import xrange
36 |
37 |
38 | class PorterStemmer(object):
39 | def __init__(self):
40 | """The main part of the stemming algorithm starts here.
41 | b is a buffer holding a word to be stemmed. The letters are in b[0],
42 | b[1] ... ending at b[k]. k is readjusted downwards as the stemming
43 | progresses.
44 |
45 | Note that only lower case sequences are stemmed. Forcing to lower case
46 | should be done before stem(...) is called.
47 | """
48 |
49 | self.b = "" # buffer for word to be stemmed
50 | self.k = 0
51 | self.j = 0 # j is a general offset into the string
52 |
53 | def _cons(self, i):
54 | """True <=> b[i] is a consonant."""
55 | ch = self.b[i]
56 | if ch in "aeiou":
57 | return False
58 | if ch == 'y':
59 | return i == 0 or not self._cons(i - 1)
60 | return True
61 |
62 | def _m(self):
63 | """Returns the number of consonant sequences between 0 and j.
64 |
65 | If c is a consonant sequence and v a vowel sequence, and <..>
66 | indicates arbitrary presence,
67 |
68 | gives 0
69 | vc gives 1
70 | vcvc gives 2
71 | vcvcvc gives 3
72 | ....
73 | """
74 | i = 0
75 | while True:
76 | if i > self.j:
77 | return 0
78 | if not self._cons(i):
79 | break
80 | i += 1
81 | i += 1
82 | n = 0
83 | while True:
84 | while True:
85 | if i > self.j:
86 | return n
87 | if self._cons(i):
88 | break
89 | i += 1
90 | i += 1
91 | n += 1
92 | while 1:
93 | if i > self.j:
94 | return n
95 | if not self._cons(i):
96 | break
97 | i += 1
98 | i += 1
99 |
100 | def _vowelinstem(self):
101 | """True <=> 0,...j contains a vowel"""
102 | return not all(self._cons(i) for i in xrange(self.j + 1))
103 |
104 | def _doublec(self, j):
105 | """True <=> j,(j-1) contain a double consonant."""
106 | return j > 0 and self.b[j] == self.b[j-1] and self._cons(j)
107 |
108 | def _cvc(self, i):
109 | """True <=> i-2,i-1,i has the form consonant - vowel - consonant
110 | and also if the second c is not w,x or y. This is used when trying to
111 | restore an e at the end of a short word, e.g.
112 |
113 | cav(e), lov(e), hop(e), crim(e), but
114 | snow, box, tray.
115 | """
116 | if i < 2 or not self._cons(i) or self._cons(i-1) or not self._cons(i-2):
117 | return False
118 | return self.b[i] not in "wxy"
119 |
120 | def _ends(self, s):
121 | """True <=> 0,...k ends with the string s."""
122 | if s[-1] != self.b[self.k]: # tiny speed-up
123 | return 0
124 | length = len(s)
125 | if length > (self.k + 1):
126 | return 0
127 | if self.b[self.k-length+1:self.k+1] != s:
128 | return 0
129 | self.j = self.k - length
130 | return 1
131 |
132 | def _setto(self, s):
133 | """Set (j+1),...k to the characters in the string s, adjusting k."""
134 | self.b = self.b[:self.j+1] + s
135 | self.k = len(self.b) - 1
136 |
137 | def _r(self, s):
138 | if self._m() > 0:
139 | self._setto(s)
140 |
141 | def _step1ab(self):
142 | """Get rid of plurals and -ed or -ing. E.g.,
143 |
144 | caresses -> caress
145 | ponies -> poni
146 | ties -> ti
147 | caress -> caress
148 | cats -> cat
149 |
150 | feed -> feed
151 | agreed -> agree
152 | disabled -> disable
153 |
154 | matting -> mat
155 | mating -> mate
156 | meeting -> meet
157 | milling -> mill
158 | messing -> mess
159 |
160 | meetings -> meet
161 | """
162 | if self.b[self.k] == 's':
163 | if self._ends("sses"):
164 | self.k -= 2
165 | elif self._ends("ies"):
166 | self._setto("i")
167 | elif self.b[self.k - 1] != 's':
168 | self.k -= 1
169 | if self._ends("eed"):
170 | if self._m() > 0:
171 | self.k -= 1
172 | elif (self._ends("ed") or self._ends("ing")) and self._vowelinstem():
173 | self.k = self.j
174 | if self._ends("at"): self._setto("ate")
175 | elif self._ends("bl"): self._setto("ble")
176 | elif self._ends("iz"): self._setto("ize")
177 | elif self._doublec(self.k):
178 | if self.b[self.k - 1] not in "lsz":
179 | self.k -= 1
180 | elif self._m() == 1 and self._cvc(self.k):
181 | self._setto("e")
182 |
183 | def _step1c(self):
184 | """Turn terminal y to i when there is another vowel in the stem."""
185 | if self._ends("y") and self._vowelinstem():
186 | self.b = self.b[:self.k] + 'i'
187 |
188 | def _step2(self):
189 | """Map double suffices to single ones.
190 |
191 | So, -ization ( = -ize plus -ation) maps to -ize etc. Note that the
192 | string before the suffix must give _m() > 0.
193 | """
194 | ch = self.b[self.k - 1]
195 | if ch == 'a':
196 | if self._ends("ational"): self._r("ate")
197 | elif self._ends("tional"): self._r("tion")
198 | elif ch == 'c':
199 | if self._ends("enci"): self._r("ence")
200 | elif self._ends("anci"): self._r("ance")
201 | elif ch == 'e':
202 | if self._ends("izer"): self._r("ize")
203 | elif ch == 'l':
204 | if self._ends("bli"): self._r("ble") # --DEPARTURE--
205 | # To match the published algorithm, replace this phrase with
206 | # if self._ends("abli"): self._r("able")
207 | elif self._ends("alli"): self._r("al")
208 | elif self._ends("entli"): self._r("ent")
209 | elif self._ends("eli"): self._r("e")
210 | elif self._ends("ousli"): self._r("ous")
211 | elif ch == 'o':
212 | if self._ends("ization"): self._r("ize")
213 | elif self._ends("ation"): self._r("ate")
214 | elif self._ends("ator"): self._r("ate")
215 | elif ch == 's':
216 | if self._ends("alism"): self._r("al")
217 | elif self._ends("iveness"): self._r("ive")
218 | elif self._ends("fulness"): self._r("ful")
219 | elif self._ends("ousness"): self._r("ous")
220 | elif ch == 't':
221 | if self._ends("aliti"): self._r("al")
222 | elif self._ends("iviti"): self._r("ive")
223 | elif self._ends("biliti"): self._r("ble")
224 | elif ch == 'g': # --DEPARTURE--
225 | if self._ends("logi"): self._r("log")
226 | # To match the published algorithm, delete this phrase
227 |
228 | def _step3(self):
229 | """Deal with -ic-, -full, -ness etc. Similar strategy to _step2."""
230 | ch = self.b[self.k]
231 | if ch == 'e':
232 | if self._ends("icate"): self._r("ic")
233 | elif self._ends("ative"): self._r("")
234 | elif self._ends("alize"): self._r("al")
235 | elif ch == 'i':
236 | if self._ends("iciti"): self._r("ic")
237 | elif ch == 'l':
238 | if self._ends("ical"): self._r("ic")
239 | elif self._ends("ful"): self._r("")
240 | elif ch == 's':
241 | if self._ends("ness"): self._r("")
242 |
243 | def _step4(self):
244 | """_step4() takes off -ant, -ence etc., in context vcvc."""
245 | ch = self.b[self.k - 1]
246 | if ch == 'a':
247 | if not self._ends("al"): return
248 | elif ch == 'c':
249 | if not self._ends("ance") and not self._ends("ence"): return
250 | elif ch == 'e':
251 | if not self._ends("er"): return
252 | elif ch == 'i':
253 | if not self._ends("ic"): return
254 | elif ch == 'l':
255 | if not self._ends("able") and not self._ends("ible"): return
256 | elif ch == 'n':
257 | if self._ends("ant"): pass
258 | elif self._ends("ement"): pass
259 | elif self._ends("ment"): pass
260 | elif self._ends("ent"): pass
261 | else: return
262 | elif ch == 'o':
263 | if self._ends("ion") and self.b[self.j] in "st": pass
264 | elif self._ends("ou"): pass
265 | # takes care of -ous
266 | else: return
267 | elif ch == 's':
268 | if not self._ends("ism"): return
269 | elif ch == 't':
270 | if not self._ends("ate") and not self._ends("iti"): return
271 | elif ch == 'u':
272 | if not self._ends("ous"): return
273 | elif ch == 'v':
274 | if not self._ends("ive"): return
275 | elif ch == 'z':
276 | if not self._ends("ize"): return
277 | else:
278 | return
279 | if self._m() > 1:
280 | self.k = self.j
281 |
282 | def _step5(self):
283 | """Remove a final -e if _m() > 1, and change -ll to -l if m() > 1.
284 | """
285 | k = self.j = self.k
286 | if self.b[k] == 'e':
287 | a = self._m()
288 | if a > 1 or (a == 1 and not self._cvc(k - 1)):
289 | self.k -= 1
290 | if self.b[self.k] == 'l' and self._doublec(self.k) and self._m() > 1:
291 | self.k -= 1
292 |
293 | def stem(self, w):
294 | """Stem the word w, return the stemmed form."""
295 | w = w.lower()
296 | k = len(w) - 1
297 | if k <= 1:
298 | return w # --DEPARTURE--
299 |
300 | # With this line, strings of length 1 or 2 don't go through the
301 | # stemming process, although no mention is made of this in the
302 | # published algorithm. Remove the line to match the published
303 | # algorithm.
304 |
305 | self.b = w
306 | self.k = k
307 |
308 | self._step1ab()
309 | self._step1c()
310 | self._step2()
311 | self._step3()
312 | self._step4()
313 | self._step5()
314 | return self.b[:self.k+1]
315 |
316 | def stem_sentence(self, txt):
317 | return " ".join(map(self.stem, txt.split()))
318 |
319 | def stem_documents(self, docs):
320 | return map(self.stem_sentence, docs)
321 |
322 |
323 | if __name__ == '__main__':
324 | import sys
325 |
326 | p = PorterStemmer()
327 |
328 | for f in sys.argv[1:]:
329 | with open(f) as infile:
330 | for line in infile:
331 | print(p.stem_sentence(line))
332 |
--------------------------------------------------------------------------------
/gensim/parsing/preprocessing.py:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 | import glob
4 |
5 | from gensim import utils
6 | from gensim.parsing.porter import PorterStemmer
7 |
8 |
9 | # improved list from Stone, Denis, Kwantes (2010)
10 | STOPWORDS = """
11 | a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
12 | became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
13 | cannot cant co computer con could couldnt cry de describe
14 | detail did do does doesn doing don done down due during
15 | each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
16 | fify fill find fire first five for former formerly forty found four from front full further get give go
17 | had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
18 | if in inc indeed interest into is it its itself keep last latter latterly least less ltd
19 | just
20 | kg km
21 | made many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
22 | neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
23 | often on once one only onto or other others otherwise our ours ourselves out over own part per
24 | perhaps please put rather re
25 | quite
26 | rather really regarding
27 | same see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
28 | than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
29 | until up unless upon us used using
30 | various very very via
31 | was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
32 | your yours yourself yourselves
33 | """
34 | STOPWORDS = frozenset(w for w in STOPWORDS.split() if w)
35 |
36 |
37 | def remove_stopwords(s):
38 | s = utils.to_unicode(s)
39 | return " ".join(w for w in s.split() if w not in STOPWORDS)
40 |
41 |
42 | RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
43 | def strip_punctuation(s):
44 | s = utils.to_unicode(s)
45 | return RE_PUNCT.sub(" ", s)
46 |
47 |
48 | # unicode.translate cannot delete characters like str can
49 | strip_punctuation2 = strip_punctuation
50 | # def strip_punctuation2(s):
51 | # s = utils.to_unicode(s)
52 | # return s.translate(None, string.punctuation)
53 |
54 |
55 | RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE)
56 | def strip_tags(s):
57 | s = utils.to_unicode(s)
58 | return RE_TAGS.sub("",s)
59 |
60 |
61 | def strip_short(s, minsize=3):
62 | s = utils.to_unicode(s)
63 | return " ".join(e for e in s.split() if len(e) >= minsize)
64 |
65 |
66 | RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
67 | def strip_numeric(s):
68 | s = utils.to_unicode(s)
69 | return RE_NUMERIC.sub("", s)
70 |
71 |
72 | RE_NONALPHA = re.compile(r"\W", re.UNICODE)
73 | def strip_non_alphanum(s):
74 | s = utils.to_unicode(s)
75 | return RE_NONALPHA.sub(" ", s)
76 |
77 |
78 | RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE)
79 | def strip_multiple_whitespaces(s):
80 | s = utils.to_unicode(s)
81 | return RE_WHITESPACE.sub(" ", s)
82 |
83 |
84 | RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE)
85 | RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE)
86 | def split_alphanum(s):
87 | s = utils.to_unicode(s)
88 | s = RE_AL_NUM.sub(r"\1 \2", s)
89 | return RE_NUM_AL.sub(r"\1 \2", s)
90 |
91 |
92 | def stem_text(text):
93 | """
94 | Return lowercase and (porter-)stemmed version of string `text`.
95 | """
96 | text = utils.to_unicode(text)
97 | p = PorterStemmer()
98 | return ' '.join(p.stem(word) for word in text.split())
99 | stem = stem_text
100 |
101 | DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces,
102 | strip_numeric, remove_stopwords, strip_short, stem_text]
103 |
104 |
105 | def preprocess_string(s, filters=DEFAULT_FILTERS):
106 | s = utils.to_unicode(s)
107 | for f in filters:
108 | s = f(s)
109 | return s.split()
110 |
111 |
112 | def preprocess_documents(docs):
113 | return [preprocess_string(d) for d in docs]
114 |
115 |
116 | def read_file(path):
117 | with utils.smart_open(path) as fin:
118 | return fin.read()
119 |
120 |
121 | def read_files(pattern):
122 | return [read_file(fname) for fname in glob.glob(pattern)]
123 |
--------------------------------------------------------------------------------
/gensim/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/scripts/__init__.py
--------------------------------------------------------------------------------
/gensim/scripts/make_wiki.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_lemma.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online_lemma.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online_nodebug.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/scripts/make_wikicorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2010 Radim Rehurek
5 | # Copyright (C) 2012 Lars Buitinck
6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7 |
8 |
9 | """
10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11 |
12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13 | bz2-compressed dump of Wikipedia articles, in XML format.
14 |
15 | This actually creates three files:
16 |
17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19 | Matrix Matrix format
20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21 |
22 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
23 | disk space; gensim's corpus iterators can work with compressed input, too.
24 |
25 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
26 | removing tokens that appear in more than 10%% of all documents). Defaults to
27 | 50,000.
28 |
29 | If you have the `pattern` package installed, this script will use a fancy
30 | lemmatization to get a lemma of each token (instead of plain alphabetic
31 | tokenizer). The package is available at https://github.com/clips/pattern .
32 |
33 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
34 | """
35 |
36 |
37 | import logging
38 | import os.path
39 | import sys
40 |
41 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
42 | from gensim.models import TfidfModel
43 |
44 |
45 | # Wiki is first scanned for all distinct word types (~7M). The types that
46 | # appear in more than 10% of articles are removed and from the rest, the
47 | # DEFAULT_DICT_SIZE most frequent types are kept.
48 | DEFAULT_DICT_SIZE = 100000
49 |
50 |
51 | if __name__ == '__main__':
52 | program = os.path.basename(sys.argv[0])
53 | logger = logging.getLogger(program)
54 |
55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
56 | logging.root.setLevel(level=logging.INFO)
57 | logger.info("running %s" % ' '.join(sys.argv))
58 |
59 | # check and process input arguments
60 | if len(sys.argv) < 3:
61 | print(globals()['__doc__'] % locals())
62 | sys.exit(1)
63 | inp, outp = sys.argv[1:3]
64 | if len(sys.argv) > 3:
65 | keep_words = int(sys.argv[3])
66 | else:
67 | keep_words = DEFAULT_DICT_SIZE
68 | online = 'online' in program
69 | lemmatize = 'lemma' in program
70 | debug = 'nodebug' not in program
71 |
72 | if online:
73 | dictionary = HashDictionary(id_range=keep_words, debug=debug)
74 | dictionary.allow_update = True # start collecting document frequencies
75 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
76 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
77 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
78 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
79 | dictionary.save_as_text(outp + '_wordids.txt.bz2')
80 | wiki.save(outp + '_corpus.pkl.bz2')
81 | dictionary.allow_update = False
82 | else:
83 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
84 | # only keep the most frequent words (out of total ~8.2m unique tokens)
85 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86 | # save dictionary and bag-of-words (term-document frequency matrix)
87 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
88 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
89 | # load back the id->word mapping directly from file
90 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above
91 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
92 | del wiki
93 |
94 | # initialize corpus reader and word->id mapping
95 | mm = MmCorpus(outp + '_bow.mm')
96 |
97 | # build tfidf, ~50min
98 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
99 |
100 | # save tfidf vectors in matrix market format
101 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
102 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
103 |
104 | logger.info("finished running %s" % program)
105 |
--------------------------------------------------------------------------------
/gensim/similarities/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains implementations of pairwise similarity queries.
3 | """
4 |
5 | # bring classes directly into package namespace, to save some typing
6 | from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity
7 |
--------------------------------------------------------------------------------
/gensim/similarities/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/similarities/__init__.pyc
--------------------------------------------------------------------------------
/gensim/similarities/docsim.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/similarities/docsim.pyc
--------------------------------------------------------------------------------
/gensim/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim/utils.pyc
--------------------------------------------------------------------------------
/gensim_addons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/__init__.py
--------------------------------------------------------------------------------
/gensim_addons/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/__init__.pyc
--------------------------------------------------------------------------------
/gensim_addons/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/models/__init__.py
--------------------------------------------------------------------------------
/gensim_addons/models/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/largelymfs/paragraph2vec/d7fa862a8dc5dfb6a7452340da7e694f650cbc2e/gensim_addons/models/__init__.pyc
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ###Paragraph vectors DM Model
2 |
3 | Usage : `python test.py training_text testing_text`
4 |
5 | Output : `para_vectors_train.txt` and `para_vectors_test.txt`
6 |
7 |
8 | If you use the code, please cite this paper:
9 |
10 | Yang Liu, Zhiyuan Liu, Tat-Seng Chua, Maosong Sun. *Topical Word Embeddings*. The 29th AAAI Conference on Artificial Intelligence (AAAI'15).
11 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Copyright (C) 2012 Radim Rehurek
5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6 |
7 | """
8 | Run with:
9 |
10 | sudo python ./setup.py install
11 | """
12 |
13 | import os
14 | import sys
15 |
16 | if sys.version_info[:2] < (2, 5):
17 | raise Exception('This version of gensim needs Python 2.5 or later. ')
18 |
19 | import ez_setup
20 | ez_setup.use_setuptools()
21 | from setuptools import setup, find_packages, Extension
22 |
23 |
24 | # Commonly used information
25 | pkg_name = 'gensim'
26 | pkg_ver = '0.10.1'
27 | pkg_desc = 'Python framework for fast Vector Space Modelling'
28 |
29 | # there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562
30 | pkg_author = 'Radim Rehurek' # u'Radim Řehůřek', # <- should really be this...
31 | pkg_author_email = 'radimrehurek@seznam.cz'
32 | pkg_url = 'http://radimrehurek.com/gensim'
33 | pkg_download_url = 'http://pypi.python.org/pypi/gensim'
34 |
35 | pkg_keywords = 'Singular Value Decomposition, SVD, Latent Semantic Indexing, '
36 | 'LSA, LSI, Latent Dirichlet Allocation, LDA, '
37 | 'Hierarchical Dirichlet Process, HDP, Random Projections, '
38 | 'TFIDF, word2vec'
39 |
40 | pkg_classifiers = [ # from http://pypi.python.org/pypi?%3Aaction=list_classifiers
41 | 'Development Status :: 5 - Production/Stable',
42 | 'Environment :: Console',
43 | 'Intended Audience :: Science/Research',
44 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
45 | 'Operating System :: OS Independent',
46 | 'Programming Language :: Python :: 2.6',
47 | 'Programming Language :: Python :: 3.3',
48 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
49 | 'Topic :: Scientific/Engineering :: Information Analysis',
50 | 'Topic :: Text Processing :: Linguistic',
51 | ]
52 |
53 | pkg_license = 'LGPL'
54 |
55 | def read(fname):
56 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
57 |
58 | native_ext = False
59 |
60 | setup(
61 | name = pkg_name,
62 | version = pkg_ver,
63 | description = pkg_desc,
64 | long_description = read('README.rst'),
65 |
66 | packages = find_packages(exclude=[ pkg_name + '_addons', pkg_name + '_addons.*']),
67 |
68 | author = pkg_author,
69 | author_email = pkg_author_email,
70 |
71 | url = pkg_url,
72 | download_url = pkg_download_url,
73 |
74 | keywords = pkg_keywords,
75 |
76 | license = pkg_license,
77 | platforms = 'any',
78 |
79 | zip_safe = False,
80 |
81 | classifiers = pkg_classifiers,
82 |
83 | test_suite = "gensim.test",
84 |
85 | install_requires = [
86 | 'scipy >= 0.7.0',
87 | 'six >= 1.2.0',
88 | ],
89 |
90 | extras_require = {
91 | 'distributed': ['Pyro4 >= 4.8'],
92 | },
93 |
94 | include_package_data = True,
95 |
96 | # lines below are commented out to avoid installing system-wide scripts
97 | # scripts can be run by running `python -m module_name` instead: less
98 | # flexible but more explicit and imo cleaner.
99 | # entry_points = {
100 | # 'console_scripts': [
101 | # 'lsi_worker = gensim.models.lsi_worker:main',
102 | # 'lsi_dispatcher = gensim.models.lsi_dispatcher:main',
103 | # ],
104 | # },
105 |
106 | )
107 |
108 | # Here comes the setup for cythonized native addon-extension.
109 |
110 | # try:
111 | # from Cython.Distutils import build_ext
112 | # import numpy
113 | # models_dir = os.path.join(os.path.dirname(__file__), 'gensim', 'models')
114 |
115 | # ext_modules = [
116 | # Extension('gensim_addons.models.word2vec_inner',
117 | # ['gensim_addons/models/word2vec_inner.pyx'],
118 | # include_dirs = [models_dir, numpy.get_include()])
119 | # ]
120 |
121 | # native_ext = True
122 |
123 | # except ImportError:
124 | # sys.stderr.write('''
125 | # =========================================================
126 |
127 | # Please install Cython (http://cython.org/), if you
128 | # want to use the highly optimized version of word2vec.
129 |
130 | # Usually you can install it (optional) using:
131 |
132 | # pip install -U cython
133 |
134 | # or
135 |
136 | # easy_install -U cython
137 |
138 | # or
139 |
140 | # the package-management of your distribution.
141 |
142 | # If you install Cython *after* installing gensim, the
143 | # optimized version of word2vec will still be automatically
144 | # generated, on the first use of word2vec.
145 |
146 | # =========================================================
147 | # ''')
148 |
149 | # if native_ext:
150 |
151 | # setup(
152 |
153 | # name = pkg_name + '_addons',
154 | # version = pkg_ver,
155 | # description = pkg_desc,
156 | # long_description = read('README.rst'),
157 |
158 | # packages = find_packages(exclude=[ pkg_name, pkg_name + '.*']),
159 |
160 | # author = pkg_author,
161 | # author_email = pkg_author_email,
162 |
163 | # url = pkg_url,
164 | # download_url = pkg_download_url,
165 |
166 | # keywords = pkg_keywords,
167 |
168 | # license = pkg_license,
169 | # platforms = 'any',
170 |
171 | # zip_safe = False,
172 |
173 | # classifiers = pkg_classifiers,
174 |
175 | # install_requires = [
176 | # 'gensim == ' + pkg_ver,
177 | # ],
178 |
179 | # include_package_data = True,
180 |
181 | # cmdclass = {
182 | # 'build_ext': build_ext
183 | # },
184 |
185 | # ext_modules = ext_modules,
186 | # )
187 |
--------------------------------------------------------------------------------
/standard.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | #-*- coding: UTF-8 -*-
3 | #File:
4 | #Date:
5 | #Author: Yang Liu
6 | #Description:
7 | with open("para_vectors_train.txt") as f:
8 | with open("para_train.txt","w") as fo:
9 | for i in range(100000):
10 | f.readline()
11 | for i in range(25000):
12 | f.readline()
13 | fo.write(f.readline())
14 |
15 |
--------------------------------------------------------------------------------
/svm_test.py:
--------------------------------------------------------------------------------
1 | #/usr/bin/python
2 | #-*-coding:utf-8-*-
3 |
4 | if __name__=="__main__":
5 | train_filename = "para_train.txt"
6 | test_filename = "para_test.txt"
7 | train_x = []
8 | test_x = []
9 | train_y = []
10 | test_y = []
11 | with open("train_y.txt") as f:
12 | for l in f:
13 | train_y.append(float(l.strip()))
14 | with open("test_y.txt") as f :
15 | for l in f:
16 | test_y.append(float(l.strip()))
17 |
18 | with open(train_filename,"r") as f:
19 | while True:
20 |
21 |
22 |
23 | l = f.readline()
24 | if not l :
25 | break
26 | l = l.strip().split()
27 | result = []
28 | for i in range(len(l)):
29 | result.append(float(l[i]))
30 | train_x.append(result)
31 | print len(train_x)
32 | print "FINISH LOADING TRAIN"
33 | with open(test_filename,"r") as f:
34 | while True:
35 |
36 |
37 |
38 | l = f.readline()
39 | if not l :
40 | break
41 | l = l.strip().split()
42 | result = []
43 | for i in range(len(l)):
44 | result.append(float(l[i]))
45 | test_x.append(result)
46 | print "FINSIH LOADING TEST"
47 | from sklearn.svm import SVC
48 | x = SVC()
49 | x.fit(train_x, train_y)
50 | print "TRAINING..."
51 | result = x.predict(test_x)
52 | print "PREDICTING..."
53 | num = 0
54 | for i in range(len(test_y)):
55 | if test_y[i]!=result[i]:
56 | num+=1
57 | print float(num)/float(len(test_y))
58 | result = x.predict(train_x)
59 | num = 0
60 | for i in range(len(train_y)):
61 | if train_y[i]!=result[i]:
62 | num+=1
63 | print float(num)/float(len(train_y))
64 |
--------------------------------------------------------------------------------
/test.cc:
--------------------------------------------------------------------------------
1 | #include
2 | using namespace std;
3 |
4 | int main(){
5 | return 0;
6 | }
7 |
--------------------------------------------------------------------------------
/test_it.sh:
--------------------------------------------------------------------------------
1 | python standard.py
2 | python svm_test.py
3 |
--------------------------------------------------------------------------------
/test_nn.py:
--------------------------------------------------------------------------------
1 | import ffnet
2 |
3 | if __name__=="__main__":
4 | train_x = []
5 | train_y = []
6 | test_x = []
7 | test_y = []
8 | for i in range(12500):
9 | test_y.append(1)
10 | train_y.append(1)
11 | for i in range(12500):
12 | test_y.append(0)
13 | train_y.append(0)
14 | with open("para_train.txt") as f:
15 | for l in f:
16 | l = l.strip().split()
17 | result = []
18 | for i in range(len(l)):
19 | result.append(float(l[i]))
20 | train_x.append(result)
21 | print "FINISH READING TRAIN FILE"
22 | with open("para_test.txt") as f:
23 | for l in f:
24 | l = l.strip().split()
25 | result = []
26 | for i in range(len(l)):
27 | result.append(float(l[i]))
28 | test_x.append(result)
29 | print "FINISH READING TEST FILE"
30 | #train_x = train_x[:5]
31 | #test_x = test_x[:5]
32 | #train_y = train_y[:5]
33 | #test_y = test_y[:5]
34 | c = ffnet.ffnet(ffnet.mlgraph((len(train_x[0]), 50, 1)))
35 | print "TRAINING....",
36 | c.train_tnc(train_x, train_y, messages = 1, nproc = 'ncpu', maxfun = 1000)
37 | print "OK"
38 | print "TESTING....",
39 | wrong= 0
40 | for i in range(len(test_y)):
41 | result = c.call(test_x[i]).tolist()[0]
42 | if result >=0.5:
43 | result = 1.0
44 | else:
45 | result = 0.0
46 | if result != test_y[i]:
47 | wrong+=1
48 | print "OK"
49 | print float(wrong) / float(len(test_y))
--------------------------------------------------------------------------------
/test_word2vec.py:
--------------------------------------------------------------------------------
1 | import gensim
2 | s = []
3 | with open("../text8") as f:
4 | for l in f:
5 | s.append(l.strip().split())
6 |
7 | w = gensim.models.Word2Vec(s,workers=24)
8 | print w.similarity("man","woman")
9 |
--------------------------------------------------------------------------------