├── gensim
    ├── scripts
    │   ├── __init__.py
    │   ├── make_wiki.py
    │   ├── make_wiki_lemma.py
    │   ├── make_wiki_online.py
    │   ├── make_wiki_online_lemma.py
    │   ├── make_wiki_online_nodebug.py
    │   └── make_wikicorpus.py
    ├── examples
    │   └── dmlcz
    │   │   ├── __init__.py
    │   │   ├── runall.sh
    │   │   ├── gensim_build.py
    │   │   ├── gensim_genmodel.py
    │   │   └── gensim_xml.py
    ├── test
    │   ├── test_data
    │   │   ├── lee.cor
    │   │   ├── dtm_test.dict
    │   │   ├── testcorpus.mm.index
    │   │   ├── testcorpus.blei.index
    │   │   ├── testcorpus.low.index
    │   │   ├── testcorpus.uci.index
    │   │   ├── head500.noblanks.cor.bz2
    │   │   ├── testcorpus.mallet.index
    │   │   ├── testcorpus.svmlight.index
    │   │   ├── head500.noblanks.cor_tfidf.model
    │   │   ├── testcorpus.blei.vocab
    │   │   ├── testcorpus.uci.vocab
    │   │   ├── miIslita.cor
    │   │   ├── test_corpus_small.mm
    │   │   ├── testcorpus.blei
    │   │   ├── testcorpus.txt
    │   │   ├── testcorpus.low
    │   │   ├── testcorpus.svmlight
    │   │   ├── test_corpus_ok.mm
    │   │   ├── mihalcea_tarau.kw.txt
    │   │   ├── testcorpus.mallet
    │   │   ├── testcorpus.uci
    │   │   ├── mihalcea_tarau.kwpos.txt
    │   │   ├── testcorpus.mm
    │   │   ├── mihalcea_tarau.summ.txt
    │   │   ├── ldavowpalwabbit.dict.txt
    │   │   ├── testsummarization_unrelated.txt
    │   │   └── mihalcea_tarau.txt
    │   ├── __init__.py
    │   ├── test_hdpmodel.py
    │   ├── test_big.py
    │   ├── test_dtm.py
    │   ├── test_utils.py
    │   ├── test_logentropy_model.py
    │   ├── test_parsing.py
    │   ├── test_rpmodel.py
    │   ├── test_phrases.py
    │   ├── test_tfidfmodel.py
    │   ├── test_keywords.py
    │   ├── test_miislita.py
    │   └── test_ldamallet_wrapper.py
    ├── summarization
    │   ├── __init__.py
    │   ├── commons.py
    │   ├── syntactic_unit.py
    │   ├── pagerank_weighted.py
    │   ├── bm25.py
    │   └── textcleaner.py
    ├── models
    │   ├── wrappers
    │   │   └── __init__.py
    │   ├── voidptr.h
    │   ├── __init__.py
    │   ├── word2vec_inner.pxd
    │   ├── rpmodel.py
    │   ├── lsi_worker.py
    │   ├── lda_worker.py
    │   └── logentropy_model.py
    ├── parsing
    │   ├── __init__.py
    │   └── preprocessing.py
    ├── similarities
    │   └── __init__.py
    ├── corpora
    │   ├── __init__.py
    │   ├── mmcorpus.py
    │   ├── csvcorpus.py
    │   ├── malletcorpus.py
    │   ├── textcorpus.py
    │   ├── bleicorpus.py
    │   └── svmlightcorpus.py
    ├── __init__.py
    └── nosy.py
├── docs
    └── src
    │   ├── _static
    │       ├── favicon.ico
    │       └── images
    │       │   ├── bg.png
    │       │   ├── arrows.png
    │       │   ├── gensim.png
    │       │   ├── ukazka.png
    │       │   ├── bullets.png
    │       │   ├── checker.png
    │       │   ├── default.png
    │       │   ├── download.png
    │       │   ├── favicon.ico
    │       │   ├── loading.gif
    │       │   ├── tagline.png
    │       │   ├── ukazka2.png
    │       │   ├── gensim_code.png
    │       │   ├── get-started.png
    │       │   ├── logo-gensim.png
    │       │   ├── menubutton.png
    │       │   ├── twitterbird.png
    │       │   ├── gensim-footer.png
    │       │   ├── googlegroups.png
    │       │   ├── direct-install.png
    │       │   ├── features
    │       │       ├── robust.png
    │       │       ├── support.png
    │       │       ├── free_lgpl.png
    │       │       ├── converters.png
    │       │       ├── memory_independence.png
    │       │       ├── similarity_queries.png
    │       │       ├── platform_independence.png
    │       │       └── efficient_implementations.png
    │       │   ├── gensim_compact.png
    │       │   ├── tagline_compact.png
    │       │   ├── logo-gensim_compact.png
    │       │   ├── references
    │       │       ├── logo_dtu.gif
    │       │       ├── logo_eudml.png
    │       │       ├── logo_ghent.png
    │       │       ├── logo_ibcn.png
    │       │       ├── logo_issuu.jpeg
    │       │       ├── logo_roistr.png
    │       │       ├── logo_dynadmic.png
    │       │       ├── logo_tailwind.png
    │       │       └── logo_sportsauthority.png
    │       │   └── forkme_left_white_ffffff.png
    │   ├── gensim_theme
    │       ├── page.html
    │       ├── theme.conf
    │       ├── search.html
    │       ├── domainindex.html
    │       └── genindex.html
    │   ├── indextoc.rst
    │   ├── corpora
    │       ├── corpora.rst
    │       ├── dictionary.rst
    │       ├── mmcorpus.rst
    │       ├── wikicorpus.rst
    │       ├── bleicorpus.rst
    │       ├── lowcorpus.rst
    │       ├── svmlightcorpus.rst
    │       ├── textcorpus.rst
    │       ├── hashdictionary.rst
    │       ├── ucicorpus.rst
    │       └── indexedcorpus.rst
    │   ├── matutils.rst
    │   ├── models
    │       ├── rpmodel.rst
    │       ├── models.rst
    │       ├── lda_worker.rst
    │       ├── lsi_worker.rst
    │       ├── tfidfmodel.rst
    │       ├── lda_dispatcher.rst
    │       ├── lsi_dispatcher.rst
    │       ├── lsimodel.rst
    │       ├── ldamodel.rst
    │       ├── word2vec.rst
    │       ├── doc2vec.rst
    │       ├── logentropy_model.rst
    │       ├── phrases.rst
    │       ├── hdpmodel.rst
    │       ├── wrappers
    │       │   ├── wrappers.rst
    │       │   ├── ldamallet.rst
    │       │   ├── dtmmodel.rst
    │       │   └── ldavowpalwabbit.rst
    │       └── ldamulticore.rst
    │   ├── similarities
    │       ├── simserver.rst
    │       └── docsim.rst
    │   ├── utils.rst
    │   ├── interfaces.rst
    │   ├── apiref.rst
    │   ├── support.rst
    │   ├── Makefile
    │   ├── distributed.rst
    │   ├── about.rst
    │   ├── changes_080.rst
    │   ├── install.rst
    │   ├── tutorial.rst
    │   └── dist_lda.rst
├── setup.cfg
├── CONTRIBUTING.md
├── MANIFEST.in
├── .travis.yml
├── continuous_integration
    └── appveyor
    │   ├── requirements.txt
    │   └── run_with_env.cmd
├── .gitignore
└── appveyor.yml


/gensim/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gensim/examples/dmlcz/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/gensim/scripts/make_wiki.py:
--------------------------------------------------------------------------------
1 | make_wikicorpus.py


--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_lemma.py:
--------------------------------------------------------------------------------
1 | make_wikicorpus.py


--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online.py:
--------------------------------------------------------------------------------
1 | make_wikicorpus.py


--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online_lemma.py:
--------------------------------------------------------------------------------
1 | make_wikicorpus.py


--------------------------------------------------------------------------------
/gensim/scripts/make_wiki_online_nodebug.py:
--------------------------------------------------------------------------------
1 | make_wikicorpus.py


--------------------------------------------------------------------------------
/docs/src/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/favicon.ico


--------------------------------------------------------------------------------
/gensim/test/test_data/lee.cor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/lee.cor


--------------------------------------------------------------------------------
/docs/src/_static/images/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/bg.png


--------------------------------------------------------------------------------
/gensim/test/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains automated code tests for all other gensim packages.
3 | """
4 | 


--------------------------------------------------------------------------------
/docs/src/_static/images/arrows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/arrows.png


--------------------------------------------------------------------------------
/docs/src/_static/images/gensim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim.png


--------------------------------------------------------------------------------
/docs/src/_static/images/ukazka.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/ukazka.png


--------------------------------------------------------------------------------
/docs/src/gensim_theme/page.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% block body %}
3 |   {{ body }}
4 | {% endblock %}
5 | 


--------------------------------------------------------------------------------
/docs/src/_static/images/bullets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/bullets.png


--------------------------------------------------------------------------------
/docs/src/_static/images/checker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/checker.png


--------------------------------------------------------------------------------
/docs/src/_static/images/default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/default.png


--------------------------------------------------------------------------------
/docs/src/_static/images/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/download.png


--------------------------------------------------------------------------------
/docs/src/_static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/favicon.ico


--------------------------------------------------------------------------------
/docs/src/_static/images/loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/loading.gif


--------------------------------------------------------------------------------
/docs/src/_static/images/tagline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/tagline.png


--------------------------------------------------------------------------------
/docs/src/_static/images/ukazka2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/ukazka2.png


--------------------------------------------------------------------------------
/gensim/test/test_data/dtm_test.dict:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/dtm_test.dict


--------------------------------------------------------------------------------
/docs/src/_static/images/gensim_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim_code.png


--------------------------------------------------------------------------------
/docs/src/_static/images/get-started.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/get-started.png


--------------------------------------------------------------------------------
/docs/src/_static/images/logo-gensim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/logo-gensim.png


--------------------------------------------------------------------------------
/docs/src/_static/images/menubutton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/menubutton.png


--------------------------------------------------------------------------------
/docs/src/_static/images/twitterbird.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/twitterbird.png


--------------------------------------------------------------------------------
/docs/src/_static/images/gensim-footer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim-footer.png


--------------------------------------------------------------------------------
/docs/src/_static/images/googlegroups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/googlegroups.png


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.mm.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.mm.index


--------------------------------------------------------------------------------
/docs/src/_static/images/direct-install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/direct-install.png


--------------------------------------------------------------------------------
/docs/src/_static/images/features/robust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/robust.png


--------------------------------------------------------------------------------
/docs/src/_static/images/features/support.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/support.png


--------------------------------------------------------------------------------
/docs/src/_static/images/gensim_compact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim_compact.png


--------------------------------------------------------------------------------
/docs/src/_static/images/tagline_compact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/tagline_compact.png


--------------------------------------------------------------------------------
/docs/src/gensim_theme/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = none
3 | stylesheet = css/style.css
4 | pygments_style = sphinx
5 | 
6 | [options]
7 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.blei.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.blei.index


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.low.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.low.index


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.uci.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.uci.index


--------------------------------------------------------------------------------
/docs/src/_static/images/features/free_lgpl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/free_lgpl.png


--------------------------------------------------------------------------------
/gensim/test/test_data/head500.noblanks.cor.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/head500.noblanks.cor.bz2


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.mallet.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.mallet.index


--------------------------------------------------------------------------------
/docs/src/_static/images/features/converters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/converters.png


--------------------------------------------------------------------------------
/docs/src/_static/images/logo-gensim_compact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/logo-gensim_compact.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_dtu.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_dtu.gif


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_eudml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_eudml.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_ghent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_ghent.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_ibcn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_ibcn.png


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.svmlight.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.svmlight.index


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_issuu.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_issuu.jpeg


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_roistr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_roistr.png


--------------------------------------------------------------------------------
/docs/src/_static/images/forkme_left_white_ffffff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/forkme_left_white_ffffff.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_dynadmic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_dynadmic.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_tailwind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_tailwind.png


--------------------------------------------------------------------------------
/gensim/test/test_data/head500.noblanks.cor_tfidf.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/head500.noblanks.cor_tfidf.model


--------------------------------------------------------------------------------
/docs/src/_static/images/features/memory_independence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/memory_independence.png


--------------------------------------------------------------------------------
/docs/src/_static/images/features/similarity_queries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/similarity_queries.png


--------------------------------------------------------------------------------
/docs/src/_static/images/features/platform_independence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/platform_independence.png


--------------------------------------------------------------------------------
/docs/src/_static/images/references/logo_sportsauthority.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_sportsauthority.png


--------------------------------------------------------------------------------
/docs/src/_static/images/features/efficient_implementations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/efficient_implementations.png


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.blei.vocab:
--------------------------------------------------------------------------------
 1 | human
 2 | interface
 3 | computer
 4 | user
 5 | system
 6 | response
 7 | time
 8 | eps
 9 | survey
10 | trees
11 | graph
12 | minors
13 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.uci.vocab:
--------------------------------------------------------------------------------
 1 | human
 2 | interface
 3 | computer
 4 | user
 5 | system
 6 | response
 7 | time
 8 | eps
 9 | survey
10 | trees
11 | graph
12 | minors
13 | 


--------------------------------------------------------------------------------
/docs/src/indextoc.rst:
--------------------------------------------------------------------------------
 1 | .. toctree::
 2 |    :hidden:
 3 |    :maxdepth: 1
 4 | 
 5 |    intro
 6 |    install
 7 |    tutorial
 8 |    distributed
 9 |    support
10 |    wiki
11 |    apiref
12 | 


--------------------------------------------------------------------------------
/gensim/summarization/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | # bring model classes directly into package namespace, to save some typing
3 | from .summarizer import summarize, summarize_corpus
4 | from .keywords import keywords


--------------------------------------------------------------------------------
/gensim/test/test_data/miIslita.cor:
--------------------------------------------------------------------------------
1 | LSI tutorials and fast tracks
2 | Books on semantic analysis
3 | Learning latent semantic indexing
4 | Advances in structures and advances in indexing
5 | Analysis of latent structures
6 | 


--------------------------------------------------------------------------------
/gensim/models/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains wrappers for other topic modeling programs.
3 | """
4 | 
5 | from .ldamallet import LdaMallet
6 | from .dtmmodel import DtmModel
7 | from .ldavowpalwabbit import LdaVowpalWabbit
8 | 


--------------------------------------------------------------------------------
/docs/src/corpora/corpora.rst:
--------------------------------------------------------------------------------
1 | :mod:`corpora` -- Package for corpora I/O
2 | ==========================================
3 | 
4 | .. automodule:: gensim.corpora
5 |     :synopsis: Package for corpora I/O
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/gensim/parsing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains functions to preprocess raw text
3 | """
4 | 
5 | # bring model classes directly into package namespace, to save some typing
6 | from .porter import PorterStemmer
7 | from .preprocessing import *
8 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/test_corpus_small.mm:
--------------------------------------------------------------------------------
 1 | %%matrixmarket matrix coordinate real general
 2 | 3 5 9
 3 | 1 1 1.000000
 4 | 1 2 3.000000
 5 | 1 4 5.000000
 6 | 2 2 2.000000
 7 | 2 3 1.000000
 8 | 2 5 4.000000
 9 | 3 1 2.000000
10 | 3 2 2.000000
11 | 3 4 1.000000


--------------------------------------------------------------------------------
/docs/src/matutils.rst:
--------------------------------------------------------------------------------
 1 | :mod:`matutils` -- Math utils
 2 | ==============================
 3 | 
 4 | .. automodule:: gensim.matutils
 5 |     :synopsis: Math utils
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/rpmodel.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.rpmodel` -- Random Projections
2 | ======================================================
3 | 
4 | .. automodule:: gensim.models.rpmodel
5 |     :synopsis: Random Projections
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/gensim/similarities/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This package contains implementations of pairwise similarity queries.
3 | """
4 | 
5 | # bring classes directly into package namespace, to save some typing
6 | from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity
7 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.blei:
--------------------------------------------------------------------------------
 1 | 3 0:1.0 1:1.0 2:1.0
 2 | 6 2:1.0 3:1.0 4:1.0 5:1.0 6:1.0 8:1.0
 3 | 4 1:1.0 3:1.0 4:1.0 7:1.0
 4 | 3 0:1.0 4:2.0 7:1.0
 5 | 3 3:1.0 5:1.0 6:1.0
 6 | 1 9:1.0
 7 | 2 9:1.0 10:1.0
 8 | 3 9:1.0 10:1.0 11:1.0
 9 | 3 8:1.0 10:1.0 11:1.0
10 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.txt:
--------------------------------------------------------------------------------
 1 | computer human interface
 2 | computer response survey system time user
 3 | interface system user eps
 4 | human system system eps
 5 | response time user
 6 | trees
 7 | trees graph
 8 | trees graph minors
 9 | survey graph minors
10 | 


--------------------------------------------------------------------------------
/docs/src/models/models.rst:
--------------------------------------------------------------------------------
1 | :mod:`models` -- Package for transformation models
2 | ======================================================
3 | 
4 | .. automodule:: gensim.models
5 |     :synopsis: Package for transformation models
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.low:
--------------------------------------------------------------------------------
 1 | 9
 2 | computer human interface
 3 | computer response survey system time user
 4 | interface system user eps
 5 | human system system eps
 6 | response time user
 7 | trees
 8 | trees graph
 9 | trees graph minors
10 | survey graph minors
11 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.svmlight:
--------------------------------------------------------------------------------
 1 | 0 1:1.0 2:1.0 3:1.0
 2 | 0 1:1.0 4:1.0 5:1.0 6:1.0 7:1.0 8:1.0
 3 | 0 3:1.0 6:1.0 8:1.0 9:1.0
 4 | 0 2:1.0 6:2.0 9:1.0
 5 | 0 4:1.0 7:1.0 8:1.0
 6 | 0 10:1.0
 7 | 0 10:1.0 11:1.0
 8 | 0 10:1.0 11:1.0 12:1.0
 9 | 0 5:1.0 11:1.0 12:1.0
10 | 


--------------------------------------------------------------------------------
/docs/src/similarities/simserver.rst:
--------------------------------------------------------------------------------
1 | :mod:`simserver` -- Document similarity server
2 | ======================================================
3 | 
4 | .. automodule:: simserver.simserver
5 |     :synopsis: Document similarity server
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/src/models/lda_worker.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.lda_worker` -- Worker for distributed LDA
2 | ======================================================
3 | 
4 | .. automodule:: gensim.models.lda_worker
5 |     :synopsis: Worker for distributed LDA
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/src/models/lsi_worker.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.lsi_worker` -- Worker for distributed LSI
2 | ======================================================
3 | 
4 | .. automodule:: gensim.models.lsi_worker
5 |     :synopsis: Worker for distributed LSI
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/test_corpus_ok.mm:
--------------------------------------------------------------------------------
 1 | %%matrixmarket matrix coordinate real general
 2 | 3 5 9
 3 | 1 1 1.000000
 4 | 1 2 3.000000
 5 | 1 4 5.000000
 6 | 2 2 2.000000
 7 | 2 3 1.000000
 8 | 2 5 4.000000
 9 | 3 1 2.000000
10 | 3 2 2.000000
11 | 3 3 8.000000
12 | 3 4 1.000000
13 | 3 5 2.000000


--------------------------------------------------------------------------------
/docs/src/utils.rst:
--------------------------------------------------------------------------------
 1 | :mod:`utils` -- Various utility functions
 2 | ==========================================
 3 | 
 4 | .. automodule:: gensim.utils
 5 |     :synopsis: Various utility functions
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/interfaces.rst:
--------------------------------------------------------------------------------
 1 | :mod:`interfaces` -- Core gensim interfaces
 2 | ============================================
 3 | 
 4 | .. automodule:: gensim.interfaces
 5 |     :synopsis: Core gensim interfaces
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/tfidfmodel.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.tfidfmodel` -- TF-IDF model
 2 | ======================================================
 3 | 
 4 | .. automodule:: gensim.models.tfidfmodel
 5 |     :synopsis: TF-IDF model
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/similarities/docsim.rst:
--------------------------------------------------------------------------------
1 | :mod:`similarities.docsim` -- Document similarity queries
2 | ========================================================================
3 | 
4 | .. automodule:: gensim.similarities.docsim
5 |     :synopsis: Document similarity queries
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/mihalcea_tarau.kw.txt:
--------------------------------------------------------------------------------
 1 | gilbert
 2 | hurricane
 3 | winds
 4 | coast
 5 | storm
 6 | saturday
 7 | flood
 8 | flooding
 9 | weather
10 | alert
11 | defense alerted
12 | strong
13 | people
14 | pushed
15 | puerto
16 | cabral said
17 | north
18 | associated
19 | south
20 | domingo
21 | residents
22 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.mallet:
--------------------------------------------------------------------------------
 1 | 1 en computer human interface
 2 | 2 en computer response survey system time user
 3 | 3 en interface system user eps
 4 | 4 en human system system eps
 5 | 5 en response time user
 6 | 6 en trees
 7 | 7 en trees graph
 8 | 8 en trees graph minors
 9 | 9 en survey graph minors
10 | 


--------------------------------------------------------------------------------
/docs/src/models/lda_dispatcher.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.lda_dispatcher` -- Dispatcher for distributed LDA
2 | ================================================================
3 | 
4 | .. automodule:: gensim.models.lda_dispatcher
5 |     :synopsis: Dispatcher for distributed LDA
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/src/models/lsi_dispatcher.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.lsi_dispatcher` -- Dispatcher for distributed LSI
2 | ===============================================================
3 | 
4 | .. automodule:: gensim.models.lsi_dispatcher
5 |     :synopsis: Dispatcher for distributed LSI
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/src/models/lsimodel.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.lsimodel` -- Latent Semantic Indexing
 2 | ======================================================
 3 | 
 4 | .. automodule:: gensim.models.lsimodel
 5 |     :synopsis: Latent Semantic Indexing
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/ldamodel.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.ldamodel` -- Latent Dirichlet Allocation
 2 | ======================================================
 3 | 
 4 | .. automodule:: gensim.models.ldamodel
 5 |     :synopsis: Latent Dirichlet Allocation
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/word2vec.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.word2vec` -- Deep learning with word2vec
 2 | ======================================================
 3 | 
 4 | .. automodule:: gensim.models.word2vec
 5 |     :synopsis: Deep learning with word2vec
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/doc2vec.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.doc2vec` -- Deep learning with paragraph2vec
 2 | =========================================================
 3 | 
 4 | .. automodule:: gensim.models.doc2vec
 5 |     :synopsis: Deep learning with doc2vec
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/logentropy_model.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.logentropy_model` -- LogEntropy model
 2 | ======================================================
 3 | 
 4 | .. automodule:: gensim.models.logentropy_model
 5 |     :synopsis: LogEntropy model
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/phrases.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.phrases` -- Phrase (collocation) detection
 2 | =======================================================
 3 | 
 4 | .. automodule:: gensim.models.phrases
 5 |     :synopsis: Phrase (collocation) detection
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/hdpmodel.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.hdpmodel` -- Hierarchical Dirichlet Process
 2 | ========================================================
 3 | 
 4 | .. automodule:: gensim.models.hdpmodel
 5 |     :synopsis: Hierarchical Dirichlet Process
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/dictionary.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.dictionary` -- Construct word<->id mappings
 2 | ==========================================================
 3 | 
 4 | .. automodule:: gensim.corpora.dictionary
 5 |     :synopsis: Construct word<->id mappings
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/mmcorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.mmcorpus` -- Corpus in Matrix Market format
 2 | ==========================================================
 3 | 
 4 | .. automodule:: gensim.corpora.mmcorpus
 5 |     :synopsis: Corpus in Matrix Market format
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/wikicorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.wikicorpus` -- Corpus from a Wikipedia dump
 2 | ==========================================================
 3 | 
 4 | .. automodule:: gensim.corpora.wikicorpus
 5 |     :synopsis: Corpus from a Wikipedia dump
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/bleicorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.bleicorpus` -- Corpus in Blei's LDA-C format
 2 | ==========================================================
 3 | 
 4 | .. automodule:: gensim.corpora.bleicorpus
 5 |     :synopsis: Corpus in Blei's LDA-C format
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/lowcorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.lowcorpus` -- Corpus in List-of-Words format
 2 | ===========================================================
 3 | 
 4 | .. automodule:: gensim.corpora.lowcorpus
 5 |     :synopsis: Corpus in List-of-Words format
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/svmlightcorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.svmlightcorpus` -- Corpus in SVMlight format
 2 | ==================================================================
 3 | 
 4 | .. automodule:: gensim.corpora.svmlightcorpus
 5 |     :synopsis: Corpus in SVMlight format
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/textcorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.textcorpus` -- Building corpora with dictionaries
 2 | =================================================================
 3 | 
 4 | .. automodule:: gensim.corpora.textcorpus
 5 |     :synopsis: Building corpora with dictionaries
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/models/wrappers/wrappers.rst:
--------------------------------------------------------------------------------
1 | :mod:`models.wrappers` -- Package for transformation models via external programs
2 | =================================================================================
3 | 
4 | .. automodule:: gensim.models.wrappers
5 |     :synopsis: Package for transformation models via external programs
6 |     :members:
7 |     :inherited-members:
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/src/models/ldamulticore.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.ldamulticore` -- parallelized Latent Dirichlet Allocation
 2 | ======================================================================
 3 | 
 4 | .. automodule:: gensim.models.ldamulticore
 5 |     :synopsis: Latent Dirichlet Allocation
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheelhouse_uploader]
2 | artifact_indexes=
3 |     # OSX wheels built by travis (only for specific tags):
4 |     # https://github.com/MacPython/scikit-learn-wheels
5 |     http://wheels.scipy.org
6 |     # Windows wheels buit by:
7 |     # https://ci.appveyor.com/project/piskvorky/gensim 
8 |     http://17a25141cb7f75c18ee4-676a79255544e7711e0dd8bccdcdd1cb.r23.cf2.rackcdn.com
9 | 


--------------------------------------------------------------------------------
/docs/src/corpora/hashdictionary.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.hashdictionary` -- Construct word<->id mappings
 2 | =============================================================
 3 | 
 4 | .. automodule:: gensim.corpora.hashdictionary
 5 |     :synopsis: Construct word<->id mappings on the fly (the "hashing trick")
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/gensim/models/voidptr.h:
--------------------------------------------------------------------------------
 1 | #include <Python.h>
 2 | 
 3 | #if PY_VERSION_HEX >= 0x03020000
 4 | 
 5 | /*
 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore
 7 | */
 8 | static void * PyCObject_AsVoidPtr(PyObject *obj)
 9 | {
10 |     void *ret = PyCapsule_GetPointer(obj, NULL);
11 |     if (ret == NULL) {
12 |         PyErr_Clear();
13 |     }
14 |     return ret;
15 | }
16 | 
17 | #endif


--------------------------------------------------------------------------------
/docs/src/models/wrappers/ldamallet.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.wrappers.ldamallet` -- Latent Dirichlet Allocation via Mallet
 2 | ==========================================================================
 3 | 
 4 | .. automodule:: gensim.models.wrappers.ldamallet
 5 |     :synopsis: Latent Dirichlet Allocation via Mallet
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.uci:
--------------------------------------------------------------------------------
 1 | 9                   
 2 | 12                  
 3 | 28                  
 4 | 1 1 1
 5 | 1 2 1
 6 | 1 3 1
 7 | 2 1 1
 8 | 2 4 1
 9 | 2 5 1
10 | 2 6 1
11 | 2 7 1
12 | 2 8 1
13 | 3 3 1
14 | 3 6 1
15 | 3 8 1
16 | 3 9 1
17 | 4 2 1
18 | 4 6 2
19 | 4 9 1
20 | 5 4 1
21 | 5 7 1
22 | 5 8 1
23 | 6 10 1
24 | 7 10 1
25 | 7 11 1
26 | 8 10 1
27 | 8 11 1
28 | 8 12 1
29 | 9 5 1
30 | 9 11 1
31 | 9 12 1
32 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/mihalcea_tarau.kwpos.txt:
--------------------------------------------------------------------------------
 1 | gilbert
 2 | hurricane
 3 | coast
 4 | storm
 5 | saturday
 6 | winds heavy
 7 | flood
 8 | flooding
 9 | weather
10 | alert
11 | defense alerted
12 | strong
13 | pushed
14 | people
15 | puerto
16 | cabral said
17 | north
18 | associated
19 | south
20 | domingo
21 | residents
22 | dominican
23 | miles
24 | southeast
25 | san
26 | civil
27 | home
28 | reached
29 | juan
30 | named
31 | 


--------------------------------------------------------------------------------
/docs/src/models/wrappers/dtmmodel.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.wrappers.dtmmodel` -- Dynamic Topic Models (DTM) and Dynamic Influence Models (DIM)
 2 | ================================================================================================
 3 | 
 4 | .. automodule:: gensim.models.wrappers.dtmmodel
 5 |     :synopsis: Dynamic Topic Models
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/docs/src/corpora/ucicorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.ucicorpus` -- Corpus in UCI bag-of-words format
 2 | ==============================================================================================================
 3 | 
 4 | .. automodule:: gensim.corpora.ucicorpus
 5 |     :synopsis: Corpus in University of California, Irvine (UCI) bag-of-words format
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Please see [contribution-guide.org](http://www.contribution-guide.org/) for Due Diligence steps we expect from contributors before submitting an issue.
 4 | 
 5 | For open-ended questions the best place is our active [mailing list](https://groups.google.com/forum/#!forum/gensim).
 6 | 
 7 | For Code Style please see our [Developer Page](https://github.com/piskvorky/gensim/wiki/Developer-page#code-style).
 8 | 
 9 | Thanks!
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/src/models/wrappers/ldavowpalwabbit.rst:
--------------------------------------------------------------------------------
 1 | :mod:`models.wrappers.ldavowpalwabbit` -- Latent Dirichlet Allocation via Vowpal Wabbit
 2 | =======================================================================================
 3 | 
 4 | .. automodule:: gensim.models.wrappers.ldavowpalwabbit
 5 |     :synopsis: Latent Dirichlet Allocation via Vowpal Wabbit
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include docs *
 2 | recursive-include gensim/test/test_data *
 3 | recursive-include . *.sh
 4 | prune docs/src*
 5 | include README.rst
 6 | include CHANGELOG.txt
 7 | include COPYING
 8 | include COPYING.LESSER
 9 | include ez_setup.py
10 | include gensim/models/voidptr.h
11 | include gensim/models/word2vec_inner.c
12 | include gensim/models/word2vec_inner.pyx
13 | include gensim/models/doc2vec_inner.c
14 | include gensim/models/doc2vec_inner.pyx
15 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testcorpus.mm:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | 9 12 28                                           
 3 | 1 1 1.0
 4 | 1 2 1.0
 5 | 1 3 1.0
 6 | 2 1 1.0
 7 | 2 4 1.0
 8 | 2 5 1.0
 9 | 2 6 1.0
10 | 2 7 1.0
11 | 2 8 1.0
12 | 3 3 1.0
13 | 3 6 1.0
14 | 3 8 1.0
15 | 3 9 1.0
16 | 4 2 1.0
17 | 4 6 2.0
18 | 4 9 1.0
19 | 5 4 1.0
20 | 5 7 1.0
21 | 5 8 1.0
22 | 6 10 1.0
23 | 7 10 1.0
24 | 7 11 1.0
25 | 8 10 1.0
26 | 8 11 1.0
27 | 8 12 1.0
28 | 9 5 1.0
29 | 9 11 1.0
30 | 9 12 1.0
31 | 


--------------------------------------------------------------------------------
/docs/src/corpora/indexedcorpus.rst:
--------------------------------------------------------------------------------
 1 | :mod:`corpora.indexedcorpus` -- Random access to corpus documents
 2 | =================================================================
 3 | 
 4 | .. automodule:: gensim.corpora.indexedcorpus
 5 |     :synopsis: Random access to corpus documents
 6 |     :members:
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 |     :show-inheritance:
10 | 
11 | 
12 | .. autoclass:: IndexedCorpus
13 |     :members:
14 |     :inherited-members:
15 |     :undoc-members:
16 |     :show-inheritance:


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | python:
 4 |   - "2.6"
 5 |   - "2.7"
 6 |   - "3.3"
 7 |   - "3.4"
 8 |   - "3.5"  
 9 | before_install:
10 |   - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh
11 |   - chmod +x miniconda.sh
12 |   - ./miniconda.sh -b
13 |   - export PATH=/home/travis/miniconda2/bin:$PATH
14 |   - conda update --yes conda
15 | install:
16 |   - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy
17 |   - source activate gensim-test
18 |   - python setup.py install
19 | script: python setup.py test
20 | 


--------------------------------------------------------------------------------
/gensim/summarization/commons.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | from gensim.summarization.graph import Graph
 7 | 
 8 | 
 9 | def build_graph(sequence):
10 |     graph = Graph()
11 |     for item in sequence:
12 |         if not graph.has_node(item):
13 |             graph.add_node(item)
14 |     return graph
15 | 
16 | 
17 | def remove_unreachable_nodes(graph):
18 |     for node in graph.nodes():
19 |         if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
20 |             graph.del_node(node)
21 | 


--------------------------------------------------------------------------------
/gensim/summarization/syntactic_unit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | 
 7 | class SyntacticUnit(object):
 8 | 
 9 |     def __init__(self, text, token=None, tag=None):
10 |         self.text = text
11 |         self.token = token
12 |         self.tag = tag[:2] if tag else None     # Just first two letters of tag
13 |         self.index = -1
14 |         self.score = -1
15 | 
16 |     def __str__(self):
17 |         return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
18 | 
19 |     def __repr__(self):
20 |         return str(self)
21 | 


--------------------------------------------------------------------------------
/gensim/corpora/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This package contains implementations of various streaming corpus I/O format.
 3 | """
 4 | 
 5 | # bring corpus classes directly into package namespace, to save some typing
 6 | from .indexedcorpus import IndexedCorpus # must appear before the other classes
 7 | 
 8 | from .mmcorpus import MmCorpus
 9 | from .bleicorpus import BleiCorpus
10 | from .svmlightcorpus import SvmLightCorpus
11 | from .lowcorpus import LowCorpus
12 | from .dictionary import Dictionary
13 | from .hashdictionary import HashDictionary
14 | from .wikicorpus import WikiCorpus
15 | from .textcorpus import TextCorpus
16 | from .ucicorpus import UciCorpus
17 | from .malletcorpus import MalletCorpus
18 | from .sharded_corpus import ShardedCorpus
19 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/mihalcea_tarau.summ.txt:
--------------------------------------------------------------------------------
1 | Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas.
2 | The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo.
3 | The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm.
4 | Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast.


--------------------------------------------------------------------------------
/gensim/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This package contains interfaces and functionality to compute pair-wise document
 3 | similarities within a corpus of documents.
 4 | """
 5 | 
 6 | from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization
 7 | import logging
 8 | 
 9 | try:
10 |     __version__ = __import__('pkg_resources').get_distribution('gensim').version
11 | except:
12 |     __version__ = '?'
13 | 
14 | 
15 | class NullHandler(logging.Handler):
16 |     """For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""
17 |     def emit(self, record):
18 |         pass
19 | 
20 | logger = logging.getLogger('gensim')
21 | if len(logger.handlers) == 0:	# To ensure reload() doesn't add another one
22 |     logger.addHandler(NullHandler())
23 | 


--------------------------------------------------------------------------------
/continuous_integration/appveyor/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Fetch numpy and scipy wheels from the sklearn rackspace wheelhouse.
 2 | # Those wheels were collected from http://www.lfd.uci.edu/~gohlke/pythonlibs/
 3 | # This is a temporary solution. As soon as numpy and scipy provide official
 4 | # wheel for windows we ca delete this --find-links line.
 5 | --find-links http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/
 6 | 
 7 | # fix the versions of numpy to force the use of numpy and scipy to use the whl
 8 | # of the rackspace folder instead of trying to install from more recent
 9 | # source tarball published on PyPI
10 | numpy==1.9.3
11 | scipy==0.16.0
12 | cython
13 | six >= 1.5.0
14 | smart_open >= 1.2.1
15 | nose
16 | wheel
17 | wheelhouse_uploader
18 | 
19 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/ldavowpalwabbit.dict.txt:
--------------------------------------------------------------------------------
 1 | 28	alex	1
 2 | 23	alice	1
 3 | 47	bacon	1
 4 | 46	beans	1
 5 | 25	bob	1
 6 | 10	brakes	1
 7 | 36	c	1
 8 | 12	car	1
 9 | 6	cat	1
10 | 40	cereal	1
11 | 0	cheetah	1
12 | 11	clutch	1
13 | 49	coffee	1
14 | 38	cplusplus	1
15 | 37	csharp	1
16 | 18	cylinder	1
17 | 27	dave	1
18 | 48	eggs	1
19 | 19	engine	1
20 | 30	erlang	1
21 | 17	exhaust	1
22 | 34	go	1
23 | 42	ham	1
24 | 24	harry	1
25 | 35	haskell	1
26 | 1	jaguar	1
27 | 39	java	1
28 | 21	jim	1
29 | 41	juice	1
30 | 2	kitten	1
31 | 4	leopard	1
32 | 9	lion	1
33 | 7	lynx	1
34 | 14	motor	1
35 | 3	mouse	1
36 | 44	mushrooms	1
37 | 5	puppy	1
38 | 32	python	1
39 | 26	rachel	1
40 | 22	robert	1
41 | 31	ruby	1
42 | 43	sausages	1
43 | 33	scala	1
44 | 20	sue	1
45 | 16	suspension	1
46 | 45	tea	1
47 | 8	tiger	1
48 | 29	tim	1
49 | 13	tyre	1
50 | 15	wheel	1
51 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/testsummarization_unrelated.txt:
--------------------------------------------------------------------------------
 1 | River lake island mountain area.
 2 | Relay athletics metres freestyle hurdles.
 3 | Were court law government police.
 4 | Courcelles centimeters mattythewhite wine stamps.
 5 | Sysop iran pakistan ali arab.
 6 | Copyrighted northamerica rihanna cloudz knowles.
 7 | Israel sockpuppet jerusalem palestinian ifk.
 8 | Melbourne rovers australian wanderers dinamo.
 9 | Film series episode television.
10 | Wrestling chateau ligue discus estonian.
11 | Edits notability archived clearer speedy.
12 | Admins acid molniya chemical compound.
13 | India tamil singh temple kumar.
14 | Bwebs malta hobart basa columella huon.
15 | Rabbi bgwhite lebanese beirut caligari.
16 | German berlin kategorie cross.
17 | System power energy data.
18 | Indonesia malaysia singapore greek jakarta.
19 | Stakes webs futsal whitish thoroughbred racehorse.
20 | Oblast uploaded nordland selsoviet halogaland.


--------------------------------------------------------------------------------
/docs/src/apiref.rst:
--------------------------------------------------------------------------------
 1 | .. _apiref:
 2 | 
 3 | API Reference
 4 | =============
 5 | 
 6 | Modules:
 7 | 
 8 | .. toctree::
 9 |     :maxdepth: 0
10 | 
11 |     interfaces
12 |     utils
13 |     matutils
14 |     corpora/bleicorpus
15 |     corpora/dictionary
16 |     corpora/hashdictionary
17 |     corpora/lowcorpus
18 |     corpora/mmcorpus
19 |     corpora/svmlightcorpus
20 |     corpora/wikicorpus
21 |     corpora/textcorpus
22 |     corpora/ucicorpus
23 |     corpora/indexedcorpus
24 |     models/ldamodel
25 |     models/ldamulticore
26 |     models/lsimodel
27 |     models/tfidfmodel
28 |     models/rpmodel
29 |     models/hdpmodel
30 |     models/logentropy_model
31 |     models/lsi_dispatcher
32 |     models/lsi_worker
33 |     models/lda_dispatcher
34 |     models/lda_worker
35 |     models/word2vec
36 |     models/doc2vec
37 |     models/phrases
38 |     models/wrappers/ldamallet
39 |     models/wrappers/dtmmodel
40 |     models/wrappers/ldavowpalwabbit.rst
41 |     similarities/docsim
42 |     similarities/simserver
43 | 
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | *.pyc
10 | 
11 | # Packages #
12 | ############
13 | # it's better to unpack these files and commit the raw source
14 | # git has its own built in compression methods
15 | *.7z
16 | *.dmg
17 | *.gz
18 | *.iso
19 | *.jar
20 | *.rar
21 | *.tar
22 | *.zip
23 | 
24 | # Logs and databases #
25 | ######################
26 | *.log
27 | *.sql
28 | *.sqlite
29 | *.pkl
30 | *.bak
31 | *.npy
32 | *.npz
33 | 
34 | # OS generated files #
35 | ######################
36 | .DS_Store?
37 | ehthumbs.db
38 | Icon?
39 | Thumbs.db
40 | 
41 | # Other #
42 | #########
43 | .project
44 | .pydevproject
45 | .ropeproject
46 | .settings/
47 | .eggs
48 | cython_debug
49 | docs/src/_build/
50 | docs/_static
51 | dedan_gensim.tmproj
52 | gensim*.egg-info
53 | *,cover
54 | .idea
55 | *.dict
56 | *.index
57 | .coverage
58 | .*.sw[op]
59 | data
60 | *.bak
61 | /build/
62 | /dist/
63 | *.prof
64 | *.lprof
65 | *.bin
66 | *.old
67 | *.model
68 | *~
69 | 


--------------------------------------------------------------------------------
/gensim/examples/dmlcz/runall.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # full path to gensim executables
 4 | BIN_PATH=~/xrehurek/gensim/dmlcz
 5 | 
 6 | # intermediate data will be stored to this dir
 7 | RESULT_PATH=~/xrehurek/results
 8 | 
 9 | # set python path, so that python can find and import gensim modules
10 | export PYTHONPATH=~/xrehurek:$PYTHONPATH
11 | 
12 | # Language is set to 'any', meaning all articles are processed for similarity in
13 | # one go, regardless of their language.
14 | # Set language to 'eng', 'fre', 'rus' etc. to only process a specific subset of
15 | # articles (an article's language is determined from its metadata).
16 | language=any
17 | 
18 | 
19 | # ========== parse all article sources, build article co-occurence matrix ======
20 | ${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log
21 | 
22 | 
23 | # ========== build transformation models =======================================
24 | for method in tfidf rp;
25 | do
26 | 	( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) &
27 | done
28 | wait
29 | 
30 | method=lsi
31 | ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log
32 | 
33 | 
34 | # =========== generate output xml files ========================================
35 | # generate xml files for all methods at once, in parallel, to save time.
36 | # NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here)
37 | for method in tfidf lsi rp;
38 | do
39 |     ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) &
40 | done
41 | wait
42 | 


--------------------------------------------------------------------------------
/gensim/nosy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | A simple testrunner for nose (or anything else).
 5 | 
 6 | Watch for changes in all file types specified in 'EXTENSIONS'.
 7 | If changes, run test executable in 'EXECUTABLE', with default
 8 | arguments 'DEFAULTARGS'.
 9 | 
10 | The --with-color option needs the "rudolf" nose plugin. See:
11 | http://pypi.python.org/pypi/rudolf/
12 | 
13 | Originally by Jeff Winkler, http://jeffwinkler.net
14 | Forked from wkral http://github.com/wkral/Nosy
15 | """
16 | 
17 | import os
18 | import stat
19 | import time
20 | import datetime
21 | import sys
22 | import fnmatch
23 | 
24 | 
25 | EXTENSIONS = ['*.py']
26 | EXECUTABLE = 'nosetests test/'
27 | DEFAULTARGS = '--with-color -exe'# -w tests'
28 | 
29 | 
30 | def checkSum():
31 |     """
32 |     Return a long which can be used to know if any .py files have changed.
33 |     """
34 |     val = 0
35 |     for root, dirs, files in os.walk(os.getcwd()):
36 |         for extension in EXTENSIONS:
37 |             for f in fnmatch.filter(files, extension):
38 |                 stats = os.stat(os.path.join(root, f))
39 |                 val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME]
40 |     return val
41 | 
42 | if __name__ == '__main__':
43 |     val = 0
44 |     try:
45 |         while True:
46 |             if checkSum() != val:
47 |                 val = checkSum()
48 |                 os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS,
49 |                     ' '.join(sys.argv[1:])))
50 |                 print(datetime.datetime.now().__str__())
51 |                 print('=' * 77)
52 |             time.sleep(1)
53 |     except KeyboardInterrupt:
54 |         print('Goodbye')
55 | 


--------------------------------------------------------------------------------
/gensim/corpora/mmcorpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | 
 8 | """
 9 | Corpus in the Matrix Market format.
10 | """
11 | 
12 | 
13 | import logging
14 | 
15 | from gensim import interfaces, matutils
16 | from gensim.corpora import IndexedCorpus
17 | 
18 | 
19 | logger = logging.getLogger('gensim.corpora.mmcorpus')
20 | 
21 | 
22 | class MmCorpus(matutils.MmReader, IndexedCorpus):
23 |     """
24 |     Corpus in the Matrix Market format.
25 |     """
26 |     def __init__(self, fname):
27 |         # avoid calling super(), too confusing
28 |         IndexedCorpus.__init__(self, fname)
29 |         matutils.MmReader.__init__(self, fname)
30 | 
31 |     def __iter__(self):
32 |         """
33 |         Interpret a matrix in Matrix Market format as a streamed gensim corpus
34 |         (yielding one document at a time).
35 |         """
36 |         for doc_id, doc in super(MmCorpus, self).__iter__():
37 |             yield doc  # get rid of doc id, return the sparse vector only
38 | 
39 |     @staticmethod
40 |     def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
41 |         """
42 |         Save a corpus in the Matrix Market format to disk.
43 | 
44 |         This function is automatically called by `MmCorpus.serialize`; don't
45 |         call it directly, call `serialize` instead.
46 |         """
47 |         logger.info("storing corpus in Matrix Market format to %s" % fname)
48 |         num_terms = len(id2word) if id2word is not None else None
49 |         return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata)
50 | 
51 | # endclass MmCorpus
52 | 


--------------------------------------------------------------------------------
/docs/src/support.rst:
--------------------------------------------------------------------------------
 1 | .. _support:
 2 | 
 3 | =============
 4 | Support
 5 | =============
 6 | 
 7 | Open source support
 8 | --------------------
 9 | 
10 | The main communication channel is the `gensim mailing list <https://groups.google.com/group/gensim>`_.
11 | This is the preferred way to **ask for help**, **report problems** and **share insights** with the community. Newbie questions are perfectly fine, just make sure you've read the :doc:`tutorials <tutorial>`.
12 | 
13 | I discourage sending private emails, because the mailing list serves as a knowledge base for all gensim users, cutting maintenance efforts needed for support. If you feel your problem is too special, data too sensitive, technical scope too demanding, **see the "business" section below**.
14 | 
15 | When posting on the mailing list, try to include all relevant information, such as what it is you are trying to achieve, what went wrong, relevant gensim logs, package versions etc.
16 | 
17 | **FAQ** and some useful **snippets of code** are maintained on GitHub: https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ.
18 | 
19 | You can also try asking on StackOverflow, using the `gensim tag <http://stackoverflow.com/questions/tagged/gensim>`_.
20 | 
21 | 
22 | Business support
23 | ------------------
24 | 
25 | I also run a consulting business for data mining and information retrieval solutions, `rare-technologies.com <http://rare-technologies.com>`_.
26 | 
27 | In case you need commercial support, design validation, technical training or custom system development, `get in touch <http://rare-technologies.com/contact>`_ for a quote.
28 | 
29 | Developer support
30 | ------------------
31 | 
32 | Developers who `tweak gensim internals <https://github.com/piskvorky/gensim/wiki/Developer-page>`_ are encouraged to report issues at the `GitHub issue tracker <https://github.com/piskvorky/gensim/issues>`_.
33 | Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that.
34 | 


--------------------------------------------------------------------------------
/docs/src/gensim_theme/search.html:
--------------------------------------------------------------------------------
 1 | {#
 2 |     basic/search.html
 3 |     ~~~~~~~~~~~~~~~~~
 4 | 
 5 |     Template for the search page.
 6 | 
 7 |     :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
 8 |     :license: BSD, see LICENSE for details.
 9 | #}
10 | {% extends "layout.html" %}
11 | {% set title = _('Search') %}
12 | {% set script_files = script_files + ['_static/searchtools.js'] %}
13 | {% block extrahead %}
14 |   <script type="text/javascript">
15 |     jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
16 |   </script>
17 |   {{ super() }}
18 | {% endblock %}
19 | {% block body %}
20 |   <h1 id="search-documentation">{{ _('Search') }}</h1>
21 |   <div id="fallback" class="admonition warning">
22 |   <script type="text/javascript">$('#fallback').hide();</script>
23 |   <p>
24 |     {% trans %}Please activate JavaScript to enable the search
25 |     functionality.{% endtrans %}
26 |   </p>
27 |   </div>
28 |   <p>
29 |     {% trans %}From here you can search these documents. Enter your search
30 |     words into the box below and click "search". Note that the search
31 |     function will automatically search for all of the words. Pages
32 |     containing fewer words won't appear in the result list.{% endtrans %}
33 |   </p>
34 |   <form action="" method="get">
35 |     <input type="text" name="q" value="" />
36 |     <input type="submit" value="{{ _('search') }}" />
37 |     <span id="search-progress" style="padding-left: 10px"></span>
38 |   </form>
39 |   {% if search_performed %}
40 |     <h2>{{ _('Search Results') }}</h2>
41 |     {% if not search_results %}
42 |       <p>{{ _('Your search did not match any results.') }}</p>
43 |     {% endif %}
44 |   {% endif %}
45 |   <div id="search-results">
46 |   {% if search_results %}
47 |     <ul>
48 |     {% for href, caption, context in search_results %}
49 |       <li><a href="{{ pathto(item.href) }}">{{ caption }}</a>
50 |         <div class="context">{{ context|e }}</div>
51 |       </li>
52 |     {% endfor %}
53 |     </ul>
54 |   {% endif %}
55 |   </div>
56 | {% endblock %}
57 | 


--------------------------------------------------------------------------------
/gensim/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This package contains algorithms for extracting document representations from their raw
 3 | bag-of-word counts.
 4 | """
 5 | 
 6 | # bring model classes directly into package namespace, to save some typing
 7 | from .hdpmodel import HdpModel
 8 | from .ldamodel import LdaModel
 9 | from .lsimodel import LsiModel
10 | from .tfidfmodel import TfidfModel
11 | from .rpmodel import RpModel
12 | from .logentropy_model import LogEntropyModel
13 | from .word2vec import Word2Vec
14 | from .doc2vec import Doc2Vec
15 | from .ldamulticore import LdaMulticore
16 | from .phrases import Phrases
17 | 
18 | from . import wrappers
19 | 
20 | from gensim import interfaces, utils
21 | 
22 | 
23 | class VocabTransform(interfaces.TransformationABC):
24 |     """
25 |     Remap feature ids to new values.
26 | 
27 |     Given a mapping between old ids and new ids (some old ids may be missing = these
28 |     features are to be discarded), this will wrap a corpus so that iterating over
29 |     `VocabTransform[corpus]` returns the same vectors but with the new ids.
30 | 
31 |     Old features that have no counterpart in the new ids are discarded. This
32 |     can be used to filter vocabulary of a corpus "online"::
33 | 
34 |     >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep))
35 |     >>> vt = VocabTransform(old2new)
36 |     >>> for vec_with_new_ids in vt[corpus_with_old_ids]:
37 |     >>>     ...
38 | 
39 |     """
40 |     def __init__(self, old2new, id2token=None):
41 |         # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems())
42 |         self.old2new = old2new
43 |         self.id2token = id2token
44 | 
45 | 
46 |     def __getitem__(self, bow):
47 |         """
48 |         Return representation with the ids transformed.
49 |         """
50 |         # if the input vector is in fact a corpus, return a transformed corpus as a result
51 |         is_corpus, bow = utils.is_corpus(bow)
52 |         if is_corpus:
53 |             return self._apply(bow)
54 | 
55 |         return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
56 | #endclass VocabTransform
57 | 


--------------------------------------------------------------------------------
/gensim/summarization/pagerank_weighted.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | from numpy import empty as empty_matrix
 6 | from scipy.sparse import csr_matrix
 7 | from scipy.sparse.linalg import eigs
 8 | from six.moves import xrange
 9 | 
10 | try:
11 |     from numpy import VisibleDeprecationWarning
12 |     import warnings
13 |     warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
14 | except ImportError:
15 |     pass
16 | 
17 | 
18 | def pagerank_weighted(graph, damping=0.85):
19 |     adjacency_matrix = build_adjacency_matrix(graph)
20 |     probability_matrix = build_probability_matrix(graph)
21 | 
22 |     pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
23 | 
24 |     vals, vecs = eigs(pagerank_matrix.T, k=1)  # TODO raise an error if matrix has complex eigenvectors?
25 | 
26 |     return process_results(graph, vecs.real)
27 | 
28 | 
29 | def build_adjacency_matrix(graph):
30 |     row = []
31 |     col = []
32 |     data = []
33 |     nodes = graph.nodes()
34 |     length = len(nodes)
35 | 
36 |     for i in xrange(length):
37 |         current_node = nodes[i]
38 |         neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
39 |         for j in xrange(length):
40 |             edge_weight = float(graph.edge_weight((current_node, nodes[j])))
41 |             if i != j and edge_weight != 0.0:
42 |                 row.append(i)
43 |                 col.append(j)
44 |                 data.append(edge_weight / neighbors_sum)
45 | 
46 |     return csr_matrix((data, (row, col)), shape=(length, length))
47 | 
48 | 
49 | def build_probability_matrix(graph):
50 |     dimension = len(graph.nodes())
51 |     matrix = empty_matrix((dimension, dimension))
52 | 
53 |     probability = 1.0 / float(dimension)
54 |     matrix.fill(probability)
55 | 
56 |     return matrix
57 | 
58 | 
59 | def process_results(graph, vecs):
60 |     scores = {}
61 |     for i, node in enumerate(graph.nodes()):
62 |         scores[node] = abs(vecs[i, :])
63 | 
64 |     return scores
65 | 


--------------------------------------------------------------------------------
/gensim/corpora/csvcorpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2013 Zygmunt Zając <zygmunt@fastml.com>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | """
 8 | Corpus in CSV format.
 9 | 
10 | """
11 | 
12 | 
13 | from __future__ import with_statement
14 | 
15 | import logging
16 | import csv
17 | import itertools
18 | 
19 | from gensim import interfaces, utils
20 | 
21 | logger = logging.getLogger('gensim.corpora.csvcorpus')
22 | 
23 | 
24 | class CsvCorpus(interfaces.CorpusABC):
25 |     """
26 |     Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
27 |     based on the file content.
28 | 
29 |     All row values are expected to be ints/floats.
30 | 
31 |     """
32 | 
33 |     def __init__(self, fname, labels):
34 |         """
35 |         Initialize the corpus from a file.
36 |         `labels` = are class labels present in the input file? => skip the first column
37 | 
38 |         """
39 |         logger.info("loading corpus from %s" % fname)
40 |         self.fname = fname
41 |         self.length = None
42 |         self.labels = labels
43 | 
44 |         # load the first few lines, to guess the CSV dialect
45 |         head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
46 |         self.headers = csv.Sniffer().has_header(head)
47 |         self.dialect = csv.Sniffer().sniff(head)
48 |         logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
49 | 
50 |     def __iter__(self):
51 |         """
52 |         Iterate over the corpus, returning one sparse vector at a time.
53 | 
54 |         """
55 |         reader = csv.reader(utils.smart_open(self.fname), self.dialect)
56 |         if self.headers:
57 |             next(reader)    # skip the headers
58 | 
59 |         line_no = -1
60 |         for line_no, line in enumerate(reader):
61 |             if self.labels:
62 |                 line.pop(0)  # ignore the first column = class label
63 |             yield list(enumerate(map(float, line)))
64 | 
65 |         self.length = line_no + 1  # store the total number of CSV rows = documents
66 | 
67 | # endclass CsvCorpus
68 | 


--------------------------------------------------------------------------------
/docs/src/gensim_theme/domainindex.html:
--------------------------------------------------------------------------------
 1 | {#
 2 |     basic/domainindex.html
 3 |     ~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 |     Template for domain indices (module index, ...).
 6 | 
 7 |     :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
 8 |     :license: BSD, see LICENSE for details.
 9 | #}
10 | {% extends "layout.html" %}
11 | {% set title = indextitle %}
12 | {% block extrahead %}
13 | {{ super() }}
14 | {% if not embedded and collapse_index %}
15 |     <script type="text/javascript">
16 |       DOCUMENTATION_OPTIONS.COLLAPSE_INDEX = true;
17 |     </script>
18 | {% endif %}
19 | {% endblock %}
20 | {% block body %}
21 | 
22 |    {%- set groupid = idgen() %}
23 | 
24 |    <h1>{{ indextitle }}</h1>
25 | 
26 |    <div class="modindex-jumpbox">
27 |    {%- for (letter, entries) in content %}
28 |    <a href="#cap-{{ letter }}"><strong>{{ letter }}</strong></a>
29 |      {%- if not loop.last %} | {% endif %}
30 |    {%- endfor %}
31 |    </div>
32 | 
33 |    <table class="indextable modindextable" cellspacing="0" cellpadding="2">
34 |    {%- for letter, entries in content %}
35 |      <tr class="pcap"><td></td><td>&nbsp;</td><td></td></tr>
36 |      <tr class="cap" id="cap-{{ letter }}"><td></td><td>
37 |        <strong>{{ letter }}</strong></td><td></td></tr>
38 |      {%- for (name, grouptype, page, anchor, extra, qualifier, description)
39 |              in entries %}
40 |      <tr{% if grouptype == 2 %} class="cg-{{ groupid.current() }}"{% endif %}>
41 |        <td>{% if grouptype == 1 -%}
42 |          <img src="{{ pathto('_static/minus.png', 1) }}" class="toggler"
43 |               id="toggle-{{ groupid.next() }}" style="display: none" alt="-" />
44 |            {%- endif %}</td>
45 |        <td>{% if grouptype == 2 %}&nbsp;&nbsp;&nbsp;{% endif %}
46 |        {% if page %}<a href="{{ pathto(page) }}#{{ anchor }}">{% endif -%}
47 |        <tt class="xref">{{ name|e }}</tt>
48 |        {%- if page %}</a>{% endif %}
49 |      {%- if extra %} <em>({{ extra|e }})</em>{% endif -%}
50 |      </td><td>{% if qualifier %}<strong>{{ qualifier|e }}:</strong>{% endif %}
51 |        <em>{{ description|e }}</em></td></tr>
52 |      {%- endfor %}
53 |    {%- endfor %}
54 |    </table>
55 | 
56 | {% endblock %}
57 | 


--------------------------------------------------------------------------------
/gensim/test/test_data/mihalcea_tarau.txt:
--------------------------------------------------------------------------------
 1 | AP880911-0016
 2 | AP-NR-09-11-88 0423EDT r i
 3 | BC-HurricaneGilbert 09-11 0339
 4 | BC-Hurricane Gilbert,0348
 5 | Hurricane Gilbert Heads Toward Dominican Coast
 6 | By RUDDY GONZALEZ
 7 | Associated Press Writer
 8 | SANTO DOMINGO, Dominican Republic (AP)
 9 | Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas.
10 | The storm was approaching from the southeast with sustained winds of 75 mph gusting to 92 mph.
11 | ``There is no need for alarm,'' Civil Defense Director Eugenio Cabral said in a television alert shortly before midnight Saturday.
12 | Cabral said residents of the province of Barahona should closely follow Gilbert's movement.
13 | An estimated 100,000 people live in the province, including 70,000 in the city of Barahona, about 125 miles west of Santo Domingo.
14 | Tropical Storm Gilbert formed in the eastern Caribbean and strengthened into a hurricane Saturday night.
15 | The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo.
16 | The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm.
17 | The weather service issued a flash flood watch for Puerto Rico and the Virgin Islands until at least 6 p.m. Sunday.
18 | Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast.
19 | There were no reports of casualties.
20 | San Juan, on the north coast, had heavy rains and gusts Saturday, but they subsided during the night.
21 | On Saturday, Hurricane Florence was downgraded to a tropical storm and its remnants pushed inland from the U.S. Gulf Coast.
22 | Residents returned home, happy to find little damage from 80 mph winds and sheets of rain.
23 | Florence, the sixth named storm of the 1988 Atlantic storm season, was the second hurricane.
24 | The first, Debby, reached minimal hurricane strength briefly before hitting the Mexican coast last month.


--------------------------------------------------------------------------------
/gensim/test/test_hdpmodel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | """
 8 | Automated tests for checking transformation algorithms (the models package).
 9 | """
10 | 
11 | 
12 | import logging
13 | import unittest
14 | import os
15 | import os.path
16 | import tempfile
17 | 
18 | import six
19 | import numpy
20 | import scipy.linalg
21 | 
22 | from gensim.corpora import mmcorpus, Dictionary
23 | from gensim.models import hdpmodel
24 | from gensim import matutils
25 | 
26 | 
27 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
28 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
29 | 
30 | 
31 | # set up vars used in testing ("Deerwester" from the web tutorial)
32 | texts = [['human', 'interface', 'computer'],
33 |  ['survey', 'user', 'computer', 'system', 'response', 'time'],
34 |  ['eps', 'user', 'interface', 'system'],
35 |  ['system', 'human', 'system', 'eps'],
36 |  ['user', 'response', 'time'],
37 |  ['trees'],
38 |  ['graph', 'trees'],
39 |  ['graph', 'minors', 'trees'],
40 |  ['graph', 'minors', 'survey']]
41 | dictionary = Dictionary(texts)
42 | corpus = [dictionary.doc2bow(text) for text in texts]
43 | 
44 | 
45 | def testfile():
46 |     # temporary data will be stored to this file
47 |     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
48 | 
49 | 
50 | 
51 | class TestHdpModel(unittest.TestCase):
52 |     def setUp(self):
53 |         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
54 |         self.class_ = hdpmodel.HdpModel
55 |         self.model = self.class_(corpus, id2word=dictionary)
56 | 
57 |     def testShowTopics(self):
58 |         topics = self.model.show_topics(formatted=False)
59 | 
60 |         for topic_no, topic in topics:
61 |             self.assertTrue(isinstance(topic_no, int))
62 |             self.assertTrue(isinstance(topic, list))
63 |             for k, v in topic:
64 |                 self.assertTrue(isinstance(k, six.string_types))
65 |                 self.assertTrue(isinstance(v, float))
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
71 |     unittest.main()
72 | 


--------------------------------------------------------------------------------
/docs/src/gensim_theme/genindex.html:
--------------------------------------------------------------------------------
 1 | {#
 2 |     basic/genindex.html
 3 |     ~~~~~~~~~~~~~~~~~~~
 4 | 
 5 |     Template for an "all-in-one" index.
 6 | 
 7 |     :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
 8 |     :license: BSD, see LICENSE for details.
 9 | #}
10 | {% macro indexentries(firstname, links) %}
11 |   <dt>
12 |   {%- if links -%}
13 |     <a href="{{ links[0][1] }}">
14 |     {%- if links[0][0] %}<strong>{% endif -%}
15 |     {{ firstname|e }}
16 |     {%- if links[0][0] %}</strong>{% endif -%}
17 |     </a>
18 | 
19 |     {%- for ismain, link in links[1:] -%}
20 |       , <a href="{{ link }}">{% if ismain %}<strong>{% endif -%}
21 |       [{{ loop.index }}]
22 |       {%- if ismain %}</strong>{% endif -%}
23 |       </a>
24 |     {%- endfor %}
25 |   {%- else %}
26 |     {{ firstname|e }}
27 |   {%- endif %}
28 |   </dt>
29 | {% endmacro %}
30 | 
31 | {% extends "layout.html" %}
32 | {% set title = _('Index') %}
33 | {% block body %}
34 | 
35 | <h1 id="index">{{ _('Index') }}</h1>
36 | 
37 | <div class="genindex-jumpbox">
38 |  {% for key, dummy in genindexentries -%}
39 |  <a href="#{{ key }}"><strong>{{ key }}</strong></a>
40 |  {% if not loop.last %}| {% endif %}
41 |  {%- endfor %}
42 | </div>
43 | 
44 | {%- for key, entries in genindexentries %}
45 | <h2 id="{{ key }}">{{ key }}</h2>
46 | <table style="width: 100%" class="indextable genindextable"><tr>
47 |   {%- for column in entries|slice(2) if column %}
48 |   <td style="width: 33%" valign="top"><dl>
49 |     {%- for entryname, (links, subitems) in column %}
50 |       {{ indexentries(entryname, links) }}
51 |       {%- if subitems %}
52 |       <dd><dl>
53 |       {%- for subentryname, subentrylinks in subitems %}
54 |         {{ indexentries(subentryname, subentrylinks) }}
55 |       {%- endfor %}
56 |       </dl></dd>
57 |       {%- endif -%}
58 |     {%- endfor %}
59 |   </dl></td>
60 |   {%- endfor %}
61 | </tr></table>
62 | {% endfor %}
63 | 
64 | {% endblock %}
65 | 
66 | {% block sidebarrel %}
67 | {% if split_index %}
68 |    <h4>{{ _('Index') }}</h4>
69 |    <p>{% for key, dummy in genindexentries -%}
70 |    <a href="{{ pathto('genindex-' + key) }}"><strong>{{ key }}</strong></a>
71 |      {% if not loop.last %}| {% endif %}
72 |    {%- endfor %}</p>
73 | 
74 |    <p><a href="{{ pathto('genindex-all') }}"><strong>{{ _('Full index on one page') }}</strong></a></p>
75 | {% endif %}
76 |    {{ super() }}
77 | {% endblock %}
78 | 


--------------------------------------------------------------------------------
/gensim/summarization/bm25.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | import math
 7 | from six import iteritems
 8 | from six.moves import xrange
 9 | 
10 | 
11 | # BM25 parameters.
12 | PARAM_K1 = 1.5
13 | PARAM_B = 0.75
14 | EPSILON = 0.25
15 | 
16 | 
17 | class BM25(object):
18 | 
19 |     def __init__(self, corpus):
20 |         self.corpus_size = len(corpus)
21 |         self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
22 |         self.corpus = corpus
23 |         self.f = []
24 |         self.df = {}
25 |         self.idf = {}
26 |         self.initialize()
27 | 
28 |     def initialize(self):
29 |         for document in self.corpus:
30 |             frequencies = {}
31 |             for word in document:
32 |                 if word not in frequencies:
33 |                     frequencies[word] = 0
34 |                 frequencies[word] += 1
35 |             self.f.append(frequencies)
36 | 
37 |             for word, freq in iteritems(frequencies):
38 |                 if word not in self.df:
39 |                     self.df[word] = 0
40 |                 self.df[word] += 1
41 | 
42 |         for word, freq in iteritems(self.df):
43 |             self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5)
44 | 
45 |     def get_score(self, document, index, average_idf):
46 |         score = 0
47 |         for word in document:
48 |             if word not in self.f[index]:
49 |                 continue
50 |             idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
51 |             score += (idf*self.f[index][word]*(PARAM_K1+1)
52 |                       / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl)))
53 |         return score
54 | 
55 |     def get_scores(self, document, average_idf):
56 |         scores = []
57 |         for index in xrange(self.corpus_size):
58 |             score = self.get_score(document, index, average_idf)
59 |             scores.append(score)
60 |         return scores
61 | 
62 | 
63 | def get_bm25_weights(corpus):
64 |     bm25 = BM25(corpus)
65 |     average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
66 | 
67 |     weights = []
68 |     for doc in corpus:
69 |         scores = bm25.get_scores(doc, average_idf)
70 |         weights.append(scores)
71 | 
72 |     return weights
73 | 


--------------------------------------------------------------------------------
/gensim/models/word2vec_inner.pxd:
--------------------------------------------------------------------------------
 1 | #
 2 | # shared type definitions for word2vec_inner
 3 | # used by both word2vec_inner.pyx (automatically) and doc2vec_inner.pyx (by explicit cimport)
 4 | #
 5 | # Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np
 7 | 
 8 | cdef extern from "voidptr.h":
 9 |     void* PyCObject_AsVoidPtr(object obj)
10 | 
11 | cimport numpy as np
12 | ctypedef np.float32_t REAL_t
13 | 
14 | # BLAS routine signatures
15 | ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
16 | ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
17 | ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
18 | ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
19 | ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
20 | ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
21 | 
22 | cdef scopy_ptr scopy
23 | cdef saxpy_ptr saxpy
24 | cdef sdot_ptr sdot
25 | cdef dsdot_ptr dsdot
26 | cdef snrm2_ptr snrm2
27 | cdef sscal_ptr sscal
28 | 
29 | # precalculated sigmoid table
30 | DEF EXP_TABLE_SIZE = 1000
31 | DEF MAX_EXP = 6
32 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE
33 | 
34 | # function implementations swapped based on BLAS detected in word2vec_inner.pyx init()
35 | ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
36 | ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
37 | 
38 | cdef our_dot_ptr our_dot
39 | cdef our_saxpy_ptr our_saxpy
40 | 
41 | # for when fblas.sdot returns a double
42 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
43 | 
44 | # for when fblas.sdot returns a float
45 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
46 | 
47 | # for when no blas available
48 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
49 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
50 | 
51 | # to support random draws from negative-sampling cum_table
52 | cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil
53 | 
54 | cdef unsigned long long random_int32(unsigned long long *next_random) nogil
55 | 


--------------------------------------------------------------------------------
/gensim/test/test_big.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2014 Radim Rehurek <radimrehurek@seznam.cz>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | """
 8 | Automated tests for checking processing/storing large inputs.
 9 | """
10 | 
11 | 
12 | import logging
13 | import unittest
14 | import os
15 | import itertools
16 | import tempfile
17 | 
18 | import numpy
19 | 
20 | import gensim
21 | 
22 | 
23 | def testfile():
24 |     # temporary data will be stored to this file
25 |     return os.path.join(tempfile.gettempdir(), 'gensim_big.tst')
26 | 
27 | 
28 | class BigCorpus(object):
29 |     """A corpus of a large number of docs & large vocab"""
30 |     def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len=100):
31 |         self.dictionary = gensim.utils.FakeDict(num_terms)
32 |         self.words_only = words_only
33 |         self.num_docs = num_docs
34 |         self.doc_len = doc_len
35 | 
36 |     def __iter__(self):
37 |         for _ in range(self.num_docs):
38 |             doc_len = numpy.random.poisson(self.doc_len)
39 |             ids = numpy.random.randint(0, len(self.dictionary), doc_len)
40 |             if self.words_only:
41 |                 yield [str(id) for id in ids]
42 |             else:
43 |                 weights = numpy.random.poisson(3, doc_len)
44 |                 yield sorted(zip(ids, weights))
45 | 
46 | 
47 | if os.environ.get('GENSIM_BIG', False):
48 |     class TestLargeData(unittest.TestCase):
49 |         """Try common operations, using large models. You'll need ~8GB RAM to run these tests"""
50 |         def testWord2Vec(self):
51 |             corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200)
52 |             model = gensim.models.Word2Vec(corpus, size=300, workers=4)
53 |             model.save(testfile(), ignore=['syn1'])
54 |             del model
55 |             model = gensim.models.Word2Vec.load(testfile())
56 | 
57 |         def testLsiModel(self):
58 |             corpus = BigCorpus(num_docs=50000)
59 |             model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary)
60 |             model.save(testfile())
61 |             del model
62 |             model = gensim.models.LsiModel.load(testfile())
63 | 
64 |         def testLdaModel(self):
65 |             corpus = BigCorpus(num_docs=5000)
66 |             model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary)
67 |             model.save(testfile())
68 |             del model
69 |             model = gensim.models.LdaModel.load(testfile())
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | """
 7 | USAGE: %(program)s LANGUAGE
 8 |     Process the repository, accepting articles in LANGUAGE (or 'any').
 9 |     Store the word co-occurence matrix and id mappings, which are needed for subsequent processing.
10 | 
11 | Example: ./gensim_build.py eng
12 | """
13 | 
14 | 
15 | import logging
16 | import sys
17 | import os.path
18 | import re
19 | 
20 | 
21 | from gensim.corpora import sources, dmlcorpus
22 | 
23 | 
24 | PREFIX = 'dmlcz'
25 | 
26 | AT_HOME = False
27 | 
28 | if AT_HOME:
29 |     SOURCE_LIST = [
30 |                    sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
31 |                    sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
32 |                    sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
33 |                    ]
34 | 
35 | #    SOURCE_LIST = [
36 | #                   sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'),
37 | #                   ]
38 | 
39 |     RESULT_DIR = '/Users/kofola/workspace/dml/data/results'
40 | 
41 | else:
42 | 
43 |     SOURCE_LIST = [
44 |                    sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
45 |                    sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
46 |                    sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
47 |                    ]
48 | 
49 |     RESULT_DIR = '/data/dmlcz/xrehurek/results'
50 | 
51 | 
52 | def buildDmlCorpus(config):
53 |     dml = dmlcorpus.DmlCorpus()
54 |     dml.processConfig(config, shuffle = True)
55 |     dml.buildDictionary()
56 |     dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words
57 | 
58 |     dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs)
59 |     dml.saveAsText() # save id mappings and documents as text data (matrix market format)
60 |     return dml
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
65 |     logging.root.setLevel(level=logging.INFO)
66 |     logging.info("running %s" % ' '.join(sys.argv))
67 | 
68 |     program = os.path.basename(sys.argv[0])
69 | 
70 |     # check and process input arguments
71 |     if len(sys.argv) < 2:
72 |         print(globals()['__doc__'] % locals())
73 |         sys.exit(1)
74 |     language = sys.argv[1]
75 | 
76 |     # construct the config, which holds information about sources, data file filenames etc.
77 |     config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language])
78 |     for source in SOURCE_LIST:
79 |         config.addSource(source)
80 |     buildDmlCorpus(config)
81 | 
82 |     logging.info("finished running %s" % program)
83 | 


--------------------------------------------------------------------------------
/gensim/test/test_dtm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Automated tests for DTM/DIM model
 6 | """
 7 | 
 8 | 
 9 | import logging
10 | from subprocess import CalledProcessError
11 | import gensim
12 | import os
13 | import sys
14 | import unittest
15 | from gensim import corpora
16 | 
17 | 
18 | # needed because sample data files are located in the same folder
19 | module_path = os.path.dirname(__file__)
20 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
21 | 
22 | 
23 | class TestDtmModel(unittest.TestCase):
24 | 
25 |     def setUp(self):
26 |         self.time_slices = [3, 7]
27 |         self.corpus = corpora.mmcorpus.MmCorpus(datapath('dtm_test.mm'))
28 |         self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict'))
29 |         # first you need to setup the environment variable $DTM_PATH for the dtm executable file
30 |         self.dtm_path = os.environ.get('DTM_PATH', None)
31 |         if self.dtm_path is None:
32 |             if sys.version_info >= (2, 7, 0):
33 |                 self.skipTest("$DTM_PATH is not properly set up.")
34 |             else:
35 |                 logging.warning("$DTM_PATH is not properly set up.")
36 | 
37 |     def testDtm(self):
38 |         if self.dtm_path is not None:
39 |             model = gensim.models.wrappers.DtmModel(
40 |                 self.dtm_path, self.corpus, self.time_slices, num_topics=2,
41 |                 id2word=self.id2word, model='dtm', initialize_lda=True,
42 |                 rng_seed=1)
43 |             topics = model.show_topics(topics=2, times=2, topn=10)
44 |             self.assertEqual(len(topics), 4)
45 | 
46 |             one_topic = model.show_topic(topicid=1, time=1, topn=10)
47 |             self.assertEqual(len(one_topic), 10)
48 |             self.assertEqual(one_topic[0][1], u'idexx')
49 | 
50 |     def testDim(self):
51 |         if self.dtm_path is not None:
52 |             model = gensim.models.wrappers.DtmModel(
53 |                 self.dtm_path, self.corpus, self.time_slices, num_topics=2,
54 |                 id2word=self.id2word, model='fixed', initialize_lda=True,
55 |                 rng_seed=1)
56 |             topics = model.show_topics(topics=2, times=2, topn=10)
57 |             self.assertEqual(len(topics), 4)
58 | 
59 |             one_topic = model.show_topic(topicid=1, time=1, topn=10)
60 |             self.assertEqual(len(one_topic), 10)
61 |             self.assertEqual(one_topic[0][1], u'skills')
62 | 
63 |     # In stderr expect "Error opening file /tmp/a65419_train_out/initial-lda-ss.dat. Failing."
64 |     def testCalledProcessError(self):
65 |         if self.dtm_path is not None:
66 |             with self.assertRaises(CalledProcessError):
67 |                 gensim.models.wrappers.DtmModel(
68 |                     self.dtm_path, self.corpus, self.time_slices, num_topics=2,
69 |                     id2word=self.id2word, model='dtm', initialize_lda=False,
70 |                     rng_seed=1)
71 | 
72 | if __name__ == '__main__':
73 |     logging.basicConfig(level=logging.DEBUG)
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/gensim/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | """
 7 | Automated tests for checking various utils functions.
 8 | """
 9 | 
10 | 
11 | import logging
12 | import unittest
13 | 
14 | from gensim import utils
15 | 
16 | 
17 | class TestIsCorpus(unittest.TestCase):
18 |     def test_None(self):
19 |         # test None
20 |         result = utils.is_corpus(None)
21 |         expected = (False, None)
22 |         self.assertEqual(expected, result)
23 | 
24 |     def test_simple_lists_of_tuples(self):
25 |         # test list words
26 | 
27 |         # one document, one word
28 |         potentialCorpus = [[(0, 4.)]]
29 |         result = utils.is_corpus(potentialCorpus)
30 |         expected = (True, potentialCorpus)
31 |         self.assertEqual(expected, result)
32 | 
33 |         # one document, several words
34 |         potentialCorpus = [[(0, 4.), (1, 2.)]]
35 |         result = utils.is_corpus(potentialCorpus)
36 |         expected = (True, potentialCorpus)
37 |         self.assertEqual(expected, result)
38 | 
39 |         potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
40 |         result = utils.is_corpus(potentialCorpus)
41 |         expected = (True, potentialCorpus)
42 |         self.assertEqual(expected, result)
43 | 
44 |         # several documents, one word
45 |         potentialCorpus = [[(0, 4.)], [(1, 2.)]]
46 |         result = utils.is_corpus(potentialCorpus)
47 |         expected = (True, potentialCorpus)
48 |         self.assertEqual(expected, result)
49 | 
50 |         potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
51 |         result = utils.is_corpus(potentialCorpus)
52 |         expected = (True, potentialCorpus)
53 |         self.assertEqual(expected, result)
54 | 
55 |     def test_int_tuples(self):
56 |         potentialCorpus = [[(0, 4)]]
57 |         result = utils.is_corpus(potentialCorpus)
58 |         expected = (True, potentialCorpus)
59 |         self.assertEqual(expected, result)
60 | 
61 |     def test_invalid_formats(self):
62 |         # test invalid formats
63 |         # these are no corpus, because they do not consists of 2-tuples with
64 |         # the form(int, float).
65 |         potentials = list()
66 |         potentials.append(["human"])
67 |         potentials.append("human")
68 |         potentials.append(["human", "star"])
69 |         potentials.append([1, 2, 3, 4, 5, 5])
70 |         potentials.append([[(0, 'string')]])
71 |         for noCorpus in potentials:
72 |             result = utils.is_corpus(noCorpus)
73 |             expected = (False, noCorpus)
74 |             self.assertEqual(expected, result)
75 | 
76 | 
77 | class TestUtils(unittest.TestCase):
78 |     def test_decode_entities(self):
79 |         # create a string that fails to decode with unichr on narrow python builds
80 |         body = u'It&#146;s the Year of the Horse. YES VIN DIESEL &#128588; &#128175;'
81 |         expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af'
82 |         self.assertEquals(utils.decode_htmlentities(body), expected)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     logging.root.setLevel(logging.WARNING)
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/gensim/test/test_logentropy_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | """
 8 | Automated tests for checking transformation algorithms (the models package).
 9 | """
10 | 
11 | 
12 | import logging
13 | import unittest
14 | import os
15 | import os.path
16 | import tempfile
17 | 
18 | import six
19 | import numpy
20 | import scipy.linalg
21 | 
22 | from gensim.corpora import mmcorpus, Dictionary
23 | from gensim.models import logentropy_model
24 | from gensim import matutils
25 | 
26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
28 | 
29 | 
30 | # set up vars used in testing ("Deerwester" from the web tutorial)
31 | texts = [['human', 'interface', 'computer'],
32 |  ['survey', 'user', 'computer', 'system', 'response', 'time'],
33 |  ['eps', 'user', 'interface', 'system'],
34 |  ['system', 'human', 'system', 'eps'],
35 |  ['user', 'response', 'time'],
36 |  ['trees'],
37 |  ['graph', 'trees'],
38 |  ['graph', 'minors', 'trees'],
39 |  ['graph', 'minors', 'survey']]
40 | dictionary = Dictionary(texts)
41 | corpus = [dictionary.doc2bow(text) for text in texts]
42 | 
43 | 
44 | def testfile():
45 |     # temporary data will be stored to this file
46 |     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
47 | 
48 | 
49 | class TestLogEntropyModel(unittest.TestCase):
50 |     def setUp(self):
51 |         self.corpus_small = mmcorpus.MmCorpus(datapath('test_corpus_small.mm'))
52 |         self.corpus_ok = mmcorpus.MmCorpus(datapath('test_corpus_ok.mm'))
53 | 
54 | 
55 |     def testTransform(self):
56 |         # create the transformation model
57 |         model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=False)
58 | 
59 |         # transform one document
60 |         doc = list(self.corpus_ok)[0]
61 |         transformed = model[doc]
62 | 
63 |         expected = [(0, 0.3748900964125389),
64 |                     (1, 0.30730215324230725),
65 |                     (3, 1.20941755462856)]
66 |         self.assertTrue(numpy.allclose(transformed, expected))
67 | 
68 | 
69 |     def testPersistence(self):
70 |         fname = testfile()
71 |         model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True)
72 |         model.save(fname)
73 |         model2 = logentropy_model.LogEntropyModel.load(fname)
74 |         self.assertTrue(model.entr == model2.entr)
75 |         tstvec = []
76 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec]))
77 | 
78 |     def testPersistenceCompressed(self):
79 |         fname = testfile() + '.gz'
80 |         model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True)
81 |         model.save(fname)
82 |         model2 = logentropy_model.LogEntropyModel.load(fname, mmap=None)
83 |         self.assertTrue(model.entr == model2.entr)
84 |         tstvec = []
85 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec]))
86 | #endclass TestLogEntropyModel
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_genmodel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | """
 7 | USAGE: %(program)s LANGUAGE METHOD
 8 |     Generate topic models for the specified subcorpus. METHOD is currently one \
 9 | of 'tfidf', 'lsi', 'lda', 'rp'.
10 | 
11 | Example: ./gensim_genmodel.py any lsi
12 | """
13 | 
14 | 
15 | import logging
16 | import sys
17 | import os.path
18 | import re
19 | 
20 | 
21 | from gensim.corpora import sources, dmlcorpus, MmCorpus
22 | from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel
23 | 
24 | import gensim_build
25 | 
26 | 
27 | # internal method parameters
28 | DIM_RP = 300 # dimensionality for random projections
29 | DIM_LSI = 200 # for lantent semantic indexing
30 | DIM_LDA = 100 # for latent dirichlet allocation
31 | 
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
36 |     logging.root.setLevel(level = logging.INFO)
37 |     logging.info("running %s" % ' '.join(sys.argv))
38 | 
39 |     program = os.path.basename(sys.argv[0])
40 | 
41 |     # check and process input arguments
42 |     if len(sys.argv) < 3:
43 |         print(globals()['__doc__'] % locals())
44 |         sys.exit(1)
45 |     language = sys.argv[1]
46 |     method = sys.argv[2].strip().lower()
47 | 
48 |     logging.info("loading corpus mappings")
49 |     config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
50 |                                  resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
51 | 
52 |     logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
53 |     id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
54 |     logging.info("loaded %i word ids" % len(id2word))
55 | 
56 |     corpus = MmCorpus(config.resultFile('bow.mm'))
57 | 
58 |     if method == 'tfidf':
59 |         model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
60 |         model.save(config.resultFile('model_tfidf.pkl'))
61 |     elif method == 'lda':
62 |         model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
63 |         model.save(config.resultFile('model_lda.pkl'))
64 |     elif method == 'lsi':
65 |         # first, transform word counts to tf-idf weights
66 |         tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
67 |         # then find the transformation from tf-idf to latent space
68 |         model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI)
69 |         model.save(config.resultFile('model_lsi.pkl'))
70 |     elif method == 'rp':
71 |         # first, transform word counts to tf-idf weights
72 |         tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
73 |         # then find the transformation from tf-idf to latent space
74 |         model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP)
75 |         model.save(config.resultFile('model_rp.pkl'))
76 |     else:
77 |         raise ValueError('unknown topic extraction method: %s' % repr(method))
78 | 
79 |     MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus])
80 | 
81 |     logging.info("finished running %s" % program)
82 | 
83 | 


--------------------------------------------------------------------------------
/gensim/test/test_parsing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Automated tests for the parsing module.
 6 | """
 7 | 
 8 | import logging
 9 | import unittest
10 | import numpy as np
11 | 
12 | from gensim.parsing.preprocessing import *
13 | 
14 | 
15 | # several documents
16 | doc1 = """C'est un trou de verdure où chante une rivière,
17 | Accrochant follement aux herbes des haillons
18 | D'argent ; où le soleil, de la montagne fière,
19 | Luit : c'est un petit val qui mousse de rayons."""
20 | 
21 | doc2 = """Un soldat jeune, bouche ouverte, tête nue,
22 | Et la nuque baignant dans le frais cresson bleu,
23 | Dort ; il est étendu dans l'herbe, sous la nue,
24 | Pâle dans son lit vert où la lumière pleut."""
25 | 
26 | doc3 = """Les pieds dans les glaïeuls, il dort. Souriant comme
27 | Sourirait un enfant malade, il fait un somme :
28 | Nature, berce-le chaudement : il a froid."""
29 | 
30 | doc4 = """Les parfums ne font pas frissonner sa narine ;
31 | Il dort dans le soleil, la main sur sa poitrine,
32 | Tranquille. Il a deux trous rouges au côté droit."""
33 | 
34 | doc5 = """While it is quite useful to be able to search a
35 | large collection of documents almost instantly for a joint
36 | occurrence of a collection of exact words,
37 | for many searching purposes, a little fuzziness would help. """
38 | 
39 | 
40 | dataset = map(lambda x: strip_punctuation2(x.lower()),
41 |         [doc1, doc2, doc3, doc4])
42 | # doc1 and doc2 have class 0, doc3 and doc4 avec class 1
43 | classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]])
44 | 
45 | 
46 | 
47 | class TestPreprocessing(unittest.TestCase):
48 | 
49 |     def testStripNumeric(self):
50 |         self.assertEqual(strip_numeric("salut les amis du 59"),
51 |                           "salut les amis du ")
52 | 
53 |     def testStripShort(self):
54 |         self.assertEqual(strip_short("salut les amis du 59", 3),
55 |                           "salut les amis")
56 | 
57 |     def testStripTags(self):
58 |         self.assertEqual(strip_tags("<i>Hello</i> <b>World</b>!"),
59 |                           "Hello World!")
60 | 
61 |     def testStripMultipleWhitespaces(self):
62 |         self.assertEqual(strip_multiple_whitespaces("salut  les\r\nloulous!"),
63 |                           "salut les loulous!")
64 | 
65 |     def testStripNonAlphanum(self):
66 |         self.assertEqual(strip_non_alphanum("toto nf-kappa titi"),
67 |                           "toto nf kappa titi")
68 | 
69 |     def testSplitAlphanum(self):
70 |         self.assertEqual(split_alphanum("toto diet1 titi"),
71 |                           "toto diet 1 titi")
72 |         self.assertEqual(split_alphanum("toto 1diet titi"),
73 |                           "toto 1 diet titi")
74 | 
75 |     def testStripStopwords(self):
76 |         self.assertEqual(remove_stopwords("the world is square"),
77 |                           "world square")
78 | 
79 |     def testStemText(self):
80 |         target = "while it is quit us to be abl to search a larg " + \
81 |                 "collect of document almost instantli for a joint occurr " + \
82 |                 "of a collect of exact words, for mani search purposes, " + \
83 |                 "a littl fuzzi would help."
84 |         self.assertEqual(stem_text(doc5), target)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     logging.basicConfig(level=logging.WARNING)
89 |     unittest.main()
90 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # AppVeyor.com is a Continuous Integration service to build and run tests under
 2 | # Windows
 3 | # https://ci.appveyor.com/project/tmylk/gensim
 4 | 
 5 | environment:
 6 |   global:
 7 |     # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the
 8 |     # /E:ON and /V:ON options are not enabled in the batch script intepreter
 9 |     # See: http://stackoverflow.com/a/13751649/163740
10 |     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\continuous_integration\\appveyor\\run_with_env.cmd"
11 |     WHEELHOUSE_UPLOADER_USERNAME: "Lev.Konstantinovskiy"
12 |     WHEELHOUSE_UPLOADER_SECRET:
13 |       secure: qXqY3dFmLOqvxa3Om2gQi/BjotTOK+EP2IPLolBNo0c61yDtNWxbmE4wH3up72Be
14 |     
15 |   matrix:
16 |     - PYTHON: "C:\\Python27"
17 |       PYTHON_VERSION: "2.7.8"
18 |       PYTHON_ARCH: "32"
19 | 
20 |     - PYTHON: "C:\\Python27-x64"
21 |       PYTHON_VERSION: "2.7.8"
22 |       PYTHON_ARCH: "64"
23 | 
24 |     - PYTHON: "C:\\Python35"
25 |       PYTHON_VERSION: "3.5.0"
26 |       PYTHON_ARCH: "32"
27 | 
28 |     - PYTHON: "C:\\Python35-x64"
29 |       PYTHON_VERSION: "3.5.0"
30 |       PYTHON_ARCH: "64"
31 | 
32 | 
33 | 
34 | install:
35 |   # Install Python (from the official .msi of http://python.org) and pip when
36 |   # not already installed.
37 |   - "powershell ./continuous_integration/appveyor/install.ps1"
38 |   - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
39 | 
40 |   # Check that we have the expected version and architecture for Python
41 |   - "python --version"
42 |   - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
43 | 
44 |   # Install the build and runtime dependencies of the project.
45 |     # Install the build and runtime dependencies of the project.
46 |   - "%CMD_IN_ENV% pip install --timeout=60 --trusted-host 28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com -r continuous_integration/appveyor/requirements.txt"
47 |   - "%CMD_IN_ENV% python setup.py bdist_wheel bdist_wininst "
48 |   - ps: "ls dist"
49 | 
50 |   # Install the genreated wheel package to test it
51 |   - "pip install --pre --no-index --find-links dist/ gensim"
52 | 
53 | # Not a .NET project, we build scikit-learn in the install step instead
54 | build: false
55 | 
56 | test_script:
57 |   # Change to a non-source folder to make sure we run the tests on the
58 |   # installed library.
59 |   - "mkdir empty_folder"
60 |   - "cd empty_folder"
61 | 
62 |   - "python -c \"import nose; nose.main()\" -s -v gensim"
63 |   # Move back to the project folder
64 |   - "cd .."
65 | 
66 | artifacts:
67 |   # Archive the generated wheel package in the ci.appveyor.com build report.
68 |   - path: dist\*
69 | on_success:
70 |   # Upload the generated wheel package to Rackspace
71 |   # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we
72 |   # disable the ssl checks.
73 |   - "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist gensim-windows-wheels"
74 | 
75 | notifications:
76 |   - provider: Webhook
77 |     url: https://webhooks.gitter.im/e/62c44ad26933cd7ed7e8
78 |     on_build_success: false
79 |     on_build_failure: True
80 | 
81 | cache:
82 |   # Use the appveyor cache to avoid re-downloading large archives such
83 |   # the MKL numpy and scipy wheels mirrored on a rackspace cloud
84 |   # container, speed up the appveyor jobs and reduce bandwidth
85 |   # usage on our rackspace account.
86 |   - '%APPDATA%\pip\Cache'
87 | 
88 | 


--------------------------------------------------------------------------------
/gensim/test/test_rpmodel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 6 | 
 7 | """
 8 | Automated tests for checking transformation algorithms (the models package).
 9 | """
10 | 
11 | 
12 | import logging
13 | import unittest
14 | import os
15 | import os.path
16 | import tempfile
17 | 
18 | import six
19 | import numpy
20 | import scipy.linalg
21 | 
22 | from gensim.corpora import mmcorpus, Dictionary
23 | from gensim.models import rpmodel
24 | from gensim import matutils
25 | 
26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
28 | 
29 | 
30 | # set up vars used in testing ("Deerwester" from the web tutorial)
31 | texts = [['human', 'interface', 'computer'],
32 |  ['survey', 'user', 'computer', 'system', 'response', 'time'],
33 |  ['eps', 'user', 'interface', 'system'],
34 |  ['system', 'human', 'system', 'eps'],
35 |  ['user', 'response', 'time'],
36 |  ['trees'],
37 |  ['graph', 'trees'],
38 |  ['graph', 'minors', 'trees'],
39 |  ['graph', 'minors', 'survey']]
40 | dictionary = Dictionary(texts)
41 | corpus = [dictionary.doc2bow(text) for text in texts]
42 | 
43 | 
44 | def testfile():
45 |     # temporary data will be stored to this file
46 |     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
47 | 
48 | 
49 | 
50 | class TestRpModel(unittest.TestCase):
51 |     def setUp(self):
52 |         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
53 | 
54 |     def testTransform(self):
55 |         # create the transformation model
56 |         numpy.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results)
57 |         model = rpmodel.RpModel(self.corpus, num_topics=2)
58 | 
59 |         # transform one document
60 |         doc = list(self.corpus)[0]
61 |         transformed = model[doc]
62 |         vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
63 | 
64 |         expected = numpy.array([-0.70710677, 0.70710677])
65 |         self.assertTrue(numpy.allclose(vec, expected)) # transformed entries must be equal up to sign
66 | 
67 | 
68 |     def testPersistence(self):
69 |         fname = testfile()
70 |         model = rpmodel.RpModel(self.corpus, num_topics=2)
71 |         model.save(fname)
72 |         model2 = rpmodel.RpModel.load(fname)
73 |         self.assertEqual(model.num_topics, model2.num_topics)
74 |         self.assertTrue(numpy.allclose(model.projection, model2.projection))
75 |         tstvec = []
76 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
77 | 
78 |     def testPersistenceCompressed(self):
79 |         fname = testfile() + '.gz'
80 |         model = rpmodel.RpModel(self.corpus, num_topics=2)
81 |         model.save(fname)
82 |         model2 = rpmodel.RpModel.load(fname, mmap=None)
83 |         self.assertEqual(model.num_topics, model2.num_topics)
84 |         self.assertTrue(numpy.allclose(model.projection, model2.projection))
85 |         tstvec = []
86 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
87 | #endclass TestRpModel
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/docs/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | BUILDDIR      = _build
 9 | 
10 | # Internal variables.
11 | PAPEROPT_a4     = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | 
15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
16 | 
17 | help:
18 | 	@echo "Please use \`make <target>' where <target> is one of"
19 | 	@echo "  html      to make standalone HTML files"
20 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
21 | 	@echo "  pickle    to make pickle files"
22 | 	@echo "  json      to make JSON files"
23 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
24 | 	@echo "  qthelp    to make HTML files and a qthelp project"
25 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
26 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
27 | 	@echo "  linkcheck to check all external links for integrity"
28 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
29 | 
30 | clean:
31 | 	-rm -rf $(BUILDDIR)/*
32 | 
33 | html:
34 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
35 | 	rm -r $(BUILDDIR)/html/_sources
36 | 	cp -r $(BUILDDIR)/html/* ../
37 | 	@echo
38 | 	@echo "Build finished. The HTML pages are in ../"
39 | 
40 | upload: 
41 | 	scp -r _build/html/* rr:public_html/gensim/
42 | 
43 | dirhtml:
44 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
45 | 	@echo
46 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
47 | 
48 | pickle:
49 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
50 | 	@echo
51 | 	@echo "Build finished; now you can process the pickle files."
52 | 
53 | json:
54 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
55 | 	@echo
56 | 	@echo "Build finished; now you can process the JSON files."
57 | 
58 | htmlhelp:
59 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
60 | 	@echo
61 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
62 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
63 | 
64 | qthelp:
65 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
66 | 	@echo
67 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
68 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
69 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gensim.qhcp"
70 | 	@echo "To view the help file:"
71 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gensim.qhc"
72 | 
73 | latex:
74 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
75 | 	@echo
76 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
77 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
78 | 	      "run these through (pdf)latex."
79 | 	cd $(BUILDDIR)/latex && make all-pdf
80 | 	cp  $(BUILDDIR)/latex/gensim.pdf ../gensim_manual.pdf
81 | 
82 | changes:
83 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
84 | 	@echo
85 | 	@echo "The overview file is in $(BUILDDIR)/changes."
86 | 
87 | linkcheck:
88 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
89 | 	@echo
90 | 	@echo "Link check complete; look for any errors in the above output " \
91 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
92 | 
93 | doctest:
94 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
95 | 	@echo "Testing of doctests in the sources finished, look at the " \
96 | 	      "results in $(BUILDDIR)/doctest/output.txt."
97 | 


--------------------------------------------------------------------------------
/continuous_integration/appveyor/run_with_env.cmd:
--------------------------------------------------------------------------------
 1 | :: To build extensions for 64 bit Python 3, we need to configure environment
 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
 4 | ::
 5 | :: To build extensions for 64 bit Python 2, we need to configure environment
 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
 8 | ::
 9 | :: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific
10 | :: environment configurations.
11 | ::
12 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the
13 | :: cmd interpreter, at least for (SDK v7.0)
14 | ::
15 | :: More details at:
16 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
17 | :: http://stackoverflow.com/a/13751649/163740
18 | ::
19 | :: Author: Olivier Grisel
20 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
21 | ::
22 | :: Notes about batch files for Python people:
23 | ::
24 | :: Quotes in values are literally part of the values:
25 | ::      SET FOO="bar"
26 | :: FOO is now five characters long: " b a r "
27 | :: If you don't want quotes, don't include them on the right-hand side.
28 | ::
29 | :: The CALL lines at the end of this file look redundant, but if you move them
30 | :: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y
31 | :: case, I don't know why.
32 | @ECHO OFF
33 | 
34 | SET COMMAND_TO_RUN=%*
35 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
36 | SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf
37 | 
38 | :: Extract the major and minor versions, and allow for the minor version to be
39 | :: more than 9.  This requires the version number to have two dots in it.
40 | SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1%
41 | IF "%PYTHON_VERSION:~3,1%" == "." (
42 |     SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,1%
43 | ) ELSE (
44 |     SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,2%
45 | )
46 | 
47 | :: Based on the Python version, determine what SDK version to use, and whether
48 | :: to set the SDK for 64-bit.
49 | IF %MAJOR_PYTHON_VERSION% == 2 (
50 |     SET WINDOWS_SDK_VERSION="v7.0"
51 |     SET SET_SDK_64=Y
52 | ) ELSE (
53 |     IF %MAJOR_PYTHON_VERSION% == 3 (
54 |         SET WINDOWS_SDK_VERSION="v7.1"
55 |         IF %MINOR_PYTHON_VERSION% LEQ 4 (
56 |             SET SET_SDK_64=Y
57 |         ) ELSE (
58 |             SET SET_SDK_64=N
59 |             IF EXIST "%WIN_WDK%" (
60 |                 :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/
61 |                 REN "%WIN_WDK%" 0wdf
62 |             )
63 |         )
64 |     ) ELSE (
65 |         ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
66 |         EXIT 1
67 |     )
68 | )
69 | 
70 | IF %PYTHON_ARCH% == 64 (
71 |     IF %SET_SDK_64% == Y (
72 |         ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
73 |         SET DISTUTILS_USE_SDK=1
74 |         SET MSSdk=1
75 |         "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
76 |         "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
77 |         ECHO Executing: %COMMAND_TO_RUN%
78 |         call %COMMAND_TO_RUN% || EXIT 1
79 |     ) ELSE (
80 |         ECHO Using default MSVC build environment for 64 bit architecture
81 |         ECHO Executing: %COMMAND_TO_RUN%
82 |         call %COMMAND_TO_RUN% || EXIT 1
83 |     )
84 | ) ELSE (
85 |     ECHO Using default MSVC build environment for 32 bit architecture
86 |     ECHO Executing: %COMMAND_TO_RUN%
87 |     call %COMMAND_TO_RUN% || EXIT 1
88 | )
89 | 


--------------------------------------------------------------------------------
/gensim/test/test_phrases.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | """
 7 | Automated tests for checking transformation algorithms (the models package).
 8 | """
 9 | 
10 | 
11 | import logging
12 | import unittest
13 | import os
14 | import sys
15 | 
16 | from gensim import utils
17 | from gensim.models.phrases import Phrases
18 | 
19 | if sys.version_info[0] >= 3:
20 |     unicode = str
21 | 
22 | module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
23 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
24 | 
25 | 
26 | sentences = [
27 |     ['human', 'interface', 'computer'],
28 |     ['survey', 'user', 'computer', 'system', 'response', 'time'],
29 |     ['eps', 'user', 'interface', 'system'],
30 |     ['system', 'human', 'system', 'eps'],
31 |     ['user', 'response', 'time'],
32 |     ['trees'],
33 |     ['graph', 'trees'],
34 |     ['graph', 'minors', 'trees'],
35 |     ['graph', 'minors', 'survey']
36 | ]
37 | 
38 | 
39 | class TestPhrasesModel(unittest.TestCase):
40 |     def testSentenceGeneration(self):
41 |         """Test basic bigram using a dummy corpus."""
42 |         bigram = Phrases(sentences)
43 |         # test that we generate the same amount of sentences as the input
44 |         self.assertEqual(len(sentences), len(list(bigram[sentences])))
45 | 
46 |     def testBigramConstruction(self):
47 |         """Test Phrases bigram construction building."""
48 |         bigram = Phrases(sentences, min_count=1, threshold=1)
49 | 
50 |         # with this setting we should get response_time and graph_minors
51 |         bigram1_seen = False
52 |         bigram2_seen = False
53 | 
54 |         for s in bigram[sentences]:
55 |             if u'response_time' in s:
56 |                 bigram1_seen = True
57 |             if u'graph_minors' in s:
58 |                 bigram2_seen = True
59 |         self.assertTrue(bigram1_seen and bigram2_seen)
60 | 
61 |         # check the same thing, this time using single doc transformation
62 |         self.assertTrue(u'response_time' in bigram[sentences[1]])
63 |         self.assertTrue(u'response_time' in bigram[sentences[4]])
64 |         self.assertTrue(u'graph_minors' in bigram[sentences[-2]])
65 |         self.assertTrue(u'graph_minors' in bigram[sentences[-1]])
66 | 
67 |     def testBadParameters(self):
68 |         """Test the phrases module with bad parameters."""
69 |         # should fail with something less or equal than 0
70 |         self.assertRaises(ValueError, Phrases, sentences, min_count=0)
71 | 
72 |         # threshold should be positive
73 |         self.assertRaises(ValueError, Phrases, sentences, threshold=-1)
74 | 
75 |     def testEncoding(self):
76 |         """Test that both utf8 and unicode input work; output must be unicode."""
77 |         expected = [u'survey', u'user', u'computer', u'system', u'response_time']
78 | 
79 |         bigram_utf8 = Phrases(sentences, min_count=1, threshold=1)
80 |         self.assertEquals(bigram_utf8[sentences[1]], expected)
81 | 
82 |         unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]
83 |         bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1)
84 |         self.assertEquals(bigram_unicode[sentences[1]], expected)
85 | 
86 |         transformed = ' '.join(bigram_utf8[sentences[1]])
87 |         self.assertTrue(isinstance(transformed, unicode))
88 | 
89 |     def testPruning(self):
90 |         """Test that max_vocab_size parameter is respected."""
91 |         bigram = Phrases(sentences, max_vocab_size=5)
92 |         self.assertTrue(len(bigram.vocab) <= 5)
93 | #endclass TestPhrasesModel
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/gensim/test/test_tfidfmodel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | Automated tests for checking transformation algorithms (the models package).
  9 | """
 10 | 
 11 | 
 12 | import logging
 13 | import unittest
 14 | import os
 15 | import os.path
 16 | import tempfile
 17 | 
 18 | import six
 19 | import numpy
 20 | import scipy.linalg
 21 | 
 22 | from gensim.corpora import mmcorpus, Dictionary
 23 | from gensim.models import tfidfmodel
 24 | from gensim import matutils
 25 | 
 26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
 27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
 28 | 
 29 | 
 30 | # set up vars used in testing ("Deerwester" from the web tutorial)
 31 | texts = [['human', 'interface', 'computer'],
 32 |  ['survey', 'user', 'computer', 'system', 'response', 'time'],
 33 |  ['eps', 'user', 'interface', 'system'],
 34 |  ['system', 'human', 'system', 'eps'],
 35 |  ['user', 'response', 'time'],
 36 |  ['trees'],
 37 |  ['graph', 'trees'],
 38 |  ['graph', 'minors', 'trees'],
 39 |  ['graph', 'minors', 'survey']]
 40 | dictionary = Dictionary(texts)
 41 | corpus = [dictionary.doc2bow(text) for text in texts]
 42 | 
 43 | 
 44 | def testfile():
 45 |     # temporary data will be stored to this file
 46 |     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 47 | 
 48 | 
 49 | 
 50 | class TestTfidfModel(unittest.TestCase):
 51 |     def setUp(self):
 52 |         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
 53 | 
 54 |     def testTransform(self):
 55 |         # create the transformation model
 56 |         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
 57 | 
 58 |         # transform one document
 59 |         doc = list(self.corpus)[0]
 60 |         transformed = model[doc]
 61 | 
 62 |         expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
 63 |         self.assertTrue(numpy.allclose(transformed, expected))
 64 | 
 65 | 
 66 |     def testInit(self):
 67 |         # create the transformation model by analyzing a corpus
 68 |         # uses the global `corpus`!
 69 |         model1 = tfidfmodel.TfidfModel(corpus)
 70 | 
 71 |         # make sure the dfs<->idfs transformation works
 72 |         self.assertEqual(model1.dfs, dictionary.dfs)
 73 |         self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus)))
 74 | 
 75 |         # create the transformation model by directly supplying a term->docfreq
 76 |         # mapping from the global var `dictionary`.
 77 |         model2 = tfidfmodel.TfidfModel(dictionary=dictionary)
 78 |         self.assertEqual(model1.idfs, model2.idfs)
 79 | 
 80 | 
 81 |     def testPersistence(self):
 82 |         fname = testfile()
 83 |         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
 84 |         model.save(fname)
 85 |         model2 = tfidfmodel.TfidfModel.load(fname)
 86 |         self.assertTrue(model.idfs == model2.idfs)
 87 |         tstvec = []
 88 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
 89 | 
 90 |     def testPersistenceCompressed(self):
 91 |         fname = testfile() + '.gz'
 92 |         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
 93 |         model.save(fname)
 94 |         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
 95 |         self.assertTrue(model.idfs == model2.idfs)
 96 |         tstvec = []
 97 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
 98 | #endclass TestTfidfModel
 99 | 
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
104 |     unittest.main()
105 | 


--------------------------------------------------------------------------------
/gensim/test/test_keywords.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | #
 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 5 | 
 6 | """
 7 | Automated test to reproduce the results of Mihalcea and Tarau (2004).
 8 | 
 9 | Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
10 | As a validation of the gensim implementation we reproduced its results
11 | in this test.
12 | 
13 | """
14 | 
15 | import os.path
16 | import logging
17 | import unittest
18 | 
19 | from gensim import utils
20 | from gensim.corpora import Dictionary
21 | from gensim.summarization import keywords
22 | 
23 | 
24 | class TestKeywordsTest(unittest.TestCase):
25 | 
26 |     def test_text_keywords(self):
27 |         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
28 | 
29 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
30 |             text = f.read()
31 | 
32 |         # calculate keywords
33 |         generated_keywords = keywords(text, split=True)
34 | 
35 |         # To be compared to the reference.
36 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
37 |             kw = f.read().strip().split("\n")
38 | 
39 |         self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))
40 | 
41 |     def test_text_keywords_words(self):
42 |         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
43 | 
44 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
45 |             text = f.read()
46 | 
47 |         # calculate exactly 13 keywords
48 |         generated_keywords = keywords(text, words=15, split=True)
49 | 
50 |         self.assertEqual(len(generated_keywords), 16)
51 | 
52 |     def test_text_keywords_pos(self):
53 |         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
54 | 
55 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
56 |             text = f.read()
57 | 
58 |         # calculate keywords using only certain parts of speech
59 |         generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)
60 | 
61 |         # To be compared to the reference.
62 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
63 |             kw = f.read().strip().split("\n")
64 | 
65 |         self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))
66 | 
67 |     def test_text_summarization_raises_exception_on_short_input_text(self):
68 |         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
69 | 
70 |         with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
71 |             text = f.read()
72 | 
73 |         # Keeps the first 8 sentences to make the text shorter.
74 |         text = "\n".join(text.split('\n')[:8])
75 | 
76 |         self.assertTrue(keywords(text) is not None)
77 | 
78 | 
79 |     def test_keywords_ratio(self):
80 |         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
81 | 
82 |         with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
83 |             text = f.read()
84 | 
85 |         # Check ratio parameter is well behaved.  Because length is taken on tokenized clean text
86 |         # we just check that ratio 20% is twice as long as ratio 10%
87 |         # Values of 10% and 20% were carefully selected for this test to avoid
88 |         # numerical instabilities when several keywords have almost the same score
89 |         selected_docs_12 = keywords(text, ratio=0.1, split=True)
90 |         selected_docs_21 = keywords(text, ratio=0.2, split=True)
91 | 
92 |         self.assertAlmostEqual(float(len(selected_docs_21))/len(selected_docs_12), float(21)/12, places=1)
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
97 |     unittest.main()
98 | 


--------------------------------------------------------------------------------
/gensim/summarization/textcleaner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | from gensim.summarization.syntactic_unit import SyntacticUnit
  7 | from gensim.parsing.preprocessing import preprocess_documents
  8 | from gensim.utils import tokenize
  9 | from six.moves import xrange
 10 | import re
 11 | import logging
 12 | 
 13 | logger = logging.getLogger('summa.preprocessing.cleaner')
 14 | 
 15 | try:
 16 |     from pattern.en import tag
 17 |     logger.info("'pattern' package found; tag filters are available for English")
 18 |     HAS_PATTERN = True
 19 | except ImportError:
 20 |     logger.info("'pattern' package not found; tag filters are not available for English")
 21 |     HAS_PATTERN = False
 22 | 
 23 | 
 24 | SEPARATOR = r"@"
 25 | RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
 26 | AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
 27 | AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
 28 | AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
 29 | UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
 30 | UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
 31 | 
 32 | 
 33 | def split_sentences(text):
 34 |     processed = replace_abbreviations(text)
 35 |     return [undo_replacement(sentence) for sentence in get_sentences(processed)]
 36 | 
 37 | 
 38 | def replace_abbreviations(text):
 39 |     return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
 40 | 
 41 | 
 42 | def undo_replacement(sentence):
 43 |     return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
 44 | 
 45 | 
 46 | def replace_with_separator(text, separator, regexs):
 47 |     replacement = r"\1" + separator + r"\2"
 48 |     result = text
 49 |     for regex in regexs:
 50 |         result = regex.sub(replacement, result)
 51 |     return result
 52 | 
 53 | 
 54 | def get_sentences(text):
 55 |     for match in RE_SENTENCE.finditer(text):
 56 |         yield match.group()
 57 | 
 58 | 
 59 | def merge_syntactic_units(original_units, filtered_units, tags=None):
 60 |     units = []
 61 |     for i in xrange(len(original_units)):
 62 |         if filtered_units[i] == '':
 63 |             continue
 64 | 
 65 |         text = original_units[i]
 66 |         token = filtered_units[i]
 67 |         tag = tags[i][1] if tags else None
 68 |         sentence = SyntacticUnit(text, token, tag)
 69 |         sentence.index = i
 70 | 
 71 |         units.append(sentence)
 72 | 
 73 |     return units
 74 | 
 75 | 
 76 | def join_words(words, separator=" "):
 77 |     return separator.join(words)
 78 | 
 79 | 
 80 | def clean_text_by_sentences(text):
 81 |     """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
 82 |     Returns a SyntacticUnit list. """
 83 |     original_sentences = split_sentences(text)
 84 |     filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
 85 | 
 86 |     return merge_syntactic_units(original_sentences, filtered_sentences)
 87 | 
 88 | 
 89 | def clean_text_by_word(text):
 90 |     """ Tokenizes a given text into words, applying filters and lemmatizing them.
 91 |     Returns a dict of word -> syntacticUnit. """
 92 |     text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
 93 |     original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
 94 |     filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
 95 |     if HAS_PATTERN:
 96 |         tags = tag(join_words(original_words))  # tag needs the context of the words in the text
 97 |     else:
 98 |         tags = None
 99 |     units = merge_syntactic_units(original_words, filtered_words, tags)
100 |     return dict((unit.text, unit) for unit in units)
101 | 
102 | 
103 | def tokenize_by_word(text):
104 |     text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
105 |     return tokenize(text_without_acronyms, to_lower=True, deacc=True)
106 | 


--------------------------------------------------------------------------------
/gensim/models/rpmodel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | 
  8 | import logging
  9 | import itertools
 10 | 
 11 | import numpy
 12 | import scipy
 13 | 
 14 | from gensim import interfaces, matutils, utils
 15 | 
 16 | 
 17 | logger = logging.getLogger('gensim.models.rpmodel')
 18 | 
 19 | 
 20 | class RpModel(interfaces.TransformationABC):
 21 |     """
 22 |     Objects of this class allow building and maintaining a model for Random Projections
 23 |     (also known as Random Indexing). For theoretical background on RP, see:
 24 | 
 25 |       Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis."
 26 | 
 27 |     The main methods are:
 28 | 
 29 |     1. constructor, which creates the random projection matrix
 30 |     2. the [] method, which transforms a simple count representation into the TfIdf
 31 |        space.
 32 | 
 33 |     >>> rp = RpModel(corpus)
 34 |     >>> print(rp[some_doc])
 35 |     >>> rp.save('/tmp/foo.rp_model')
 36 | 
 37 |     Model persistency is achieved via its load/save methods.
 38 |     """
 39 |     def __init__(self, corpus, id2word=None, num_topics=300):
 40 |         """
 41 |         `id2word` is a mapping from word ids (integers) to words (strings). It is
 42 |         used to determine the vocabulary size, as well as for debugging and topic
 43 |         printing. If not set, it will be determined from the corpus.
 44 |         """
 45 |         self.id2word = id2word
 46 |         self.num_topics = num_topics
 47 |         if corpus is not None:
 48 |             self.initialize(corpus)
 49 | 
 50 | 
 51 |     def __str__(self):
 52 |         return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)
 53 | 
 54 | 
 55 |     def initialize(self, corpus):
 56 |         """
 57 |         Initialize the random projection matrix.
 58 |         """
 59 |         if self.id2word is None:
 60 |             logger.info("no word id mapping provided; initializing from corpus, assuming identity")
 61 |             self.id2word = utils.dict_from_corpus(corpus)
 62 |             self.num_terms = len(self.id2word)
 63 |         else:
 64 |             self.num_terms = 1 + max([-1] + self.id2word.keys())
 65 | 
 66 |         shape = self.num_topics, self.num_terms
 67 |         logger.info("constructing %s random matrix" % str(shape))
 68 |         # Now construct the projection matrix itself.
 69 |         # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
 70 |         # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
 71 |         randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1
 72 |         self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications
 73 | 
 74 | 
 75 |     def __getitem__(self, bow):
 76 |         """
 77 |         Return RP representation of the input vector and/or corpus.
 78 |         """
 79 |         # if the input vector is in fact a corpus, return a transformed corpus as result
 80 |         is_corpus, bow = utils.is_corpus(bow)
 81 |         if is_corpus:
 82 |             return self._apply(bow)
 83 | 
 84 |         vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
 85 |         vec = numpy.asfortranarray(vec, dtype=numpy.float32)
 86 |         topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
 87 |         return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
 88 |                 if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
 89 | 
 90 | 
 91 |     def __setstate__(self, state):
 92 |         """
 93 |         This is a hack to work around a bug in numpy, where a FORTRAN-order array
 94 |         unpickled from disk segfaults on using it.
 95 |         """
 96 |         self.__dict__ = state
 97 |         if self.projection is not None:
 98 |             self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array
 99 | #endclass RpModel
100 | 


--------------------------------------------------------------------------------
/docs/src/distributed.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed:
 2 | 
 3 | Distributed Computing
 4 | ===================================
 5 | 
 6 | Why distributed computing?
 7 | ---------------------------
 8 | 
 9 | Need to build semantic representation of a corpus that is millions of documents large and it's
10 | taking forever? Have several idle machines at your disposal that you could use?
11 | `Distributed computing <http://en.wikipedia.org/wiki/Distributed_computing>`_ tries
12 | to accelerate computations by splitting a given task into several smaller subtasks,
13 | passing them on to several computing nodes in parallel.
14 | 
15 | In the context of `gensim`, computing nodes are computers identified by their IP address/port,
16 | and communication happens over TCP/IP. The whole collection of available machines is called
17 | a *cluster*. The distribution is very coarse grained (not
18 | much communication going on), so the network is allowed to be of relatively high latency.
19 | 
20 | .. warning::
21 |   The primary reason for using distributed computing is making things run faster. In `gensim`,
22 |   most of the time consuming stuff is done inside low-level routines for linear algebra, inside
23 |   NumPy, independent of any `gensim` code.
24 |   **Installing a fast** `BLAS (Basic Linear Algebra) <http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms>`_ **library
25 |   for NumPy can improve performance up to 15 times!** So before you start buying those extra computers,
26 |   consider installing a fast, threaded BLAS that is optimized for your particular machine
27 |   (as opposed to a generic, binary-distributed library).
28 |   Options include your vendor's BLAS library (Intel's MKL,
29 |   AMD's ACML, OS X's vecLib, Sun's Sunperf, ...) or some open-source alternative (GotoBLAS, ALTAS).
30 | 
31 |   To see what BLAS and LAPACK you are using, type into your shell::
32 | 
33 |     python -c 'import scipy; scipy.show_config()'
34 | 
35 | Prerequisites
36 | -----------------
37 | 
38 | For communication between nodes, `gensim` uses `Pyro (PYthon Remote Objects)
39 | <http://pypi.python.org/pypi/Pyro4>`_, version >= 4.27. This is a library for low-level socket communication
40 | and remote procedure calls (RPC) in Python. `Pyro` is a pure-Python library, so its
41 | installation is quite painless and only involves copying its `*.py` files somewhere onto your Python's import path::
42 | 
43 |   sudo easy_install Pyro4
44 | 
45 | You don't have to install `Pyro` to run `gensim`, but if you don't, you won't be able
46 | to access the distributed features (i.e., everything will always run in serial mode,
47 | the examples on this page don't apply).
48 | 
49 | 
50 | Core concepts
51 | -----------------------------------
52 | 
53 | As always, `gensim` strives for a clear and straightforward API (see :ref:`design`).
54 | To this end, *you do not need to make any changes in your code at all* in order to
55 | run it over a cluster of computers!
56 | 
57 | What you need to do is run a :term:`worker` script (see below) on each of your cluster nodes prior
58 | to starting your computation. Running this script tells `gensim` that it may use the node
59 | as a slave to delegate some work to it. During initialization, the algorithms
60 | inside `gensim` will try to look for and enslave all available worker nodes.
61 | 
62 | .. glossary::
63 | 
64 |   Node
65 |     A logical working unit. Can correspond to a single physical machine, but you
66 |     can also run multiple workers on one machine, resulting in multiple
67 |     logical nodes.
68 | 
69 |   Cluster
70 |     Several nodes which communicate over TCP/IP. Currently, network broadcasting
71 |     is used to discover and connect all communicating nodes, so the nodes must lie
72 |     within the same `broadcast domain <http://en.wikipedia.org/wiki/Broadcast_domain>`_.
73 | 
74 |   Worker
75 |     A process which is created on each node. To remove a node from your cluster,
76 |     simply kill its worker process.
77 | 
78 |   Dispatcher
79 |     The dispatcher will be in charge of negotiating all computations, queueing and
80 |     distributing ("dispatching") individual jobs to the workers. Computations never
81 |     "talk" to worker nodes directly, only through this dispatcher. Unlike workers,
82 |     there can only be one active dispatcher at a time in the cluster.
83 | 
84 | 
85 | Available distributed algorithms
86 | ---------------------------------
87 | 
88 | .. toctree::
89 |    :maxdepth: 1
90 | 
91 |    dist_lsi
92 |    dist_lda
93 | 


--------------------------------------------------------------------------------
/gensim/models/lsi_worker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | USAGE: %(program)s
  9 | 
 10 |     Worker ("slave") process used in computing distributed LSI. Run this script \
 11 | on every node in your cluster. If you wish, you may even run it multiple times \
 12 | on a single machine, to make better use of multiple cores (just beware that \
 13 | memory footprint increases accordingly).
 14 | 
 15 | Example: python -m gensim.models.lsi_worker
 16 | """
 17 | 
 18 | 
 19 | from __future__ import with_statement
 20 | import os, sys, logging
 21 | import threading
 22 | import tempfile
 23 | try:
 24 |     import Queue
 25 | except ImportError:
 26 |     import queue as Queue
 27 | import Pyro4
 28 | from gensim.models import lsimodel
 29 | from gensim import utils
 30 | 
 31 | logger = logging.getLogger('gensim.models.lsi_worker')
 32 | 
 33 | 
 34 | SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never)
 35 | 
 36 | 
 37 | 
 38 | class Worker(object):
 39 |     def __init__(self):
 40 |         self.model = None
 41 | 
 42 | 
 43 |     def initialize(self, myid, dispatcher, **model_params):
 44 |         self.lock_update = threading.Lock()
 45 |         self.jobsdone = 0 # how many jobs has this worker completed?
 46 |         self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
 47 |         self.dispatcher = dispatcher
 48 |         self.finished = False
 49 |         logger.info("initializing worker #%s" % myid)
 50 |         self.model = lsimodel.LsiModel(**model_params)
 51 | 
 52 | 
 53 |     @Pyro4.oneway
 54 |     def requestjob(self):
 55 |         """
 56 |         Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called.
 57 |         """
 58 |         if self.model is None:
 59 |             raise RuntimeError("worker must be initialized before receiving jobs")
 60 | 
 61 |         job = None
 62 |         while job is None and not self.finished:
 63 |             try:
 64 |                 job = self.dispatcher.getjob(self.myid)
 65 |             except Queue.Empty:
 66 |                 # no new job: try again, unless we're finished with all work
 67 |                 continue
 68 |         if job is not None:
 69 |             logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
 70 |             self.processjob(job)
 71 |             self.dispatcher.jobdone(self.myid)
 72 |         else:
 73 |             logger.info("worker #%i stopping asking for jobs" % self.myid)
 74 | 
 75 | 
 76 |     @utils.synchronous('lock_update')
 77 |     def processjob(self, job):
 78 |         self.model.add_documents(job)
 79 |         self.jobsdone += 1
 80 |         if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0:
 81 |             fname = os.path.join(tempfile.gettempdir(), 'lsi_worker.pkl')
 82 |             self.model.save(fname)
 83 | 
 84 | 
 85 |     @utils.synchronous('lock_update')
 86 |     def getstate(self):
 87 |         logger.info("worker #%i returning its state after %s jobs" %
 88 |                     (self.myid, self.jobsdone))
 89 |         assert isinstance(self.model.projection, lsimodel.Projection)
 90 |         self.finished = True
 91 |         return self.model.projection
 92 | 
 93 | 
 94 |     @utils.synchronous('lock_update')
 95 |     def reset(self):
 96 |         logger.info("resetting worker #%i" % self.myid)
 97 |         self.model.projection = self.model.projection.empty_like()
 98 |         self.finished = False
 99 | 
100 | 
101 |     @Pyro4.oneway
102 |     def exit(self):
103 |         logger.info("terminating worker #%i" % self.myid)
104 |         os._exit(0)
105 | #endclass Worker
106 | 
107 | 
108 | 
109 | def main():
110 |     logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
111 |     logger.info("running %s" % " ".join(sys.argv))
112 | 
113 |     program = os.path.basename(sys.argv[0])
114 |     # make sure we have enough cmd line parameters
115 |     if len(sys.argv) < 1:
116 |         print(globals()["__doc__"] % locals())
117 |         sys.exit(1)
118 | 
119 |     utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True)
120 | 
121 |     logger.info("finished running %s" % program)
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     main()
127 | 


--------------------------------------------------------------------------------
/gensim/test/test_miislita.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | """
  7 | This module replicates the miislita vector spaces from
  8 | "A Linear Algebra Approach to the Vector Space Model -- A Fast Track Tutorial"
  9 | by Dr. E. Garcia, admin@miislita.com
 10 | 
 11 | See http://www.miislita.com for further details.
 12 | 
 13 | """
 14 | 
 15 | from __future__ import division  # always use floats
 16 | from __future__ import with_statement
 17 | 
 18 | import logging
 19 | import tempfile
 20 | import unittest
 21 | import bz2
 22 | import os
 23 | 
 24 | from gensim import utils, corpora, models, similarities
 25 | 
 26 | # sample data files are located in the same folder
 27 | module_path = os.path.dirname(__file__)
 28 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
 29 | 
 30 | logger = logging.getLogger('test_miislita')
 31 | 
 32 | 
 33 | def get_tmpfile(suffix):
 34 |     return os.path.join(tempfile.gettempdir(), suffix)
 35 | 
 36 | 
 37 | class CorpusMiislita(corpora.TextCorpus):
 38 |     stoplist = set('for a of the and to in on'.split())
 39 | 
 40 |     def get_texts(self):
 41 |         """
 42 |         Parse documents from the .cor file provided in the constructor. Lowercase
 43 |         each document and ignore some stopwords.
 44 | 
 45 |         .cor format: one document per line, words separated by whitespace.
 46 | 
 47 |         """
 48 |         with self.getstream() as stream:
 49 |             for doc in stream:
 50 |                 yield [word for word in utils.to_unicode(doc).lower().split()
 51 |                         if word not in CorpusMiislita.stoplist]
 52 | 
 53 |     def __len__(self):
 54 |         """Define this so we can use `len(corpus)`"""
 55 |         if 'length' not in self.__dict__:
 56 |             logger.info("caching corpus size (calculating number of documents)")
 57 |             self.length = sum(1 for doc in self.get_texts())
 58 |         return self.length
 59 | 
 60 | 
 61 | class TestMiislita(unittest.TestCase):
 62 |     def test_textcorpus(self):
 63 |         """Make sure TextCorpus can be serialized to disk. """
 64 |         # construct corpus from file
 65 |         miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
 66 | 
 67 |         # make sure serializing works
 68 |         ftmp = get_tmpfile('test_textcorpus.mm')
 69 |         corpora.MmCorpus.save_corpus(ftmp, miislita)
 70 |         self.assertTrue(os.path.exists(ftmp))
 71 | 
 72 |         # make sure deserializing gives the same result
 73 |         miislita2 = corpora.MmCorpus(ftmp)
 74 |         self.assertEqual(list(miislita), list(miislita2))
 75 | 
 76 | 
 77 |     def test_save_load_ability(self):
 78 |         """
 79 |         Make sure we can save and load (un/pickle) TextCorpus objects (as long
 80 |         as the underlying input isn't a file-like object; we cannot pickle those).
 81 |         """
 82 |         # construct corpus from file
 83 |         corpusname = datapath('miIslita.cor')
 84 |         miislita = CorpusMiislita(corpusname)
 85 | 
 86 |         # pickle to disk
 87 |         tmpf = get_tmpfile('tc_test.cpickle')
 88 |         miislita.save(tmpf)
 89 | 
 90 |         miislita2 = CorpusMiislita.load(tmpf)
 91 | 
 92 |         self.assertEqual(len(miislita), len(miislita2))
 93 |         self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id)
 94 | 
 95 | 
 96 |     def test_miislita_high_level(self):
 97 |         # construct corpus from file
 98 |         miislita = CorpusMiislita(datapath('miIslita.cor'))
 99 | 
100 |         # initialize tfidf transformation and similarity index
101 |         tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
102 |         index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
103 | 
104 |         # compare to query
105 |         query = 'latent semantic indexing'
106 |         vec_bow = miislita.dictionary.doc2bow(query.lower().split())
107 |         vec_tfidf = tfidf[vec_bow]
108 | 
109 |         # perform a similarity query against the corpus
110 |         sims_tfidf = index[vec_tfidf]
111 | 
112 |         # for the expected results see the article
113 |         expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
114 |         for i, value in enumerate(expected):
115 |             self.assertAlmostEqual(sims_tfidf[i], value, 2)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     logging.basicConfig(level=logging.DEBUG)
120 |     unittest.main()
121 | 


--------------------------------------------------------------------------------
/gensim/corpora/malletcorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | """
  7 | Corpus in Mallet format of List-Of-Words.
  8 | """
  9 | 
 10 | from __future__ import with_statement
 11 | 
 12 | import logging
 13 | 
 14 | from gensim import utils
 15 | from gensim.corpora import LowCorpus
 16 | 
 17 | 
 18 | logger = logging.getLogger('gensim.corpora.malletcorpus')
 19 | 
 20 | 
 21 | class MalletCorpus(LowCorpus):
 22 |     """
 23 |     Quoting http://mallet.cs.umass.edu/import.php:
 24 | 
 25 |         One file, one instance per line
 26 |         Assume the data is in the following format:
 27 | 
 28 |         [URL] [language] [text of the page...]
 29 | 
 30 |     Or, more generally,
 31 |         [document #1 id] [label] [text of the document...]
 32 |         [document #2 id] [label] [text of the document...]
 33 |         ...
 34 |         [document #N id] [label] [text of the document...]
 35 | 
 36 |     Note that language/label is *not* considered in Gensim.
 37 | 
 38 |     """
 39 |     def __init__(self, fname, id2word=None, metadata=False):
 40 |         self.metadata = metadata
 41 |         LowCorpus.__init__(self, fname, id2word)
 42 | 
 43 |     def _calculate_num_docs(self):
 44 |         with utils.smart_open(self.fname) as fin:
 45 |             result = sum([1 for x in fin])
 46 |         return result
 47 | 
 48 |     def __iter__(self):
 49 |         """
 50 |         Iterate over the corpus at the given filename.
 51 | 
 52 |         Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
 53 |         """
 54 |         with utils.smart_open(self.fname) as f:
 55 |             for line in f:
 56 |                 yield self.line2doc(line)
 57 | 
 58 |     def line2doc(self, line):
 59 |         l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
 60 |         docid, doclang, words = l[0], l[1], l[2:]
 61 | 
 62 |         doc = super(MalletCorpus, self).line2doc(' '.join(words))
 63 | 
 64 |         if self.metadata:
 65 |             return doc, (docid, doclang)
 66 |         else:
 67 |             return doc
 68 | 
 69 |     @staticmethod
 70 |     def save_corpus(fname, corpus, id2word=None, metadata=False):
 71 |         """
 72 |         Save a corpus in the Mallet format.
 73 | 
 74 |         The document id will be generated by enumerating the corpus.
 75 |         That is, it will range between 0 and number of documents in the corpus.
 76 | 
 77 |         Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
 78 |         If the language needs to be saved, post-processing will be required.
 79 | 
 80 |         This function is automatically called by `MalletCorpus.serialize`; don't
 81 |         call it directly, call `serialize` instead.
 82 | 
 83 |         """
 84 |         if id2word is None:
 85 |             logger.info("no word id mapping provided; initializing from corpus")
 86 |             id2word = utils.dict_from_corpus(corpus)
 87 | 
 88 |         logger.info("storing corpus in Mallet format into %s" % fname)
 89 | 
 90 |         truncated = 0
 91 |         offsets = []
 92 |         with utils.smart_open(fname, 'wb') as fout:
 93 |             for doc_id, doc in enumerate(corpus):
 94 |                 if metadata:
 95 |                     doc_id, doc_lang = doc[1]
 96 |                     doc = doc[0]
 97 |                 else:
 98 |                     doc_lang = '__unknown__'
 99 | 
100 |                 words = []
101 |                 for wordid, value in doc:
102 |                     if abs(int(value) - value) > 1e-6:
103 |                         truncated += 1
104 |                     words.extend([utils.to_unicode(id2word[wordid])] * int(value))
105 |                 offsets.append(fout.tell())
106 |                 fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))
107 | 
108 |         if truncated:
109 |             logger.warning("Mallet format can only save vectors with "
110 |                             "integer elements; %i float entries were truncated to integer value" %
111 |                             truncated)
112 | 
113 |         return offsets
114 | 
115 |     def docbyoffset(self, offset):
116 |         """
117 |         Return the document stored at file position `offset`.
118 |         """
119 |         with utils.smart_open(self.fname) as f:
120 |             f.seek(offset)
121 |             return self.line2doc(f.readline())
122 | 
123 | # endclass MalletCorpus
124 | 


--------------------------------------------------------------------------------
/docs/src/about.rst:
--------------------------------------------------------------------------------
 1 | .. _about:
 2 | 
 3 | ============
 4 | About
 5 | ============
 6 | 
 7 | History
 8 | --------
 9 | 
10 | Gensim started off as a collection of various Python scripts for the Czech Digital Mathematics Library `dml.cz <http://dml.cz/>`_ in 2008,
11 | where it served to generate a short list of the most similar articles to a given article (**gensim = "generate similar"**).
12 | I also wanted to try these fancy "Latent Semantic Methods", but the libraries that
13 | realized the necessary computation were `not much fun to work with <http://soi.stanford.edu/~rmunk/PROPACK/>`_.
14 | 
15 | Naturally, I set out to reinvent the wheel. Our `2010 LREC publication <http://radimrehurek.com/gensim/lrec2010_final.pdf>`_
16 | describes the initial design decisions behind gensim (clarity, efficiency and scalability)
17 | and is fairly representative of how gensim works even today.
18 | 
19 | Later versions of gensim improved this efficiency and scalability tremendously. In fact,
20 | I made algorithmic scalability of distributional semantics the topic of my `PhD thesis <http://radimrehurek.com/phd_rehurek.pdf>`_.
21 | 
22 | By now, gensim is---to my knowledge---the most robust, efficient and hassle-free piece
23 | of software to realize unsupervised semantic modelling from plain text. It stands
24 | in contrast to brittle homework-assignment-implementations that do not scale on one hand,
25 | and robust java-esque projects that take forever just to run "hello world".
26 | 
27 | In 2011, I started using `Github <https://github.com/piskvorky/gensim>`_ for source code hosting
28 | and the gensim website moved to its present domain. In 2013, gensim got its current logo and website design.
29 | 
30 | 
31 | Licensing
32 | ----------
33 | 
34 | Gensim is licensed under the OSI-approved `GNU LGPL license <http://www.gnu.org/licenses/lgpl.html>`_.
35 | This means that it's free for both personal and commercial use, but if you make any
36 | modification to gensim that you distribute to other people, you have to disclose
37 | the source code of these modifications.
38 | 
39 | Apart from that, you are free to redistribute gensim in any way you like, though you're
40 | not allowed to modify its license (doh!).
41 | 
42 | My intent here is, of course, to **get more help and community involvement** with the development of gensim.
43 | The legalese is therefore less important to me than your input and contributions.
44 | Contact me if LGPL doesn't fit your bill but you'd still like to use gensim -- we'll work something out.
45 | 
46 | .. seealso::
47 | 
48 |     I also host a document similarity package `gensim.simserver`. This is a high-level
49 |     interface to `gensim` functionality, and offers transactional remote (web-based)
50 |     document similarity queries and indexing. It uses gensim to do the heavy lifting:
51 |     you don't need the `simserver` to use gensim, but you do need gensim to use the `simserver`.
52 |     Note that unlike gensim, `gensim.simserver` is licensed under `Affero GPL <http://www.gnu.org/licenses/agpl-3.0.html>`_,
53 |     which is much more restrictive for inclusion in commercial projects.
54 | 
55 | Contributors
56 | --------------
57 | 
58 | Credit goes to all the people who contributed to gensim, be it in `discussions <http://groups.google.com/group/gensim>`_,
59 | ideas, `code contributions <https://github.com/piskvorky/gensim/pulls>`_ or `bug reports <https://github.com/piskvorky/gensim/issues>`_.
60 | It's really useful and motivating to get feedback, in any shape or form, so big thanks to you all!
61 | 
62 | Some honorable mentions are included in the `CHANGELOG.txt <https://github.com/piskvorky/gensim/blob/develop/CHANGELOG.txt>`_.
63 | 
64 | Academic citing
65 | ----------------
66 | 
67 | Gensim has been used in `many students' final theses as well as research papers <http://scholar.google.cz/citations?view_op=view_citation&hl=en&user=9vG_kV0AAAAJ&citation_for_view=9vG_kV0AAAAJ:u-x6o8ySG0sC>`_. When citing gensim,
68 | please use `this BibTeX entry <bibtex_gensim.bib>`_::
69 | 
70 |   @inproceedings{rehurek_lrec,
71 |         title = {{Software Framework for Topic Modelling with Large Corpora}},
72 |         author = {Radim {\v R}eh{\r u}{\v r}ek and Petr Sojka},
73 |         booktitle = {{Proceedings of the LREC 2010 Workshop on New
74 |              Challenges for NLP Frameworks}},
75 |         pages = {45--50},
76 |         year = 2010,
77 |         month = May,
78 |         day = 22,
79 |         publisher = {ELRA},
80 |         address = {Valletta, Malta},
81 |         note={\url{http://is.muni.cz/publication/884893/en}},
82 |         language={English}
83 |   }
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/gensim/corpora/textcorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | """
  7 | Text corpora usually reside on disk, as text files in one format or another
  8 | In a common scenario, we need to build a dictionary (a `word->integer id`
  9 | mapping), which is then used to construct sparse bag-of-word vectors
 10 | (= sequences of `(word_id, word_weight)` 2-tuples).
 11 | 
 12 | This module provides some code scaffolding to simplify this pipeline. For
 13 | example, given a corpus where each document is a separate line in file on disk,
 14 | you would override the `TextCorpus.get_texts` method to read one line=document
 15 | at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence
 16 | of words.
 17 | 
 18 | Overriding `get_texts` is enough; you can then initialize the corpus with e.g.
 19 | `MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a
 20 | corpus of sparse vectors. The `__iter__` methods is automatically set up, and
 21 | dictionary is automatically populated with all `word->id` mappings.
 22 | 
 23 | The resulting object can be used as input to all gensim models (TFIDF, LSI, ...),
 24 | serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc).
 25 | 
 26 | See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example.
 27 | """
 28 | 
 29 | 
 30 | from __future__ import with_statement
 31 | 
 32 | import logging
 33 | 
 34 | from gensim import interfaces, utils
 35 | from six import string_types
 36 | from gensim.corpora.dictionary import Dictionary
 37 | 
 38 | logger = logging.getLogger('gensim.corpora.textcorpus')
 39 | 
 40 | 
 41 | class TextCorpus(interfaces.CorpusABC):
 42 |     """
 43 |     Helper class to simplify the pipeline of getting bag-of-words vectors (= a
 44 |     gensim corpus) from plain text.
 45 | 
 46 |     This is an abstract base class: override the `get_texts()` and `__len__()`
 47 |     methods to match your particular input.
 48 | 
 49 |     Given a filename (or a file-like object) in constructor, the corpus object
 50 |     will be automatically initialized with a dictionary in `self.dictionary` and
 51 |     will support the `iter` corpus method. You must only provide a correct `get_texts`
 52 |     implementation.
 53 | 
 54 |     """
 55 |     def __init__(self, input=None):
 56 |         super(TextCorpus, self).__init__()
 57 |         self.input = input
 58 |         self.dictionary = Dictionary()
 59 |         self.metadata = False
 60 |         if input is not None:
 61 |             self.dictionary.add_documents(self.get_texts())
 62 |         else:
 63 |             logger.warning("No input document stream provided; assuming "
 64 |                            "dictionary will be initialized some other way.")
 65 | 
 66 |     def __iter__(self):
 67 |         """
 68 |         The function that defines a corpus.
 69 | 
 70 |         Iterating over the corpus must yield sparse vectors, one for each document.
 71 |         """
 72 |         for text in self.get_texts():
 73 |             if self.metadata:
 74 |                 yield self.dictionary.doc2bow(text[0], allow_update=False), text[1]
 75 |             else:
 76 |                 yield self.dictionary.doc2bow(text, allow_update=False)
 77 | 
 78 |     def getstream(self):
 79 |         return utils.file_or_filename(self.input)
 80 | 
 81 |     def get_texts(self):
 82 |         """
 83 |         Iterate over the collection, yielding one document at a time. A document
 84 |         is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
 85 | 
 86 |         Override this function to match your input (parse input files, do any
 87 |         text preprocessing, lowercasing, tokenizing etc.). There will be no further
 88 |         preprocessing of the words coming out of this function.
 89 |         """
 90 |         # Instead of raising NotImplementedError, let's provide a sample implementation:
 91 |         # assume documents are lines in a single file (one document per line).
 92 |         # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
 93 |         with self.getstream() as lines:
 94 |             for lineno, line in enumerate(lines):
 95 |                 if self.metadata:
 96 |                     yield utils.tokenize(line, lowercase=True), (lineno,)
 97 |                 else:
 98 |                     yield utils.tokenize(line, lowercase=True)
 99 | 
100 |     def __len__(self):
101 |         if not hasattr(self, 'length'):
102 |             # cache the corpus length
103 |             self.length = sum(1 for _ in self.get_texts())
104 |         return self.length
105 | 
106 | # endclass TextCorpus
107 | 


--------------------------------------------------------------------------------
/docs/src/changes_080.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_080:
 2 | 
 3 | Change Set for 0.8.0
 4 | ============================
 5 | 
 6 | Release 0.8.0 concludes the 0.7.x series, which was about API consolidation and performance.
 7 | In 0.8.x, I'd like to extend `gensim` with new functionality and features.
 8 | 
 9 | Codestyle Changes
10 | ------------------
11 | 
12 | Codebase was modified to comply with `PEP8: Style Guide for Python Code <http://www.python.org/dev/peps/pep-0008/>`_.
13 | This means the 0.8.0 API is **backward incompatible** with the 0.7.x series.
14 | 
15 | That's not as tragic as it sounds, gensim was almost there anyway. The changes are few and pretty straightforward:
16 | 
17 | 1. the `numTopics` parameter is now `num_topics`
18 | 2. `addDocuments()` method becomes `add_documents()`
19 | 3. `toUtf8()` => `to_utf8()`
20 | 4. ... you get the idea: replace `camelCase` with `lowercase_with_underscores`.
21 | 
22 | If you stored a model that is affected by this to disk, you'll need to rename its attributes manually:
23 | 
24 | >>> lsa = gensim.models.LsiModel.load('/some/path') # load old <0.8.0 model
25 | >>> lsa.num_terms, lsa.num_topics = lsa.numTerms, lsa.numTopics # rename attributes
26 | >>> del lsa.numTerms, lsa.numTopics # clean up old attributes (optional)
27 | >>> lsa.save('/some/path') # save again to disk, as 0.8.0 compatible
28 | 
29 | Only attributes (variables) need to be renamed; method names (functions) are not affected, due to the way `pickle` works.
30 | 
31 | Similarity Queries
32 | -------------------
33 | 
34 | Improved speed and scalability of :doc:`similarity queries <tut2>`.
35 | 
36 | The `Similarity` class can now index corpora of arbitrary size more efficiently.
37 | Internally, this is done by splitting the index into several smaller pieces ("shards") that fit in RAM
38 | and can be processed independently. In addition, documents can now be added to a `Similarity` index dynamically.
39 | 
40 | There is also a new way to query the similarity indexes:
41 | 
42 | >>> index = MatrixSimilarity(corpus) # create an index
43 | >>> sims = index[document] # get cosine similarity of query "document" against every document in the index
44 | >>> sims = index[chunk_of_documents] # new syntax!
45 | 
46 | Advantage of the last line (querying multiple documents at the same time) is faster execution.
47 | 
48 | This faster execution is also utilized *automatically for you* if you're using the ``for sims in index: ...`` syntax
49 | (which returns pairwise similarities of documents in the index).
50 | 
51 | To see the speed-up on your machine, run ``python -m gensim.test.simspeed`` (and compare to my results `here <http://groups.google.com/group/gensim/msg/4f6f171a869e4fca?>`_ to see how your machine fares).
52 | 
53 | .. note::
54 |   This current functionality of querying is as far as I wanted to get with gensim.
55 |   More optimizations and smarter indexing are certainly possible, but I'd like to
56 |   focus on other features now. Pull requests are still welcome though :)
57 | 
58 | Check out the :mod:`updated documentation <gensim.similarities.docsim>` of the similarity classes for more info.
59 | 
60 | Simplified Directory Structure
61 | --------------------------------
62 | 
63 | Instead of the java-esque ``ROOT_DIR/src/gensim`` directory structure of gensim,
64 | the packages now reside directly in ``ROOT_DIR/gensim`` (no superfluous ``src``). See the new structure `on github <https://github.com/piskvorky/gensim>`_.
65 | 
66 | Other changes (that you're unlikely to notice unless you look)
67 | ----------------------------------------------------------------------
68 | 
69 | * Improved efficiency of ``lsi[corpus]`` transformations (documents are chunked internally for better performance).
70 | * Large matrices (numpy/scipy.sparse, in `LsiModel`, `Similarity` etc.) are now mmapped to/from disk when doing `save/load`. The `cPickle` approach used previously was too `buggy <http://groups.google.com/group/gensim/browse_thread/thread/3c4c6c0f76c5938c#>`_ and `slow <http://dieter.plaetinck.be/poor_mans_pickle_implementations_benchmark.html>`_.
71 | * Renamed `chunks` parameter to `chunksize` (i.e. `LsiModel(corpus, num_topics=100, chunksize=20000)`). This better reflects its purpose: size of a chunk=number of documents to be processed at once.
72 | * Also improved memory efficiency of LSI and LDA model generation (again).
73 | * Removed SciPy 0.6 from the list of supported SciPi versions (need >=0.7 now).
74 | * Added more unit tests.
75 | * Several smaller fixes; see the `commit history <https://github.com/piskvorky/gensim/commits/0.8.0>`_ for full account.
76 | 
77 | .. admonition:: Future Directions?
78 | 
79 |    If you have ideas or proposals for new features for 0.8.x, now is the time to let me know:
80 |    `gensim mailing list <http://groups.google.com/group/gensim>`_.
81 | 


--------------------------------------------------------------------------------
/gensim/models/lda_worker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | USAGE: %(program)s
  9 | 
 10 |     Worker ("slave") process used in computing distributed LDA. Run this script \
 11 | on every node in your cluster. If you wish, you may even run it multiple times \
 12 | on a single machine, to make better use of multiple cores (just beware that \
 13 | memory footprint increases accordingly).
 14 | 
 15 | Example: python -m gensim.models.lda_worker
 16 | """
 17 | 
 18 | 
 19 | from __future__ import with_statement
 20 | import os, sys, logging
 21 | import threading
 22 | import tempfile
 23 | try:
 24 |     import Queue
 25 | except ImportError:
 26 |     import queue as Queue
 27 | import Pyro4
 28 | from gensim.models import ldamodel
 29 | from gensim import utils
 30 | 
 31 | logger = logging.getLogger('gensim.models.lda_worker')
 32 | 
 33 | 
 34 | # periodically save intermediate models after every SAVE_DEBUG updates (0 for never)
 35 | SAVE_DEBUG = 0
 36 | 
 37 | 
 38 | 
 39 | class Worker(object):
 40 |     def __init__(self):
 41 |         self.model = None
 42 | 
 43 | 
 44 |     def initialize(self, myid, dispatcher, **model_params):
 45 |         self.lock_update = threading.Lock()
 46 |         self.jobsdone = 0 # how many jobs has this worker completed?
 47 |         self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
 48 |         self.dispatcher = dispatcher
 49 |         self.finished = False
 50 |         logger.info("initializing worker #%s" % myid)
 51 |         self.model = ldamodel.LdaModel(**model_params)
 52 | 
 53 | 
 54 |     @Pyro4.oneway
 55 |     def requestjob(self):
 56 |         """
 57 |         Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called.
 58 |         """
 59 |         if self.model is None:
 60 |             raise RuntimeError("worker must be initialized before receiving jobs")
 61 | 
 62 |         job = None
 63 |         while job is None and not self.finished:
 64 |             try:
 65 |                 job = self.dispatcher.getjob(self.myid)
 66 |             except Queue.Empty:
 67 |                 # no new job: try again, unless we're finished with all work
 68 |                 continue
 69 |         if job is not None:
 70 |             logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone))
 71 |             self.processjob(job)
 72 |             self.dispatcher.jobdone(self.myid)
 73 |         else:
 74 |             logger.info("worker #%i stopping asking for jobs" % self.myid)
 75 | 
 76 | 
 77 |     @utils.synchronous('lock_update')
 78 |     def processjob(self, job):
 79 |         logger.debug("starting to process job #%i" % self.jobsdone)
 80 |         self.model.do_estep(job)
 81 |         self.jobsdone += 1
 82 |         if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0:
 83 |             fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl')
 84 |             self.model.save(fname)
 85 |         logger.info("finished processing job #%i" % (self.jobsdone - 1))
 86 | 
 87 | 
 88 |     @utils.synchronous('lock_update')
 89 |     def getstate(self):
 90 |         logger.info("worker #%i returning its state after %s jobs" %
 91 |                     (self.myid, self.jobsdone))
 92 |         result = self.model.state
 93 |         assert isinstance(result, ldamodel.LdaState)
 94 |         self.model.clear() # free up mem in-between two EM cycles
 95 |         self.finished = True
 96 |         return result
 97 | 
 98 | 
 99 |     @utils.synchronous('lock_update')
100 |     def reset(self, state):
101 |         assert state is not None
102 |         logger.info("resetting worker #%i" % self.myid)
103 |         self.model.state = state
104 |         self.model.sync_state()
105 |         self.model.state.reset()
106 |         self.finished = False
107 | 
108 | 
109 |     @Pyro4.oneway
110 |     def exit(self):
111 |         logger.info("terminating worker #%i" % self.myid)
112 |         os._exit(0)
113 | #endclass Worker
114 | 
115 | 
116 | 
117 | def main():
118 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
119 |     logger.info("running %s" % " ".join(sys.argv))
120 | 
121 |     program = os.path.basename(sys.argv[0])
122 |     # make sure we have enough cmd line parameters
123 |     if len(sys.argv) < 1:
124 |         print(globals()["__doc__"] % locals())
125 |         sys.exit(1)
126 | 
127 |     utils.pyro_daemon('gensim.lda_worker', Worker(), random_suffix=True)
128 | 
129 |     logger.info("finished running %s" % program)
130 | 
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/gensim/models/logentropy_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | import logging
  7 | import math
  8 | from gensim import interfaces, matutils, utils
  9 | 
 10 | 
 11 | logger = logging.getLogger('gensim.models.logentropy_model')
 12 | 
 13 | 
 14 | class LogEntropyModel(interfaces.TransformationABC):
 15 |     """
 16 |     Objects of this class realize the transformation between word-document
 17 |     co-occurence matrix (integers) into a locally/globally weighted matrix
 18 |     (positive floats).
 19 | 
 20 |     This is done by a log entropy normalization, optionally normalizing the
 21 |     resulting documents to unit length. The following formulas explain how
 22 |     to compute the log entropy weight for term `i` in document `j`::
 23 | 
 24 |       local_weight_{i,j} = log(frequency_{i,j} + 1)
 25 | 
 26 |       P_{i,j} = frequency_{i,j} / sum_j frequency_{i,j}
 27 | 
 28 |                             sum_j P_{i,j} * log(P_{i,j})
 29 |       global_weight_i = 1 + ----------------------------
 30 |                             log(number_of_documents + 1)
 31 | 
 32 |       final_weight_{i,j} = local_weight_{i,j} * global_weight_i
 33 | 
 34 |     The main methods are:
 35 | 
 36 |     1. constructor, which calculates the global weighting for all terms in
 37 |         a corpus.
 38 |     2. the [] method, which transforms a simple count representation into the
 39 |         log entropy normalized space.
 40 | 
 41 |     >>> log_ent = LogEntropyModel(corpus)
 42 |     >>> print(log_ent[some_doc])
 43 |     >>> log_ent.save('/tmp/foo.log_ent_model')
 44 | 
 45 |     Model persistency is achieved via its load/save methods.
 46 |     """
 47 | 
 48 |     def __init__(self, corpus, id2word=None, normalize=True):
 49 |         """
 50 |         `normalize` dictates whether the resulting vectors will be
 51 |         set to unit length.
 52 |         """
 53 |         self.normalize = normalize
 54 |         self.n_docs = 0
 55 |         self.n_words = 0
 56 |         self.entr = {}
 57 |         if corpus is not None:
 58 |             self.initialize(corpus)
 59 | 
 60 |     def __str__(self):
 61 |         return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs,
 62 |                                                            self.n_words)
 63 | 
 64 |     def initialize(self, corpus):
 65 |         """
 66 |         Initialize internal statistics based on a training corpus. Called
 67 |         automatically from the constructor.
 68 |         """
 69 |         logger.info("calculating counts")
 70 |         glob_freq = {}
 71 |         glob_num_words, doc_no = 0, -1
 72 |         for doc_no, bow in enumerate(corpus):
 73 |             if doc_no % 10000 == 0:
 74 |                 logger.info("PROGRESS: processing document #%i" % doc_no)
 75 |             glob_num_words += len(bow)
 76 |             for term_id, term_count in bow:
 77 |                 glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count
 78 | 
 79 |         # keep some stats about the training corpus
 80 |         self.n_docs = doc_no + 1
 81 |         self.n_words = glob_num_words
 82 | 
 83 |         # and finally compute the global weights
 84 |         logger.info("calculating global log entropy weights for %i "
 85 |                      "documents and %i features (%i matrix non-zeros)"
 86 |                      % (self.n_docs, len(glob_freq), self.n_words))
 87 |         logger.debug('iterating over corpus')
 88 |         for doc_no2, bow in enumerate(corpus):
 89 |             for key, freq in bow:
 90 |                 p = (float(freq) / glob_freq[key]) * math.log(float(freq) /
 91 |                                                               glob_freq[key])
 92 |                 self.entr[key] = self.entr.get(key, 0.0) + p
 93 |         if doc_no2 != doc_no:
 94 |             raise ValueError("LogEntropyModel doesn't support generators as training data")
 95 | 
 96 |         logger.debug('iterating over keys')
 97 |         for key in self.entr:
 98 |             self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1)
 99 | 
100 |     def __getitem__(self, bow):
101 |         """
102 |         Return log entropy representation of the input vector and/or corpus.
103 |         """
104 |         # if the input vector is in fact a corpus, return a transformed corpus
105 |         is_corpus, bow = utils.is_corpus(bow)
106 |         if is_corpus:
107 |             return self._apply(bow)
108 | 
109 |         # unknown (new) terms will be given zero weight (NOT infinity/huge)
110 |         vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
111 |                   for term_id, tf in bow if term_id in self.entr]
112 |         if self.normalize:
113 |             vector = matutils.unitvec(vector)
114 |         return vector
115 | 


--------------------------------------------------------------------------------
/docs/src/install.rst:
--------------------------------------------------------------------------------
  1 | .. _install:
  2 | 
  3 | =============
  4 | Installation
  5 | =============
  6 | 
  7 | Quick install
  8 | --------------
  9 | 
 10 | Run in your terminal::
 11 | 
 12 |   easy_install -U gensim
 13 | 
 14 | or, alternatively::
 15 | 
 16 |   pip install --upgrade gensim
 17 | 
 18 | In case that fails, make sure you're installing into a writeable location (or use `sudo`), or read on.
 19 | 
 20 | -----
 21 | 
 22 | Dependencies
 23 | -------------
 24 | Gensim is known to run on Linux, Windows and Mac OS X and should run on any other
 25 | platform that supports Python 2.6+ and NumPy. Gensim depends on the following software:
 26 | 
 27 | * `Python <http://www.python.org>`_ >= 2.6. Tested with versions 2.6, 2.7, 3.3, 3.4 and 3.5. Support for Python 2.5 was discontinued starting gensim 0.10.0; if you *must* use Python 2.5, install gensim 0.9.1.
 28 | * `NumPy <http://www.numpy.org>`_ >= 1.3. Tested with version 1.9.0, 1.7.1, 1.7.0, 1.6.2, 1.6.1rc2, 1.5.0rc1, 1.4.0, 1.3.0, 1.3.0rc2.
 29 | * `SciPy <http://www.scipy.org>`_ >= 0.7. Tested with version 0.14.0, 0.12.0, 0.11.0, 0.10.1, 0.9.0, 0.8.0, 0.8.0b1, 0.7.1, 0.7.0.
 30 | 
 31 | **Windows users** are well advised to try the `Enthought distribution <http://www.enthought.com/products/epd.php>`_,
 32 | which conveniently includes Python & NumPy & SciPy in a single bundle, and is free for academic use.
 33 | 
 34 | 
 35 | Install Python and `easy_install`
 36 | ---------------------------------
 37 | 
 38 | Check what version of Python you have with::
 39 | 
 40 |     python --version
 41 | 
 42 | You can download Python from http://python.org/download.
 43 | 
 44 | .. note:: Gensim requires Python 2.6 / 3.3 or greater, and will not run under earlier versions.
 45 | 
 46 | Next, install the `easy_install utility <http://pypi.python.org/pypi/setuptools>`_,
 47 | which will make installing other Python programs easier.
 48 | 
 49 | Install SciPy & NumPy
 50 | ----------------------
 51 | 
 52 | These are quite popular Python packages, so chances are there are pre-built binary
 53 | distributions available for your platform. You can try installing from source using easy_install::
 54 | 
 55 |     easy_install numpy
 56 |     easy_install scipy
 57 | 
 58 | If that doesn't work or if you'd rather install using a binary package, consult
 59 | http://www.scipy.org/Download.
 60 | 
 61 | Install `gensim`
 62 | -----------------
 63 | 
 64 | You can now install (or upgrade) `gensim` with::
 65 | 
 66 |     easy_install --upgrade gensim
 67 | 
 68 | That's it! Congratulations, you can proceed to the :doc:`tutorials <tutorial>`.
 69 | 
 70 | -----
 71 | 
 72 | If you also want to run the algorithms over a cluster
 73 | of computers, in :doc:`distributed`, you should install with::
 74 | 
 75 |     easy_install gensim[distributed]
 76 | 
 77 | The optional `distributed` feature installs `Pyro (PYthon Remote Objects) <http://pypi.python.org/pypi/Pyro>`_.
 78 | If you don't know what distributed computing means, you can ignore it:
 79 | `gensim` will work fine for you anyway.
 80 | This optional extension can also be installed separately later with::
 81 | 
 82 |     easy_install Pyro4
 83 | 
 84 | -----
 85 | 
 86 | There are also alternative routes to install:
 87 | 
 88 | 1. If you have downloaded and unzipped the `tar.gz source <http://pypi.python.org/pypi/gensim>`_
 89 |    for `gensim` (or you're installing `gensim` from `github <https://github.com/piskvorky/gensim/>`_),
 90 |    you can run::
 91 | 
 92 |      python setup.py install
 93 | 
 94 |    to install `gensim` into your ``site-packages`` folder.
 95 | 2. If you wish to make local changes to the `gensim` code (`gensim` is, after all, a
 96 |    package which targets research prototyping and modifications), a preferred
 97 |    way may be installing with::
 98 | 
 99 |      python setup.py develop
100 | 
101 |    This will only place a symlink into your ``site-packages`` directory. The actual
102 |    files will stay wherever you unpacked them.
103 | 3. If you don't have root priviledges (or just don't want to put the package into
104 |    your ``site-packages``), simply unpack the source package somewhere and that's it! No
105 |    compilation or installation needed. Just don't forget to set your PYTHONPATH
106 |    (or modify ``sys.path``), so that Python can find the unpacked package when importing.
107 | 
108 | 
109 | Testing `gensim`
110 | ----------------
111 | 
112 | To test the package, unzip the `tar.gz source <http://pypi.python.org/pypi/gensim>`_ and run::
113 | 
114 |     python setup.py test
115 | 
116 | Gensim uses Travis CI for continuous integration: |Travis|_
117 | 
118 | .. |Travis| image:: https://api.travis-ci.org/piskvorky/gensim.png?branch=develop
119 | .. _Travis: https://travis-ci.org/piskvorky/gensim
120 | 
121 | 
122 | Problems?
123 | ---------
124 | 
125 | Use the `gensim discussion group <http://groups.google.com/group/gensim/>`_ for
126 | questions and troubleshooting. See the :doc:`support page <support>`.
127 | 


--------------------------------------------------------------------------------
/gensim/scripts/make_wikicorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
  6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  7 | 
  8 | 
  9 | """
 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
 11 | 
 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
 13 | bz2-compressed dump of Wikipedia articles, in XML format.
 14 | 
 15 | This actually creates three files:
 16 | 
 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
 19 |   Matrix Matrix format
 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
 21 | * `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
 22 | 
 23 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save
 24 | disk space; gensim's corpus iterators can work with compressed input, too.
 25 | 
 26 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
 27 | removing tokens that appear in more than 10%% of all documents). Defaults to
 28 | 100,000.
 29 | 
 30 | If you have the `pattern` package installed, this script will use a fancy
 31 | lemmatization to get a lemma of each token (instead of plain alphabetic
 32 | tokenizer). The package is available at https://github.com/clips/pattern .
 33 | 
 34 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
 35 | """
 36 | 
 37 | 
 38 | import logging
 39 | import os.path
 40 | import sys
 41 | 
 42 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
 43 | from gensim.models import TfidfModel
 44 | 
 45 | 
 46 | # Wiki is first scanned for all distinct word types (~7M). The types that
 47 | # appear in more than 10% of articles are removed and from the rest, the
 48 | # DEFAULT_DICT_SIZE most frequent types are kept.
 49 | DEFAULT_DICT_SIZE = 100000
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     program = os.path.basename(sys.argv[0])
 54 |     logger = logging.getLogger(program)
 55 | 
 56 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
 57 |     logging.root.setLevel(level=logging.INFO)
 58 |     logger.info("running %s" % ' '.join(sys.argv))
 59 | 
 60 |     # check and process input arguments
 61 |     if len(sys.argv) < 3:
 62 |         print(globals()['__doc__'] % locals())
 63 |         sys.exit(1)
 64 |     inp, outp = sys.argv[1:3]
 65 |     if len(sys.argv) > 3:
 66 |         keep_words = int(sys.argv[3])
 67 |     else:
 68 |         keep_words = DEFAULT_DICT_SIZE
 69 |     online = 'online' in program
 70 |     lemmatize = 'lemma' in program
 71 |     debug = 'nodebug' not in program
 72 | 
 73 |     if online:
 74 |         dictionary = HashDictionary(id_range=keep_words, debug=debug)
 75 |         dictionary.allow_update = True # start collecting document frequencies
 76 |         wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
 77 |         MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
 78 |         # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
 79 |         dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
 80 |         dictionary.save_as_text(outp + '_wordids.txt.bz2')
 81 |         wiki.save(outp + '_corpus.pkl.bz2')
 82 |         dictionary.allow_update = False
 83 |     else:
 84 |         wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
 85 |         # only keep the most frequent words (out of total ~8.2m unique tokens)
 86 |         wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
 87 |         # save dictionary and bag-of-words (term-document frequency matrix)
 88 |         MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
 89 |         wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
 90 |         # load back the id->word mapping directly from file
 91 |         # this seems to save more memory, compared to keeping the wiki.dictionary object from above
 92 |         dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
 93 |     del wiki
 94 | 
 95 |     # initialize corpus reader and word->id mapping
 96 |     mm = MmCorpus(outp + '_bow.mm')
 97 | 
 98 |     # build tfidf, ~50min
 99 |     tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
100 |     tfidf.save(outp + '.tfidf_model')
101 | 
102 |     # save tfidf vectors in matrix market format
103 |     # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
104 |     MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
105 | 
106 |     logger.info("finished running %s" % program)
107 | 


--------------------------------------------------------------------------------
/docs/src/tutorial.rst:
--------------------------------------------------------------------------------
  1 | .. _tutorial:
  2 | 
  3 | Tutorials
  4 | =========
  5 | 
  6 | 
  7 | The tutorials are organized as a series of examples that highlight various features
  8 | of `gensim`. It is assumed that the reader is familiar with the `Python language <http://www.python.org/>`_, has :doc:`installed gensim <install>`
  9 | and read the :doc:`introduction <intro>`.
 10 | 
 11 | The examples are divided into parts on:
 12 | 
 13 | .. toctree::
 14 |    :maxdepth: 2
 15 | 
 16 |    tut1
 17 |    tut2
 18 |    tut3
 19 |    wiki
 20 |    distributed
 21 | 
 22 | Preliminaries
 23 | --------------
 24 | 
 25 | All the examples can be directly copied to your Python interpreter shell. `IPython <http://ipython.scipy.org>`_'s ``cpaste`` command is especially handy for copypasting code fragments, including the leading ``>>>`` characters.
 26 | 
 27 | Gensim uses Python's standard :mod:`logging` module to log various stuff at various
 28 | priority levels; to activate logging (this is optional), run
 29 | 
 30 | >>> import logging
 31 | >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 32 | 
 33 | 
 34 | .. _first-example:
 35 | 
 36 | Quick Example
 37 | -------------
 38 | 
 39 | First, let's import gensim and create a small corpus of nine documents and twelve features [1]_:
 40 | 
 41 | >>> from gensim import corpora, models, similarities
 42 | >>>
 43 | >>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
 44 | >>>           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
 45 | >>>           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
 46 | >>>           [(0, 1.0), (4, 2.0), (7, 1.0)],
 47 | >>>           [(3, 1.0), (5, 1.0), (6, 1.0)],
 48 | >>>           [(9, 1.0)],
 49 | >>>           [(9, 1.0), (10, 1.0)],
 50 | >>>           [(9, 1.0), (10, 1.0), (11, 1.0)],
 51 | >>>           [(8, 1.0), (10, 1.0), (11, 1.0)]]
 52 | 
 53 | :dfn:`Corpus` is simply an object which, when iterated over, returns its documents represented
 54 | as sparse vectors. If you're not familiar with the `vector space model <http://en.wikipedia.org/wiki/Vector_space_model>`_, we'll bridge the gap between **raw strings**, **corpora** and **sparse vectors** in the next tutorial on :doc:`tut1`.
 55 | 
 56 | If you're familiar with the vector space model, you'll probably know that the way you parse your documents and convert them to vectors
 57 | has major impact on the quality of any subsequent applications.
 58 | 
 59 | .. note::
 60 |     In this example, the whole corpus is stored in memory, as a Python list. However,
 61 |     the corpus interface only dictates that a corpus must support iteration over its
 62 |     constituent documents. For very large corpora, it is advantageous to keep the
 63 |     corpus on disk, and access its documents sequentially, one at a time. All the
 64 |     operations and transformations are implemented in such a way that makes
 65 |     them independent of the size of the corpus, memory-wise.
 66 | 
 67 | 
 68 | Next, let's initialize a :dfn:`transformation`:
 69 | 
 70 | >>> tfidf = models.TfidfModel(corpus)
 71 | 
 72 | A transformation is used to convert documents from one vector representation into another:
 73 | 
 74 | >>> vec = [(0, 1), (4, 1)]
 75 | >>> print(tfidf[vec])
 76 | [(0, 0.8075244), (4, 0.5898342)]
 77 | 
 78 | Here, we used `Tf-Idf <http://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_, a simple
 79 | transformation which takes documents represented as bag-of-words counts and applies
 80 | a weighting which discounts common terms (or, equivalently, promotes rare terms).
 81 | It also scales the resulting vector to unit length (in the `Euclidean norm <http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm>`_).
 82 | 
 83 | Transformations are covered in detail in the tutorial on :doc:`tut2`.
 84 | 
 85 | To transform the whole corpus via TfIdf and index it, in preparation for similarity queries:
 86 | 
 87 | >>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
 88 | 
 89 | and to query the similarity of our query vector ``vec`` against every document in the corpus:
 90 | 
 91 | >>> sims = index[tfidf[vec]]
 92 | >>> print(list(enumerate(sims)))
 93 | [(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
 94 | 
 95 | How to read this output? Document number zero (the first document) has a similarity score of 0.466=46.6\%,
 96 | the second document has a similarity score of 19.1\% etc.
 97 | 
 98 | Thus, according to TfIdf document representation and cosine similarity measure,
 99 | the most similar to our query document `vec` is document no. 3, with a similarity score of 82.1%.
100 | Note that in the TfIdf representation, any documents which do not share any common features
101 | with ``vec`` at all (documents no. 4--8) get a similarity score of 0.0. See the :doc:`tut3` tutorial for more detail.
102 | 
103 | ------
104 | 
105 | .. [1]  This is the same corpus as used in
106 |         `Deerwester et al. (1990): Indexing by Latent Semantic Analysis <http://www.cs.bham.ac.uk/~pxt/IDA/lsa_ind.pdf>`_, Table 2.
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/gensim/corpora/bleicorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | 
  8 | """
  9 | Blei's LDA-C format.
 10 | """
 11 | 
 12 | from __future__ import with_statement
 13 | 
 14 | from os import path
 15 | import logging
 16 | 
 17 | from gensim import interfaces, utils
 18 | from gensim.corpora import IndexedCorpus
 19 | from six.moves import xrange
 20 | 
 21 | 
 22 | logger = logging.getLogger('gensim.corpora.bleicorpus')
 23 | 
 24 | 
 25 | class BleiCorpus(IndexedCorpus):
 26 |     """
 27 |     Corpus in Blei's LDA-C format.
 28 | 
 29 |     The corpus is represented as two files: one describing the documents, and another
 30 |     describing the mapping between words and their ids.
 31 | 
 32 |     Each document is one line::
 33 | 
 34 |       N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
 35 | 
 36 |     The vocabulary is a file with words, one word per line; word at line K has an
 37 |     implicit ``id=K``.
 38 |     """
 39 | 
 40 |     def __init__(self, fname, fname_vocab=None):
 41 |         """
 42 |         Initialize the corpus from a file.
 43 | 
 44 |         `fname_vocab` is the file with vocabulary; if not specified, it defaults to
 45 |         `fname.vocab`.
 46 |         """
 47 |         IndexedCorpus.__init__(self, fname)
 48 |         logger.info("loading corpus from %s" % fname)
 49 | 
 50 |         if fname_vocab is None:
 51 |             fname_base, _ = path.splitext(fname)
 52 |             fname_dir = path.dirname(fname)
 53 |             for fname_vocab in [
 54 |                         utils.smart_extension(fname, '.vocab'),
 55 |                         utils.smart_extension(fname, '/vocab.txt'),
 56 |                         utils.smart_extension(fname_base, '.vocab'),
 57 |                         utils.smart_extension(fname_dir, '/vocab.txt'),
 58 |                         ]:
 59 |                 if path.exists(fname_vocab):
 60 |                     break
 61 |             else:
 62 |                 raise IOError('BleiCorpus: could not find vocabulary file')
 63 | 
 64 |         self.fname = fname
 65 |         with utils.smart_open(fname_vocab) as fin:
 66 |             words = [utils.to_unicode(word).rstrip() for word in fin]
 67 |         self.id2word = dict(enumerate(words))
 68 | 
 69 |     def __iter__(self):
 70 |         """
 71 |         Iterate over the corpus, returning one sparse vector at a time.
 72 |         """
 73 |         lineno = -1
 74 |         with utils.smart_open(self.fname) as fin:
 75 |             for lineno, line in enumerate(fin):
 76 |                 yield self.line2doc(line)
 77 |         self.length = lineno + 1
 78 | 
 79 |     def line2doc(self, line):
 80 |         parts = utils.to_unicode(line).split()
 81 |         if int(parts[0]) != len(parts) - 1:
 82 |             raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
 83 |         doc = [part.rsplit(':', 1) for part in parts[1:]]
 84 |         doc = [(int(p1), float(p2)) for p1, p2 in doc]
 85 |         return doc
 86 | 
 87 |     @staticmethod
 88 |     def save_corpus(fname, corpus, id2word=None, metadata=False):
 89 |         """
 90 |         Save a corpus in the LDA-C format.
 91 | 
 92 |         There are actually two files saved: `fname` and `fname.vocab`, where
 93 |         `fname.vocab` is the vocabulary file.
 94 | 
 95 |         This function is automatically called by `BleiCorpus.serialize`; don't
 96 |         call it directly, call `serialize` instead.
 97 |         """
 98 |         if id2word is None:
 99 |             logger.info("no word id mapping provided; initializing from corpus")
100 |             id2word = utils.dict_from_corpus(corpus)
101 |             num_terms = len(id2word)
102 |         else:
103 |             num_terms = 1 + max([-1] + id2word.keys())
104 | 
105 |         logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
106 |         with utils.smart_open(fname, 'wb') as fout:
107 |             offsets = []
108 |             for doc in corpus:
109 |                 doc = list(doc)
110 |                 offsets.append(fout.tell())
111 |                 parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
112 |                 fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))
113 | 
114 |         # write out vocabulary, in a format compatible with Blei's topics.py script
115 |         fname_vocab = utils.smart_extension(fname, '.vocab')
116 |         logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
117 |         with utils.smart_open(fname_vocab, 'wb') as fout:
118 |             for featureid in xrange(num_terms):
119 |                 fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
120 | 
121 |         return offsets
122 | 
123 |     def docbyoffset(self, offset):
124 |         """
125 |         Return the document stored at file position `offset`.
126 |         """
127 |         with utils.smart_open(self.fname) as f:
128 |             f.seek(offset)
129 |             return self.line2doc(f.readline())
130 | 
131 | # endclass BleiCorpus
132 | 


--------------------------------------------------------------------------------
/gensim/parsing/preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | import re
  7 | import string
  8 | import glob
  9 | 
 10 | from gensim import utils
 11 | from gensim.parsing.porter import PorterStemmer
 12 | 
 13 | 
 14 | # improved list from Stone, Denis, Kwantes (2010)
 15 | STOPWORDS = """
 16 | a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
 17 | became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
 18 | cannot cant co computer con could couldnt cry de describe
 19 | detail did didn do does doesn doing don done down due during
 20 | each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
 21 | fify fill find fire first five for former formerly forty found four from front full further get give go
 22 | had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
 23 | if in inc indeed interest into is it its itself keep last latter latterly least less ltd
 24 | just
 25 | kg km
 26 | made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
 27 | neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
 28 | often on once one only onto or other others otherwise our ours ourselves out over own part per
 29 | perhaps please put rather re
 30 | quite
 31 | rather really regarding
 32 | same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
 33 | than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
 34 | until up unless upon us used using
 35 | various very very via
 36 | was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
 37 | your yours yourself yourselves
 38 | """
 39 | STOPWORDS = frozenset(w for w in STOPWORDS.split() if w)
 40 | 
 41 | 
 42 | def remove_stopwords(s):
 43 |     s = utils.to_unicode(s)
 44 |     return " ".join(w for w in s.split() if w not in STOPWORDS)
 45 | 
 46 | 
 47 | RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
 48 | def strip_punctuation(s):
 49 |     s = utils.to_unicode(s)
 50 |     return RE_PUNCT.sub(" ", s)
 51 | 
 52 | 
 53 | # unicode.translate cannot delete characters like str can
 54 | strip_punctuation2 = strip_punctuation
 55 | # def strip_punctuation2(s):
 56 | #     s = utils.to_unicode(s)
 57 | #     return s.translate(None, string.punctuation)
 58 | 
 59 | 
 60 | RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE)
 61 | def strip_tags(s):
 62 |     s = utils.to_unicode(s)
 63 |     return RE_TAGS.sub("",s)
 64 | 
 65 | 
 66 | def strip_short(s, minsize=3):
 67 |     s = utils.to_unicode(s)
 68 |     return " ".join(e for e in s.split() if len(e) >= minsize)
 69 | 
 70 | 
 71 | RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
 72 | def strip_numeric(s):
 73 |     s = utils.to_unicode(s)
 74 |     return RE_NUMERIC.sub("", s)
 75 | 
 76 | 
 77 | RE_NONALPHA = re.compile(r"\W", re.UNICODE)
 78 | def strip_non_alphanum(s):
 79 |     s = utils.to_unicode(s)
 80 |     return RE_NONALPHA.sub(" ", s)
 81 | 
 82 | 
 83 | RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE)
 84 | def strip_multiple_whitespaces(s):
 85 |     s = utils.to_unicode(s)
 86 |     return RE_WHITESPACE.sub(" ", s)
 87 | 
 88 | 
 89 | RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE)
 90 | RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE)
 91 | def split_alphanum(s):
 92 |     s = utils.to_unicode(s)
 93 |     s = RE_AL_NUM.sub(r"\1 \2", s)
 94 |     return RE_NUM_AL.sub(r"\1 \2", s)
 95 | 
 96 | 
 97 | def stem_text(text):
 98 |     """
 99 |     Return lowercase and (porter-)stemmed version of string `text`.
100 |     """
101 |     text = utils.to_unicode(text)
102 |     p = PorterStemmer()
103 |     return ' '.join(p.stem(word) for word in text.split())
104 | stem = stem_text
105 | 
106 | DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces,
107 |                    strip_numeric, remove_stopwords, strip_short, stem_text]
108 | 
109 | 
110 | def preprocess_string(s, filters=DEFAULT_FILTERS):
111 |     s = utils.to_unicode(s)
112 |     for f in filters:
113 |         s = f(s)
114 |     return s.split()
115 | 
116 | 
117 | def preprocess_documents(docs):
118 |     return [preprocess_string(d) for d in docs]
119 | 
120 | 
121 | def read_file(path):
122 |     with utils.smart_open(path) as fin:
123 |         return fin.read()
124 | 
125 | 
126 | def read_files(pattern):
127 |     return [read_file(fname) for fname in glob.glob(pattern)]
128 | 


--------------------------------------------------------------------------------
/gensim/corpora/svmlightcorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | 
  8 | """
  9 | Corpus in SVMlight format.
 10 | """
 11 | 
 12 | 
 13 | from __future__ import with_statement
 14 | 
 15 | import logging
 16 | 
 17 | from gensim import utils
 18 | from gensim.corpora import IndexedCorpus
 19 | 
 20 | 
 21 | logger = logging.getLogger('gensim.corpora.svmlightcorpus')
 22 | 
 23 | 
 24 | class SvmLightCorpus(IndexedCorpus):
 25 |     """
 26 |     Corpus in SVMlight format.
 27 | 
 28 |     Quoting http://svmlight.joachims.org/:
 29 |     The input file contains the training examples. The first lines
 30 |     may contain comments and are ignored if they start with #. Each of the following
 31 |     lines represents one training example and is of the following format::
 32 | 
 33 |         <line> .=. <target> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
 34 |         <target> .=. +1 | -1 | 0 | <float>
 35 |         <feature> .=. <integer> | "qid"
 36 |         <value> .=. <float>
 37 |         <info> .=. <string>
 38 | 
 39 |     The "qid" feature (used for SVMlight ranking), if present, is ignored.
 40 | 
 41 |     Although not mentioned in the specification above, SVMlight also expect its
 42 |     feature ids to be 1-based (counting starts at 1). We convert features to 0-base
 43 |     internally by decrementing all ids when loading a SVMlight input file, and
 44 |     increment them again when saving as SVMlight.
 45 | 
 46 |     """
 47 | 
 48 |     def __init__(self, fname, store_labels=True):
 49 |         """
 50 |         Initialize the corpus from a file.
 51 | 
 52 |         Although vector labels (~SVM target class) are not used in gensim in any way,
 53 |         they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
 54 |         to skip storing these labels (e.g. if there are too many vectors to store
 55 |         the self.labels array in memory).
 56 | 
 57 |         """
 58 |         IndexedCorpus.__init__(self, fname)
 59 |         logger.info("loading corpus from %s" % fname)
 60 | 
 61 |         self.fname = fname # input file, see class doc for format
 62 |         self.length = None
 63 |         self.store_labels = store_labels
 64 |         self.labels = []
 65 | 
 66 |     def __iter__(self):
 67 |         """
 68 |         Iterate over the corpus, returning one sparse vector at a time.
 69 |         """
 70 |         lineno = -1
 71 |         self.labels = []
 72 |         with utils.smart_open(self.fname) as fin:
 73 |             for lineno, line in enumerate(fin):
 74 |                 doc = self.line2doc(line)
 75 |                 if doc is not None:
 76 |                     if self.store_labels:
 77 |                         self.labels.append(doc[1])
 78 |                     yield doc[0]
 79 |         self.length = lineno + 1
 80 | 
 81 |     @staticmethod
 82 |     def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
 83 |         """
 84 |         Save a corpus in the SVMlight format.
 85 | 
 86 |         The SVMlight `<target>` class tag is taken from the `labels` array, or set
 87 |         to 0 for all documents if `labels` is not supplied.
 88 | 
 89 |         This function is automatically called by `SvmLightCorpus.serialize`; don't
 90 |         call it directly, call `serialize` instead.
 91 |         """
 92 |         logger.info("converting corpus to SVMlight format: %s" % fname)
 93 | 
 94 |         offsets = []
 95 |         with utils.smart_open(fname, 'wb') as fout:
 96 |             for docno, doc in enumerate(corpus):
 97 |                 label = labels[docno] if labels else 0 # target class is 0 by default
 98 |                 offsets.append(fout.tell())
 99 |                 fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
100 |         return offsets
101 | 
102 |     def docbyoffset(self, offset):
103 |         """
104 |         Return the document stored at file position `offset`.
105 |         """
106 |         with utils.smart_open(self.fname) as f:
107 |             f.seek(offset)
108 |             return self.line2doc(f.readline())[0]
109 | 
110 |     def line2doc(self, line):
111 |         """
112 |         Create a document from a single line (string) in SVMlight format
113 |         """
114 |         line = utils.to_unicode(line)
115 |         line = line[: line.find('#')].strip()
116 |         if not line:
117 |             return None # ignore comments and empty lines
118 |         parts = line.split()
119 |         if not parts:
120 |             raise ValueError('invalid line format in %s' % self.fname)
121 |         target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
122 |         doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
123 |         return doc, target
124 | 
125 |     @staticmethod
126 |     def doc2line(doc, label=0):
127 |         """
128 |         Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
129 |         """
130 |         pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
131 |         return "%s %s\n" % (label, pairs)
132 | 
133 | # endclass SvmLightCorpus
134 | 


--------------------------------------------------------------------------------
/gensim/examples/dmlcz/gensim_xml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  5 | 
  6 | """
  7 | USAGE: %(program)s LANGUAGE METHOD
  8 |     Generate similar.xml files, using a previously built model for METHOD.
  9 | 
 10 | Example: ./gensim_xml.py eng lsi
 11 | """
 12 | 
 13 | 
 14 | import logging
 15 | import sys
 16 | import os.path
 17 | import re
 18 | 
 19 | 
 20 | from gensim.corpora import sources, dmlcorpus, MmCorpus
 21 | from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity
 22 | 
 23 | import gensim_build
 24 | 
 25 | 
 26 | # set to True to do everything EXCEPT actually writing out similar.xml files to disk.
 27 | # similar.xml files are NOT written if DRY_RUN is true.
 28 | DRY_RUN = False
 29 | 
 30 | # how many 'most similar' documents to store in each similar.xml?
 31 | MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored)
 32 | MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit).
 33 | 
 34 | # if there are no similar articles (after the pruning), do we still want to generate similar.xml?
 35 | SAVE_EMPTY = True
 36 | 
 37 | # xml template for similar articles
 38 | ARTICLE = """
 39 |     <article weight="%(score)f">
 40 |         <authors>
 41 |             <author>%(author)s</author>
 42 |         </authors>
 43 |         <title>%(title)s</title>
 44 |         <suffix>%(suffix)s</suffix>
 45 |         <links>
 46 |             <link source="%(source)s" id="%(intId)s" path="%(pathId)s"/>
 47 |         </links>
 48 |     </article>"""
 49 | 
 50 | # template for the whole similar.xml file (will be filled with multiple ARTICLE instances)
 51 | SIMILAR = """\
 52 | <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
 53 | <related>%s
 54 | </related>
 55 | """
 56 | 
 57 | 
 58 | 
 59 | def generateSimilar(corpus, index, method):
 60 |     for docNo, topSims in enumerate(index): # for each document
 61 |         # store similarities to the following file
 62 |         outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method)
 63 | 
 64 |         articles = [] # collect similars in this list
 65 |         for docNo2, score in topSims: # for each most similar article
 66 |             if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring)
 67 |                 source, (intId, pathId) = corpus.documents[docNo2]
 68 |                 meta = corpus.getMeta(docNo2)
 69 |                 suffix, author, title = '', meta.get('author', ''), meta.get('title', '')
 70 |                 articles.append(ARTICLE % locals()) # add the similar article to output
 71 |                 if len(articles) >= MAX_SIMILAR:
 72 |                     break
 73 | 
 74 |         # now `articles` holds multiple strings in similar_*.xml format
 75 |         if SAVE_EMPTY or articles:
 76 |             output = ''.join(articles) # concat all similars to one string
 77 |             if not DRY_RUN: # only open output files for writing if DRY_RUN is false
 78 |                 logging.info("generating %s (%i similars)" % (outfile, len(articles)))
 79 |                 outfile = open(outfile, 'w')
 80 |                 outfile.write(SIMILAR % output) # add xml headers and print to file
 81 |                 outfile.close()
 82 |             else:
 83 |                 logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output))
 84 |         else:
 85 |             logging.debug("skipping %s (no similar found)" % outfile)
 86 | 
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
 91 |     logging.root.setLevel(level=logging.INFO)
 92 |     logging.info("running %s" % ' '.join(sys.argv))
 93 | 
 94 |     program = os.path.basename(sys.argv[0])
 95 | 
 96 |     # check and process input arguments
 97 |     if len(sys.argv) < 3:
 98 |         print(globals()['__doc__'] % locals())
 99 |         sys.exit(1)
100 |     language = sys.argv[1]
101 |     method = sys.argv[2].strip().lower()
102 | 
103 |     logging.info("loading corpus mappings")
104 |     config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
105 |                                  resultDir=gensim_build.RESULT_DIR, acceptLangs=[language])
106 | 
107 |     logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
108 |     id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
109 |     logging.info("loaded %i word ids" % len(id2word))
110 | 
111 |     corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
112 |     input = MmCorpus(config.resultFile('_%s.mm' % method))
113 |     assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus))
114 | 
115 |      # initialize structure for similarity queries
116 |     if method == 'lsi' or method == 'rp': # for these methods, use dense vectors
117 |         index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms)
118 |     else:
119 |         index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)
120 | 
121 |     index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
122 |     generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format
123 | 
124 |     logging.info("finished running %s" % program)
125 | 
126 | 


--------------------------------------------------------------------------------
/docs/src/dist_lda.rst:
--------------------------------------------------------------------------------
 1 | .. _dist_lda:
 2 | 
 3 | Distributed Latent Dirichlet Allocation
 4 | ============================================
 5 | 
 6 | 
 7 | .. note::
 8 |   See :doc:`distributed` for an introduction to distributed computing in `gensim`.
 9 | 
10 | 
11 | Setting up the cluster
12 | _______________________
13 | 
14 | See the tutorial on :doc:`dist_lsi`; setting up a cluster for LDA is completely
15 | analogous, except you want to run `lda_worker` and `lda_dispatcher` scripts instead
16 | of `lsi_worker` and `lsi_dispatcher`.
17 | 
18 | Running LDA
19 | ____________
20 | 
21 | Run LDA like you normally would, but turn on the `distributed=True` constructor
22 | parameter::
23 | 
24 |     >>> # extract 100 LDA topics, using default parameters
25 |     >>> lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, distributed=True)
26 |     using distributed version with 4 workers
27 |     running online LDA training, 100 topics, 1 passes over the supplied corpus of 3199665 documets, updating model once every 40000 documents
28 |     ..
29 | 
30 | 
31 | In serial mode (no distribution), creating this online LDA :doc:`model of Wikipedia <wiki>`
32 | takes 10h56m on my laptop (OS X, C2D 2.53GHz, 4GB RAM with `libVec`).
33 | In distributed mode with four workers (Linux, Xeons of 2Ghz, 4GB RAM
34 | with `ATLAS <http://math-atlas.sourceforge.net/>`_), the wallclock time taken drops to 3h20m.
35 | 
36 | To run standard batch LDA (no online updates of mini-batches) instead, you would similarly
37 | call::
38 | 
39 |     >>> lda = LdaModel(corpus=mm, id2word=id2token, num_topics=100, update_every=0, passes=20, distributed=True)
40 |     using distributed version with 4 workers
41 |     running batch LDA training, 100 topics, 20 passes over the supplied corpus of 3199665 documets, updating model once every 3199665 documents
42 |     initializing workers
43 |     iteration 0, dispatching documents up to #10000/3199665
44 |     iteration 0, dispatching documents up to #20000/3199665
45 |     ...
46 | 
47 | and then, some two days later::
48 | 
49 |     iteration 19, dispatching documents up to #3190000/3199665
50 |     iteration 19, dispatching documents up to #3199665/3199665
51 |     reached the end of input; now waiting for all remaining jobs to finish
52 | 
53 | ::
54 | 
55 |     >>> lda.print_topics(20)
56 |     topic #0: 0.007*disease + 0.006*medical + 0.005*treatment + 0.005*cells + 0.005*cell + 0.005*cancer + 0.005*health + 0.005*blood + 0.004*patients + 0.004*drug
57 |     topic #1: 0.024*king + 0.013*ii + 0.013*prince + 0.013*emperor + 0.008*duke + 0.008*empire + 0.007*son + 0.007*china + 0.007*dynasty + 0.007*iii
58 |     topic #2: 0.031*film + 0.017*films + 0.005*movie + 0.005*directed + 0.004*man + 0.004*episode + 0.003*character + 0.003*cast + 0.003*father + 0.003*mother
59 |     topic #3: 0.022*user + 0.012*edit + 0.009*wikipedia + 0.007*block + 0.007*my + 0.007*here + 0.007*edits + 0.007*blocked + 0.006*revert + 0.006*me
60 |     topic #4: 0.045*air + 0.026*aircraft + 0.021*force + 0.018*airport + 0.011*squadron + 0.010*flight + 0.010*military + 0.008*wing + 0.007*aviation + 0.007*f
61 |     topic #5: 0.025*sun + 0.022*star + 0.018*moon + 0.015*light + 0.013*stars + 0.012*planet + 0.011*camera + 0.010*mm + 0.009*earth + 0.008*lens
62 |     topic #6: 0.037*radio + 0.026*station + 0.022*fm + 0.014*news + 0.014*stations + 0.014*channel + 0.013*am + 0.013*racing + 0.011*tv + 0.010*broadcasting
63 |     topic #7: 0.122*image + 0.099*jpg + 0.046*file + 0.038*uploaded + 0.024*png + 0.014*contribs + 0.013*notify + 0.013*logs + 0.013*picture + 0.013*flag
64 |     topic #8: 0.036*russian + 0.030*soviet + 0.028*polish + 0.024*poland + 0.022*russia + 0.013*union + 0.012*czech + 0.011*republic + 0.011*moscow + 0.010*finland
65 |     topic #9: 0.031*language + 0.014*word + 0.013*languages + 0.009*term + 0.009*words + 0.008*example + 0.007*names + 0.007*meaning + 0.006*latin + 0.006*form
66 |     topic #10: 0.029*w + 0.029*toronto + 0.023*l + 0.020*hockey + 0.019*nhl + 0.014*ontario + 0.012*calgary + 0.011*edmonton + 0.011*hamilton + 0.010*season
67 |     topic #11: 0.110*wikipedia + 0.110*articles + 0.030*library + 0.029*wikiproject + 0.028*project + 0.019*data + 0.016*archives + 0.012*needing + 0.009*reference + 0.009*statements
68 |     topic #12: 0.032*http + 0.030*your + 0.022*request + 0.017*sources + 0.016*archived + 0.016*modify + 0.015*changes + 0.015*creation + 0.014*www + 0.013*try
69 |     topic #13: 0.011*your + 0.010*my + 0.009*we + 0.008*don + 0.008*get + 0.008*know + 0.007*me + 0.006*think + 0.006*question + 0.005*find
70 |     topic #14: 0.073*r + 0.066*japanese + 0.062*japan + 0.018*tokyo + 0.008*prefecture + 0.005*osaka + 0.004*j + 0.004*sf + 0.003*kyoto + 0.003*manga
71 |     topic #15: 0.045*da + 0.045*fr + 0.027*kategori + 0.026*pl + 0.024*nl + 0.021*pt + 0.017*en + 0.015*categoria + 0.014*es + 0.012*kategorie
72 |     topic #16: 0.010*death + 0.005*died + 0.005*father + 0.004*said + 0.004*himself + 0.004*took + 0.004*son + 0.004*killed + 0.003*murder + 0.003*wife
73 |     topic #17: 0.027*book + 0.021*published + 0.020*books + 0.014*isbn + 0.010*author + 0.010*magazine + 0.009*press + 0.009*novel + 0.009*writers + 0.008*story
74 |     topic #18: 0.027*football + 0.024*players + 0.023*cup + 0.019*club + 0.017*fc + 0.017*footballers + 0.017*league + 0.011*season + 0.007*teams + 0.007*goals
75 |     topic #19: 0.032*band + 0.024*album + 0.014*albums + 0.013*guitar + 0.013*rock + 0.011*records + 0.011*vocals + 0.009*live + 0.008*bass + 0.008*track
76 | 
77 | 
78 | 
79 | If you used the distributed LDA implementation in `gensim`, please let me know (my
80 | email is at the bottom of this page). I would like to hear about your application and
81 | the possible (inevitable?) issues that you encountered, to improve `gensim` in the future.
82 | 


--------------------------------------------------------------------------------
/gensim/test/test_ldamallet_wrapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | Automated tests for checking transformation algorithms (the models package).
  9 | """
 10 | 
 11 | 
 12 | import logging
 13 | import unittest
 14 | import os
 15 | import os.path
 16 | import tempfile
 17 | 
 18 | import six
 19 | import numpy
 20 | import scipy.linalg
 21 | 
 22 | from gensim.corpora import mmcorpus, Dictionary
 23 | from gensim.models.wrappers import ldamallet
 24 | from gensim import matutils
 25 | from gensim.models import ldamodel
 26 | 
 27 | 
 28 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
 29 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
 30 | 
 31 | 
 32 | # set up vars used in testing ("Deerwester" from the web tutorial)
 33 | texts = [['human', 'interface', 'computer'],
 34 |  ['survey', 'user', 'computer', 'system', 'response', 'time'],
 35 |  ['eps', 'user', 'interface', 'system'],
 36 |  ['system', 'human', 'system', 'eps'],
 37 |  ['user', 'response', 'time'],
 38 |  ['trees'],
 39 |  ['graph', 'trees'],
 40 |  ['graph', 'minors', 'trees'],
 41 |  ['graph', 'minors', 'survey']]
 42 | dictionary = Dictionary(texts)
 43 | corpus = [dictionary.doc2bow(text) for text in texts]
 44 | 
 45 | 
 46 | def testfile():
 47 |     # temporary data will be stored to this file
 48 |     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 49 | 
 50 | 
 51 | class TestLdaMallet(unittest.TestCase):
 52 |     def setUp(self):
 53 |         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
 54 |         mallet_home = os.environ.get('MALLET_HOME', None)
 55 |         self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
 56 | 
 57 |     def testTransform(self):
 58 |         if not self.mallet_path:
 59 |             return
 60 |         passed = False
 61 |         for i in range(5): # restart at most 5 times
 62 |             # create the transformation model
 63 |             model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200)
 64 | 
 65 |             # transform one document
 66 |             doc = list(corpus)[0]
 67 |             transformed = model[doc]
 68 | 
 69 |             vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
 70 |             expected = [0.49, 0.51]
 71 |             passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
 72 |             if passed:
 73 |                 break
 74 |             logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
 75 |                             (i, sorted(vec), sorted(expected)))
 76 |         self.assertTrue(passed)
 77 | 
 78 | 
 79 |     def testPersistence(self):
 80 |         if not self.mallet_path:
 81 |             return
 82 |         fname = testfile()
 83 |         model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
 84 |         model.save(fname)
 85 |         model2 = ldamallet.LdaMallet.load(fname)
 86 |         self.assertEqual(model.num_topics, model2.num_topics)
 87 |         self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
 88 |         tstvec = []
 89 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
 90 | 
 91 |     def testPersistenceCompressed(self):
 92 |         if not self.mallet_path:
 93 |             return
 94 |         fname = testfile() + '.gz'
 95 |         model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
 96 |         model.save(fname)
 97 |         model2 = ldamallet.LdaMallet.load(fname, mmap=None)
 98 |         self.assertEqual(model.num_topics, model2.num_topics)
 99 |         self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
100 |         tstvec = []
101 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
102 | 
103 |     def testLargeMmap(self):
104 |         if not self.mallet_path:
105 |             return
106 |         fname = testfile()
107 |         model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
108 | 
109 |         # simulate storing large arrays separately
110 |         model.save(testfile(), sep_limit=0)
111 | 
112 |         # test loading the large model arrays with mmap
113 |         model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
114 |         self.assertEqual(model.num_topics, model2.num_topics)
115 |         self.assertTrue(isinstance(model2.wordtopics, numpy.memmap))
116 |         self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
117 |         tstvec = []
118 |         self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
119 | 
120 |     def testLargeMmapCompressed(self):
121 |         if not self.mallet_path:
122 |             return
123 |         fname = testfile() + '.gz'
124 |         model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
125 | 
126 |         # simulate storing large arrays separately
127 |         model.save(fname, sep_limit=0)
128 | 
129 |         # test loading the large model arrays with mmap
130 |         self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
131 | #endclass TestLdaMallet
132 | 
133 | if __name__ == '__main__':
134 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
135 |     unittest.main()
136 | 


--------------------------------------------------------------------------------