├── gensim ├── scripts │ ├── __init__.py │ ├── make_wiki.py │ ├── make_wiki_lemma.py │ ├── make_wiki_online.py │ ├── make_wiki_online_lemma.py │ ├── make_wiki_online_nodebug.py │ └── make_wikicorpus.py ├── examples │ └── dmlcz │ │ ├── __init__.py │ │ ├── runall.sh │ │ ├── gensim_build.py │ │ ├── gensim_genmodel.py │ │ └── gensim_xml.py ├── test │ ├── test_data │ │ ├── lee.cor │ │ ├── dtm_test.dict │ │ ├── testcorpus.mm.index │ │ ├── testcorpus.blei.index │ │ ├── testcorpus.low.index │ │ ├── testcorpus.uci.index │ │ ├── head500.noblanks.cor.bz2 │ │ ├── testcorpus.mallet.index │ │ ├── testcorpus.svmlight.index │ │ ├── head500.noblanks.cor_tfidf.model │ │ ├── testcorpus.blei.vocab │ │ ├── testcorpus.uci.vocab │ │ ├── miIslita.cor │ │ ├── test_corpus_small.mm │ │ ├── testcorpus.blei │ │ ├── testcorpus.txt │ │ ├── testcorpus.low │ │ ├── testcorpus.svmlight │ │ ├── test_corpus_ok.mm │ │ ├── mihalcea_tarau.kw.txt │ │ ├── testcorpus.mallet │ │ ├── testcorpus.uci │ │ ├── mihalcea_tarau.kwpos.txt │ │ ├── testcorpus.mm │ │ ├── mihalcea_tarau.summ.txt │ │ ├── ldavowpalwabbit.dict.txt │ │ ├── testsummarization_unrelated.txt │ │ └── mihalcea_tarau.txt │ ├── __init__.py │ ├── test_hdpmodel.py │ ├── test_big.py │ ├── test_dtm.py │ ├── test_utils.py │ ├── test_logentropy_model.py │ ├── test_parsing.py │ ├── test_rpmodel.py │ ├── test_phrases.py │ ├── test_tfidfmodel.py │ ├── test_keywords.py │ ├── test_miislita.py │ └── test_ldamallet_wrapper.py ├── summarization │ ├── __init__.py │ ├── commons.py │ ├── syntactic_unit.py │ ├── pagerank_weighted.py │ ├── bm25.py │ └── textcleaner.py ├── models │ ├── wrappers │ │ └── __init__.py │ ├── voidptr.h │ ├── __init__.py │ ├── word2vec_inner.pxd │ ├── rpmodel.py │ ├── lsi_worker.py │ ├── lda_worker.py │ └── logentropy_model.py ├── parsing │ ├── __init__.py │ └── preprocessing.py ├── similarities │ └── __init__.py ├── corpora │ ├── __init__.py │ ├── mmcorpus.py │ ├── csvcorpus.py │ ├── malletcorpus.py │ ├── textcorpus.py │ ├── bleicorpus.py │ └── svmlightcorpus.py ├── __init__.py └── nosy.py ├── docs └── src │ ├── _static │ ├── favicon.ico │ └── images │ │ ├── bg.png │ │ ├── arrows.png │ │ ├── gensim.png │ │ ├── ukazka.png │ │ ├── bullets.png │ │ ├── checker.png │ │ ├── default.png │ │ ├── download.png │ │ ├── favicon.ico │ │ ├── loading.gif │ │ ├── tagline.png │ │ ├── ukazka2.png │ │ ├── gensim_code.png │ │ ├── get-started.png │ │ ├── logo-gensim.png │ │ ├── menubutton.png │ │ ├── twitterbird.png │ │ ├── gensim-footer.png │ │ ├── googlegroups.png │ │ ├── direct-install.png │ │ ├── features │ │ ├── robust.png │ │ ├── support.png │ │ ├── free_lgpl.png │ │ ├── converters.png │ │ ├── memory_independence.png │ │ ├── similarity_queries.png │ │ ├── platform_independence.png │ │ └── efficient_implementations.png │ │ ├── gensim_compact.png │ │ ├── tagline_compact.png │ │ ├── logo-gensim_compact.png │ │ ├── references │ │ ├── logo_dtu.gif │ │ ├── logo_eudml.png │ │ ├── logo_ghent.png │ │ ├── logo_ibcn.png │ │ ├── logo_issuu.jpeg │ │ ├── logo_roistr.png │ │ ├── logo_dynadmic.png │ │ ├── logo_tailwind.png │ │ └── logo_sportsauthority.png │ │ └── forkme_left_white_ffffff.png │ ├── gensim_theme │ ├── page.html │ ├── theme.conf │ ├── search.html │ ├── domainindex.html │ └── genindex.html │ ├── indextoc.rst │ ├── corpora │ ├── corpora.rst │ ├── dictionary.rst │ ├── mmcorpus.rst │ ├── wikicorpus.rst │ ├── bleicorpus.rst │ ├── lowcorpus.rst │ ├── svmlightcorpus.rst │ ├── textcorpus.rst │ ├── hashdictionary.rst │ ├── ucicorpus.rst │ └── indexedcorpus.rst │ ├── matutils.rst │ ├── models │ ├── rpmodel.rst │ ├── models.rst │ ├── lda_worker.rst │ ├── lsi_worker.rst │ ├── tfidfmodel.rst │ ├── lda_dispatcher.rst │ ├── lsi_dispatcher.rst │ ├── lsimodel.rst │ ├── ldamodel.rst │ ├── word2vec.rst │ ├── doc2vec.rst │ ├── logentropy_model.rst │ ├── phrases.rst │ ├── hdpmodel.rst │ ├── wrappers │ │ ├── wrappers.rst │ │ ├── ldamallet.rst │ │ ├── dtmmodel.rst │ │ └── ldavowpalwabbit.rst │ └── ldamulticore.rst │ ├── similarities │ ├── simserver.rst │ └── docsim.rst │ ├── utils.rst │ ├── interfaces.rst │ ├── apiref.rst │ ├── support.rst │ ├── Makefile │ ├── distributed.rst │ ├── about.rst │ ├── changes_080.rst │ ├── install.rst │ ├── tutorial.rst │ └── dist_lda.rst ├── setup.cfg ├── CONTRIBUTING.md ├── MANIFEST.in ├── .travis.yml ├── continuous_integration └── appveyor │ ├── requirements.txt │ └── run_with_env.cmd ├── .gitignore └── appveyor.yml /gensim/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gensim/scripts/make_wiki.py: -------------------------------------------------------------------------------- 1 | make_wikicorpus.py -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_lemma.py: -------------------------------------------------------------------------------- 1 | make_wikicorpus.py -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online.py: -------------------------------------------------------------------------------- 1 | make_wikicorpus.py -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online_lemma.py: -------------------------------------------------------------------------------- 1 | make_wikicorpus.py -------------------------------------------------------------------------------- /gensim/scripts/make_wiki_online_nodebug.py: -------------------------------------------------------------------------------- 1 | make_wikicorpus.py -------------------------------------------------------------------------------- /docs/src/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/favicon.ico -------------------------------------------------------------------------------- /gensim/test/test_data/lee.cor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/lee.cor -------------------------------------------------------------------------------- /docs/src/_static/images/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/bg.png -------------------------------------------------------------------------------- /gensim/test/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains automated code tests for all other gensim packages. 3 | """ 4 | -------------------------------------------------------------------------------- /docs/src/_static/images/arrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/arrows.png -------------------------------------------------------------------------------- /docs/src/_static/images/gensim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim.png -------------------------------------------------------------------------------- /docs/src/_static/images/ukazka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/ukazka.png -------------------------------------------------------------------------------- /docs/src/gensim_theme/page.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | {{ body }} 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /docs/src/_static/images/bullets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/bullets.png -------------------------------------------------------------------------------- /docs/src/_static/images/checker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/checker.png -------------------------------------------------------------------------------- /docs/src/_static/images/default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/default.png -------------------------------------------------------------------------------- /docs/src/_static/images/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/download.png -------------------------------------------------------------------------------- /docs/src/_static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/favicon.ico -------------------------------------------------------------------------------- /docs/src/_static/images/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/loading.gif -------------------------------------------------------------------------------- /docs/src/_static/images/tagline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/tagline.png -------------------------------------------------------------------------------- /docs/src/_static/images/ukazka2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/ukazka2.png -------------------------------------------------------------------------------- /gensim/test/test_data/dtm_test.dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/dtm_test.dict -------------------------------------------------------------------------------- /docs/src/_static/images/gensim_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim_code.png -------------------------------------------------------------------------------- /docs/src/_static/images/get-started.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/get-started.png -------------------------------------------------------------------------------- /docs/src/_static/images/logo-gensim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/logo-gensim.png -------------------------------------------------------------------------------- /docs/src/_static/images/menubutton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/menubutton.png -------------------------------------------------------------------------------- /docs/src/_static/images/twitterbird.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/twitterbird.png -------------------------------------------------------------------------------- /docs/src/_static/images/gensim-footer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim-footer.png -------------------------------------------------------------------------------- /docs/src/_static/images/googlegroups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/googlegroups.png -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.mm.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.mm.index -------------------------------------------------------------------------------- /docs/src/_static/images/direct-install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/direct-install.png -------------------------------------------------------------------------------- /docs/src/_static/images/features/robust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/robust.png -------------------------------------------------------------------------------- /docs/src/_static/images/features/support.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/support.png -------------------------------------------------------------------------------- /docs/src/_static/images/gensim_compact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/gensim_compact.png -------------------------------------------------------------------------------- /docs/src/_static/images/tagline_compact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/tagline_compact.png -------------------------------------------------------------------------------- /docs/src/gensim_theme/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = none 3 | stylesheet = css/style.css 4 | pygments_style = sphinx 5 | 6 | [options] 7 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.blei.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.blei.index -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.low.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.low.index -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.uci.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.uci.index -------------------------------------------------------------------------------- /docs/src/_static/images/features/free_lgpl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/free_lgpl.png -------------------------------------------------------------------------------- /gensim/test/test_data/head500.noblanks.cor.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/head500.noblanks.cor.bz2 -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.mallet.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.mallet.index -------------------------------------------------------------------------------- /docs/src/_static/images/features/converters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/converters.png -------------------------------------------------------------------------------- /docs/src/_static/images/logo-gensim_compact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/logo-gensim_compact.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_dtu.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_dtu.gif -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_eudml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_eudml.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_ghent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_ghent.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_ibcn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_ibcn.png -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.svmlight.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/testcorpus.svmlight.index -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_issuu.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_issuu.jpeg -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_roistr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_roistr.png -------------------------------------------------------------------------------- /docs/src/_static/images/forkme_left_white_ffffff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/forkme_left_white_ffffff.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_dynadmic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_dynadmic.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_tailwind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_tailwind.png -------------------------------------------------------------------------------- /gensim/test/test_data/head500.noblanks.cor_tfidf.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/gensim/test/test_data/head500.noblanks.cor_tfidf.model -------------------------------------------------------------------------------- /docs/src/_static/images/features/memory_independence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/memory_independence.png -------------------------------------------------------------------------------- /docs/src/_static/images/features/similarity_queries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/similarity_queries.png -------------------------------------------------------------------------------- /docs/src/_static/images/features/platform_independence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/platform_independence.png -------------------------------------------------------------------------------- /docs/src/_static/images/references/logo_sportsauthority.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/references/logo_sportsauthority.png -------------------------------------------------------------------------------- /docs/src/_static/images/features/efficient_implementations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deliarusu/gensim/develop/docs/src/_static/images/features/efficient_implementations.png -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.blei.vocab: -------------------------------------------------------------------------------- 1 | human 2 | interface 3 | computer 4 | user 5 | system 6 | response 7 | time 8 | eps 9 | survey 10 | trees 11 | graph 12 | minors 13 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.uci.vocab: -------------------------------------------------------------------------------- 1 | human 2 | interface 3 | computer 4 | user 5 | system 6 | response 7 | time 8 | eps 9 | survey 10 | trees 11 | graph 12 | minors 13 | -------------------------------------------------------------------------------- /docs/src/indextoc.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :hidden: 3 | :maxdepth: 1 4 | 5 | intro 6 | install 7 | tutorial 8 | distributed 9 | support 10 | wiki 11 | apiref 12 | -------------------------------------------------------------------------------- /gensim/summarization/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # bring model classes directly into package namespace, to save some typing 3 | from .summarizer import summarize, summarize_corpus 4 | from .keywords import keywords -------------------------------------------------------------------------------- /gensim/test/test_data/miIslita.cor: -------------------------------------------------------------------------------- 1 | LSI tutorials and fast tracks 2 | Books on semantic analysis 3 | Learning latent semantic indexing 4 | Advances in structures and advances in indexing 5 | Analysis of latent structures 6 | -------------------------------------------------------------------------------- /gensim/models/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains wrappers for other topic modeling programs. 3 | """ 4 | 5 | from .ldamallet import LdaMallet 6 | from .dtmmodel import DtmModel 7 | from .ldavowpalwabbit import LdaVowpalWabbit 8 | -------------------------------------------------------------------------------- /docs/src/corpora/corpora.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora` -- Package for corpora I/O 2 | ========================================== 3 | 4 | .. automodule:: gensim.corpora 5 | :synopsis: Package for corpora I/O 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /gensim/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains functions to preprocess raw text 3 | """ 4 | 5 | # bring model classes directly into package namespace, to save some typing 6 | from .porter import PorterStemmer 7 | from .preprocessing import * 8 | -------------------------------------------------------------------------------- /gensim/test/test_data/test_corpus_small.mm: -------------------------------------------------------------------------------- 1 | %%matrixmarket matrix coordinate real general 2 | 3 5 9 3 | 1 1 1.000000 4 | 1 2 3.000000 5 | 1 4 5.000000 6 | 2 2 2.000000 7 | 2 3 1.000000 8 | 2 5 4.000000 9 | 3 1 2.000000 10 | 3 2 2.000000 11 | 3 4 1.000000 -------------------------------------------------------------------------------- /docs/src/matutils.rst: -------------------------------------------------------------------------------- 1 | :mod:`matutils` -- Math utils 2 | ============================== 3 | 4 | .. automodule:: gensim.matutils 5 | :synopsis: Math utils 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/rpmodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.rpmodel` -- Random Projections 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.rpmodel 5 | :synopsis: Random Projections 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /gensim/similarities/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains implementations of pairwise similarity queries. 3 | """ 4 | 5 | # bring classes directly into package namespace, to save some typing 6 | from .docsim import Similarity, MatrixSimilarity, SparseMatrixSimilarity 7 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.blei: -------------------------------------------------------------------------------- 1 | 3 0:1.0 1:1.0 2:1.0 2 | 6 2:1.0 3:1.0 4:1.0 5:1.0 6:1.0 8:1.0 3 | 4 1:1.0 3:1.0 4:1.0 7:1.0 4 | 3 0:1.0 4:2.0 7:1.0 5 | 3 3:1.0 5:1.0 6:1.0 6 | 1 9:1.0 7 | 2 9:1.0 10:1.0 8 | 3 9:1.0 10:1.0 11:1.0 9 | 3 8:1.0 10:1.0 11:1.0 10 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.txt: -------------------------------------------------------------------------------- 1 | computer human interface 2 | computer response survey system time user 3 | interface system user eps 4 | human system system eps 5 | response time user 6 | trees 7 | trees graph 8 | trees graph minors 9 | survey graph minors 10 | -------------------------------------------------------------------------------- /docs/src/models/models.rst: -------------------------------------------------------------------------------- 1 | :mod:`models` -- Package for transformation models 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models 5 | :synopsis: Package for transformation models 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.low: -------------------------------------------------------------------------------- 1 | 9 2 | computer human interface 3 | computer response survey system time user 4 | interface system user eps 5 | human system system eps 6 | response time user 7 | trees 8 | trees graph 9 | trees graph minors 10 | survey graph minors 11 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.svmlight: -------------------------------------------------------------------------------- 1 | 0 1:1.0 2:1.0 3:1.0 2 | 0 1:1.0 4:1.0 5:1.0 6:1.0 7:1.0 8:1.0 3 | 0 3:1.0 6:1.0 8:1.0 9:1.0 4 | 0 2:1.0 6:2.0 9:1.0 5 | 0 4:1.0 7:1.0 8:1.0 6 | 0 10:1.0 7 | 0 10:1.0 11:1.0 8 | 0 10:1.0 11:1.0 12:1.0 9 | 0 5:1.0 11:1.0 12:1.0 10 | -------------------------------------------------------------------------------- /docs/src/similarities/simserver.rst: -------------------------------------------------------------------------------- 1 | :mod:`simserver` -- Document similarity server 2 | ====================================================== 3 | 4 | .. automodule:: simserver.simserver 5 | :synopsis: Document similarity server 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /docs/src/models/lda_worker.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.lda_worker` -- Worker for distributed LDA 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.lda_worker 5 | :synopsis: Worker for distributed LDA 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /docs/src/models/lsi_worker.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.lsi_worker` -- Worker for distributed LSI 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.lsi_worker 5 | :synopsis: Worker for distributed LSI 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /gensim/test/test_data/test_corpus_ok.mm: -------------------------------------------------------------------------------- 1 | %%matrixmarket matrix coordinate real general 2 | 3 5 9 3 | 1 1 1.000000 4 | 1 2 3.000000 5 | 1 4 5.000000 6 | 2 2 2.000000 7 | 2 3 1.000000 8 | 2 5 4.000000 9 | 3 1 2.000000 10 | 3 2 2.000000 11 | 3 3 8.000000 12 | 3 4 1.000000 13 | 3 5 2.000000 -------------------------------------------------------------------------------- /docs/src/utils.rst: -------------------------------------------------------------------------------- 1 | :mod:`utils` -- Various utility functions 2 | ========================================== 3 | 4 | .. automodule:: gensim.utils 5 | :synopsis: Various utility functions 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/interfaces.rst: -------------------------------------------------------------------------------- 1 | :mod:`interfaces` -- Core gensim interfaces 2 | ============================================ 3 | 4 | .. automodule:: gensim.interfaces 5 | :synopsis: Core gensim interfaces 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/tfidfmodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.tfidfmodel` -- TF-IDF model 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.tfidfmodel 5 | :synopsis: TF-IDF model 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/similarities/docsim.rst: -------------------------------------------------------------------------------- 1 | :mod:`similarities.docsim` -- Document similarity queries 2 | ======================================================================== 3 | 4 | .. automodule:: gensim.similarities.docsim 5 | :synopsis: Document similarity queries 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /gensim/test/test_data/mihalcea_tarau.kw.txt: -------------------------------------------------------------------------------- 1 | gilbert 2 | hurricane 3 | winds 4 | coast 5 | storm 6 | saturday 7 | flood 8 | flooding 9 | weather 10 | alert 11 | defense alerted 12 | strong 13 | people 14 | pushed 15 | puerto 16 | cabral said 17 | north 18 | associated 19 | south 20 | domingo 21 | residents 22 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.mallet: -------------------------------------------------------------------------------- 1 | 1 en computer human interface 2 | 2 en computer response survey system time user 3 | 3 en interface system user eps 4 | 4 en human system system eps 5 | 5 en response time user 6 | 6 en trees 7 | 7 en trees graph 8 | 8 en trees graph minors 9 | 9 en survey graph minors 10 | -------------------------------------------------------------------------------- /docs/src/models/lda_dispatcher.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.lda_dispatcher` -- Dispatcher for distributed LDA 2 | ================================================================ 3 | 4 | .. automodule:: gensim.models.lda_dispatcher 5 | :synopsis: Dispatcher for distributed LDA 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /docs/src/models/lsi_dispatcher.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.lsi_dispatcher` -- Dispatcher for distributed LSI 2 | =============================================================== 3 | 4 | .. automodule:: gensim.models.lsi_dispatcher 5 | :synopsis: Dispatcher for distributed LSI 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /docs/src/models/lsimodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.lsimodel` -- Latent Semantic Indexing 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.lsimodel 5 | :synopsis: Latent Semantic Indexing 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/ldamodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.ldamodel` -- Latent Dirichlet Allocation 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.ldamodel 5 | :synopsis: Latent Dirichlet Allocation 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/word2vec.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.word2vec` -- Deep learning with word2vec 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.word2vec 5 | :synopsis: Deep learning with word2vec 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/doc2vec.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.doc2vec` -- Deep learning with paragraph2vec 2 | ========================================================= 3 | 4 | .. automodule:: gensim.models.doc2vec 5 | :synopsis: Deep learning with doc2vec 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/logentropy_model.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.logentropy_model` -- LogEntropy model 2 | ====================================================== 3 | 4 | .. automodule:: gensim.models.logentropy_model 5 | :synopsis: LogEntropy model 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/phrases.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.phrases` -- Phrase (collocation) detection 2 | ======================================================= 3 | 4 | .. automodule:: gensim.models.phrases 5 | :synopsis: Phrase (collocation) detection 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/hdpmodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.hdpmodel` -- Hierarchical Dirichlet Process 2 | ======================================================== 3 | 4 | .. automodule:: gensim.models.hdpmodel 5 | :synopsis: Hierarchical Dirichlet Process 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/dictionary.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.dictionary` -- Construct word<->id mappings 2 | ========================================================== 3 | 4 | .. automodule:: gensim.corpora.dictionary 5 | :synopsis: Construct word<->id mappings 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/mmcorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.mmcorpus` -- Corpus in Matrix Market format 2 | ========================================================== 3 | 4 | .. automodule:: gensim.corpora.mmcorpus 5 | :synopsis: Corpus in Matrix Market format 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/wikicorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.wikicorpus` -- Corpus from a Wikipedia dump 2 | ========================================================== 3 | 4 | .. automodule:: gensim.corpora.wikicorpus 5 | :synopsis: Corpus from a Wikipedia dump 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/bleicorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.bleicorpus` -- Corpus in Blei's LDA-C format 2 | ========================================================== 3 | 4 | .. automodule:: gensim.corpora.bleicorpus 5 | :synopsis: Corpus in Blei's LDA-C format 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/lowcorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.lowcorpus` -- Corpus in List-of-Words format 2 | =========================================================== 3 | 4 | .. automodule:: gensim.corpora.lowcorpus 5 | :synopsis: Corpus in List-of-Words format 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/svmlightcorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.svmlightcorpus` -- Corpus in SVMlight format 2 | ================================================================== 3 | 4 | .. automodule:: gensim.corpora.svmlightcorpus 5 | :synopsis: Corpus in SVMlight format 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/textcorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.textcorpus` -- Building corpora with dictionaries 2 | ================================================================= 3 | 4 | .. automodule:: gensim.corpora.textcorpus 5 | :synopsis: Building corpora with dictionaries 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/models/wrappers/wrappers.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.wrappers` -- Package for transformation models via external programs 2 | ================================================================================= 3 | 4 | .. automodule:: gensim.models.wrappers 5 | :synopsis: Package for transformation models via external programs 6 | :members: 7 | :inherited-members: 8 | 9 | -------------------------------------------------------------------------------- /docs/src/models/ldamulticore.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.ldamulticore` -- parallelized Latent Dirichlet Allocation 2 | ====================================================================== 3 | 4 | .. automodule:: gensim.models.ldamulticore 5 | :synopsis: Latent Dirichlet Allocation 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheelhouse_uploader] 2 | artifact_indexes= 3 | # OSX wheels built by travis (only for specific tags): 4 | # https://github.com/MacPython/scikit-learn-wheels 5 | http://wheels.scipy.org 6 | # Windows wheels buit by: 7 | # https://ci.appveyor.com/project/piskvorky/gensim 8 | http://17a25141cb7f75c18ee4-676a79255544e7711e0dd8bccdcdd1cb.r23.cf2.rackcdn.com 9 | -------------------------------------------------------------------------------- /docs/src/corpora/hashdictionary.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.hashdictionary` -- Construct word<->id mappings 2 | ============================================================= 3 | 4 | .. automodule:: gensim.corpora.hashdictionary 5 | :synopsis: Construct word<->id mappings on the fly (the "hashing trick") 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /gensim/models/voidptr.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #if PY_VERSION_HEX >= 0x03020000 4 | 5 | /* 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore 7 | */ 8 | static void * PyCObject_AsVoidPtr(PyObject *obj) 9 | { 10 | void *ret = PyCapsule_GetPointer(obj, NULL); 11 | if (ret == NULL) { 12 | PyErr_Clear(); 13 | } 14 | return ret; 15 | } 16 | 17 | #endif -------------------------------------------------------------------------------- /docs/src/models/wrappers/ldamallet.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.wrappers.ldamallet` -- Latent Dirichlet Allocation via Mallet 2 | ========================================================================== 3 | 4 | .. automodule:: gensim.models.wrappers.ldamallet 5 | :synopsis: Latent Dirichlet Allocation via Mallet 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.uci: -------------------------------------------------------------------------------- 1 | 9 2 | 12 3 | 28 4 | 1 1 1 5 | 1 2 1 6 | 1 3 1 7 | 2 1 1 8 | 2 4 1 9 | 2 5 1 10 | 2 6 1 11 | 2 7 1 12 | 2 8 1 13 | 3 3 1 14 | 3 6 1 15 | 3 8 1 16 | 3 9 1 17 | 4 2 1 18 | 4 6 2 19 | 4 9 1 20 | 5 4 1 21 | 5 7 1 22 | 5 8 1 23 | 6 10 1 24 | 7 10 1 25 | 7 11 1 26 | 8 10 1 27 | 8 11 1 28 | 8 12 1 29 | 9 5 1 30 | 9 11 1 31 | 9 12 1 32 | -------------------------------------------------------------------------------- /gensim/test/test_data/mihalcea_tarau.kwpos.txt: -------------------------------------------------------------------------------- 1 | gilbert 2 | hurricane 3 | coast 4 | storm 5 | saturday 6 | winds heavy 7 | flood 8 | flooding 9 | weather 10 | alert 11 | defense alerted 12 | strong 13 | pushed 14 | people 15 | puerto 16 | cabral said 17 | north 18 | associated 19 | south 20 | domingo 21 | residents 22 | dominican 23 | miles 24 | southeast 25 | san 26 | civil 27 | home 28 | reached 29 | juan 30 | named 31 | -------------------------------------------------------------------------------- /docs/src/models/wrappers/dtmmodel.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.wrappers.dtmmodel` -- Dynamic Topic Models (DTM) and Dynamic Influence Models (DIM) 2 | ================================================================================================ 3 | 4 | .. automodule:: gensim.models.wrappers.dtmmodel 5 | :synopsis: Dynamic Topic Models 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/src/corpora/ucicorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.ucicorpus` -- Corpus in UCI bag-of-words format 2 | ============================================================================================================== 3 | 4 | .. automodule:: gensim.corpora.ucicorpus 5 | :synopsis: Corpus in University of California, Irvine (UCI) bag-of-words format 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Please see [contribution-guide.org](http://www.contribution-guide.org/) for Due Diligence steps we expect from contributors before submitting an issue. 4 | 5 | For open-ended questions the best place is our active [mailing list](https://groups.google.com/forum/#!forum/gensim). 6 | 7 | For Code Style please see our [Developer Page](https://github.com/piskvorky/gensim/wiki/Developer-page#code-style). 8 | 9 | Thanks! 10 | 11 | -------------------------------------------------------------------------------- /docs/src/models/wrappers/ldavowpalwabbit.rst: -------------------------------------------------------------------------------- 1 | :mod:`models.wrappers.ldavowpalwabbit` -- Latent Dirichlet Allocation via Vowpal Wabbit 2 | ======================================================================================= 3 | 4 | .. automodule:: gensim.models.wrappers.ldavowpalwabbit 5 | :synopsis: Latent Dirichlet Allocation via Vowpal Wabbit 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include docs * 2 | recursive-include gensim/test/test_data * 3 | recursive-include . *.sh 4 | prune docs/src* 5 | include README.rst 6 | include CHANGELOG.txt 7 | include COPYING 8 | include COPYING.LESSER 9 | include ez_setup.py 10 | include gensim/models/voidptr.h 11 | include gensim/models/word2vec_inner.c 12 | include gensim/models/word2vec_inner.pyx 13 | include gensim/models/doc2vec_inner.c 14 | include gensim/models/doc2vec_inner.pyx 15 | -------------------------------------------------------------------------------- /gensim/test/test_data/testcorpus.mm: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | 9 12 28 3 | 1 1 1.0 4 | 1 2 1.0 5 | 1 3 1.0 6 | 2 1 1.0 7 | 2 4 1.0 8 | 2 5 1.0 9 | 2 6 1.0 10 | 2 7 1.0 11 | 2 8 1.0 12 | 3 3 1.0 13 | 3 6 1.0 14 | 3 8 1.0 15 | 3 9 1.0 16 | 4 2 1.0 17 | 4 6 2.0 18 | 4 9 1.0 19 | 5 4 1.0 20 | 5 7 1.0 21 | 5 8 1.0 22 | 6 10 1.0 23 | 7 10 1.0 24 | 7 11 1.0 25 | 8 10 1.0 26 | 8 11 1.0 27 | 8 12 1.0 28 | 9 5 1.0 29 | 9 11 1.0 30 | 9 12 1.0 31 | -------------------------------------------------------------------------------- /docs/src/corpora/indexedcorpus.rst: -------------------------------------------------------------------------------- 1 | :mod:`corpora.indexedcorpus` -- Random access to corpus documents 2 | ================================================================= 3 | 4 | .. automodule:: gensim.corpora.indexedcorpus 5 | :synopsis: Random access to corpus documents 6 | :members: 7 | :inherited-members: 8 | :undoc-members: 9 | :show-inheritance: 10 | 11 | 12 | .. autoclass:: IndexedCorpus 13 | :members: 14 | :inherited-members: 15 | :undoc-members: 16 | :show-inheritance: -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 4 | - "2.6" 5 | - "2.7" 6 | - "3.3" 7 | - "3.4" 8 | - "3.5" 9 | before_install: 10 | - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh 11 | - chmod +x miniconda.sh 12 | - ./miniconda.sh -b 13 | - export PATH=/home/travis/miniconda2/bin:$PATH 14 | - conda update --yes conda 15 | install: 16 | - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy 17 | - source activate gensim-test 18 | - python setup.py install 19 | script: python setup.py test 20 | -------------------------------------------------------------------------------- /gensim/summarization/commons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | from gensim.summarization.graph import Graph 7 | 8 | 9 | def build_graph(sequence): 10 | graph = Graph() 11 | for item in sequence: 12 | if not graph.has_node(item): 13 | graph.add_node(item) 14 | return graph 15 | 16 | 17 | def remove_unreachable_nodes(graph): 18 | for node in graph.nodes(): 19 | if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: 20 | graph.del_node(node) 21 | -------------------------------------------------------------------------------- /gensim/summarization/syntactic_unit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | 7 | class SyntacticUnit(object): 8 | 9 | def __init__(self, text, token=None, tag=None): 10 | self.text = text 11 | self.token = token 12 | self.tag = tag[:2] if tag else None # Just first two letters of tag 13 | self.index = -1 14 | self.score = -1 15 | 16 | def __str__(self): 17 | return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" 18 | 19 | def __repr__(self): 20 | return str(self) 21 | -------------------------------------------------------------------------------- /gensim/corpora/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains implementations of various streaming corpus I/O format. 3 | """ 4 | 5 | # bring corpus classes directly into package namespace, to save some typing 6 | from .indexedcorpus import IndexedCorpus # must appear before the other classes 7 | 8 | from .mmcorpus import MmCorpus 9 | from .bleicorpus import BleiCorpus 10 | from .svmlightcorpus import SvmLightCorpus 11 | from .lowcorpus import LowCorpus 12 | from .dictionary import Dictionary 13 | from .hashdictionary import HashDictionary 14 | from .wikicorpus import WikiCorpus 15 | from .textcorpus import TextCorpus 16 | from .ucicorpus import UciCorpus 17 | from .malletcorpus import MalletCorpus 18 | from .sharded_corpus import ShardedCorpus 19 | -------------------------------------------------------------------------------- /gensim/test/test_data/mihalcea_tarau.summ.txt: -------------------------------------------------------------------------------- 1 | Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas. 2 | The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo. 3 | The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm. 4 | Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast. -------------------------------------------------------------------------------- /gensim/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains interfaces and functionality to compute pair-wise document 3 | similarities within a corpus of documents. 4 | """ 5 | 6 | from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization 7 | import logging 8 | 9 | try: 10 | __version__ = __import__('pkg_resources').get_distribution('gensim').version 11 | except: 12 | __version__ = '?' 13 | 14 | 15 | class NullHandler(logging.Handler): 16 | """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" 17 | def emit(self, record): 18 | pass 19 | 20 | logger = logging.getLogger('gensim') 21 | if len(logger.handlers) == 0: # To ensure reload() doesn't add another one 22 | logger.addHandler(NullHandler()) 23 | -------------------------------------------------------------------------------- /continuous_integration/appveyor/requirements.txt: -------------------------------------------------------------------------------- 1 | # Fetch numpy and scipy wheels from the sklearn rackspace wheelhouse. 2 | # Those wheels were collected from http://www.lfd.uci.edu/~gohlke/pythonlibs/ 3 | # This is a temporary solution. As soon as numpy and scipy provide official 4 | # wheel for windows we ca delete this --find-links line. 5 | --find-links http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/ 6 | 7 | # fix the versions of numpy to force the use of numpy and scipy to use the whl 8 | # of the rackspace folder instead of trying to install from more recent 9 | # source tarball published on PyPI 10 | numpy==1.9.3 11 | scipy==0.16.0 12 | cython 13 | six >= 1.5.0 14 | smart_open >= 1.2.1 15 | nose 16 | wheel 17 | wheelhouse_uploader 18 | 19 | -------------------------------------------------------------------------------- /gensim/test/test_data/ldavowpalwabbit.dict.txt: -------------------------------------------------------------------------------- 1 | 28 alex 1 2 | 23 alice 1 3 | 47 bacon 1 4 | 46 beans 1 5 | 25 bob 1 6 | 10 brakes 1 7 | 36 c 1 8 | 12 car 1 9 | 6 cat 1 10 | 40 cereal 1 11 | 0 cheetah 1 12 | 11 clutch 1 13 | 49 coffee 1 14 | 38 cplusplus 1 15 | 37 csharp 1 16 | 18 cylinder 1 17 | 27 dave 1 18 | 48 eggs 1 19 | 19 engine 1 20 | 30 erlang 1 21 | 17 exhaust 1 22 | 34 go 1 23 | 42 ham 1 24 | 24 harry 1 25 | 35 haskell 1 26 | 1 jaguar 1 27 | 39 java 1 28 | 21 jim 1 29 | 41 juice 1 30 | 2 kitten 1 31 | 4 leopard 1 32 | 9 lion 1 33 | 7 lynx 1 34 | 14 motor 1 35 | 3 mouse 1 36 | 44 mushrooms 1 37 | 5 puppy 1 38 | 32 python 1 39 | 26 rachel 1 40 | 22 robert 1 41 | 31 ruby 1 42 | 43 sausages 1 43 | 33 scala 1 44 | 20 sue 1 45 | 16 suspension 1 46 | 45 tea 1 47 | 8 tiger 1 48 | 29 tim 1 49 | 13 tyre 1 50 | 15 wheel 1 51 | -------------------------------------------------------------------------------- /gensim/test/test_data/testsummarization_unrelated.txt: -------------------------------------------------------------------------------- 1 | River lake island mountain area. 2 | Relay athletics metres freestyle hurdles. 3 | Were court law government police. 4 | Courcelles centimeters mattythewhite wine stamps. 5 | Sysop iran pakistan ali arab. 6 | Copyrighted northamerica rihanna cloudz knowles. 7 | Israel sockpuppet jerusalem palestinian ifk. 8 | Melbourne rovers australian wanderers dinamo. 9 | Film series episode television. 10 | Wrestling chateau ligue discus estonian. 11 | Edits notability archived clearer speedy. 12 | Admins acid molniya chemical compound. 13 | India tamil singh temple kumar. 14 | Bwebs malta hobart basa columella huon. 15 | Rabbi bgwhite lebanese beirut caligari. 16 | German berlin kategorie cross. 17 | System power energy data. 18 | Indonesia malaysia singapore greek jakarta. 19 | Stakes webs futsal whitish thoroughbred racehorse. 20 | Oblast uploaded nordland selsoviet halogaland. -------------------------------------------------------------------------------- /docs/src/apiref.rst: -------------------------------------------------------------------------------- 1 | .. _apiref: 2 | 3 | API Reference 4 | ============= 5 | 6 | Modules: 7 | 8 | .. toctree:: 9 | :maxdepth: 0 10 | 11 | interfaces 12 | utils 13 | matutils 14 | corpora/bleicorpus 15 | corpora/dictionary 16 | corpora/hashdictionary 17 | corpora/lowcorpus 18 | corpora/mmcorpus 19 | corpora/svmlightcorpus 20 | corpora/wikicorpus 21 | corpora/textcorpus 22 | corpora/ucicorpus 23 | corpora/indexedcorpus 24 | models/ldamodel 25 | models/ldamulticore 26 | models/lsimodel 27 | models/tfidfmodel 28 | models/rpmodel 29 | models/hdpmodel 30 | models/logentropy_model 31 | models/lsi_dispatcher 32 | models/lsi_worker 33 | models/lda_dispatcher 34 | models/lda_worker 35 | models/word2vec 36 | models/doc2vec 37 | models/phrases 38 | models/wrappers/ldamallet 39 | models/wrappers/dtmmodel 40 | models/wrappers/ldavowpalwabbit.rst 41 | similarities/docsim 42 | similarities/simserver 43 | 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.pyc 10 | 11 | # Packages # 12 | ############ 13 | # it's better to unpack these files and commit the raw source 14 | # git has its own built in compression methods 15 | *.7z 16 | *.dmg 17 | *.gz 18 | *.iso 19 | *.jar 20 | *.rar 21 | *.tar 22 | *.zip 23 | 24 | # Logs and databases # 25 | ###################### 26 | *.log 27 | *.sql 28 | *.sqlite 29 | *.pkl 30 | *.bak 31 | *.npy 32 | *.npz 33 | 34 | # OS generated files # 35 | ###################### 36 | .DS_Store? 37 | ehthumbs.db 38 | Icon? 39 | Thumbs.db 40 | 41 | # Other # 42 | ######### 43 | .project 44 | .pydevproject 45 | .ropeproject 46 | .settings/ 47 | .eggs 48 | cython_debug 49 | docs/src/_build/ 50 | docs/_static 51 | dedan_gensim.tmproj 52 | gensim*.egg-info 53 | *,cover 54 | .idea 55 | *.dict 56 | *.index 57 | .coverage 58 | .*.sw[op] 59 | data 60 | *.bak 61 | /build/ 62 | /dist/ 63 | *.prof 64 | *.lprof 65 | *.bin 66 | *.old 67 | *.model 68 | *~ 69 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/runall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # full path to gensim executables 4 | BIN_PATH=~/xrehurek/gensim/dmlcz 5 | 6 | # intermediate data will be stored to this dir 7 | RESULT_PATH=~/xrehurek/results 8 | 9 | # set python path, so that python can find and import gensim modules 10 | export PYTHONPATH=~/xrehurek:$PYTHONPATH 11 | 12 | # Language is set to 'any', meaning all articles are processed for similarity in 13 | # one go, regardless of their language. 14 | # Set language to 'eng', 'fre', 'rus' etc. to only process a specific subset of 15 | # articles (an article's language is determined from its metadata). 16 | language=any 17 | 18 | 19 | # ========== parse all article sources, build article co-occurence matrix ====== 20 | ${BIN_PATH}/gensim_build.py $language 2>&1 | tee ${RESULT_PATH}/gensim_build.log 21 | 22 | 23 | # ========== build transformation models ======================================= 24 | for method in tfidf rp; 25 | do 26 | ( ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) & 27 | done 28 | wait 29 | 30 | method=lsi 31 | ${BIN_PATH}/gensim_genmodel.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log 32 | 33 | 34 | # =========== generate output xml files ======================================== 35 | # generate xml files for all methods at once, in parallel, to save time. 36 | # NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here) 37 | for method in tfidf lsi rp; 38 | do 39 | ( ${BIN_PATH}/gensim_xml.py $language $method 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) & 40 | done 41 | wait 42 | -------------------------------------------------------------------------------- /gensim/nosy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A simple testrunner for nose (or anything else). 5 | 6 | Watch for changes in all file types specified in 'EXTENSIONS'. 7 | If changes, run test executable in 'EXECUTABLE', with default 8 | arguments 'DEFAULTARGS'. 9 | 10 | The --with-color option needs the "rudolf" nose plugin. See: 11 | http://pypi.python.org/pypi/rudolf/ 12 | 13 | Originally by Jeff Winkler, http://jeffwinkler.net 14 | Forked from wkral http://github.com/wkral/Nosy 15 | """ 16 | 17 | import os 18 | import stat 19 | import time 20 | import datetime 21 | import sys 22 | import fnmatch 23 | 24 | 25 | EXTENSIONS = ['*.py'] 26 | EXECUTABLE = 'nosetests test/' 27 | DEFAULTARGS = '--with-color -exe'# -w tests' 28 | 29 | 30 | def checkSum(): 31 | """ 32 | Return a long which can be used to know if any .py files have changed. 33 | """ 34 | val = 0 35 | for root, dirs, files in os.walk(os.getcwd()): 36 | for extension in EXTENSIONS: 37 | for f in fnmatch.filter(files, extension): 38 | stats = os.stat(os.path.join(root, f)) 39 | val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME] 40 | return val 41 | 42 | if __name__ == '__main__': 43 | val = 0 44 | try: 45 | while True: 46 | if checkSum() != val: 47 | val = checkSum() 48 | os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, 49 | ' '.join(sys.argv[1:]))) 50 | print(datetime.datetime.now().__str__()) 51 | print('=' * 77) 52 | time.sleep(1) 53 | except KeyboardInterrupt: 54 | print('Goodbye') 55 | -------------------------------------------------------------------------------- /gensim/corpora/mmcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus in the Matrix Market format. 10 | """ 11 | 12 | 13 | import logging 14 | 15 | from gensim import interfaces, matutils 16 | from gensim.corpora import IndexedCorpus 17 | 18 | 19 | logger = logging.getLogger('gensim.corpora.mmcorpus') 20 | 21 | 22 | class MmCorpus(matutils.MmReader, IndexedCorpus): 23 | """ 24 | Corpus in the Matrix Market format. 25 | """ 26 | def __init__(self, fname): 27 | # avoid calling super(), too confusing 28 | IndexedCorpus.__init__(self, fname) 29 | matutils.MmReader.__init__(self, fname) 30 | 31 | def __iter__(self): 32 | """ 33 | Interpret a matrix in Matrix Market format as a streamed gensim corpus 34 | (yielding one document at a time). 35 | """ 36 | for doc_id, doc in super(MmCorpus, self).__iter__(): 37 | yield doc # get rid of doc id, return the sparse vector only 38 | 39 | @staticmethod 40 | def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): 41 | """ 42 | Save a corpus in the Matrix Market format to disk. 43 | 44 | This function is automatically called by `MmCorpus.serialize`; don't 45 | call it directly, call `serialize` instead. 46 | """ 47 | logger.info("storing corpus in Matrix Market format to %s" % fname) 48 | num_terms = len(id2word) if id2word is not None else None 49 | return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata) 50 | 51 | # endclass MmCorpus 52 | -------------------------------------------------------------------------------- /docs/src/support.rst: -------------------------------------------------------------------------------- 1 | .. _support: 2 | 3 | ============= 4 | Support 5 | ============= 6 | 7 | Open source support 8 | -------------------- 9 | 10 | The main communication channel is the `gensim mailing list `_. 11 | This is the preferred way to **ask for help**, **report problems** and **share insights** with the community. Newbie questions are perfectly fine, just make sure you've read the :doc:`tutorials `. 12 | 13 | I discourage sending private emails, because the mailing list serves as a knowledge base for all gensim users, cutting maintenance efforts needed for support. If you feel your problem is too special, data too sensitive, technical scope too demanding, **see the "business" section below**. 14 | 15 | When posting on the mailing list, try to include all relevant information, such as what it is you are trying to achieve, what went wrong, relevant gensim logs, package versions etc. 16 | 17 | **FAQ** and some useful **snippets of code** are maintained on GitHub: https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ. 18 | 19 | You can also try asking on StackOverflow, using the `gensim tag `_. 20 | 21 | 22 | Business support 23 | ------------------ 24 | 25 | I also run a consulting business for data mining and information retrieval solutions, `rare-technologies.com `_. 26 | 27 | In case you need commercial support, design validation, technical training or custom system development, `get in touch `_ for a quote. 28 | 29 | Developer support 30 | ------------------ 31 | 32 | Developers who `tweak gensim internals `_ are encouraged to report issues at the `GitHub issue tracker `_. 33 | Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that. 34 | -------------------------------------------------------------------------------- /docs/src/gensim_theme/search.html: -------------------------------------------------------------------------------- 1 | {# 2 | basic/search.html 3 | ~~~~~~~~~~~~~~~~~ 4 | 5 | Template for the search page. 6 | 7 | :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | :license: BSD, see LICENSE for details. 9 | #} 10 | {% extends "layout.html" %} 11 | {% set title = _('Search') %} 12 | {% set script_files = script_files + ['_static/searchtools.js'] %} 13 | {% block extrahead %} 14 | 17 | {{ super() }} 18 | {% endblock %} 19 | {% block body %} 20 |

{{ _('Search') }}

21 |
22 | 23 |

24 | {% trans %}Please activate JavaScript to enable the search 25 | functionality.{% endtrans %} 26 |

27 |
28 |

29 | {% trans %}From here you can search these documents. Enter your search 30 | words into the box below and click "search". Note that the search 31 | function will automatically search for all of the words. Pages 32 | containing fewer words won't appear in the result list.{% endtrans %} 33 |

34 |
35 | 36 | 37 | 38 |
39 | {% if search_performed %} 40 |

{{ _('Search Results') }}

41 | {% if not search_results %} 42 |

{{ _('Your search did not match any results.') }}

43 | {% endif %} 44 | {% endif %} 45 |
46 | {% if search_results %} 47 |
    48 | {% for href, caption, context in search_results %} 49 |
  • {{ caption }} 50 |
    {{ context|e }}
    51 |
  • 52 | {% endfor %} 53 |
54 | {% endif %} 55 |
56 | {% endblock %} 57 | -------------------------------------------------------------------------------- /gensim/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package contains algorithms for extracting document representations from their raw 3 | bag-of-word counts. 4 | """ 5 | 6 | # bring model classes directly into package namespace, to save some typing 7 | from .hdpmodel import HdpModel 8 | from .ldamodel import LdaModel 9 | from .lsimodel import LsiModel 10 | from .tfidfmodel import TfidfModel 11 | from .rpmodel import RpModel 12 | from .logentropy_model import LogEntropyModel 13 | from .word2vec import Word2Vec 14 | from .doc2vec import Doc2Vec 15 | from .ldamulticore import LdaMulticore 16 | from .phrases import Phrases 17 | 18 | from . import wrappers 19 | 20 | from gensim import interfaces, utils 21 | 22 | 23 | class VocabTransform(interfaces.TransformationABC): 24 | """ 25 | Remap feature ids to new values. 26 | 27 | Given a mapping between old ids and new ids (some old ids may be missing = these 28 | features are to be discarded), this will wrap a corpus so that iterating over 29 | `VocabTransform[corpus]` returns the same vectors but with the new ids. 30 | 31 | Old features that have no counterpart in the new ids are discarded. This 32 | can be used to filter vocabulary of a corpus "online":: 33 | 34 | >>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep)) 35 | >>> vt = VocabTransform(old2new) 36 | >>> for vec_with_new_ids in vt[corpus_with_old_ids]: 37 | >>> ... 38 | 39 | """ 40 | def __init__(self, old2new, id2token=None): 41 | # id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems()) 42 | self.old2new = old2new 43 | self.id2token = id2token 44 | 45 | 46 | def __getitem__(self, bow): 47 | """ 48 | Return representation with the ids transformed. 49 | """ 50 | # if the input vector is in fact a corpus, return a transformed corpus as a result 51 | is_corpus, bow = utils.is_corpus(bow) 52 | if is_corpus: 53 | return self._apply(bow) 54 | 55 | return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new) 56 | #endclass VocabTransform 57 | -------------------------------------------------------------------------------- /gensim/summarization/pagerank_weighted.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | from numpy import empty as empty_matrix 6 | from scipy.sparse import csr_matrix 7 | from scipy.sparse.linalg import eigs 8 | from six.moves import xrange 9 | 10 | try: 11 | from numpy import VisibleDeprecationWarning 12 | import warnings 13 | warnings.filterwarnings("ignore", category=VisibleDeprecationWarning) 14 | except ImportError: 15 | pass 16 | 17 | 18 | def pagerank_weighted(graph, damping=0.85): 19 | adjacency_matrix = build_adjacency_matrix(graph) 20 | probability_matrix = build_probability_matrix(graph) 21 | 22 | pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix 23 | 24 | vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? 25 | 26 | return process_results(graph, vecs.real) 27 | 28 | 29 | def build_adjacency_matrix(graph): 30 | row = [] 31 | col = [] 32 | data = [] 33 | nodes = graph.nodes() 34 | length = len(nodes) 35 | 36 | for i in xrange(length): 37 | current_node = nodes[i] 38 | neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) 39 | for j in xrange(length): 40 | edge_weight = float(graph.edge_weight((current_node, nodes[j]))) 41 | if i != j and edge_weight != 0.0: 42 | row.append(i) 43 | col.append(j) 44 | data.append(edge_weight / neighbors_sum) 45 | 46 | return csr_matrix((data, (row, col)), shape=(length, length)) 47 | 48 | 49 | def build_probability_matrix(graph): 50 | dimension = len(graph.nodes()) 51 | matrix = empty_matrix((dimension, dimension)) 52 | 53 | probability = 1.0 / float(dimension) 54 | matrix.fill(probability) 55 | 56 | return matrix 57 | 58 | 59 | def process_results(graph, vecs): 60 | scores = {} 61 | for i, node in enumerate(graph.nodes()): 62 | scores[node] = abs(vecs[i, :]) 63 | 64 | return scores 65 | -------------------------------------------------------------------------------- /gensim/corpora/csvcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2013 Zygmunt Zając 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Corpus in CSV format. 9 | 10 | """ 11 | 12 | 13 | from __future__ import with_statement 14 | 15 | import logging 16 | import csv 17 | import itertools 18 | 19 | from gensim import interfaces, utils 20 | 21 | logger = logging.getLogger('gensim.corpora.csvcorpus') 22 | 23 | 24 | class CsvCorpus(interfaces.CorpusABC): 25 | """ 26 | Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically 27 | based on the file content. 28 | 29 | All row values are expected to be ints/floats. 30 | 31 | """ 32 | 33 | def __init__(self, fname, labels): 34 | """ 35 | Initialize the corpus from a file. 36 | `labels` = are class labels present in the input file? => skip the first column 37 | 38 | """ 39 | logger.info("loading corpus from %s" % fname) 40 | self.fname = fname 41 | self.length = None 42 | self.labels = labels 43 | 44 | # load the first few lines, to guess the CSV dialect 45 | head = ''.join(itertools.islice(utils.smart_open(self.fname), 5)) 46 | self.headers = csv.Sniffer().has_header(head) 47 | self.dialect = csv.Sniffer().sniff(head) 48 | logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers)) 49 | 50 | def __iter__(self): 51 | """ 52 | Iterate over the corpus, returning one sparse vector at a time. 53 | 54 | """ 55 | reader = csv.reader(utils.smart_open(self.fname), self.dialect) 56 | if self.headers: 57 | next(reader) # skip the headers 58 | 59 | line_no = -1 60 | for line_no, line in enumerate(reader): 61 | if self.labels: 62 | line.pop(0) # ignore the first column = class label 63 | yield list(enumerate(map(float, line))) 64 | 65 | self.length = line_no + 1 # store the total number of CSV rows = documents 66 | 67 | # endclass CsvCorpus 68 | -------------------------------------------------------------------------------- /docs/src/gensim_theme/domainindex.html: -------------------------------------------------------------------------------- 1 | {# 2 | basic/domainindex.html 3 | ~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | Template for domain indices (module index, ...). 6 | 7 | :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | :license: BSD, see LICENSE for details. 9 | #} 10 | {% extends "layout.html" %} 11 | {% set title = indextitle %} 12 | {% block extrahead %} 13 | {{ super() }} 14 | {% if not embedded and collapse_index %} 15 | 18 | {% endif %} 19 | {% endblock %} 20 | {% block body %} 21 | 22 | {%- set groupid = idgen() %} 23 | 24 |

{{ indextitle }}

25 | 26 |
27 | {%- for (letter, entries) in content %} 28 | {{ letter }} 29 | {%- if not loop.last %} | {% endif %} 30 | {%- endfor %} 31 |
32 | 33 | 34 | {%- for letter, entries in content %} 35 | 36 | 38 | {%- for (name, grouptype, page, anchor, extra, qualifier, description) 39 | in entries %} 40 | 41 | 45 | 52 | {%- endfor %} 53 | {%- endfor %} 54 |
 
37 | {{ letter }}
{% if grouptype == 1 -%} 42 | 44 | {%- endif %}{% if grouptype == 2 %}   {% endif %} 46 | {% if page %}{% endif -%} 47 | {{ name|e }} 48 | {%- if page %}{% endif %} 49 | {%- if extra %} ({{ extra|e }}){% endif -%} 50 | {% if qualifier %}{{ qualifier|e }}:{% endif %} 51 | {{ description|e }}
55 | 56 | {% endblock %} 57 | -------------------------------------------------------------------------------- /gensim/test/test_data/mihalcea_tarau.txt: -------------------------------------------------------------------------------- 1 | AP880911-0016 2 | AP-NR-09-11-88 0423EDT r i 3 | BC-HurricaneGilbert 09-11 0339 4 | BC-Hurricane Gilbert,0348 5 | Hurricane Gilbert Heads Toward Dominican Coast 6 | By RUDDY GONZALEZ 7 | Associated Press Writer 8 | SANTO DOMINGO, Dominican Republic (AP) 9 | Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas. 10 | The storm was approaching from the southeast with sustained winds of 75 mph gusting to 92 mph. 11 | ``There is no need for alarm,'' Civil Defense Director Eugenio Cabral said in a television alert shortly before midnight Saturday. 12 | Cabral said residents of the province of Barahona should closely follow Gilbert's movement. 13 | An estimated 100,000 people live in the province, including 70,000 in the city of Barahona, about 125 miles west of Santo Domingo. 14 | Tropical Storm Gilbert formed in the eastern Caribbean and strengthened into a hurricane Saturday night. 15 | The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo. 16 | The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm. 17 | The weather service issued a flash flood watch for Puerto Rico and the Virgin Islands until at least 6 p.m. Sunday. 18 | Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast. 19 | There were no reports of casualties. 20 | San Juan, on the north coast, had heavy rains and gusts Saturday, but they subsided during the night. 21 | On Saturday, Hurricane Florence was downgraded to a tropical storm and its remnants pushed inland from the U.S. Gulf Coast. 22 | Residents returned home, happy to find little damage from 80 mph winds and sheets of rain. 23 | Florence, the sixth named storm of the 1988 Atlantic storm season, was the second hurricane. 24 | The first, Debby, reached minimal hurricane strength briefly before hitting the Mexican coast last month. -------------------------------------------------------------------------------- /gensim/test/test_hdpmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking transformation algorithms (the models package). 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import os.path 16 | import tempfile 17 | 18 | import six 19 | import numpy 20 | import scipy.linalg 21 | 22 | from gensim.corpora import mmcorpus, Dictionary 23 | from gensim.models import hdpmodel 24 | from gensim import matutils 25 | 26 | 27 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 28 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 29 | 30 | 31 | # set up vars used in testing ("Deerwester" from the web tutorial) 32 | texts = [['human', 'interface', 'computer'], 33 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 34 | ['eps', 'user', 'interface', 'system'], 35 | ['system', 'human', 'system', 'eps'], 36 | ['user', 'response', 'time'], 37 | ['trees'], 38 | ['graph', 'trees'], 39 | ['graph', 'minors', 'trees'], 40 | ['graph', 'minors', 'survey']] 41 | dictionary = Dictionary(texts) 42 | corpus = [dictionary.doc2bow(text) for text in texts] 43 | 44 | 45 | def testfile(): 46 | # temporary data will be stored to this file 47 | return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') 48 | 49 | 50 | 51 | class TestHdpModel(unittest.TestCase): 52 | def setUp(self): 53 | self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) 54 | self.class_ = hdpmodel.HdpModel 55 | self.model = self.class_(corpus, id2word=dictionary) 56 | 57 | def testShowTopics(self): 58 | topics = self.model.show_topics(formatted=False) 59 | 60 | for topic_no, topic in topics: 61 | self.assertTrue(isinstance(topic_no, int)) 62 | self.assertTrue(isinstance(topic, list)) 63 | for k, v in topic: 64 | self.assertTrue(isinstance(k, six.string_types)) 65 | self.assertTrue(isinstance(v, float)) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 71 | unittest.main() 72 | -------------------------------------------------------------------------------- /docs/src/gensim_theme/genindex.html: -------------------------------------------------------------------------------- 1 | {# 2 | basic/genindex.html 3 | ~~~~~~~~~~~~~~~~~~~ 4 | 5 | Template for an "all-in-one" index. 6 | 7 | :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | :license: BSD, see LICENSE for details. 9 | #} 10 | {% macro indexentries(firstname, links) %} 11 |
12 | {%- if links -%} 13 | 14 | {%- if links[0][0] %}{% endif -%} 15 | {{ firstname|e }} 16 | {%- if links[0][0] %}{% endif -%} 17 | 18 | 19 | {%- for ismain, link in links[1:] -%} 20 | , {% if ismain %}{% endif -%} 21 | [{{ loop.index }}] 22 | {%- if ismain %}{% endif -%} 23 | 24 | {%- endfor %} 25 | {%- else %} 26 | {{ firstname|e }} 27 | {%- endif %} 28 |
29 | {% endmacro %} 30 | 31 | {% extends "layout.html" %} 32 | {% set title = _('Index') %} 33 | {% block body %} 34 | 35 |

{{ _('Index') }}

36 | 37 |
38 | {% for key, dummy in genindexentries -%} 39 | {{ key }} 40 | {% if not loop.last %}| {% endif %} 41 | {%- endfor %} 42 |
43 | 44 | {%- for key, entries in genindexentries %} 45 |

{{ key }}

46 | 47 | {%- for column in entries|slice(2) if column %} 48 | 60 | {%- endfor %} 61 |
49 | {%- for entryname, (links, subitems) in column %} 50 | {{ indexentries(entryname, links) }} 51 | {%- if subitems %} 52 |
53 | {%- for subentryname, subentrylinks in subitems %} 54 | {{ indexentries(subentryname, subentrylinks) }} 55 | {%- endfor %} 56 |
57 | {%- endif -%} 58 | {%- endfor %} 59 |
62 | {% endfor %} 63 | 64 | {% endblock %} 65 | 66 | {% block sidebarrel %} 67 | {% if split_index %} 68 |

{{ _('Index') }}

69 |

{% for key, dummy in genindexentries -%} 70 | {{ key }} 71 | {% if not loop.last %}| {% endif %} 72 | {%- endfor %}

73 | 74 |

{{ _('Full index on one page') }}

75 | {% endif %} 76 | {{ super() }} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /gensim/summarization/bm25.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | import math 7 | from six import iteritems 8 | from six.moves import xrange 9 | 10 | 11 | # BM25 parameters. 12 | PARAM_K1 = 1.5 13 | PARAM_B = 0.75 14 | EPSILON = 0.25 15 | 16 | 17 | class BM25(object): 18 | 19 | def __init__(self, corpus): 20 | self.corpus_size = len(corpus) 21 | self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size 22 | self.corpus = corpus 23 | self.f = [] 24 | self.df = {} 25 | self.idf = {} 26 | self.initialize() 27 | 28 | def initialize(self): 29 | for document in self.corpus: 30 | frequencies = {} 31 | for word in document: 32 | if word not in frequencies: 33 | frequencies[word] = 0 34 | frequencies[word] += 1 35 | self.f.append(frequencies) 36 | 37 | for word, freq in iteritems(frequencies): 38 | if word not in self.df: 39 | self.df[word] = 0 40 | self.df[word] += 1 41 | 42 | for word, freq in iteritems(self.df): 43 | self.idf[word] = math.log(self.corpus_size-freq+0.5) - math.log(freq+0.5) 44 | 45 | def get_score(self, document, index, average_idf): 46 | score = 0 47 | for word in document: 48 | if word not in self.f[index]: 49 | continue 50 | idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf 51 | score += (idf*self.f[index][word]*(PARAM_K1+1) 52 | / (self.f[index][word] + PARAM_K1*(1 - PARAM_B+PARAM_B*self.corpus_size / self.avgdl))) 53 | return score 54 | 55 | def get_scores(self, document, average_idf): 56 | scores = [] 57 | for index in xrange(self.corpus_size): 58 | score = self.get_score(document, index, average_idf) 59 | scores.append(score) 60 | return scores 61 | 62 | 63 | def get_bm25_weights(corpus): 64 | bm25 = BM25(corpus) 65 | average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) 66 | 67 | weights = [] 68 | for doc in corpus: 69 | scores = bm25.get_scores(doc, average_idf) 70 | weights.append(scores) 71 | 72 | return weights 73 | -------------------------------------------------------------------------------- /gensim/models/word2vec_inner.pxd: -------------------------------------------------------------------------------- 1 | # 2 | # shared type definitions for word2vec_inner 3 | # used by both word2vec_inner.pyx (automatically) and doc2vec_inner.pyx (by explicit cimport) 4 | # 5 | # Copyright (C) 2013 Radim Rehurek 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np 7 | 8 | cdef extern from "voidptr.h": 9 | void* PyCObject_AsVoidPtr(object obj) 10 | 11 | cimport numpy as np 12 | ctypedef np.float32_t REAL_t 13 | 14 | # BLAS routine signatures 15 | ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil 16 | ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 17 | ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 18 | ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 19 | ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil 20 | ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil 21 | 22 | cdef scopy_ptr scopy 23 | cdef saxpy_ptr saxpy 24 | cdef sdot_ptr sdot 25 | cdef dsdot_ptr dsdot 26 | cdef snrm2_ptr snrm2 27 | cdef sscal_ptr sscal 28 | 29 | # precalculated sigmoid table 30 | DEF EXP_TABLE_SIZE = 1000 31 | DEF MAX_EXP = 6 32 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE 33 | 34 | # function implementations swapped based on BLAS detected in word2vec_inner.pyx init() 35 | ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 36 | ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 37 | 38 | cdef our_dot_ptr our_dot 39 | cdef our_saxpy_ptr our_saxpy 40 | 41 | # for when fblas.sdot returns a double 42 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 43 | 44 | # for when fblas.sdot returns a float 45 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 46 | 47 | # for when no blas available 48 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 49 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 50 | 51 | # to support random draws from negative-sampling cum_table 52 | cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil 53 | 54 | cdef unsigned long long random_int32(unsigned long long *next_random) nogil 55 | -------------------------------------------------------------------------------- /gensim/test/test_big.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2014 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking processing/storing large inputs. 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import itertools 16 | import tempfile 17 | 18 | import numpy 19 | 20 | import gensim 21 | 22 | 23 | def testfile(): 24 | # temporary data will be stored to this file 25 | return os.path.join(tempfile.gettempdir(), 'gensim_big.tst') 26 | 27 | 28 | class BigCorpus(object): 29 | """A corpus of a large number of docs & large vocab""" 30 | def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len=100): 31 | self.dictionary = gensim.utils.FakeDict(num_terms) 32 | self.words_only = words_only 33 | self.num_docs = num_docs 34 | self.doc_len = doc_len 35 | 36 | def __iter__(self): 37 | for _ in range(self.num_docs): 38 | doc_len = numpy.random.poisson(self.doc_len) 39 | ids = numpy.random.randint(0, len(self.dictionary), doc_len) 40 | if self.words_only: 41 | yield [str(id) for id in ids] 42 | else: 43 | weights = numpy.random.poisson(3, doc_len) 44 | yield sorted(zip(ids, weights)) 45 | 46 | 47 | if os.environ.get('GENSIM_BIG', False): 48 | class TestLargeData(unittest.TestCase): 49 | """Try common operations, using large models. You'll need ~8GB RAM to run these tests""" 50 | def testWord2Vec(self): 51 | corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200) 52 | model = gensim.models.Word2Vec(corpus, size=300, workers=4) 53 | model.save(testfile(), ignore=['syn1']) 54 | del model 55 | model = gensim.models.Word2Vec.load(testfile()) 56 | 57 | def testLsiModel(self): 58 | corpus = BigCorpus(num_docs=50000) 59 | model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary) 60 | model.save(testfile()) 61 | del model 62 | model = gensim.models.LsiModel.load(testfile()) 63 | 64 | def testLdaModel(self): 65 | corpus = BigCorpus(num_docs=5000) 66 | model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary) 67 | model.save(testfile()) 68 | del model 69 | model = gensim.models.LdaModel.load(testfile()) 70 | 71 | 72 | if __name__ == '__main__': 73 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE 8 | Process the repository, accepting articles in LANGUAGE (or 'any'). 9 | Store the word co-occurence matrix and id mappings, which are needed for subsequent processing. 10 | 11 | Example: ./gensim_build.py eng 12 | """ 13 | 14 | 15 | import logging 16 | import sys 17 | import os.path 18 | import re 19 | 20 | 21 | from gensim.corpora import sources, dmlcorpus 22 | 23 | 24 | PREFIX = 'dmlcz' 25 | 26 | AT_HOME = False 27 | 28 | if AT_HOME: 29 | SOURCE_LIST = [ 30 | sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), 31 | sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), 32 | sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), 33 | ] 34 | 35 | # SOURCE_LIST = [ 36 | # sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), 37 | # ] 38 | 39 | RESULT_DIR = '/Users/kofola/workspace/dml/data/results' 40 | 41 | else: 42 | 43 | SOURCE_LIST = [ 44 | sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), 45 | sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), 46 | sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), 47 | ] 48 | 49 | RESULT_DIR = '/data/dmlcz/xrehurek/results' 50 | 51 | 52 | def buildDmlCorpus(config): 53 | dml = dmlcorpus.DmlCorpus() 54 | dml.processConfig(config, shuffle = True) 55 | dml.buildDictionary() 56 | dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words 57 | 58 | dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) 59 | dml.saveAsText() # save id mappings and documents as text data (matrix market format) 60 | return dml 61 | 62 | 63 | if __name__ == '__main__': 64 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 65 | logging.root.setLevel(level=logging.INFO) 66 | logging.info("running %s" % ' '.join(sys.argv)) 67 | 68 | program = os.path.basename(sys.argv[0]) 69 | 70 | # check and process input arguments 71 | if len(sys.argv) < 2: 72 | print(globals()['__doc__'] % locals()) 73 | sys.exit(1) 74 | language = sys.argv[1] 75 | 76 | # construct the config, which holds information about sources, data file filenames etc. 77 | config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language]) 78 | for source in SOURCE_LIST: 79 | config.addSource(source) 80 | buildDmlCorpus(config) 81 | 82 | logging.info("finished running %s" % program) 83 | -------------------------------------------------------------------------------- /gensim/test/test_dtm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Automated tests for DTM/DIM model 6 | """ 7 | 8 | 9 | import logging 10 | from subprocess import CalledProcessError 11 | import gensim 12 | import os 13 | import sys 14 | import unittest 15 | from gensim import corpora 16 | 17 | 18 | # needed because sample data files are located in the same folder 19 | module_path = os.path.dirname(__file__) 20 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 21 | 22 | 23 | class TestDtmModel(unittest.TestCase): 24 | 25 | def setUp(self): 26 | self.time_slices = [3, 7] 27 | self.corpus = corpora.mmcorpus.MmCorpus(datapath('dtm_test.mm')) 28 | self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict')) 29 | # first you need to setup the environment variable $DTM_PATH for the dtm executable file 30 | self.dtm_path = os.environ.get('DTM_PATH', None) 31 | if self.dtm_path is None: 32 | if sys.version_info >= (2, 7, 0): 33 | self.skipTest("$DTM_PATH is not properly set up.") 34 | else: 35 | logging.warning("$DTM_PATH is not properly set up.") 36 | 37 | def testDtm(self): 38 | if self.dtm_path is not None: 39 | model = gensim.models.wrappers.DtmModel( 40 | self.dtm_path, self.corpus, self.time_slices, num_topics=2, 41 | id2word=self.id2word, model='dtm', initialize_lda=True, 42 | rng_seed=1) 43 | topics = model.show_topics(topics=2, times=2, topn=10) 44 | self.assertEqual(len(topics), 4) 45 | 46 | one_topic = model.show_topic(topicid=1, time=1, topn=10) 47 | self.assertEqual(len(one_topic), 10) 48 | self.assertEqual(one_topic[0][1], u'idexx') 49 | 50 | def testDim(self): 51 | if self.dtm_path is not None: 52 | model = gensim.models.wrappers.DtmModel( 53 | self.dtm_path, self.corpus, self.time_slices, num_topics=2, 54 | id2word=self.id2word, model='fixed', initialize_lda=True, 55 | rng_seed=1) 56 | topics = model.show_topics(topics=2, times=2, topn=10) 57 | self.assertEqual(len(topics), 4) 58 | 59 | one_topic = model.show_topic(topicid=1, time=1, topn=10) 60 | self.assertEqual(len(one_topic), 10) 61 | self.assertEqual(one_topic[0][1], u'skills') 62 | 63 | # In stderr expect "Error opening file /tmp/a65419_train_out/initial-lda-ss.dat. Failing." 64 | def testCalledProcessError(self): 65 | if self.dtm_path is not None: 66 | with self.assertRaises(CalledProcessError): 67 | gensim.models.wrappers.DtmModel( 68 | self.dtm_path, self.corpus, self.time_slices, num_topics=2, 69 | id2word=self.id2word, model='dtm', initialize_lda=False, 70 | rng_seed=1) 71 | 72 | if __name__ == '__main__': 73 | logging.basicConfig(level=logging.DEBUG) 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /gensim/test/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Automated tests for checking various utils functions. 8 | """ 9 | 10 | 11 | import logging 12 | import unittest 13 | 14 | from gensim import utils 15 | 16 | 17 | class TestIsCorpus(unittest.TestCase): 18 | def test_None(self): 19 | # test None 20 | result = utils.is_corpus(None) 21 | expected = (False, None) 22 | self.assertEqual(expected, result) 23 | 24 | def test_simple_lists_of_tuples(self): 25 | # test list words 26 | 27 | # one document, one word 28 | potentialCorpus = [[(0, 4.)]] 29 | result = utils.is_corpus(potentialCorpus) 30 | expected = (True, potentialCorpus) 31 | self.assertEqual(expected, result) 32 | 33 | # one document, several words 34 | potentialCorpus = [[(0, 4.), (1, 2.)]] 35 | result = utils.is_corpus(potentialCorpus) 36 | expected = (True, potentialCorpus) 37 | self.assertEqual(expected, result) 38 | 39 | potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]] 40 | result = utils.is_corpus(potentialCorpus) 41 | expected = (True, potentialCorpus) 42 | self.assertEqual(expected, result) 43 | 44 | # several documents, one word 45 | potentialCorpus = [[(0, 4.)], [(1, 2.)]] 46 | result = utils.is_corpus(potentialCorpus) 47 | expected = (True, potentialCorpus) 48 | self.assertEqual(expected, result) 49 | 50 | potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]] 51 | result = utils.is_corpus(potentialCorpus) 52 | expected = (True, potentialCorpus) 53 | self.assertEqual(expected, result) 54 | 55 | def test_int_tuples(self): 56 | potentialCorpus = [[(0, 4)]] 57 | result = utils.is_corpus(potentialCorpus) 58 | expected = (True, potentialCorpus) 59 | self.assertEqual(expected, result) 60 | 61 | def test_invalid_formats(self): 62 | # test invalid formats 63 | # these are no corpus, because they do not consists of 2-tuples with 64 | # the form(int, float). 65 | potentials = list() 66 | potentials.append(["human"]) 67 | potentials.append("human") 68 | potentials.append(["human", "star"]) 69 | potentials.append([1, 2, 3, 4, 5, 5]) 70 | potentials.append([[(0, 'string')]]) 71 | for noCorpus in potentials: 72 | result = utils.is_corpus(noCorpus) 73 | expected = (False, noCorpus) 74 | self.assertEqual(expected, result) 75 | 76 | 77 | class TestUtils(unittest.TestCase): 78 | def test_decode_entities(self): 79 | # create a string that fails to decode with unichr on narrow python builds 80 | body = u'It’s the Year of the Horse. YES VIN DIESEL 🙌 💯' 81 | expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af' 82 | self.assertEquals(utils.decode_htmlentities(body), expected) 83 | 84 | 85 | if __name__ == '__main__': 86 | logging.root.setLevel(logging.WARNING) 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /gensim/test/test_logentropy_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking transformation algorithms (the models package). 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import os.path 16 | import tempfile 17 | 18 | import six 19 | import numpy 20 | import scipy.linalg 21 | 22 | from gensim.corpora import mmcorpus, Dictionary 23 | from gensim.models import logentropy_model 24 | from gensim import matutils 25 | 26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 28 | 29 | 30 | # set up vars used in testing ("Deerwester" from the web tutorial) 31 | texts = [['human', 'interface', 'computer'], 32 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 33 | ['eps', 'user', 'interface', 'system'], 34 | ['system', 'human', 'system', 'eps'], 35 | ['user', 'response', 'time'], 36 | ['trees'], 37 | ['graph', 'trees'], 38 | ['graph', 'minors', 'trees'], 39 | ['graph', 'minors', 'survey']] 40 | dictionary = Dictionary(texts) 41 | corpus = [dictionary.doc2bow(text) for text in texts] 42 | 43 | 44 | def testfile(): 45 | # temporary data will be stored to this file 46 | return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') 47 | 48 | 49 | class TestLogEntropyModel(unittest.TestCase): 50 | def setUp(self): 51 | self.corpus_small = mmcorpus.MmCorpus(datapath('test_corpus_small.mm')) 52 | self.corpus_ok = mmcorpus.MmCorpus(datapath('test_corpus_ok.mm')) 53 | 54 | 55 | def testTransform(self): 56 | # create the transformation model 57 | model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=False) 58 | 59 | # transform one document 60 | doc = list(self.corpus_ok)[0] 61 | transformed = model[doc] 62 | 63 | expected = [(0, 0.3748900964125389), 64 | (1, 0.30730215324230725), 65 | (3, 1.20941755462856)] 66 | self.assertTrue(numpy.allclose(transformed, expected)) 67 | 68 | 69 | def testPersistence(self): 70 | fname = testfile() 71 | model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) 72 | model.save(fname) 73 | model2 = logentropy_model.LogEntropyModel.load(fname) 74 | self.assertTrue(model.entr == model2.entr) 75 | tstvec = [] 76 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) 77 | 78 | def testPersistenceCompressed(self): 79 | fname = testfile() + '.gz' 80 | model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) 81 | model.save(fname) 82 | model2 = logentropy_model.LogEntropyModel.load(fname, mmap=None) 83 | self.assertTrue(model.entr == model2.entr) 84 | tstvec = [] 85 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) 86 | #endclass TestLogEntropyModel 87 | 88 | 89 | if __name__ == '__main__': 90 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_genmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE METHOD 8 | Generate topic models for the specified subcorpus. METHOD is currently one \ 9 | of 'tfidf', 'lsi', 'lda', 'rp'. 10 | 11 | Example: ./gensim_genmodel.py any lsi 12 | """ 13 | 14 | 15 | import logging 16 | import sys 17 | import os.path 18 | import re 19 | 20 | 21 | from gensim.corpora import sources, dmlcorpus, MmCorpus 22 | from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel 23 | 24 | import gensim_build 25 | 26 | 27 | # internal method parameters 28 | DIM_RP = 300 # dimensionality for random projections 29 | DIM_LSI = 200 # for lantent semantic indexing 30 | DIM_LDA = 100 # for latent dirichlet allocation 31 | 32 | 33 | 34 | if __name__ == '__main__': 35 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 36 | logging.root.setLevel(level = logging.INFO) 37 | logging.info("running %s" % ' '.join(sys.argv)) 38 | 39 | program = os.path.basename(sys.argv[0]) 40 | 41 | # check and process input arguments 42 | if len(sys.argv) < 3: 43 | print(globals()['__doc__'] % locals()) 44 | sys.exit(1) 45 | language = sys.argv[1] 46 | method = sys.argv[2].strip().lower() 47 | 48 | logging.info("loading corpus mappings") 49 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), 50 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) 51 | 52 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) 53 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) 54 | logging.info("loaded %i word ids" % len(id2word)) 55 | 56 | corpus = MmCorpus(config.resultFile('bow.mm')) 57 | 58 | if method == 'tfidf': 59 | model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 60 | model.save(config.resultFile('model_tfidf.pkl')) 61 | elif method == 'lda': 62 | model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) 63 | model.save(config.resultFile('model_lda.pkl')) 64 | elif method == 'lsi': 65 | # first, transform word counts to tf-idf weights 66 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 67 | # then find the transformation from tf-idf to latent space 68 | model = lsimodel.LsiModel(tfidf[corpus], id2word = id2word, numTopics = DIM_LSI) 69 | model.save(config.resultFile('model_lsi.pkl')) 70 | elif method == 'rp': 71 | # first, transform word counts to tf-idf weights 72 | tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) 73 | # then find the transformation from tf-idf to latent space 74 | model = rpmodel.RpModel(tfidf[corpus], id2word = id2word, numTopics = DIM_RP) 75 | model.save(config.resultFile('model_rp.pkl')) 76 | else: 77 | raise ValueError('unknown topic extraction method: %s' % repr(method)) 78 | 79 | MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) 80 | 81 | logging.info("finished running %s" % program) 82 | 83 | -------------------------------------------------------------------------------- /gensim/test/test_parsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Automated tests for the parsing module. 6 | """ 7 | 8 | import logging 9 | import unittest 10 | import numpy as np 11 | 12 | from gensim.parsing.preprocessing import * 13 | 14 | 15 | # several documents 16 | doc1 = """C'est un trou de verdure où chante une rivière, 17 | Accrochant follement aux herbes des haillons 18 | D'argent ; où le soleil, de la montagne fière, 19 | Luit : c'est un petit val qui mousse de rayons.""" 20 | 21 | doc2 = """Un soldat jeune, bouche ouverte, tête nue, 22 | Et la nuque baignant dans le frais cresson bleu, 23 | Dort ; il est étendu dans l'herbe, sous la nue, 24 | Pâle dans son lit vert où la lumière pleut.""" 25 | 26 | doc3 = """Les pieds dans les glaïeuls, il dort. Souriant comme 27 | Sourirait un enfant malade, il fait un somme : 28 | Nature, berce-le chaudement : il a froid.""" 29 | 30 | doc4 = """Les parfums ne font pas frissonner sa narine ; 31 | Il dort dans le soleil, la main sur sa poitrine, 32 | Tranquille. Il a deux trous rouges au côté droit.""" 33 | 34 | doc5 = """While it is quite useful to be able to search a 35 | large collection of documents almost instantly for a joint 36 | occurrence of a collection of exact words, 37 | for many searching purposes, a little fuzziness would help. """ 38 | 39 | 40 | dataset = map(lambda x: strip_punctuation2(x.lower()), 41 | [doc1, doc2, doc3, doc4]) 42 | # doc1 and doc2 have class 0, doc3 and doc4 avec class 1 43 | classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) 44 | 45 | 46 | 47 | class TestPreprocessing(unittest.TestCase): 48 | 49 | def testStripNumeric(self): 50 | self.assertEqual(strip_numeric("salut les amis du 59"), 51 | "salut les amis du ") 52 | 53 | def testStripShort(self): 54 | self.assertEqual(strip_short("salut les amis du 59", 3), 55 | "salut les amis") 56 | 57 | def testStripTags(self): 58 | self.assertEqual(strip_tags("Hello World!"), 59 | "Hello World!") 60 | 61 | def testStripMultipleWhitespaces(self): 62 | self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), 63 | "salut les loulous!") 64 | 65 | def testStripNonAlphanum(self): 66 | self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), 67 | "toto nf kappa titi") 68 | 69 | def testSplitAlphanum(self): 70 | self.assertEqual(split_alphanum("toto diet1 titi"), 71 | "toto diet 1 titi") 72 | self.assertEqual(split_alphanum("toto 1diet titi"), 73 | "toto 1 diet titi") 74 | 75 | def testStripStopwords(self): 76 | self.assertEqual(remove_stopwords("the world is square"), 77 | "world square") 78 | 79 | def testStemText(self): 80 | target = "while it is quit us to be abl to search a larg " + \ 81 | "collect of document almost instantli for a joint occurr " + \ 82 | "of a collect of exact words, for mani search purposes, " + \ 83 | "a littl fuzzi would help." 84 | self.assertEqual(stem_text(doc5), target) 85 | 86 | 87 | if __name__ == "__main__": 88 | logging.basicConfig(level=logging.WARNING) 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # AppVeyor.com is a Continuous Integration service to build and run tests under 2 | # Windows 3 | # https://ci.appveyor.com/project/tmylk/gensim 4 | 5 | environment: 6 | global: 7 | # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the 8 | # /E:ON and /V:ON options are not enabled in the batch script intepreter 9 | # See: http://stackoverflow.com/a/13751649/163740 10 | CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\continuous_integration\\appveyor\\run_with_env.cmd" 11 | WHEELHOUSE_UPLOADER_USERNAME: "Lev.Konstantinovskiy" 12 | WHEELHOUSE_UPLOADER_SECRET: 13 | secure: qXqY3dFmLOqvxa3Om2gQi/BjotTOK+EP2IPLolBNo0c61yDtNWxbmE4wH3up72Be 14 | 15 | matrix: 16 | - PYTHON: "C:\\Python27" 17 | PYTHON_VERSION: "2.7.8" 18 | PYTHON_ARCH: "32" 19 | 20 | - PYTHON: "C:\\Python27-x64" 21 | PYTHON_VERSION: "2.7.8" 22 | PYTHON_ARCH: "64" 23 | 24 | - PYTHON: "C:\\Python35" 25 | PYTHON_VERSION: "3.5.0" 26 | PYTHON_ARCH: "32" 27 | 28 | - PYTHON: "C:\\Python35-x64" 29 | PYTHON_VERSION: "3.5.0" 30 | PYTHON_ARCH: "64" 31 | 32 | 33 | 34 | install: 35 | # Install Python (from the official .msi of http://python.org) and pip when 36 | # not already installed. 37 | - "powershell ./continuous_integration/appveyor/install.ps1" 38 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 39 | 40 | # Check that we have the expected version and architecture for Python 41 | - "python --version" 42 | - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" 43 | 44 | # Install the build and runtime dependencies of the project. 45 | # Install the build and runtime dependencies of the project. 46 | - "%CMD_IN_ENV% pip install --timeout=60 --trusted-host 28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com -r continuous_integration/appveyor/requirements.txt" 47 | - "%CMD_IN_ENV% python setup.py bdist_wheel bdist_wininst " 48 | - ps: "ls dist" 49 | 50 | # Install the genreated wheel package to test it 51 | - "pip install --pre --no-index --find-links dist/ gensim" 52 | 53 | # Not a .NET project, we build scikit-learn in the install step instead 54 | build: false 55 | 56 | test_script: 57 | # Change to a non-source folder to make sure we run the tests on the 58 | # installed library. 59 | - "mkdir empty_folder" 60 | - "cd empty_folder" 61 | 62 | - "python -c \"import nose; nose.main()\" -s -v gensim" 63 | # Move back to the project folder 64 | - "cd .." 65 | 66 | artifacts: 67 | # Archive the generated wheel package in the ci.appveyor.com build report. 68 | - path: dist\* 69 | on_success: 70 | # Upload the generated wheel package to Rackspace 71 | # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we 72 | # disable the ssl checks. 73 | - "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist gensim-windows-wheels" 74 | 75 | notifications: 76 | - provider: Webhook 77 | url: https://webhooks.gitter.im/e/62c44ad26933cd7ed7e8 78 | on_build_success: false 79 | on_build_failure: True 80 | 81 | cache: 82 | # Use the appveyor cache to avoid re-downloading large archives such 83 | # the MKL numpy and scipy wheels mirrored on a rackspace cloud 84 | # container, speed up the appveyor jobs and reduce bandwidth 85 | # usage on our rackspace account. 86 | - '%APPDATA%\pip\Cache' 87 | 88 | -------------------------------------------------------------------------------- /gensim/test/test_rpmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking transformation algorithms (the models package). 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import os.path 16 | import tempfile 17 | 18 | import six 19 | import numpy 20 | import scipy.linalg 21 | 22 | from gensim.corpora import mmcorpus, Dictionary 23 | from gensim.models import rpmodel 24 | from gensim import matutils 25 | 26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 28 | 29 | 30 | # set up vars used in testing ("Deerwester" from the web tutorial) 31 | texts = [['human', 'interface', 'computer'], 32 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 33 | ['eps', 'user', 'interface', 'system'], 34 | ['system', 'human', 'system', 'eps'], 35 | ['user', 'response', 'time'], 36 | ['trees'], 37 | ['graph', 'trees'], 38 | ['graph', 'minors', 'trees'], 39 | ['graph', 'minors', 'survey']] 40 | dictionary = Dictionary(texts) 41 | corpus = [dictionary.doc2bow(text) for text in texts] 42 | 43 | 44 | def testfile(): 45 | # temporary data will be stored to this file 46 | return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') 47 | 48 | 49 | 50 | class TestRpModel(unittest.TestCase): 51 | def setUp(self): 52 | self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) 53 | 54 | def testTransform(self): 55 | # create the transformation model 56 | numpy.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) 57 | model = rpmodel.RpModel(self.corpus, num_topics=2) 58 | 59 | # transform one document 60 | doc = list(self.corpus)[0] 61 | transformed = model[doc] 62 | vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests 63 | 64 | expected = numpy.array([-0.70710677, 0.70710677]) 65 | self.assertTrue(numpy.allclose(vec, expected)) # transformed entries must be equal up to sign 66 | 67 | 68 | def testPersistence(self): 69 | fname = testfile() 70 | model = rpmodel.RpModel(self.corpus, num_topics=2) 71 | model.save(fname) 72 | model2 = rpmodel.RpModel.load(fname) 73 | self.assertEqual(model.num_topics, model2.num_topics) 74 | self.assertTrue(numpy.allclose(model.projection, model2.projection)) 75 | tstvec = [] 76 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 77 | 78 | def testPersistenceCompressed(self): 79 | fname = testfile() + '.gz' 80 | model = rpmodel.RpModel(self.corpus, num_topics=2) 81 | model.save(fname) 82 | model2 = rpmodel.RpModel.load(fname, mmap=None) 83 | self.assertEqual(model.num_topics, model2.num_topics) 84 | self.assertTrue(numpy.allclose(model.projection, model2.projection)) 85 | tstvec = [] 86 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 87 | #endclass TestRpModel 88 | 89 | 90 | if __name__ == '__main__': 91 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /docs/src/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " pickle to make pickle files" 22 | @echo " json to make JSON files" 23 | @echo " htmlhelp to make HTML files and a HTML help project" 24 | @echo " qthelp to make HTML files and a qthelp project" 25 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 26 | @echo " changes to make an overview of all changed/added/deprecated items" 27 | @echo " linkcheck to check all external links for integrity" 28 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 29 | 30 | clean: 31 | -rm -rf $(BUILDDIR)/* 32 | 33 | html: 34 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 35 | rm -r $(BUILDDIR)/html/_sources 36 | cp -r $(BUILDDIR)/html/* ../ 37 | @echo 38 | @echo "Build finished. The HTML pages are in ../" 39 | 40 | upload: 41 | scp -r _build/html/* rr:public_html/gensim/ 42 | 43 | dirhtml: 44 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 45 | @echo 46 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 47 | 48 | pickle: 49 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 50 | @echo 51 | @echo "Build finished; now you can process the pickle files." 52 | 53 | json: 54 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 55 | @echo 56 | @echo "Build finished; now you can process the JSON files." 57 | 58 | htmlhelp: 59 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 60 | @echo 61 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 62 | ".hhp project file in $(BUILDDIR)/htmlhelp." 63 | 64 | qthelp: 65 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 66 | @echo 67 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 68 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 69 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gensim.qhcp" 70 | @echo "To view the help file:" 71 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gensim.qhc" 72 | 73 | latex: 74 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 75 | @echo 76 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 77 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 78 | "run these through (pdf)latex." 79 | cd $(BUILDDIR)/latex && make all-pdf 80 | cp $(BUILDDIR)/latex/gensim.pdf ../gensim_manual.pdf 81 | 82 | changes: 83 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 84 | @echo 85 | @echo "The overview file is in $(BUILDDIR)/changes." 86 | 87 | linkcheck: 88 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 89 | @echo 90 | @echo "Link check complete; look for any errors in the above output " \ 91 | "or in $(BUILDDIR)/linkcheck/output.txt." 92 | 93 | doctest: 94 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 95 | @echo "Testing of doctests in the sources finished, look at the " \ 96 | "results in $(BUILDDIR)/doctest/output.txt." 97 | -------------------------------------------------------------------------------- /continuous_integration/appveyor/run_with_env.cmd: -------------------------------------------------------------------------------- 1 | :: To build extensions for 64 bit Python 3, we need to configure environment 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) 4 | :: 5 | :: To build extensions for 64 bit Python 2, we need to configure environment 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) 8 | :: 9 | :: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific 10 | :: environment configurations. 11 | :: 12 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the 13 | :: cmd interpreter, at least for (SDK v7.0) 14 | :: 15 | :: More details at: 16 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 17 | :: http://stackoverflow.com/a/13751649/163740 18 | :: 19 | :: Author: Olivier Grisel 20 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 21 | :: 22 | :: Notes about batch files for Python people: 23 | :: 24 | :: Quotes in values are literally part of the values: 25 | :: SET FOO="bar" 26 | :: FOO is now five characters long: " b a r " 27 | :: If you don't want quotes, don't include them on the right-hand side. 28 | :: 29 | :: The CALL lines at the end of this file look redundant, but if you move them 30 | :: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y 31 | :: case, I don't know why. 32 | @ECHO OFF 33 | 34 | SET COMMAND_TO_RUN=%* 35 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows 36 | SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf 37 | 38 | :: Extract the major and minor versions, and allow for the minor version to be 39 | :: more than 9. This requires the version number to have two dots in it. 40 | SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1% 41 | IF "%PYTHON_VERSION:~3,1%" == "." ( 42 | SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,1% 43 | ) ELSE ( 44 | SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,2% 45 | ) 46 | 47 | :: Based on the Python version, determine what SDK version to use, and whether 48 | :: to set the SDK for 64-bit. 49 | IF %MAJOR_PYTHON_VERSION% == 2 ( 50 | SET WINDOWS_SDK_VERSION="v7.0" 51 | SET SET_SDK_64=Y 52 | ) ELSE ( 53 | IF %MAJOR_PYTHON_VERSION% == 3 ( 54 | SET WINDOWS_SDK_VERSION="v7.1" 55 | IF %MINOR_PYTHON_VERSION% LEQ 4 ( 56 | SET SET_SDK_64=Y 57 | ) ELSE ( 58 | SET SET_SDK_64=N 59 | IF EXIST "%WIN_WDK%" ( 60 | :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/ 61 | REN "%WIN_WDK%" 0wdf 62 | ) 63 | ) 64 | ) ELSE ( 65 | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" 66 | EXIT 1 67 | ) 68 | ) 69 | 70 | IF %PYTHON_ARCH% == 64 ( 71 | IF %SET_SDK_64% == Y ( 72 | ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture 73 | SET DISTUTILS_USE_SDK=1 74 | SET MSSdk=1 75 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% 76 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release 77 | ECHO Executing: %COMMAND_TO_RUN% 78 | call %COMMAND_TO_RUN% || EXIT 1 79 | ) ELSE ( 80 | ECHO Using default MSVC build environment for 64 bit architecture 81 | ECHO Executing: %COMMAND_TO_RUN% 82 | call %COMMAND_TO_RUN% || EXIT 1 83 | ) 84 | ) ELSE ( 85 | ECHO Using default MSVC build environment for 32 bit architecture 86 | ECHO Executing: %COMMAND_TO_RUN% 87 | call %COMMAND_TO_RUN% || EXIT 1 88 | ) 89 | -------------------------------------------------------------------------------- /gensim/test/test_phrases.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Automated tests for checking transformation algorithms (the models package). 8 | """ 9 | 10 | 11 | import logging 12 | import unittest 13 | import os 14 | import sys 15 | 16 | from gensim import utils 17 | from gensim.models.phrases import Phrases 18 | 19 | if sys.version_info[0] >= 3: 20 | unicode = str 21 | 22 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 23 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 24 | 25 | 26 | sentences = [ 27 | ['human', 'interface', 'computer'], 28 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 29 | ['eps', 'user', 'interface', 'system'], 30 | ['system', 'human', 'system', 'eps'], 31 | ['user', 'response', 'time'], 32 | ['trees'], 33 | ['graph', 'trees'], 34 | ['graph', 'minors', 'trees'], 35 | ['graph', 'minors', 'survey'] 36 | ] 37 | 38 | 39 | class TestPhrasesModel(unittest.TestCase): 40 | def testSentenceGeneration(self): 41 | """Test basic bigram using a dummy corpus.""" 42 | bigram = Phrases(sentences) 43 | # test that we generate the same amount of sentences as the input 44 | self.assertEqual(len(sentences), len(list(bigram[sentences]))) 45 | 46 | def testBigramConstruction(self): 47 | """Test Phrases bigram construction building.""" 48 | bigram = Phrases(sentences, min_count=1, threshold=1) 49 | 50 | # with this setting we should get response_time and graph_minors 51 | bigram1_seen = False 52 | bigram2_seen = False 53 | 54 | for s in bigram[sentences]: 55 | if u'response_time' in s: 56 | bigram1_seen = True 57 | if u'graph_minors' in s: 58 | bigram2_seen = True 59 | self.assertTrue(bigram1_seen and bigram2_seen) 60 | 61 | # check the same thing, this time using single doc transformation 62 | self.assertTrue(u'response_time' in bigram[sentences[1]]) 63 | self.assertTrue(u'response_time' in bigram[sentences[4]]) 64 | self.assertTrue(u'graph_minors' in bigram[sentences[-2]]) 65 | self.assertTrue(u'graph_minors' in bigram[sentences[-1]]) 66 | 67 | def testBadParameters(self): 68 | """Test the phrases module with bad parameters.""" 69 | # should fail with something less or equal than 0 70 | self.assertRaises(ValueError, Phrases, sentences, min_count=0) 71 | 72 | # threshold should be positive 73 | self.assertRaises(ValueError, Phrases, sentences, threshold=-1) 74 | 75 | def testEncoding(self): 76 | """Test that both utf8 and unicode input work; output must be unicode.""" 77 | expected = [u'survey', u'user', u'computer', u'system', u'response_time'] 78 | 79 | bigram_utf8 = Phrases(sentences, min_count=1, threshold=1) 80 | self.assertEquals(bigram_utf8[sentences[1]], expected) 81 | 82 | unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] 83 | bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1) 84 | self.assertEquals(bigram_unicode[sentences[1]], expected) 85 | 86 | transformed = ' '.join(bigram_utf8[sentences[1]]) 87 | self.assertTrue(isinstance(transformed, unicode)) 88 | 89 | def testPruning(self): 90 | """Test that max_vocab_size parameter is respected.""" 91 | bigram = Phrases(sentences, max_vocab_size=5) 92 | self.assertTrue(len(bigram.vocab) <= 5) 93 | #endclass TestPhrasesModel 94 | 95 | 96 | if __name__ == '__main__': 97 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /gensim/test/test_tfidfmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking transformation algorithms (the models package). 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import os.path 16 | import tempfile 17 | 18 | import six 19 | import numpy 20 | import scipy.linalg 21 | 22 | from gensim.corpora import mmcorpus, Dictionary 23 | from gensim.models import tfidfmodel 24 | from gensim import matutils 25 | 26 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 27 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 28 | 29 | 30 | # set up vars used in testing ("Deerwester" from the web tutorial) 31 | texts = [['human', 'interface', 'computer'], 32 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 33 | ['eps', 'user', 'interface', 'system'], 34 | ['system', 'human', 'system', 'eps'], 35 | ['user', 'response', 'time'], 36 | ['trees'], 37 | ['graph', 'trees'], 38 | ['graph', 'minors', 'trees'], 39 | ['graph', 'minors', 'survey']] 40 | dictionary = Dictionary(texts) 41 | corpus = [dictionary.doc2bow(text) for text in texts] 42 | 43 | 44 | def testfile(): 45 | # temporary data will be stored to this file 46 | return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') 47 | 48 | 49 | 50 | class TestTfidfModel(unittest.TestCase): 51 | def setUp(self): 52 | self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) 53 | 54 | def testTransform(self): 55 | # create the transformation model 56 | model = tfidfmodel.TfidfModel(self.corpus, normalize=True) 57 | 58 | # transform one document 59 | doc = list(self.corpus)[0] 60 | transformed = model[doc] 61 | 62 | expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] 63 | self.assertTrue(numpy.allclose(transformed, expected)) 64 | 65 | 66 | def testInit(self): 67 | # create the transformation model by analyzing a corpus 68 | # uses the global `corpus`! 69 | model1 = tfidfmodel.TfidfModel(corpus) 70 | 71 | # make sure the dfs<->idfs transformation works 72 | self.assertEqual(model1.dfs, dictionary.dfs) 73 | self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dictionary.dfs, len(corpus))) 74 | 75 | # create the transformation model by directly supplying a term->docfreq 76 | # mapping from the global var `dictionary`. 77 | model2 = tfidfmodel.TfidfModel(dictionary=dictionary) 78 | self.assertEqual(model1.idfs, model2.idfs) 79 | 80 | 81 | def testPersistence(self): 82 | fname = testfile() 83 | model = tfidfmodel.TfidfModel(self.corpus, normalize=True) 84 | model.save(fname) 85 | model2 = tfidfmodel.TfidfModel.load(fname) 86 | self.assertTrue(model.idfs == model2.idfs) 87 | tstvec = [] 88 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 89 | 90 | def testPersistenceCompressed(self): 91 | fname = testfile() + '.gz' 92 | model = tfidfmodel.TfidfModel(self.corpus, normalize=True) 93 | model.save(fname) 94 | model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) 95 | self.assertTrue(model.idfs == model2.idfs) 96 | tstvec = [] 97 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 98 | #endclass TestTfidfModel 99 | 100 | 101 | 102 | if __name__ == '__main__': 103 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 104 | unittest.main() 105 | -------------------------------------------------------------------------------- /gensim/test/test_keywords.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Automated test to reproduce the results of Mihalcea and Tarau (2004). 8 | 9 | Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm. 10 | As a validation of the gensim implementation we reproduced its results 11 | in this test. 12 | 13 | """ 14 | 15 | import os.path 16 | import logging 17 | import unittest 18 | 19 | from gensim import utils 20 | from gensim.corpora import Dictionary 21 | from gensim.summarization import keywords 22 | 23 | 24 | class TestKeywordsTest(unittest.TestCase): 25 | 26 | def test_text_keywords(self): 27 | pre_path = os.path.join(os.path.dirname(__file__), 'test_data') 28 | 29 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: 30 | text = f.read() 31 | 32 | # calculate keywords 33 | generated_keywords = keywords(text, split=True) 34 | 35 | # To be compared to the reference. 36 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: 37 | kw = f.read().strip().split("\n") 38 | 39 | self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw))) 40 | 41 | def test_text_keywords_words(self): 42 | pre_path = os.path.join(os.path.dirname(__file__), 'test_data') 43 | 44 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: 45 | text = f.read() 46 | 47 | # calculate exactly 13 keywords 48 | generated_keywords = keywords(text, words=15, split=True) 49 | 50 | self.assertEqual(len(generated_keywords), 16) 51 | 52 | def test_text_keywords_pos(self): 53 | pre_path = os.path.join(os.path.dirname(__file__), 'test_data') 54 | 55 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: 56 | text = f.read() 57 | 58 | # calculate keywords using only certain parts of speech 59 | generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) 60 | 61 | # To be compared to the reference. 62 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: 63 | kw = f.read().strip().split("\n") 64 | 65 | self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw))) 66 | 67 | def test_text_summarization_raises_exception_on_short_input_text(self): 68 | pre_path = os.path.join(os.path.dirname(__file__), 'test_data') 69 | 70 | with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: 71 | text = f.read() 72 | 73 | # Keeps the first 8 sentences to make the text shorter. 74 | text = "\n".join(text.split('\n')[:8]) 75 | 76 | self.assertTrue(keywords(text) is not None) 77 | 78 | 79 | def test_keywords_ratio(self): 80 | pre_path = os.path.join(os.path.dirname(__file__), 'test_data') 81 | 82 | with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: 83 | text = f.read() 84 | 85 | # Check ratio parameter is well behaved. Because length is taken on tokenized clean text 86 | # we just check that ratio 20% is twice as long as ratio 10% 87 | # Values of 10% and 20% were carefully selected for this test to avoid 88 | # numerical instabilities when several keywords have almost the same score 89 | selected_docs_12 = keywords(text, ratio=0.1, split=True) 90 | selected_docs_21 = keywords(text, ratio=0.2, split=True) 91 | 92 | self.assertAlmostEqual(float(len(selected_docs_21))/len(selected_docs_12), float(21)/12, places=1) 93 | 94 | 95 | if __name__ == '__main__': 96 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 97 | unittest.main() 98 | -------------------------------------------------------------------------------- /gensim/summarization/textcleaner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | from gensim.summarization.syntactic_unit import SyntacticUnit 7 | from gensim.parsing.preprocessing import preprocess_documents 8 | from gensim.utils import tokenize 9 | from six.moves import xrange 10 | import re 11 | import logging 12 | 13 | logger = logging.getLogger('summa.preprocessing.cleaner') 14 | 15 | try: 16 | from pattern.en import tag 17 | logger.info("'pattern' package found; tag filters are available for English") 18 | HAS_PATTERN = True 19 | except ImportError: 20 | logger.info("'pattern' package not found; tag filters are not available for English") 21 | HAS_PATTERN = False 22 | 23 | 24 | SEPARATOR = r"@" 25 | RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) 26 | AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) 27 | AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) 28 | AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) 29 | UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE) 30 | UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE) 31 | 32 | 33 | def split_sentences(text): 34 | processed = replace_abbreviations(text) 35 | return [undo_replacement(sentence) for sentence in get_sentences(processed)] 36 | 37 | 38 | def replace_abbreviations(text): 39 | return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) 40 | 41 | 42 | def undo_replacement(sentence): 43 | return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) 44 | 45 | 46 | def replace_with_separator(text, separator, regexs): 47 | replacement = r"\1" + separator + r"\2" 48 | result = text 49 | for regex in regexs: 50 | result = regex.sub(replacement, result) 51 | return result 52 | 53 | 54 | def get_sentences(text): 55 | for match in RE_SENTENCE.finditer(text): 56 | yield match.group() 57 | 58 | 59 | def merge_syntactic_units(original_units, filtered_units, tags=None): 60 | units = [] 61 | for i in xrange(len(original_units)): 62 | if filtered_units[i] == '': 63 | continue 64 | 65 | text = original_units[i] 66 | token = filtered_units[i] 67 | tag = tags[i][1] if tags else None 68 | sentence = SyntacticUnit(text, token, tag) 69 | sentence.index = i 70 | 71 | units.append(sentence) 72 | 73 | return units 74 | 75 | 76 | def join_words(words, separator=" "): 77 | return separator.join(words) 78 | 79 | 80 | def clean_text_by_sentences(text): 81 | """ Tokenizes a given text into sentences, applying filters and lemmatizing them. 82 | Returns a SyntacticUnit list. """ 83 | original_sentences = split_sentences(text) 84 | filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] 85 | 86 | return merge_syntactic_units(original_sentences, filtered_sentences) 87 | 88 | 89 | def clean_text_by_word(text): 90 | """ Tokenizes a given text into words, applying filters and lemmatizing them. 91 | Returns a dict of word -> syntacticUnit. """ 92 | text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) 93 | original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) 94 | filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] 95 | if HAS_PATTERN: 96 | tags = tag(join_words(original_words)) # tag needs the context of the words in the text 97 | else: 98 | tags = None 99 | units = merge_syntactic_units(original_words, filtered_words, tags) 100 | return dict((unit.text, unit) for unit in units) 101 | 102 | 103 | def tokenize_by_word(text): 104 | text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) 105 | return tokenize(text_without_acronyms, to_lower=True, deacc=True) 106 | -------------------------------------------------------------------------------- /gensim/models/rpmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | import logging 9 | import itertools 10 | 11 | import numpy 12 | import scipy 13 | 14 | from gensim import interfaces, matutils, utils 15 | 16 | 17 | logger = logging.getLogger('gensim.models.rpmodel') 18 | 19 | 20 | class RpModel(interfaces.TransformationABC): 21 | """ 22 | Objects of this class allow building and maintaining a model for Random Projections 23 | (also known as Random Indexing). For theoretical background on RP, see: 24 | 25 | Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." 26 | 27 | The main methods are: 28 | 29 | 1. constructor, which creates the random projection matrix 30 | 2. the [] method, which transforms a simple count representation into the TfIdf 31 | space. 32 | 33 | >>> rp = RpModel(corpus) 34 | >>> print(rp[some_doc]) 35 | >>> rp.save('/tmp/foo.rp_model') 36 | 37 | Model persistency is achieved via its load/save methods. 38 | """ 39 | def __init__(self, corpus, id2word=None, num_topics=300): 40 | """ 41 | `id2word` is a mapping from word ids (integers) to words (strings). It is 42 | used to determine the vocabulary size, as well as for debugging and topic 43 | printing. If not set, it will be determined from the corpus. 44 | """ 45 | self.id2word = id2word 46 | self.num_topics = num_topics 47 | if corpus is not None: 48 | self.initialize(corpus) 49 | 50 | 51 | def __str__(self): 52 | return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) 53 | 54 | 55 | def initialize(self, corpus): 56 | """ 57 | Initialize the random projection matrix. 58 | """ 59 | if self.id2word is None: 60 | logger.info("no word id mapping provided; initializing from corpus, assuming identity") 61 | self.id2word = utils.dict_from_corpus(corpus) 62 | self.num_terms = len(self.id2word) 63 | else: 64 | self.num_terms = 1 + max([-1] + self.id2word.keys()) 65 | 66 | shape = self.num_topics, self.num_terms 67 | logger.info("constructing %s random matrix" % str(shape)) 68 | # Now construct the projection matrix itself. 69 | # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", 70 | # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). 71 | randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 72 | self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications 73 | 74 | 75 | def __getitem__(self, bow): 76 | """ 77 | Return RP representation of the input vector and/or corpus. 78 | """ 79 | # if the input vector is in fact a corpus, return a transformed corpus as result 80 | is_corpus, bow = utils.is_corpus(bow) 81 | if is_corpus: 82 | return self._apply(bow) 83 | 84 | vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) 85 | vec = numpy.asfortranarray(vec, dtype=numpy.float32) 86 | topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) 87 | return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) 88 | if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)] 89 | 90 | 91 | def __setstate__(self, state): 92 | """ 93 | This is a hack to work around a bug in numpy, where a FORTRAN-order array 94 | unpickled from disk segfaults on using it. 95 | """ 96 | self.__dict__ = state 97 | if self.projection is not None: 98 | self.projection = self.projection.copy('F') # simply making a fresh copy fixes the broken array 99 | #endclass RpModel 100 | -------------------------------------------------------------------------------- /docs/src/distributed.rst: -------------------------------------------------------------------------------- 1 | .. _distributed: 2 | 3 | Distributed Computing 4 | =================================== 5 | 6 | Why distributed computing? 7 | --------------------------- 8 | 9 | Need to build semantic representation of a corpus that is millions of documents large and it's 10 | taking forever? Have several idle machines at your disposal that you could use? 11 | `Distributed computing `_ tries 12 | to accelerate computations by splitting a given task into several smaller subtasks, 13 | passing them on to several computing nodes in parallel. 14 | 15 | In the context of `gensim`, computing nodes are computers identified by their IP address/port, 16 | and communication happens over TCP/IP. The whole collection of available machines is called 17 | a *cluster*. The distribution is very coarse grained (not 18 | much communication going on), so the network is allowed to be of relatively high latency. 19 | 20 | .. warning:: 21 | The primary reason for using distributed computing is making things run faster. In `gensim`, 22 | most of the time consuming stuff is done inside low-level routines for linear algebra, inside 23 | NumPy, independent of any `gensim` code. 24 | **Installing a fast** `BLAS (Basic Linear Algebra) `_ **library 25 | for NumPy can improve performance up to 15 times!** So before you start buying those extra computers, 26 | consider installing a fast, threaded BLAS that is optimized for your particular machine 27 | (as opposed to a generic, binary-distributed library). 28 | Options include your vendor's BLAS library (Intel's MKL, 29 | AMD's ACML, OS X's vecLib, Sun's Sunperf, ...) or some open-source alternative (GotoBLAS, ALTAS). 30 | 31 | To see what BLAS and LAPACK you are using, type into your shell:: 32 | 33 | python -c 'import scipy; scipy.show_config()' 34 | 35 | Prerequisites 36 | ----------------- 37 | 38 | For communication between nodes, `gensim` uses `Pyro (PYthon Remote Objects) 39 | `_, version >= 4.27. This is a library for low-level socket communication 40 | and remote procedure calls (RPC) in Python. `Pyro` is a pure-Python library, so its 41 | installation is quite painless and only involves copying its `*.py` files somewhere onto your Python's import path:: 42 | 43 | sudo easy_install Pyro4 44 | 45 | You don't have to install `Pyro` to run `gensim`, but if you don't, you won't be able 46 | to access the distributed features (i.e., everything will always run in serial mode, 47 | the examples on this page don't apply). 48 | 49 | 50 | Core concepts 51 | ----------------------------------- 52 | 53 | As always, `gensim` strives for a clear and straightforward API (see :ref:`design`). 54 | To this end, *you do not need to make any changes in your code at all* in order to 55 | run it over a cluster of computers! 56 | 57 | What you need to do is run a :term:`worker` script (see below) on each of your cluster nodes prior 58 | to starting your computation. Running this script tells `gensim` that it may use the node 59 | as a slave to delegate some work to it. During initialization, the algorithms 60 | inside `gensim` will try to look for and enslave all available worker nodes. 61 | 62 | .. glossary:: 63 | 64 | Node 65 | A logical working unit. Can correspond to a single physical machine, but you 66 | can also run multiple workers on one machine, resulting in multiple 67 | logical nodes. 68 | 69 | Cluster 70 | Several nodes which communicate over TCP/IP. Currently, network broadcasting 71 | is used to discover and connect all communicating nodes, so the nodes must lie 72 | within the same `broadcast domain `_. 73 | 74 | Worker 75 | A process which is created on each node. To remove a node from your cluster, 76 | simply kill its worker process. 77 | 78 | Dispatcher 79 | The dispatcher will be in charge of negotiating all computations, queueing and 80 | distributing ("dispatching") individual jobs to the workers. Computations never 81 | "talk" to worker nodes directly, only through this dispatcher. Unlike workers, 82 | there can only be one active dispatcher at a time in the cluster. 83 | 84 | 85 | Available distributed algorithms 86 | --------------------------------- 87 | 88 | .. toctree:: 89 | :maxdepth: 1 90 | 91 | dist_lsi 92 | dist_lda 93 | -------------------------------------------------------------------------------- /gensim/models/lsi_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s 9 | 10 | Worker ("slave") process used in computing distributed LSI. Run this script \ 11 | on every node in your cluster. If you wish, you may even run it multiple times \ 12 | on a single machine, to make better use of multiple cores (just beware that \ 13 | memory footprint increases accordingly). 14 | 15 | Example: python -m gensim.models.lsi_worker 16 | """ 17 | 18 | 19 | from __future__ import with_statement 20 | import os, sys, logging 21 | import threading 22 | import tempfile 23 | try: 24 | import Queue 25 | except ImportError: 26 | import queue as Queue 27 | import Pyro4 28 | from gensim.models import lsimodel 29 | from gensim import utils 30 | 31 | logger = logging.getLogger('gensim.models.lsi_worker') 32 | 33 | 34 | SAVE_DEBUG = 0 # save intermediate models after every SAVE_DEBUG updates (0 for never) 35 | 36 | 37 | 38 | class Worker(object): 39 | def __init__(self): 40 | self.model = None 41 | 42 | 43 | def initialize(self, myid, dispatcher, **model_params): 44 | self.lock_update = threading.Lock() 45 | self.jobsdone = 0 # how many jobs has this worker completed? 46 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? 47 | self.dispatcher = dispatcher 48 | self.finished = False 49 | logger.info("initializing worker #%s" % myid) 50 | self.model = lsimodel.LsiModel(**model_params) 51 | 52 | 53 | @Pyro4.oneway 54 | def requestjob(self): 55 | """ 56 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. 57 | """ 58 | if self.model is None: 59 | raise RuntimeError("worker must be initialized before receiving jobs") 60 | 61 | job = None 62 | while job is None and not self.finished: 63 | try: 64 | job = self.dispatcher.getjob(self.myid) 65 | except Queue.Empty: 66 | # no new job: try again, unless we're finished with all work 67 | continue 68 | if job is not None: 69 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) 70 | self.processjob(job) 71 | self.dispatcher.jobdone(self.myid) 72 | else: 73 | logger.info("worker #%i stopping asking for jobs" % self.myid) 74 | 75 | 76 | @utils.synchronous('lock_update') 77 | def processjob(self, job): 78 | self.model.add_documents(job) 79 | self.jobsdone += 1 80 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0: 81 | fname = os.path.join(tempfile.gettempdir(), 'lsi_worker.pkl') 82 | self.model.save(fname) 83 | 84 | 85 | @utils.synchronous('lock_update') 86 | def getstate(self): 87 | logger.info("worker #%i returning its state after %s jobs" % 88 | (self.myid, self.jobsdone)) 89 | assert isinstance(self.model.projection, lsimodel.Projection) 90 | self.finished = True 91 | return self.model.projection 92 | 93 | 94 | @utils.synchronous('lock_update') 95 | def reset(self): 96 | logger.info("resetting worker #%i" % self.myid) 97 | self.model.projection = self.model.projection.empty_like() 98 | self.finished = False 99 | 100 | 101 | @Pyro4.oneway 102 | def exit(self): 103 | logger.info("terminating worker #%i" % self.myid) 104 | os._exit(0) 105 | #endclass Worker 106 | 107 | 108 | 109 | def main(): 110 | logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 111 | logger.info("running %s" % " ".join(sys.argv)) 112 | 113 | program = os.path.basename(sys.argv[0]) 114 | # make sure we have enough cmd line parameters 115 | if len(sys.argv) < 1: 116 | print(globals()["__doc__"] % locals()) 117 | sys.exit(1) 118 | 119 | utils.pyro_daemon('gensim.lsi_worker', Worker(), random_suffix=True) 120 | 121 | logger.info("finished running %s" % program) 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /gensim/test/test_miislita.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | This module replicates the miislita vector spaces from 8 | "A Linear Algebra Approach to the Vector Space Model -- A Fast Track Tutorial" 9 | by Dr. E. Garcia, admin@miislita.com 10 | 11 | See http://www.miislita.com for further details. 12 | 13 | """ 14 | 15 | from __future__ import division # always use floats 16 | from __future__ import with_statement 17 | 18 | import logging 19 | import tempfile 20 | import unittest 21 | import bz2 22 | import os 23 | 24 | from gensim import utils, corpora, models, similarities 25 | 26 | # sample data files are located in the same folder 27 | module_path = os.path.dirname(__file__) 28 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 29 | 30 | logger = logging.getLogger('test_miislita') 31 | 32 | 33 | def get_tmpfile(suffix): 34 | return os.path.join(tempfile.gettempdir(), suffix) 35 | 36 | 37 | class CorpusMiislita(corpora.TextCorpus): 38 | stoplist = set('for a of the and to in on'.split()) 39 | 40 | def get_texts(self): 41 | """ 42 | Parse documents from the .cor file provided in the constructor. Lowercase 43 | each document and ignore some stopwords. 44 | 45 | .cor format: one document per line, words separated by whitespace. 46 | 47 | """ 48 | with self.getstream() as stream: 49 | for doc in stream: 50 | yield [word for word in utils.to_unicode(doc).lower().split() 51 | if word not in CorpusMiislita.stoplist] 52 | 53 | def __len__(self): 54 | """Define this so we can use `len(corpus)`""" 55 | if 'length' not in self.__dict__: 56 | logger.info("caching corpus size (calculating number of documents)") 57 | self.length = sum(1 for doc in self.get_texts()) 58 | return self.length 59 | 60 | 61 | class TestMiislita(unittest.TestCase): 62 | def test_textcorpus(self): 63 | """Make sure TextCorpus can be serialized to disk. """ 64 | # construct corpus from file 65 | miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2')) 66 | 67 | # make sure serializing works 68 | ftmp = get_tmpfile('test_textcorpus.mm') 69 | corpora.MmCorpus.save_corpus(ftmp, miislita) 70 | self.assertTrue(os.path.exists(ftmp)) 71 | 72 | # make sure deserializing gives the same result 73 | miislita2 = corpora.MmCorpus(ftmp) 74 | self.assertEqual(list(miislita), list(miislita2)) 75 | 76 | 77 | def test_save_load_ability(self): 78 | """ 79 | Make sure we can save and load (un/pickle) TextCorpus objects (as long 80 | as the underlying input isn't a file-like object; we cannot pickle those). 81 | """ 82 | # construct corpus from file 83 | corpusname = datapath('miIslita.cor') 84 | miislita = CorpusMiislita(corpusname) 85 | 86 | # pickle to disk 87 | tmpf = get_tmpfile('tc_test.cpickle') 88 | miislita.save(tmpf) 89 | 90 | miislita2 = CorpusMiislita.load(tmpf) 91 | 92 | self.assertEqual(len(miislita), len(miislita2)) 93 | self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) 94 | 95 | 96 | def test_miislita_high_level(self): 97 | # construct corpus from file 98 | miislita = CorpusMiislita(datapath('miIslita.cor')) 99 | 100 | # initialize tfidf transformation and similarity index 101 | tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False) 102 | index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary)) 103 | 104 | # compare to query 105 | query = 'latent semantic indexing' 106 | vec_bow = miislita.dictionary.doc2bow(query.lower().split()) 107 | vec_tfidf = tfidf[vec_bow] 108 | 109 | # perform a similarity query against the corpus 110 | sims_tfidf = index[vec_tfidf] 111 | 112 | # for the expected results see the article 113 | expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334] 114 | for i, value in enumerate(expected): 115 | self.assertAlmostEqual(sims_tfidf[i], value, 2) 116 | 117 | 118 | if __name__ == '__main__': 119 | logging.basicConfig(level=logging.DEBUG) 120 | unittest.main() 121 | -------------------------------------------------------------------------------- /gensim/corpora/malletcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Corpus in Mallet format of List-Of-Words. 8 | """ 9 | 10 | from __future__ import with_statement 11 | 12 | import logging 13 | 14 | from gensim import utils 15 | from gensim.corpora import LowCorpus 16 | 17 | 18 | logger = logging.getLogger('gensim.corpora.malletcorpus') 19 | 20 | 21 | class MalletCorpus(LowCorpus): 22 | """ 23 | Quoting http://mallet.cs.umass.edu/import.php: 24 | 25 | One file, one instance per line 26 | Assume the data is in the following format: 27 | 28 | [URL] [language] [text of the page...] 29 | 30 | Or, more generally, 31 | [document #1 id] [label] [text of the document...] 32 | [document #2 id] [label] [text of the document...] 33 | ... 34 | [document #N id] [label] [text of the document...] 35 | 36 | Note that language/label is *not* considered in Gensim. 37 | 38 | """ 39 | def __init__(self, fname, id2word=None, metadata=False): 40 | self.metadata = metadata 41 | LowCorpus.__init__(self, fname, id2word) 42 | 43 | def _calculate_num_docs(self): 44 | with utils.smart_open(self.fname) as fin: 45 | result = sum([1 for x in fin]) 46 | return result 47 | 48 | def __iter__(self): 49 | """ 50 | Iterate over the corpus at the given filename. 51 | 52 | Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary. 53 | """ 54 | with utils.smart_open(self.fname) as f: 55 | for line in f: 56 | yield self.line2doc(line) 57 | 58 | def line2doc(self, line): 59 | l = [word for word in utils.to_unicode(line).strip().split(' ') if word] 60 | docid, doclang, words = l[0], l[1], l[2:] 61 | 62 | doc = super(MalletCorpus, self).line2doc(' '.join(words)) 63 | 64 | if self.metadata: 65 | return doc, (docid, doclang) 66 | else: 67 | return doc 68 | 69 | @staticmethod 70 | def save_corpus(fname, corpus, id2word=None, metadata=False): 71 | """ 72 | Save a corpus in the Mallet format. 73 | 74 | The document id will be generated by enumerating the corpus. 75 | That is, it will range between 0 and number of documents in the corpus. 76 | 77 | Since Mallet has a language field in the format, this defaults to the string '__unknown__'. 78 | If the language needs to be saved, post-processing will be required. 79 | 80 | This function is automatically called by `MalletCorpus.serialize`; don't 81 | call it directly, call `serialize` instead. 82 | 83 | """ 84 | if id2word is None: 85 | logger.info("no word id mapping provided; initializing from corpus") 86 | id2word = utils.dict_from_corpus(corpus) 87 | 88 | logger.info("storing corpus in Mallet format into %s" % fname) 89 | 90 | truncated = 0 91 | offsets = [] 92 | with utils.smart_open(fname, 'wb') as fout: 93 | for doc_id, doc in enumerate(corpus): 94 | if metadata: 95 | doc_id, doc_lang = doc[1] 96 | doc = doc[0] 97 | else: 98 | doc_lang = '__unknown__' 99 | 100 | words = [] 101 | for wordid, value in doc: 102 | if abs(int(value) - value) > 1e-6: 103 | truncated += 1 104 | words.extend([utils.to_unicode(id2word[wordid])] * int(value)) 105 | offsets.append(fout.tell()) 106 | fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) 107 | 108 | if truncated: 109 | logger.warning("Mallet format can only save vectors with " 110 | "integer elements; %i float entries were truncated to integer value" % 111 | truncated) 112 | 113 | return offsets 114 | 115 | def docbyoffset(self, offset): 116 | """ 117 | Return the document stored at file position `offset`. 118 | """ 119 | with utils.smart_open(self.fname) as f: 120 | f.seek(offset) 121 | return self.line2doc(f.readline()) 122 | 123 | # endclass MalletCorpus 124 | -------------------------------------------------------------------------------- /docs/src/about.rst: -------------------------------------------------------------------------------- 1 | .. _about: 2 | 3 | ============ 4 | About 5 | ============ 6 | 7 | History 8 | -------- 9 | 10 | Gensim started off as a collection of various Python scripts for the Czech Digital Mathematics Library `dml.cz `_ in 2008, 11 | where it served to generate a short list of the most similar articles to a given article (**gensim = "generate similar"**). 12 | I also wanted to try these fancy "Latent Semantic Methods", but the libraries that 13 | realized the necessary computation were `not much fun to work with `_. 14 | 15 | Naturally, I set out to reinvent the wheel. Our `2010 LREC publication `_ 16 | describes the initial design decisions behind gensim (clarity, efficiency and scalability) 17 | and is fairly representative of how gensim works even today. 18 | 19 | Later versions of gensim improved this efficiency and scalability tremendously. In fact, 20 | I made algorithmic scalability of distributional semantics the topic of my `PhD thesis `_. 21 | 22 | By now, gensim is---to my knowledge---the most robust, efficient and hassle-free piece 23 | of software to realize unsupervised semantic modelling from plain text. It stands 24 | in contrast to brittle homework-assignment-implementations that do not scale on one hand, 25 | and robust java-esque projects that take forever just to run "hello world". 26 | 27 | In 2011, I started using `Github `_ for source code hosting 28 | and the gensim website moved to its present domain. In 2013, gensim got its current logo and website design. 29 | 30 | 31 | Licensing 32 | ---------- 33 | 34 | Gensim is licensed under the OSI-approved `GNU LGPL license `_. 35 | This means that it's free for both personal and commercial use, but if you make any 36 | modification to gensim that you distribute to other people, you have to disclose 37 | the source code of these modifications. 38 | 39 | Apart from that, you are free to redistribute gensim in any way you like, though you're 40 | not allowed to modify its license (doh!). 41 | 42 | My intent here is, of course, to **get more help and community involvement** with the development of gensim. 43 | The legalese is therefore less important to me than your input and contributions. 44 | Contact me if LGPL doesn't fit your bill but you'd still like to use gensim -- we'll work something out. 45 | 46 | .. seealso:: 47 | 48 | I also host a document similarity package `gensim.simserver`. This is a high-level 49 | interface to `gensim` functionality, and offers transactional remote (web-based) 50 | document similarity queries and indexing. It uses gensim to do the heavy lifting: 51 | you don't need the `simserver` to use gensim, but you do need gensim to use the `simserver`. 52 | Note that unlike gensim, `gensim.simserver` is licensed under `Affero GPL `_, 53 | which is much more restrictive for inclusion in commercial projects. 54 | 55 | Contributors 56 | -------------- 57 | 58 | Credit goes to all the people who contributed to gensim, be it in `discussions `_, 59 | ideas, `code contributions `_ or `bug reports `_. 60 | It's really useful and motivating to get feedback, in any shape or form, so big thanks to you all! 61 | 62 | Some honorable mentions are included in the `CHANGELOG.txt `_. 63 | 64 | Academic citing 65 | ---------------- 66 | 67 | Gensim has been used in `many students' final theses as well as research papers `_. When citing gensim, 68 | please use `this BibTeX entry `_:: 69 | 70 | @inproceedings{rehurek_lrec, 71 | title = {{Software Framework for Topic Modelling with Large Corpora}}, 72 | author = {Radim {\v R}eh{\r u}{\v r}ek and Petr Sojka}, 73 | booktitle = {{Proceedings of the LREC 2010 Workshop on New 74 | Challenges for NLP Frameworks}}, 75 | pages = {45--50}, 76 | year = 2010, 77 | month = May, 78 | day = 22, 79 | publisher = {ELRA}, 80 | address = {Valletta, Malta}, 81 | note={\url{http://is.muni.cz/publication/884893/en}}, 82 | language={English} 83 | } 84 | 85 | 86 | -------------------------------------------------------------------------------- /gensim/corpora/textcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | Text corpora usually reside on disk, as text files in one format or another 8 | In a common scenario, we need to build a dictionary (a `word->integer id` 9 | mapping), which is then used to construct sparse bag-of-word vectors 10 | (= sequences of `(word_id, word_weight)` 2-tuples). 11 | 12 | This module provides some code scaffolding to simplify this pipeline. For 13 | example, given a corpus where each document is a separate line in file on disk, 14 | you would override the `TextCorpus.get_texts` method to read one line=document 15 | at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence 16 | of words. 17 | 18 | Overriding `get_texts` is enough; you can then initialize the corpus with e.g. 19 | `MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a 20 | corpus of sparse vectors. The `__iter__` methods is automatically set up, and 21 | dictionary is automatically populated with all `word->id` mappings. 22 | 23 | The resulting object can be used as input to all gensim models (TFIDF, LSI, ...), 24 | serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc). 25 | 26 | See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example. 27 | """ 28 | 29 | 30 | from __future__ import with_statement 31 | 32 | import logging 33 | 34 | from gensim import interfaces, utils 35 | from six import string_types 36 | from gensim.corpora.dictionary import Dictionary 37 | 38 | logger = logging.getLogger('gensim.corpora.textcorpus') 39 | 40 | 41 | class TextCorpus(interfaces.CorpusABC): 42 | """ 43 | Helper class to simplify the pipeline of getting bag-of-words vectors (= a 44 | gensim corpus) from plain text. 45 | 46 | This is an abstract base class: override the `get_texts()` and `__len__()` 47 | methods to match your particular input. 48 | 49 | Given a filename (or a file-like object) in constructor, the corpus object 50 | will be automatically initialized with a dictionary in `self.dictionary` and 51 | will support the `iter` corpus method. You must only provide a correct `get_texts` 52 | implementation. 53 | 54 | """ 55 | def __init__(self, input=None): 56 | super(TextCorpus, self).__init__() 57 | self.input = input 58 | self.dictionary = Dictionary() 59 | self.metadata = False 60 | if input is not None: 61 | self.dictionary.add_documents(self.get_texts()) 62 | else: 63 | logger.warning("No input document stream provided; assuming " 64 | "dictionary will be initialized some other way.") 65 | 66 | def __iter__(self): 67 | """ 68 | The function that defines a corpus. 69 | 70 | Iterating over the corpus must yield sparse vectors, one for each document. 71 | """ 72 | for text in self.get_texts(): 73 | if self.metadata: 74 | yield self.dictionary.doc2bow(text[0], allow_update=False), text[1] 75 | else: 76 | yield self.dictionary.doc2bow(text, allow_update=False) 77 | 78 | def getstream(self): 79 | return utils.file_or_filename(self.input) 80 | 81 | def get_texts(self): 82 | """ 83 | Iterate over the collection, yielding one document at a time. A document 84 | is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. 85 | 86 | Override this function to match your input (parse input files, do any 87 | text preprocessing, lowercasing, tokenizing etc.). There will be no further 88 | preprocessing of the words coming out of this function. 89 | """ 90 | # Instead of raising NotImplementedError, let's provide a sample implementation: 91 | # assume documents are lines in a single file (one document per line). 92 | # Yield each document as a list of lowercase tokens, via `utils.tokenize`. 93 | with self.getstream() as lines: 94 | for lineno, line in enumerate(lines): 95 | if self.metadata: 96 | yield utils.tokenize(line, lowercase=True), (lineno,) 97 | else: 98 | yield utils.tokenize(line, lowercase=True) 99 | 100 | def __len__(self): 101 | if not hasattr(self, 'length'): 102 | # cache the corpus length 103 | self.length = sum(1 for _ in self.get_texts()) 104 | return self.length 105 | 106 | # endclass TextCorpus 107 | -------------------------------------------------------------------------------- /docs/src/changes_080.rst: -------------------------------------------------------------------------------- 1 | .. _changes_080: 2 | 3 | Change Set for 0.8.0 4 | ============================ 5 | 6 | Release 0.8.0 concludes the 0.7.x series, which was about API consolidation and performance. 7 | In 0.8.x, I'd like to extend `gensim` with new functionality and features. 8 | 9 | Codestyle Changes 10 | ------------------ 11 | 12 | Codebase was modified to comply with `PEP8: Style Guide for Python Code `_. 13 | This means the 0.8.0 API is **backward incompatible** with the 0.7.x series. 14 | 15 | That's not as tragic as it sounds, gensim was almost there anyway. The changes are few and pretty straightforward: 16 | 17 | 1. the `numTopics` parameter is now `num_topics` 18 | 2. `addDocuments()` method becomes `add_documents()` 19 | 3. `toUtf8()` => `to_utf8()` 20 | 4. ... you get the idea: replace `camelCase` with `lowercase_with_underscores`. 21 | 22 | If you stored a model that is affected by this to disk, you'll need to rename its attributes manually: 23 | 24 | >>> lsa = gensim.models.LsiModel.load('/some/path') # load old <0.8.0 model 25 | >>> lsa.num_terms, lsa.num_topics = lsa.numTerms, lsa.numTopics # rename attributes 26 | >>> del lsa.numTerms, lsa.numTopics # clean up old attributes (optional) 27 | >>> lsa.save('/some/path') # save again to disk, as 0.8.0 compatible 28 | 29 | Only attributes (variables) need to be renamed; method names (functions) are not affected, due to the way `pickle` works. 30 | 31 | Similarity Queries 32 | ------------------- 33 | 34 | Improved speed and scalability of :doc:`similarity queries `. 35 | 36 | The `Similarity` class can now index corpora of arbitrary size more efficiently. 37 | Internally, this is done by splitting the index into several smaller pieces ("shards") that fit in RAM 38 | and can be processed independently. In addition, documents can now be added to a `Similarity` index dynamically. 39 | 40 | There is also a new way to query the similarity indexes: 41 | 42 | >>> index = MatrixSimilarity(corpus) # create an index 43 | >>> sims = index[document] # get cosine similarity of query "document" against every document in the index 44 | >>> sims = index[chunk_of_documents] # new syntax! 45 | 46 | Advantage of the last line (querying multiple documents at the same time) is faster execution. 47 | 48 | This faster execution is also utilized *automatically for you* if you're using the ``for sims in index: ...`` syntax 49 | (which returns pairwise similarities of documents in the index). 50 | 51 | To see the speed-up on your machine, run ``python -m gensim.test.simspeed`` (and compare to my results `here `_ to see how your machine fares). 52 | 53 | .. note:: 54 | This current functionality of querying is as far as I wanted to get with gensim. 55 | More optimizations and smarter indexing are certainly possible, but I'd like to 56 | focus on other features now. Pull requests are still welcome though :) 57 | 58 | Check out the :mod:`updated documentation ` of the similarity classes for more info. 59 | 60 | Simplified Directory Structure 61 | -------------------------------- 62 | 63 | Instead of the java-esque ``ROOT_DIR/src/gensim`` directory structure of gensim, 64 | the packages now reside directly in ``ROOT_DIR/gensim`` (no superfluous ``src``). See the new structure `on github `_. 65 | 66 | Other changes (that you're unlikely to notice unless you look) 67 | ---------------------------------------------------------------------- 68 | 69 | * Improved efficiency of ``lsi[corpus]`` transformations (documents are chunked internally for better performance). 70 | * Large matrices (numpy/scipy.sparse, in `LsiModel`, `Similarity` etc.) are now mmapped to/from disk when doing `save/load`. The `cPickle` approach used previously was too `buggy `_ and `slow `_. 71 | * Renamed `chunks` parameter to `chunksize` (i.e. `LsiModel(corpus, num_topics=100, chunksize=20000)`). This better reflects its purpose: size of a chunk=number of documents to be processed at once. 72 | * Also improved memory efficiency of LSI and LDA model generation (again). 73 | * Removed SciPy 0.6 from the list of supported SciPi versions (need >=0.7 now). 74 | * Added more unit tests. 75 | * Several smaller fixes; see the `commit history `_ for full account. 76 | 77 | .. admonition:: Future Directions? 78 | 79 | If you have ideas or proposals for new features for 0.8.x, now is the time to let me know: 80 | `gensim mailing list `_. 81 | -------------------------------------------------------------------------------- /gensim/models/lda_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2011 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | USAGE: %(program)s 9 | 10 | Worker ("slave") process used in computing distributed LDA. Run this script \ 11 | on every node in your cluster. If you wish, you may even run it multiple times \ 12 | on a single machine, to make better use of multiple cores (just beware that \ 13 | memory footprint increases accordingly). 14 | 15 | Example: python -m gensim.models.lda_worker 16 | """ 17 | 18 | 19 | from __future__ import with_statement 20 | import os, sys, logging 21 | import threading 22 | import tempfile 23 | try: 24 | import Queue 25 | except ImportError: 26 | import queue as Queue 27 | import Pyro4 28 | from gensim.models import ldamodel 29 | from gensim import utils 30 | 31 | logger = logging.getLogger('gensim.models.lda_worker') 32 | 33 | 34 | # periodically save intermediate models after every SAVE_DEBUG updates (0 for never) 35 | SAVE_DEBUG = 0 36 | 37 | 38 | 39 | class Worker(object): 40 | def __init__(self): 41 | self.model = None 42 | 43 | 44 | def initialize(self, myid, dispatcher, **model_params): 45 | self.lock_update = threading.Lock() 46 | self.jobsdone = 0 # how many jobs has this worker completed? 47 | self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? 48 | self.dispatcher = dispatcher 49 | self.finished = False 50 | logger.info("initializing worker #%s" % myid) 51 | self.model = ldamodel.LdaModel(**model_params) 52 | 53 | 54 | @Pyro4.oneway 55 | def requestjob(self): 56 | """ 57 | Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. 58 | """ 59 | if self.model is None: 60 | raise RuntimeError("worker must be initialized before receiving jobs") 61 | 62 | job = None 63 | while job is None and not self.finished: 64 | try: 65 | job = self.dispatcher.getjob(self.myid) 66 | except Queue.Empty: 67 | # no new job: try again, unless we're finished with all work 68 | continue 69 | if job is not None: 70 | logger.info("worker #%s received job #%i" % (self.myid, self.jobsdone)) 71 | self.processjob(job) 72 | self.dispatcher.jobdone(self.myid) 73 | else: 74 | logger.info("worker #%i stopping asking for jobs" % self.myid) 75 | 76 | 77 | @utils.synchronous('lock_update') 78 | def processjob(self, job): 79 | logger.debug("starting to process job #%i" % self.jobsdone) 80 | self.model.do_estep(job) 81 | self.jobsdone += 1 82 | if SAVE_DEBUG and self.jobsdone % SAVE_DEBUG == 0: 83 | fname = os.path.join(tempfile.gettempdir(), 'lda_worker.pkl') 84 | self.model.save(fname) 85 | logger.info("finished processing job #%i" % (self.jobsdone - 1)) 86 | 87 | 88 | @utils.synchronous('lock_update') 89 | def getstate(self): 90 | logger.info("worker #%i returning its state after %s jobs" % 91 | (self.myid, self.jobsdone)) 92 | result = self.model.state 93 | assert isinstance(result, ldamodel.LdaState) 94 | self.model.clear() # free up mem in-between two EM cycles 95 | self.finished = True 96 | return result 97 | 98 | 99 | @utils.synchronous('lock_update') 100 | def reset(self, state): 101 | assert state is not None 102 | logger.info("resetting worker #%i" % self.myid) 103 | self.model.state = state 104 | self.model.sync_state() 105 | self.model.state.reset() 106 | self.finished = False 107 | 108 | 109 | @Pyro4.oneway 110 | def exit(self): 111 | logger.info("terminating worker #%i" % self.myid) 112 | os._exit(0) 113 | #endclass Worker 114 | 115 | 116 | 117 | def main(): 118 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 119 | logger.info("running %s" % " ".join(sys.argv)) 120 | 121 | program = os.path.basename(sys.argv[0]) 122 | # make sure we have enough cmd line parameters 123 | if len(sys.argv) < 1: 124 | print(globals()["__doc__"] % locals()) 125 | sys.exit(1) 126 | 127 | utils.pyro_daemon('gensim.lda_worker', Worker(), random_suffix=True) 128 | 129 | logger.info("finished running %s" % program) 130 | 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /gensim/models/logentropy_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | import logging 7 | import math 8 | from gensim import interfaces, matutils, utils 9 | 10 | 11 | logger = logging.getLogger('gensim.models.logentropy_model') 12 | 13 | 14 | class LogEntropyModel(interfaces.TransformationABC): 15 | """ 16 | Objects of this class realize the transformation between word-document 17 | co-occurence matrix (integers) into a locally/globally weighted matrix 18 | (positive floats). 19 | 20 | This is done by a log entropy normalization, optionally normalizing the 21 | resulting documents to unit length. The following formulas explain how 22 | to compute the log entropy weight for term `i` in document `j`:: 23 | 24 | local_weight_{i,j} = log(frequency_{i,j} + 1) 25 | 26 | P_{i,j} = frequency_{i,j} / sum_j frequency_{i,j} 27 | 28 | sum_j P_{i,j} * log(P_{i,j}) 29 | global_weight_i = 1 + ---------------------------- 30 | log(number_of_documents + 1) 31 | 32 | final_weight_{i,j} = local_weight_{i,j} * global_weight_i 33 | 34 | The main methods are: 35 | 36 | 1. constructor, which calculates the global weighting for all terms in 37 | a corpus. 38 | 2. the [] method, which transforms a simple count representation into the 39 | log entropy normalized space. 40 | 41 | >>> log_ent = LogEntropyModel(corpus) 42 | >>> print(log_ent[some_doc]) 43 | >>> log_ent.save('/tmp/foo.log_ent_model') 44 | 45 | Model persistency is achieved via its load/save methods. 46 | """ 47 | 48 | def __init__(self, corpus, id2word=None, normalize=True): 49 | """ 50 | `normalize` dictates whether the resulting vectors will be 51 | set to unit length. 52 | """ 53 | self.normalize = normalize 54 | self.n_docs = 0 55 | self.n_words = 0 56 | self.entr = {} 57 | if corpus is not None: 58 | self.initialize(corpus) 59 | 60 | def __str__(self): 61 | return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, 62 | self.n_words) 63 | 64 | def initialize(self, corpus): 65 | """ 66 | Initialize internal statistics based on a training corpus. Called 67 | automatically from the constructor. 68 | """ 69 | logger.info("calculating counts") 70 | glob_freq = {} 71 | glob_num_words, doc_no = 0, -1 72 | for doc_no, bow in enumerate(corpus): 73 | if doc_no % 10000 == 0: 74 | logger.info("PROGRESS: processing document #%i" % doc_no) 75 | glob_num_words += len(bow) 76 | for term_id, term_count in bow: 77 | glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count 78 | 79 | # keep some stats about the training corpus 80 | self.n_docs = doc_no + 1 81 | self.n_words = glob_num_words 82 | 83 | # and finally compute the global weights 84 | logger.info("calculating global log entropy weights for %i " 85 | "documents and %i features (%i matrix non-zeros)" 86 | % (self.n_docs, len(glob_freq), self.n_words)) 87 | logger.debug('iterating over corpus') 88 | for doc_no2, bow in enumerate(corpus): 89 | for key, freq in bow: 90 | p = (float(freq) / glob_freq[key]) * math.log(float(freq) / 91 | glob_freq[key]) 92 | self.entr[key] = self.entr.get(key, 0.0) + p 93 | if doc_no2 != doc_no: 94 | raise ValueError("LogEntropyModel doesn't support generators as training data") 95 | 96 | logger.debug('iterating over keys') 97 | for key in self.entr: 98 | self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1) 99 | 100 | def __getitem__(self, bow): 101 | """ 102 | Return log entropy representation of the input vector and/or corpus. 103 | """ 104 | # if the input vector is in fact a corpus, return a transformed corpus 105 | is_corpus, bow = utils.is_corpus(bow) 106 | if is_corpus: 107 | return self._apply(bow) 108 | 109 | # unknown (new) terms will be given zero weight (NOT infinity/huge) 110 | vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) 111 | for term_id, tf in bow if term_id in self.entr] 112 | if self.normalize: 113 | vector = matutils.unitvec(vector) 114 | return vector 115 | -------------------------------------------------------------------------------- /docs/src/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | ============= 4 | Installation 5 | ============= 6 | 7 | Quick install 8 | -------------- 9 | 10 | Run in your terminal:: 11 | 12 | easy_install -U gensim 13 | 14 | or, alternatively:: 15 | 16 | pip install --upgrade gensim 17 | 18 | In case that fails, make sure you're installing into a writeable location (or use `sudo`), or read on. 19 | 20 | ----- 21 | 22 | Dependencies 23 | ------------- 24 | Gensim is known to run on Linux, Windows and Mac OS X and should run on any other 25 | platform that supports Python 2.6+ and NumPy. Gensim depends on the following software: 26 | 27 | * `Python `_ >= 2.6. Tested with versions 2.6, 2.7, 3.3, 3.4 and 3.5. Support for Python 2.5 was discontinued starting gensim 0.10.0; if you *must* use Python 2.5, install gensim 0.9.1. 28 | * `NumPy `_ >= 1.3. Tested with version 1.9.0, 1.7.1, 1.7.0, 1.6.2, 1.6.1rc2, 1.5.0rc1, 1.4.0, 1.3.0, 1.3.0rc2. 29 | * `SciPy `_ >= 0.7. Tested with version 0.14.0, 0.12.0, 0.11.0, 0.10.1, 0.9.0, 0.8.0, 0.8.0b1, 0.7.1, 0.7.0. 30 | 31 | **Windows users** are well advised to try the `Enthought distribution `_, 32 | which conveniently includes Python & NumPy & SciPy in a single bundle, and is free for academic use. 33 | 34 | 35 | Install Python and `easy_install` 36 | --------------------------------- 37 | 38 | Check what version of Python you have with:: 39 | 40 | python --version 41 | 42 | You can download Python from http://python.org/download. 43 | 44 | .. note:: Gensim requires Python 2.6 / 3.3 or greater, and will not run under earlier versions. 45 | 46 | Next, install the `easy_install utility `_, 47 | which will make installing other Python programs easier. 48 | 49 | Install SciPy & NumPy 50 | ---------------------- 51 | 52 | These are quite popular Python packages, so chances are there are pre-built binary 53 | distributions available for your platform. You can try installing from source using easy_install:: 54 | 55 | easy_install numpy 56 | easy_install scipy 57 | 58 | If that doesn't work or if you'd rather install using a binary package, consult 59 | http://www.scipy.org/Download. 60 | 61 | Install `gensim` 62 | ----------------- 63 | 64 | You can now install (or upgrade) `gensim` with:: 65 | 66 | easy_install --upgrade gensim 67 | 68 | That's it! Congratulations, you can proceed to the :doc:`tutorials `. 69 | 70 | ----- 71 | 72 | If you also want to run the algorithms over a cluster 73 | of computers, in :doc:`distributed`, you should install with:: 74 | 75 | easy_install gensim[distributed] 76 | 77 | The optional `distributed` feature installs `Pyro (PYthon Remote Objects) `_. 78 | If you don't know what distributed computing means, you can ignore it: 79 | `gensim` will work fine for you anyway. 80 | This optional extension can also be installed separately later with:: 81 | 82 | easy_install Pyro4 83 | 84 | ----- 85 | 86 | There are also alternative routes to install: 87 | 88 | 1. If you have downloaded and unzipped the `tar.gz source `_ 89 | for `gensim` (or you're installing `gensim` from `github `_), 90 | you can run:: 91 | 92 | python setup.py install 93 | 94 | to install `gensim` into your ``site-packages`` folder. 95 | 2. If you wish to make local changes to the `gensim` code (`gensim` is, after all, a 96 | package which targets research prototyping and modifications), a preferred 97 | way may be installing with:: 98 | 99 | python setup.py develop 100 | 101 | This will only place a symlink into your ``site-packages`` directory. The actual 102 | files will stay wherever you unpacked them. 103 | 3. If you don't have root priviledges (or just don't want to put the package into 104 | your ``site-packages``), simply unpack the source package somewhere and that's it! No 105 | compilation or installation needed. Just don't forget to set your PYTHONPATH 106 | (or modify ``sys.path``), so that Python can find the unpacked package when importing. 107 | 108 | 109 | Testing `gensim` 110 | ---------------- 111 | 112 | To test the package, unzip the `tar.gz source `_ and run:: 113 | 114 | python setup.py test 115 | 116 | Gensim uses Travis CI for continuous integration: |Travis|_ 117 | 118 | .. |Travis| image:: https://api.travis-ci.org/piskvorky/gensim.png?branch=develop 119 | .. _Travis: https://travis-ci.org/piskvorky/gensim 120 | 121 | 122 | Problems? 123 | --------- 124 | 125 | Use the `gensim discussion group `_ for 126 | questions and troubleshooting. See the :doc:`support page `. 127 | -------------------------------------------------------------------------------- /gensim/scripts/make_wikicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Copyright (C) 2012 Lars Buitinck 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 | 8 | 9 | """ 10 | USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 | 12 | Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13 | bz2-compressed dump of Wikipedia articles, in XML format. 14 | 15 | This actually creates three files: 16 | 17 | * `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18 | * `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 | Matrix Matrix format 20 | * `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21 | * `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump 22 | 23 | The output Matrix Market files can then be compressed (e.g., by bzip2) to save 24 | disk space; gensim's corpus iterators can work with compressed input, too. 25 | 26 | `VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 27 | removing tokens that appear in more than 10%% of all documents). Defaults to 28 | 100,000. 29 | 30 | If you have the `pattern` package installed, this script will use a fancy 31 | lemmatization to get a lemma of each token (instead of plain alphabetic 32 | tokenizer). The package is available at https://github.com/clips/pattern . 33 | 34 | Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en 35 | """ 36 | 37 | 38 | import logging 39 | import os.path 40 | import sys 41 | 42 | from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 43 | from gensim.models import TfidfModel 44 | 45 | 46 | # Wiki is first scanned for all distinct word types (~7M). The types that 47 | # appear in more than 10% of articles are removed and from the rest, the 48 | # DEFAULT_DICT_SIZE most frequent types are kept. 49 | DEFAULT_DICT_SIZE = 100000 50 | 51 | 52 | if __name__ == '__main__': 53 | program = os.path.basename(sys.argv[0]) 54 | logger = logging.getLogger(program) 55 | 56 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 57 | logging.root.setLevel(level=logging.INFO) 58 | logger.info("running %s" % ' '.join(sys.argv)) 59 | 60 | # check and process input arguments 61 | if len(sys.argv) < 3: 62 | print(globals()['__doc__'] % locals()) 63 | sys.exit(1) 64 | inp, outp = sys.argv[1:3] 65 | if len(sys.argv) > 3: 66 | keep_words = int(sys.argv[3]) 67 | else: 68 | keep_words = DEFAULT_DICT_SIZE 69 | online = 'online' in program 70 | lemmatize = 'lemma' in program 71 | debug = 'nodebug' not in program 72 | 73 | if online: 74 | dictionary = HashDictionary(id_range=keep_words, debug=debug) 75 | dictionary.allow_update = True # start collecting document frequencies 76 | wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 77 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 78 | # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 79 | dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 80 | dictionary.save_as_text(outp + '_wordids.txt.bz2') 81 | wiki.save(outp + '_corpus.pkl.bz2') 82 | dictionary.allow_update = False 83 | else: 84 | wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 85 | # only keep the most frequent words (out of total ~8.2m unique tokens) 86 | wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 87 | # save dictionary and bag-of-words (term-document frequency matrix) 88 | MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 89 | wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 90 | # load back the id->word mapping directly from file 91 | # this seems to save more memory, compared to keeping the wiki.dictionary object from above 92 | dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 93 | del wiki 94 | 95 | # initialize corpus reader and word->id mapping 96 | mm = MmCorpus(outp + '_bow.mm') 97 | 98 | # build tfidf, ~50min 99 | tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 100 | tfidf.save(outp + '.tfidf_model') 101 | 102 | # save tfidf vectors in matrix market format 103 | # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 104 | MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 105 | 106 | logger.info("finished running %s" % program) 107 | -------------------------------------------------------------------------------- /docs/src/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | Tutorials 4 | ========= 5 | 6 | 7 | The tutorials are organized as a series of examples that highlight various features 8 | of `gensim`. It is assumed that the reader is familiar with the `Python language `_, has :doc:`installed gensim ` 9 | and read the :doc:`introduction `. 10 | 11 | The examples are divided into parts on: 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | tut1 17 | tut2 18 | tut3 19 | wiki 20 | distributed 21 | 22 | Preliminaries 23 | -------------- 24 | 25 | All the examples can be directly copied to your Python interpreter shell. `IPython `_'s ``cpaste`` command is especially handy for copypasting code fragments, including the leading ``>>>`` characters. 26 | 27 | Gensim uses Python's standard :mod:`logging` module to log various stuff at various 28 | priority levels; to activate logging (this is optional), run 29 | 30 | >>> import logging 31 | >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 32 | 33 | 34 | .. _first-example: 35 | 36 | Quick Example 37 | ------------- 38 | 39 | First, let's import gensim and create a small corpus of nine documents and twelve features [1]_: 40 | 41 | >>> from gensim import corpora, models, similarities 42 | >>> 43 | >>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)], 44 | >>> [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)], 45 | >>> [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)], 46 | >>> [(0, 1.0), (4, 2.0), (7, 1.0)], 47 | >>> [(3, 1.0), (5, 1.0), (6, 1.0)], 48 | >>> [(9, 1.0)], 49 | >>> [(9, 1.0), (10, 1.0)], 50 | >>> [(9, 1.0), (10, 1.0), (11, 1.0)], 51 | >>> [(8, 1.0), (10, 1.0), (11, 1.0)]] 52 | 53 | :dfn:`Corpus` is simply an object which, when iterated over, returns its documents represented 54 | as sparse vectors. If you're not familiar with the `vector space model `_, we'll bridge the gap between **raw strings**, **corpora** and **sparse vectors** in the next tutorial on :doc:`tut1`. 55 | 56 | If you're familiar with the vector space model, you'll probably know that the way you parse your documents and convert them to vectors 57 | has major impact on the quality of any subsequent applications. 58 | 59 | .. note:: 60 | In this example, the whole corpus is stored in memory, as a Python list. However, 61 | the corpus interface only dictates that a corpus must support iteration over its 62 | constituent documents. For very large corpora, it is advantageous to keep the 63 | corpus on disk, and access its documents sequentially, one at a time. All the 64 | operations and transformations are implemented in such a way that makes 65 | them independent of the size of the corpus, memory-wise. 66 | 67 | 68 | Next, let's initialize a :dfn:`transformation`: 69 | 70 | >>> tfidf = models.TfidfModel(corpus) 71 | 72 | A transformation is used to convert documents from one vector representation into another: 73 | 74 | >>> vec = [(0, 1), (4, 1)] 75 | >>> print(tfidf[vec]) 76 | [(0, 0.8075244), (4, 0.5898342)] 77 | 78 | Here, we used `Tf-Idf `_, a simple 79 | transformation which takes documents represented as bag-of-words counts and applies 80 | a weighting which discounts common terms (or, equivalently, promotes rare terms). 81 | It also scales the resulting vector to unit length (in the `Euclidean norm `_). 82 | 83 | Transformations are covered in detail in the tutorial on :doc:`tut2`. 84 | 85 | To transform the whole corpus via TfIdf and index it, in preparation for similarity queries: 86 | 87 | >>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12) 88 | 89 | and to query the similarity of our query vector ``vec`` against every document in the corpus: 90 | 91 | >>> sims = index[tfidf[vec]] 92 | >>> print(list(enumerate(sims))) 93 | [(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)] 94 | 95 | How to read this output? Document number zero (the first document) has a similarity score of 0.466=46.6\%, 96 | the second document has a similarity score of 19.1\% etc. 97 | 98 | Thus, according to TfIdf document representation and cosine similarity measure, 99 | the most similar to our query document `vec` is document no. 3, with a similarity score of 82.1%. 100 | Note that in the TfIdf representation, any documents which do not share any common features 101 | with ``vec`` at all (documents no. 4--8) get a similarity score of 0.0. See the :doc:`tut3` tutorial for more detail. 102 | 103 | ------ 104 | 105 | .. [1] This is the same corpus as used in 106 | `Deerwester et al. (1990): Indexing by Latent Semantic Analysis `_, Table 2. 107 | 108 | 109 | -------------------------------------------------------------------------------- /gensim/corpora/bleicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Blei's LDA-C format. 10 | """ 11 | 12 | from __future__ import with_statement 13 | 14 | from os import path 15 | import logging 16 | 17 | from gensim import interfaces, utils 18 | from gensim.corpora import IndexedCorpus 19 | from six.moves import xrange 20 | 21 | 22 | logger = logging.getLogger('gensim.corpora.bleicorpus') 23 | 24 | 25 | class BleiCorpus(IndexedCorpus): 26 | """ 27 | Corpus in Blei's LDA-C format. 28 | 29 | The corpus is represented as two files: one describing the documents, and another 30 | describing the mapping between words and their ids. 31 | 32 | Each document is one line:: 33 | 34 | N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN 35 | 36 | The vocabulary is a file with words, one word per line; word at line K has an 37 | implicit ``id=K``. 38 | """ 39 | 40 | def __init__(self, fname, fname_vocab=None): 41 | """ 42 | Initialize the corpus from a file. 43 | 44 | `fname_vocab` is the file with vocabulary; if not specified, it defaults to 45 | `fname.vocab`. 46 | """ 47 | IndexedCorpus.__init__(self, fname) 48 | logger.info("loading corpus from %s" % fname) 49 | 50 | if fname_vocab is None: 51 | fname_base, _ = path.splitext(fname) 52 | fname_dir = path.dirname(fname) 53 | for fname_vocab in [ 54 | utils.smart_extension(fname, '.vocab'), 55 | utils.smart_extension(fname, '/vocab.txt'), 56 | utils.smart_extension(fname_base, '.vocab'), 57 | utils.smart_extension(fname_dir, '/vocab.txt'), 58 | ]: 59 | if path.exists(fname_vocab): 60 | break 61 | else: 62 | raise IOError('BleiCorpus: could not find vocabulary file') 63 | 64 | self.fname = fname 65 | with utils.smart_open(fname_vocab) as fin: 66 | words = [utils.to_unicode(word).rstrip() for word in fin] 67 | self.id2word = dict(enumerate(words)) 68 | 69 | def __iter__(self): 70 | """ 71 | Iterate over the corpus, returning one sparse vector at a time. 72 | """ 73 | lineno = -1 74 | with utils.smart_open(self.fname) as fin: 75 | for lineno, line in enumerate(fin): 76 | yield self.line2doc(line) 77 | self.length = lineno + 1 78 | 79 | def line2doc(self, line): 80 | parts = utils.to_unicode(line).split() 81 | if int(parts[0]) != len(parts) - 1: 82 | raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) 83 | doc = [part.rsplit(':', 1) for part in parts[1:]] 84 | doc = [(int(p1), float(p2)) for p1, p2 in doc] 85 | return doc 86 | 87 | @staticmethod 88 | def save_corpus(fname, corpus, id2word=None, metadata=False): 89 | """ 90 | Save a corpus in the LDA-C format. 91 | 92 | There are actually two files saved: `fname` and `fname.vocab`, where 93 | `fname.vocab` is the vocabulary file. 94 | 95 | This function is automatically called by `BleiCorpus.serialize`; don't 96 | call it directly, call `serialize` instead. 97 | """ 98 | if id2word is None: 99 | logger.info("no word id mapping provided; initializing from corpus") 100 | id2word = utils.dict_from_corpus(corpus) 101 | num_terms = len(id2word) 102 | else: 103 | num_terms = 1 + max([-1] + id2word.keys()) 104 | 105 | logger.info("storing corpus in Blei's LDA-C format into %s" % fname) 106 | with utils.smart_open(fname, 'wb') as fout: 107 | offsets = [] 108 | for doc in corpus: 109 | doc = list(doc) 110 | offsets.append(fout.tell()) 111 | parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] 112 | fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) 113 | 114 | # write out vocabulary, in a format compatible with Blei's topics.py script 115 | fname_vocab = utils.smart_extension(fname, '.vocab') 116 | logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) 117 | with utils.smart_open(fname_vocab, 'wb') as fout: 118 | for featureid in xrange(num_terms): 119 | fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) 120 | 121 | return offsets 122 | 123 | def docbyoffset(self, offset): 124 | """ 125 | Return the document stored at file position `offset`. 126 | """ 127 | with utils.smart_open(self.fname) as f: 128 | f.seek(offset) 129 | return self.line2doc(f.readline()) 130 | 131 | # endclass BleiCorpus 132 | -------------------------------------------------------------------------------- /gensim/parsing/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | import re 7 | import string 8 | import glob 9 | 10 | from gensim import utils 11 | from gensim.parsing.porter import PorterStemmer 12 | 13 | 14 | # improved list from Stone, Denis, Kwantes (2010) 15 | STOPWORDS = """ 16 | a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be 17 | became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can 18 | cannot cant co computer con could couldnt cry de describe 19 | detail did didn do does doesn doing don done down due during 20 | each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen 21 | fify fill find fire first five for former formerly forty found four from front full further get give go 22 | had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie 23 | if in inc indeed interest into is it its itself keep last latter latterly least less ltd 24 | just 25 | kg km 26 | made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely 27 | neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off 28 | often on once one only onto or other others otherwise our ours ourselves out over own part per 29 | perhaps please put rather re 30 | quite 31 | rather really regarding 32 | same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten 33 | than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under 34 | until up unless upon us used using 35 | various very very via 36 | was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you 37 | your yours yourself yourselves 38 | """ 39 | STOPWORDS = frozenset(w for w in STOPWORDS.split() if w) 40 | 41 | 42 | def remove_stopwords(s): 43 | s = utils.to_unicode(s) 44 | return " ".join(w for w in s.split() if w not in STOPWORDS) 45 | 46 | 47 | RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) 48 | def strip_punctuation(s): 49 | s = utils.to_unicode(s) 50 | return RE_PUNCT.sub(" ", s) 51 | 52 | 53 | # unicode.translate cannot delete characters like str can 54 | strip_punctuation2 = strip_punctuation 55 | # def strip_punctuation2(s): 56 | # s = utils.to_unicode(s) 57 | # return s.translate(None, string.punctuation) 58 | 59 | 60 | RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) 61 | def strip_tags(s): 62 | s = utils.to_unicode(s) 63 | return RE_TAGS.sub("",s) 64 | 65 | 66 | def strip_short(s, minsize=3): 67 | s = utils.to_unicode(s) 68 | return " ".join(e for e in s.split() if len(e) >= minsize) 69 | 70 | 71 | RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) 72 | def strip_numeric(s): 73 | s = utils.to_unicode(s) 74 | return RE_NUMERIC.sub("", s) 75 | 76 | 77 | RE_NONALPHA = re.compile(r"\W", re.UNICODE) 78 | def strip_non_alphanum(s): 79 | s = utils.to_unicode(s) 80 | return RE_NONALPHA.sub(" ", s) 81 | 82 | 83 | RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) 84 | def strip_multiple_whitespaces(s): 85 | s = utils.to_unicode(s) 86 | return RE_WHITESPACE.sub(" ", s) 87 | 88 | 89 | RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) 90 | RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) 91 | def split_alphanum(s): 92 | s = utils.to_unicode(s) 93 | s = RE_AL_NUM.sub(r"\1 \2", s) 94 | return RE_NUM_AL.sub(r"\1 \2", s) 95 | 96 | 97 | def stem_text(text): 98 | """ 99 | Return lowercase and (porter-)stemmed version of string `text`. 100 | """ 101 | text = utils.to_unicode(text) 102 | p = PorterStemmer() 103 | return ' '.join(p.stem(word) for word in text.split()) 104 | stem = stem_text 105 | 106 | DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, 107 | strip_numeric, remove_stopwords, strip_short, stem_text] 108 | 109 | 110 | def preprocess_string(s, filters=DEFAULT_FILTERS): 111 | s = utils.to_unicode(s) 112 | for f in filters: 113 | s = f(s) 114 | return s.split() 115 | 116 | 117 | def preprocess_documents(docs): 118 | return [preprocess_string(d) for d in docs] 119 | 120 | 121 | def read_file(path): 122 | with utils.smart_open(path) as fin: 123 | return fin.read() 124 | 125 | 126 | def read_files(pattern): 127 | return [read_file(fname) for fname in glob.glob(pattern)] 128 | -------------------------------------------------------------------------------- /gensim/corpora/svmlightcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | Corpus in SVMlight format. 10 | """ 11 | 12 | 13 | from __future__ import with_statement 14 | 15 | import logging 16 | 17 | from gensim import utils 18 | from gensim.corpora import IndexedCorpus 19 | 20 | 21 | logger = logging.getLogger('gensim.corpora.svmlightcorpus') 22 | 23 | 24 | class SvmLightCorpus(IndexedCorpus): 25 | """ 26 | Corpus in SVMlight format. 27 | 28 | Quoting http://svmlight.joachims.org/: 29 | The input file contains the training examples. The first lines 30 | may contain comments and are ignored if they start with #. Each of the following 31 | lines represents one training example and is of the following format:: 32 | 33 | .=. : : ... : # 34 | .=. +1 | -1 | 0 | 35 | .=. | "qid" 36 | .=. 37 | .=. 38 | 39 | The "qid" feature (used for SVMlight ranking), if present, is ignored. 40 | 41 | Although not mentioned in the specification above, SVMlight also expect its 42 | feature ids to be 1-based (counting starts at 1). We convert features to 0-base 43 | internally by decrementing all ids when loading a SVMlight input file, and 44 | increment them again when saving as SVMlight. 45 | 46 | """ 47 | 48 | def __init__(self, fname, store_labels=True): 49 | """ 50 | Initialize the corpus from a file. 51 | 52 | Although vector labels (~SVM target class) are not used in gensim in any way, 53 | they are parsed and stored in `self.labels` for convenience. Set `store_labels=False` 54 | to skip storing these labels (e.g. if there are too many vectors to store 55 | the self.labels array in memory). 56 | 57 | """ 58 | IndexedCorpus.__init__(self, fname) 59 | logger.info("loading corpus from %s" % fname) 60 | 61 | self.fname = fname # input file, see class doc for format 62 | self.length = None 63 | self.store_labels = store_labels 64 | self.labels = [] 65 | 66 | def __iter__(self): 67 | """ 68 | Iterate over the corpus, returning one sparse vector at a time. 69 | """ 70 | lineno = -1 71 | self.labels = [] 72 | with utils.smart_open(self.fname) as fin: 73 | for lineno, line in enumerate(fin): 74 | doc = self.line2doc(line) 75 | if doc is not None: 76 | if self.store_labels: 77 | self.labels.append(doc[1]) 78 | yield doc[0] 79 | self.length = lineno + 1 80 | 81 | @staticmethod 82 | def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): 83 | """ 84 | Save a corpus in the SVMlight format. 85 | 86 | The SVMlight `` class tag is taken from the `labels` array, or set 87 | to 0 for all documents if `labels` is not supplied. 88 | 89 | This function is automatically called by `SvmLightCorpus.serialize`; don't 90 | call it directly, call `serialize` instead. 91 | """ 92 | logger.info("converting corpus to SVMlight format: %s" % fname) 93 | 94 | offsets = [] 95 | with utils.smart_open(fname, 'wb') as fout: 96 | for docno, doc in enumerate(corpus): 97 | label = labels[docno] if labels else 0 # target class is 0 by default 98 | offsets.append(fout.tell()) 99 | fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) 100 | return offsets 101 | 102 | def docbyoffset(self, offset): 103 | """ 104 | Return the document stored at file position `offset`. 105 | """ 106 | with utils.smart_open(self.fname) as f: 107 | f.seek(offset) 108 | return self.line2doc(f.readline())[0] 109 | 110 | def line2doc(self, line): 111 | """ 112 | Create a document from a single line (string) in SVMlight format 113 | """ 114 | line = utils.to_unicode(line) 115 | line = line[: line.find('#')].strip() 116 | if not line: 117 | return None # ignore comments and empty lines 118 | parts = line.split() 119 | if not parts: 120 | raise ValueError('invalid line format in %s' % self.fname) 121 | target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] 122 | doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based 123 | return doc, target 124 | 125 | @staticmethod 126 | def doc2line(doc, label=0): 127 | """ 128 | Output the document in SVMlight format, as a string. Inverse function to `line2doc`. 129 | """ 130 | pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base 131 | return "%s %s\n" % (label, pairs) 132 | 133 | # endclass SvmLightCorpus 134 | -------------------------------------------------------------------------------- /gensim/examples/dmlcz/gensim_xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2010 Radim Rehurek 4 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 | 6 | """ 7 | USAGE: %(program)s LANGUAGE METHOD 8 | Generate similar.xml files, using a previously built model for METHOD. 9 | 10 | Example: ./gensim_xml.py eng lsi 11 | """ 12 | 13 | 14 | import logging 15 | import sys 16 | import os.path 17 | import re 18 | 19 | 20 | from gensim.corpora import sources, dmlcorpus, MmCorpus 21 | from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity 22 | 23 | import gensim_build 24 | 25 | 26 | # set to True to do everything EXCEPT actually writing out similar.xml files to disk. 27 | # similar.xml files are NOT written if DRY_RUN is true. 28 | DRY_RUN = False 29 | 30 | # how many 'most similar' documents to store in each similar.xml? 31 | MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) 32 | MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). 33 | 34 | # if there are no similar articles (after the pruning), do we still want to generate similar.xml? 35 | SAVE_EMPTY = True 36 | 37 | # xml template for similar articles 38 | ARTICLE = """ 39 |
40 | 41 | %(author)s 42 | 43 | %(title)s 44 | %(suffix)s 45 | 46 | 47 | 48 |
""" 49 | 50 | # template for the whole similar.xml file (will be filled with multiple ARTICLE instances) 51 | SIMILAR = """\ 52 | 53 | %s 54 | 55 | """ 56 | 57 | 58 | 59 | def generateSimilar(corpus, index, method): 60 | for docNo, topSims in enumerate(index): # for each document 61 | # store similarities to the following file 62 | outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method) 63 | 64 | articles = [] # collect similars in this list 65 | for docNo2, score in topSims: # for each most similar article 66 | if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) 67 | source, (intId, pathId) = corpus.documents[docNo2] 68 | meta = corpus.getMeta(docNo2) 69 | suffix, author, title = '', meta.get('author', ''), meta.get('title', '') 70 | articles.append(ARTICLE % locals()) # add the similar article to output 71 | if len(articles) >= MAX_SIMILAR: 72 | break 73 | 74 | # now `articles` holds multiple strings in similar_*.xml format 75 | if SAVE_EMPTY or articles: 76 | output = ''.join(articles) # concat all similars to one string 77 | if not DRY_RUN: # only open output files for writing if DRY_RUN is false 78 | logging.info("generating %s (%i similars)" % (outfile, len(articles))) 79 | outfile = open(outfile, 'w') 80 | outfile.write(SIMILAR % output) # add xml headers and print to file 81 | outfile.close() 82 | else: 83 | logging.info("would be generating %s (%i similars):%s\n" % (outfile, len(articles), output)) 84 | else: 85 | logging.debug("skipping %s (no similar found)" % outfile) 86 | 87 | 88 | 89 | if __name__ == '__main__': 90 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 91 | logging.root.setLevel(level=logging.INFO) 92 | logging.info("running %s" % ' '.join(sys.argv)) 93 | 94 | program = os.path.basename(sys.argv[0]) 95 | 96 | # check and process input arguments 97 | if len(sys.argv) < 3: 98 | print(globals()['__doc__'] % locals()) 99 | sys.exit(1) 100 | language = sys.argv[1] 101 | method = sys.argv[2].strip().lower() 102 | 103 | logging.info("loading corpus mappings") 104 | config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), 105 | resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) 106 | 107 | logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) 108 | id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) 109 | logging.info("loaded %i word ids" % len(id2word)) 110 | 111 | corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) 112 | input = MmCorpus(config.resultFile('_%s.mm' % method)) 113 | assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) 114 | 115 | # initialize structure for similarity queries 116 | if method == 'lsi' or method == 'rp': # for these methods, use dense vectors 117 | index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) 118 | else: 119 | index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) 120 | 121 | index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) 122 | generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format 123 | 124 | logging.info("finished running %s" % program) 125 | 126 | -------------------------------------------------------------------------------- /docs/src/dist_lda.rst: -------------------------------------------------------------------------------- 1 | .. _dist_lda: 2 | 3 | Distributed Latent Dirichlet Allocation 4 | ============================================ 5 | 6 | 7 | .. note:: 8 | See :doc:`distributed` for an introduction to distributed computing in `gensim`. 9 | 10 | 11 | Setting up the cluster 12 | _______________________ 13 | 14 | See the tutorial on :doc:`dist_lsi`; setting up a cluster for LDA is completely 15 | analogous, except you want to run `lda_worker` and `lda_dispatcher` scripts instead 16 | of `lsi_worker` and `lsi_dispatcher`. 17 | 18 | Running LDA 19 | ____________ 20 | 21 | Run LDA like you normally would, but turn on the `distributed=True` constructor 22 | parameter:: 23 | 24 | >>> # extract 100 LDA topics, using default parameters 25 | >>> lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, distributed=True) 26 | using distributed version with 4 workers 27 | running online LDA training, 100 topics, 1 passes over the supplied corpus of 3199665 documets, updating model once every 40000 documents 28 | .. 29 | 30 | 31 | In serial mode (no distribution), creating this online LDA :doc:`model of Wikipedia ` 32 | takes 10h56m on my laptop (OS X, C2D 2.53GHz, 4GB RAM with `libVec`). 33 | In distributed mode with four workers (Linux, Xeons of 2Ghz, 4GB RAM 34 | with `ATLAS `_), the wallclock time taken drops to 3h20m. 35 | 36 | To run standard batch LDA (no online updates of mini-batches) instead, you would similarly 37 | call:: 38 | 39 | >>> lda = LdaModel(corpus=mm, id2word=id2token, num_topics=100, update_every=0, passes=20, distributed=True) 40 | using distributed version with 4 workers 41 | running batch LDA training, 100 topics, 20 passes over the supplied corpus of 3199665 documets, updating model once every 3199665 documents 42 | initializing workers 43 | iteration 0, dispatching documents up to #10000/3199665 44 | iteration 0, dispatching documents up to #20000/3199665 45 | ... 46 | 47 | and then, some two days later:: 48 | 49 | iteration 19, dispatching documents up to #3190000/3199665 50 | iteration 19, dispatching documents up to #3199665/3199665 51 | reached the end of input; now waiting for all remaining jobs to finish 52 | 53 | :: 54 | 55 | >>> lda.print_topics(20) 56 | topic #0: 0.007*disease + 0.006*medical + 0.005*treatment + 0.005*cells + 0.005*cell + 0.005*cancer + 0.005*health + 0.005*blood + 0.004*patients + 0.004*drug 57 | topic #1: 0.024*king + 0.013*ii + 0.013*prince + 0.013*emperor + 0.008*duke + 0.008*empire + 0.007*son + 0.007*china + 0.007*dynasty + 0.007*iii 58 | topic #2: 0.031*film + 0.017*films + 0.005*movie + 0.005*directed + 0.004*man + 0.004*episode + 0.003*character + 0.003*cast + 0.003*father + 0.003*mother 59 | topic #3: 0.022*user + 0.012*edit + 0.009*wikipedia + 0.007*block + 0.007*my + 0.007*here + 0.007*edits + 0.007*blocked + 0.006*revert + 0.006*me 60 | topic #4: 0.045*air + 0.026*aircraft + 0.021*force + 0.018*airport + 0.011*squadron + 0.010*flight + 0.010*military + 0.008*wing + 0.007*aviation + 0.007*f 61 | topic #5: 0.025*sun + 0.022*star + 0.018*moon + 0.015*light + 0.013*stars + 0.012*planet + 0.011*camera + 0.010*mm + 0.009*earth + 0.008*lens 62 | topic #6: 0.037*radio + 0.026*station + 0.022*fm + 0.014*news + 0.014*stations + 0.014*channel + 0.013*am + 0.013*racing + 0.011*tv + 0.010*broadcasting 63 | topic #7: 0.122*image + 0.099*jpg + 0.046*file + 0.038*uploaded + 0.024*png + 0.014*contribs + 0.013*notify + 0.013*logs + 0.013*picture + 0.013*flag 64 | topic #8: 0.036*russian + 0.030*soviet + 0.028*polish + 0.024*poland + 0.022*russia + 0.013*union + 0.012*czech + 0.011*republic + 0.011*moscow + 0.010*finland 65 | topic #9: 0.031*language + 0.014*word + 0.013*languages + 0.009*term + 0.009*words + 0.008*example + 0.007*names + 0.007*meaning + 0.006*latin + 0.006*form 66 | topic #10: 0.029*w + 0.029*toronto + 0.023*l + 0.020*hockey + 0.019*nhl + 0.014*ontario + 0.012*calgary + 0.011*edmonton + 0.011*hamilton + 0.010*season 67 | topic #11: 0.110*wikipedia + 0.110*articles + 0.030*library + 0.029*wikiproject + 0.028*project + 0.019*data + 0.016*archives + 0.012*needing + 0.009*reference + 0.009*statements 68 | topic #12: 0.032*http + 0.030*your + 0.022*request + 0.017*sources + 0.016*archived + 0.016*modify + 0.015*changes + 0.015*creation + 0.014*www + 0.013*try 69 | topic #13: 0.011*your + 0.010*my + 0.009*we + 0.008*don + 0.008*get + 0.008*know + 0.007*me + 0.006*think + 0.006*question + 0.005*find 70 | topic #14: 0.073*r + 0.066*japanese + 0.062*japan + 0.018*tokyo + 0.008*prefecture + 0.005*osaka + 0.004*j + 0.004*sf + 0.003*kyoto + 0.003*manga 71 | topic #15: 0.045*da + 0.045*fr + 0.027*kategori + 0.026*pl + 0.024*nl + 0.021*pt + 0.017*en + 0.015*categoria + 0.014*es + 0.012*kategorie 72 | topic #16: 0.010*death + 0.005*died + 0.005*father + 0.004*said + 0.004*himself + 0.004*took + 0.004*son + 0.004*killed + 0.003*murder + 0.003*wife 73 | topic #17: 0.027*book + 0.021*published + 0.020*books + 0.014*isbn + 0.010*author + 0.010*magazine + 0.009*press + 0.009*novel + 0.009*writers + 0.008*story 74 | topic #18: 0.027*football + 0.024*players + 0.023*cup + 0.019*club + 0.017*fc + 0.017*footballers + 0.017*league + 0.011*season + 0.007*teams + 0.007*goals 75 | topic #19: 0.032*band + 0.024*album + 0.014*albums + 0.013*guitar + 0.013*rock + 0.011*records + 0.011*vocals + 0.009*live + 0.008*bass + 0.008*track 76 | 77 | 78 | 79 | If you used the distributed LDA implementation in `gensim`, please let me know (my 80 | email is at the bottom of this page). I would like to hear about your application and 81 | the possible (inevitable?) issues that you encountered, to improve `gensim` in the future. 82 | -------------------------------------------------------------------------------- /gensim/test/test_ldamallet_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Automated tests for checking transformation algorithms (the models package). 9 | """ 10 | 11 | 12 | import logging 13 | import unittest 14 | import os 15 | import os.path 16 | import tempfile 17 | 18 | import six 19 | import numpy 20 | import scipy.linalg 21 | 22 | from gensim.corpora import mmcorpus, Dictionary 23 | from gensim.models.wrappers import ldamallet 24 | from gensim import matutils 25 | from gensim.models import ldamodel 26 | 27 | 28 | module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 29 | datapath = lambda fname: os.path.join(module_path, 'test_data', fname) 30 | 31 | 32 | # set up vars used in testing ("Deerwester" from the web tutorial) 33 | texts = [['human', 'interface', 'computer'], 34 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 35 | ['eps', 'user', 'interface', 'system'], 36 | ['system', 'human', 'system', 'eps'], 37 | ['user', 'response', 'time'], 38 | ['trees'], 39 | ['graph', 'trees'], 40 | ['graph', 'minors', 'trees'], 41 | ['graph', 'minors', 'survey']] 42 | dictionary = Dictionary(texts) 43 | corpus = [dictionary.doc2bow(text) for text in texts] 44 | 45 | 46 | def testfile(): 47 | # temporary data will be stored to this file 48 | return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') 49 | 50 | 51 | class TestLdaMallet(unittest.TestCase): 52 | def setUp(self): 53 | self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) 54 | mallet_home = os.environ.get('MALLET_HOME', None) 55 | self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None 56 | 57 | def testTransform(self): 58 | if not self.mallet_path: 59 | return 60 | passed = False 61 | for i in range(5): # restart at most 5 times 62 | # create the transformation model 63 | model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200) 64 | 65 | # transform one document 66 | doc = list(corpus)[0] 67 | transformed = model[doc] 68 | 69 | vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests 70 | expected = [0.49, 0.51] 71 | passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering 72 | if passed: 73 | break 74 | logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % 75 | (i, sorted(vec), sorted(expected))) 76 | self.assertTrue(passed) 77 | 78 | 79 | def testPersistence(self): 80 | if not self.mallet_path: 81 | return 82 | fname = testfile() 83 | model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) 84 | model.save(fname) 85 | model2 = ldamallet.LdaMallet.load(fname) 86 | self.assertEqual(model.num_topics, model2.num_topics) 87 | self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) 88 | tstvec = [] 89 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 90 | 91 | def testPersistenceCompressed(self): 92 | if not self.mallet_path: 93 | return 94 | fname = testfile() + '.gz' 95 | model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) 96 | model.save(fname) 97 | model2 = ldamallet.LdaMallet.load(fname, mmap=None) 98 | self.assertEqual(model.num_topics, model2.num_topics) 99 | self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) 100 | tstvec = [] 101 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 102 | 103 | def testLargeMmap(self): 104 | if not self.mallet_path: 105 | return 106 | fname = testfile() 107 | model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) 108 | 109 | # simulate storing large arrays separately 110 | model.save(testfile(), sep_limit=0) 111 | 112 | # test loading the large model arrays with mmap 113 | model2 = ldamodel.LdaModel.load(testfile(), mmap='r') 114 | self.assertEqual(model.num_topics, model2.num_topics) 115 | self.assertTrue(isinstance(model2.wordtopics, numpy.memmap)) 116 | self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) 117 | tstvec = [] 118 | self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 119 | 120 | def testLargeMmapCompressed(self): 121 | if not self.mallet_path: 122 | return 123 | fname = testfile() + '.gz' 124 | model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) 125 | 126 | # simulate storing large arrays separately 127 | model.save(fname, sep_limit=0) 128 | 129 | # test loading the large model arrays with mmap 130 | self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r') 131 | #endclass TestLdaMallet 132 | 133 | if __name__ == '__main__': 134 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 135 | unittest.main() 136 | --------------------------------------------------------------------------------