├── unit_tests ├── __init__.py ├── tests_lsa.py ├── tests_split.py ├── tests_model.py ├── tests_beagleenvironment.py ├── tests_tfviewer.py ├── tests_tfidfviewer.py ├── tests_lsaviewer.py ├── tests_beaglecomposite.py ├── tests_spatial.py ├── tests_beagleviewer.py ├── tests_lda.py ├── tests_tfidf.py ├── tests_tf.py ├── tests_beaglecontext.py ├── tests_viewer_wrappers.py ├── tests_labeleddata.py ├── tests_corpus_util.py ├── tests_ldacgsviewer.py ├── tests_structarr.py ├── tests_beagleorder.py └── tests_corpus.py ├── functional_tests └── tests_tf.py ├── vsm ├── extensions │ ├── __init__.py │ ├── interop │ │ ├── __init__.py │ │ ├── mallet.py │ │ ├── weka.py │ │ └── ldac.py │ ├── clustering │ │ ├── __init__.py │ │ ├── plotting.py │ │ └── manifold.py │ ├── corpusbuilders │ │ ├── __init__.py │ │ └── corpusstreamers.py │ ├── inpho.py │ ├── testdata │ │ └── history_greek_philosophy │ │ │ ├── doc_meta.json │ │ │ ├── __init__.py │ │ │ ├── frontmatter.json │ │ │ └── chapter6.json │ ├── lda_py4j │ │ ├── README.txt │ │ └── org │ │ │ └── knowceans │ │ │ └── gibbstest │ │ │ ├── LDA.java │ │ │ ├── FileReadWrite.py │ │ │ ├── FileArrayProvider.java │ │ │ └── LdaRoutine.py │ ├── editions.py │ ├── mahout │ │ ├── README.txt │ │ └── mahout.py │ ├── trans.py │ └── multi_k.py ├── corpus │ ├── util │ │ ├── __init__.py │ │ └── corpusbuilders.py │ └── __init__.py ├── model │ ├── ldagibbs.py │ ├── __init__.py │ ├── beagleenvironment.py │ ├── beaglecomposite.py │ ├── base.py │ ├── ldaexact.py │ ├── tfidf.py │ ├── lda.py │ ├── ldacgs.py │ ├── _cgs_update.pyx │ ├── lsa.py │ ├── tf.py │ └── beaglecontext.py ├── viewer │ ├── ldagibbsviewer.py │ ├── __init__.py │ ├── types.py │ └── beagleviewer.py ├── exceptions │ ├── __init__.py │ └── compatibility.py ├── __init__.py └── split.py ├── MANIFEST.in ├── doc ├── source │ ├── vsm.corpus.rst │ ├── index.rst │ ├── vsm.viewer.rst │ ├── vsm.model.Lsa.rst │ ├── vsm.model.TfSeq.rst │ ├── vsm.model.TfIdf.rst │ ├── vsm.model.LdaCgsSeq.rst │ ├── vsm.viewer.BeagleViewer.rst │ ├── vsm.model.LdaCgsMulti.rst │ ├── vsm.model.TfMulti.rst │ ├── vsm.model.rst │ ├── vsm.model.BeagleOrderSeq.rst │ ├── vsm.model.BeagleContextSeq.rst │ ├── vsm.model.BeagleComposite.rst │ ├── vsm.model.BeagleOrderMulti.rst │ ├── vsm.model.BeagleEnvironment.rst │ ├── vsm.model.BeagleContextMulti.rst │ ├── vsm.rst │ ├── vsm.viewer.LsaViewer.rst │ ├── vsm.viewer.TfIdfViewer.rst │ ├── vsm.viewer.TfViewer.rst │ └── vsm.viewer.LdaCgsViewer.rst ├── README └── Makefile ├── register.py ├── requirements.txt ├── .gitignore ├── coverage.sh ├── CHANGELOG.md ├── roadmap.txt ├── LICENSE.txt ├── README.md ├── .travis.yml ├── appveyor.yml └── setup.py /unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /functional_tests/tests_tf.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vsm/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vsm/extensions/interop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include vsm/model/_cgs_update.pyx 2 | -------------------------------------------------------------------------------- /vsm/corpus/util/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility submodule 3 | """ 4 | from vsm.extensions.corpusbuilders import * 5 | -------------------------------------------------------------------------------- /vsm/corpus/util/corpusbuilders.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility submodule 3 | """ 4 | from vsm.extensions.corpusbuilders import * 5 | -------------------------------------------------------------------------------- /vsm/model/ldagibbs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility submodule 3 | """ 4 | from ldacgsseq import LdaCgsSeq as LDAGibbs 5 | 6 | 7 | __all__ = [ 'LDAGibbs' ] 8 | -------------------------------------------------------------------------------- /vsm/viewer/ldagibbsviewer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility submodule 3 | """ 4 | from ldacgsviewer import LdaCgsViewer as LDAGibbsViewer 5 | 6 | 7 | __all__ = [ 'LDAGibbsViewer' ] 8 | -------------------------------------------------------------------------------- /vsm/extensions/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [Documentation concerning the clustering extension] 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from .manifold import * 7 | -------------------------------------------------------------------------------- /vsm/extensions/corpusbuilders/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [Documentation about the corpusbuilders extension] 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from .corpusbuilders import * 7 | -------------------------------------------------------------------------------- /vsm/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [General documentation about the :mod:`corpus` submodule] 3 | """ 4 | 5 | 6 | from vsm.corpus.base import * 7 | import vsm.corpus.base 8 | 9 | 10 | __all__ = base.__all__[:] 11 | -------------------------------------------------------------------------------- /vsm/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [General documentation about the :mod:`exceptions` submodule] 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from . import compatibility 7 | from .compatibility import * 8 | 9 | __all__ = compatibility.__all__ 10 | -------------------------------------------------------------------------------- /doc/source/vsm.corpus.rst: -------------------------------------------------------------------------------- 1 | vsm.corpus 2 | ========== 3 | 4 | .. automodule:: vsm.corpus 5 | 6 | 7 | 8 | 9 | .. rubric:: Classes 10 | 11 | .. autosummary:: 12 | 13 | Corpus 14 | 15 | 16 | .. autoclass:: Corpus 17 | :members: 18 | 19 | -------------------------------------------------------------------------------- /register.py: -------------------------------------------------------------------------------- 1 | import pypandoc 2 | import os 3 | import sys 4 | 5 | pypandoc.convert('README.md', 'rst', outputfile='README.txt') 6 | if sys.argv[-1] == 'test': 7 | os.system("python setup.py register -r pypitest") 8 | else: 9 | os.system("python setup.py register") 10 | os.remove('README.txt') 11 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to vsm documentation! 2 | ============================= 3 | 4 | .. autosummary:: 5 | :toctree: 6 | 7 | vsm 8 | vsm.corpus 9 | vsm.model 10 | vsm.viewer 11 | 12 | Indices and tables 13 | ================== 14 | 15 | * :ref:`genindex` 16 | * :ref:`modindex` 17 | * :ref:`search` 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chardet==3.0.4 2 | future==1.0.0 3 | matplotlib>=2.0.2 4 | nltk>=3.0.0,<4.0.0 5 | numpy>=1.12.1,<2.0.0 6 | progressbar2>=3.35.2 7 | py4j==0.10.6 8 | scikit_learn>=0.19.1 9 | scipy>=0.19.0 10 | sortedcontainers>=1.5.7 11 | translate==3.5.0 12 | Unidecode==1.0.22 13 | 14 | backports.tempfile==1.0; python_version=='2.7' 15 | numpy>=1.14.3; python_version=='3.7' 16 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer 2 | ========== 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. automodule:: vsm.viewer 7 | 8 | 9 | 10 | .. rubric:: Classes 11 | 12 | .. autosummary:: 13 | :toctree: 14 | 15 | BeagleViewer 16 | LdaCgsViewer 17 | LsaViewer 18 | TfIdfViewer 19 | TfViewer 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /doc/source/vsm.model.Lsa.rst: -------------------------------------------------------------------------------- 1 | vsm.model.Lsa 2 | ============= 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: Lsa 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~Lsa.__init__ 12 | ~Lsa.load 13 | ~Lsa.save 14 | ~Lsa.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/source/vsm.model.TfSeq.rst: -------------------------------------------------------------------------------- 1 | vsm.model.TfSeq 2 | =============== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: TfSeq 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~TfSeq.__init__ 12 | ~TfSeq.load 13 | ~TfSeq.save 14 | ~TfSeq.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | -------------------------------------------------------------------------------- /doc/source/vsm.model.TfIdf.rst: -------------------------------------------------------------------------------- 1 | vsm.model.TfIdf 2 | =============== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: TfIdf 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~TfIdf.__init__ 12 | ~TfIdf.load 13 | ~TfIdf.save 14 | ~TfIdf.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/source/vsm.model.LdaCgsSeq.rst: -------------------------------------------------------------------------------- 1 | vsm.model.LdaCgsSeq 2 | =================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: LdaCgsSeq 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~LdaCgsSeq.__init__ 12 | ~LdaCgsSeq.load 13 | ~LdaCgsSeq.save 14 | ~LdaCgsSeq.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.so 3 | *.c 4 | *.cpp 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | .eggs 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | 20 | # Installer logs 21 | pip-log.txt 22 | 23 | # Unit test / coverage reports 24 | .coverage 25 | .tox 26 | 27 | #Translations 28 | *.mo 29 | 30 | #Mr Developer 31 | .mr.developer.cfg 32 | 33 | 34 | # API keys 35 | .travis.key 36 | .appveyor.key 37 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.BeagleViewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer.BeagleViewer 2 | ======================= 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. autoclass:: BeagleViewer 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | 12 | ~BeagleViewer.__init__ 13 | ~BeagleViewer.dismat_word 14 | ~BeagleViewer.dist_word_word 15 | 16 | 17 | .. automethod:: __init__ 18 | .. automethod:: dismat_word 19 | .. automethod:: dist_word_word 20 | -------------------------------------------------------------------------------- /doc/source/vsm.model.LdaCgsMulti.rst: -------------------------------------------------------------------------------- 1 | vsm.model.LdaCgsMulti 2 | ===================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: LdaCgsMulti 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~LdaCgsMulti.__init__ 12 | ~LdaCgsMulti.load 13 | ~LdaCgsMulti.save 14 | ~LdaCgsMulti.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/source/vsm.model.TfMulti.rst: -------------------------------------------------------------------------------- 1 | vsm.model.TfMulti 2 | ================= 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: TfMulti 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~TfMulti.__init__ 12 | ~TfMulti.load 13 | ~TfMulti.save 14 | ~TfMulti.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /doc/source/vsm.model.rst: -------------------------------------------------------------------------------- 1 | vsm.model 2 | ========= 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. automodule:: vsm.model 7 | 8 | .. rubric:: Classes 9 | 10 | .. autosummary:: 11 | :toctree: 12 | 13 | BeagleComposite 14 | BeagleContextMulti 15 | BeagleContextSeq 16 | BeagleEnvironment 17 | BeagleOrderMulti 18 | BeagleOrderSeq 19 | LdaCgsSeq 20 | LdaCgsMulti 21 | Lsa 22 | TfIdf 23 | TfMulti 24 | TfSeq 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleOrderSeq.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleOrderSeq 2 | ======================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleOrderSeq 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~BeagleOrderSeq.__init__ 12 | ~BeagleOrderSeq.load 13 | ~BeagleOrderSeq.save 14 | ~BeagleOrderSeq.train 15 | 16 | 17 | .. automethod:: __init__ 18 | .. automethod:: load 19 | .. automethod:: save 20 | .. automethod:: train 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleContextSeq.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleContextSeq 2 | ========================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleContextSeq 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~BeagleContextSeq.__init__ 12 | ~BeagleContextSeq.load 13 | ~BeagleContextSeq.save 14 | ~BeagleContextSeq.train 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: load 18 | .. automethod:: save 19 | .. automethod:: train 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleComposite.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleComposite 2 | ========================= 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleComposite 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | BeagleComposite.__init__ 12 | BeagleComposite.load 13 | BeagleComposite.save 14 | BeagleComposite.train 15 | 16 | 17 | .. automethod:: __init__ 18 | .. automethod:: load 19 | .. automethod:: save 20 | .. automethod:: train 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleOrderMulti.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleOrderMulti 2 | ========================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleOrderMulti 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~BeagleOrderMulti.__init__ 12 | ~BeagleOrderMulti.load 13 | ~BeagleOrderMulti.save 14 | ~BeagleOrderMulti.train 15 | 16 | 17 | .. automethod:: __init__ 18 | .. automethod:: load 19 | .. automethod:: save 20 | .. automethod:: train 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleEnvironment.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleEnvironment 2 | =========================== 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleEnvironment 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~BeagleEnvironment.__init__ 12 | ~BeagleEnvironment.load 13 | ~BeagleEnvironment.save 14 | ~BeagleEnvironment.train 15 | 16 | 17 | .. automethod:: __init__ 18 | .. automethod:: load 19 | .. automethod:: save 20 | .. automethod:: train 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /doc/source/vsm.model.BeagleContextMulti.rst: -------------------------------------------------------------------------------- 1 | vsm.model.BeagleContextMulti 2 | ============================ 3 | 4 | .. currentmodule:: vsm.model 5 | 6 | .. autoclass:: BeagleContextMulti 7 | 8 | 9 | .. rubric:: Methods 10 | 11 | .. autosummary:: 12 | ~BeagleContextMulti.__init__ 13 | ~BeagleContextMulti.load 14 | ~BeagleContextMulti.save 15 | ~BeagleContextMulti.train 16 | 17 | 18 | .. automethod:: __init__ 19 | .. automethod:: load 20 | .. automethod:: save 21 | .. automethod:: train 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /vsm/viewer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [General documentation about the :mod:`viewer` submodule] 3 | """ 4 | 5 | 6 | from . import beagleviewer 7 | from .beagleviewer import * 8 | from . import ldacgsviewer 9 | from .ldacgsviewer import * 10 | from . import lsaviewer 11 | from .lsaviewer import * 12 | from . import tfviewer 13 | from .tfviewer import * 14 | from . import tfidfviewer 15 | from .tfidfviewer import * 16 | 17 | __all__ = beagleviewer.__all__[:] 18 | __all__ += ldacgsviewer.__all__ 19 | __all__ += lsaviewer.__all__ 20 | __all__ += tfviewer.__all__ 21 | __all__ += tfidfviewer.__all__ 22 | 23 | -------------------------------------------------------------------------------- /doc/source/vsm.rst: -------------------------------------------------------------------------------- 1 | vsm 2 | === 3 | 4 | .. automodule:: vsm 5 | 6 | 7 | 8 | .. rubric:: Classes 9 | 10 | .. autosummary:: 11 | 12 | BeagleComposite 13 | BeagleContextMulti 14 | BeagleContextSeq 15 | BeagleEnvironment 16 | BeagleOrderMulti 17 | BeagleOrderSeq 18 | BeagleViewer 19 | Corpus 20 | LdaCgsSeq 21 | LdaCgsMulti 22 | LdaCgsViewer 23 | Lsa 24 | LsaViewer 25 | TfIdf 26 | TfIdfViewer 27 | TfMulti 28 | TfSeq 29 | TfViewer 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.LsaViewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer.LsaViewer 2 | ==================== 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. autoclass:: LsaViewer 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | ~LsaViewer.__init__ 12 | ~LsaViewer.dismat_doc 13 | ~LsaViewer.dismat_word 14 | ~LsaViewer.dist_doc_doc 15 | ~LsaViewer.dist_word_doc 16 | ~LsaViewer.dist_word_word 17 | 18 | .. automethod:: __init__ 19 | .. automethod:: dismat_doc 20 | .. automethod:: dismat_word 21 | .. automethod:: dist_doc_doc 22 | .. automethod:: dist_word_doc 23 | .. automethod:: dist_word_word 24 | 25 | -------------------------------------------------------------------------------- /coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CMD="coverage run -a --source=vsm" 3 | CMD="coverage run -a --source vsm.model,vsm.viewer,vsm.corpus,vsm.spatial,vsm.split,vsm.structarr,vsm.exceptions" 4 | #CMD="coverage run -a --source vsm.model,vsm.viewer,vsm.corpus,vsm.spatial,vsm.split,vsm.structarr,vsm.exceptions --debug trace" 5 | 6 | rm -rf .coverage 7 | coverage debug sys 8 | 9 | $CMD -m pytest unit_tests/* 10 | EXIT=$? 11 | 12 | rm -rf ap.ini ap ap.tgz 13 | #pip install --pre topicexplorer 14 | #$CMD -m topicexplorer.demo 15 | #EXIT=$?+$EXIT 16 | #$CMD -m topicexplorer.serve ap.ini 17 | 18 | coverage report 19 | 20 | echo "Test exit code: $EXIT" 21 | exit $EXIT 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. This project adheres to [PEP 440: Version Identification and Dependency Specification](https://www.python.org/dev/peps/pep-0440/), a slight modification of Semantic Versioning. 3 | 4 | ## 0.4.0 5 | - Refactor of `Corpus.__init__()`. Significant memory improvements. 6 | - Refactor of stopword removal. Significant memory improvements. 7 | - Addition of `LdaCgsViewer.labels` property. 8 | - Addition of `LdaCgsViewer.dist(doc1, doc2, dist_fn=JS_dist)` function. 9 | - Addition of `vsm.extensions.comparison.lda` [NOT YET MERGED] 10 | - Addition of `vsm.extensions.interop` [NOT YET MERGED] 11 | 12 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.TfIdfViewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer.TfIdfViewer 2 | ====================== 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. autoclass:: TfIdfViewer 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | 12 | ~TfIdfViewer.__init__ 13 | ~TfIdfViewer.dismat_doc 14 | ~TfIdfViewer.dismat_word 15 | ~TfIdfViewer.dist_doc_doc 16 | ~TfIdfViewer.dist_word_doc 17 | ~TfIdfViewer.dist_word_word 18 | 19 | .. automethod:: __init__ 20 | .. automethod:: dismat_doc 21 | .. automethod:: dismat_word 22 | .. automethod:: dist_doc_doc 23 | .. automethod:: dist_word_doc 24 | .. automethod:: dist_word_word 25 | 26 | 27 | -------------------------------------------------------------------------------- /unit_tests/tests_lsa.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.model.lsa import Lsa 5 | 6 | 7 | class TestLsa(unittest.TestCase): 8 | 9 | def setUp(self): 10 | 11 | self.tfidf_mat = np.array(\ 12 | [[2.0794415, 1.3862944, 0], 13 | [0.86304623, 0.28768209, 0.28768209], 14 | [np.inf, np.inf, np.inf], 15 | [0, 0, 0]]) 16 | self.eigenvalues = np.array(\ 17 | [ 0.35270742, 2.65176495]) 18 | self.doc_matrix = np.array([0.314334, 0.023485]) 19 | 20 | #TODO: Write some actual unit tests for this module 21 | 22 | 23 | #Define and run test suite 24 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLsa) 25 | unittest.TextTestRunner(verbosity=2).run(suite) 26 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.TfViewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer.TfViewer 2 | =================== 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. autoclass:: TfViewer 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | 12 | ~TfViewer.__init__ 13 | ~TfViewer.coll_freq 14 | ~TfViewer.coll_freqs 15 | ~TfViewer.dismat_doc 16 | ~TfViewer.dismat_word 17 | ~TfViewer.dist_doc_doc 18 | ~TfViewer.dist_word_doc 19 | ~TfViewer.dist_word_word 20 | 21 | 22 | .. automethod:: __init__ 23 | .. automethod:: coll_freq 24 | .. automethod:: coll_freqs 25 | .. automethod:: dismat_doc 26 | .. automethod:: dismat_word 27 | .. automethod:: dist_doc_doc 28 | .. automethod:: dist_word_doc 29 | .. automethod:: dist_word_word 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /vsm/extensions/inpho.py: -------------------------------------------------------------------------------- 1 | from inpho.model import * 2 | 3 | ideas = Session.query(Idea) 4 | words_int = dict([(idea.label, idea.ID) for idea in ideas.all()]) 5 | 6 | def inpho_word_tokenize(document, terms=None): 7 | if terms is None: 8 | terms = ideas[:] 9 | occurrences = [] 10 | 11 | # iterate over terms to be scanned 12 | for term in terms: 13 | # build list of search patterns starting with label 14 | for pattern in term.patterns: 15 | try: 16 | if re.search(pattern, document, flags=re.IGNORECASE): 17 | occurrences.append(str(term.ID)) 18 | break 19 | except re.error: 20 | logging.warning('Term %d (%s) pattern "%s" failed' % 21 | (term.ID, term.label, pattern)) 22 | 23 | return occurrences 24 | 25 | 26 | -------------------------------------------------------------------------------- /roadmap.txt: -------------------------------------------------------------------------------- 1 | ------------------------------- 2 | 2014.10.20 3 | 4 | Roadmap towards a first release 5 | ------------------------------- 6 | 7 | * Update unit tests. At present, these have not been updated with the 8 | code changes. 9 | 10 | * Functional tests 11 | 12 | * Refactoring 13 | 14 | * All models take corpus objects 15 | 16 | * Refactoring corpus builders - documentation, tutorial 17 | 18 | * Refactoring corpus metadata 19 | 20 | * Include sample corpus and use in documentation 21 | 22 | * Sphinx-generated documentation 23 | 24 | * Getting Started page (including how to install) 25 | 26 | * Workflow 27 | 28 | * Tutorials and demos 29 | 30 | Wishlist 31 | -------- 32 | 33 | * Better defaults for LDA priors. 34 | 35 | * More Bayesian models. There are several descendants of LDA with 36 | highly desirable features (e.g., correlated topics, topic change 37 | models) we have yet to implement. 38 | 39 | * Robust plotting and clustering extensions. 40 | -------------------------------------------------------------------------------- /vsm/extensions/testdata/history_greek_philosophy/doc_meta.json: -------------------------------------------------------------------------------- 1 | [[{ "part_of_book": "Front Matter"}], 2 | [{ "part_of_book": "Chapter 1"}], 3 | [{ "part_of_book": "Chapter 2"}], 4 | [{ "part_of_book": "Chapter 3"}], 5 | [{ "part_of_book": "Chapter 4"}], 6 | [{ "part_of_book": "Chapter 5"}], 7 | [{ "part_of_book": "Chapter 6"}], 8 | [{ "part_of_book": "Chapter 7"}], 9 | [{ "part_of_book": "Chapter 8"}], 10 | [{ "part_of_book": "Chapter 9"}], 11 | [{ "part_of_book": "Chapter 10"}], 12 | [{ "part_of_book": "Chapter 11"}], 13 | [{ "part_of_book": "Chapter 12"}], 14 | [{ "part_of_book": "Chapter 13"}], 15 | [{ "part_of_book": "Chapter 14"}], 16 | [{ "part_of_book": "Chapter 15"}], 17 | [{ "part_of_book": "Chapter 16"}], 18 | [{ "part_of_book": "Chapter 17"}], 19 | [{ "part_of_book": "Chapter 18"}], 20 | [{ "part_of_book": "Chapter 19"}], 21 | [{ "part_of_book": "Chapter 20"}], 22 | [{ "part_of_book": "Chapter 21"}], 23 | [{ "part_of_book": "Chapter 22"}], 24 | [{ "part_of_book": "Back Matter"}]] 25 | -------------------------------------------------------------------------------- /unit_tests/tests_split.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm import * 5 | from vsm.split import * 6 | 7 | class TestCore(unittest.TestCase): 8 | 9 | def test_mp_split_ls(self): 10 | 11 | l = [slice(0,0), slice(0,0), slice(0,0)] 12 | self.assertTrue(len(mp_split_ls(l, 1)) == 1) 13 | self.assertTrue((mp_split_ls(l, 1)[0] == l).all()) 14 | self.assertTrue(len(mp_split_ls(l, 2)) == 2) 15 | self.assertTrue((mp_split_ls(l, 2)[0] == 16 | [slice(0,0), slice(0,0)]).all()) 17 | self.assertTrue((mp_split_ls(l, 2)[1] == [slice(0,0)]).all()) 18 | self.assertTrue(len(mp_split_ls(l, 3)) == 3) 19 | self.assertTrue((mp_split_ls(l, 3)[0] == [slice(0,0)]).all()) 20 | self.assertTrue((mp_split_ls(l, 3)[1] == [slice(0,0)]).all()) 21 | self.assertTrue((mp_split_ls(l, 3)[2] == [slice(0,0)]).all()) 22 | 23 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCore) 24 | unittest.TextTestRunner(verbosity=2).run(suite) 25 | -------------------------------------------------------------------------------- /unit_tests/tests_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.corpus.util.corpusbuilders import random_corpus 5 | from vsm.model.base import BaseModel 6 | 7 | 8 | class TestBaseModel(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.c = random_corpus(1000, 50, 6, 100) 12 | self.m = BaseModel(self.c, 'context') 13 | 14 | 15 | def test_BaseModel_IO(self): 16 | 17 | from tempfile import NamedTemporaryFile as NTF 18 | import os 19 | 20 | c = random_corpus(1000, 50, 6, 100) 21 | with NTF(delete=False, suffix='.npz') as tmp: 22 | m0 = BaseModel(c.corpus, 'context') 23 | m0.save(tmp.name) 24 | m1 = BaseModel.load(tmp.name) 25 | 26 | self.assertEqual(m0.context_type, m1.context_type) 27 | self.assertTrue((m0.matrix == m1.matrix).all()) 28 | 29 | os.remove(tmp.name) 30 | 31 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBaseModel) 32 | unittest.TextTestRunner(verbosity=2).run(suite) 33 | -------------------------------------------------------------------------------- /doc/README: -------------------------------------------------------------------------------- 1 | This directory contains the source files to build vsm documentation 2 | with Sphinx. The Makefile has been customized to facilitate updating 3 | the documentation hosted as github pages. 4 | 5 | To update the github pages automatically, invoke 6 | 7 | $ make gh-pages 8 | 9 | in this directory. 10 | 11 | To see what this does concretely, inspect Makefile. In summary, this 12 | command will do the following: 13 | 14 | * build the html documentation in a temporary subdirectory `build`; 15 | 16 | * clone the vsm github repo in a temporary subdirectory `vsm-gh-pages' 17 | and checkout the gh-pages branch; 18 | 19 | * (!) remove almost everything in that branch; 20 | 21 | * move the newly built documentation to `vsm-gh-pages`; 22 | 23 | * add, commit and push the new files (you may need to give your 24 | credentials); 25 | 26 | * remove the subdirectories `build` and `vsm-gh-pages`. 27 | 28 | Currently, this procedure has no special error-handling. If it doesn't 29 | complete, you may have to clean up the temporary subdirectories 30 | yourself. -------------------------------------------------------------------------------- /vsm/model/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | [General Documentation about :mod:`model` classes] 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from . import beaglecomposite 7 | from .beaglecomposite import * 8 | from . import beaglecontext 9 | from .beaglecontext import * 10 | from . import beagleenvironment 11 | from .beagleenvironment import * 12 | from . import beagleorder 13 | from .beagleorder import * 14 | from . import lda 15 | from .lda import * 16 | from . import ldacgsseq 17 | from .ldacgsseq import * 18 | from . import ldacgsmulti 19 | from .ldacgsmulti import * 20 | from . import lsa 21 | from .lsa import * 22 | from . import tf 23 | from .tf import * 24 | from . import tfidf 25 | from .tfidf import * 26 | 27 | 28 | __all__ = beaglecomposite.__all__[:] 29 | __all__ += beaglecontext.__all__ 30 | __all__ += beagleenvironment.__all__ 31 | __all__ += beagleorder.__all__ 32 | __all__ += lda.__all__ 33 | __all__ += ldacgsseq.__all__ 34 | __all__ += ldacgsmulti.__all__ 35 | __all__ += lsa.__all__ 36 | __all__ += tf.__all__ 37 | __all__ += tfidf.__all__ 38 | -------------------------------------------------------------------------------- /doc/source/vsm.viewer.LdaCgsViewer.rst: -------------------------------------------------------------------------------- 1 | vsm.viewer.LdaCgsViewer 2 | ======================= 3 | 4 | .. currentmodule:: vsm.viewer 5 | 6 | .. autoclass:: LdaCgsViewer 7 | 8 | .. rubric:: Methods 9 | 10 | .. autosummary:: 11 | 12 | ~LdaCgsViewer.__init__ 13 | ~LdaCgsViewer.dismat_doc 14 | ~LdaCgsViewer.dismat_top 15 | ~LdaCgsViewer.dist_doc_doc 16 | ~LdaCgsViewer.dist_top_doc 17 | ~LdaCgsViewer.dist_top_top 18 | ~LdaCgsViewer.dist_word_top 19 | ~LdaCgsViewer.doc_topics 20 | ~LdaCgsViewer.logp_plot 21 | ~LdaCgsViewer.topic_entropies 22 | ~LdaCgsViewer.topic_hist 23 | ~LdaCgsViewer.topics 24 | ~LdaCgsViewer.word_topics 25 | 26 | .. automethod:: __init__ 27 | .. automethod:: dismat_doc 28 | .. automethod:: dismat_top 29 | .. automethod:: dist_doc_doc 30 | .. automethod:: dist_top_doc 31 | .. automethod:: dist_top_top 32 | .. automethod:: dist_word_top 33 | .. automethod:: doc_topics 34 | .. automethod:: logp_plot 35 | .. automethod:: topic_entropies 36 | .. automethod:: topic_hist 37 | .. automethod:: topics 38 | .. automethod:: word_topics 39 | 40 | 41 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 The Trustees of Indiana University and 4 | The Indiana Philosophy Ontology (InPhO) Project 5 | http://inpho.cogs.indiana.edu/ -- inpho@indiana.edu 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /vsm/extensions/lda_py4j/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains python and java code that interacts with 2 | java code retrieved from LdaGibbsSamlper.java at http://knowceans.com. 3 | 4 | 5 | Directions 6 | ---------- 7 | 0. write corpus txtfile: 8 | from FileReadWrite import write_file 9 | write_file(Corpus, ctx_type, 'fname.txt') 10 | 11 | 1. compile: javac -cp py4j0.8.1.jar *.java (in gibbstest dir) 12 | 13 | 2. run: java -cp org/knowceans/gibbstest/py4j0.8.1.jar:. 14 | org.knowceans.gibbstest.LDA 15 | org/knowceans/gibbstest/testcorp.txt 16 | (in parent dir of org) 17 | 18 | 3. python: run LdaRoutine (in ipython) 19 | 20 | 4. exit out of java program to end the server connection. 21 | 22 | Notes 23 | ----- 24 | - directory structure: org/knowceans/gibbstest 25 | 26 | - running java starts the gateway server. This needs to be running for python 27 | code (py4j) to work. 28 | 29 | - needs java version "1.7.0_25" to run correctly. 30 | 31 | - all java files are in package org.knowceans.gibbstest; 32 | 33 | - LDA.java takes written corpus file (from 0.) as args 34 | 35 | - python code works with LdaGibbsSampler java object. 36 | 37 | - LdaRoutine.py depends on vsm, so move LdaRoutine.py and FileReadWrite.py to a location where vsm is importable, if needed. 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #VSM 2 | 3 | [![Build Status](https://img.shields.io/travis/inpho/vsm.svg?label=UNIX+build)](https://travis-ci.org/inpho/vsm) 4 | [![Appveyor](https://img.shields.io/appveyor/ci/JaimieMurdock/vsm.svg?label=Windows+build)](https://ci.appveyor.com/project/JaimieMurdock/vsm) 5 | [![Coveralls](https://img.shields.io/coveralls/inpho/vsm.svg)](https://coveralls.io/r/inpho/vsm) 6 | [![GitHub license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/inpho/vsm/blob/master/LICENSE.txt) 7 | [![PyPI](https://img.shields.io/pypi/v/vsm.svg)](https://pypi.python.org/pypi/vsm) 8 | 9 | **Note:** More than likely, you are looking for the [InPhO Topic Explorer](http://github.com/inpho/topic-explorer). This library is for the machine learning implementations underlying the Topic Explorer and is updated much slower than the user interfaces. If you are using the Topic Explorer, please file issues there and the developers will triage appropriately. 10 | 11 | Vector Space Model Framework developed for the 12 | [InPhO Project](https://inpho.cogs.indiana.edu/). 13 | 14 | Documentation can be found in the module and [here](http://inpho.github.io/vsm/). 15 | 16 | ##Installation 17 | 18 | ``` 19 | pip install Cython==0.29.37 numpy==1.26.1 20 | git clone git@github.com:inpho/vsm.git 21 | cd vsm 22 | pip install -r requirements.txt -e . 23 | ``` 24 | -------------------------------------------------------------------------------- /vsm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`vsm` module provides tools and a workflow for producing 3 | semantic models of textual corpora and analyzing and visualizing these 4 | models. 5 | 6 | The :mod:`vsm` module has been conceived within the SciPy ecosystem. 7 | In a typical work flow, a collection of texts is first transformed 8 | into a Corpus object, whose underlying data structures are NumPy 9 | numerical arrays. The user may then feed a Corpus object to one of the 10 | model classes, which contain the algorithms, implemented in NumPy, 11 | SciPy and IPython.parallel, for training models such as :doc:`TF`, 12 | :doc:`TFIDF`, :doc:`LSA`, 13 | :doc:`BEAGLE`, or :doc:`LDA`. 14 | Finally, the user may examine the 15 | results with a Viewer class specialized to a particular model type. A 16 | Viewer object contains a variety of methods for analysis and 17 | visualization and achieves its full functionality within an IPython 18 | notebook session extended with matplotlib and scikit-learn. 19 | """ 20 | 21 | 22 | import vsm.corpus 23 | from vsm.corpus import * 24 | import vsm.model 25 | from vsm.model import * 26 | import vsm.viewer 27 | from vsm.viewer import * 28 | 29 | __version__ = '1.0.0b1' 30 | 31 | __all__ = ['__version__'] 32 | __all__ += vsm.corpus.__all__[:] 33 | __all__ += vsm.model.__all__ 34 | __all__ += vsm.viewer.__all__ 35 | 36 | -------------------------------------------------------------------------------- /unit_tests/tests_beagleenvironment.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.model.beagleenvironment import * 5 | 6 | 7 | 8 | class TestBeagleEnvironment(unittest.TestCase): 9 | 10 | def setUp(self): 11 | 12 | from vsm.extensions.corpusbuilders import random_corpus 13 | 14 | c = random_corpus(1000, 100, 0, 20) 15 | 16 | self.m = BeagleEnvironment(c, n_cols=100) 17 | self.m.train() 18 | 19 | 20 | def test_BeagleEnvironment(self): 21 | 22 | self.assertTrue((self.m.matrix <= 1).all()) 23 | self.assertTrue((self.m.matrix >= -1).all()) 24 | 25 | norms = (self.m.matrix**2).sum(1)**0.5 26 | 27 | self.assertTrue(np.allclose(np.ones(norms.shape[0]), norms)) 28 | 29 | 30 | def test_BE_IO(self): 31 | from tempfile import NamedTemporaryFile 32 | import os 33 | 34 | try: 35 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 36 | self.m.save(tmp.name) 37 | tmp.close() 38 | m1 = BeagleEnvironment.load(tmp.name) 39 | self.assertTrue((self.m.matrix == m1.matrix).all()) 40 | 41 | finally: 42 | os.remove(tmp.name) 43 | 44 | 45 | #Define and run test suite 46 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleEnvironment) 47 | unittest.TextTestRunner(verbosity=2).run(suite) 48 | -------------------------------------------------------------------------------- /unit_tests/tests_tfviewer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.viewer.tfviewer import * 5 | from vsm.viewer.labeleddata import * 6 | 7 | 8 | class TestTfViewer(unittest.TestCase): 9 | 10 | def setUp(self): 11 | 12 | from vsm.corpus.util.corpusbuilders import random_corpus 13 | from vsm.model.tf import TfSeq 14 | 15 | c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True) 16 | 17 | m = TfSeq(c, 'document') 18 | m.train() 19 | 20 | self.v = TfViewer(c, m) 21 | 22 | #TODO: test for coll_freq 23 | def test_TfViewer(self): 24 | 25 | li = [0,1] 26 | 27 | sww = self.v.dist_word_word('0') 28 | swwl = self.v.dist_word_word(['0','1']) 29 | sdd = self.v.dist_doc_doc(0) 30 | sddl = self.v.dist_doc_doc(li) 31 | cfs = self.v.coll_freqs() 32 | 33 | distmatw = self.v.dismat_word(['0','2','5']) 34 | distmatd = self.v.dismat_doc(li) 35 | 36 | self.assertEqual(type(sww), LabeledColumn) 37 | self.assertEqual(type(swwl), LabeledColumn) 38 | self.assertEqual(type(sdd), LabeledColumn) 39 | self.assertEqual(type(sddl), LabeledColumn) 40 | self.assertEqual(type(cfs), LabeledColumn) 41 | self.assertEqual(type(distmatw), IndexedSymmArray) 42 | self.assertEqual(type(distmatd), IndexedSymmArray) 43 | 44 | 45 | 46 | #Define and run test suite 47 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfViewer) 48 | unittest.TextTestRunner(verbosity=2).run(suite) 49 | -------------------------------------------------------------------------------- /unit_tests/tests_tfidfviewer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.viewer.tfidfviewer import * 5 | from vsm.viewer.labeleddata import * 6 | 7 | 8 | class TestTfIdfViewer(unittest.TestCase): 9 | 10 | def setUp(self): 11 | 12 | from vsm.corpus.util.corpusbuilders import random_corpus 13 | from vsm.model.tf import TfSeq 14 | from vsm.model.tfidf import TfIdf 15 | 16 | c = random_corpus(1000, 50, 0, 10, context_type='document', metadata=True) 17 | 18 | tf = TfSeq(c, 'document') 19 | tf.train() 20 | 21 | m = TfIdf.from_tf(tf) 22 | m.train() 23 | 24 | self.v = TfIdfViewer(c, m) 25 | 26 | def test_TfIdfViewer(self): 27 | 28 | li = [0,1] 29 | 30 | sww = self.v.dist_word_word('0') 31 | swwl = self.v.dist_word_word(['0','1']) 32 | sdd = self.v.dist_doc_doc(0) 33 | sddl = self.v.dist_doc_doc(li) 34 | 35 | dismatw = self.v.dismat_word(['0','2','5']) 36 | dismatd = self.v.dismat_doc(li) 37 | 38 | self.assertEqual(type(sww), LabeledColumn) 39 | self.assertEqual(type(swwl), LabeledColumn) 40 | self.assertEqual(type(sdd), LabeledColumn) 41 | self.assertEqual(type(sddl), LabeledColumn) 42 | 43 | self.assertEqual(type(dismatw), IndexedSymmArray) 44 | self.assertEqual(type(dismatd), IndexedSymmArray) 45 | 46 | 47 | 48 | #Define and run test suite 49 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfIdfViewer) 50 | unittest.TextTestRunner(verbosity=2).run(suite) 51 | -------------------------------------------------------------------------------- /vsm/extensions/interop/mallet.py: -------------------------------------------------------------------------------- 1 | from finalcorpus import * 2 | import gzip 3 | 4 | start_idx = 0 5 | m = lda_m[20] 6 | metadata = c.view_metadata(m.context_type) 7 | 8 | def export_model(): 9 | with gzip.open('model_to_mallet.gz', 'wb') as f: 10 | f.write("#doc source pos typeindex type topic") 11 | alpha = m.alpha 12 | f.write(alpha) 13 | beta = m.beta 14 | 15 | for end_idx, doc in metadata: 16 | for i in range(start_idx, end_idx): 17 | doc = doc 18 | source = "/" 19 | pos = i 20 | typeIndex = c.corpus[i] 21 | ttype = c.words[c.corpus[i]] 22 | topic = m.Z[i] 23 | line = "{} {} {} {} {} {}\n".format(doc, source, pos, typeIndex, ttype, topic) 24 | f.write(line) 25 | start_idx = end_idx 26 | 27 | 28 | def import_model(): 29 | startPos = [] 30 | corpus = [] 31 | z = [] 32 | words = {} 33 | prevDoc = 0; 34 | 35 | with gzip.open('topic-state.gz', 'rb') as f: 36 | for i, line in enumerate(f, start = -3): 37 | #skip first three lines with header info 38 | if i >= 0: 39 | #columns - #doc source pos typeindex type topic 40 | doc, _, _, typeindex, type, topic = line.split() 41 | corpus.append(typeindex) 42 | z.append(topic) 43 | words[typeindex] = type 44 | if doc != prevDoc: 45 | startPos.append(i) 46 | prevDoc = doc 47 | 48 | -------------------------------------------------------------------------------- /vsm/model/beagleenvironment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from vsm.model.base import BaseModel 4 | 5 | 6 | __all__ = ['BeagleEnvironment'] 7 | 8 | 9 | class BeagleEnvironment(BaseModel): 10 | """ 11 | `BeagleEnvironment` is a randomly generated fixed vectors 12 | representing the environment. 13 | """ 14 | 15 | def __init__(self, corpus, n_cols=2048, dtype=np.float64, 16 | context_type='sentence'): 17 | """ 18 | Initialize BeagleEnvironment. 19 | 20 | :param corpus: Source of observed data. 21 | :type corpus: Corpus 22 | 23 | :param n_cols: Number of columns. Default is 2048. 24 | :type n_cols: int, optional 25 | 26 | :param dtype: Numpy dtype for matrix attribute. Default is `np.float64`. 27 | :type dtype: np.dtype, optional 28 | 29 | :param context_type: Name of tokenization stored in `corpus` whose 30 | tokens will be treated as documents. Default is `sentence`. 31 | :type context_type: string, optional 32 | """ 33 | self.context_type = context_type 34 | self.shape = (corpus.words.shape[0], n_cols) 35 | self.dtype = dtype 36 | 37 | 38 | def train(self): 39 | """ 40 | Sets a m x n environment matrix where m is the number of words in 41 | `corpus` and n is `n_cols`. The matrix consists of randomly generated 42 | vectors. 43 | """ 44 | self.matrix = np.array(np.random.normal(size=self.shape), 45 | dtype=self.dtype) 46 | # normalize rows 47 | self.matrix /= np.sqrt((self.matrix * self.matrix).sum(1)[:,np.newaxis]) 48 | 49 | 50 | -------------------------------------------------------------------------------- /vsm/extensions/editions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from vsm.corpus import Corpus 3 | 4 | 5 | __all__ = ['eqva', 'new_material'] 6 | 7 | 8 | def eqva(a1, a2): 9 | """ 10 | modified np.array_equal. considers a1 and a2 11 | equal when there is 1 difference. 12 | """ 13 | a1.sort() 14 | a2.sort() 15 | count = 0 16 | a1_, a2_ = a1, a2 17 | if len(a1) > len(a2): 18 | a1_ = a2 19 | a2_ = a1 20 | 21 | for s in a1: 22 | if not s in a2: 23 | count += 1 24 | 25 | return count 26 | 27 | 28 | def find_idx(ind, c1, c2): 29 | """ 30 | finds exact match (1 diff) in c2 and returns the index. 31 | """ 32 | ctx2 = c2.view_contexts('sentence', as_strings=True) 33 | ctx = c1.view_contexts('sentence', as_strings=True)[ind] 34 | 35 | for i in xrange(len(ctx2)): 36 | if eqva(ctx, ctx2[i]) < 2: 37 | return str(i) 38 | return '' 39 | 40 | 41 | def new_material(c1, c2, idx=0): 42 | """ 43 | Return new material in a list. 44 | 'idx' is an optional parameter for cutting off references. 45 | """ 46 | ctx1 = c1.view_contexts('sentence', as_strings=True) 47 | 48 | if idx == 0: 49 | ctx2 = c2.view_contexts('sentence', as_strings=True) 50 | else: 51 | ctx2 = c2.view_contexts('sentence', as_strings=True)[:idx] 52 | len2 = len(ctx2) 53 | 54 | new = [] 55 | for i in xrange(len(ctx1)): 56 | if i < len2: 57 | if len(ctx1[i]) == 0: # empty tokens. 58 | pass 59 | else: 60 | ind = find_idx(i, c1, c2) 61 | if len(ind) == 0: 62 | new.append(i) 63 | return new 64 | -------------------------------------------------------------------------------- /unit_tests/tests_lsaviewer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.viewer.lsaviewer import LsaViewer 5 | 6 | 7 | class TestLsaViewer(unittest.TestCase): 8 | 9 | def setUp(self): 10 | 11 | from vsm.corpus.util.corpusbuilders import random_corpus 12 | from vsm.model.tf import TfSeq 13 | from vsm.model.tfidf import TfIdf 14 | from vsm.model.lsa import Lsa 15 | 16 | c = random_corpus(10000, 1000, 0, 30, context_type='document', metadata=True) 17 | 18 | tf = TfSeq(c, 'document') 19 | tf.train() 20 | 21 | tfidf = TfIdf.from_tf(tf) 22 | tfidf.train() 23 | 24 | m = Lsa.from_tfidf(tfidf) 25 | m.train() 26 | 27 | self.v = LsaViewer(c, m) 28 | 29 | 30 | def test_Lsaviewer(self): 31 | 32 | from vsm.viewer.labeleddata import LabeledColumn, IndexedSymmArray 33 | 34 | sww = self.v.dist_word_word('1') 35 | swwl = self.v.dist_word_word(['1', '0']) 36 | sdd = self.v.dist_doc_doc(1) 37 | sddl = self.v.dist_doc_doc([1, 0]) 38 | self.assertTrue(type(sww) == LabeledColumn) 39 | self.assertTrue(type(swwl) == LabeledColumn) 40 | self.assertTrue(type(sdd) == LabeledColumn) 41 | self.assertTrue(type(sddl) == LabeledColumn) 42 | 43 | sw = self.v.dismat_word(['2','4','5']) 44 | sd = self.v.dismat_doc([1, 0]) 45 | self.assertTrue(type(sw) == IndexedSymmArray) 46 | self.assertTrue(type(sd) == IndexedSymmArray) 47 | 48 | 49 | 50 | 51 | #Define and run test suite 52 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLsaViewer) 53 | unittest.TextTestRunner(verbosity=2).run(suite) 54 | -------------------------------------------------------------------------------- /vsm/exceptions/compatibility.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from functools import update_wrapper, wraps 3 | import inspect 4 | 5 | __all__ = ['deprecation_warning', 'deprecated_meth'] 6 | 7 | 8 | 9 | def deprecation_warning(old_name, new_name): 10 | """ 11 | Deprecation warning for deprecated functions. 12 | """ 13 | warnings.simplefilter('always', DeprecationWarning) 14 | 15 | message = "{0} is deprecated. Please use {1} instead.".format(old_name, 16 | new_name) 17 | warnings.warn(message, DeprecationWarning) 18 | 19 | 20 | #TODO: a function for deprecated class AND auto generate doc string with 21 | # a note about deprecation. 22 | 23 | def deprecated_meth(new_fn_name): 24 | """ 25 | Decorator to be used for deprecated functions/modules. 26 | Throws a DeprecationWarning. 27 | """ 28 | def wrap(old_fn): 29 | 30 | def wrapper(self, *args, **kwargs): 31 | new_fn = getattr(self, new_fn_name) 32 | deprecation_warning(old_fn.__name__, new_fn.__name__) 33 | 34 | return new_fn(*args, **kwargs) 35 | 36 | #update_wrapper(wrapper, new_fn_) 37 | return wrapper 38 | 39 | return wrap 40 | 41 | 42 | """ 43 | def deprecated_fn(new_fn): 44 | Decorator to be used for deprecated functions/modules. 45 | Throws a DeprecationWarning. 46 | def wrap(old_fn): 47 | 48 | def wrapper(self, *args, **kwargs): 49 | deprecation_warning(old_fn.__name__, new_fn.__name__) 50 | #new_fn_ = getattr(self, new_fn.__name__) 51 | 52 | return new_fn(self, *args, **kwargs) 53 | 54 | update_wrapper(wrapper, new_fn) 55 | 56 | return wrapper 57 | 58 | return wrap 59 | """ 60 | -------------------------------------------------------------------------------- /vsm/extensions/interop/weka.py: -------------------------------------------------------------------------------- 1 | """ 2 | `vsm.extensions.interop.weka` 3 | 4 | Module containing functions for import/export between VSM and Weka, 5 | a collection of machine learning algorithms for data mining tasks 6 | implemented in Java. Weka is available at: 7 | ``_ 8 | 9 | This module imports and exports corpora to the `ARFF format`_ used 10 | by Weka. ARFF files can then be used for `text categorization with Weka`_. 11 | 12 | 13 | .. _ARFF format: https://weka.wikispaces.com/ARFF 14 | .. _text categorization with Weka: 15 | https://weka.wikispaces.com/Text+categorization+with+Weka 16 | 17 | """ 18 | import os 19 | import os.path 20 | 21 | from scipy.stats import itemfreq 22 | import numpy as np 23 | 24 | from vsm.extensions.corpusbuilders import corpus_fromlist 25 | 26 | 27 | def export_corpus(corpus, outfolder, context_type='document'): 28 | """ 29 | Converts a vsm.corpus.Corpus object into a Weka-compatible `ARFF file`_. 30 | 31 | :param corpus: VSM Corpus object to convert to lda-c file 32 | :type corpus: vsm.corpus.Corpus 33 | 34 | :param outfolder: Directory to output "vocab.txt" and "corpus.dat" 35 | :type string: path 36 | 37 | .. _ARFF file: https://weka.wikispaces.com/ARFF 38 | """ 39 | pass 40 | 41 | 42 | def import_corpus(corpusfilename, vocabfilename, context_type='document'): 43 | """ 44 | Converts an lda-c compatible data file into a VSM Corpus object. 45 | 46 | :param corpusfilename: path to corpus file, as defined in lda-c 47 | documentation. 48 | :type string: 49 | 50 | :param vocabfilename: path to vocabulary file, one word per line 51 | :type string: 52 | """ 53 | pass 54 | 55 | 56 | def import_model(filename): 57 | pass 58 | 59 | 60 | def export_model(filename): 61 | pass 62 | -------------------------------------------------------------------------------- /unit_tests/tests_beaglecomposite.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | from builtins import range 3 | import unittest 4 | import numpy as np 5 | 6 | 7 | class TestBeagleComposite(unittest.TestCase): 8 | 9 | def setUp(self): 10 | from vsm.corpus.util.corpusbuilders import random_corpus 11 | from vsm.model.beaglecomposite import BeagleComposite 12 | from vsm.model.beagleenvironment import BeagleEnvironment 13 | from vsm.model.beaglecontext import BeagleContextSeq 14 | from vsm.model.beagleorder import BeagleOrderSeq 15 | 16 | self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence') 17 | self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)]) 18 | 19 | self.e = BeagleEnvironment(self.ec, n_cols=5) 20 | self.e.train() 21 | 22 | self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix) 23 | self.cm.train() 24 | 25 | self.om = BeagleOrderSeq(self.ec, self.e.matrix) 26 | self.om.train() 27 | 28 | self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec, self.om.matrix) 29 | self.m.train() 30 | 31 | 32 | def test_BeagleCompositeIO(self): 33 | from tempfile import NamedTemporaryFile 34 | from vsm.model.beaglecomposite import BeagleComposite 35 | import os 36 | 37 | try: 38 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 39 | self.m.save(tmp.name) 40 | tmp.close() 41 | m1 = BeagleComposite.load(tmp.name) 42 | self.assertTrue((self.m.matrix == m1.matrix).all()) 43 | 44 | finally: 45 | os.remove(tmp.name) 46 | 47 | 48 | 49 | 50 | #Define and run test suite 51 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleComposite) 52 | unittest.TextTestRunner(verbosity=2).run(suite) 53 | -------------------------------------------------------------------------------- /vsm/extensions/mahout/README.txt: -------------------------------------------------------------------------------- 1 | mahout.py contains methods that interact with mahout-generated files, 2 | create vsm `Corpus` and `LDAGibbs`. 3 | 4 | 5 | STEPS TO LDA 6 | ------------ 7 | 1) Convert directory of documents to SequenceFile format 8 | mahout-distribution-0.9/bin/mahout seqdirectory -i inpho/testdata -o inpho/mahout-out 9 | 10 | 2) Creating Vectors from SequenceFile 11 | mahout-distribution-0.9/bin/mahout seq2sparse -i inpho/mahout-out -o inpho/mahout-vect-test 12 | 13 | 3) Creating Matrix from tf-vectors 14 | mahout-distribution-0.9/bin/mahout rowid -i inpho/mahout-vect-test/tf-vectors -o inpho/mahout-mat-test 15 | 16 | 4) Run LDA Collapsed Variable Bayes 17 | mahout-distribution-0.9/bin/mahout cvb -i inpho/mahout-mat-test/matrix -dict inpho/mahout-vect-test/dictionary.file-0 -o inpho/mahout-lda-test -a 0.01 -e 0.01 -dt inpho/mahout-dt-test -mt inpho/mahout-models-test -k 5 -x 100 18 | 19 | 20 | CREATING READABLE FILES 21 | ----------------------- 22 | doori@space:~$ mahout-distribution-0.9/bin/mahout vectordump -i inpho/mahout-vect-test/tf-vectors/part-r-00000 -o inpho/mahout-vect-test/tf-vectors/tf.txt -p true --csv csv 23 | 24 | doori@space:~$ mahout-distribution-0.9/bin/mahout seqdumper -i inpho/mahout-dt/part-m-00000 -o inpho/mahout-dt/doc-topics.txt 25 | 26 | doori@space:~$ mahout-distribution-0.9/bin/mahout seqdumper -i inpho/mahout-lda/part-m-00000 -o inpho/mahout-lda/lda.txt 27 | 28 | 29 | NOTES 30 | ----- 31 | If you are running 'seq2sparse' for building the feature vectors and are using the Lucene StandardAnalyzer (which is the default), the English stopwords should be removed automatically. (-x option to remove *high frequency* words. default is 99) 32 | 33 | REFERENCES 34 | ---------- 35 | https://mahout.apache.org/users/basics/creating-vectors-from-text.html 36 | 37 | http://mahout.apache.org/users/clustering/lda-commandline.html 38 | -------------------------------------------------------------------------------- /unit_tests/tests_spatial.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from past.utils import old_div 3 | import unittest 4 | import numpy as np 5 | 6 | from vsm.spatial import * 7 | 8 | #TODO: add tests for recently added methods. 9 | def KL(p,q): 10 | return sum(p*np.log2(old_div(p,q))) 11 | def partial_KL(p,q): 12 | return p * np.log2(old_div((2*p), (p+q))) 13 | def JS(p,q): 14 | return 0.5*(KL(p,((p+q)*0.5)) + KL(q,((p+q)*0.5))) 15 | def JSD(p,q): 16 | return (0.5*(KL(p,((p+q)*0.5)) + KL(q,((p+q)*0.5))))**0.5 17 | 18 | 19 | class TestSpatial(unittest.TestCase): 20 | 21 | def setUp(self): 22 | # 2 random distributions 23 | self.p=np.random.random_sample((5,)) 24 | self.q=np.random.random_sample((5,)) 25 | 26 | # normalize 27 | self.p /= self.p.sum() 28 | self.q /= self.q.sum() 29 | 30 | def test_KL_div(self): 31 | self.assertTrue(np.allclose(KL_div(self.p,self.q), KL(self.p,self.q))) 32 | 33 | def test_JS_div(self): 34 | self.assertTrue(np.allclose(JS_div(self.p,self.q), JS(self.p,self.q))) 35 | 36 | def test_JS_dist(self): 37 | self.assertTrue(np.allclose(JS_dist(self.p,self.q), JSD(self.p,self.q))) 38 | 39 | 40 | def test_count_matrix(self): 41 | 42 | arr = [1, 2, 4, 2, 1] 43 | slices = [slice(0,1), slice(1, 3), slice(3,3), slice(3, 5)] 44 | m = 6 45 | result = coo_matrix([[0, 0, 0, 0], 46 | [1, 0, 0, 1], 47 | [0, 1, 0, 1], 48 | [0, 0, 0, 0], 49 | [0, 1, 0, 0], 50 | [0, 0, 0, 0]]) 51 | 52 | self.assertTrue((result.toarray() == 53 | count_matrix(arr, slices, m).toarray()).all()) 54 | 55 | 56 | 57 | 58 | suite = unittest.TestLoader().loadTestsFromTestCase(TestSpatial) 59 | unittest.TextTestRunner(verbosity=2).run(suite) 60 | -------------------------------------------------------------------------------- /unit_tests/tests_beagleviewer.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | from builtins import range 3 | import unittest 4 | import numpy as np 5 | 6 | from vsm.viewer.beagleviewer import BeagleViewer 7 | from vsm.viewer.labeleddata import * 8 | 9 | 10 | class TestBeagleViewer(unittest.TestCase): 11 | 12 | def setUp(self): 13 | 14 | from vsm.corpus.util.corpusbuilders import random_corpus 15 | from vsm.model.beagleenvironment import BeagleEnvironment 16 | from vsm.model.beaglecontext import BeagleContextSeq 17 | from vsm.model.beagleorder import BeagleOrderSeq 18 | from vsm.model.beaglecomposite import BeagleComposite 19 | 20 | ec = random_corpus(1000, 50, 0, 20, context_type='sentence') 21 | cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)]) 22 | e = BeagleEnvironment(ec, n_cols=5) 23 | e.train() 24 | 25 | cm = BeagleContextSeq(cc, ec, e.matrix) 26 | cm.train() 27 | 28 | om = BeagleOrderSeq(ec, e.matrix) 29 | om.train() 30 | 31 | m = BeagleComposite(cc, cm.matrix, ec, om.matrix) 32 | m.train() 33 | 34 | self.venv = BeagleViewer(ec, e) 35 | self.vctx = BeagleViewer(cc, cm) 36 | self.vord = BeagleViewer(ec, om) 37 | self.vcom = BeagleViewer(cc, m) 38 | 39 | 40 | def test_BeagleViewer(self): 41 | 42 | sww = self.venv.dist_word_word('1') 43 | sww1 = self.vord.dist_word_word('0') 44 | self.assertTrue(type(sww) == LabeledColumn) 45 | self.assertTrue(type(sww1) == LabeledColumn) 46 | 47 | smw = self.vcom.dismat_word(['1']) 48 | smw1 = self.vctx.dismat_word(['1']) 49 | self.assertTrue(type(smw) == IndexedSymmArray) 50 | self.assertTrue(type(smw1) == IndexedSymmArray) 51 | 52 | 53 | 54 | #Define and run test suite 55 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleViewer) 56 | unittest.TextTestRunner(verbosity=2).run(suite) 57 | -------------------------------------------------------------------------------- /unit_tests/tests_lda.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.corpus import Corpus 5 | from vsm.corpus.util.corpusbuilders import random_corpus 6 | from vsm.model.ldacgsmulti import * 7 | from vsm.model.ldacgsseq import * 8 | from vsm.model.lda import * 9 | import platform 10 | 11 | 12 | class TestLda(unittest.TestCase): 13 | def setUp(self): 14 | pass 15 | 16 | def test_Lda_LdaCgsSeq(self): 17 | m=LDA(multiprocessing=False) 18 | self.assertTrue(isinstance(m, LdaCgsSeq)) 19 | 20 | def test_Lda_LdaCgsSeq_seed_or_seeds(self): 21 | m=LDA(multiprocessing=False, seed_or_seeds=2) 22 | self.assertTrue(isinstance(m, LdaCgsSeq)) 23 | self.assertTrue(m.seed == 2) 24 | if platform.system() != 'Windows': 25 | with self.assertRaises(ValueError): 26 | m=LDA(multiprocessing=False, seed_or_seeds=[2,4]) 27 | 28 | 29 | def test_Lda_proper_class(self): 30 | m=LDA(multiprocessing=True) 31 | if platform.system() == 'Windows': 32 | self.assertTrue(isinstance(m,LdaCgsSeq)) 33 | else: 34 | self.assertTrue(isinstance(m,LdaCgsMulti)) 35 | 36 | def test_Lda_LdaCgsMulti_seed_or_seeds(self): 37 | m=LDA(multiprocessing=True, seed_or_seeds=[2,4], n_proc=2) 38 | if platform.system() == 'Windows': 39 | self.assertTrue(isinstance(m,LdaCgsSeq)) 40 | self.assertTrue(m.seed == 2) 41 | else: 42 | self.assertTrue(isinstance(m,LdaCgsMulti)) 43 | self.assertTrue(m.seeds == [2,4]) 44 | 45 | # test improper numper of seed_or_seeds with multiprocessing 46 | if platform.system() != 'Windows': 47 | with self.assertRaises(ValueError): 48 | m=LDA(multiprocessing=True, seed_or_seeds=[2], n_proc=2) 49 | 50 | 51 | if __name__ == '__main__': 52 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLdaCgsMulti) 53 | unittest.TextTestRunner(verbosity=2).run(suite) 54 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | #language: python 2 | env: 3 | - PYTHON_VERSION="2.7" NUMPY_VERSION="1.12.1" 4 | - PYTHON_VERSION="3.5" NUMPY_VERSION="1.12.1" 5 | - PYTHON_VERSION="3.6" NUMPY_VERSION="1.12.1" 6 | - PYTHON_VERSION="3.7" NUMPY_VERSION="1.14.6" 7 | # command to install dependencies 8 | os: 9 | - linux 10 | - osx 11 | install: 12 | # Install conda 13 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then 14 | if [[ "$PYTHON_VERSION" == "2.7" ]]; then 15 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 16 | else 17 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 18 | fi; else 19 | if [[ "$PYTHON_VERSION" == "2.7" ]]; then 20 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh; 21 | else 22 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; 23 | fi 24 | fi 25 | - bash miniconda.sh -b -p $HOME/miniconda 26 | - export PATH="$HOME/miniconda/bin:$PATH" 27 | - hash -r 28 | - conda config --set always_yes yes --set changeps1 no 29 | - conda update -q conda --yes 30 | - conda create -n py -q --yes pip numpy=$NUMPY_VERSION scipy scikit-learn unidecode nltk chardet cython "python=$PYTHON_VERSION" 31 | - source activate py 32 | - which python 33 | - which pip 34 | - python --version 35 | - pip --version 36 | # command to install dependencies 37 | - pip install coveralls 38 | - python -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')" 39 | - pip install . 40 | # command to run tests 41 | before_script: 42 | - pip install unittest2 nose 43 | script: bash coverage.sh 44 | after_success: 45 | - coveralls 46 | - pwd 47 | - pip install wheel 48 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then 49 | python setup.py bdist_egg; 50 | else 51 | python setup.py bdist_wheel --plat-name=macosx_10_7_x86_64; 52 | fi 53 | - ls dist 54 | 55 | addons: 56 | artifacts: 57 | paths: 58 | - $(ls dist/*.whl | tr "\n" ":") 59 | - $(ls dist/*.egg | tr "\n" ":") 60 | target_paths: 61 | - /$TRAVIS_OS_NAME/$TRAVIS_BUILD_NUMBER 62 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | # For Python versions available on Appveyor, see 4 | # http://www.appveyor.com/docs/installed-software#python 5 | - PYTHON: "C:\\Miniconda" 6 | NUMPY: "1.12.1" 7 | - PYTHON: "C:\\Miniconda-x64" 8 | NUMPY: "1.12.1" 9 | - PYTHON: "C:\\Miniconda36" 10 | NUMPY: "1.12.1" 11 | - PYTHON: "C:\\Miniconda36-x64" 12 | NUMPY: "1.12.1" 13 | # Currently doing manual 3.7 builds due to Appveyor issues 14 | # - PYTHON: "C:\\Miniconda37" 15 | # - PYTHON: "C:\\Miniconda37-x64" 16 | 17 | install: 18 | - "setx path \"%path%;%PYTHON%\\Library\\bin\"" 19 | - "%PYTHON%\\python.exe -m conda update -q --yes conda" 20 | - "%PYTHON%\\python.exe -m conda install -q --yes pip numpy=%NUMPY% scipy scikit-learn unidecode nltk chardet cython sqlite" 21 | - "%PYTHON%\\python.exe -c \"import nltk; nltk.download('stopwords'); nltk.download('punkt')\"" 22 | - "%PYTHON%\\python.exe -m pip install unittest2 nose" 23 | - "%PYTHON%\\python.exe -m pip install ." 24 | 25 | build: off 26 | version: '1.0.{build}' 27 | 28 | test_script: 29 | # Put your test command here. 30 | # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, 31 | # you can remove "build.cmd" from the front of the command, as it's 32 | # only needed to support those cases. 33 | # Note that you must use the environment variable %PYTHON% to refer to 34 | # the interpreter you're using - Appveyor does not do anything special 35 | # to put the Python evrsion you want to use on PATH. 36 | - "%PYTHON%\\python.exe setup.py test" 37 | 38 | after_test: 39 | # This step builds your wheels. 40 | # Again, you only need build.cmd if you're building C extensions for 41 | # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct 42 | # interpreter 43 | - "%PYTHON%\\python.exe setup.py bdist_wheel" 44 | 45 | artifacts: 46 | # bdist_wheel puts your built wheel in the dist directory 47 | - path: dist\* 48 | 49 | #on_success: 50 | # You can use this step to upload your artifacts to a public website. 51 | # See Appveyor's documentation for more details. Or you can simply 52 | # access your wheels from the Appveyor "artifacts" tab for your build. 53 | -------------------------------------------------------------------------------- /unit_tests/tests_tfidf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from scipy.sparse import coo_matrix 4 | from vsm.model import tfidf 5 | from vsm.model import tf 6 | 7 | 8 | class TestTfIdf(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3, 12 | 3, 0, 1, 0, 13 | 3, 14 | 1, 3]) 15 | self.docs = [slice(0,9), slice(9,13), 16 | slice(13,14), slice(14,16)] 17 | self.V = 4 18 | 19 | self.tf_mat = coo_matrix(np.array([[3, 2, 0, 0], 20 | [3, 1, 0, 1], 21 | [0, 0, 0, 0], 22 | [3, 1, 1, 1]], dtype=int)) 23 | self.tfidf_mat = np.array(\ 24 | [[2.0794415, 1.3862944, 0, 0], 25 | [0.86304623, 0.28768209, 0, 0.28768209], 26 | [0, 0, 0, 0], 27 | [0, 0, 0, 0]]) 28 | self.undefined_rows = [2] 29 | 30 | 31 | def test_TfIdf_train(self): 32 | m = tfidf.TfIdf() 33 | m.train() 34 | self.assertTrue(m.matrix.size == 0) 35 | self.assertTrue(len(m.undefined_rows) == 0) 36 | 37 | m = tfidf.TfIdf(tf_matrix=self.tf_mat) 38 | m.train() 39 | np.testing.assert_almost_equal(self.tfidf_mat, m.matrix.toarray()) 40 | self.assertEqual(m.undefined_rows, self.undefined_rows) 41 | 42 | def test_TfIdf_from_tf(self): 43 | tf_model = tf.TF() 44 | tf_model.corpus = self.corpus 45 | tf_model.docs = self.docs 46 | tf_model.V = self.V 47 | tf_model.train() 48 | self.assertTrue((self.tf_mat == tf_model.matrix.toarray()).all()) 49 | 50 | m = tfidf.TfIdf.from_tf(tf_model) 51 | self.assertTrue((m.matrix == tf_model.matrix.toarray()).all()) 52 | m.train() 53 | np.testing.assert_almost_equal(self.tfidf_mat, m.matrix.toarray()) 54 | self.assertEqual(m.undefined_rows, self.undefined_rows) 55 | 56 | 57 | #Define and run test suite 58 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfIdf) 59 | unittest.TextTestRunner(verbosity=2).run(suite) 60 | -------------------------------------------------------------------------------- /vsm/viewer/types.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | from builtins import int 3 | from past.builtins import basestring 4 | import numpy as np 5 | 6 | 7 | __all__ = ['isfloat', 'isint', 'isstr', 8 | 'res_doc_type', 'res_top_type', 'res_word_type'] 9 | 10 | 11 | 12 | # 13 | # Rudimentary type-checking fns 14 | # 15 | 16 | 17 | def isfloat(x): 18 | """ 19 | Returns True if `x` is an instance of a float. 20 | """ 21 | return (isinstance(x, np.inexact) or isinstance(x, float)) 22 | 23 | 24 | def isint(x): 25 | """ 26 | Returns True if `x` is an instance of an int. 27 | """ 28 | return (isinstance(x, np.integer) or isinstance(x, int)) 29 | 30 | 31 | def isstr(x): 32 | """ 33 | Returns True if `x` is an instance of a string. 34 | """ 35 | return isinstance(x, basestring) or isinstance(x, np.flexible) 36 | 37 | 38 | # 39 | # fns to resolve input polymorphism to the dist_*_* fns 40 | # 41 | 42 | 43 | def res_doc_type(corp, context_type, label_name, doc): 44 | """ 45 | If `doc` is a string or a dict, performs a look up for its 46 | associated integer. If `doc` is a dict, looks for its label. 47 | Finally, if `doc` is an integer, stringifies `doc` for use as 48 | a label. 49 | 50 | Returns an integer, string pair: (, ). 52 | """ 53 | if isstr(doc): 54 | query = {label_name: doc} 55 | d = corp.meta_int(context_type, query) 56 | elif isinstance(doc, dict): 57 | d = corp.meta_int(context_type, doc) 58 | 59 | #TODO: Define an exception for failed queries in 60 | #vsm.corpus. Use it here. 61 | doc = corp.view_metadata(context_type)[label_name][d] 62 | else: 63 | d, doc = doc, str(doc) 64 | 65 | return d, doc 66 | 67 | 68 | def res_top_type(topic_or_topics): 69 | """ 70 | If `topic_or_topics` is an int, then returns it in a list. 71 | """ 72 | if isint(topic_or_topics): 73 | topic_or_topics = [topic_or_topics] 74 | 75 | return topic_or_topics 76 | 77 | 78 | def res_word_type(corp, word): 79 | """ 80 | If `word` is a string, performs a look up for its associated 81 | integer. Otherwise, stringifies `word`. 82 | 83 | Returns an integer, string pair: (, ). 84 | """ 85 | if isstr(word): 86 | return corp.words_int[word], word 87 | 88 | return word, str(word) 89 | -------------------------------------------------------------------------------- /unit_tests/tests_tf.py: -------------------------------------------------------------------------------- 1 | from builtins import object 2 | import unittest 3 | import numpy as np 4 | 5 | from vsm.model import tf 6 | from multiprocessing import Process 7 | import platform 8 | 9 | class MPTester(object): 10 | def setUp(self): 11 | self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3, 12 | 3, 0, 1, 0, 13 | 1, 3]) 14 | self.docs = [slice(0,9), slice(9,13), 15 | slice(13,13), slice(13,15)] 16 | self.V = 4 17 | self.cnt_mat = np.array([[3, 2, 0, 0], 18 | [3, 1, 0, 1], 19 | [0, 0, 0, 0], 20 | [3, 1, 0, 1]]) 21 | 22 | def test_TfMulti_train(self): 23 | self.setUp() 24 | m = tf.TfMulti() 25 | m.corpus = self.corpus 26 | m.docs = self.docs 27 | m.V = self.V 28 | m.train(2) 29 | 30 | assert (self.cnt_mat == m.matrix.toarray()).all() 31 | 32 | class TestTf(unittest.TestCase): 33 | 34 | def setUp(self): 35 | self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3, 36 | 3, 0, 1, 0, 37 | 1, 3]) 38 | self.docs = [slice(0,9), slice(9,13), 39 | slice(13,13), slice(13,15)] 40 | self.V = 4 41 | self.cnt_mat = np.array([[3, 2, 0, 0], 42 | [3, 1, 0, 1], 43 | [0, 0, 0, 0], 44 | [3, 1, 0, 1]]) 45 | 46 | def test_TF_proper_class(self): 47 | m = tf.TF(multiprocessing=True) 48 | if platform.system() == 'Windows': 49 | self.assertTrue(isinstance(m,tf.TfSeq)) 50 | else: 51 | self.assertTrue(isinstance(m,tf.TfMulti)) 52 | 53 | def test_TfSeq_train(self): 54 | m = tf.TfSeq() 55 | m.corpus = self.corpus 56 | m.docs = self.docs 57 | m.V = self.V 58 | m.train() 59 | self.assertTrue((self.cnt_mat == m.matrix.toarray()).all()) 60 | 61 | def test_demo_TfMulti_train(self): 62 | t = MPTester() 63 | p = Process(target=t.test_TfMulti_train, args=()) 64 | p.start() 65 | p.join() 66 | 67 | 68 | 69 | #Define and run test suite 70 | if __name__ == '__main__': 71 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTf) 72 | unittest.TextTestRunner(verbosity=2).run(suite) 73 | -------------------------------------------------------------------------------- /vsm/model/beaglecomposite.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | 5 | from vsm.model.base import BaseModel 6 | from vsm.model.beaglecontext import realign_env_mat 7 | 8 | 9 | __all__ = [ 'BeagleComposite' ] 10 | 11 | 12 | class BeagleComposite(BaseModel): 13 | """ 14 | `BeagleComposite` combines the BEAGLE order and context model 15 | with a user defined ratio. Default ratio is .5 which weighs 16 | order and context matrices equally. 17 | """ 18 | 19 | def __init__(self, ctx_corp, ctx_matrix, 20 | ord_corp, ord_matrix, context_type='sentence'): 21 | """ 22 | Assume that the context corpus is a subcorpus of the order 23 | corpus and that the eventual composite corpus is the context 24 | corpus. The order matrix is sliced and reordered so that it 25 | aligns with the context matrix. 26 | 27 | :param ctx_corp: Corpus from BEAGLE context model. 28 | :type ctx_corp: :class:`Corpus` 29 | 30 | :param ctx_matrix: BEAGLE context matrix. 31 | :type ctx_matrix: np.ndarray matrix 32 | 33 | :param ord_corp: Corpus from BEAGLE order model. 34 | :type ord_corp: :class:`Corpus` 35 | 36 | :param ord_matrix: BEAGLE order matrix. 37 | :type ord_matrix: np.ndarray matrix 38 | 39 | :param context_type: Name of tokenization stored in `corpus` whose 40 | tokens will be treated as documents. Default is `sentence`. 41 | :type context_type: string, optional 42 | """ 43 | self.ctx_matrix = (ctx_matrix / 44 | ((ctx_matrix**2).sum(1)**0.5)[:,np.newaxis]) 45 | self.ord_matrix = realign_env_mat(ctx_corp, ord_corp, ord_matrix) 46 | self.ord_matrix /= ((self.ord_matrix**2).sum(1)**0.5)[:,np.newaxis] 47 | self.context_type = context_type 48 | 49 | 50 | def train(self, wgt=.5): 51 | """ 52 | Combines the context and order matrices blended by `wgt` ratio. 53 | 54 | :param wgt: The weight of context model. If `wgt` is .7 then 55 | the ratio of context and order model is 7:3. `wgt` should be 56 | a value in [0,1]. Default is .5. 57 | :type wgt: float, optional 58 | 59 | :returns: `None` 60 | """ 61 | print('Summing context and order vectors') 62 | self.matrix = wgt * self.ctx_matrix + (1 - wgt) * self.ord_matrix 63 | 64 | -------------------------------------------------------------------------------- /vsm/split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for splitting lists and arrays 3 | """ 4 | 5 | 6 | from builtins import range 7 | import numpy as np 8 | 9 | 10 | __all__ = ['split_corpus', 'mp_split_ls', 'split_documents'] 11 | 12 | 13 | 14 | def split_corpus(arr, indices): 15 | """ 16 | Splits the given array by the indices into list of sub-arrays. 17 | 18 | :param arr: An array to be split. 19 | :type arr: array 20 | :param indices: 1-dimensional array of integers that indicates 21 | where the array is split. 22 | :type indices: array 23 | 24 | :returns: A list of sub-arrays split at the indices. 25 | 26 | **Examples** 27 | 28 | >>> arr = np.arange(8) 29 | >>> indices = np.array([2,4,7]) 30 | >>> split_corpus(arr, indices) 31 | [array([0,1]), array([2,3]), array([4,5,6]), array([7])] 32 | """ 33 | if len(indices) == 0: 34 | return arr 35 | 36 | if isinstance(indices, list): 37 | indices = np.array(indices) 38 | 39 | out = np.split(arr, indices) 40 | 41 | if (indices >= len(arr)).any(): 42 | out = out[:-1] 43 | try: 44 | for i in range(len(out)): 45 | if out[i].size == 0: 46 | out[i] = np.array([], dtype=arr.dtype) 47 | except AttributeError: 48 | for i in range(len(out)): 49 | if out[i].size == 0: 50 | out[i] = np.array([]) 51 | 52 | return out 53 | 54 | 55 | 56 | def mp_split_ls(ls, n): 57 | """ 58 | Split list into an `n`-length list of arrays. 59 | 60 | :param ls: List to be split. 61 | :type ls: list 62 | 63 | :param n: Number of splits. 64 | :type n: int 65 | 66 | :returns: List of arrays whose length is 'n'. 67 | 68 | **Examples** 69 | >>> ls = [1,5,6,8,2,8] 70 | >>> mp_split_ls(ls, 4) 71 | [array([1, 5]), array([6, 8]), array([2]), array([8])] 72 | """ 73 | return np.array_split(ls, min(len(ls), n)) 74 | 75 | 76 | def split_documents(corpus, indices, max_partitions): 77 | """ 78 | """ 79 | docs = [(0, indices[0])] 80 | for i in range(len(indices)-1): 81 | docs.append((indices[i], indices[i+1])) 82 | docs = np.array(docs, dtype='i8, i8') 83 | 84 | corpus_chunks = np.array_split(corpus, max_partitions) 85 | chunk_indices = np.cumsum([len(chunk) for chunk in corpus_chunks]) 86 | doc_indices = np.searchsorted(indices, chunk_indices, side='right') 87 | doc_partitions = np.split(docs, doc_indices[:-1]) 88 | 89 | doc_partitions = [part for part in doc_partitions if part.size] 90 | 91 | return doc_partitions 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, Command, find_packages 2 | import platform 3 | import numpy 4 | 5 | from Cython.Build import cythonize 6 | 7 | 8 | # find packages in vsm subdirectory 9 | # this will skip the unittests, etc. 10 | packages = ['vsm.'+pkg for pkg in find_packages('vsm')] 11 | packages.append('vsm') 12 | 13 | install_requires=[ 14 | 'chardet', 15 | 'cython', 16 | 'future', 17 | 'matplotlib', 18 | 'nltk', 19 | 'numpy', 20 | 'progressbar2', 21 | 'py4j', 22 | 'scikit_learn', 23 | 'scipy', 24 | 'sortedcontainers', 25 | 'translate', 26 | 'Unidecode', 27 | ] 28 | 29 | if platform.python_version_tuple()[0] == '2': 30 | install_requires.append("futures>=3.0.0") 31 | install_requires.append("backports.tempfile==1.0") 32 | 33 | setup( 34 | name = "vsm", 35 | version = "1.0.0b1", 36 | description = ('Vector Space Semantic Modeling Framework '\ 37 | 'for the Indiana Philosophy Ontology Project'), 38 | author = "The Indiana Philosophy Ontology (InPhO) Project", 39 | author_email = "inpho@indiana.edu", 40 | url = "http://inpho.cogs.indiana.edu/", 41 | download_url = "http://www.github.com/inpho/vsm", 42 | keywords = [], 43 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 44 | classifiers = [ 45 | "Programming Language :: Python", 46 | "Programming Language :: Python :: 2", 47 | "Programming Language :: Python :: 3", 48 | "Development Status :: 5 - Production/Stable", 49 | "Environment :: Console", 50 | "Intended Audience :: Developers", 51 | "Intended Audience :: Science/Research", 52 | "License :: OSI Approved :: MIT License", 53 | "Operating System :: OS Independent", 54 | "Topic :: Software Development :: Libraries :: Python Modules", 55 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 56 | "Topic :: Text Processing :: Linguistic", 57 | ], 58 | install_requires=install_requires, 59 | license = 'MIT', 60 | packages=packages, 61 | ext_modules = cythonize( 62 | Extension( 63 | "vsm.model._cgs_update", 64 | sources=["vsm/model/_cgs_update.pyx"], 65 | include_dirs=[numpy.get_include()] 66 | ) 67 | ), 68 | zip_safe = False, 69 | package_data = {'vsm': ['vsm/model/_cgs_update.pyx']}, 70 | dependency_links=['https://inpho.cogs.indiana.edu/pypi/pymmseg/'], 71 | 72 | test_suite = "unittest2.collector", 73 | tests_require=['unittest2'], 74 | ) 75 | -------------------------------------------------------------------------------- /unit_tests/tests_beaglecontext.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | from builtins import range 3 | import unittest 4 | import numpy as np 5 | 6 | 7 | 8 | class TestBeagleContext(unittest.TestCase): 9 | 10 | def setUp(self): 11 | from vsm.corpus.util.corpusbuilders import random_corpus 12 | from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti 13 | from vsm.model.beagleenvironment import BeagleEnvironment 14 | 15 | self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence') 16 | self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)]) 17 | 18 | self.e = BeagleEnvironment(self.ec, n_cols=5) 19 | self.e.train() 20 | 21 | self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix) 22 | self.ms.train() 23 | ''' 24 | self.mm = BeagleContextMulti(self.cc, self.ec, self.e.matrix) 25 | self.mm.train(n_procs=2) 26 | ''' 27 | 28 | 29 | def test_BeagleContextSeq(self): 30 | from tempfile import NamedTemporaryFile 31 | import os 32 | 33 | from vsm.model.beaglecontext import BeagleContextSeq 34 | try: 35 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 36 | self.ms.save(tmp.name) 37 | tmp.close() 38 | m1 = BeagleContextSeq.load(tmp.name) 39 | self.assertTrue((self.ms.matrix == m1.matrix).all()) 40 | 41 | finally: 42 | os.remove(tmp.name) 43 | 44 | 45 | ''' 46 | def test_BeagleContextMulti(self): 47 | from tempfile import NamedTemporaryFile 48 | import os 49 | 50 | from vsm.model.beaglecontext import BeagleContextMulti 51 | try: 52 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 53 | self.mm.save(tmp.name) 54 | tmp.close() 55 | m1 = BeagleContextMulti.load(tmp.name) 56 | self.assertTrue((self.mm.matrix == m1.matrix).all()) 57 | 58 | finally: 59 | os.remove(tmp.name) 60 | 61 | 62 | 63 | def test_compare(self): 64 | 65 | print 'Training single processor model' 66 | ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix) 67 | ms.train() 68 | 69 | print 'Training multiprocessor model' 70 | mm = BeagleContextMulti(self.cc, self.ec, self.e.matrix) 71 | mm.train() 72 | 73 | self.assertTrue(np.allclose(ms.matrix, mm.matrix)) 74 | ''' 75 | 76 | #Define and run test suite 77 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleContext) 78 | unittest.TextTestRunner(verbosity=2).run(suite) 79 | -------------------------------------------------------------------------------- /vsm/extensions/lda_py4j/org/knowceans/gibbstest/LDA.java: -------------------------------------------------------------------------------- 1 | package org.knowceans.gibbstest; 2 | 3 | import java.text.DecimalFormat; 4 | import java.text.NumberFormat; 5 | import java.io.IOException; 6 | import java.util.*; 7 | import java.io.StringWriter; 8 | 9 | import py4j.GatewayServer; 10 | 11 | 12 | public class LDA { 13 | 14 | private static FileArrayProvider fap; 15 | private static LdaGibbsSampler lda; 16 | 17 | public LDA(String corpFile) throws IOException { 18 | 19 | FileArrayProvider fap = new FileArrayProvider(); 20 | this.fap = fap; 21 | 22 | int[][] documents = fap.readFile(corpFile); 23 | 24 | List vli = new ArrayList(); 25 | for (int[] d : documents) { 26 | for (int i : d) { 27 | if (!vli.contains(i)) { 28 | vli.add(i); 29 | } 30 | } 31 | } 32 | 33 | int V = vli.size(); 34 | int M = documents.length; 35 | System.out.println("V, M "+ V + " " + M); 36 | LdaGibbsSampler lda = new LdaGibbsSampler(documents, V); 37 | 38 | this.lda = lda; 39 | } 40 | 41 | public LdaGibbsSampler getLda() { 42 | return this.lda; 43 | } 44 | 45 | public FileArrayProvider getFap() { 46 | return this.fap; 47 | } 48 | 49 | public static void sample(int iter, int K, double alpha, double beta) { 50 | // configure(iter, burnin, thinInterval, sampleLag) default values 51 | // from LdaGibbsSampler example. 52 | lda.configure(iter, 2000, 100, 10); 53 | lda.gibbs(K, alpha, beta); 54 | } 55 | 56 | public static void main(String[] args) throws IOException { 57 | // Note: iter=1000 returns NaN for all phi, theta 58 | String corpfile = args[0]; 59 | LDA ldai = new LDA(corpfile); 60 | GatewayServer gatewayServer = new GatewayServer(ldai); 61 | gatewayServer.start(); 62 | System.out.println("Gateway Server Started!"); 63 | } 64 | 65 | public void writeMeta(int iter, int K, double alpha, double beta, 66 | String metaFile) throws IOException { 67 | 68 | String s = ""; 69 | s += "K," + K + "\n"; 70 | s += "iteration," + iter + "\n"; 71 | s += "m_words," + this.getLda().V + "\n"; 72 | s += "doc_prior," + alpha + "\n"; 73 | s += "top_prior," + beta + "\n"; 74 | // add dummy values 75 | s += "inv_top_sums," + "0.0\n"; 76 | s += "log_probs," + "0.0\n"; 77 | 78 | this.getFap().writeStrFile(s, metaFile); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /vsm/model/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from builtins import object 3 | import numpy as np 4 | 5 | __all__ = ['BaseModel'] 6 | 7 | 8 | 9 | 10 | class BaseModel(object): 11 | """ 12 | Base class for models which store data in a single matrix. 13 | 14 | :param matrix: A two-dimensional numpy array storing the results 15 | of model training. Default is `None`. 16 | :type matrix: numpy.ndarray, optional 17 | 18 | :param context_type: A string specifying the type of context over 19 | which the model trainer is applied. Default is `None`. 20 | :type context_type: string, optional 21 | 22 | :attributes: 23 | Same as parameters. 24 | 25 | :methods: 26 | * **save** 27 | Takes a filename or file object and saves `self.matrix` 28 | in an npz archive. 29 | * **load** 30 | Takes a filename or file object and loads it as an npz 31 | archive into a BaseModel object. 32 | 33 | :See Also: :meth:`numpy.savez`, :meth:`numpy.load` 34 | """ 35 | def __init__(self, matrix=None, context_type=None): 36 | self.matrix = matrix 37 | self.context_type = context_type 38 | 39 | def save(self, f): 40 | """ 41 | Takes a filename or file object and saves `self.matrix` in an 42 | npz archive. 43 | 44 | :param file: Designates the file to which to save data. See 45 | `numpy.savez` for further details. 46 | :type file: str-like or file-like object 47 | 48 | :returns: `None` 49 | 50 | :See Also: :meth:`numpy.savez` 51 | """ 52 | print('Saving model to {}'.format(f)) 53 | np.savez(f, matrix=np.array(self.matrix), context_type=np.array(self.context_type)) 54 | 55 | 56 | @staticmethod 57 | def load(f): 58 | """ 59 | Takes a filename or file object and loads it as an npz archive 60 | into a BaseModel object. 61 | 62 | :param file: Designates the file to read. If `file` is a string 63 | ending in `.gz`, the file is first gunzipped. See `numpy.load` 64 | for further details. 65 | :type file: str-like or file-like object 66 | 67 | :returns: A dictionary storing the data found in `file`. 68 | 69 | :See Also: :meth:`numpy.load` 70 | """ 71 | print('Loading model from {}'.format(f)) 72 | npz = np.load(f) 73 | 74 | # The slice [()] is to unwrap sparse matrices, which get saved 75 | # in singleton object arrays 76 | return BaseModel(matrix=npz['matrix'], context_type=npz['context_type']) 77 | -------------------------------------------------------------------------------- /unit_tests/tests_viewer_wrappers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.viewer.wrappers import * 5 | from vsm.viewer.labeleddata import * 6 | 7 | 8 | 9 | 10 | class TestViewerWrappers(unittest.TestCase): 11 | 12 | # TODO: Rewrite these to be independent of LDA 13 | pass 14 | 15 | # def setUp(self): 16 | 17 | # from vsm.corpus.util.corpusbuilders import random_corpus 18 | # from vsm.model.ldacgsseq import LdaCgsSeq 19 | 20 | # self.c = random_corpus(1000, 50, 0, 20, context_type='sentence', 21 | # metadata=True) 22 | 23 | # self.m = LDAGibbs(self.c, 'sentence', K=40) 24 | # self.m.train(n_iterations=50) 25 | 26 | 27 | # def test_dist_(self): 28 | 29 | # li = [0,1] 30 | 31 | # swt = dist_word_top(self.c, self.m.word_top.T, '0') 32 | # swtl = dist_word_top(self.c, self.m.word_top.T, ['0','1'], order='i') 33 | # sww = dist_word_word(self.c, self.m.word_top, '0') 34 | # swwl = dist_word_word(self.c, self.m.word_top, ['0','1'], order='i') 35 | # std = dist_top_doc(self.c, self.m.top_doc.T, 0, 'sentence', order='i') 36 | # stdl = dist_top_doc(self.c, self.m.top_doc.T, li, 'sentence') 37 | # sdd = dist_doc_doc(self.c, self.m.top_doc, self.m.context_type, 0) 38 | # sddl = dist_doc_doc(self.c, self.m.top_doc, self.m.context_type, li) 39 | # stt = dist_top_top(self.m.word_top.T, 1) 40 | # sttl = dist_top_top(self.m.word_top.T, li) 41 | 42 | # self.assertEqual(type(swt), LabeledColumn) 43 | # self.assertEqual(type(swtl), LabeledColumn) 44 | # self.assertEqual(type(sww), LabeledColumn) 45 | # self.assertEqual(type(swwl), LabeledColumn) 46 | # self.assertEqual(type(std), LabeledColumn) 47 | # self.assertEqual(type(stdl), LabeledColumn) 48 | # self.assertEqual(type(sdd), LabeledColumn) 49 | # self.assertEqual(type(sddl), LabeledColumn) 50 | # self.assertEqual(type(stt), LabeledColumn) 51 | # self.assertEqual(type(sttl), LabeledColumn) 52 | 53 | 54 | # def test_dismat_(self): 55 | 56 | # dismatw = dismat_word(['0','2','5'], self.c, self.m.word_top) 57 | # dismatd = dismat_doc([0,1,2], self.c, self.m.context_type, 58 | # self.m.top_doc) 59 | # dismatt = dismat_top([0,1,2], self.m.word_top) 60 | 61 | # self.assertEqual(type(dismatw), IndexedSymmArray) 62 | # self.assertEqual(type(dismatd), IndexedSymmArray) 63 | # self.assertEqual(type(dismatt), IndexedSymmArray) 64 | 65 | 66 | 67 | #Define and run test suite 68 | suite = unittest.TestLoader().loadTestsFromTestCase(TestViewerWrappers) 69 | unittest.TextTestRunner(verbosity=2).run(suite) 70 | -------------------------------------------------------------------------------- /unit_tests/tests_labeleddata.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | from builtins import zip 3 | from builtins import range 4 | from past.builtins import basestring 5 | 6 | import unittest 7 | import numpy as np 8 | 9 | from vsm.viewer.labeleddata import * 10 | 11 | 12 | class TestLabeleddata(unittest.TestCase): 13 | 14 | def setUp(self): 15 | 16 | words = ['row', 'row', 'row', 'your', 'boat', 'gently', 'down', 'the', 17 | 'stream', 'merrily', 'merrily', 'merrily', 'merrily', 'life', 18 | 'is', 'but', 'a', 'dream'] 19 | values = [np.random.random() for t in words] 20 | d = [('i', np.array(words).dtype), 21 | ('value', np.array(values).dtype)] 22 | self.v = np.array(list(zip(words, values)), dtype=d) 23 | 24 | 25 | 26 | def test_LabeledColumn(self): 27 | 28 | arr = self.v.view(LabeledColumn) 29 | arr.subcol_headers = ['Word', 'Value'] 30 | arr.col_header = 'Song lets make this longer than subcol headers' 31 | arr.col_len = 10 32 | arr1 = self.v.view(LabeledColumn) 33 | 34 | self.assertTrue(isinstance(arr.__str__(), basestring)) 35 | self.assertTrue(sum(arr.subcol_widths) <= arr.col_width) 36 | self.assertEqual(arr.shape[0], arr1.col_len) 37 | self.assertFalse(arr1.col_header) 38 | self.assertFalse(arr1.subcol_headers) 39 | 40 | 41 | def test_DataTable(self): 42 | 43 | v = LabeledColumn(self.v) 44 | v.subcol_widths = [30, 20] 45 | v.col_len = 10 46 | t = [] 47 | for i in range(5): 48 | t.append(v.copy()) 49 | t[i].col_header = 'Iteration ' + str(i) 50 | 51 | schc = ['Topic', 'Word'] 52 | schf = ['Word', 'Value'] 53 | t = DataTable(t, 'Song', subcolhdr_compact=schc, subcolhdr_full=schf) 54 | 55 | self.assertTrue(isinstance(t.__str__(), basestring)) 56 | self.assertTrue('Song', t.table_header) 57 | 58 | t.compact_view = False 59 | self.assertTrue(isinstance(t.__str__(), basestring)) 60 | self.assertTrue('Song', t.table_header) 61 | 62 | 63 | 64 | def test_IndexedSymmArray(self): 65 | 66 | from vsm.corpus.util.corpusbuilders import random_corpus 67 | from vsm.model.ldacgsseq import LdaCgsSeq 68 | from vsm.viewer.ldacgsviewer import LdaCgsViewer 69 | 70 | c = random_corpus(50000, 1000, 0, 50) 71 | m = LdaCgsSeq(c, 'document', K=20) 72 | viewer = LdaCgsViewer(c, m) 73 | 74 | li = [0, 1, 10] 75 | isa = viewer.dismat_top(li) 76 | 77 | self.assertEqual(isa.shape[0], len(li)) 78 | 79 | 80 | 81 | 82 | #Define and run test suite 83 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLabeleddata) 84 | unittest.TextTestRunner(verbosity=2).run(suite) 85 | -------------------------------------------------------------------------------- /vsm/extensions/trans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import nltk 5 | import re 6 | import os 7 | from unidecode import unidecode 8 | from translate import Translator as Ts 9 | from vsm.corpus.util import * 10 | 11 | """ 12 | Uses `translate` python module from https://pypi.python.org/pypi/translate 13 | """ 14 | 15 | def sent_tokenize(text, lang='english'): 16 | tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(lang)) 17 | return tokenizer.tokenize(text) 18 | 19 | def cleanup(s): 20 | s = rehyph(s) 21 | s = s.strip('\n') 22 | def replace(match): 23 | if match: 24 | if match.group(0).startswith(r'\*'): 25 | return unidecode(match.group(0)) 26 | else: 27 | return '' 28 | 29 | return re.sub(r"[\x90-\xff]", replace, s) 30 | 31 | 32 | def transwrapper(text, from_lang, to_lang): 33 | 34 | if from_lang == 'en': 35 | lang = 'english' 36 | elif from_lang == 'fr': 37 | lang = 'french' 38 | elif from_lang == 'de': 39 | lang = 'german' 40 | sli = sent_tokenize(text, lang=lang) 41 | 42 | out = '' 43 | for sent in sli: 44 | sent = cleanup(sent) 45 | 46 | ts = Ts(from_lang=from_lang, to_lang=to_lang) 47 | target = ts.translate(sent) 48 | out += target 49 | 50 | return out 51 | 52 | 53 | if __name__=="__main__": 54 | frompath = 'darwin-de/' 55 | topath = 'darwin-de-translate/' 56 | 57 | books = os.listdir(frompath) 58 | books.sort() 59 | 60 | for book in books: 61 | book_path = os.path.join(frompath, book) 62 | print(book_path) 63 | pages = os.listdir(book_path) 64 | pages.sort() 65 | 66 | for page in pages: 67 | page_name = os.path.join(book_path, page) 68 | 69 | with open(page_name, 'r') as f: 70 | try: 71 | out = transwrapper(f.read(), 'de', 'en') 72 | out = out.encode('utf-8') 73 | except: 74 | out = '' 75 | print(page_name, ' failed translation.') 76 | 77 | try: 78 | os.mkdir(os.path.join(topath, book)) 79 | except OSError: 80 | pass 81 | topage = os.path.join(topath, book, page) 82 | #with open(topage, 'w') as fout: 83 | """ 84 | # for individual pages 85 | fin = 'darwin-de/wu.89101307601/00000636.txt' 86 | fout = 'darwin-de-translate/wu.89101307601/00000636.txt' 87 | 88 | with open(fin, 'r') as f: 89 | out = transwrapper(f.read(), 'de', 'en') 90 | out = out.encode('utf-8') 91 | with open(fout, 'w') as fo: 92 | fo.write(out) 93 | fout.write(out)""" 94 | -------------------------------------------------------------------------------- /vsm/extensions/clustering/plotting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def gen_colors(clusters): 4 | """ 5 | Takes 'clusters' and creates a list of colors so a cluster has a color. 6 | 7 | :param clusters: A flat list of integers where an integer represents which 8 | cluster the information belongs to. 9 | :type clusters: list 10 | 11 | :returns: colorm : list 12 | A list of colors obtained from matplotlib colormap cm.hsv. The 13 | length of 'colorm' is the same as the number of distinct 14 | clusters. 15 | """ 16 | import matplotlib.cm as cm 17 | 18 | n = len(set(clusters)) 19 | colorm = [cm.hsv(i * 1.0 /n, 1) for i in xrange(n)] 20 | return colorm 21 | 22 | 23 | def plot_clusters(arr, labels, clusters=[], size=[]): 24 | """ 25 | Takes 2-dimensional array(simmat), list of clusters, list of labels, 26 | and list of marker size. 'clusters' should be a flat list which can be 27 | obtained from cluster_topics(by_cluster=False). 28 | Plots each clusters in different colors. 29 | 30 | :type arr: 2-dimensional array 31 | :param arr: Array has x, y coordinates to be plotted on a 2-dimensional 32 | space. 33 | 34 | :param labels: List of labels to be displayed in the graph. 35 | :type labels: list 36 | 37 | :param clusters: A flat list of integers where an integer represents which 38 | cluster the information belongs to. If not given, it returns a 39 | basic plot with no color variation. Default is an empty list. 40 | :type clusters: list, optional 41 | 42 | :param size: List of markersize for points where markersize can note the 43 | importance of the point. If not given, 'size' is a list of 44 | fixed markersize, 40. Default is an empty list. 45 | :type size: list, optional 46 | 47 | :returns: plt : maplotlit.pyplot object 48 | A graph with scatter plots from 'arr'. 49 | """ 50 | import matplotlib.pyplot as plt 51 | 52 | n = arr.shape[0] 53 | X = arr[:,0] 54 | Y = arr[:,1] 55 | 56 | if len(size) == 0: 57 | size = [40 for i in xrange(n)] 58 | 59 | fig = plt.figure(figsize=(10,10)) 60 | ax = plt.subplot(111) 61 | 62 | if len(clusters) == 0: 63 | plt.scatter(X, Y, size) 64 | else: 65 | colors = gen_colors(clusters) 66 | colors = [colors[i] for i in clusters] 67 | 68 | for i in xrange(n): 69 | plt.scatter(X[i], Y[i], size, color=colors[i]) 70 | 71 | ax.set_xlim(np.min(X) - .1, np.max(X) + .1) 72 | ax.set_ylim(np.min(Y) - .1, np.max(Y) + .1) 73 | ax.set_xticks([]) 74 | ax.set_yticks([]) 75 | 76 | for label, x, y in zip(labels, X, Y): 77 | plt.annotate(label, xy = (x, y), xytext=(-2, 3), 78 | textcoords='offset points', fontsize=10) 79 | 80 | plt.show() 81 | -------------------------------------------------------------------------------- /unit_tests/tests_corpus_util.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | import unittest 3 | 4 | from vsm.corpus import add_metadata 5 | from vsm.extensions.corpusbuilders.util import * 6 | import numpy as np 7 | 8 | class TestCorpusUtil(unittest.TestCase): 9 | 10 | def test_strip_punc(self): 11 | 12 | tsent = ['foo-foo',',','3','foo','bars','bar_foo','2to1','.'] 13 | out = strip_punc(tsent) 14 | self.assertEqual(out, ['foofoo','3','foo','bars','barfoo','2to1']) 15 | 16 | 17 | def test_rem_num(self): 18 | 19 | tsent = ['foo-foo',',','3','foo','bars','2-parts','2-to-1','3words','.'] 20 | out = rem_num(tsent) 21 | self.assertEqual(out, ['foo-foo',',','foo','bars','-parts','-to-','words','.']) 22 | 23 | def test_rehyph(self): 24 | 25 | sent = 'foo foo 3 foo--bars barfoo -- 2to1.' 26 | out = rehyph(sent) 27 | self.assertEqual(out, 'foo foo 3 foo - bars barfoo - 2to1.') 28 | 29 | def test_add_metadata(self): 30 | 31 | from vsm.corpus.util.corpusbuilders import random_corpus 32 | 33 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) 34 | n = c.view_metadata('sentence').size 35 | meta = ['m_{0}'.format(i) for i in range(n)] 36 | new_c = add_metadata(c, 'sentence', 'new_meta', meta) 37 | 38 | self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta) 39 | 40 | 41 | def test_apply_stoplist(self): 42 | 43 | from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist 44 | 45 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) 46 | new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], 47 | freq=0, in_place=False) 48 | 49 | li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']] 50 | wc = corpus_fromlist(li, context_type='sentence') 51 | new_wc = apply_stoplist(wc, nltk_stop=True, freq=1, in_place=False) 52 | 53 | self.assertTrue('0' in c.words) 54 | self.assertTrue('1' in c.words) 55 | self.assertFalse('0' in new_c.words) 56 | self.assertFalse('1' in new_c.words) 57 | 58 | self.assertTrue('said' in new_wc.words) 59 | self.assertTrue('bar' in new_wc.words) 60 | self.assertFalse('he' in new_wc.words) 61 | self.assertFalse('foo' in new_wc.words) 62 | self.assertFalse('ate' in new_wc.words) 63 | 64 | 65 | def test_filter_by_suffix(self): 66 | 67 | li = ['a.txt', 'b.json', 'c.txt'] 68 | filtered = filter_by_suffix(li, ['.txt']) 69 | filtered1 = filter_by_suffix(li, ['.json']) 70 | filtered2 = filter_by_suffix(li, ['.csv']) 71 | 72 | self.assertEqual(filtered, ['b.json']) 73 | self.assertEqual(filtered1, ['a.txt','c.txt']) 74 | self.assertEqual(filtered2, li) 75 | 76 | 77 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusUtil) 78 | unittest.TextTestRunner(verbosity=2).run(suite) 79 | -------------------------------------------------------------------------------- /vsm/extensions/testdata/history_greek_philosophy/__init__.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | 3 | 4 | __all__ = [ 'doc_files', 'doc_meta_file', 5 | 'documents', 'document_metadata', 6 | 'corpus', 'paragraphs', 'doc_label_fn' ] 7 | 8 | 9 | 10 | _doc_files = [ 'frontmatter.json', 'chapter1.json', 'chapter2.json', 11 | 'chapter3.json', 'chapter4.json', 'chapter5.json', 12 | 'chapter6.json', 'chapter7.json', 'chapter8.json', 13 | 'chapter9.json', 'chapter10.json', 'chapter11.json', 14 | 'chapter12.json', 'chapter13.json', 'chapter14.json', 15 | 'chapter15.json', 'chapter16.json', 'chapter17.json', 16 | 'chapter18.json', 'chapter19.json', 'chapter20.json', 17 | 'chapter21.json', 'chapter22.json', 'backmatter.json' ] 18 | 19 | doc_files = [os.path.join(os.path.dirname(__file__), f) 20 | for f in _doc_files] 21 | 22 | 23 | doc_meta_file = os.path.join(os.path.dirname(__file__), 'doc_meta.json') 24 | 25 | 26 | def document_metadata(): 27 | """Returns an iterator over document metadata in corpus. 28 | 29 | """ 30 | with open(doc_meta_file, 'r') as f: 31 | doc_meta_all = json.load(f) 32 | for docs_meta in doc_meta_all: 33 | for doc_meta in docs_meta: 34 | yield doc_meta 35 | 36 | 37 | def documents(): 38 | """Returns an iterator over documents paired with their metadata. 39 | 40 | """ 41 | m = document_metadata() 42 | 43 | for doc_file in doc_files: 44 | with open(doc_file, 'r') as f: 45 | docs = json.load(f) 46 | for doc in docs: 47 | yield doc, m.next() 48 | 49 | 50 | def paragraphs(): 51 | """Returns iterator over paragraphs and associated metadata. 52 | 53 | """ 54 | import copy 55 | import vsm.ext.corpusbuilders.util as util 56 | 57 | docs = documents() 58 | for doc, meta in docs: 59 | p = 0 60 | pars = util.paragraph_tokenize(doc) 61 | for par in pars: 62 | par_meta = copy.deepcopy(meta) 63 | par_meta['paragraph'] = p 64 | p += 1 65 | yield par, par_meta 66 | 67 | 68 | def corpus(doc_type='document', unidecode=True, nltk_stop=True, 69 | stop_freq=0, add_stop=None): 70 | """Returns Corpus object containing text data and metadata. 71 | 72 | """ 73 | from vsm.ext.corpusbuilders import corpus_from_strings 74 | 75 | if doc_type=='document': 76 | docs = documents() 77 | elif doc_type=='paragraphs': 78 | docs = paragraphs() 79 | else: 80 | raise Exception('Unrecognized document type given.') 81 | 82 | docs, meta = zip(*list(docs)) 83 | 84 | return corpus_from_strings(docs, meta, 85 | unidecode=unidecode, 86 | nltk_stop=nltk_stop, 87 | stop_freq=stop_freq, 88 | add_stop=add_stop) 89 | 90 | 91 | def doc_label_fn(metadata): 92 | label = metadata['part_of_book'] 93 | return label 94 | -------------------------------------------------------------------------------- /vsm/model/ldaexact.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | from itertools import product 5 | from ldacgsseq import LdaCgsSeq 6 | 7 | 8 | __all__ = [ 'LdaExact' ] 9 | 10 | 11 | 12 | def uniquify(l): 13 | """Takes a list `l` and returns a list of the unique elements in `l` 14 | in the order in which they appeared. 15 | 16 | """ 17 | mem = set([]) 18 | out = [] 19 | for e in l: 20 | if e not in mem: 21 | mem.add(e) 22 | out.append(e) 23 | return out 24 | 25 | 26 | def productoid(A, n): 27 | 28 | prod = product(A, repeat=n) 29 | d = dict((i, A[:i]) for i in xrange(1, len(A)+1)) 30 | 31 | for t in prod: 32 | elems = uniquify(t) 33 | if elems == d[len(elems)]: 34 | yield t 35 | 36 | 37 | class LdaExact(LdaCgsSeq): 38 | 39 | 40 | @property 41 | def arg_maxima(self): 42 | if hasattr(self, '_arg_maxima'): 43 | return self._arg_maxima 44 | return [] 45 | 46 | 47 | @arg_maxima.setter 48 | def arg_maxima(self, l): 49 | self._arg_maxima = l 50 | 51 | 52 | def _Z_values(self): 53 | 54 | A = range(self.K) 55 | p = productoid(A, len(self.corpus)) 56 | for t in p: 57 | yield np.array(t, dtype=int) 58 | 59 | 60 | def _init_model(self, Z): 61 | m = LdaCgsSeq(context_type=self.context_type, 62 | K=self.K, V=self.V, alpha=self.alpha, beta=self.beta) 63 | m.corpus = self.corpus 64 | m.V = self.V 65 | m.indices = self.indices 66 | m.Z = Z 67 | m._compute_top_doc() 68 | m._compute_word_top() 69 | m.inv_top_sums = 1. / self.word_top.sum(0) 70 | m.iteration = 1 71 | m.log_probs = [(1, m._compute_log_prob())] 72 | return m 73 | 74 | 75 | def _log_probs(self): 76 | 77 | Z = self._Z_values() 78 | 79 | for next_Z in Z: 80 | m = self._init_model(next_Z) 81 | yield (next_Z, m.log_probs[0][1]) 82 | 83 | 84 | def arg_max(self, verbose=1): 85 | 86 | max_log_prob = -np.inf 87 | maxima = [] 88 | 89 | log_probs = self._log_probs() 90 | for (Z, log_prob) in log_probs: 91 | if log_prob == max_log_prob: 92 | maxima.append((Z, log_prob)) 93 | elif log_prob > max_log_prob: 94 | max_log_prob = log_prob 95 | maxima = [(Z, log_prob)] 96 | 97 | self.arg_maxima = maxima 98 | self.Z = maxima[0][0] 99 | self._compute_top_doc() 100 | self._compute_word_top() 101 | self.inv_top_sums = 1. / self.word_top.sum(0) 102 | self.iteration = 1 103 | self.log_probs = [(1, max_log_prob)] 104 | 105 | if verbose > 0: 106 | print('Number of maxima:', len(self.arg_maxima)) 107 | 108 | 109 | def all_estimates(self): 110 | 111 | for (Z, log_prob) in self.arg_maxima: 112 | yield self._init_model(Z) 113 | 114 | -------------------------------------------------------------------------------- /vsm/extensions/lda_py4j/org/knowceans/gibbstest/FileReadWrite.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | 5 | def lda_save(ctx_type, phifile, thetafile, zfile, restfile, modelfile): 6 | # Reads data from too many files. 7 | dic = file_to_dict(restfile) 8 | top_doc = file_to_mat(thetafile) 9 | word_top = file_to_mat(phifile) 10 | 11 | arrays_out = dict() 12 | arrays_out['iteration'] = int(dic['iteration']) 13 | dt = dtype=[('i', int), ('v', float)] 14 | logs = [float(dic['log_probs'])] * int(dic['iteration']) 15 | indices = range(0, int(dic['iteration'])) 16 | arrays_out['log_probs'] = np.array(zip(indices, logs), dtype=dt) 17 | arrays_out['Z'] = list(file_to_arrli(zfile)) 18 | arrays_out['top_doc'] = top_doc.T 19 | arrays_out['word_top'] = word_top.T 20 | arrays_out['inv_top_sums'] = np.array([float(dic['inv_top_sums'])] 21 | * word_top.shape[1]) 22 | arrays_out['context_type'] = ctx_type 23 | arrays_out['K'] = int(dic['K']) 24 | arrays_out['m_words'] = int(dic['m_words']) 25 | arrays_out['doc_prior'] = np.array([float(dic['doc_prior'])] 26 | * top_doc.size)#.reshape(top_doc.shape) 27 | arrays_out['top_prior'] = np.array([float(dic['top_prior'])] 28 | * word_top.size)#.reshape(word_top.shape) 29 | 30 | print('Saving LDA model to', modelfile) 31 | np.savez(modelfile, **arrays_out) 32 | 33 | 34 | def file_to_dict(filename): 35 | """ 36 | Reads a file where each line is 'k,v' 37 | and returns a dictionary of k,v. 38 | """ 39 | dic = dict() 40 | with open(filename, 'r') as f: 41 | lines = f.readlines() 42 | for l in lines: 43 | l = l.strip('\n') 44 | li = l.split(',') 45 | dic[li[0]] = li[1] 46 | return dic 47 | 48 | 49 | def write_file(corpus, ctx_type, filename): 50 | """ 51 | Writes corpus.view_contexts(ctx_type) to a file txt. 52 | """ 53 | ctx = corpus.view_contexts(ctx_type) # [arrays,] 54 | 55 | with open(filename, 'w') as f: 56 | for arr in ctx: 57 | for i in arr: 58 | f.write(str(i)) 59 | f.write('\n') 60 | f.write('\n') 61 | 62 | 63 | def file_to_mat(filename): 64 | """ 65 | Data to an array. works for theta, phi. 66 | Removes automatically added 'missing values' at the end of the rows. 67 | """ 68 | arr = np.genfromtxt(filename, delimiter=',') 69 | 70 | return arr[:,:-1] 71 | 72 | 73 | def file_to_arrli(filename, dtype='int'): 74 | """ 75 | for Z, list of arrays where each array represents a document 76 | and the array has topic assignment for each word position in the document. 77 | Length of the array varies as it depends on the length of the 78 | corresponding document. 79 | """ 80 | 81 | with open(filename, 'r') as f: 82 | lines = f.readlines() 83 | 84 | docs = [] 85 | for l in lines: 86 | l = l.strip('\n') 87 | arr = np.fromstring(l, dtype=dtype, sep=',') 88 | docs.append(arr) 89 | 90 | return docs 91 | 92 | 93 | -------------------------------------------------------------------------------- /unit_tests/tests_ldacgsviewer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.viewer.ldacgsviewer import * 5 | from vsm.viewer.labeleddata import * 6 | 7 | 8 | class TestLdaCgsViewer(unittest.TestCase): 9 | 10 | def setUp(self): 11 | 12 | from vsm.corpus.util.corpusbuilders import random_corpus 13 | from vsm.model.ldacgsseq import LdaCgsSeq 14 | 15 | c = random_corpus(1000, 50, 0, 20, context_type='document', 16 | metadata=True) 17 | 18 | m = LdaCgsSeq(c, 'document', K=10) 19 | m.train(n_iterations=50, verbose=0) 20 | 21 | self.ldav = LdaCgsViewer(c, m) 22 | 23 | 24 | def test_LdaCgsViewer(self): 25 | 26 | li = [0,1] 27 | 28 | t = self.ldav.topics(compact_view=False) 29 | te = self.ldav.topic_entropies() 30 | swt = self.ldav.dist_word_top('0', compact_view=False) 31 | 32 | dt = self.ldav.doc_topics(0) 33 | dt_ = self.ldav.doc_topics(0) 34 | wt = self.ldav.word_topics('0') 35 | stt = self.ldav.dist_top_top(1) 36 | sttl = self.ldav.dist_top_top(li) 37 | std = self.ldav.dist_top_doc(0) 38 | stdl = self.ldav.dist_top_doc(li) 39 | sdd = self.ldav.dist_doc_doc(0) 40 | sddl = self.ldav.dist_doc_doc(li) 41 | 42 | t_c = self.ldav.topics() 43 | te_c = self.ldav.topic_entropies() 44 | swt_c = self.ldav.dist_word_top('1') 45 | 46 | dismatd = self.ldav.dismat_doc() 47 | dismatt = self.ldav.dismat_top() 48 | 49 | self.assertEqual(type(t), DataTable) 50 | self.assertEqual(type(te), LabeledColumn) 51 | self.assertEqual(type(swt), DataTable) 52 | 53 | self.assertEqual(type(dt), LabeledColumn) 54 | self.assertEqual(type(dt_), LabeledColumn) 55 | self.assertEqual(type(wt), LabeledColumn) 56 | self.assertEqual(type(stt), DataTable) 57 | self.assertEqual(type(sttl), DataTable) 58 | self.assertEqual(type(std), LabeledColumn) 59 | self.assertEqual(type(stdl), LabeledColumn) 60 | self.assertEqual(type(sdd), LabeledColumn) 61 | self.assertEqual(type(sddl), LabeledColumn) 62 | 63 | self.assertEqual(type(t_c), DataTable) 64 | self.assertEqual(type(te_c), LabeledColumn) 65 | self.assertEqual(type(swt_c), DataTable) 66 | 67 | self.assertEqual(type(dismatd), IndexedSymmArray) 68 | self.assertEqual(type(dismatt), IndexedSymmArray) 69 | 70 | def test_LdaCgsViewer_topics_args(self): 71 | # test calls of ldav.topics() 72 | t = self.ldav.topics() 73 | self.assertEqual(type(t), DataTable) 74 | self.assertEqual(len(t), self.ldav.model.K) 75 | 76 | with self.assertRaises(ValueError): 77 | self.ldav.topics(2) 78 | 79 | t=self.ldav.topics([2]) 80 | self.assertEqual(type(t), DataTable) 81 | self.assertEqual(len(t), 1) 82 | 83 | t = self.ldav.topics([2,4]) 84 | self.assertEqual(type(t), DataTable) 85 | self.assertEqual(len(t), 2) 86 | 87 | 88 | 89 | #Define and run test suite 90 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLdaCgsViewer) 91 | unittest.TextTestRunner(verbosity=2).run(suite) 92 | -------------------------------------------------------------------------------- /unit_tests/tests_structarr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm import * 5 | from vsm.structarr import * 6 | 7 | class TestCore(unittest.TestCase): 8 | 9 | def test_arr_add_field(self): 10 | 11 | arr = np.array([(1, '1'), (2, '2'), (3, '3')], 12 | dtype=[('i', int), ('c', '|S1')]) 13 | new_arr = np.array([(1, '1', 0), (2, '2', 0), (3, '3', 0)], 14 | dtype=[('i', int), ('c', '|S1'), ('new', int)]) 15 | 16 | new_field = 'new' 17 | vals = np.zeros(3, dtype=int) 18 | 19 | test_arr = arr_add_field(arr, new_field, vals) 20 | 21 | self.assertTrue(np.array_equiv(new_arr, test_arr)) 22 | self.assertTrue(new_arr.dtype==test_arr.dtype) 23 | 24 | def test_enum_matrix(self): 25 | 26 | arr = np.array([[6,3,7], [2,0,4]], dtype=int) 27 | em1 = enum_matrix(arr) 28 | em2 = enum_matrix(arr, 29 | indices=[10,20,30], 30 | field_name='tens') 31 | 32 | self.assertTrue(np.array_equiv(em1, np.array([[(2,7), (0,6), (1, 3)],[(2,4), (0,2), (1,0)]], 33 | dtype=[('i', int), ('value', int)]))) 34 | self.assertTrue(np.array_equiv(em2, np.array([[(30,7), (10,6), (20, 3)],[(30,4), (10,2), (20,0)]], 35 | dtype=[('tens', int), ('value', int)]))) 36 | 37 | 38 | 39 | def test_enum_sort(self): 40 | 41 | arr = np.array([7,3,1,8,2]) 42 | sorted_arr = enum_sort(arr) 43 | sorted_arr1 = enum_sort(arr, indices=[10,20,30,40,50]) 44 | 45 | self.assertTrue(np.array_equiv(sorted_arr, 46 | np.array([(3, 8), (0, 7), (1, 3), (4, 2), (2, 1)], 47 | dtype=[('i', int), ('value', int)]))) 48 | 49 | self.assertTrue(np.array_equiv(sorted_arr1, 50 | np.array([(40, 8), (10, 7), (20, 3), (50, 2), (30, 1)], 51 | dtype=[('i', int), ('value', int)]))) 52 | 53 | 54 | def test_enum_array(self): 55 | 56 | arr1 = np.array([7,3,1,8,2]) 57 | ea1 = enum_array(arr1) 58 | arr2 = np.array([6,3,7,2,0,4]) 59 | ea2 = enum_array(arr2) 60 | 61 | self.assertTrue(np.array_equiv(ea1, 62 | np.array([(0,7), (1,3), (2,1), (3,8), (4,2)], 63 | dtype=[('i', int), ('value', int)]))) 64 | self.assertTrue(np.array_equiv(ea2, 65 | np.array([(0,6), (1,3), (2,7), (3,2), (4,0), (5,4)], 66 | dtype=[('i', int), ('value', int)]))) 67 | 68 | 69 | def test_zip_arr(self): 70 | 71 | arr1 = np.array([[2,4], [6,8]], dtype=int) 72 | arr2 = np.array([[1,3], [5,7]], dtype=int) 73 | 74 | zipped = zip_arr(arr1, arr2, field_names=['even', 'odd']) 75 | self.assertTrue(np.array_equiv(zipped, np.array([[(2,1), (4,3)], [(6,5), (8,7)]], 76 | dtype=[('even', int), ('odd', int)]))) 77 | 78 | 79 | def test_map_strarr(self): 80 | 81 | arr = np.array([(0, 1.), (1, 2.)], 82 | dtype=[('i', 'i4'), ('v', 'f4')]) 83 | m = ['foo', 'bar'] 84 | arr = map_strarr(arr, m, 'i', new_k='str') 85 | 86 | self.assertTrue(np.array_equal(arr['str'], 87 | np.array(m, dtype=np.array(m).dtype))) 88 | self.assertTrue(np.array_equal(arr['v'], np.array([1., 2.], dtype='f4'))) 89 | 90 | 91 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCore) 92 | unittest.TextTestRunner(verbosity=2).run(suite) 93 | -------------------------------------------------------------------------------- /vsm/extensions/lda_py4j/org/knowceans/gibbstest/FileArrayProvider.java: -------------------------------------------------------------------------------- 1 | package org.knowceans.gibbstest; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | 11 | public class FileArrayProvider { 12 | 13 | public static int[][] readFile(String filename) throws IOException { 14 | /* 15 | Reads file that contains corpus.view_contexts(ctx_type), 16 | list of arrays. This returns the int[][] for LdaGibbsSampler 17 | documents. 18 | */ 19 | FileReader fileReader = new FileReader(filename); 20 | BufferedReader bufferedReader = new BufferedReader(fileReader); 21 | 22 | // List lines = new ArrayList(); 23 | List> lines = new ArrayList>(); 24 | String line = null; 25 | 26 | List a = new ArrayList(); 27 | while ((line = bufferedReader.readLine()) != null) { 28 | if (line.length() > 0) { // String 29 | int item = Integer.parseInt(line); 30 | a.add(item); 31 | } else { 32 | lines.add(a); 33 | a = new ArrayList(); 34 | } 35 | } 36 | bufferedReader.close(); 37 | 38 | int[][] arr = new int[lines.size()][]; 39 | for (int i=0; i subli = lines.get(i); 42 | int[] blankarr = new int[subli.size()]; 43 | for (int j=0; j Z 53 | """ 54 | r = len(samples) 55 | mat = np.zeros((r, r)) 56 | 57 | for n_ in xrange(n): 58 | # select integer d from distr[0] to distr[1] 59 | d = randrange(distr[0],distr[1]+1) 60 | km = KMeans(n_clusters=d, init='k-means++', 61 | max_iter=100, n_init=1,verbose=False) 62 | km.fit(samples) 63 | labels = km.labels_ 64 | 65 | for i in range(r): 66 | for j in range(i,r): 67 | if labels[i] == labels[j]: 68 | mat[i][j] += 1 69 | 70 | mat = mat + mat.T 71 | mat /= n 72 | 73 | cutplot = np.zeros((n+1 ,2), dtype='f2<') 74 | for l in xrange(n+1): 75 | # Construct graph for which mat[i][j] > l/n 76 | graph = mat > 1.0 * l /n 77 | 78 | n_comp, labels = cs.cs_graph_components(graph) 79 | cutplot[l][0] = l * 1.0 /n 80 | cutplot[l][1] = n_comp 81 | return mat, cutplot 82 | 83 | 84 | def find_cutoff(cutplot, n_cls=None): 85 | """ 86 | Finds the weight cutoff based on the longest run in cutplot. 87 | If n_cls is provided, finds the cutoff point where n_cls 88 | clusters are formed.""" 89 | from itertools import groupby 90 | 91 | if n_cls != None: 92 | for c in cutplot: 93 | if c[1] == n_cls: 94 | return c[0] 95 | 96 | group = groupby(cutplot[:,1]) 97 | val = max(group, key=lambda k: len(list(k[1])))[0] 98 | 99 | for c in cutplot: 100 | if c[1] == val: 101 | return c[0] + 0.01 102 | 103 | 104 | def category_mat(samples, mat, cutplot, cutoff=None): 105 | """ 106 | Predicts the category for each data point 107 | """ 108 | if cutoff == None: 109 | cutoff = find_cutoff(cutplot) 110 | # Build a new graph on samples with edges mat[i][j] > cutoff 111 | newG = mat > cutoff 112 | n_comp, labels = cs.cs_graph_components(newG) 113 | 114 | return labels 115 | 116 | -------------------------------------------------------------------------------- /vsm/model/tfidf.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from builtins import range 3 | 4 | import numpy as np 5 | from scipy.sparse import csr_matrix 6 | 7 | from vsm.model.base import BaseModel 8 | 9 | 10 | __all__ = [ 'TfIdf' ] 11 | 12 | 13 | class TfIdf(BaseModel): 14 | """ 15 | Transforms a term-frequency model into a term-frequency 16 | inverse-document-frequency model. 17 | 18 | A TF-IDF model is term frequency model whose rows, corresponding 19 | to word types, are scaled by IDF values. The idea is that a word 20 | type which occurs in most of the contexts (i.e., documents) does 21 | less to distinguish the contexts semantically than does a word 22 | type which occurs in few of the contexts. The document frequency 23 | is the number of documents in which a word occurs divided by the 24 | number of documents. The IDF is the log of the inverse of the 25 | document frequency. 26 | 27 | As with a term-frequency model, word types correspond to matrix 28 | rows and contexts correspond to matrix columns. 29 | 30 | The data structure is a sparse float matrix. 31 | 32 | :See Also: :class:`vsm.model.TfSeq`, :class:`vsm.model.base`, 33 | :class:`scipy.sparse.coo_matrix` 34 | 35 | :notes: 36 | A zero in the matrix might arise in two ways: (1) the word type 37 | occurs in every document, in which case the IDF value is 0; (2) 38 | the word type occurs in no document at all, in which case the IDF 39 | value is undefined. 40 | """ 41 | def __init__(self, corpus=None, context_type=None, tf_matrix=None): 42 | """ 43 | Initialize TfIdf. 44 | 45 | :param corpus: A Corpus object containing the training data. 46 | :type corpus: Corpus 47 | 48 | :param context_type: A string specifying the type of context over 49 | which the model trainer is applied. 50 | :type context_type: string 51 | 52 | :param tf_matrix: A matrix containing the term-frequency data. 53 | :type tf_matrix: scipy.sparse matrix 54 | """ 55 | 56 | self.context_type = context_type 57 | if corpus is not None: 58 | self.corpus = corpus.corpus 59 | else: 60 | self.corpus = [] 61 | 62 | if tf_matrix is None: 63 | self.matrix = csr_matrix([], dtype=np.float64) 64 | else: 65 | self.matrix = tf_matrix.copy() 66 | self.matrix = self.matrix.tocsr() 67 | self.matrix = self.matrix.astype(np.float64) 68 | 69 | self.undefined_rows = [] 70 | 71 | 72 | def train(self): 73 | """ 74 | Computes the IDF values for the input term-frequency matrix, 75 | scales the rows by these values and stores the results in 76 | `self.matrix`. 77 | """ 78 | if self.matrix.size > 0: 79 | n_docs = np.float64(self.matrix.shape[1]) 80 | 81 | for i in range(self.matrix.indptr.shape[0] - 1): 82 | 83 | start = self.matrix.indptr[i] 84 | stop = self.matrix.indptr[i + 1] 85 | 86 | if start == stop: 87 | self.undefined_rows.append(i) 88 | else: 89 | row = self.matrix.data[start:stop] 90 | row *= np.log(n_docs / np.count_nonzero(row)) 91 | start = stop 92 | 93 | @staticmethod 94 | def from_tf(tf_model): 95 | """ 96 | Takes a `Tf` model object and generates a `TfIdf` model. 97 | """ 98 | model = TfIdf(tf_matrix=tf_model.matrix) 99 | model.corpus = tf_model.corpus 100 | model.context_type = tf_model.context_type 101 | return model 102 | -------------------------------------------------------------------------------- /vsm/extensions/interop/ldac.py: -------------------------------------------------------------------------------- 1 | """ 2 | `vsm.extensions.interop.ldac` 3 | 4 | Module containing functions for import/export between VSM and lda-c, which is 5 | the original LDA implementation referenced in Blei, Ng, and Jordan (2003). 6 | lda-c is available at: `` 7 | """ 8 | import os 9 | import os.path 10 | 11 | from scipy.stats import itemfreq 12 | import numpy as np 13 | 14 | from vsm.extensions.corpusbuilders import corpus_fromlist 15 | 16 | 17 | def export_corpus(corpus, outfolder, context_type='document'): 18 | """ 19 | Converts a vsm.corpus.Corpus object into a lda-c compatible data file. 20 | Creates two files: 21 | 1. "vocab.txt" - contains the integer-word mappings 22 | 2. "corpus.dat" - contains the corpus object in the format described in 23 | the `lda-c documentation`_: 24 | 25 | Under LDA, the words of each document are assumed exchangeable. 26 | Thus, each document is succinctly represented as a sparse vector 27 | of word counts. The data is a file where each line is of the form: 28 | 29 | [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count] 30 | 31 | where [M] is the number of unique terms in the document, and the 32 | [count] associated with each term is how many times that term 33 | appeared in the document. Note that [term_1] is an integer 34 | which indexes the term; it is not a string. 35 | 36 | :param corpus: VSM Corpus object to convert to lda-c file 37 | :type corpus: vsm.corpus.Corpus 38 | 39 | :param outfolder: Directory to output "vocab.txt" and "corpus.dat" 40 | :type string: path 41 | 42 | .. _lda-c documentation: http://www.cs.princeton.edu/~blei/lda-c/readme.txt 43 | """ 44 | if not os.path.exists(outfolder): 45 | os.makedirs(outfolder) 46 | 47 | vocabfilename = os.path.join(outfolder, 'vocab.txt') 48 | with open(vocabfilename, 'w') as vocabfile: 49 | for word in corpus.words: 50 | vocabfile.write(word + '\n') 51 | 52 | corpusfilename = os.path.join(outfolder, 'corpus.dat') 53 | with open(corpusfilename, 'w') as corpusfile: 54 | for ctx in corpus.view_contexts(context_type): 55 | M = len(np.unique(ctx)) 56 | corpusfile.write("{0}".format(M)) 57 | 58 | for token in itemfreq(ctx): 59 | corpusfile.write(" {term}:{count}".format( 60 | term=token[0], count=token[1])) 61 | 62 | corpusfile.write("\n") 63 | 64 | 65 | def import_corpus(corpusfilename, vocabfilename, context_type='document'): 66 | """ 67 | Converts an lda-c compatible data file into a VSM Corpus object. 68 | 69 | :param corpusfilename: path to corpus file, as defined in lda-c 70 | documentation. 71 | :type string: 72 | 73 | :param vocabfilename: path to vocabulary file, one word per line 74 | :type string: 75 | """ 76 | # process vocabulary file 77 | with open(vocabfilename) as vocabfile: 78 | vocab = [line.strip() for line in vocabfile] 79 | 80 | # process corpus file 81 | corpus = [] 82 | with open(corpusfilename) as corpusfile: 83 | for line in corpusfile: 84 | tokens = line.split()[1:] 85 | ctx = [] 86 | for token in tokens: 87 | id, count = token.split(':') 88 | id = int(id) 89 | count = int(count) 90 | ctx.extend([vocab[id]] * count) 91 | corpus.append(ctx) 92 | 93 | return corpus_fromlist(corpus, context_type=context_type) 94 | 95 | 96 | def import_model(filename): 97 | pass 98 | 99 | 100 | def export_model(filename): 101 | pass 102 | -------------------------------------------------------------------------------- /vsm/model/lda.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides a convenient alias for the LdaCgs* classes 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import print_function 6 | from builtins import str 7 | from builtins import object 8 | import platform # For Windows workaround 9 | import warnings 10 | 11 | 12 | __all__ = [ 'LDA' ] 13 | 14 | class LDA(object): 15 | """ 16 | Depending on the boolean parameter `multiprocessing`, returns and 17 | initializes an instance of either LdaCgsSeq or LdaCgsMulti. 18 | 19 | Note that on Windows platforms, `multiprocessing` is not implemented. 20 | In contrast to LdaCgsMulti, LDA always returns a valid object. Instead 21 | of raising a NotImplementedError, LDA issues a RuntimeWarning, notifying 22 | the user the sequental algorithm is being used. When `seed_or_seeds` is a 23 | list in this instance, only the first seed is used. 24 | """ 25 | def __new__(cls, 26 | corpus=None, context_type=None, 27 | K=20, V=0, alpha=[], beta=[], 28 | multiprocessing=False, seed_or_seeds=None, n_proc=None): 29 | 30 | kwargs = dict(corpus=corpus, context_type=context_type, 31 | K=K, V=V, alpha=alpha, beta=beta) 32 | 33 | if multiprocessing and platform.system() != 'Windows': 34 | if n_proc is not None: 35 | kwargs['n_proc'] = n_proc 36 | if seed_or_seeds is not None and not isinstance(seed_or_seeds, int): 37 | kwargs['seeds'] = seed_or_seeds 38 | 39 | 40 | from .ldacgsmulti import LdaCgsMulti 41 | return LdaCgsMulti(**kwargs) 42 | 43 | else: 44 | if multiprocessing and platform.system() == 'Windows': 45 | warnings.warn("""Multiprocessing is not implemented on Windows. 46 | Defaulting to sequential algorithm.""", RuntimeWarning) 47 | 48 | # extract single seed 49 | if seed_or_seeds is not None and not isinstance(seed_or_seeds, int): 50 | seed_or_seeds = seed_or_seeds[0] 51 | warnings.warn("Windows is using only the first seed: " + 52 | str(seed_or_seeds), RuntimeWarning) 53 | 54 | # parse seed_or_seeds argument 55 | if isinstance(seed_or_seeds, int): 56 | kwargs['seed'] = seed_or_seeds 57 | elif seed_or_seeds is not None: 58 | raise ValueError("LDA(seed_or_seeds, ...) must take an" + 59 | "integer in single-threaded mode.") 60 | 61 | from .ldacgsseq import LdaCgsSeq 62 | return LdaCgsSeq(**kwargs) 63 | 64 | @staticmethod 65 | def load(filename, multiprocessing=False, n_proc=None): 66 | """ 67 | A static method for loading a saved LdaCgsMulti model. 68 | 69 | :param filename: Name of a saved model to be loaded. 70 | :type filename: string 71 | 72 | :returns: m : LdaCgsMulti object 73 | 74 | :See Also: :class:`numpy.load` 75 | """ 76 | from .ldafunctions import load_lda 77 | from .ldacgsmulti import LdaCgsMulti 78 | from .ldacgsseq import LdaCgsSeq 79 | 80 | if multiprocessing and platform.system() != 'Windows': 81 | return load_lda(filename, LdaCgsMulti) 82 | else: 83 | if multiprocessing and platform.system() == 'Windows': 84 | warnings.warn("""Multiprocessing is not implemented on Windows. 85 | Defaulting to sequential algorithm.""", RuntimeWarning) 86 | m = load_lda(filename, LdaCgsSeq) 87 | try: 88 | if m.n_proc: 89 | print("reloading with multiprocessing support") 90 | m = load_lda(filename, LdaCgsMulti) 91 | except AttributeError: 92 | pass 93 | 94 | return m 95 | -------------------------------------------------------------------------------- /vsm/model/ldacgs.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import time 5 | from ldafunctions import load_lda, save_lda, init_priors 6 | 7 | # import pyximport; pyximport.install() 8 | from _ldacgs import cgs 9 | 10 | 11 | __all__ = [ 'LdaCgs' ] 12 | 13 | 14 | 15 | class LdaCgs(object): 16 | """ 17 | """ 18 | def __init__(self, corpus=None, context_type=None, 19 | K=20, V=0, alpha=[], beta=[]): 20 | """ 21 | Initialize LdaCgs. 22 | 23 | :param corpus: Source of observed data. 24 | :type corpus: `Corpus` 25 | 26 | :param context_type: Name of tokenization stored in `corpus` whose tokens 27 | will be treated as documents. 28 | :type context_type: string, optional 29 | 30 | :param K: Number of topics. Default is `20`. 31 | :type K: int, optional 32 | 33 | :param beta: Topic priors. Default is 0.01 for all words. 34 | :type beta: list, optional 35 | 36 | :param alpha: Document priors. Default is a flat prior of 0.01 37 | for all topics. 38 | :type alpha: list, optional 39 | """ 40 | 41 | self.context_type = context_type 42 | self.K = K 43 | 44 | if corpus: 45 | self.V = corpus.words.size 46 | self.indices = corpus.view_contexts(self.context_type, 47 | as_indices=True) 48 | self.indices = np.array(self.indices, dtype=('i')) 49 | self.corpus = np.array(corpus.corpus, dtype=('i')) 50 | else: 51 | self.V = V 52 | self.indices = np.array([], dtype=('i')) 53 | self.corpus = np.array([], dtype=('i')) 54 | 55 | priors = init_priors(self.V, self.K, beta, alpha) 56 | self.beta, self.alpha = priors 57 | 58 | self.Z = None 59 | self.word_top = None 60 | self.top_doc = None 61 | 62 | self.log_probs = None 63 | self.iteration = 0 64 | 65 | 66 | def train(self, n_iterations=100, n_threads=1, verbose=1): 67 | 68 | seed = np.uint64(0) 69 | 70 | results = cgs(self.K, 71 | self.V, 72 | self.indices, 73 | self.corpus, 74 | self.alpha.reshape(-1,), 75 | self.beta.reshape(-1,), 76 | n_iterations, 77 | n_threads, 78 | seed) 79 | 80 | self.Z = results['Z'] 81 | self.word_top = results['word_top'] 82 | self.top_doc = results['top_doc'] 83 | #TODO: Manage log_probs so that training continuations can be done. 84 | self.log_probs = results['log_probs'] 85 | 86 | 87 | @staticmethod 88 | def load(filename): 89 | return load_lda(filename, LdaCgsSeq) 90 | 91 | 92 | def save(self, filename): 93 | save_lda(self, filename) 94 | 95 | 96 | 97 | ################################################################# 98 | # Demos 99 | ################################################################# 100 | 101 | 102 | def demo_LdaCgs(doc_len=500, V=100000, n_docs=100, 103 | K=20, n_iterations=5, n_threads=1): 104 | 105 | from vsm.extensions.corpusbuilders import random_corpus 106 | 107 | print('Words per document:', doc_len) 108 | print('Words in vocabulary:', V) 109 | print('Documents in corpus:', n_docs) 110 | print('Number of topics:', K) 111 | print('Iterations:', n_iterations) 112 | 113 | c = random_corpus(n_docs*doc_len, V, doc_len, doc_len+1) 114 | 115 | print('Random corpus generated. Initializing model.') 116 | m = LdaCgs(c, 'document', K=K) 117 | 118 | print('Begin estimation.') 119 | m.train(n_iterations=n_iterations, n_threads=n_threads) 120 | 121 | return m 122 | -------------------------------------------------------------------------------- /vsm/model/_cgs_update.pyx: -------------------------------------------------------------------------------- 1 | # cython: binding=True 2 | # cython: wraparound=False 3 | # cython: boundscheck=False 4 | # cython: cdivision=True 5 | 6 | import cython 7 | cimport cython 8 | 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | ctypedef np.float32_t NP_FLOAT_t 13 | #TODO: figure out how to use np types in python code 14 | #ctypedef fused NP_FLOAT_t: 15 | # np.float32_t 16 | # np.float64_t 17 | 18 | ctypedef fused CORPUS_t: 19 | unsigned int 20 | unsigned short 21 | 22 | ctypedef fused TOPIC_t: 23 | unsigned short 24 | unsigned char 25 | 26 | cdef extern from "math.h": 27 | float logf(float n) 28 | 29 | @cython.wraparound(False) 30 | @cython.boundscheck(False) 31 | @cython.cdivision(True) 32 | def cgs_update(int itr, 33 | CORPUS_t [:] corpus, 34 | np.ndarray[NP_FLOAT_t, ndim=2] word_top, 35 | np.ndarray[NP_FLOAT_t] inv_top_sums, 36 | np.ndarray[NP_FLOAT_t, ndim=2] top_doc, 37 | TOPIC_t [:] Z, 38 | int [:] indices, 39 | str mtrand_str, 40 | unsigned int [:] mtrand_keys, 41 | int mtrand_pos, 42 | int mtrand_has_gauss, 43 | float mtrand_cached_gaussian): 44 | 45 | cdef int first, last 46 | cdef long stop, doc_len, offset 47 | cdef NP_FLOAT_t r, s 48 | cdef Py_ssize_t i, j, idx, w, t, k 49 | 50 | cdef int V = corpus.shape[0] 51 | cdef int N = indices.shape[0] 52 | cdef int K = word_top.shape[1] 53 | cdef int W = word_top.shape[0] 54 | 55 | cdef NP_FLOAT_t log_p = 0 56 | cdef np.ndarray[NP_FLOAT_t, ndim=2] log_wk = np.log(word_top * inv_top_sums) 57 | cdef np.ndarray[NP_FLOAT_t, ndim=2] log_kd = np.log(top_doc / top_doc.sum(0)) 58 | 59 | cdef object np_random_state = np.random.RandomState() 60 | np_random_state.set_state((mtrand_str, mtrand_keys, 61 | mtrand_pos, mtrand_has_gauss, 62 | mtrand_cached_gaussian)) 63 | cdef np.ndarray[NP_FLOAT_t] samples = np_random_state.uniform(size=V).astype(np.float32) 64 | cdef np.ndarray[NP_FLOAT_t] dist = np.zeros((K,), dtype=np.float32) 65 | 66 | cdef object mtrand_state = np_random_state.get_state() 67 | 68 | 69 | with nogil: 70 | for i in range(N): 71 | 72 | if i==0: 73 | doc_len = indices[0] 74 | offset = 0 75 | else: 76 | offset = indices[i-1] 77 | stop = indices[i] 78 | doc_len = stop - offset 79 | 80 | for j in range(doc_len): 81 | 82 | idx = offset + j 83 | w,k = corpus[idx], Z[idx] 84 | 85 | log_p += log_wk[w, k] + log_kd[k, i] 86 | 87 | if itr > 0: 88 | word_top[w, k] -= 1 89 | s = inv_top_sums[k] 90 | inv_top_sums[k] = s / (1 - s) 91 | top_doc[k, i] -= 1 92 | 93 | t = 0 94 | dist[t] = (inv_top_sums[t] * word_top[w,t] * top_doc[t,i]) 95 | for t in range(1,K): 96 | dist[t] = dist[t-1] + (inv_top_sums[t] * word_top[w,t] * top_doc[t,i]) 97 | 98 | r = samples[idx] * dist[K-1] 99 | for k in range(K): 100 | if r < dist[k]: 101 | break 102 | """ 103 | # This code implements binary search for the right insertion 104 | # point for the probability in the cumulative distribution 105 | first = 0 106 | last = K - 1 107 | while first < last: 108 | k = (first + last) / 2 109 | if r < dist[k]: 110 | last = k 111 | else: 112 | first = k + 1 113 | """ 114 | 115 | word_top[w, k] += 1 116 | s = inv_top_sums[k] 117 | inv_top_sums[k] = s / (1 + s) 118 | top_doc[k, i] += 1 119 | 120 | Z[idx] = (k) 121 | 122 | return (np.asarray(word_top), np.asarray(inv_top_sums), 123 | np.asarray(top_doc), np.asarray(Z), log_p, 124 | mtrand_state[0], mtrand_state[1], mtrand_state[2], 125 | mtrand_state[3], mtrand_state[4]) 126 | -------------------------------------------------------------------------------- /unit_tests/tests_beagleorder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from vsm.model.beagleorder import * 5 | from vsm.model.beagleorder import (reduce_ngrams, rand_pt_unit_sphere, 6 | two_rand_perm) 7 | 8 | 9 | class TestBeagleOrder(unittest.TestCase): 10 | 11 | def setUp(self): 12 | 13 | from vsm.corpus.util.corpusbuilders import random_corpus 14 | from vsm.model.beagleenvironment import BeagleEnvironment 15 | 16 | self.c = random_corpus(1000, 50, 0, 10, context_type='sentence') 17 | 18 | self.e = BeagleEnvironment(self.c, n_cols=100) 19 | self.e.train() 20 | 21 | self.ms = BeagleOrderSeq(self.c, self.e.matrix) 22 | self.ms.train() 23 | ''' 24 | self.mm = BeagleOrderMulti(self.c, self.e.matrix) 25 | self.mm.train(2) 26 | ''' 27 | 28 | 29 | def test_BeagleOrderSeq(self): 30 | from tempfile import NamedTemporaryFile 31 | import os 32 | 33 | try: 34 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 35 | self.ms.save(tmp.name) 36 | tmp.close() 37 | m1 = BeagleOrderSeq.load(tmp.name) 38 | self.assertTrue((self.ms.matrix == m1.matrix).all()) 39 | 40 | finally: 41 | os.remove(tmp.name) 42 | 43 | 44 | ''' 45 | def test_BeagleOrderMulti(self): 46 | 47 | from tempfile import NamedTemporaryFile 48 | import os 49 | 50 | try: 51 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') 52 | self.mm.save(tmp.name) 53 | tmp.close() 54 | m1 = BeagleOrderMulti.load(tmp.name) 55 | self.assertTrue((self.mm.matrix == m1.matrix).all()) 56 | 57 | finally: 58 | os.remove(tmp.name) 59 | ''' 60 | 61 | #TODO: Construct a reference result for both models 62 | # def test_compare(self): 63 | 64 | # psi = rand_pt_unit_sphere(self.e.shape[1]) 65 | 66 | # rand_perm = two_rand_perm(self.e.shape[1]) 67 | 68 | # print 'Training single processor model' 69 | # ms = BeagleOrderSeq(self.c, self.e.matrix, psi=psi, rand_perm=rand_perm) 70 | # ms.train() 71 | 72 | # print 'Training multiprocessor model' 73 | # mm = BeagleOrderMulti(self.c, self.e.matrix, psi=psi, rand_perm=rand_perm) 74 | # mm.train() 75 | 76 | # self.assertTrue(np.allclose(ms.matrix, mm.matrix), (ms.matrix, mm.matrix 77 | # )) 78 | 79 | 80 | #TODO: Make into actual unit tests 81 | # def test10(self): 82 | 83 | # import pprint 84 | 85 | # def fn(x,y): 86 | # if isinstance(x, tuple): 87 | # return x + (y,) 88 | # return (x, y) 89 | 90 | # a = np.arange(5) 91 | # print 'array length', a.shape[0] 92 | 93 | # for i in xrange(a.shape[0]): 94 | # n = 3 95 | # print 'ngram length', n 96 | # print 'index', i 97 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 98 | 99 | # for i in xrange(a.shape[0]): 100 | # n = 4 101 | # print 'ngram length', n 102 | # print 'index', i 103 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 104 | 105 | # for i in xrange(a.shape[0]): 106 | # n = 5 107 | # print 'ngram length', n 108 | # print 'index', i 109 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 110 | 111 | 112 | # def test11(self): 113 | 114 | # import pprint 115 | 116 | # def fn(x,y): 117 | # return x + y 118 | 119 | # a = np.arange(5) 120 | # print 'array length', a.shape[0] 121 | 122 | # for i in xrange(a.shape[0]): 123 | # n = 3 124 | # print 'ngram length', n 125 | # print 'index', i 126 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 127 | 128 | # for i in xrange(a.shape[0]): 129 | # n = 4 130 | # print 'ngram length', n 131 | # print 'index', i 132 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 133 | 134 | # for i in xrange(a.shape[0]): 135 | # n = 5 136 | # print 'ngram length', n 137 | # print 'index', i 138 | # pprint.pprint(reduce_ngrams(fn, a, n, i)) 139 | 140 | 141 | 142 | #Define and run test suite 143 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleOrder) 144 | unittest.TextTestRunner(verbosity=2).run(suite) 145 | -------------------------------------------------------------------------------- /vsm/extensions/corpusbuilders/corpusstreamers.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | import sys 3 | if sys.version_info[0] == 2: 4 | import backports.tempfile 5 | 6 | from codecs import open 7 | from concurrent.futures import as_completed, ProcessPoolExecutor 8 | import pickle 9 | import tempfile 10 | import os 11 | 12 | from progressbar import ProgressBar, Bar, Percentage 13 | from unidecode import unidecode 14 | 15 | from vsm.extensions.corpusbuilders import corpus_fromlist 16 | from vsm.extensions.corpusbuilders.util import (apply_stoplist, 17 | detect_encoding, word_tokenize) 18 | 19 | IGNORE = ['.json','.log','.pickle', '.DS_Store', '.err', '.npz'] 20 | 21 | def read_file(filename, encoding='utf8', decode=False): 22 | if encoding == 'detect': 23 | encoding = detect_encoding(filename) 24 | 25 | try: 26 | if decode: 27 | with open(filename, mode='r', encoding=encoding) as f: 28 | data = unidecode(f.read()) 29 | else: 30 | with open(filename, mode='r', encoding=encoding) as f: 31 | data = f.read() 32 | except UnicodeDecodeError: 33 | encoding = detect_encoding(filename) 34 | if decode: 35 | with open(filename, mode='r', encoding=encoding) as f: 36 | data = unidecode(f.read()) 37 | else: 38 | with open(filename, mode='r', encoding=encoding) as f: 39 | data = f.read() 40 | 41 | return data 42 | 43 | def tokenize_and_pickle_file(filename, pickle_dir=None, 44 | tokenizer=word_tokenize, encoding='utf8', decode=False): 45 | """ 46 | Tokenizes a file and returns a filename of a PickledWords instance. 47 | """ 48 | data = read_file(filename, encoding=encoding, decode=decode) 49 | 50 | corpus = tokenizer(data) 51 | 52 | # dump to picklefile 53 | with tempfile.NamedTemporaryFile(dir=pickle_dir, delete=False) as fp: 54 | pickle.dump(corpus, fp) 55 | filename = fp.name 56 | del corpus 57 | 58 | return filename 59 | 60 | 61 | def corpus_from_files(dir_or_filenames, encoding='utf8', ignore=IGNORE, 62 | nltk_stop=False, stop_freq=0, add_stop=None, decode=False, 63 | verbose=True, simple=False, tokenizer=word_tokenize): 64 | if os.path.isdir(dir_or_filenames): 65 | # go through files in directory, filter hidden files 66 | filenames = [os.path.join(root, path) 67 | for root, dirs, files in os.walk(dir_or_filenames) 68 | for path in files 69 | if not path.startswith('.') 70 | and not any(path.endswith(i) for i in ignore)] 71 | labels = [filename.replace(dir_or_filenames + '/', '') for filename in filenames] 72 | else: 73 | filenames = dir_or_filenames 74 | labels = filenames[:] 75 | 76 | if verbose: 77 | pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(filenames)) 78 | pbar = pbar.start() 79 | n = 0 80 | 81 | if sys.version_info[0] == 2: 82 | TD = backports.tempfile.TemporaryDirectory 83 | else: 84 | TD = tempfile.TemporaryDirectory 85 | with TD(prefix='vsm-') as pickle_dir: 86 | with ProcessPoolExecutor() as executor: 87 | corpus = [executor.submit(tokenize_and_pickle_file, filename, pickle_dir, tokenizer) 88 | for filename in filenames] 89 | if verbose: 90 | for f in as_completed(corpus): 91 | n += 1 92 | pbar.update(n) 93 | 94 | pbar.finish() 95 | corpus = [f.result() for f in corpus] 96 | 97 | corpus = [PickledWords(f) for f in corpus] 98 | corpus = corpus_fromlist(corpus, context_type='document', remove_empty=False) 99 | corpus.context_data[0]['document_label'][:] = labels 100 | 101 | corpus = apply_stoplist(corpus, nltk_stop=nltk_stop, freq=stop_freq) 102 | 103 | return corpus 104 | 105 | class PickledWords: 106 | def __init__(self, filename): 107 | self.file = filename 108 | 109 | with open(self.file, 'rb') as fp: 110 | self.list = pickle.load(fp) 111 | self.len = len(self.list) 112 | del self.list 113 | 114 | def __iter__(self): 115 | with open(self.file, 'rb') as fp: 116 | self.list = pickle.load(fp) 117 | 118 | for i in range(len(self.list)): 119 | yield self.list[i] 120 | 121 | del self.list 122 | 123 | return 124 | 125 | def __len__(self): 126 | return self.len 127 | 128 | def __copy__(self): 129 | return PickledWords(self.file) 130 | -------------------------------------------------------------------------------- /vsm/model/lsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | 4 | 5 | __all__ = [ 'Lsa' ] 6 | 7 | 8 | class Lsa(object): 9 | """ 10 | """ 11 | 12 | def __init__(self, corpus=None, context_type=None, td_matrix=None): 13 | """ 14 | Initialize Lsa. 15 | 16 | :param corpus: A Corpus object containing the training data. 17 | :type corpus: Corpus, optional 18 | 19 | :param context_type: Name of tokenization whose tokens will be 20 | treated as documents. Default is `None`. 21 | :type context_type: string, optional 22 | 23 | :param td_matrix: Term-Document matrix. Default is `None`. 24 | :type td_matrix: np.array, optional 25 | """ 26 | 27 | self.word_matrix = None 28 | self.doc_matrix = None 29 | self.eigenvalues = None 30 | self.context_type = context_type 31 | if corpus is not None: 32 | self.corpus = corpus.corpus 33 | else: 34 | self.corpus = [] 35 | 36 | if td_matrix is None: 37 | self.td_matrix = np.array([]) 38 | else: 39 | td_matrix = sparse.coo_matrix(td_matrix) 40 | 41 | # Removing infinite values for SVD 42 | finite_mask = np.isfinite(td_matrix.data) 43 | coo_in = (td_matrix.data[finite_mask], 44 | (td_matrix.row[finite_mask], 45 | td_matrix.col[finite_mask])) 46 | 47 | td_matrix = sparse.coo_matrix(coo_in, shape=td_matrix.shape, 48 | dtype=np.float64) 49 | self.td_matrix = td_matrix.tocsr() 50 | 51 | 52 | def train(self, k_factors=300): 53 | """ 54 | Trains the model. 55 | 56 | :param k_factors: Default is 300. 57 | :type k_factors: int, optional 58 | """ 59 | from scipy.sparse import linalg as linalgs 60 | 61 | u,s,v = np.array([]), np.array([]), np.array([]) 62 | 63 | if self.td_matrix.size > 0: 64 | s = min(self.td_matrix.shape) 65 | if s < k_factors: 66 | k_factors = s - 1 67 | 68 | # print 'Performing sparse SVD' 69 | u, s, v = linalgs.svds(self.td_matrix, k=k_factors) 70 | 71 | indices = s.argsort()[::-1] 72 | self.word_matrix = u[:, indices] 73 | self.eigenvalues = s[indices] 74 | self.doc_matrix = v[indices, :] 75 | 76 | 77 | def save(self, f): 78 | """ 79 | Saves model data as a numpy archive file with extension `npz`. 80 | The keys for the component matrices are `word_matrix`, 81 | `eigenvalues` and `doc_matrix`. 82 | 83 | :param f: Designates the file to which to save data. See 84 | `numpy.savez` for further details. 85 | :type f: str-like or file-like object 86 | 87 | :See Also: :meth:`numpy.savez` 88 | """ 89 | arrays_out = dict() 90 | arrays_out['word_matrix'] = self.word_matrix 91 | arrays_out['eigenvalues'] = self.eigenvalues 92 | arrays_out['doc_matrix'] = self.doc_matrix 93 | arrays_out['context_type'] = self.context_type 94 | np.savez(f, **arrays_out) 95 | 96 | 97 | @staticmethod 98 | def load(f): 99 | """ 100 | Loads LSA model data from a numpy archive file with extension 101 | `npz`. The expected keys for the component matrices are 102 | `word_matrix`, `eigenvalues` and `doc_matrix`. 103 | 104 | :param f: Designates the file from which to load data. See 105 | `numpy.load` for further details. 106 | :type f: str-like or file-like object 107 | 108 | :returns: a saved Lsa model. 109 | 110 | :See Also: :meth:`numpy.load` 111 | """ 112 | arrays_in = np.load(f) 113 | m = Lsa(context_type=arrays_in['context_type']) 114 | m.word_matrix=arrays_in['word_matrix'] 115 | m.eigenvalues=arrays_in['eigenvalues'] 116 | m.doc_matrix=arrays_in['doc_matrix'] 117 | return m 118 | 119 | @staticmethod 120 | def from_tf(tf_model): 121 | """ 122 | Takes a `Tf` model object and generates a `TfIdf` model. 123 | """ 124 | model = Lsa(td_matrix=tf_model.matrix) 125 | model.corpus = tf_model.corpus 126 | model.context_type = tf_model.context_type 127 | return model 128 | 129 | @staticmethod 130 | def from_tfidf(tfidf_model): 131 | """ 132 | Takes a `Tf` model object and generates a `TfIdf` model. 133 | """ 134 | model = Lsa(td_matrix=tfidf_model.matrix) 135 | model.corpus = tfidf_model.corpus 136 | model.context_type = tfidf_model.context_type 137 | return model 138 | -------------------------------------------------------------------------------- /vsm/extensions/clustering/manifold.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from builtins import object 3 | import numpy as np 4 | from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering 5 | from sklearn.manifold import Isomap, MDS 6 | from .plotting import plot_clusters 7 | 8 | 9 | __all__ = [ 'Manifold' ] 10 | 11 | 12 | class Manifold(object): 13 | def __init__(self, dismat, labels=None, cls=[], pos=[]): 14 | self.dismat = np.asarray(dismat) 15 | self.labels = labels 16 | self._cls = cls # Clusters info 17 | self.pos = pos 18 | 19 | 20 | def __str__(self): 21 | return self.dismat.__str__() 22 | 23 | 24 | @property 25 | def cls(self): 26 | """ 27 | views clusters as lists 28 | """ 29 | return [[self.labels[i] for i,lab in enumerate(self._cls) if lab == x] 30 | for x in set(self._cls)] 31 | 32 | 33 | # 34 | # Clustering methods 35 | # 36 | def KMeans(self, n_clusters=10, init='k-means++', max_iter=100, 37 | n_init=1, verbose=1, show=True): 38 | """ 39 | Clusters the objects in `dismat` using k-means algorithm. This requires 40 | `pos` be precomputed by `mds` or `isomap`. For parameters of the 41 | algorithms see: 42 | http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans. 43 | html#sklearn.cluster.KMeans 44 | 45 | :param n_clusters: Number of clusters used as the parameter for K-means. 46 | :type n_clusters: int, optional 47 | 48 | :param show: Shows the resulting clusters if true. 49 | :type n_clusters: boolean, optional 50 | """ 51 | 52 | if len(self.pos)==0: 53 | raise Exception('K-Means requires low dimentional coordinates. Try mds() or isomap() first.') 54 | 55 | model = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, 56 | n_init=n_init,verbose=verbose).fit(self.pos) 57 | self._cls = model.labels_ 58 | 59 | if show: 60 | return self.cls 61 | 62 | 63 | 64 | def AffinityPropagation(self, show=True): 65 | """ 66 | Clusters objects in `dismat` using affinity propagation algorithm. 67 | 68 | :param show: Shows the resulting clusters if true. 69 | :type n_clusters: boolean, optional 70 | """ 71 | 72 | model = AffinityPropagation(affinity='precomputed').fit(self.dismat) 73 | self._cls = model.labels_ 74 | 75 | if show: 76 | return self.cls 77 | 78 | 79 | 80 | def SpectralClustering(self, n_clusters=10, show=True): 81 | """ 82 | Clusters objects in `dismat` using spectral clustering. 83 | 84 | :param n_clusters: Number of clusters used as the parameter for K-means. 85 | :type n_clusters: int, optional 86 | 87 | :param show: Shows the resulting clusters if true. 88 | :type n_clusters: boolean, optional 89 | """ 90 | 91 | model = SpectralClustering(n_clusters=n_clusters, 92 | affinity='precomputed').fit(self.dismat) 93 | self._cls = model.labels_ 94 | 95 | if show: 96 | return self.cls 97 | 98 | 99 | 100 | # 101 | # Manifold learning methods 102 | # 103 | 104 | def mds(self, n_components=2, dissimilarity='precomputed', show=False): 105 | """ 106 | Calculates lower dimention coordinates using the mds algorithm. 107 | This requires sklearn ver 0.14 due to the dissimilarity argument. 108 | 109 | :param n_components: dimentionality of the reduced space. 110 | :type n_components: int, optional 111 | 112 | :param show: Shows the calculated coordinates if true. 113 | :type show: boolean, optional 114 | """ 115 | model = MDS(n_components=n_components, dissimilarity=dissimilarity, max_iter=100) 116 | self.pos = model.fit_transform(self.dismat) 117 | 118 | if show: 119 | return self.pos 120 | 121 | 122 | 123 | def isomap(self, n_components=2, n_neighbors=3, show=False): 124 | """ 125 | Calculates lower dimention coordinates using the isomap algorithm. 126 | 127 | :param n_components: dimentionality of the reduced space 128 | :type n_components: int, optional 129 | 130 | :param n_neighbors: Used by isomap to determine the number of neighbors 131 | for each point. Large neighbor size tends to produce a denser map. 132 | :type n_neighbors: int, optional 133 | 134 | :param show: Shows the calculated coordinates if true. 135 | :type show: boolean, optional 136 | """ 137 | 138 | model = Isomap(n_components=n_components, n_neighbors=n_neighbors) 139 | self.pos = model.fit(self.dismat).embedding_ 140 | 141 | if show: 142 | return self.pos 143 | 144 | 145 | 146 | def plot(self, xy = (0,1)): 147 | """ 148 | Outputs 2d embeded plot based on `pos` 149 | 150 | :param xy: specifies the dimsntions of pos to be plotted. 151 | :type xy: tuple, optional 152 | 153 | """ 154 | return plot_clusters(self.pos[:,[xy[0],xy[1]]], self.labels, clusters=self._cls) 155 | -------------------------------------------------------------------------------- /vsm/extensions/mahout/mahout.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import re 4 | import numpy as np 5 | 6 | 7 | def load_help(txtfile): 8 | """ 9 | Returns a list of strings split with ': ' 10 | """ 11 | with open(txtfile, 'r') as f: 12 | s = f.read() 13 | s = re.sub('\n',': ', s) 14 | li = s.split(': ') 15 | return li 16 | 17 | 18 | def load_vals(txtfile): 19 | """ 20 | Loads data from mahout-generated txtfile(topic-term or doc-topic). 21 | Returns a list of dictionaries. 22 | """ 23 | import ast 24 | 25 | data = [] 26 | li = load_help(txtfile) 27 | 28 | for i in xrange(len(li)): 29 | if li[i] == 'Value' and i < len(li)-1: 30 | dic = ast.literal_eval(li[i+1]) 31 | data.append(dic) 32 | return data 33 | 34 | 35 | def build_arr(dictli): 36 | """ 37 | dictli : list of dictionaries 38 | """ 39 | r = len(dictli) 40 | c = len(dictli[0]) 41 | 42 | arr = np.zeros((r,c)) 43 | 44 | for i in xrange(r): 45 | arr[i] = dictli[i].values() 46 | 47 | return arr 48 | 49 | 50 | def load_kv(txtfile): 51 | """ 52 | Returns dictionary equivalent to Corpus.word_int 53 | """ 54 | dic = {} 55 | li = load_help(txtfile) 56 | 57 | for i in xrange(len(li)): 58 | if li[i] == 'Key': 59 | dic[li[i+1]] = int(li[i+3]) 60 | 61 | return dic 62 | 63 | 64 | def make_corpus(txtfile, word_int, as_strings=False): 65 | """ 66 | Returns a list of arrays that represent documents. 67 | """ 68 | corp = [] 69 | li = load_help(txtfile) 70 | 71 | for i in xrange(len(li)): 72 | if li[i] == 'Value': 73 | doc = li[i+1] 74 | doc = doc.strip() 75 | doc = doc.strip('[') 76 | doc = doc.strip(']') 77 | doc = doc.split(', ') 78 | doc = [str(w) for w in doc] 79 | 80 | idoc = [] 81 | for w in doc: 82 | try: 83 | i = word_int[w] 84 | if as_strings: 85 | idoc.append(w) 86 | else: 87 | idoc.append(int(i)) 88 | except: 89 | pass 90 | 91 | corp.append(np.array(idoc)) 92 | 93 | return corp 94 | 95 | 96 | def stopwords(corp, topword): 97 | """ 98 | corp : `Corpus` object 99 | topword : topword (list of dictionaries) from model. 100 | """ 101 | ind = topword[0].keys() 102 | 103 | rem = [] 104 | for w in corp.words: 105 | i = corp.words_int[w] 106 | if i not in ind: 107 | rem.append(w) 108 | 109 | return rem 110 | 111 | 112 | def savez(fname, ctx_type, itr, K, alpha, beta, doc_top, top_word, W): 113 | arrays_out = dict() 114 | 115 | V = top_word.shape[1] 116 | # use mahout-vect-test/tokenized-documents 117 | # mahout-vect-test/dictionary.file-0 118 | corp = np.array(np.hstack(W)) 119 | arrays_out['W_corpus'] = corp 120 | arrays_out['W_indices'] = np.cumsum([a.size for a in W]) 121 | arrays_out['V'] = V # num of Vocabs 122 | 123 | # next 3 lines are dummy values 124 | arrays_out['Z_corpus'] = np.zeros(corp.shape[0]) 125 | arrays_out['Z_indices'] = np.cumsum([a.size for a in W]) 126 | arrays_out['log_prob_init'] = False 127 | 128 | arrays_out['doc_top'] = doc_top 129 | arrays_out['top_word'] = top_word 130 | arrays_out['sum_word_top'] = (V * beta) + np.zeros(K) 131 | 132 | arrays_out['context_type'] = ctx_type 133 | arrays_out['K'] = K 134 | arrays_out['iterations'] = itr 135 | arrays_out['alpha'] = alpha 136 | arrays_out['beta'] = beta 137 | 138 | print('Saving LDA model to ', fname) 139 | np.savez(fname, **arrays_out) 140 | 141 | 142 | """ 143 | if __name__=='__main__': 144 | # workflow 145 | # from vsm.corpus.util.corpupsbuilders import corpus_fromlist 146 | 147 | # Return topword, doctop information from the txt file as arrays. 148 | top_word = load_vals('../../../mahout-lda-test/lda.txt') 149 | doc_top = load_vals('../../../mahout-dt-test/doc-topics.txt') 150 | 151 | arrtw = build_arr(top_word) 152 | arrdt = build_arr(doc_top) 153 | 154 | # dicionary that corresponds to Corpus.words_int 155 | words_int = load_kv('../../../mahout-vect-test/dict.txt') 156 | 157 | # list of arrays that represent documents. 158 | # `wcorp` can be an input to `corpus_fromlist()` to create a `Corpus`. 159 | wcorp = make_corpus('../../../mahout-vect-test/tokenized-documents/tdocs.txt', 160 | words_int, as_strings=True) 161 | 162 | # make `Corpus` object and apply_stoplist to ensure the words 163 | # are exactly the same as the ones in the topword. 164 | wc = corpus_fromlist(wcorp, 'document') 165 | rem = stopwords(wc, top_word) 166 | wc_ = wc.apply_stoplist(rem) 167 | 168 | # Save `Corpus` and LDA model. 169 | wc_.save('mahout-test.npz') 170 | savez('mahout-test-K5-100.npz', 'document', 100, 5, 0.01, 0.01, arrdt, 171 | arrtw, wc_.view_contexts('document')) 172 | 173 | """ 174 | -------------------------------------------------------------------------------- /vsm/viewer/beagleviewer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from vsm.spatial import angle 4 | from vsm.exceptions import * 5 | 6 | from vsm.viewer.wrappers import * 7 | 8 | 9 | __all__ = ['BeagleViewer'] 10 | 11 | 12 | class BeagleViewer(object): 13 | """ 14 | A class for viewing BEAGLE models. 15 | """ 16 | 17 | def __init__(self, corpus, model): 18 | """ 19 | Initialize BeagleViewer. 20 | 21 | :param corpus: Source of observed data. 22 | :type corpus: :class:`Corpus` 23 | 24 | :param model: One of the Beagle objects. 25 | :type model: BEAGLE model 26 | """ 27 | self.corpus = corpus 28 | self.model = model 29 | 30 | def dist_word_word(self, word_or_words, weights=[], 31 | filter_nan=True, print_len=10, as_strings=True, 32 | dist_fn=angle, order='i'): 33 | """ 34 | Computes and sorts the distances between word(s) and every word. 35 | 36 | :param word_or_words: Query word(s) to which distances are calculated. 37 | :type word_or_words: string or list of strings 38 | 39 | :param weights: Specify weights for each query word in `word_or_words`. 40 | Default uses equal weights (i.e. arithmetic mean) 41 | :type weights: list of floating point, optional 42 | 43 | :param filter_nan: If `True` not a number entries are filtered. 44 | Default is `True`. 45 | :type filter_nan: boolean, optional 46 | 47 | :param print_len: Number of words to be displayed. Default is 10. 48 | :type print_len: int, optional 49 | 50 | :param as_strings: If `True`, returns a list of words as strings rather 51 | than their integer representations. Default is `True`. 52 | :type as_strings: boolean, optional 53 | 54 | :param dist_fn: A distance function from functions in vsm.spatial. 55 | Default is :meth:`angle`. 56 | :type dist_fn: string, optional 57 | 58 | :param order: Order of sorting. 'i' for increasing and 'd' for 59 | decreasing order. Default is 'i'. 60 | :type order: string, optional 61 | 62 | :returns: an instance of :class:`LabeledColumn`. 63 | A 2-dim array containing words and their distances to 64 | `word_or_words`. 65 | 66 | :See Also: :meth:`vsm.viewer.wrappers.dist_word_word` 67 | """ 68 | return dist_word_word(word_or_words, self.corpus, 69 | self.model.matrix.T, weights=weights, 70 | filter_nan=filter_nan, 71 | print_len=print_len, as_strings=True, 72 | dist_fn=dist_fn, order=order) 73 | 74 | 75 | @deprecated_meth("dist_word_word") 76 | def sim_word_word(self, word_or_words, weights=[], 77 | filter_nan=True, print_len=10, as_strings=True, 78 | dist_fn=angle, order='i'): 79 | 80 | self.sim_word_word.__func__.__doc__ = dist_word_word.__doc__ 81 | pass 82 | 83 | 84 | @deprecated_meth("dismat_word") 85 | def simmat_word(self, word_list, dist_fn=angle): 86 | pass 87 | 88 | def dismat_word(self, word_list, dist_fn=angle): 89 | """ 90 | Calculates a distance matrix for a given list of words. 91 | 92 | :param word_list: A list of words whose distance matrix is to be 93 | computed. 94 | :type word_list: list of strings 95 | 96 | :param dist_fn: A distance function from functions in vsm.spatial. 97 | Default is :meth:`angle`. 98 | :type dist_fn: string, optional 99 | 100 | :returns: an instance of :class:`IndexedSymmArray`. 101 | A n x n matrix containing floats where n is the number of words 102 | in `word_list`. 103 | 104 | :See Also: :meth:`vsm.viewer.wrappers.dismat_word` 105 | """ 106 | 107 | return dismat_word(word_list, self.corpus, 108 | self.model.matrix.T, dist_fn=dist_fn) 109 | 110 | 111 | 112 | # # This is a quick adaptation of the isomap_docs function from 113 | # # ldagibbsviewer. This should be abstracted and moved to 114 | # # similarity.py or something equivalent. 115 | # def isomap_words(self, words, weights=[], thres=.8, 116 | # n_neighbors=5, scale=True, trim=20): 117 | # """ 118 | # """ 119 | # from sklearn import manifold 120 | # from math import ceil 121 | # from vsm.ext.clustering.plotting import ( 122 | # gen_colors as _gen_colors_, 123 | # plot_clusters as _plot_clusters_) 124 | 125 | # # create a list to be plotted 126 | # word_list = self.dist_word_word(words, weights=weights) 127 | 128 | # # cut down the list by the threshold 129 | # labels, size = zip(*[(w,s) for (w,s) in word_list if s < thres]) 130 | # print size 131 | # # calculate coordinates 132 | # dismat = self.dismat_words(labels) 133 | # dismat = np.clip(dismat, 0, 2) # cut off values outside [0, 1] 134 | # imap = manifold.Isomap(n_components=2, n_neighbors=n_neighbors) 135 | # pos = imap.fit(dismat).embedding_ 136 | 137 | # # set graphic parameters 138 | # # - scale point size 139 | # if scale: 140 | # size = [s+0.5 if s == 0 else s for s in size] # for given word which has 0.0 141 | # # value to be visible. 142 | # size = [s**2*150 for s in size] 143 | # else: 144 | # size = np.ones_like(size) * 50 145 | # # - trim labels 146 | # if trim: 147 | # labels = [lab[:trim] for lab in labels] 148 | 149 | # # hack for unidecode issues in matplotlib 150 | # labels = [label.decode('utf-8', 'ignore') for label in labels] 151 | 152 | # return _plot_clusters_(pos, labels, size=size) 153 | 154 | -------------------------------------------------------------------------------- /vsm/extensions/testdata/history_greek_philosophy/frontmatter.json: -------------------------------------------------------------------------------- 1 | ["Project Gutenberg's A Short History of Greek Philosophy, by John Marshall\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever. You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: A Short History of Greek Philosophy\n\nAuthor: John Marshall\n\nRelease Date: February 1, 2007 [EBook #20500]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK GREEK PHILOSOPHY ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\nA SHORT HISTORY\n\nOF\n\nGREEK PHILOSOPHY\n\n\nBY\n\nJOHN MARSHALL\n\nM.A. OXON., LL.D. EDIN.\n\n\nRECTOR OF THE ROYAL HIGH SCHOOL, EDINBURGH\n\nFORMERLY PROFESSOR OF CLASSICAL LITERATURE AND PHILOSOPHY\n\nIN THE YORKSHIRE COLLEGE, LEEDS\n\n\n\n\nLONDON\n\nPERCIVAL AND CO.\n\n1891\n\n_All rights reserved_\n\n\n\n\nPREFACE\n\nThe main purpose which I have had in view in writing this book has been\nto present an account of Greek philosophy which, within strict limits\nof brevity, shall be at once authentic and interesting--_authentic_, as\nbeing based on the original works themselves, and not on any secondary\nsources; _interesting_, as presenting to the ordinary English reader,\nin language freed as far as possible from technicality and\nabstruseness, the great thoughts of the greatest men of antiquity on\nquestions of permanent significance and value. There has been no\nattempt to shirk the really philosophic problems which these men tried\nin their day to solve; but I have endeavoured to show, by a sympathetic\ntreatment of them, that these problems were no mere wars of words, but\nthat in fact the philosophers of twenty-four centuries ago were dealing\nwith exactly similar difficulties as to the bases of belief and of\nright action as, under different forms, beset thoughtful men and women\nto-day.\n\nIn the general treatment of the subject, I have followed in the main\nthe order, and drawn chiefly on the selection of passages, in Ritter\nand Preller's _Historia Philosophiae Graecae_. It is hoped that in\nthis way the little book may be found useful at the universities, as a\nrunning commentary on that excellent work; and the better to aid\nstudents in the use of it for that purpose, the corresponding sections\nin Ritter and Preller are indicated by the figures in the margin.\n\nIn the sections on Plato, and occasionally elsewhere, I have drawn to\nsome extent, by the kind permission of the Delegates of the Clarendon\nPress and his own, on Professor Jowett's great commentary and\ntranslation.\n\nJOHN MARSHALL.\n\n\n\n\nTranscriber's notes:\n\nThe passage numbers in the Ritter-Preller book mentioned in the second\nparagraph above are indicated in this book with square brackets, e.g.\n\"[10]\". In the original book they were formatted as sidenotes. In\nthis e-book they are embedded in the text approximately where they\nappear in the original book, unless they are at the start of a\nparagraph, in which case they appear immediately before that paragraph.\n\nPage numbers are indicated with curly brackets, e.g. \"{5}\". They are\nembedded into the text where page breaks occurred in the original book.\n\nIn the original book, pages had headings that varied with the material\nbeing discussed on that pair of pages. In this e-book, those headings\nhave been collected into an \"introductory\" paragraph at the beginning\nof each chapter.\n\n\nThe original book uses several Greek words. These words, the chapters\nthey are used in, and their transliterations are as follows:\n\nChapter I (pages 3, 4, 12) - \"arche\" - alpha (with the soft-breathing\nmark), rho, chi, eta; \"phloios\" - phi, lambda, omicron, iota, omicron,\nfinal sigma.\n\nChapter III (page 28) - \"soma\" - sigma, omega, mu, alpha; \"sema\" -\nsigma, eta, mu, alpha.\n\nChapter IV (page 33, 34 - \"doxa\" - delta, omicron, xi, alpha; \"Peri\" -\nPI, epsilon, rho, iota; \"Phueos\" - PHI, upsilon, sigma, epsilon, omega,\nfinal sigma.\n\nChapter V (page 48) - \"logos\" - lambda, omicron, gamma, omicron, final\nsigma; \"hule\" - upsilon with rough breathing mark, lambda, eta.\n\n\n\n\nCONTENTS\n\n\nCHAP.\n\n I.--THE SCHOOL OF MILETUS--\n I. Thales . . . . . . . . . . . . . . . . . . . 1\n II. Anaximander . . . . . . . . . . . . . . . . . 7\n\n II.--THE SCHOOL OF MILETUS (_concluded_)--\n III. Anaximenes . . . . . . . . . . . . . . . . . 14\n IV. Heraclitus . . . . . . . . . . . . . . . . . 15\n\n III.--PYTHAGORAS AND THE PYTHAGOREANS . . . . . . . . . 22\n\n IV.--THE ELEATICS--\n I. Xenophanes . . . . . . . . . . . . . . . . . 31\n II. Parmenides . . . . . . . . . . . . . . . . . 33\n\n V.--THE ELEATICS (_concluded_)--\n III. Zeno . . . . . . . . . . . . . . . . . . . . 42\n IV. Melissus . . . . . . . . . . . . . . . . . . 46\n\n VI.--THE ATOMISTS--\n I. Anaxagoras . . . . . . . . . . . . . . . . . 52\n\n VII.--THE ATOMISTS (_continued_)--\n II. Empedocles . . . . . . . . . . . . . . . . . 58\n\n VIII.--THE ATOMISTS (_concluded_)--\n III. Leucippus and Democritus . . . . . . . . . . 74\n\n IX.--THE SOPHISTS--\n I. Protagoras . . . . . . . . . . . . . . . . . 85\n\n X.--THE SOPHISTS (_concluded_)--\n II. Gorgias . . . . . . . . . . . . . . . . . . . 92\n\n XI.--SOCRATES . . . . . . . . . . . . . . . . . . . . . 101\n\n XII.--SOCRATES (concluded) . . . . . . . . . . . . . . . 116\n\n XIII.--THE INCOMPLETE SOCRATICS--\n I. Aristippus and the Cyrenaics . . . . . . . . 124\n II. Antisthenes and the Cynics . . . . . . . . . 128\n III. Euclides and the Megarics . . . . . . . . . . 132\n\n XIV.--PLATO . . . . . . . . . . . . . . . . . . . . . . . 134\n\n XV.--PLATO (_continued_) . . . . . . . . . . . . . . . . 146\n\n XVI.--PLATO (_continued_) . . . . . . . . . . . . . . . . 154\n\n XVII.--PLATO (_concluded_) . . . . . . . . . . . . . . . . 162\n\n XVIII.--ARISTOTLE . . . . . . . . . . . . . . . . . . . . . 172\n\n XIX.--ARISTOTLE (_continued_) . . . . . . . . . . . . . . 187\n\n XX.--ARISTOTLE (_concluded_) . . . . . . . . . . . . . . 199\n\n XXI.--THE SCEPTICS AND EPICUREANS . . . . . . . . . . . . 210\n\n XXII.--THE STOICS . . . . . . . . . . . . . . . . . . . . 238\n\n INDEX . . . . . . . . . . . . . . . . . . . . . . . 245\n"] -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/vsm.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/vsm.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/vsm" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/vsm" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | 155 | gh-pages: 156 | make html 157 | git clone https://github.com/inpho/vsm vsm_gh_pages 158 | cd vsm_gh_pages \ 159 | && git checkout gh-pages \ 160 | && rm -rf * _static _sources \ 161 | && mv ../$(BUILDDIR)/html/* . \ 162 | && git add -A \ 163 | && git commit -m "Generated gh-pages" \ 164 | && git push origin gh-pages 165 | rm -rf $(BUILDDIR) vsm_gh_pages 166 | -------------------------------------------------------------------------------- /vsm/model/tf.py: -------------------------------------------------------------------------------- 1 | from builtins import object 2 | import multiprocessing as mp 3 | import platform, warnings 4 | 5 | import numpy as np 6 | from scipy.sparse import hstack 7 | 8 | from vsm.spatial import count_matrix 9 | from vsm.split import * 10 | from vsm.model.base import * 11 | 12 | 13 | __all__ = ['TF', 'TfSeq', 'TfMulti'] 14 | 15 | 16 | 17 | class TfSeq(BaseModel): 18 | """ 19 | Trains a term-frequency model. 20 | 21 | In a term-frequency model, the number of occurrences of a word 22 | type in a context is counted for all word types and documents. Word 23 | types correspond to matrix rows and documents correspond to matrix 24 | columns. 25 | 26 | :See Also: :class:`vsm.model.base`, :class:`vsm.corpus.Corpus`, 27 | :class:`scipy.sparse.coo_matrix` 28 | """ 29 | 30 | def __init__(self, corpus=None, context_type=None): 31 | """ 32 | Initialize TfSeq. 33 | 34 | :param corpus: A Corpus object containing the training data. 35 | :type corpus: Corpus 36 | 37 | :param context_type: A string specifying the type of context over which 38 | the model trainer is applied. 39 | :type context_type: string 40 | """ 41 | 42 | self.context_type = context_type 43 | if corpus: 44 | self.corpus = corpus.corpus 45 | self.docs = corpus.view_contexts(context_type, as_slices=True) 46 | self.V = corpus.words.size 47 | else: 48 | self.corpus = [] 49 | self.docs = [] 50 | self.V = 0 51 | 52 | 53 | def train(self): 54 | """ 55 | Counts word-type occurrences per context and stores the results in 56 | `self.matrix`. 57 | """ 58 | self.matrix = count_matrix(self.corpus, self.docs, self.V) 59 | 60 | 61 | 62 | class TfMulti(TfSeq): 63 | """ 64 | Trains a term-frequency model. 65 | 66 | In a term-frequency model, the number of occurrences of a word 67 | type in a context is counted for all word types and documents. Word 68 | types correspond to matrix rows and documents correspond to matrix 69 | columns. 70 | 71 | The data structure is a sparse integer matrix. 72 | 73 | :See Also: :class:`vsm.model.base.BaseModel`, :class:`vsm.corpus.Corpus`, 74 | :class:`scipy.sparse.coo_matrix` 75 | """ 76 | def __init__(self, corpus=None, context_type=None): 77 | """ 78 | Initialize TfMulti. 79 | 80 | :param corpus: A Corpus object containing the training data 81 | :type corpus: Corpus, optional 82 | 83 | :param context_type: A string specifying the type of context over which 84 | the model trainer is applied. 85 | :type context_type: string, optional 86 | """ 87 | self._read_globals = False 88 | self._write_globals = False 89 | 90 | super(TfMulti, self).__init__(corpus=corpus, context_type=context_type) 91 | 92 | 93 | def _move_globals_to_locals(self): 94 | 95 | self._write_globals = False 96 | self.V = self.V 97 | self.corpus = self.corpus 98 | self._read_globals = False 99 | global _V, _corpus 100 | del _V, _corpus 101 | 102 | 103 | def _move_locals_to_globals(self): 104 | 105 | self._write_globals = True 106 | self.V = self.V 107 | self.corpus = self.corpus 108 | self._read_globals = True 109 | del self._V_local, self._corpus_local 110 | 111 | 112 | @property 113 | def corpus(self): 114 | if self._read_globals: 115 | return np.frombuffer(_corpus, np.int32) 116 | return self._corpus_local 117 | 118 | @corpus.setter 119 | def corpus(self, a): 120 | if self._write_globals: 121 | global _corpus 122 | if not '_corpus' in globals(): 123 | _corpus = mp.Array('i', len(a), lock=False) 124 | _corpus[:] = a 125 | else: 126 | self._corpus_local = a 127 | 128 | @property 129 | def V(self): 130 | if self._read_globals: 131 | return _V.value 132 | return self._V_local 133 | 134 | @V.setter 135 | def V(self, V): 136 | if self._write_globals: 137 | global _V 138 | _V = mp.Value('i', V, lock=False) 139 | else: 140 | self._V_local = V 141 | 142 | 143 | 144 | def train(self, n_proc=2): 145 | """ 146 | Takes a number of processes `n_proc` over which to map and reduce. 147 | 148 | :param n_procs: Number of processors. 149 | :type n_procs: int 150 | """ 151 | self._move_locals_to_globals() 152 | 153 | doc_indices = mp_split_ls(self.docs, n_proc) 154 | 155 | p=mp.Pool(n_proc) 156 | cnt_mats = p.map(tf_fn, doc_indices) 157 | p.close() 158 | 159 | self.matrix = hstack(cnt_mats, format='coo') 160 | 161 | self._move_globals_to_locals() 162 | 163 | 164 | 165 | def tf_fn(ctx_sbls): 166 | """ 167 | The map function for vsm.model.TfMulti. Takes a list of documents 168 | as slices and returns a count matrix. 169 | 170 | :param ctx_sbls: list of documents as slices. 171 | :type ctx_sbls: list of slices 172 | 173 | :returns: a count matrix 174 | """ 175 | offset = ctx_sbls[0].start 176 | corpus = _corpus[offset: ctx_sbls[-1].stop] 177 | slices = [slice(s.start-offset, s.stop-offset) for s in ctx_sbls] 178 | return count_matrix(corpus, slices, _V.value) 179 | 180 | 181 | class TF(object): 182 | """ 183 | Depending on the boolean parameter `multiprocessing`, returns and 184 | initializes an instance of either TfSeq or TfMulti. 185 | 186 | Note that on Windows platforms, `multiprocessing` is not implemented. 187 | In contrast to LdaCgsMulti, LDA always returns a valid object. Instead 188 | of raising a NotImplementedError, LDA issues a RuntimeWarning, notifying 189 | the user the sequental algorithm is being used. 190 | """ 191 | def __new__(cls, corpus=None, context_type=None, multiprocessing=False): 192 | 193 | kwargs = dict(corpus=corpus, context_type=context_type) 194 | 195 | if multiprocessing and platform.system() != 'Windows': 196 | return TfMulti(**kwargs) 197 | else: 198 | if platform.system() == 'Windows': 199 | warnings.warn("""Multiprocessing is not implemented on Windows. 200 | Defaulting to sequential algorithm.""", RuntimeWarning) 201 | return TfSeq(**kwargs) 202 | -------------------------------------------------------------------------------- /vsm/model/beaglecontext.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import absolute_import 3 | from future import standard_library 4 | standard_library.install_aliases() 5 | from builtins import zip 6 | from builtins import str 7 | from builtins import range 8 | import os 9 | import shutil 10 | import tempfile 11 | import multiprocessing as mp 12 | import pickle as cpickle 13 | 14 | import numpy as np 15 | 16 | from vsm.model.base import BaseModel 17 | 18 | 19 | __all__ = [ 'BeagleContextSeq', 'BeagleContextMulti' ] 20 | 21 | 22 | def realign_env_mat(corpus, env_corpus, env_matrix): 23 | """ 24 | """ 25 | words = corpus.words 26 | indices = [env_corpus.words_int[w] for w in words] 27 | return env_matrix[indices] 28 | 29 | 30 | 31 | class BeagleContextSeq(BaseModel): 32 | """ 33 | 34 | """ 35 | def __init__(self, corpus, env_corpus, env_matrix, 36 | context_type='sentence'): 37 | """ 38 | Initialize BeagleContextSeq. 39 | 40 | :param corpus: Source of observed data. 41 | :type corpus: class:`Corpus` 42 | 43 | :param env_corpus: BEAGLE environment corpus. 44 | :type env_corpus: class:`Corpus` 45 | 46 | :param env_matrix: Matrix from BEAGLE environment model. 47 | :type env_matrix: 2-D array 48 | 49 | :param context_type: Name of tokenization stored in `corpus` whose 50 | tokens will be treated as documents. Default is `sentence`. 51 | :type context_type: string, optional 52 | """ 53 | self.context_type = context_type 54 | self.sents = corpus.view_contexts(context_type) 55 | self.env_matrix = realign_env_mat(corpus, env_corpus, env_matrix) 56 | 57 | 58 | def train(self): 59 | """ 60 | Trains the model. 61 | """ 62 | self.matrix = np.zeros_like(self.env_matrix) 63 | 64 | for sent in self.sents: 65 | 66 | if sent.shape[0] > 1: 67 | 68 | left_sums = np.cumsum(self.env_matrix[sent[:-1]], axis=0) 69 | right_sums = np.cumsum(self.env_matrix[sent[:0:-1]], axis=0) 70 | 71 | for i,word in enumerate(sent): 72 | 73 | if i == 0: 74 | ctx_vector = right_sums[-1] 75 | 76 | elif i == sent.shape[0] - 1: 77 | ctx_vector = left_sums[-1] 78 | 79 | else: 80 | ctx_vector = left_sums[i - 1] + right_sums[-i - 1] 81 | 82 | self.matrix[word, :] += ctx_vector 83 | 84 | 85 | 86 | class BeagleContextMulti(BaseModel): 87 | """ 88 | 89 | """ 90 | 91 | def __init__(self, corpus, env_corpus, env_matrix, 92 | context_type='sentence'): 93 | """ 94 | Initialize BeagleContextMulti. 95 | 96 | :param corpus: Souce of observed data. 97 | :type corpus: class:`Corpus` 98 | 99 | :param env_corpus: BEAGLE environment corpus. 100 | :type env_corpus: class:`Corpus` 101 | 102 | :param env_matrix: Matrix from BEAGLE environment model. 103 | :type env_matrix: 2-D array 104 | 105 | :param context_type: Name of tokenization stored in `corpus` whose 106 | tokens will be treated as documents. Default is `sentence`. 107 | :type context_type: string, optional 108 | """ 109 | self.context_type = context_type 110 | self.sents = corpus.view_contexts(context_type) 111 | self.dtype = env_matrix.dtype 112 | env_matrix = realign_env_mat(corpus, env_corpus, env_matrix) 113 | 114 | global _shape 115 | _shape = mp.Array('i', 2, lock=False) 116 | _shape[:] = env_matrix.shape 117 | 118 | print('Copying env matrix to shared mp array') 119 | global _env_matrix 120 | _env_matrix = mp.Array('d', env_matrix.size, lock=False) 121 | _env_matrix[:] = env_matrix.ravel()[:] 122 | 123 | 124 | def train(self, n_procs=2): 125 | """ 126 | Takes an optional argument `n_procs`, number of processors, 127 | and trains the model on the number of processors. `n_procs` 128 | is 2 by default. 129 | 130 | :param n_procs: Number of processors. Default is 2. 131 | :type n_procs: int, optional 132 | 133 | :returs: `None` 134 | """ 135 | sent_lists = np.array_split(self.sents, n_procs-1) 136 | if len(sent_lists) != n_procs: 137 | sent_lists = np.array_split(self.sents, n_procs) 138 | 139 | tmp_dir = tempfile.mkdtemp() 140 | tmp_files = [os.path.join(tmp_dir, 'tmp_' + str(i)) 141 | for i in range(len(sent_lists))] 142 | 143 | sent_lists = list(zip(sent_lists, tmp_files)) 144 | del self.sents 145 | 146 | try: 147 | print('Forking') 148 | # For debugging 149 | # tmp_files = map(mpfn, sent_lists) 150 | 151 | p = mp.Pool(n_procs) 152 | tmp_files = p.map(mpfn, sent_lists, 1) 153 | p.close() 154 | 155 | print('Reducing') 156 | self.matrix = np.zeros(tuple(_shape), dtype=self.dtype) 157 | 158 | for filename in tmp_files: 159 | 160 | with open(filename, 'rb') as f: 161 | result = cpickle.load(f) 162 | 163 | for k,v in result.items(): 164 | self.matrix[k, :] += v 165 | 166 | finally: 167 | print('Removing {}'.format(tmp_dir)) 168 | shutil.rmtree(tmp_dir) 169 | 170 | 171 | 172 | def mpfn(sents_filename): 173 | """ 174 | """ 175 | sents, filename = sents_filename 176 | result = dict() 177 | 178 | for sent in sents: 179 | if sent.shape[0] > 1: 180 | 181 | env = np.empty((sent.size, _shape[1]), dtype=np.float64) 182 | for i,w in enumerate(sent): 183 | env[i, :] = _env_matrix[w*_shape[1]: (w+1)*_shape[1]] 184 | 185 | left_sums = np.cumsum(env[:-1], axis=0) 186 | right_sums = np.cumsum(env[:0:-1], axis=0) 187 | 188 | for i,t in enumerate(sent): 189 | 190 | if i == 0: 191 | ctx_vector = right_sums[-1] 192 | 193 | elif i == sent.shape[0] - 1: 194 | ctx_vector = left_sums[-1] 195 | 196 | else: 197 | ctx_vector = left_sums[i - 1] + right_sums[-i - 1] 198 | 199 | if t in result: 200 | result[t] += ctx_vector 201 | else: 202 | result[t] = ctx_vector 203 | 204 | with open(filename, 'wb') as f: 205 | cpickle.dump(result, f) 206 | 207 | return filename 208 | 209 | -------------------------------------------------------------------------------- /unit_tests/tests_corpus.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | import unittest 3 | import numpy as np 4 | import os 5 | from vsm.corpus import * 6 | from vsm.split import split_corpus 7 | from tempfile import NamedTemporaryFile 8 | 9 | 10 | class TestCorpus(unittest.TestCase): 11 | 12 | def setUp(self): 13 | corpus = np.array([0, 3, 2, 1, 0, 3, 0, 2, 3, 0, 2, 3, 1, 2, 0, 3, 14 | 2, 1, 2, 2], dtype=int) 15 | contextData = np.array([(3, 'doc0'), (5, 'doc1'), (7,'doc2'), (11,'doc3'), 16 | (11,'doc4'), (15,'doc5'), (18,'doc6'), (20,'doc7')], 17 | dtype=[('idx', '