├── unit_tests
    ├── __init__.py
    ├── tests_lsa.py
    ├── tests_split.py
    ├── tests_model.py
    ├── tests_beagleenvironment.py
    ├── tests_tfviewer.py
    ├── tests_tfidfviewer.py
    ├── tests_lsaviewer.py
    ├── tests_beaglecomposite.py
    ├── tests_spatial.py
    ├── tests_beagleviewer.py
    ├── tests_lda.py
    ├── tests_tfidf.py
    ├── tests_tf.py
    ├── tests_beaglecontext.py
    ├── tests_viewer_wrappers.py
    ├── tests_labeleddata.py
    ├── tests_corpus_util.py
    ├── tests_ldacgsviewer.py
    ├── tests_structarr.py
    ├── tests_beagleorder.py
    └── tests_corpus.py
├── functional_tests
    └── tests_tf.py
├── vsm
    ├── extensions
    │   ├── __init__.py
    │   ├── interop
    │   │   ├── __init__.py
    │   │   ├── mallet.py
    │   │   ├── weka.py
    │   │   └── ldac.py
    │   ├── clustering
    │   │   ├── __init__.py
    │   │   ├── plotting.py
    │   │   └── manifold.py
    │   ├── corpusbuilders
    │   │   ├── __init__.py
    │   │   └── corpusstreamers.py
    │   ├── inpho.py
    │   ├── testdata
    │   │   └── history_greek_philosophy
    │   │   │   ├── doc_meta.json
    │   │   │   ├── __init__.py
    │   │   │   ├── frontmatter.json
    │   │   │   └── chapter6.json
    │   ├── lda_py4j
    │   │   ├── README.txt
    │   │   └── org
    │   │   │   └── knowceans
    │   │   │       └── gibbstest
    │   │   │           ├── LDA.java
    │   │   │           ├── FileReadWrite.py
    │   │   │           ├── FileArrayProvider.java
    │   │   │           └── LdaRoutine.py
    │   ├── editions.py
    │   ├── mahout
    │   │   ├── README.txt
    │   │   └── mahout.py
    │   ├── trans.py
    │   └── multi_k.py
    ├── corpus
    │   ├── util
    │   │   ├── __init__.py
    │   │   └── corpusbuilders.py
    │   └── __init__.py
    ├── model
    │   ├── ldagibbs.py
    │   ├── __init__.py
    │   ├── beagleenvironment.py
    │   ├── beaglecomposite.py
    │   ├── base.py
    │   ├── ldaexact.py
    │   ├── tfidf.py
    │   ├── lda.py
    │   ├── ldacgs.py
    │   ├── _cgs_update.pyx
    │   ├── lsa.py
    │   ├── tf.py
    │   └── beaglecontext.py
    ├── viewer
    │   ├── ldagibbsviewer.py
    │   ├── __init__.py
    │   ├── types.py
    │   └── beagleviewer.py
    ├── exceptions
    │   ├── __init__.py
    │   └── compatibility.py
    ├── __init__.py
    └── split.py
├── MANIFEST.in
├── doc
    ├── source
    │   ├── vsm.corpus.rst
    │   ├── index.rst
    │   ├── vsm.viewer.rst
    │   ├── vsm.model.Lsa.rst
    │   ├── vsm.model.TfSeq.rst
    │   ├── vsm.model.TfIdf.rst
    │   ├── vsm.model.LdaCgsSeq.rst
    │   ├── vsm.viewer.BeagleViewer.rst
    │   ├── vsm.model.LdaCgsMulti.rst
    │   ├── vsm.model.TfMulti.rst
    │   ├── vsm.model.rst
    │   ├── vsm.model.BeagleOrderSeq.rst
    │   ├── vsm.model.BeagleContextSeq.rst
    │   ├── vsm.model.BeagleComposite.rst
    │   ├── vsm.model.BeagleOrderMulti.rst
    │   ├── vsm.model.BeagleEnvironment.rst
    │   ├── vsm.model.BeagleContextMulti.rst
    │   ├── vsm.rst
    │   ├── vsm.viewer.LsaViewer.rst
    │   ├── vsm.viewer.TfIdfViewer.rst
    │   ├── vsm.viewer.TfViewer.rst
    │   └── vsm.viewer.LdaCgsViewer.rst
    ├── README
    └── Makefile
├── register.py
├── requirements.txt
├── .gitignore
├── coverage.sh
├── CHANGELOG.md
├── roadmap.txt
├── LICENSE.txt
├── README.md
├── .travis.yml
├── appveyor.yml
└── setup.py


/unit_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/functional_tests/tests_tf.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vsm/extensions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vsm/extensions/interop/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include vsm/model/_cgs_update.pyx
2 | 


--------------------------------------------------------------------------------
/vsm/corpus/util/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Compatibility submodule
3 | """
4 | from vsm.extensions.corpusbuilders import *
5 | 


--------------------------------------------------------------------------------
/vsm/corpus/util/corpusbuilders.py:
--------------------------------------------------------------------------------
1 | """
2 | Compatibility submodule
3 | """
4 | from vsm.extensions.corpusbuilders import *
5 | 


--------------------------------------------------------------------------------
/vsm/model/ldagibbs.py:
--------------------------------------------------------------------------------
1 | """
2 | Compatibility submodule
3 | """
4 | from ldacgsseq import LdaCgsSeq as LDAGibbs
5 | 
6 | 
7 | __all__ = [ 'LDAGibbs' ]
8 | 


--------------------------------------------------------------------------------
/vsm/viewer/ldagibbsviewer.py:
--------------------------------------------------------------------------------
1 | """
2 | Compatibility submodule
3 | """
4 | from ldacgsviewer import LdaCgsViewer as LDAGibbsViewer
5 | 
6 | 
7 | __all__ = [ 'LDAGibbsViewer' ]
8 | 


--------------------------------------------------------------------------------
/vsm/extensions/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | [Documentation concerning the clustering extension]
3 | """
4 | from __future__ import absolute_import
5 | 
6 | from .manifold import *
7 | 


--------------------------------------------------------------------------------
/vsm/extensions/corpusbuilders/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | [Documentation about the corpusbuilders extension]
3 | """
4 | from __future__ import absolute_import
5 | 
6 | from .corpusbuilders import *
7 | 


--------------------------------------------------------------------------------
/vsm/corpus/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | [General documentation about the :mod:`corpus` submodule]
 3 | """
 4 | 
 5 | 
 6 | from vsm.corpus.base import *
 7 | import vsm.corpus.base
 8 | 
 9 | 
10 | __all__ = base.__all__[:]
11 | 


--------------------------------------------------------------------------------
/vsm/exceptions/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | [General documentation about the :mod:`exceptions` submodule]
 3 | """
 4 | from __future__ import absolute_import
 5 | 
 6 | from . import compatibility
 7 | from .compatibility import *
 8 | 
 9 | __all__ = compatibility.__all__
10 | 


--------------------------------------------------------------------------------
/doc/source/vsm.corpus.rst:
--------------------------------------------------------------------------------
 1 | vsm.corpus
 2 | ==========
 3 | 
 4 | .. automodule:: vsm.corpus
 5 | 
 6 |    
 7 |    
 8 | 
 9 |    .. rubric:: Classes
10 | 
11 |    .. autosummary::
12 |    
13 |       Corpus
14 | 
15 | 
16 | .. autoclass:: Corpus 
17 |     :members:
18 | 
19 | 


--------------------------------------------------------------------------------
/register.py:
--------------------------------------------------------------------------------
 1 | import pypandoc
 2 | import os
 3 | import sys
 4 | 
 5 | pypandoc.convert('README.md', 'rst', outputfile='README.txt')
 6 | if sys.argv[-1] == 'test':
 7 |     os.system("python setup.py register -r pypitest")
 8 | else:
 9 |     os.system("python setup.py register")
10 | os.remove('README.txt')
11 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to vsm documentation!
 2 | =============================
 3 | 
 4 | .. autosummary::
 5 |     :toctree:
 6 | 
 7 |     vsm
 8 |     vsm.corpus
 9 |     vsm.model
10 |     vsm.viewer
11 | 
12 | Indices and tables
13 | ==================
14 | 
15 | * :ref:`genindex`
16 | * :ref:`modindex`
17 | * :ref:`search`
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chardet==3.0.4
 2 | future==1.0.0
 3 | matplotlib>=2.0.2
 4 | nltk>=3.0.0,<4.0.0
 5 | numpy>=1.12.1,<2.0.0
 6 | progressbar2>=3.35.2
 7 | py4j==0.10.6
 8 | scikit_learn>=0.19.1
 9 | scipy>=0.19.0
10 | sortedcontainers>=1.5.7
11 | translate==3.5.0
12 | Unidecode==1.0.22
13 | 
14 | backports.tempfile==1.0; python_version=='2.7' 
15 | numpy>=1.14.3; python_version=='3.7'
16 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer
 2 | ==========
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. automodule:: vsm.viewer
 7 | 
 8 |    
 9 |    
10 |    .. rubric:: Classes
11 | 
12 |    .. autosummary::
13 |       :toctree:
14 | 
15 |       BeagleViewer
16 |       LdaCgsViewer
17 |       LsaViewer
18 |       TfIdfViewer
19 |       TfViewer
20 |    
21 |    
22 |    
23 |    
24 |    
25 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.Lsa.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.Lsa
 2 | =============
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: Lsa
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~Lsa.__init__
12 |       ~Lsa.load
13 |       ~Lsa.save
14 |       ~Lsa.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train  
20 | 
21 |    
22 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.TfSeq.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.TfSeq
 2 | ===============
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: TfSeq
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~TfSeq.__init__
12 |       ~TfSeq.load
13 |       ~TfSeq.save
14 |       ~TfSeq.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train
20 |     
21 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.TfIdf.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.TfIdf
 2 | ===============
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: TfIdf
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~TfIdf.__init__
12 |       ~TfIdf.load
13 |       ~TfIdf.save
14 |       ~TfIdf.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train  
20 |    
21 |    
22 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.LdaCgsSeq.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.LdaCgsSeq
 2 | ===================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: LdaCgsSeq
 7 |   
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~LdaCgsSeq.__init__
12 |       ~LdaCgsSeq.load
13 |       ~LdaCgsSeq.save
14 |       ~LdaCgsSeq.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.so
 3 | *.c
 4 | *.cpp
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | .eggs
10 | dist
11 | build
12 | eggs
13 | parts
14 | bin
15 | var
16 | sdist
17 | develop-eggs
18 | .installed.cfg
19 | 
20 | # Installer logs
21 | pip-log.txt
22 | 
23 | # Unit test / coverage reports
24 | .coverage
25 | .tox
26 | 
27 | #Translations
28 | *.mo
29 | 
30 | #Mr Developer
31 | .mr.developer.cfg
32 | 
33 | 
34 | # API keys
35 | .travis.key
36 | .appveyor.key
37 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.BeagleViewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer.BeagleViewer
 2 | =======================
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. autoclass:: BeagleViewer
 7 |  
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |    
12 |       ~BeagleViewer.__init__
13 |       ~BeagleViewer.dismat_word
14 |       ~BeagleViewer.dist_word_word
15 |    
16 |    
17 |    .. automethod:: __init__
18 |    .. automethod:: dismat_word
19 |    .. automethod:: dist_word_word
20 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.LdaCgsMulti.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.LdaCgsMulti
 2 | =====================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: LdaCgsMulti
 7 |   
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~LdaCgsMulti.__init__
12 |       ~LdaCgsMulti.load
13 |       ~LdaCgsMulti.save
14 |       ~LdaCgsMulti.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.TfMulti.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.TfMulti
 2 | =================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: TfMulti
 7 |    
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~TfMulti.__init__
12 |       ~TfMulti.load
13 |       ~TfMulti.save
14 |       ~TfMulti.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train
20 | 
21 |   
22 |    
23 | 
24 |    
25 |    
26 |    
27 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.rst:
--------------------------------------------------------------------------------
 1 | vsm.model
 2 | =========
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. automodule:: vsm.model
 7 | 
 8 |    .. rubric:: Classes
 9 | 
10 |    .. autosummary::
11 |       :toctree: 
12 |       
13 |       BeagleComposite
14 |       BeagleContextMulti
15 |       BeagleContextSeq
16 |       BeagleEnvironment
17 |       BeagleOrderMulti
18 |       BeagleOrderSeq
19 |       LdaCgsSeq
20 |       LdaCgsMulti
21 |       Lsa
22 |       TfIdf
23 |       TfMulti
24 |       TfSeq
25 |    
26 |    
27 | 
28 |    
29 |    
30 |    
31 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleOrderSeq.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleOrderSeq
 2 | ========================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleOrderSeq
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~BeagleOrderSeq.__init__
12 |       ~BeagleOrderSeq.load
13 |       ~BeagleOrderSeq.save
14 |       ~BeagleOrderSeq.train
15 |    
16 |    
17 |    .. automethod:: __init__
18 |    .. automethod:: load
19 |    .. automethod:: save
20 |    .. automethod:: train
21 | 
22 |    
23 | 
24 |    
25 |    
26 |    
27 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleContextSeq.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleContextSeq
 2 | ==========================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleContextSeq
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~BeagleContextSeq.__init__
12 |       ~BeagleContextSeq.load
13 |       ~BeagleContextSeq.save
14 |       ~BeagleContextSeq.train
15 |    
16 |    .. automethod:: __init__
17 |    .. automethod:: load
18 |    .. automethod:: save
19 |    .. automethod:: train
20 | 
21 |    
22 | 
23 |    
24 |    
25 |    
26 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleComposite.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleComposite
 2 | =========================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleComposite
 7 |   
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       BeagleComposite.__init__
12 |       BeagleComposite.load
13 |       BeagleComposite.save
14 |       BeagleComposite.train
15 |  
16 |    
17 |    .. automethod:: __init__
18 |    .. automethod:: load
19 |    .. automethod:: save
20 |    .. automethod:: train
21 | 
22 |    
23 |    
24 | 
25 |    
26 |    
27 |    
28 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleOrderMulti.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleOrderMulti
 2 | ==========================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleOrderMulti
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~BeagleOrderMulti.__init__
12 |       ~BeagleOrderMulti.load
13 |       ~BeagleOrderMulti.save
14 |       ~BeagleOrderMulti.train
15 |    
16 |  
17 |    .. automethod:: __init__
18 |    .. automethod:: load
19 |    .. automethod:: save
20 |    .. automethod:: train
21 | 
22 |    
23 | 
24 |    
25 |    
26 |    
27 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleEnvironment.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleEnvironment
 2 | ===========================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleEnvironment
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~BeagleEnvironment.__init__
12 |       ~BeagleEnvironment.load
13 |       ~BeagleEnvironment.save
14 |       ~BeagleEnvironment.train
15 |    
16 |    
17 |    .. automethod:: __init__
18 |    .. automethod:: load
19 |    .. automethod:: save
20 |    .. automethod:: train
21 | 
22 |  
23 | 
24 |    
25 |    
26 |    
27 | 


--------------------------------------------------------------------------------
/doc/source/vsm.model.BeagleContextMulti.rst:
--------------------------------------------------------------------------------
 1 | vsm.model.BeagleContextMulti
 2 | ============================
 3 | 
 4 | .. currentmodule:: vsm.model
 5 | 
 6 | .. autoclass:: BeagleContextMulti
 7 | 
 8 |    
 9 |    .. rubric:: Methods
10 | 
11 |    .. autosummary::
12 |       ~BeagleContextMulti.__init__
13 |       ~BeagleContextMulti.load
14 |       ~BeagleContextMulti.save
15 |       ~BeagleContextMulti.train
16 |    
17 |  
18 |    .. automethod:: __init__
19 |    .. automethod:: load
20 |    .. automethod:: save
21 |    .. automethod:: train
22 | 
23 |    
24 | 
25 |    
26 |    
27 |    
28 | 


--------------------------------------------------------------------------------
/vsm/viewer/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | [General documentation about the :mod:`viewer` submodule]
 3 | """
 4 | 
 5 | 
 6 | from . import beagleviewer
 7 | from .beagleviewer import *
 8 | from . import ldacgsviewer
 9 | from .ldacgsviewer import *
10 | from . import lsaviewer
11 | from .lsaviewer import *
12 | from . import tfviewer
13 | from .tfviewer import *
14 | from . import tfidfviewer
15 | from .tfidfviewer import *
16 | 
17 | __all__ = beagleviewer.__all__[:]
18 | __all__ += ldacgsviewer.__all__
19 | __all__ += lsaviewer.__all__
20 | __all__ += tfviewer.__all__
21 | __all__ += tfidfviewer.__all__
22 | 
23 | 


--------------------------------------------------------------------------------
/doc/source/vsm.rst:
--------------------------------------------------------------------------------
 1 | vsm
 2 | ===
 3 | 
 4 | .. automodule:: vsm
 5 | 
 6 |    
 7 |    
 8 |    .. rubric:: Classes
 9 | 
10 |    .. autosummary::
11 |       
12 |       BeagleComposite
13 |       BeagleContextMulti
14 |       BeagleContextSeq
15 |       BeagleEnvironment
16 |       BeagleOrderMulti
17 |       BeagleOrderSeq
18 |       BeagleViewer
19 |       Corpus
20 |       LdaCgsSeq
21 |       LdaCgsMulti
22 |       LdaCgsViewer
23 |       Lsa
24 |       LsaViewer
25 |       TfIdf
26 |       TfIdfViewer
27 |       TfMulti
28 |       TfSeq
29 |       TfViewer
30 |    
31 |    
32 | 
33 |    
34 |    
35 |    
36 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.LsaViewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer.LsaViewer
 2 | ====================
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. autoclass:: LsaViewer
 7 |   
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |       ~LsaViewer.__init__
12 |       ~LsaViewer.dismat_doc
13 |       ~LsaViewer.dismat_word
14 |       ~LsaViewer.dist_doc_doc
15 |       ~LsaViewer.dist_word_doc
16 |       ~LsaViewer.dist_word_word
17 | 
18 |    .. automethod:: __init__
19 |    .. automethod:: dismat_doc
20 |    .. automethod:: dismat_word
21 |    .. automethod:: dist_doc_doc
22 |    .. automethod:: dist_word_doc
23 |    .. automethod:: dist_word_word
24 | 
25 | 


--------------------------------------------------------------------------------
/coverage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CMD="coverage run -a --source=vsm"
 3 | CMD="coverage run -a --source vsm.model,vsm.viewer,vsm.corpus,vsm.spatial,vsm.split,vsm.structarr,vsm.exceptions"
 4 | #CMD="coverage run -a --source vsm.model,vsm.viewer,vsm.corpus,vsm.spatial,vsm.split,vsm.structarr,vsm.exceptions --debug trace"
 5 | 
 6 | rm -rf .coverage
 7 | coverage debug sys
 8 | 
 9 | $CMD -m pytest unit_tests/*
10 | EXIT=$?
11 | 
12 | rm -rf ap.ini ap ap.tgz
13 | #pip install --pre topicexplorer
14 | #$CMD -m topicexplorer.demo
15 | #EXIT=$?+$EXIT
16 | #$CMD -m topicexplorer.serve ap.ini 
17 | 
18 | coverage report
19 | 
20 | echo "Test exit code: $EXIT"
21 | exit $EXIT
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file. This project adheres to [PEP 440: Version Identification and Dependency Specification](https://www.python.org/dev/peps/pep-0440/), a slight modification of Semantic Versioning.
 3 | 
 4 | ## 0.4.0
 5 | - Refactor of `Corpus.__init__()`. Significant memory improvements.
 6 | - Refactor of stopword removal. Significant memory improvements.
 7 | - Addition of `LdaCgsViewer.labels` property.
 8 | - Addition of `LdaCgsViewer.dist(doc1, doc2, dist_fn=JS_dist)` function.
 9 | - Addition of `vsm.extensions.comparison.lda` [NOT YET MERGED]
10 | - Addition of `vsm.extensions.interop` [NOT YET MERGED]
11 | 
12 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.TfIdfViewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer.TfIdfViewer
 2 | ======================
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. autoclass:: TfIdfViewer
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |    
12 |       ~TfIdfViewer.__init__
13 |       ~TfIdfViewer.dismat_doc
14 |       ~TfIdfViewer.dismat_word
15 |       ~TfIdfViewer.dist_doc_doc
16 |       ~TfIdfViewer.dist_word_doc
17 |       ~TfIdfViewer.dist_word_word
18 |    
19 |    .. automethod:: __init__
20 |    .. automethod:: dismat_doc
21 |    .. automethod:: dismat_word
22 |    .. automethod:: dist_doc_doc
23 |    .. automethod:: dist_word_doc
24 |    .. automethod:: dist_word_word
25 |      
26 |    
27 | 


--------------------------------------------------------------------------------
/unit_tests/tests_lsa.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.model.lsa import Lsa
 5 | 
 6 | 
 7 | class TestLsa(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 | 
11 |         self.tfidf_mat = np.array(\
12 |             [[2.0794415, 1.3862944, 0],
13 |              [0.86304623, 0.28768209, 0.28768209],
14 |              [np.inf, np.inf, np.inf],
15 |              [0, 0, 0]])
16 |         self.eigenvalues = np.array(\
17 |             [ 0.35270742,  2.65176495])
18 |         self.doc_matrix = np.array([0.314334, 0.023485])
19 |     
20 |     #TODO: Write some actual unit tests for this module
21 | 
22 |         
23 | #Define and run test suite
24 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLsa)
25 | unittest.TextTestRunner(verbosity=2).run(suite)
26 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.TfViewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer.TfViewer
 2 | ===================
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. autoclass:: TfViewer
 7 | 
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |    
12 |       ~TfViewer.__init__
13 |       ~TfViewer.coll_freq
14 |       ~TfViewer.coll_freqs
15 |       ~TfViewer.dismat_doc
16 |       ~TfViewer.dismat_word
17 |       ~TfViewer.dist_doc_doc
18 |       ~TfViewer.dist_word_doc
19 |       ~TfViewer.dist_word_word
20 |    
21 |    
22 |    .. automethod:: __init__
23 |    .. automethod:: coll_freq
24 |    .. automethod:: coll_freqs
25 |    .. automethod:: dismat_doc
26 |    .. automethod:: dismat_word
27 |    .. automethod:: dist_doc_doc
28 |    .. automethod:: dist_word_doc
29 |    .. automethod:: dist_word_word  
30 | 
31 |    
32 |    
33 |    
34 | 


--------------------------------------------------------------------------------
/vsm/extensions/inpho.py:
--------------------------------------------------------------------------------
 1 | from inpho.model import *
 2 | 
 3 | ideas = Session.query(Idea)
 4 | words_int = dict([(idea.label, idea.ID) for idea in ideas.all()])
 5 | 
 6 | def inpho_word_tokenize(document, terms=None):
 7 |     if terms is None:
 8 |         terms = ideas[:]
 9 |     occurrences = []
10 |     
11 |     # iterate over terms to be scanned
12 |     for term in terms:
13 |         # build list of search patterns starting with label
14 |         for pattern in term.patterns:
15 |             try:
16 |                 if re.search(pattern, document, flags=re.IGNORECASE):
17 |                     occurrences.append(str(term.ID))
18 |                     break
19 |             except re.error:
20 |                 logging.warning('Term %d (%s) pattern "%s" failed' % 
21 |                                 (term.ID, term.label, pattern))
22 | 
23 |     return occurrences
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/roadmap.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------
 2 | 2014.10.20
 3 | 
 4 | Roadmap towards a first release
 5 | -------------------------------
 6 | 
 7 | * Update unit tests. At present, these have not been updated with the
 8 |   code changes.
 9 | 
10 | * Functional tests
11 | 
12 | * Refactoring
13 |   
14 |   * All models take corpus objects
15 |   
16 |   * Refactoring corpus builders - documentation, tutorial
17 |   
18 |   * Refactoring corpus metadata
19 | 
20 | * Include sample corpus and use in documentation
21 | 
22 | * Sphinx-generated documentation
23 | 
24 |   * Getting Started page (including how to install)
25 | 
26 |   * Workflow
27 | 
28 |   * Tutorials and demos
29 | 
30 | Wishlist
31 | --------
32 | 
33 | * Better defaults for LDA priors.
34 | 
35 | * More Bayesian models. There are several descendants of LDA with
36 |   highly desirable features (e.g., correlated topics, topic change
37 |   models) we have yet to implement.
38 | 
39 | * Robust plotting and clustering extensions.
40 | 


--------------------------------------------------------------------------------
/vsm/extensions/testdata/history_greek_philosophy/doc_meta.json:
--------------------------------------------------------------------------------
 1 | [[{ "part_of_book": "Front Matter"}],
 2 |  [{ "part_of_book": "Chapter 1"}],
 3 |  [{ "part_of_book": "Chapter 2"}],
 4 |  [{ "part_of_book": "Chapter 3"}],
 5 |  [{ "part_of_book": "Chapter 4"}],
 6 |  [{ "part_of_book": "Chapter 5"}],
 7 |  [{ "part_of_book": "Chapter 6"}],
 8 |  [{ "part_of_book": "Chapter 7"}],
 9 |  [{ "part_of_book": "Chapter 8"}],
10 |  [{ "part_of_book": "Chapter 9"}],
11 |  [{ "part_of_book": "Chapter 10"}],
12 |  [{ "part_of_book": "Chapter 11"}],
13 |  [{ "part_of_book": "Chapter 12"}],
14 |  [{ "part_of_book": "Chapter 13"}],
15 |  [{ "part_of_book": "Chapter 14"}],
16 |  [{ "part_of_book": "Chapter 15"}],
17 |  [{ "part_of_book": "Chapter 16"}],
18 |  [{ "part_of_book": "Chapter 17"}],
19 |  [{ "part_of_book": "Chapter 18"}],
20 |  [{ "part_of_book": "Chapter 19"}],
21 |  [{ "part_of_book": "Chapter 20"}],
22 |  [{ "part_of_book": "Chapter 21"}],
23 |  [{ "part_of_book": "Chapter 22"}],
24 |  [{ "part_of_book": "Back Matter"}]]
25 | 


--------------------------------------------------------------------------------
/unit_tests/tests_split.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm import *
 5 | from vsm.split import *
 6 | 
 7 | class TestCore(unittest.TestCase):
 8 | 
 9 |     def test_mp_split_ls(self):
10 | 
11 |         l = [slice(0,0), slice(0,0), slice(0,0)]
12 |         self.assertTrue(len(mp_split_ls(l, 1)) == 1)
13 |         self.assertTrue((mp_split_ls(l, 1)[0] == l).all())
14 |         self.assertTrue(len(mp_split_ls(l, 2)) == 2)
15 |         self.assertTrue((mp_split_ls(l, 2)[0] == 
16 |                         [slice(0,0), slice(0,0)]).all())
17 |         self.assertTrue((mp_split_ls(l, 2)[1] == [slice(0,0)]).all())
18 |         self.assertTrue(len(mp_split_ls(l, 3)) == 3)
19 |         self.assertTrue((mp_split_ls(l, 3)[0] == [slice(0,0)]).all())
20 |         self.assertTrue((mp_split_ls(l, 3)[1] == [slice(0,0)]).all())
21 |         self.assertTrue((mp_split_ls(l, 3)[2] == [slice(0,0)]).all())
22 | 
23 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCore)
24 | unittest.TextTestRunner(verbosity=2).run(suite)
25 | 


--------------------------------------------------------------------------------
/unit_tests/tests_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.corpus.util.corpusbuilders import random_corpus
 5 | from vsm.model.base import BaseModel
 6 | 
 7 | 
 8 | class TestBaseModel(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.c = random_corpus(1000, 50, 6, 100)
12 |         self.m = BaseModel(self.c, 'context')
13 | 
14 | 
15 |     def test_BaseModel_IO(self):
16 |            
17 |         from tempfile import NamedTemporaryFile as NTF
18 |         import os
19 | 
20 |         c = random_corpus(1000, 50, 6, 100)
21 |         with NTF(delete=False, suffix='.npz') as tmp:
22 |             m0 = BaseModel(c.corpus, 'context')
23 |             m0.save(tmp.name)
24 |             m1 = BaseModel.load(tmp.name)
25 | 
26 |             self.assertEqual(m0.context_type, m1.context_type)
27 |             self.assertTrue((m0.matrix == m1.matrix).all())
28 | 
29 |         os.remove(tmp.name)
30 | 
31 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBaseModel)
32 | unittest.TextTestRunner(verbosity=2).run(suite)
33 | 


--------------------------------------------------------------------------------
/doc/README:
--------------------------------------------------------------------------------
 1 | This directory contains the source files to build vsm documentation
 2 | with Sphinx. The Makefile has been customized to facilitate updating
 3 | the documentation hosted as github pages.
 4 | 
 5 | To update the github pages automatically, invoke
 6 | 
 7 | $ make gh-pages
 8 | 
 9 | in this directory.
10 | 
11 | To see what this does concretely, inspect Makefile. In summary, this
12 | command will do the following:
13 | 
14 | * build the html documentation in a temporary subdirectory `build`;
15 | 
16 | * clone the vsm github repo in a temporary subdirectory `vsm-gh-pages'
17 |   and checkout the gh-pages branch;
18 | 
19 | * (!) remove almost everything in that branch;
20 | 
21 | * move the newly built documentation to `vsm-gh-pages`;
22 | 
23 | * add, commit and push the new files (you may need to give your
24 |   credentials);
25 | 
26 | * remove the subdirectories `build` and `vsm-gh-pages`.
27 | 
28 | Currently, this procedure has no special error-handling. If it doesn't
29 | complete, you may have to clean up the temporary subdirectories
30 | yourself.


--------------------------------------------------------------------------------
/vsm/model/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | [General Documentation about :mod:`model` classes]
 3 | """
 4 | from __future__ import absolute_import
 5 | 
 6 | from . import beaglecomposite
 7 | from .beaglecomposite import *
 8 | from . import beaglecontext
 9 | from .beaglecontext import *
10 | from . import beagleenvironment
11 | from .beagleenvironment import *
12 | from . import beagleorder
13 | from .beagleorder import *
14 | from . import lda
15 | from .lda import *
16 | from . import ldacgsseq
17 | from .ldacgsseq import *
18 | from . import ldacgsmulti
19 | from .ldacgsmulti import *
20 | from . import lsa
21 | from .lsa import *
22 | from . import tf
23 | from .tf import *
24 | from . import tfidf
25 | from .tfidf import *
26 | 
27 | 
28 | __all__ = beaglecomposite.__all__[:]
29 | __all__ += beaglecontext.__all__
30 | __all__ += beagleenvironment.__all__
31 | __all__ += beagleorder.__all__
32 | __all__ += lda.__all__
33 | __all__ += ldacgsseq.__all__
34 | __all__ += ldacgsmulti.__all__
35 | __all__ += lsa.__all__
36 | __all__ += tf.__all__
37 | __all__ += tfidf.__all__
38 | 


--------------------------------------------------------------------------------
/doc/source/vsm.viewer.LdaCgsViewer.rst:
--------------------------------------------------------------------------------
 1 | vsm.viewer.LdaCgsViewer
 2 | =======================
 3 | 
 4 | .. currentmodule:: vsm.viewer
 5 | 
 6 | .. autoclass:: LdaCgsViewer
 7 |    
 8 |    .. rubric:: Methods
 9 | 
10 |    .. autosummary::
11 |    
12 |       ~LdaCgsViewer.__init__
13 |       ~LdaCgsViewer.dismat_doc
14 |       ~LdaCgsViewer.dismat_top
15 |       ~LdaCgsViewer.dist_doc_doc
16 |       ~LdaCgsViewer.dist_top_doc
17 |       ~LdaCgsViewer.dist_top_top
18 |       ~LdaCgsViewer.dist_word_top
19 |       ~LdaCgsViewer.doc_topics
20 |       ~LdaCgsViewer.logp_plot
21 |       ~LdaCgsViewer.topic_entropies
22 |       ~LdaCgsViewer.topic_hist
23 |       ~LdaCgsViewer.topics
24 |       ~LdaCgsViewer.word_topics
25 |    
26 |    .. automethod:: __init__
27 |    .. automethod:: dismat_doc
28 |    .. automethod:: dismat_top
29 |    .. automethod:: dist_doc_doc
30 |    .. automethod:: dist_top_doc
31 |    .. automethod:: dist_top_top
32 |    .. automethod:: dist_word_top
33 |    .. automethod:: doc_topics
34 |    .. automethod:: logp_plot
35 |    .. automethod:: topic_entropies
36 |    .. automethod:: topic_hist
37 |    .. automethod:: topics
38 |    .. automethod:: word_topics
39 |    
40 |    
41 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 The Trustees of Indiana University and
 4 | The Indiana Philosophy Ontology (InPhO) Project
 5 | http://inpho.cogs.indiana.edu/ -- inpho@indiana.edu
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/vsm/extensions/lda_py4j/README.txt:
--------------------------------------------------------------------------------
 1 | This directory contains python and java code that interacts with
 2 | java code retrieved from LdaGibbsSamlper.java at http://knowceans.com.
 3 | 
 4 | 
 5 | Directions
 6 | ----------
 7 | 0. write corpus txtfile:   
 8 |             from FileReadWrite import write_file
 9 | 			write_file(Corpus, ctx_type, 'fname.txt')
10 |     
11 | 1. compile:  javac -cp py4j0.8.1.jar *.java    (in gibbstest dir)
12 | 
13 | 2. run:      java -cp org/knowceans/gibbstest/py4j0.8.1.jar:.
14 |                       org.knowceans.gibbstest.LDA
15 | 		      org/knowceans/gibbstest/testcorp.txt
16 | 	         (in parent dir of org)
17 | 
18 | 3. python:   run LdaRoutine  (in ipython)
19 | 
20 | 4. exit out of java program to end the server connection.
21 | 
22 | Notes
23 | -----
24 | - directory structure: org/knowceans/gibbstest
25 | 
26 | - running java starts the gateway server. This needs to be running for python 
27 | code (py4j) to work.
28 | 
29 | - needs java version "1.7.0_25" to run correctly.
30 | 
31 | - all java files are in package org.knowceans.gibbstest;
32 | 
33 | - LDA.java takes written corpus file (from 0.) as args
34 | 
35 | - python code works with LdaGibbsSampler java object.
36 | 
37 | - LdaRoutine.py depends on vsm, so move LdaRoutine.py and FileReadWrite.py to a location where vsm is importable, if needed.
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #VSM
 2 | 
 3 | [![Build Status](https://img.shields.io/travis/inpho/vsm.svg?label=UNIX+build)](https://travis-ci.org/inpho/vsm)
 4 | [![Appveyor](https://img.shields.io/appveyor/ci/JaimieMurdock/vsm.svg?label=Windows+build)](https://ci.appveyor.com/project/JaimieMurdock/vsm)
 5 | [![Coveralls](https://img.shields.io/coveralls/inpho/vsm.svg)](https://coveralls.io/r/inpho/vsm)
 6 | [![GitHub license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/inpho/vsm/blob/master/LICENSE.txt)
 7 | [![PyPI](https://img.shields.io/pypi/v/vsm.svg)](https://pypi.python.org/pypi/vsm)
 8 | 
 9 | **Note:** More than likely, you are looking for the [InPhO Topic Explorer](http://github.com/inpho/topic-explorer). This library is for the machine learning implementations underlying the Topic Explorer and is updated much slower than the user interfaces. If you are using the Topic Explorer, please file issues there and the developers will triage appropriately.
10 | 
11 | Vector Space Model Framework developed for the 
12 | [InPhO Project](https://inpho.cogs.indiana.edu/).
13 | 
14 | Documentation can be found in the module and [here](http://inpho.github.io/vsm/).
15 | 
16 | ##Installation
17 | 
18 | ```
19 | pip install Cython==0.29.37 numpy==1.26.1
20 | git clone git@github.com:inpho/vsm.git
21 | cd vsm
22 | pip install -r requirements.txt -e .
23 | ```
24 | 


--------------------------------------------------------------------------------
/vsm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`vsm` module provides tools and a workflow for producing
 3 | semantic models of textual corpora and analyzing and visualizing these
 4 | models.
 5 | 
 6 | The :mod:`vsm` module has been conceived within the SciPy ecosystem.
 7 | In a typical work flow, a collection of texts is first transformed
 8 | into a Corpus object, whose underlying data structures are NumPy
 9 | numerical arrays. The user may then feed a Corpus object to one of the
10 | model classes, which contain the algorithms, implemented in NumPy,
11 | SciPy and IPython.parallel, for training models such as :doc:`TF<wf_tf>`,
12 | :doc:`TFIDF<wf_tfidf>`, :doc:`LSA<wf_lsa>`,
13 | :doc:`BEAGLE<wf_beagle>`, or :doc:`LDA<wf_lda>`.
14 | Finally, the user may examine the
15 | results with a Viewer class specialized to a particular model type. A
16 | Viewer object contains a variety of methods for analysis and
17 | visualization and achieves its full functionality within an IPython
18 | notebook session extended with matplotlib and scikit-learn.
19 | """
20 | 
21 | 
22 | import vsm.corpus
23 | from vsm.corpus import *
24 | import vsm.model
25 | from vsm.model import *
26 | import vsm.viewer
27 | from vsm.viewer import *
28 | 
29 | __version__ = '1.0.0b1'
30 | 
31 | __all__ = ['__version__']
32 | __all__ += vsm.corpus.__all__[:]
33 | __all__ += vsm.model.__all__
34 | __all__ += vsm.viewer.__all__
35 | 
36 | 


--------------------------------------------------------------------------------
/unit_tests/tests_beagleenvironment.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.model.beagleenvironment import *
 5 | 
 6 | 
 7 | 
 8 | class TestBeagleEnvironment(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 | 
12 |         from vsm.extensions.corpusbuilders import random_corpus
13 | 
14 |         c = random_corpus(1000, 100, 0, 20)
15 | 
16 |         self.m = BeagleEnvironment(c, n_cols=100)
17 |         self.m.train()
18 | 
19 | 
20 |     def test_BeagleEnvironment(self):
21 |     
22 |         self.assertTrue((self.m.matrix <= 1).all())
23 |         self.assertTrue((self.m.matrix >= -1).all())
24 | 
25 |         norms = (self.m.matrix**2).sum(1)**0.5
26 | 
27 |         self.assertTrue(np.allclose(np.ones(norms.shape[0]), norms))
28 | 
29 | 
30 |     def test_BE_IO(self):
31 |         from tempfile import NamedTemporaryFile
32 |         import os
33 | 
34 |         try:
35 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
36 |             self.m.save(tmp.name)
37 |             tmp.close()
38 |             m1 = BeagleEnvironment.load(tmp.name)
39 |             self.assertTrue((self.m.matrix == m1.matrix).all())
40 |     
41 |         finally:
42 |             os.remove(tmp.name)
43 |        
44 |         
45 | #Define and run test suite
46 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleEnvironment)
47 | unittest.TextTestRunner(verbosity=2).run(suite)
48 | 


--------------------------------------------------------------------------------
/unit_tests/tests_tfviewer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.viewer.tfviewer import *
 5 | from vsm.viewer.labeleddata import *
 6 | 
 7 | 
 8 | class TestTfViewer(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 | 
12 |         from vsm.corpus.util.corpusbuilders import random_corpus
13 |         from vsm.model.tf import TfSeq
14 | 
15 |         c = random_corpus(1000, 50, 0, 20, context_type='document', metadata=True)
16 | 
17 |         m = TfSeq(c, 'document')
18 |         m.train()
19 | 
20 |         self.v = TfViewer(c, m)
21 | 
22 |     #TODO: test for coll_freq
23 |     def test_TfViewer(self):
24 | 
25 |         li = [0,1]
26 | 
27 |         sww = self.v.dist_word_word('0')
28 |         swwl = self.v.dist_word_word(['0','1'])
29 |         sdd = self.v.dist_doc_doc(0)
30 |         sddl = self.v.dist_doc_doc(li)
31 |         cfs = self.v.coll_freqs()
32 | 
33 |         distmatw = self.v.dismat_word(['0','2','5'])
34 |         distmatd = self.v.dismat_doc(li)
35 | 
36 |         self.assertEqual(type(sww), LabeledColumn)
37 |         self.assertEqual(type(swwl), LabeledColumn)
38 |         self.assertEqual(type(sdd), LabeledColumn)
39 |         self.assertEqual(type(sddl), LabeledColumn)
40 |         self.assertEqual(type(cfs), LabeledColumn)
41 |         self.assertEqual(type(distmatw), IndexedSymmArray)
42 |         self.assertEqual(type(distmatd), IndexedSymmArray)
43 | 
44 | 
45 | 
46 | #Define and run test suite
47 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfViewer)
48 | unittest.TextTestRunner(verbosity=2).run(suite)
49 | 


--------------------------------------------------------------------------------
/unit_tests/tests_tfidfviewer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.viewer.tfidfviewer import *
 5 | from vsm.viewer.labeleddata import *
 6 | 
 7 | 
 8 | class TestTfIdfViewer(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |     
12 |         from vsm.corpus.util.corpusbuilders import random_corpus
13 |         from vsm.model.tf import TfSeq
14 |         from vsm.model.tfidf import TfIdf
15 | 
16 |         c = random_corpus(1000, 50, 0, 10, context_type='document', metadata=True)
17 | 
18 |         tf = TfSeq(c, 'document')
19 |         tf.train()
20 | 
21 |         m = TfIdf.from_tf(tf)
22 |         m.train()
23 | 
24 |         self.v = TfIdfViewer(c, m)
25 |     
26 |     def test_TfIdfViewer(self):
27 | 
28 |         li = [0,1]
29 | 
30 |         sww = self.v.dist_word_word('0')
31 |         swwl = self.v.dist_word_word(['0','1'])
32 |         sdd = self.v.dist_doc_doc(0)
33 |         sddl = self.v.dist_doc_doc(li)
34 | 
35 |         dismatw = self.v.dismat_word(['0','2','5'])
36 |         dismatd = self.v.dismat_doc(li)
37 | 
38 |         self.assertEqual(type(sww), LabeledColumn)
39 |         self.assertEqual(type(swwl), LabeledColumn)
40 |         self.assertEqual(type(sdd), LabeledColumn)
41 |         self.assertEqual(type(sddl), LabeledColumn)
42 | 
43 |         self.assertEqual(type(dismatw), IndexedSymmArray)
44 |         self.assertEqual(type(dismatd), IndexedSymmArray)
45 | 
46 | 
47 | 
48 | #Define and run test suite
49 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfIdfViewer)
50 | unittest.TextTestRunner(verbosity=2).run(suite)
51 | 


--------------------------------------------------------------------------------
/vsm/extensions/interop/mallet.py:
--------------------------------------------------------------------------------
 1 | from finalcorpus import *
 2 | import gzip
 3 | 
 4 | start_idx = 0
 5 | m = lda_m[20]
 6 | metadata = c.view_metadata(m.context_type)
 7 | 
 8 | def export_model():
 9 |     with gzip.open('model_to_mallet.gz', 'wb') as f:
10 |         f.write("#doc source pos typeindex type topic")
11 |         alpha = m.alpha
12 |         f.write(alpha)
13 |         beta = m.beta
14 | 
15 |         for end_idx, doc in metadata:
16 |             for i in range(start_idx, end_idx):
17 |                 doc = doc
18 |                 source = "/"
19 |                 pos = i
20 |                 typeIndex = c.corpus[i]
21 |                 ttype = c.words[c.corpus[i]]
22 |                 topic = m.Z[i]
23 |                 line = "{} {} {} {} {} {}\n".format(doc, source, pos, typeIndex, ttype, topic)
24 |                 f.write(line)
25 |                 start_idx = end_idx
26 | 
27 | 
28 | def import_model():
29 |     startPos = []
30 |     corpus = []
31 |     z = []
32 |     words = {}
33 |     prevDoc = 0;
34 | 
35 |     with gzip.open('topic-state.gz', 'rb') as f:
36 |         for i, line in enumerate(f, start = -3):
37 |             #skip first three lines with header info
38 |             if i >= 0:
39 |                 #columns - #doc source pos typeindex type topic
40 |                 doc, _, _, typeindex, type, topic = line.split() 
41 |                 corpus.append(typeindex)
42 |                 z.append(topic)
43 |                 words[typeindex] = type
44 |                 if doc != prevDoc:
45 |                     startPos.append(i)
46 |                 prevDoc = doc
47 | 
48 | 


--------------------------------------------------------------------------------
/vsm/model/beagleenvironment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from vsm.model.base import BaseModel
 4 | 
 5 | 
 6 | __all__ = ['BeagleEnvironment']
 7 | 
 8 | 
 9 | class BeagleEnvironment(BaseModel):
10 |     """
11 |     `BeagleEnvironment` is a randomly generated fixed vectors
12 |     representing the environment.
13 |     """
14 |     
15 |     def __init__(self, corpus, n_cols=2048, dtype=np.float64, 
16 |                  context_type='sentence'):
17 |         """
18 |         Initialize BeagleEnvironment.
19 | 
20 |         :param corpus: Source of observed data.
21 |         :type corpus: Corpus
22 | 
23 |         :param n_cols: Number of columns. Default is 2048.
24 |         :type n_cols: int, optional
25 | 
26 |         :param dtype: Numpy dtype for matrix attribute. Default is `np.float64`.
27 |         :type dtype: np.dtype, optional
28 | 
29 |         :param context_type: Name of tokenization stored in `corpus` whose
30 |             tokens will be treated as documents. Default is `sentence`.
31 |         :type context_type: string, optional
32 |         """
33 |         self.context_type = context_type
34 |         self.shape = (corpus.words.shape[0], n_cols)
35 |         self.dtype = dtype
36 | 
37 | 
38 |     def train(self):
39 |         """
40 |         Sets a m x n environment matrix where m is the number of words in
41 |         `corpus` and n is `n_cols`. The matrix consists of randomly generated
42 |         vectors. 
43 |         """
44 |         self.matrix = np.array(np.random.normal(size=self.shape),
45 |                                dtype=self.dtype)
46 |         # normalize rows
47 |         self.matrix /= np.sqrt((self.matrix * self.matrix).sum(1)[:,np.newaxis])
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/vsm/extensions/editions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from vsm.corpus import Corpus
 3 | 
 4 | 
 5 | __all__ = ['eqva', 'new_material']
 6 | 
 7 | 
 8 | def eqva(a1, a2):
 9 |     """
10 |     modified np.array_equal. considers a1 and a2
11 |     equal when there is 1 difference.
12 |     """
13 |     a1.sort()
14 |     a2.sort()
15 |     count = 0
16 |     a1_, a2_ = a1, a2
17 |     if len(a1) > len(a2):
18 |         a1_ = a2
19 |         a2_ = a1
20 | 
21 |     for s in a1:
22 |         if not s in a2:
23 |             count += 1
24 | 
25 |     return count
26 | 
27 | 
28 | def find_idx(ind, c1, c2):
29 |     """
30 |     finds exact match (1 diff) in c2 and returns the index.
31 |     """
32 |     ctx2 = c2.view_contexts('sentence', as_strings=True)
33 |     ctx = c1.view_contexts('sentence', as_strings=True)[ind]
34 |     
35 |     for i in xrange(len(ctx2)):
36 |         if eqva(ctx, ctx2[i]) < 2:
37 |             return str(i)
38 |     return ''
39 | 
40 | 
41 | def new_material(c1, c2, idx=0):
42 |     """
43 |     Return new material in a list. 
44 |     'idx' is an optional parameter for cutting off references.
45 |     """
46 |     ctx1 = c1.view_contexts('sentence', as_strings=True)
47 |     
48 |     if idx == 0:
49 |         ctx2 = c2.view_contexts('sentence', as_strings=True)
50 |     else:
51 |         ctx2 = c2.view_contexts('sentence', as_strings=True)[:idx]
52 |     len2 = len(ctx2)
53 | 
54 |     new = []
55 |     for i in xrange(len(ctx1)):
56 |         if i < len2:
57 |             if len(ctx1[i]) == 0: # empty tokens.
58 |                 pass
59 |             else:
60 |                 ind = find_idx(i, c1, c2)
61 |                 if len(ind) == 0:
62 |                     new.append(i)
63 |     return new
64 | 


--------------------------------------------------------------------------------
/unit_tests/tests_lsaviewer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.viewer.lsaviewer import LsaViewer 
 5 | 
 6 | 
 7 | class TestLsaViewer(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         
11 |         from vsm.corpus.util.corpusbuilders import random_corpus
12 |         from vsm.model.tf import TfSeq
13 |         from vsm.model.tfidf import TfIdf
14 |         from vsm.model.lsa import Lsa
15 | 
16 |         c = random_corpus(10000, 1000, 0, 30, context_type='document', metadata=True)
17 | 
18 |         tf = TfSeq(c, 'document')
19 |         tf.train()
20 | 
21 |         tfidf = TfIdf.from_tf(tf)
22 |         tfidf.train()
23 | 
24 |         m = Lsa.from_tfidf(tfidf)
25 |         m.train()
26 | 
27 |         self.v = LsaViewer(c, m)
28 | 
29 | 
30 |     def test_Lsaviewer(self):
31 |         
32 |         from vsm.viewer.labeleddata import LabeledColumn, IndexedSymmArray 
33 | 
34 |         sww = self.v.dist_word_word('1')
35 |         swwl = self.v.dist_word_word(['1', '0'])
36 |         sdd = self.v.dist_doc_doc(1)
37 |         sddl = self.v.dist_doc_doc([1, 0])
38 |         self.assertTrue(type(sww) == LabeledColumn)
39 |         self.assertTrue(type(swwl) == LabeledColumn)
40 |         self.assertTrue(type(sdd) == LabeledColumn)
41 |         self.assertTrue(type(sddl) == LabeledColumn)
42 | 
43 |         sw = self.v.dismat_word(['2','4','5'])
44 |         sd = self.v.dismat_doc([1, 0])
45 |         self.assertTrue(type(sw) == IndexedSymmArray)
46 |         self.assertTrue(type(sd) == IndexedSymmArray)
47 | 
48 | 
49 | 
50 |         
51 | #Define and run test suite
52 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLsaViewer)
53 | unittest.TextTestRunner(verbosity=2).run(suite)
54 | 


--------------------------------------------------------------------------------
/vsm/exceptions/compatibility.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from functools import update_wrapper, wraps
 3 | import inspect
 4 | 
 5 | __all__ = ['deprecation_warning', 'deprecated_meth']
 6 | 
 7 | 
 8 | 
 9 | def deprecation_warning(old_name, new_name):
10 |     """
11 |     Deprecation warning for deprecated functions.
12 |     """
13 |     warnings.simplefilter('always', DeprecationWarning)
14 |     
15 |     message = "{0} is deprecated. Please use {1} instead.".format(old_name,
16 |                 new_name)
17 |     warnings.warn(message, DeprecationWarning)
18 | 
19 | 
20 | #TODO: a function for deprecated class AND auto generate doc string with
21 | # a note about deprecation.
22 | 
23 | def deprecated_meth(new_fn_name):
24 |     """
25 |     Decorator to be used for deprecated functions/modules.
26 |     Throws a DeprecationWarning.
27 |     """
28 |     def wrap(old_fn):
29 |         
30 |         def wrapper(self, *args, **kwargs):
31 |             new_fn = getattr(self, new_fn_name)
32 |             deprecation_warning(old_fn.__name__, new_fn.__name__)
33 |             
34 |             return new_fn(*args, **kwargs)
35 |   
36 |         #update_wrapper(wrapper, new_fn_)
37 |         return wrapper
38 | 
39 |     return wrap
40 | 
41 | 
42 | """
43 | def deprecated_fn(new_fn):
44 |     Decorator to be used for deprecated functions/modules.
45 |     Throws a DeprecationWarning.
46 |     def wrap(old_fn):
47 |         
48 |         def wrapper(self, *args, **kwargs):
49 |             deprecation_warning(old_fn.__name__, new_fn.__name__)
50 |             #new_fn_ = getattr(self, new_fn.__name__)
51 | 
52 |             return new_fn(self, *args, **kwargs)
53 |   
54 |         update_wrapper(wrapper, new_fn)
55 |         
56 |         return wrapper
57 | 
58 |     return wrap
59 | """
60 | 


--------------------------------------------------------------------------------
/vsm/extensions/interop/weka.py:
--------------------------------------------------------------------------------
 1 | """
 2 | `vsm.extensions.interop.weka`
 3 | 
 4 | Module containing functions for import/export between VSM and Weka,
 5 | a collection of machine learning algorithms for data mining tasks
 6 | implemented in Java. Weka is available at:
 7 | `<http://www.cs.waikato.ac.nz/ml/weka/>`_
 8 | 
 9 | This module imports and exports corpora to the `ARFF format`_ used 
10 | by Weka. ARFF files can then be used for `text categorization with Weka`_.
11 | 
12 | 
13 | .. _ARFF format: https://weka.wikispaces.com/ARFF
14 | .. _text categorization with Weka: 
15 |     https://weka.wikispaces.com/Text+categorization+with+Weka
16 | 
17 | """
18 | import os
19 | import os.path
20 | 
21 | from scipy.stats import itemfreq
22 | import numpy as np
23 | 
24 | from vsm.extensions.corpusbuilders import corpus_fromlist
25 | 
26 | 
27 | def export_corpus(corpus, outfolder, context_type='document'):
28 |     """
29 |     Converts a vsm.corpus.Corpus object into a Weka-compatible `ARFF file`_.
30 | 
31 |     :param corpus: VSM Corpus object to convert to lda-c file
32 |     :type corpus: vsm.corpus.Corpus
33 | 
34 |     :param outfolder: Directory to output "vocab.txt" and "corpus.dat"
35 |     :type string: path
36 | 
37 |     .. _ARFF file: https://weka.wikispaces.com/ARFF
38 |     """
39 |     pass
40 | 
41 | 
42 | def import_corpus(corpusfilename, vocabfilename, context_type='document'):
43 |     """
44 |     Converts an lda-c compatible data file into a VSM Corpus object.
45 | 
46 |     :param corpusfilename: path to corpus file, as defined in lda-c
47 |     documentation.
48 |     :type string:
49 | 
50 |     :param vocabfilename: path to vocabulary file, one word per line
51 |     :type string:
52 |     """
53 |     pass
54 | 
55 | 
56 | def import_model(filename):
57 |     pass
58 | 
59 | 
60 | def export_model(filename):
61 |     pass
62 | 


--------------------------------------------------------------------------------
/unit_tests/tests_beaglecomposite.py:
--------------------------------------------------------------------------------
 1 | from builtins import str
 2 | from builtins import range
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | 
 7 | class TestBeagleComposite(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         from vsm.corpus.util.corpusbuilders import random_corpus
11 |         from vsm.model.beaglecomposite import BeagleComposite 
12 |         from vsm.model.beagleenvironment import BeagleEnvironment
13 |         from vsm.model.beaglecontext import BeagleContextSeq
14 |         from vsm.model.beagleorder import BeagleOrderSeq
15 | 
16 |         self.ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
17 |         self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)])
18 | 
19 |         self.e = BeagleEnvironment(self.ec, n_cols=5)
20 |         self.e.train()
21 | 
22 |         self.cm = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
23 |         self.cm.train()
24 | 
25 |         self.om = BeagleOrderSeq(self.ec, self.e.matrix)
26 |         self.om.train()
27 | 
28 |         self.m = BeagleComposite(self.cc, self.cm.matrix, self.ec, self.om.matrix)
29 |         self.m.train()
30 | 
31 | 
32 |     def test_BeagleCompositeIO(self):
33 |         from tempfile import NamedTemporaryFile
34 |         from vsm.model.beaglecomposite import BeagleComposite 
35 |         import os
36 | 
37 |         try:
38 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
39 |             self.m.save(tmp.name)
40 |             tmp.close()
41 |             m1 = BeagleComposite.load(tmp.name)
42 |             self.assertTrue((self.m.matrix == m1.matrix).all())
43 |     
44 |         finally:
45 |             os.remove(tmp.name)
46 | 
47 | 
48 | 
49 |         
50 | #Define and run test suite
51 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleComposite)
52 | unittest.TextTestRunner(verbosity=2).run(suite)
53 | 


--------------------------------------------------------------------------------
/vsm/extensions/mahout/README.txt:
--------------------------------------------------------------------------------
 1 | mahout.py contains methods that interact with mahout-generated files, 
 2 | create vsm `Corpus` and `LDAGibbs`. 
 3 | 
 4 | 
 5 | STEPS TO LDA
 6 | ------------
 7 | 1) Convert directory of documents to SequenceFile format
 8 | mahout-distribution-0.9/bin/mahout seqdirectory -i inpho/testdata -o inpho/mahout-out
 9 | 
10 | 2) Creating Vectors from SequenceFile
11 | mahout-distribution-0.9/bin/mahout seq2sparse -i inpho/mahout-out -o inpho/mahout-vect-test
12 | 
13 | 3) Creating Matrix from tf-vectors
14 | mahout-distribution-0.9/bin/mahout rowid -i inpho/mahout-vect-test/tf-vectors -o inpho/mahout-mat-test
15 | 
16 | 4) Run LDA Collapsed Variable Bayes
17 | mahout-distribution-0.9/bin/mahout cvb -i inpho/mahout-mat-test/matrix -dict inpho/mahout-vect-test/dictionary.file-0 -o inpho/mahout-lda-test -a 0.01 -e 0.01 -dt inpho/mahout-dt-test -mt inpho/mahout-models-test -k 5 -x 100
18 | 
19 | 
20 | CREATING READABLE FILES
21 | -----------------------
22 | doori@space:~$ mahout-distribution-0.9/bin/mahout vectordump -i inpho/mahout-vect-test/tf-vectors/part-r-00000 -o inpho/mahout-vect-test/tf-vectors/tf.txt -p true --csv csv
23 | 
24 | doori@space:~$ mahout-distribution-0.9/bin/mahout seqdumper -i inpho/mahout-dt/part-m-00000 -o inpho/mahout-dt/doc-topics.txt
25 | 
26 | doori@space:~$ mahout-distribution-0.9/bin/mahout seqdumper -i inpho/mahout-lda/part-m-00000 -o inpho/mahout-lda/lda.txt
27 | 
28 | 
29 | NOTES
30 | -----
31 | If you are running 'seq2sparse' for building the feature vectors and are using the Lucene  StandardAnalyzer (which is the default), the English stopwords should be removed automatically. (-x option to remove *high frequency* words. default is 99)
32 | 
33 | REFERENCES
34 | ----------
35 | https://mahout.apache.org/users/basics/creating-vectors-from-text.html
36 | 
37 | http://mahout.apache.org/users/clustering/lda-commandline.html
38 | 


--------------------------------------------------------------------------------
/unit_tests/tests_spatial.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from past.utils import old_div
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | from vsm.spatial import *
 7 | 
 8 | #TODO: add tests for recently added methods.
 9 | def KL(p,q):
10 |     return sum(p*np.log2(old_div(p,q)))
11 | def partial_KL(p,q):
12 |     return p * np.log2(old_div((2*p), (p+q)))
13 | def JS(p,q):
14 |     return 0.5*(KL(p,((p+q)*0.5)) + KL(q,((p+q)*0.5)))
15 | def JSD(p,q):
16 |     return (0.5*(KL(p,((p+q)*0.5)) + KL(q,((p+q)*0.5))))**0.5
17 | 
18 | 
19 | class TestSpatial(unittest.TestCase):
20 | 
21 |     def setUp(self):
22 |         # 2 random distributions
23 |         self.p=np.random.random_sample((5,))
24 |         self.q=np.random.random_sample((5,))
25 | 
26 |         # normalize
27 |         self.p /= self.p.sum()
28 |         self.q /= self.q.sum()
29 | 
30 |     def test_KL_div(self):
31 |         self.assertTrue(np.allclose(KL_div(self.p,self.q), KL(self.p,self.q)))
32 |         
33 |     def test_JS_div(self):
34 |         self.assertTrue(np.allclose(JS_div(self.p,self.q), JS(self.p,self.q)))
35 |     
36 |     def test_JS_dist(self):
37 |         self.assertTrue(np.allclose(JS_dist(self.p,self.q), JSD(self.p,self.q)))
38 | 
39 | 
40 |     def test_count_matrix(self):
41 |     
42 |         arr = [1, 2, 4, 2, 1]
43 |         slices = [slice(0,1), slice(1, 3), slice(3,3), slice(3, 5)]
44 |         m = 6
45 |         result = coo_matrix([[0, 0, 0, 0],
46 |                          [1, 0, 0, 1],
47 |                          [0, 1, 0, 1],
48 |                          [0, 0, 0, 0],
49 |                          [0, 1, 0, 0],
50 |                          [0, 0, 0, 0]])
51 |     
52 |         self.assertTrue((result.toarray() == 
53 |                         count_matrix(arr, slices, m).toarray()).all())
54 | 
55 |     
56 | 
57 | 
58 | suite = unittest.TestLoader().loadTestsFromTestCase(TestSpatial)
59 | unittest.TextTestRunner(verbosity=2).run(suite)
60 | 


--------------------------------------------------------------------------------
/unit_tests/tests_beagleviewer.py:
--------------------------------------------------------------------------------
 1 | from builtins import str
 2 | from builtins import range
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | from vsm.viewer.beagleviewer import BeagleViewer 
 7 | from vsm.viewer.labeleddata import *
 8 | 
 9 | 
10 | class TestBeagleViewer(unittest.TestCase):
11 | 
12 |     def setUp(self):
13 | 
14 |         from vsm.corpus.util.corpusbuilders import random_corpus
15 |         from vsm.model.beagleenvironment import BeagleEnvironment
16 |         from vsm.model.beaglecontext import BeagleContextSeq
17 |         from vsm.model.beagleorder import BeagleOrderSeq
18 |         from vsm.model.beaglecomposite import BeagleComposite
19 | 
20 |         ec = random_corpus(1000, 50, 0, 20, context_type='sentence')
21 |         cc = ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)])
22 |         e = BeagleEnvironment(ec, n_cols=5)
23 |         e.train()
24 | 
25 |         cm = BeagleContextSeq(cc, ec, e.matrix)
26 |         cm.train()
27 | 
28 |         om = BeagleOrderSeq(ec, e.matrix)
29 |         om.train()
30 | 
31 |         m = BeagleComposite(cc, cm.matrix, ec, om.matrix)
32 |         m.train()
33 | 
34 |         self.venv = BeagleViewer(ec, e)
35 |         self.vctx = BeagleViewer(cc, cm)
36 |         self.vord = BeagleViewer(ec, om)
37 |         self.vcom = BeagleViewer(cc, m)
38 | 
39 | 
40 |     def test_BeagleViewer(self):
41 |     
42 |         sww = self.venv.dist_word_word('1')        
43 |         sww1 = self.vord.dist_word_word('0')
44 |         self.assertTrue(type(sww) == LabeledColumn)
45 |         self.assertTrue(type(sww1) == LabeledColumn)
46 |         
47 |         smw = self.vcom.dismat_word(['1'])
48 |         smw1 = self.vctx.dismat_word(['1'])
49 |         self.assertTrue(type(smw) == IndexedSymmArray)
50 |         self.assertTrue(type(smw1) == IndexedSymmArray)
51 | 
52 | 
53 |         
54 | #Define and run test suite
55 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleViewer)
56 | unittest.TextTestRunner(verbosity=2).run(suite)
57 | 


--------------------------------------------------------------------------------
/unit_tests/tests_lda.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.corpus import Corpus
 5 | from vsm.corpus.util.corpusbuilders import random_corpus
 6 | from vsm.model.ldacgsmulti import *
 7 | from vsm.model.ldacgsseq import *
 8 | from vsm.model.lda import *
 9 | import platform
10 | 
11 | 
12 | class TestLda(unittest.TestCase):
13 |     def setUp(self):
14 |         pass
15 | 
16 |     def test_Lda_LdaCgsSeq(self):
17 |         m=LDA(multiprocessing=False)
18 |         self.assertTrue(isinstance(m, LdaCgsSeq))
19 |     
20 |     def test_Lda_LdaCgsSeq_seed_or_seeds(self):
21 |         m=LDA(multiprocessing=False, seed_or_seeds=2)
22 |         self.assertTrue(isinstance(m, LdaCgsSeq))
23 |         self.assertTrue(m.seed == 2)
24 |         if platform.system() != 'Windows':
25 |             with self.assertRaises(ValueError):
26 |                 m=LDA(multiprocessing=False, seed_or_seeds=[2,4])
27 |     
28 |     
29 |     def test_Lda_proper_class(self):
30 |         m=LDA(multiprocessing=True)
31 |         if platform.system() == 'Windows':
32 |             self.assertTrue(isinstance(m,LdaCgsSeq))
33 |         else:
34 |             self.assertTrue(isinstance(m,LdaCgsMulti))
35 | 
36 |     def test_Lda_LdaCgsMulti_seed_or_seeds(self):
37 |         m=LDA(multiprocessing=True, seed_or_seeds=[2,4], n_proc=2)
38 |         if platform.system() == 'Windows':
39 |             self.assertTrue(isinstance(m,LdaCgsSeq))
40 |             self.assertTrue(m.seed == 2)
41 |         else:
42 |             self.assertTrue(isinstance(m,LdaCgsMulti))
43 |             self.assertTrue(m.seeds == [2,4])
44 | 
45 |         # test improper numper of seed_or_seeds with multiprocessing
46 |         if platform.system() != 'Windows':
47 |             with self.assertRaises(ValueError):
48 |                 m=LDA(multiprocessing=True, seed_or_seeds=[2], n_proc=2)
49 |         
50 | 
51 | if __name__ == '__main__':
52 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestLdaCgsMulti)
53 |     unittest.TextTestRunner(verbosity=2).run(suite)
54 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #language: python
 2 | env:
 3 |   - PYTHON_VERSION="2.7" NUMPY_VERSION="1.12.1"
 4 |   - PYTHON_VERSION="3.5" NUMPY_VERSION="1.12.1"
 5 |   - PYTHON_VERSION="3.6" NUMPY_VERSION="1.12.1"
 6 |   - PYTHON_VERSION="3.7" NUMPY_VERSION="1.14.6"
 7 | # command to install dependencies
 8 | os:
 9 |   - linux
10 |   - osx
11 | install:
12 |   # Install conda
13 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
14 |       if [[ "$PYTHON_VERSION" == "2.7" ]]; then
15 |         wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
16 |       else
17 |         wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
18 |       fi; else
19 |       if [[ "$PYTHON_VERSION" == "2.7" ]]; then
20 |         wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh;
21 |       else
22 |         wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh;
23 |       fi
24 |     fi
25 |   - bash miniconda.sh -b -p $HOME/miniconda
26 |   - export PATH="$HOME/miniconda/bin:$PATH"
27 |   - hash -r
28 |   - conda config --set always_yes yes --set changeps1 no
29 |   - conda update -q conda --yes
30 |   - conda create -n py -q --yes pip numpy=$NUMPY_VERSION scipy scikit-learn unidecode nltk chardet cython "python=$PYTHON_VERSION"
31 |   - source activate py
32 |   - which python
33 |   - which pip
34 |   - python --version
35 |   - pip --version
36 |   # command to install dependencies
37 |   - pip install coveralls
38 |   - python -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')"
39 |   - pip install .
40 | # command to run tests
41 | before_script: 
42 |   - pip install unittest2 nose
43 | script: bash coverage.sh
44 | after_success:
45 |   - coveralls
46 |   - pwd
47 |   - pip install wheel
48 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
49 |       python setup.py bdist_egg;
50 |     else
51 |       python setup.py bdist_wheel --plat-name=macosx_10_7_x86_64;
52 |     fi
53 |   - ls dist
54 | 
55 | addons:
56 |   artifacts:
57 |     paths:
58 |     - $(ls dist/*.whl | tr "\n" ":")
59 |     - $(ls dist/*.egg | tr "\n" ":")
60 |     target_paths:
61 |     - /$TRAVIS_OS_NAME/$TRAVIS_BUILD_NUMBER
62 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     # For Python versions available on Appveyor, see
 4 |     # http://www.appveyor.com/docs/installed-software#python
 5 |     - PYTHON: "C:\\Miniconda"
 6 |       NUMPY: "1.12.1"
 7 |     - PYTHON: "C:\\Miniconda-x64"
 8 |       NUMPY: "1.12.1"
 9 |     - PYTHON: "C:\\Miniconda36"
10 |       NUMPY: "1.12.1"
11 |     - PYTHON: "C:\\Miniconda36-x64"
12 |       NUMPY: "1.12.1"
13 |     # Currently doing manual 3.7 builds due to Appveyor issues
14 |     # - PYTHON: "C:\\Miniconda37"
15 |     # - PYTHON: "C:\\Miniconda37-x64"
16 | 
17 | install:
18 |   - "setx path \"%path%;%PYTHON%\\Library\\bin\""
19 |   - "%PYTHON%\\python.exe -m conda update -q --yes conda"
20 |   - "%PYTHON%\\python.exe -m conda install -q --yes pip numpy=%NUMPY% scipy scikit-learn unidecode nltk chardet cython sqlite"
21 |   - "%PYTHON%\\python.exe -c \"import nltk; nltk.download('stopwords'); nltk.download('punkt')\""
22 |   - "%PYTHON%\\python.exe -m pip install unittest2 nose"
23 |   - "%PYTHON%\\python.exe -m pip install ."
24 | 
25 | build: off
26 | version: '1.0.{build}'
27 | 
28 | test_script:
29 |   # Put your test command here.
30 |   # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
31 |   # you can remove "build.cmd" from the front of the command, as it's
32 |   # only needed to support those cases.
33 |   # Note that you must use the environment variable %PYTHON% to refer to
34 |   # the interpreter you're using - Appveyor does not do anything special
35 |   # to put the Python evrsion you want to use on PATH.
36 |   - "%PYTHON%\\python.exe setup.py test"
37 | 
38 | after_test:
39 |   # This step builds your wheels.
40 |   # Again, you only need build.cmd if you're building C extensions for
41 |   # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
42 |   # interpreter
43 |   - "%PYTHON%\\python.exe setup.py bdist_wheel"
44 | 
45 | artifacts:
46 |   # bdist_wheel puts your built wheel in the dist directory
47 |   - path: dist\*
48 | 
49 | #on_success:
50 | #  You can use this step to upload your artifacts to a public website.
51 | #  See Appveyor's documentation for more details. Or you can simply
52 | #  access your wheels from the Appveyor "artifacts" tab for your build.
53 | 


--------------------------------------------------------------------------------
/unit_tests/tests_tfidf.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from scipy.sparse import coo_matrix
 4 | from vsm.model import tfidf
 5 | from vsm.model import tf
 6 | 
 7 | 
 8 | class TestTfIdf(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3,
12 |                                 3, 0, 1, 0,
13 |                                 3,
14 |                                 1, 3])
15 |         self.docs = [slice(0,9), slice(9,13),
16 |                      slice(13,14), slice(14,16)]
17 |         self.V = 4
18 | 
19 |         self.tf_mat = coo_matrix(np.array([[3, 2, 0, 0],
20 |                                            [3, 1, 0, 1],
21 |                                            [0, 0, 0, 0],
22 |                                            [3, 1, 1, 1]], dtype=int))
23 |         self.tfidf_mat = np.array(\
24 |             [[2.0794415, 1.3862944, 0, 0],
25 |              [0.86304623, 0.28768209, 0, 0.28768209],
26 |              [0, 0, 0, 0],
27 |              [0, 0, 0, 0]])
28 |         self.undefined_rows = [2]
29 |         
30 | 
31 |     def test_TfIdf_train(self):
32 |         m = tfidf.TfIdf()
33 |         m.train()
34 |         self.assertTrue(m.matrix.size == 0)
35 |         self.assertTrue(len(m.undefined_rows) == 0)
36 |         
37 |         m = tfidf.TfIdf(tf_matrix=self.tf_mat)
38 |         m.train()
39 |         np.testing.assert_almost_equal(self.tfidf_mat, m.matrix.toarray())
40 |         self.assertEqual(m.undefined_rows, self.undefined_rows)
41 | 
42 |     def test_TfIdf_from_tf(self):
43 |         tf_model = tf.TF()
44 |         tf_model.corpus = self.corpus
45 |         tf_model.docs = self.docs
46 |         tf_model.V = self.V
47 |         tf_model.train()
48 |         self.assertTrue((self.tf_mat == tf_model.matrix.toarray()).all())
49 | 
50 |         m = tfidf.TfIdf.from_tf(tf_model)
51 |         self.assertTrue((m.matrix == tf_model.matrix.toarray()).all())
52 |         m.train()
53 |         np.testing.assert_almost_equal(self.tfidf_mat, m.matrix.toarray())
54 |         self.assertEqual(m.undefined_rows, self.undefined_rows)
55 | 
56 |         
57 | #Define and run test suite
58 | suite = unittest.TestLoader().loadTestsFromTestCase(TestTfIdf)
59 | unittest.TextTestRunner(verbosity=2).run(suite)
60 | 


--------------------------------------------------------------------------------
/vsm/viewer/types.py:
--------------------------------------------------------------------------------
 1 | from builtins import str
 2 | from builtins import int
 3 | from past.builtins import basestring
 4 | import numpy as np
 5 | 
 6 | 
 7 | __all__ = ['isfloat', 'isint', 'isstr', 
 8 |            'res_doc_type', 'res_top_type', 'res_word_type']
 9 | 
10 | 
11 | 
12 | # 
13 | # Rudimentary type-checking fns
14 | #
15 | 
16 | 
17 | def isfloat(x):
18 |     """
19 |     Returns True if `x` is an instance of a float.
20 |     """
21 |     return (isinstance(x, np.inexact) or isinstance(x, float))
22 | 
23 | 
24 | def isint(x):
25 |     """
26 |     Returns True if `x` is an instance of an int.
27 |     """
28 |     return (isinstance(x, np.integer) or isinstance(x, int))
29 | 
30 | 
31 | def isstr(x):
32 |     """
33 |     Returns True if `x` is an instance of a string.
34 |     """
35 |     return isinstance(x, basestring) or isinstance(x, np.flexible)
36 | 
37 | 
38 | # 
39 | # fns to resolve input polymorphism to the dist_*_* fns
40 | #
41 | 
42 | 
43 | def res_doc_type(corp, context_type, label_name, doc):
44 |     """
45 |     If `doc` is a string or a dict, performs a look up for its
46 |     associated integer. If `doc` is a dict, looks for its label.
47 |     Finally, if `doc` is an integer, stringifies `doc` for use as
48 |     a label.
49 |     
50 |     Returns an integer, string pair: (<document index>, <document
51 |     label>).
52 |     """
53 |     if isstr(doc):
54 |         query = {label_name: doc}
55 |         d = corp.meta_int(context_type, query)
56 |     elif isinstance(doc, dict):
57 |         d = corp.meta_int(context_type, doc)
58 |         
59 |         #TODO: Define an exception for failed queries in
60 |         #vsm.corpus. Use it here.
61 |         doc = corp.view_metadata(context_type)[label_name][d]
62 |     else:
63 |         d, doc = doc, str(doc)
64 | 
65 |     return d, doc
66 |                     
67 | 
68 | def res_top_type(topic_or_topics):
69 |     """
70 |     If `topic_or_topics` is an int, then returns it in a list.
71 |     """
72 |     if isint(topic_or_topics):
73 |         topic_or_topics = [topic_or_topics]
74 | 
75 |     return topic_or_topics
76 | 
77 | 
78 | def res_word_type(corp, word):
79 |     """
80 |     If `word` is a string, performs a look up for its associated
81 |     integer. Otherwise, stringifies `word`. 
82 | 
83 |     Returns an integer, string pair: (<word index>, <word label>).
84 |     """
85 |     if isstr(word):
86 |         return corp.words_int[word], word
87 | 
88 |     return word, str(word)
89 | 


--------------------------------------------------------------------------------
/unit_tests/tests_tf.py:
--------------------------------------------------------------------------------
 1 | from builtins import object
 2 | import unittest
 3 | import numpy as np
 4 | 
 5 | from vsm.model import tf
 6 | from multiprocessing import Process
 7 | import platform
 8 | 
 9 | class MPTester(object):
10 |     def setUp(self):
11 |         self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3,
12 |                                 3, 0, 1, 0,
13 |                                 1, 3])
14 |         self.docs = [slice(0,9), slice(9,13),
15 |                      slice(13,13), slice(13,15)]
16 |         self.V = 4
17 |         self.cnt_mat = np.array([[3, 2, 0, 0],
18 |                                  [3, 1, 0, 1],
19 |                                  [0, 0, 0, 0],
20 |                                  [3, 1, 0, 1]])
21 | 
22 |     def test_TfMulti_train(self):
23 |         self.setUp()
24 |         m = tf.TfMulti()
25 |         m.corpus = self.corpus
26 |         m.docs = self.docs
27 |         m.V = self.V
28 |         m.train(2)
29 | 
30 |         assert (self.cnt_mat == m.matrix.toarray()).all()
31 | 
32 | class TestTf(unittest.TestCase):
33 | 
34 |     def setUp(self):
35 |         self.corpus = np.array([0, 1, 3, 1, 1, 0, 3, 0, 3,
36 |                                 3, 0, 1, 0,
37 |                                 1, 3])
38 |         self.docs = [slice(0,9), slice(9,13),
39 |                      slice(13,13), slice(13,15)]
40 |         self.V = 4
41 |         self.cnt_mat = np.array([[3, 2, 0, 0],
42 |                                  [3, 1, 0, 1],
43 |                                  [0, 0, 0, 0],
44 |                                  [3, 1, 0, 1]])
45 |         
46 |     def test_TF_proper_class(self):
47 |         m = tf.TF(multiprocessing=True)
48 |         if platform.system() == 'Windows':
49 |             self.assertTrue(isinstance(m,tf.TfSeq))
50 |         else:
51 |             self.assertTrue(isinstance(m,tf.TfMulti))
52 | 
53 |     def test_TfSeq_train(self):
54 |         m = tf.TfSeq()
55 |         m.corpus = self.corpus
56 |         m.docs = self.docs
57 |         m.V = self.V
58 |         m.train()
59 |         self.assertTrue((self.cnt_mat == m.matrix.toarray()).all())
60 | 
61 |     def test_demo_TfMulti_train(self):
62 |         t = MPTester()
63 |         p = Process(target=t.test_TfMulti_train, args=())
64 |         p.start()
65 |         p.join()
66 | 
67 | 
68 |         
69 | #Define and run test suite
70 | if __name__ == '__main__':
71 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestTf)
72 |     unittest.TextTestRunner(verbosity=2).run(suite)
73 | 


--------------------------------------------------------------------------------
/vsm/model/beaglecomposite.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | import numpy as np
 4 | 
 5 | from vsm.model.base import BaseModel
 6 | from vsm.model.beaglecontext import realign_env_mat
 7 | 
 8 | 
 9 | __all__ = [ 'BeagleComposite' ]
10 | 
11 | 
12 | class BeagleComposite(BaseModel):
13 |     """
14 |     `BeagleComposite` combines the BEAGLE order and context model
15 |     with a user defined ratio. Default ratio is .5 which weighs
16 |     order and context matrices equally.
17 |     """
18 |     
19 |     def __init__(self, ctx_corp, ctx_matrix, 
20 |                  ord_corp, ord_matrix, context_type='sentence'):
21 |         """
22 |         Assume that the context corpus is a subcorpus of the order
23 |         corpus and that the eventual composite corpus is the context
24 |         corpus. The order matrix is sliced and reordered so that it
25 |         aligns with the context matrix.
26 |  
27 |         :param ctx_corp: Corpus from BEAGLE context model.
28 |         :type ctx_corp: :class:`Corpus`
29 |     
30 |         :param ctx_matrix: BEAGLE context matrix.
31 |         :type ctx_matrix: np.ndarray matrix
32 | 
33 |         :param ord_corp: Corpus from BEAGLE order model.
34 |         :type ord_corp: :class:`Corpus`
35 | 
36 |         :param ord_matrix: BEAGLE order matrix.
37 |         :type ord_matrix: np.ndarray matrix
38 | 
39 |         :param context_type: Name of tokenization stored in `corpus` whose
40 |             tokens will be treated as documents. Default is `sentence`.
41 |         :type context_type: string, optional
42 |         """
43 |         self.ctx_matrix = (ctx_matrix / 
44 |                            ((ctx_matrix**2).sum(1)**0.5)[:,np.newaxis])
45 |         self.ord_matrix = realign_env_mat(ctx_corp, ord_corp, ord_matrix)
46 |         self.ord_matrix /= ((self.ord_matrix**2).sum(1)**0.5)[:,np.newaxis]
47 |         self.context_type = context_type
48 | 
49 | 
50 |     def train(self, wgt=.5):
51 |         """
52 |         Combines the context and order matrices blended by `wgt` ratio.
53 | 
54 |         :param wgt: The weight of context model. If `wgt` is .7 then
55 |             the ratio of context and order model is 7:3. `wgt` should be 
56 |             a value in [0,1]. Default is .5.
57 |         :type wgt: float, optional
58 |        
59 |         :returns: `None`
60 |         """
61 |         print('Summing context and order vectors')
62 |         self.matrix = wgt * self.ctx_matrix + (1 - wgt) * self.ord_matrix
63 | 
64 | 


--------------------------------------------------------------------------------
/vsm/split.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions for splitting lists and arrays
 3 | """
 4 | 
 5 | 
 6 | from builtins import range
 7 | import numpy as np
 8 | 
 9 | 
10 | __all__ = ['split_corpus', 'mp_split_ls', 'split_documents']
11 | 
12 | 
13 | 
14 | def split_corpus(arr, indices):
15 |     """
16 |     Splits the given array by the indices into list of sub-arrays.
17 |     
18 |     :param arr: An array to be split.
19 |     :type arr: array
20 |     :param indices: 1-dimensional array of integers that indicates 
21 |         where the array is split.
22 |     :type indices: array
23 |    
24 |     :returns: A list of sub-arrays split at the indices.
25 |    
26 |     **Examples**
27 | 
28 |     >>> arr = np.arange(8)
29 |     >>> indices = np.array([2,4,7])
30 |     >>> split_corpus(arr, indices)
31 |     [array([0,1]), array([2,3]), array([4,5,6]), array([7])]
32 |     """
33 |     if len(indices) == 0:
34 |         return arr
35 | 
36 |     if isinstance(indices, list):
37 |         indices = np.array(indices)
38 | 
39 |     out = np.split(arr, indices)
40 |     
41 |     if (indices >= len(arr)).any():
42 |         out = out[:-1]
43 |     try:
44 |         for i in range(len(out)):
45 |             if out[i].size == 0:
46 |                 out[i] = np.array([], dtype=arr.dtype)
47 |     except AttributeError:
48 |         for i in range(len(out)):
49 |             if out[i].size == 0:
50 |                 out[i] = np.array([])
51 | 
52 |     return out
53 | 
54 | 
55 | 
56 | def mp_split_ls(ls, n):
57 |     """
58 |     Split list into an `n`-length list of arrays.
59 | 
60 |     :param ls: List to be split.
61 |     :type ls: list
62 | 
63 |     :param n: Number of splits.
64 |     :type n: int
65 | 
66 |     :returns: List of arrays whose length is 'n'.
67 | 
68 |     **Examples**
69 |     >>> ls = [1,5,6,8,2,8]
70 |     >>> mp_split_ls(ls, 4)
71 |     [array([1, 5]), array([6, 8]), array([2]), array([8])]
72 |     """
73 |     return np.array_split(ls, min(len(ls), n))
74 | 
75 | 
76 | def split_documents(corpus, indices, max_partitions):
77 |     """
78 |     """
79 |     docs = [(0, indices[0])]
80 |     for i in range(len(indices)-1):
81 |         docs.append((indices[i], indices[i+1]))
82 |     docs = np.array(docs, dtype='i8, i8')
83 | 
84 |     corpus_chunks = np.array_split(corpus, max_partitions)
85 |     chunk_indices = np.cumsum([len(chunk) for chunk in corpus_chunks])
86 |     doc_indices = np.searchsorted(indices, chunk_indices, side='right')
87 |     doc_partitions = np.split(docs, doc_indices[:-1])
88 | 
89 |     doc_partitions = [part for part in doc_partitions if part.size]
90 | 
91 |     return doc_partitions
92 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, Command, find_packages
 2 | import platform
 3 | import numpy
 4 | 
 5 | from Cython.Build import cythonize
 6 | 
 7 | 
 8 | # find packages in vsm subdirectory
 9 | # this will skip the unittests, etc.
10 | packages = ['vsm.'+pkg for pkg in find_packages('vsm')]
11 | packages.append('vsm')
12 | 
13 | install_requires=[
14 |         'chardet',
15 |         'cython',
16 |         'future',
17 |         'matplotlib',
18 |         'nltk',
19 |         'numpy',
20 |         'progressbar2',
21 |         'py4j',
22 |         'scikit_learn',
23 |         'scipy',
24 |         'sortedcontainers',
25 |         'translate',
26 |         'Unidecode',
27 |     ]
28 | 
29 | if platform.python_version_tuple()[0] == '2':
30 |     install_requires.append("futures>=3.0.0")
31 |     install_requires.append("backports.tempfile==1.0")
32 | 
33 | setup(
34 |     name = "vsm",
35 |     version = "1.0.0b1",
36 |     description = ('Vector Space Semantic Modeling Framework '\
37 |                    'for the Indiana Philosophy Ontology Project'),
38 |     author = "The Indiana Philosophy Ontology (InPhO) Project",
39 |     author_email = "inpho@indiana.edu",
40 |     url = "http://inpho.cogs.indiana.edu/",
41 |     download_url = "http://www.github.com/inpho/vsm",
42 |     keywords = [],
43 |     # https://pypi.python.org/pypi?%3Aaction=list_classifiers
44 |     classifiers = [
45 |         "Programming Language :: Python",
46 |         "Programming Language :: Python :: 2",
47 |         "Programming Language :: Python :: 3",
48 |         "Development Status :: 5 - Production/Stable",
49 |         "Environment :: Console",
50 |         "Intended Audience :: Developers",
51 |         "Intended Audience :: Science/Research",
52 |         "License :: OSI Approved :: MIT License",
53 |         "Operating System :: OS Independent",
54 |         "Topic :: Software Development :: Libraries :: Python Modules",
55 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
56 |         "Topic :: Text Processing :: Linguistic",
57 |         ],
58 |     install_requires=install_requires,
59 |     license = 'MIT',
60 |     packages=packages,
61 |     ext_modules = cythonize(
62 |         Extension(
63 |             "vsm.model._cgs_update",
64 |             sources=["vsm/model/_cgs_update.pyx"],
65 |             include_dirs=[numpy.get_include()]
66 |         )
67 |     ),
68 |     zip_safe = False,
69 |     package_data = {'vsm': ['vsm/model/_cgs_update.pyx']},
70 |     dependency_links=['https://inpho.cogs.indiana.edu/pypi/pymmseg/'],
71 | 
72 |     test_suite = "unittest2.collector",
73 |     tests_require=['unittest2'],
74 | )
75 | 


--------------------------------------------------------------------------------
/unit_tests/tests_beaglecontext.py:
--------------------------------------------------------------------------------
 1 | from builtins import str
 2 | from builtins import range
 3 | import unittest
 4 | import numpy as np
 5 | 
 6 | 
 7 | 
 8 | class TestBeagleContext(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         from vsm.corpus.util.corpusbuilders import random_corpus
12 |         from vsm.model.beaglecontext import BeagleContextSeq, BeagleContextMulti 
13 |         from vsm.model.beagleenvironment import BeagleEnvironment
14 | 
15 |         self.ec = random_corpus(1000, 50, 0, 5, context_type='sentence')
16 |         self.cc = self.ec.apply_stoplist(stoplist=[str(i) for i in range(0,50,7)])
17 | 
18 |         self.e = BeagleEnvironment(self.ec, n_cols=5)
19 |         self.e.train()
20 | 
21 |         self.ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
22 |         self.ms.train()
23 |         '''
24 |         self.mm = BeagleContextMulti(self.cc, self.ec, self.e.matrix)
25 |         self.mm.train(n_procs=2)
26 |         '''
27 | 
28 | 
29 |     def test_BeagleContextSeq(self):
30 |         from tempfile import NamedTemporaryFile
31 |         import os
32 | 
33 |         from vsm.model.beaglecontext import BeagleContextSeq 
34 |         try:
35 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
36 |             self.ms.save(tmp.name)
37 |             tmp.close()
38 |             m1 = BeagleContextSeq.load(tmp.name)
39 |             self.assertTrue((self.ms.matrix == m1.matrix).all())
40 |     
41 |         finally:
42 |             os.remove(tmp.name)
43 | 
44 | 
45 |     '''
46 |     def test_BeagleContextMulti(self):
47 |         from tempfile import NamedTemporaryFile
48 |         import os
49 | 
50 |         from vsm.model.beaglecontext import BeagleContextMulti
51 |         try:
52 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
53 |             self.mm.save(tmp.name)
54 |             tmp.close()
55 |             m1 = BeagleContextMulti.load(tmp.name)
56 |             self.assertTrue((self.mm.matrix == m1.matrix).all())
57 |     
58 |         finally:
59 |             os.remove(tmp.name)
60 | 
61 | 
62 | 
63 |     def test_compare(self):
64 | 
65 |         print 'Training single processor model'
66 |         ms = BeagleContextSeq(self.cc, self.ec, self.e.matrix)
67 |         ms.train()
68 | 
69 |         print 'Training multiprocessor model'
70 |         mm = BeagleContextMulti(self.cc, self.ec, self.e.matrix)
71 |         mm.train()
72 | 
73 |         self.assertTrue(np.allclose(ms.matrix, mm.matrix))
74 |     '''    
75 |         
76 | #Define and run test suite
77 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleContext)
78 | unittest.TextTestRunner(verbosity=2).run(suite)
79 | 


--------------------------------------------------------------------------------
/vsm/extensions/lda_py4j/org/knowceans/gibbstest/LDA.java:
--------------------------------------------------------------------------------
 1 | package org.knowceans.gibbstest;
 2 | 
 3 | import java.text.DecimalFormat;
 4 | import java.text.NumberFormat;
 5 | import java.io.IOException;
 6 | import java.util.*;
 7 | import java.io.StringWriter;
 8 | 
 9 | import py4j.GatewayServer;
10 | 
11 | 
12 | public class LDA {
13 | 
14 |     private static FileArrayProvider fap;
15 |     private static LdaGibbsSampler lda;
16 | 
17 |     public LDA(String corpFile) throws IOException {
18 |         
19 |         FileArrayProvider fap = new FileArrayProvider();
20 |         this.fap = fap;
21 | 
22 |         int[][] documents = fap.readFile(corpFile);
23 |         
24 |         List<Integer> vli = new ArrayList<Integer>();
25 |         for (int[] d : documents) {
26 |             for (int i : d) {
27 |                 if (!vli.contains(i)) {
28 |                     vli.add(i);
29 |                 }
30 |             }
31 |         }
32 | 
33 |         int V = vli.size();
34 |         int M = documents.length;
35 |         System.out.println("V, M "+ V + " " + M);
36 |         LdaGibbsSampler lda = new LdaGibbsSampler(documents, V);
37 |         
38 |         this.lda = lda;
39 |     }
40 |    
41 |     public LdaGibbsSampler getLda() {
42 |         return this.lda;
43 |     }
44 | 
45 |     public FileArrayProvider getFap() {
46 |         return this.fap;
47 |     }
48 |     
49 |     public static void sample(int iter, int K, double alpha, double beta)  {
50 |         // configure(iter, burnin, thinInterval, sampleLag) default values
51 |         // from LdaGibbsSampler example.
52 |         lda.configure(iter, 2000, 100, 10);
53 |         lda.gibbs(K, alpha, beta); 
54 |     }
55 |     
56 |     public static void main(String[] args) throws IOException {
57 |         // Note: iter=1000   returns NaN for all phi, theta
58 |         String corpfile = args[0];
59 |         LDA ldai = new LDA(corpfile);
60 |         GatewayServer gatewayServer = new GatewayServer(ldai);
61 |         gatewayServer.start();
62 |         System.out.println("Gateway Server Started!");
63 |    }
64 |     
65 |     public void writeMeta(int iter, int K, double alpha, double beta, 
66 |            String metaFile) throws IOException {
67 |        
68 |         String s = "";
69 |         s += "K," + K + "\n";
70 |         s += "iteration," + iter + "\n";
71 |         s += "m_words," + this.getLda().V + "\n";
72 |         s += "doc_prior," + alpha + "\n";
73 |         s += "top_prior," + beta + "\n";
74 |         // add dummy values
75 |         s += "inv_top_sums," + "0.0\n";
76 |         s += "log_probs," + "0.0\n";
77 |         
78 |         this.getFap().writeStrFile(s, metaFile);
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/vsm/model/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from builtins import object
 3 | import numpy as np
 4 | 
 5 | __all__ = ['BaseModel']
 6 | 
 7 | 
 8 | 
 9 | 
10 | class BaseModel(object):
11 |     """
12 |     Base class for models which store data in a single matrix.
13 | 
14 |     :param matrix: A two-dimensional numpy array storing the results
15 |         of model training. Default is `None`.
16 |     :type matrix: numpy.ndarray, optional
17 |     
18 |     :param context_type: A string specifying the type of context over
19 |         which the model trainer is applied. Default is `None`.
20 |     :type context_type: string, optional
21 | 
22 |     :attributes:
23 |         Same as parameters.
24 | 
25 |     :methods:
26 |         * **save**
27 |             Takes a filename or file object and saves `self.matrix` 
28 |             in an npz archive.
29 |         * **load**
30 |             Takes a filename or file object and loads it as an npz
31 |             archive into a BaseModel object.
32 | 
33 |     :See Also: :meth:`numpy.savez`, :meth:`numpy.load`
34 |     """
35 |     def __init__(self, matrix=None, context_type=None):
36 |         self.matrix = matrix
37 |         self.context_type = context_type
38 | 
39 |     def save(self, f):
40 |         """
41 |         Takes a filename or file object and saves `self.matrix` in an
42 |         npz archive.
43 |         
44 |         :param file: Designates the file to which to save data. See
45 |             `numpy.savez` for further details.
46 |         :type file: str-like or file-like object
47 |             
48 |         :returns: `None`
49 | 
50 |         :See Also: :meth:`numpy.savez`
51 |         """
52 |         print('Saving model to {}'.format(f))
53 |         np.savez(f, matrix=np.array(self.matrix), context_type=np.array(self.context_type))
54 | 
55 | 
56 |     @staticmethod
57 |     def load(f):
58 |         """
59 |         Takes a filename or file object and loads it as an npz archive
60 |         into a BaseModel object.
61 | 
62 |         :param file: Designates the file to read. If `file` is a string
63 |             ending in `.gz`, the file is first gunzipped. See `numpy.load`
64 |             for further details.
65 |         :type file: str-like or file-like object
66 | 
67 |         :returns: A dictionary storing the data found in `file`.
68 | 
69 |         :See Also: :meth:`numpy.load`
70 |         """
71 |         print('Loading model from {}'.format(f))
72 |         npz = np.load(f)
73 |         
74 |         # The slice [()] is to unwrap sparse matrices, which get saved
75 |         # in singleton object arrays
76 |         return BaseModel(matrix=npz['matrix'], context_type=npz['context_type'])
77 | 


--------------------------------------------------------------------------------
/unit_tests/tests_viewer_wrappers.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.viewer.wrappers import *
 5 | from vsm.viewer.labeleddata import *
 6 | 
 7 | 
 8 | 
 9 | 
10 | class TestViewerWrappers(unittest.TestCase):
11 | 
12 |     # TODO: Rewrite these to be independent of LDA
13 |     pass
14 | 
15 |     # def setUp(self):
16 | 
17 |     #     from vsm.corpus.util.corpusbuilders import random_corpus
18 |     #     from vsm.model.ldacgsseq import LdaCgsSeq
19 | 
20 |     #     self.c = random_corpus(1000, 50, 0, 20, context_type='sentence',
21 |     #                         metadata=True)
22 | 
23 |     #     self.m = LDAGibbs(self.c, 'sentence', K=40)
24 |     #     self.m.train(n_iterations=50)
25 | 
26 | 
27 |     # def test_dist_(self):
28 |     
29 |     #     li = [0,1]
30 | 
31 |     #     swt = dist_word_top(self.c, self.m.word_top.T, '0')
32 |     #     swtl = dist_word_top(self.c, self.m.word_top.T, ['0','1'], order='i')
33 |     #     sww = dist_word_word(self.c, self.m.word_top, '0')
34 |     #     swwl = dist_word_word(self.c, self.m.word_top, ['0','1'], order='i')
35 |     #     std = dist_top_doc(self.c, self.m.top_doc.T, 0, 'sentence', order='i')
36 |     #     stdl = dist_top_doc(self.c, self.m.top_doc.T, li, 'sentence')
37 |     #     sdd = dist_doc_doc(self.c, self.m.top_doc, self.m.context_type, 0)
38 |     #     sddl = dist_doc_doc(self.c, self.m.top_doc, self.m.context_type, li)
39 |     #     stt = dist_top_top(self.m.word_top.T, 1)
40 |     #     sttl = dist_top_top(self.m.word_top.T, li)
41 | 
42 |     #     self.assertEqual(type(swt), LabeledColumn)
43 |     #     self.assertEqual(type(swtl), LabeledColumn)
44 |     #     self.assertEqual(type(sww), LabeledColumn)
45 |     #     self.assertEqual(type(swwl), LabeledColumn)
46 |     #     self.assertEqual(type(std), LabeledColumn)
47 |     #     self.assertEqual(type(stdl), LabeledColumn)
48 |     #     self.assertEqual(type(sdd), LabeledColumn)
49 |     #     self.assertEqual(type(sddl), LabeledColumn)
50 |     #     self.assertEqual(type(stt), LabeledColumn)
51 |     #     self.assertEqual(type(sttl), LabeledColumn)
52 |         
53 |         
54 |     # def test_dismat_(self):
55 | 
56 |     #     dismatw = dismat_word(['0','2','5'], self.c, self.m.word_top)
57 |     #     dismatd = dismat_doc([0,1,2], self.c, self.m.context_type, 
58 |     #                          self.m.top_doc)
59 |     #     dismatt = dismat_top([0,1,2], self.m.word_top)
60 | 
61 |     #     self.assertEqual(type(dismatw), IndexedSymmArray)
62 |     #     self.assertEqual(type(dismatd), IndexedSymmArray)
63 |     #     self.assertEqual(type(dismatt), IndexedSymmArray)
64 |         
65 |         
66 | 
67 | #Define and run test suite
68 | suite = unittest.TestLoader().loadTestsFromTestCase(TestViewerWrappers)
69 | unittest.TextTestRunner(verbosity=2).run(suite)
70 | 


--------------------------------------------------------------------------------
/unit_tests/tests_labeleddata.py:
--------------------------------------------------------------------------------
 1 | from builtins import str
 2 | from builtins import zip
 3 | from builtins import range
 4 | from past.builtins import basestring
 5 | 
 6 | import unittest
 7 | import numpy as np
 8 | 
 9 | from vsm.viewer.labeleddata import *
10 | 
11 | 
12 | class TestLabeleddata(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 | 
16 |         words = ['row', 'row', 'row', 'your', 'boat', 'gently', 'down', 'the', 
17 |                  'stream', 'merrily', 'merrily', 'merrily', 'merrily', 'life', 
18 |                  'is', 'but', 'a', 'dream']
19 |         values = [np.random.random() for t in words]
20 |         d = [('i', np.array(words).dtype), 
21 |              ('value', np.array(values).dtype)]
22 |         self.v = np.array(list(zip(words, values)), dtype=d)
23 | 
24 | 
25 | 
26 |     def test_LabeledColumn(self):
27 | 
28 |         arr = self.v.view(LabeledColumn)
29 |         arr.subcol_headers = ['Word', 'Value']
30 |         arr.col_header = 'Song lets make this longer than subcol headers'
31 |         arr.col_len = 10
32 |         arr1 = self.v.view(LabeledColumn)
33 | 
34 |         self.assertTrue(isinstance(arr.__str__(), basestring))
35 |         self.assertTrue(sum(arr.subcol_widths) <= arr.col_width)
36 |         self.assertEqual(arr.shape[0], arr1.col_len)
37 |         self.assertFalse(arr1.col_header)
38 |         self.assertFalse(arr1.subcol_headers)
39 | 
40 | 
41 |     def test_DataTable(self):
42 | 
43 |         v = LabeledColumn(self.v)
44 |         v.subcol_widths = [30, 20]
45 |         v.col_len = 10
46 |         t = []
47 |         for i in range(5):
48 |             t.append(v.copy())
49 |             t[i].col_header = 'Iteration ' + str(i)
50 |         
51 |         schc = ['Topic', 'Word']
52 |         schf = ['Word', 'Value'] 
53 |         t = DataTable(t, 'Song', subcolhdr_compact=schc, subcolhdr_full=schf)
54 | 
55 |         self.assertTrue(isinstance(t.__str__(), basestring))
56 |         self.assertTrue('Song', t.table_header)
57 | 
58 |         t.compact_view = False
59 |         self.assertTrue(isinstance(t.__str__(), basestring))
60 |         self.assertTrue('Song', t.table_header)
61 | 
62 | 
63 | 
64 |     def test_IndexedSymmArray(self):
65 | 
66 |         from vsm.corpus.util.corpusbuilders import random_corpus
67 |         from vsm.model.ldacgsseq import LdaCgsSeq
68 |         from vsm.viewer.ldacgsviewer import LdaCgsViewer
69 | 
70 |         c = random_corpus(50000, 1000, 0, 50)
71 |         m = LdaCgsSeq(c, 'document', K=20)
72 |         viewer = LdaCgsViewer(c, m)
73 |         
74 |         li = [0, 1, 10]
75 |         isa = viewer.dismat_top(li)
76 |         
77 |         self.assertEqual(isa.shape[0], len(li))
78 |         
79 | 
80 |   
81 |         
82 | #Define and run test suite
83 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLabeleddata)
84 | unittest.TextTestRunner(verbosity=2).run(suite)
85 | 


--------------------------------------------------------------------------------
/vsm/extensions/trans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | 
 4 | import nltk
 5 | import re
 6 | import os
 7 | from  unidecode import unidecode
 8 | from translate import Translator as Ts
 9 | from vsm.corpus.util import *
10 | 
11 | """
12 | Uses `translate` python module from https://pypi.python.org/pypi/translate
13 | """
14 | 
15 | def sent_tokenize(text, lang='english'):
16 |     tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(lang))
17 |     return tokenizer.tokenize(text)
18 | 
19 | def cleanup(s):
20 |     s = rehyph(s)
21 |     s = s.strip('\n')
22 |     def replace(match):
23 |         if match:
24 |             if match.group(0).startswith(r'\*'):
25 |                 return unidecode(match.group(0))
26 |         else:
27 |             return ''
28 | 
29 |     return re.sub(r"[\x90-\xff]", replace, s)
30 | 
31 | 
32 | def transwrapper(text, from_lang, to_lang):
33 |     
34 |     if from_lang == 'en':
35 |         lang = 'english'
36 |     elif from_lang == 'fr':
37 |         lang = 'french'
38 |     elif from_lang == 'de':
39 |         lang = 'german'
40 |     sli = sent_tokenize(text, lang=lang)
41 |     
42 |     out = ''
43 |     for sent in sli:
44 |         sent = cleanup(sent) 
45 |         
46 |         ts = Ts(from_lang=from_lang, to_lang=to_lang)
47 |         target = ts.translate(sent)
48 |         out += target
49 |     
50 |     return out
51 | 
52 | 
53 | if __name__=="__main__":
54 |     frompath = 'darwin-de/'
55 |     topath = 'darwin-de-translate/'
56 |     
57 |     books = os.listdir(frompath)
58 |     books.sort()
59 |     
60 |     for book in books:
61 |         book_path = os.path.join(frompath, book)
62 |         print(book_path)
63 |         pages = os.listdir(book_path)
64 |         pages.sort()
65 |         
66 |         for page in pages:
67 |             page_name = os.path.join(book_path, page)
68 |             
69 |             with open(page_name, 'r') as f:
70 |                 try:
71 |                     out = transwrapper(f.read(), 'de', 'en')
72 |                     out = out.encode('utf-8')
73 |                 except:
74 |                     out = ''
75 |                     print(page_name, ' failed translation.')
76 |                 
77 |                 try:
78 |                     os.mkdir(os.path.join(topath, book))
79 |                 except OSError:
80 |                     pass
81 |                 topage = os.path.join(topath, book, page)
82 |                 #with open(topage, 'w') as fout:
83 |     """ 
84 |     # for individual pages 
85 |     fin = 'darwin-de/wu.89101307601/00000636.txt'
86 |     fout = 'darwin-de-translate/wu.89101307601/00000636.txt'
87 | 
88 |     with open(fin, 'r') as f:
89 |         out = transwrapper(f.read(), 'de', 'en')
90 |         out = out.encode('utf-8')
91 |         with open(fout, 'w') as fo:
92 |             fo.write(out)
93 |                 fout.write(out)"""
94 | 


--------------------------------------------------------------------------------
/vsm/extensions/clustering/plotting.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def gen_colors(clusters):
 4 |     """
 5 |     Takes 'clusters' and creates a list of colors so a cluster has a color.
 6 |     
 7 |     :param clusters: A flat list of integers where an integer represents which
 8 |         cluster the information belongs to.
 9 |     :type clusters: list
10 |     
11 |     :returns: colorm : list
12 |         A list of colors obtained from matplotlib colormap cm.hsv. The
13 |         length of 'colorm' is the same as the number of distinct
14 |         clusters.
15 |     """
16 |     import matplotlib.cm as cm
17 |     
18 |     n = len(set(clusters))
19 |     colorm = [cm.hsv(i * 1.0 /n, 1) for i in xrange(n)]
20 |     return colorm
21 | 
22 | 
23 | def plot_clusters(arr, labels, clusters=[], size=[]):
24 |     """	
25 |     Takes 2-dimensional array(simmat), list of clusters, list of labels,
26 |     and list of marker size. 'clusters' should be a flat list which can be
27 |     obtained from cluster_topics(by_cluster=False).
28 |     Plots each clusters in different colors.
29 |     
30 |     :type arr: 2-dimensional array
31 |     :param arr: Array has x, y coordinates to be plotted on a 2-dimensional
32 |         space.
33 |     
34 |     :param labels: List of labels to be displayed in the graph. 
35 |     :type labels: list
36 |     
37 |     :param clusters: A flat list of integers where an integer represents which
38 |         cluster the information belongs to. If not given, it returns a
39 |         basic plot with no color variation. Default is an empty list.
40 |     :type clusters: list, optional
41 |     
42 |     :param size: List of markersize for points where markersize can note the
43 |         importance of the point. If not given, 'size' is a list of
44 |         fixed markersize, 40. Default is an empty list.
45 |     :type size: list, optional
46 | 
47 |     :returns: plt : maplotlit.pyplot object
48 |         A graph with scatter plots from 'arr'.
49 |     """
50 |     import matplotlib.pyplot as plt
51 | 
52 |     n = arr.shape[0]
53 |     X = arr[:,0]
54 |     Y = arr[:,1]
55 | 
56 |     if len(size) == 0:
57 |         size = [40 for i in xrange(n)]
58 |         
59 |     fig = plt.figure(figsize=(10,10))
60 |     ax = plt.subplot(111)
61 | 
62 |     if len(clusters) == 0:
63 |         plt.scatter(X, Y, size)
64 |     else:	
65 |         colors = gen_colors(clusters)
66 |         colors = [colors[i] for i in clusters]
67 | 
68 |         for i in xrange(n):
69 |             plt.scatter(X[i], Y[i], size, color=colors[i])
70 | 
71 |     ax.set_xlim(np.min(X) - .1, np.max(X) + .1)
72 |     ax.set_ylim(np.min(Y) - .1, np.max(Y) + .1)
73 |     ax.set_xticks([])
74 |     ax.set_yticks([])
75 | 
76 |     for label, x, y in zip(labels, X, Y):
77 |         plt.annotate(label, xy = (x, y), xytext=(-2, 3), 
78 |                      textcoords='offset points', fontsize=10)
79 | 
80 |     plt.show()
81 | 


--------------------------------------------------------------------------------
/unit_tests/tests_corpus_util.py:
--------------------------------------------------------------------------------
 1 | from builtins import range
 2 | import unittest
 3 | 
 4 | from vsm.corpus import add_metadata
 5 | from vsm.extensions.corpusbuilders.util import *
 6 | import numpy as np
 7 | 
 8 | class TestCorpusUtil(unittest.TestCase):
 9 |     
10 |     def test_strip_punc(self):
11 |         
12 |         tsent = ['foo-foo',',','3','foo','bars','bar_foo','2to1','.']
13 |         out = strip_punc(tsent)
14 |         self.assertEqual(out, ['foofoo','3','foo','bars','barfoo','2to1'])
15 | 
16 | 
17 |     def test_rem_num(self):
18 |  
19 |         tsent = ['foo-foo',',','3','foo','bars','2-parts','2-to-1','3words','.']
20 |         out = rem_num(tsent)
21 |         self.assertEqual(out, ['foo-foo',',','foo','bars','-parts','-to-','words','.'])
22 | 
23 |     def test_rehyph(self):
24 |         
25 |         sent = 'foo foo 3 foo--bars barfoo -- 2to1.'
26 |         out = rehyph(sent)
27 |         self.assertEqual(out, 'foo foo 3 foo - bars barfoo  -  2to1.')
28 | 
29 |     def test_add_metadata(self):
30 |         
31 |         from vsm.corpus.util.corpusbuilders import random_corpus
32 | 
33 |         c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True)
34 |         n = c.view_metadata('sentence').size
35 |         meta = ['m_{0}'.format(i) for i in range(n)]
36 |         new_c = add_metadata(c, 'sentence', 'new_meta', meta)
37 | 
38 |         self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta)
39 | 
40 | 
41 |     def test_apply_stoplist(self):
42 |         
43 |         from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist
44 | 
45 |         c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True)
46 |         new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], 
47 |                                freq=0, in_place=False)
48 | 
49 |         li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']]
50 |         wc = corpus_fromlist(li, context_type='sentence')
51 |         new_wc = apply_stoplist(wc, nltk_stop=True, freq=1, in_place=False)
52 |         
53 |         self.assertTrue('0' in c.words)
54 |         self.assertTrue('1' in c.words)
55 |         self.assertFalse('0' in new_c.words)
56 |         self.assertFalse('1' in new_c.words)
57 | 
58 |         self.assertTrue('said' in new_wc.words)
59 |         self.assertTrue('bar' in new_wc.words)
60 |         self.assertFalse('he' in new_wc.words)
61 |         self.assertFalse('foo' in new_wc.words)
62 |         self.assertFalse('ate' in new_wc.words)
63 | 
64 | 
65 |     def test_filter_by_suffix(self):
66 | 
67 |         li = ['a.txt', 'b.json', 'c.txt']
68 |         filtered = filter_by_suffix(li, ['.txt'])
69 |         filtered1 = filter_by_suffix(li, ['.json'])
70 |         filtered2 = filter_by_suffix(li, ['.csv'])
71 | 
72 |         self.assertEqual(filtered, ['b.json'])
73 |         self.assertEqual(filtered1, ['a.txt','c.txt'])
74 |         self.assertEqual(filtered2, li)
75 | 
76 | 
77 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusUtil)
78 | unittest.TextTestRunner(verbosity=2).run(suite)
79 | 


--------------------------------------------------------------------------------
/vsm/extensions/testdata/history_greek_philosophy/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, json
 2 | 
 3 | 
 4 | __all__ = [ 'doc_files', 'doc_meta_file',
 5 |             'documents', 'document_metadata',
 6 |             'corpus', 'paragraphs', 'doc_label_fn' ]
 7 | 
 8 | 
 9 | 
10 | _doc_files = [ 'frontmatter.json', 'chapter1.json', 'chapter2.json',
11 |                'chapter3.json', 'chapter4.json', 'chapter5.json',
12 |                'chapter6.json', 'chapter7.json', 'chapter8.json',
13 |                'chapter9.json', 'chapter10.json', 'chapter11.json',
14 |                'chapter12.json', 'chapter13.json', 'chapter14.json',
15 |                'chapter15.json', 'chapter16.json', 'chapter17.json',
16 |                'chapter18.json', 'chapter19.json', 'chapter20.json',
17 |                'chapter21.json', 'chapter22.json', 'backmatter.json' ]
18 | 
19 | doc_files = [os.path.join(os.path.dirname(__file__), f) 
20 |                   for f in _doc_files]
21 | 
22 | 
23 | doc_meta_file = os.path.join(os.path.dirname(__file__), 'doc_meta.json')
24 | 
25 | 
26 | def document_metadata():
27 |     """Returns an iterator over document metadata in corpus.
28 | 
29 |     """
30 |     with open(doc_meta_file, 'r') as f:
31 |         doc_meta_all = json.load(f)
32 |     for docs_meta in doc_meta_all:
33 |         for doc_meta in docs_meta:
34 |             yield doc_meta
35 | 
36 | 
37 | def documents():
38 |     """Returns an iterator over documents paired with their metadata.
39 | 
40 |     """
41 |     m = document_metadata()
42 | 
43 |     for doc_file in doc_files:
44 |         with open(doc_file, 'r') as f:
45 |             docs = json.load(f)
46 |         for doc in docs:
47 |             yield doc, m.next()
48 | 
49 | 
50 | def paragraphs():
51 |     """Returns iterator over paragraphs and associated metadata.
52 | 
53 |     """
54 |     import copy
55 |     import vsm.ext.corpusbuilders.util as util
56 | 
57 |     docs = documents()
58 |     for doc, meta in docs:
59 |         p = 0
60 |         pars = util.paragraph_tokenize(doc)
61 |         for par in pars:
62 |             par_meta = copy.deepcopy(meta)
63 |             par_meta['paragraph'] = p
64 |             p += 1
65 |             yield par, par_meta
66 |  
67 | 
68 | def corpus(doc_type='document', unidecode=True, nltk_stop=True, 
69 |            stop_freq=0, add_stop=None):
70 |     """Returns Corpus object containing text data and metadata.
71 | 
72 |     """
73 |     from vsm.ext.corpusbuilders import corpus_from_strings
74 | 
75 |     if doc_type=='document':
76 |         docs = documents()
77 |     elif doc_type=='paragraphs':
78 |         docs = paragraphs()
79 |     else:
80 |         raise Exception('Unrecognized document type given.')
81 | 
82 |     docs, meta = zip(*list(docs))
83 | 
84 |     return corpus_from_strings(docs, meta, 
85 |                                unidecode=unidecode, 
86 |                                nltk_stop=nltk_stop, 
87 |                                stop_freq=stop_freq, 
88 |                                add_stop=add_stop)
89 | 
90 | 
91 | def doc_label_fn(metadata):
92 |     label = metadata['part_of_book']
93 |     return label
94 | 


--------------------------------------------------------------------------------
/vsm/model/ldaexact.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | from itertools import product
  5 | from ldacgsseq import LdaCgsSeq
  6 | 
  7 | 
  8 | __all__ = [ 'LdaExact' ]
  9 | 
 10 | 
 11 | 
 12 | def uniquify(l):
 13 |     """Takes a list `l` and returns a list of the unique elements in `l`
 14 |     in the order in which they appeared.
 15 | 
 16 |     """
 17 |     mem = set([])
 18 |     out = []
 19 |     for e in l:
 20 |         if e not in mem:
 21 |             mem.add(e)
 22 |             out.append(e)
 23 |     return out
 24 | 
 25 | 
 26 | def productoid(A, n):
 27 | 
 28 |     prod = product(A, repeat=n)
 29 |     d = dict((i, A[:i]) for i in xrange(1, len(A)+1))
 30 | 
 31 |     for t in prod:
 32 |         elems = uniquify(t)
 33 |         if elems == d[len(elems)]:
 34 |             yield t
 35 | 
 36 | 
 37 | class LdaExact(LdaCgsSeq):
 38 | 
 39 | 
 40 |     @property
 41 |     def arg_maxima(self):
 42 |         if hasattr(self, '_arg_maxima'):
 43 |             return self._arg_maxima
 44 |         return []
 45 | 
 46 | 
 47 |     @arg_maxima.setter
 48 |     def arg_maxima(self, l):
 49 |         self._arg_maxima = l
 50 | 
 51 | 
 52 |     def _Z_values(self):
 53 |         
 54 |         A = range(self.K)
 55 |         p = productoid(A, len(self.corpus))
 56 |         for t in p:
 57 |             yield np.array(t, dtype=int)
 58 | 
 59 | 
 60 |     def _init_model(self, Z):
 61 |         m = LdaCgsSeq(context_type=self.context_type,
 62 |                       K=self.K, V=self.V, alpha=self.alpha, beta=self.beta)
 63 |         m.corpus = self.corpus
 64 |         m.V = self.V
 65 |         m.indices = self.indices
 66 |         m.Z = Z
 67 |         m._compute_top_doc()
 68 |         m._compute_word_top()
 69 |         m.inv_top_sums = 1. / self.word_top.sum(0)
 70 |         m.iteration = 1
 71 |         m.log_probs = [(1, m._compute_log_prob())]
 72 |         return m
 73 |         
 74 | 
 75 |     def _log_probs(self):
 76 | 
 77 |         Z = self._Z_values()
 78 | 
 79 |         for next_Z in Z:
 80 |             m = self._init_model(next_Z)
 81 |             yield (next_Z, m.log_probs[0][1])
 82 | 
 83 | 
 84 |     def arg_max(self, verbose=1):
 85 |         
 86 |         max_log_prob = -np.inf
 87 |         maxima = []
 88 | 
 89 |         log_probs = self._log_probs()
 90 |         for (Z, log_prob) in log_probs:
 91 |             if log_prob == max_log_prob:
 92 |                 maxima.append((Z, log_prob))
 93 |             elif log_prob > max_log_prob:
 94 |                 max_log_prob = log_prob
 95 |                 maxima = [(Z, log_prob)]
 96 | 
 97 |         self.arg_maxima = maxima
 98 |         self.Z = maxima[0][0]
 99 |         self._compute_top_doc()
100 |         self._compute_word_top()
101 |         self.inv_top_sums = 1. / self.word_top.sum(0)
102 |         self.iteration = 1
103 |         self.log_probs = [(1, max_log_prob)]
104 | 
105 |         if verbose > 0:
106 |             print('Number of maxima:', len(self.arg_maxima))
107 | 
108 | 
109 |     def all_estimates(self):
110 |         
111 |         for (Z, log_prob) in self.arg_maxima:
112 |             yield self._init_model(Z)
113 | 
114 | 


--------------------------------------------------------------------------------
/vsm/extensions/lda_py4j/org/knowceans/gibbstest/FileReadWrite.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | 
 4 | 
 5 | def lda_save(ctx_type, phifile, thetafile, zfile, restfile, modelfile):
 6 |     # Reads data from too many files.
 7 |     dic = file_to_dict(restfile)
 8 |     top_doc = file_to_mat(thetafile)
 9 |     word_top = file_to_mat(phifile)
10 | 
11 |     arrays_out = dict()
12 |     arrays_out['iteration'] = int(dic['iteration'])
13 |     dt = dtype=[('i', int), ('v', float)]
14 |     logs = [float(dic['log_probs'])] * int(dic['iteration'])
15 |     indices = range(0, int(dic['iteration']))
16 |     arrays_out['log_probs'] = np.array(zip(indices, logs), dtype=dt)
17 |     arrays_out['Z'] = list(file_to_arrli(zfile))
18 |     arrays_out['top_doc'] = top_doc.T
19 |     arrays_out['word_top'] = word_top.T
20 |     arrays_out['inv_top_sums'] = np.array([float(dic['inv_top_sums'])]
21 |                                     * word_top.shape[1])
22 |     arrays_out['context_type'] = ctx_type
23 |     arrays_out['K'] = int(dic['K'])
24 |     arrays_out['m_words'] = int(dic['m_words'])
25 |     arrays_out['doc_prior'] = np.array([float(dic['doc_prior'])]
26 |                                     * top_doc.size)#.reshape(top_doc.shape)
27 |     arrays_out['top_prior'] = np.array([float(dic['top_prior'])]
28 |                                     * word_top.size)#.reshape(word_top.shape)
29 |     
30 |     print('Saving LDA model to', modelfile)
31 |     np.savez(modelfile, **arrays_out)
32 | 
33 | 
34 | def file_to_dict(filename):
35 |     """
36 |     Reads a file where each line is 'k,v'
37 |     and returns a dictionary of k,v.
38 |     """
39 |     dic = dict()
40 |     with open(filename, 'r') as f:
41 |         lines = f.readlines()
42 |         for l in lines:
43 |             l = l.strip('\n')
44 |             li = l.split(',')
45 |             dic[li[0]] = li[1]
46 |     return dic
47 | 
48 | 
49 | def write_file(corpus, ctx_type, filename):
50 |     """
51 |     Writes corpus.view_contexts(ctx_type) to a file txt.
52 |     """
53 |     ctx = corpus.view_contexts(ctx_type) # [arrays,]
54 | 
55 |     with open(filename, 'w') as f:
56 |         for arr in ctx:
57 |             for i in arr:
58 |                 f.write(str(i))
59 |                 f.write('\n')
60 |             f.write('\n')
61 | 
62 | 
63 | def file_to_mat(filename):
64 |     """
65 |     Data to an array. works for theta, phi.
66 |     Removes automatically added 'missing values' at the end of the rows.
67 |     """
68 |     arr = np.genfromtxt(filename, delimiter=',')
69 | 
70 |     return arr[:,:-1]
71 | 
72 | 
73 | def file_to_arrli(filename, dtype='int'):
74 |     """
75 |     for Z, list of arrays where each array represents a document
76 |     and the array has topic assignment for each word position in the document.
77 |     Length of the array varies as it depends on the length of the 
78 |     corresponding document.
79 |     """
80 | 
81 |     with open(filename, 'r') as f:
82 |         lines = f.readlines()
83 | 
84 |         docs = []
85 |         for l in lines:
86 |             l = l.strip('\n')
87 |             arr = np.fromstring(l, dtype=dtype, sep=',')
88 |             docs.append(arr)
89 | 
90 |     return docs
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/unit_tests/tests_ldacgsviewer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm.viewer.ldacgsviewer import *
 5 | from vsm.viewer.labeleddata import *
 6 | 
 7 | 
 8 | class TestLdaCgsViewer(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 | 
12 |         from vsm.corpus.util.corpusbuilders import random_corpus
13 |         from vsm.model.ldacgsseq import LdaCgsSeq
14 | 
15 |         c = random_corpus(1000, 50, 0, 20, context_type='document',
16 |                             metadata=True)
17 | 
18 |         m = LdaCgsSeq(c, 'document', K=10)
19 |         m.train(n_iterations=50, verbose=0)
20 | 
21 |         self.ldav = LdaCgsViewer(c, m)
22 | 
23 | 
24 |     def test_LdaCgsViewer(self):
25 |     
26 |         li = [0,1]
27 | 
28 |         t = self.ldav.topics(compact_view=False)
29 |         te = self.ldav.topic_entropies()
30 |         swt = self.ldav.dist_word_top('0', compact_view=False)
31 |         
32 |         dt = self.ldav.doc_topics(0)
33 |         dt_ = self.ldav.doc_topics(0)
34 |         wt = self.ldav.word_topics('0')
35 |         stt = self.ldav.dist_top_top(1)
36 |         sttl = self.ldav.dist_top_top(li)
37 |         std = self.ldav.dist_top_doc(0)
38 |         stdl = self.ldav.dist_top_doc(li) 
39 |         sdd = self.ldav.dist_doc_doc(0)
40 |         sddl = self.ldav.dist_doc_doc(li)
41 |         
42 |         t_c = self.ldav.topics()
43 |         te_c = self.ldav.topic_entropies()
44 |         swt_c = self.ldav.dist_word_top('1')
45 |         
46 |         dismatd = self.ldav.dismat_doc()
47 |         dismatt = self.ldav.dismat_top()
48 | 
49 |         self.assertEqual(type(t), DataTable)
50 |         self.assertEqual(type(te), LabeledColumn)
51 |         self.assertEqual(type(swt), DataTable)
52 |         
53 |         self.assertEqual(type(dt), LabeledColumn)
54 |         self.assertEqual(type(dt_), LabeledColumn)
55 |         self.assertEqual(type(wt), LabeledColumn)
56 |         self.assertEqual(type(stt), DataTable)
57 |         self.assertEqual(type(sttl), DataTable)
58 |         self.assertEqual(type(std), LabeledColumn)
59 |         self.assertEqual(type(stdl), LabeledColumn)
60 |         self.assertEqual(type(sdd), LabeledColumn)
61 |         self.assertEqual(type(sddl), LabeledColumn)
62 |         
63 |         self.assertEqual(type(t_c), DataTable)
64 |         self.assertEqual(type(te_c), LabeledColumn)
65 |         self.assertEqual(type(swt_c), DataTable)
66 | 
67 |         self.assertEqual(type(dismatd), IndexedSymmArray)
68 |         self.assertEqual(type(dismatt), IndexedSymmArray)
69 | 
70 |     def test_LdaCgsViewer_topics_args(self):
71 |         # test calls of ldav.topics()
72 |         t = self.ldav.topics()
73 |         self.assertEqual(type(t), DataTable)
74 |         self.assertEqual(len(t), self.ldav.model.K)
75 | 
76 |         with self.assertRaises(ValueError):
77 |             self.ldav.topics(2)
78 |         
79 |         t=self.ldav.topics([2])
80 |         self.assertEqual(type(t), DataTable)
81 |         self.assertEqual(len(t), 1)
82 | 
83 |         t = self.ldav.topics([2,4])
84 |         self.assertEqual(type(t), DataTable)
85 |         self.assertEqual(len(t), 2)
86 | 
87 |         
88 | 
89 | #Define and run test suite
90 | suite = unittest.TestLoader().loadTestsFromTestCase(TestLdaCgsViewer)
91 | unittest.TextTestRunner(verbosity=2).run(suite)
92 | 


--------------------------------------------------------------------------------
/unit_tests/tests_structarr.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from vsm import *
 5 | from vsm.structarr import *
 6 | 
 7 | class TestCore(unittest.TestCase):
 8 | 
 9 |     def test_arr_add_field(self):
10 | 
11 |         arr = np.array([(1, '1'), (2, '2'), (3, '3')],
12 |                    dtype=[('i', int), ('c', '|S1')])
13 |         new_arr = np.array([(1, '1', 0), (2, '2', 0), (3, '3', 0)],
14 |                        dtype=[('i', int), ('c', '|S1'), ('new', int)])
15 | 
16 |         new_field = 'new'
17 |         vals = np.zeros(3, dtype=int)
18 | 
19 |         test_arr = arr_add_field(arr, new_field, vals)
20 | 
21 |         self.assertTrue(np.array_equiv(new_arr, test_arr))
22 |         self.assertTrue(new_arr.dtype==test_arr.dtype)
23 | 
24 |     def test_enum_matrix(self):
25 | 
26 |         arr = np.array([[6,3,7], [2,0,4]], dtype=int)
27 |         em1 = enum_matrix(arr)
28 |         em2 = enum_matrix(arr, 
29 |                           indices=[10,20,30], 
30 |                           field_name='tens')
31 | 
32 |         self.assertTrue(np.array_equiv(em1, np.array([[(2,7), (0,6), (1, 3)],[(2,4), (0,2), (1,0)]],
33 |                         dtype=[('i', int), ('value', int)])))
34 |         self.assertTrue(np.array_equiv(em2, np.array([[(30,7), (10,6), (20, 3)],[(30,4), (10,2), (20,0)]],
35 |                         dtype=[('tens', int), ('value', int)])))
36 |         
37 | 
38 | 
39 |     def test_enum_sort(self):
40 |         
41 |         arr = np.array([7,3,1,8,2])
42 |         sorted_arr = enum_sort(arr)
43 |         sorted_arr1 = enum_sort(arr, indices=[10,20,30,40,50])
44 | 
45 |         self.assertTrue(np.array_equiv(sorted_arr, 
46 |             np.array([(3, 8), (0, 7), (1, 3), (4, 2), (2, 1)],
47 |             dtype=[('i', int), ('value', int)])))
48 | 
49 |         self.assertTrue(np.array_equiv(sorted_arr1,
50 |             np.array([(40, 8), (10, 7), (20, 3), (50, 2), (30, 1)], 
51 |                   dtype=[('i', int), ('value', int)])))
52 | 
53 | 
54 |     def test_enum_array(self):
55 |         
56 |         arr1 = np.array([7,3,1,8,2])
57 |         ea1 = enum_array(arr1)
58 |         arr2 = np.array([6,3,7,2,0,4])
59 |         ea2 = enum_array(arr2)
60 | 
61 |         self.assertTrue(np.array_equiv(ea1, 
62 |             np.array([(0,7), (1,3), (2,1), (3,8), (4,2)],
63 |                     dtype=[('i', int), ('value', int)])))
64 |         self.assertTrue(np.array_equiv(ea2,
65 |             np.array([(0,6), (1,3), (2,7), (3,2), (4,0), (5,4)],
66 |                     dtype=[('i', int), ('value', int)])))
67 |         
68 | 
69 |     def test_zip_arr(self):
70 |         
71 |         arr1 = np.array([[2,4], [6,8]], dtype=int)
72 |         arr2 = np.array([[1,3], [5,7]], dtype=int)
73 | 
74 |         zipped = zip_arr(arr1, arr2, field_names=['even', 'odd'])
75 |         self.assertTrue(np.array_equiv(zipped, np.array([[(2,1), (4,3)], [(6,5), (8,7)]],
76 |                         dtype=[('even', int), ('odd', int)])))
77 | 
78 | 
79 |     def test_map_strarr(self):
80 | 
81 |         arr = np.array([(0, 1.), (1, 2.)], 
82 |                    dtype=[('i', 'i4'), ('v', 'f4')])
83 |         m = ['foo', 'bar']
84 |         arr = map_strarr(arr, m, 'i', new_k='str')
85 | 
86 |         self.assertTrue(np.array_equal(arr['str'], 
87 |                         np.array(m, dtype=np.array(m).dtype)))
88 |         self.assertTrue(np.array_equal(arr['v'], np.array([1., 2.], dtype='f4')))
89 | 
90 | 
91 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCore)
92 | unittest.TextTestRunner(verbosity=2).run(suite)
93 | 


--------------------------------------------------------------------------------
/vsm/extensions/lda_py4j/org/knowceans/gibbstest/FileArrayProvider.java:
--------------------------------------------------------------------------------
  1 | package org.knowceans.gibbstest;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileReader;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | 
 10 | 
 11 | public class FileArrayProvider {
 12 | 
 13 |     public static int[][] readFile(String filename) throws IOException {
 14 |         /*
 15 |         Reads file that contains corpus.view_contexts(ctx_type),
 16 |         list of arrays. This returns the int[][] for LdaGibbsSampler
 17 |         documents.
 18 |         */
 19 |         FileReader fileReader = new FileReader(filename);
 20 |         BufferedReader bufferedReader = new BufferedReader(fileReader);
 21 |         
 22 |         // List<String> lines = new ArrayList<String>();
 23 |         List<List<Integer>> lines = new ArrayList<List<Integer>>();
 24 |         String line = null;
 25 |         
 26 |         List<Integer> a = new ArrayList<Integer>();
 27 |         while ((line = bufferedReader.readLine()) != null) {
 28 |             if (line.length() > 0) { // String
 29 |                 int item = Integer.parseInt(line);
 30 |                 a.add(item);
 31 |             } else {
 32 |                 lines.add(a);
 33 |                 a = new ArrayList<Integer>();
 34 |             }
 35 |         }
 36 |         bufferedReader.close();
 37 |         
 38 |         int[][] arr = new int[lines.size()][];
 39 |         for (int i=0; i<lines.size(); i++) {
 40 |             
 41 |             List<Integer> subli = lines.get(i);
 42 |             int[] blankarr = new int[subli.size()];
 43 |             for (int j=0; j<subli.size(); j++) {
 44 |                 blankarr[j] = subli.get(j).intValue();
 45 |             }
 46 |             arr[i] = blankarr;
 47 |         }
 48 |         
 49 |         return arr;
 50 |     }
 51 |        
 52 |     public static void writeDoubleFile(double[][] data, String filename) throws IOException {
 53 |          /*
 54 |         Need a function to write a .txt containing Z, phi, and theta
 55 |         for python to use. General function to be used for Z, phi, theta.
 56 |         @param filename : file to write to
 57 |         */
 58 |         
 59 |         FileWriter fileWriter = new FileWriter(filename);
 60 |         for (double[] d : data) {
 61 |             for (double i : d) {
 62 |                 String s = String.valueOf(i);
 63 |                 fileWriter.write(s + ",");
 64 |             }
 65 |             fileWriter.write('\n');
 66 |         }
 67 |         fileWriter.close();
 68 |     }
 69 | 
 70 | 
 71 |     public static void writeIntFile(int[][] data, String filename) throws IOException {
 72 |          /*
 73 |         @param filename : file to write to
 74 |         */
 75 |         
 76 |         FileWriter fileWriter = new FileWriter(filename);
 77 |         for (int[] d : data) {
 78 |             for (int i : d) {
 79 |                 String s = String.valueOf(i);
 80 |                 fileWriter.write(s + ",");
 81 |             }
 82 |             fileWriter.write('\n');
 83 |         }
 84 |         fileWriter.close();
 85 |     }
 86 | 
 87 |     public static void writeStrFile(String data, String filename) throws IOException {
 88 |         
 89 |         FileWriter fileWriter = new FileWriter(filename);
 90 |         fileWriter.write(data);
 91 |         fileWriter.close();
 92 |     }
 93 | 
 94 | 
 95 |     public static void main(String[] args) throws IOException {
 96 |         // test readFile. 
 97 |         int[][] arr = readFile("testcorp.txt");
 98 |         System.out.println(arr);
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/vsm/extensions/lda_py4j/org/knowceans/gibbstest/LdaRoutine.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from FileReadWrite import *
  3 | from py4j.java_gateway import JavaGateway
  4 | import numpy as np
  5 | 
  6 | 
  7 | def lda_run(corp, corpfile, ctx_type, iterations, K, metafile,
  8 |             alpha=0.01, beta=0.01):
  9 |     """
 10 |     corp : `vsm.Corpus`
 11 |     corpfile : fname, '\n' indicates ctx breaks (tokenizaiton).
 12 |     ctx_type : a context type in corp.context_types
 13 |     
 14 |     """
 15 |     
 16 |     ## This is redundant since java program is running with an existing corp.
 17 |     ## write_file(corp, ctx_type, corpfile)
 18 | 
 19 |     # connect to java
 20 |     gw = JavaGateway(auto_field=True)
 21 | 
 22 |     # gibbs sampling
 23 |     gw.entry_point.sample(iterations, K, alpha, beta)
 24 |     # write necessary data for vsm.model.ldagibbs (for saving step)
 25 |     gw.entry_point.writeMeta(iterations, K, alpha, beta, metafile)
 26 |     
 27 |     # LdaGibbsSampler object 
 28 |     lda = gw.entry_point.getLda()
 29 | 
 30 |     return gw, lda
 31 | 
 32 | 
 33 | def save(lda, ctx_type, fname, metafile):
 34 |     """
 35 |     lda : LdaGibbsSampler obj
 36 |     fname : output file
 37 |     metafile : contains log_probs, top_doc,,and such 
 38 |     """
 39 |     top_word = nested_arr_to_np(lda.getPhi())
 40 |     doc_top = nested_arr_to_np(lda.getTheta())
 41 |     W = nested_arr_to_np(lda.getDocuments(), arrarr=True)
 42 |     Z = nested_arr_to_np(lda.getZ(), arrarr=True)
 43 |     dic = file_to_dict(metafile)
 44 |     
 45 |     arrays_out = dict()
 46 |     arrays_out['W_corpus'] = np.array(np.hstack(W))
 47 |     arrays_out['W_indices'] = np.cumsum([a.size for a in W])
 48 |     arrays_out['V'] = lda.getV()
 49 |     arrays_out['iterations'] = int(dic['iteration']) # name could change
 50 |     arrays_out['Z_corpus'] = np.array(np.hstack(Z), dtype=np.int32)
 51 |     arrays_out['Z_indices'] = np.cumsum([a.size for a in Z])
 52 |     arrays_out['doc_top'] = doc_top
 53 |     arrays_out['top_word'] = top_word
 54 |     arrays_out['sum_word_top'] = (lda.getV() * float(dic['doc_prior'])) +\
 55 |                                   np.zeros(lda.getK())
 56 |     arrays_out['context_type'] = ctx_type
 57 |     arrays_out['K'] = int(dic['K'])
 58 |     arrays_out['alpha'] = float(dic['top_prior'])
 59 |     arrays_out['beta'] = float(dic['doc_prior'])
 60 |     arrays_out['log_prob_init'] = False
 61 |     """
 62 |     # log_prob_init = False for now
 63 |     dt = [('i', int), ('v', float)]
 64 |     logs = [float(dic['log_probs'])] * int(dic['iteration'])
 65 |     indices = range(0, int(dic['iteration']))
 66 |     arrays_out['log_prob_init'] = np.array(zip(indices, logs), dtype=dt)
 67 |     """
 68 |     print('Saving LDA model to', fname)
 69 |     np.savez(fname, **arrays_out)
 70 |    
 71 | 
 72 | def nested_arr_to_np(arr, arrarr=False):
 73 |     
 74 |     outli = []
 75 |     for r in arr:
 76 |         inli = []
 77 |         for c in r:
 78 |             inli.append(c)
 79 |         
 80 |         if arrarr:
 81 |             inli = np.array(inli)    
 82 |         outli.append(inli)
 83 |     
 84 |     return np.array(outli)
 85 | 
 86 |     
 87 | if __name__=='__main__':
 88 |     from vsm.corpus import Corpus
 89 |    
 90 |     path = '../org/knowceans/gibbstest/'
 91 |     c = Corpus.load(path+'church_corp.npz')
 92 | 
 93 |     writepath = '/home/doori/inpho/org/knowceans/gibbstest/'
 94 |     ctx = 'document'
 95 |     # java can't process '..' in the path.
 96 |     gw, m = lda_run(c, path+'churchcorp.txt', ctx, 10000, 2, 
 97 |                     writepath+'church-meta.txt', 0.01, 0.01)
 98 | 
 99 |     save(m, ctx, writepath+'church_lda.npz', writepath+'church-meta.txt')
100 | 


--------------------------------------------------------------------------------
/vsm/extensions/multi_k.py:
--------------------------------------------------------------------------------
  1 | from __future import print_function
  2 | 
  3 | import numpy as np
  4 | from sklearn.cluster import KMeans
  5 | from sklearn import datasets
  6 | from random import randrange
  7 | from scipy.sparse import csgraph as cs
  8 | 
  9 | 
 10 | # generate sample data
 11 | n_samples = 30
 12 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
 13 |                                       noise=.05)
 14 | blobs =  datasets.make_blobs(n_samples=n_samples, random_state=8)
 15 | 
 16 | # S = noisy_circles[0]
 17 | S = blobs[0]
 18 | 
 19 | def multi_k(samples, n=30, distr=(10,30), cutoff=None, n_cls=None):
 20 |     """
 21 |     Oveall procedure run. Returns cutplot and category_func.
 22 |     """
 23 |     
 24 |     import matplotlib.pyplot as plt
 25 | 
 26 |     mat, cutplot = connection_matrix(samples=samples, n=n, distr=distr)
 27 | 
 28 |     if cutoff == None:
 29 |         cutoff = find_cutoff(cutplot, n_cls=n_cls)
 30 |     print("Weight cutoff is set to ", cutoff)
 31 |     category_func = category_mat(samples, mat, cutplot, cutoff=cutoff)
 32 | 
 33 |     x, y = zip(*cutplot)
 34 |     plt.plot(x,y)
 35 | 
 36 |     return plt, cutplot, category_func
 37 | 
 38 | 
 39 | def connection_matrix(samples, n=10, distr=(10,30)):
 40 |     """
 41 |     samples : set S of r points in Rn.
 42 |     n : number n of clusterings to perform.
 43 |     distr : numbers of clusters.
 44 |         <- this should be a distribution over 0,...,n. 
 45 |         Let us use a uniform integer for a moment. 
 46 |     cutoff : weight cutoff, 0 <= cutoff <= 1.
 47 |     
 48 |     could make n,distr,cutoff optional by setting default values.
 49 |     
 50 |     :returns: mat : connection matrix 
 51 |              cutplot :  A cut plot
 52 |                 f : [0,1] -> Z
 53 |     """
 54 |     r = len(samples)
 55 |     mat = np.zeros((r, r))
 56 |     
 57 |     for n_ in xrange(n):
 58 |         # select integer d from distr[0] to distr[1] 
 59 |         d = randrange(distr[0],distr[1]+1)
 60 |         km = KMeans(n_clusters=d, init='k-means++',
 61 |                     max_iter=100, n_init=1,verbose=False)
 62 |         km.fit(samples)
 63 |         labels = km.labels_
 64 | 
 65 |         for i in range(r):
 66 |             for j in range(i,r):
 67 |                 if labels[i] == labels[j]:
 68 |                     mat[i][j] += 1
 69 |         
 70 |     mat = mat + mat.T
 71 |     mat /= n
 72 | 
 73 |     cutplot = np.zeros((n+1 ,2), dtype='f2<')
 74 |     for l in xrange(n+1):
 75 |         # Construct graph for which mat[i][j] > l/n
 76 |         graph = mat > 1.0 * l /n
 77 |         
 78 |         n_comp, labels = cs.cs_graph_components(graph)
 79 |         cutplot[l][0] = l * 1.0 /n
 80 |         cutplot[l][1] = n_comp
 81 |     return mat, cutplot
 82 | 
 83 | 
 84 | def find_cutoff(cutplot, n_cls=None):
 85 |     """
 86 |     Finds the weight cutoff based on the longest run in cutplot.
 87 |     If n_cls is provided, finds the cutoff point where n_cls
 88 |     clusters are formed."""
 89 |     from itertools import groupby
 90 | 
 91 |     if n_cls != None:
 92 |         for c in cutplot:
 93 |             if c[1] == n_cls:
 94 |                 return c[0]
 95 | 
 96 |     group = groupby(cutplot[:,1])
 97 |     val = max(group, key=lambda k: len(list(k[1])))[0]
 98 |     
 99 |     for c in cutplot:
100 |         if c[1] == val:
101 |             return c[0] + 0.01
102 | 
103 | 
104 | def category_mat(samples, mat, cutplot, cutoff=None):
105 |     """
106 |     Predicts the category for each data point
107 |     """ 
108 |     if cutoff == None:
109 |         cutoff = find_cutoff(cutplot)
110 |     # Build a new graph on samples with edges mat[i][j] > cutoff
111 |     newG = mat > cutoff
112 |     n_comp, labels = cs.cs_graph_components(newG)
113 |     
114 |     return labels
115 |    
116 | 


--------------------------------------------------------------------------------
/vsm/model/tfidf.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from builtins import range
  3 | 
  4 | import numpy as np
  5 | from scipy.sparse import csr_matrix
  6 | 
  7 | from vsm.model.base import BaseModel
  8 | 
  9 | 
 10 | __all__ = [ 'TfIdf' ]
 11 | 
 12 | 
 13 | class TfIdf(BaseModel):
 14 |     """
 15 |     Transforms a term-frequency model into a term-frequency
 16 |     inverse-document-frequency model.
 17 | 
 18 |     A TF-IDF model is term frequency model whose rows, corresponding
 19 |     to word types, are scaled by IDF values. The idea is that a word
 20 |     type which occurs in most of the contexts (i.e., documents) does
 21 |     less to distinguish the contexts semantically than does a word
 22 |     type which occurs in few of the contexts. The document frequency
 23 |     is the number of documents in which a word occurs divided by the
 24 |     number of documents. The IDF is the log of the inverse of the
 25 |     document frequency.
 26 | 
 27 |     As with a term-frequency model, word types correspond to matrix
 28 |     rows and contexts correspond to matrix columns.
 29 | 
 30 |     The data structure is a sparse float matrix.
 31 | 
 32 |     :See Also: :class:`vsm.model.TfSeq`, :class:`vsm.model.base`,
 33 |         :class:`scipy.sparse.coo_matrix`
 34 | 
 35 |     :notes:
 36 |         A zero in the matrix might arise in two ways: (1) the word type
 37 |         occurs in every document, in which case the IDF value is 0; (2)
 38 |         the word type occurs in no document at all, in which case the IDF
 39 |         value is undefined.
 40 |     """
 41 |     def __init__(self, corpus=None, context_type=None, tf_matrix=None):
 42 |         """
 43 |         Initialize TfIdf.
 44 | 
 45 |         :param corpus: A Corpus object containing the training data.
 46 |         :type corpus: Corpus
 47 |     
 48 |         :param context_type: A string specifying the type of context over
 49 |             which the model trainer is applied.
 50 |         :type context_type: string 
 51 | 
 52 |         :param tf_matrix: A matrix containing the term-frequency data.
 53 |         :type tf_matrix: scipy.sparse matrix
 54 |         """
 55 | 
 56 |         self.context_type = context_type
 57 |         if corpus is not None:
 58 |             self.corpus = corpus.corpus
 59 |         else:
 60 |             self.corpus = []
 61 | 
 62 |         if tf_matrix is None:
 63 |             self.matrix = csr_matrix([], dtype=np.float64)
 64 |         else:
 65 |             self.matrix = tf_matrix.copy()
 66 |             self.matrix = self.matrix.tocsr()
 67 |             self.matrix = self.matrix.astype(np.float64)
 68 | 
 69 |         self.undefined_rows = []
 70 | 
 71 | 
 72 |     def train(self):
 73 |         """
 74 |         Computes the IDF values for the input term-frequency matrix,
 75 |         scales the rows by these values and stores the results in
 76 |         `self.matrix`.
 77 |         """
 78 |         if self.matrix.size > 0:
 79 |             n_docs = np.float64(self.matrix.shape[1])
 80 |             
 81 |             for i in range(self.matrix.indptr.shape[0] - 1):
 82 | 
 83 |                 start = self.matrix.indptr[i]
 84 |                 stop = self.matrix.indptr[i + 1]
 85 | 
 86 |                 if start == stop:
 87 |                     self.undefined_rows.append(i)
 88 |                 else:
 89 |                     row = self.matrix.data[start:stop]
 90 |                     row *= np.log(n_docs / np.count_nonzero(row))
 91 |                     start = stop
 92 | 
 93 |     @staticmethod
 94 |     def from_tf(tf_model):
 95 |         """
 96 |         Takes a `Tf` model object and generates a `TfIdf` model.
 97 |         """
 98 |         model = TfIdf(tf_matrix=tf_model.matrix)
 99 |         model.corpus = tf_model.corpus
100 |         model.context_type = tf_model.context_type
101 |         return model
102 | 


--------------------------------------------------------------------------------
/vsm/extensions/interop/ldac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | `vsm.extensions.interop.ldac`
  3 | 
  4 | Module containing functions for import/export between VSM and lda-c, which is
  5 | the original LDA implementation referenced in Blei, Ng, and Jordan (2003).
  6 | lda-c is available at: `<http://www.cs.princeton.edu/~blei/lda-c/>`
  7 | """
  8 | import os
  9 | import os.path
 10 | 
 11 | from scipy.stats import itemfreq
 12 | import numpy as np
 13 | 
 14 | from vsm.extensions.corpusbuilders import corpus_fromlist
 15 | 
 16 | 
 17 | def export_corpus(corpus, outfolder, context_type='document'):
 18 |     """
 19 |     Converts a vsm.corpus.Corpus object into a lda-c compatible data file.
 20 |     Creates two files:
 21 |     1.  "vocab.txt" - contains the integer-word mappings
 22 |     2.  "corpus.dat" - contains the corpus object in the format described in
 23 |         the `lda-c documentation`_:
 24 | 
 25 |             Under LDA, the words of each document are assumed exchangeable.
 26 |             Thus, each document is succinctly represented as a sparse vector
 27 |             of word counts. The data is a file where each line is of the form:
 28 | 
 29 |                 [M] [term_1]:[count] [term_2]:[count] ...  [term_N]:[count]
 30 | 
 31 |             where [M] is the number of unique terms in the document, and the
 32 |             [count] associated with each term is how many times that term
 33 |             appeared in the document.  Note that [term_1] is an integer
 34 |             which indexes the term; it is not a string.
 35 | 
 36 |     :param corpus: VSM Corpus object to convert to lda-c file
 37 |     :type corpus: vsm.corpus.Corpus
 38 | 
 39 |     :param outfolder: Directory to output "vocab.txt" and "corpus.dat"
 40 |     :type string: path
 41 | 
 42 |     .. _lda-c documentation: http://www.cs.princeton.edu/~blei/lda-c/readme.txt
 43 |     """
 44 |     if not os.path.exists(outfolder):
 45 |         os.makedirs(outfolder)
 46 | 
 47 |     vocabfilename = os.path.join(outfolder, 'vocab.txt')
 48 |     with open(vocabfilename, 'w') as vocabfile:
 49 |         for word in corpus.words:
 50 |             vocabfile.write(word + '\n')
 51 | 
 52 |     corpusfilename = os.path.join(outfolder, 'corpus.dat')
 53 |     with open(corpusfilename, 'w') as corpusfile:
 54 |         for ctx in corpus.view_contexts(context_type):
 55 |             M = len(np.unique(ctx))
 56 |             corpusfile.write("{0}".format(M))
 57 | 
 58 |             for token in itemfreq(ctx):
 59 |                 corpusfile.write(" {term}:{count}".format(
 60 |                     term=token[0], count=token[1]))
 61 | 
 62 |             corpusfile.write("\n")
 63 | 
 64 | 
 65 | def import_corpus(corpusfilename, vocabfilename, context_type='document'):
 66 |     """
 67 |     Converts an lda-c compatible data file into a VSM Corpus object.
 68 | 
 69 |     :param corpusfilename: path to corpus file, as defined in lda-c
 70 |     documentation.
 71 |     :type string:
 72 | 
 73 |     :param vocabfilename: path to vocabulary file, one word per line
 74 |     :type string:
 75 |     """
 76 |     # process vocabulary file
 77 |     with open(vocabfilename) as vocabfile:
 78 |         vocab = [line.strip() for line in vocabfile]
 79 | 
 80 |     # process corpus file
 81 |     corpus = []
 82 |     with open(corpusfilename) as corpusfile:
 83 |         for line in corpusfile:
 84 |             tokens = line.split()[1:]
 85 |             ctx = []
 86 |             for token in tokens:
 87 |                 id, count = token.split(':')
 88 |                 id = int(id)
 89 |                 count = int(count)
 90 |                 ctx.extend([vocab[id]] * count)
 91 |             corpus.append(ctx)
 92 | 
 93 |     return corpus_fromlist(corpus, context_type=context_type)
 94 | 
 95 | 
 96 | def import_model(filename):
 97 |     pass
 98 | 
 99 | 
100 | def export_model(filename):
101 |     pass
102 | 


--------------------------------------------------------------------------------
/vsm/model/lda.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides a convenient alias for the LdaCgs* classes
 3 | """
 4 | from __future__ import absolute_import
 5 | from __future__ import print_function
 6 | from builtins import str
 7 | from builtins import object
 8 | import platform # For Windows workaround
 9 | import warnings
10 | 
11 | 
12 | __all__ = [ 'LDA' ]
13 | 
14 | class LDA(object):
15 |     """
16 |     Depending on the boolean parameter `multiprocessing`, returns and
17 |     initializes an instance of either LdaCgsSeq or LdaCgsMulti.
18 | 
19 |     Note that on Windows platforms, `multiprocessing` is not implemented.
20 |     In contrast to LdaCgsMulti, LDA always returns a valid object. Instead
21 |     of raising a NotImplementedError, LDA issues a RuntimeWarning, notifying 
22 |     the user the sequental algorithm is being used. When `seed_or_seeds` is a
23 |     list in this instance, only the first seed is used. 
24 |     """
25 |     def __new__(cls,
26 |                 corpus=None, context_type=None,
27 |                 K=20, V=0, alpha=[], beta=[],
28 |                 multiprocessing=False, seed_or_seeds=None, n_proc=None):
29 | 
30 |         kwargs = dict(corpus=corpus, context_type=context_type,
31 |                       K=K, V=V, alpha=alpha, beta=beta)
32 |         
33 |         if multiprocessing and platform.system() != 'Windows':
34 |             if n_proc is not None:
35 |                 kwargs['n_proc'] = n_proc
36 |             if seed_or_seeds is not None and not isinstance(seed_or_seeds, int):
37 |                 kwargs['seeds'] = seed_or_seeds
38 | 
39 | 
40 |             from .ldacgsmulti import LdaCgsMulti
41 |             return LdaCgsMulti(**kwargs)
42 | 
43 |         else:
44 |             if multiprocessing and platform.system() == 'Windows':
45 |                 warnings.warn("""Multiprocessing is not implemented on Windows.
46 |                 Defaulting to sequential algorithm.""", RuntimeWarning)
47 |                 
48 |                 # extract single seed
49 |                 if seed_or_seeds is not None and not isinstance(seed_or_seeds, int):
50 |                     seed_or_seeds = seed_or_seeds[0]
51 |                     warnings.warn("Windows is using only the first seed: " +
52 |                                   str(seed_or_seeds), RuntimeWarning)
53 | 
54 |             # parse seed_or_seeds argument
55 |             if isinstance(seed_or_seeds, int):
56 |                 kwargs['seed'] = seed_or_seeds
57 |             elif seed_or_seeds is not None:
58 |                 raise ValueError("LDA(seed_or_seeds, ...) must take an" +
59 |                                  "integer in single-threaded mode.")
60 | 
61 |             from .ldacgsseq import LdaCgsSeq
62 |             return LdaCgsSeq(**kwargs)
63 | 
64 |     @staticmethod
65 |     def load(filename, multiprocessing=False, n_proc=None):
66 |         """
67 |         A static method for loading a saved LdaCgsMulti model.
68 | 
69 |         :param filename: Name of a saved model to be loaded.
70 |         :type filename: string
71 | 
72 |         :returns: m : LdaCgsMulti object
73 | 
74 |         :See Also: :class:`numpy.load`
75 |         """
76 |         from .ldafunctions import load_lda
77 |         from .ldacgsmulti import LdaCgsMulti
78 |         from .ldacgsseq import LdaCgsSeq
79 | 
80 |         if multiprocessing and platform.system() != 'Windows':
81 |             return load_lda(filename, LdaCgsMulti)
82 |         else:
83 |             if multiprocessing and platform.system() == 'Windows':
84 |                 warnings.warn("""Multiprocessing is not implemented on Windows.
85 |                 Defaulting to sequential algorithm.""", RuntimeWarning)
86 |             m = load_lda(filename, LdaCgsSeq)
87 |             try:
88 |                 if m.n_proc:
89 |                     print("reloading with multiprocessing support")
90 |                     m = load_lda(filename, LdaCgsMulti)
91 |             except AttributeError:
92 |                 pass
93 | 
94 |             return m
95 | 


--------------------------------------------------------------------------------
/vsm/model/ldacgs.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import time
  5 | from ldafunctions import load_lda, save_lda, init_priors
  6 | 
  7 | # import pyximport; pyximport.install()
  8 | from _ldacgs import cgs
  9 | 
 10 | 
 11 | __all__ = [ 'LdaCgs' ]
 12 | 
 13 | 
 14 | 
 15 | class LdaCgs(object):
 16 |     """
 17 |     """
 18 |     def __init__(self, corpus=None, context_type=None,
 19 |                  K=20, V=0, alpha=[], beta=[]):
 20 |         """
 21 |         Initialize LdaCgs.
 22 | 
 23 |         :param corpus: Source of observed data.
 24 |         :type corpus: `Corpus`
 25 |     
 26 |         :param context_type: Name of tokenization stored in `corpus` whose tokens
 27 |             will be treated as documents.
 28 |         :type context_type: string, optional
 29 | 
 30 |         :param K: Number of topics. Default is `20`.
 31 |         :type K: int, optional
 32 |     
 33 |         :param beta: Topic priors. Default is 0.01 for all words.
 34 |         :type beta: list, optional
 35 |     
 36 |         :param alpha: Document priors. Default is a flat prior of 0.01 
 37 |             for all topics.
 38 |         :type alpha: list, optional
 39 |         """
 40 | 
 41 |         self.context_type = context_type
 42 |         self.K = K
 43 | 
 44 |         if corpus:
 45 |             self.V = corpus.words.size
 46 |             self.indices = corpus.view_contexts(self.context_type, 
 47 |                                                 as_indices=True)
 48 |             self.indices = np.array(self.indices, dtype=('i'))
 49 |             self.corpus = np.array(corpus.corpus, dtype=('i'))
 50 |         else:
 51 |             self.V = V
 52 |             self.indices = np.array([], dtype=('i'))
 53 |             self.corpus = np.array([], dtype=('i'))
 54 | 
 55 |         priors = init_priors(self.V, self.K, beta, alpha)
 56 |         self.beta, self.alpha = priors
 57 | 
 58 |         self.Z = None
 59 |         self.word_top = None
 60 |         self.top_doc = None
 61 | 
 62 |         self.log_probs = None
 63 |         self.iteration = 0
 64 | 
 65 | 
 66 |     def train(self, n_iterations=100, n_threads=1, verbose=1):
 67 | 
 68 |         seed = np.uint64(0)
 69 | 
 70 |         results = cgs(self.K, 
 71 |                       self.V,
 72 |                       self.indices,
 73 |                       self.corpus,
 74 |                       self.alpha.reshape(-1,),
 75 |                       self.beta.reshape(-1,),
 76 |                       n_iterations,
 77 |                       n_threads,
 78 |                       seed)
 79 | 
 80 |         self.Z = results['Z']
 81 |         self.word_top = results['word_top']
 82 |         self.top_doc = results['top_doc']
 83 |         #TODO: Manage log_probs so that training continuations can be done.
 84 |         self.log_probs = results['log_probs']
 85 | 
 86 | 
 87 |     @staticmethod
 88 |     def load(filename):
 89 |         return load_lda(filename, LdaCgsSeq)
 90 | 
 91 | 
 92 |     def save(self, filename):
 93 |         save_lda(self, filename)
 94 | 
 95 | 
 96 | 
 97 | #################################################################
 98 | #                            Demos
 99 | #################################################################
100 | 
101 | 
102 | def demo_LdaCgs(doc_len=500, V=100000, n_docs=100,
103 |                 K=20, n_iterations=5, n_threads=1):
104 | 
105 |     from vsm.extensions.corpusbuilders import random_corpus
106 | 
107 |     print('Words per document:', doc_len)
108 |     print('Words in vocabulary:', V)
109 |     print('Documents in corpus:', n_docs)
110 |     print('Number of topics:', K)
111 |     print('Iterations:', n_iterations)
112 | 
113 |     c = random_corpus(n_docs*doc_len, V, doc_len, doc_len+1)
114 | 
115 |     print('Random corpus generated. Initializing model.')
116 |     m = LdaCgs(c, 'document', K=K)
117 | 
118 |     print('Begin estimation.')
119 |     m.train(n_iterations=n_iterations, n_threads=n_threads)
120 | 
121 |     return m
122 | 


--------------------------------------------------------------------------------
/vsm/model/_cgs_update.pyx:
--------------------------------------------------------------------------------
  1 | # cython: binding=True
  2 | # cython: wraparound=False
  3 | # cython: boundscheck=False
  4 | # cython: cdivision=True
  5 | 
  6 | import cython
  7 | cimport cython
  8 | 
  9 | import numpy as np
 10 | cimport numpy as np
 11 | 
 12 | ctypedef np.float32_t NP_FLOAT_t
 13 | #TODO: figure out how to use np types in python code
 14 | #ctypedef fused NP_FLOAT_t:
 15 | #    np.float32_t
 16 | #    np.float64_t
 17 | 
 18 | ctypedef fused CORPUS_t:
 19 |     unsigned int
 20 |     unsigned short
 21 | 
 22 | ctypedef fused TOPIC_t:
 23 |     unsigned short
 24 |     unsigned char
 25 | 
 26 | cdef extern from "math.h":
 27 |     float logf(float n)
 28 | 
 29 | @cython.wraparound(False)
 30 | @cython.boundscheck(False)
 31 | @cython.cdivision(True)
 32 | def cgs_update(int itr, 
 33 |                CORPUS_t [:] corpus,
 34 |                np.ndarray[NP_FLOAT_t, ndim=2] word_top,
 35 |                np.ndarray[NP_FLOAT_t] inv_top_sums,
 36 |                np.ndarray[NP_FLOAT_t, ndim=2] top_doc,
 37 |                TOPIC_t [:] Z,
 38 |                int [:] indices,
 39 |                str mtrand_str,
 40 |                unsigned int [:] mtrand_keys,
 41 |                int mtrand_pos,
 42 |                int mtrand_has_gauss,
 43 |                float mtrand_cached_gaussian):
 44 |     
 45 |     cdef int first, last
 46 |     cdef long stop, doc_len, offset
 47 |     cdef NP_FLOAT_t r, s
 48 |     cdef Py_ssize_t i, j, idx, w, t, k
 49 | 
 50 |     cdef int V = corpus.shape[0]
 51 |     cdef int N = indices.shape[0]
 52 |     cdef int K = word_top.shape[1]
 53 |     cdef int W = word_top.shape[0]
 54 |     
 55 |     cdef NP_FLOAT_t log_p = 0
 56 |     cdef np.ndarray[NP_FLOAT_t, ndim=2] log_wk = np.log(word_top * inv_top_sums)
 57 |     cdef np.ndarray[NP_FLOAT_t, ndim=2] log_kd = np.log(top_doc / top_doc.sum(0))
 58 | 
 59 |     cdef object np_random_state = np.random.RandomState()
 60 |     np_random_state.set_state((mtrand_str, mtrand_keys, 
 61 |                                mtrand_pos, mtrand_has_gauss, 
 62 |                                mtrand_cached_gaussian))
 63 |     cdef np.ndarray[NP_FLOAT_t] samples = np_random_state.uniform(size=V).astype(np.float32)
 64 |     cdef np.ndarray[NP_FLOAT_t] dist = np.zeros((K,), dtype=np.float32)
 65 | 
 66 |     cdef object mtrand_state = np_random_state.get_state()
 67 | 
 68 | 
 69 |     with nogil:
 70 |         for i in range(N):
 71 |     
 72 |             if i==0:
 73 |                 doc_len = indices[0]
 74 |                 offset = 0
 75 |             else:
 76 |                 offset = indices[i-1]
 77 |                 stop = indices[i]
 78 |                 doc_len = stop - offset 
 79 |     
 80 |             for j in range(doc_len):
 81 |     
 82 |                 idx = offset + j
 83 |                 w,k = corpus[idx], Z[idx]
 84 |     
 85 |                 log_p += log_wk[w, k] + log_kd[k, i]
 86 |     
 87 |                 if itr > 0:
 88 |                     word_top[w, k] -= 1
 89 |                     s = inv_top_sums[k]
 90 |                     inv_top_sums[k] = s / (1 - s)
 91 |                     top_doc[k, i] -= 1
 92 |     
 93 |                 t = 0
 94 |                 dist[t] = <NP_FLOAT_t>(inv_top_sums[t] * word_top[w,t] * top_doc[t,i])
 95 |                 for t in range(1,K):
 96 |                     dist[t] = dist[t-1] + <NP_FLOAT_t>(inv_top_sums[t] * word_top[w,t] * top_doc[t,i])
 97 |                 
 98 |                 r = samples[idx] * dist[K-1]
 99 |                 for k in range(K):
100 |                     if r < dist[k]:
101 |                         break
102 |                 """
103 |                 # This code implements binary search for the right insertion
104 |                 # point for the probability in the cumulative distribution
105 |                 first = 0
106 |                 last = K - 1
107 |                 while first < last:
108 |                     k = (first + last) / 2
109 |                     if r < dist[k]:
110 |                         last = k
111 |                     else:
112 |                         first = k + 1
113 |                 """
114 |     
115 |                 word_top[w, k] += 1
116 |                 s = inv_top_sums[k]
117 |                 inv_top_sums[k] = s / (1 + s) 
118 |                 top_doc[k, i] += 1
119 |     
120 |                 Z[idx] = <TOPIC_t>(k)
121 |             
122 |     return (np.asarray(word_top), np.asarray(inv_top_sums), 
123 |             np.asarray(top_doc), np.asarray(Z), log_p, 
124 |             mtrand_state[0], mtrand_state[1], mtrand_state[2], 
125 |             mtrand_state[3], mtrand_state[4])
126 | 


--------------------------------------------------------------------------------
/unit_tests/tests_beagleorder.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | 
  4 | from vsm.model.beagleorder import *
  5 | from vsm.model.beagleorder import (reduce_ngrams, rand_pt_unit_sphere, 
  6 |                                    two_rand_perm)
  7 | 
  8 | 
  9 | class TestBeagleOrder(unittest.TestCase):
 10 | 
 11 |     def setUp(self):
 12 | 
 13 |         from vsm.corpus.util.corpusbuilders import random_corpus
 14 |         from vsm.model.beagleenvironment import BeagleEnvironment
 15 | 
 16 |         self.c = random_corpus(1000, 50, 0, 10, context_type='sentence')
 17 | 
 18 |         self.e = BeagleEnvironment(self.c, n_cols=100)
 19 |         self.e.train()
 20 | 
 21 |         self.ms = BeagleOrderSeq(self.c, self.e.matrix)
 22 |         self.ms.train()
 23 |         '''
 24 |         self.mm = BeagleOrderMulti(self.c, self.e.matrix)
 25 |         self.mm.train(2)
 26 |         '''
 27 | 
 28 | 
 29 |     def test_BeagleOrderSeq(self):
 30 |         from tempfile import NamedTemporaryFile
 31 |         import os
 32 | 
 33 |         try:
 34 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
 35 |             self.ms.save(tmp.name)
 36 |             tmp.close()
 37 |             m1 = BeagleOrderSeq.load(tmp.name)
 38 |             self.assertTrue((self.ms.matrix == m1.matrix).all())
 39 |     
 40 |         finally:
 41 |             os.remove(tmp.name)
 42 | 
 43 | 
 44 |     '''
 45 |     def test_BeagleOrderMulti(self):
 46 | 
 47 |         from tempfile import NamedTemporaryFile
 48 |         import os
 49 | 
 50 |         try:
 51 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
 52 |             self.mm.save(tmp.name)
 53 |             tmp.close()
 54 |             m1 = BeagleOrderMulti.load(tmp.name)
 55 |             self.assertTrue((self.mm.matrix == m1.matrix).all())
 56 |     
 57 |         finally:
 58 |             os.remove(tmp.name)
 59 |     '''
 60 | 
 61 |     #TODO: Construct a reference result for both models
 62 |     # def test_compare(self):
 63 | 
 64 |     #     psi = rand_pt_unit_sphere(self.e.shape[1])
 65 | 
 66 |     #     rand_perm = two_rand_perm(self.e.shape[1])
 67 | 
 68 |     #     print 'Training single processor model'
 69 |     #     ms = BeagleOrderSeq(self.c, self.e.matrix, psi=psi, rand_perm=rand_perm)
 70 |     #     ms.train()
 71 | 
 72 |     #     print 'Training multiprocessor model'
 73 |     #     mm = BeagleOrderMulti(self.c, self.e.matrix, psi=psi, rand_perm=rand_perm)
 74 |     #     mm.train()
 75 | 
 76 |     #     self.assertTrue(np.allclose(ms.matrix, mm.matrix), (ms.matrix, mm.matrix
 77 |                                                         # ))
 78 | 
 79 | 
 80 |     #TODO: Make into actual unit tests
 81 |     # def test10(self):
 82 | 
 83 |     #     import pprint
 84 | 
 85 |     #     def fn(x,y):
 86 |     #         if isinstance(x, tuple):
 87 |     #             return x + (y,)
 88 |     #         return (x, y)
 89 | 
 90 |     #     a = np.arange(5)
 91 |     #     print 'array length', a.shape[0]
 92 |         
 93 |     #     for i in xrange(a.shape[0]):
 94 |     #         n = 3
 95 |     #         print 'ngram length', n
 96 |     #         print 'index', i
 97 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
 98 | 
 99 |     #     for i in xrange(a.shape[0]):
100 |     #         n = 4
101 |     #         print 'ngram length', n
102 |     #         print 'index', i
103 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
104 | 
105 |     #     for i in xrange(a.shape[0]):
106 |     #         n = 5
107 |     #         print 'ngram length', n
108 |     #         print 'index', i
109 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
110 | 
111 | 
112 |     # def test11(self):
113 | 
114 |     #     import pprint
115 | 
116 |     #     def fn(x,y):
117 |     #         return x + y
118 | 
119 |     #     a = np.arange(5)
120 |     #     print 'array length', a.shape[0]
121 |         
122 |     #     for i in xrange(a.shape[0]):
123 |     #         n = 3
124 |     #         print 'ngram length', n
125 |     #         print 'index', i
126 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
127 | 
128 |     #     for i in xrange(a.shape[0]):
129 |     #         n = 4
130 |     #         print 'ngram length', n
131 |     #         print 'index', i
132 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
133 | 
134 |     #     for i in xrange(a.shape[0]):
135 |     #         n = 5
136 |     #         print 'ngram length', n
137 |     #         print 'index', i
138 |     #         pprint.pprint(reduce_ngrams(fn, a, n, i))
139 | 
140 | 
141 |         
142 | #Define and run test suite
143 | suite = unittest.TestLoader().loadTestsFromTestCase(TestBeagleOrder)
144 | unittest.TextTestRunner(verbosity=2).run(suite)
145 | 


--------------------------------------------------------------------------------
/vsm/extensions/corpusbuilders/corpusstreamers.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | import sys
  3 | if sys.version_info[0] == 2:
  4 |     import backports.tempfile
  5 | 
  6 | from codecs import open
  7 | from concurrent.futures import as_completed, ProcessPoolExecutor
  8 | import pickle
  9 | import tempfile
 10 | import os
 11 | 
 12 | from progressbar import ProgressBar, Bar, Percentage
 13 | from unidecode import unidecode
 14 | 
 15 | from vsm.extensions.corpusbuilders import corpus_fromlist
 16 | from vsm.extensions.corpusbuilders.util import (apply_stoplist, 
 17 |     detect_encoding, word_tokenize)
 18 | 
 19 | IGNORE = ['.json','.log','.pickle', '.DS_Store', '.err', '.npz']
 20 | 
 21 | def read_file(filename, encoding='utf8', decode=False):
 22 |     if encoding == 'detect':
 23 |         encoding = detect_encoding(filename)
 24 |     
 25 |     try:
 26 |         if decode:
 27 |             with open(filename, mode='r', encoding=encoding) as f:
 28 |                 data = unidecode(f.read())
 29 |         else:
 30 |             with open(filename, mode='r', encoding=encoding) as f:
 31 |                 data = f.read()
 32 |     except UnicodeDecodeError:
 33 |         encoding = detect_encoding(filename)
 34 |         if decode:
 35 |             with open(filename, mode='r', encoding=encoding) as f:
 36 |                 data = unidecode(f.read())
 37 |         else:
 38 |             with open(filename, mode='r', encoding=encoding) as f:
 39 |                 data = f.read()
 40 | 
 41 |     return data
 42 | 
 43 | def tokenize_and_pickle_file(filename, pickle_dir=None,
 44 |     tokenizer=word_tokenize, encoding='utf8', decode=False):
 45 |     """
 46 |     Tokenizes a file and returns a filename of a PickledWords instance.
 47 |     """
 48 |     data = read_file(filename, encoding=encoding, decode=decode)
 49 | 
 50 |     corpus = tokenizer(data)
 51 | 
 52 |     # dump to picklefile
 53 |     with tempfile.NamedTemporaryFile(dir=pickle_dir, delete=False) as fp:
 54 |         pickle.dump(corpus, fp)
 55 |         filename = fp.name
 56 |     del corpus
 57 | 
 58 |     return filename
 59 | 
 60 | 
 61 | def corpus_from_files(dir_or_filenames, encoding='utf8', ignore=IGNORE,
 62 |     nltk_stop=False, stop_freq=0, add_stop=None, decode=False, 
 63 |     verbose=True, simple=False, tokenizer=word_tokenize):
 64 |     if os.path.isdir(dir_or_filenames):
 65 |         # go through files in directory, filter hidden files
 66 |         filenames = [os.path.join(root, path) 
 67 |                         for root, dirs, files in os.walk(dir_or_filenames)
 68 |                             for path in files 
 69 |                             if not path.startswith('.')
 70 |                                and not any(path.endswith(i) for i in ignore)]
 71 |         labels = [filename.replace(dir_or_filenames + '/', '') for filename in filenames]
 72 |     else:
 73 |         filenames = dir_or_filenames
 74 |         labels = filenames[:]
 75 | 
 76 |     if verbose:
 77 |         pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(filenames))
 78 |         pbar = pbar.start()
 79 |         n = 0
 80 |     
 81 |     if sys.version_info[0] == 2:
 82 |         TD = backports.tempfile.TemporaryDirectory 
 83 |     else:
 84 |         TD = tempfile.TemporaryDirectory
 85 |     with TD(prefix='vsm-') as pickle_dir:
 86 |         with ProcessPoolExecutor() as executor:
 87 |             corpus = [executor.submit(tokenize_and_pickle_file, filename, pickle_dir, tokenizer)
 88 |                           for filename in filenames]
 89 |             if verbose:
 90 |                 for f in as_completed(corpus):
 91 |                     n += 1
 92 |                     pbar.update(n)
 93 | 
 94 |             pbar.finish()
 95 |             corpus = [f.result() for f in corpus]
 96 |         
 97 |         corpus = [PickledWords(f) for f in corpus]
 98 |         corpus = corpus_fromlist(corpus, context_type='document', remove_empty=False)
 99 |         corpus.context_data[0]['document_label'][:] = labels
100 |     
101 |     corpus = apply_stoplist(corpus, nltk_stop=nltk_stop, freq=stop_freq)
102 | 
103 |     return corpus
104 | 
105 | class PickledWords:
106 |     def __init__(self, filename):
107 |         self.file = filename
108 | 
109 |         with open(self.file, 'rb') as fp:
110 |             self.list = pickle.load(fp)
111 |             self.len = len(self.list)
112 |             del self.list
113 |     
114 |     def __iter__(self):
115 |         with open(self.file, 'rb') as fp:
116 |             self.list = pickle.load(fp)
117 |     
118 |         for i in range(len(self.list)):
119 |             yield self.list[i]
120 | 
121 |         del self.list
122 | 
123 |         return
124 | 
125 |     def __len__(self):
126 |         return self.len
127 | 
128 |     def __copy__(self):
129 |         return PickledWords(self.file)
130 | 


--------------------------------------------------------------------------------
/vsm/model/lsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import sparse
  3 | 
  4 | 
  5 | __all__ = [ 'Lsa' ]
  6 | 
  7 | 
  8 | class Lsa(object):
  9 |     """
 10 |     """
 11 |     
 12 |     def __init__(self, corpus=None, context_type=None, td_matrix=None):
 13 |         """
 14 |         Initialize Lsa.
 15 | 
 16 |         :param corpus: A Corpus object containing the training data.
 17 |         :type corpus: Corpus, optional
 18 | 
 19 |         :param context_type: Name of tokenization whose tokens will be
 20 |             treated as documents. Default is `None`.
 21 |         :type context_type: string, optional
 22 | 
 23 |         :param td_matrix: Term-Document matrix. Default is `None`.
 24 |         :type td_matrix: np.array, optional
 25 |         """
 26 | 
 27 |         self.word_matrix = None
 28 |         self.doc_matrix = None
 29 |         self.eigenvalues = None
 30 |         self.context_type = context_type
 31 |         if corpus is not None:
 32 |             self.corpus = corpus.corpus
 33 |         else:
 34 |             self.corpus = []
 35 | 
 36 |         if td_matrix is None:
 37 |             self.td_matrix = np.array([])
 38 |         else:
 39 |             td_matrix = sparse.coo_matrix(td_matrix)
 40 |             
 41 |             # Removing infinite values for SVD
 42 |             finite_mask = np.isfinite(td_matrix.data)
 43 |             coo_in = (td_matrix.data[finite_mask],
 44 |                       (td_matrix.row[finite_mask], 
 45 |                        td_matrix.col[finite_mask]))
 46 | 
 47 |             td_matrix = sparse.coo_matrix(coo_in, shape=td_matrix.shape, 
 48 |                                           dtype=np.float64)
 49 |             self.td_matrix = td_matrix.tocsr()
 50 | 
 51 | 
 52 |     def train(self, k_factors=300):
 53 |         """
 54 |         Trains the model.
 55 | 
 56 |         :param k_factors: Default is 300.
 57 |         :type k_factors: int, optional
 58 |         """
 59 |         from scipy.sparse import linalg as linalgs
 60 | 
 61 |         u,s,v = np.array([]), np.array([]), np.array([])
 62 | 
 63 |         if self.td_matrix.size > 0:
 64 |             s = min(self.td_matrix.shape)
 65 |             if s < k_factors:
 66 |                 k_factors = s - 1
 67 | 
 68 |             # print 'Performing sparse SVD'
 69 |             u, s, v = linalgs.svds(self.td_matrix, k=k_factors)
 70 | 
 71 |         indices = s.argsort()[::-1]
 72 |         self.word_matrix = u[:, indices]
 73 |         self.eigenvalues = s[indices]
 74 |         self.doc_matrix = v[indices, :]
 75 | 
 76 | 
 77 |     def save(self, f):
 78 |         """
 79 |         Saves model data as a numpy archive file with extension `npz`.
 80 |         The keys for the component matrices are `word_matrix`,
 81 |         `eigenvalues` and `doc_matrix`.
 82 |         
 83 |         :param f: Designates the file to which to save data. See
 84 |             `numpy.savez` for further details.
 85 |         :type f: str-like or file-like object
 86 |             
 87 |         :See Also: :meth:`numpy.savez`
 88 |         """
 89 |         arrays_out = dict()
 90 |         arrays_out['word_matrix'] = self.word_matrix
 91 |         arrays_out['eigenvalues'] = self.eigenvalues
 92 |         arrays_out['doc_matrix'] = self.doc_matrix
 93 |         arrays_out['context_type'] = self.context_type
 94 |         np.savez(f, **arrays_out)
 95 | 
 96 | 
 97 |     @staticmethod
 98 |     def load(f):
 99 |         """
100 |         Loads LSA model data from a numpy archive file with extension
101 |         `npz`. The expected keys for the component matrices are
102 |         `word_matrix`, `eigenvalues` and `doc_matrix`.
103 |         
104 |         :param f: Designates the file from which to load data. See
105 |             `numpy.load` for further details.
106 |         :type f: str-like or file-like object
107 |             
108 |         :returns: a saved Lsa model.
109 | 
110 |         :See Also: :meth:`numpy.load`
111 |         """
112 |         arrays_in = np.load(f)
113 |         m = Lsa(context_type=arrays_in['context_type'])
114 |         m.word_matrix=arrays_in['word_matrix']
115 |         m.eigenvalues=arrays_in['eigenvalues']
116 |         m.doc_matrix=arrays_in['doc_matrix']
117 |         return m
118 | 
119 |     @staticmethod
120 |     def from_tf(tf_model):
121 |         """
122 |         Takes a `Tf` model object and generates a `TfIdf` model.
123 |         """
124 |         model = Lsa(td_matrix=tf_model.matrix)
125 |         model.corpus = tf_model.corpus
126 |         model.context_type = tf_model.context_type
127 |         return model
128 |     
129 |     @staticmethod
130 |     def from_tfidf(tfidf_model):
131 |         """
132 |         Takes a `Tf` model object and generates a `TfIdf` model.
133 |         """
134 |         model = Lsa(td_matrix=tfidf_model.matrix)
135 |         model.corpus = tfidf_model.corpus
136 |         model.context_type = tfidf_model.context_type
137 |         return model
138 | 


--------------------------------------------------------------------------------
/vsm/extensions/clustering/manifold.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from builtins import object
  3 | import numpy as np
  4 | from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering
  5 | from sklearn.manifold import Isomap, MDS
  6 | from .plotting import plot_clusters
  7 | 
  8 | 
  9 | __all__ = [ 'Manifold' ]
 10 | 
 11 | 
 12 | class Manifold(object):
 13 |     def __init__(self, dismat, labels=None, cls=[], pos=[]):
 14 |         self.dismat = np.asarray(dismat)
 15 |         self.labels = labels
 16 |         self._cls = cls     # Clusters info
 17 |         self.pos = pos
 18 | 
 19 | 
 20 |     def __str__(self):
 21 |         return self.dismat.__str__()
 22 | 
 23 | 
 24 |     @property
 25 |     def cls(self):
 26 |         """
 27 |         views clusters as lists
 28 |         """
 29 |         return [[self.labels[i] for i,lab in enumerate(self._cls) if lab == x]
 30 |                 for x in set(self._cls)]
 31 | 
 32 | 
 33 | #
 34 | # Clustering methods
 35 | #
 36 |     def KMeans(self, n_clusters=10, init='k-means++', max_iter=100,
 37 |                n_init=1, verbose=1, show=True):
 38 |         """
 39 |         Clusters the objects in `dismat` using k-means algorithm. This requires 
 40 |         `pos` be  precomputed by `mds` or `isomap`. For parameters of the 
 41 |         algorithms see: 
 42 |         http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.
 43 |         html#sklearn.cluster.KMeans
 44 | 
 45 |         :param n_clusters: Number of clusters used as the parameter for K-means.
 46 |         :type n_clusters: int, optional
 47 | 
 48 |         :param show: Shows the resulting clusters if true.
 49 |         :type n_clusters: boolean, optional
 50 |         """
 51 | 
 52 |         if len(self.pos)==0:
 53 |             raise Exception('K-Means requires low dimentional coordinates. Try mds() or isomap() first.')
 54 | 
 55 |         model = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, 
 56 |                        n_init=n_init,verbose=verbose).fit(self.pos)
 57 |         self._cls = model.labels_
 58 | 
 59 |         if show:
 60 |             return self.cls
 61 |         
 62 | 
 63 | 
 64 |     def AffinityPropagation(self, show=True):
 65 |         """
 66 |         Clusters objects in `dismat` using affinity propagation algorithm.
 67 | 
 68 |         :param show: Shows the resulting clusters if true.
 69 |         :type n_clusters: boolean, optional
 70 |         """
 71 | 
 72 |         model = AffinityPropagation(affinity='precomputed').fit(self.dismat)
 73 |         self._cls = model.labels_
 74 |         
 75 |         if show:
 76 |             return self.cls
 77 | 
 78 | 
 79 | 
 80 |     def SpectralClustering(self, n_clusters=10, show=True):
 81 |         """
 82 |         Clusters objects in `dismat` using spectral clustering. 
 83 | 
 84 |         :param n_clusters: Number of clusters used as the parameter for K-means.
 85 |         :type n_clusters: int, optional
 86 | 
 87 |         :param show: Shows the resulting clusters if true.
 88 |         :type n_clusters: boolean, optional
 89 |         """
 90 | 
 91 |         model = SpectralClustering(n_clusters=n_clusters, 
 92 |                                    affinity='precomputed').fit(self.dismat)
 93 |         self._cls = model.labels_
 94 | 
 95 |         if show:
 96 |             return self.cls
 97 | 
 98 | 
 99 | 
100 | #
101 | # Manifold learning methods
102 | #
103 | 
104 |     def mds(self, n_components=2, dissimilarity='precomputed', show=False): 
105 |         """
106 |         Calculates lower dimention coordinates using the mds algorithm.
107 |         This requires sklearn ver 0.14 due to the dissimilarity argument.
108 | 
109 |         :param n_components: dimentionality of the reduced space.
110 |         :type n_components: int, optional
111 | 
112 |         :param show: Shows the calculated coordinates if true.
113 |         :type show: boolean, optional
114 |         """
115 |         model = MDS(n_components=n_components, dissimilarity=dissimilarity, max_iter=100)
116 |         self.pos = model.fit_transform(self.dismat)
117 | 
118 |         if show:
119 |             return self.pos
120 | 
121 | 
122 | 
123 |     def isomap(self, n_components=2, n_neighbors=3, show=False):
124 |         """
125 |         Calculates lower dimention coordinates using the isomap algorithm.
126 | 
127 |         :param n_components: dimentionality of the reduced space
128 |         :type n_components: int, optional
129 | 
130 |         :param n_neighbors: Used by isomap to determine the number of neighbors
131 |             for each point. Large neighbor size tends to produce a denser map.
132 |         :type n_neighbors: int, optional
133 | 
134 |         :param show: Shows the calculated coordinates if true.
135 |         :type show: boolean, optional
136 |         """
137 | 
138 |         model = Isomap(n_components=n_components, n_neighbors=n_neighbors)
139 |         self.pos  = model.fit(self.dismat).embedding_
140 | 
141 |         if show:
142 |             return self.pos
143 | 
144 | 
145 | 
146 |     def plot(self, xy = (0,1)):
147 |         """
148 |         Outputs 2d embeded plot based on `pos`
149 | 
150 |         :param xy: specifies the dimsntions of pos to be plotted.
151 |         :type xy: tuple, optional
152 | 
153 |         """
154 |         return plot_clusters(self.pos[:,[xy[0],xy[1]]], self.labels, clusters=self._cls)
155 | 


--------------------------------------------------------------------------------
/vsm/extensions/mahout/mahout.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import re
  4 | import numpy as np
  5 | 
  6 | 
  7 | def load_help(txtfile):
  8 |     """
  9 |     Returns a list of strings split with ': '
 10 |     """
 11 |     with open(txtfile, 'r') as f:
 12 |         s = f.read()
 13 |         s = re.sub('\n',': ', s)
 14 |         li = s.split(': ')
 15 |         return li
 16 | 
 17 | 
 18 | def load_vals(txtfile):
 19 |     """
 20 |     Loads data from mahout-generated txtfile(topic-term or doc-topic).
 21 |     Returns a list of dictionaries.
 22 |     """
 23 |     import ast
 24 |     
 25 |     data = []
 26 |     li = load_help(txtfile)
 27 |         
 28 |     for i in xrange(len(li)):
 29 |         if li[i] == 'Value' and i < len(li)-1:
 30 |             dic = ast.literal_eval(li[i+1])
 31 |             data.append(dic)
 32 |     return data
 33 | 
 34 | 
 35 | def build_arr(dictli):
 36 |     """
 37 |     dictli : list of dictionaries
 38 |     """
 39 |     r = len(dictli)
 40 |     c = len(dictli[0])
 41 |     
 42 |     arr = np.zeros((r,c))
 43 |     
 44 |     for i in xrange(r):
 45 |         arr[i] = dictli[i].values()
 46 |         
 47 |     return arr
 48 | 
 49 | 
 50 | def load_kv(txtfile):
 51 |     """
 52 |     Returns dictionary equivalent to Corpus.word_int
 53 |     """
 54 |     dic = {}
 55 |     li = load_help(txtfile)
 56 |         
 57 |     for i in xrange(len(li)):
 58 |         if li[i] == 'Key':
 59 |             dic[li[i+1]] = int(li[i+3])
 60 |             
 61 |     return dic
 62 | 
 63 | 
 64 | def make_corpus(txtfile, word_int, as_strings=False):
 65 |     """
 66 |     Returns a list of arrays that represent documents.
 67 |     """
 68 |     corp = []
 69 |     li = load_help(txtfile)
 70 |     
 71 |     for i in xrange(len(li)):
 72 |         if li[i] == 'Value':
 73 |             doc = li[i+1]
 74 |             doc = doc.strip()
 75 |             doc = doc.strip('[')
 76 |             doc = doc.strip(']')
 77 |             doc = doc.split(', ')
 78 |             doc = [str(w) for w in doc]
 79 |             
 80 |             idoc = []
 81 |             for w in doc:
 82 |                 try:
 83 |                     i = word_int[w]
 84 |                     if as_strings:
 85 |                         idoc.append(w)
 86 |                     else:
 87 |                         idoc.append(int(i))
 88 |                 except:
 89 |                     pass
 90 |             
 91 |             corp.append(np.array(idoc))
 92 |             
 93 |     return corp
 94 | 
 95 | 
 96 | def stopwords(corp, topword):
 97 |     """
 98 |     corp : `Corpus` object
 99 |     topword : topword (list of dictionaries) from model.
100 |     """
101 |     ind = topword[0].keys()
102 | 
103 |     rem = []
104 |     for w in corp.words:
105 |         i = corp.words_int[w]
106 |         if i not in ind:
107 |             rem.append(w)
108 | 
109 |     return rem
110 | 
111 | 
112 | def savez(fname, ctx_type, itr, K, alpha, beta, doc_top, top_word, W):
113 |     arrays_out = dict()
114 |     
115 |     V = top_word.shape[1]
116 |     # use mahout-vect-test/tokenized-documents
117 |     # mahout-vect-test/dictionary.file-0
118 |     corp = np.array(np.hstack(W))
119 |     arrays_out['W_corpus'] = corp 
120 |     arrays_out['W_indices'] = np.cumsum([a.size for a in W])
121 |     arrays_out['V'] = V # num of Vocabs
122 |     
123 |     # next 3 lines are dummy values
124 |     arrays_out['Z_corpus'] = np.zeros(corp.shape[0])
125 |     arrays_out['Z_indices'] = np.cumsum([a.size for a in W])
126 |     arrays_out['log_prob_init'] = False
127 |     
128 |     arrays_out['doc_top'] = doc_top
129 |     arrays_out['top_word'] = top_word
130 |     arrays_out['sum_word_top'] = (V * beta) + np.zeros(K)
131 |                                   
132 |     arrays_out['context_type'] = ctx_type
133 |     arrays_out['K'] = K
134 |     arrays_out['iterations'] = itr
135 |     arrays_out['alpha'] = alpha
136 |     arrays_out['beta'] = beta
137 |     
138 |     print('Saving LDA model to ', fname)
139 |     np.savez(fname, **arrays_out)
140 | 
141 | 
142 | """
143 | if __name__=='__main__':
144 |     # workflow
145 |     # from vsm.corpus.util.corpupsbuilders import corpus_fromlist
146 | 
147 |     # Return topword, doctop information from the txt file as arrays.
148 |     top_word = load_vals('../../../mahout-lda-test/lda.txt')
149 |     doc_top = load_vals('../../../mahout-dt-test/doc-topics.txt')
150 | 
151 |     arrtw = build_arr(top_word)
152 |     arrdt = build_arr(doc_top)
153 |     
154 |     # dicionary that corresponds to Corpus.words_int
155 |     words_int = load_kv('../../../mahout-vect-test/dict.txt')
156 | 
157 |     # list of arrays that represent documents.
158 |     # `wcorp` can be an input to `corpus_fromlist()` to create a `Corpus`.
159 |     wcorp = make_corpus('../../../mahout-vect-test/tokenized-documents/tdocs.txt',
160 |                         words_int, as_strings=True)
161 | 
162 |     # make `Corpus` object and apply_stoplist to ensure the words
163 |     # are exactly the same as the ones in the topword.
164 |     wc = corpus_fromlist(wcorp, 'document')
165 |     rem = stopwords(wc, top_word)
166 |     wc_ = wc.apply_stoplist(rem)
167 | 
168 |     # Save `Corpus` and LDA model.
169 |     wc_.save('mahout-test.npz')
170 |     savez('mahout-test-K5-100.npz', 'document', 100, 5, 0.01, 0.01, arrdt,
171 |             arrtw, wc_.view_contexts('document'))
172 | 
173 | """ 
174 | 


--------------------------------------------------------------------------------
/vsm/viewer/beagleviewer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from vsm.spatial import angle
  4 | from vsm.exceptions import *
  5 | 
  6 | from vsm.viewer.wrappers import *
  7 | 
  8 | 
  9 | __all__ = ['BeagleViewer']
 10 | 
 11 | 
 12 | class BeagleViewer(object):
 13 |     """
 14 |     A class for viewing BEAGLE models.
 15 |     """
 16 |     
 17 |     def __init__(self, corpus, model):
 18 |         """ 
 19 |         Initialize BeagleViewer.
 20 | 
 21 |         :param corpus: Source of observed data.
 22 |         :type corpus: :class:`Corpus`
 23 |     
 24 |         :param model: One of the Beagle objects.
 25 |         :type model: BEAGLE model
 26 |         """
 27 |         self.corpus = corpus
 28 |         self.model = model
 29 | 
 30 |     def dist_word_word(self, word_or_words, weights=[], 
 31 |                        filter_nan=True, print_len=10, as_strings=True,
 32 |                        dist_fn=angle, order='i'):
 33 |         """
 34 |         Computes and sorts the distances between word(s) and every word.
 35 | 
 36 |         :param word_or_words: Query word(s) to which distances are calculated.
 37 |         :type word_or_words: string or list of strings
 38 |         
 39 |         :param weights: Specify weights for each query word in `word_or_words`. 
 40 |             Default uses equal weights (i.e. arithmetic mean)
 41 |         :type weights: list of floating point, optional
 42 |         
 43 |         :param filter_nan: If `True` not a number entries are filtered.
 44 |             Default is `True`.
 45 |         :type filter_nan: boolean, optional
 46 | 
 47 |         :param print_len: Number of words to be displayed. Default is 10.
 48 |         :type print_len: int, optional
 49 | 
 50 |         :param as_strings: If `True`, returns a list of words as strings rather
 51 |             than their integer representations. Default is `True`.
 52 |         :type as_strings: boolean, optional
 53 | 
 54 |         :param dist_fn: A distance function from functions in vsm.spatial. 
 55 |             Default is :meth:`angle`.
 56 |         :type dist_fn: string, optional
 57 | 
 58 |         :param order: Order of sorting. 'i' for increasing and 'd' for
 59 |             decreasing order. Default is 'i'.
 60 |         :type order: string, optional
 61 | 
 62 |         :returns: an instance of :class:`LabeledColumn`.
 63 |             A 2-dim array containing words and their distances to 
 64 |             `word_or_words`. 
 65 |         
 66 |         :See Also: :meth:`vsm.viewer.wrappers.dist_word_word`
 67 |         """
 68 |         return dist_word_word(word_or_words, self.corpus, 
 69 |                               self.model.matrix.T, weights=weights, 
 70 |                               filter_nan=filter_nan, 
 71 |                               print_len=print_len, as_strings=True,
 72 |                               dist_fn=dist_fn, order=order)
 73 | 
 74 | 
 75 |     @deprecated_meth("dist_word_word")
 76 |     def sim_word_word(self, word_or_words, weights=[], 
 77 |                        filter_nan=True, print_len=10, as_strings=True,
 78 |                        dist_fn=angle, order='i'):
 79 | 
 80 |         self.sim_word_word.__func__.__doc__ = dist_word_word.__doc__
 81 |         pass
 82 | 
 83 |     
 84 |     @deprecated_meth("dismat_word")
 85 |     def simmat_word(self, word_list, dist_fn=angle):
 86 |         pass
 87 | 
 88 |     def dismat_word(self, word_list, dist_fn=angle):        
 89 |         """
 90 |         Calculates a distance matrix for a given list of words.
 91 | 
 92 |         :param word_list: A list of words whose distance matrix is to be
 93 |             computed.
 94 |         :type word_list: list of strings
 95 | 
 96 |         :param dist_fn: A distance function from functions in vsm.spatial. 
 97 |             Default is :meth:`angle`.
 98 |         :type dist_fn: string, optional
 99 | 
100 |         :returns: an instance of :class:`IndexedSymmArray`.
101 |             A n x n matrix containing floats where n is the number of words
102 |             in `word_list`.
103 |         
104 |         :See Also: :meth:`vsm.viewer.wrappers.dismat_word`
105 |         """
106 | 
107 |         return dismat_word(word_list, self.corpus, 
108 |                            self.model.matrix.T, dist_fn=dist_fn)
109 | 
110 | 
111 | 
112 |     # # This is a quick adaptation of the isomap_docs function from
113 |     # # ldagibbsviewer. This should be abstracted and moved to
114 |     # # similarity.py or something equivalent.
115 |     # def isomap_words(self, words, weights=[], thres=.8,
116 |     #                  n_neighbors=5, scale=True, trim=20):
117 |     #     """
118 |     #     """
119 |     #     from sklearn import manifold
120 |     #     from math import ceil
121 |     #     from vsm.ext.clustering.plotting import (
122 |     #         gen_colors as _gen_colors_,
123 |     #         plot_clusters as _plot_clusters_)
124 | 
125 |     #     # create a list to be plotted
126 |     #     word_list = self.dist_word_word(words, weights=weights)
127 | 
128 |     #     # cut down the list by the threshold
129 |     #     labels, size = zip(*[(w,s) for (w,s) in word_list if s < thres])
130 |     #     print size
131 |     #     # calculate coordinates
132 |     #     dismat = self.dismat_words(labels)
133 |     #     dismat = np.clip(dismat, 0, 2)     # cut off values outside [0, 1]
134 |     #     imap = manifold.Isomap(n_components=2, n_neighbors=n_neighbors)
135 |     #     pos  = imap.fit(dismat).embedding_
136 | 
137 |     #     # set graphic parameters
138 |     #     # - scale point size
139 |     #     if scale:
140 |     #         size = [s+0.5 if s == 0 else s for s in size] # for given word which has 0.0
141 |     #         # value to be visible.
142 |     #         size = [s**2*150 for s in size] 
143 |     #     else:
144 |     #         size = np.ones_like(size) * 50
145 |     #     # - trim labels
146 |     #     if trim:
147 |     #         labels = [lab[:trim] for lab in labels]
148 | 
149 |     #     # hack for unidecode issues in matplotlib
150 |     #     labels = [label.decode('utf-8', 'ignore') for label in labels]
151 |         
152 |     #     return _plot_clusters_(pos, labels, size=size)
153 | 
154 | 


--------------------------------------------------------------------------------
/vsm/extensions/testdata/history_greek_philosophy/frontmatter.json:
--------------------------------------------------------------------------------
1 | ["Project Gutenberg's A Short History of Greek Philosophy, by John Marshall\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: A Short History of Greek Philosophy\n\nAuthor: John Marshall\n\nRelease Date: February 1, 2007 [EBook #20500]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK GREEK PHILOSOPHY ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\nA SHORT HISTORY\n\nOF\n\nGREEK PHILOSOPHY\n\n\nBY\n\nJOHN MARSHALL\n\nM.A. OXON., LL.D. EDIN.\n\n\nRECTOR OF THE ROYAL HIGH SCHOOL, EDINBURGH\n\nFORMERLY PROFESSOR OF CLASSICAL LITERATURE AND PHILOSOPHY\n\nIN THE YORKSHIRE COLLEGE, LEEDS\n\n\n\n\nLONDON\n\nPERCIVAL AND CO.\n\n1891\n\n_All rights reserved_\n\n\n\n\nPREFACE\n\nThe main purpose which I have had in view in writing this book has been\nto present an account of Greek philosophy which, within strict limits\nof brevity, shall be at once authentic and interesting--_authentic_, as\nbeing based on the original works themselves, and not on any secondary\nsources; _interesting_, as presenting to the ordinary English reader,\nin language freed as far as possible from technicality and\nabstruseness, the great thoughts of the greatest men of antiquity on\nquestions of permanent significance and value.  There has been no\nattempt to shirk the really philosophic problems which these men tried\nin their day to solve; but I have endeavoured to show, by a sympathetic\ntreatment of them, that these problems were no mere wars of words, but\nthat in fact the philosophers of twenty-four centuries ago were dealing\nwith exactly similar difficulties as to the bases of belief and of\nright action as, under different forms, beset thoughtful men and women\nto-day.\n\nIn the general treatment of the subject, I have followed in the main\nthe order, and drawn chiefly on the selection of passages, in Ritter\nand Preller's _Historia Philosophiae Graecae_.  It is hoped that in\nthis way the little book may be found useful at the universities, as a\nrunning commentary on that excellent work; and the better to aid\nstudents in the use of it for that purpose, the corresponding sections\nin Ritter and Preller are indicated by the figures in the margin.\n\nIn the sections on Plato, and occasionally elsewhere, I have drawn to\nsome extent, by the kind permission of the Delegates of the Clarendon\nPress and his own, on Professor Jowett's great commentary and\ntranslation.\n\nJOHN MARSHALL.\n\n\n\n\nTranscriber's notes:\n\nThe passage numbers in the Ritter-Preller book mentioned in the second\nparagraph above are indicated in this book with square brackets, e.g.\n\"[10]\".  In the original book they were formatted as sidenotes.  In\nthis e-book they are embedded in the text approximately where they\nappear in the original book, unless they are at the start of a\nparagraph, in which case they appear immediately before that paragraph.\n\nPage numbers are indicated with curly brackets, e.g. \"{5}\".  They are\nembedded into the text where page breaks occurred in the original book.\n\nIn the original book, pages had headings that varied with the material\nbeing discussed on that pair of pages.  In this e-book, those headings\nhave been collected into an \"introductory\" paragraph at the beginning\nof each chapter.\n\n\nThe original book uses several Greek words.  These words, the chapters\nthey are used in, and their transliterations are as follows:\n\nChapter I (pages 3, 4, 12) - \"arche\" - alpha (with the soft-breathing\nmark), rho, chi, eta; \"phloios\" - phi, lambda, omicron, iota, omicron,\nfinal sigma.\n\nChapter III (page 28) - \"soma\" - sigma, omega, mu, alpha; \"sema\" -\nsigma, eta, mu, alpha.\n\nChapter IV (page 33, 34 - \"doxa\" - delta, omicron, xi, alpha; \"Peri\" -\nPI, epsilon, rho, iota; \"Phueos\" - PHI, upsilon, sigma, epsilon, omega,\nfinal sigma.\n\nChapter V (page 48) - \"logos\" - lambda, omicron, gamma, omicron, final\nsigma; \"hule\" - upsilon with rough breathing mark, lambda, eta.\n\n\n\n\nCONTENTS\n\n\nCHAP.\n\n     I.--THE SCHOOL OF MILETUS--\n            I. Thales  . . . . . . . . . . . . . . . . . . .   1\n           II. Anaximander . . . . . . . . . . . . . . . . .   7\n\n    II.--THE SCHOOL OF MILETUS (_concluded_)--\n          III. Anaximenes  . . . . . . . . . . . . . . . . .  14\n           IV. Heraclitus  . . . . . . . . . . . . . . . . .  15\n\n   III.--PYTHAGORAS AND THE PYTHAGOREANS   . . . . . . . . .  22\n\n    IV.--THE ELEATICS--\n            I. Xenophanes  . . . . . . . . . . . . . . . . .  31\n           II. Parmenides  . . . . . . . . . . . . . . . . .  33\n\n     V.--THE ELEATICS (_concluded_)--\n          III. Zeno  . . . . . . . . . . . . . . . . . . . .  42\n           IV. Melissus  . . . . . . . . . . . . . . . . . .  46\n\n    VI.--THE ATOMISTS--\n            I. Anaxagoras  . . . . . . . . . . . . . . . . .  52\n\n   VII.--THE ATOMISTS (_continued_)--\n           II. Empedocles  . . . . . . . . . . . . . . . . .  58\n\n  VIII.--THE ATOMISTS (_concluded_)--\n          III. Leucippus and Democritus  . . . . . . . . . .  74\n\n    IX.--THE SOPHISTS--\n            I. Protagoras  . . . . . . . . . . . . . . . . .  85\n\n     X.--THE SOPHISTS (_concluded_)--\n           II. Gorgias . . . . . . . . . . . . . . . . . . .  92\n\n    XI.--SOCRATES  . . . . . . . . . . . . . . . . . . . . . 101\n\n   XII.--SOCRATES (concluded)  . . . . . . . . . . . . . . . 116\n\n  XIII.--THE INCOMPLETE SOCRATICS--\n            I. Aristippus and the Cyrenaics  . . . . . . . . 124\n           II. Antisthenes and the Cynics  . . . . . . . . . 128\n          III. Euclides and the Megarics . . . . . . . . . . 132\n\n   XIV.--PLATO . . . . . . . . . . . . . . . . . . . . . . . 134\n\n    XV.--PLATO (_continued_) . . . . . . . . . . . . . . . . 146\n\n   XVI.--PLATO (_continued_) . . . . . . . . . . . . . . . . 154\n\n  XVII.--PLATO (_concluded_) . . . . . . . . . . . . . . . . 162\n\n XVIII.--ARISTOTLE . . . . . . . . . . . . . . . . . . . . . 172\n\n   XIX.--ARISTOTLE (_continued_) . . . . . . . . . . . . . . 187\n\n    XX.--ARISTOTLE (_concluded_) . . . . . . . . . . . . . . 199\n\n   XXI.--THE SCEPTICS AND EPICUREANS . . . . . . . . . . . . 210\n\n  XXII.--THE STOICS  . . . . . . . . . . . . . . . . . . . . 238\n\n         INDEX . . . . . . . . . . . . . . . . . . . . . . . 245\n"]


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/vsm.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/vsm.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/vsm"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/vsm"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 
155 | gh-pages:
156 | 	make html
157 | 	git clone https://github.com/inpho/vsm vsm_gh_pages
158 | 	cd vsm_gh_pages \
159 | 	&& git checkout gh-pages \
160 | 	&& rm -rf * _static _sources \
161 | 	&& mv ../$(BUILDDIR)/html/* . \
162 | 	&& git add -A \
163 | 	&& git commit -m "Generated gh-pages" \
164 | 	&& git push origin gh-pages
165 | 	rm -rf $(BUILDDIR) vsm_gh_pages
166 | 


--------------------------------------------------------------------------------
/vsm/model/tf.py:
--------------------------------------------------------------------------------
  1 | from builtins import object
  2 | import multiprocessing as mp
  3 | import platform, warnings
  4 | 
  5 | import numpy as np
  6 | from scipy.sparse import hstack
  7 | 
  8 | from vsm.spatial import count_matrix
  9 | from vsm.split import *
 10 | from vsm.model.base import *
 11 | 
 12 | 
 13 | __all__ = ['TF', 'TfSeq', 'TfMulti']
 14 | 
 15 | 
 16 | 
 17 | class TfSeq(BaseModel):
 18 |     """
 19 |     Trains a term-frequency model. 
 20 | 
 21 |     In a term-frequency model, the number of occurrences of a word
 22 |     type in a context is counted for all word types and documents. Word
 23 |     types correspond to matrix rows and documents correspond to matrix
 24 |     columns.
 25 | 
 26 |     :See Also: :class:`vsm.model.base`, :class:`vsm.corpus.Corpus`,
 27 |         :class:`scipy.sparse.coo_matrix`
 28 |     """
 29 |     
 30 |     def __init__(self, corpus=None, context_type=None):
 31 |         """
 32 |         Initialize TfSeq.
 33 | 
 34 |         :param corpus: A Corpus object containing the training data.
 35 |         :type corpus: Corpus
 36 |     
 37 |         :param context_type: A string specifying the type of context over which
 38 |             the model trainer is applied.
 39 |         :type context_type: string
 40 |         """
 41 | 
 42 |         self.context_type = context_type
 43 |         if corpus:
 44 |             self.corpus = corpus.corpus
 45 |             self.docs = corpus.view_contexts(context_type, as_slices=True)
 46 |             self.V = corpus.words.size
 47 |         else:
 48 |             self.corpus = []
 49 |             self.docs = []
 50 |             self.V = 0
 51 | 
 52 | 
 53 |     def train(self):
 54 |         """
 55 |         Counts word-type occurrences per context and stores the results in
 56 |         `self.matrix`.
 57 |         """
 58 |         self.matrix = count_matrix(self.corpus, self.docs, self.V)
 59 | 
 60 | 
 61 |     
 62 | class TfMulti(TfSeq):
 63 |     """
 64 |     Trains a term-frequency model. 
 65 | 
 66 |     In a term-frequency model, the number of occurrences of a word
 67 |     type in a context is counted for all word types and documents. Word
 68 |     types correspond to matrix rows and documents correspond to matrix
 69 |     columns.
 70 | 
 71 |     The data structure is a sparse integer matrix.
 72 | 
 73 |     :See Also: :class:`vsm.model.base.BaseModel`, :class:`vsm.corpus.Corpus`,
 74 |         :class:`scipy.sparse.coo_matrix`
 75 |     """
 76 |     def __init__(self, corpus=None, context_type=None):
 77 |         """
 78 |         Initialize TfMulti.
 79 | 
 80 |         :param corpus: A Corpus object containing the training data
 81 |         :type corpus: Corpus, optional
 82 |     
 83 |         :param context_type: A string specifying the type of context over which
 84 |             the model trainer is applied.
 85 |         :type context_type: string, optional
 86 |         """
 87 |         self._read_globals = False
 88 |         self._write_globals = False
 89 | 
 90 |         super(TfMulti, self).__init__(corpus=corpus, context_type=context_type)
 91 | 
 92 | 
 93 |     def _move_globals_to_locals(self):
 94 |         
 95 |         self._write_globals = False
 96 |         self.V = self.V
 97 |         self.corpus = self.corpus
 98 |         self._read_globals = False
 99 |         global _V, _corpus
100 |         del _V, _corpus
101 | 
102 | 
103 |     def _move_locals_to_globals(self):
104 |         
105 |         self._write_globals = True
106 |         self.V = self.V
107 |         self.corpus = self.corpus
108 |         self._read_globals = True
109 |         del self._V_local, self._corpus_local
110 | 
111 | 
112 |     @property
113 |     def corpus(self):
114 |         if self._read_globals:
115 |             return np.frombuffer(_corpus, np.int32)
116 |         return self._corpus_local
117 | 
118 |     @corpus.setter
119 |     def corpus(self, a):
120 |         if self._write_globals:
121 |             global _corpus
122 |             if not '_corpus' in globals():
123 |                 _corpus = mp.Array('i', len(a), lock=False)
124 |             _corpus[:] = a
125 |         else:
126 |             self._corpus_local = a
127 | 
128 |     @property
129 |     def V(self):
130 |         if self._read_globals:
131 |             return _V.value
132 |         return self._V_local
133 | 
134 |     @V.setter
135 |     def V(self, V):
136 |         if self._write_globals:
137 |             global _V
138 |             _V = mp.Value('i', V, lock=False)
139 |         else:
140 |             self._V_local = V
141 | 
142 | 
143 | 
144 |     def train(self, n_proc=2):
145 |         """
146 |         Takes a number of processes `n_proc` over which to map and reduce.
147 | 
148 |         :param n_procs: Number of processors.
149 |         :type n_procs: int
150 |         """
151 |         self._move_locals_to_globals()
152 | 
153 |         doc_indices = mp_split_ls(self.docs, n_proc)
154 | 
155 |         p=mp.Pool(n_proc)
156 |         cnt_mats = p.map(tf_fn, doc_indices)
157 |         p.close()
158 | 
159 |         self.matrix = hstack(cnt_mats, format='coo')
160 | 
161 |         self._move_globals_to_locals()
162 | 
163 | 
164 | 
165 | def tf_fn(ctx_sbls):
166 |     """
167 |     The map function for vsm.model.TfMulti. Takes a list of documents
168 |     as slices and returns a count matrix.
169 |     
170 |     :param ctx_sbls: list of documents as slices.
171 |     :type ctx_sbls: list of slices
172 | 
173 |     :returns: a count matrix
174 |     """
175 |     offset = ctx_sbls[0].start
176 |     corpus = _corpus[offset: ctx_sbls[-1].stop]
177 |     slices = [slice(s.start-offset, s.stop-offset) for s in ctx_sbls]
178 |     return count_matrix(corpus, slices, _V.value)
179 | 
180 | 
181 | class TF(object):
182 |     """
183 |     Depending on the boolean parameter `multiprocessing`, returns and
184 |     initializes an instance of either TfSeq or TfMulti.
185 | 
186 |     Note that on Windows platforms, `multiprocessing` is not implemented.
187 |     In contrast to LdaCgsMulti, LDA always returns a valid object. Instead
188 |     of raising a NotImplementedError, LDA issues a RuntimeWarning, notifying 
189 |     the user the sequental algorithm is being used.
190 |     """
191 |     def __new__(cls, corpus=None, context_type=None, multiprocessing=False):
192 | 
193 |         kwargs = dict(corpus=corpus, context_type=context_type)
194 |         
195 |         if multiprocessing and platform.system() != 'Windows':
196 |             return TfMulti(**kwargs)
197 |         else:
198 |             if platform.system() == 'Windows':
199 |                 warnings.warn("""Multiprocessing is not implemented on Windows.
200 |                 Defaulting to sequential algorithm.""", RuntimeWarning)
201 |             return TfSeq(**kwargs)
202 | 


--------------------------------------------------------------------------------
/vsm/model/beaglecontext.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import absolute_import
  3 | from future import standard_library
  4 | standard_library.install_aliases()
  5 | from builtins import zip
  6 | from builtins import str
  7 | from builtins import range
  8 | import os
  9 | import shutil
 10 | import tempfile
 11 | import multiprocessing as mp
 12 | import pickle as cpickle
 13 | 
 14 | import numpy as np
 15 | 
 16 | from vsm.model.base import BaseModel
 17 | 
 18 | 
 19 | __all__ = [ 'BeagleContextSeq', 'BeagleContextMulti' ]
 20 | 
 21 | 
 22 | def realign_env_mat(corpus, env_corpus, env_matrix):
 23 |     """
 24 |     """
 25 |     words = corpus.words
 26 |     indices = [env_corpus.words_int[w] for w in words]
 27 |     return env_matrix[indices]
 28 | 
 29 | 
 30 | 
 31 | class BeagleContextSeq(BaseModel):
 32 |     """
 33 | 
 34 |     """
 35 |     def __init__(self, corpus, env_corpus, env_matrix, 
 36 |                 context_type='sentence'):
 37 |         """
 38 |         Initialize BeagleContextSeq.
 39 | 
 40 |         :param corpus: Source of observed data.
 41 |         :type corpus: class:`Corpus`
 42 | 
 43 |         :param env_corpus: BEAGLE environment corpus.
 44 |         :type env_corpus: class:`Corpus`
 45 | 
 46 |         :param env_matrix: Matrix from BEAGLE environment model.
 47 |         :type env_matrix: 2-D array
 48 | 
 49 |         :param context_type: Name of tokenization stored in `corpus` whose
 50 |             tokens will be treated as documents. Default is `sentence`.
 51 |         :type context_type: string, optional
 52 |         """
 53 |         self.context_type = context_type
 54 |         self.sents = corpus.view_contexts(context_type)
 55 |         self.env_matrix = realign_env_mat(corpus, env_corpus, env_matrix)
 56 | 
 57 | 
 58 |     def train(self):
 59 |         """
 60 |         Trains the model.
 61 |         """
 62 |         self.matrix = np.zeros_like(self.env_matrix)
 63 | 
 64 |         for sent in self.sents:
 65 | 
 66 |             if sent.shape[0] > 1:
 67 | 
 68 |                 left_sums = np.cumsum(self.env_matrix[sent[:-1]], axis=0)
 69 |                 right_sums = np.cumsum(self.env_matrix[sent[:0:-1]], axis=0)
 70 | 
 71 |                 for i,word in enumerate(sent):
 72 | 
 73 |                     if i == 0:
 74 |                         ctx_vector = right_sums[-1]
 75 | 
 76 |                     elif i == sent.shape[0] - 1:
 77 |                         ctx_vector = left_sums[-1]
 78 |                     
 79 |                     else:
 80 |                         ctx_vector = left_sums[i - 1] + right_sums[-i - 1]
 81 | 
 82 |                     self.matrix[word, :] += ctx_vector
 83 | 
 84 | 
 85 | 
 86 | class BeagleContextMulti(BaseModel):
 87 |     """
 88 | 
 89 |     """
 90 | 
 91 |     def __init__(self, corpus, env_corpus, env_matrix, 
 92 |                  context_type='sentence'):
 93 |         """
 94 |         Initialize BeagleContextMulti.
 95 |         
 96 |         :param corpus: Souce of observed data.
 97 |         :type corpus: class:`Corpus`
 98 | 
 99 |         :param env_corpus: BEAGLE environment corpus. 
100 |         :type env_corpus: class:`Corpus`
101 | 
102 |         :param env_matrix: Matrix from BEAGLE environment model.
103 |         :type env_matrix: 2-D array
104 | 
105 |         :param context_type: Name of tokenization stored in `corpus` whose
106 |             tokens will be treated as documents. Default is `sentence`.
107 |         :type context_type: string, optional
108 |         """
109 |         self.context_type = context_type
110 |         self.sents = corpus.view_contexts(context_type)
111 |         self.dtype = env_matrix.dtype
112 |         env_matrix = realign_env_mat(corpus, env_corpus, env_matrix)
113 | 
114 |         global _shape 
115 |         _shape = mp.Array('i', 2, lock=False)
116 |         _shape[:] = env_matrix.shape
117 |         
118 |         print('Copying env matrix to shared mp array')
119 |         global _env_matrix
120 |         _env_matrix = mp.Array('d', env_matrix.size, lock=False)
121 |         _env_matrix[:] = env_matrix.ravel()[:]
122 | 
123 | 
124 |     def train(self, n_procs=2):
125 |         """
126 |         Takes an optional argument `n_procs`, number of processors,
127 |         and trains the model on the number of processors. `n_procs`
128 |         is 2 by default.
129 |         
130 |         :param n_procs: Number of processors. Default is 2.
131 |         :type n_procs: int, optional
132 | 
133 |         :returs: `None`
134 |         """
135 |         sent_lists = np.array_split(self.sents, n_procs-1)
136 |         if len(sent_lists) != n_procs:
137 |             sent_lists = np.array_split(self.sents, n_procs)
138 | 
139 |         tmp_dir = tempfile.mkdtemp()
140 |         tmp_files = [os.path.join(tmp_dir, 'tmp_' + str(i))
141 |                      for i in range(len(sent_lists))]
142 | 
143 |         sent_lists = list(zip(sent_lists, tmp_files))
144 |         del self.sents
145 | 
146 |         try:
147 |             print('Forking')
148 |             # For debugging
149 |             # tmp_files = map(mpfn, sent_lists)
150 |             
151 |             p = mp.Pool(n_procs)
152 |             tmp_files = p.map(mpfn, sent_lists, 1)
153 |             p.close()
154 | 
155 |             print('Reducing')
156 |             self.matrix = np.zeros(tuple(_shape), dtype=self.dtype)
157 | 
158 |             for filename in tmp_files:
159 | 
160 |                 with open(filename, 'rb') as f:
161 |                     result = cpickle.load(f)
162 | 
163 |                 for k,v in result.items():
164 |                     self.matrix[k, :] += v
165 | 
166 |         finally:
167 |             print('Removing {}'.format(tmp_dir))
168 |             shutil.rmtree(tmp_dir)
169 | 
170 | 
171 | 
172 | def mpfn(sents_filename):
173 |     """
174 |     """
175 |     sents, filename = sents_filename
176 |     result = dict()
177 | 
178 |     for sent in sents:
179 |         if sent.shape[0] > 1:
180 | 
181 |             env = np.empty((sent.size, _shape[1]), dtype=np.float64)
182 |             for i,w in enumerate(sent):
183 |                 env[i, :] = _env_matrix[w*_shape[1]: (w+1)*_shape[1]]
184 |         
185 |             left_sums = np.cumsum(env[:-1], axis=0)
186 |             right_sums = np.cumsum(env[:0:-1], axis=0)
187 | 
188 |             for i,t in enumerate(sent):
189 |                 
190 |                 if i == 0:
191 |                     ctx_vector = right_sums[-1]
192 | 
193 |                 elif i == sent.shape[0] - 1:
194 |                     ctx_vector = left_sums[-1]
195 |                     
196 |                 else:
197 |                     ctx_vector = left_sums[i - 1] + right_sums[-i - 1]
198 |                    
199 |                 if t in result:
200 |                     result[t] += ctx_vector
201 |                 else:
202 |                     result[t] = ctx_vector
203 | 
204 |     with open(filename, 'wb') as f:
205 |         cpickle.dump(result, f)
206 | 
207 |     return filename
208 | 
209 | 


--------------------------------------------------------------------------------
/unit_tests/tests_corpus.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | import unittest
  3 | import numpy as np
  4 | import os
  5 | from vsm.corpus import *
  6 | from vsm.split import split_corpus
  7 | from tempfile import NamedTemporaryFile
  8 | 
  9 | 
 10 | class TestCorpus(unittest.TestCase):
 11 | 
 12 |     def setUp(self):
 13 |         corpus = np.array([0, 3, 2, 1, 0, 3, 0, 2, 3, 0, 2, 3, 1, 2, 0, 3,
 14 |                                 2, 1, 2, 2], dtype=int)
 15 |         contextData = np.array([(3, 'doc0'), (5, 'doc1'), (7,'doc2'), (11,'doc3'),
 16 |                 (11,'doc4'), (15,'doc5'), (18,'doc6'), (20,'doc7')], 
 17 |                 dtype=[('idx', '<i8'), ('document_label', '|U4')])
 18 | 
 19 |         self.bc = BaseCorpus(corpus, context_data=[contextData],
 20 |                                      context_types=['document'],
 21 |                                      remove_empty=True)
 22 |         
 23 |         text = ['I', 'came', 'I', 'saw', 'I', 'conquered']
 24 |         ctx_data = [np.array([(2, 'Veni'), (4, 'Vidi'), (6, 'Vici')],
 25 |                             dtype=[('idx', '<i8'), ('sentence_label', '|S6')])]
 26 | 
 27 |         self.corpus = Corpus(text, context_data=ctx_data,
 28 |                                     context_types=['sentence'])
 29 |         
 30 |                         
 31 |     #TODO: Move this test to vsm.split
 32 |     def test_SplitCorpus(self): 
 33 |         odd = split_corpus(self.corpus.corpus, [1,3,5])
 34 |         even = split_corpus(self.corpus.corpus, [2,4,6])
 35 | 
 36 |         odd_expected = [np.array([0]), np.array([1, 0]),
 37 |                 np.array([3, 0]), np.array([2])]
 38 |         even_expected = [np.array([0, 1]), np.array([0, 3]),
 39 |                 np.array([0, 2])]
 40 | 
 41 |         for i in range(len(odd)):
 42 |             np.testing.assert_array_equal(odd[i], odd_expected[i])
 43 |         for i in range(len(even)):
 44 |             np.testing.assert_array_equal(even[i], even_expected[i])
 45 | 
 46 |     def test_apply_stoplist(self):
 47 |         stopped_corpus = self.corpus.apply_stoplist(['I'])
 48 |         print(stopped_corpus.context_data[0].dtype)
 49 |         new_ctx = all(a == b 
 50 |             for a, b in zip(
 51 |                 stopped_corpus.context_data[0], 
 52 |                 np.array([(1,'Veni'), (2,'Vidi'), (3,'Vici')], dtype=[('idx', '<i8'), ('sentence_label', 'S6')])
 53 |             ))
 54 |         self.assertTrue(new_ctx, msg=None)
 55 |         self.assertEqual({'I'}, stopped_corpus.stopped_words)
 56 | 
 57 |     def test_align_corpora(self):
 58 |         
 59 |         out = align_corpora(self.corpus, Corpus([]))
 60 |         self.assertTrue(len(out.corpus)==0)
 61 |         self.assertTrue(len(out.words)==4)
 62 |         self.assertTrue(len(out.words_int)==4)
 63 | 
 64 |         out = align_corpora(Corpus([], remove_empty=False), self.corpus)
 65 |         self.assertTrue(len(out.corpus)==0)
 66 |         self.assertTrue(len(out.words)==0)
 67 |         self.assertTrue(len(out.words_int)==0)
 68 | 
 69 |         out = align_corpora(self.corpus, self.corpus)
 70 |         self.assertTrue(len(out.corpus)==len(self.corpus.corpus))
 71 |         self.assertTrue((out.corpus==self.corpus.corpus).all())
 72 |         self.assertTrue(len(out.words)==len(self.corpus.words))
 73 |         self.assertTrue((out.words==self.corpus.words).all())
 74 |         self.assertTrue(out.words_int==self.corpus.words_int)
 75 | 
 76 |         new_corp = Corpus(
 77 |             [ 'came', 'saw', 'and', 'conquered' ],
 78 |             context_data=[ np.array([(4, )], dtype=[('idx', '<i8')]) ])
 79 |         out = align_corpora(self.corpus, new_corp)
 80 |         self.assertTrue(len(out.corpus)==3)
 81 |         for w in out.corpus:
 82 |             self.assertTrue(out.words[w]==self.corpus.words[w])
 83 |         self.assertTrue(len(out.words)==4)
 84 |         self.assertTrue((out.words==self.corpus.words).all())
 85 |         self.assertTrue(out.words_int==self.corpus.words_int)
 86 | 
 87 | 
 88 |     def test_ValidateIndices(self):
 89 |         for t in self.bc.context_data:
 90 |             self.assertTrue(self.bc._validate_indices(t['idx']))
 91 | 
 92 |     def test_RemoveEmpty(self):
 93 |         self.bc.remove_empty()
 94 |         print(self.bc.context_data[0].dtype)
 95 |         new_ctx = all(a == b for a,b in zip(self.bc.context_data[0],
 96 |             np.array([(3,'doc0'), (5,'doc1'), 
 97 |                 (7,'doc2'), (11,'doc3'), (15,'doc5'), (18,'doc6'), 
 98 |                 (20,'doc7')], dtype=[('idx', '<i8'), ('document_label', '<U4')])))
 99 |         self.assertTrue(new_ctx, msg=None)
100 | 
101 |     def test_ViewMetadata(self):
102 |         meta = self.bc.view_metadata('document')
103 |         np.testing.assert_array_equal(self.bc.context_data[0], meta)
104 | 
105 |     def test_bc_ViewContexts(self):
106 |         ctx = self.bc.view_contexts('document')
107 |         expected = [np.array([0,3,2]), np.array([1,0]), np.array([3,0]),
108 |              np.array([2,3,0,2]), np.array([3,1,2,0]), np.array([3,2,1]),
109 |              np.array([2,2])]
110 |         for i in range(len(ctx)):
111 |             np.testing.assert_array_equal(ctx[i], expected[i])
112 |         
113 |     
114 |     def test_MetaInt(self):
115 |         i = self.bc.meta_int('document', {'document_label': 'doc3'})
116 |         self.assertEqual(3, i)
117 | 
118 |     def test_GetMetadatum(self):
119 |         s = self.bc.get_metadatum('document', {'document_label': 'doc0'}, 'document_label')
120 |         self.assertEqual('doc0', s)
121 | 
122 |    
123 |     def test_SetWordsInt(self):
124 |         d = {'I':0, 'came':1, 'conquered':2, 'saw':3}
125 |         self.assertEqual(self.corpus.words_int, d)
126 | 
127 |     def test_ViewContexts(self):
128 |         expected = [np.array(['I','came']), np.array(['I', 'saw']), np.array(['I', 'conquered'])]
129 |         ctx = self.corpus.view_contexts('sentence', as_strings=True)
130 |         for i in range(len(ctx)):
131 |             np.testing.assert_array_equal(ctx[i], expected[i])
132 | 
133 |     def test_SaveLoad(self):
134 |         
135 |         try:
136 |             tmp = NamedTemporaryFile(delete=False, suffix='.npz')
137 |             self.corpus.save(tmp.name)
138 |             tmp.close()
139 | 
140 |             c_reloaded = Corpus.load(tmp.name)
141 | 
142 |             self.assertTrue((self.corpus.corpus == c_reloaded.corpus).all())
143 |             self.assertTrue((self.corpus.words == c_reloaded.words).all())
144 |             self.assertTrue(self.corpus.words_int == c_reloaded.words_int)
145 |             self.assertTrue(self.corpus.context_types == c_reloaded.context_types)
146 |             
147 |             for i in range(len(self.corpus.context_data)):
148 |                 self.assertTrue((self.corpus.context_data[i] == 
149 |                         c_reloaded.context_data[i]).all(), msg=None)
150 |     
151 |         finally:
152 |             os.remove(tmp.name)
153 | 
154 | if __name__ == '__main__':
155 |     suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpus)
156 |     unittest.TextTestRunner(verbosity=2).run(suite)
157 | 


--------------------------------------------------------------------------------
/vsm/extensions/testdata/history_greek_philosophy/chapter6.json:
--------------------------------------------------------------------------------
1 | ["CHAPTER VI\n\nTHE ATOMISTS\n\n_Anaxagoras and the cosmos--Mind in nature--The seeds of existence_\n\n\n[129]\n\nI. ANAXAGORAS.--Anaxagoras was born at Clazomenae, a city of Ionia,\nabout the year 500 B.C.  At the age of twenty he removed to Athens, of\nwhich city Clazomenae was for some time a dependency.  This step on his\npart may have been connected with the circumstances attending the great\ninvasion of Greece by Xerxes in the year 480.  For Xerxes drew a large\ncontingent of his army from the Ionian cities which he had subdued, and\nmany who were unwilling to serve against their mother-country may have\ntaken refuge about that time in Athens.  At Athens he resided for\nnearly fifty years, and during that period became the friend and\nteacher of many eminent men, among the rest of Pericles, the great\nAthenian [118] statesman, and of Euripides, the dramatist.  Like most\nof the Ionian philosophers he had a taste for mathematics and\nastronomy, as well as for certain practical applications of\nmathematics.  Among other books he is said to have written a treatise\non the art {53} of scene-designing for the stage, possibly to oblige\nhis friend and pupil Euripides.  In his case, as in that of his\npredecessors, only fragments of his philosophic writings have been\npreserved, and the connection of certain portions of his teaching as\nthey have come down to us remains somewhat uncertain.\n\n[119]\n\nWith respect to the constitution of the universe we have the following:\n\"Origination and destruction are phrases which are generally\nmisunderstood among the Greeks.  Nothing really is originated or\ndestroyed; the only processes which actually take place are combination\nand separation of elements already existing.  [120] These elements we\nare to conceive as having been in a state of chaos at first, infinite\nin number and infinitely small, forming in their immobility a confused\nand characterless unity.  About this chaos was spread the air and\naether, infinite also in the multitude of their particles, and\ninfinitely extended.  Before separation commenced there was no clear\ncolour or appearance in anything, whether of moist or dry, of hot or\ncold, of bright or dark, but only an infinite number of the seeds of\nthings, having concealed in them all manner of forms and colours and\nsavours.\"\n\nThere is a curious resemblance in this to the opening verses of\nGenesis, \"The earth was without form and void, and darkness was upon\nthe face of the deep.\"  Nor is the next step in his philosophy without\nits resemblance to that in the Biblical record.  [122] As summarised by\nDiogenes Laertius it takes this form, \"All things were as one: then\ncometh Mind, and by division brought all things into order.\"  [121]\n\"Conceiving,\" as Aristotle puts it, \"that the original elements of\nthings had no power to generate or develop out of themselves things as\nthey exist, philosophers were forced by the facts themselves to seek\nthe immediate cause of this development.  They were unable to believe\nthat fire, or earth, or any such principle was adequate to account for\nthe order and beauty visible in the frame of things; nor did they think\nit possible to attribute these to mere innate necessity or chance.\n_One_ (Anaxagoras) observing how in living creatures Mind is the\nordering force, declared that in nature also this must be the cause of\norder and beauty, and in so declaring he seemed, when compared with\nthose before him, as one sober amidst a crowd of babblers.\"\n\n[122]\n\nElsewhere, however, Aristotle modifies this commendation.\n\"Anaxagoras,\" he says, \"uses Mind only as a kind of last resort,\ndragging it in when he fails otherwise to account for a phenomenon, but\nnever thinking of it else.\"  And in the _Phaedo_ Plato makes Socrates\nspeak of the high hopes with which he had taken to the works of\nAnaxagoras, and how grievously he had been disappointed.  \"As I\nproceeded,\" he says, \"I found my philosopher altogether forsaking Mind\nor any other principle of order, and having {55} recourse to air, and\naether, and water, and other eccentricities.\"\n\nAnaxagoras, then, at least on this side of his teaching, must be\nconsidered rather as the author of a phrase than as the founder of a\nphilosophy.  The phrase remained, and had a profound influence on\nsubsequent philosophies, but in his own hands it was little more than a\ndead letter.  His immediate interest was rather in the variety of\nphenomena than in their conceived principle of unity; he is\ntheoretically, perhaps, 'on the side of the angels,' in practice he is\na materialist.\n\n[12]\n\nMind he conceived as something apart, sitting throned like Zeus upon\nthe heights, giving doubtless the first impulse to the movement of\nthings, but leaving them for the rest to their own inherent tendencies.\nAs distinguished from them it was, he conceived, the one thing which\nwas absolutely pure and unmixed.  All things else had intermixture with\nevery other, the mixtures increasing in complexity towards the centre\nof things.  On the outmost verge were distributed the finest and least\ncomplex forms of things--the sun, the moon, the stars; the more dense\ngathering together, to form as it were in the centre of the vortex, the\nearth and its manifold existences.  By the intermixture of air and\nearth and water, containing in themselves the infinitely varied seeds\nof things, plants and animals were {56} developed.  The seeds\nthemselves are too minute to be apprehended by the senses, but we can\ndivine their character by the various characters of the visible things\nthemselves, each of these having a necessary correspondence with the\nnature of the seeds from which they respectively were formed.\n\n[128]\n\nThus for a true apprehension of things sensation and reason are both\nnecessary--sensation to certify to the apparent characters of objects,\nreason to pass from these to the nature of the invisible seeds or atoms\nwhich cause those characters.  Taken by themselves our sensations are\nfalse, inasmuch as they give us only combined impressions, yet they are\na necessary stage towards the truth, as providing the materials which\nreason must separate into their real elements.\n\nFrom this brief summary we may gather that Mind was conceived, so to\nspeak, as placed at the _beginning_ of existence, inasmuch as it is the\nfirst originator of the vortex motions of the atoms or seeds of things;\nit was conceived also at the _end_ of existence as the power which by\nanalysis of the data of sensation goes back through the complexity of\nactual being to the original unmingled or undeveloped nature of things.\nBut the whole process of nature itself between these limits Anaxagoras\nconceived as a purely mechanical or at least physical development, the\nuncertainty of his view as between these two alternative ways of\nconsidering it being {57} typified in his use of the two expressions\n_atoms_ and _seeds_.  The analogies of this view with those of modern\nmaterialism, which finds in the ultimate molecules of matter \"the\npromise and the potency of all life and all existence,\" need not be\nhere enlarged upon.\n\nAfter nearly half a century's teaching at Athens Anaxagoras was\nindicted on a charge of inculcating doctrines subversive of religion.\nIt is obvious enough that his theories left no room for the popular\nmythology, but the Athenians were not usually very sensitive as to the\nbearing of mere theories upon their public institutions.  It seems\nprobable that the accusation was merely a cloak for political\nhostility.  Anaxagoras was the friend and intimate of Pericles, leader\nof the democratic party in the state, and the attack upon Anaxagoras\nwas really a political move intended to damage Pericles.  As such\nPericles himself accepted it, and the trial became a contest of\nstrength, which resulted in a partial success and a partial defeat for\nboth sides.  Pericles succeeded in saving his friend's life, but the\nopposite party obtained a sentence of fine and banishment against him.\nAnaxagoras retired to Lampsacus, a city on the Hellespont, and there,\nafter some five years, he died.\n\n\n\n\n{58}\n"]


--------------------------------------------------------------------------------