├── .gitignore
├── .rtd
├── .travis.yml
├── LICENSE
├── README.rst
├── TODO
├── docs
    ├── Makefile
    ├── api.rst
    ├── conf.py
    ├── index.rst
    ├── lda2vec
    │   ├── corpus.rst
    │   ├── dirichlet_likelihood.rst
    │   ├── embed_mixture.rst
    │   ├── fake_data.rst
    │   ├── lda2vec.rst
    │   ├── preprocess.rst
    │   └── tracking.rst
    └── make.bat
├── examples
    ├── hacker_news
    │   ├── README.md
    │   ├── data
    │   │   └── preprocess.py
    │   └── lda2vec
    │   │   ├── lda2vec.ipynb
    │   │   ├── lda2vec_model.py
    │   │   ├── lda2vec_run.py
    │   │   └── word_vectors.ipynb
    └── twenty_newsgroups
    │   ├── README.md
    │   ├── data
    │       └── preprocess.py
    │   ├── lda
    │       ├── lda.py
    │       ├── lda_run.py
    │       └── topics.pyldavis.npz
    │   ├── lda2vec
    │       ├── lda2vec.ipynb
    │       ├── lda2vec_model.py
    │       ├── lda2vec_run.py
    │       └── topics.pyldavis.npz
    │   ├── nslda
    │       ├── nslda.py
    │       └── nslda_run.py
    │   └── nvdm
    │       ├── nvdm.py
    │       └── nvdm_run.py
├── images
    ├── img00_word2vec.png
    ├── img01_lda.png
    ├── img02_lda_topics.png
    ├── img03_lda2vec_topics01.png
    ├── img04_lda2vec_topics02.png
    ├── img05_lda2vec_topics03_supervised.png
    └── img06_pyldavis.gif
├── lda2vec
    ├── __init__.py
    ├── corpus.py
    ├── dirichlet_likelihood.py
    ├── embed_mixture.py
    ├── fake_data.py
    ├── negative_sampling.py
    ├── preprocess.py
    ├── topics.py
    ├── tracking.py
    └── utils.py
├── lda2vec_network_publish_text.gif
├── pytest.ini
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── test_dirichlet_likelihood.py
    ├── test_embed_mixture.py
    ├── test_fake_data.py
    ├── test_preprocess.py
    └── test_topics.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.egg/
 3 | *.pyc
 4 | *.pyo
 5 | *.cpp
 6 | *.so
 7 | *.swp
 8 | build
 9 | \#*\#
10 | .\#*
11 | .coverage
12 | .eggs/
13 | *.egg-info/
14 | dist/
15 | htmlcov/
16 | 


--------------------------------------------------------------------------------
/.rtd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/.rtd


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Much of this script was adapted from astropy/astropy.
 2 | 
 3 | language: python
 4 | 
 5 | env:
 6 |     global:
 7 |         - NUMPY_VERSION=1.10
 8 | 
 9 | matrix:
10 |     include:
11 |         # All the versions of Python.
12 |         - python: 2.7
13 | 
14 | before_install:
15 |     - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
16 |     - chmod +x miniconda.sh
17 |     - ./miniconda.sh -b
18 |     - export PATH=/home/travis/miniconda2/bin:$PATH
19 | install:
20 |     - conda create --yes -n testing python=$TRAVIS_PYTHON_VERSION
21 |     - source activate testing
22 |     - conda install --yes numpy=$NUMPY_VERSION nose pip numba cython scikit-learn h5py
23 |     - pip install chainer pytest spacy codecov coveralls  pytest-cov
24 |     - python -m spacy.en.download --force all
25 |     - python setup.py install
26 | 
27 | script:
28 |     - ls
29 |     - pwd
30 |     - env | sort
31 |     - py.test --cov=lda2vec tests/ lda2vec --ignore=lda2vec/preprocess.py
32 |     - # python examples/twenty_newsgroups/lda.py
33 | after_success:
34 |     - coveralls
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Christopher Erick Moody
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | lda2vec: Tools for interpreting natural language
  2 | =================================================
  3 | 
  4 | **This lda2vec works on python 3.**
  5 | 
  6 | .. image:: http://img.shields.io/badge/license-MIT-blue.svg?style=flat
  7 |     :target: https://github.com/cemoody/lda2vec/blob/master/LICENSE
  8 | 
  9 | .. image:: https://readthedocs.org/projects/lda2vec/badge/?version=latest
 10 |     :target: http://lda2vec.readthedocs.org/en/latest/?badge=latest
 11 | 
 12 | .. image:: https://travis-ci.org/cemoody/lda2vec.svg?branch=master
 13 |     :target: https://travis-ci.org/cemoody/lda2vec
 14 | 
 15 | .. image:: https://img.shields.io/badge/coverage-93%25-green.svg
 16 |     :target: https://travis-ci.org/cemoody/lda2vec
 17 | 
 18 | .. image:: https://img.shields.io/twitter/follow/chrisemoody.svg?style=social
 19 |     :target: https://twitter.com/intent/follow?screen_name=chrisemoody
 20 | 
 21 | .. image:: lda2vec_network_publish_text.gif
 22 | 
 23 | 
 24 | The lda2vec model tries to mix the best parts of word2vec and LDA
 25 | into a single framework. word2vec captures powerful relationships 
 26 | between words, but the resulting vectors are largely uninterpretable
 27 | and don't represent documents. LDA on the other hand is quite
 28 | interpretable by humans, but doesn't model local word relationships
 29 | like word2vec. We build a model that builds both word and document
 30 | topics, makes them interpreable,  makes topics over clients, times,
 31 | and documents, and makes them supervised topics.
 32 | 
 33 | *Warning*: this code is a big series of experiments. It's research software,
 34 | and we've tried to make it simple to modify lda2vec and to play around with
 35 | your own custom topic models. However, it's still research software.
 36 | I wouldn't run this in production, Windows, and I'd only use it after you've
 37 | decided both word2vec and LDA are inadequate and you'd like to tinker with your
 38 | own cool models :) That said, I don't want to discourage experimentation:
 39 | there's some limited documentation, a modicum of unit tests, and some 
 40 | interactive examples to get you started.
 41 | 
 42 | 
 43 | Resources
 44 | ---------
 45 | See the research paper `Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec <http://arxiv.org/abs/1605.02019>`_
 46 | 
 47 | See this `Jupyter Notebook <http://nbviewer.jupyter.org/github/cemoody/lda2vec/blob/master/examples/twenty_newsgroups/lda2vec/lda2vec.ipynb>`_
 48 | for an example of an end-to-end demonstration.
 49 | 
 50 | See this `slide deck <http://www.slideshare.net/ChristopherMoody3/word2vec-lda-and-introducing-a-new-hybrid-algorithm-lda2vec-57135994>`_
 51 | or this `youtube video <https://www.youtube.com/watch?v=eHcBeVnAiD4>`_
 52 | for a presentation focused on the benefits of word2vec, LDA, and lda2vec.
 53 | 
 54 | See the `API reference docs <https://lda2vec.readthedocs.org/en/latest/>`_
 55 | 
 56 | 
 57 | About
 58 | -----
 59 | 
 60 | .. image:: images/img00_word2vec.png
 61 | 
 62 | Word2vec tries to model word-to-word relationships.
 63 | 
 64 | .. image:: images/img01_lda.png
 65 | 
 66 | LDA models document-to-word relationships.
 67 | 
 68 | .. image:: images/img02_lda_topics.png
 69 | 
 70 | LDA yields topics over each document.
 71 | 
 72 | .. image:: images/img03_lda2vec_topics01.png
 73 | 
 74 | lda2vec yields topics not over just documents, but also regions.
 75 | 
 76 | .. image:: images/img04_lda2vec_topics02.png
 77 | 
 78 | lda2vec also yields topics over clients.
 79 | 
 80 | .. image:: images/img05_lda2vec_topics03_supervised.png
 81 | 
 82 | lda2vec the topics can be 'supervised' and forced to predict another target.
 83 | 
 84 | lda2vec also includes more contexts and features than LDA. LDA dictates that
 85 | words are generated by a document vector; but we might have all kinds of
 86 | 'side-information' that should influence our topics. For example, a single
 87 | client comment is about a particular item ID, written at a particular time
 88 | and in a particular region. In this case, lda2vec gives you topics over all
 89 | items (separating jeans from shirts, for example) times (winter versus summer)
 90 | regions (desert versus coastal) and clients (sporty vs professional attire).
 91 | 
 92 | Ultimately, the topics are interpreted using the excellent pyLDAvis library:
 93 | 
 94 | .. image:: images/img06_pyldavis.gif
 95 | 
 96 | 
 97 | Requirements
 98 | ------------
 99 | 
100 | I tested twenty-newsgroups example based on following requirements:
101 | 
102 | - Python 3.5.2
103 | - NumPy 1.16.0
104 | - Chainer 5.1.0
105 | - spaCy 1.9.0
106 | - pyxDamerauLevenshtein 1.5.2
107 | - pyLDAvis 2.1.2
108 | 
109 | 
110 | Requirements for some features:
111 | 
112 | - CUDA support
113 | - Testing utilities: py.test
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
 1 | Add tests for target
 2 | Add tests for global targets
 3 | Add examples of specific documents to 20ng example
 4 | Add better naming to categorical variables, e.g. like target variables
 5 | Keep track of doc counts between model serializations
 6 | Add bigramming
 7 | Add better README
 8 | Add an example script with HN with doc id, client id, and predicted score
 9 | Add super simple explanatory models
10 | Remove spacy dep
11 | Change EmbedMixture naming to possible values and n latent factors
12 | Print out topics while training
13 | Add doctets to lda2vec main classes
14 | Randomize chunking order on fit
15 | Add loss tracking and reporting classes to code
16 | Finish filling out docstrings
17 | Add multiple targets for one component
18 | Add convergence criterion
19 | 
20 | Add docs on:
21 |     Installation
22 |     HN Tutorial
23 |         Parse document into vector
24 |         Setup LDA for document
25 |         Mesure perplexity
26 |         Visualize topics
27 |         Add supervised component
28 |         Mesure perplexity
29 |         Visualize topics
30 |         Add another component for time
31 |         Mesure perplexity
32 |         Visualize topics
33 |         Visualize topics, changing temperature
34 |     Data formats
35 |         Loose
36 |         Compact
37 |         Flat
38 |     Contexts
39 |         Categorical contexts
40 |         Other contexts TBA
41 |     Targets
42 |         RMSE
43 |         Logistic
44 |         Softmax
45 |     Advanced
46 |         Options
47 |             GPU
48 |             Gradient Clipping
49 |             Online learning, fraction argument
50 |         Logging progress
51 |         Perplexity
52 |         Model saving, prediction
53 |         Dropout fractions
54 | 
55 | Nomenclature
56 |     Categorical Feature
57 |         Each category in set has n_possible_values
58 |         Each feature has n_latent_factors
59 |         Each feature has a single target
60 |     Components
61 |         Each component defined total number of documents and number of topics
62 |         Each component may also have supervised targets
63 | 
64 | Done:
65 |     Add BoW mode
66 |     Add logger
67 |     Add fake data generator
68 |     Add perplexity measurements
69 |     Add tracking utility
70 |     Add utilities for converting corpora
71 |     Put license
72 |     Add masks / skips / pads
73 |     Add reindexing on the fly
74 |     Convert docstrings to numpy format
75 |     Implement corpus loose to dense and vice versa
76 |     Add fit function for all data at once
77 |     Add CI & coverage & license icons
78 |     Add readthedocs support
79 |     Add examples to CI
80 |     Add dropout
81 |     Change component naming to 'categorical feature'
82 |     Add linear layers between input latent and output context
83 |     Merge skipgram branch
84 |     Add topic numbers to topic print out
85 |     Try higher importance to the prior
86 |     Change prob model to just model prob of word in topic
87 |     Add word dropout
88 |     Add an example script with 20 newsgroups -- LDA
89 |     Add visualization for topic-word
90 |     Implement skipgram contexts
91 |     Prevent mixing between documents
92 |     Add temperature to perplexity measurements
93 |     Add temperature to viz
94 |     Add model saving
95 |     Add model predicting
96 |     Hook up RTD to docstrings
97 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/lda2vec.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/lda2vec.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/lda2vec"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/lda2vec"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | lda2vec package
2 | ===============
3 | 
4 | .. toctree::
5 |    :maxdepth: 2
6 |    :glob:
7 | 
8 |    lda2vec/*
9 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # lda2vec documentation build configuration file, created by
  4 | # sphinx-quickstart on Sun Jan 24 18:22:13 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | try:
 18 |     from unittest.mock import MagicMock
 19 | except ImportError:
 20 |     from mock import Mock as MagicMock
 21 | 
 22 | 
 23 | class Mock(MagicMock):
 24 |     @classmethod
 25 |     def __getattr__(cls, name):
 26 |         return Mock()
 27 | 
 28 | 
 29 | MOCK_MODULES = ['sklearn', 'chainer', 'chainer.functions', 'chainer.links',
 30 |                 'chainer.optimizers', 'spacy', 'numpy', 'pyLDAvis',
 31 |                 'sklearn.linear_model', 'spacy.en', 'sklearn.datasets',
 32 |                 'numpy.random', 'spacy.attrs']
 33 | if os.environ.get('READTHEDOCS', None) == 'True':
 34 |     sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 35 | 
 36 | 
 37 | # If extensions (or modules to document with autodoc) are in another directory,
 38 | # add these directories to sys.path here. If the directory is relative to the
 39 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 40 | sys.path.insert(0, os.path.abspath('.'))
 41 | sys.path.insert(0, os.path.abspath('../'))
 42 | 
 43 | # -- General configuration ------------------------------------------------
 44 | 
 45 | # If your documentation needs a minimal Sphinx version, state it here.
 46 | #needs_sphinx = '1.0'
 47 | 
 48 | # Add any Sphinx extension module names here, as strings. They can be
 49 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 50 | # ones.
 51 | extensions = [
 52 |     'sphinx.ext.autodoc',
 53 |     'sphinx.ext.doctest',
 54 |     'sphinx.ext.mathjax',
 55 |     'sphinx.ext.napoleon'
 56 | ]
 57 | 
 58 | # Add any paths that contain templates here, relative to this directory.
 59 | templates_path = ['_templates']
 60 | 
 61 | # The suffix(es) of source filenames.
 62 | # You can specify multiple suffix as a list of string:
 63 | # source_suffix = ['.rst', '.md']
 64 | source_suffix = '.rst'
 65 | 
 66 | # The encoding of source files.
 67 | #source_encoding = 'utf-8-sig'
 68 | 
 69 | # The master toctree document.
 70 | master_doc = 'index'
 71 | 
 72 | # General information about the project.
 73 | project = u'lda2vec'
 74 | copyright = u'2016, Christopher Erick Moody'
 75 | author = u'Christopher Erick Moody'
 76 | 
 77 | # The version info for the project you're documenting, acts as replacement for
 78 | # |version| and |release|, also used in various other places throughout the
 79 | # built documents.
 80 | #
 81 | # The short X.Y version.
 82 | version = u'0.01'
 83 | # The full version, including alpha/beta/rc tags.
 84 | release = u'0.01'
 85 | 
 86 | # The language for content autogenerated by Sphinx. Refer to documentation
 87 | # for a list of supported languages.
 88 | #
 89 | # This is also used if you do content translation via gettext catalogs.
 90 | # Usually you set "language" from the command line for these cases.
 91 | language = None
 92 | 
 93 | # There are two options for replacing |today|: either, you set today to some
 94 | # non-false value, then it is used:
 95 | #today = ''
 96 | # Else, today_fmt is used as the format for a strftime call.
 97 | #today_fmt = '%B %d, %Y'
 98 | 
 99 | # List of patterns, relative to source directory, that match files and
100 | # directories to ignore when looking for source files.
101 | exclude_patterns = ['_build']
102 | 
103 | # The reST default role (used for this markup: `text`) to use for all
104 | # documents.
105 | #default_role = None
106 | 
107 | # If true, '()' will be appended to :func: etc. cross-reference text.
108 | #add_function_parentheses = True
109 | 
110 | # If true, the current module name will be prepended to all description
111 | # unit titles (such as .. function::).
112 | #add_module_names = True
113 | 
114 | # If true, sectionauthor and moduleauthor directives will be shown in the
115 | # output. They are ignored by default.
116 | #show_authors = False
117 | 
118 | # The name of the Pygments (syntax highlighting) style to use.
119 | pygments_style = 'sphinx'
120 | 
121 | # A list of ignored prefixes for module index sorting.
122 | #modindex_common_prefix = []
123 | 
124 | # If true, keep warnings as "system message" paragraphs in the built documents.
125 | #keep_warnings = False
126 | 
127 | # If true, `todo` and `todoList` produce output, else they produce nothing.
128 | todo_include_todos = False
129 | 
130 | 
131 | # -- Options for HTML output ----------------------------------------------
132 | 
133 | # The theme to use for HTML and HTML Help pages.  See the documentation for
134 | # a list of builtin themes.
135 | html_theme = 'sphinx_rtd_theme'
136 | 
137 | # Theme options are theme-specific and customize the look and feel of a theme
138 | # further.  For a list of options available for each theme, see the
139 | # documentation.
140 | #html_theme_options = {}
141 | 
142 | # Add any paths that contain custom themes here, relative to this directory.
143 | #html_theme_path = []
144 | 
145 | # The name for this set of Sphinx documents.  If None, it defaults to
146 | # "<project> v<release> documentation".
147 | #html_title = None
148 | 
149 | # A shorter title for the navigation bar.  Default is the same as html_title.
150 | #html_short_title = None
151 | 
152 | # The name of an image file (relative to this directory) to place at the top
153 | # of the sidebar.
154 | #html_logo = None
155 | 
156 | # The name of an image file (within the static path) to use as favicon of the
157 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
158 | # pixels large.
159 | #html_favicon = None
160 | 
161 | # Add any paths that contain custom static files (such as style sheets) here,
162 | # relative to this directory. They are copied after the builtin static files,
163 | # so a file named "default.css" will overwrite the builtin "default.css".
164 | html_static_path = ['_static']
165 | 
166 | # Add any extra paths that contain custom files (such as robots.txt or
167 | # .htaccess) here, relative to this directory. These files are copied
168 | # directly to the root of the documentation.
169 | #html_extra_path = []
170 | 
171 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
172 | # using the given strftime format.
173 | #html_last_updated_fmt = '%b %d, %Y'
174 | 
175 | # If true, SmartyPants will be used to convert quotes and dashes to
176 | # typographically correct entities.
177 | #html_use_smartypants = True
178 | 
179 | # Custom sidebar templates, maps document names to template names.
180 | #html_sidebars = {}
181 | 
182 | # Additional templates that should be rendered to pages, maps page names to
183 | # template names.
184 | #html_additional_pages = {}
185 | 
186 | # If false, no module index is generated.
187 | #html_domain_indices = True
188 | 
189 | # If false, no index is generated.
190 | #html_use_index = True
191 | 
192 | # If true, the index is split into individual pages for each letter.
193 | #html_split_index = False
194 | 
195 | # If true, links to the reST sources are added to the pages.
196 | #html_show_sourcelink = True
197 | 
198 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
199 | #html_show_sphinx = True
200 | 
201 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
202 | #html_show_copyright = True
203 | 
204 | # If true, an OpenSearch description file will be output, and all pages will
205 | # contain a <link> tag referring to it.  The value of this option must be the
206 | # base URL from which the finished HTML is served.
207 | #html_use_opensearch = ''
208 | 
209 | # This is the file name suffix for HTML files (e.g. ".xhtml").
210 | #html_file_suffix = None
211 | 
212 | # Language to be used for generating the HTML full-text search index.
213 | # Sphinx supports the following languages:
214 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
215 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
216 | #html_search_language = 'en'
217 | 
218 | # A dictionary with options for the search language support, empty by default.
219 | # Now only 'ja' uses this config value
220 | #html_search_options = {'type': 'default'}
221 | 
222 | # The name of a javascript file (relative to the configuration directory) that
223 | # implements a search results scorer. If empty, the default will be used.
224 | #html_search_scorer = 'scorer.js'
225 | 
226 | # Output file base name for HTML help builder.
227 | htmlhelp_basename = 'lda2vecdoc'
228 | 
229 | # -- Options for LaTeX output ---------------------------------------------
230 | 
231 | latex_elements = {
232 | # The paper size ('letterpaper' or 'a4paper').
233 | #'papersize': 'letterpaper',
234 | 
235 | # The font size ('10pt', '11pt' or '12pt').
236 | #'pointsize': '10pt',
237 | 
238 | # Additional stuff for the LaTeX preamble.
239 | #'preamble': '',
240 | 
241 | # Latex figure (float) alignment
242 | #'figure_align': 'htbp',
243 | }
244 | 
245 | # Grouping the document tree into LaTeX files. List of tuples
246 | # (source start file, target name, title,
247 | #  author, documentclass [howto, manual, or own class]).
248 | latex_documents = [
249 |   (master_doc, 'lda2vec.tex', u'lda2vec Documentation',
250 |    u'Christopher Erick Moody', 'manual'),
251 | ]
252 | 
253 | # The name of an image file (relative to this directory) to place at the top of
254 | # the title page.
255 | #latex_logo = None
256 | 
257 | # For "manual" documents, if this is true, then toplevel headings are parts,
258 | # not chapters.
259 | #latex_use_parts = False
260 | 
261 | # If true, show page references after internal links.
262 | #latex_show_pagerefs = False
263 | 
264 | # If true, show URL addresses after external links.
265 | #latex_show_urls = False
266 | 
267 | # Documents to append as an appendix to all manuals.
268 | #latex_appendices = []
269 | 
270 | # If false, no module index is generated.
271 | #latex_domain_indices = True
272 | 
273 | 
274 | # -- Options for manual page output ---------------------------------------
275 | 
276 | # One entry per manual page. List of tuples
277 | # (source start file, name, description, authors, manual section).
278 | man_pages = [
279 |     (master_doc, 'lda2vec', u'lda2vec Documentation',
280 |      [author], 1)
281 | ]
282 | 
283 | # If true, show URL addresses after external links.
284 | #man_show_urls = False
285 | 
286 | 
287 | # -- Options for Texinfo output -------------------------------------------
288 | 
289 | # Grouping the document tree into Texinfo files. List of tuples
290 | # (source start file, target name, title, author,
291 | #  dir menu entry, description, category)
292 | texinfo_documents = [
293 |   (master_doc, 'lda2vec', u'lda2vec Documentation',
294 |    author, 'lda2vec', 'One line description of project.',
295 |    'Miscellaneous'),
296 | ]
297 | 
298 | # Documents to append as an appendix to all manuals.
299 | #texinfo_appendices = []
300 | 
301 | # If false, no module index is generated.
302 | #texinfo_domain_indices = True
303 | 
304 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
305 | #texinfo_show_urls = 'footnote'
306 | 
307 | # If true, do not generate a @detailmenu in the "Top" node's menu.
308 | #texinfo_no_detailmenu = False
309 | 
310 | 
311 | source_suffix = ['.rst']
312 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ==============================================
 2 | lda2vec -- flexible & interpretable NLP models
 3 | ==============================================
 4 | 
 5 | This is the documentation for lda2vec, a framework for useful 
 6 | flexible and interpretable NLP models.
 7 | 
 8 | Defining the model is simple and quick::
 9 | 
10 |     model = LDA2Vec(n_words, max_length, n_hidden, counts)
11 |     model.add_component(n_docs, n_topics, name='document id')
12 |     model.fit(clean, components=[doc_ids])
13 | 
14 | While visualizing the feature is similarly straightforward::
15 | 
16 |     topics = model.prepare_topics('document_id', vocab)
17 |     prepared = pyLDAvis.prepare(topics)
18 |     pyLDAvis.display(prepared)
19 | 
20 | Resources
21 | ---------
22 | See this `Jupyter Notebook <http://nbviewer.jupyter.org/github/cemoody/lda2vec/blob/master/examples/twenty_newsgroups/lda.ipynb>`_
23 | for an example of an end-to-end demonstration.
24 | 
25 | See this `presentation <http://www.slideshare.net/ChristopherMoody3/word2vec-lda-and-introducing-a-new-hybrid-algorithm-lda2vec-57135994>`_
26 | for a presentation focused on the benefits of word2vec, LDA, and lda2vec.
27 | 
28 | See the `API reference docs <https://lda2vec.readthedocs.org/en/latest/>`_
29 | 
30 | See the `GitHub repo <https://github.com/cemoody/lda2vec>`_
31 | 
32 | API
33 | ===
34 | .. toctree::
35 | 
36 |     api
37 | 
38 | 
39 | Indices and tables
40 | ==================
41 | 
42 | * :ref:`genindex`
43 | * :ref:`modindex`
44 | * :ref:`search`
45 | 


--------------------------------------------------------------------------------
/docs/lda2vec/corpus.rst:
--------------------------------------------------------------------------------
1 | lda2vec.corpus module
2 | ---------------------
3 | 
4 | .. automodule:: lda2vec.corpus
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/dirichlet_likelihood.rst:
--------------------------------------------------------------------------------
1 | lda2vec.dirichlet_likelihood module
2 | -----------------------------------
3 | 
4 | .. automodule:: lda2vec.dirichlet_likelihood
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/embed_mixture.rst:
--------------------------------------------------------------------------------
1 | lda2vec.embed_mixture module
2 | ----------------------------
3 | 
4 | .. automodule:: lda2vec.embed_mixture
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/fake_data.rst:
--------------------------------------------------------------------------------
1 | lda2vec.fake_data module
2 | ------------------------
3 | 
4 | .. automodule:: lda2vec.fake_data
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/lda2vec.rst:
--------------------------------------------------------------------------------
1 | lda2vec.lda2vec module
2 | ----------------------
3 | 
4 | .. automodule:: lda2vec.lda2vec
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/preprocess.rst:
--------------------------------------------------------------------------------
1 | lda2vec.preprocess module
2 | -------------------------
3 | 
4 | .. automodule:: lda2vec.preprocess
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lda2vec/tracking.rst:
--------------------------------------------------------------------------------
1 | lda2vec.tracking module
2 | -----------------------
3 | 
4 | .. automodule:: lda2vec.tracking
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\lda2vec.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\lda2vec.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/examples/hacker_news/README.md:
--------------------------------------------------------------------------------
 1 | # Hacker News Comments with lda2vec example
 2 | This example trains a multi-component lda2vec model on a corpus of Hacker News
 3 | comments. The goal is to model how Hacker News stories have changed in time,
 4 | how they correlate with the number of comments posted, and what individual
 5 | commenter topics are.
 6 | 
 7 | ### Running the model
 8 | 
 9 | To run this example, first run `preprocess.py` which will download the Hacker
10 | News comments CSV, tokenize it, and quickly build a vocabulary. Once finished,
11 | it saves the training data to file.
12 | 
13 | Then run `model.py` which will train the lda2vec model. 
14 | 
15 | Finally, `visualize.py` helps the human interpret what the topics mean.
16 | 
17 | ### The HN Comment Data
18 | 
19 | The corpus has been slightly filtered. We've removed comments made by 
20 | infrequent users (e.g. having fewer than 10 comments ever) and removed stories
21 | with fewer than 10 comments. The training corpus is available at 
22 | [Zenodo](https://zenodo.org/record/45901#.Vrv5jJMrLMU).
23 | 
24 | ### Preparing the HN Comment Data
25 | 
26 | You shouldn't need to repeat any of the Google BigQuery work. If you would like
27 | to nevertheless, the rough steps are outline below:
28 | 
29 | The raw HN data is available on Google BigQuery, see for example these resources:
30 | 
31 | - Previous analysis on this [dataset](https://github.com/fhoffa/notebooks/blob/master/analyzing%20hacker%20news.ipynb)
32 | 
33 | - Dataset [shared here](https://bigquery.cloud.google.com/table/fh-bigquery:hackernews.comments)
34 | 
35 | Data Prepataion
36 | 
37 | #### Query 1
38 | 
39 |     SELECT p0.id AS id
40 |          , p0.text as text
41 |          , p0.author AS author
42 |          , p0.ranking AS ranking
43 |          , p0.time
44 |          , p0.time_ts
45 |          , COALESCE(p7.parent, p6.parent, p5.parent, p4.parent, p3.parent, p2.parent, p1.parent, p0.parent) story_id
46 |          , GREATEST(  IF(p7.parent IS null, -1, 7)
47 |                     , IF(p6.parent IS null, -1, 6)
48 |                     , IF(p5.parent IS null, -1, 5)
49 |                     , IF(p4.parent IS null, -1, 4)
50 |                     , IF(p3.parent IS null, -1, 3)
51 |                     , IF(p2.parent IS null, -1, 2)
52 |                     , IF(p1.parent IS null, -1, 1)
53 |                     , 0) level
54 |     FROM    [fh-bigquery:hackernews.comments] p0
55 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p1 ON p1.id=p0.parent
56 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p2 ON p2.id=p1.parent
57 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p3 ON p3.id=p2.parent
58 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p4 ON p4.id=p3.parent
59 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p5 ON p5.id=p4.parent
60 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p6 ON p6.id=p5.parent
61 |     LEFT JOIN EACH [fh-bigquery:hackernews.comments] p7 ON p7.id=p6.parent
62 |     WHERE p0.deleted IS NULL
63 |       AND p0.dead IS NULL
64 |       AND LENGTH(p0.text) > 5
65 |     HAVING level = 0
66 | 
67 | #### Query 2
68 | 
69 |     SELECT s.id AS story_id
70 |      , s.time AS story_time
71 |      , s.url AS story_url
72 |      , s.text AS story_text
73 |      , s.author AS story_author
74 |      , c.id AS comment_id
75 |      , c.text AS comment_text
76 |      , c.author AS comment_author
77 |      , c.ranking as comment_ranking
78 |      , author_counts.n_comments AS author_comment_count
79 |      , story_counts.n_comments AS story_comment_count
80 |     FROM [lda2vec-v02:data.comment_to_story_id] c
81 |     JOIN (SELECT story_id
82 |                , COUNT(story_id) AS n_comments
83 |           FROM [lda2vec-v02:data.comment_to_story_id]
84 |           GROUP BY story_id
85 |         ) AS story_counts
86 |     ON c.story_id = story_counts.story_id 
87 |     JOIN (SELECT author
88 |                , COUNT(author) AS n_comments
89 |           FROM [lda2vec-v02:data.comment_to_story_id]
90 |           GROUP BY author
91 |         ) AS author_counts
92 |     ON c.author = author_counts.author 
93 |     JOIN [fh-bigquery:hackernews.stories] s
94 |     ON s.id = c.story_id
95 |     WHERE story_counts.n_comments > 10
96 |       AND author_counts.n_comments > 10
97 | 


--------------------------------------------------------------------------------
/examples/hacker_news/data/preprocess.py:
--------------------------------------------------------------------------------
  1 | # Author: Chris Moody <chrisemoody@gmail.com>
  2 | # License: MIT
  3 | 
  4 | # This example loads a large 800MB Hacker News comments dataset
  5 | # and preprocesses it. This can take a few hours, and a lot of
  6 | # memory, so please be patient!
  7 | 
  8 | from lda2vec import preprocess, Corpus
  9 | import numpy as np
 10 | import pandas as pd
 11 | import logging
 12 | import cPickle as pickle
 13 | import os.path
 14 | 
 15 | logging.basicConfig()
 16 | 
 17 | max_length = 250   # Limit of 250 words per comment
 18 | min_author_comments = 50  # Exclude authors with fewer comments
 19 | nrows = None  # Number of rows of file to read; None reads in full file
 20 | 
 21 | fn = "hacker_news_comments.csv"
 22 | url = "https://zenodo.org/record/45901/files/hacker_news_comments.csv"
 23 | if not os.path.exists(fn):
 24 |     import requests
 25 |     response = requests.get(url, stream=True, timeout=2400)
 26 |     with open(fn, 'w') as fh:
 27 |         # Iterate over 1MB chunks
 28 |         for data in response.iter_content(1024**2):
 29 |             fh.write(data)
 30 | 
 31 | 
 32 | features = []
 33 | # Convert to unicode (spaCy only works with unicode)
 34 | features = pd.read_csv(fn, encoding='utf8', nrows=nrows)
 35 | # Convert all integer arrays to int32
 36 | for col, dtype in zip(features.columns, features.dtypes):
 37 |     if dtype is np.dtype('int64'):
 38 |         features[col] = features[col].astype('int32')
 39 | 
 40 | # Tokenize the texts
 41 | # If this fails it's likely spacy. Install a recent spacy version.
 42 | # Only the most recent versions have tokenization of noun phrases
 43 | # I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff
 44 | # Also try running python -m spacy.en.download all --force
 45 | texts = features.pop('comment_text').values
 46 | tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4,
 47 |                                     merge=True)
 48 | del texts
 49 | 
 50 | # Make a ranked list of rare vs frequent words
 51 | corpus = Corpus()
 52 | corpus.update_word_count(tokens)
 53 | corpus.finalize()
 54 | 
 55 | # The tokenization uses spaCy indices, and so may have gaps
 56 | # between indices for words that aren't present in our dataset.
 57 | # This builds a new compact index
 58 | compact = corpus.to_compact(tokens)
 59 | # Remove extremely rare words
 60 | pruned = corpus.filter_count(compact, min_count=10)
 61 | # Words tend to have power law frequency, so selectively
 62 | # downsample the most prevalent words
 63 | clean = corpus.subsample_frequent(pruned)
 64 | print "n_words", np.unique(clean).max()
 65 | 
 66 | # Extract numpy arrays over the fields we want covered by topics
 67 | # Convert to categorical variables
 68 | author_counts = features['comment_author'].value_counts()
 69 | to_remove = author_counts[author_counts < min_author_comments].index
 70 | mask = features['comment_author'].isin(to_remove).values
 71 | author_name = features['comment_author'].values.copy()
 72 | author_name[mask] = 'infrequent_author'
 73 | features['comment_author'] = author_name
 74 | authors = pd.Categorical(features['comment_author'])
 75 | author_id = authors.codes
 76 | author_name = authors.categories
 77 | story_id = pd.Categorical(features['story_id']).codes
 78 | # Chop timestamps into days
 79 | story_time = pd.to_datetime(features['story_time'], unit='s')
 80 | days_since = (story_time - story_time.min()) / pd.Timedelta('1 day')
 81 | time_id = days_since.astype('int32')
 82 | features['story_id_codes'] = story_id
 83 | features['author_id_codes'] = story_id
 84 | features['time_id_codes'] = time_id
 85 | 
 86 | print "n_authors", author_id.max()
 87 | print "n_stories", story_id.max()
 88 | print "n_times", time_id.max()
 89 | 
 90 | # Extract outcome supervised features
 91 | ranking = features['comment_ranking'].values
 92 | score = features['story_comment_count'].values
 93 | 
 94 | # Now flatten a 2D array of document per row and word position
 95 | # per column to a 1D array of words. This will also remove skips
 96 | # and OoV words
 97 | feature_arrs = (story_id, author_id, time_id, ranking, score)
 98 | flattened, features_flat = corpus.compact_to_flat(pruned, *feature_arrs)
 99 | # Flattened feature arrays
100 | (story_id_f, author_id_f, time_id_f, ranking_f, score_f) = features_flat
101 | 
102 | # Save the data
103 | pickle.dump(corpus, open('corpus', 'w'), protocol=2)
104 | pickle.dump(vocab, open('vocab', 'w'), protocol=2)
105 | features.to_pickle('features.pd')
106 | data = dict(flattened=flattened, story_id=story_id_f, author_id=author_id_f,
107 |             time_id=time_id_f, ranking=ranking_f, score=score_f,
108 |             author_name=author_name, author_index=author_id)
109 | np.savez('data', **data)
110 | np.save(open('tokens', 'w'), tokens)
111 | 


--------------------------------------------------------------------------------
/examples/hacker_news/lda2vec/lda2vec_model.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import EmbedMixture
 2 | from lda2vec import dirichlet_likelihood
 3 | from lda2vec.utils import move
 4 | 
 5 | from chainer import Chain
 6 | import chainer.links as L
 7 | import chainer.functions as F
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class LDA2Vec(Chain):
13 |     def __init__(self, n_stories=100, n_story_topics=10,
14 |                  n_authors=100, n_author_topics=10,
15 |                  n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True,
16 |                  counts=None, n_samples=15, word_dropout_ratio=0.0):
17 |         em1 = EmbedMixture(n_stories, n_story_topics, n_units,
18 |                            dropout_ratio=dropout_ratio)
19 |         em2 = EmbedMixture(n_authors, n_author_topics, n_units,
20 |                            dropout_ratio=dropout_ratio)
21 |         kwargs = {}
22 |         kwargs['mixture_sty'] = em1
23 |         kwargs['mixture_aut'] = em2
24 |         kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples)
25 |         super(LDA2Vec, self).__init__(**kwargs)
26 |         rand = np.random.random(self.sampler.W.data.shape)
27 |         self.sampler.W.data[:, :] = rand[:, :]
28 |         self.n_units = n_units
29 |         self.train = train
30 |         self.dropout_ratio = dropout_ratio
31 |         self.word_dropout_ratio = word_dropout_ratio
32 |         self.n_samples = n_samples
33 | 
34 |     def prior(self):
35 |         dl1 = dirichlet_likelihood(self.mixture_sty.weights)
36 |         dl2 = dirichlet_likelihood(self.mixture_aut.weights)
37 |         return dl1 + dl2
38 | 
39 |     def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5):
40 |         sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids)
41 |         pivot_idx = next(move(self.xp, rwrd_ids[window: -window]))
42 |         pivot = F.embed_id(pivot_idx, self.sampler.W)
43 |         sty_at_pivot = rsty_ids[window: -window]
44 |         aut_at_pivot = raut_ids[window: -window]
45 |         sty = self.mixture_sty(next(move(self.xp, sty_at_pivot)))
46 |         aut = self.mixture_aut(next(move(self.xp, aut_at_pivot)))
47 |         loss = 0.0
48 |         start, end = window, rwrd_ids.shape[0] - window
49 |         context = sty + aut + F.dropout(pivot, self.dropout_ratio)
50 |         for frame in range(-window, window + 1):
51 |             # Skip predicting the current pivot
52 |             if frame == 0:
53 |                 continue
54 |             # Predict word given context and pivot word
55 |             # The target starts before the pivot
56 |             targetidx = rwrd_ids[start + frame: end + frame]
57 |             sty_at_target = rsty_ids[start + frame: end + frame]
58 |             aut_at_target = raut_ids[start + frame: end + frame]
59 |             sty_is_same = sty_at_target == sty_at_pivot
60 |             aut_is_same = aut_at_target == aut_at_pivot
61 |             # Randomly dropout words (default is to never do this)
62 |             rand = np.random.uniform(0, 1, sty_is_same.shape[0])
63 |             mask = (rand > self.word_dropout_ratio).astype('bool')
64 |             sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same)
65 |             weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32')
66 |             # If weight is 1.0 then targetidx
67 |             # If weight is 0.0 then -1
68 |             targetidx = targetidx * weight + -1 * (1 - weight)
69 |             target, = move(self.xp, targetidx)
70 |             loss = self.sampler(context, target)
71 |             loss.backward()
72 |         return loss.data
73 | 


--------------------------------------------------------------------------------
/examples/hacker_news/lda2vec/lda2vec_run.py:
--------------------------------------------------------------------------------
  1 | # Author: Chris Moody <chrisemoody@gmail.com>
  2 | # License: MIT
  3 | 
  4 | # This simple example loads the newsgroups data from sklearn
  5 | # and train an LDA-like model on it
  6 | import os.path
  7 | import pickle
  8 | import time
  9 | 
 10 | import chainer
 11 | from chainer import cuda
 12 | from chainer import serializers
 13 | import chainer.optimizers as O
 14 | import numpy as np
 15 | 
 16 | from lda2vec import utils
 17 | from lda2vec import prepare_topics, print_top_words_per_topic
 18 | from lda2vec_model import LDA2Vec
 19 | 
 20 | gpu_id = int(os.getenv('CUDA_GPU', 0))
 21 | cuda.get_device(gpu_id).use()
 22 | print "Using GPU " + str(gpu_id)
 23 | 
 24 | # You must run preprocess.py before this data becomes available
 25 | vocab = pickle.load(open('../data/vocab', 'r'))
 26 | corpus = pickle.load(open('../data/corpus', 'r'))
 27 | data = np.load(open('../data/data.npz', 'r'))
 28 | flattened = data['flattened']
 29 | story_id = data['story_id']
 30 | author_id = data['author_id']
 31 | time_id = data['time_id']
 32 | ranking = data['ranking'].astype('float32')
 33 | score = data['score'].astype('float32')
 34 | 
 35 | 
 36 | # Model Parameters
 37 | # Number of documents
 38 | n_stories = story_id.max() + 1
 39 | # Number of users
 40 | n_authors = author_id.max() + 1
 41 | # Number of unique words in the vocabulary
 42 | n_vocab = flattened.max() + 1
 43 | # Number of dimensions in a single word vector
 44 | n_units = 256
 45 | # Number of topics to fit
 46 | n_story_topics = 40
 47 | n_author_topics = 20
 48 | batchsize = 4096
 49 | # Get the string representation for every compact key
 50 | words = corpus.word_list(vocab)[:n_vocab]
 51 | 
 52 | # How many tokens are in each story
 53 | sty_idx, lengths = np.unique(story_id, return_counts=True)
 54 | sty_len = np.zeros(sty_idx.max() + 1, dtype='int32')
 55 | sty_len[sty_idx] = lengths
 56 | 
 57 | # How many tokens are in each author
 58 | aut_idx, lengths = np.unique(author_id, return_counts=True)
 59 | aut_len = np.zeros(aut_idx.max() + 1, dtype='int32')
 60 | aut_len[aut_idx] = lengths
 61 | 
 62 | # Count all token frequencies
 63 | tok_idx, freq = np.unique(flattened, return_counts=True)
 64 | term_frequency = np.zeros(n_vocab, dtype='int32')
 65 | term_frequency[tok_idx] = freq
 66 | 
 67 | model = LDA2Vec(n_stories=n_stories, n_story_topics=n_story_topics,
 68 |                 n_authors=n_authors, n_author_topics=n_author_topics,
 69 |                 n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
 70 |                 n_samples=15)
 71 | if os.path.exists('lda2vec.hdf5'):
 72 |     print "Reloading from saved"
 73 |     serializers.load_hdf5("lda2vec.hdf5", model)
 74 | model.to_gpu()
 75 | optimizer = O.Adam()
 76 | optimizer.setup(model)
 77 | clip = chainer.optimizer.GradientClipping(5.0)
 78 | optimizer.add_hook(clip)
 79 | 
 80 | j = 0
 81 | epoch = 0
 82 | fraction = batchsize * 1.0 / flattened.shape[0]
 83 | for epoch in range(5000):
 84 |     ts = prepare_topics(cuda.to_cpu(model.mixture_sty.weights.W.data).copy(),
 85 |                         cuda.to_cpu(model.mixture_sty.factors.W.data).copy(),
 86 |                         cuda.to_cpu(model.sampler.W.data).copy(),
 87 |                         words)
 88 |     print_top_words_per_topic(ts)
 89 |     ts['doc_lengths'] = sty_len
 90 |     ts['term_frequency'] = term_frequency
 91 |     np.savez('topics.story.pyldavis', **ts)
 92 |     ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(),
 93 |                         cuda.to_cpu(model.mixture_aut.factors.W.data).copy(),
 94 |                         cuda.to_cpu(model.sampler.W.data).copy(),
 95 |                         words)
 96 |     print_top_words_per_topic(ta)
 97 |     ta['doc_lengths'] = aut_len
 98 |     ta['term_frequency'] = term_frequency
 99 |     np.savez('topics.author.pyldavis', **ta)
100 |     for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
101 |         t0 = time.time()
102 |         optimizer.zero_grads()
103 |         l = model.fit_partial(s.copy(), a.copy(), f.copy())
104 |         prior = model.prior()
105 |         loss = prior * fraction
106 |         loss.backward()
107 |         optimizer.update()
108 |         msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
109 |                "P:{prior:1.3e} R:{rate:1.3e}")
110 |         prior.to_cpu()
111 |         loss.to_cpu()
112 |         t1 = time.time()
113 |         dt = t1 - t0
114 |         rate = batchsize / dt
115 |         logs = dict(loss=float(l), epoch=epoch, j=j,
116 |                     prior=float(prior.data), rate=rate)
117 |         print msg.format(**logs)
118 |         j += 1
119 |     serializers.save_hdf5("lda2vec.hdf5", model)
120 | 


--------------------------------------------------------------------------------
/examples/hacker_news/lda2vec/word_vectors.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "### Vector Math"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this notebook we'll demo that word2vec-like properties are kept. You can download the vectors, follow along at home, and make your own queries if you'd like.\n",
  15 |     "\n",
  16 |     "Sums:\n",
  17 |     "\n",
  18 |     "1. `silicon valley ~ california + technology`  \n",
  19 |     "2. `uber ~ taxis + company`\n",
  20 |     "3. `baidu ~ china + search engine`\n",
  21 |     "\n",
  22 |     "Analogies:\n",
  23 |     "\n",
  24 |     "1. `Mark Zuckerberg - Facebook + Amazon = Jeff Bezos`\n",
  25 |     "1. `Hacker News - story + article = StackOverflow`\n",
  26 |     "1. `VIM - terminal + graphics = Photoshop`\n",
  27 |     "\n",
  28 |     "And slightly more whimsically:\n",
  29 |     "\n",
  30 |     "1. `vegeables - eat + drink = tea`\n",
  31 |     "2. `scala - features + simple = haskell`"
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "code",
  36 |    "execution_count": 37,
  37 |    "metadata": {
  38 |     "collapsed": false
  39 |    },
  40 |    "outputs": [
  41 |     {
  42 |      "name": "stdout",
  43 |      "output_type": "stream",
  44 |      "text": [
  45 |       "--2016-04-17 12:56:06--  https://zenodo.org/record/49903/files/vocab.npy\n",
  46 |       "Resolving zenodo.org (zenodo.org)... 188.184.66.202\n",
  47 |       "Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.\n",
  48 |       "HTTP request sent, awaiting response... 200 OK\n",
  49 |       "Length: 81754640 (78M) [application/octet-stream]\n",
  50 |       "Saving to: ‘vocab.npy’\n",
  51 |       "\n",
  52 |       "vocab.npy           100%[=====================>]  77.97M  9.21MB/s   in 23s    \n",
  53 |       "\n",
  54 |       "2016-04-17 12:56:32 (3.37 MB/s) - ‘vocab.npy’ saved [81754640/81754640]\n",
  55 |       "\n"
  56 |      ]
  57 |     }
  58 |    ],
  59 |    "source": [
  60 |     "!wget https://zenodo.org/record/49903/files/vocab.npy"
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "code",
  65 |    "execution_count": 36,
  66 |    "metadata": {
  67 |     "collapsed": false
  68 |    },
  69 |    "outputs": [
  70 |     {
  71 |      "name": "stdout",
  72 |      "output_type": "stream",
  73 |      "text": [
  74 |       "--2016-04-17 12:55:41--  https://zenodo.org/record/49903/files/word_vectors.npy\n",
  75 |       "Resolving zenodo.org (zenodo.org)... 188.184.66.202\n",
  76 |       "Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.\n",
  77 |       "HTTP request sent, awaiting response... 200 OK\n",
  78 |       "Length: 116273232 (111M) [application/octet-stream]\n",
  79 |       "Saving to: ‘word_vectors.npy’\n",
  80 |       "\n",
  81 |       "word_vectors.npy    100%[=====================>] 110.89M  6.64MB/s   in 21s    \n",
  82 |       "\n",
  83 |       "2016-04-17 12:56:06 (5.31 MB/s) - ‘word_vectors.npy’ saved [116273232/116273232]\n",
  84 |       "\n"
  85 |      ]
  86 |     }
  87 |    ],
  88 |    "source": [
  89 |     "!wget https://zenodo.org/record/49903/files/word_vectors.npy"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {},
  95 |    "source": [
  96 |     "You don't need to run the code below unless you've trained your own model. Otherwise, just download the word vectors from the URL above."
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "code",
 101 |    "execution_count": 32,
 102 |    "metadata": {
 103 |     "collapsed": false
 104 |    },
 105 |    "outputs": [],
 106 |    "source": [
 107 |     "#from lda2vec_model import LDA2Vec\n",
 108 |     "#from chainer import serializers\n",
 109 |     "#import numpy as np\n",
 110 |     "#import pandas as pd\n",
 111 |     "#import pickle\n",
 112 |     "#\n",
 113 |     "#features = pd.read_pickle(\"../data/features.pd\")\n",
 114 |     "#vocab = np.load(\"../data/vocab\")\n",
 115 |     "#npz = np.load(open('topics.story.pyldavis.npz', 'r'))\n",
 116 |     "#dat = {k: v for (k, v) in npz.iteritems()}\n",
 117 |     "#vocab = dat['vocab'].tolist()\n",
 118 |     "#dat = np.load(\"../data/data.npz\")\n",
 119 |     "#n_stories = features.story_id_codes.max() + 1\n",
 120 |     "#n_units = 256\n",
 121 |     "#n_vocab = dat['flattened'].max() + 1\n",
 122 |     "#model = LDA2Vec(n_stories=n_stories, n_story_topics=40,\n",
 123 |     "#                n_authors=5664, n_author_topics=20,\n",
 124 |     "#                n_units=n_units, n_vocab=n_vocab, counts=np.zeros(n_vocab),\n",
 125 |     "#                n_samples=15)\n",
 126 |     "#serializers.load_hdf5(\"/home/chris/lda2vec-12/examples/hacker_news/lda2vec/lda2vec.hdf5\", model)\n",
 127 |     "#np.save(\"word_vectors\", model.sampler.W.data)\n",
 128 |     "#np.save(\"vocab\", vocab)"
 129 |    ]
 130 |   },
 131 |   {
 132 |    "cell_type": "code",
 133 |    "execution_count": 2,
 134 |    "metadata": {
 135 |     "collapsed": false
 136 |    },
 137 |    "outputs": [],
 138 |    "source": [
 139 |     "import numpy as np\n",
 140 |     "word_vectors_raw = np.load(\"word_vectors.npy\")\n",
 141 |     "vocab = np.load(\"vocab.npy\").tolist()"
 142 |    ]
 143 |   },
 144 |   {
 145 |    "cell_type": "markdown",
 146 |    "metadata": {},
 147 |    "source": [
 148 |     "L2 Normalize the word vectors"
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "code",
 153 |    "execution_count": 15,
 154 |    "metadata": {
 155 |     "collapsed": false
 156 |    },
 157 |    "outputs": [],
 158 |    "source": [
 159 |     "word_vectors = word_vectors_raw / np.linalg.norm(word_vectors_raw, axis=-1)[:, None]"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "code",
 164 |    "execution_count": 16,
 165 |    "metadata": {
 166 |     "collapsed": false
 167 |    },
 168 |    "outputs": [],
 169 |    "source": [
 170 |     "def get_vector(token):\n",
 171 |     "    index = vocab.index(token)\n",
 172 |     "    return word_vectors[index, :].copy()\n",
 173 |     "\n",
 174 |     "def most_similar(token, n=20):\n",
 175 |     "    word_vector = get_vector(token)\n",
 176 |     "    similarities = np.dot(word_vectors, word_vector)\n",
 177 |     "    top = np.argsort(similarities)[::-1][:n]\n",
 178 |     "    return [vocab[i] for i in top]\n",
 179 |     "\n",
 180 |     "# This is Levy & Goldberg's 3Cosmul Metric\n",
 181 |     "# Based on the Gensim implementation: https://github.com/piskvorky/gensim/blob/master/gensim/models/word2vec.py\n",
 182 |     "def cosmul(positives, negatives, topn=20):\n",
 183 |     "    positive = [get_vector(p) for p in positives]\n",
 184 |     "    negative = [get_vector(n) for n in negatives]\n",
 185 |     "    pos_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in positive]\n",
 186 |     "    neg_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in negative]\n",
 187 |     "    dists = np.prod(pos_dists, axis=0) / (np.prod(neg_dists, axis=0) + 1e-6)\n",
 188 |     "    idxs = np.argsort(dists)[::-1][:topn]\n",
 189 |     "    return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]\n",
 190 |     "def most_similar_posneg(positives, negatives, topn=20):\n",
 191 |     "    positive = np.sum([get_vector(p) for p in positives], axis=0)\n",
 192 |     "    negative = np.sum([get_vector(n) for n in negatives], axis=0)\n",
 193 |     "    vector = positive - negative\n",
 194 |     "    dists = np.dot(word_vectors, vector)\n",
 195 |     "    idxs = np.argsort(dists)[::-1][:topn]\n",
 196 |     "    return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]"
 197 |    ]
 198 |   },
 199 |   {
 200 |    "cell_type": "code",
 201 |    "execution_count": 17,
 202 |    "metadata": {
 203 |     "collapsed": false
 204 |    },
 205 |    "outputs": [
 206 |     {
 207 |      "data": {
 208 |       "text/plain": [
 209 |        "[u'san francisco',\n",
 210 |        " u'new york',\n",
 211 |        " u'nyc',\n",
 212 |        " u'palo alto',\n",
 213 |        " u'mountain view',\n",
 214 |        " u'boston',\n",
 215 |        " u'seattle',\n",
 216 |        " u'sf',\n",
 217 |        " u'los angeles',\n",
 218 |        " u'new york city',\n",
 219 |        " u'london',\n",
 220 |        " u'ny',\n",
 221 |        " u'brooklyn',\n",
 222 |        " u'chicago',\n",
 223 |        " u'austin',\n",
 224 |        " u'atlanta',\n",
 225 |        " u'portland',\n",
 226 |        " u'san jose',\n",
 227 |        " u'san mateo',\n",
 228 |        " u'sunnyvale']"
 229 |       ]
 230 |      },
 231 |      "execution_count": 17,
 232 |      "metadata": {},
 233 |      "output_type": "execute_result"
 234 |     }
 235 |    ],
 236 |    "source": [
 237 |     "most_similar('san francisco')"
 238 |    ]
 239 |   },
 240 |   {
 241 |    "cell_type": "code",
 242 |    "execution_count": 18,
 243 |    "metadata": {
 244 |     "collapsed": false
 245 |    },
 246 |    "outputs": [
 247 |     {
 248 |      "data": {
 249 |       "text/plain": [
 250 |        "[u'silicon valley',\n",
 251 |        " u'in',\n",
 252 |        " u'new york',\n",
 253 |        " u'u.s.',\n",
 254 |        " u'west',\n",
 255 |        " u'tech',\n",
 256 |        " u'usa',\n",
 257 |        " u'san francisco',\n",
 258 |        " u'japan',\n",
 259 |        " u'america',\n",
 260 |        " u'dc',\n",
 261 |        " u'industry',\n",
 262 |        " u'canada',\n",
 263 |        " u'new york city',\n",
 264 |        " u'nyc',\n",
 265 |        " u'area',\n",
 266 |        " u'valley',\n",
 267 |        " u'china']"
 268 |       ]
 269 |      },
 270 |      "execution_count": 18,
 271 |      "metadata": {},
 272 |      "output_type": "execute_result"
 273 |     }
 274 |    ],
 275 |    "source": [
 276 |     "cosmul(['california', 'technology'], [], topn=20)"
 277 |    ]
 278 |   },
 279 |   {
 280 |    "cell_type": "code",
 281 |    "execution_count": 19,
 282 |    "metadata": {
 283 |     "collapsed": false
 284 |    },
 285 |    "outputs": [
 286 |     {
 287 |      "data": {
 288 |       "text/plain": [
 289 |        "[u'currencies',\n",
 290 |        " u'bitcoin',\n",
 291 |        " u'goods',\n",
 292 |        " u'physical',\n",
 293 |        " u'gold',\n",
 294 |        " u'fiat',\n",
 295 |        " u'trading',\n",
 296 |        " u'cryptocurrency',\n",
 297 |        " u'bitcoins',\n",
 298 |        " u'electronic',\n",
 299 |        " u'analog',\n",
 300 |        " u'transfers',\n",
 301 |        " u'banking',\n",
 302 |        " u'commodity',\n",
 303 |        " u'mining',\n",
 304 |        " u'virtual currency',\n",
 305 |        " u'other currencies',\n",
 306 |        " u'media']"
 307 |       ]
 308 |      },
 309 |      "execution_count": 19,
 310 |      "metadata": {},
 311 |      "output_type": "execute_result"
 312 |     }
 313 |    ],
 314 |    "source": [
 315 |     "cosmul(['digital', 'currency'], [], topn=20)"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": 20,
 321 |    "metadata": {
 322 |     "collapsed": false
 323 |    },
 324 |    "outputs": [
 325 |     {
 326 |      "data": {
 327 |       "text/plain": [
 328 |        "[u'vim',\n",
 329 |        " u'emacs',\n",
 330 |        " u'editor',\n",
 331 |        " u'sublime',\n",
 332 |        " u'tmux',\n",
 333 |        " u'shell',\n",
 334 |        " u'iterm',\n",
 335 |        " u'vi',\n",
 336 |        " u'ide',\n",
 337 |        " u'debugger',\n",
 338 |        " u'latex',\n",
 339 |        " u'gui',\n",
 340 |        " u'gvim',\n",
 341 |        " u'notepad',\n",
 342 |        " u'eclipse',\n",
 343 |        " u'command line',\n",
 344 |        " u'terminal.app',\n",
 345 |        " u'window manager']"
 346 |       ]
 347 |      },
 348 |      "execution_count": 20,
 349 |      "metadata": {},
 350 |      "output_type": "execute_result"
 351 |     }
 352 |    ],
 353 |    "source": [
 354 |     "cosmul(['text editor', 'terminal'], [], topn=20)"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "code",
 359 |    "execution_count": 35,
 360 |    "metadata": {
 361 |     "collapsed": false
 362 |    },
 363 |    "outputs": [
 364 |     {
 365 |      "data": {
 366 |       "text/plain": [
 367 |        "[u'russia',\n",
 368 |        " u'india',\n",
 369 |        " u'japan',\n",
 370 |        " u'africa',\n",
 371 |        " u'korea',\n",
 372 |        " u'germany',\n",
 373 |        " u'other countries',\n",
 374 |        " u'asia',\n",
 375 |        " u'ukraine',\n",
 376 |        " u'iran',\n",
 377 |        " u'brazil',\n",
 378 |        " u'israel',\n",
 379 |        " u'usa',\n",
 380 |        " u'vietnam',\n",
 381 |        " u'france',\n",
 382 |        " u'countries',\n",
 383 |        " u'south korea',\n",
 384 |        " u'hong kong',\n",
 385 |        " u'europe']"
 386 |       ]
 387 |      },
 388 |      "execution_count": 35,
 389 |      "metadata": {},
 390 |      "output_type": "execute_result"
 391 |     }
 392 |    ],
 393 |    "source": [
 394 |     "cosmul(['china'], [], topn=20)"
 395 |    ]
 396 |   },
 397 |   {
 398 |    "cell_type": "code",
 399 |    "execution_count": 21,
 400 |    "metadata": {
 401 |     "collapsed": false
 402 |    },
 403 |    "outputs": [
 404 |     {
 405 |      "data": {
 406 |       "text/plain": [
 407 |        "[u'baidu',\n",
 408 |        " u'google',\n",
 409 |        " u'google search',\n",
 410 |        " u'india',\n",
 411 |        " u'russia',\n",
 412 |        " u'japan',\n",
 413 |        " u'iran',\n",
 414 |        " u'country',\n",
 415 |        " u'yandex',\n",
 416 |        " u'africa',\n",
 417 |        " u'duckduckgo',\n",
 418 |        " u'south korea',\n",
 419 |        " u'bing',\n",
 420 |        " u'france',\n",
 421 |        " u'beijing',\n",
 422 |        " u'hong kong',\n",
 423 |        " u'great firewall',\n",
 424 |        " u'search engines']"
 425 |       ]
 426 |      },
 427 |      "execution_count": 21,
 428 |      "metadata": {},
 429 |      "output_type": "execute_result"
 430 |     }
 431 |    ],
 432 |    "source": [
 433 |     "cosmul(['china', 'search engine'], [], topn=20)"
 434 |    ]
 435 |   },
 436 |   {
 437 |    "cell_type": "code",
 438 |    "execution_count": 22,
 439 |    "metadata": {
 440 |     "collapsed": false
 441 |    },
 442 |    "outputs": [
 443 |     {
 444 |      "data": {
 445 |       "text/plain": [
 446 |        "[u'apple',\n",
 447 |        " u'ms',\n",
 448 |        " u'msft',\n",
 449 |        " u'google',\n",
 450 |        " u'nokia',\n",
 451 |        " u'adobe',\n",
 452 |        " u'samsung',\n",
 453 |        " u'hp',\n",
 454 |        " u'rim',\n",
 455 |        " u'oracle',\n",
 456 |        " u'valve',\n",
 457 |        " u'mozilla',\n",
 458 |        " u'ibm',\n",
 459 |        " u'motorola',\n",
 460 |        " u'oems',\n",
 461 |        " u'ballmer',\n",
 462 |        " u'intel',\n",
 463 |        " u'ms.',\n",
 464 |        " u'canonical']"
 465 |       ]
 466 |      },
 467 |      "execution_count": 22,
 468 |      "metadata": {},
 469 |      "output_type": "execute_result"
 470 |     }
 471 |    ],
 472 |    "source": [
 473 |     "cosmul(['microsoft'], [], topn=20)"
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "code",
 478 |    "execution_count": 23,
 479 |    "metadata": {
 480 |     "collapsed": false
 481 |    },
 482 |    "outputs": [
 483 |     {
 484 |      "data": {
 485 |       "text/plain": [
 486 |        "[u'apple',\n",
 487 |        " u'google',\n",
 488 |        " u'enterprise',\n",
 489 |        " u'azure',\n",
 490 |        " u'ms',\n",
 491 |        " u'skydrive',\n",
 492 |        " u'sharepoint',\n",
 493 |        " u'walled garden',\n",
 494 |        " u'icloud',\n",
 495 |        " u'oracle',\n",
 496 |        " u'chrome os',\n",
 497 |        " u'cloud services',\n",
 498 |        " u'android market',\n",
 499 |        " u'adobe',\n",
 500 |        " u'app store',\n",
 501 |        " u'rackspace',\n",
 502 |        " u'hp',\n",
 503 |        " u'samsung']"
 504 |       ]
 505 |      },
 506 |      "execution_count": 23,
 507 |      "metadata": {},
 508 |      "output_type": "execute_result"
 509 |     }
 510 |    ],
 511 |    "source": [
 512 |     "cosmul(['microsoft', 'cloud'], [], topn=20)"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "markdown",
 517 |    "metadata": {},
 518 |    "source": [
 519 |     "Queen is several rankings down, so not exactly the same as out of the box word2vec!"
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "code",
 524 |    "execution_count": 24,
 525 |    "metadata": {
 526 |     "collapsed": false
 527 |    },
 528 |    "outputs": [
 529 |     {
 530 |      "data": {
 531 |       "text/plain": [
 532 |        "[u'professional context',\n",
 533 |        " u'female',\n",
 534 |        " u'pawn',\n",
 535 |        " u'content farm',\n",
 536 |        " u'queen',\n",
 537 |        " u'career trajectory',\n",
 538 |        " u'real risk',\n",
 539 |        " u'philadelphia',\n",
 540 |        " u'teen',\n",
 541 |        " u'shitty place',\n",
 542 |        " u'prussia',\n",
 543 |        " u'criminal offense',\n",
 544 |        " u'main theme',\n",
 545 |        " u'she',\n",
 546 |        " u'magician',\n",
 547 |        " u'gray area',\n",
 548 |        " u'herself',\n",
 549 |        " u'best site']"
 550 |       ]
 551 |      },
 552 |      "execution_count": 24,
 553 |      "metadata": {},
 554 |      "output_type": "execute_result"
 555 |     }
 556 |    ],
 557 |    "source": [
 558 |     "cosmul(['king', 'woman'], ['man'], topn=20)"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "code",
 563 |    "execution_count": 25,
 564 |    "metadata": {
 565 |     "collapsed": false
 566 |    },
 567 |    "outputs": [
 568 |     {
 569 |      "name": "stdout",
 570 |      "output_type": "stream",
 571 |      "text": [
 572 |       "Most similar\n",
 573 |       "mark zuckerberg\n",
 574 |       "bill gates\n",
 575 |       "zuckerberg\n",
 576 |       "larry page\n",
 577 |       "zuck\n",
 578 |       "steve jobs\n",
 579 |       "sergey brin\n",
 580 |       "jeff bezos\n",
 581 |       "gates\n",
 582 |       "warren buffet\n",
 583 |       "ceo\n",
 584 |       "peter thiel\n",
 585 |       "paul allen\n",
 586 |       "sean parker\n",
 587 |       "jack dorsey\n",
 588 |       "paul graham\n",
 589 |       "richard branson\n",
 590 |       "sergey\n",
 591 |       "linus torvalds\n",
 592 |       "larry ellison\n",
 593 |       "\n",
 594 |       "Cosmul\n",
 595 |       "jeff bezos\n",
 596 |       "elon musk\n",
 597 |       "warren buffet\n",
 598 |       "bezos\n",
 599 |       "michael dell\n",
 600 |       "bill gates\n",
 601 |       "musk\n",
 602 |       "hp\n",
 603 |       "toshiba\n",
 604 |       "dell\n",
 605 |       "richard branson\n",
 606 |       "elon\n",
 607 |       "buffet\n",
 608 |       "john carmack\n",
 609 |       "steve wozniak\n",
 610 |       "asus\n",
 611 |       "ford\n",
 612 |       "morgan\n",
 613 |       "\n",
 614 |       "Traditional Similarity\n",
 615 |       "jeff bezos\n",
 616 |       "bill gates\n",
 617 |       "elon musk\n",
 618 |       "bezos\n",
 619 |       "warren buffet\n",
 620 |       "michael dell\n",
 621 |       "hp\n",
 622 |       "musk\n",
 623 |       "richard branson\n",
 624 |       "dell\n",
 625 |       "toshiba\n",
 626 |       "john carmack\n",
 627 |       "buffet\n",
 628 |       "peter thiel\n",
 629 |       "steve wozniak\n",
 630 |       "gates\n",
 631 |       "steve jobs\n",
 632 |       "ford\n"
 633 |      ]
 634 |     }
 635 |    ],
 636 |    "source": [
 637 |     "print 'Most similar'\n",
 638 |     "print '\\n'.join(most_similar('mark zuckerberg'))\n",
 639 |     "print '\\nCosmul'\n",
 640 |     "pos = ['mark zuckerberg', 'amazon']\n",
 641 |     "neg = ['facebook']\n",
 642 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
 643 |     "print '\\nTraditional Similarity'\n",
 644 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
 645 |    ]
 646 |   },
 647 |   {
 648 |    "cell_type": "code",
 649 |    "execution_count": 26,
 650 |    "metadata": {
 651 |     "collapsed": false
 652 |    },
 653 |    "outputs": [
 654 |     {
 655 |      "name": "stdout",
 656 |      "output_type": "stream",
 657 |      "text": [
 658 |       "Most similar\n",
 659 |       "hacker news\n",
 660 |       "hn\n",
 661 |       "hn.\n",
 662 |       "reddit\n",
 663 |       "front page\n",
 664 |       "hackernews\n",
 665 |       "commenting\n",
 666 |       "posted\n",
 667 |       "frontpage\n",
 668 |       "comment\n",
 669 |       "posting\n",
 670 |       "upvoted\n",
 671 |       "slashdot\n",
 672 |       "news.yc\n",
 673 |       "comments\n",
 674 |       "posts\n",
 675 |       "proggit\n",
 676 |       "post\n",
 677 |       "techcrunch\n",
 678 |       "top story\n",
 679 |       "\n",
 680 |       "Cosmul\n",
 681 |       "stack overflow\n",
 682 |       "stackoverflow\n",
 683 |       "answers\n",
 684 |       "answering\n",
 685 |       "answer\n",
 686 |       "questions\n",
 687 |       "quora\n",
 688 |       "answered\n",
 689 |       "ask\n",
 690 |       "hn\n",
 691 |       "other questions\n",
 692 |       "other question\n",
 693 |       "programming questions\n",
 694 |       "asking\n",
 695 |       "stackexchange\n",
 696 |       "stack exchange\n",
 697 |       "why\n",
 698 |       "basic questions\n",
 699 |       "\n",
 700 |       "Traditional Similarity\n",
 701 |       "stack overflow\n",
 702 |       "answer\n",
 703 |       "stackoverflow\n",
 704 |       "answering\n",
 705 |       "answers\n",
 706 |       "hn\n",
 707 |       "questions\n",
 708 |       "answered\n",
 709 |       "quora\n",
 710 |       "ask\n",
 711 |       "asking\n",
 712 |       "other question\n",
 713 |       "other questions\n",
 714 |       "first question\n",
 715 |       "stackexchange\n",
 716 |       "hn.\n",
 717 |       "programming questions\n",
 718 |       "hackernews\n"
 719 |      ]
 720 |     }
 721 |    ],
 722 |    "source": [
 723 |     "pos = ['hacker news', 'question']\n",
 724 |     "neg = ['story']\n",
 725 |     "\n",
 726 |     "print 'Most similar'\n",
 727 |     "print '\\n'.join(most_similar(pos[0]))\n",
 728 |     "print '\\nCosmul'\n",
 729 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
 730 |     "print '\\nTraditional Similarity'\n",
 731 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "code",
 736 |    "execution_count": 27,
 737 |    "metadata": {
 738 |     "collapsed": false
 739 |    },
 740 |    "outputs": [
 741 |     {
 742 |      "name": "stdout",
 743 |      "output_type": "stream",
 744 |      "text": [
 745 |       "Most similar\n",
 746 |       "san francisco\n",
 747 |       "new york\n",
 748 |       "nyc\n",
 749 |       "palo alto\n",
 750 |       "mountain view\n",
 751 |       "boston\n",
 752 |       "seattle\n",
 753 |       "sf\n",
 754 |       "los angeles\n",
 755 |       "new york city\n",
 756 |       "london\n",
 757 |       "ny\n",
 758 |       "brooklyn\n",
 759 |       "chicago\n",
 760 |       "austin\n",
 761 |       "atlanta\n",
 762 |       "portland\n",
 763 |       "san jose\n",
 764 |       "san mateo\n",
 765 |       "sunnyvale\n",
 766 |       "\n",
 767 |       "Cosmul\n",
 768 |       "new york\n",
 769 |       "nyc\n",
 770 |       "palo alto\n",
 771 |       "mountain view\n",
 772 |       "boston\n",
 773 |       "seattle\n",
 774 |       "sf\n",
 775 |       "los angeles\n",
 776 |       "new york city\n",
 777 |       "london\n",
 778 |       "ny\n",
 779 |       "brooklyn\n",
 780 |       "chicago\n",
 781 |       "austin\n",
 782 |       "atlanta\n",
 783 |       "portland\n",
 784 |       "san jose\n",
 785 |       "san mateo\n",
 786 |       "sunnyvale\n",
 787 |       "\n",
 788 |       "Traditional Similarity\n",
 789 |       "new york\n",
 790 |       "nyc\n",
 791 |       "palo alto\n",
 792 |       "mountain view\n",
 793 |       "boston\n",
 794 |       "seattle\n",
 795 |       "sf\n",
 796 |       "los angeles\n",
 797 |       "new york city\n",
 798 |       "london\n",
 799 |       "ny\n",
 800 |       "brooklyn\n",
 801 |       "chicago\n",
 802 |       "austin\n",
 803 |       "atlanta\n",
 804 |       "portland\n",
 805 |       "san jose\n",
 806 |       "san mateo\n",
 807 |       "sunnyvale\n"
 808 |      ]
 809 |     }
 810 |    ],
 811 |    "source": [
 812 |     "pos = ['san francisco']\n",
 813 |     "neg = []\n",
 814 |     "\n",
 815 |     "print 'Most similar'\n",
 816 |     "print '\\n'.join(most_similar(pos[0]))\n",
 817 |     "print '\\nCosmul'\n",
 818 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
 819 |     "print '\\nTraditional Similarity'\n",
 820 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "code",
 825 |    "execution_count": 28,
 826 |    "metadata": {
 827 |     "collapsed": false
 828 |    },
 829 |    "outputs": [
 830 |     {
 831 |      "name": "stdout",
 832 |      "output_type": "stream",
 833 |      "text": [
 834 |       "Most similar\n",
 835 |       "nlp\n",
 836 |       "machine learning\n",
 837 |       "data mining\n",
 838 |       "computer vision\n",
 839 |       "natural language processing\n",
 840 |       "ml\n",
 841 |       "image processing\n",
 842 |       "analytics\n",
 843 |       "classification\n",
 844 |       "algorithms\n",
 845 |       "data science\n",
 846 |       "hadoop\n",
 847 |       "analysis\n",
 848 |       "ai\n",
 849 |       "clustering\n",
 850 |       "mapreduce\n",
 851 |       "algorithm design\n",
 852 |       "information retrieval\n",
 853 |       "data analysis\n",
 854 |       "statistical\n",
 855 |       "\n",
 856 |       "Cosmul\n",
 857 |       "computer vision\n",
 858 |       "machine learning\n",
 859 |       "data mining\n",
 860 |       "image processing\n",
 861 |       "ai\n",
 862 |       "analytics\n",
 863 |       "algorithm\n",
 864 |       "randomized\n",
 865 |       "classification\n",
 866 |       "natural language processing\n",
 867 |       "hadoop\n",
 868 |       "engine\n",
 869 |       "statistical\n",
 870 |       "analysis\n",
 871 |       "machine\n",
 872 |       "clustering\n",
 873 |       "ml\n",
 874 |       "artificial intelligence\n",
 875 |       "neo4j\n",
 876 |       "\n",
 877 |       "Traditional Similarity\n",
 878 |       "computer vision\n",
 879 |       "machine learning\n",
 880 |       "data mining\n",
 881 |       "image processing\n",
 882 |       "ai\n",
 883 |       "analytics\n",
 884 |       "algorithm\n",
 885 |       "natural language processing\n",
 886 |       "classification\n",
 887 |       "randomized\n",
 888 |       "analysis\n",
 889 |       "ml\n",
 890 |       "hadoop\n",
 891 |       "engine\n",
 892 |       "machine\n",
 893 |       "statistical\n",
 894 |       "clustering\n",
 895 |       "visualization\n"
 896 |      ]
 897 |     }
 898 |    ],
 899 |    "source": [
 900 |     "pos = ['nlp', 'image']\n",
 901 |     "neg = ['text']\n",
 902 |     "\n",
 903 |     "print 'Most similar'\n",
 904 |     "print '\\n'.join(most_similar(pos[0]))\n",
 905 |     "print '\\nCosmul'\n",
 906 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
 907 |     "print '\\nTraditional Similarity'\n",
 908 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": 29,
 914 |    "metadata": {
 915 |     "collapsed": false
 916 |    },
 917 |    "outputs": [
 918 |     {
 919 |      "name": "stdout",
 920 |      "output_type": "stream",
 921 |      "text": [
 922 |       "Most similar\n",
 923 |       "vim\n",
 924 |       "emacs\n",
 925 |       "vi\n",
 926 |       "sublime\n",
 927 |       "tmux\n",
 928 |       "textmate\n",
 929 |       "eclipse\n",
 930 |       "sublime text\n",
 931 |       "macvim\n",
 932 |       "zsh\n",
 933 |       "org-mode\n",
 934 |       "terminal\n",
 935 |       "st2\n",
 936 |       "bbedit\n",
 937 |       "intellij\n",
 938 |       "text editor\n",
 939 |       "latex\n",
 940 |       "notepad++\n",
 941 |       "netbeans\n",
 942 |       "other editors\n",
 943 |       "\n",
 944 |       "Cosmul\n",
 945 |       "photoshop\n",
 946 |       "animations\n",
 947 |       "typography\n",
 948 |       "programming\n",
 949 |       "layout\n",
 950 |       "textures\n",
 951 |       "web design\n",
 952 |       "fonts\n",
 953 |       "coding\n",
 954 |       "illustrator\n",
 955 |       "common lisp\n",
 956 |       "design\n",
 957 |       "prototyping\n",
 958 |       "canvas\n",
 959 |       "css.\n",
 960 |       "css\n",
 961 |       "diagrams\n",
 962 |       "vector graphics\n",
 963 |       "usability\n",
 964 |       "\n",
 965 |       "Traditional Similarity\n",
 966 |       "photoshop\n",
 967 |       "animations\n",
 968 |       "textures\n",
 969 |       "layout\n",
 970 |       "typography\n",
 971 |       "programming\n",
 972 |       "fonts\n",
 973 |       "coding\n",
 974 |       "illustrator\n",
 975 |       "design\n",
 976 |       "web design\n",
 977 |       "common lisp\n",
 978 |       "canvas\n",
 979 |       "photography\n",
 980 |       "ides\n",
 981 |       "visual\n",
 982 |       "animation\n",
 983 |       "css\n"
 984 |      ]
 985 |     }
 986 |    ],
 987 |    "source": [
 988 |     "pos = ['vim', 'graphics']\n",
 989 |     "neg = ['terminal']\n",
 990 |     "\n",
 991 |     "print 'Most similar'\n",
 992 |     "print '\\n'.join(most_similar(pos[0]))\n",
 993 |     "print '\\nCosmul'\n",
 994 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
 995 |     "print '\\nTraditional Similarity'\n",
 996 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "code",
1001 |    "execution_count": 30,
1002 |    "metadata": {
1003 |     "collapsed": false
1004 |    },
1005 |    "outputs": [
1006 |     {
1007 |      "name": "stdout",
1008 |      "output_type": "stream",
1009 |      "text": [
1010 |       "Most similar\n",
1011 |       "vegetables\n",
1012 |       "meat\n",
1013 |       "rice\n",
1014 |       "meats\n",
1015 |       "fruit\n",
1016 |       "veggies\n",
1017 |       "pasta\n",
1018 |       "salads\n",
1019 |       "eat\n",
1020 |       "fruits\n",
1021 |       "cheese\n",
1022 |       "carrots\n",
1023 |       "potatoes\n",
1024 |       "beans\n",
1025 |       "seafood\n",
1026 |       "soy\n",
1027 |       "yogurt\n",
1028 |       "spices\n",
1029 |       "dairy\n",
1030 |       "fats\n",
1031 |       "\n",
1032 |       "Cosmul\n",
1033 |       "tea\n",
1034 |       "coffee\n",
1035 |       "beer\n",
1036 |       "drinking\n",
1037 |       "red wine\n",
1038 |       "soda\n",
1039 |       "cup\n",
1040 |       "alcohol\n",
1041 |       "cups\n",
1042 |       "vodka\n",
1043 |       "rice\n",
1044 |       "fruit\n",
1045 |       "whisky\n",
1046 |       "orange juice\n",
1047 |       "milk\n",
1048 |       "espresso\n",
1049 |       "drinks\n",
1050 |       "carrots\n",
1051 |       "\n",
1052 |       "Traditional Similarity\n",
1053 |       "tea\n",
1054 |       "coffee\n",
1055 |       "beer\n",
1056 |       "drinking\n",
1057 |       "soda\n",
1058 |       "red wine\n",
1059 |       "cup\n",
1060 |       "alcohol\n",
1061 |       "rice\n",
1062 |       "cups\n",
1063 |       "fruit\n",
1064 |       "vodka\n",
1065 |       "milk\n",
1066 |       "drinks\n",
1067 |       "orange juice\n",
1068 |       "carrots\n",
1069 |       "whisky\n",
1070 |       "pasta\n"
1071 |      ]
1072 |     }
1073 |    ],
1074 |    "source": [
1075 |     "pos = ['vegetables', 'drink']\n",
1076 |     "neg = ['eat']\n",
1077 |     "\n",
1078 |     "print 'Most similar'\n",
1079 |     "print '\\n'.join(most_similar(pos[0]))\n",
1080 |     "print '\\nCosmul'\n",
1081 |     "print '\\n'.join(cosmul(pos, neg, topn=20))\n",
1082 |     "print '\\nTraditional Similarity'\n",
1083 |     "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))"
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "code",
1088 |    "execution_count": 31,
1089 |    "metadata": {
1090 |     "collapsed": false
1091 |    },
1092 |    "outputs": [
1093 |     {
1094 |      "name": "stdout",
1095 |      "output_type": "stream",
1096 |      "text": [
1097 |       "Most similar\n",
1098 |       "lda\n",
1099 |       "linear\n",
1100 |       "kmeans\n",
1101 |       "clustering\n",
1102 |       "-2\n",
1103 |       "176\n",
1104 |       "classification\n",
1105 |       "svm\n",
1106 |       "10000000\n",
1107 |       "minaway\n",
1108 |       "mb&#x2f;s\n",
1109 |       "statistical\n",
1110 |       "173\n",
1111 |       "ans\n",
1112 |       "joiner\n",
1113 |       "stdev\n",
1114 |       "because:<p><pre><code\n",
1115 |       "regression\n",
1116 |       "\t\n",
1117 |       "gaussian\n"
1118 |      ]
1119 |     }
1120 |    ],
1121 |    "source": [
1122 |     "pos = ['lda', '']\n",
1123 |     "neg = ['']\n",
1124 |     "\n",
1125 |     "print 'Most similar'\n",
1126 |     "print '\\n'.join(most_similar(pos[0]))"
1127 |    ]
1128 |   }
1129 |  ],
1130 |  "metadata": {
1131 |   "kernelspec": {
1132 |    "display_name": "Python 2",
1133 |    "language": "python",
1134 |    "name": "python2"
1135 |   },
1136 |   "language_info": {
1137 |    "codemirror_mode": {
1138 |     "name": "ipython",
1139 |     "version": 2
1140 |    },
1141 |    "file_extension": ".py",
1142 |    "mimetype": "text/x-python",
1143 |    "name": "python",
1144 |    "nbconvert_exporter": "python",
1145 |    "pygments_lexer": "ipython2",
1146 |    "version": "2.7.11"
1147 |   }
1148 |  },
1149 |  "nbformat": 4,
1150 |  "nbformat_minor": 0
1151 | }
1152 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/README.md:
--------------------------------------------------------------------------------
 1 | There are a variety of LDA-inspired models in this directory. 
 2 | 
 3 | Warning: Not all of these models are working at the moment.
 4 | 
 5 | 
 6 | lda
 7 | ----
 8 | This model takes a document id, finds the latent document vector, and predicts a bag-of-words (BoW) representation for each document.
 9 | 
10 | 
11 | nslda
12 | -----
13 | Like the above, but predicts individual words instead of a BoW representation predicts individual words using word2vec's negative sampling. Note that this is not using skipgrams; it simply maps context vector to word vector while also discouraging the mapping from context vector to negatively sampled words.
14 | 
15 | 
16 | lda2vec
17 | -------
18 | This model adds in skipgrams. A word predicts another word in the same window, as in word2vec, but also has the notion of a context vector which only changes at the document level as in LDA. 
19 | 
20 | nvdm
21 | ----
22 | 
23 | This code implements the Neural Inference Document Modeling (NVDM) in the paper ["Neural Variational Inference for Text Processing".](http://arxiv.org/pdf/1511.06038v3.pdf)


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/data/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody <chrisemoody@gmail.com>
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import logging
 7 | import pickle
 8 | 
 9 | from sklearn.datasets import fetch_20newsgroups
10 | import numpy as np
11 | 
12 | from lda2vec import preprocess, Corpus
13 | 
14 | logging.basicConfig()
15 | 
16 | # Fetch data
17 | remove = ('headers', 'footers', 'quotes')
18 | texts = fetch_20newsgroups(subset='train', remove=remove).data
19 | # Remove tokens with these substrings
20 | bad = set(["ax>", '`@("', '---', '===', '^^^'])
21 | 
22 | 
23 | def clean(line):
24 |     return ' '.join(w for w in line.split() if not any(t in w for t in bad))
25 | 
26 | # Preprocess data
27 | max_length = 10000   # Limit of 10k words per document
28 | # Convert to unicode (spaCy only works with unicode)
29 | texts = [clean(d) for d in texts]
30 | tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
31 |                                     n_threads=4)
32 | corpus = Corpus()
33 | # Make a ranked list of rare vs frequent words
34 | corpus.update_word_count(tokens)
35 | corpus.finalize()
36 | # The tokenization uses spaCy indices, and so may have gaps
37 | # between indices for words that aren't present in our dataset.
38 | # This builds a new compact index
39 | compact = corpus.to_compact(tokens)
40 | # Remove extremely rare words
41 | pruned = corpus.filter_count(compact, min_count=30)
42 | # Convert the compactified arrays into bag of words arrays
43 | bow = corpus.compact_to_bow(pruned)
44 | # Words tend to have power law frequency, so selectively
45 | # downsample the most prevalent words
46 | clean = corpus.subsample_frequent(pruned)
47 | # Now flatten a 2D array of document per row and word position
48 | # per column to a 1D array of words. This will also remove skips
49 | # and OoV words
50 | doc_ids = np.arange(pruned.shape[0])
51 | flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
52 | assert flattened.min() >= 0
53 | # Fill in the pretrained word vectors
54 | n_dim = 300
55 | fn_wordvc = 'GoogleNews-vectors-negative300.bin'
56 | vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)
57 | # Save all of the preprocessed files
58 | pickle.dump(vocab, open('vocab.pkl', 'wb'))
59 | pickle.dump(corpus, open('corpus.pkl', 'wb'))
60 | np.save("flattened", flattened)
61 | np.save("doc_ids", doc_ids)
62 | np.save("pruned", pruned)
63 | np.save("bow", bow)
64 | np.save("vectors", vectors)
65 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/lda.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils, dirichlet_likelihood
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | class LDA(chainer.Chain):
11 |     def __init__(self, n_docs, n_topics, n_dim, n_vocab):
12 |         factors = np.random.random((n_topics, n_dim)).astype('float32')
13 |         super(LDA, self).__init__(proportions=L.EmbedID(n_docs, n_topics),
14 |                                   factors=L.Parameter(factors),
15 |                                   embedding=L.Linear(n_dim, n_vocab))
16 |         self.n_docs = n_docs
17 |         self.n_topics = n_topics
18 |         self.n_vocab = n_vocab
19 |         self.n_dim = n_dim
20 | 
21 |     def forward(self, ids, bow):
22 |         bow, ids = utils.move(self.xp, bow, ids)
23 |         proportions = self.proportions(ids)
24 |         ld = dirichlet_likelihood(proportions)
25 |         doc = F.matmul(F.softmax(proportions), self.factors())
26 |         logp = F.dropout(self.embedding(doc))
27 |         # loss = -F.sum(bow * F.log_softmax(logp))
28 |         sources, targets, counts = [], [], []
29 |         lpi =  F.sum(bow * F.log_softmax(logp), axis=1)
30 |         loss = -F.sum(lpi)
31 |         return loss, ld
32 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/lda_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody <chrisemoody@gmail.com>
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | from chainer import cuda
12 | import chainer.optimizers as O
13 | import chainer.link as L
14 | import numpy as np
15 | 
16 | # from lda2vec import prepare_topics, print_top_words_per_topic
17 | # from lda2vec import utils
18 | from lda2vec import topics, utils
19 | from lda import LDA
20 | 
21 | gpu_id = int(os.getenv('CUDA_GPU', 0))
22 | cuda.get_device(gpu_id).use()
23 | print("Using GPU ", str(gpu_id))
24 | 
25 | vocab = pickle.load(open('../data/vocab.pkl', 'rb'))
26 | corpus = pickle.load(open('../data/corpus.pkl', 'rb'))
27 | bow = np.load("../data/bow.npy").astype('float32')
28 | # Remove bow counts on the first two tokens, which <SKIP> and <EOS>
29 | bow[:, :2] = 0
30 | # Normalize bag of words to be a probability
31 | # bow = bow / bow.sum(axis=1)[:, None]
32 | 
33 | # Number of docs
34 | n_docs = bow.shape[0]
35 | # Number of unique words in the vocabulary
36 | n_vocab = bow.shape[1]
37 | # Number of dimensions in a single word vector
38 | n_units = 256
39 | # number of topics
40 | n_topics = 20
41 | batchsize = 128
42 | counts = corpus.keys_counts[:n_vocab]
43 | # Get the string representation for every compact key
44 | words = corpus.word_list(vocab)[:n_vocab]
45 | 
46 | model = LDA(n_docs, n_topics, n_units, n_vocab)
47 | if os.path.exists('lda.hdf5'):
48 |     print("Reloading from saved")
49 |     serializers.load_hdf5("lda.hdf5", model)
50 | model.to_gpu()
51 | optimizer = O.Adam()
52 | optimizer.setup(model)
53 | 
54 | j = 0
55 | fraction = batchsize * 1.0 / bow.shape[0]
56 | for epoch in range(50000000):
57 |     if epoch % 100 == 0:
58 |         p = cuda.to_cpu(model.proportions.W.data).copy()
59 |         f = cuda.to_cpu(model.factors.W.data).copy()
60 |         w = cuda.to_cpu(model.embedding.W.data).copy()
61 |         d = topics.prepare_topics(p, f, w, words)
62 |         topics.print_top_words_per_topic(d)
63 |     for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
64 |         t0 = time.time()
65 |         # optimizer.zero_grads()
66 |         model.cleargrads()
67 |         rec, ld = model.forward(ids, batch)
68 |         l = rec + ld
69 |         l.backward()
70 |         optimizer.update()
71 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
72 |                "P:{ld:1.3e} R:{rate:1.3e}")
73 |         l.to_cpu()
74 |         rec.to_cpu()
75 |         ld.to_cpu()
76 |         t1 = time.time()
77 |         dt = t1 - t0
78 |         rate = batchsize / dt
79 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
80 |                     ld=float(ld.data), rate=rate)
81 |         print(msg.format(**logs))
82 |         j += 1
83 |     if epoch % 100 == 0:
84 |         serializers.save_hdf5("lda.hdf5", model)
85 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/topics.pyldavis.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/examples/twenty_newsgroups/lda/topics.pyldavis.npz


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/lda2vec_model.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import EmbedMixture
 2 | from lda2vec import dirichlet_likelihood
 3 | from lda2vec.utils import move
 4 | 
 5 | from chainer import Chain
 6 | import chainer.links as L
 7 | import chainer.functions as F
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class LDA2Vec(Chain):
13 |     def __init__(self, n_documents=100, n_document_topics=10,
14 |                  n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True,
15 |                  counts=None, n_samples=15, word_dropout_ratio=0.0,
16 |                  power=0.75, temperature=1.0):
17 |         em = EmbedMixture(n_documents, n_document_topics, n_units,
18 |                           dropout_ratio=dropout_ratio, temperature=temperature)
19 |         kwargs = {}
20 |         kwargs['mixture'] = em
21 |         kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples,
22 |                                                power=power)
23 |         super(LDA2Vec, self).__init__(**kwargs)
24 |         rand = np.random.random(self.sampler.W.data.shape)
25 |         self.sampler.W.data[:, :] = rand[:, :]
26 |         self.n_units = n_units
27 |         self.train = train
28 |         self.dropout_ratio = dropout_ratio
29 |         self.word_dropout_ratio = word_dropout_ratio
30 |         self.n_samples = n_samples
31 | 
32 |     def prior(self):
33 |         dl1 = dirichlet_likelihood(self.mixture.weights)
34 |         return dl1
35 | 
36 |     def fit_partial(self, rdoc_ids, rword_indices, window=5,
37 |                     update_only_docs=False):
38 |         doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices)
39 |         pivot_idx = next(move(self.xp, rword_indices[window: -window]))
40 |         pivot = F.embed_id(pivot_idx, self.sampler.W)
41 |         if update_only_docs:
42 |             pivot.unchain_backward()
43 |         doc_at_pivot = rdoc_ids[window: -window]
44 |         doc = self.mixture(next(move(self.xp, doc_at_pivot)),
45 |                            update_only_docs=update_only_docs)
46 |         loss = 0.0
47 |         start, end = window, rword_indices.shape[0] - window
48 |         context = (F.dropout(doc, self.dropout_ratio) +
49 |                    F.dropout(pivot, self.dropout_ratio))
50 |         for frame in range(-window, window + 1):
51 |             # Skip predicting the current pivot
52 |             if frame == 0:
53 |                 continue
54 |             # Predict word given context and pivot word
55 |             # The target starts before the pivot
56 |             targetidx = rword_indices[start + frame: end + frame]
57 |             doc_at_target = rdoc_ids[start + frame: end + frame]
58 |             doc_is_same = doc_at_target == doc_at_pivot
59 |             rand = np.random.uniform(0, 1, doc_is_same.shape[0])
60 |             mask = (rand > self.word_dropout_ratio).astype('bool')
61 |             weight = np.logical_and(doc_is_same, mask).astype('int32')
62 |             # If weight is 1.0 then targetidx
63 |             # If weight is 0.0 then -1
64 |             targetidx = targetidx * weight + -1 * (1 - weight)
65 |             target, = move(self.xp, targetidx)
66 |             loss = self.sampler(context, target)
67 |             loss.backward()
68 |             if update_only_docs:
69 |                 # Wipe out any gradient accumulation on word vectors
70 |                 self.sampler.W.grad *= 0.0
71 |         return loss.data
72 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/lda2vec_run.py:
--------------------------------------------------------------------------------
  1 | # Author: Chris Moody <chrisemoody@gmail.com>
  2 | # License: MIT
  3 | 
  4 | # This simple example loads the newsgroups data from sklearn
  5 | # and train an LDA-like model on it
  6 | import os
  7 | import os.path
  8 | import pdb
  9 | import pickle
 10 | import time
 11 | import shelve
 12 | 
 13 | import chainer
 14 | from chainer import cuda
 15 | from chainer import serializers
 16 | import chainer.optimizers as O
 17 | import numpy as np
 18 | 
 19 | from lda2vec import topics, utils
 20 | # from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
 21 | from lda2vec_model import LDA2Vec
 22 | 
 23 | gpu_id = int(os.getenv('CUDA_GPU', 0))
 24 | cuda.get_device(gpu_id).use()
 25 | print ("Using GPU ", str(gpu_id))
 26 | 
 27 | data_dir = os.getenv('data_dir', '../data/')
 28 | fn_vocab = '{data_dir:s}/vocab.pkl'.format(data_dir=data_dir)
 29 | fn_corpus = '{data_dir:s}/corpus.pkl'.format(data_dir=data_dir)
 30 | fn_flatnd = '{data_dir:s}/flattened.npy'.format(data_dir=data_dir)
 31 | fn_docids = '{data_dir:s}/doc_ids.npy'.format(data_dir=data_dir)
 32 | fn_vectors = '{data_dir:s}/vectors.npy'.format(data_dir=data_dir)
 33 | vocab = pickle.load(open(fn_vocab, 'rb'))
 34 | corpus = pickle.load(open(fn_corpus, 'rb'))
 35 | flattened = np.load(fn_flatnd)
 36 | doc_ids = np.load(fn_docids)
 37 | vectors = np.load(fn_vectors)
 38 | 
 39 | # Model Parameters
 40 | # Number of documents
 41 | n_docs = doc_ids.max() + 1
 42 | # Number of unique words in the vocabulary
 43 | n_vocab = flattened.max() + 1
 44 | # 'Strength' of the dircihlet prior; 200.0 seems to work well
 45 | clambda = 200.0
 46 | # Number of topics to fit
 47 | n_topics = int(os.getenv('n_topics', 20))
 48 | batchsize = 4096
 49 | # Power for neg sampling
 50 | power = float(os.getenv('power', 0.75))
 51 | # Intialize with pretrained word vectors
 52 | pretrained = bool(int(os.getenv('pretrained', True)))
 53 | # Sampling temperature
 54 | temperature = float(os.getenv('temperature', 1.0))
 55 | # Number of dimensions in a single word vector
 56 | n_units = int(os.getenv('n_units', 300))
 57 | # Get the string representation for every compact key
 58 | words = corpus.word_list(vocab)[:n_vocab]
 59 | # How many tokens are in each document
 60 | doc_idx, lengths = np.unique(doc_ids, return_counts=True)
 61 | doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
 62 | doc_lengths[doc_idx] = lengths
 63 | # Count all token frequencies
 64 | tok_idx, freq = np.unique(flattened, return_counts=True)
 65 | term_frequency = np.zeros(n_vocab, dtype='int32')
 66 | term_frequency[tok_idx] = freq
 67 | 
 68 | for key in sorted(locals().keys()):
 69 |     val = locals()[key]
 70 |     if len(str(val)) < 100 and '<' not in str(val):
 71 |         print(key, val)
 72 | 
 73 | model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
 74 |                 n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
 75 |                 n_samples=15, power=power, temperature=temperature)
 76 | if os.path.exists('lda2vec.hdf5'):
 77 |     print("Reloading from saved")
 78 |     serializers.load_hdf5("lda2vec.hdf5", model)
 79 | if pretrained:
 80 |     model.sampler.W.data[:, :] = vectors[:n_vocab, :]
 81 | model.to_gpu()
 82 | optimizer = O.Adam()
 83 | optimizer.setup(model)
 84 | clip = chainer.optimizer.GradientClipping(5.0)
 85 | optimizer.add_hook(clip)
 86 | 
 87 | j = 0
 88 | epoch = 0
 89 | fraction = batchsize * 1.0 / flattened.shape[0]
 90 | progress = shelve.open('progress.shelve')
 91 | for epoch in range(200):
 92 |     data = topics.prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
 93 |                           cuda.to_cpu(model.mixture.factors.W.data).copy(),
 94 |                           cuda.to_cpu(model.sampler.W.data).copy(),
 95 |                           words)
 96 |     top_words = topics.print_top_words_per_topic(data)
 97 |     if j % 100 == 0 and j > 100:
 98 |         coherence = topics.topic_coherence(top_words)
 99 |         for j in range(n_topics):
100 |             print(j, coherence[(j, 'cv')])
101 |         kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
102 |         progress[str(epoch)] = pickle.dumps(kw)
103 |     data['doc_lengths'] = doc_lengths
104 |     data['term_frequency'] = term_frequency
105 |     np.savez('topics.pyldavis', **data)
106 |     for d, f in utils.chunks(batchsize, doc_ids, flattened):
107 |         t0 = time.time()
108 |         # optimizer.zero_grads()
109 |         model.cleargrads()
110 |         l = model.fit_partial(d.copy(), f.copy())
111 |         prior = model.prior()
112 |         loss = prior * fraction
113 |         loss.backward()
114 |         optimizer.update()
115 |         msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
116 |                "P:{prior:1.3e} R:{rate:1.3e}")
117 |         prior.to_cpu()
118 |         loss.to_cpu()
119 |         t1 = time.time()
120 |         dt = t1 - t0
121 |         rate = batchsize / dt
122 |         logs = dict(loss=float(l), epoch=epoch, j=j,
123 |                     prior=float(prior.data), rate=rate)
124 |         print(msg.format(**logs))
125 |         j += 1
126 |     serializers.save_hdf5("lda2vec.hdf5", model)
127 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/topics.pyldavis.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/examples/twenty_newsgroups/lda2vec/topics.pyldavis.npz


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nslda/nslda.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils, dirichlet_likelihood
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | class NSLDA(chainer.Chain):
11 |     def __init__(self, counts, n_docs, n_topics, n_dim, n_vocab, n_samples=5):
12 |         factors = np.random.random((n_topics, n_dim)).astype('float32')
13 |         loss_func = L.NegativeSampling(n_dim, counts, n_samples)
14 |         loss_func.W.data[:, :] = np.random.randn(*loss_func.W.data.shape)
15 |         loss_func.W.data[:, :] /= np.sqrt(np.prod(loss_func.W.data.shape))
16 |         super(NSLDA, self).__init__(proportions=L.EmbedID(n_docs, n_topics),
17 |                                     factors=L.Parameter(factors),
18 |                                     loss_func=loss_func)
19 |         self.n_docs = n_docs
20 |         self.n_topics = n_topics
21 |         self.n_vocab = n_vocab
22 |         self.n_dim = n_dim
23 | 
24 |     def forward(self, doc, wrd, window=5):
25 |         doc, wrd = utils.move(self.xp, doc, wrd)
26 |         proportions = self.proportions(doc)
27 |         ld = dirichlet_likelihood(self.proportions.W)
28 |         context = F.matmul(F.softmax(proportions), self.factors())
29 |         loss = self.loss_func(context, wrd)
30 |         return loss, ld
31 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nslda/nslda_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody <chrisemoody@gmail.com>
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | from chainer import cuda
12 | import chainer.optimizers as O
13 | import numpy as np
14 | 
15 | from lda2vec import prepare_topics, print_top_words_per_topic
16 | from lda2vec import utils
17 | from nslda import NSLDA
18 | 
19 | gpu_id = int(os.getenv('CUDA_GPU', 0))
20 | cuda.get_device(gpu_id).use()
21 | print "Using GPU " + str(gpu_id)
22 | 
23 | vocab = pickle.load(open('../data/vocab.pkl', 'r'))
24 | corpus = pickle.load(open('../data/corpus.pkl', 'r'))
25 | doc_id = np.load("../data/doc_ids.npy")
26 | flattened = np.load("../data/flattened.npy")
27 | 
28 | # Number of docs
29 | n_docs = doc_id.max() + 1
30 | # Number of unique words in the vocabulary
31 | n_vocab = flattened.max() + 1
32 | # Number of dimensions in a single word vector
33 | n_units = 256
34 | # number of topics
35 | n_topics = 20
36 | batchsize = 4096 * 8
37 | # Strength of Dirichlet prior
38 | strength = 1.0
39 | counts = corpus.keys_counts[:n_vocab]
40 | # Get the string representation for every compact key
41 | words = corpus.word_list(vocab)[:n_vocab]
42 | 
43 | model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab)
44 | if os.path.exists('nslda.hdf5'):
45 |     print "Reloading from saved"
46 |     serializers.load_hdf5("nslda.hdf5", model)
47 | model.to_gpu()
48 | optimizer = O.Adam()
49 | optimizer.setup(model)
50 | 
51 | j = 0
52 | fraction = batchsize * 1.0 / flattened.shape[0]
53 | for epoch in range(50000000):
54 |     p = cuda.to_cpu(model.proportions.W.data).copy()
55 |     f = cuda.to_cpu(model.factors.W.data).copy()
56 |     w = cuda.to_cpu(model.loss_func.W.data).copy()
57 |     d = prepare_topics(p, f, w, words)
58 |     print_top_words_per_topic(d)
59 |     for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened):
60 |         t0 = time.time()
61 |         optimizer.zero_grads()
62 |         rec, ld = model.forward(doc_ids, flat)
63 |         l = rec + ld * fraction * strength
64 |         l.backward()
65 |         optimizer.update()
66 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
67 |                "P:{ld:1.3e} R:{rate:1.3e}")
68 |         l.to_cpu()
69 |         rec.to_cpu()
70 |         ld.to_cpu()
71 |         t1 = time.time()
72 |         dt = t1 - t0
73 |         rate = batchsize / dt
74 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
75 |                     ld=float(ld.data), rate=rate)
76 |         print msg.format(**logs)
77 |         j += 1
78 |     if epoch % 100 == 0:
79 |         serializers.save_hdf5("nslda.hdf5", model)
80 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nvdm/nvdm.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils
 6 | 
 7 | 
 8 | class NVDM(chainer.Chain):
 9 |     def __init__(self, n_vocab, n_dim):
10 |         super(NVDM, self).__init__(l1=L.Linear(n_vocab, n_dim),
11 |                                    l2=L.Linear(n_dim, n_dim),
12 |                                    mu_logsigma=L.Linear(n_dim, n_dim * 2),
13 |                                    embedding=L.Linear(n_dim, n_vocab))
14 |         self.n_vocab = n_vocab
15 |         self.n_dim = n_dim
16 | 
17 |     def encode(self, bow):
18 |         """ Convert the bag of words vector of shape (n_docs, n_vocab)
19 |         into latent mean log variance vectors.
20 |         """
21 |         lam = F.relu(self.l1(bow))
22 |         pi = F.relu(self.l2(lam))
23 |         mu, log_sigma = F.split_axis(self.mu_logsigma(pi), 2, 1)
24 |         sample = F.gaussian(mu, log_sigma)
25 |         loss = F.gaussian_kl_divergence(mu, log_sigma)
26 |         return sample, loss
27 | 
28 |     def decode(self, sample, bow):
29 |         """ Decode latent document vectors back into word counts
30 |         (n_docs, n_vocab).
31 |         """
32 |         logprob = F.log_softmax(self.embedding(sample))
33 |         # This is equivalent to a softmax_cross_entropy where instead of
34 |         # guessing 1 of N words we have repeated observations
35 |         # Normal softmax for guessing the next word is:
36 |         # t log softmax(x), where t is 0 or 1
37 |         # Softmax for guessing word counts is simply doing
38 |         # the above more times, so multiply by the count
39 |         # count log softmax(x)
40 |         loss = -F.sum(bow * logprob)
41 |         return loss
42 | 
43 |     def observe(self, bow):
44 |         bow, = utils.move(self.xp, bow * 1.0)
45 |         sample, kl = self.encode(bow)
46 |         rec = self.decode(sample, bow)
47 |         return rec, kl
48 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nvdm/nvdm_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody <chrisemoody@gmail.com>
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | import chainer.optimizers as O
12 | import numpy as np
13 | 
14 | from lda2vec import utils
15 | from nvdm import NVDM
16 | 
17 | vocab = pickle.load(open('vocab.pkl', 'r'))
18 | corpus = pickle.load(open('corpus.pkl', 'r'))
19 | bow = np.load("bow.npy").astype('float32')
20 | # Remove bow counts on the first two tokens, which <SKIP> and <EOS>
21 | bow[:, :2] = 0
22 | # Normalize bag of words to be a probability
23 | bow = bow / bow.sum(axis=1)[:, None]
24 | 
25 | # Number of unique words in the vocabulary
26 | n_vocab = bow.shape[1]
27 | # Number of dimensions in a single word vector
28 | n_units = 256
29 | batchsize = 128
30 | counts = corpus.keys_counts[:n_vocab]
31 | # Get the string representation for every compact key
32 | words = corpus.word_list(vocab)[:n_vocab]
33 | 
34 | model = NVDM(n_vocab, n_units)
35 | if os.path.exists('nvdm.hdf5'):
36 |     print "Reloading from saved"
37 |     serializers.load_hdf5("nvdm.hdf5", model)
38 | # model.to_gpu()
39 | optimizer = O.Adam()
40 | optimizer.setup(model)
41 | 
42 | j = 0
43 | fraction = batchsize * 1.0 / bow.shape[0]
44 | for epoch in range(500):
45 |     for (batch,) in utils.chunks(batchsize, bow):
46 |         t0 = time.time()
47 |         rec, kl = model.observe(batch)
48 |         optimizer.zero_grads()
49 |         l = rec + kl
50 |         l.backward()
51 |         optimizer.update()
52 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
53 |                "P:{kl:1.3e} R:{rate:1.3e}")
54 |         l.to_cpu()
55 |         rec.to_cpu()
56 |         kl.to_cpu()
57 |         t1 = time.time()
58 |         dt = t1 - t0
59 |         rate = batchsize / dt
60 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
61 |                     kl=float(kl.data), rate=rate)
62 |         print msg.format(**logs)
63 |         j += 1
64 |     serializers.save_hdf5("nvdm.hdf5", model)
65 | 


--------------------------------------------------------------------------------
/images/img00_word2vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img00_word2vec.png


--------------------------------------------------------------------------------
/images/img01_lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img01_lda.png


--------------------------------------------------------------------------------
/images/img02_lda_topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img02_lda_topics.png


--------------------------------------------------------------------------------
/images/img03_lda2vec_topics01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img03_lda2vec_topics01.png


--------------------------------------------------------------------------------
/images/img04_lda2vec_topics02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img04_lda2vec_topics02.png


--------------------------------------------------------------------------------
/images/img05_lda2vec_topics03_supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img05_lda2vec_topics03_supervised.png


--------------------------------------------------------------------------------
/images/img06_pyldavis.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img06_pyldavis.gif


--------------------------------------------------------------------------------
/lda2vec/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | assert sys.version_info.major == 3, "Must use Python 3!"
 5 | 
 6 | sys.path.append(os.path.dirname(__file__))
 7 | 
 8 | import dirichlet_likelihood
 9 | import embed_mixture
10 | # import tracking
11 | # import preprocess
12 | import corpus
13 | # import topics
14 | import negative_sampling
15 | 
16 | dirichlet_likelihood = dirichlet_likelihood.dirichlet_likelihood
17 | EmbedMixture = embed_mixture.EmbedMixture
18 | # Tracking = tracking.Tracking
19 | # tokenize = preprocess.tokenize
20 | Corpus = corpus.Corpus
21 | # prepare_topics = topics.prepare_topics
22 | # print_top_words_per_topic = topics.print_top_words_per_topic
23 | # negative_sampling = negative_sampling.negative_sampling
24 | NegativeSampling = negative_sampling.negative_sampling
25 | # topic_coherence = topics.topic_coherence
26 | 


--------------------------------------------------------------------------------
/lda2vec/corpus.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import difflib
  4 | import pandas as pd
  5 | 
  6 | try:
  7 |     from pyxdameraulevenshtein import damerau_levenshtein_distance_ndarray
  8 | except ImportError:
  9 |     pass
 10 | 
 11 | 
 12 | class Corpus():
 13 |     _keys_frequency = None
 14 | 
 15 |     def __init__(self, out_of_vocabulary=-1, skip=-2):
 16 |         """ The Corpus helps with tasks involving integer representations of
 17 |         words. This object is used to filter, subsample, and convert loose
 18 |         word indices to compact word indices.
 19 | 
 20 |         'Loose' word arrays are word indices given by a tokenizer. The word
 21 |         index is not necessarily representative of word's frequency rank, and
 22 |         so loose arrays tend to have 'gaps' of unused indices, which can make
 23 |         models less memory efficient. As a result, this class helps convert
 24 |         a loose array to a 'compact' one where the most common words have low
 25 |         indices, and the most infrequent have high indices.
 26 | 
 27 |         Corpus maintains a count of how many of each word it has seen so
 28 |         that it can later selectively filter frequent or rare words. However,
 29 |         since word popularity rank could change with incoming data the word
 30 |         index count must be updated fully and `self.finalize()` must be called
 31 |         before any filtering and subsampling operations can happen.
 32 | 
 33 |         Arguments
 34 |         ---------
 35 |         out_of_vocabulary : int, default=-1
 36 |             Token index to replace whenever we encounter a rare or unseen word.
 37 |             Instead of skipping the token, we mark as an out of vocabulary
 38 |             word.
 39 |         skip : int, default=-2
 40 |             Token index to replace whenever we want to skip the current frame.
 41 |             Particularly useful when subsampling words or when padding a
 42 |             sentence.
 43 | 
 44 |         Examples
 45 |         --------
 46 |         >>> corpus = Corpus()
 47 |         >>> words_raw = np.random.randint(100, size=25)
 48 |         >>> corpus.update_word_count(words_raw)
 49 |         >>> corpus.finalize()
 50 |         >>> words_compact = corpus.to_compact(words_raw)
 51 |         >>> words_pruned = corpus.filter_count(words_compact, min_count=2)
 52 |         >>> # words_sub = corpus.subsample_frequent(words_pruned, thresh=1e-5)
 53 |         >>> words_loose = corpus.to_loose(words_pruned)
 54 |         >>> not_oov = words_loose > -1
 55 |         >>> np.all(words_loose[not_oov] == words_raw[not_oov])
 56 |         True
 57 |         """
 58 |         self.counts_loose = defaultdict(int)
 59 |         self._finalized = False
 60 |         self.specials = dict(out_of_vocabulary=out_of_vocabulary,
 61 |                              skip=skip)
 62 | 
 63 |     @property
 64 |     def n_specials(self):
 65 |         return len(self.specials)
 66 | 
 67 |     def update_word_count(self, loose_array):
 68 |         """ Update the corpus word counts given a loose array of word indices.
 69 |         Can be called multiple times, but once `finalize` is called the word
 70 |         counts cannot be updated.
 71 | 
 72 |         Arguments
 73 |         ---------
 74 |         loose_array : int array
 75 |             Array of word indices.
 76 | 
 77 |         Examples
 78 |         --------
 79 |         >>> corpus = Corpus()
 80 |         >>> corpus.update_word_count(np.arange(10))
 81 |         >>> corpus.update_word_count(np.arange(8))
 82 |         >>> corpus.counts_loose[0]
 83 |         2
 84 |         >>> corpus.counts_loose[9]
 85 |         1
 86 |         """
 87 |         self._check_unfinalized()
 88 |         uniques, counts = np.unique(np.ravel(loose_array), return_counts=True)
 89 |         msg = "Loose arrays cannot have elements below the values of special "
 90 |         msg += "tokens as these indices are reserved"
 91 |         assert uniques.min() >= min(self.specials.values()), msg
 92 |         for k, v in zip(uniques, counts):
 93 |             self.counts_loose[k] += v
 94 | 
 95 |     def _loose_keys_ordered(self):
 96 |         """ Get the loose keys in order of decreasing frequency"""
 97 |         loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1],
 98 |                               reverse=True)
 99 |         keys = np.array(loose_counts)[:, 0]
100 |         counts = np.array(loose_counts)[:, 1]
101 |         order = np.argsort(counts)[::-1].astype('int32')
102 |         keys, counts = keys[order], counts[order]
103 |         # Add in the specials as a prefix to the other keys
104 |         specials = np.sort(list(self.specials.values()))
105 |         keys = np.concatenate((specials, keys))
106 |         empty = np.zeros(len(specials), dtype='int32')
107 |         counts = np.concatenate((empty, counts))
108 |         n_keys = keys.shape[0]
109 |         assert counts.min() >= 0
110 |         return keys, counts, n_keys
111 | 
112 |     def finalize(self):
113 |         """ Call `finalize` once done updating word counts. This means the
114 |         object will no longer accept new word count data, but the loose
115 |         to compact index mapping can be computed. This frees the object to
116 |         filter, subsample, and compactify incoming word arrays.
117 | 
118 |         Examples
119 |         --------
120 |         >>> corpus = Corpus()
121 |         >>> # We'll update the word counts, making sure that word index 2
122 |         >>> # is the most common word index.
123 |         >>> corpus.update_word_count(np.arange(1) + 2)
124 |         >>> corpus.update_word_count(np.arange(3) + 2)
125 |         >>> corpus.update_word_count(np.arange(10) + 2)
126 |         >>> corpus.update_word_count(np.arange(8) + 2)
127 |         >>> corpus.counts_loose[2]
128 |         4
129 |         >>> # The corpus has not been finalized yet, and so the compact mapping
130 |         >>> # has not yet been computed.
131 |         >>> corpus.keys_counts[0]
132 |         Traceback (most recent call last):
133 |             ...
134 |         AttributeError: Corpus instance has no attribute 'keys_counts'
135 |         >>> corpus.finalize()
136 |         >>> corpus.n_specials
137 |         2
138 |         >>> # The special tokens are mapped to the first compact indices
139 |         >>> corpus.compact_to_loose[0]
140 |         -2
141 |         >>> corpus.compact_to_loose[0] == corpus.specials['skip']
142 |         True
143 |         >>> corpus.compact_to_loose[1] == corpus.specials['out_of_vocabulary']
144 |         True
145 |         >>> corpus.compact_to_loose[2]  # Most popular token is mapped next
146 |         2
147 |         >>> corpus.loose_to_compact[3]  # 2nd most popular token is mapped next
148 |         4
149 |         >>> first_non_special = corpus.n_specials
150 |         >>> corpus.keys_counts[first_non_special] # First normal token
151 |         4
152 |         """
153 |         # Return the loose keys and counts in descending count order
154 |         # so that the counts arrays is already in compact order
155 |         self.keys_loose, self.keys_counts, n_keys = self._loose_keys_ordered()
156 |         self.keys_compact = np.arange(n_keys).astype('int32')
157 |         self.loose_to_compact = {l: c for l, c in
158 |                                  zip(self.keys_loose, self.keys_compact)}
159 |         self.compact_to_loose = {c: l for l, c in
160 |                                  self.loose_to_compact.items()}
161 |         self.specials_to_compact = {s: self.loose_to_compact[i]
162 |                                     for s, i in self.specials.items()}
163 |         self.compact_to_special = {c: s for c, s in
164 |                                    self.specials_to_compact.items()}
165 |         self._finalized = True
166 | 
167 |     @property
168 |     def keys_frequency(self):
169 |         if self._keys_frequency is None:
170 |             f = self.keys_counts * 1.0 / np.sum(self.keys_counts)
171 |             self._keys_frequency = f
172 |         return self._keys_frequency
173 | 
174 |     def _check_finalized(self):
175 |         msg = "self.finalized() must be called before any other array ops"
176 |         assert self._finalized, msg
177 | 
178 |     def _check_unfinalized(self):
179 |         msg = "Cannot update word counts after self.finalized()"
180 |         msg += "has been called"
181 |         assert not self._finalized, msg
182 | 
183 |     def filter_count(self, words_compact, min_count=15, max_count=0,
184 |                      max_replacement=None, min_replacement=None):
185 |         """ Replace word indices below min_count with the pad index.
186 | 
187 |         Arguments
188 |         ---------
189 |         words_compact: int array
190 |             Source array whose values will be replaced. This is assumed to
191 |             already be converted into a compact array with `to_compact`.
192 |         min_count : int
193 |             Replace words less frequently occuring than this count. This
194 |             defines the threshold for what words are very rare
195 |         max_count : int
196 |             Replace words occuring more frequently than this count. This
197 |             defines the threshold for very frequent words
198 |         min_replacement : int, default is out_of_vocabulary
199 |             Replace words less than min_count with this.
200 |         max_replacement : int, default is out_of_vocabulary
201 |             Replace words greater than max_count with this.
202 | 
203 |         Examples
204 |         --------
205 |         >>> corpus = Corpus()
206 |         >>> # Make 1000 word indices with index < 100 and
207 |         >>> # update the word counts.
208 |         >>> word_indices = np.random.randint(100, size=1000)
209 |         >>> corpus.update_word_count(word_indices)
210 |         >>> corpus.finalize()  # any word indices above 99 will be filtered
211 |         >>> # Now create a new text, but with some indices above 100
212 |         >>> word_indices = np.random.randint(200, size=1000)
213 |         >>> word_indices.max() < 100
214 |         False
215 |         >>> # Remove words that have never appeared in the original corpus.
216 |         >>> filtered = corpus.filter_count(word_indices, min_count=1)
217 |         >>> filtered.max() < 100
218 |         True
219 |         >>> # We can also remove highly frequent words.
220 |         >>> filtered = corpus.filter_count(word_indices, max_count=2)
221 |         >>> len(np.unique(word_indices)) > len(np.unique(filtered))
222 |         True
223 |         """
224 |         self._check_finalized()
225 |         ret = words_compact.copy()
226 |         if min_replacement is None:
227 |             min_replacement = self.specials_to_compact['out_of_vocabulary']
228 |         if max_replacement is None:
229 |             max_replacement = self.specials_to_compact['out_of_vocabulary']
230 |         not_specials = np.ones(self.keys_counts.shape[0], dtype='bool')
231 |         not_specials[:self.n_specials] = False
232 |         if min_count:
233 |             # Find first index with count less than min_count
234 |             min_idx = np.argmax(not_specials & (self.keys_counts < min_count))
235 |             # Replace all indices greater than min_idx
236 |             ret[ret > min_idx] = min_replacement
237 |         if max_count:
238 |             # Find first index with count less than max_count
239 |             max_idx = np.argmax(not_specials & (self.keys_counts < max_count))
240 |             # Replace all indices less than max_idx
241 |             ret[ret < max_idx] = max_replacement
242 |         return ret
243 | 
244 |     def subsample_frequent(self, words_compact, threshold=1e-5):
245 |         """ Subsample the most frequent words. This aggressively
246 |         replaces words with frequencies higher than `threshold`. Words
247 |         are replaced with the out_of_vocabulary token.
248 | 
249 |         Words will be replaced with probability as a function of their
250 |         frequency in the training corpus:
251 | 
252 |         .. math::
253 |             p(w) = 1.0 - \sqrt{threshold\over f(w)}
254 | 
255 |         Arguments
256 |         ---------
257 |         words_compact: int array
258 |             The input array to subsample.
259 |         threshold: float in [0, 1]
260 |             Words with frequencies higher than this will be increasingly
261 |             subsampled.
262 | 
263 |         Examples
264 |         --------
265 |         >>> corpus = Corpus()
266 |         >>> word_indices = (np.random.power(5.0, size=1000) * 100).astype('i')
267 |         >>> corpus.update_word_count(word_indices)
268 |         >>> corpus.finalize()
269 |         >>> compact = corpus.to_compact(word_indices)
270 |         >>> sampled = corpus.subsample_frequent(compact, threshold=1e-2)
271 |         >>> skip = corpus.specials_to_compact['skip']
272 |         >>> np.sum(compact == skip)  # No skips in the compact tokens
273 |         0
274 |         >>> np.sum(sampled == skip) > 0  # Many skips in the sampled tokens
275 |         True
276 | 
277 |         .. [1] Distributed Representations of Words and Phrases and
278 |                their Compositionality. Mikolov, Tomas and Sutskever, Ilya
279 |                and Chen, Kai and Corrado, Greg S and Dean, Jeff
280 |                Advances in Neural Information Processing Systems 26
281 |         """
282 |         self._check_finalized()
283 |         freq = self.keys_frequency + 1e-10
284 |         pw = 1.0 - (np.sqrt(threshold / freq) + threshold / freq)
285 |         prob = fast_replace(words_compact, self.keys_compact, pw)
286 |         draw = np.random.uniform(size=prob.shape)
287 |         ret = words_compact.copy()
288 |         # If probability greater than draw, skip the word
289 |         ret[prob > draw] = self.specials_to_compact['skip']
290 |         return ret
291 | 
292 |     def to_compact(self, word_loose):
293 |         """ Convert a loose word index matrix to a compact array using
294 |         a fixed loose to dense mapping. Out of vocabulary word indices
295 |         will be replaced by the out of vocabulary index. The most common
296 |         index will be mapped to 0, the next most common to 1, and so on.
297 | 
298 |         Arguments
299 |         ---------
300 |         word_loose : int array
301 |             Input loose word array to be converted into a compact array.
302 | 
303 | 
304 |         Examples
305 |         --------
306 |         >>> corpus = Corpus()
307 |         >>> word_indices = np.random.randint(100, size=1000)
308 |         >>> n_words = len(np.unique(word_indices))
309 |         >>> corpus.update_word_count(word_indices)
310 |         >>> corpus.finalize()
311 |         >>> word_compact = corpus.to_compact(word_indices)
312 |         >>> # The most common word in the training set will be mapped to be
313 |         >>> # right after all the special tokens, so 2 in this case.
314 |         >>> np.argmax(np.bincount(word_compact)) == 2
315 |         True
316 |         >>> most_common = np.argmax(np.bincount(word_indices))
317 |         >>> corpus.loose_to_compact[most_common] == 2
318 |         True
319 |         >>> # Out of vocabulary indices will be mapped to 1
320 |         >>> word_indices = np.random.randint(150, size=1000)
321 |         >>> word_compact_oov = corpus.to_compact(word_indices)
322 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
323 |         >>> oov
324 |         1
325 |         >>> oov in word_compact
326 |         False
327 |         >>> oov in word_compact_oov
328 |         True
329 |         """
330 |         self._check_finalized()
331 |         keys = self.keys_loose
332 |         reps = self.keys_compact
333 |         uniques = np.unique(word_loose)
334 |         # Find the out of vocab indices
335 |         oov = np.setdiff1d(uniques, keys, assume_unique=True)
336 |         oov_token = self.specials_to_compact['out_of_vocabulary']
337 |         keys = np.concatenate((keys, oov))
338 |         reps = np.concatenate((reps, np.zeros_like(oov) + oov_token))
339 |         compact = fast_replace(word_loose, keys, reps)
340 |         msg = "Error: all compact indices should be non-negative"
341 |         assert compact.min() >= 0, msg
342 |         return compact
343 | 
344 |     def to_loose(self, word_compact):
345 |         """ Convert a compacted array back into a loose array.
346 | 
347 |         Arguments
348 |         ---------
349 |         word_compact : int array
350 |             Input compacted word array to be converted into a loose array.
351 | 
352 | 
353 |         Examples
354 |         --------
355 |         >>> corpus = Corpus()
356 |         >>> word_indices = np.random.randint(100, size=1000)
357 |         >>> corpus.update_word_count(word_indices)
358 |         >>> corpus.finalize()
359 |         >>> word_compact = corpus.to_compact(word_indices)
360 |         >>> word_loose = corpus.to_loose(word_compact)
361 |         >>> np.all(word_loose == word_indices)
362 |         True
363 |         """
364 |         self._check_finalized()
365 |         uniques = np.unique(word_compact)
366 |         # Find the out of vocab indices
367 |         oov = np.setdiff1d(uniques, self.keys_compact, assume_unique=True)
368 |         msg = "Found keys in `word_compact` not present in the"
369 |         msg += "training corpus. Is this actually a compacted array?"
370 |         assert np.all(oov < 0), msg
371 |         loose = fast_replace(word_compact, self.keys_compact, self.keys_loose)
372 |         return loose
373 | 
374 |     def compact_to_flat(self, word_compact, *components):
375 |         """ Ravel a 2D compact array of documents (rows) and word
376 |         positions (columns) into a 1D array of words. Leave out special
377 |         tokens and ravel the component arrays in the same fashion.
378 | 
379 |         Arguments
380 |         ---------
381 |         word_compact : int array
382 |             Array of word indices in documents. Has shape (n_docs, max_length)
383 |         components : list of arrays
384 |             A list of arrays detailing per-document properties. Each array
385 |             must n_docs long.
386 | 
387 |         Returns
388 |         -------
389 |         flat : int array
390 |             An array of all words unravelled into a 1D shape
391 |         components : list of arrays
392 |             Each array here is also unravelled into the same shape
393 | 
394 |         Examples
395 |         --------
396 |         >>> corpus = Corpus()
397 |         >>> word_indices = np.random.randint(100, size=1000)
398 |         >>> corpus.update_word_count(word_indices)
399 |         >>> corpus.finalize()
400 |         >>> doc_texts = np.arange(8).reshape((2, 4))
401 |         >>> doc_texts[:, -1] = -2  # Mark as skips
402 |         >>> doc_ids = np.arange(2)
403 |         >>> compact = corpus.to_compact(doc_texts)
404 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
405 |         >>> compact[1, 3] = oov  # Mark the last word as OOV
406 |         >>> flat = corpus.compact_to_flat(compact)
407 |         >>> flat.shape[0] == 6  # 2 skips were dropped from 8 words
408 |         True
409 |         >>> flat[-1] == corpus.loose_to_compact[doc_texts[1, 2]]
410 |         True
411 |         >>> flat, (flat_id,) = corpus.compact_to_flat(compact, doc_ids)
412 |         >>> flat_id
413 |         array([0, 0, 0, 1, 1, 1])
414 |         """
415 |         self._check_finalized()
416 |         n_docs = word_compact.shape[0]
417 |         max_length = word_compact.shape[1]
418 |         idx = word_compact > self.n_specials
419 |         components_raveled = []
420 |         msg = "Length of each component must much `word_compact` size"
421 |         for component in components:
422 |             raveled = np.tile(component[:, None], max_length)[idx]
423 |             components_raveled.append(raveled)
424 |             assert len(component) == n_docs, msg
425 |         if len(components_raveled) == 0:
426 |             return word_compact[idx]
427 |         else:
428 |             return word_compact[idx], components_raveled
429 | 
430 |     def word_list(self, vocab, max_compact_index=None, oov_token='<OoV>'):
431 |         """ Translate compact keys back into string representations for a word.
432 | 
433 |         Arguments
434 |         ---------
435 |         vocab : dict
436 |             The vocab object has loose indices as keys and word strings as
437 |             values.
438 | 
439 |         max_compact_index : int
440 |             Only return words up to this index. If None, defaults to the number
441 |             of compact indices available
442 | 
443 |         oov_token : str
444 |             Returns this string if a compact index does not have a word in the
445 |             vocab dictionary provided.
446 | 
447 |         Returns
448 |         -------
449 |         word_list : list
450 |             A list of strings representations corresponding to word indices
451 |             zero to `max_compact_index`
452 | 
453 |         Examples
454 |         --------
455 | 
456 |         >>> vocab = {0: 'But', 1: 'the', 2: 'night', 3: 'was', 4: 'warm'}
457 |         >>> word_indices = np.zeros(50).astype('int32')
458 |         >>> word_indices[:25] = 0  # 'But' shows 25 times
459 |         >>> word_indices[25:35] = 1  # 'the' is in 10 times
460 |         >>> word_indices[40:46] = 2  # 'night' is in 6 times
461 |         >>> word_indices[46:49] = 3  # 'was' is in 3 times
462 |         >>> word_indices[49:] = 4  # 'warm' in in 2 times
463 |         >>> corpus = Corpus()
464 |         >>> corpus.update_word_count(word_indices)
465 |         >>> corpus.finalize()
466 |         >>> # Build a vocabulary of word indices
467 |         >>> corpus.word_list(vocab)
468 |         ['skip', 'out_of_vocabulary', 'But', 'the', 'night', 'was', 'warm']
469 |         """
470 |         # Translate the compact keys into string words
471 |         oov = self.specials['out_of_vocabulary']
472 |         words = []
473 |         if max_compact_index is None:
474 |             max_compact_index = self.keys_compact.shape[0]
475 |         index_to_special = {i: s for s, i in self.specials.items()}
476 |         for compact_index in range(max_compact_index):
477 |             loose_index = self.compact_to_loose.get(compact_index, oov)
478 |             special = index_to_special.get(loose_index, oov_token)
479 |             string = vocab.get(loose_index, special)
480 |             words.append(string)
481 |         return words
482 | 
483 |     def compact_word_vectors(self, vocab, filename=None, array=None,
484 |                              top=20000):
485 |         """ Retrieve pretrained word spectors for our vocabulary.
486 |         The returned word array has row indices corresponding to the
487 |         compact index of a word, and columns correponding to the word
488 |         vector.
489 | 
490 |         Arguments
491 |         ---------
492 |         vocab : dict
493 |             Dictionary where keys are the loose index, and values are
494 |             the word string.
495 | 
496 |         use_spacy : bool
497 |             Use SpaCy to load in word vectors. Otherwise Gensim.
498 | 
499 |         filename : str
500 |             Filename for SpaCy-compatible word vectors or if use_spacy=False
501 |             then uses word2vec vectors via gensim.
502 | 
503 |         Returns
504 |         -------
505 |         data : numpy float array
506 |             Array such that data[compact_index, :] = word_vector
507 | 
508 |         Examples
509 |         --------
510 |         >>> import numpy.linalg as nl
511 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
512 |         >>> word_indices = np.zeros(50).astype('int32')
513 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
514 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
515 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
516 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
517 |         >>> corpus = Corpus()
518 |         >>> corpus.update_word_count(word_indices)
519 |         >>> corpus.finalize()
520 |         >>> v, s, f = corpus.compact_word_vectors(vocab)
521 |         >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
522 |         >>> vocab[corpus.compact_to_loose[2]]
523 |         'shuttle'
524 |         >>> vocab[corpus.compact_to_loose[3]]
525 |         'astronomy'
526 |         >>> vocab[corpus.compact_to_loose[4]]
527 |         'cold'
528 |         >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
529 |         >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
530 |         >>> sim_shuttle_astro > sim_shuttle_cold
531 |         True
532 |         """
533 |         n_words = len(self.compact_to_loose)
534 |         from gensim.models.keyedvectors import Word2VecKeyedVectors
535 |         model = Word2VecKeyedVectors.load_word2vec_format(filename, binary=True)
536 |         n_dim = model.syn0.shape[1]
537 |         data = np.random.normal(size=(n_words, n_dim)).astype('float32')
538 |         data -= data.mean()
539 |         data += model.syn0.mean()
540 |         data /= data.std()
541 |         data *= model.syn0.std()
542 |         if array is not None:
543 |             data = array
544 |             n_words = data.shape[0]
545 |         keys_raw = model.vocab.keys()
546 |         keys = [s.encode('ascii', 'ignore') for s in keys_raw]
547 |         lens = [len(s) for s in model.vocab.keys()]
548 |         choices = np.array(keys, dtype='S')
549 |         lengths = np.array(lens, dtype='int32')
550 |         s, f = 0, 0
551 |         rep0 = lambda w: w
552 |         rep1 = lambda w: w.replace(' ', '_')
553 |         rep2 = lambda w: w.title().replace(' ', '_')
554 |         reps = [rep0, rep1, rep2]
555 |         for compact in np.arange(top):
556 |             loose = self.compact_to_loose.get(compact, None)
557 |             if loose is None:
558 |                 continue
559 |             word = vocab.get(loose, None)
560 |             if word is None:
561 |                 continue
562 |             word = word.strip()
563 |             vector = None
564 |             for rep in reps:
565 |                 clean = rep(word)
566 |                 if clean in model.vocab:
567 |                     vector = model[clean]
568 |                     break
569 |             if vector is None:
570 |                 try:
571 |                     idx = lengths >= len(word) - 3
572 |                     idx &= lengths <= len(word) + 3
573 |                     sel = choices[idx]
574 |                     d = damerau_levenshtein_distance_ndarray(word, sel)
575 |                     choice = np.array(keys_raw)[idx][np.argmin(d)]
576 |                     # choice = difflib.get_close_matches(word, choices)[0]
577 |                     vector = model[choice]
578 |                     print(compact, word, ' --> ', choice)
579 |                 except IndexError:
580 |                     pass
581 |             if vector is None:
582 |                 f += 1
583 |                 continue
584 |             s += 1
585 |             data[compact, :] = vector[:]
586 |         return data, s, f
587 | 
588 |     def compact_to_bow(self, word_compact, max_compact_index=None):
589 |         """ Given a 2D array of compact indices, return the bag of words
590 |         representation where the column is the word index, row is the document
591 |         index, and the value is the number of times that word appears in that
592 |         document.
593 | 
594 |         >>> import numpy.linalg as nl
595 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
596 |         >>> word_indices = np.zeros(50).astype('int32')
597 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
598 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
599 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
600 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
601 |         >>> corpus = Corpus()
602 |         >>> corpus.update_word_count(word_indices)
603 |         >>> corpus.finalize()
604 |         >>> v = corpus.compact_to_bow(word_indices)
605 |         >>> len(v)
606 |         20
607 |         >>> v[:6]
608 |         array([ 5,  0,  0,  4,  0, 10])
609 |         >>> v[19]
610 |         25
611 |         >>> v.sum()
612 |         50
613 |         >>> words = [[0, 0, 0, 3, 4], [1, 1, 1, 4, 5]]
614 |         >>> words = np.array(words)
615 |         >>> bow = corpus.compact_to_bow(words)
616 |         >>> bow.shape
617 |         (2, 6)
618 |         """
619 |         if max_compact_index is None:
620 |             max_compact_index = word_compact.max()
621 | 
622 |         def bincount(x):
623 |             return np.bincount(x, minlength=max_compact_index + 1)
624 |         axis = len(word_compact.shape) - 1
625 |         bow = np.apply_along_axis(bincount, axis, word_compact)
626 |         return bow
627 | 
628 |     def compact_to_coocurrence(self, word_compact, indices, window_size=10):
629 |         """ From an array of compact tokens and aligned array of document indices
630 |         compute (word, word, document) co-occurrences within a moving window.
631 | 
632 |         Arguments
633 |         ---------
634 |         word_compact: int array
635 |         Sequence of tokens.
636 | 
637 |         indices: dict of int arrays
638 |         Each array in this dictionary should represent the document index it
639 |         came from.
640 | 
641 |         window_size: int
642 |         Indicates the moving window size around which all co-occurrences will
643 |         be computed.
644 | 
645 |         Returns
646 |         -------
647 |         counts : DataFrame
648 |         Returns a DataFrame with two columns for word index A and B,
649 |         one extra column for each document index, and a final column for counts
650 |         in that key.
651 | 
652 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
653 |         >>> doc_idx = np.array([0, 0, 0, 0, 1, 1, 1, 1])
654 |         >>> corpus = Corpus()
655 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
656 |         >>> counts.counts.sum()
657 |         24
658 |         >>> counts.query('doc == 0').counts.values
659 |         array([3, 3, 6])
660 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
661 |         >>> doc_idx = np.array([0, 0, 0, 1, 1, 2, 2, 2])
662 |         >>> corpus = Corpus()
663 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
664 |         >>> counts.counts.sum()
665 |         14
666 |         >>> counts.query('doc == 0').word_index_x.values
667 |         array([0, 1, 1])
668 |         >>> counts.query('doc == 0').word_index_y.values
669 |         array([1, 0, 1])
670 |         >>> counts.query('doc == 0').counts.values
671 |         array([2, 2, 2])
672 |         >>> counts.query('doc == 1').counts.values
673 |         array([1, 1])
674 |         """
675 |         tokens = pd.DataFrame(dict(word_index=word_compact)).reset_index()
676 |         for name, index in indices.items():
677 |             tokens[name] = index
678 |         a, b = tokens.copy(), tokens.copy()
679 |         mask = lambda x: np.prod([x[k + '_x'] == x[k + '_y']
680 |                                   for k in indices.keys()], axis=0)
681 |         group_keys = ['word_index_x', 'word_index_y', ]
682 |         group_keys += [k + '_x' for k in indices.keys()]
683 |         total = []
684 |         a['frame'] = a['index'].copy()
685 |         for frame in range(-window_size, window_size + 1):
686 |             if frame == 0:
687 |                 continue
688 |             b['frame'] = b['index'] + frame
689 |             matches = (a.merge(b, on='frame')
690 |                         .assign(same_doc=mask)
691 |                         .pipe(lambda df: df[df['same_doc'] == 1])
692 |                         .groupby(group_keys)['frame']
693 |                         .count()
694 |                         .reset_index())
695 |             total.append(matches)
696 |         counts = (pd.concat(total)
697 |                     .groupby(group_keys)['frame']
698 |                     .sum()
699 |                     .reset_index()
700 |                     .rename(columns={k + '_x': k for k in indices.keys()})
701 |                     .rename(columns=dict(frame='counts')))
702 |         return counts
703 | 
704 | 
705 | def fast_replace(data, keys, values, skip_checks=False):
706 |     """ Do a search-and-replace in array `data`.
707 | 
708 |     Arguments
709 |     ---------
710 |     data : int array
711 |         Array of integers
712 |     keys : int array
713 |         Array of keys inside of `data` to be replaced
714 |     values : int array
715 |         Array of values that replace the `keys` array
716 |     skip_checks : bool, default=False
717 |         Optionally skip sanity checking the input.
718 | 
719 |     Examples
720 |     --------
721 |     >>> fast_replace(np.arange(5), np.arange(5), np.arange(5)[::-1])
722 |     array([4, 3, 2, 1, 0])
723 |     """
724 |     assert np.allclose(keys.shape, values.shape)
725 |     if not skip_checks:
726 |         msg = "data has elements not in keys"
727 |         assert data.max() <= keys.max(), msg
728 |     sdx = np.argsort(keys)
729 |     keys, values = keys[sdx], values[sdx]
730 |     idx = np.digitize(data, keys, right=True)
731 |     new_data = values[idx]
732 |     return new_data
733 | 


--------------------------------------------------------------------------------
/lda2vec/dirichlet_likelihood.py:
--------------------------------------------------------------------------------
 1 | import chainer.functions as F
 2 | from chainer import Variable
 3 | 
 4 | 
 5 | def dirichlet_likelihood(weights, alpha=None):
 6 |     """ Calculate the log likelihood of the observed topic proportions.
 7 |     A negative likelihood is more likely than a negative likelihood.
 8 | 
 9 |     Args:
10 |         weights (chainer.Variable): Unnormalized weight vector. The vector
11 |             will be passed through a softmax function that will map the input
12 |             onto a probability simplex.
13 |         alpha (float): The Dirichlet concentration parameter. Alpha
14 |             greater than 1.0 results in very dense topic weights such
15 |             that each document belongs to many topics. Alpha < 1.0 results
16 |             in sparser topic weights. The default is to set alpha to
17 |             1.0 / n_topics, effectively enforcing the prior belief that a
18 |             document belong to very topics at once.
19 | 
20 |     Returns:
21 |         ~chainer.Variable: Output loss variable.
22 |     """
23 |     if type(weights) is Variable:
24 |         n_topics = weights.data.shape[1]
25 |     else:
26 |         n_topics = weights.W.data.shape[1]
27 |     if alpha is None:
28 |         alpha = 1.0 / n_topics
29 |     if type(weights) is Variable:
30 |         log_proportions = F.log_softmax(weights)
31 |     else:
32 |         log_proportions = F.log_softmax(weights.W)
33 |     loss = (alpha - 1.0) * log_proportions
34 |     return -F.sum(loss)
35 | 


--------------------------------------------------------------------------------
/lda2vec/embed_mixture.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import chainer
  4 | import chainer.links as L
  5 | import chainer.functions as F
  6 | from chainer import Variable
  7 | 
  8 | 
  9 | def _orthogonal_matrix(shape):
 10 |     # Stolen from blocks:
 11 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 12 |     M1 = np.random.randn(shape[0], shape[0])
 13 |     M2 = np.random.randn(shape[1], shape[1])
 14 | 
 15 |     # QR decomposition of matrix with entries in N(0, 1) is random
 16 |     Q1, R1 = np.linalg.qr(M1)
 17 |     Q2, R2 = np.linalg.qr(M2)
 18 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
 19 |     Q1 = Q1 * np.sign(np.diag(R1))
 20 |     Q2 = Q2 * np.sign(np.diag(R2))
 21 | 
 22 |     n_min = min(shape[0], shape[1])
 23 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
 24 | 
 25 | 
 26 | class EmbedMixture(chainer.Chain):
 27 |     r""" A single document is encoded as a multinomial mixture of latent topics.
 28 |     The mixture is defined on simplex, so that mixture weights always sum
 29 |     to 100%. The latent topic vectors resemble word vectors whose elements are
 30 |     defined over all real numbers.
 31 | 
 32 |     For example, a single document mix may be :math:`[0.9, 0.1]`, indicating
 33 |     that it is 90% in the first topic, 10% in the second. An example topic
 34 |     vector looks like :math:`[1.5e1, -1.3e0, +3.4e0, -0.2e0]`, which is
 35 |     largely uninterpretable until you measure the words most similar to this
 36 |     topic vector.
 37 | 
 38 |     A single document vector :math:`\vec{e}` is composed as weights :math:`c_j`
 39 |     over topic vectors :math:`\vec{T_j}`:
 40 | 
 41 |     .. math::
 42 | 
 43 |         \vec{e}=\Sigma_{j=0}^{j=n\_topics}c_j\vec{T_j}
 44 | 
 45 |     This is usually paired with regularization on the weights :math:`c_j`.
 46 |     If using a Dirichlet prior with low alpha, these weights will be sparse.
 47 | 
 48 |     Args:
 49 |         n_documents (int): Total number of documents
 50 |         n_topics (int): Number of topics per document
 51 |         n_dim (int): Number of dimensions per topic vector (should match word
 52 |             vector size)
 53 | 
 54 |     Attributes:
 55 |         weights : chainer.links.EmbedID
 56 |             Unnormalized topic weights (:math:`c_j`). To normalize these
 57 |             weights, use `F.softmax(weights)`.
 58 |         factors : chainer.links.Parameter
 59 |             Topic vector matrix (:math:`T_j`)
 60 | 
 61 |     .. seealso:: :func:`lda2vec.dirichlet_likelihood`
 62 |     """
 63 | 
 64 |     def __init__(self, n_documents, n_topics, n_dim, dropout_ratio=0.2,
 65 |                  temperature=1.0):
 66 |         self.n_documents = n_documents
 67 |         self.n_topics = n_topics
 68 |         self.n_dim = n_dim
 69 |         self.dropout_ratio = dropout_ratio
 70 |         factors = _orthogonal_matrix((n_topics, n_dim)).astype('float32')
 71 |         factors /= np.sqrt(n_topics + n_dim)
 72 |         super(EmbedMixture, self).__init__(
 73 |             weights=L.EmbedID(n_documents, n_topics),
 74 |             factors=L.Parameter(factors))
 75 |         self.temperature = temperature
 76 |         self.weights.W.data[...] /= np.sqrt(n_documents + n_topics)
 77 | 
 78 |     def __call__(self, doc_ids, update_only_docs=False):
 79 |         """ Given an array of document integer indices, returns a vector
 80 |         for each document. The vector is composed of topic weights projected
 81 |         onto topic vectors.
 82 | 
 83 |         Args:
 84 |             doc_ids : chainer.Variable
 85 |                 One-dimensional batch vectors of IDs
 86 | 
 87 |         Returns:
 88 |             doc_vector : chainer.Variable
 89 |                 Batch of two-dimensional embeddings for every document.
 90 |         """
 91 |         # (batchsize, ) --> (batchsize, multinomial)
 92 |         proportions = self.proportions(doc_ids, softmax=True)
 93 |         # (batchsize, n_factors) * (n_factors, n_dim) --> (batchsize, n_dim)
 94 |         factors = F.dropout(self.factors(), ratio=self.dropout_ratio)
 95 |         if update_only_docs:
 96 |             factors.unchain_backward()
 97 |         w_sum = F.matmul(proportions, factors)
 98 |         return w_sum
 99 | 
100 |     def proportions(self, doc_ids, softmax=False):
101 |         """ Given an array of document indices, return a vector
102 |         for each document of just the unnormalized topic weights.
103 | 
104 |         Returns:
105 |             doc_weights : chainer.Variable
106 |                 Two dimensional topic weights of each document.
107 |         """
108 |         w = self.weights(doc_ids)
109 |         if softmax:
110 |             size = w.data.shape
111 |             mask = self.xp.random.random_integers(0, 1, size=size)
112 |             y = (F.softmax(w * self.temperature) *
113 |                  Variable(mask.astype('float32')))
114 |             norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
115 |             return y / (norm + 1e-7)
116 |         else:
117 |             return w
118 | 


--------------------------------------------------------------------------------
/lda2vec/fake_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.random import random_sample
 3 | 
 4 | 
 5 | def orthogonal_matrix(shape):
 6 |     # Stolen from blocks:
 7 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 8 |     M1 = np.random.randn(shape[0], shape[0])
 9 |     M2 = np.random.randn(shape[1], shape[1])
10 | 
11 |     # QR decomposition of matrix with entries in N(0, 1) is random
12 |     Q1, R1 = np.linalg.qr(M1)
13 |     Q2, R2 = np.linalg.qr(M2)
14 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
15 |     Q1 = Q1 * np.sign(np.diag(R1))
16 |     Q2 = Q2 * np.sign(np.diag(R2))
17 | 
18 |     n_min = min(shape[0], shape[1])
19 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
20 | 
21 | 
22 | def softmax(w):
23 |     # https://gist.github.com/stober/1946926
24 |     w = np.array(w)
25 |     maxes = np.amax(w, axis=1)
26 |     maxes = maxes.reshape(maxes.shape[0], 1)
27 |     e = np.exp(w - maxes)
28 |     dist = e / np.sum(e, axis=1)[:, None]
29 |     return dist
30 | 
31 | 
32 | def sample(values, probabilities, size):
33 |     assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
34 |     bins = np.add.accumulate(probabilities)
35 |     return values[np.digitize(random_sample(size), bins)]
36 | 
37 | 
38 | def fake_data(n_docs, n_words, n_sent_length, n_topics):
39 |     """ Generate latent topic vectors for words and documents
40 |     and then for each document, draw a sentence. Draw each word
41 |     document with probability proportional to the dot product and
42 |     normalized with a softmax.
43 | 
44 |     Arguments
45 |     ---------
46 |     n_docs : int
47 |         Number of documents
48 |     n_words : int
49 |         Number of words in the vocabulary
50 |     n_sent_length : int
51 |         Number of words to draw for each document
52 |     n_topics : int
53 |         Number of topics that a single document can belong to.
54 | 
55 |     Returns
56 |     -------
57 |     sentences : int array
58 |         Array of word indices of shape (n_docs, n_sent_length).
59 | 
60 |     """
61 |     # These are log ratios for the doc & word topics
62 |     doc_topics = orthogonal_matrix([n_docs, n_topics])
63 |     wrd_topics = orthogonal_matrix([n_topics, n_words])
64 |     # Multiply log ratios and softmax to get prob of word in doc
65 |     doc_to_wrds = softmax(np.dot(doc_topics, wrd_topics))
66 |     # Now sample from doc_to_wrd to get realizations
67 |     indices = np.arange(n_words).astype('int32')
68 |     sentences = []
69 |     for doc_to_wrd in doc_to_wrds:
70 |         words = sample(indices, doc_to_wrd, n_sent_length)
71 |         sentences.append(words)
72 |     sentences = np.array(sentences)
73 |     return sentences.astype('int32')
74 | 


--------------------------------------------------------------------------------
/lda2vec/negative_sampling.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import six
  3 | 
  4 | from chainer import cuda
  5 | from chainer import function
  6 | from chainer.utils import type_check
  7 | 
  8 | 
  9 | class NegativeSamplingFunction(function.Function):
 10 | 
 11 |     ignore_label = -1
 12 | 
 13 |     def __init__(self, sampler, sample_size):
 14 |         self.sampler = sampler
 15 |         self.sample_size = sample_size
 16 | 
 17 |     def _make_samples(self, t):
 18 |         if hasattr(self, 'samples'):
 19 |             return self.samples  # for testing
 20 | 
 21 |         size = int(t.shape[0])
 22 |         # first one is the positive, and others are sampled negatives
 23 |         samples = self.sampler((size, self.sample_size + 1))
 24 |         samples[:, 0] = t
 25 |         self.samples = samples
 26 | 
 27 |     def check_type_forward(self, in_types):
 28 |         type_check.expect(in_types.size() == 3)
 29 |         x_type, t_type, w_type = in_types
 30 | 
 31 |         type_check.expect(
 32 |             x_type.dtype == numpy.float32,
 33 |             x_type.ndim == 2,
 34 |             t_type.dtype == numpy.int32,
 35 |             t_type.ndim == 1,
 36 |             x_type.shape[0] == t_type.shape[0],
 37 |             w_type.dtype == numpy.float32,
 38 |             w_type.ndim == 2,
 39 |         )
 40 | 
 41 |     def forward_cpu(self, inputs):
 42 |         x, t, W = inputs
 43 |         self.ignore_mask = (t != self.ignore_label)
 44 |         self._make_samples(t)
 45 | 
 46 |         loss = numpy.float32(0.0)
 47 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
 48 |                                     self.samples[self.ignore_mask])):
 49 |             w = W[k]
 50 |             f = w.dot(ix)
 51 |             f[0] *= -1  # positive sample
 52 |             loss += numpy.sum(numpy.logaddexp(f, 0))
 53 |         return numpy.array(loss, numpy.float32),
 54 | 
 55 |     def forward_gpu(self, inputs):
 56 |         x, t, W = inputs
 57 |         self.ignore_mask = (t != self.ignore_label)
 58 |         n_in = x.shape[1]
 59 |         self._make_samples(t)
 60 | 
 61 |         self.wx = cuda.elementwise(
 62 |             'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
 63 |             '''
 64 |             T f = 0;
 65 |             if (mask == 1){
 66 |                 for (int j = 0; j < c; ++j) {
 67 |                   int x_ind[] = {(i / m), j};
 68 |                   int w_ind[] = {k, j};
 69 |                   f += x[x_ind] * W[w_ind];
 70 |                 }
 71 |             }
 72 |             wx = f;
 73 |             ''',
 74 |             'negative_sampling_wx'
 75 |             )(W, x, self.ignore_mask[:, None], self.samples, n_in,
 76 |               self.sample_size + 1)
 77 | 
 78 |         y = cuda.elementwise(
 79 |             'T wx, int32 c, int32 m', 'T y',
 80 |             '''
 81 |             T f = wx;
 82 |             if (i % m == 0) {
 83 |               f = -f;
 84 |             }
 85 |             T loss;
 86 |             if (f < 0) {
 87 |               loss = __logf(1 + __expf(f));
 88 |             } else {
 89 |               loss = f + __logf(1 + __expf(-f));
 90 |             }
 91 |             y = loss;
 92 |             ''',
 93 |             'negative_sampling_forward'
 94 |         )(self.wx, n_in, self.sample_size + 1)
 95 |         # TODO(okuta): merge elementwise
 96 |         loss = cuda.cupy.sum(y * self.ignore_mask[:, None].astype('float32'))
 97 |         return loss,
 98 | 
 99 |     def backward_cpu(self, inputs, grads):
100 |         x, t, W = inputs
101 |         gloss, = grads
102 | 
103 |         gx = numpy.zeros_like(x)
104 |         gW = numpy.zeros_like(W)
105 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
106 |                                     self.samples[self.ignore_mask])):
107 |             w = W[k]
108 |             f = w.dot(ix)
109 | 
110 |             # g == -y * gloss / (1 + exp(yf))
111 |             f[0] *= -1
112 |             g = gloss / (1 + numpy.exp(-f))
113 |             g[0] *= -1
114 | 
115 |             gx[i] = g.dot(w)
116 |             for ik, ig in six.moves.zip(k, g):
117 |                 gW[ik] += ig * ix
118 |         return gx, None, gW
119 | 
120 |     def backward_gpu(self, inputs, grads):
121 |         cupy = cuda.cupy
122 |         x, t, W = inputs
123 |         gloss, = grads
124 | 
125 |         n_in = x.shape[1]
126 |         g = cuda.elementwise(
127 |             'T wx, raw T gloss, int32 m', 'T g',
128 |             '''
129 |             T y;
130 |             if (i % m == 0) {
131 |               y = 1;
132 |             } else {
133 |               y = -1;
134 |             }
135 | 
136 |             g = -y * gloss[0] / (1.0f + __expf(wx * y));
137 |             ''',
138 |             'negative_sampling_calculate_g'
139 |         )(self.wx, gloss, self.sample_size + 1)
140 |         gx = cupy.zeros_like(x)
141 |         cuda.elementwise(
142 |             'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
143 |             '''
144 |             int d = i / c;
145 |             T w = 0;
146 |             if (mask == 1){
147 |                 for (int j = 0; j < m; ++j) {
148 |                   w += g[d * m + j] * W[k[d * m + j] * c + i % c];
149 |                 }
150 |             }
151 |             gx = w;
152 |             ''',
153 |             'negative_sampling_calculate_gx'
154 |             )(g, W, self.ignore_mask[:, None], self.samples, n_in,
155 |               self.sample_size + 1, gx)
156 |         gW = cupy.zeros_like(W)
157 |         cuda.elementwise(
158 |             'T g, raw T x, S k, bool mask, int32 c, int32 m',
159 |             'raw T gW',
160 |             '''
161 |             T gi = g;
162 |             if (mask == 1) {
163 |                 for (int j = 0; j < c; ++j) {
164 |                   atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
165 |                 }
166 |             }
167 |             ''',
168 |             'negative_sampling_calculate_gw'
169 |             )(g, x, self.samples, self.ignore_mask[:, None], n_in,
170 |               self.sample_size + 1, gW)
171 |         return gx, None, gW
172 | 
173 | 
174 | def negative_sampling(x, t, W, sampler, sample_size):
175 |     """Negative sampling loss function.
176 | 
177 |     In natural language processing, especially language modeling, the number of
178 |     words in a vocabulary can be very large.
179 |     Therefore, you need to spend a lot of time calculating the gradient of the
180 |     embedding matrix.
181 | 
182 |     By using the negative sampling trick you only need to calculate the
183 |     gradient for a few sampled negative examples.
184 | 
185 |     The objective function is below:
186 | 
187 |     .. math::
188 | 
189 |        f(x, p) = \\log \\sigma(x^\\top w_p) + \\
190 |        k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)],
191 | 
192 |     where :math:`\sigma(\cdot)` is a sigmoid function, :math:`w_i` is the
193 |     weight vector for the word :math:`i`, and :math:`p` is a positive example.
194 |     It is approximeted with :math:`k` examples :math:`N` sampled from
195 |     probability :math:`P(i)`, like this:
196 | 
197 |     .. math::
198 | 
199 |        f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\
200 |        \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n).
201 | 
202 |     Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`.
203 |     This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where
204 |     :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is
205 |     a hyper-parameter, and :math:`Z` is the normalization constant.
206 | 
207 |     Args:
208 |         x (~chainer.Variable): Batch of input vectors.
209 |         t (~chainer.Variable): Vector of groundtruth labels.
210 |         W (~chainer.Variable): Weight matrix.
211 |         sampler (function): Sampling function. It takes a shape and returns an
212 |             integer array of the shape. Each element of this array is a sample
213 |             from the word distribution. A :class:`~chainer.utils.WalkerAlias`
214 |             object built with the power distribution of word frequency is
215 |             recommended.
216 |         sample_size (int): Number of samples.
217 | 
218 |     See: `Distributed Representations of Words and Phrases and their\
219 |          Compositionality <http://arxiv.org/abs/1310.4546>`_
220 | 
221 |     .. seealso:: :class:`~chainer.links.NegativeSampling`.
222 | 
223 |     """
224 |     return NegativeSamplingFunction(sampler, sample_size)(x, t, W)
225 | 
226 | 
227 | # Monkey-patch the chainer code to replace the negative sampling
228 | # with the one used here
229 | import chainer.links as L
230 | import chainer.functions as F
231 | negative_sampling.patched = True
232 | L.NegativeSampling.negative_sampling = negative_sampling
233 | F.negative_sampling = negative_sampling
234 | 


--------------------------------------------------------------------------------
/lda2vec/preprocess.py:
--------------------------------------------------------------------------------
  1 | from spacy.en import English
  2 | from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
  8 |              **kwargs):
  9 |     """ Uses spaCy to quickly tokenize text and return an array
 10 |     of indices.
 11 | 
 12 |     This method stores a global NLP directory in memory, and takes
 13 |     up to a minute to run for the time. Later calls will have the
 14 |     tokenizer in memory.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     text : list of unicode strings
 19 |         These are the input documents. There can be multiple sentences per
 20 |         item in the list.
 21 |     max_length : int
 22 |         This is the maximum number of words per document. If the document is
 23 |         shorter then this number it will be padded to this length.
 24 |     skip : int, optional
 25 |         Short documents will be padded with this variable up until max_length.
 26 |     attr : int, from spacy.attrs
 27 |         What to transform the token to. Choice must be in spacy.attrs, and =
 28 |         common choices are (LOWER, LEMMA)
 29 |     merge : int, optional
 30 |         Merge noun phrases into a single token. Useful for turning 'New York'
 31 |         into a single token.
 32 |     nlp : None
 33 |         A spaCy NLP object. Useful for not reinstantiating the object multiple
 34 |         times.
 35 |     kwargs : dict, optional
 36 |         Any further argument will be sent to the spaCy tokenizer. For extra
 37 |         speed consider setting tag=False, parse=False, entity=False, or
 38 |         n_threads=8.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     arr : 2D array of ints
 43 |         Has shape (len(texts), max_length). Each value represents
 44 |         the word index.
 45 |     vocab : dict
 46 |         Keys are the word index, and values are the string. The pad index gets
 47 |         mapped to None
 48 | 
 49 |     >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
 50 |     >>> arr, vocab = tokenize(sents, 10, merge=True)
 51 |     >>> arr.shape[0]
 52 |     2
 53 |     >>> arr.shape[1]
 54 |     10
 55 |     >>> w2i = {w: i for i, w in vocab.iteritems()}
 56 |     >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
 57 |     True
 58 |     >>> arr[0, 1] == w2i[u'you']
 59 |     True
 60 |     >>> arr[0, -1]  # last word in 0th document is a pad word
 61 |     -2
 62 |     >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
 63 |     True
 64 |     >>> arr[1, 1]  # The URL token is thrown out
 65 |     -2
 66 |     """
 67 |     if nlp is None:
 68 |         nlp = English()
 69 |     data = np.zeros((len(texts), max_length), dtype='int32')
 70 |     data[:] = skip
 71 |     bad_deps = ('amod', 'compound')
 72 |     for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
 73 |         if merge:
 74 |             # from the spaCy blog, an example on how to merge
 75 |             # noun phrases into single tokens
 76 |             for phrase in doc.noun_chunks:
 77 |                 # Only keep adjectives and nouns, e.g. "good ideas"
 78 |                 while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
 79 |                     phrase = phrase[1:]
 80 |                 if len(phrase) > 1:
 81 |                     # Merge the tokens, e.g. good_ideas
 82 |                     phrase.merge(phrase.root.tag_, phrase.text,
 83 |                                  phrase.root.ent_type_)
 84 |                 # Iterate over named entities
 85 |                 for ent in doc.ents:
 86 |                     if len(ent) > 1:
 87 |                         # Merge them into single tokens
 88 |                         ent.merge(ent.root.tag_, ent.text, ent.label_)
 89 |         dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
 90 |         if len(dat) > 0:
 91 |             dat = dat.astype('int32')
 92 |             msg = "Negative indices reserved for special tokens"
 93 |             assert dat.min() >= 0, msg
 94 |             # Replace email and URL tokens
 95 |             idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
 96 |             dat[idx] = skip
 97 |             length = min(len(dat), max_length)
 98 |             data[row, :length] = dat[:length, 0].ravel()
 99 |     uniques = np.unique(data)
100 |     vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
101 |     vocab[skip] = '<SKIP>'
102 |     return data, vocab
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     import doctest
107 |     doctest.testmod()
108 | 


--------------------------------------------------------------------------------
/lda2vec/topics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import requests
  3 | import multiprocessing
  4 | 
  5 | 
  6 | def _softmax(x):
  7 |     e_x = np.exp(x - np.max(x))
  8 |     out = e_x / e_x.sum()
  9 |     return out
 10 | 
 11 | 
 12 | def _softmax_2d(x):
 13 |     y = x - x.max(axis=1, keepdims=True)
 14 |     np.exp(y, out=y)
 15 |     y /= y.sum(axis=1, keepdims=True)
 16 |     return y
 17 | 
 18 | 
 19 | def prob_words(context, vocab, temperature=1.0):
 20 |     """ This calculates a softmax over the vocabulary as a function
 21 |     of the dot product of context and word.
 22 |     """
 23 |     dot = np.dot(vocab, context)
 24 |     prob = _softmax(dot / temperature)
 25 |     return prob
 26 | 
 27 | 
 28 | def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
 29 |                    doc_lengths=None, term_frequency=None, normalize=False):
 30 |     """ Collects a dictionary of word, document and topic distributions.
 31 | 
 32 |     Arguments
 33 |     ---------
 34 |     weights : float array
 35 |         This must be an array of unnormalized log-odds of document-to-topic
 36 |         weights. Shape should be [n_documents, n_topics]
 37 |     factors : float array
 38 |         Should be an array of topic vectors. These topic vectors live in the
 39 |         same space as word vectors and will be used to find the most similar
 40 |         words to each topic. Shape should be [n_topics, n_dim].
 41 |     word_vectors : float array
 42 |         This must be a matrix of word vectors. Should be of shape
 43 |         [n_words, n_dim]
 44 |     vocab : list of str
 45 |         These must be the strings for words corresponding to
 46 |         indices [0, n_words]
 47 |     temperature : float
 48 |         Used to calculate the log probability of a word. Higher
 49 |         temperatures make more rare words more likely.
 50 |     doc_lengths : int array
 51 |         An array indicating the number of words in the nth document.
 52 |         Must be of shape [n_documents]. Required by pyLDAvis.
 53 |     term_frequency : int array
 54 |         An array indicating the overall number of times each token appears
 55 |         in the corpus. Must be of shape [n_words]. Required by pyLDAvis.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     data : dict
 60 |         This dictionary is readily consumed by pyLDAVis for topic
 61 |         visualization.
 62 |     """
 63 |     # Map each factor vector to a word
 64 |     topic_to_word = []
 65 |     msg = "Vocabulary size did not match size of word vectors"
 66 |     assert len(vocab) == word_vectors.shape[0], msg
 67 |     if normalize:
 68 |         word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None]
 69 |     # factors = factors / np.linalg.norm(factors, axis=1)[:, None]
 70 |     for factor_vector in factors:
 71 |         factor_to_word = prob_words(factor_vector, word_vectors,
 72 |                                     temperature=temperature)
 73 |         topic_to_word.append(np.ravel(factor_to_word))
 74 |     topic_to_word = np.array(topic_to_word)
 75 |     msg = "Not all rows in topic_to_word sum to 1"
 76 |     assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg
 77 |     # Collect document-to-topic distributions, e.g. theta
 78 |     doc_to_topic = _softmax_2d(weights)
 79 |     msg = "Not all rows in doc_to_topic sum to 1"
 80 |     assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg
 81 |     data = {'topic_term_dists': topic_to_word,
 82 |             'doc_topic_dists': doc_to_topic,
 83 |             'doc_lengths': doc_lengths,
 84 |             'vocab': vocab,
 85 |             'term_frequency': term_frequency}
 86 |     return data
 87 | 
 88 | 
 89 | def print_top_words_per_topic(data, top_n=10, do_print=True):
 90 |     """ Given a pyLDAvis data array, print out the top words in every topic.
 91 | 
 92 |     Arguments
 93 |     ---------
 94 |     data : dict
 95 |         A dict object that summarizes topic data and has been made using
 96 |         `prepare_topics`.
 97 |     """
 98 |     msgs = []
 99 |     lists = []
100 |     for j, topic_to_word in enumerate(data['topic_term_dists']):
101 |         top = np.argsort(topic_to_word)[::-1][:top_n]
102 |         prefix = "Top words in topic %i " % j
103 |         top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
104 |         msg = ' '.join(top_words)
105 |         if do_print:
106 |             print(prefix, msg)
107 |         lists.append(top_words)
108 |     return lists
109 | 
110 | 
111 | def get_request(url):
112 |     for _ in range(5):
113 |         try:
114 |             return float(requests.get(url).text)
115 |         except:
116 |             pass
117 |     return None
118 | 
119 | 
120 | def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
121 |                                      'umass']):
122 |     """ Requests the topic coherence from AKSW Palmetto
123 | 
124 |     Arguments
125 |     ---------
126 |     lists : list of lists
127 |         A list of lists with one list of top words for each topic.
128 | 
129 |     >>> topic_words = [['cake', 'apple', 'banana', 'cherry', 'chocolate']]
130 |     >>> topic_coherence(topic_words, services=['cv'])
131 |     {(0, 'cv'): 0.5678879445677241}
132 |     """
133 |     url = u'http://palmetto.aksw.org/palmetto-webapp/service/{}?words={}'
134 |     reqs = [url.format(s, '%20'.join(top[:10])) for s in services for top in lists]
135 |     pool = multiprocessing.Pool()
136 |     coherences = pool.map(get_request, reqs)
137 |     pool.close()
138 |     pool.terminate()
139 |     pool.join()
140 |     del pool
141 |     args = [(j, s, top) for s in services for j, top in enumerate(lists)]
142 |     ans = {}
143 |     for ((j, s, t), tc) in zip(args, coherences):
144 |         ans[(j, s)] = tc
145 |     return ans
146 | 


--------------------------------------------------------------------------------
/lda2vec/tracking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.linear_model import LinearRegression
 3 | 
 4 | 
 5 | class Tracking:
 6 |     cache = {}
 7 |     calls = 0
 8 |     slope = 0.0
 9 | 
10 |     def __init__(self, n=5000):
11 |         """ The tracking class keeps a most recently used cache of values
12 | 
13 |         Parameters
14 |         ----------
15 |         n: int
16 |         Number of items to keep.
17 |         """
18 |         self.n = n
19 | 
20 |     def add(self, key, item):
21 |         """ Add an item with a particular to the cache.
22 | 
23 |         >>> tracker = Tracking()
24 |         >>> tracker.add('log_perplexity', 55.6)
25 |         >>> tracker.cache['log_perplexity']
26 |         [55.6]
27 |         >>> tracker.add('log_perplexity', 55.2)
28 |         >>> tracker.add('loss', -12.1)
29 |         >>> tracker.cache['log_perplexity']
30 |         [55.6, 55.2]
31 |         >>> tracker.cache['loss']
32 |         [-12.1]
33 |         """
34 |         if key not in self.cache:
35 |             self.cache[key] = []
36 |         self.cache[key].append(item)
37 |         if len(self.cache[key]) > self.n:
38 |             self.cache[key] = self.cache[key][:self.n]
39 | 
40 |     def stats(self, key):
41 |         """ Get the statistics for items with a particular key
42 | 
43 |         >>> tracker = Tracking()
44 |         >>> tracker.add('log_perplexity', 55.6)
45 |         >>> tracker.add('log_perplexity', 55.2)
46 |         >>> tracker.stats('log_perplexity')
47 |         (55.400000000000006, 0.19999999999999929, 0.0)
48 |         """
49 |         data = self.cache[key]
50 |         mean = np.mean(data)
51 |         std = np.std(data)
52 |         slope = self.slope
53 |         if self.calls % 100 == 0:
54 |             lr = LinearRegression()
55 |             x = np.arange(len(data)).astype('float32')
56 |             lr.fit(x[:, None], np.array(data))
57 |             self.slope = lr.coef_[0]
58 |         self.calls += 1
59 |         return mean, std, slope
60 | 
61 | if __name__ == "__main__":
62 |     import doctest
63 |     doctest.testmod()
64 | 


--------------------------------------------------------------------------------
/lda2vec/utils.py:
--------------------------------------------------------------------------------
 1 | from chainer import Variable
 2 | import random
 3 | import numpy as np
 4 | 
 5 | 
 6 | def move(xp, *args):
 7 |     for arg in args:
 8 |         if 'float' in str(arg.dtype):
 9 |             yield Variable(xp.asarray(arg, dtype='float32'))
10 |         else:
11 |             assert 'int' in str(arg.dtype)
12 |             yield Variable(xp.asarray(arg, dtype='int32'))
13 | 
14 | 
15 | def most_similar(embeddings, word_index):
16 |     input_vector = embeddings.W[word_index]
17 |     similarities = embeddings.dot(input_vector)
18 |     return similarities
19 | 
20 | 
21 | def chunks(n, *args):
22 |     """Yield successive n-sized chunks from l."""
23 |     # From stackoverflow question 312443
24 |     keypoints = []
25 |     for i in range(0, len(args[0]), n):
26 |         keypoints.append((i, i + n))
27 |     random.shuffle(keypoints)
28 |     for a, b in keypoints:
29 |         yield [arg[a: b] for arg in args]
30 | 
31 | 
32 | class MovingAverage():
33 |     def __init__(self, lastn=100):
34 |         self.points = np.array([])
35 |         self.lastn = lastn
36 | 
37 |     def add(self, x):
38 |         self.points = np.append(self.points, x)
39 | 
40 |     def mean(self):
41 |         return np.mean(self.points[-self.lastn:])
42 | 
43 |     def std(self):
44 |         return np.std(self.points[-self.lastn:])
45 | 
46 |     def get_stats(self):
47 |         return (np.mean(self.points[-self.lastn:]),
48 |                 np.std(self.points[-self.lastn:]))
49 | 


--------------------------------------------------------------------------------
/lda2vec_network_publish_text.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/lda2vec_network_publish_text.gif


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python >= 3.5.2
2 | chainer>=5.1.0
3 | numpy>=1.16.0
4 | spacy>=1.9.0
5 | scipy>=1.0.0
6 | pyxDamerauLevenshtein==1.5.2
7 | pyLDAvis==2.1.2
8 | sklearn
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | import os
 4 | 
 5 | with open('requirements.txt') as f:
 6 |     install_requires = f.read().splitlines()
 7 | 
 8 | # If building on RTD, don't install anything
 9 | if os.environ.get('READTHEDOCS', None) == 'True':
10 |     install_requires = []
11 | 
12 | kw = dict(
13 |     name='lda2vec',
14 |     version='0.1',
15 |     description='Tools for interpreting natural language',
16 |     author='Christopher E Moody',
17 |     author_email='chrisemoody@gmail.com',
18 |     install_requires=install_requires,
19 |     packages=find_packages(),
20 |     url='')
21 | 
22 | setup(**kw)
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_dirichlet_likelihood.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import chainer.links as L
 3 | from chainer import Variable
 4 | 
 5 | from lda2vec import dirichlet_likelihood
 6 | 
 7 | 
 8 | def test_concentration():
 9 |     """ Test that alpha > 1.0 on a dense vector has a higher likelihood
10 |     than alpha < 1.0 on a dense vector, and test that a sparse vector
11 |     has the opposite character. """
12 | 
13 |     dense = np.random.randn(5, 10).astype('float32')
14 |     sparse = np.random.randn(5, 10).astype('float32')
15 |     sparse[:, 1:] /= 1e5
16 |     weights = Variable(dense)
17 |     dhl_dense_10 = dirichlet_likelihood(weights, alpha=10.0).data
18 |     dhl_dense_01 = dirichlet_likelihood(weights, alpha=0.1).data
19 |     weights = Variable(sparse)
20 |     dhl_sparse_10 = dirichlet_likelihood(weights, alpha=10.0).data
21 |     dhl_sparse_01 = dirichlet_likelihood(weights, alpha=0.1).data
22 | 
23 |     msg = "Sparse vector has higher likelihood than dense with alpha=0.1"
24 |     assert dhl_sparse_01 > dhl_dense_01, msg
25 |     msg = "Dense vector has higher likelihood than sparse with alpha=10.0"
26 |     assert dhl_dense_10 > dhl_sparse_10, msg
27 | 
28 | 
29 | def test_embed():
30 |     """ Test that embedding is treated like a Variable"""
31 | 
32 |     embed_dense = L.EmbedID(5, 10)
33 |     embed_sparse = L.EmbedID(5, 10)
34 |     embed_dense.W.data[:] = np.random.randn(5, 10).astype('float32')
35 |     embed_sparse.W.data[:] = np.random.randn(5, 10).astype('float32')
36 |     embed_sparse.W.data[:, 1:] /= 1e5
37 |     dhl_dense_01 = dirichlet_likelihood(embed_dense, alpha=0.1).data
38 |     dhl_sparse_01 = dirichlet_likelihood(embed_sparse, alpha=0.1).data
39 | 
40 |     msg = "Sparse vector has higher likelihood than dense with alpha=0.1"
41 |     assert dhl_sparse_01 > dhl_dense_01, msg
42 | 


--------------------------------------------------------------------------------
/tests/test_embed_mixture.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from chainer import Variable
 3 | 
 4 | from lda2vec import EmbedMixture
 5 | 
 6 | 
 7 | def softmax(v):
 8 |     return np.exp(v) / np.sum(np.exp(v))
 9 | 
10 | 
11 | def test_embed_mixture():
12 |     """ Manually test the projection logic between topic weights and vectors"""
13 |     # Ten documents, two topics, five hidden dimensions
14 |     em = EmbedMixture(10, 2, 5, dropout_ratio=0.0)
15 |     doc_ids = Variable(np.arange(1, dtype='int32'))
16 |     doc_vector = em(doc_ids).data
17 |     # weights -- (n_topics)
18 |     weights = softmax(em.weights.W.data[0, :])
19 |     un_weights = softmax(em.unnormalized_weights(doc_ids).data[0, :])
20 |     # (n_hidden) = (n_topics) . (n_topics, n_hidden)
21 |     test = np.sum(weights * em.factors.W.data.T, axis=1)
22 |     assert np.allclose(doc_vector, test)
23 |     assert np.allclose(un_weights, weights)
24 | 


--------------------------------------------------------------------------------
/tests/test_fake_data.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import fake_data
 2 | from chainer import Variable
 3 | from chainer.functions import cross_covariance
 4 | import numpy as np
 5 | 
 6 | 
 7 | def test_orthogonal_matrix():
 8 |     msg = "Orthogonal matrices have equal inverse and transpose"
 9 |     arr = fake_data.orthogonal_matrix([20, 20])
10 |     assert np.allclose(np.linalg.inv(arr), arr.T), msg
11 | 
12 | 
13 | def test_orthogonal_matrix_covariance():
14 |     msg = "Orthogonal matrix should have less covariance than a random matrix"
15 |     orth = Variable(fake_data.orthogonal_matrix([20, 20]).astype('float32'))
16 |     rand = Variable(np.random.randn(20, 20).astype('float32'))
17 |     orth_cc = cross_covariance(orth, orth).data
18 |     rand_cc = cross_covariance(rand, rand).data
19 |     assert orth_cc < rand_cc, msg
20 | 
21 | 
22 | def test_softmax():
23 |     arr = np.random.randn(100, 15)
24 |     probs = fake_data.softmax(arr)
25 |     norms = np.sum(probs, axis=1)
26 |     assert np.allclose(norms, np.ones_like(norms))
27 | 
28 | 
29 | def test_sample():
30 |     n_categories = 10
31 |     idx = 4
32 |     probs = np.zeros(n_categories)
33 |     probs = np.array(probs)
34 |     probs[idx] = 1.0
35 |     values = np.arange(n_categories)
36 |     size = 10
37 |     draws = fake_data.sample(values, probs, size)
38 |     assert np.all(draws == idx)
39 | 
40 | 
41 | def test_fake_data():
42 |     n_docs = 100
43 |     n_words = 10
44 |     n_hidden = 2
45 |     n_sent_length = 5
46 |     data = fake_data.fake_data(n_docs, n_words, n_sent_length, n_hidden)
47 |     assert data.dtype == np.dtype('int32')
48 |     assert data.shape[0] == n_docs
49 |     assert data.shape[1] == n_sent_length
50 |     assert np.max(data) <= n_words - 1
51 | 


--------------------------------------------------------------------------------
/tests/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import preprocess
 2 | import numpy as np
 3 | import pytest
 4 | import os
 5 | 
 6 | on_ci = os.environ.get('CI', False) == 'true'
 7 | 
 8 | 
 9 | @pytest.mark.skipif(on_ci, reason='SpaCy install fails on TravisCI')
10 | def test_tokenize():
11 |     texts = [u'Do you recall, not long ago']
12 |     texts += [u'We would walk on the sidewalk?']
13 |     arr, vocab = preprocess.tokenize(texts, 10)
14 |     assert arr[0, 0] != arr[0, 1]
15 |     assert arr.shape[0] == 2
16 |     assert arr.shape[1] == 10
17 |     assert arr[0, -1] == -2
18 |     assert arr.dtype == np.dtype('int32')
19 |     first_word = texts[0].split(' ')[0].lower()
20 |     first_lowr = preprocess.nlp.vocab[arr[0, 0]].lower_
21 |     assert first_word == first_lowr
22 | 


--------------------------------------------------------------------------------
/tests/test_topics.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import topics
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def exp_entropy(log_p):
 7 |     return -np.nansum(np.exp(log_p + 1e-12) * (log_p + 1e-12))
 8 | 
 9 | 
10 | def test_prob_words():
11 |     context = np.random.randn(3)
12 |     vocab = np.random.randn(10, 3)
13 |     lo = topics.prob_words(context, vocab, temperature=1)
14 |     hi = topics.prob_words(context, vocab, temperature=1e6)
15 |     msg = "Lower temperatures should be lower entropy and more concentrated"
16 |     assert exp_entropy(np.log(lo)) < exp_entropy(np.log(hi)), msg
17 | 
18 | 
19 | def prepare_topics():
20 |     # One document in two topics, unnormalized
21 |     weights = np.array([[0.5, -0.1]])
22 |     # Two topics in 4 dimensions
23 |     factors = np.array([[0.1, 0.1, 0.1, 5.0],
24 |                         [5.1, 0.1, 0.1, 0.0]])
25 |     # Three words in 4 dimensions
26 |     vectors = np.array([[5.0, 0.1, 0.1, 0.1],
27 |                         [0.0, 0.1, 0.1, 5.0],
28 |                         [2.0, 0.1, 0.1, -.9]])
29 |     vocab = ['a', 'b', 'c']
30 |     data = topics.prepare_topics(weights, factors, vectors, vocab)
31 |     return data
32 | 
33 | 
34 | def test_prepare_topics():
35 |     data = prepare_topics()
36 |     t2w = data['topic_term_dists']
37 |     msg = "Topic 0 should be most similar to 2nd token"
38 |     assert t2w[0].argsort()[::-1][0] == 1, msg
39 |     msg = "Topic 1 should be most similar to 1st token"
40 |     assert t2w[1].argsort()[::-1][0] == 0, msg
41 | 
42 | 
43 | def test_print_top_words_per_topic():
44 |     data = prepare_topics()
45 |     msgs = topics.print_top_words_per_topic(data, do_print=False)
46 |     assert len(msgs) == 2
47 |     for msg in msgs:
48 |         assert len(msg.split(' ')) == 3
49 | 


--------------------------------------------------------------------------------