├── .gitignore ├── .rtd ├── .travis.yml ├── LICENSE ├── README.rst ├── TODO ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.rst ├── lda2vec │ ├── corpus.rst │ ├── dirichlet_likelihood.rst │ ├── embed_mixture.rst │ ├── fake_data.rst │ ├── lda2vec.rst │ ├── preprocess.rst │ └── tracking.rst └── make.bat ├── examples ├── hacker_news │ ├── README.md │ ├── data │ │ └── preprocess.py │ └── lda2vec │ │ ├── lda2vec.ipynb │ │ ├── lda2vec_model.py │ │ ├── lda2vec_run.py │ │ └── word_vectors.ipynb └── twenty_newsgroups │ ├── README.md │ ├── data │ └── preprocess.py │ ├── lda │ ├── lda.py │ ├── lda_run.py │ └── topics.pyldavis.npz │ ├── lda2vec │ ├── lda2vec.ipynb │ ├── lda2vec_model.py │ ├── lda2vec_run.py │ └── topics.pyldavis.npz │ ├── nslda │ ├── nslda.py │ └── nslda_run.py │ └── nvdm │ ├── nvdm.py │ └── nvdm_run.py ├── images ├── img00_word2vec.png ├── img01_lda.png ├── img02_lda_topics.png ├── img03_lda2vec_topics01.png ├── img04_lda2vec_topics02.png ├── img05_lda2vec_topics03_supervised.png └── img06_pyldavis.gif ├── lda2vec ├── __init__.py ├── corpus.py ├── dirichlet_likelihood.py ├── embed_mixture.py ├── fake_data.py ├── negative_sampling.py ├── preprocess.py ├── topics.py ├── tracking.py └── utils.py ├── lda2vec_network_publish_text.gif ├── pytest.ini ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── test_dirichlet_likelihood.py ├── test_embed_mixture.py ├── test_fake_data.py ├── test_preprocess.py └── test_topics.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.egg/ 3 | *.pyc 4 | *.pyo 5 | *.cpp 6 | *.so 7 | *.swp 8 | build 9 | \#*\# 10 | .\#* 11 | .coverage 12 | .eggs/ 13 | *.egg-info/ 14 | dist/ 15 | htmlcov/ 16 | -------------------------------------------------------------------------------- /.rtd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/.rtd -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Much of this script was adapted from astropy/astropy. 2 | 3 | language: python 4 | 5 | env: 6 | global: 7 | - NUMPY_VERSION=1.10 8 | 9 | matrix: 10 | include: 11 | # All the versions of Python. 12 | - python: 2.7 13 | 14 | before_install: 15 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 16 | - chmod +x miniconda.sh 17 | - ./miniconda.sh -b 18 | - export PATH=/home/travis/miniconda2/bin:$PATH 19 | install: 20 | - conda create --yes -n testing python=$TRAVIS_PYTHON_VERSION 21 | - source activate testing 22 | - conda install --yes numpy=$NUMPY_VERSION nose pip numba cython scikit-learn h5py 23 | - pip install chainer pytest spacy codecov coveralls pytest-cov 24 | - python -m spacy.en.download --force all 25 | - python setup.py install 26 | 27 | script: 28 | - ls 29 | - pwd 30 | - env | sort 31 | - py.test --cov=lda2vec tests/ lda2vec --ignore=lda2vec/preprocess.py 32 | - # python examples/twenty_newsgroups/lda.py 33 | after_success: 34 | - coveralls 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Christopher Erick Moody 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | lda2vec: Tools for interpreting natural language 2 | ================================================= 3 | 4 | **This lda2vec works on python 3.** 5 | 6 | .. image:: http://img.shields.io/badge/license-MIT-blue.svg?style=flat 7 | :target: https://github.com/cemoody/lda2vec/blob/master/LICENSE 8 | 9 | .. image:: https://readthedocs.org/projects/lda2vec/badge/?version=latest 10 | :target: http://lda2vec.readthedocs.org/en/latest/?badge=latest 11 | 12 | .. image:: https://travis-ci.org/cemoody/lda2vec.svg?branch=master 13 | :target: https://travis-ci.org/cemoody/lda2vec 14 | 15 | .. image:: https://img.shields.io/badge/coverage-93%25-green.svg 16 |    :target: https://travis-ci.org/cemoody/lda2vec 17 | 18 | .. image:: https://img.shields.io/twitter/follow/chrisemoody.svg?style=social 19 | :target: https://twitter.com/intent/follow?screen_name=chrisemoody 20 | 21 | .. image:: lda2vec_network_publish_text.gif 22 | 23 | 24 | The lda2vec model tries to mix the best parts of word2vec and LDA 25 | into a single framework. word2vec captures powerful relationships 26 | between words, but the resulting vectors are largely uninterpretable 27 | and don't represent documents. LDA on the other hand is quite 28 | interpretable by humans, but doesn't model local word relationships 29 | like word2vec. We build a model that builds both word and document 30 | topics, makes them interpreable, makes topics over clients, times, 31 | and documents, and makes them supervised topics. 32 | 33 | *Warning*: this code is a big series of experiments. It's research software, 34 | and we've tried to make it simple to modify lda2vec and to play around with 35 | your own custom topic models. However, it's still research software. 36 | I wouldn't run this in production, Windows, and I'd only use it after you've 37 | decided both word2vec and LDA are inadequate and you'd like to tinker with your 38 | own cool models :) That said, I don't want to discourage experimentation: 39 | there's some limited documentation, a modicum of unit tests, and some 40 | interactive examples to get you started. 41 | 42 | 43 | Resources 44 | --------- 45 | See the research paper `Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec `_ 46 | 47 | See this `Jupyter Notebook `_ 48 | for an example of an end-to-end demonstration. 49 | 50 | See this `slide deck `_ 51 | or this `youtube video `_ 52 | for a presentation focused on the benefits of word2vec, LDA, and lda2vec. 53 | 54 | See the `API reference docs `_ 55 | 56 | 57 | About 58 | ----- 59 | 60 | .. image:: images/img00_word2vec.png 61 | 62 | Word2vec tries to model word-to-word relationships. 63 | 64 | .. image:: images/img01_lda.png 65 | 66 | LDA models document-to-word relationships. 67 | 68 | .. image:: images/img02_lda_topics.png 69 | 70 | LDA yields topics over each document. 71 | 72 | .. image:: images/img03_lda2vec_topics01.png 73 | 74 | lda2vec yields topics not over just documents, but also regions. 75 | 76 | .. image:: images/img04_lda2vec_topics02.png 77 | 78 | lda2vec also yields topics over clients. 79 | 80 | .. image:: images/img05_lda2vec_topics03_supervised.png 81 | 82 | lda2vec the topics can be 'supervised' and forced to predict another target. 83 | 84 | lda2vec also includes more contexts and features than LDA. LDA dictates that 85 | words are generated by a document vector; but we might have all kinds of 86 | 'side-information' that should influence our topics. For example, a single 87 | client comment is about a particular item ID, written at a particular time 88 | and in a particular region. In this case, lda2vec gives you topics over all 89 | items (separating jeans from shirts, for example) times (winter versus summer) 90 | regions (desert versus coastal) and clients (sporty vs professional attire). 91 | 92 | Ultimately, the topics are interpreted using the excellent pyLDAvis library: 93 | 94 | .. image:: images/img06_pyldavis.gif 95 | 96 | 97 | Requirements 98 | ------------ 99 | 100 | I tested twenty-newsgroups example based on following requirements: 101 | 102 | - Python 3.5.2 103 | - NumPy 1.16.0 104 | - Chainer 5.1.0 105 | - spaCy 1.9.0 106 | - pyxDamerauLevenshtein 1.5.2 107 | - pyLDAvis 2.1.2 108 | 109 | 110 | Requirements for some features: 111 | 112 | - CUDA support 113 | - Testing utilities: py.test 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | Add tests for target 2 | Add tests for global targets 3 | Add examples of specific documents to 20ng example 4 | Add better naming to categorical variables, e.g. like target variables 5 | Keep track of doc counts between model serializations 6 | Add bigramming 7 | Add better README 8 | Add an example script with HN with doc id, client id, and predicted score 9 | Add super simple explanatory models 10 | Remove spacy dep 11 | Change EmbedMixture naming to possible values and n latent factors 12 | Print out topics while training 13 | Add doctets to lda2vec main classes 14 | Randomize chunking order on fit 15 | Add loss tracking and reporting classes to code 16 | Finish filling out docstrings 17 | Add multiple targets for one component 18 | Add convergence criterion 19 | 20 | Add docs on: 21 | Installation 22 | HN Tutorial 23 | Parse document into vector 24 | Setup LDA for document 25 | Mesure perplexity 26 | Visualize topics 27 | Add supervised component 28 | Mesure perplexity 29 | Visualize topics 30 | Add another component for time 31 | Mesure perplexity 32 | Visualize topics 33 | Visualize topics, changing temperature 34 | Data formats 35 | Loose 36 | Compact 37 | Flat 38 | Contexts 39 | Categorical contexts 40 | Other contexts TBA 41 | Targets 42 | RMSE 43 | Logistic 44 | Softmax 45 | Advanced 46 | Options 47 | GPU 48 | Gradient Clipping 49 | Online learning, fraction argument 50 | Logging progress 51 | Perplexity 52 | Model saving, prediction 53 | Dropout fractions 54 | 55 | Nomenclature 56 | Categorical Feature 57 | Each category in set has n_possible_values 58 | Each feature has n_latent_factors 59 | Each feature has a single target 60 | Components 61 | Each component defined total number of documents and number of topics 62 | Each component may also have supervised targets 63 | 64 | Done: 65 | Add BoW mode 66 | Add logger 67 | Add fake data generator 68 | Add perplexity measurements 69 | Add tracking utility 70 | Add utilities for converting corpora 71 | Put license 72 | Add masks / skips / pads 73 | Add reindexing on the fly 74 | Convert docstrings to numpy format 75 | Implement corpus loose to dense and vice versa 76 | Add fit function for all data at once 77 | Add CI & coverage & license icons 78 | Add readthedocs support 79 | Add examples to CI 80 | Add dropout 81 | Change component naming to 'categorical feature' 82 | Add linear layers between input latent and output context 83 | Merge skipgram branch 84 | Add topic numbers to topic print out 85 | Try higher importance to the prior 86 | Change prob model to just model prob of word in topic 87 | Add word dropout 88 | Add an example script with 20 newsgroups -- LDA 89 | Add visualization for topic-word 90 | Implement skipgram contexts 91 | Prevent mixing between documents 92 | Add temperature to perplexity measurements 93 | Add temperature to viz 94 | Add model saving 95 | Add model predicting 96 | Hook up RTD to docstrings 97 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/lda2vec.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/lda2vec.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/lda2vec" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/lda2vec" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | lda2vec package 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :glob: 7 | 8 | lda2vec/* 9 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # lda2vec documentation build configuration file, created by 4 | # sphinx-quickstart on Sun Jan 24 18:22:13 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | try: 18 | from unittest.mock import MagicMock 19 | except ImportError: 20 | from mock import Mock as MagicMock 21 | 22 | 23 | class Mock(MagicMock): 24 | @classmethod 25 | def __getattr__(cls, name): 26 | return Mock() 27 | 28 | 29 | MOCK_MODULES = ['sklearn', 'chainer', 'chainer.functions', 'chainer.links', 30 | 'chainer.optimizers', 'spacy', 'numpy', 'pyLDAvis', 31 | 'sklearn.linear_model', 'spacy.en', 'sklearn.datasets', 32 | 'numpy.random', 'spacy.attrs'] 33 | if os.environ.get('READTHEDOCS', None) == 'True': 34 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 35 | 36 | 37 | # If extensions (or modules to document with autodoc) are in another directory, 38 | # add these directories to sys.path here. If the directory is relative to the 39 | # documentation root, use os.path.abspath to make it absolute, like shown here. 40 | sys.path.insert(0, os.path.abspath('.')) 41 | sys.path.insert(0, os.path.abspath('../')) 42 | 43 | # -- General configuration ------------------------------------------------ 44 | 45 | # If your documentation needs a minimal Sphinx version, state it here. 46 | #needs_sphinx = '1.0' 47 | 48 | # Add any Sphinx extension module names here, as strings. They can be 49 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 50 | # ones. 51 | extensions = [ 52 | 'sphinx.ext.autodoc', 53 | 'sphinx.ext.doctest', 54 | 'sphinx.ext.mathjax', 55 | 'sphinx.ext.napoleon' 56 | ] 57 | 58 | # Add any paths that contain templates here, relative to this directory. 59 | templates_path = ['_templates'] 60 | 61 | # The suffix(es) of source filenames. 62 | # You can specify multiple suffix as a list of string: 63 | # source_suffix = ['.rst', '.md'] 64 | source_suffix = '.rst' 65 | 66 | # The encoding of source files. 67 | #source_encoding = 'utf-8-sig' 68 | 69 | # The master toctree document. 70 | master_doc = 'index' 71 | 72 | # General information about the project. 73 | project = u'lda2vec' 74 | copyright = u'2016, Christopher Erick Moody' 75 | author = u'Christopher Erick Moody' 76 | 77 | # The version info for the project you're documenting, acts as replacement for 78 | # |version| and |release|, also used in various other places throughout the 79 | # built documents. 80 | # 81 | # The short X.Y version. 82 | version = u'0.01' 83 | # The full version, including alpha/beta/rc tags. 84 | release = u'0.01' 85 | 86 | # The language for content autogenerated by Sphinx. Refer to documentation 87 | # for a list of supported languages. 88 | # 89 | # This is also used if you do content translation via gettext catalogs. 90 | # Usually you set "language" from the command line for these cases. 91 | language = None 92 | 93 | # There are two options for replacing |today|: either, you set today to some 94 | # non-false value, then it is used: 95 | #today = '' 96 | # Else, today_fmt is used as the format for a strftime call. 97 | #today_fmt = '%B %d, %Y' 98 | 99 | # List of patterns, relative to source directory, that match files and 100 | # directories to ignore when looking for source files. 101 | exclude_patterns = ['_build'] 102 | 103 | # The reST default role (used for this markup: `text`) to use for all 104 | # documents. 105 | #default_role = None 106 | 107 | # If true, '()' will be appended to :func: etc. cross-reference text. 108 | #add_function_parentheses = True 109 | 110 | # If true, the current module name will be prepended to all description 111 | # unit titles (such as .. function::). 112 | #add_module_names = True 113 | 114 | # If true, sectionauthor and moduleauthor directives will be shown in the 115 | # output. They are ignored by default. 116 | #show_authors = False 117 | 118 | # The name of the Pygments (syntax highlighting) style to use. 119 | pygments_style = 'sphinx' 120 | 121 | # A list of ignored prefixes for module index sorting. 122 | #modindex_common_prefix = [] 123 | 124 | # If true, keep warnings as "system message" paragraphs in the built documents. 125 | #keep_warnings = False 126 | 127 | # If true, `todo` and `todoList` produce output, else they produce nothing. 128 | todo_include_todos = False 129 | 130 | 131 | # -- Options for HTML output ---------------------------------------------- 132 | 133 | # The theme to use for HTML and HTML Help pages. See the documentation for 134 | # a list of builtin themes. 135 | html_theme = 'sphinx_rtd_theme' 136 | 137 | # Theme options are theme-specific and customize the look and feel of a theme 138 | # further. For a list of options available for each theme, see the 139 | # documentation. 140 | #html_theme_options = {} 141 | 142 | # Add any paths that contain custom themes here, relative to this directory. 143 | #html_theme_path = [] 144 | 145 | # The name for this set of Sphinx documents. If None, it defaults to 146 | # " v documentation". 147 | #html_title = None 148 | 149 | # A shorter title for the navigation bar. Default is the same as html_title. 150 | #html_short_title = None 151 | 152 | # The name of an image file (relative to this directory) to place at the top 153 | # of the sidebar. 154 | #html_logo = None 155 | 156 | # The name of an image file (within the static path) to use as favicon of the 157 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 158 | # pixels large. 159 | #html_favicon = None 160 | 161 | # Add any paths that contain custom static files (such as style sheets) here, 162 | # relative to this directory. They are copied after the builtin static files, 163 | # so a file named "default.css" will overwrite the builtin "default.css". 164 | html_static_path = ['_static'] 165 | 166 | # Add any extra paths that contain custom files (such as robots.txt or 167 | # .htaccess) here, relative to this directory. These files are copied 168 | # directly to the root of the documentation. 169 | #html_extra_path = [] 170 | 171 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 172 | # using the given strftime format. 173 | #html_last_updated_fmt = '%b %d, %Y' 174 | 175 | # If true, SmartyPants will be used to convert quotes and dashes to 176 | # typographically correct entities. 177 | #html_use_smartypants = True 178 | 179 | # Custom sidebar templates, maps document names to template names. 180 | #html_sidebars = {} 181 | 182 | # Additional templates that should be rendered to pages, maps page names to 183 | # template names. 184 | #html_additional_pages = {} 185 | 186 | # If false, no module index is generated. 187 | #html_domain_indices = True 188 | 189 | # If false, no index is generated. 190 | #html_use_index = True 191 | 192 | # If true, the index is split into individual pages for each letter. 193 | #html_split_index = False 194 | 195 | # If true, links to the reST sources are added to the pages. 196 | #html_show_sourcelink = True 197 | 198 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 199 | #html_show_sphinx = True 200 | 201 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 202 | #html_show_copyright = True 203 | 204 | # If true, an OpenSearch description file will be output, and all pages will 205 | # contain a tag referring to it. The value of this option must be the 206 | # base URL from which the finished HTML is served. 207 | #html_use_opensearch = '' 208 | 209 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 210 | #html_file_suffix = None 211 | 212 | # Language to be used for generating the HTML full-text search index. 213 | # Sphinx supports the following languages: 214 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 215 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 216 | #html_search_language = 'en' 217 | 218 | # A dictionary with options for the search language support, empty by default. 219 | # Now only 'ja' uses this config value 220 | #html_search_options = {'type': 'default'} 221 | 222 | # The name of a javascript file (relative to the configuration directory) that 223 | # implements a search results scorer. If empty, the default will be used. 224 | #html_search_scorer = 'scorer.js' 225 | 226 | # Output file base name for HTML help builder. 227 | htmlhelp_basename = 'lda2vecdoc' 228 | 229 | # -- Options for LaTeX output --------------------------------------------- 230 | 231 | latex_elements = { 232 | # The paper size ('letterpaper' or 'a4paper'). 233 | #'papersize': 'letterpaper', 234 | 235 | # The font size ('10pt', '11pt' or '12pt'). 236 | #'pointsize': '10pt', 237 | 238 | # Additional stuff for the LaTeX preamble. 239 | #'preamble': '', 240 | 241 | # Latex figure (float) alignment 242 | #'figure_align': 'htbp', 243 | } 244 | 245 | # Grouping the document tree into LaTeX files. List of tuples 246 | # (source start file, target name, title, 247 | # author, documentclass [howto, manual, or own class]). 248 | latex_documents = [ 249 | (master_doc, 'lda2vec.tex', u'lda2vec Documentation', 250 | u'Christopher Erick Moody', 'manual'), 251 | ] 252 | 253 | # The name of an image file (relative to this directory) to place at the top of 254 | # the title page. 255 | #latex_logo = None 256 | 257 | # For "manual" documents, if this is true, then toplevel headings are parts, 258 | # not chapters. 259 | #latex_use_parts = False 260 | 261 | # If true, show page references after internal links. 262 | #latex_show_pagerefs = False 263 | 264 | # If true, show URL addresses after external links. 265 | #latex_show_urls = False 266 | 267 | # Documents to append as an appendix to all manuals. 268 | #latex_appendices = [] 269 | 270 | # If false, no module index is generated. 271 | #latex_domain_indices = True 272 | 273 | 274 | # -- Options for manual page output --------------------------------------- 275 | 276 | # One entry per manual page. List of tuples 277 | # (source start file, name, description, authors, manual section). 278 | man_pages = [ 279 | (master_doc, 'lda2vec', u'lda2vec Documentation', 280 | [author], 1) 281 | ] 282 | 283 | # If true, show URL addresses after external links. 284 | #man_show_urls = False 285 | 286 | 287 | # -- Options for Texinfo output ------------------------------------------- 288 | 289 | # Grouping the document tree into Texinfo files. List of tuples 290 | # (source start file, target name, title, author, 291 | # dir menu entry, description, category) 292 | texinfo_documents = [ 293 | (master_doc, 'lda2vec', u'lda2vec Documentation', 294 | author, 'lda2vec', 'One line description of project.', 295 | 'Miscellaneous'), 296 | ] 297 | 298 | # Documents to append as an appendix to all manuals. 299 | #texinfo_appendices = [] 300 | 301 | # If false, no module index is generated. 302 | #texinfo_domain_indices = True 303 | 304 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 305 | #texinfo_show_urls = 'footnote' 306 | 307 | # If true, do not generate a @detailmenu in the "Top" node's menu. 308 | #texinfo_no_detailmenu = False 309 | 310 | 311 | source_suffix = ['.rst'] 312 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ============================================== 2 | lda2vec -- flexible & interpretable NLP models 3 | ============================================== 4 | 5 | This is the documentation for lda2vec, a framework for useful 6 | flexible and interpretable NLP models. 7 | 8 | Defining the model is simple and quick:: 9 | 10 | model = LDA2Vec(n_words, max_length, n_hidden, counts) 11 | model.add_component(n_docs, n_topics, name='document id') 12 | model.fit(clean, components=[doc_ids]) 13 | 14 | While visualizing the feature is similarly straightforward:: 15 | 16 | topics = model.prepare_topics('document_id', vocab) 17 | prepared = pyLDAvis.prepare(topics) 18 | pyLDAvis.display(prepared) 19 | 20 | Resources 21 | --------- 22 | See this `Jupyter Notebook `_ 23 | for an example of an end-to-end demonstration. 24 | 25 | See this `presentation `_ 26 | for a presentation focused on the benefits of word2vec, LDA, and lda2vec. 27 | 28 | See the `API reference docs `_ 29 | 30 | See the `GitHub repo `_ 31 | 32 | API 33 | === 34 | .. toctree:: 35 | 36 | api 37 | 38 | 39 | Indices and tables 40 | ================== 41 | 42 | * :ref:`genindex` 43 | * :ref:`modindex` 44 | * :ref:`search` 45 | -------------------------------------------------------------------------------- /docs/lda2vec/corpus.rst: -------------------------------------------------------------------------------- 1 | lda2vec.corpus module 2 | --------------------- 3 | 4 | .. automodule:: lda2vec.corpus 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/dirichlet_likelihood.rst: -------------------------------------------------------------------------------- 1 | lda2vec.dirichlet_likelihood module 2 | ----------------------------------- 3 | 4 | .. automodule:: lda2vec.dirichlet_likelihood 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/embed_mixture.rst: -------------------------------------------------------------------------------- 1 | lda2vec.embed_mixture module 2 | ---------------------------- 3 | 4 | .. automodule:: lda2vec.embed_mixture 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/fake_data.rst: -------------------------------------------------------------------------------- 1 | lda2vec.fake_data module 2 | ------------------------ 3 | 4 | .. automodule:: lda2vec.fake_data 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/lda2vec.rst: -------------------------------------------------------------------------------- 1 | lda2vec.lda2vec module 2 | ---------------------- 3 | 4 | .. automodule:: lda2vec.lda2vec 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/preprocess.rst: -------------------------------------------------------------------------------- 1 | lda2vec.preprocess module 2 | ------------------------- 3 | 4 | .. automodule:: lda2vec.preprocess 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/lda2vec/tracking.rst: -------------------------------------------------------------------------------- 1 | lda2vec.tracking module 2 | ----------------------- 3 | 4 | .. automodule:: lda2vec.tracking 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\lda2vec.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\lda2vec.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /examples/hacker_news/README.md: -------------------------------------------------------------------------------- 1 | # Hacker News Comments with lda2vec example 2 | This example trains a multi-component lda2vec model on a corpus of Hacker News 3 | comments. The goal is to model how Hacker News stories have changed in time, 4 | how they correlate with the number of comments posted, and what individual 5 | commenter topics are. 6 | 7 | ### Running the model 8 | 9 | To run this example, first run `preprocess.py` which will download the Hacker 10 | News comments CSV, tokenize it, and quickly build a vocabulary. Once finished, 11 | it saves the training data to file. 12 | 13 | Then run `model.py` which will train the lda2vec model. 14 | 15 | Finally, `visualize.py` helps the human interpret what the topics mean. 16 | 17 | ### The HN Comment Data 18 | 19 | The corpus has been slightly filtered. We've removed comments made by 20 | infrequent users (e.g. having fewer than 10 comments ever) and removed stories 21 | with fewer than 10 comments. The training corpus is available at 22 | [Zenodo](https://zenodo.org/record/45901#.Vrv5jJMrLMU). 23 | 24 | ### Preparing the HN Comment Data 25 | 26 | You shouldn't need to repeat any of the Google BigQuery work. If you would like 27 | to nevertheless, the rough steps are outline below: 28 | 29 | The raw HN data is available on Google BigQuery, see for example these resources: 30 | 31 | - Previous analysis on this [dataset](https://github.com/fhoffa/notebooks/blob/master/analyzing%20hacker%20news.ipynb) 32 | 33 | - Dataset [shared here](https://bigquery.cloud.google.com/table/fh-bigquery:hackernews.comments) 34 | 35 | Data Prepataion 36 | 37 | #### Query 1 38 | 39 | SELECT p0.id AS id 40 | , p0.text as text 41 | , p0.author AS author 42 | , p0.ranking AS ranking 43 | , p0.time 44 | , p0.time_ts 45 | , COALESCE(p7.parent, p6.parent, p5.parent, p4.parent, p3.parent, p2.parent, p1.parent, p0.parent) story_id 46 | , GREATEST( IF(p7.parent IS null, -1, 7) 47 | , IF(p6.parent IS null, -1, 6) 48 | , IF(p5.parent IS null, -1, 5) 49 | , IF(p4.parent IS null, -1, 4) 50 | , IF(p3.parent IS null, -1, 3) 51 | , IF(p2.parent IS null, -1, 2) 52 | , IF(p1.parent IS null, -1, 1) 53 | , 0) level 54 | FROM [fh-bigquery:hackernews.comments] p0 55 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p1 ON p1.id=p0.parent 56 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p2 ON p2.id=p1.parent 57 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p3 ON p3.id=p2.parent 58 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p4 ON p4.id=p3.parent 59 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p5 ON p5.id=p4.parent 60 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p6 ON p6.id=p5.parent 61 | LEFT JOIN EACH [fh-bigquery:hackernews.comments] p7 ON p7.id=p6.parent 62 | WHERE p0.deleted IS NULL 63 | AND p0.dead IS NULL 64 | AND LENGTH(p0.text) > 5 65 | HAVING level = 0 66 | 67 | #### Query 2 68 | 69 | SELECT s.id AS story_id 70 | , s.time AS story_time 71 | , s.url AS story_url 72 | , s.text AS story_text 73 | , s.author AS story_author 74 | , c.id AS comment_id 75 | , c.text AS comment_text 76 | , c.author AS comment_author 77 | , c.ranking as comment_ranking 78 | , author_counts.n_comments AS author_comment_count 79 | , story_counts.n_comments AS story_comment_count 80 | FROM [lda2vec-v02:data.comment_to_story_id] c 81 | JOIN (SELECT story_id 82 | , COUNT(story_id) AS n_comments 83 | FROM [lda2vec-v02:data.comment_to_story_id] 84 | GROUP BY story_id 85 | ) AS story_counts 86 | ON c.story_id = story_counts.story_id 87 | JOIN (SELECT author 88 | , COUNT(author) AS n_comments 89 | FROM [lda2vec-v02:data.comment_to_story_id] 90 | GROUP BY author 91 | ) AS author_counts 92 | ON c.author = author_counts.author 93 | JOIN [fh-bigquery:hackernews.stories] s 94 | ON s.id = c.story_id 95 | WHERE story_counts.n_comments > 10 96 | AND author_counts.n_comments > 10 97 | -------------------------------------------------------------------------------- /examples/hacker_news/data/preprocess.py: -------------------------------------------------------------------------------- 1 | # Author: Chris Moody 2 | # License: MIT 3 | 4 | # This example loads a large 800MB Hacker News comments dataset 5 | # and preprocesses it. This can take a few hours, and a lot of 6 | # memory, so please be patient! 7 | 8 | from lda2vec import preprocess, Corpus 9 | import numpy as np 10 | import pandas as pd 11 | import logging 12 | import cPickle as pickle 13 | import os.path 14 | 15 | logging.basicConfig() 16 | 17 | max_length = 250 # Limit of 250 words per comment 18 | min_author_comments = 50 # Exclude authors with fewer comments 19 | nrows = None # Number of rows of file to read; None reads in full file 20 | 21 | fn = "hacker_news_comments.csv" 22 | url = "https://zenodo.org/record/45901/files/hacker_news_comments.csv" 23 | if not os.path.exists(fn): 24 | import requests 25 | response = requests.get(url, stream=True, timeout=2400) 26 | with open(fn, 'w') as fh: 27 | # Iterate over 1MB chunks 28 | for data in response.iter_content(1024**2): 29 | fh.write(data) 30 | 31 | 32 | features = [] 33 | # Convert to unicode (spaCy only works with unicode) 34 | features = pd.read_csv(fn, encoding='utf8', nrows=nrows) 35 | # Convert all integer arrays to int32 36 | for col, dtype in zip(features.columns, features.dtypes): 37 | if dtype is np.dtype('int64'): 38 | features[col] = features[col].astype('int32') 39 | 40 | # Tokenize the texts 41 | # If this fails it's likely spacy. Install a recent spacy version. 42 | # Only the most recent versions have tokenization of noun phrases 43 | # I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff 44 | # Also try running python -m spacy.en.download all --force 45 | texts = features.pop('comment_text').values 46 | tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4, 47 | merge=True) 48 | del texts 49 | 50 | # Make a ranked list of rare vs frequent words 51 | corpus = Corpus() 52 | corpus.update_word_count(tokens) 53 | corpus.finalize() 54 | 55 | # The tokenization uses spaCy indices, and so may have gaps 56 | # between indices for words that aren't present in our dataset. 57 | # This builds a new compact index 58 | compact = corpus.to_compact(tokens) 59 | # Remove extremely rare words 60 | pruned = corpus.filter_count(compact, min_count=10) 61 | # Words tend to have power law frequency, so selectively 62 | # downsample the most prevalent words 63 | clean = corpus.subsample_frequent(pruned) 64 | print "n_words", np.unique(clean).max() 65 | 66 | # Extract numpy arrays over the fields we want covered by topics 67 | # Convert to categorical variables 68 | author_counts = features['comment_author'].value_counts() 69 | to_remove = author_counts[author_counts < min_author_comments].index 70 | mask = features['comment_author'].isin(to_remove).values 71 | author_name = features['comment_author'].values.copy() 72 | author_name[mask] = 'infrequent_author' 73 | features['comment_author'] = author_name 74 | authors = pd.Categorical(features['comment_author']) 75 | author_id = authors.codes 76 | author_name = authors.categories 77 | story_id = pd.Categorical(features['story_id']).codes 78 | # Chop timestamps into days 79 | story_time = pd.to_datetime(features['story_time'], unit='s') 80 | days_since = (story_time - story_time.min()) / pd.Timedelta('1 day') 81 | time_id = days_since.astype('int32') 82 | features['story_id_codes'] = story_id 83 | features['author_id_codes'] = story_id 84 | features['time_id_codes'] = time_id 85 | 86 | print "n_authors", author_id.max() 87 | print "n_stories", story_id.max() 88 | print "n_times", time_id.max() 89 | 90 | # Extract outcome supervised features 91 | ranking = features['comment_ranking'].values 92 | score = features['story_comment_count'].values 93 | 94 | # Now flatten a 2D array of document per row and word position 95 | # per column to a 1D array of words. This will also remove skips 96 | # and OoV words 97 | feature_arrs = (story_id, author_id, time_id, ranking, score) 98 | flattened, features_flat = corpus.compact_to_flat(pruned, *feature_arrs) 99 | # Flattened feature arrays 100 | (story_id_f, author_id_f, time_id_f, ranking_f, score_f) = features_flat 101 | 102 | # Save the data 103 | pickle.dump(corpus, open('corpus', 'w'), protocol=2) 104 | pickle.dump(vocab, open('vocab', 'w'), protocol=2) 105 | features.to_pickle('features.pd') 106 | data = dict(flattened=flattened, story_id=story_id_f, author_id=author_id_f, 107 | time_id=time_id_f, ranking=ranking_f, score=score_f, 108 | author_name=author_name, author_index=author_id) 109 | np.savez('data', **data) 110 | np.save(open('tokens', 'w'), tokens) 111 | -------------------------------------------------------------------------------- /examples/hacker_news/lda2vec/lda2vec_model.py: -------------------------------------------------------------------------------- 1 | from lda2vec import EmbedMixture 2 | from lda2vec import dirichlet_likelihood 3 | from lda2vec.utils import move 4 | 5 | from chainer import Chain 6 | import chainer.links as L 7 | import chainer.functions as F 8 | 9 | import numpy as np 10 | 11 | 12 | class LDA2Vec(Chain): 13 | def __init__(self, n_stories=100, n_story_topics=10, 14 | n_authors=100, n_author_topics=10, 15 | n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True, 16 | counts=None, n_samples=15, word_dropout_ratio=0.0): 17 | em1 = EmbedMixture(n_stories, n_story_topics, n_units, 18 | dropout_ratio=dropout_ratio) 19 | em2 = EmbedMixture(n_authors, n_author_topics, n_units, 20 | dropout_ratio=dropout_ratio) 21 | kwargs = {} 22 | kwargs['mixture_sty'] = em1 23 | kwargs['mixture_aut'] = em2 24 | kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples) 25 | super(LDA2Vec, self).__init__(**kwargs) 26 | rand = np.random.random(self.sampler.W.data.shape) 27 | self.sampler.W.data[:, :] = rand[:, :] 28 | self.n_units = n_units 29 | self.train = train 30 | self.dropout_ratio = dropout_ratio 31 | self.word_dropout_ratio = word_dropout_ratio 32 | self.n_samples = n_samples 33 | 34 | def prior(self): 35 | dl1 = dirichlet_likelihood(self.mixture_sty.weights) 36 | dl2 = dirichlet_likelihood(self.mixture_aut.weights) 37 | return dl1 + dl2 38 | 39 | def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): 40 | sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids) 41 | pivot_idx = next(move(self.xp, rwrd_ids[window: -window])) 42 | pivot = F.embed_id(pivot_idx, self.sampler.W) 43 | sty_at_pivot = rsty_ids[window: -window] 44 | aut_at_pivot = raut_ids[window: -window] 45 | sty = self.mixture_sty(next(move(self.xp, sty_at_pivot))) 46 | aut = self.mixture_aut(next(move(self.xp, aut_at_pivot))) 47 | loss = 0.0 48 | start, end = window, rwrd_ids.shape[0] - window 49 | context = sty + aut + F.dropout(pivot, self.dropout_ratio) 50 | for frame in range(-window, window + 1): 51 | # Skip predicting the current pivot 52 | if frame == 0: 53 | continue 54 | # Predict word given context and pivot word 55 | # The target starts before the pivot 56 | targetidx = rwrd_ids[start + frame: end + frame] 57 | sty_at_target = rsty_ids[start + frame: end + frame] 58 | aut_at_target = raut_ids[start + frame: end + frame] 59 | sty_is_same = sty_at_target == sty_at_pivot 60 | aut_is_same = aut_at_target == aut_at_pivot 61 | # Randomly dropout words (default is to never do this) 62 | rand = np.random.uniform(0, 1, sty_is_same.shape[0]) 63 | mask = (rand > self.word_dropout_ratio).astype('bool') 64 | sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same) 65 | weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32') 66 | # If weight is 1.0 then targetidx 67 | # If weight is 0.0 then -1 68 | targetidx = targetidx * weight + -1 * (1 - weight) 69 | target, = move(self.xp, targetidx) 70 | loss = self.sampler(context, target) 71 | loss.backward() 72 | return loss.data 73 | -------------------------------------------------------------------------------- /examples/hacker_news/lda2vec/lda2vec_run.py: -------------------------------------------------------------------------------- 1 | # Author: Chris Moody 2 | # License: MIT 3 | 4 | # This simple example loads the newsgroups data from sklearn 5 | # and train an LDA-like model on it 6 | import os.path 7 | import pickle 8 | import time 9 | 10 | import chainer 11 | from chainer import cuda 12 | from chainer import serializers 13 | import chainer.optimizers as O 14 | import numpy as np 15 | 16 | from lda2vec import utils 17 | from lda2vec import prepare_topics, print_top_words_per_topic 18 | from lda2vec_model import LDA2Vec 19 | 20 | gpu_id = int(os.getenv('CUDA_GPU', 0)) 21 | cuda.get_device(gpu_id).use() 22 | print "Using GPU " + str(gpu_id) 23 | 24 | # You must run preprocess.py before this data becomes available 25 | vocab = pickle.load(open('../data/vocab', 'r')) 26 | corpus = pickle.load(open('../data/corpus', 'r')) 27 | data = np.load(open('../data/data.npz', 'r')) 28 | flattened = data['flattened'] 29 | story_id = data['story_id'] 30 | author_id = data['author_id'] 31 | time_id = data['time_id'] 32 | ranking = data['ranking'].astype('float32') 33 | score = data['score'].astype('float32') 34 | 35 | 36 | # Model Parameters 37 | # Number of documents 38 | n_stories = story_id.max() + 1 39 | # Number of users 40 | n_authors = author_id.max() + 1 41 | # Number of unique words in the vocabulary 42 | n_vocab = flattened.max() + 1 43 | # Number of dimensions in a single word vector 44 | n_units = 256 45 | # Number of topics to fit 46 | n_story_topics = 40 47 | n_author_topics = 20 48 | batchsize = 4096 49 | # Get the string representation for every compact key 50 | words = corpus.word_list(vocab)[:n_vocab] 51 | 52 | # How many tokens are in each story 53 | sty_idx, lengths = np.unique(story_id, return_counts=True) 54 | sty_len = np.zeros(sty_idx.max() + 1, dtype='int32') 55 | sty_len[sty_idx] = lengths 56 | 57 | # How many tokens are in each author 58 | aut_idx, lengths = np.unique(author_id, return_counts=True) 59 | aut_len = np.zeros(aut_idx.max() + 1, dtype='int32') 60 | aut_len[aut_idx] = lengths 61 | 62 | # Count all token frequencies 63 | tok_idx, freq = np.unique(flattened, return_counts=True) 64 | term_frequency = np.zeros(n_vocab, dtype='int32') 65 | term_frequency[tok_idx] = freq 66 | 67 | model = LDA2Vec(n_stories=n_stories, n_story_topics=n_story_topics, 68 | n_authors=n_authors, n_author_topics=n_author_topics, 69 | n_units=n_units, n_vocab=n_vocab, counts=term_frequency, 70 | n_samples=15) 71 | if os.path.exists('lda2vec.hdf5'): 72 | print "Reloading from saved" 73 | serializers.load_hdf5("lda2vec.hdf5", model) 74 | model.to_gpu() 75 | optimizer = O.Adam() 76 | optimizer.setup(model) 77 | clip = chainer.optimizer.GradientClipping(5.0) 78 | optimizer.add_hook(clip) 79 | 80 | j = 0 81 | epoch = 0 82 | fraction = batchsize * 1.0 / flattened.shape[0] 83 | for epoch in range(5000): 84 | ts = prepare_topics(cuda.to_cpu(model.mixture_sty.weights.W.data).copy(), 85 | cuda.to_cpu(model.mixture_sty.factors.W.data).copy(), 86 | cuda.to_cpu(model.sampler.W.data).copy(), 87 | words) 88 | print_top_words_per_topic(ts) 89 | ts['doc_lengths'] = sty_len 90 | ts['term_frequency'] = term_frequency 91 | np.savez('topics.story.pyldavis', **ts) 92 | ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(), 93 | cuda.to_cpu(model.mixture_aut.factors.W.data).copy(), 94 | cuda.to_cpu(model.sampler.W.data).copy(), 95 | words) 96 | print_top_words_per_topic(ta) 97 | ta['doc_lengths'] = aut_len 98 | ta['term_frequency'] = term_frequency 99 | np.savez('topics.author.pyldavis', **ta) 100 | for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): 101 | t0 = time.time() 102 | optimizer.zero_grads() 103 | l = model.fit_partial(s.copy(), a.copy(), f.copy()) 104 | prior = model.prior() 105 | loss = prior * fraction 106 | loss.backward() 107 | optimizer.update() 108 | msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " 109 | "P:{prior:1.3e} R:{rate:1.3e}") 110 | prior.to_cpu() 111 | loss.to_cpu() 112 | t1 = time.time() 113 | dt = t1 - t0 114 | rate = batchsize / dt 115 | logs = dict(loss=float(l), epoch=epoch, j=j, 116 | prior=float(prior.data), rate=rate) 117 | print msg.format(**logs) 118 | j += 1 119 | serializers.save_hdf5("lda2vec.hdf5", model) 120 | -------------------------------------------------------------------------------- /examples/hacker_news/lda2vec/word_vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Vector Math" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this notebook we'll demo that word2vec-like properties are kept. You can download the vectors, follow along at home, and make your own queries if you'd like.\n", 15 | "\n", 16 | "Sums:\n", 17 | "\n", 18 | "1. `silicon valley ~ california + technology` \n", 19 | "2. `uber ~ taxis + company`\n", 20 | "3. `baidu ~ china + search engine`\n", 21 | "\n", 22 | "Analogies:\n", 23 | "\n", 24 | "1. `Mark Zuckerberg - Facebook + Amazon = Jeff Bezos`\n", 25 | "1. `Hacker News - story + article = StackOverflow`\n", 26 | "1. `VIM - terminal + graphics = Photoshop`\n", 27 | "\n", 28 | "And slightly more whimsically:\n", 29 | "\n", 30 | "1. `vegeables - eat + drink = tea`\n", 31 | "2. `scala - features + simple = haskell`" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 37, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "--2016-04-17 12:56:06-- https://zenodo.org/record/49903/files/vocab.npy\n", 46 | "Resolving zenodo.org (zenodo.org)... 188.184.66.202\n", 47 | "Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.\n", 48 | "HTTP request sent, awaiting response... 200 OK\n", 49 | "Length: 81754640 (78M) [application/octet-stream]\n", 50 | "Saving to: ‘vocab.npy’\n", 51 | "\n", 52 | "vocab.npy 100%[=====================>] 77.97M 9.21MB/s in 23s \n", 53 | "\n", 54 | "2016-04-17 12:56:32 (3.37 MB/s) - ‘vocab.npy’ saved [81754640/81754640]\n", 55 | "\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "!wget https://zenodo.org/record/49903/files/vocab.npy" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 36, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "--2016-04-17 12:55:41-- https://zenodo.org/record/49903/files/word_vectors.npy\n", 75 | "Resolving zenodo.org (zenodo.org)... 188.184.66.202\n", 76 | "Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.\n", 77 | "HTTP request sent, awaiting response... 200 OK\n", 78 | "Length: 116273232 (111M) [application/octet-stream]\n", 79 | "Saving to: ‘word_vectors.npy’\n", 80 | "\n", 81 | "word_vectors.npy 100%[=====================>] 110.89M 6.64MB/s in 21s \n", 82 | "\n", 83 | "2016-04-17 12:56:06 (5.31 MB/s) - ‘word_vectors.npy’ saved [116273232/116273232]\n", 84 | "\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "!wget https://zenodo.org/record/49903/files/word_vectors.npy" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "You don't need to run the code below unless you've trained your own model. Otherwise, just download the word vectors from the URL above." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 32, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "#from lda2vec_model import LDA2Vec\n", 108 | "#from chainer import serializers\n", 109 | "#import numpy as np\n", 110 | "#import pandas as pd\n", 111 | "#import pickle\n", 112 | "#\n", 113 | "#features = pd.read_pickle(\"../data/features.pd\")\n", 114 | "#vocab = np.load(\"../data/vocab\")\n", 115 | "#npz = np.load(open('topics.story.pyldavis.npz', 'r'))\n", 116 | "#dat = {k: v for (k, v) in npz.iteritems()}\n", 117 | "#vocab = dat['vocab'].tolist()\n", 118 | "#dat = np.load(\"../data/data.npz\")\n", 119 | "#n_stories = features.story_id_codes.max() + 1\n", 120 | "#n_units = 256\n", 121 | "#n_vocab = dat['flattened'].max() + 1\n", 122 | "#model = LDA2Vec(n_stories=n_stories, n_story_topics=40,\n", 123 | "# n_authors=5664, n_author_topics=20,\n", 124 | "# n_units=n_units, n_vocab=n_vocab, counts=np.zeros(n_vocab),\n", 125 | "# n_samples=15)\n", 126 | "#serializers.load_hdf5(\"/home/chris/lda2vec-12/examples/hacker_news/lda2vec/lda2vec.hdf5\", model)\n", 127 | "#np.save(\"word_vectors\", model.sampler.W.data)\n", 128 | "#np.save(\"vocab\", vocab)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 2, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "import numpy as np\n", 140 | "word_vectors_raw = np.load(\"word_vectors.npy\")\n", 141 | "vocab = np.load(\"vocab.npy\").tolist()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "L2 Normalize the word vectors" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 15, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "word_vectors = word_vectors_raw / np.linalg.norm(word_vectors_raw, axis=-1)[:, None]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 16, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "def get_vector(token):\n", 171 | " index = vocab.index(token)\n", 172 | " return word_vectors[index, :].copy()\n", 173 | "\n", 174 | "def most_similar(token, n=20):\n", 175 | " word_vector = get_vector(token)\n", 176 | " similarities = np.dot(word_vectors, word_vector)\n", 177 | " top = np.argsort(similarities)[::-1][:n]\n", 178 | " return [vocab[i] for i in top]\n", 179 | "\n", 180 | "# This is Levy & Goldberg's 3Cosmul Metric\n", 181 | "# Based on the Gensim implementation: https://github.com/piskvorky/gensim/blob/master/gensim/models/word2vec.py\n", 182 | "def cosmul(positives, negatives, topn=20):\n", 183 | " positive = [get_vector(p) for p in positives]\n", 184 | " negative = [get_vector(n) for n in negatives]\n", 185 | " pos_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in positive]\n", 186 | " neg_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in negative]\n", 187 | " dists = np.prod(pos_dists, axis=0) / (np.prod(neg_dists, axis=0) + 1e-6)\n", 188 | " idxs = np.argsort(dists)[::-1][:topn]\n", 189 | " return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]\n", 190 | "def most_similar_posneg(positives, negatives, topn=20):\n", 191 | " positive = np.sum([get_vector(p) for p in positives], axis=0)\n", 192 | " negative = np.sum([get_vector(n) for n in negatives], axis=0)\n", 193 | " vector = positive - negative\n", 194 | " dists = np.dot(word_vectors, vector)\n", 195 | " idxs = np.argsort(dists)[::-1][:topn]\n", 196 | " return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 17, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "[u'san francisco',\n", 210 | " u'new york',\n", 211 | " u'nyc',\n", 212 | " u'palo alto',\n", 213 | " u'mountain view',\n", 214 | " u'boston',\n", 215 | " u'seattle',\n", 216 | " u'sf',\n", 217 | " u'los angeles',\n", 218 | " u'new york city',\n", 219 | " u'london',\n", 220 | " u'ny',\n", 221 | " u'brooklyn',\n", 222 | " u'chicago',\n", 223 | " u'austin',\n", 224 | " u'atlanta',\n", 225 | " u'portland',\n", 226 | " u'san jose',\n", 227 | " u'san mateo',\n", 228 | " u'sunnyvale']" 229 | ] 230 | }, 231 | "execution_count": 17, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "most_similar('san francisco')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 18, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "[u'silicon valley',\n", 251 | " u'in',\n", 252 | " u'new york',\n", 253 | " u'u.s.',\n", 254 | " u'west',\n", 255 | " u'tech',\n", 256 | " u'usa',\n", 257 | " u'san francisco',\n", 258 | " u'japan',\n", 259 | " u'america',\n", 260 | " u'dc',\n", 261 | " u'industry',\n", 262 | " u'canada',\n", 263 | " u'new york city',\n", 264 | " u'nyc',\n", 265 | " u'area',\n", 266 | " u'valley',\n", 267 | " u'china']" 268 | ] 269 | }, 270 | "execution_count": 18, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "cosmul(['california', 'technology'], [], topn=20)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 19, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "[u'currencies',\n", 290 | " u'bitcoin',\n", 291 | " u'goods',\n", 292 | " u'physical',\n", 293 | " u'gold',\n", 294 | " u'fiat',\n", 295 | " u'trading',\n", 296 | " u'cryptocurrency',\n", 297 | " u'bitcoins',\n", 298 | " u'electronic',\n", 299 | " u'analog',\n", 300 | " u'transfers',\n", 301 | " u'banking',\n", 302 | " u'commodity',\n", 303 | " u'mining',\n", 304 | " u'virtual currency',\n", 305 | " u'other currencies',\n", 306 | " u'media']" 307 | ] 308 | }, 309 | "execution_count": 19, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "cosmul(['digital', 'currency'], [], topn=20)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 20, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "[u'vim',\n", 329 | " u'emacs',\n", 330 | " u'editor',\n", 331 | " u'sublime',\n", 332 | " u'tmux',\n", 333 | " u'shell',\n", 334 | " u'iterm',\n", 335 | " u'vi',\n", 336 | " u'ide',\n", 337 | " u'debugger',\n", 338 | " u'latex',\n", 339 | " u'gui',\n", 340 | " u'gvim',\n", 341 | " u'notepad',\n", 342 | " u'eclipse',\n", 343 | " u'command line',\n", 344 | " u'terminal.app',\n", 345 | " u'window manager']" 346 | ] 347 | }, 348 | "execution_count": 20, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "cosmul(['text editor', 'terminal'], [], topn=20)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 35, 360 | "metadata": { 361 | "collapsed": false 362 | }, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "[u'russia',\n", 368 | " u'india',\n", 369 | " u'japan',\n", 370 | " u'africa',\n", 371 | " u'korea',\n", 372 | " u'germany',\n", 373 | " u'other countries',\n", 374 | " u'asia',\n", 375 | " u'ukraine',\n", 376 | " u'iran',\n", 377 | " u'brazil',\n", 378 | " u'israel',\n", 379 | " u'usa',\n", 380 | " u'vietnam',\n", 381 | " u'france',\n", 382 | " u'countries',\n", 383 | " u'south korea',\n", 384 | " u'hong kong',\n", 385 | " u'europe']" 386 | ] 387 | }, 388 | "execution_count": 35, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "cosmul(['china'], [], topn=20)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 21, 400 | "metadata": { 401 | "collapsed": false 402 | }, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "[u'baidu',\n", 408 | " u'google',\n", 409 | " u'google search',\n", 410 | " u'india',\n", 411 | " u'russia',\n", 412 | " u'japan',\n", 413 | " u'iran',\n", 414 | " u'country',\n", 415 | " u'yandex',\n", 416 | " u'africa',\n", 417 | " u'duckduckgo',\n", 418 | " u'south korea',\n", 419 | " u'bing',\n", 420 | " u'france',\n", 421 | " u'beijing',\n", 422 | " u'hong kong',\n", 423 | " u'great firewall',\n", 424 | " u'search engines']" 425 | ] 426 | }, 427 | "execution_count": 21, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "cosmul(['china', 'search engine'], [], topn=20)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 22, 439 | "metadata": { 440 | "collapsed": false 441 | }, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "[u'apple',\n", 447 | " u'ms',\n", 448 | " u'msft',\n", 449 | " u'google',\n", 450 | " u'nokia',\n", 451 | " u'adobe',\n", 452 | " u'samsung',\n", 453 | " u'hp',\n", 454 | " u'rim',\n", 455 | " u'oracle',\n", 456 | " u'valve',\n", 457 | " u'mozilla',\n", 458 | " u'ibm',\n", 459 | " u'motorola',\n", 460 | " u'oems',\n", 461 | " u'ballmer',\n", 462 | " u'intel',\n", 463 | " u'ms.',\n", 464 | " u'canonical']" 465 | ] 466 | }, 467 | "execution_count": 22, 468 | "metadata": {}, 469 | "output_type": "execute_result" 470 | } 471 | ], 472 | "source": [ 473 | "cosmul(['microsoft'], [], topn=20)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 23, 479 | "metadata": { 480 | "collapsed": false 481 | }, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "[u'apple',\n", 487 | " u'google',\n", 488 | " u'enterprise',\n", 489 | " u'azure',\n", 490 | " u'ms',\n", 491 | " u'skydrive',\n", 492 | " u'sharepoint',\n", 493 | " u'walled garden',\n", 494 | " u'icloud',\n", 495 | " u'oracle',\n", 496 | " u'chrome os',\n", 497 | " u'cloud services',\n", 498 | " u'android market',\n", 499 | " u'adobe',\n", 500 | " u'app store',\n", 501 | " u'rackspace',\n", 502 | " u'hp',\n", 503 | " u'samsung']" 504 | ] 505 | }, 506 | "execution_count": 23, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | } 510 | ], 511 | "source": [ 512 | "cosmul(['microsoft', 'cloud'], [], topn=20)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "Queen is several rankings down, so not exactly the same as out of the box word2vec!" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 24, 525 | "metadata": { 526 | "collapsed": false 527 | }, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": [ 532 | "[u'professional context',\n", 533 | " u'female',\n", 534 | " u'pawn',\n", 535 | " u'content farm',\n", 536 | " u'queen',\n", 537 | " u'career trajectory',\n", 538 | " u'real risk',\n", 539 | " u'philadelphia',\n", 540 | " u'teen',\n", 541 | " u'shitty place',\n", 542 | " u'prussia',\n", 543 | " u'criminal offense',\n", 544 | " u'main theme',\n", 545 | " u'she',\n", 546 | " u'magician',\n", 547 | " u'gray area',\n", 548 | " u'herself',\n", 549 | " u'best site']" 550 | ] 551 | }, 552 | "execution_count": 24, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "cosmul(['king', 'woman'], ['man'], topn=20)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 25, 564 | "metadata": { 565 | "collapsed": false 566 | }, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "Most similar\n", 573 | "mark zuckerberg\n", 574 | "bill gates\n", 575 | "zuckerberg\n", 576 | "larry page\n", 577 | "zuck\n", 578 | "steve jobs\n", 579 | "sergey brin\n", 580 | "jeff bezos\n", 581 | "gates\n", 582 | "warren buffet\n", 583 | "ceo\n", 584 | "peter thiel\n", 585 | "paul allen\n", 586 | "sean parker\n", 587 | "jack dorsey\n", 588 | "paul graham\n", 589 | "richard branson\n", 590 | "sergey\n", 591 | "linus torvalds\n", 592 | "larry ellison\n", 593 | "\n", 594 | "Cosmul\n", 595 | "jeff bezos\n", 596 | "elon musk\n", 597 | "warren buffet\n", 598 | "bezos\n", 599 | "michael dell\n", 600 | "bill gates\n", 601 | "musk\n", 602 | "hp\n", 603 | "toshiba\n", 604 | "dell\n", 605 | "richard branson\n", 606 | "elon\n", 607 | "buffet\n", 608 | "john carmack\n", 609 | "steve wozniak\n", 610 | "asus\n", 611 | "ford\n", 612 | "morgan\n", 613 | "\n", 614 | "Traditional Similarity\n", 615 | "jeff bezos\n", 616 | "bill gates\n", 617 | "elon musk\n", 618 | "bezos\n", 619 | "warren buffet\n", 620 | "michael dell\n", 621 | "hp\n", 622 | "musk\n", 623 | "richard branson\n", 624 | "dell\n", 625 | "toshiba\n", 626 | "john carmack\n", 627 | "buffet\n", 628 | "peter thiel\n", 629 | "steve wozniak\n", 630 | "gates\n", 631 | "steve jobs\n", 632 | "ford\n" 633 | ] 634 | } 635 | ], 636 | "source": [ 637 | "print 'Most similar'\n", 638 | "print '\\n'.join(most_similar('mark zuckerberg'))\n", 639 | "print '\\nCosmul'\n", 640 | "pos = ['mark zuckerberg', 'amazon']\n", 641 | "neg = ['facebook']\n", 642 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 643 | "print '\\nTraditional Similarity'\n", 644 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 26, 650 | "metadata": { 651 | "collapsed": false 652 | }, 653 | "outputs": [ 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "Most similar\n", 659 | "hacker news\n", 660 | "hn\n", 661 | "hn.\n", 662 | "reddit\n", 663 | "front page\n", 664 | "hackernews\n", 665 | "commenting\n", 666 | "posted\n", 667 | "frontpage\n", 668 | "comment\n", 669 | "posting\n", 670 | "upvoted\n", 671 | "slashdot\n", 672 | "news.yc\n", 673 | "comments\n", 674 | "posts\n", 675 | "proggit\n", 676 | "post\n", 677 | "techcrunch\n", 678 | "top story\n", 679 | "\n", 680 | "Cosmul\n", 681 | "stack overflow\n", 682 | "stackoverflow\n", 683 | "answers\n", 684 | "answering\n", 685 | "answer\n", 686 | "questions\n", 687 | "quora\n", 688 | "answered\n", 689 | "ask\n", 690 | "hn\n", 691 | "other questions\n", 692 | "other question\n", 693 | "programming questions\n", 694 | "asking\n", 695 | "stackexchange\n", 696 | "stack exchange\n", 697 | "why\n", 698 | "basic questions\n", 699 | "\n", 700 | "Traditional Similarity\n", 701 | "stack overflow\n", 702 | "answer\n", 703 | "stackoverflow\n", 704 | "answering\n", 705 | "answers\n", 706 | "hn\n", 707 | "questions\n", 708 | "answered\n", 709 | "quora\n", 710 | "ask\n", 711 | "asking\n", 712 | "other question\n", 713 | "other questions\n", 714 | "first question\n", 715 | "stackexchange\n", 716 | "hn.\n", 717 | "programming questions\n", 718 | "hackernews\n" 719 | ] 720 | } 721 | ], 722 | "source": [ 723 | "pos = ['hacker news', 'question']\n", 724 | "neg = ['story']\n", 725 | "\n", 726 | "print 'Most similar'\n", 727 | "print '\\n'.join(most_similar(pos[0]))\n", 728 | "print '\\nCosmul'\n", 729 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 730 | "print '\\nTraditional Similarity'\n", 731 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 27, 737 | "metadata": { 738 | "collapsed": false 739 | }, 740 | "outputs": [ 741 | { 742 | "name": "stdout", 743 | "output_type": "stream", 744 | "text": [ 745 | "Most similar\n", 746 | "san francisco\n", 747 | "new york\n", 748 | "nyc\n", 749 | "palo alto\n", 750 | "mountain view\n", 751 | "boston\n", 752 | "seattle\n", 753 | "sf\n", 754 | "los angeles\n", 755 | "new york city\n", 756 | "london\n", 757 | "ny\n", 758 | "brooklyn\n", 759 | "chicago\n", 760 | "austin\n", 761 | "atlanta\n", 762 | "portland\n", 763 | "san jose\n", 764 | "san mateo\n", 765 | "sunnyvale\n", 766 | "\n", 767 | "Cosmul\n", 768 | "new york\n", 769 | "nyc\n", 770 | "palo alto\n", 771 | "mountain view\n", 772 | "boston\n", 773 | "seattle\n", 774 | "sf\n", 775 | "los angeles\n", 776 | "new york city\n", 777 | "london\n", 778 | "ny\n", 779 | "brooklyn\n", 780 | "chicago\n", 781 | "austin\n", 782 | "atlanta\n", 783 | "portland\n", 784 | "san jose\n", 785 | "san mateo\n", 786 | "sunnyvale\n", 787 | "\n", 788 | "Traditional Similarity\n", 789 | "new york\n", 790 | "nyc\n", 791 | "palo alto\n", 792 | "mountain view\n", 793 | "boston\n", 794 | "seattle\n", 795 | "sf\n", 796 | "los angeles\n", 797 | "new york city\n", 798 | "london\n", 799 | "ny\n", 800 | "brooklyn\n", 801 | "chicago\n", 802 | "austin\n", 803 | "atlanta\n", 804 | "portland\n", 805 | "san jose\n", 806 | "san mateo\n", 807 | "sunnyvale\n" 808 | ] 809 | } 810 | ], 811 | "source": [ 812 | "pos = ['san francisco']\n", 813 | "neg = []\n", 814 | "\n", 815 | "print 'Most similar'\n", 816 | "print '\\n'.join(most_similar(pos[0]))\n", 817 | "print '\\nCosmul'\n", 818 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 819 | "print '\\nTraditional Similarity'\n", 820 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 28, 826 | "metadata": { 827 | "collapsed": false 828 | }, 829 | "outputs": [ 830 | { 831 | "name": "stdout", 832 | "output_type": "stream", 833 | "text": [ 834 | "Most similar\n", 835 | "nlp\n", 836 | "machine learning\n", 837 | "data mining\n", 838 | "computer vision\n", 839 | "natural language processing\n", 840 | "ml\n", 841 | "image processing\n", 842 | "analytics\n", 843 | "classification\n", 844 | "algorithms\n", 845 | "data science\n", 846 | "hadoop\n", 847 | "analysis\n", 848 | "ai\n", 849 | "clustering\n", 850 | "mapreduce\n", 851 | "algorithm design\n", 852 | "information retrieval\n", 853 | "data analysis\n", 854 | "statistical\n", 855 | "\n", 856 | "Cosmul\n", 857 | "computer vision\n", 858 | "machine learning\n", 859 | "data mining\n", 860 | "image processing\n", 861 | "ai\n", 862 | "analytics\n", 863 | "algorithm\n", 864 | "randomized\n", 865 | "classification\n", 866 | "natural language processing\n", 867 | "hadoop\n", 868 | "engine\n", 869 | "statistical\n", 870 | "analysis\n", 871 | "machine\n", 872 | "clustering\n", 873 | "ml\n", 874 | "artificial intelligence\n", 875 | "neo4j\n", 876 | "\n", 877 | "Traditional Similarity\n", 878 | "computer vision\n", 879 | "machine learning\n", 880 | "data mining\n", 881 | "image processing\n", 882 | "ai\n", 883 | "analytics\n", 884 | "algorithm\n", 885 | "natural language processing\n", 886 | "classification\n", 887 | "randomized\n", 888 | "analysis\n", 889 | "ml\n", 890 | "hadoop\n", 891 | "engine\n", 892 | "machine\n", 893 | "statistical\n", 894 | "clustering\n", 895 | "visualization\n" 896 | ] 897 | } 898 | ], 899 | "source": [ 900 | "pos = ['nlp', 'image']\n", 901 | "neg = ['text']\n", 902 | "\n", 903 | "print 'Most similar'\n", 904 | "print '\\n'.join(most_similar(pos[0]))\n", 905 | "print '\\nCosmul'\n", 906 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 907 | "print '\\nTraditional Similarity'\n", 908 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 29, 914 | "metadata": { 915 | "collapsed": false 916 | }, 917 | "outputs": [ 918 | { 919 | "name": "stdout", 920 | "output_type": "stream", 921 | "text": [ 922 | "Most similar\n", 923 | "vim\n", 924 | "emacs\n", 925 | "vi\n", 926 | "sublime\n", 927 | "tmux\n", 928 | "textmate\n", 929 | "eclipse\n", 930 | "sublime text\n", 931 | "macvim\n", 932 | "zsh\n", 933 | "org-mode\n", 934 | "terminal\n", 935 | "st2\n", 936 | "bbedit\n", 937 | "intellij\n", 938 | "text editor\n", 939 | "latex\n", 940 | "notepad++\n", 941 | "netbeans\n", 942 | "other editors\n", 943 | "\n", 944 | "Cosmul\n", 945 | "photoshop\n", 946 | "animations\n", 947 | "typography\n", 948 | "programming\n", 949 | "layout\n", 950 | "textures\n", 951 | "web design\n", 952 | "fonts\n", 953 | "coding\n", 954 | "illustrator\n", 955 | "common lisp\n", 956 | "design\n", 957 | "prototyping\n", 958 | "canvas\n", 959 | "css.\n", 960 | "css\n", 961 | "diagrams\n", 962 | "vector graphics\n", 963 | "usability\n", 964 | "\n", 965 | "Traditional Similarity\n", 966 | "photoshop\n", 967 | "animations\n", 968 | "textures\n", 969 | "layout\n", 970 | "typography\n", 971 | "programming\n", 972 | "fonts\n", 973 | "coding\n", 974 | "illustrator\n", 975 | "design\n", 976 | "web design\n", 977 | "common lisp\n", 978 | "canvas\n", 979 | "photography\n", 980 | "ides\n", 981 | "visual\n", 982 | "animation\n", 983 | "css\n" 984 | ] 985 | } 986 | ], 987 | "source": [ 988 | "pos = ['vim', 'graphics']\n", 989 | "neg = ['terminal']\n", 990 | "\n", 991 | "print 'Most similar'\n", 992 | "print '\\n'.join(most_similar(pos[0]))\n", 993 | "print '\\nCosmul'\n", 994 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 995 | "print '\\nTraditional Similarity'\n", 996 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": 30, 1002 | "metadata": { 1003 | "collapsed": false 1004 | }, 1005 | "outputs": [ 1006 | { 1007 | "name": "stdout", 1008 | "output_type": "stream", 1009 | "text": [ 1010 | "Most similar\n", 1011 | "vegetables\n", 1012 | "meat\n", 1013 | "rice\n", 1014 | "meats\n", 1015 | "fruit\n", 1016 | "veggies\n", 1017 | "pasta\n", 1018 | "salads\n", 1019 | "eat\n", 1020 | "fruits\n", 1021 | "cheese\n", 1022 | "carrots\n", 1023 | "potatoes\n", 1024 | "beans\n", 1025 | "seafood\n", 1026 | "soy\n", 1027 | "yogurt\n", 1028 | "spices\n", 1029 | "dairy\n", 1030 | "fats\n", 1031 | "\n", 1032 | "Cosmul\n", 1033 | "tea\n", 1034 | "coffee\n", 1035 | "beer\n", 1036 | "drinking\n", 1037 | "red wine\n", 1038 | "soda\n", 1039 | "cup\n", 1040 | "alcohol\n", 1041 | "cups\n", 1042 | "vodka\n", 1043 | "rice\n", 1044 | "fruit\n", 1045 | "whisky\n", 1046 | "orange juice\n", 1047 | "milk\n", 1048 | "espresso\n", 1049 | "drinks\n", 1050 | "carrots\n", 1051 | "\n", 1052 | "Traditional Similarity\n", 1053 | "tea\n", 1054 | "coffee\n", 1055 | "beer\n", 1056 | "drinking\n", 1057 | "soda\n", 1058 | "red wine\n", 1059 | "cup\n", 1060 | "alcohol\n", 1061 | "rice\n", 1062 | "cups\n", 1063 | "fruit\n", 1064 | "vodka\n", 1065 | "milk\n", 1066 | "drinks\n", 1067 | "orange juice\n", 1068 | "carrots\n", 1069 | "whisky\n", 1070 | "pasta\n" 1071 | ] 1072 | } 1073 | ], 1074 | "source": [ 1075 | "pos = ['vegetables', 'drink']\n", 1076 | "neg = ['eat']\n", 1077 | "\n", 1078 | "print 'Most similar'\n", 1079 | "print '\\n'.join(most_similar(pos[0]))\n", 1080 | "print '\\nCosmul'\n", 1081 | "print '\\n'.join(cosmul(pos, neg, topn=20))\n", 1082 | "print '\\nTraditional Similarity'\n", 1083 | "print '\\n'.join(most_similar_posneg(pos, neg, topn=20))" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": 31, 1089 | "metadata": { 1090 | "collapsed": false 1091 | }, 1092 | "outputs": [ 1093 | { 1094 | "name": "stdout", 1095 | "output_type": "stream", 1096 | "text": [ 1097 | "Most similar\n", 1098 | "lda\n", 1099 | "linear\n", 1100 | "kmeans\n", 1101 | "clustering\n", 1102 | "-2\n", 1103 | "176\n", 1104 | "classification\n", 1105 | "svm\n", 1106 | "10000000\n", 1107 | "minaway\n", 1108 | "mb/s\n", 1109 | "statistical\n", 1110 | "173\n", 1111 | "ans\n", 1112 | "joiner\n", 1113 | "stdev\n", 1114 | "because:


 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import logging
 7 | import pickle
 8 | 
 9 | from sklearn.datasets import fetch_20newsgroups
10 | import numpy as np
11 | 
12 | from lda2vec import preprocess, Corpus
13 | 
14 | logging.basicConfig()
15 | 
16 | # Fetch data
17 | remove = ('headers', 'footers', 'quotes')
18 | texts = fetch_20newsgroups(subset='train', remove=remove).data
19 | # Remove tokens with these substrings
20 | bad = set(["ax>", '`@("', '---', '===', '^^^'])
21 | 
22 | 
23 | def clean(line):
24 |     return ' '.join(w for w in line.split() if not any(t in w for t in bad))
25 | 
26 | # Preprocess data
27 | max_length = 10000   # Limit of 10k words per document
28 | # Convert to unicode (spaCy only works with unicode)
29 | texts = [clean(d) for d in texts]
30 | tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
31 |                                     n_threads=4)
32 | corpus = Corpus()
33 | # Make a ranked list of rare vs frequent words
34 | corpus.update_word_count(tokens)
35 | corpus.finalize()
36 | # The tokenization uses spaCy indices, and so may have gaps
37 | # between indices for words that aren't present in our dataset.
38 | # This builds a new compact index
39 | compact = corpus.to_compact(tokens)
40 | # Remove extremely rare words
41 | pruned = corpus.filter_count(compact, min_count=30)
42 | # Convert the compactified arrays into bag of words arrays
43 | bow = corpus.compact_to_bow(pruned)
44 | # Words tend to have power law frequency, so selectively
45 | # downsample the most prevalent words
46 | clean = corpus.subsample_frequent(pruned)
47 | # Now flatten a 2D array of document per row and word position
48 | # per column to a 1D array of words. This will also remove skips
49 | # and OoV words
50 | doc_ids = np.arange(pruned.shape[0])
51 | flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
52 | assert flattened.min() >= 0
53 | # Fill in the pretrained word vectors
54 | n_dim = 300
55 | fn_wordvc = 'GoogleNews-vectors-negative300.bin'
56 | vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)
57 | # Save all of the preprocessed files
58 | pickle.dump(vocab, open('vocab.pkl', 'wb'))
59 | pickle.dump(corpus, open('corpus.pkl', 'wb'))
60 | np.save("flattened", flattened)
61 | np.save("doc_ids", doc_ids)
62 | np.save("pruned", pruned)
63 | np.save("bow", bow)
64 | np.save("vectors", vectors)
65 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/lda.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils, dirichlet_likelihood
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | class LDA(chainer.Chain):
11 |     def __init__(self, n_docs, n_topics, n_dim, n_vocab):
12 |         factors = np.random.random((n_topics, n_dim)).astype('float32')
13 |         super(LDA, self).__init__(proportions=L.EmbedID(n_docs, n_topics),
14 |                                   factors=L.Parameter(factors),
15 |                                   embedding=L.Linear(n_dim, n_vocab))
16 |         self.n_docs = n_docs
17 |         self.n_topics = n_topics
18 |         self.n_vocab = n_vocab
19 |         self.n_dim = n_dim
20 | 
21 |     def forward(self, ids, bow):
22 |         bow, ids = utils.move(self.xp, bow, ids)
23 |         proportions = self.proportions(ids)
24 |         ld = dirichlet_likelihood(proportions)
25 |         doc = F.matmul(F.softmax(proportions), self.factors())
26 |         logp = F.dropout(self.embedding(doc))
27 |         # loss = -F.sum(bow * F.log_softmax(logp))
28 |         sources, targets, counts = [], [], []
29 |         lpi =  F.sum(bow * F.log_softmax(logp), axis=1)
30 |         loss = -F.sum(lpi)
31 |         return loss, ld
32 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/lda_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody 
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | from chainer import cuda
12 | import chainer.optimizers as O
13 | import chainer.link as L
14 | import numpy as np
15 | 
16 | # from lda2vec import prepare_topics, print_top_words_per_topic
17 | # from lda2vec import utils
18 | from lda2vec import topics, utils
19 | from lda import LDA
20 | 
21 | gpu_id = int(os.getenv('CUDA_GPU', 0))
22 | cuda.get_device(gpu_id).use()
23 | print("Using GPU ", str(gpu_id))
24 | 
25 | vocab = pickle.load(open('../data/vocab.pkl', 'rb'))
26 | corpus = pickle.load(open('../data/corpus.pkl', 'rb'))
27 | bow = np.load("../data/bow.npy").astype('float32')
28 | # Remove bow counts on the first two tokens, which  and 
29 | bow[:, :2] = 0
30 | # Normalize bag of words to be a probability
31 | # bow = bow / bow.sum(axis=1)[:, None]
32 | 
33 | # Number of docs
34 | n_docs = bow.shape[0]
35 | # Number of unique words in the vocabulary
36 | n_vocab = bow.shape[1]
37 | # Number of dimensions in a single word vector
38 | n_units = 256
39 | # number of topics
40 | n_topics = 20
41 | batchsize = 128
42 | counts = corpus.keys_counts[:n_vocab]
43 | # Get the string representation for every compact key
44 | words = corpus.word_list(vocab)[:n_vocab]
45 | 
46 | model = LDA(n_docs, n_topics, n_units, n_vocab)
47 | if os.path.exists('lda.hdf5'):
48 |     print("Reloading from saved")
49 |     serializers.load_hdf5("lda.hdf5", model)
50 | model.to_gpu()
51 | optimizer = O.Adam()
52 | optimizer.setup(model)
53 | 
54 | j = 0
55 | fraction = batchsize * 1.0 / bow.shape[0]
56 | for epoch in range(50000000):
57 |     if epoch % 100 == 0:
58 |         p = cuda.to_cpu(model.proportions.W.data).copy()
59 |         f = cuda.to_cpu(model.factors.W.data).copy()
60 |         w = cuda.to_cpu(model.embedding.W.data).copy()
61 |         d = topics.prepare_topics(p, f, w, words)
62 |         topics.print_top_words_per_topic(d)
63 |     for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
64 |         t0 = time.time()
65 |         # optimizer.zero_grads()
66 |         model.cleargrads()
67 |         rec, ld = model.forward(ids, batch)
68 |         l = rec + ld
69 |         l.backward()
70 |         optimizer.update()
71 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
72 |                "P:{ld:1.3e} R:{rate:1.3e}")
73 |         l.to_cpu()
74 |         rec.to_cpu()
75 |         ld.to_cpu()
76 |         t1 = time.time()
77 |         dt = t1 - t0
78 |         rate = batchsize / dt
79 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
80 |                     ld=float(ld.data), rate=rate)
81 |         print(msg.format(**logs))
82 |         j += 1
83 |     if epoch % 100 == 0:
84 |         serializers.save_hdf5("lda.hdf5", model)
85 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda/topics.pyldavis.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/examples/twenty_newsgroups/lda/topics.pyldavis.npz


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/lda2vec_model.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import EmbedMixture
 2 | from lda2vec import dirichlet_likelihood
 3 | from lda2vec.utils import move
 4 | 
 5 | from chainer import Chain
 6 | import chainer.links as L
 7 | import chainer.functions as F
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class LDA2Vec(Chain):
13 |     def __init__(self, n_documents=100, n_document_topics=10,
14 |                  n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True,
15 |                  counts=None, n_samples=15, word_dropout_ratio=0.0,
16 |                  power=0.75, temperature=1.0):
17 |         em = EmbedMixture(n_documents, n_document_topics, n_units,
18 |                           dropout_ratio=dropout_ratio, temperature=temperature)
19 |         kwargs = {}
20 |         kwargs['mixture'] = em
21 |         kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples,
22 |                                                power=power)
23 |         super(LDA2Vec, self).__init__(**kwargs)
24 |         rand = np.random.random(self.sampler.W.data.shape)
25 |         self.sampler.W.data[:, :] = rand[:, :]
26 |         self.n_units = n_units
27 |         self.train = train
28 |         self.dropout_ratio = dropout_ratio
29 |         self.word_dropout_ratio = word_dropout_ratio
30 |         self.n_samples = n_samples
31 | 
32 |     def prior(self):
33 |         dl1 = dirichlet_likelihood(self.mixture.weights)
34 |         return dl1
35 | 
36 |     def fit_partial(self, rdoc_ids, rword_indices, window=5,
37 |                     update_only_docs=False):
38 |         doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices)
39 |         pivot_idx = next(move(self.xp, rword_indices[window: -window]))
40 |         pivot = F.embed_id(pivot_idx, self.sampler.W)
41 |         if update_only_docs:
42 |             pivot.unchain_backward()
43 |         doc_at_pivot = rdoc_ids[window: -window]
44 |         doc = self.mixture(next(move(self.xp, doc_at_pivot)),
45 |                            update_only_docs=update_only_docs)
46 |         loss = 0.0
47 |         start, end = window, rword_indices.shape[0] - window
48 |         context = (F.dropout(doc, self.dropout_ratio) +
49 |                    F.dropout(pivot, self.dropout_ratio))
50 |         for frame in range(-window, window + 1):
51 |             # Skip predicting the current pivot
52 |             if frame == 0:
53 |                 continue
54 |             # Predict word given context and pivot word
55 |             # The target starts before the pivot
56 |             targetidx = rword_indices[start + frame: end + frame]
57 |             doc_at_target = rdoc_ids[start + frame: end + frame]
58 |             doc_is_same = doc_at_target == doc_at_pivot
59 |             rand = np.random.uniform(0, 1, doc_is_same.shape[0])
60 |             mask = (rand > self.word_dropout_ratio).astype('bool')
61 |             weight = np.logical_and(doc_is_same, mask).astype('int32')
62 |             # If weight is 1.0 then targetidx
63 |             # If weight is 0.0 then -1
64 |             targetidx = targetidx * weight + -1 * (1 - weight)
65 |             target, = move(self.xp, targetidx)
66 |             loss = self.sampler(context, target)
67 |             loss.backward()
68 |             if update_only_docs:
69 |                 # Wipe out any gradient accumulation on word vectors
70 |                 self.sampler.W.grad *= 0.0
71 |         return loss.data
72 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/lda2vec_run.py:
--------------------------------------------------------------------------------
  1 | # Author: Chris Moody 
  2 | # License: MIT
  3 | 
  4 | # This simple example loads the newsgroups data from sklearn
  5 | # and train an LDA-like model on it
  6 | import os
  7 | import os.path
  8 | import pdb
  9 | import pickle
 10 | import time
 11 | import shelve
 12 | 
 13 | import chainer
 14 | from chainer import cuda
 15 | from chainer import serializers
 16 | import chainer.optimizers as O
 17 | import numpy as np
 18 | 
 19 | from lda2vec import topics, utils
 20 | # from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
 21 | from lda2vec_model import LDA2Vec
 22 | 
 23 | gpu_id = int(os.getenv('CUDA_GPU', 0))
 24 | cuda.get_device(gpu_id).use()
 25 | print ("Using GPU ", str(gpu_id))
 26 | 
 27 | data_dir = os.getenv('data_dir', '../data/')
 28 | fn_vocab = '{data_dir:s}/vocab.pkl'.format(data_dir=data_dir)
 29 | fn_corpus = '{data_dir:s}/corpus.pkl'.format(data_dir=data_dir)
 30 | fn_flatnd = '{data_dir:s}/flattened.npy'.format(data_dir=data_dir)
 31 | fn_docids = '{data_dir:s}/doc_ids.npy'.format(data_dir=data_dir)
 32 | fn_vectors = '{data_dir:s}/vectors.npy'.format(data_dir=data_dir)
 33 | vocab = pickle.load(open(fn_vocab, 'rb'))
 34 | corpus = pickle.load(open(fn_corpus, 'rb'))
 35 | flattened = np.load(fn_flatnd)
 36 | doc_ids = np.load(fn_docids)
 37 | vectors = np.load(fn_vectors)
 38 | 
 39 | # Model Parameters
 40 | # Number of documents
 41 | n_docs = doc_ids.max() + 1
 42 | # Number of unique words in the vocabulary
 43 | n_vocab = flattened.max() + 1
 44 | # 'Strength' of the dircihlet prior; 200.0 seems to work well
 45 | clambda = 200.0
 46 | # Number of topics to fit
 47 | n_topics = int(os.getenv('n_topics', 20))
 48 | batchsize = 4096
 49 | # Power for neg sampling
 50 | power = float(os.getenv('power', 0.75))
 51 | # Intialize with pretrained word vectors
 52 | pretrained = bool(int(os.getenv('pretrained', True)))
 53 | # Sampling temperature
 54 | temperature = float(os.getenv('temperature', 1.0))
 55 | # Number of dimensions in a single word vector
 56 | n_units = int(os.getenv('n_units', 300))
 57 | # Get the string representation for every compact key
 58 | words = corpus.word_list(vocab)[:n_vocab]
 59 | # How many tokens are in each document
 60 | doc_idx, lengths = np.unique(doc_ids, return_counts=True)
 61 | doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
 62 | doc_lengths[doc_idx] = lengths
 63 | # Count all token frequencies
 64 | tok_idx, freq = np.unique(flattened, return_counts=True)
 65 | term_frequency = np.zeros(n_vocab, dtype='int32')
 66 | term_frequency[tok_idx] = freq
 67 | 
 68 | for key in sorted(locals().keys()):
 69 |     val = locals()[key]
 70 |     if len(str(val)) < 100 and '<' not in str(val):
 71 |         print(key, val)
 72 | 
 73 | model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
 74 |                 n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
 75 |                 n_samples=15, power=power, temperature=temperature)
 76 | if os.path.exists('lda2vec.hdf5'):
 77 |     print("Reloading from saved")
 78 |     serializers.load_hdf5("lda2vec.hdf5", model)
 79 | if pretrained:
 80 |     model.sampler.W.data[:, :] = vectors[:n_vocab, :]
 81 | model.to_gpu()
 82 | optimizer = O.Adam()
 83 | optimizer.setup(model)
 84 | clip = chainer.optimizer.GradientClipping(5.0)
 85 | optimizer.add_hook(clip)
 86 | 
 87 | j = 0
 88 | epoch = 0
 89 | fraction = batchsize * 1.0 / flattened.shape[0]
 90 | progress = shelve.open('progress.shelve')
 91 | for epoch in range(200):
 92 |     data = topics.prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
 93 |                           cuda.to_cpu(model.mixture.factors.W.data).copy(),
 94 |                           cuda.to_cpu(model.sampler.W.data).copy(),
 95 |                           words)
 96 |     top_words = topics.print_top_words_per_topic(data)
 97 |     if j % 100 == 0 and j > 100:
 98 |         coherence = topics.topic_coherence(top_words)
 99 |         for j in range(n_topics):
100 |             print(j, coherence[(j, 'cv')])
101 |         kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
102 |         progress[str(epoch)] = pickle.dumps(kw)
103 |     data['doc_lengths'] = doc_lengths
104 |     data['term_frequency'] = term_frequency
105 |     np.savez('topics.pyldavis', **data)
106 |     for d, f in utils.chunks(batchsize, doc_ids, flattened):
107 |         t0 = time.time()
108 |         # optimizer.zero_grads()
109 |         model.cleargrads()
110 |         l = model.fit_partial(d.copy(), f.copy())
111 |         prior = model.prior()
112 |         loss = prior * fraction
113 |         loss.backward()
114 |         optimizer.update()
115 |         msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
116 |                "P:{prior:1.3e} R:{rate:1.3e}")
117 |         prior.to_cpu()
118 |         loss.to_cpu()
119 |         t1 = time.time()
120 |         dt = t1 - t0
121 |         rate = batchsize / dt
122 |         logs = dict(loss=float(l), epoch=epoch, j=j,
123 |                     prior=float(prior.data), rate=rate)
124 |         print(msg.format(**logs))
125 |         j += 1
126 |     serializers.save_hdf5("lda2vec.hdf5", model)
127 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/lda2vec/topics.pyldavis.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/examples/twenty_newsgroups/lda2vec/topics.pyldavis.npz


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nslda/nslda.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils, dirichlet_likelihood
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | class NSLDA(chainer.Chain):
11 |     def __init__(self, counts, n_docs, n_topics, n_dim, n_vocab, n_samples=5):
12 |         factors = np.random.random((n_topics, n_dim)).astype('float32')
13 |         loss_func = L.NegativeSampling(n_dim, counts, n_samples)
14 |         loss_func.W.data[:, :] = np.random.randn(*loss_func.W.data.shape)
15 |         loss_func.W.data[:, :] /= np.sqrt(np.prod(loss_func.W.data.shape))
16 |         super(NSLDA, self).__init__(proportions=L.EmbedID(n_docs, n_topics),
17 |                                     factors=L.Parameter(factors),
18 |                                     loss_func=loss_func)
19 |         self.n_docs = n_docs
20 |         self.n_topics = n_topics
21 |         self.n_vocab = n_vocab
22 |         self.n_dim = n_dim
23 | 
24 |     def forward(self, doc, wrd, window=5):
25 |         doc, wrd = utils.move(self.xp, doc, wrd)
26 |         proportions = self.proportions(doc)
27 |         ld = dirichlet_likelihood(self.proportions.W)
28 |         context = F.matmul(F.softmax(proportions), self.factors())
29 |         loss = self.loss_func(context, wrd)
30 |         return loss, ld
31 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nslda/nslda_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody 
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | from chainer import cuda
12 | import chainer.optimizers as O
13 | import numpy as np
14 | 
15 | from lda2vec import prepare_topics, print_top_words_per_topic
16 | from lda2vec import utils
17 | from nslda import NSLDA
18 | 
19 | gpu_id = int(os.getenv('CUDA_GPU', 0))
20 | cuda.get_device(gpu_id).use()
21 | print "Using GPU " + str(gpu_id)
22 | 
23 | vocab = pickle.load(open('../data/vocab.pkl', 'r'))
24 | corpus = pickle.load(open('../data/corpus.pkl', 'r'))
25 | doc_id = np.load("../data/doc_ids.npy")
26 | flattened = np.load("../data/flattened.npy")
27 | 
28 | # Number of docs
29 | n_docs = doc_id.max() + 1
30 | # Number of unique words in the vocabulary
31 | n_vocab = flattened.max() + 1
32 | # Number of dimensions in a single word vector
33 | n_units = 256
34 | # number of topics
35 | n_topics = 20
36 | batchsize = 4096 * 8
37 | # Strength of Dirichlet prior
38 | strength = 1.0
39 | counts = corpus.keys_counts[:n_vocab]
40 | # Get the string representation for every compact key
41 | words = corpus.word_list(vocab)[:n_vocab]
42 | 
43 | model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab)
44 | if os.path.exists('nslda.hdf5'):
45 |     print "Reloading from saved"
46 |     serializers.load_hdf5("nslda.hdf5", model)
47 | model.to_gpu()
48 | optimizer = O.Adam()
49 | optimizer.setup(model)
50 | 
51 | j = 0
52 | fraction = batchsize * 1.0 / flattened.shape[0]
53 | for epoch in range(50000000):
54 |     p = cuda.to_cpu(model.proportions.W.data).copy()
55 |     f = cuda.to_cpu(model.factors.W.data).copy()
56 |     w = cuda.to_cpu(model.loss_func.W.data).copy()
57 |     d = prepare_topics(p, f, w, words)
58 |     print_top_words_per_topic(d)
59 |     for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened):
60 |         t0 = time.time()
61 |         optimizer.zero_grads()
62 |         rec, ld = model.forward(doc_ids, flat)
63 |         l = rec + ld * fraction * strength
64 |         l.backward()
65 |         optimizer.update()
66 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
67 |                "P:{ld:1.3e} R:{rate:1.3e}")
68 |         l.to_cpu()
69 |         rec.to_cpu()
70 |         ld.to_cpu()
71 |         t1 = time.time()
72 |         dt = t1 - t0
73 |         rate = batchsize / dt
74 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
75 |                     ld=float(ld.data), rate=rate)
76 |         print msg.format(**logs)
77 |         j += 1
78 |     if epoch % 100 == 0:
79 |         serializers.save_hdf5("nslda.hdf5", model)
80 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nvdm/nvdm.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.links as L
 3 | import chainer.functions as F
 4 | 
 5 | from lda2vec import utils
 6 | 
 7 | 
 8 | class NVDM(chainer.Chain):
 9 |     def __init__(self, n_vocab, n_dim):
10 |         super(NVDM, self).__init__(l1=L.Linear(n_vocab, n_dim),
11 |                                    l2=L.Linear(n_dim, n_dim),
12 |                                    mu_logsigma=L.Linear(n_dim, n_dim * 2),
13 |                                    embedding=L.Linear(n_dim, n_vocab))
14 |         self.n_vocab = n_vocab
15 |         self.n_dim = n_dim
16 | 
17 |     def encode(self, bow):
18 |         """ Convert the bag of words vector of shape (n_docs, n_vocab)
19 |         into latent mean log variance vectors.
20 |         """
21 |         lam = F.relu(self.l1(bow))
22 |         pi = F.relu(self.l2(lam))
23 |         mu, log_sigma = F.split_axis(self.mu_logsigma(pi), 2, 1)
24 |         sample = F.gaussian(mu, log_sigma)
25 |         loss = F.gaussian_kl_divergence(mu, log_sigma)
26 |         return sample, loss
27 | 
28 |     def decode(self, sample, bow):
29 |         """ Decode latent document vectors back into word counts
30 |         (n_docs, n_vocab).
31 |         """
32 |         logprob = F.log_softmax(self.embedding(sample))
33 |         # This is equivalent to a softmax_cross_entropy where instead of
34 |         # guessing 1 of N words we have repeated observations
35 |         # Normal softmax for guessing the next word is:
36 |         # t log softmax(x), where t is 0 or 1
37 |         # Softmax for guessing word counts is simply doing
38 |         # the above more times, so multiply by the count
39 |         # count log softmax(x)
40 |         loss = -F.sum(bow * logprob)
41 |         return loss
42 | 
43 |     def observe(self, bow):
44 |         bow, = utils.move(self.xp, bow * 1.0)
45 |         sample, kl = self.encode(bow)
46 |         rec = self.decode(sample, bow)
47 |         return rec, kl
48 | 


--------------------------------------------------------------------------------
/examples/twenty_newsgroups/nvdm/nvdm_run.py:
--------------------------------------------------------------------------------
 1 | # Author: Chris Moody 
 2 | # License: MIT
 3 | 
 4 | # This simple example loads the newsgroups data from sklearn
 5 | # and train an LDA-like model on it
 6 | import os.path
 7 | import pickle
 8 | import time
 9 | 
10 | from chainer import serializers
11 | import chainer.optimizers as O
12 | import numpy as np
13 | 
14 | from lda2vec import utils
15 | from nvdm import NVDM
16 | 
17 | vocab = pickle.load(open('vocab.pkl', 'r'))
18 | corpus = pickle.load(open('corpus.pkl', 'r'))
19 | bow = np.load("bow.npy").astype('float32')
20 | # Remove bow counts on the first two tokens, which  and 
21 | bow[:, :2] = 0
22 | # Normalize bag of words to be a probability
23 | bow = bow / bow.sum(axis=1)[:, None]
24 | 
25 | # Number of unique words in the vocabulary
26 | n_vocab = bow.shape[1]
27 | # Number of dimensions in a single word vector
28 | n_units = 256
29 | batchsize = 128
30 | counts = corpus.keys_counts[:n_vocab]
31 | # Get the string representation for every compact key
32 | words = corpus.word_list(vocab)[:n_vocab]
33 | 
34 | model = NVDM(n_vocab, n_units)
35 | if os.path.exists('nvdm.hdf5'):
36 |     print "Reloading from saved"
37 |     serializers.load_hdf5("nvdm.hdf5", model)
38 | # model.to_gpu()
39 | optimizer = O.Adam()
40 | optimizer.setup(model)
41 | 
42 | j = 0
43 | fraction = batchsize * 1.0 / bow.shape[0]
44 | for epoch in range(500):
45 |     for (batch,) in utils.chunks(batchsize, bow):
46 |         t0 = time.time()
47 |         rec, kl = model.observe(batch)
48 |         optimizer.zero_grads()
49 |         l = rec + kl
50 |         l.backward()
51 |         optimizer.update()
52 |         msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
53 |                "P:{kl:1.3e} R:{rate:1.3e}")
54 |         l.to_cpu()
55 |         rec.to_cpu()
56 |         kl.to_cpu()
57 |         t1 = time.time()
58 |         dt = t1 - t0
59 |         rate = batchsize / dt
60 |         logs = dict(rec=float(rec.data), epoch=epoch, j=j,
61 |                     kl=float(kl.data), rate=rate)
62 |         print msg.format(**logs)
63 |         j += 1
64 |     serializers.save_hdf5("nvdm.hdf5", model)
65 | 


--------------------------------------------------------------------------------
/images/img00_word2vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img00_word2vec.png


--------------------------------------------------------------------------------
/images/img01_lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img01_lda.png


--------------------------------------------------------------------------------
/images/img02_lda_topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img02_lda_topics.png


--------------------------------------------------------------------------------
/images/img03_lda2vec_topics01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img03_lda2vec_topics01.png


--------------------------------------------------------------------------------
/images/img04_lda2vec_topics02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img04_lda2vec_topics02.png


--------------------------------------------------------------------------------
/images/img05_lda2vec_topics03_supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img05_lda2vec_topics03_supervised.png


--------------------------------------------------------------------------------
/images/img06_pyldavis.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/images/img06_pyldavis.gif


--------------------------------------------------------------------------------
/lda2vec/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | assert sys.version_info.major == 3, "Must use Python 3!"
 5 | 
 6 | sys.path.append(os.path.dirname(__file__))
 7 | 
 8 | import dirichlet_likelihood
 9 | import embed_mixture
10 | # import tracking
11 | # import preprocess
12 | import corpus
13 | # import topics
14 | import negative_sampling
15 | 
16 | dirichlet_likelihood = dirichlet_likelihood.dirichlet_likelihood
17 | EmbedMixture = embed_mixture.EmbedMixture
18 | # Tracking = tracking.Tracking
19 | # tokenize = preprocess.tokenize
20 | Corpus = corpus.Corpus
21 | # prepare_topics = topics.prepare_topics
22 | # print_top_words_per_topic = topics.print_top_words_per_topic
23 | # negative_sampling = negative_sampling.negative_sampling
24 | NegativeSampling = negative_sampling.negative_sampling
25 | # topic_coherence = topics.topic_coherence
26 | 


--------------------------------------------------------------------------------
/lda2vec/corpus.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import difflib
  4 | import pandas as pd
  5 | 
  6 | try:
  7 |     from pyxdameraulevenshtein import damerau_levenshtein_distance_ndarray
  8 | except ImportError:
  9 |     pass
 10 | 
 11 | 
 12 | class Corpus():
 13 |     _keys_frequency = None
 14 | 
 15 |     def __init__(self, out_of_vocabulary=-1, skip=-2):
 16 |         """ The Corpus helps with tasks involving integer representations of
 17 |         words. This object is used to filter, subsample, and convert loose
 18 |         word indices to compact word indices.
 19 | 
 20 |         'Loose' word arrays are word indices given by a tokenizer. The word
 21 |         index is not necessarily representative of word's frequency rank, and
 22 |         so loose arrays tend to have 'gaps' of unused indices, which can make
 23 |         models less memory efficient. As a result, this class helps convert
 24 |         a loose array to a 'compact' one where the most common words have low
 25 |         indices, and the most infrequent have high indices.
 26 | 
 27 |         Corpus maintains a count of how many of each word it has seen so
 28 |         that it can later selectively filter frequent or rare words. However,
 29 |         since word popularity rank could change with incoming data the word
 30 |         index count must be updated fully and `self.finalize()` must be called
 31 |         before any filtering and subsampling operations can happen.
 32 | 
 33 |         Arguments
 34 |         ---------
 35 |         out_of_vocabulary : int, default=-1
 36 |             Token index to replace whenever we encounter a rare or unseen word.
 37 |             Instead of skipping the token, we mark as an out of vocabulary
 38 |             word.
 39 |         skip : int, default=-2
 40 |             Token index to replace whenever we want to skip the current frame.
 41 |             Particularly useful when subsampling words or when padding a
 42 |             sentence.
 43 | 
 44 |         Examples
 45 |         --------
 46 |         >>> corpus = Corpus()
 47 |         >>> words_raw = np.random.randint(100, size=25)
 48 |         >>> corpus.update_word_count(words_raw)
 49 |         >>> corpus.finalize()
 50 |         >>> words_compact = corpus.to_compact(words_raw)
 51 |         >>> words_pruned = corpus.filter_count(words_compact, min_count=2)
 52 |         >>> # words_sub = corpus.subsample_frequent(words_pruned, thresh=1e-5)
 53 |         >>> words_loose = corpus.to_loose(words_pruned)
 54 |         >>> not_oov = words_loose > -1
 55 |         >>> np.all(words_loose[not_oov] == words_raw[not_oov])
 56 |         True
 57 |         """
 58 |         self.counts_loose = defaultdict(int)
 59 |         self._finalized = False
 60 |         self.specials = dict(out_of_vocabulary=out_of_vocabulary,
 61 |                              skip=skip)
 62 | 
 63 |     @property
 64 |     def n_specials(self):
 65 |         return len(self.specials)
 66 | 
 67 |     def update_word_count(self, loose_array):
 68 |         """ Update the corpus word counts given a loose array of word indices.
 69 |         Can be called multiple times, but once `finalize` is called the word
 70 |         counts cannot be updated.
 71 | 
 72 |         Arguments
 73 |         ---------
 74 |         loose_array : int array
 75 |             Array of word indices.
 76 | 
 77 |         Examples
 78 |         --------
 79 |         >>> corpus = Corpus()
 80 |         >>> corpus.update_word_count(np.arange(10))
 81 |         >>> corpus.update_word_count(np.arange(8))
 82 |         >>> corpus.counts_loose[0]
 83 |         2
 84 |         >>> corpus.counts_loose[9]
 85 |         1
 86 |         """
 87 |         self._check_unfinalized()
 88 |         uniques, counts = np.unique(np.ravel(loose_array), return_counts=True)
 89 |         msg = "Loose arrays cannot have elements below the values of special "
 90 |         msg += "tokens as these indices are reserved"
 91 |         assert uniques.min() >= min(self.specials.values()), msg
 92 |         for k, v in zip(uniques, counts):
 93 |             self.counts_loose[k] += v
 94 | 
 95 |     def _loose_keys_ordered(self):
 96 |         """ Get the loose keys in order of decreasing frequency"""
 97 |         loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1],
 98 |                               reverse=True)
 99 |         keys = np.array(loose_counts)[:, 0]
100 |         counts = np.array(loose_counts)[:, 1]
101 |         order = np.argsort(counts)[::-1].astype('int32')
102 |         keys, counts = keys[order], counts[order]
103 |         # Add in the specials as a prefix to the other keys
104 |         specials = np.sort(list(self.specials.values()))
105 |         keys = np.concatenate((specials, keys))
106 |         empty = np.zeros(len(specials), dtype='int32')
107 |         counts = np.concatenate((empty, counts))
108 |         n_keys = keys.shape[0]
109 |         assert counts.min() >= 0
110 |         return keys, counts, n_keys
111 | 
112 |     def finalize(self):
113 |         """ Call `finalize` once done updating word counts. This means the
114 |         object will no longer accept new word count data, but the loose
115 |         to compact index mapping can be computed. This frees the object to
116 |         filter, subsample, and compactify incoming word arrays.
117 | 
118 |         Examples
119 |         --------
120 |         >>> corpus = Corpus()
121 |         >>> # We'll update the word counts, making sure that word index 2
122 |         >>> # is the most common word index.
123 |         >>> corpus.update_word_count(np.arange(1) + 2)
124 |         >>> corpus.update_word_count(np.arange(3) + 2)
125 |         >>> corpus.update_word_count(np.arange(10) + 2)
126 |         >>> corpus.update_word_count(np.arange(8) + 2)
127 |         >>> corpus.counts_loose[2]
128 |         4
129 |         >>> # The corpus has not been finalized yet, and so the compact mapping
130 |         >>> # has not yet been computed.
131 |         >>> corpus.keys_counts[0]
132 |         Traceback (most recent call last):
133 |             ...
134 |         AttributeError: Corpus instance has no attribute 'keys_counts'
135 |         >>> corpus.finalize()
136 |         >>> corpus.n_specials
137 |         2
138 |         >>> # The special tokens are mapped to the first compact indices
139 |         >>> corpus.compact_to_loose[0]
140 |         -2
141 |         >>> corpus.compact_to_loose[0] == corpus.specials['skip']
142 |         True
143 |         >>> corpus.compact_to_loose[1] == corpus.specials['out_of_vocabulary']
144 |         True
145 |         >>> corpus.compact_to_loose[2]  # Most popular token is mapped next
146 |         2
147 |         >>> corpus.loose_to_compact[3]  # 2nd most popular token is mapped next
148 |         4
149 |         >>> first_non_special = corpus.n_specials
150 |         >>> corpus.keys_counts[first_non_special] # First normal token
151 |         4
152 |         """
153 |         # Return the loose keys and counts in descending count order
154 |         # so that the counts arrays is already in compact order
155 |         self.keys_loose, self.keys_counts, n_keys = self._loose_keys_ordered()
156 |         self.keys_compact = np.arange(n_keys).astype('int32')
157 |         self.loose_to_compact = {l: c for l, c in
158 |                                  zip(self.keys_loose, self.keys_compact)}
159 |         self.compact_to_loose = {c: l for l, c in
160 |                                  self.loose_to_compact.items()}
161 |         self.specials_to_compact = {s: self.loose_to_compact[i]
162 |                                     for s, i in self.specials.items()}
163 |         self.compact_to_special = {c: s for c, s in
164 |                                    self.specials_to_compact.items()}
165 |         self._finalized = True
166 | 
167 |     @property
168 |     def keys_frequency(self):
169 |         if self._keys_frequency is None:
170 |             f = self.keys_counts * 1.0 / np.sum(self.keys_counts)
171 |             self._keys_frequency = f
172 |         return self._keys_frequency
173 | 
174 |     def _check_finalized(self):
175 |         msg = "self.finalized() must be called before any other array ops"
176 |         assert self._finalized, msg
177 | 
178 |     def _check_unfinalized(self):
179 |         msg = "Cannot update word counts after self.finalized()"
180 |         msg += "has been called"
181 |         assert not self._finalized, msg
182 | 
183 |     def filter_count(self, words_compact, min_count=15, max_count=0,
184 |                      max_replacement=None, min_replacement=None):
185 |         """ Replace word indices below min_count with the pad index.
186 | 
187 |         Arguments
188 |         ---------
189 |         words_compact: int array
190 |             Source array whose values will be replaced. This is assumed to
191 |             already be converted into a compact array with `to_compact`.
192 |         min_count : int
193 |             Replace words less frequently occuring than this count. This
194 |             defines the threshold for what words are very rare
195 |         max_count : int
196 |             Replace words occuring more frequently than this count. This
197 |             defines the threshold for very frequent words
198 |         min_replacement : int, default is out_of_vocabulary
199 |             Replace words less than min_count with this.
200 |         max_replacement : int, default is out_of_vocabulary
201 |             Replace words greater than max_count with this.
202 | 
203 |         Examples
204 |         --------
205 |         >>> corpus = Corpus()
206 |         >>> # Make 1000 word indices with index < 100 and
207 |         >>> # update the word counts.
208 |         >>> word_indices = np.random.randint(100, size=1000)
209 |         >>> corpus.update_word_count(word_indices)
210 |         >>> corpus.finalize()  # any word indices above 99 will be filtered
211 |         >>> # Now create a new text, but with some indices above 100
212 |         >>> word_indices = np.random.randint(200, size=1000)
213 |         >>> word_indices.max() < 100
214 |         False
215 |         >>> # Remove words that have never appeared in the original corpus.
216 |         >>> filtered = corpus.filter_count(word_indices, min_count=1)
217 |         >>> filtered.max() < 100
218 |         True
219 |         >>> # We can also remove highly frequent words.
220 |         >>> filtered = corpus.filter_count(word_indices, max_count=2)
221 |         >>> len(np.unique(word_indices)) > len(np.unique(filtered))
222 |         True
223 |         """
224 |         self._check_finalized()
225 |         ret = words_compact.copy()
226 |         if min_replacement is None:
227 |             min_replacement = self.specials_to_compact['out_of_vocabulary']
228 |         if max_replacement is None:
229 |             max_replacement = self.specials_to_compact['out_of_vocabulary']
230 |         not_specials = np.ones(self.keys_counts.shape[0], dtype='bool')
231 |         not_specials[:self.n_specials] = False
232 |         if min_count:
233 |             # Find first index with count less than min_count
234 |             min_idx = np.argmax(not_specials & (self.keys_counts < min_count))
235 |             # Replace all indices greater than min_idx
236 |             ret[ret > min_idx] = min_replacement
237 |         if max_count:
238 |             # Find first index with count less than max_count
239 |             max_idx = np.argmax(not_specials & (self.keys_counts < max_count))
240 |             # Replace all indices less than max_idx
241 |             ret[ret < max_idx] = max_replacement
242 |         return ret
243 | 
244 |     def subsample_frequent(self, words_compact, threshold=1e-5):
245 |         """ Subsample the most frequent words. This aggressively
246 |         replaces words with frequencies higher than `threshold`. Words
247 |         are replaced with the out_of_vocabulary token.
248 | 
249 |         Words will be replaced with probability as a function of their
250 |         frequency in the training corpus:
251 | 
252 |         .. math::
253 |             p(w) = 1.0 - \sqrt{threshold\over f(w)}
254 | 
255 |         Arguments
256 |         ---------
257 |         words_compact: int array
258 |             The input array to subsample.
259 |         threshold: float in [0, 1]
260 |             Words with frequencies higher than this will be increasingly
261 |             subsampled.
262 | 
263 |         Examples
264 |         --------
265 |         >>> corpus = Corpus()
266 |         >>> word_indices = (np.random.power(5.0, size=1000) * 100).astype('i')
267 |         >>> corpus.update_word_count(word_indices)
268 |         >>> corpus.finalize()
269 |         >>> compact = corpus.to_compact(word_indices)
270 |         >>> sampled = corpus.subsample_frequent(compact, threshold=1e-2)
271 |         >>> skip = corpus.specials_to_compact['skip']
272 |         >>> np.sum(compact == skip)  # No skips in the compact tokens
273 |         0
274 |         >>> np.sum(sampled == skip) > 0  # Many skips in the sampled tokens
275 |         True
276 | 
277 |         .. [1] Distributed Representations of Words and Phrases and
278 |                their Compositionality. Mikolov, Tomas and Sutskever, Ilya
279 |                and Chen, Kai and Corrado, Greg S and Dean, Jeff
280 |                Advances in Neural Information Processing Systems 26
281 |         """
282 |         self._check_finalized()
283 |         freq = self.keys_frequency + 1e-10
284 |         pw = 1.0 - (np.sqrt(threshold / freq) + threshold / freq)
285 |         prob = fast_replace(words_compact, self.keys_compact, pw)
286 |         draw = np.random.uniform(size=prob.shape)
287 |         ret = words_compact.copy()
288 |         # If probability greater than draw, skip the word
289 |         ret[prob > draw] = self.specials_to_compact['skip']
290 |         return ret
291 | 
292 |     def to_compact(self, word_loose):
293 |         """ Convert a loose word index matrix to a compact array using
294 |         a fixed loose to dense mapping. Out of vocabulary word indices
295 |         will be replaced by the out of vocabulary index. The most common
296 |         index will be mapped to 0, the next most common to 1, and so on.
297 | 
298 |         Arguments
299 |         ---------
300 |         word_loose : int array
301 |             Input loose word array to be converted into a compact array.
302 | 
303 | 
304 |         Examples
305 |         --------
306 |         >>> corpus = Corpus()
307 |         >>> word_indices = np.random.randint(100, size=1000)
308 |         >>> n_words = len(np.unique(word_indices))
309 |         >>> corpus.update_word_count(word_indices)
310 |         >>> corpus.finalize()
311 |         >>> word_compact = corpus.to_compact(word_indices)
312 |         >>> # The most common word in the training set will be mapped to be
313 |         >>> # right after all the special tokens, so 2 in this case.
314 |         >>> np.argmax(np.bincount(word_compact)) == 2
315 |         True
316 |         >>> most_common = np.argmax(np.bincount(word_indices))
317 |         >>> corpus.loose_to_compact[most_common] == 2
318 |         True
319 |         >>> # Out of vocabulary indices will be mapped to 1
320 |         >>> word_indices = np.random.randint(150, size=1000)
321 |         >>> word_compact_oov = corpus.to_compact(word_indices)
322 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
323 |         >>> oov
324 |         1
325 |         >>> oov in word_compact
326 |         False
327 |         >>> oov in word_compact_oov
328 |         True
329 |         """
330 |         self._check_finalized()
331 |         keys = self.keys_loose
332 |         reps = self.keys_compact
333 |         uniques = np.unique(word_loose)
334 |         # Find the out of vocab indices
335 |         oov = np.setdiff1d(uniques, keys, assume_unique=True)
336 |         oov_token = self.specials_to_compact['out_of_vocabulary']
337 |         keys = np.concatenate((keys, oov))
338 |         reps = np.concatenate((reps, np.zeros_like(oov) + oov_token))
339 |         compact = fast_replace(word_loose, keys, reps)
340 |         msg = "Error: all compact indices should be non-negative"
341 |         assert compact.min() >= 0, msg
342 |         return compact
343 | 
344 |     def to_loose(self, word_compact):
345 |         """ Convert a compacted array back into a loose array.
346 | 
347 |         Arguments
348 |         ---------
349 |         word_compact : int array
350 |             Input compacted word array to be converted into a loose array.
351 | 
352 | 
353 |         Examples
354 |         --------
355 |         >>> corpus = Corpus()
356 |         >>> word_indices = np.random.randint(100, size=1000)
357 |         >>> corpus.update_word_count(word_indices)
358 |         >>> corpus.finalize()
359 |         >>> word_compact = corpus.to_compact(word_indices)
360 |         >>> word_loose = corpus.to_loose(word_compact)
361 |         >>> np.all(word_loose == word_indices)
362 |         True
363 |         """
364 |         self._check_finalized()
365 |         uniques = np.unique(word_compact)
366 |         # Find the out of vocab indices
367 |         oov = np.setdiff1d(uniques, self.keys_compact, assume_unique=True)
368 |         msg = "Found keys in `word_compact` not present in the"
369 |         msg += "training corpus. Is this actually a compacted array?"
370 |         assert np.all(oov < 0), msg
371 |         loose = fast_replace(word_compact, self.keys_compact, self.keys_loose)
372 |         return loose
373 | 
374 |     def compact_to_flat(self, word_compact, *components):
375 |         """ Ravel a 2D compact array of documents (rows) and word
376 |         positions (columns) into a 1D array of words. Leave out special
377 |         tokens and ravel the component arrays in the same fashion.
378 | 
379 |         Arguments
380 |         ---------
381 |         word_compact : int array
382 |             Array of word indices in documents. Has shape (n_docs, max_length)
383 |         components : list of arrays
384 |             A list of arrays detailing per-document properties. Each array
385 |             must n_docs long.
386 | 
387 |         Returns
388 |         -------
389 |         flat : int array
390 |             An array of all words unravelled into a 1D shape
391 |         components : list of arrays
392 |             Each array here is also unravelled into the same shape
393 | 
394 |         Examples
395 |         --------
396 |         >>> corpus = Corpus()
397 |         >>> word_indices = np.random.randint(100, size=1000)
398 |         >>> corpus.update_word_count(word_indices)
399 |         >>> corpus.finalize()
400 |         >>> doc_texts = np.arange(8).reshape((2, 4))
401 |         >>> doc_texts[:, -1] = -2  # Mark as skips
402 |         >>> doc_ids = np.arange(2)
403 |         >>> compact = corpus.to_compact(doc_texts)
404 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
405 |         >>> compact[1, 3] = oov  # Mark the last word as OOV
406 |         >>> flat = corpus.compact_to_flat(compact)
407 |         >>> flat.shape[0] == 6  # 2 skips were dropped from 8 words
408 |         True
409 |         >>> flat[-1] == corpus.loose_to_compact[doc_texts[1, 2]]
410 |         True
411 |         >>> flat, (flat_id,) = corpus.compact_to_flat(compact, doc_ids)
412 |         >>> flat_id
413 |         array([0, 0, 0, 1, 1, 1])
414 |         """
415 |         self._check_finalized()
416 |         n_docs = word_compact.shape[0]
417 |         max_length = word_compact.shape[1]
418 |         idx = word_compact > self.n_specials
419 |         components_raveled = []
420 |         msg = "Length of each component must much `word_compact` size"
421 |         for component in components:
422 |             raveled = np.tile(component[:, None], max_length)[idx]
423 |             components_raveled.append(raveled)
424 |             assert len(component) == n_docs, msg
425 |         if len(components_raveled) == 0:
426 |             return word_compact[idx]
427 |         else:
428 |             return word_compact[idx], components_raveled
429 | 
430 |     def word_list(self, vocab, max_compact_index=None, oov_token=''):
431 |         """ Translate compact keys back into string representations for a word.
432 | 
433 |         Arguments
434 |         ---------
435 |         vocab : dict
436 |             The vocab object has loose indices as keys and word strings as
437 |             values.
438 | 
439 |         max_compact_index : int
440 |             Only return words up to this index. If None, defaults to the number
441 |             of compact indices available
442 | 
443 |         oov_token : str
444 |             Returns this string if a compact index does not have a word in the
445 |             vocab dictionary provided.
446 | 
447 |         Returns
448 |         -------
449 |         word_list : list
450 |             A list of strings representations corresponding to word indices
451 |             zero to `max_compact_index`
452 | 
453 |         Examples
454 |         --------
455 | 
456 |         >>> vocab = {0: 'But', 1: 'the', 2: 'night', 3: 'was', 4: 'warm'}
457 |         >>> word_indices = np.zeros(50).astype('int32')
458 |         >>> word_indices[:25] = 0  # 'But' shows 25 times
459 |         >>> word_indices[25:35] = 1  # 'the' is in 10 times
460 |         >>> word_indices[40:46] = 2  # 'night' is in 6 times
461 |         >>> word_indices[46:49] = 3  # 'was' is in 3 times
462 |         >>> word_indices[49:] = 4  # 'warm' in in 2 times
463 |         >>> corpus = Corpus()
464 |         >>> corpus.update_word_count(word_indices)
465 |         >>> corpus.finalize()
466 |         >>> # Build a vocabulary of word indices
467 |         >>> corpus.word_list(vocab)
468 |         ['skip', 'out_of_vocabulary', 'But', 'the', 'night', 'was', 'warm']
469 |         """
470 |         # Translate the compact keys into string words
471 |         oov = self.specials['out_of_vocabulary']
472 |         words = []
473 |         if max_compact_index is None:
474 |             max_compact_index = self.keys_compact.shape[0]
475 |         index_to_special = {i: s for s, i in self.specials.items()}
476 |         for compact_index in range(max_compact_index):
477 |             loose_index = self.compact_to_loose.get(compact_index, oov)
478 |             special = index_to_special.get(loose_index, oov_token)
479 |             string = vocab.get(loose_index, special)
480 |             words.append(string)
481 |         return words
482 | 
483 |     def compact_word_vectors(self, vocab, filename=None, array=None,
484 |                              top=20000):
485 |         """ Retrieve pretrained word spectors for our vocabulary.
486 |         The returned word array has row indices corresponding to the
487 |         compact index of a word, and columns correponding to the word
488 |         vector.
489 | 
490 |         Arguments
491 |         ---------
492 |         vocab : dict
493 |             Dictionary where keys are the loose index, and values are
494 |             the word string.
495 | 
496 |         use_spacy : bool
497 |             Use SpaCy to load in word vectors. Otherwise Gensim.
498 | 
499 |         filename : str
500 |             Filename for SpaCy-compatible word vectors or if use_spacy=False
501 |             then uses word2vec vectors via gensim.
502 | 
503 |         Returns
504 |         -------
505 |         data : numpy float array
506 |             Array such that data[compact_index, :] = word_vector
507 | 
508 |         Examples
509 |         --------
510 |         >>> import numpy.linalg as nl
511 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
512 |         >>> word_indices = np.zeros(50).astype('int32')
513 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
514 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
515 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
516 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
517 |         >>> corpus = Corpus()
518 |         >>> corpus.update_word_count(word_indices)
519 |         >>> corpus.finalize()
520 |         >>> v, s, f = corpus.compact_word_vectors(vocab)
521 |         >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
522 |         >>> vocab[corpus.compact_to_loose[2]]
523 |         'shuttle'
524 |         >>> vocab[corpus.compact_to_loose[3]]
525 |         'astronomy'
526 |         >>> vocab[corpus.compact_to_loose[4]]
527 |         'cold'
528 |         >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
529 |         >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
530 |         >>> sim_shuttle_astro > sim_shuttle_cold
531 |         True
532 |         """
533 |         n_words = len(self.compact_to_loose)
534 |         from gensim.models.keyedvectors import Word2VecKeyedVectors
535 |         model = Word2VecKeyedVectors.load_word2vec_format(filename, binary=True)
536 |         n_dim = model.syn0.shape[1]
537 |         data = np.random.normal(size=(n_words, n_dim)).astype('float32')
538 |         data -= data.mean()
539 |         data += model.syn0.mean()
540 |         data /= data.std()
541 |         data *= model.syn0.std()
542 |         if array is not None:
543 |             data = array
544 |             n_words = data.shape[0]
545 |         keys_raw = model.vocab.keys()
546 |         keys = [s.encode('ascii', 'ignore') for s in keys_raw]
547 |         lens = [len(s) for s in model.vocab.keys()]
548 |         choices = np.array(keys, dtype='S')
549 |         lengths = np.array(lens, dtype='int32')
550 |         s, f = 0, 0
551 |         rep0 = lambda w: w
552 |         rep1 = lambda w: w.replace(' ', '_')
553 |         rep2 = lambda w: w.title().replace(' ', '_')
554 |         reps = [rep0, rep1, rep2]
555 |         for compact in np.arange(top):
556 |             loose = self.compact_to_loose.get(compact, None)
557 |             if loose is None:
558 |                 continue
559 |             word = vocab.get(loose, None)
560 |             if word is None:
561 |                 continue
562 |             word = word.strip()
563 |             vector = None
564 |             for rep in reps:
565 |                 clean = rep(word)
566 |                 if clean in model.vocab:
567 |                     vector = model[clean]
568 |                     break
569 |             if vector is None:
570 |                 try:
571 |                     idx = lengths >= len(word) - 3
572 |                     idx &= lengths <= len(word) + 3
573 |                     sel = choices[idx]
574 |                     d = damerau_levenshtein_distance_ndarray(word, sel)
575 |                     choice = np.array(keys_raw)[idx][np.argmin(d)]
576 |                     # choice = difflib.get_close_matches(word, choices)[0]
577 |                     vector = model[choice]
578 |                     print(compact, word, ' --> ', choice)
579 |                 except IndexError:
580 |                     pass
581 |             if vector is None:
582 |                 f += 1
583 |                 continue
584 |             s += 1
585 |             data[compact, :] = vector[:]
586 |         return data, s, f
587 | 
588 |     def compact_to_bow(self, word_compact, max_compact_index=None):
589 |         """ Given a 2D array of compact indices, return the bag of words
590 |         representation where the column is the word index, row is the document
591 |         index, and the value is the number of times that word appears in that
592 |         document.
593 | 
594 |         >>> import numpy.linalg as nl
595 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
596 |         >>> word_indices = np.zeros(50).astype('int32')
597 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
598 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
599 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
600 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
601 |         >>> corpus = Corpus()
602 |         >>> corpus.update_word_count(word_indices)
603 |         >>> corpus.finalize()
604 |         >>> v = corpus.compact_to_bow(word_indices)
605 |         >>> len(v)
606 |         20
607 |         >>> v[:6]
608 |         array([ 5,  0,  0,  4,  0, 10])
609 |         >>> v[19]
610 |         25
611 |         >>> v.sum()
612 |         50
613 |         >>> words = [[0, 0, 0, 3, 4], [1, 1, 1, 4, 5]]
614 |         >>> words = np.array(words)
615 |         >>> bow = corpus.compact_to_bow(words)
616 |         >>> bow.shape
617 |         (2, 6)
618 |         """
619 |         if max_compact_index is None:
620 |             max_compact_index = word_compact.max()
621 | 
622 |         def bincount(x):
623 |             return np.bincount(x, minlength=max_compact_index + 1)
624 |         axis = len(word_compact.shape) - 1
625 |         bow = np.apply_along_axis(bincount, axis, word_compact)
626 |         return bow
627 | 
628 |     def compact_to_coocurrence(self, word_compact, indices, window_size=10):
629 |         """ From an array of compact tokens and aligned array of document indices
630 |         compute (word, word, document) co-occurrences within a moving window.
631 | 
632 |         Arguments
633 |         ---------
634 |         word_compact: int array
635 |         Sequence of tokens.
636 | 
637 |         indices: dict of int arrays
638 |         Each array in this dictionary should represent the document index it
639 |         came from.
640 | 
641 |         window_size: int
642 |         Indicates the moving window size around which all co-occurrences will
643 |         be computed.
644 | 
645 |         Returns
646 |         -------
647 |         counts : DataFrame
648 |         Returns a DataFrame with two columns for word index A and B,
649 |         one extra column for each document index, and a final column for counts
650 |         in that key.
651 | 
652 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
653 |         >>> doc_idx = np.array([0, 0, 0, 0, 1, 1, 1, 1])
654 |         >>> corpus = Corpus()
655 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
656 |         >>> counts.counts.sum()
657 |         24
658 |         >>> counts.query('doc == 0').counts.values
659 |         array([3, 3, 6])
660 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
661 |         >>> doc_idx = np.array([0, 0, 0, 1, 1, 2, 2, 2])
662 |         >>> corpus = Corpus()
663 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
664 |         >>> counts.counts.sum()
665 |         14
666 |         >>> counts.query('doc == 0').word_index_x.values
667 |         array([0, 1, 1])
668 |         >>> counts.query('doc == 0').word_index_y.values
669 |         array([1, 0, 1])
670 |         >>> counts.query('doc == 0').counts.values
671 |         array([2, 2, 2])
672 |         >>> counts.query('doc == 1').counts.values
673 |         array([1, 1])
674 |         """
675 |         tokens = pd.DataFrame(dict(word_index=word_compact)).reset_index()
676 |         for name, index in indices.items():
677 |             tokens[name] = index
678 |         a, b = tokens.copy(), tokens.copy()
679 |         mask = lambda x: np.prod([x[k + '_x'] == x[k + '_y']
680 |                                   for k in indices.keys()], axis=0)
681 |         group_keys = ['word_index_x', 'word_index_y', ]
682 |         group_keys += [k + '_x' for k in indices.keys()]
683 |         total = []
684 |         a['frame'] = a['index'].copy()
685 |         for frame in range(-window_size, window_size + 1):
686 |             if frame == 0:
687 |                 continue
688 |             b['frame'] = b['index'] + frame
689 |             matches = (a.merge(b, on='frame')
690 |                         .assign(same_doc=mask)
691 |                         .pipe(lambda df: df[df['same_doc'] == 1])
692 |                         .groupby(group_keys)['frame']
693 |                         .count()
694 |                         .reset_index())
695 |             total.append(matches)
696 |         counts = (pd.concat(total)
697 |                     .groupby(group_keys)['frame']
698 |                     .sum()
699 |                     .reset_index()
700 |                     .rename(columns={k + '_x': k for k in indices.keys()})
701 |                     .rename(columns=dict(frame='counts')))
702 |         return counts
703 | 
704 | 
705 | def fast_replace(data, keys, values, skip_checks=False):
706 |     """ Do a search-and-replace in array `data`.
707 | 
708 |     Arguments
709 |     ---------
710 |     data : int array
711 |         Array of integers
712 |     keys : int array
713 |         Array of keys inside of `data` to be replaced
714 |     values : int array
715 |         Array of values that replace the `keys` array
716 |     skip_checks : bool, default=False
717 |         Optionally skip sanity checking the input.
718 | 
719 |     Examples
720 |     --------
721 |     >>> fast_replace(np.arange(5), np.arange(5), np.arange(5)[::-1])
722 |     array([4, 3, 2, 1, 0])
723 |     """
724 |     assert np.allclose(keys.shape, values.shape)
725 |     if not skip_checks:
726 |         msg = "data has elements not in keys"
727 |         assert data.max() <= keys.max(), msg
728 |     sdx = np.argsort(keys)
729 |     keys, values = keys[sdx], values[sdx]
730 |     idx = np.digitize(data, keys, right=True)
731 |     new_data = values[idx]
732 |     return new_data
733 | 


--------------------------------------------------------------------------------
/lda2vec/dirichlet_likelihood.py:
--------------------------------------------------------------------------------
 1 | import chainer.functions as F
 2 | from chainer import Variable
 3 | 
 4 | 
 5 | def dirichlet_likelihood(weights, alpha=None):
 6 |     """ Calculate the log likelihood of the observed topic proportions.
 7 |     A negative likelihood is more likely than a negative likelihood.
 8 | 
 9 |     Args:
10 |         weights (chainer.Variable): Unnormalized weight vector. The vector
11 |             will be passed through a softmax function that will map the input
12 |             onto a probability simplex.
13 |         alpha (float): The Dirichlet concentration parameter. Alpha
14 |             greater than 1.0 results in very dense topic weights such
15 |             that each document belongs to many topics. Alpha < 1.0 results
16 |             in sparser topic weights. The default is to set alpha to
17 |             1.0 / n_topics, effectively enforcing the prior belief that a
18 |             document belong to very topics at once.
19 | 
20 |     Returns:
21 |         ~chainer.Variable: Output loss variable.
22 |     """
23 |     if type(weights) is Variable:
24 |         n_topics = weights.data.shape[1]
25 |     else:
26 |         n_topics = weights.W.data.shape[1]
27 |     if alpha is None:
28 |         alpha = 1.0 / n_topics
29 |     if type(weights) is Variable:
30 |         log_proportions = F.log_softmax(weights)
31 |     else:
32 |         log_proportions = F.log_softmax(weights.W)
33 |     loss = (alpha - 1.0) * log_proportions
34 |     return -F.sum(loss)
35 | 


--------------------------------------------------------------------------------
/lda2vec/embed_mixture.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import chainer
  4 | import chainer.links as L
  5 | import chainer.functions as F
  6 | from chainer import Variable
  7 | 
  8 | 
  9 | def _orthogonal_matrix(shape):
 10 |     # Stolen from blocks:
 11 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 12 |     M1 = np.random.randn(shape[0], shape[0])
 13 |     M2 = np.random.randn(shape[1], shape[1])
 14 | 
 15 |     # QR decomposition of matrix with entries in N(0, 1) is random
 16 |     Q1, R1 = np.linalg.qr(M1)
 17 |     Q2, R2 = np.linalg.qr(M2)
 18 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
 19 |     Q1 = Q1 * np.sign(np.diag(R1))
 20 |     Q2 = Q2 * np.sign(np.diag(R2))
 21 | 
 22 |     n_min = min(shape[0], shape[1])
 23 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
 24 | 
 25 | 
 26 | class EmbedMixture(chainer.Chain):
 27 |     r""" A single document is encoded as a multinomial mixture of latent topics.
 28 |     The mixture is defined on simplex, so that mixture weights always sum
 29 |     to 100%. The latent topic vectors resemble word vectors whose elements are
 30 |     defined over all real numbers.
 31 | 
 32 |     For example, a single document mix may be :math:`[0.9, 0.1]`, indicating
 33 |     that it is 90% in the first topic, 10% in the second. An example topic
 34 |     vector looks like :math:`[1.5e1, -1.3e0, +3.4e0, -0.2e0]`, which is
 35 |     largely uninterpretable until you measure the words most similar to this
 36 |     topic vector.
 37 | 
 38 |     A single document vector :math:`\vec{e}` is composed as weights :math:`c_j`
 39 |     over topic vectors :math:`\vec{T_j}`:
 40 | 
 41 |     .. math::
 42 | 
 43 |         \vec{e}=\Sigma_{j=0}^{j=n\_topics}c_j\vec{T_j}
 44 | 
 45 |     This is usually paired with regularization on the weights :math:`c_j`.
 46 |     If using a Dirichlet prior with low alpha, these weights will be sparse.
 47 | 
 48 |     Args:
 49 |         n_documents (int): Total number of documents
 50 |         n_topics (int): Number of topics per document
 51 |         n_dim (int): Number of dimensions per topic vector (should match word
 52 |             vector size)
 53 | 
 54 |     Attributes:
 55 |         weights : chainer.links.EmbedID
 56 |             Unnormalized topic weights (:math:`c_j`). To normalize these
 57 |             weights, use `F.softmax(weights)`.
 58 |         factors : chainer.links.Parameter
 59 |             Topic vector matrix (:math:`T_j`)
 60 | 
 61 |     .. seealso:: :func:`lda2vec.dirichlet_likelihood`
 62 |     """
 63 | 
 64 |     def __init__(self, n_documents, n_topics, n_dim, dropout_ratio=0.2,
 65 |                  temperature=1.0):
 66 |         self.n_documents = n_documents
 67 |         self.n_topics = n_topics
 68 |         self.n_dim = n_dim
 69 |         self.dropout_ratio = dropout_ratio
 70 |         factors = _orthogonal_matrix((n_topics, n_dim)).astype('float32')
 71 |         factors /= np.sqrt(n_topics + n_dim)
 72 |         super(EmbedMixture, self).__init__(
 73 |             weights=L.EmbedID(n_documents, n_topics),
 74 |             factors=L.Parameter(factors))
 75 |         self.temperature = temperature
 76 |         self.weights.W.data[...] /= np.sqrt(n_documents + n_topics)
 77 | 
 78 |     def __call__(self, doc_ids, update_only_docs=False):
 79 |         """ Given an array of document integer indices, returns a vector
 80 |         for each document. The vector is composed of topic weights projected
 81 |         onto topic vectors.
 82 | 
 83 |         Args:
 84 |             doc_ids : chainer.Variable
 85 |                 One-dimensional batch vectors of IDs
 86 | 
 87 |         Returns:
 88 |             doc_vector : chainer.Variable
 89 |                 Batch of two-dimensional embeddings for every document.
 90 |         """
 91 |         # (batchsize, ) --> (batchsize, multinomial)
 92 |         proportions = self.proportions(doc_ids, softmax=True)
 93 |         # (batchsize, n_factors) * (n_factors, n_dim) --> (batchsize, n_dim)
 94 |         factors = F.dropout(self.factors(), ratio=self.dropout_ratio)
 95 |         if update_only_docs:
 96 |             factors.unchain_backward()
 97 |         w_sum = F.matmul(proportions, factors)
 98 |         return w_sum
 99 | 
100 |     def proportions(self, doc_ids, softmax=False):
101 |         """ Given an array of document indices, return a vector
102 |         for each document of just the unnormalized topic weights.
103 | 
104 |         Returns:
105 |             doc_weights : chainer.Variable
106 |                 Two dimensional topic weights of each document.
107 |         """
108 |         w = self.weights(doc_ids)
109 |         if softmax:
110 |             size = w.data.shape
111 |             mask = self.xp.random.random_integers(0, 1, size=size)
112 |             y = (F.softmax(w * self.temperature) *
113 |                  Variable(mask.astype('float32')))
114 |             norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
115 |             return y / (norm + 1e-7)
116 |         else:
117 |             return w
118 | 


--------------------------------------------------------------------------------
/lda2vec/fake_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.random import random_sample
 3 | 
 4 | 
 5 | def orthogonal_matrix(shape):
 6 |     # Stolen from blocks:
 7 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 8 |     M1 = np.random.randn(shape[0], shape[0])
 9 |     M2 = np.random.randn(shape[1], shape[1])
10 | 
11 |     # QR decomposition of matrix with entries in N(0, 1) is random
12 |     Q1, R1 = np.linalg.qr(M1)
13 |     Q2, R2 = np.linalg.qr(M2)
14 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
15 |     Q1 = Q1 * np.sign(np.diag(R1))
16 |     Q2 = Q2 * np.sign(np.diag(R2))
17 | 
18 |     n_min = min(shape[0], shape[1])
19 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
20 | 
21 | 
22 | def softmax(w):
23 |     # https://gist.github.com/stober/1946926
24 |     w = np.array(w)
25 |     maxes = np.amax(w, axis=1)
26 |     maxes = maxes.reshape(maxes.shape[0], 1)
27 |     e = np.exp(w - maxes)
28 |     dist = e / np.sum(e, axis=1)[:, None]
29 |     return dist
30 | 
31 | 
32 | def sample(values, probabilities, size):
33 |     assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
34 |     bins = np.add.accumulate(probabilities)
35 |     return values[np.digitize(random_sample(size), bins)]
36 | 
37 | 
38 | def fake_data(n_docs, n_words, n_sent_length, n_topics):
39 |     """ Generate latent topic vectors for words and documents
40 |     and then for each document, draw a sentence. Draw each word
41 |     document with probability proportional to the dot product and
42 |     normalized with a softmax.
43 | 
44 |     Arguments
45 |     ---------
46 |     n_docs : int
47 |         Number of documents
48 |     n_words : int
49 |         Number of words in the vocabulary
50 |     n_sent_length : int
51 |         Number of words to draw for each document
52 |     n_topics : int
53 |         Number of topics that a single document can belong to.
54 | 
55 |     Returns
56 |     -------
57 |     sentences : int array
58 |         Array of word indices of shape (n_docs, n_sent_length).
59 | 
60 |     """
61 |     # These are log ratios for the doc & word topics
62 |     doc_topics = orthogonal_matrix([n_docs, n_topics])
63 |     wrd_topics = orthogonal_matrix([n_topics, n_words])
64 |     # Multiply log ratios and softmax to get prob of word in doc
65 |     doc_to_wrds = softmax(np.dot(doc_topics, wrd_topics))
66 |     # Now sample from doc_to_wrd to get realizations
67 |     indices = np.arange(n_words).astype('int32')
68 |     sentences = []
69 |     for doc_to_wrd in doc_to_wrds:
70 |         words = sample(indices, doc_to_wrd, n_sent_length)
71 |         sentences.append(words)
72 |     sentences = np.array(sentences)
73 |     return sentences.astype('int32')
74 | 


--------------------------------------------------------------------------------
/lda2vec/negative_sampling.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import six
  3 | 
  4 | from chainer import cuda
  5 | from chainer import function
  6 | from chainer.utils import type_check
  7 | 
  8 | 
  9 | class NegativeSamplingFunction(function.Function):
 10 | 
 11 |     ignore_label = -1
 12 | 
 13 |     def __init__(self, sampler, sample_size):
 14 |         self.sampler = sampler
 15 |         self.sample_size = sample_size
 16 | 
 17 |     def _make_samples(self, t):
 18 |         if hasattr(self, 'samples'):
 19 |             return self.samples  # for testing
 20 | 
 21 |         size = int(t.shape[0])
 22 |         # first one is the positive, and others are sampled negatives
 23 |         samples = self.sampler((size, self.sample_size + 1))
 24 |         samples[:, 0] = t
 25 |         self.samples = samples
 26 | 
 27 |     def check_type_forward(self, in_types):
 28 |         type_check.expect(in_types.size() == 3)
 29 |         x_type, t_type, w_type = in_types
 30 | 
 31 |         type_check.expect(
 32 |             x_type.dtype == numpy.float32,
 33 |             x_type.ndim == 2,
 34 |             t_type.dtype == numpy.int32,
 35 |             t_type.ndim == 1,
 36 |             x_type.shape[0] == t_type.shape[0],
 37 |             w_type.dtype == numpy.float32,
 38 |             w_type.ndim == 2,
 39 |         )
 40 | 
 41 |     def forward_cpu(self, inputs):
 42 |         x, t, W = inputs
 43 |         self.ignore_mask = (t != self.ignore_label)
 44 |         self._make_samples(t)
 45 | 
 46 |         loss = numpy.float32(0.0)
 47 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
 48 |                                     self.samples[self.ignore_mask])):
 49 |             w = W[k]
 50 |             f = w.dot(ix)
 51 |             f[0] *= -1  # positive sample
 52 |             loss += numpy.sum(numpy.logaddexp(f, 0))
 53 |         return numpy.array(loss, numpy.float32),
 54 | 
 55 |     def forward_gpu(self, inputs):
 56 |         x, t, W = inputs
 57 |         self.ignore_mask = (t != self.ignore_label)
 58 |         n_in = x.shape[1]
 59 |         self._make_samples(t)
 60 | 
 61 |         self.wx = cuda.elementwise(
 62 |             'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
 63 |             '''
 64 |             T f = 0;
 65 |             if (mask == 1){
 66 |                 for (int j = 0; j < c; ++j) {
 67 |                   int x_ind[] = {(i / m), j};
 68 |                   int w_ind[] = {k, j};
 69 |                   f += x[x_ind] * W[w_ind];
 70 |                 }
 71 |             }
 72 |             wx = f;
 73 |             ''',
 74 |             'negative_sampling_wx'
 75 |             )(W, x, self.ignore_mask[:, None], self.samples, n_in,
 76 |               self.sample_size + 1)
 77 | 
 78 |         y = cuda.elementwise(
 79 |             'T wx, int32 c, int32 m', 'T y',
 80 |             '''
 81 |             T f = wx;
 82 |             if (i % m == 0) {
 83 |               f = -f;
 84 |             }
 85 |             T loss;
 86 |             if (f < 0) {
 87 |               loss = __logf(1 + __expf(f));
 88 |             } else {
 89 |               loss = f + __logf(1 + __expf(-f));
 90 |             }
 91 |             y = loss;
 92 |             ''',
 93 |             'negative_sampling_forward'
 94 |         )(self.wx, n_in, self.sample_size + 1)
 95 |         # TODO(okuta): merge elementwise
 96 |         loss = cuda.cupy.sum(y * self.ignore_mask[:, None].astype('float32'))
 97 |         return loss,
 98 | 
 99 |     def backward_cpu(self, inputs, grads):
100 |         x, t, W = inputs
101 |         gloss, = grads
102 | 
103 |         gx = numpy.zeros_like(x)
104 |         gW = numpy.zeros_like(W)
105 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
106 |                                     self.samples[self.ignore_mask])):
107 |             w = W[k]
108 |             f = w.dot(ix)
109 | 
110 |             # g == -y * gloss / (1 + exp(yf))
111 |             f[0] *= -1
112 |             g = gloss / (1 + numpy.exp(-f))
113 |             g[0] *= -1
114 | 
115 |             gx[i] = g.dot(w)
116 |             for ik, ig in six.moves.zip(k, g):
117 |                 gW[ik] += ig * ix
118 |         return gx, None, gW
119 | 
120 |     def backward_gpu(self, inputs, grads):
121 |         cupy = cuda.cupy
122 |         x, t, W = inputs
123 |         gloss, = grads
124 | 
125 |         n_in = x.shape[1]
126 |         g = cuda.elementwise(
127 |             'T wx, raw T gloss, int32 m', 'T g',
128 |             '''
129 |             T y;
130 |             if (i % m == 0) {
131 |               y = 1;
132 |             } else {
133 |               y = -1;
134 |             }
135 | 
136 |             g = -y * gloss[0] / (1.0f + __expf(wx * y));
137 |             ''',
138 |             'negative_sampling_calculate_g'
139 |         )(self.wx, gloss, self.sample_size + 1)
140 |         gx = cupy.zeros_like(x)
141 |         cuda.elementwise(
142 |             'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
143 |             '''
144 |             int d = i / c;
145 |             T w = 0;
146 |             if (mask == 1){
147 |                 for (int j = 0; j < m; ++j) {
148 |                   w += g[d * m + j] * W[k[d * m + j] * c + i % c];
149 |                 }
150 |             }
151 |             gx = w;
152 |             ''',
153 |             'negative_sampling_calculate_gx'
154 |             )(g, W, self.ignore_mask[:, None], self.samples, n_in,
155 |               self.sample_size + 1, gx)
156 |         gW = cupy.zeros_like(W)
157 |         cuda.elementwise(
158 |             'T g, raw T x, S k, bool mask, int32 c, int32 m',
159 |             'raw T gW',
160 |             '''
161 |             T gi = g;
162 |             if (mask == 1) {
163 |                 for (int j = 0; j < c; ++j) {
164 |                   atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
165 |                 }
166 |             }
167 |             ''',
168 |             'negative_sampling_calculate_gw'
169 |             )(g, x, self.samples, self.ignore_mask[:, None], n_in,
170 |               self.sample_size + 1, gW)
171 |         return gx, None, gW
172 | 
173 | 
174 | def negative_sampling(x, t, W, sampler, sample_size):
175 |     """Negative sampling loss function.
176 | 
177 |     In natural language processing, especially language modeling, the number of
178 |     words in a vocabulary can be very large.
179 |     Therefore, you need to spend a lot of time calculating the gradient of the
180 |     embedding matrix.
181 | 
182 |     By using the negative sampling trick you only need to calculate the
183 |     gradient for a few sampled negative examples.
184 | 
185 |     The objective function is below:
186 | 
187 |     .. math::
188 | 
189 |        f(x, p) = \\log \\sigma(x^\\top w_p) + \\
190 |        k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)],
191 | 
192 |     where :math:`\sigma(\cdot)` is a sigmoid function, :math:`w_i` is the
193 |     weight vector for the word :math:`i`, and :math:`p` is a positive example.
194 |     It is approximeted with :math:`k` examples :math:`N` sampled from
195 |     probability :math:`P(i)`, like this:
196 | 
197 |     .. math::
198 | 
199 |        f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\
200 |        \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n).
201 | 
202 |     Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`.
203 |     This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where
204 |     :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is
205 |     a hyper-parameter, and :math:`Z` is the normalization constant.
206 | 
207 |     Args:
208 |         x (~chainer.Variable): Batch of input vectors.
209 |         t (~chainer.Variable): Vector of groundtruth labels.
210 |         W (~chainer.Variable): Weight matrix.
211 |         sampler (function): Sampling function. It takes a shape and returns an
212 |             integer array of the shape. Each element of this array is a sample
213 |             from the word distribution. A :class:`~chainer.utils.WalkerAlias`
214 |             object built with the power distribution of word frequency is
215 |             recommended.
216 |         sample_size (int): Number of samples.
217 | 
218 |     See: `Distributed Representations of Words and Phrases and their\
219 |          Compositionality `_
220 | 
221 |     .. seealso:: :class:`~chainer.links.NegativeSampling`.
222 | 
223 |     """
224 |     return NegativeSamplingFunction(sampler, sample_size)(x, t, W)
225 | 
226 | 
227 | # Monkey-patch the chainer code to replace the negative sampling
228 | # with the one used here
229 | import chainer.links as L
230 | import chainer.functions as F
231 | negative_sampling.patched = True
232 | L.NegativeSampling.negative_sampling = negative_sampling
233 | F.negative_sampling = negative_sampling
234 | 


--------------------------------------------------------------------------------
/lda2vec/preprocess.py:
--------------------------------------------------------------------------------
  1 | from spacy.en import English
  2 | from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
  8 |              **kwargs):
  9 |     """ Uses spaCy to quickly tokenize text and return an array
 10 |     of indices.
 11 | 
 12 |     This method stores a global NLP directory in memory, and takes
 13 |     up to a minute to run for the time. Later calls will have the
 14 |     tokenizer in memory.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     text : list of unicode strings
 19 |         These are the input documents. There can be multiple sentences per
 20 |         item in the list.
 21 |     max_length : int
 22 |         This is the maximum number of words per document. If the document is
 23 |         shorter then this number it will be padded to this length.
 24 |     skip : int, optional
 25 |         Short documents will be padded with this variable up until max_length.
 26 |     attr : int, from spacy.attrs
 27 |         What to transform the token to. Choice must be in spacy.attrs, and =
 28 |         common choices are (LOWER, LEMMA)
 29 |     merge : int, optional
 30 |         Merge noun phrases into a single token. Useful for turning 'New York'
 31 |         into a single token.
 32 |     nlp : None
 33 |         A spaCy NLP object. Useful for not reinstantiating the object multiple
 34 |         times.
 35 |     kwargs : dict, optional
 36 |         Any further argument will be sent to the spaCy tokenizer. For extra
 37 |         speed consider setting tag=False, parse=False, entity=False, or
 38 |         n_threads=8.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     arr : 2D array of ints
 43 |         Has shape (len(texts), max_length). Each value represents
 44 |         the word index.
 45 |     vocab : dict
 46 |         Keys are the word index, and values are the string. The pad index gets
 47 |         mapped to None
 48 | 
 49 |     >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
 50 |     >>> arr, vocab = tokenize(sents, 10, merge=True)
 51 |     >>> arr.shape[0]
 52 |     2
 53 |     >>> arr.shape[1]
 54 |     10
 55 |     >>> w2i = {w: i for i, w in vocab.iteritems()}
 56 |     >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
 57 |     True
 58 |     >>> arr[0, 1] == w2i[u'you']
 59 |     True
 60 |     >>> arr[0, -1]  # last word in 0th document is a pad word
 61 |     -2
 62 |     >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
 63 |     True
 64 |     >>> arr[1, 1]  # The URL token is thrown out
 65 |     -2
 66 |     """
 67 |     if nlp is None:
 68 |         nlp = English()
 69 |     data = np.zeros((len(texts), max_length), dtype='int32')
 70 |     data[:] = skip
 71 |     bad_deps = ('amod', 'compound')
 72 |     for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
 73 |         if merge:
 74 |             # from the spaCy blog, an example on how to merge
 75 |             # noun phrases into single tokens
 76 |             for phrase in doc.noun_chunks:
 77 |                 # Only keep adjectives and nouns, e.g. "good ideas"
 78 |                 while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
 79 |                     phrase = phrase[1:]
 80 |                 if len(phrase) > 1:
 81 |                     # Merge the tokens, e.g. good_ideas
 82 |                     phrase.merge(phrase.root.tag_, phrase.text,
 83 |                                  phrase.root.ent_type_)
 84 |                 # Iterate over named entities
 85 |                 for ent in doc.ents:
 86 |                     if len(ent) > 1:
 87 |                         # Merge them into single tokens
 88 |                         ent.merge(ent.root.tag_, ent.text, ent.label_)
 89 |         dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
 90 |         if len(dat) > 0:
 91 |             dat = dat.astype('int32')
 92 |             msg = "Negative indices reserved for special tokens"
 93 |             assert dat.min() >= 0, msg
 94 |             # Replace email and URL tokens
 95 |             idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
 96 |             dat[idx] = skip
 97 |             length = min(len(dat), max_length)
 98 |             data[row, :length] = dat[:length, 0].ravel()
 99 |     uniques = np.unique(data)
100 |     vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
101 |     vocab[skip] = ''
102 |     return data, vocab
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     import doctest
107 |     doctest.testmod()
108 | 


--------------------------------------------------------------------------------
/lda2vec/topics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import requests
  3 | import multiprocessing
  4 | 
  5 | 
  6 | def _softmax(x):
  7 |     e_x = np.exp(x - np.max(x))
  8 |     out = e_x / e_x.sum()
  9 |     return out
 10 | 
 11 | 
 12 | def _softmax_2d(x):
 13 |     y = x - x.max(axis=1, keepdims=True)
 14 |     np.exp(y, out=y)
 15 |     y /= y.sum(axis=1, keepdims=True)
 16 |     return y
 17 | 
 18 | 
 19 | def prob_words(context, vocab, temperature=1.0):
 20 |     """ This calculates a softmax over the vocabulary as a function
 21 |     of the dot product of context and word.
 22 |     """
 23 |     dot = np.dot(vocab, context)
 24 |     prob = _softmax(dot / temperature)
 25 |     return prob
 26 | 
 27 | 
 28 | def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
 29 |                    doc_lengths=None, term_frequency=None, normalize=False):
 30 |     """ Collects a dictionary of word, document and topic distributions.
 31 | 
 32 |     Arguments
 33 |     ---------
 34 |     weights : float array
 35 |         This must be an array of unnormalized log-odds of document-to-topic
 36 |         weights. Shape should be [n_documents, n_topics]
 37 |     factors : float array
 38 |         Should be an array of topic vectors. These topic vectors live in the
 39 |         same space as word vectors and will be used to find the most similar
 40 |         words to each topic. Shape should be [n_topics, n_dim].
 41 |     word_vectors : float array
 42 |         This must be a matrix of word vectors. Should be of shape
 43 |         [n_words, n_dim]
 44 |     vocab : list of str
 45 |         These must be the strings for words corresponding to
 46 |         indices [0, n_words]
 47 |     temperature : float
 48 |         Used to calculate the log probability of a word. Higher
 49 |         temperatures make more rare words more likely.
 50 |     doc_lengths : int array
 51 |         An array indicating the number of words in the nth document.
 52 |         Must be of shape [n_documents]. Required by pyLDAvis.
 53 |     term_frequency : int array
 54 |         An array indicating the overall number of times each token appears
 55 |         in the corpus. Must be of shape [n_words]. Required by pyLDAvis.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     data : dict
 60 |         This dictionary is readily consumed by pyLDAVis for topic
 61 |         visualization.
 62 |     """
 63 |     # Map each factor vector to a word
 64 |     topic_to_word = []
 65 |     msg = "Vocabulary size did not match size of word vectors"
 66 |     assert len(vocab) == word_vectors.shape[0], msg
 67 |     if normalize:
 68 |         word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None]
 69 |     # factors = factors / np.linalg.norm(factors, axis=1)[:, None]
 70 |     for factor_vector in factors:
 71 |         factor_to_word = prob_words(factor_vector, word_vectors,
 72 |                                     temperature=temperature)
 73 |         topic_to_word.append(np.ravel(factor_to_word))
 74 |     topic_to_word = np.array(topic_to_word)
 75 |     msg = "Not all rows in topic_to_word sum to 1"
 76 |     assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg
 77 |     # Collect document-to-topic distributions, e.g. theta
 78 |     doc_to_topic = _softmax_2d(weights)
 79 |     msg = "Not all rows in doc_to_topic sum to 1"
 80 |     assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg
 81 |     data = {'topic_term_dists': topic_to_word,
 82 |             'doc_topic_dists': doc_to_topic,
 83 |             'doc_lengths': doc_lengths,
 84 |             'vocab': vocab,
 85 |             'term_frequency': term_frequency}
 86 |     return data
 87 | 
 88 | 
 89 | def print_top_words_per_topic(data, top_n=10, do_print=True):
 90 |     """ Given a pyLDAvis data array, print out the top words in every topic.
 91 | 
 92 |     Arguments
 93 |     ---------
 94 |     data : dict
 95 |         A dict object that summarizes topic data and has been made using
 96 |         `prepare_topics`.
 97 |     """
 98 |     msgs = []
 99 |     lists = []
100 |     for j, topic_to_word in enumerate(data['topic_term_dists']):
101 |         top = np.argsort(topic_to_word)[::-1][:top_n]
102 |         prefix = "Top words in topic %i " % j
103 |         top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
104 |         msg = ' '.join(top_words)
105 |         if do_print:
106 |             print(prefix, msg)
107 |         lists.append(top_words)
108 |     return lists
109 | 
110 | 
111 | def get_request(url):
112 |     for _ in range(5):
113 |         try:
114 |             return float(requests.get(url).text)
115 |         except:
116 |             pass
117 |     return None
118 | 
119 | 
120 | def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
121 |                                      'umass']):
122 |     """ Requests the topic coherence from AKSW Palmetto
123 | 
124 |     Arguments
125 |     ---------
126 |     lists : list of lists
127 |         A list of lists with one list of top words for each topic.
128 | 
129 |     >>> topic_words = [['cake', 'apple', 'banana', 'cherry', 'chocolate']]
130 |     >>> topic_coherence(topic_words, services=['cv'])
131 |     {(0, 'cv'): 0.5678879445677241}
132 |     """
133 |     url = u'http://palmetto.aksw.org/palmetto-webapp/service/{}?words={}'
134 |     reqs = [url.format(s, '%20'.join(top[:10])) for s in services for top in lists]
135 |     pool = multiprocessing.Pool()
136 |     coherences = pool.map(get_request, reqs)
137 |     pool.close()
138 |     pool.terminate()
139 |     pool.join()
140 |     del pool
141 |     args = [(j, s, top) for s in services for j, top in enumerate(lists)]
142 |     ans = {}
143 |     for ((j, s, t), tc) in zip(args, coherences):
144 |         ans[(j, s)] = tc
145 |     return ans
146 | 


--------------------------------------------------------------------------------
/lda2vec/tracking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.linear_model import LinearRegression
 3 | 
 4 | 
 5 | class Tracking:
 6 |     cache = {}
 7 |     calls = 0
 8 |     slope = 0.0
 9 | 
10 |     def __init__(self, n=5000):
11 |         """ The tracking class keeps a most recently used cache of values
12 | 
13 |         Parameters
14 |         ----------
15 |         n: int
16 |         Number of items to keep.
17 |         """
18 |         self.n = n
19 | 
20 |     def add(self, key, item):
21 |         """ Add an item with a particular to the cache.
22 | 
23 |         >>> tracker = Tracking()
24 |         >>> tracker.add('log_perplexity', 55.6)
25 |         >>> tracker.cache['log_perplexity']
26 |         [55.6]
27 |         >>> tracker.add('log_perplexity', 55.2)
28 |         >>> tracker.add('loss', -12.1)
29 |         >>> tracker.cache['log_perplexity']
30 |         [55.6, 55.2]
31 |         >>> tracker.cache['loss']
32 |         [-12.1]
33 |         """
34 |         if key not in self.cache:
35 |             self.cache[key] = []
36 |         self.cache[key].append(item)
37 |         if len(self.cache[key]) > self.n:
38 |             self.cache[key] = self.cache[key][:self.n]
39 | 
40 |     def stats(self, key):
41 |         """ Get the statistics for items with a particular key
42 | 
43 |         >>> tracker = Tracking()
44 |         >>> tracker.add('log_perplexity', 55.6)
45 |         >>> tracker.add('log_perplexity', 55.2)
46 |         >>> tracker.stats('log_perplexity')
47 |         (55.400000000000006, 0.19999999999999929, 0.0)
48 |         """
49 |         data = self.cache[key]
50 |         mean = np.mean(data)
51 |         std = np.std(data)
52 |         slope = self.slope
53 |         if self.calls % 100 == 0:
54 |             lr = LinearRegression()
55 |             x = np.arange(len(data)).astype('float32')
56 |             lr.fit(x[:, None], np.array(data))
57 |             self.slope = lr.coef_[0]
58 |         self.calls += 1
59 |         return mean, std, slope
60 | 
61 | if __name__ == "__main__":
62 |     import doctest
63 |     doctest.testmod()
64 | 


--------------------------------------------------------------------------------
/lda2vec/utils.py:
--------------------------------------------------------------------------------
 1 | from chainer import Variable
 2 | import random
 3 | import numpy as np
 4 | 
 5 | 
 6 | def move(xp, *args):
 7 |     for arg in args:
 8 |         if 'float' in str(arg.dtype):
 9 |             yield Variable(xp.asarray(arg, dtype='float32'))
10 |         else:
11 |             assert 'int' in str(arg.dtype)
12 |             yield Variable(xp.asarray(arg, dtype='int32'))
13 | 
14 | 
15 | def most_similar(embeddings, word_index):
16 |     input_vector = embeddings.W[word_index]
17 |     similarities = embeddings.dot(input_vector)
18 |     return similarities
19 | 
20 | 
21 | def chunks(n, *args):
22 |     """Yield successive n-sized chunks from l."""
23 |     # From stackoverflow question 312443
24 |     keypoints = []
25 |     for i in range(0, len(args[0]), n):
26 |         keypoints.append((i, i + n))
27 |     random.shuffle(keypoints)
28 |     for a, b in keypoints:
29 |         yield [arg[a: b] for arg in args]
30 | 
31 | 
32 | class MovingAverage():
33 |     def __init__(self, lastn=100):
34 |         self.points = np.array([])
35 |         self.lastn = lastn
36 | 
37 |     def add(self, x):
38 |         self.points = np.append(self.points, x)
39 | 
40 |     def mean(self):
41 |         return np.mean(self.points[-self.lastn:])
42 | 
43 |     def std(self):
44 |         return np.std(self.points[-self.lastn:])
45 | 
46 |     def get_stats(self):
47 |         return (np.mean(self.points[-self.lastn:]),
48 |                 np.std(self.points[-self.lastn:]))
49 | 


--------------------------------------------------------------------------------
/lda2vec_network_publish_text.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/lda2vec_network_publish_text.gif


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python >= 3.5.2
2 | chainer>=5.1.0
3 | numpy>=1.16.0
4 | spacy>=1.9.0
5 | scipy>=1.0.0
6 | pyxDamerauLevenshtein==1.5.2
7 | pyLDAvis==2.1.2
8 | sklearn
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | import os
 4 | 
 5 | with open('requirements.txt') as f:
 6 |     install_requires = f.read().splitlines()
 7 | 
 8 | # If building on RTD, don't install anything
 9 | if os.environ.get('READTHEDOCS', None) == 'True':
10 |     install_requires = []
11 | 
12 | kw = dict(
13 |     name='lda2vec',
14 |     version='0.1',
15 |     description='Tools for interpreting natural language',
16 |     author='Christopher E Moody',
17 |     author_email='chrisemoody@gmail.com',
18 |     install_requires=install_requires,
19 |     packages=find_packages(),
20 |     url='')
21 | 
22 | setup(**kw)
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whcjimmy/lda2vec/359ac33fa24e959ecbf493cd6fe2fa98677d7663/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_dirichlet_likelihood.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import chainer.links as L
 3 | from chainer import Variable
 4 | 
 5 | from lda2vec import dirichlet_likelihood
 6 | 
 7 | 
 8 | def test_concentration():
 9 |     """ Test that alpha > 1.0 on a dense vector has a higher likelihood
10 |     than alpha < 1.0 on a dense vector, and test that a sparse vector
11 |     has the opposite character. """
12 | 
13 |     dense = np.random.randn(5, 10).astype('float32')
14 |     sparse = np.random.randn(5, 10).astype('float32')
15 |     sparse[:, 1:] /= 1e5
16 |     weights = Variable(dense)
17 |     dhl_dense_10 = dirichlet_likelihood(weights, alpha=10.0).data
18 |     dhl_dense_01 = dirichlet_likelihood(weights, alpha=0.1).data
19 |     weights = Variable(sparse)
20 |     dhl_sparse_10 = dirichlet_likelihood(weights, alpha=10.0).data
21 |     dhl_sparse_01 = dirichlet_likelihood(weights, alpha=0.1).data
22 | 
23 |     msg = "Sparse vector has higher likelihood than dense with alpha=0.1"
24 |     assert dhl_sparse_01 > dhl_dense_01, msg
25 |     msg = "Dense vector has higher likelihood than sparse with alpha=10.0"
26 |     assert dhl_dense_10 > dhl_sparse_10, msg
27 | 
28 | 
29 | def test_embed():
30 |     """ Test that embedding is treated like a Variable"""
31 | 
32 |     embed_dense = L.EmbedID(5, 10)
33 |     embed_sparse = L.EmbedID(5, 10)
34 |     embed_dense.W.data[:] = np.random.randn(5, 10).astype('float32')
35 |     embed_sparse.W.data[:] = np.random.randn(5, 10).astype('float32')
36 |     embed_sparse.W.data[:, 1:] /= 1e5
37 |     dhl_dense_01 = dirichlet_likelihood(embed_dense, alpha=0.1).data
38 |     dhl_sparse_01 = dirichlet_likelihood(embed_sparse, alpha=0.1).data
39 | 
40 |     msg = "Sparse vector has higher likelihood than dense with alpha=0.1"
41 |     assert dhl_sparse_01 > dhl_dense_01, msg
42 | 


--------------------------------------------------------------------------------
/tests/test_embed_mixture.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from chainer import Variable
 3 | 
 4 | from lda2vec import EmbedMixture
 5 | 
 6 | 
 7 | def softmax(v):
 8 |     return np.exp(v) / np.sum(np.exp(v))
 9 | 
10 | 
11 | def test_embed_mixture():
12 |     """ Manually test the projection logic between topic weights and vectors"""
13 |     # Ten documents, two topics, five hidden dimensions
14 |     em = EmbedMixture(10, 2, 5, dropout_ratio=0.0)
15 |     doc_ids = Variable(np.arange(1, dtype='int32'))
16 |     doc_vector = em(doc_ids).data
17 |     # weights -- (n_topics)
18 |     weights = softmax(em.weights.W.data[0, :])
19 |     un_weights = softmax(em.unnormalized_weights(doc_ids).data[0, :])
20 |     # (n_hidden) = (n_topics) . (n_topics, n_hidden)
21 |     test = np.sum(weights * em.factors.W.data.T, axis=1)
22 |     assert np.allclose(doc_vector, test)
23 |     assert np.allclose(un_weights, weights)
24 | 


--------------------------------------------------------------------------------
/tests/test_fake_data.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import fake_data
 2 | from chainer import Variable
 3 | from chainer.functions import cross_covariance
 4 | import numpy as np
 5 | 
 6 | 
 7 | def test_orthogonal_matrix():
 8 |     msg = "Orthogonal matrices have equal inverse and transpose"
 9 |     arr = fake_data.orthogonal_matrix([20, 20])
10 |     assert np.allclose(np.linalg.inv(arr), arr.T), msg
11 | 
12 | 
13 | def test_orthogonal_matrix_covariance():
14 |     msg = "Orthogonal matrix should have less covariance than a random matrix"
15 |     orth = Variable(fake_data.orthogonal_matrix([20, 20]).astype('float32'))
16 |     rand = Variable(np.random.randn(20, 20).astype('float32'))
17 |     orth_cc = cross_covariance(orth, orth).data
18 |     rand_cc = cross_covariance(rand, rand).data
19 |     assert orth_cc < rand_cc, msg
20 | 
21 | 
22 | def test_softmax():
23 |     arr = np.random.randn(100, 15)
24 |     probs = fake_data.softmax(arr)
25 |     norms = np.sum(probs, axis=1)
26 |     assert np.allclose(norms, np.ones_like(norms))
27 | 
28 | 
29 | def test_sample():
30 |     n_categories = 10
31 |     idx = 4
32 |     probs = np.zeros(n_categories)
33 |     probs = np.array(probs)
34 |     probs[idx] = 1.0
35 |     values = np.arange(n_categories)
36 |     size = 10
37 |     draws = fake_data.sample(values, probs, size)
38 |     assert np.all(draws == idx)
39 | 
40 | 
41 | def test_fake_data():
42 |     n_docs = 100
43 |     n_words = 10
44 |     n_hidden = 2
45 |     n_sent_length = 5
46 |     data = fake_data.fake_data(n_docs, n_words, n_sent_length, n_hidden)
47 |     assert data.dtype == np.dtype('int32')
48 |     assert data.shape[0] == n_docs
49 |     assert data.shape[1] == n_sent_length
50 |     assert np.max(data) <= n_words - 1
51 | 


--------------------------------------------------------------------------------
/tests/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import preprocess
 2 | import numpy as np
 3 | import pytest
 4 | import os
 5 | 
 6 | on_ci = os.environ.get('CI', False) == 'true'
 7 | 
 8 | 
 9 | @pytest.mark.skipif(on_ci, reason='SpaCy install fails on TravisCI')
10 | def test_tokenize():
11 |     texts = [u'Do you recall, not long ago']
12 |     texts += [u'We would walk on the sidewalk?']
13 |     arr, vocab = preprocess.tokenize(texts, 10)
14 |     assert arr[0, 0] != arr[0, 1]
15 |     assert arr.shape[0] == 2
16 |     assert arr.shape[1] == 10
17 |     assert arr[0, -1] == -2
18 |     assert arr.dtype == np.dtype('int32')
19 |     first_word = texts[0].split(' ')[0].lower()
20 |     first_lowr = preprocess.nlp.vocab[arr[0, 0]].lower_
21 |     assert first_word == first_lowr
22 | 


--------------------------------------------------------------------------------
/tests/test_topics.py:
--------------------------------------------------------------------------------
 1 | from lda2vec import topics
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def exp_entropy(log_p):
 7 |     return -np.nansum(np.exp(log_p + 1e-12) * (log_p + 1e-12))
 8 | 
 9 | 
10 | def test_prob_words():
11 |     context = np.random.randn(3)
12 |     vocab = np.random.randn(10, 3)
13 |     lo = topics.prob_words(context, vocab, temperature=1)
14 |     hi = topics.prob_words(context, vocab, temperature=1e6)
15 |     msg = "Lower temperatures should be lower entropy and more concentrated"
16 |     assert exp_entropy(np.log(lo)) < exp_entropy(np.log(hi)), msg
17 | 
18 | 
19 | def prepare_topics():
20 |     # One document in two topics, unnormalized
21 |     weights = np.array([[0.5, -0.1]])
22 |     # Two topics in 4 dimensions
23 |     factors = np.array([[0.1, 0.1, 0.1, 5.0],
24 |                         [5.1, 0.1, 0.1, 0.0]])
25 |     # Three words in 4 dimensions
26 |     vectors = np.array([[5.0, 0.1, 0.1, 0.1],
27 |                         [0.0, 0.1, 0.1, 5.0],
28 |                         [2.0, 0.1, 0.1, -.9]])
29 |     vocab = ['a', 'b', 'c']
30 |     data = topics.prepare_topics(weights, factors, vectors, vocab)
31 |     return data
32 | 
33 | 
34 | def test_prepare_topics():
35 |     data = prepare_topics()
36 |     t2w = data['topic_term_dists']
37 |     msg = "Topic 0 should be most similar to 2nd token"
38 |     assert t2w[0].argsort()[::-1][0] == 1, msg
39 |     msg = "Topic 1 should be most similar to 1st token"
40 |     assert t2w[1].argsort()[::-1][0] == 0, msg
41 | 
42 | 
43 | def test_print_top_words_per_topic():
44 |     data = prepare_topics()
45 |     msgs = topics.print_top_words_per_topic(data, do_print=False)
46 |     assert len(msgs) == 2
47 |     for msg in msgs:
48 |         assert len(msg.split(' ')) == 3
49 | 


--------------------------------------------------------------------------------