├── .flake8 ├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.rst ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── make.bat ├── modules │ └── API.rst └── readme.rst ├── notebooks ├── Gensim Newsgroup.ipynb ├── GraphLab.ipynb ├── LDA model.ipynb ├── Movie Reviews, AP News, and Jeopardy.ipynb ├── data │ ├── ap_input.json │ ├── jeopardy_input.json │ └── movie_reviews_input.json ├── pyLDAvis └── pyLDAvis_overview.ipynb ├── pyLDAvis ├── __init__.py ├── _display.py ├── _prepare.py ├── _server.py ├── gensim_models.py ├── graphlab.py ├── js │ ├── d3.v5.min.js │ ├── ldavis.css │ ├── ldavis.js │ ├── ldavis.v1.0.0.css │ ├── ldavis.v1.0.0.js │ └── ldavis.v3.0.0.js ├── lda_model.py ├── urls.py └── utils.py ├── pyproject.toml ├── requirements.txt ├── setup.py ├── tests ├── data │ ├── .gitattributes │ ├── export_data.R │ ├── movie_reviews_input.json │ └── movie_reviews_output.json └── pyLDAvis │ ├── test_gensim_models.py │ └── test_prepare.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | exclude = mypy-stubs 4 | ignore = W,E731,F403 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | .eggs 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib 20 | lib64 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | htmlcov 30 | test/data/*.json 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | 40 | # Complexity 41 | output/*.html 42 | output/*/index.html 43 | 44 | # Sphinx 45 | docs/_build 46 | 47 | # IPython 48 | .ipynb_checkpoints 49 | 50 | # JetBrains 51 | .idea 52 | 53 | # Mac OS 54 | .DS_Store 55 | 56 | ### Python.VirtualEnv Stack ### 57 | # Virtualenv 58 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 59 | [Bb]in 60 | [Ii]nclude 61 | [Ll]ib 62 | [Ll]ib64 63 | [Ll]ocal 64 | [Ss]cripts 65 | pyvenv.cfg 66 | pip-selfcheck.json 67 | 68 | # Pip 69 | # Pipfile 70 | # Pipfile.lock 71 | pypi_package.iml 72 | # pyproject.toml 73 | 74 | # Distribution / packaging 75 | .Python 76 | build/ 77 | develop-eggs/ 78 | dist/ 79 | downloads/ 80 | eggs/ 81 | .eggs/ 82 | lib/ 83 | lib64/ 84 | parts/ 85 | sdist/ 86 | var/ 87 | wheels/ 88 | *.egg-info/ 89 | .installed.cfg 90 | *.egg 91 | MANIFEST 92 | 93 | # Notebooks 94 | notebooks/data 95 | notebooks/newsgroups* 96 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "3.11" 7 | - "3.10" 8 | - "3.9" 9 | 10 | env: 11 | - DEPS="pytest gensim smart_open==2.0.0" 12 | 13 | before_install: 14 | # conda instructions from http://conda.pydata.org/docs/travis.html 15 | - sudo apt-get update 16 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 17 | - bash miniconda.sh -b -p $HOME/miniconda 18 | - source "$HOME/miniconda/etc/profile.d/conda.sh" 19 | - hash -r 20 | - conda config --set always_yes yes --set changeps1 no 21 | - conda update -q conda 22 | # Useful for debugging any issues with conda 23 | - conda info -a 24 | - export BOTO_CONFIG=/dev/null 25 | install: 26 | # download JSON data from github since travis does not have git-lfs rolled out yet 27 | - (cd tests/data; curl -L -O https://github.com/bmabey/pyLDAvis/raw/master/tests/data/movie_reviews_input.json && curl -L -O https://github.com/bmabey/pyLDAvis/raw/master/tests/data/movie_reviews_output.json) 28 | - ls -la tests/data/ 29 | - conda create -n testenv --yes python=$TRAVIS_PYTHON_VERSION $DEPS 30 | - conda activate testenv 31 | - pip install . 32 | 33 | # command to run tests, e.g. pytest 34 | script: 35 | - pytest 36 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Ben Mabey 9 | 10 | Contributors 11 | ------------ 12 | 13 | * Paul English - JS and CSS fixes and improvements. 14 | * Mark Susol - Python and JSS improvements. 15 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/bmabey/pyLDAvis/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "feature" 34 | is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | pyLDAvis could always use more documentation, whether as part of the 40 | official pyLDAvis docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/bmabey/pyLDAvis/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | 55 | Get Started! 56 | ------------ 57 | 58 | Ready to contribute? Here's how to set up `pyLDAvis` for local development. 59 | 60 | 1. Fork the `pyLDAvis` repo on GitHub. 61 | 2. Clone your fork locally:: 62 | 63 | $ git clone git@github.com:your_name_here/pyLDAvis.git 64 | 65 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 66 | 67 | $ mkvirtualenv pyLDAvis 68 | $ cd pyLDAvis/ 69 | $ python setup.py develop 70 | 71 | 4. Create a branch for local development:: 72 | 73 | $ git checkout -b name-of-your-bugfix-or-feature 74 | 75 | Now you can make your changes locally. 76 | 77 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 78 | 79 | $ python -m flake8 pyLDAvis tests 80 | $ python -m pytest 81 | $ python -m tox 82 | 83 | To get flake8 and tox, just pip install them into your virtualenv. 84 | 85 | 6. Commit your changes and push your branch to GitHub:: 86 | 87 | $ git add . 88 | $ git commit -m "Your detailed description of your changes." 89 | $ git push origin name-of-your-bugfix-or-feature 90 | 91 | 7. Submit a pull request through the GitHub website. 92 | 93 | Pull Request Guidelines 94 | ----------------------- 95 | 96 | Before you submit a pull request, check that it meets these guidelines: 97 | 98 | 1. The pull request should include tests. 99 | 2. If the pull request adds functionality, the docs should be updated. Put 100 | your new functionality into a function with a docstring, and add the 101 | feature to the list in README.rst. 102 | 3. The pull request should work for Python 3.9, 3.10, 3.11, and for PyPI. Check 103 | https://travis-ci.org/bmabey/pyLDAvis/pull_requests 104 | and make sure that the tests pass for all supported Python versions. 105 | 106 | Maintainers 107 | ------------ 108 | 109 | Ready to publish a new version to PyPi? Here's how the workflow to follow. 110 | 111 | 1. Ensure you are in the pyLDAvis directory 112 | 2. Pipenv workflow:: 113 | 114 | $ pipenv install -e . 115 | $ pipenv install --dev 116 | $ pipenv shell 117 | (pyLDAvis) $ flake8 pyLDAvis tests 118 | (pyLDAvis) $ pytest 119 | (pyLDAvis) $ tox 120 | 121 | -- TestPyPi 122 | (pyLDAvis) $ python setup.py sdist bdist_wheel 123 | (pyLDAvis) $ twine check dist/* 124 | (pyLDAvis) $ twine upload --repository testpypi dist/* 125 | 126 | -- Publish 127 | (pyLDAvis) $ twine upload --repository-url https://upload.pypi.org/legacy/ dist/* 128 | (pyLDAvis) $ rm dist/* 129 | 130 | Note: MacOS Big Sur is both 10.16 and 11.0 – it’s official (https://eclecticlight.co/2020/07/21/big-sur-is-both-10-16-and-11-0-its-official/) :: 131 | 132 | $ export SYSTEM_VERSION_COMPAT=1 133 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 3.4.1 (2023-04-23) 7 | ~~~~~~~~~~~~~~~~~~ 8 | * Pandas 2.x release, the drop shall use .drop(saliency, axis=1) #247 9 | 10 | 3.4.0 (2023-02-12) 11 | ~~~~~~~~~~~~~~~~~~ 12 | 13 | * Adding testing for Python 3.10, 3.11, move default version to Python 3.10. 14 | * Tox testing: No module named 'sklearn.manifold'; 'sklearn' is not a package. 15 | * Rename sklearn.py to lda_model.py. 16 | * ValueError: The parameter init="pca" cannot be used with metric="precomputed". 17 | * Update sklearn.py #239. 18 | * fixes error of get_feature_names removal #235. 19 | * Remove "sklearn" from requirements #234 20 | * Bump joblib from 1.0.1 to 1.2.0 dependencies #231. 21 | * Fixing for small number of topics #229. 22 | * Bump numpy from 1.20.1 to 1.22.0 dependenciesv #227. 23 | * License correction #224. 24 | * Fix background color in Notebooks with dark themes #222. 25 | * Start building Wheels alongside sdist #221 26 | 27 | 3.3.1 (2021-03-24) 28 | ~~~~~~~~~~~~~~~~~~ 29 | 30 | * Restored x-axis scale labels for term bars #200. 31 | * import pyLDAvis.gensim_models as gensimvis 32 | * Deleted orphaned files. 33 | * Update .gitignore for notebooks/* models, data. 34 | 35 | 3.3.0 (2021-03-16) 36 | ~~~~~~~~~~~~~~~~~~ 37 | 38 | * Python 3.7, 3.8, 3.9: dropped 2.7, 3.5, 3.6 support. 39 | * RuntimeWarning: divide by zero encountered in log #174. 40 | * Deprecation warning due to invalid escape sequences #166. 41 | * `python setup.py test` is deprecated. 42 | * FutureWarning: pandas.util.testing is deprecated. 43 | 44 | 3.2.2 (2021-02-19) 45 | ~~~~~~~~~~~~~~~~~~ 46 | 47 | * Fix browser caching of cdn.jsdelivr files. 48 | * update ldavis.js to match ldavis.3.0.0.js 49 | 50 | 3.2.1 (2021-02-17) 51 | ~~~~~~~~~~~~~~~~~~ 52 | 53 | * Fix missing labels and other D3.V3 to D3.V5 issues. 54 | * Revert the indexing changes i.e. (startIndex - 1). 55 | * Removed some unused GLOBALs. 56 | 57 | 3.2.0 (2021-02-10) 58 | ~~~~~~~~~~~~~~~~~~ 59 | 60 | * Switches the CDN to cdn.jsdelivr to get accurate mime types. 61 | 62 | 3.1.0 (2021-02-07) 63 | ~~~~~~~~~~~~~~~~~~ 64 | 65 | * Replaces rawgit CDN since it has been sunset. 66 | 67 | 3.0.0 (2021-02-06) 68 | ~~~~~~~~~~~~~~~~~~ 69 | 70 | * Upgrades D3 code to use the d3.v5. 71 | 72 | 2.1.2 (2018-02-06) 73 | ~~~~~~~~~~~~~~~~~~ 74 | 75 | * Fix pandas deprecation warnings. 76 | 77 | 2.1.1 (2017-02-13) 78 | ~~~~~~~~~~~~~~~~~~ 79 | 80 | * Fix `gensim` module to work with a sparse corpus #82. 81 | 82 | 2.1.0 (2016-06-30) 83 | ~~~~~~~~~~~~~~~~~~ 84 | 85 | * Added missing dependency on `scipy`. 86 | * Fixed term sorting that was incompatible with pandas 0.19.x. 87 | 88 | 2.0.0 (2016-06-30) 89 | ~~~~~~~~~~~~~~~~~~ 90 | 91 | * Removed dependency on `scikit-bio` by adding an internal PCoA implementation. 92 | * Added helper functions for scikit-learn LDA model! See the new notebook for details. 93 | * Extended gensim helper functions to work with HDP models. 94 | * Added scikit-learn's Multi-dimensional scaling as another MDS option when scikit-learn is installed. 95 | 96 | 1.5.1 (2016-04-15) 97 | ~~~~~~~~~~~~~~~~~~ 98 | 99 | * Add sort_topics option to prepare function to allow disabling of topic re-ordering. 100 | 101 | 102 | 1.5.0 (2016-02-20) 103 | ~~~~~~~~~~~~~~~~~~ 104 | 105 | * Red Bar Width bug fix 106 | 107 | In some cases, the widths of the red topic-term bars did not decrease (as they should have) from term \#1 to 108 | term \#R under the relevance ranking with $\lambda = 1$. In other words, when $\lambda = 1$, there were topics 109 | in which a narrow red bar was displayed above a wider red bar, which should never happen. The issue had to do 110 | with the way topic-term bar widths are computed, and is discussed in detail in #32. 111 | 112 | 113 | In the end, we implemented a quick fix in which we compute term frequencies implicitly, rather than using those 114 | supplied in the createJSON() function. The upside is that the red bar widths are now explicitly controlled to 115 | produce the correct visualization. The downside is that the blue bar widths do not necessarily match the 116 | user-supplied term frequencies exactly -- in fact, the new version of LDAvis ignores the user-supplied term 117 | frequencies entirely. In a few experiments, the differences are small, and decrease (as a proportion of the true 118 | term frequencies) as the true term frequencies increase. 119 | 120 | 121 | 122 | 1.4.1 (2016-01-31) 123 | ~~~~~~~~~~~~~~~~~~ 124 | 125 | * Included requirements.txt in MANIFEST to (hopefully) fix bad release. 126 | 127 | 1.4.0 (2016-01-31) 128 | ~~~~~~~~~~~~~~~~~~ 129 | 130 | * Updated to newest version of skibio for PCoA mds. 131 | * requirements.txt cleanup 132 | * New 'tsne' option for prepare, see docs and notebook for more info. 133 | 134 | 135 | 1.3.5 (2015-12-18) 136 | ~~~~~~~~~~~~~~~~~~ 137 | 138 | * Add explicit version info for scikit-bio since the API has changed. 139 | 140 | 141 | 1.3.4 (2015-11-16) 142 | ~~~~~~~~~~~~~~~~~~ 143 | 144 | * Gensim Python typo fix in imports. :/ 145 | 146 | 1.3.3 (2015-11-13) 147 | ~~~~~~~~~~~~~~~~~~ 148 | 149 | * Gensim Python 2.x fix for absolute imports. 150 | 151 | 1.3.2 (2015-11-09) 152 | ~~~~~~~~~~~~~~~~~~ 153 | 154 | * Gensim prepare 25% speed increase, thanks @mattilyra! 155 | * Pandas deprecation warnings are now gone. 156 | * Pandas v0.17 is now being used. 157 | 158 | 1.3.1 (2015-11-02) 159 | ~~~~~~~~~~~~~~~~~~ 160 | 161 | * Updates gensim and other logic to be python 3 compatible. 162 | 163 | 1.3.0 (2015-08-20) 164 | ~~~~~~~~~~~~~~~~~~ 165 | 166 | * Fixes gensim logic and makes it more robust. 167 | * Faster graphlab processing. 168 | * kargs for gensim and graphlab are passed down to underlying prepare function. 169 | * Requires recent version of pandas to avoid problems with our use of the newer `DataFrame.to_dict` API. 170 | 171 | 1.2.0 (2015-06-13) 172 | ~~~~~~~~~~~~~~~~~~ 173 | 174 | * Updates gensim logic to be clearer and work with Python 3.x. 175 | 176 | 1.1.0 (2015-06-02) 177 | ~~~~~~~~~~~~~~~~~~ 178 | 179 | * Fixes bug with GraphLab function that was producing bogus visualizations. 180 | 181 | 1.0.0 (2015-05-29) 182 | ~~~~~~~~~~~~~~~~~~ 183 | 184 | * First release on PyPI. Faithful port of R version with IPython support and helper functions for GraphLab & gensim. 185 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Ben Mabey 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | * Neither the name of pyLDAvis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | include requirements.txt 7 | 8 | recursive-include tests *.py 9 | recursive-include notebooks *.ipynb 10 | recursive-exclude notebooks/.ipynb_checkpoints * 11 | recursive-exclude * __pycache__ 12 | recursive-exclude * *.py[co] 13 | include tests/data/movie_reviews_input.json tests/data/movie_reviews_output.json tests/data/export_data.R 14 | 15 | recursive-include docs *.rst conf.py Makefile make.bat 16 | recursive-include pyLDAvis *.py *.js *.css 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean - remove all build, test, coverage and Python artifacts" 5 | @echo "clean-build - remove build artifacts" 6 | @echo "clean-pyc - remove Python file artifacts" 7 | @echo "clean-test - remove test and coverage artifacts" 8 | @echo "lint - check style with flake8" 9 | @echo "test - run tests quickly with the default Python" 10 | @echo "test-all - run tests on every Python version with tox" 11 | @echo "coverage - check code coverage quickly with the default Python" 12 | @echo "docs - generate Sphinx HTML documentation, including API docs" 13 | @echo "release - package and upload a release" 14 | @echo "dist - package" 15 | @echo "install - install the package to the active Python's site-packages" 16 | 17 | clean: clean-build clean-pyc clean-test 18 | 19 | clean-build: 20 | rm -rf build/ 21 | rm -rf dist/ 22 | rm -rf .eggs/ 23 | find . -name '*.egg-info' -exec rm -rf {} + 24 | find . -name '*.egg' -exec rm -rf {} + 25 | 26 | clean-pyc: 27 | find . -name '*.pyc' -exec rm -f {} + 28 | find . -name '*.pyo' -exec rm -f {} + 29 | find . -name '*~' -exec rm -f {} + 30 | find . -name '__pycache__' -exec rm -rf {} + 31 | 32 | clean-test: 33 | rm -rf .tox/ 34 | rm -f .coverage 35 | rm -rf htmlcov/ 36 | 37 | lint: 38 | flake8 pyLDAvis tests 39 | 40 | test: 41 | pytest 42 | 43 | test-all: 44 | tox 45 | 46 | coverage: 47 | coverage run --source pyLDAvis setup.py test 48 | coverage report -m 49 | coverage html 50 | open htmlcov/index.html 51 | 52 | docs: 53 | rm -f docs/pyLDAvis.rst 54 | rm -f docs/modules.rst 55 | sphinx-apidoc -o docs/ pyLDAvis 56 | $(MAKE) -C docs clean 57 | $(MAKE) -C docs html 58 | open docs/_build/html/index.html 59 | 60 | release: clean 61 | python setup.py sdist upload 62 | #python setup.py bdist_wheel upload 63 | 64 | dist: clean 65 | python setup.py sdist 66 | #python setup.py bdist_wheel 67 | ls -l dist 68 | 69 | install: clean 70 | python setup.py install 71 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pyLDAvis = {editable = true, path = "."} 8 | numpy = ">=1.24.2" 9 | scipy = "*" 10 | pandas = ">=2.0.0" 11 | joblib = ">=1.2.0" 12 | numexpr = "*" 13 | funcy = "*" 14 | scikit-learn = ">=1.0.0" 15 | gensim = "*" 16 | Jinja2 = "*" 17 | reproducer = {editable = true, path = "."} 18 | 19 | [dev-packages] 20 | pytest = ">=3.9" 21 | 22 | [requires] 23 | python_version = "3.11" 24 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "e27e982c64fe58fb5878295a2e6ca15753d1a5dab60d17674314b8a04a450ee4" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.9" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "funcy": { 20 | "hashes": [ 21 | "sha256:1d3fc5d42cf7564a6b2be04042d0df7a50c77903cf760a34786d0c9ebd659b25", 22 | "sha256:2775409b7dc9106283f1224d97e6df5f2c02e7291c8caed72764f5a115dffb50" 23 | ], 24 | "index": "pypi", 25 | "version": "==1.16" 26 | }, 27 | "gensim": { 28 | "hashes": [ 29 | "sha256:1932c257de4eccbb64cc40d46e8577a25f5f47b94b96019a969fb36150f11d15", 30 | "sha256:1ff0171ec5b7473facb1441426a6b41e8ec4599fd62e1820868ab965804e3d4c", 31 | "sha256:36222dbf89aa57909131fc79654e92c918e1075b9ebd00532c3d23b76b6ce8eb", 32 | "sha256:39139be83c3128e234216189a094f959ac2b052a808911b0b56d980d5f96981f", 33 | "sha256:3e34cfe767a8db52812826136d6e94863081fd1456726bd1ff40b4e25965fbb5", 34 | "sha256:58d9ab570b225f3aafec55286864560a25701f7446af9dbc0ad51aa5f61712fa", 35 | "sha256:615d2a57efeaf97cd847e95f83b2fc168f9d22f4922aaa9cda9350f05648560c", 36 | "sha256:66a9574f9f2bbf8fd8e6d7a120443793b96bfd4c153b41f266b6299aa3362de7", 37 | "sha256:7bbc3d6c80c9fd97b89dfee2f44562b75542f72141f5fbacb91334597485f55c", 38 | "sha256:804e18d76d9034bc70f93b8407680b7956c99f03914e85e31dd8b296623dc0ed", 39 | "sha256:8bd89b791e6729a9dd1c345d32fc9e2ba51348cf54fbaa8d49259eb92e719084", 40 | "sha256:8c6a4b271f4d554fdf14b9cb34d4da6cde7084f7f581c5c6dd5fcac648db35be", 41 | "sha256:8d0bf4074ff467a0b22c5e4cecfb7d12afcca6246dac515d5a06ab7e4c775f8e", 42 | "sha256:d4b4ca5d1408e2d89e0ac45cd8a432abf747d5b62eea68e6dccacefa03d759c9", 43 | "sha256:d812dcdf2bfaf527a09ecf867303c117d6f497233db08f1d8209ffb71aaf3fdb", 44 | "sha256:ea47999c7da97472fce8f0831a63e4089d85539c8e0cdb895f087aea1eed4a3b", 45 | "sha256:f6133b0f76d0c262231465936cded8920df88edf079df1e7bfe95f049ad8301e" 46 | ], 47 | "index": "pypi", 48 | "version": "==4.1.2" 49 | }, 50 | "jinja2": { 51 | "hashes": [ 52 | "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", 53 | "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" 54 | ], 55 | "index": "pypi", 56 | "version": "==3.0.3" 57 | }, 58 | "joblib": { 59 | "hashes": [ 60 | "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385", 61 | "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018" 62 | ], 63 | "index": "pypi", 64 | "version": "==1.2.0" 65 | }, 66 | "markupsafe": { 67 | "hashes": [ 68 | "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed", 69 | "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc", 70 | "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2", 71 | "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460", 72 | "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7", 73 | "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0", 74 | "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1", 75 | "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa", 76 | "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03", 77 | "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323", 78 | "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65", 79 | "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013", 80 | "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036", 81 | "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f", 82 | "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4", 83 | "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419", 84 | "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2", 85 | "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619", 86 | "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a", 87 | "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a", 88 | "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd", 89 | "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7", 90 | "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666", 91 | "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65", 92 | "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859", 93 | "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625", 94 | "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff", 95 | "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156", 96 | "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd", 97 | "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba", 98 | "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f", 99 | "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1", 100 | "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094", 101 | "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a", 102 | "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513", 103 | "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed", 104 | "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d", 105 | "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3", 106 | "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147", 107 | "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c", 108 | "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603", 109 | "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601", 110 | "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a", 111 | "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1", 112 | "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d", 113 | "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3", 114 | "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54", 115 | "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2", 116 | "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6", 117 | "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58" 118 | ], 119 | "markers": "python_version >= '3.7'", 120 | "version": "==2.1.2" 121 | }, 122 | "numexpr": { 123 | "hashes": [ 124 | "sha256:05b97b19e864a5d1a0b106933b1637233a2444fd375685bead264a818f847ef2", 125 | "sha256:0732c9989bff8568ee78fa461f3698166d4ac79363860be22ff49eae1dcd15e7", 126 | "sha256:23718ac5f2ebae995f5899509624781b375da568f2b645b5d1fd6dbb17f41a56", 127 | "sha256:24cdb8c0e93f31387a4c2ddd09a687874c006e6139fd68bcf77b96e51d17cb01", 128 | "sha256:2e14b44a79030fbe25f16393162a4d21ced14056fac49ff73856f661a78db731", 129 | "sha256:3daa55515ee3cb40bf5ab8263c0c13fff8d484d64d107a9c414e8ca151dc08a6", 130 | "sha256:43616529f9b7d1afc83386f943dc66c4da5e052f00217ba7e3ad8dd1b5f3a825", 131 | "sha256:4527a0a7b04f858a73c348c9c4ce8441b7a54965db74a32ba808c51d9d53b7cd", 132 | "sha256:51277a530a353e0f94665b44615249d7e7075f0c73f78d4743da632fc44bc648", 133 | "sha256:5223a519f48754dd350723d9fbcadbcd0476881bc954a281a09a6538ecabfc27", 134 | "sha256:5d6dbf050a9b8ebff0b7706ebeaf1cd57d64ef4dfe61aef3790851b481daf6b5", 135 | "sha256:5f4122bd58aa4e4891814c2f72bd47b1cdb202c9d863ea96c5394dffb72a16e2", 136 | "sha256:602df9b5c500d0a887dc96b4cfd16fb60ae7ef39ccd6f013f4df2ee11ae70553", 137 | "sha256:618259287b8b81a352a7d088ad03fe3b393a842ccb45f0b3cfc6a712d41b7595", 138 | "sha256:74df157ab4577bfc83c14f4e39d14781b06ade5406d3efef049f90c88d8c28ea", 139 | "sha256:785065819ce98e3d3dd853794244e0de190d7ba36ab42c8fd79e0e9cd40de7af", 140 | "sha256:7ab40e2b438f4ea2ea8234c63639cdf5072cdb29d0ac521307854efe0281a567", 141 | "sha256:833a363c86266424349467b53f4060f77aaa7ec03c1e6f38c54e69c65ceebf30", 142 | "sha256:8b76bcca930cbf0db0fe98b6a51d6286dff77d525dad670cb7750e29a138d434", 143 | "sha256:8fc23a49f4266c24a23310c0cb92ff54c4b4f535635f90372b3a2d5cb1f83329", 144 | "sha256:90ea6d5813e1906bb203ef220a600b30d83e75aea2607a7e7037cceae9e93346", 145 | "sha256:97753d17d1ea39e082b1907b99b6cb63cac7d1dfa512d2ff5079eb7bfab1ea88", 146 | "sha256:99472731bc1111f5d73285dd2a4c228b5bfb176f785a567872e0fbfec6584f2b", 147 | "sha256:a3f1cec8657bd3920869a2ea27f98d68ac3000334f366d844a9670ae671fe4bd", 148 | "sha256:a8e0e48d72391543b68d0471fac2e31c614efdce4036e2a0a8a182fde1edb0e0", 149 | "sha256:aae4ce158da53ebc47df053de90fed9d0d51fa0df8cc481abc8a901ea4f0cec7", 150 | "sha256:b0a9124a66a61b05ea84b832358d6aa5561c30e69b4dcaea819b296f4f025f89", 151 | "sha256:c2605e5665b0d7362e0d2b92683387c12e15c7440daf702a7637f7502a967810", 152 | "sha256:c9218aeb76717768f617362b72a87e9219da95ba7cdec0732ccecc4a4719124c", 153 | "sha256:c978c49bd9dded6a4ba6b3501e3a34e3aba9312cbb7d800bed7ac6fcd2d5949d", 154 | "sha256:d14ae09318ad86579e35aacf1596c83d5db1139cd68615967ee23605e11f5d82", 155 | "sha256:d423441593a952ac56d1f774068b81fb22f514fb68873c066578345a6af74c0d", 156 | "sha256:dc707486b1f3dda18a39bc4d06a0a09d3c0ea47bd6b99fdb98adb26d1277253f", 157 | "sha256:dfdca3d1f4c83fa8fd3ee7573110efd13e838543896641b89367622ec6a67eb4", 158 | "sha256:e000570a6a704c594832ff4fc45f18864b721b7b444a185b365dbb03d3fe3abb", 159 | "sha256:e985026e64350dd59fd91a09bc364edf706d58b12e01362ddfa63829878bd434", 160 | "sha256:eeeb6325df6cf3f3ab7d9dbabf3bc03ac88b7e2f2aed21419c31e23c3048dce1", 161 | "sha256:f9df0a74d39616fd011071c5850418f244bac414f24ed55c00dcf3c5385e8374" 162 | ], 163 | "index": "pypi", 164 | "version": "==2.7.3" 165 | }, 166 | "numpy": { 167 | "hashes": [ 168 | "sha256:0cfe07133fd00b27edee5e6385e333e9eeb010607e8a46e1cd673f05f8596595", 169 | "sha256:11a1f3816ea82eed4178102c56281782690ab5993251fdfd75039aad4d20385f", 170 | "sha256:2762331de395739c91f1abb88041f94a080cb1143aeec791b3b223976228af3f", 171 | "sha256:283d9de87c0133ef98f93dfc09fad3fb382f2a15580de75c02b5bb36a5a159a5", 172 | "sha256:3d22662b4b10112c545c91a0741f2436f8ca979ab3d69d03d19322aa970f9695", 173 | "sha256:41388e32e40b41dd56eb37fcaa7488b2b47b0adf77c66154d6b89622c110dfe9", 174 | "sha256:42c16cec1c8cf2728f1d539bd55aaa9d6bb48a7de2f41eb944697293ef65a559", 175 | "sha256:47ee7a839f5885bc0c63a74aabb91f6f40d7d7b639253768c4199b37aede7982", 176 | "sha256:5a311ee4d983c487a0ab546708edbdd759393a3dc9cd30305170149fedd23c88", 177 | "sha256:5dc65644f75a4c2970f21394ad8bea1a844104f0fe01f278631be1c7eae27226", 178 | "sha256:6ed0d073a9c54ac40c41a9c2d53fcc3d4d4ed607670b9e7b0de1ba13b4cbfe6f", 179 | "sha256:76ba7c40e80f9dc815c5e896330700fd6e20814e69da9c1267d65a4d051080f1", 180 | "sha256:818b9be7900e8dc23e013a92779135623476f44a0de58b40c32a15368c01d471", 181 | "sha256:a024181d7aef0004d76fb3bce2a4c9f2e67a609a9e2a6ff2571d30e9976aa383", 182 | "sha256:a955e4128ac36797aaffd49ab44ec74a71c11d6938df83b1285492d277db5397", 183 | "sha256:a97a954a8c2f046d3817c2bce16e3c7e9a9c2afffaf0400f5c16df5172a67c9c", 184 | "sha256:a97e82c39d9856fe7d4f9b86d8a1e66eff99cf3a8b7ba48202f659703d27c46f", 185 | "sha256:b55b953a1bdb465f4dc181758570d321db4ac23005f90ffd2b434cc6609a63dd", 186 | "sha256:bb02929b0d6bfab4c48a79bd805bd7419114606947ec8284476167415171f55b", 187 | "sha256:bece0a4a49e60e472a6d1f70ac6cdea00f9ab80ff01132f96bd970cdd8a9e5a9", 188 | "sha256:e41e8951749c4b5c9a2dc5fdbc1a4eec6ab2a140fdae9b460b0f557eed870f4d", 189 | "sha256:f71d57cc8645f14816ae249407d309be250ad8de93ef61d9709b45a0ddf4050c" 190 | ], 191 | "index": "pypi", 192 | "version": "==1.22.0" 193 | }, 194 | "pandas": { 195 | "hashes": [ 196 | "sha256:003ba92db58b71a5f8add604a17a059f3068ef4e8c0c365b088468d0d64935fd", 197 | "sha256:10e10a2527db79af6e830c3d5842a4d60383b162885270f8cffc15abca4ba4a9", 198 | "sha256:22808afb8f96e2269dcc5b846decacb2f526dd0b47baebc63d913bf847317c8f", 199 | "sha256:2d1dc09c0013d8faa7474574d61b575f9af6257ab95c93dcf33a14fd8d2c1bab", 200 | "sha256:35c77609acd2e4d517da41bae0c11c70d31c87aae8dd1aabd2670906c6d2c143", 201 | "sha256:372d72a3d8a5f2dbaf566a5fa5fa7f230842ac80f29a931fb4b071502cf86b9a", 202 | "sha256:42493f8ae67918bf129869abea8204df899902287a7f5eaf596c8e54e0ac7ff4", 203 | "sha256:4acc28364863127bca1029fb72228e6f473bb50c32e77155e80b410e2068eeac", 204 | "sha256:5298a733e5bfbb761181fd4672c36d0c627320eb999c59c65156c6a90c7e1b4f", 205 | "sha256:5ba0aac1397e1d7b654fccf263a4798a9e84ef749866060d19e577e927d66e1b", 206 | "sha256:9707bdc1ea9639c886b4d3be6e2a45812c1ac0c2080f94c31b71c9fa35556f9b", 207 | "sha256:a2aa18d3f0b7d538e21932f637fbfe8518d085238b429e4790a35e1e44a96ffc", 208 | "sha256:a388960f979665b447f0847626e40f99af8cf191bce9dc571d716433130cb3a7", 209 | "sha256:a51528192755f7429c5bcc9e80832c517340317c861318fea9cea081b57c9afd", 210 | "sha256:b528e126c13816a4374e56b7b18bfe91f7a7f6576d1aadba5dee6a87a7f479ae", 211 | "sha256:c1aa4de4919358c5ef119f6377bc5964b3a7023c23e845d9db7d9016fa0c5b1c", 212 | "sha256:c2646458e1dce44df9f71a01dc65f7e8fa4307f29e5c0f2f92c97f47a5bf22f5", 213 | "sha256:c2f44425594ae85e119459bb5abb0748d76ef01d9c08583a667e3339e134218e", 214 | "sha256:d47750cf07dee6b55d8423471be70d627314277976ff2edd1381f02d52dbadf9", 215 | "sha256:d99d2350adb7b6c3f7f8f0e5dfb7d34ff8dd4bc0a53e62c445b7e43e163fce63", 216 | "sha256:dd324f8ee05925ee85de0ea3f0d66e1362e8c80799eb4eb04927d32335a3e44a", 217 | "sha256:eaca36a80acaacb8183930e2e5ad7f71539a66805d6204ea88736570b2876a7b", 218 | "sha256:f567e972dce3bbc3a8076e0b675273b4a9e8576ac629149cf8286ee13c259ae5", 219 | "sha256:fe48e4925455c964db914b958f6e7032d285848b7538a5e1b19aeb26ffaea3ec" 220 | ], 221 | "index": "pypi", 222 | "version": "==1.3.4" 223 | }, 224 | "pyldavis": { 225 | "editable": true, 226 | "path": "." 227 | }, 228 | "python-dateutil": { 229 | "hashes": [ 230 | "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", 231 | "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" 232 | ], 233 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 234 | "version": "==2.8.2" 235 | }, 236 | "pytz": { 237 | "hashes": [ 238 | "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0", 239 | "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a" 240 | ], 241 | "version": "==2022.7.1" 242 | }, 243 | "reproducer": { 244 | "editable": true, 245 | "path": "." 246 | }, 247 | "sanitized-package": { 248 | "editable": true, 249 | "path": "." 250 | }, 251 | "scikit-learn": { 252 | "hashes": [ 253 | "sha256:02aee3b257617da0ec98dee9572b10523dc00c25b68c195ddf100c1a93b1854b", 254 | "sha256:059c5be0c0365321ddbcac7abf0db806fad8ecb64ee6c7cbcd58313c7d61634d", 255 | "sha256:116e05fd990d9b363fc29bd3699ec2117d7da9088f6ca9a90173b240c5a063f1", 256 | "sha256:11a57405c1c3514227d0c6a0bee561c94cd1284b41e236f7a1d76b3975f77593", 257 | "sha256:32d941f12fd7e245f01da2b82943c5ce6f1133fa5375eb80caa51457532b3e7e", 258 | "sha256:46248cc6a8b72490f723c73ff2e65e62633d14cafe9d2df3a7b3f87d332a6f7e", 259 | "sha256:515b227f01f569145dc9f86e56f4cea9f00a613fc4d074bbfc0a92ca00bff467", 260 | "sha256:538f3a85c4980c7572f3e754f0ba8489363976ef3e7f6a94e8f1af5ae45f6f6a", 261 | "sha256:53bb7c605427ab187869d7a05cd3f524a3015a90e351c1788fc3a662e7f92b69", 262 | "sha256:59b1d6df8724003fa16b7365a3b43449ee152aa6e488dd7a19f933640bb2d7fb", 263 | "sha256:62ce4e3ddb6e6e9dcdb3e5ac7f0575dbaf56f79ce2b2edee55192b12b52df5be", 264 | "sha256:648f4dbfdd0a1b45bf6e2e4afe3f431774c55dee05e2d28f8394d6648296f373", 265 | "sha256:944f47b2d881b9d24aee40d643bfdc4bd2b6dc3d25b62964411c6d8882f940a1", 266 | "sha256:a51fdbc116974d9715957366df73e5ec6f0a7a2afa017864c2e5f5834e6f494d", 267 | "sha256:a800665527c1a63f7395a0baae3c89b0d97b54d2c23769c1c9879061bb80bc19", 268 | "sha256:ac2ca9dbb754d61cfe1c83ba8483498ef951d29b93ec09d6f002847f210a99da", 269 | "sha256:bd78a2442c948536f677e2744917c37cff014559648102038822c23863741c27", 270 | "sha256:c604a813df8e7d6dfca3ae0db0a8fd7e5dff4ea9d94081ab263c81bf0b61ab4b", 271 | "sha256:c6b9510fd2e1642314efb7aa951a0d05d963f3523e01c30b2dadde2395ebe6b4", 272 | "sha256:ebbe4275556d3c02707bd93ae8b96d9651acd4165126e0ae64b336afa2a6dcb1", 273 | "sha256:ee59da47e18b703f6de17d5d51b16ce086c50969d5a83db5217f0ae9372de232", 274 | "sha256:fb7214103f6c36c1371dd8c166897e3528264a28f2e2e42573ba8c61ed4d7142", 275 | "sha256:fc60e0371e521995a6af2ef3f5d911568506124c272889b318b8b6e497251231", 276 | "sha256:fc75f81571137b39f9b31766e15a0e525331637e7fe8f8000a3fbfba7da3add9", 277 | "sha256:fecb5102f0a36c16c1361ec519a7bb0260776ef40e17393a81f530569c916a7b" 278 | ], 279 | "index": "pypi", 280 | "version": "==1.0.1" 281 | }, 282 | "scipy": { 283 | "hashes": [ 284 | "sha256:1437073f1d4664990879aa8f9547524764372e0fef84a077be4b19e82bba7a8d", 285 | "sha256:17fd991a275e4283453f89d404209aa92059ac68d76d804b4bc1716a3742e1b5", 286 | "sha256:1ea6233f5a365cb7945b4304bd06323ece3ece85d6a3fa8598d2f53e513467c9", 287 | "sha256:2d25272c03ee3c0fe5e0dff1bb7889280bb6c9e1766fa9c7bde81ad8a5f78694", 288 | "sha256:30bdda199667e74b50208a793eb1ba47a04e5e3fa16f5ff06c6f7969ae78e4da", 289 | "sha256:359b60a0cccd17723b9d5e329a5212a710e771a3ddde800e472fb93732756c46", 290 | "sha256:39f838ea5ce8da868785193d88d05cf5a6d5c390804ec99de29a28e1dcdd53e6", 291 | "sha256:4d175ba93e00d8eef8f7cd70d4d88a9106a86800c82ea03cf2268c36d6545483", 292 | "sha256:5273d832fb9cd5724ee0d335c16a903b923441107dd973d27fc4293075a9f4e3", 293 | "sha256:54951f51d731c832b1b8885e0a92e89f33d087de7e40d02078bf0d49c7cbdbb5", 294 | "sha256:74f518ce542533054695f743e4271cb8986b63f95bb51d70fcee4f3929cbff7d", 295 | "sha256:7b1d0f5f524518f1a86f288443528e4ff4a739c0966db663af4129b7ac7849f8", 296 | "sha256:82c5befebf54d799d77e5f0205c03030f57f69ba2541baa44d2e6ad138c28cd3", 297 | "sha256:8482c8e45857ab0a5446eb7460d2307a27cbbe659d6d2257820c6d6eb950fd0f", 298 | "sha256:87cf3964db0f1cce17aeed5bfc1b89a6b4b07dbfc48e50d21fa3549e00456803", 299 | "sha256:8b5726a0fedeaa6beb1095e4466998bdd1d1e960b28db9b5a16c89cbd7b2ebf1", 300 | "sha256:97eb573e361a73a553b915dc195c6f72a08249964b1a33f157f9659f3b6210d1", 301 | "sha256:a80eb01c43fd98257ec7a49ff5cec0edba32031b5f86503f55399a48cb2c5379", 302 | "sha256:cac71d5476a6f56b50459da21f6221707e0051ebd428b2137db32ef4a43bb15e", 303 | "sha256:d86abd1ddf421dea5e9cebfeb4de0d205b3dc04e78249afedba9c6c3b2227ff2", 304 | "sha256:dc2d1bf41294e63c7302bf499973ac0c7f73c93c01763db43055f6525234bf11", 305 | "sha256:e08b81fcd9bf98740b58dc6fdd7879e33a64dcb682201c1135f7d4a75216bb05", 306 | "sha256:e3efe7ef75dfe627b354ab0af0dbc918eadee97cc80ff1aabea6d3e01114ebdd", 307 | "sha256:fa2dbabaaecdb502641b0b3c00dec05fb475ae48655c66da16c9ed24eda1e711" 308 | ], 309 | "index": "pypi", 310 | "version": "==1.7.2" 311 | }, 312 | "six": { 313 | "hashes": [ 314 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", 315 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" 316 | ], 317 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 318 | "version": "==1.16.0" 319 | }, 320 | "smart-open": { 321 | "hashes": [ 322 | "sha256:b4c9ae193ad6d3e7add50944b86afa0d150bd821ab8ec21edb26d9a06b66f6a8", 323 | "sha256:d5238825fe9a9340645fac3d75b287c08fbb99fb2b422477de781c9f5f09e019" 324 | ], 325 | "markers": "python_version >= '3.6' and python_version < '4.0'", 326 | "version": "==6.3.0" 327 | }, 328 | "threadpoolctl": { 329 | "hashes": [ 330 | "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b", 331 | "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380" 332 | ], 333 | "markers": "python_version >= '3.6'", 334 | "version": "==3.1.0" 335 | } 336 | }, 337 | "develop": { 338 | "attrs": { 339 | "hashes": [ 340 | "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836", 341 | "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99" 342 | ], 343 | "markers": "python_version >= '3.6'", 344 | "version": "==22.2.0" 345 | }, 346 | "importlib-metadata": { 347 | "hashes": [ 348 | "sha256:53ccfd5c134223e497627b9815d5030edf77d2ed573922f7a0b8f8bb81a1c100", 349 | "sha256:75bdec14c397f528724c1bfd9709d660b33a4d2e77387a3358f20b848bb5e5fb" 350 | ], 351 | "markers": "python_version < '3.8'", 352 | "version": "==4.8.2" 353 | }, 354 | "iniconfig": { 355 | "hashes": [ 356 | "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", 357 | "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374" 358 | ], 359 | "markers": "python_version >= '3.7'", 360 | "version": "==2.0.0" 361 | }, 362 | "packaging": { 363 | "hashes": [ 364 | "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2", 365 | "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97" 366 | ], 367 | "markers": "python_version >= '3.7'", 368 | "version": "==23.0" 369 | }, 370 | "pluggy": { 371 | "hashes": [ 372 | "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", 373 | "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" 374 | ], 375 | "markers": "python_version >= '3.6'", 376 | "version": "==1.0.0" 377 | }, 378 | "py": { 379 | "hashes": [ 380 | "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", 381 | "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" 382 | ], 383 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 384 | "version": "==1.11.0" 385 | }, 386 | "pyparsing": { 387 | "hashes": [ 388 | "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4", 389 | "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81" 390 | ], 391 | "markers": "python_version >= '3.6'", 392 | "version": "==3.0.6" 393 | }, 394 | "pytest": { 395 | "hashes": [ 396 | "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89", 397 | "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134" 398 | ], 399 | "index": "pypi", 400 | "version": "==6.2.5" 401 | }, 402 | "toml": { 403 | "hashes": [ 404 | "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", 405 | "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" 406 | ], 407 | "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", 408 | "version": "==0.10.2" 409 | }, 410 | "typing-extensions": { 411 | "hashes": [ 412 | "sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed", 413 | "sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9" 414 | ], 415 | "markers": "python_version < '3.8'", 416 | "version": "==4.0.0" 417 | }, 418 | "zipp": { 419 | "hashes": [ 420 | "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832", 421 | "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc" 422 | ], 423 | "markers": "python_version >= '3.6'", 424 | "version": "==3.6.0" 425 | } 426 | } 427 | } 428 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pyLDAvis 2 | ======== 3 | 4 | Python library for interactive topic model visualization. 5 | This is a port of the fabulous `R package `_ by `Carson Sievert `__ and `Kenny Shirley `__. 6 | 7 | .. figure:: http://www.kennyshirley.com/figures/ldavis-pic.png 8 | :alt: LDAvis icon 9 | 10 | **pyLDAvis** is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization. 11 | 12 | The visualization is intended to be used within an IPython notebook but can also be saved to a stand-alone HTML file for easy sharing. 13 | 14 | Note: LDA stands for `latent Dirichlet allocation `_. 15 | 16 | |version status| |build status| |docs| 17 | 18 | Installation 19 | ~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | - Stable version using pip: 22 | 23 | :: 24 | 25 | pip install pyldavis 26 | 27 | - Development version on GitHub 28 | 29 | Clone the repository and run ``python setup.py`` 30 | 31 | Usage 32 | ~~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | The best way to learn how to use **pyLDAvis** is to see it in action. 35 | Check out this `notebook for an overview `__. 36 | Refer to the `documentation `__ for details. 37 | 38 | For a concise explanation of the visualization see this 39 | `vignette `__ from the LDAvis R package. 40 | 41 | Video demos 42 | ~~~~~~~~~~~ 43 | 44 | Ben Mabey walked through the visualization in this short talk using a Hacker News corpus: 45 | 46 | - `Visualizing Topic Models `__ 47 | - `Notebook and visualization used in the demo `__ 48 | - `Slide deck `__ 49 | 50 | 51 | `Carson Sievert `__ created a video demoing the R package. The visualization is the same and so it applies equally to pyLDAvis: 52 | 53 | - `Visualizing & Exploring the Twenty Newsgroup Data `__ 54 | 55 | More documentation 56 | ~~~~~~~~~~~~~~~~~~ 57 | 58 | To read about the methodology behind pyLDAvis, see `the original 59 | paper `__, 60 | which was presented at the `2014 ACL Workshop on Interactive Language 61 | Learning, Visualization, and 62 | Interfaces `__ in Baltimore 63 | on June 27, 2014. 64 | 65 | 66 | 67 | 68 | .. |version status| image:: https://img.shields.io/pypi/v/pyLDAvis.svg 69 | :target: https://pypi.python.org/pypi/pyLDAvis 70 | .. |build status| image:: https://travis-ci.org/bmabey/pyLDAvis.png?branch=master 71 | :target: https://travis-ci.org/bmabey/pyLDAvis 72 | .. |docs| image:: https://readthedocs.org/projects/pyldavis/badge/?version=latest 73 | :target: https://pyLDAvis.readthedocs.org 74 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyLDAvis.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyLDAvis.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pyLDAvis" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyLDAvis" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # pyLDAvis documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | import mock 20 | 21 | # If extensions (or modules to document with autodoc) are in another directory, 22 | # add these directories to sys.path here. If the directory is relative to the 23 | # documentation root, use os.path.abspath to make it absolute, like shown here. 24 | #sys.path.insert(0, os.path.abspath('.')) 25 | 26 | # Get the project root dir, which is the parent dir of this 27 | cwd = os.getcwd() 28 | project_root = os.path.dirname(cwd) 29 | 30 | # Insert the project root dir as the first element in the PYTHONPATH. 31 | # This lets us ensure that the source package is imported, and that its 32 | # version is used. 33 | sys.path.insert(0, project_root) 34 | 35 | MOCK_MODULES = ['numpy','joblib', 'funcy', 'scipy', 'scipy.stats', 'scipy.spatial', 36 | 'scipy.spatial.distance', 'pandas', 'skbio', 'skbio.stats', 37 | 'skbio.stats.distance', 'skbio.stats.ordination'] 38 | for mod_name in MOCK_MODULES: 39 | sys.modules[mod_name] = mock.Mock() 40 | 41 | import pyLDAvis 42 | 43 | 44 | 45 | # -- General configuration --------------------------------------------- 46 | 47 | # If your documentation needs a minimal Sphinx version, state it here. 48 | #needs_sphinx = '1.0' 49 | 50 | # Add any Sphinx extension module names here, as strings. They can be 51 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 52 | extensions = [ 53 | 'sphinx.ext.autodoc', 54 | 'sphinx.ext.autosummary', 55 | 'sphinx.ext.doctest', 56 | 'sphinx.ext.coverage', 57 | 'sphinx.ext.viewcode', 58 | 'numpydoc'] 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ['_templates'] 62 | 63 | # The suffix of source filenames. 64 | source_suffix = '.rst' 65 | 66 | # The encoding of source files. 67 | #source_encoding = 'utf-8-sig' 68 | 69 | # The master toctree document. 70 | master_doc = 'index' 71 | 72 | # General information about the project. 73 | project = u'pyLDAvis' 74 | copyright = u'2015, Ben Mabey' 75 | 76 | # The version info for the project you're documenting, acts as replacement 77 | # for |version| and |release|, also used in various other places throughout 78 | # the built documents. 79 | # 80 | # The short X.Y version. 81 | version = pyLDAvis.__version__ 82 | # The full version, including alpha/beta/rc tags. 83 | release = pyLDAvis.__version__ 84 | 85 | # The language for content autogenerated by Sphinx. Refer to documentation 86 | # for a list of supported languages. 87 | #language = None 88 | 89 | # There are two options for replacing |today|: either, you set today to 90 | # some non-false value, then it is used: 91 | #today = '' 92 | # Else, today_fmt is used as the format for a strftime call. 93 | #today_fmt = '%B %d, %Y' 94 | 95 | # List of patterns, relative to source directory, that match files and 96 | # directories to ignore when looking for source files. 97 | exclude_patterns = ['_build'] 98 | 99 | # The reST default role (used for this markup: `text`) to use for all 100 | # documents. 101 | #default_role = None 102 | 103 | # If true, '()' will be appended to :func: etc. cross-reference text. 104 | #add_function_parentheses = True 105 | 106 | # If true, the current module name will be prepended to all description 107 | # unit titles (such as .. function::). 108 | #add_module_names = True 109 | 110 | # If true, sectionauthor and moduleauthor directives will be shown in the 111 | # output. They are ignored by default. 112 | #show_authors = False 113 | 114 | # The name of the Pygments (syntax highlighting) style to use. 115 | pygments_style = 'sphinx' 116 | 117 | # A list of ignored prefixes for module index sorting. 118 | #modindex_common_prefix = [] 119 | 120 | # If true, keep warnings as "system message" paragraphs in the built 121 | # documents. 122 | #keep_warnings = False 123 | 124 | 125 | # -- Options for HTML output ------------------------------------------- 126 | 127 | # The theme to use for HTML and HTML Help pages. See the documentation for 128 | # a list of builtin themes. 129 | html_theme = 'default' 130 | 131 | # Theme options are theme-specific and customize the look and feel of a 132 | # theme further. For a list of options available for each theme, see the 133 | # documentation. 134 | #html_theme_options = {} 135 | 136 | # Add any paths that contain custom themes here, relative to this directory. 137 | #html_theme_path = [] 138 | 139 | # The name for this set of Sphinx documents. If None, it defaults to 140 | # " v documentation". 141 | #html_title = None 142 | 143 | # A shorter title for the navigation bar. Default is the same as 144 | # html_title. 145 | #html_short_title = None 146 | 147 | # The name of an image file (relative to this directory) to place at the 148 | # top of the sidebar. 149 | #html_logo = None 150 | 151 | # The name of an image file (within the static path) to use as favicon 152 | # of the docs. This file should be a Windows icon file (.ico) being 153 | # 16x16 or 32x32 pixels large. 154 | #html_favicon = None 155 | 156 | # Add any paths that contain custom static files (such as style sheets) 157 | # here, relative to this directory. They are copied after the builtin 158 | # static files, so a file named "default.css" will overwrite the builtin 159 | # "default.css". 160 | html_static_path = ['_static'] 161 | 162 | # If not '', a 'Last updated on:' timestamp is inserted at every page 163 | # bottom, using the given strftime format. 164 | #html_last_updated_fmt = '%b %d, %Y' 165 | 166 | # If true, SmartyPants will be used to convert quotes and dashes to 167 | # typographically correct entities. 168 | #html_use_smartypants = True 169 | 170 | # Custom sidebar templates, maps document names to template names. 171 | #html_sidebars = {} 172 | 173 | # Additional templates that should be rendered to pages, maps page names 174 | # to template names. 175 | #html_additional_pages = {} 176 | 177 | # If false, no module index is generated. 178 | #html_domain_indices = True 179 | 180 | # If false, no index is generated. 181 | #html_use_index = True 182 | 183 | # If true, the index is split into individual pages for each letter. 184 | #html_split_index = False 185 | 186 | # If true, links to the reST sources are added to the pages. 187 | #html_show_sourcelink = True 188 | 189 | # If true, "Created using Sphinx" is shown in the HTML footer. 190 | # Default is True. 191 | #html_show_sphinx = True 192 | 193 | # If true, "(C) Copyright ..." is shown in the HTML footer. 194 | # Default is True. 195 | #html_show_copyright = True 196 | 197 | # If true, an OpenSearch description file will be output, and all pages 198 | # will contain a tag referring to it. The value of this option 199 | # must be the base URL from which the finished HTML is served. 200 | #html_use_opensearch = '' 201 | 202 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 203 | #html_file_suffix = None 204 | 205 | # Output file base name for HTML help builder. 206 | htmlhelp_basename = 'pyLDAvisdoc' 207 | 208 | 209 | # -- Options for LaTeX output ------------------------------------------ 210 | 211 | latex_elements = { 212 | # The paper size ('letterpaper' or 'a4paper'). 213 | #'papersize': 'letterpaper', 214 | 215 | # The font size ('10pt', '11pt' or '12pt'). 216 | #'pointsize': '10pt', 217 | 218 | # Additional stuff for the LaTeX preamble. 219 | #'preamble': '', 220 | } 221 | 222 | # Grouping the document tree into LaTeX files. List of tuples 223 | # (source start file, target name, title, author, documentclass 224 | # [howto/manual]). 225 | latex_documents = [ 226 | ('index', 'pyLDAvis.tex', 227 | u'pyLDAvis Documentation', 228 | u'Ben Mabey', 'manual'), 229 | ] 230 | 231 | # The name of an image file (relative to this directory) to place at 232 | # the top of the title page. 233 | #latex_logo = None 234 | 235 | # For "manual" documents, if this is true, then toplevel headings 236 | # are parts, not chapters. 237 | #latex_use_parts = False 238 | 239 | # If true, show page references after internal links. 240 | #latex_show_pagerefs = False 241 | 242 | # If true, show URL addresses after external links. 243 | #latex_show_urls = False 244 | 245 | # Documents to append as an appendix to all manuals. 246 | #latex_appendices = [] 247 | 248 | # If false, no module index is generated. 249 | #latex_domain_indices = True 250 | 251 | 252 | # -- Options for manual page output ------------------------------------ 253 | 254 | # One entry per manual page. List of tuples 255 | # (source start file, name, description, authors, manual section). 256 | man_pages = [ 257 | ('index', 'pyLDAvis', 258 | u'pyLDAvis Documentation', 259 | [u'Ben Mabey'], 1) 260 | ] 261 | 262 | # If true, show URL addresses after external links. 263 | #man_show_urls = False 264 | 265 | 266 | # -- Options for Texinfo output ---------------------------------------- 267 | 268 | # Grouping the document tree into Texinfo files. List of tuples 269 | # (source start file, target name, title, author, 270 | # dir menu entry, description, category) 271 | texinfo_documents = [ 272 | ('index', 'pyLDAvis', 273 | u'pyLDAvis Documentation', 274 | u'Ben Mabey', 275 | 'pyLDAvis', 276 | 'One line description of project.', 277 | 'Miscellaneous'), 278 | ] 279 | 280 | # Documents to append as an appendix to all manuals. 281 | #texinfo_appendices = [] 282 | 283 | # If false, no module index is generated. 284 | #texinfo_domain_indices = True 285 | 286 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 287 | #texinfo_show_urls = 'footnote' 288 | 289 | # If true, do not generate a @detailmenu in the "Top" node's menu. 290 | #texinfo_no_detailmenu = False 291 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pyLDAvis documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pyLDAvis's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | readme 15 | installation 16 | usage 17 | contributing 18 | authors 19 | modules/API 20 | history 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | 29 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyLDAvis.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyLDAvis.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/modules/API.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. automodule:: pyLDAvis 5 | :members: 6 | 7 | .. automodule:: pyLDAvis.gensim 8 | :members: 9 | 10 | .. automodule:: pyLDAvis.graphlab 11 | :members: 12 | 13 | .. automodule:: pyLDAvis.utils 14 | :members: 15 | 16 | .. automodule:: pyLDAvis.urls 17 | :members: 18 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /notebooks/data/ap_input.json: -------------------------------------------------------------------------------- 1 | ../../tests/data/ap_input.json -------------------------------------------------------------------------------- /notebooks/data/jeopardy_input.json: -------------------------------------------------------------------------------- 1 | ../../tests/data/jeopardy_input.json -------------------------------------------------------------------------------- /notebooks/data/movie_reviews_input.json: -------------------------------------------------------------------------------- 1 | ../../tests/data/movie_reviews_input.json -------------------------------------------------------------------------------- /notebooks/pyLDAvis: -------------------------------------------------------------------------------- 1 | ../pyLDAvis -------------------------------------------------------------------------------- /pyLDAvis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Topic Models (e.g. LDA) visualization using D3 4 | ============================================= 5 | 6 | Functions: General Use 7 | ---------------------- 8 | :func:`prepare` 9 | transform and prepare a LDA model's data for visualization 10 | 11 | :func:`prepared_data_to_html` 12 | convert prepared data to an html string 13 | 14 | :func:`show` 15 | launch a web server to view the visualization 16 | 17 | :func:`save_html` 18 | save a visualization to a standalone html file 19 | 20 | :func:`save_json` 21 | save the visualization JSON data of to a file 22 | 23 | 24 | Functions: IPython Notebook 25 | --------------------------- 26 | :func:`display` 27 | display a figure in an IPython notebook 28 | 29 | :func:`enable_notebook` 30 | enable automatic D3 display of prepared model data in the IPython notebook. 31 | 32 | :func:`disable_notebook` 33 | disable automatic D3 display of prepared model data in the IPython notebook. 34 | """ 35 | 36 | __all__ = ["__version__", 37 | "prepare", "js_PCoA", 38 | "PreparedData", "prepared_data_to_html", 39 | "display", "show", "save_html", "save_json", 40 | "enable_notebook", "disable_notebook"] 41 | 42 | __version__ = "3.4.1" 43 | 44 | from pyLDAvis._display import * 45 | from pyLDAvis._prepare import prepare, js_PCoA, PreparedData 46 | -------------------------------------------------------------------------------- /pyLDAvis/_display.py: -------------------------------------------------------------------------------- 1 | # this file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_display.py 2 | # Copyright (c) 2013, Jake Vanderplas 3 | # It was adapted for pyLDAvis by Ben Mabey 4 | import warnings 5 | import random 6 | import json 7 | import jinja2 8 | import re 9 | from pyLDAvis._server import serve 10 | from pyLDAvis.utils import get_id, write_ipynb_local_js, NumPyEncoder 11 | from pyLDAvis._prepare import PreparedData 12 | import pyLDAvis.urls as urls 13 | 14 | __all__ = ["prepared_data_to_html", "display", 15 | "show", "save_html", "save_json", 16 | "enable_notebook", "disable_notebook"] 17 | 18 | 19 | # Simple HTML template. This works in standalone web pages for single visualizations, 20 | # but will not work within the IPython notebook due to the presence of 21 | # requirejs 22 | SIMPLE_HTML = jinja2.Template(""" 23 | 24 | 25 | 26 | 27 |
28 | 33 | """) 34 | 35 | 36 | # RequireJS template. If requirejs and jquery are not defined, this will 37 | # result in an error. This is suitable for use within the IPython notebook. 38 | REQUIREJS_HTML = jinja2.Template(""" 39 | 40 | 41 | 42 |
43 | 60 | """) 61 | 62 | 63 | # General HTML template. This should work correctly whether or not requirejs 64 | # is defined, and whether it's embedded in a notebook or in a standalone 65 | # HTML page. 66 | GENERAL_HTML = jinja2.Template(""" 67 | 68 | 69 | 70 |
71 | 107 | """) 108 | 109 | TEMPLATE_DICT = {"simple": SIMPLE_HTML, 110 | "notebook": REQUIREJS_HTML, 111 | "general": GENERAL_HTML} 112 | 113 | 114 | def prepared_data_to_html(data, d3_url=None, ldavis_url=None, ldavis_css_url=None, 115 | template_type="general", visid=None, use_http=False): 116 | """Output HTML with embedded visualization 117 | 118 | Parameters 119 | ---------- 120 | data : PreparedData, created using :func:`prepare` 121 | The data for the visualization. 122 | d3_url : string (optional) 123 | The URL of the d3 library. If not specified, a standard web path 124 | will be used. 125 | ldavis_url : string (optional) 126 | The URL of the LDAvis library. If not specified, a standard web path 127 | will be used. 128 | template_type : string 129 | string specifying the type of HTML template to use. Options are: 130 | 131 | ``"simple"`` 132 | suitable for a simple html page with one visualization. Will 133 | fail if require.js is available on the page. 134 | ``"notebook"`` 135 | assumes require.js and jquery are available. 136 | ``"general"`` 137 | more complicated, but works both in and out of the 138 | notebook, whether or not require.js and jquery are available 139 | visid : string (optional) 140 | The html/css id of the visualization div, which must not contain spaces. 141 | If not specified, a random id will be generated. 142 | use_http : boolean (optional) 143 | If true, use http:// instead of https:// for d3_url and ldavis_url. 144 | 145 | Returns 146 | ------- 147 | vis_html : string 148 | the HTML visualization 149 | 150 | See Also 151 | -------- 152 | :func:`save_json`: save json representation of visualization to file 153 | :func:`save_html` : save html representation of a visualization to file 154 | :func:`show` : launch a local server and show a visualization in a browser 155 | :func:`display` : embed visualization within the IPython notebook 156 | :func:`enable_notebook` : automatically embed visualizations in IPython notebook 157 | """ 158 | template = TEMPLATE_DICT[template_type] 159 | 160 | d3_url = d3_url or urls.D3_URL 161 | ldavis_url = ldavis_url or urls.LDAVIS_URL 162 | ldavis_css_url = ldavis_css_url or urls.LDAVIS_CSS_URL 163 | 164 | if use_http: 165 | d3_url = d3_url.replace('https://', 'http://') 166 | ldavis_url = ldavis_url.replace('https://', 'http://') 167 | 168 | if visid is None: 169 | visid = 'ldavis_' + get_id(data) + str(int(random.random() * 1E10)) 170 | elif re.search(r'\s', visid): 171 | raise ValueError("visid must not contain spaces") 172 | 173 | return template.render(visid=json.dumps(visid), 174 | visid_raw=visid, 175 | d3_url=d3_url, 176 | ldavis_url=ldavis_url, 177 | vis_json=data.to_json(), 178 | ldavis_css_url=ldavis_css_url) 179 | 180 | 181 | def display(data, local=False, **kwargs): 182 | """Display visualization in IPython notebook via the HTML display hook 183 | 184 | Parameters 185 | ---------- 186 | data : PreparedData, created using :func:`prepare` 187 | The data for the visualization. 188 | local : boolean (optional, default=False) 189 | if True, then copy the d3 & mpld3 libraries to a location visible to 190 | the notebook server, and source them from there. See Notes below. 191 | **kwargs : 192 | additional keyword arguments are passed through to :func:`prepared_data_to_html`. 193 | 194 | Returns 195 | ------- 196 | vis_d3 : IPython.display.HTML object 197 | the IPython HTML rich display of the visualization. 198 | 199 | Notes 200 | ----- 201 | Known issues: using ``local=True`` may not work correctly in certain cases: 202 | 203 | - In IPython < 2.0, ``local=True`` may fail if the current working 204 | directory is changed within the notebook (e.g. with the %cd command). 205 | - In IPython 2.0+, ``local=True`` may fail if a url prefix is added 206 | (e.g. by setting NotebookApp.base_url). 207 | 208 | See Also 209 | -------- 210 | :func:`show` : launch a local server and show a visualization in a browser 211 | :func:`enable_notebook` : automatically embed visualizations in IPython notebook 212 | """ 213 | # import here, in case users don't have requirements installed 214 | from IPython.display import HTML 215 | 216 | if local: 217 | if 'ldavis_url' in kwargs or 'd3_url' in kwargs: 218 | warnings.warn( 219 | "display: specified urls are ignored when local=True") 220 | kwargs['d3_url'], kwargs['ldavis_url'], kwargs['ldavis_css_url'] = write_ipynb_local_js() 221 | 222 | return HTML(prepared_data_to_html(data, **kwargs)) 223 | 224 | 225 | def show(data, ip='127.0.0.1', port=8888, n_retries=50, 226 | local=True, open_browser=True, http_server=None, **kwargs): 227 | """Starts a local webserver and opens the visualization in a browser. 228 | 229 | Parameters 230 | ---------- 231 | data : PreparedData, created using :func:`prepare` 232 | The data for the visualization. 233 | ip : string, default = '127.0.0.1' 234 | the ip address used for the local server 235 | port : int, default = 8888 236 | the port number to use for the local server. If already in use, 237 | a nearby open port will be found (see n_retries) 238 | n_retries : int, default = 50 239 | the maximum number of ports to try when locating an empty port. 240 | local : bool, default = True 241 | if True, use the local d3 & LDAvis javascript versions, within the 242 | js/ folder. If False, use the standard urls. 243 | open_browser : bool (optional) 244 | if True (default), then open a web browser to the given HTML 245 | http_server : class (optional) 246 | optionally specify an HTTPServer class to use for showing the 247 | visualization. The default is Python's basic HTTPServer. 248 | **kwargs : 249 | additional keyword arguments are passed through to :func:`prepared_data_to_html` 250 | 251 | See Also 252 | -------- 253 | :func:`display` : embed visualization within the IPython notebook 254 | :func:`enable_notebook` : automatically embed visualizations in IPython notebook 255 | """ 256 | files = None 257 | if local: 258 | kwargs['ldavis_url'] = urls.LDAVIS_URL 259 | kwargs['d3_url'] = urls.D3_URL 260 | kwargs['ldavis_css_url'] = urls.LDAVIS_CSS_URL 261 | files = {'/LDAvis.js': ["text/javascript", open(urls.LDAVIS_LOCAL, 'r').read()], 262 | '/LDAvis.css': ["text/css", open(urls.LDAVIS_CSS_URL, 'r').read()], 263 | '/d3.js': ["text/javascript", open(urls.D3_URL, 'r').read()]} 264 | html = prepared_data_to_html(data, **kwargs) 265 | serve(html, ip=ip, port=port, n_retries=n_retries, files=files, 266 | open_browser=open_browser, http_server=http_server) 267 | 268 | 269 | def enable_notebook(local=False, **kwargs): 270 | """Enable the automatic display of visualizations in the IPython Notebook. 271 | 272 | Parameters 273 | ---------- 274 | local : boolean (optional, default=False) 275 | if True, then copy the d3 & LDAvis libraries to a location visible to 276 | the notebook server, and source them from there. See Notes below. 277 | **kwargs : 278 | all keyword parameters are passed through to :func:`prepared_data_to_html` 279 | 280 | Notes 281 | ----- 282 | Known issues: using ``local=True`` may not work correctly in certain cases: 283 | 284 | - In IPython < 2.0, ``local=True`` may fail if the current working 285 | directory is changed within the notebook (e.g. with the %cd command). 286 | - In IPython 2.0+, ``local=True`` may fail if a url prefix is added 287 | (e.g. by setting NotebookApp.base_url). 288 | 289 | See Also 290 | -------- 291 | :func:`disable_notebook` : undo the action of enable_notebook 292 | :func:`display` : embed visualization within the IPython notebook 293 | :func:`show` : launch a local server and show a visualization in a browser 294 | """ 295 | try: 296 | from IPython.core.getipython import get_ipython 297 | except ImportError: 298 | raise ImportError('This feature requires IPython 1.0+') 299 | 300 | if local: 301 | if 'ldavis_url' in kwargs or 'd3_url' in kwargs: 302 | warnings.warn("enable_notebook: specified urls are ignored when local=True") 303 | kwargs['d3_url'], kwargs['ldavis_url'], kwargs['ldavis_css_url'] = write_ipynb_local_js() 304 | 305 | ip = get_ipython() 306 | formatter = ip.display_formatter.formatters['text/html'] 307 | formatter.for_type(PreparedData, 308 | lambda data, kwds=kwargs: prepared_data_to_html(data, **kwds)) 309 | 310 | 311 | def disable_notebook(): 312 | """Disable the automatic display of visualizations in the IPython Notebook. 313 | 314 | See Also 315 | -------- 316 | :func:`enable_notebook` : automatically embed visualizations in IPython notebook 317 | """ 318 | try: 319 | from IPython.core.getipython import get_ipython 320 | except ImportError: 321 | raise ImportError('This feature requires IPython 1.0+') 322 | ip = get_ipython() 323 | formatter = ip.display_formatter.formatters['text/html'] 324 | formatter.type_printers.pop(PreparedData, None) 325 | 326 | 327 | def save_html(data, fileobj, **kwargs): 328 | """Save an embedded visualization to file. 329 | 330 | This will produce a self-contained HTML file. Internet access is still required 331 | for the D3 and LDAvis libraries. 332 | 333 | Parameters 334 | ---------- 335 | data : PreparedData, created using :func:`prepare` 336 | The data for the visualization. 337 | fileobj : filename or file object 338 | The filename or file-like object in which to write the HTML 339 | representation of the visualization. 340 | **kwargs : 341 | additional keyword arguments will be passed to :func:`prepared_data_to_html` 342 | 343 | See Also 344 | -------- 345 | :func:`save_json`: save json representation of a visualization to file 346 | :func:`prepared_data_to_html` : output html representation of the visualization 347 | :func:`fig_to_dict` : output dictionary representation of the visualization 348 | """ 349 | try: 350 | if isinstance(fileobj, basestring): 351 | fileobj = open(fileobj, 'w') 352 | except NameError: 353 | if isinstance(fileobj, str): 354 | fileobj = open(fileobj, 'w') 355 | if not hasattr(fileobj, 'write'): 356 | raise ValueError("fileobj should be a filename or a writable file") 357 | fileobj.write(prepared_data_to_html(data, **kwargs)) 358 | 359 | 360 | def save_json(data, fileobj): 361 | """Save the visualization's data a json file. 362 | 363 | Parameters 364 | ---------- 365 | data : PreparedData, created using :func:`prepare` 366 | The data for the visualization. 367 | fileobj : filename or file object 368 | The filename or file-like object in which to write the HTML 369 | representation of the visualization. 370 | 371 | See Also 372 | -------- 373 | :func:`save_html` : save html representation of a visualization to file 374 | :func:`prepared_data_to_html` : output html representation of the visualization 375 | """ 376 | try: 377 | if isinstance(fileobj, basestring): 378 | fileobj = open(fileobj, 'w') 379 | except NameError: 380 | if isinstance(fileobj, str): 381 | fileobj = open(fileobj, 'w') 382 | if not hasattr(fileobj, 'write'): 383 | raise ValueError("fileobj should be a filename or a writable file") 384 | json.dump(data.to_dict(), fileobj, cls=NumPyEncoder) 385 | -------------------------------------------------------------------------------- /pyLDAvis/_prepare.py: -------------------------------------------------------------------------------- 1 | """ 2 | pyLDAvis Prepare 3 | =============== 4 | Main transformation functions for preparing LDAdata to the visualization's data structures 5 | """ 6 | import json 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | from collections import namedtuple 11 | from joblib import Parallel, delayed, cpu_count 12 | from scipy.stats import entropy 13 | from scipy.spatial.distance import pdist, squareform 14 | from sklearn.manifold import MDS, TSNE 15 | 16 | from pyLDAvis.utils import NumPyEncoder 17 | 18 | 19 | def __num_dist_rows__(array, ndigits=2): 20 | return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum()) 21 | 22 | 23 | class ValidationError(ValueError): 24 | pass 25 | 26 | 27 | def _input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency): 28 | ttds = topic_term_dists.shape 29 | dtds = doc_topic_dists.shape 30 | errors = [] 31 | 32 | def err(msg): 33 | errors.append(msg) 34 | 35 | if dtds[1] != ttds[0]: 36 | err_msg = ('Number of rows of topic_term_dists does not match number of columns of ' 37 | 'doc_topic_dists; both should be equal to the number of topics in the model.') 38 | err(err_msg) 39 | 40 | if len(doc_lengths) != dtds[0]: 41 | err_msg = ('Length of doc_lengths not equal to the number of rows in doc_topic_dists;' 42 | 'both should be equal to the number of documents in the data.') 43 | err(err_msg) 44 | 45 | W = len(vocab) 46 | if ttds[1] != W: 47 | err_msg = ('Number of terms in vocabulary does not match the number of columns of ' 48 | 'topic_term_dists (where each row of topic_term_dists is a probability ' 49 | 'distribution of terms for a given topic)') 50 | err(err_msg) 51 | if len(term_frequency) != W: 52 | err_msg = ('Length of term_frequency not equal to the number of terms in the ' 53 | 'number of terms in the vocabulary (len of vocab)') 54 | err(err_msg) 55 | 56 | if __num_dist_rows__(topic_term_dists) != ttds[0]: 57 | err('Not all rows (distributions) in topic_term_dists sum to 1.') 58 | 59 | if __num_dist_rows__(doc_topic_dists) != dtds[0]: 60 | err('Not all rows (distributions) in doc_topic_dists sum to 1.') 61 | 62 | if len(errors) > 0: 63 | return errors 64 | 65 | 66 | def _input_validate(*args): 67 | res = _input_check(*args) 68 | if res: 69 | raise ValidationError('\n' + '\n'.join([' * ' + s for s in res])) 70 | 71 | 72 | def _jensen_shannon(_P, _Q): 73 | _M = 0.5 * (_P + _Q) 74 | return 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) 75 | 76 | 77 | def _pcoa(pair_dists, n_components=2): 78 | """Principal Coordinate Analysis, 79 | aka Classical Multidimensional Scaling 80 | """ 81 | # code referenced from skbio.stats.ordination.pcoa 82 | # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py 83 | 84 | # pairwise distance matrix is assumed symmetric 85 | pair_dists = np.asarray(pair_dists, np.float64) 86 | 87 | # perform SVD on double centred distance matrix 88 | n = pair_dists.shape[0] 89 | H = np.eye(n) - np.ones((n, n)) / n 90 | B = - H.dot(pair_dists ** 2).dot(H) / 2 91 | eigvals, eigvecs = np.linalg.eig(B) 92 | 93 | # Take first n_components of eigenvalues and eigenvectors 94 | # sorted in decreasing order 95 | ix = eigvals.argsort()[::-1][:n_components] 96 | eigvals = eigvals[ix] 97 | eigvecs = eigvecs[:, ix] 98 | 99 | # replace any remaining negative eigenvalues and associated eigenvectors with zeroes 100 | # at least 1 eigenvalue must be zero 101 | eigvals[np.isclose(eigvals, 0)] = 0 102 | if np.any(eigvals < 0): 103 | ix_neg = eigvals < 0 104 | eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape) 105 | eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape) 106 | 107 | return np.sqrt(eigvals) * eigvecs 108 | 109 | 110 | def js_PCoA(distributions): 111 | """Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis 112 | (aka Classical Multidimensional Scaling) 113 | 114 | Parameters 115 | ---------- 116 | distributions : array-like, shape (`n_dists`, `k`) 117 | Matrix of distributions probabilities. 118 | 119 | Returns 120 | ------- 121 | pcoa : array, shape (`n_dists`, 2) 122 | """ 123 | dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon)) 124 | return _pcoa(dist_matrix) 125 | 126 | 127 | def js_MMDS(distributions, **kwargs): 128 | """Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling 129 | 130 | Parameters 131 | ---------- 132 | distributions : array-like, shape (`n_dists`, `k`) 133 | Matrix of distributions probabilities. 134 | 135 | **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()` 136 | 137 | Returns 138 | ------- 139 | mmds : array, shape (`n_dists`, 2) 140 | """ 141 | dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon)) 142 | model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs) 143 | return model.fit_transform(dist_matrix) 144 | 145 | 146 | def js_TSNE(distributions, **kwargs): 147 | """Dimension reduction via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding 148 | 149 | Parameters 150 | ---------- 151 | distributions : array-like, shape (`n_dists`, `k`) 152 | Matrix of distributions probabilities. 153 | 154 | **kwargs : Keyword argument to be passed to `sklearn.manifold.TSNE()` 155 | 156 | Returns 157 | ------- 158 | tsne : array, shape (`n_dists`, 2) 159 | """ 160 | dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon)) 161 | model = TSNE(n_components=2, random_state=0, metric='precomputed', init='random', 162 | perplexity=min(len(dist_matrix) - 1, 30), **kwargs) 163 | return model.fit_transform(dist_matrix) 164 | 165 | 166 | def _df_with_names(data, index_name, columns_name): 167 | if type(data) == pd.DataFrame: 168 | # we want our index to be numbered 169 | df = pd.DataFrame(data.values) 170 | else: 171 | df = pd.DataFrame(data) 172 | df.index.name = index_name 173 | df.columns.name = columns_name 174 | return df 175 | 176 | 177 | def _series_with_name(data, name): 178 | if type(data) == pd.Series: 179 | data.name = name 180 | # ensures a numeric index 181 | return data.reset_index()[name] 182 | else: 183 | return pd.Series(data, name=name) 184 | 185 | 186 | def _topic_coordinates(mds, topic_term_dists, topic_proportion, start_index=1): 187 | K = topic_term_dists.shape[0] 188 | mds_res = mds(topic_term_dists) 189 | assert mds_res.shape == (K, 2) 190 | mds_df = pd.DataFrame({'x': mds_res[:, 0], 'y': mds_res[:, 1], 191 | 'topics': range(start_index, K + start_index), 192 | 'cluster': 1, 'Freq': topic_proportion * 100}) 193 | # note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26 194 | return mds_df 195 | 196 | 197 | def _chunks(lambda_seq, n): 198 | """ Yield successive n-sized chunks from lambda_seq. 199 | """ 200 | for i in range(0, len(lambda_seq), n): 201 | yield lambda_seq[i:i + n] 202 | 203 | 204 | def _job_chunks(lambda_seq, n_jobs): 205 | n_chunks = n_jobs 206 | if n_jobs < 0: 207 | # so, have n chunks if we are using all n cores/cpus 208 | n_chunks = cpu_count() + 1 - n_jobs 209 | 210 | return _chunks(lambda_seq, n_chunks) 211 | 212 | 213 | def _find_relevance(log_ttd, log_lift, R, lambda_): 214 | relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift 215 | return relevance.T.apply(lambda topic: topic.nlargest(R).index) 216 | 217 | 218 | def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq): 219 | return pd.concat([_find_relevance(log_ttd, log_lift, R, seq) for seq in lambda_seq]) 220 | 221 | 222 | def _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, 223 | vocab, lambda_step, R, n_jobs, start_index=1): 224 | # marginal distribution over terms (width of blue bars) 225 | term_proportion = term_frequency / term_frequency.sum() 226 | 227 | # compute the distinctiveness and saliency of the terms: 228 | # this determines the R terms that are displayed when no topic is selected. 229 | # TODO(msusol): Make flake8 test pass here with 'unused' variables. 230 | tt_sum = topic_term_dists.sum() 231 | topic_given_term = pd.eval("topic_term_dists / tt_sum") 232 | log_1 = np.log(pd.eval("topic_given_term.T / topic_proportion")) 233 | kernel = pd.eval("topic_given_term * log_1.T") 234 | distinctiveness = kernel.sum() 235 | saliency = term_proportion * distinctiveness 236 | # Order the terms for the "default" view by decreasing saliency: 237 | default_term_info = pd.DataFrame({ 238 | 'saliency': saliency, 239 | 'Term': vocab, 240 | 'Freq': term_frequency, 241 | 'Total': term_frequency, 242 | 'Category': 'Default'}) 243 | default_term_info = default_term_info.sort_values( 244 | by='saliency', ascending=False).head(R).drop('saliency', axis=1) 245 | # Rounding Freq and Total to integer values to match LDAvis code: 246 | default_term_info['Freq'] = np.floor(default_term_info['Freq']) 247 | default_term_info['Total'] = np.floor(default_term_info['Total']) 248 | ranks = np.arange(R, 0, -1) 249 | default_term_info['logprob'] = default_term_info['loglift'] = ranks 250 | default_term_info = default_term_info.reindex(columns=[ 251 | "Term", "Freq", "Total", "Category", "logprob", "loglift" 252 | ]) 253 | 254 | # compute relevance and top terms for each topic 255 | log_lift = np.log(pd.eval("topic_term_dists / term_proportion")).astype("float64") 256 | log_ttd = np.log(pd.eval("topic_term_dists")).astype("float64") 257 | lambda_seq = np.arange(0, 1 + lambda_step, lambda_step) 258 | 259 | def topic_top_term_df(tup): 260 | new_topic_id, (original_topic_id, topic_terms) = tup 261 | term_ix = topic_terms.unique() 262 | df = pd.DataFrame({'Term': vocab[term_ix], 263 | 'Freq': term_topic_freq.loc[original_topic_id, term_ix], 264 | 'Total': term_frequency[term_ix], 265 | 'Category': 'Topic%d' % new_topic_id, 266 | 'logprob': log_ttd.loc[original_topic_id, term_ix].round(4), 267 | 'loglift': log_lift.loc[original_topic_id, term_ix].round(4), 268 | }) 269 | return df.reindex(columns=[ 270 | "Term", "Freq", "Total", "Category", "logprob", "loglift" 271 | ]) 272 | 273 | top_terms = pd.concat(Parallel(n_jobs=n_jobs) 274 | (delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) 275 | for ls in _job_chunks(lambda_seq, n_jobs))) 276 | topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), start_index)) 277 | return pd.concat([default_term_info] + list(topic_dfs)) 278 | 279 | 280 | def _token_table(topic_info, term_topic_freq, vocab, term_frequency, start_index=1): 281 | # last, to compute the areas of the circles when a term is highlighted 282 | # we must gather all unique terms that could show up (for every combination 283 | # of topic and value of lambda) and compute its distribution over topics. 284 | 285 | # term-topic frequency table of unique terms across all topics and all values of lambda 286 | term_ix = topic_info.index.unique() 287 | term_ix = np.sort(term_ix) 288 | 289 | top_topic_terms_freq = term_topic_freq[term_ix] 290 | # use the new ordering for the topics 291 | K = len(term_topic_freq) 292 | top_topic_terms_freq.index = range(start_index, K + start_index) 293 | top_topic_terms_freq.index.name = 'Topic' 294 | 295 | # we filter to Freq >= 0.5 to avoid sending too much data to the browser 296 | token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()})\ 297 | .reset_index().set_index('term').query('Freq >= 0.5') 298 | 299 | token_table['Freq'] = token_table['Freq'].round() 300 | token_table['Term'] = vocab[token_table.index.values].values 301 | # Normalize token frequencies: 302 | token_table['Freq'] = token_table.Freq / term_frequency[token_table.index] 303 | return token_table.sort_values(by=['Term', 'Topic']) 304 | 305 | 306 | def prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, 307 | R=30, lambda_step=0.01, mds=js_PCoA, n_jobs=-1, 308 | plot_opts=None, sort_topics=True, start_index=1): 309 | """Transforms the topic model distributions and related corpus data into 310 | the data structures needed for the visualization. 311 | 312 | Parameters 313 | ---------- 314 | topic_term_dists : array-like, shape (`n_topics`, `n_terms`) 315 | Matrix of topic-term probabilities. Where `n_terms` is `len(vocab)`. 316 | doc_topic_dists : array-like, shape (`n_docs`, `n_topics`) 317 | Matrix of document-topic probabilities. 318 | doc_lengths : array-like, shape `n_docs` 319 | The length of each document, i.e. the number of words in each document. 320 | The order of the numbers should be consistent with the ordering of the 321 | docs in `doc_topic_dists`. 322 | vocab : array-like, shape `n_terms` 323 | List of all the words in the corpus used to train the model. 324 | term_frequency : array-like, shape `n_terms` 325 | The count of each particular term over the entire corpus. The ordering 326 | of these counts should correspond with `vocab` and `topic_term_dists`. 327 | R : int 328 | The number of terms to display in the barcharts of the visualization. 329 | Default is 30. Recommended to be roughly between 10 and 50. 330 | lambda_step : float, between 0 and 1 331 | Determines the interstep distance in the grid of lambda values over 332 | which to iterate when computing relevance. 333 | Default is 0.01. Recommended to be between 0.01 and 0.1. 334 | mds : function or a string representation of function 335 | A function that takes `topic_term_dists` as an input and outputs a 336 | `n_topics` by `2` distance matrix. The output approximates the distance 337 | between topics. See :func:`js_PCoA` for details on the default function. 338 | A string representation currently accepts `pcoa` (or upper case variant), 339 | `mmds` (or upper case variant) and `tsne` (or upper case variant), 340 | if `sklearn` package is installed for the latter two. 341 | n_jobs : int 342 | The number of cores to be used to do the computations. The regular 343 | joblib conventions are followed so `-1`, which is the default, will 344 | use all cores. 345 | plot_opts : dict, with keys 'xlab' and `ylab` 346 | Dictionary of plotting options, right now only used for the axis labels. 347 | sort_topics : sort topics by topic proportion (percentage of tokens covered). Set to false to 348 | to keep original topic order. 349 | start_index: how to number topics for prepared data. Defaults to one-based indexing. 350 | Set to 0 for zero-based indexing. 351 | 352 | Returns 353 | ------- 354 | prepared_data : PreparedData 355 | A named tuple containing all the data structures required to create 356 | the visualization. To be passed on to functions like :func:`display`. 357 | This named tuple can be represented as json or a python dictionary. 358 | There is a helper function 'sorted_terms' that can be used to get 359 | the terms of a topic using lambda to rank their relevance. 360 | 361 | 362 | Notes 363 | ----- 364 | This implements the method of `Sievert, C. and Shirley, K. (2014): 365 | LDAvis: A Method for Visualizing and Interpreting Topics, ACL Workshop on 366 | Interactive Language Learning, Visualization, and Interfaces.` 367 | 368 | http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf 369 | 370 | See Also 371 | -------- 372 | :func:`save_json`: save json representation of a figure to file 373 | :func:`save_html` : save html representation of a figure to file 374 | :func:`show` : launch a local server and show a figure in a browser 375 | :func:`display` : embed figure within the IPython notebook 376 | :func:`enable_notebook` : automatically embed visualizations in IPython notebook 377 | """ 378 | if plot_opts is None: 379 | plot_opts = {'xlab': 'PC1', 'ylab': 'PC2'} 380 | 381 | # parse mds 382 | if isinstance(mds, str): 383 | mds = mds.lower() 384 | if mds == 'pcoa': 385 | mds = js_PCoA 386 | elif mds in ('mmds', 'tsne'): 387 | mds_opts = {'mmds': js_MMDS, 'tsne': js_TSNE} 388 | mds = mds_opts[mds] 389 | else: 390 | logging.warning('Unknown mds `%s`, switch to PCoA' % mds) 391 | mds = js_PCoA 392 | 393 | # Conceptually, the items in `topic_term_dists` end up as individual rows in the 394 | # DataFrame, but we can speed up ingestion by treating them as columns and 395 | # transposing at the end. (This is especially true when the number of terms far 396 | # exceeds the number of topics.) 397 | topic_term_dist_cols = [ 398 | pd.Series(topic_term_dist, dtype="float64") 399 | for topic_term_dist in topic_term_dists 400 | ] 401 | topic_term_dists = pd.concat(topic_term_dist_cols, axis=1).T 402 | 403 | topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term') 404 | doc_topic_dists = _df_with_names(doc_topic_dists, 'doc', 'topic') 405 | term_frequency = _series_with_name(term_frequency, 'term_frequency') 406 | doc_lengths = _series_with_name(doc_lengths, 'doc_length') 407 | vocab = _series_with_name(vocab, 'vocab') 408 | _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) 409 | R = min(R, len(vocab)) 410 | 411 | topic_freq = doc_topic_dists.mul(doc_lengths, axis="index").sum() 412 | # topic_freq = np.dot(doc_topic_dists.T, doc_lengths) 413 | if (sort_topics): 414 | topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False) 415 | else: 416 | topic_proportion = (topic_freq / topic_freq.sum()) 417 | 418 | topic_order = topic_proportion.index 419 | # reorder all data based on new ordering of topics 420 | topic_freq = topic_freq[topic_order] 421 | topic_term_dists = topic_term_dists.iloc[topic_order] 422 | # Unused: doc_topic_dists = doc_topic_dists[topic_order] 423 | 424 | # token counts for each term-topic combination (widths of red bars) 425 | term_topic_freq = (topic_term_dists.T * topic_freq).T 426 | # Quick fix for red bar width bug. We calculate the 427 | # term frequencies internally, using the topic term distributions and the 428 | # topic frequencies, rather than using the user-supplied term frequencies. 429 | # For a detailed discussion, see: https://github.com/cpsievert/LDAvis/pull/41 430 | term_frequency = np.sum(term_topic_freq, axis=0) 431 | 432 | topic_info = _topic_info(topic_term_dists, topic_proportion, 433 | term_frequency, term_topic_freq, vocab, lambda_step, R, 434 | n_jobs, start_index) 435 | token_table = _token_table(topic_info, term_topic_freq, vocab, term_frequency, start_index) 436 | topic_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion, start_index) 437 | client_topic_order = [x + start_index for x in topic_order] 438 | 439 | return PreparedData(topic_coordinates, topic_info, 440 | token_table, R, lambda_step, plot_opts, client_topic_order) 441 | 442 | 443 | class PreparedData(namedtuple('PreparedData', ['topic_coordinates', 'topic_info', 'token_table', 444 | 'R', 'lambda_step', 'plot_opts', 'topic_order'])): 445 | 446 | def sorted_terms(self, topic=1, _lambda=1): 447 | """Returns a dataframe using _lambda to calculate term relevance of a given topic.""" 448 | tdf = pd.DataFrame(self.topic_info[self.topic_info.Category == 'Topic' + str(topic)]) 449 | if _lambda < 0 or _lambda > 1: 450 | _lambda = 1 451 | stdf = tdf.assign(relevance=_lambda * tdf['logprob'] + (1 - _lambda) * tdf['loglift']) 452 | return stdf.sort_values('relevance', ascending=False) 453 | 454 | def to_dict(self): 455 | return {'mdsDat': self.topic_coordinates.to_dict(orient='list'), 456 | 'tinfo': self.topic_info.to_dict(orient='list'), 457 | 'token.table': self.token_table.to_dict(orient='list'), 458 | 'R': self.R, 459 | 'lambda.step': self.lambda_step, 460 | 'plot.opts': self.plot_opts, 461 | 'topic.order': self.topic_order} 462 | 463 | def to_json(self): 464 | return json.dumps(self.to_dict(), cls=NumPyEncoder) 465 | -------------------------------------------------------------------------------- /pyLDAvis/_server.py: -------------------------------------------------------------------------------- 1 | # this file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_server.py 2 | # Copyright (c) 2013, Jake Vanderplas 3 | """ 4 | A Simple server used to serve LDAvis visualizations 5 | """ 6 | import sys 7 | import threading 8 | import webbrowser 9 | import socket 10 | import itertools 11 | import random 12 | from http import server 13 | 14 | IPYTHON_WARNING = """ 15 | Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command 16 | to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook(). 17 | See more information at http://pyLDAvis.github.io/quickstart.html . 18 | 19 | You must interrupt the kernel to end this command 20 | """ 21 | 22 | 23 | def generate_handler(html, files=None): 24 | if files is None: 25 | files = {} 26 | 27 | class MyHandler(server.BaseHTTPRequestHandler): 28 | def do_GET(self): 29 | """Respond to a GET request.""" 30 | if self.path == '/': 31 | self.send_response(200) 32 | self.send_header("Content-type", "text/html") 33 | self.end_headers() 34 | self.wfile.write("" 35 | "LDAvis" 36 | "\n".encode()) 37 | self.wfile.write(html.encode()) 38 | self.wfile.write("".encode()) 39 | elif self.path in files: 40 | content_type, content = files[self.path] 41 | self.send_response(200) 42 | self.send_header("Content-type", content_type) 43 | self.end_headers() 44 | self.wfile.write(content.encode()) 45 | else: 46 | self.send_error(404) 47 | 48 | return MyHandler 49 | 50 | 51 | def find_open_port(ip, port, n=50): 52 | """Find an open port near the specified port""" 53 | ports = itertools.chain((port + i for i in range(n)), 54 | (port + random.randint(-2 * n, 2 * n))) 55 | 56 | for port in ports: 57 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 58 | result = s.connect_ex((ip, port)) 59 | s.close() 60 | if result != 0: 61 | return port 62 | raise ValueError("no open ports found") 63 | 64 | 65 | def serve(html, ip='127.0.0.1', port=8888, n_retries=50, files=None, 66 | ipython_warning=False, open_browser=True, http_server=None): 67 | """Start a server serving the given HTML, and (optionally) open a 68 | browser 69 | 70 | Parameters 71 | ---------- 72 | html : string 73 | HTML to serve 74 | ip : string (default = '127.0.0.1') 75 | ip address at which the HTML will be served. 76 | port : int (default = 8888) 77 | the port at which to serve the HTML 78 | n_retries : int (default = 50) 79 | the number of nearby ports to search if the specified port is in use. 80 | files : dictionary (optional) 81 | dictionary of extra content to serve 82 | ipython_warning : bool (optional) 83 | if True (default), then print a warning if this is used within IPython 84 | open_browser : bool (optional) 85 | if True (default), then open a web browser to the given HTML 86 | http_server : class (optional) 87 | optionally specify an HTTPServer class to use for showing the 88 | figure. The default is Python's basic HTTPServer. 89 | """ 90 | port = find_open_port(ip, port, n_retries) 91 | Handler = generate_handler(html, files) 92 | 93 | if http_server is None: 94 | srvr = server.HTTPServer((ip, port), Handler) 95 | else: 96 | srvr = http_server((ip, port), Handler) 97 | 98 | if ipython_warning: 99 | print(IPYTHON_WARNING) 100 | 101 | # Start the server 102 | print("Serving to http://{0}:{1}/ [Ctrl-C to exit]".format(ip, port)) 103 | sys.stdout.flush() 104 | 105 | if open_browser: 106 | # Use a thread to open a web browser pointing to the server 107 | b = lambda: webbrowser.open('http://{0}:{1}'.format(ip, port)) 108 | threading.Thread(target=b).start() 109 | 110 | try: 111 | srvr.serve_forever() 112 | except (KeyboardInterrupt, SystemExit): 113 | print("\nstopping Server...") 114 | 115 | srvr.server_close() 116 | -------------------------------------------------------------------------------- /pyLDAvis/gensim_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | pyLDAvis Gensim 3 | =============== 4 | Helper functions to visualize LDA models trained by Gensim 5 | """ 6 | 7 | import funcy as fp 8 | import numpy as np 9 | from scipy.sparse import issparse 10 | import pyLDAvis._prepare 11 | 12 | 13 | def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): 14 | import gensim 15 | 16 | if not gensim.matutils.ismatrix(corpus): 17 | corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary)) 18 | else: 19 | corpus_csc = corpus 20 | # Need corpus to be a streaming gensim list corpus for len and inference functions below: 21 | corpus = gensim.matutils.Sparse2Corpus(corpus_csc) 22 | 23 | vocab = list(dictionary.token2id.keys()) 24 | # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm.. 25 | # for now, I'll just make sure we don't ever get zeros... 26 | beta = 0.01 27 | fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) 28 | term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort] 29 | term_freqs[term_freqs == 0] = beta 30 | doc_lengths = corpus_csc.sum(axis=0).A.ravel() 31 | 32 | assert term_freqs.shape[0] == len(dictionary),\ 33 | 'Term frequencies and dictionary have different shape {} != {}'.format( 34 | term_freqs.shape[0], len(dictionary)) 35 | assert doc_lengths.shape[0] == len(corpus),\ 36 | 'Document lengths and corpus have different sizes {} != {}'.format( 37 | doc_lengths.shape[0], len(corpus)) 38 | 39 | if hasattr(topic_model, 'lda_alpha'): 40 | num_topics = len(topic_model.lda_alpha) 41 | else: 42 | num_topics = topic_model.num_topics 43 | 44 | if doc_topic_dists is None: 45 | # If its an HDP model. 46 | if hasattr(topic_model, 'lda_beta'): 47 | gamma = topic_model.inference(corpus) 48 | else: 49 | gamma, _ = topic_model.inference(corpus) 50 | doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] 51 | else: 52 | if isinstance(doc_topic_dists, list): 53 | doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T 54 | elif issparse(doc_topic_dists): 55 | doc_topic_dists = doc_topic_dists.T.todense() 56 | doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) 57 | 58 | assert doc_topic_dists.shape[1] == num_topics,\ 59 | 'Document topics and number of topics do not match {} != {}'.format( 60 | doc_topic_dists.shape[1], num_topics) 61 | 62 | # get the topic-term distribution straight from gensim without 63 | # iterating over tuples 64 | if hasattr(topic_model, 'lda_beta'): 65 | topic = topic_model.lda_beta 66 | else: 67 | topic = topic_model.state.get_lambda() 68 | topic = topic / topic.sum(axis=1)[:, None] 69 | topic_term_dists = topic[:, fnames_argsort] 70 | 71 | assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] 72 | 73 | return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 74 | 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs} 75 | 76 | 77 | def prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs): 78 | """Transforms the Gensim TopicModel and related corpus and dictionary into 79 | the data structures needed for the visualization. 80 | 81 | Parameters 82 | ---------- 83 | topic_model : gensim.models.ldamodel.LdaModel 84 | An already trained Gensim LdaModel. The other gensim model types are 85 | not supported (PRs welcome). 86 | 87 | corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix 88 | The corpus in bag of word form, the same docs used to train the model. 89 | The corpus is transformed into a csc matrix internally, if you intend to 90 | call prepare multiple times it is a good idea to first call 91 | `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead. 92 | 93 | For example: [(50, 3), (63, 5), ....] 94 | 95 | dictionary: gensim.corpora.Dictionary 96 | The dictionary object used to create the corpus. Needed to extract the 97 | actual terms (not ids). 98 | 99 | doc_topic_dist (optional): Document topic distribution from LDA (default=None) 100 | The document topic distribution that is eventually visualised, if you will 101 | be calling `prepare` multiple times it's a good idea to explicitly pass in 102 | `doc_topic_dist` as inferring this for large corpora can be quite 103 | expensive. 104 | 105 | **kwargs : 106 | additional keyword arguments are passed through to :func:`pyldavis.prepare`. 107 | 108 | Returns 109 | ------- 110 | prepared_data : PreparedData 111 | the data structures used in the visualization 112 | 113 | Example 114 | -------- 115 | For example usage please see this notebook: 116 | http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb 117 | 118 | See 119 | ------ 120 | See `pyLDAvis.prepare` for **kwargs. 121 | """ 122 | opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs) 123 | return pyLDAvis.prepare(**opts) 124 | -------------------------------------------------------------------------------- /pyLDAvis/graphlab.py: -------------------------------------------------------------------------------- 1 | """ 2 | pyLDAvis GraphLab 3 | =============== 4 | Helper functions to visualize GraphLab Create's TopicModel (an implementation of LDA) 5 | """ 6 | 7 | import funcy as fp 8 | import numpy as np 9 | import pandas as pd 10 | import graphlab as gl 11 | import pyLDAvis 12 | 13 | 14 | def _topics_as_df(topic_model): 15 | tdf = topic_model['topics'].to_dataframe() 16 | return pd.DataFrame(np.vstack(tdf['topic_probabilities'].values), index=tdf['vocabulary']) 17 | 18 | 19 | def _sum_sarray_dicts(sarray): 20 | counts_sf = gl.SFrame({ 21 | 'count_dicts': sarray}).stack('count_dicts').groupby( 22 | key_columns='X1', 23 | operations={'count': gl.aggregate.SUM('X2')}) 24 | return counts_sf.unstack(column=['X1', 'count'])[0].values()[0] 25 | 26 | 27 | def _extract_doc_data(docs): 28 | doc_lengths = list(docs.apply(lambda d: np.array(d.values()).sum())) 29 | term_freqs_dict = _sum_sarray_dicts(docs) 30 | 31 | vocab = term_freqs_dict.keys() 32 | term_freqs = term_freqs_dict.values() 33 | 34 | return {'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs} 35 | 36 | 37 | def _extract_model_data(topic_model, docs, vocab): 38 | doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities')) 39 | 40 | topics = _topics_as_df(topic_model) 41 | topic_term_dists = topics.T[vocab].values 42 | 43 | return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists} 44 | 45 | 46 | def _extract_data(topic_model, docs): 47 | doc_data = _extract_doc_data(docs) 48 | model_data = _extract_model_data(topic_model, docs, doc_data['vocab']) 49 | return fp.merge(doc_data, model_data) 50 | 51 | 52 | def prepare(topic_model, docs, **kargs): 53 | """Transforms the GraphLab TopicModel and related corpus data into 54 | the data structures needed for the visualization. 55 | 56 | Parameters 57 | ---------- 58 | topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel 59 | An already trained GraphLab topic model. 60 | docs : SArray of dicts 61 | The corpus in bag of word form, the same docs used to train the model. 62 | **kwargs : 63 | additional keyword arguments are passed through to :func:`pyldavis.prepare`. 64 | 65 | Returns 66 | ------- 67 | prepared_data : PreparedData 68 | the data structures used in the visualization 69 | 70 | Example 71 | -------- 72 | For example usage please see this notebook: 73 | http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb 74 | """ 75 | opts = fp.merge(_extract_data(topic_model, docs), kargs) 76 | return pyLDAvis.prepare(**opts) 77 | -------------------------------------------------------------------------------- /pyLDAvis/js/ldavis.css: -------------------------------------------------------------------------------- 1 | /* Taken from https://github.com/cpsievert/LDAvis */ 2 | /* Copyright 2013, AT&T Intellectual Property */ 3 | /* MIT Licence */ 4 | 5 | path { 6 | fill: none; 7 | stroke: none; 8 | } 9 | 10 | .xaxis .tick.major { 11 | fill: black; 12 | stroke: black; 13 | stroke-width: 0.1; 14 | opacity: 0.7; 15 | } 16 | 17 | .slideraxis { 18 | fill: black; 19 | stroke: black; 20 | stroke-width: 0.4; 21 | opacity: 1; 22 | } 23 | 24 | text { 25 | font-family: sans-serif; 26 | font-size: 11px; 27 | } 28 | 29 | /* IPython Notebook CSS to allow visualization to fit */ 30 | /* I'm open to a better way of accomplishing this goal... */ 31 | .container { width:1350px !important; } 32 | /* This is for nbviewer's benefit since the above wasn't enough... */ 33 | .output_area { width:1450px !important; } 34 | -------------------------------------------------------------------------------- /pyLDAvis/js/ldavis.js: -------------------------------------------------------------------------------- 1 | /* Original code taken from https://github.com/cpsievert/LDAvis */ 2 | /* Copyright 2013, AT&T Intellectual Property */ 3 | /* MIT Licence */ 4 | 5 | 'use strict'; 6 | 7 | var LDAvis = function(to_select, data_or_file_name, color1, color2) { 8 | 9 | // This section sets up the logic for event handling 10 | var current_clicked = { 11 | what: "nothing", 12 | element: undefined 13 | }, 14 | current_hover = { 15 | what: "nothing", 16 | element: undefined 17 | }, 18 | old_winning_state = { 19 | what: "nothing", 20 | element: undefined 21 | }, 22 | vis_state = { 23 | lambda: 1, 24 | topic: 0, 25 | term: "" 26 | }; 27 | 28 | // Set up a few 'global' variables to hold the data: 29 | var K, // number of topics 30 | R, // number of terms to display in bar chart 31 | mdsData, // (x,y) locations and topic proportions 32 | mdsData3, // topic proportions for all terms in the viz 33 | lamData, // all terms that are among the top-R most relevant for all topics, lambda values 34 | lambda = { 35 | old: 1, 36 | current: 1 37 | }, 38 | color1 = typeof color1 !=='undefined' ? color1: "#1f77b4", // baseline color for default topic circles and overall term frequencies 39 | color2 = typeof color2 !=='undefined' ? color2: "#d62728"; // 'highlight' color for selected topics and term-topic frequencies 40 | 41 | // Set the duration of each half of the transition: 42 | var duration = 750; 43 | 44 | // Set global margins used for everything 45 | var margin = { 46 | top: 30, 47 | right: 30, 48 | bottom: 70, 49 | left: 30 50 | }, 51 | 52 | mdswidth = 530, 53 | mdsheight = 530, 54 | barwidth = 530, 55 | barheight = 530, 56 | termwidth = 90, // width to add between two panels to display terms 57 | mdsarea = mdsheight * mdswidth; 58 | // controls how big the maximum circle can be 59 | // doesn't depend on data, only on mds width and height: 60 | var rMax = 60; 61 | 62 | // proportion of area of MDS plot to which the sum of default topic circle areas is set 63 | var circle_prop = 0.25; 64 | var word_prop = 0.25; 65 | 66 | // opacity of topic circles: 67 | var base_opacity = 0.2, 68 | highlight_opacity = 0.6; 69 | 70 | // topic/lambda selection names are specific to *this* vis 71 | var topic_select = to_select + "-topic"; 72 | var lambda_select = to_select + "-lambda"; 73 | 74 | // get rid of the # in the to_select (useful) for setting ID values 75 | var visID = to_select.replace("#", ""); 76 | var topicID = visID + "-topic"; 77 | var lambdaID = visID + "-lambda"; 78 | var termID = visID + "-term"; 79 | var topicDown = topicID + "-down"; 80 | var topicUp = topicID + "-up"; 81 | var topicClear = topicID + "-clear"; 82 | 83 | var leftPanelID = visID + "-leftpanel"; 84 | var barFreqsID = visID + "-bar-freqs"; 85 | var topID = visID + "-top"; 86 | var lambdaInputID = visID + "-lambdaInput"; 87 | var lambdaZeroID = visID + "-lambdaZero"; 88 | var sliderDivID = visID + "-sliderdiv"; 89 | var lambdaLabelID = visID + "-lamlabel"; 90 | 91 | ////////////////////////////////////////////////////////////////////////////// 92 | 93 | // sort array according to a specified object key name 94 | // Note that default is decreasing sort, set decreasing = -1 for increasing 95 | // adapted from http://stackoverflow.com/questions/16648076/sort-array-on-key-value 96 | function fancysort(key_name, decreasing) { 97 | decreasing = (typeof decreasing === "undefined") ? 1 : decreasing; 98 | return function(a, b) { 99 | if (a[key_name] < b[key_name]) 100 | return 1 * decreasing; 101 | if (a[key_name] > b[key_name]) 102 | return -1 * decreasing; 103 | return 0; 104 | }; 105 | } 106 | 107 | 108 | function visualize(data) { 109 | // set the number of topics to global variable K: 110 | K = data['mdsDat'].x.length; 111 | 112 | // R is the number of top relevant (or salient) words whose bars we display 113 | R = Math.min(data['R'], 30); 114 | 115 | // a (K x 5) matrix with columns x, y, topics, Freq, cluster (where x and y are locations for left panel) 116 | mdsData = []; 117 | for (var i = 0; i < K; i++) { 118 | var obj = {}; 119 | for (var key in data['mdsDat']) { 120 | obj[key] = data['mdsDat'][key][i]; 121 | } 122 | mdsData.push(obj); 123 | } 124 | 125 | // a huge matrix with 3 columns: Term, Topic, Freq, where Freq is all non-zero probabilities of topics given terms 126 | // for the terms that appear in the bar-charts for this data 127 | mdsData3 = []; 128 | for (var i = 0; i < data['token.table'].Term.length; i++) { 129 | var obj = {}; 130 | for (var key in data['token.table']) { 131 | obj[key] = data['token.table'][key][i]; 132 | } 133 | mdsData3.push(obj); 134 | }; 135 | 136 | // large data for the widths of bars in bar-charts. 6 columns: Term, logprob, loglift, Freq, Total, Category 137 | // Contains all possible terms for topics in (1, 2, ..., k) and lambda in the user-supplied grid of lambda values 138 | // which defaults to (0, 0.01, 0.02, ..., 0.99, 1). 139 | lamData = []; 140 | for (var i = 0; i < data['tinfo'].Term.length; i++) { 141 | var obj = {}; 142 | for (var key in data['tinfo']) { 143 | obj[key] = data['tinfo'][key][i]; 144 | } 145 | lamData.push(obj); 146 | } 147 | var dat3 = lamData.slice(0, R); 148 | 149 | // Create the topic input & lambda slider forms. Inspired from: 150 | // http://bl.ocks.org/d3noob/10632804 151 | // http://bl.ocks.org/d3noob/10633704 152 | init_forms(topicID, lambdaID, visID); 153 | 154 | // When the value of lambda changes, update the visualization 155 | console.log('lambda_select', lambda_select); 156 | d3.select(lambda_select) 157 | .on("mouseup", function() { 158 | console.log('lambda_select mouseup'); 159 | // store the previous lambda value 160 | lambda.old = lambda.current; 161 | lambda.current = document.getElementById(lambdaID).value; 162 | vis_state.lambda = +this.value; 163 | // adjust the text on the range slider 164 | d3.select(lambda_select).property("value", vis_state.lambda); 165 | d3.select(lambda_select + "-value").text(vis_state.lambda); 166 | // transition the order of the bars 167 | var increased = lambda.old < vis_state.lambda; 168 | if (vis_state.topic > 0) reorder_bars(increased); 169 | // store the current lambda value 170 | state_save(true); 171 | document.getElementById(lambdaID).value = vis_state.lambda; 172 | }); 173 | 174 | d3.select("#" + topicUp) 175 | .on("click", function() { 176 | // remove term selection if it exists (from a saved URL) 177 | var termElem = document.getElementById(termID + vis_state.term); 178 | if (termElem !== undefined) term_off(termElem); 179 | vis_state.term = ""; 180 | var value_old = document.getElementById(topicID).value; 181 | var value_new = Math.min(K, +value_old + 1).toFixed(0); 182 | // increment the value in the input box 183 | document.getElementById(topicID).value = value_new; 184 | topic_off(document.getElementById(topicID + value_old)); 185 | topic_on(document.getElementById(topicID + value_new)); 186 | vis_state.topic = value_new; 187 | state_save(true); 188 | }); 189 | 190 | d3.select("#" + topicDown) 191 | .on("click", function() { 192 | // remove term selection if it exists (from a saved URL) 193 | var termElem = document.getElementById(termID + vis_state.term); 194 | if (termElem !== undefined) term_off(termElem); 195 | vis_state.term = ""; 196 | var value_old = document.getElementById(topicID).value; 197 | var value_new = Math.max(0, +value_old - 1).toFixed(0); 198 | // increment the value in the input box 199 | document.getElementById(topicID).value = value_new; 200 | topic_off(document.getElementById(topicID + value_old)); 201 | topic_on(document.getElementById(topicID + value_new)); 202 | vis_state.topic = value_new; 203 | state_save(true); 204 | }); 205 | 206 | d3.select("#" + topicID) 207 | .on("keyup", function() { 208 | // remove term selection if it exists (from a saved URL) 209 | var termElem = document.getElementById(termID + vis_state.term); 210 | if (termElem !== undefined) term_off(termElem); 211 | vis_state.term = ""; 212 | topic_off(document.getElementById(topicID + vis_state.topic)); 213 | var value_new = document.getElementById(topicID).value; 214 | if (!isNaN(value_new) && value_new > 0) { 215 | value_new = Math.min(K, Math.max(1, value_new)); 216 | topic_on(document.getElementById(topicID + value_new)); 217 | vis_state.topic = value_new; 218 | state_save(true); 219 | document.getElementById(topicID).value = vis_state.topic; 220 | } 221 | }); 222 | 223 | d3.select("#" + topicClear) 224 | .on("click", function() { 225 | state_reset(); 226 | state_save(true); 227 | }); 228 | 229 | // create linear scaling to pixels (and add some padding on outer region of scatterplot) 230 | var xrange = d3.extent(mdsData, function(d) { 231 | return d.x; 232 | }); //d3.extent returns min and max of an array 233 | var xdiff = xrange[1] - xrange[0], 234 | xpad = 0.05; 235 | var yrange = d3.extent(mdsData, function(d) { 236 | return d.y; 237 | }); 238 | var ydiff = yrange[1] - yrange[0], 239 | ypad = 0.05; 240 | 241 | if (xdiff > ydiff) { 242 | var xScale = d3.scaleLinear() 243 | .range([0, mdswidth]) 244 | .domain([xrange[0] - xpad * xdiff, xrange[1] + xpad * xdiff]); 245 | 246 | var yScale = d3.scaleLinear() 247 | .range([mdsheight, 0]) 248 | .domain([yrange[0] - 0.5*(xdiff - ydiff) - ypad*xdiff, yrange[1] + 0.5*(xdiff - ydiff) + ypad*xdiff]); 249 | } else { 250 | var xScale = d3.scaleLinear() 251 | .range([0, mdswidth]) 252 | .domain([xrange[0] - 0.5*(ydiff - xdiff) - xpad*ydiff, xrange[1] + 0.5*(ydiff - xdiff) + xpad*ydiff]); 253 | 254 | var yScale = d3.scaleLinear() 255 | .range([mdsheight, 0]) 256 | .domain([yrange[0] - ypad * ydiff, yrange[1] + ypad * ydiff]); 257 | } 258 | 259 | // Create new svg element (that will contain everything): 260 | var svg = d3.select(to_select).append("svg") 261 | .attr("width", mdswidth + barwidth + margin.left + termwidth + margin.right) 262 | .attr("height", mdsheight + 2 * margin.top + margin.bottom + 2 * rMax); 263 | 264 | // Create a group for the mds plot 265 | var mdsplot = svg.append("g") 266 | .attr("id", leftPanelID) 267 | .attr("class", "points") 268 | .attr("transform", "translate(" + margin.left + "," + 2 * margin.top + ")"); 269 | 270 | // Clicking on the mdsplot should clear the selection 271 | mdsplot.append("rect") 272 | .attr("x", 0) 273 | .attr("y", 0) 274 | .attr("height", mdsheight) 275 | .attr("width", mdswidth) 276 | .style("fill", color1) 277 | .attr("opacity", 0) 278 | .on("click", function() { 279 | state_reset(); 280 | state_save(true); 281 | }); 282 | 283 | mdsplot.append("line") // draw x-axis 284 | .attr("x1", 0) 285 | .attr("x2", mdswidth) 286 | .attr("y1", mdsheight / 2) 287 | .attr("y2", mdsheight / 2) 288 | .attr("stroke", "gray") 289 | .attr("opacity", 0.3); 290 | mdsplot.append("text") // label x-axis 291 | .attr("x", 0) 292 | .attr("y", mdsheight/2 - 5) 293 | .text(data['plot.opts'].xlab) 294 | .attr("fill", "gray"); 295 | 296 | mdsplot.append("line") // draw y-axis 297 | .attr("x1", mdswidth / 2) 298 | .attr("x2", mdswidth / 2) 299 | .attr("y1", 0) 300 | .attr("y2", mdsheight) 301 | .attr("stroke", "gray") 302 | .attr("opacity", 0.3); 303 | mdsplot.append("text") // label y-axis 304 | .attr("x", mdswidth/2 + 5) 305 | .attr("y", 7) 306 | .text(data['plot.opts'].ylab) 307 | .attr("fill", "gray"); 308 | 309 | // new definitions based on fixing the sum of the areas of the default topic circles: 310 | var newSmall = Math.sqrt(0.02*mdsarea*circle_prop/Math.PI); 311 | var newMedium = Math.sqrt(0.05*mdsarea*circle_prop/Math.PI); 312 | var newLarge = Math.sqrt(0.10*mdsarea*circle_prop/Math.PI); 313 | var cx = 10 + newLarge, 314 | cx2 = cx + 1.5 * newLarge; 315 | 316 | // circle guide inspired from 317 | // http://www.nytimes.com/interactive/2012/02/13/us/politics/2013-budget-proposal-graphic.html?_r=0 318 | var circleGuide = function(rSize, size) { 319 | d3.select("#" + leftPanelID).append("circle") 320 | .attr('class', "circleGuide" + size) 321 | .attr('r', rSize) 322 | .attr('cx', cx) 323 | .attr('cy', mdsheight + rSize) 324 | .style('fill', 'none') 325 | .style('stroke-dasharray', '2 2') 326 | .style('stroke', '#999'); 327 | d3.select("#" + leftPanelID).append("line") 328 | .attr('class', "lineGuide" + size) 329 | .attr("x1", cx) 330 | .attr("x2", cx2) 331 | .attr("y1", mdsheight + 2 * rSize) 332 | .attr("y2", mdsheight + 2 * rSize) 333 | .style("stroke", "gray") 334 | .style("opacity", 0.3); 335 | }; 336 | 337 | circleGuide(newSmall, "Small"); 338 | circleGuide(newMedium, "Medium"); 339 | circleGuide(newLarge, "Large"); 340 | 341 | var defaultLabelSmall = "2%"; 342 | var defaultLabelMedium = "5%"; 343 | var defaultLabelLarge = "10%"; 344 | 345 | d3.select("#" + leftPanelID).append("text") 346 | .attr("x", 10) 347 | .attr("y", mdsheight - 10) 348 | .attr('class', "circleGuideTitle") 349 | .style("text-anchor", "left") 350 | .style("fontWeight", "bold") 351 | .text("Marginal topic distribution"); 352 | d3.select("#" + leftPanelID).append("text") 353 | .attr("x", cx2 + 10) 354 | .attr("y", mdsheight + 2 * newSmall) 355 | .attr('class', "circleGuideLabelSmall") 356 | .style("text-anchor", "start") 357 | .text(defaultLabelSmall); 358 | d3.select("#" + leftPanelID).append("text") 359 | .attr("x", cx2 + 10) 360 | .attr("y", mdsheight + 2 * newMedium) 361 | .attr('class', "circleGuideLabelMedium") 362 | .style("text-anchor", "start") 363 | .text(defaultLabelMedium); 364 | d3.select("#" + leftPanelID).append("text") 365 | .attr("x", cx2 + 10) 366 | .attr("y", mdsheight + 2 * newLarge) 367 | .attr('class', "circleGuideLabelLarge") 368 | .style("text-anchor", "start") 369 | .text(defaultLabelLarge); 370 | 371 | // bind mdsData to the points in the left panel: 372 | var points = mdsplot.selectAll("points") 373 | .data(mdsData) 374 | .enter(); 375 | 376 | // text to indicate topic 377 | points.append("text") 378 | .attr("class", "txt") 379 | .attr("x", function(d) { 380 | return (xScale(+d.x)); 381 | }) 382 | .attr("y", function(d) { 383 | return (yScale(+d.y) + 4); 384 | }) 385 | .attr("stroke", "black") 386 | .attr("opacity", 1) 387 | .style("text-anchor", "middle") 388 | .style("font-size", "11px") 389 | .style("fontWeight", 100) 390 | .text(function(d) { 391 | return d.topics; 392 | }); 393 | 394 | // draw circles 395 | points.append("circle") 396 | .attr("class", "dot") 397 | .style("opacity", 0.2) 398 | .style("fill", color1) 399 | .attr("r", function(d) { 400 | return (Math.sqrt((d.Freq/100)*mdswidth*mdsheight*circle_prop/Math.PI)); 401 | }) 402 | .attr("cx", function(d) { 403 | return (xScale(+d.x)); 404 | }) 405 | .attr("cy", function(d) { 406 | return (yScale(+d.y)); 407 | }) 408 | .attr("stroke", "black") 409 | .attr("id", function(d) { 410 | return (topicID + d.topics); 411 | }) 412 | .on("mouseover", function(d) { 413 | var old_topic = topicID + vis_state.topic; 414 | if (vis_state.topic > 0 && old_topic != this.id) { 415 | topic_off(document.getElementById(old_topic)); 416 | } 417 | topic_on(this); 418 | }) 419 | .on("click", function(d) { 420 | // prevent click event defined on the div container from firing 421 | // http://bl.ocks.org/jasondavies/3186840 422 | d3.event.stopPropagation(); 423 | var old_topic = topicID + vis_state.topic; 424 | if (vis_state.topic > 0 && old_topic != this.id) { 425 | topic_off(document.getElementById(old_topic)); 426 | } 427 | // make sure topic input box value and fragment reflects clicked selection 428 | document.getElementById(topicID).value = vis_state.topic = d.topics; 429 | state_save(true); 430 | topic_on(this); 431 | }) 432 | .on("mouseout", function(d) { 433 | if (vis_state.topic != d.topics) topic_off(this); 434 | if (vis_state.topic > 0) topic_on(document.getElementById(topicID + vis_state.topic)); 435 | }); 436 | 437 | svg.append("text") 438 | .text("Intertopic Distance Map (via multidimensional scaling)") 439 | .attr("x", mdswidth/2 + margin.left) 440 | .attr("y", 30) 441 | .style("font-size", "16px") 442 | .style("text-anchor", "middle"); 443 | 444 | // establish layout and vars for bar chart 445 | var barDefault2 = dat3.filter(function(d) { 446 | return d.Category == "Default"; 447 | }); 448 | 449 | var y = d3.scaleBand() 450 | .domain(barDefault2.map(function(d) { 451 | return d.Term; 452 | })) 453 | .rangeRound([0, barheight]) 454 | .padding(0.15); 455 | 456 | var x = d3.scaleLinear() 457 | .domain([1, d3.max(barDefault2, function(d) { 458 | return d.Total; 459 | })]) 460 | .range([0, barwidth]) 461 | .nice(); 462 | var yAxis = d3.axisLeft(y); 463 | 464 | // Add a group for the bar chart 465 | var chart = svg.append("g") 466 | .attr("transform", "translate(" + +(mdswidth + margin.left + termwidth) + "," + 2 * margin.top + ")") 467 | .attr("id", barFreqsID); 468 | 469 | // bar chart legend/guide: 470 | var barguide = {"width": 100, "height": 15}; 471 | d3.select("#" + barFreqsID).append("rect") 472 | .attr("x", 0) 473 | .attr("y", mdsheight + 10) 474 | .attr("height", barguide.height) 475 | .attr("width", barguide.width) 476 | .style("fill", color1) 477 | .attr("opacity", 0.4); 478 | d3.select("#" + barFreqsID).append("text") 479 | .attr("x", barguide.width + 5) 480 | .attr("y", mdsheight + 10 + barguide.height/2) 481 | .style("dominant-baseline", "middle") 482 | .text("Overall term frequency"); 483 | 484 | d3.select("#" + barFreqsID).append("rect") 485 | .attr("x", 0) 486 | .attr("y", mdsheight + 10 + barguide.height + 5) 487 | .attr("height", barguide.height) 488 | .attr("width", barguide.width/2) 489 | .style("fill", color2) 490 | .attr("opacity", 0.8); 491 | d3.select("#" + barFreqsID).append("text") 492 | .attr("x", barguide.width/2 + 5) 493 | .attr("y", mdsheight + 10 + (3/2)*barguide.height + 5) 494 | .style("dominant-baseline", "middle") 495 | .text("Estimated term frequency within the selected topic"); 496 | 497 | // footnotes: 498 | d3.select("#" + barFreqsID) 499 | .append("a") 500 | .attr("xlink:href", "http://vis.stanford.edu/files/2012-Termite-AVI.pdf") 501 | .attr("target", "_blank") 502 | .append("text") 503 | .attr("x", 0) 504 | .attr("y", mdsheight + 10 + (6/2)*barguide.height + 5) 505 | .style("dominant-baseline", "middle") 506 | .text("1. saliency(term w) = frequency(w) * [sum_t p(t | w) * log(p(t | w)/p(t))] for topics t; see Chuang et. al (2012)"); 507 | d3.select("#" + barFreqsID) 508 | .append("a") 509 | .attr("xlink:href", "http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf") 510 | .attr("target", "_blank") 511 | .append("text") 512 | .attr("x", 0) 513 | .attr("y", mdsheight + 10 + (8/2)*barguide.height + 5) 514 | .style("dominant-baseline", "middle") 515 | .text("2. relevance(term w | topic t) = \u03BB * p(w | t) + (1 - \u03BB) * p(w | t)/p(w); see Sievert & Shirley (2014)"); 516 | 517 | // Bind 'default' data to 'default' bar chart 518 | var basebars = chart.selectAll(to_select + " .bar-totals") 519 | .data(barDefault2) 520 | .enter(); 521 | 522 | // Draw the gray background bars defining the overall frequency of each word 523 | basebars.append("rect") 524 | .attr("class", "bar-totals") 525 | .attr("x", 0) 526 | .attr("y", function(d) { 527 | return y(d.Term); 528 | }) 529 | .attr("height", y.bandwidth()) 530 | .attr("width", function(d) { 531 | return x(d.Total); 532 | }) 533 | .style("fill", color1) 534 | .attr("opacity", 0.4); 535 | 536 | // Add word labels to the side of each bar 537 | basebars.append("text") 538 | .attr("x", -5) 539 | .attr("class", "terms") 540 | .attr("y", function(d) { 541 | return y(d.Term) + 12; 542 | }) 543 | .attr("cursor", "pointer") 544 | .attr("id", function(d) { 545 | return (termID + d.Term); 546 | }) 547 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 548 | .text(function(d) { 549 | return d.Term; 550 | }) 551 | .on("mouseover", function() { 552 | term_hover(this); 553 | }) 554 | .on("mouseout", function() { 555 | vis_state.term = ""; 556 | term_off(this); 557 | state_save(true); 558 | }); 559 | 560 | var title = chart.append("text") 561 | .attr("x", barwidth/2) 562 | .attr("y", -30) 563 | .attr("class", "bubble-tool") // set class so we can remove it when highlight_off is called 564 | .style("text-anchor", "middle") 565 | .style("font-size", "16px") 566 | .text("Top-" + R + " Most Salient Terms"); 567 | 568 | title.append("tspan") 569 | .attr("baseline-shift", "super") 570 | .attr("font-size", "12px") 571 | .text("(1)"); 572 | 573 | // barchart axis adapted from http://bl.ocks.org/mbostock/1166403 574 | var xAxis = d3.axisTop(x) 575 | .tickSize(-barheight) 576 | .ticks(6); 577 | 578 | // dynamically create the topic and lambda input forms at the top of the page: 579 | function init_forms(topicID, lambdaID, visID) { 580 | 581 | // create container div for topic and lambda input: 582 | var inputDiv = document.createElement("div"); 583 | inputDiv.setAttribute("id", topID); 584 | inputDiv.setAttribute("style", "width: 1210px"); // to match the width of the main svg element 585 | document.getElementById(visID).appendChild(inputDiv); 586 | 587 | // topic input container: 588 | var topicDiv = document.createElement("div"); 589 | topicDiv.setAttribute("style", "padding: 5px; background-color: #e8e8e8; display: inline-block; width: " + mdswidth + "px; height: 50px; float: left"); 590 | inputDiv.appendChild(topicDiv); 591 | 592 | var topicLabel = document.createElement("label"); 593 | topicLabel.setAttribute("for", topicID); 594 | topicLabel.setAttribute("style", "font-family: sans-serif; font-size: 14px"); 595 | topicLabel.innerHTML = "Selected Topic: "; 596 | topicDiv.appendChild(topicLabel); 597 | 598 | var topicInput = document.createElement("input"); 599 | topicInput.setAttribute("style", "width: 50px"); 600 | topicInput.type = "text"; 601 | topicInput.min = "0"; 602 | topicInput.max = K; // assumes the data has already been read in 603 | topicInput.value = "0"; // a value of 0 indicates no topic is selected 604 | topicInput.step = "1"; 605 | topicInput.id = topicID; 606 | topicDiv.appendChild(topicInput); 607 | 608 | var previous = document.createElement("button"); 609 | previous.setAttribute("id", topicDown); 610 | previous.setAttribute("style", "margin-left: 5px"); 611 | previous.innerHTML = "Previous Topic"; 612 | topicDiv.appendChild(previous); 613 | 614 | var next = document.createElement("button"); 615 | next.setAttribute("id", topicUp); 616 | next.setAttribute("style", "margin-left: 5px"); 617 | next.innerHTML = "Next Topic"; 618 | topicDiv.appendChild(next); 619 | 620 | var clear = document.createElement("button"); 621 | clear.setAttribute("id", topicClear); 622 | clear.setAttribute("style", "margin-left: 5px"); 623 | clear.innerHTML = "Clear Topic"; 624 | topicDiv.appendChild(clear); 625 | 626 | // lambda inputs 627 | var lambdaDivWidth = barwidth; 628 | var lambdaDiv = document.createElement("div"); 629 | lambdaDiv.setAttribute("id", lambdaInputID); 630 | lambdaDiv.setAttribute("style", "padding: 5px; background-color: #e8e8e8; display: inline-block; height: 50px; width: " + lambdaDivWidth + "px; float: right; margin-right: 30px"); 631 | inputDiv.appendChild(lambdaDiv); 632 | 633 | var lambdaZero = document.createElement("div"); 634 | lambdaZero.setAttribute("style", "padding: 5px; height: 20px; width: 220px; font-family: sans-serif; float: left"); 635 | lambdaZero.setAttribute("id", lambdaZeroID); 636 | lambdaDiv.appendChild(lambdaZero); 637 | var xx = d3.select("#" + lambdaZeroID) 638 | .append("text") 639 | .attr("x", 0) 640 | .attr("y", 0) 641 | .style("font-size", "14px") 642 | .text("Slide to adjust relevance metric:"); 643 | var yy = d3.select("#" + lambdaZeroID) 644 | .append("text") 645 | .attr("x", 125) 646 | .attr("y", -5) 647 | .style("font-size", "10px") 648 | .style("position", "absolute") 649 | .text("(2)"); 650 | 651 | var sliderDiv = document.createElement("div"); 652 | sliderDiv.setAttribute("id", sliderDivID); 653 | sliderDiv.setAttribute("style", "padding: 5px; height: 40px; width: 250px; float: right; margin-top: -5px; margin-right: 10px"); 654 | lambdaDiv.appendChild(sliderDiv); 655 | 656 | var lambdaInput = document.createElement("input"); 657 | lambdaInput.setAttribute("style", "width: 250px; margin-left: 0px; margin-right: 0px"); 658 | lambdaInput.type = "range"; 659 | lambdaInput.min = 0; 660 | lambdaInput.max = 1; 661 | lambdaInput.step = data['lambda.step']; 662 | lambdaInput.value = vis_state.lambda; 663 | lambdaInput.id = lambdaID; 664 | lambdaInput.setAttribute("list", "ticks"); // to enable automatic ticks (with no labels, see below) 665 | sliderDiv.appendChild(lambdaInput); 666 | 667 | var lambdaLabel = document.createElement("label"); 668 | lambdaLabel.setAttribute("id", lambdaLabelID); 669 | lambdaLabel.setAttribute("for", lambdaID); 670 | lambdaLabel.setAttribute("style", "height: 20px; width: 60px; font-family: sans-serif; font-size: 14px; margin-left: 80px"); 671 | lambdaLabel.innerHTML = "λ = " + vis_state.lambda + ""; 672 | lambdaDiv.appendChild(lambdaLabel); 673 | 674 | // Create the svg to contain the slider scale: 675 | var scaleContainer = d3.select("#" + sliderDivID).append("svg") 676 | .attr("width", 250) 677 | .attr("height", 25); 678 | 679 | var sliderScale = d3.scaleLinear() 680 | .domain([0, 1]) 681 | .range([7.5, 242.5]) // trimmed by 7.5px on each side to match the input type=range slider: 682 | .nice(); 683 | 684 | // adapted from http://bl.ocks.org/mbostock/1166403 685 | var sliderAxis = d3.axisBottom(sliderScale) 686 | .tickSize(10) 687 | .ticks(6); 688 | 689 | // group to contain the elements of the slider axis: 690 | var sliderAxisGroup = scaleContainer.append("g") 691 | .attr("class", "slideraxis") 692 | .attr("margin-top", "-10px") 693 | .call(sliderAxis); 694 | } 695 | 696 | // function to re-order the bars (gray and red), and terms: 697 | function reorder_bars(increase) { 698 | // grab the bar-chart data for this topic only: 699 | var dat2 = lamData.filter(function(d) { 700 | return d.Category == "Topic" + vis_state.topic; 701 | }); 702 | // define relevance: 703 | for (var i = 0; i < dat2.length; i++) { 704 | dat2[i].relevance = vis_state.lambda * dat2[i].logprob + 705 | (1 - vis_state.lambda) * dat2[i].loglift; 706 | } 707 | 708 | // sort by relevance: 709 | dat2.sort(fancysort("relevance")); 710 | 711 | // truncate to the top R tokens: 712 | var dat3 = dat2.slice(0, R); 713 | 714 | var y = d3.scaleBand() 715 | .domain(dat3.map(function(d) { 716 | return d.Term; 717 | })) 718 | .rangeRound([0, barheight]) 719 | .padding(0.15); 720 | 721 | var x = d3.scaleLinear() 722 | .domain([1, d3.max(dat3, function(d) { 723 | return d.Total; 724 | })]) 725 | .range([0, barwidth]) 726 | .nice(); 727 | 728 | // Change Total Frequency bars 729 | var graybars = d3.select("#" + barFreqsID) 730 | .selectAll(to_select + " .bar-totals") 731 | .data(dat3, function(d) { 732 | return d.Term; 733 | }); 734 | 735 | // Change word labels 736 | var labels = d3.select("#" + barFreqsID) 737 | .selectAll(to_select + " .terms") 738 | .data(dat3, function(d) { 739 | return d.Term; 740 | }); 741 | 742 | // Create red bars (drawn over the gray ones) to signify the frequency under the selected topic 743 | var redbars = d3.select("#" + barFreqsID) 744 | .selectAll(to_select + " .overlay") 745 | .data(dat3, function(d) { 746 | return d.Term; 747 | }); 748 | 749 | // adapted from http://bl.ocks.org/mbostock/1166403 750 | var xAxis = d3.axisTop(x) 751 | .tickSize(-barheight) 752 | .ticks(6); 753 | 754 | // New axis definition: 755 | var newaxis = d3.selectAll(to_select + " .xaxis"); 756 | 757 | // define the new elements to enter: 758 | var graybarsEnter = graybars.enter().append("rect") 759 | .attr("class", "bar-totals") 760 | .attr("x", 0) 761 | .attr("y", function(d) { 762 | return y(d.Term) + barheight + margin.bottom + 2 * rMax; 763 | }) 764 | .attr("height", y.bandwidth()) 765 | .style("fill", color1) 766 | .attr("opacity", 0.4); 767 | 768 | var labelsEnter = labels.enter() 769 | .append("text") 770 | .attr("x", -5) 771 | .attr("class", "terms") 772 | .attr("y", function(d) { 773 | return y(d.Term) + 12 + barheight + margin.bottom + 2 * rMax; 774 | }) 775 | .attr("cursor", "pointer") 776 | .style("text-anchor", "end") 777 | .attr("id", function(d) { 778 | return (termID + d.Term); 779 | }) 780 | .text(function(d) { 781 | return d.Term; 782 | }) 783 | .on("mouseover", function() { 784 | term_hover(this); 785 | }) 786 | .on("mouseout", function() { 787 | vis_state.term = ""; 788 | term_off(this); 789 | state_save(true); 790 | }); 791 | 792 | var redbarsEnter = redbars.enter().append("rect") 793 | .attr("class", "overlay") 794 | .attr("x", 0) 795 | .attr("y", function(d) { 796 | return y(d.Term) + barheight + margin.bottom + 2 * rMax; 797 | }) 798 | .attr("height", y.bandwidth()) 799 | .style("fill", color2) 800 | .attr("opacity", 0.8); 801 | 802 | 803 | if (increase) { 804 | graybarsEnter 805 | .attr("width", function(d) { 806 | return x(d.Total); 807 | }) 808 | .transition().duration(duration) 809 | .delay(duration) 810 | .attr("y", function(d) { 811 | return y(d.Term); 812 | }); 813 | labelsEnter 814 | .transition().duration(duration) 815 | .delay(duration) 816 | .attr("y", function(d) { 817 | return y(d.Term) + 12; 818 | }); 819 | redbarsEnter 820 | .attr("width", function(d) { 821 | return x(d.Freq); 822 | }) 823 | .transition().duration(duration) 824 | .delay(duration) 825 | .attr("y", function(d) { 826 | return y(d.Term); 827 | }); 828 | 829 | graybars.transition().duration(duration) 830 | .attr("width", function(d) { 831 | return x(d.Total); 832 | }) 833 | .transition().duration(duration) 834 | .attr("y", function(d) { 835 | return y(d.Term); 836 | }); 837 | labels.transition().duration(duration) 838 | .delay(duration) 839 | .attr("y", function(d) { 840 | return y(d.Term) + 12; 841 | }); 842 | redbars.transition().duration(duration) 843 | .attr("width", function(d) { 844 | return x(d.Freq); 845 | }) 846 | .transition().duration(duration) 847 | .attr("y", function(d) { 848 | return y(d.Term); 849 | }); 850 | 851 | // Transition exiting rectangles to the bottom of the barchart: 852 | graybars.exit() 853 | .transition().duration(duration) 854 | .attr("width", function(d) { 855 | return x(d.Total); 856 | }) 857 | .transition().duration(duration) 858 | .attr("y", function(d, i) { 859 | return barheight + margin.bottom + 6 + i * 18; 860 | }) 861 | .remove(); 862 | labels.exit() 863 | .transition().duration(duration) 864 | .delay(duration) 865 | .attr("y", function(d, i) { 866 | return barheight + margin.bottom + 18 + i * 18; 867 | }) 868 | .remove(); 869 | redbars.exit() 870 | .transition().duration(duration) 871 | .attr("width", function(d) { 872 | return x(d.Freq); 873 | }) 874 | .transition().duration(duration) 875 | .attr("y", function(d, i) { 876 | return barheight + margin.bottom + 6 + i * 18; 877 | }) 878 | .remove(); 879 | // https://github.com/mbostock/d3/wiki/Transitions#wiki-d3_ease 880 | newaxis.transition().duration(duration) 881 | .call(xAxis) 882 | .transition().duration(duration); 883 | } else { 884 | graybarsEnter 885 | .attr("width", 100) // FIXME by looking up old width of these bars 886 | .transition().duration(duration) 887 | .attr("y", function(d) { 888 | return y(d.Term); 889 | }) 890 | .transition().duration(duration) 891 | .attr("width", function(d) { 892 | return x(d.Total); 893 | }); 894 | labelsEnter 895 | .transition().duration(duration) 896 | .attr("y", function(d) { 897 | return y(d.Term) + 12; 898 | }); 899 | redbarsEnter 900 | .attr("width", 50) // FIXME by looking up old width of these bars 901 | .transition().duration(duration) 902 | .attr("y", function(d) { 903 | return y(d.Term); 904 | }) 905 | .transition().duration(duration) 906 | .attr("width", function(d) { 907 | return x(d.Freq); 908 | }); 909 | 910 | graybars.transition().duration(duration) 911 | .attr("y", function(d) { 912 | return y(d.Term); 913 | }) 914 | .transition().duration(duration) 915 | .attr("width", function(d) { 916 | return x(d.Total); 917 | }); 918 | labels.transition().duration(duration) 919 | .attr("y", function(d) { 920 | return y(d.Term) + 12; 921 | }); 922 | redbars.transition().duration(duration) 923 | .attr("y", function(d) { 924 | return y(d.Term); 925 | }) 926 | .transition().duration(duration) 927 | .attr("width", function(d) { 928 | return x(d.Freq); 929 | }); 930 | 931 | // Transition exiting rectangles to the bottom of the barchart: 932 | graybars.exit() 933 | .transition().duration(duration) 934 | .attr("y", function(d, i) { 935 | return barheight + margin.bottom + 6 + i * 18 + 2 * rMax; 936 | }) 937 | .remove(); 938 | labels.exit() 939 | .transition().duration(duration) 940 | .attr("y", function(d, i) { 941 | return barheight + margin.bottom + 18 + i * 18 + 2 * rMax; 942 | }) 943 | .remove(); 944 | redbars.exit() 945 | .transition().duration(duration) 946 | .attr("y", function(d, i) { 947 | return barheight + margin.bottom + 6 + i * 18 + 2 * rMax; 948 | }) 949 | .remove(); 950 | 951 | // https://github.com/mbostock/d3/wiki/Transitions#wiki-d3_ease 952 | newaxis.transition().duration(duration) 953 | .transition().duration(duration) 954 | .call(xAxis); 955 | } 956 | } 957 | 958 | ////////////////////////////////////////////////////////////////////////////// 959 | 960 | // function to update bar chart when a topic is selected 961 | // the circle argument should be the appropriate circle element 962 | function topic_on(circle) { 963 | if (circle == null) return null; 964 | 965 | // grab data bound to this element 966 | var d = circle.__data__; 967 | var Freq = Math.round(d.Freq * 10) / 10, 968 | topics = d.topics; 969 | 970 | // change opacity and fill of the selected circle 971 | circle.style.opacity = highlight_opacity; 972 | circle.style.fill = color2; 973 | 974 | // Remove 'old' bar chart title 975 | var text = d3.select(to_select + " .bubble-tool"); 976 | text.remove(); 977 | 978 | // append text with info relevant to topic of interest 979 | d3.select("#" + barFreqsID) 980 | .append("text") 981 | .attr("x", barwidth/2) 982 | .attr("y", -30) 983 | .attr("class", "bubble-tool") // set class so we can remove it when highlight_off is called 984 | .style("text-anchor", "middle") 985 | .style("font-size", "16px") 986 | .text("Top-" + R + " Most Relevant Terms for Topic " + topics + " (" + Freq + "% of tokens)"); 987 | 988 | // grab the bar-chart data for this topic only: 989 | var dat2 = lamData.filter(function(d) { 990 | return d.Category == "Topic" + topics; 991 | }); 992 | 993 | // define relevance: 994 | for (var i = 0; i < dat2.length; i++) { 995 | dat2[i].relevance = lambda.current * dat2[i].logprob + 996 | (1 - lambda.current) * dat2[i].loglift; 997 | } 998 | 999 | // sort by relevance: 1000 | dat2.sort(fancysort("relevance")); 1001 | 1002 | // truncate to the top R tokens: 1003 | var dat3 = dat2.slice(0, R); 1004 | 1005 | // scale the bars to the top R terms: 1006 | var y = d3.scaleBand() 1007 | .domain(dat3.map(function(d) { 1008 | return d.Term; 1009 | })) 1010 | .rangeRound([0, barheight]) 1011 | .padding(0.1); 1012 | 1013 | var x = d3.scaleLinear() 1014 | .domain([1, d3.max(dat3, function(d) { 1015 | return d.Total; 1016 | })]) 1017 | .range([0, barwidth]) 1018 | .nice(); 1019 | 1020 | // remove the red bars if there are any: 1021 | d3.selectAll(to_select + " .overlay").remove(); 1022 | 1023 | // Change Total Frequency bars 1024 | d3.selectAll(to_select + " .bar-totals") 1025 | .data(dat3) 1026 | .attr("x", 0) 1027 | .attr("y", function(d) { 1028 | return y(d.Term); 1029 | }) 1030 | .attr("height", y.bandwidth()) 1031 | .attr("width", function(d) { 1032 | return x(d.Total); 1033 | }) 1034 | .style("fill", color1) 1035 | .attr("opacity", 0.4); 1036 | 1037 | // Change word labels 1038 | d3.selectAll(to_select + " .terms") 1039 | .data(dat3) 1040 | .attr("x", -5) 1041 | .attr("y", function(d) { 1042 | return y(d.Term) + 12; 1043 | }) 1044 | .attr("id", function(d) { 1045 | return (termID + d.Term); 1046 | }) 1047 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 1048 | .text(function(d) { 1049 | return d.Term; 1050 | }); 1051 | 1052 | // Create red bars (drawn over the gray ones) to signify the frequency under the selected topic 1053 | d3.select("#" + barFreqsID).selectAll(to_select + " .overlay") 1054 | .data(dat3) 1055 | .enter() 1056 | .append("rect") 1057 | .attr("class", "overlay") 1058 | .attr("x", 0) 1059 | .attr("y", function(d) { 1060 | return y(d.Term); 1061 | }) 1062 | .attr("height", y.bandwidth()) 1063 | .attr("width", function(d) { 1064 | return x(d.Freq); 1065 | }) 1066 | .style("fill", color2) 1067 | .attr("opacity", 0.8); 1068 | 1069 | // adapted from http://bl.ocks.org/mbostock/1166403 1070 | var xAxis = d3.axisTop(x) 1071 | .tickSize(-barheight) 1072 | .ticks(6); 1073 | 1074 | // redraw x-axis 1075 | d3.selectAll(to_select + " .xaxis") 1076 | .call(xAxis); 1077 | } 1078 | 1079 | 1080 | function topic_off(circle) { 1081 | if (circle == null) return circle; 1082 | // go back to original opacity/fill 1083 | circle.style.opacity = base_opacity; 1084 | circle.style.fill = color1; 1085 | 1086 | var title = d3.selectAll(to_select + " .bubble-tool") 1087 | .text("Top-" + R + " Most Salient Terms"); 1088 | title.append("tspan") 1089 | .attr("baseline-shift", "super") 1090 | .attr("font-size", 12) 1091 | .text(1); 1092 | 1093 | // remove the red bars 1094 | d3.selectAll(to_select + " .overlay").remove(); 1095 | 1096 | // go back to 'default' bar chart 1097 | var dat2 = lamData.filter(function(d) { 1098 | return d.Category == "Default"; 1099 | }); 1100 | 1101 | var y = d3.scaleBand() 1102 | .domain(dat2.map(function(d) { 1103 | return d.Term; 1104 | })) 1105 | .rangeRound([0, barheight]) 1106 | .padding(0.15); 1107 | 1108 | var x = d3.scaleLinear() 1109 | .domain([1, d3.max(dat2, function(d) { 1110 | return d.Total; 1111 | })]) 1112 | .range([0, barwidth]) 1113 | .nice(); 1114 | 1115 | // Change Total Frequency bars 1116 | d3.selectAll(to_select + " .bar-totals") 1117 | .data(dat2) 1118 | .attr("x", 0) 1119 | .attr("y", function(d) { 1120 | return y(d.Term); 1121 | }) 1122 | .attr("height", y.bandwidth()) 1123 | .attr("width", function(d) { 1124 | return x(d.Total); 1125 | }) 1126 | .style("fill", color1) 1127 | .attr("opacity", 0.4); 1128 | 1129 | //Change word labels 1130 | d3.selectAll(to_select + " .terms") 1131 | .data(dat2) 1132 | .attr("x", -5) 1133 | .attr("y", function(d) { 1134 | return y(d.Term) + 12; 1135 | }) 1136 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 1137 | .text(function(d) { 1138 | return d.Term; 1139 | }); 1140 | 1141 | // adapted from http://bl.ocks.org/mbostock/1166403 1142 | var xAxis = d3.axisTop(x) 1143 | .tickSize(-barheight) 1144 | .ticks(6); 1145 | 1146 | // redraw x-axis 1147 | d3.selectAll(to_select + " .xaxis") 1148 | .attr("class", "xaxis") 1149 | .call(xAxis); 1150 | } 1151 | 1152 | // event definition for mousing over a term 1153 | function term_hover(term) { 1154 | var old_term = termID + vis_state.term; 1155 | if (vis_state.term != "" && old_term != term.id) { 1156 | term_off(document.getElementById(old_term)); 1157 | } 1158 | vis_state.term = term.innerHTML; 1159 | term_on(term); 1160 | state_save(true); 1161 | } 1162 | // updates vis when a term is selected via click or hover 1163 | function term_on(term) { 1164 | if (term == null) return null; 1165 | term.style["fontWeight"] = "bold"; 1166 | var d = term.__data__; 1167 | var Term = d.Term; 1168 | var dat2 = mdsData3.filter(function(d2) { 1169 | return d2.Term == Term; 1170 | }); 1171 | 1172 | var k = dat2.length; // number of topics for this token with non-zero frequency 1173 | 1174 | var radius = []; 1175 | for (var i = 0; i < K; ++i) { 1176 | radius[i] = 0; 1177 | } 1178 | for (i = 0; i < k; i++) { 1179 | radius[dat2[i].Topic - 1] = dat2[i].Freq; 1180 | } 1181 | 1182 | var size = []; 1183 | for (var i = 0; i < K; ++i) { 1184 | size[i] = 0; 1185 | } 1186 | for (i = 0; i < k; i++) { 1187 | // If we want to also re-size the topic number labels, do it here 1188 | // 11 is the default, so leaving this as 11 won't change anything. 1189 | size[dat2[i].Topic - 1] = 11; 1190 | } 1191 | 1192 | var rScaleCond = d3.scaleSqrt() 1193 | .domain([0, 1]).range([0, rMax]); 1194 | 1195 | // Change size of bubbles according to the word's distribution over topics 1196 | d3.selectAll(to_select + " .dot") 1197 | .data(radius) 1198 | .transition() 1199 | .attr("r", function(d) { 1200 | return (Math.sqrt(d*mdswidth*mdsheight*word_prop/Math.PI)); 1201 | }); 1202 | 1203 | // re-bind mdsData so we can handle multiple selection 1204 | d3.selectAll(to_select + " .dot") 1205 | .data(mdsData); 1206 | 1207 | // Change sizes of topic numbers: 1208 | d3.selectAll(to_select + " .txt") 1209 | .data(size) 1210 | .transition() 1211 | .style("font-size", function(d) { 1212 | return +d; 1213 | }); 1214 | 1215 | // Alter the guide 1216 | d3.select(to_select + " .circleGuideTitle") 1217 | .text("Conditional topic distribution given term = '" + term.innerHTML + "'"); 1218 | } 1219 | 1220 | function term_off(term) { 1221 | if (term == null) return null; 1222 | term.style["fontWeight"] = "normal"; 1223 | 1224 | d3.selectAll(to_select + " .dot") 1225 | .data(mdsData) 1226 | .transition() 1227 | .attr("r", function(d) { 1228 | return (Math.sqrt((d.Freq/100)*mdswidth*mdsheight*circle_prop/Math.PI)); 1229 | }); 1230 | 1231 | // Change sizes of topic numbers: 1232 | d3.selectAll(to_select + " .txt") 1233 | .transition() 1234 | .style("font-size", "11px"); 1235 | 1236 | // Go back to the default guide 1237 | d3.select(to_select + " .circleGuideTitle") 1238 | .text("Marginal topic distribution"); 1239 | d3.select(to_select + " .circleGuideLabelLarge") 1240 | .text(defaultLabelLarge); 1241 | d3.select(to_select + " .circleGuideLabelSmall") 1242 | .attr("y", mdsheight + 2 * newSmall) 1243 | .text(defaultLabelSmall); 1244 | d3.select(to_select + " .circleGuideSmall") 1245 | .attr("r", newSmall) 1246 | .attr("cy", mdsheight + newSmall); 1247 | d3.select(to_select + " .lineGuideSmall") 1248 | .attr("y1", mdsheight + 2 * newSmall) 1249 | .attr("y2", mdsheight + 2 * newSmall); 1250 | } 1251 | 1252 | 1253 | // serialize the visualization state using fragment identifiers -- http://en.wikipedia.org/wiki/Fragment_identifier 1254 | // location.hash holds the address information 1255 | 1256 | var params = location.hash.split("&"); 1257 | if (params.length > 1) { 1258 | vis_state.topic = params[0].split("=")[1]; 1259 | vis_state.lambda = params[1].split("=")[1]; 1260 | vis_state.term = params[2].split("=")[1]; 1261 | 1262 | // Idea: write a function to parse the URL string 1263 | // only accept values in [0,1] for lambda, {0, 1, ..., K} for topics (any string is OK for term) 1264 | // Allow for subsets of the three to be entered: 1265 | // (1) topic only (lambda = 1 term = "") 1266 | // (2) lambda only (topic = 0 term = "") visually the same but upon hovering a topic, the effect of lambda will be seen 1267 | // (3) term only (topic = 0 lambda = 1) only fires when the term is among the R most salient 1268 | // (4) topic + lambda (term = "") 1269 | // (5) topic + term (lambda = 1) 1270 | // (6) lambda + term (topic = 0) visually lambda doesn't make a difference unless a topic is hovered 1271 | // (7) topic + lambda + term 1272 | 1273 | // Short-term: assume format of "#topic=k&lambda=l&term=s" where k, l, and s are strings (b/c they're from a URL) 1274 | 1275 | // Force k (topic identifier) to be an integer between 0 and K: 1276 | vis_state.topic = Math.round(Math.min(K, Math.max(0, vis_state.topic))); 1277 | 1278 | // Force l (lambda identifier) to be in [0, 1]: 1279 | vis_state.lambda = Math.min(1, Math.max(0, vis_state.lambda)); 1280 | 1281 | // impose the value of lambda: 1282 | document.getElementById(lambdaID).value = vis_state.lambda; 1283 | document.getElementById(lambdaID + "-value").innerHTML = vis_state.lambda; 1284 | 1285 | // select the topic and transition the order of the bars (if appropriate) 1286 | if (!isNaN(vis_state.topic)) { 1287 | document.getElementById(topicID).value = vis_state.topic; 1288 | if (vis_state.topic > 0) { 1289 | topic_on(document.getElementById(topicID + vis_state.topic)); 1290 | } 1291 | if (vis_state.lambda < 1 && vis_state.topic > 0) { 1292 | reorder_bars(false); 1293 | } 1294 | } 1295 | lambda.current = vis_state.lambda; 1296 | var termElem = document.getElementById(termID + vis_state.term); 1297 | if (termElem !== undefined) term_on(termElem); 1298 | } 1299 | 1300 | function state_url() { 1301 | return location.origin + location.pathname + "#topic=" + vis_state.topic + 1302 | "&lambda=" + vis_state.lambda + "&term=" + vis_state.term; 1303 | } 1304 | 1305 | function state_save(replace) { 1306 | if (replace) 1307 | history.replaceState(vis_state, "Query", state_url()); 1308 | else 1309 | history.pushState(vis_state, "Query", state_url()); 1310 | } 1311 | 1312 | function state_reset() { 1313 | if (vis_state.topic > 0) { 1314 | topic_off(document.getElementById(topicID + vis_state.topic)); 1315 | } 1316 | if (vis_state.term != "") { 1317 | term_off(document.getElementById(termID + vis_state.term)); 1318 | } 1319 | vis_state.term = ""; 1320 | document.getElementById(topicID).value = vis_state.topic = 0; 1321 | state_save(true); 1322 | } 1323 | 1324 | } 1325 | 1326 | if (typeof data_or_file_name === 'string') 1327 | d3.json(data_or_file_name, function(error, data) {visualize(data);}); 1328 | else 1329 | visualize(data_or_file_name); 1330 | }; 1331 | -------------------------------------------------------------------------------- /pyLDAvis/js/ldavis.v1.0.0.css: -------------------------------------------------------------------------------- 1 | /* Taken from https://github.com/cpsievert/LDAvis */ 2 | /* Copyright 2013, AT&T Intellectual Property */ 3 | /* MIT Licence */ 4 | 5 | .slideraxis path { 6 | fill: none; 7 | stroke: none; 8 | } 9 | 10 | .xaxis .tick.major { 11 | fill: black; 12 | stroke: black; 13 | stroke-width: 0.1; 14 | opacity: 0.7; 15 | } 16 | 17 | .slideraxis { 18 | fill: black; 19 | stroke: black; 20 | stroke-width: 0.4; 21 | opacity: 1; 22 | } 23 | 24 | text { 25 | font-family: sans-serif; 26 | font-size: 11px; 27 | } 28 | 29 | /* IPython Notebook CSS to allow visualization to fit */ 30 | /* I'm open to a better way of accomplishing this goal... */ 31 | .container { width:1350px !important; } 32 | /* This is for nbviewer's benefit since the above wasn't enough... */ 33 | .output_area { width:1450px !important; } 34 | -------------------------------------------------------------------------------- /pyLDAvis/lda_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | pyLDAvis lda_model 3 | =============== 4 | Helper functions to visualize sklearn's LatentDirichletAllocation models 5 | """ 6 | 7 | import funcy as fp 8 | import pyLDAvis 9 | import numpy as np 10 | 11 | 12 | def _get_doc_lengths(dtm): 13 | if isinstance(dtm, np.ndarray): 14 | return dtm.sum(axis=1).ravel() 15 | if isinstance(dtm, np.matrix): 16 | return dtm.sum(axis=1).getA1() 17 | raise TypeError(str(type(dtm))) 18 | 19 | 20 | def _get_term_freqs(dtm): 21 | if isinstance(dtm, np.ndarray): 22 | return dtm.sum(axis=0).ravel() 23 | if isinstance(dtm, np.matrix): 24 | return dtm.sum(axis=0).getA1() 25 | raise TypeError(str(type(dtm))) 26 | 27 | 28 | def _get_vocab(vectorizer): 29 | return vectorizer.get_feature_names_out() 30 | 31 | 32 | def _row_norm(dists): 33 | # row normalization function required 34 | # for doc_topic_dists and topic_term_dists 35 | return dists / dists.sum(axis=1)[:, None] 36 | 37 | 38 | def _get_doc_topic_dists(lda_model, dtm): 39 | return _row_norm(lda_model.transform(dtm)) 40 | 41 | 42 | def _get_topic_term_dists(lda_model): 43 | return _row_norm(lda_model.components_) 44 | 45 | 46 | def _extract_data(lda_model, dtm, vectorizer): 47 | vocab = _get_vocab(vectorizer) 48 | doc_lengths = _get_doc_lengths(dtm) 49 | term_freqs = _get_term_freqs(dtm) 50 | topic_term_dists = _get_topic_term_dists(lda_model) 51 | err_msg = ('Topic-term distributions and document-term matrix' 52 | 'have different number of columns, {} != {}.') 53 | 54 | assert term_freqs.shape[0] == len(vocab), \ 55 | ('Term frequencies and vocabulary are of different sizes, {} != {}.' 56 | .format(term_freqs.shape[0], len(vocab))) 57 | 58 | assert topic_term_dists.shape[1] == dtm.shape[1], \ 59 | (err_msg.format(topic_term_dists.shape[1], len(vocab))) 60 | 61 | # column dimensions of document-term matrix and topic-term distributions 62 | # must match first before transforming to document-topic distributions 63 | doc_topic_dists = _get_doc_topic_dists(lda_model, dtm) 64 | return {'vocab': vocab, 65 | 'doc_lengths': doc_lengths.tolist(), 66 | 'term_frequency': term_freqs.tolist(), 67 | 'doc_topic_dists': doc_topic_dists.tolist(), 68 | 'topic_term_dists': topic_term_dists.tolist()} 69 | 70 | 71 | def prepare(lda_model, dtm, vectorizer, **kwargs): 72 | """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer. 73 | 74 | Parameters 75 | ---------- 76 | lda_model : sklearn.decomposition.LatentDirichletAllocation. 77 | Latent Dirichlet Allocation model from sklearn fitted with `dtm` 78 | 79 | dtm : array-like or sparse matrix, shape=(n_samples, n_features) 80 | Document-term matrix used to fit on LatentDirichletAllocation model (`lda_model`) 81 | 82 | vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer). 83 | vectorizer used to convert raw documents to document-term matrix (`dtm`) 84 | 85 | **kwargs: Keyword argument to be passed to pyLDAvis.prepare() 86 | 87 | 88 | Returns 89 | ------- 90 | prepared_data : PreparedData 91 | the data structures used in the visualization 92 | 93 | 94 | Example 95 | -------- 96 | For example usage please see this notebook: 97 | http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/LDA%20model.ipynb 98 | 99 | See 100 | ------ 101 | See `pyLDAvis.prepare` for **kwargs. 102 | """ 103 | opts = fp.merge(_extract_data(lda_model, dtm, vectorizer), kwargs) 104 | return pyLDAvis.prepare(**opts) 105 | -------------------------------------------------------------------------------- /pyLDAvis/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | LDAvis URLs 3 | ========== 4 | URLs and filepaths for the LDAvis javascript libraries 5 | """ 6 | 7 | import os 8 | from . import __path__, __version__ 9 | 10 | __all__ = ["D3_URL", "LDAVIS_URL", "LDAVIS_CSS_URL", 11 | "D3_LOCAL", "LDAVIS_LOCAL", "LDAVIS_CSS_LOCAL"] 12 | 13 | D3_URL = "https://d3js.org/d3.v5.js" 14 | 15 | DEV = 'git' in __version__ 16 | LOCAL_JS_DIR = os.path.join(__path__[0], "js") 17 | D3_LOCAL = os.path.join(LOCAL_JS_DIR, "d3.v5.min.js") 18 | 19 | # Avoid browser caching with @version in the URL. 20 | WWW_JS_DIR = "https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@{0}/pyLDAvis/js/".format(__version__) 21 | 22 | JS_VERSION = '1.0.0' 23 | if not DEV and int(__version__[0]) >= 3: 24 | JS_VERSION = '3.0.0' 25 | CSS_VERSION = '1.0.0' 26 | 27 | LDAVIS_URL = WWW_JS_DIR + "ldavis.v{0}.js".format(JS_VERSION) 28 | LDAVIS_CSS_URL = WWW_JS_DIR + "ldavis.v{0}.css".format(CSS_VERSION) 29 | 30 | LDAVIS_LOCAL = os.path.join(LOCAL_JS_DIR, "ldavis.v{0}.js".format(JS_VERSION)) 31 | LDAVIS_CSS_LOCAL = os.path.join(LOCAL_JS_DIR, "ldavis.v{0}.css".format(CSS_VERSION)) 32 | 33 | if DEV: 34 | LDAVIS_URL = WWW_JS_DIR + "ldavis.js" 35 | LDAVIS_CSS_URL = WWW_JS_DIR + "ldavis.css" 36 | 37 | LDAVIS_LOCAL = os.path.join(LOCAL_JS_DIR, "ldavis.js") 38 | LDAVIS_CSS_LOCAL = os.path.join(LOCAL_JS_DIR, "ldavis.css") 39 | -------------------------------------------------------------------------------- /pyLDAvis/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | pyLDAvis Utilities 3 | =============== 4 | Utility routines for the pyLDAvis package 5 | """ 6 | 7 | import json 8 | import os 9 | import re 10 | import shutil 11 | import warnings 12 | import numpy as np 13 | import pyLDAvis.urls 14 | 15 | # Make sure that DeprecationWarning gets printed 16 | warnings.simplefilter("always", DeprecationWarning) 17 | 18 | 19 | def html_id_ok(objid, html5=False): 20 | """Check whether objid is valid as an HTML id attribute. 21 | 22 | If html5 == True, then use the more liberal html5 rules. 23 | """ 24 | if html5: 25 | return not re.search(r'\s', objid) 26 | else: 27 | return bool(re.match(r"^[a-zA-Z][a-zA-Z0-9\-\.\:\_]*$", objid)) 28 | 29 | 30 | def get_id(obj, suffix="", prefix="el", warn_on_invalid=True): 31 | """Get a unique id for the object""" 32 | if not suffix: 33 | suffix = "" 34 | if not prefix: 35 | prefix = "" 36 | 37 | objid = prefix + str(os.getpid()) + str(id(obj)) + suffix 38 | 39 | if warn_on_invalid and not html_id_ok(objid): 40 | warnings.warn('"{0}" is not a valid html ID. This may cause problems') 41 | 42 | return objid 43 | 44 | 45 | def write_ipynb_local_js(location=None, d3_src=None, ldavis_src=None, ldavis_css=None): 46 | """ 47 | Write the pyLDAvis and d3 javascript libraries to the given file location. 48 | 49 | This utility is used by the IPython notebook tools to enable easy use 50 | of pyLDAvis with no web connection. 51 | 52 | Parameters 53 | ---------- 54 | location : string (optioal) 55 | the directory in which the d3 and pyLDAvis javascript libraries will be 56 | written. If not specified, the IPython nbextensions directory will be 57 | used. If IPython doesn't support nbextensions (< 2.0), 58 | the current working directory will be used. 59 | d3_src : string (optional) 60 | the source location of the d3 library. If not specified, the standard 61 | path in pyLDAvis.urls.D3_LOCAL will be used. 62 | ldavis_src : string (optional) 63 | the source location of the pyLDAvis library. If not specified, the 64 | standard path in pyLDAvis.urls.LDAVIS_LOCAL will be used. 65 | 66 | Returns 67 | ------- 68 | d3_url, ldavis_url : string 69 | The URLs to be used for loading these js files. 70 | """ 71 | nbextension = False 72 | if location is None: 73 | try: 74 | # Later IPython versions 75 | from notebook.nbextensions import install_nbextension 76 | nbextension = True 77 | except ImportError: 78 | try: 79 | # Older IPython versions 80 | from IPython.html import install_nbextension 81 | nbextension = True 82 | except ImportError: 83 | location = os.getcwd() 84 | 85 | if d3_src is None: 86 | d3_src = pyLDAvis.urls.D3_LOCAL 87 | if ldavis_src is None: 88 | ldavis_src = pyLDAvis.urls.LDAVIS_LOCAL 89 | if ldavis_css is None: 90 | ldavis_css = pyLDAvis.urls.LDAVIS_CSS_LOCAL 91 | 92 | d3js = os.path.basename(d3_src) 93 | ldavisjs = os.path.basename(ldavis_src) 94 | ldaviscss = os.path.basename(ldavis_css) 95 | 96 | if not os.path.exists(d3_src): 97 | raise ValueError("d3 src not found at '{0}'".format(d3_src)) 98 | if not os.path.exists(ldavis_src): 99 | raise ValueError("pyLDAvis src not found at '{0}'".format(ldavis_src)) 100 | if not os.path.exists(ldavis_css): 101 | raise ValueError("pyLDAvis src not found at '{0}'".format(ldavis_css)) 102 | 103 | if nbextension: 104 | # IPython 2.0+. 105 | # This will not work if a url prefix is added 106 | prefix = '/nbextensions/' 107 | 108 | try: 109 | [install_nbextension(ext) for ext in [d3_src, ldavis_src, ldavis_css]] 110 | except IOError: 111 | # files may be read only. We'll try deleting them and re-installing 112 | from IPython.utils.path import get_ipython_dir 113 | nbext = os.path.join(get_ipython_dir(), "nbextensions") 114 | 115 | for src in [d3_src, ldavis_src]: 116 | dest = os.path.join(nbext, os.path.basename(src)) 117 | if os.path.exists(dest): 118 | os.remove(dest) 119 | [install_nbextension(ext) for ext in [d3_src, ldavis_src, ldavis_css]] 120 | 121 | else: 122 | # IPython < 2.0 or explicit path. 123 | # This won't work if users have changed the kernel directory. 124 | prefix = '/files/' 125 | 126 | d3_dest = os.path.join(location, d3js) 127 | ldavis_dest = os.path.join(location, ldavisjs) 128 | ldavis_css_dest = os.path.join(location, ldaviscss) 129 | 130 | for src, dest in [(d3_src, d3_dest), 131 | (ldavis_src, ldavis_dest), 132 | (ldavis_css, ldavis_css_dest)]: 133 | try: 134 | shutil.copyfile(src, dest) 135 | except IOError: 136 | # file may be read only. We'll try deleting it first 137 | if os.path.exists(dest): 138 | os.remove(dest) 139 | shutil.copyfile(src, dest) 140 | 141 | return prefix + d3js, prefix + ldavisjs, prefix + ldaviscss 142 | 143 | 144 | class NumPyEncoder(json.JSONEncoder): 145 | def default(self, obj): 146 | if isinstance(obj, np.int64) or isinstance(obj, np.int32): 147 | return int(obj) 148 | if isinstance(obj, np.float64) or isinstance(obj, np.float32): 149 | return float(obj) 150 | return json.JSONEncoder.default(self, obj) 151 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas>=2.0.0 4 | joblib>=1.2.0 5 | jinja2 6 | numexpr 7 | funcy 8 | scikit-learn>=1.0.0 9 | gensim 10 | setuptools 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | with open('README.rst') as readme_file: 6 | readme = readme_file.read() 7 | 8 | with open('HISTORY.rst') as history_file: 9 | history = history_file.read().replace('.. :changelog:', '') 10 | 11 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 12 | if on_rtd: 13 | print('Being built on ReadTheDocs so we are avoiding pulling in scikit-bio since it imports numpy...') 14 | requirements = [] 15 | else: 16 | with open('requirements.txt') as f: 17 | requirements = f.read().splitlines() 18 | 19 | setup( 20 | name='pyLDAvis', 21 | version='3.4.1', 22 | description='Interactive topic model visualization. Port of the R package.', 23 | long_description_content_type="text/x-rst", 24 | long_description=readme, 25 | author='Ben Mabey', 26 | author_email='ben@benmabey.com', 27 | url='https://github.com/bmabey/pyLDAvis', 28 | download_url='https://github.com/bmabey/pyLDAvis/tarball/3.4.1', 29 | packages=['pyLDAvis'], 30 | package_dir={'pyLDAvis': 'pyLDAvis'}, 31 | tests_require=['pytest'], 32 | python_requires=">=3.9", 33 | include_package_data=True, 34 | install_requires=requirements, 35 | license='BSD-3-Clause', 36 | zip_safe=False, 37 | keywords=['data science', 'visualization'], 38 | classifiers=[ 39 | 'Development Status :: 5 - Production/Stable', 40 | 'Intended Audience :: Developers', 41 | 'Intended Audience :: Science/Research', 42 | 'License :: OSI Approved :: BSD License', 43 | 'Natural Language :: English', 44 | 'Programming Language :: Python :: 3', 45 | 'Programming Language :: Python :: 3.9', 46 | 'Programming Language :: Python :: 3.10', 47 | 'Programming Language :: Python :: 3.11', 48 | ] 49 | ) -------------------------------------------------------------------------------- /tests/data/.gitattributes: -------------------------------------------------------------------------------- 1 | movie_reviews_input.json filter=lfs diff=lfs merge=lfs -crlf 2 | movie_reviews_output.json filter=lfs diff=lfs merge=lfs -crlf 3 | -------------------------------------------------------------------------------- /tests/data/export_data.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ensure.packages <- function(packages) { 4 | packages.not.installed <- Filter(function(p) !(p %in% installed.packages()), packages) 5 | if(length(packages.not.installed) > 0) { 6 | install.packages(packages.not.installed, dep = T) 7 | }} 8 | 9 | ensure.packages(c('LDAvis', 'LDAvisData', 'jsonlite')) 10 | 11 | library(LDAvis) 12 | library(LDAvisData) 13 | # RJSONIO did not roundtrip cleanly so it was annoying to use 14 | library(jsonlite) 15 | 16 | export <- function(data, name, out.dir='.') { 17 | input.name <- paste0(name, "_input.json") 18 | if(!file.exists(input.name)) 19 | { 20 | cat(paste0('Exporting ', name, '...\n')) 21 | input <- jsonlite::toJSON(data, digits=50) 22 | cat(input, file = file.path(out.dir, input.name)) 23 | } 24 | 25 | output.name <- paste0(name, "_output.json") 26 | if(!file.exists(output.name)) 27 | { 28 | # roundtrip the JSON so both libraries are using the same precision 29 | data <- jsonlite::fromJSON(input) 30 | output <- createJSON(data$phi, data$theta, data$doc.length, data$vocab, data$term.frequency) 31 | cat(output, file = file.path(out.dir, output.name)) 32 | cat(paste0(input.name, ' and ', output.name, ' have been written.\n')) 33 | } 34 | } 35 | 36 | 37 | export(AP, 'ap') 38 | export(Jeopardy, 'jeopardy') 39 | export(MovieReviews, 'movie_reviews') 40 | -------------------------------------------------------------------------------- /tests/data/movie_reviews_input.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0f8753ac5b6e89031fc56623e9f71a61ebaca0e3382956944ad05c0844580298 3 | size 7087084 4 | -------------------------------------------------------------------------------- /tests/data/movie_reviews_output.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:777815470c09a2d852ee027047a9c0fff1d3683498e20c3c60b7dc10ef51cf8f 3 | size 159501 4 | -------------------------------------------------------------------------------- /tests/pyLDAvis/test_gensim_models.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/venv python3 2 | 3 | import os 4 | 5 | from gensim.models import LdaModel, HdpModel 6 | from gensim.corpora.dictionary import Dictionary 7 | 8 | import pyLDAvis 9 | import pyLDAvis.gensim_models as gensim_models 10 | 11 | 12 | def get_corpus_dictionary(): 13 | """Crafts a toy corpus and the dictionary associated.""" 14 | corpus = [ 15 | ['carrot', 'salad', 'tomato'], 16 | ['carrot', 'salad', 'dish'], 17 | ['tomato', 'dish'], 18 | ['tomato', 'salad'], 19 | 20 | ['car', 'break', 'highway'], 21 | ['highway', 'accident', 'car'], 22 | ['moto', 'break'], 23 | ['accident', 'moto', 'car'] 24 | ] 25 | dictionary = Dictionary(corpus) 26 | 27 | # Transforming corpus with dictionary. 28 | corpus = [dictionary.doc2bow(doc) for doc in corpus] 29 | 30 | # Building reverse index. 31 | for (token, uid) in dictionary.token2id.items(): 32 | dictionary.id2token[uid] = token 33 | 34 | return corpus, dictionary 35 | 36 | 37 | def test_lda(): 38 | """Trains a LDA model and tests the html outputs.""" 39 | corpus, dictionary = get_corpus_dictionary() 40 | lda = LdaModel(corpus=corpus, num_topics=2) 41 | 42 | data = gensim_models.prepare(lda, corpus, dictionary) 43 | pyLDAvis.save_html(data, 'index_lda.html') 44 | os.remove('index_lda.html') 45 | 46 | 47 | def test_hdp(): 48 | """Trains a HDP model and tests the html outputs.""" 49 | corpus, dictionary = get_corpus_dictionary() 50 | hdp = HdpModel(corpus, dictionary.id2token) 51 | 52 | data = gensim_models.prepare(hdp, corpus, dictionary) 53 | pyLDAvis.save_html(data, 'index_hdp.html') 54 | os.remove('index_hdp.html') 55 | 56 | 57 | def test_sorted_terms(): 58 | """This tests that we can get the terms of a given topic using lambda 59 | to calculate the relevance ranking. A common workflow is that once we 60 | visualize the topics we modify the lambda slide and we are interested 61 | in a particular lambda value, then with this function we can get the 62 | terms in that order. 63 | """ 64 | corpus, dictionary = get_corpus_dictionary() 65 | lda = LdaModel(corpus=corpus, num_topics=2) 66 | 67 | data = gensim_models.prepare(lda, corpus, dictionary) 68 | # https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf 69 | # lambda = 0 should rank the terms by loglift 70 | # lambda = 1 should rank them by logprob. 71 | sorted_terms = data.sorted_terms(topic=1, _lambda=1).to_dict() 72 | assert (sorted_terms['logprob'] == sorted_terms['relevance']) 73 | sorted_terms = data.sorted_terms(topic=1, _lambda=0).to_dict() 74 | assert (sorted_terms['loglift'] == sorted_terms['relevance']) 75 | 76 | 77 | if __name__ == "__main__": 78 | test_lda() 79 | test_hdp() 80 | test_sorted_terms() 81 | -------------------------------------------------------------------------------- /tests/pyLDAvis/test_prepare.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/venv python3 2 | 3 | import json 4 | import os.path as path 5 | import funcy as fp 6 | from numpy.testing import assert_array_equal 7 | import numpy as np 8 | import pandas as pd 9 | from pandas.testing import assert_frame_equal 10 | 11 | from pyLDAvis import prepare 12 | 13 | roundtrip = fp.compose(json.loads, lambda d: d.to_json(), prepare) 14 | 15 | DATA_DIR = path.join(path.dirname(path.realpath(__file__)), "../data/") 16 | 17 | 18 | def load_dataset(name): 19 | with open(path.join(DATA_DIR, '%s_input.json' % name), 'r') as j: 20 | data_input = json.load(j) 21 | 22 | with open(path.join(DATA_DIR, '%s_output.json' % name), 'r') as j: 23 | expected = json.load(j) 24 | 25 | return data_input, expected 26 | 27 | 28 | def remove_col_suffixes(df): 29 | df.columns = [w.split('_')[0] for w in df.columns] 30 | return df 31 | 32 | 33 | def test_end_to_end_with_R_examples(): 34 | data_input, expected = load_dataset('movie_reviews') 35 | output = roundtrip(topic_term_dists=data_input['phi'], 36 | doc_topic_dists=data_input['theta'], 37 | doc_lengths=data_input['doc.length'], 38 | vocab=data_input['vocab'], 39 | term_frequency=data_input['term.frequency'], R=30, lambda_step=0.01) 40 | 41 | assert_array_equal(np.array(expected['topic.order']), np.array(output['topic.order'])) 42 | 43 | def both(f): 44 | return f(expected), f(output) 45 | 46 | assert set(expected['tinfo']['Category']) == set(output['tinfo']['Category']) 47 | etinfo, otinfo = both(lambda d: pd.DataFrame(d['tinfo'])) 48 | 49 | eddf = etinfo.query('Category == "Default"') 50 | eddf = eddf.reindex(sorted(eddf.columns), axis=1) 51 | 52 | oddf = otinfo.query('Category == "Default"') 53 | oddf = oddf.reindex(sorted(oddf.columns), axis=1) 54 | assert_frame_equal(eddf, oddf) 55 | 56 | joined = pd.merge(otinfo, etinfo, how='inner', on=['Term', 'Category'], suffixes=['_o', '_e']) 57 | ejoined = remove_col_suffixes(joined[['Term', 'Category', 'Freq_e', 58 | 'Total_e', 'loglift_e', 'logprob_e']]) 59 | ojoined = remove_col_suffixes(joined[['Term', 'Category', 'Freq_o', 'Total_o', 60 | 'loglift_o', 'logprob_o']]) 61 | 62 | join_percent = float(len(joined)) / len(etinfo) 63 | print('Topic Info join was %.0f%%' % (100 * join_percent)) 64 | assert_frame_equal(ejoined, ojoined, check_exact=False, rtol=0.1) 65 | assert join_percent > 0.95 66 | 67 | def abs_basis(df): 68 | df.x = df.x.abs() 69 | df.y = df.y.abs() 70 | return df 71 | 72 | emds, omds = both(lambda r: abs_basis(pd.DataFrame(r['mdsDat']))) 73 | assert_frame_equal(emds.reindex(sorted(oddf.columns), axis=1), 74 | omds.reindex(sorted(oddf.columns), axis=1), check_exact=False, rtol=0.1) 75 | 76 | def rounded_token_table(r): 77 | tt = pd.DataFrame(r['token.table']) 78 | tt.Freq = tt.Freq.round(5) 79 | return tt 80 | ett, ott = both(rounded_token_table) 81 | joined = pd.DataFrame(pd.merge(ott, ett, on=['Freq', 'Term'], 82 | suffixes=['_o', '_e'], how='inner') 83 | .groupby('Topic_o')['Topic_e'].value_counts()) 84 | joined.columns = ['count'] 85 | most_likely_map = joined.query('count > 100') 86 | most_likely_map.index.names = ['Topic_o', 'Topic_e'] 87 | df = pd.DataFrame(most_likely_map).reset_index() 88 | assert_array_equal(df['Topic_o'].values, df['Topic_e'].values) 89 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39, py310, py311 3 | 4 | [testenv] 5 | install_command = pip3 install {opts} {packages} 6 | whitelist_externals = sh, pytest 7 | setenv = PYTHONPATH = {toxinidir}:{toxinidir}/pyLDAvis 8 | commands = pytest {posargs} # substitute with tox' positional arguments 9 | deps = 10 | pytest 11 | -r{toxinidir}/requirements.txt --------------------------------------------------------------------------------