├── LICENSE ├── README.md ├── env.yml ├── environment.yml ├── requirements-conda.txt ├── requirements.txt └── topic_space ├── __init__.py ├── app ├── .gitignore ├── DejaVuSans.ttf ├── __init__.py ├── app.py ├── config.py ├── static │ ├── ldavis │ │ ├── d3.v3.js │ │ ├── index.html │ │ ├── lda.css │ │ └── ldavis.js │ └── termite │ │ ├── angular.min.js │ │ ├── bokeh-0.8.2.min.css │ │ ├── bokeh-0.8.2.min.js │ │ ├── bootstrap.min.css │ │ ├── bootstrap.min.js │ │ └── jquery.min.js ├── templates │ ├── histogram.html │ ├── ldavis.html │ ├── termite.html │ └── wordcloud.html ├── viz.py └── wordcloud_generator.py ├── models.py └── run.py /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Topic Space 2 | 3 | A restful web application to provide a topic models service. 4 | 5 | ## License 6 | 7 | [Apache License, version 2](http://www.apache.org/licenses/LICENSE-2.0) 8 | -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: topic_space 2 | channels: 3 | - chdoig 4 | dependencies: 5 | - flask 6 | - cython 7 | - scipy 8 | - pillow 9 | - numpy 10 | - pandas 11 | - gensim 12 | - bokeh 13 | - matplotlib 14 | - topik 15 | - pip 16 | - pip: 17 | - git+https://github.com/amueller/word_cloud.git 18 | - pattern -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: topic_space 2 | dependencies: 3 | - bcolz=0.7.2.dev=np19py27_0 4 | - blaze=0.7.0=np19py27_0 5 | - bokeh=0.7.1=np19py27_0 6 | - cffi=0.8.6=py27_0 7 | - colorama=0.3.1=py27_0 8 | - cryptography=0.5.4=py27_0 9 | - cssselect=0.9.1=py27_0 10 | - cytoolz=0.7.1=py27_0 11 | - datashape=0.4.2=np19py27_0 12 | - dateutil=2.1=py27_2 13 | - decorator=3.4.0=py27_0 14 | - flask=0.10.1=py27_1 15 | - gevent=1.0.1=py27_0 16 | - gevent-websocket=0.9.3=py27_0 17 | - greenlet=0.4.5=py27_0 18 | - h5py=2.3.1=np19py27_1 19 | - hdf5=1.8.14=0 20 | - into=0.1.3=np19py27_0 21 | - ipython=2.3.1=py27_0 22 | - itsdangerous=0.24=py27_0 23 | - jinja2=2.7.3=py27_1 24 | - libsodium=0.4.5=0 25 | - libxml2=2.9.0=1 26 | - libxslt=1.1.28=2 27 | - lxml=3.4.1=py27_0 28 | - markupsafe=0.23=py27_0 29 | - matplotlib=1.4.2=np19py27_0 30 | - mkl-rt=11.1=p0 31 | - mkl-service=1.0.0=py27_p1 32 | - multipledispatch=0.4.7=py27_0 33 | - networkx=1.9.1=py27_0 34 | - nose=1.3.4=py27_0 35 | - numexpr=2.3.1=np19py27_0 36 | - numpy=1.9.1=py27_0 37 | - openssl=1.0.1k=0 38 | - pandas=0.15.2=np19py27_0 39 | - pip=6.0.6=py27_0 40 | - psutil=2.1.1=py27_0 41 | - pycparser=2.10=py27_0 42 | - pygments=2.0.1=py27_0 43 | - pyopenssl=0.14=py27_0 44 | - pytables=3.1.1=np19py27_2 45 | - python=2.7.9=1 46 | - python.app=1.2=py27_3 47 | - pytz=2014.9=py27_0 48 | - pyyaml=3.11=py27_0 49 | - pyzmq=14.5.0=py27_0 50 | - queuelib=1.2.2=py27_0 51 | - readline=6.2=2 52 | - requests=2.5.1=py27_0 53 | - scikit-learn=0.15.0b1=np18py27_0 54 | - scipy=0.15.1=np19py27_0 55 | - scrapy=0.24.4=py27_0 56 | - setuptools=12.0.3=py27_0 57 | - six=1.9.0=py27_0 58 | - sqlalchemy=0.9.8=py27_0 59 | - sqlite=3.8.4.1=0 60 | - ssl_match_hostname=3.4.0.2=py27_0 61 | - tk=8.5.15=0 62 | - toolz=0.7.1=py27_0 63 | - tornado=4.0.2=py27_0 64 | - twisted=14.0.0=py27_0 65 | - ujson=1.33=py27_0 66 | - w3lib=1.8.1=py27_1 67 | - werkzeug=0.9.6=py27_1 68 | - xz=5.0.5=0 69 | - yaml=0.1.4=1 70 | - zeromq=4.0.4=0 71 | - zlib=1.2.8=0 72 | - zope.interface=4.1.1=py27_0 73 | - pip: 74 | - backports.ssl-match-hostname==3.4.0.2 75 | - beautifulsoup4==4.3.2 76 | - cython==0.21.2 77 | - django==1.7.3 78 | - gensim==0.10.3 79 | - image==1.3.4 80 | - nltk==3.0.1 81 | - pattern==2.6 82 | - pillow==2.7.0 83 | - python-dateutil==1.5 84 | - tables==3.1.1 85 | - textblob==0.9.0 86 | - wordcloud==1.0.0 87 | - git+git://github.com/amueller/word_cloud.git 88 | -------------------------------------------------------------------------------- /requirements-conda.txt: -------------------------------------------------------------------------------- 1 | flask 2 | cython 3 | scipy 4 | pillow 5 | numpy 6 | pandas 7 | gensim 8 | bokeh 9 | matplotlib 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/amueller/word_cloud.git 2 | pattern 3 | -------------------------------------------------------------------------------- /topic_space/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/topic_space/1c863c0f1dc266bf22ce8179135e54e46cd63d54/topic_space/__init__.py -------------------------------------------------------------------------------- /topic_space/app/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # IDEA 57 | .idea/ -------------------------------------------------------------------------------- /topic_space/app/DejaVuSans.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/topic_space/1c863c0f1dc266bf22ce8179135e54e46cd63d54/topic_space/app/DejaVuSans.ttf -------------------------------------------------------------------------------- /topic_space/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa-jpl-memex/topic_space/1c863c0f1dc266bf22ce8179135e54e46cd63d54/topic_space/app/__init__.py -------------------------------------------------------------------------------- /topic_space/app/app.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | import os 3 | from collections import Counter 4 | from uuid import uuid1 5 | 6 | from flask import Flask, send_file, request, render_template 7 | from bokeh.embed import components 8 | from bokeh.models import HoverTool 9 | from bokeh.plotting import figure, ColumnDataSource 10 | from bokeh.resources import INLINE 11 | from bokeh.templates import RESOURCES 12 | from wordcloud import WordCloud 13 | 14 | from wordcloud_generator import FONT_PATH, load_docs 15 | from viz import Termite 16 | 17 | tmpl_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates') 18 | app = Flask(__name__, template_folder=tmpl_dir) 19 | 20 | DOCS_DF = load_docs() 21 | TERMITE_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'termite.csv') 22 | 23 | print("Loaded documents") 24 | 25 | 26 | class RequestData: 27 | 28 | def __init__(self, year1, year2, stop_words=None, percent1=0, percent2=100, num_intervals=1, side_by_side=False): 29 | self.year1 = int(year1) 30 | self.year2 = int(year2) 31 | self.stop_words = stop_words if stop_words is not None else [] 32 | self.percent1 = percent1 33 | self.percent2 = percent2 34 | self.num_intervals = min(int(num_intervals), self.year2 - self.year1) 35 | self.interval_len = (self.year2 - self.year1) / self.num_intervals 36 | self.side_by_side = side_by_side 37 | self.image_width = 400 if side_by_side else 800 38 | 39 | def get_interval_data(self, interval_id): 40 | interval_id = int(interval_id) 41 | interval_begin = self.interval_len * interval_id + self.year1 42 | interval_end = min(self.year2, interval_begin + self.interval_len) 43 | year_list = map(str, range(interval_begin, interval_end+1)) 44 | return interval_begin, interval_end, year_list 45 | 46 | def get_interval_num_docs(self, interval_id): 47 | _, _, year_list = self.get_interval_data(interval_id) 48 | return DOCS_DF[DOCS_DF['year'].isin(year_list)]['num_docs'].sum() 49 | 50 | def get_num_docs(self): 51 | return map(lambda interval_id: self.get_interval_num_docs(interval_id), 52 | range(self.num_intervals)) 53 | 54 | def get_word_frequencies(self, interval_id): 55 | interval_begin, interval_end, year_list = self.get_interval_data(interval_id) 56 | text = DOCS_DF[DOCS_DF['year'].isin(year_list)]['lsa_abs'].sum() 57 | stop_words = set(map(lambda t: t.strip().lower(), self.stop_words)) 58 | text_list = map(lambda t: t.strip().lower(), text.split()) 59 | text_counter = Counter(text_list) 60 | text_freq = list(text_counter.iteritems()) 61 | text_freq = filter(lambda x: x[0] not in stop_words, text_freq) 62 | text_freq.sort(key=lambda x: x[1]) 63 | low_count = int(len(text_freq) * (self.percent1 * .01)) 64 | high_count = int(len(text_freq) * (self.percent2 * .01)) 65 | text_freq = text_freq[low_count:high_count] 66 | return text_freq[-100:] 67 | 68 | def get_wordcloud_img(self, interval_id): 69 | text_freq = self.get_word_frequencies(interval_id) 70 | wordcloud = WordCloud(font_path=FONT_PATH, width=self.image_width, height=int(self.image_width * .75)) 71 | wordcloud.fit_words(list(reversed(text_freq[-100:]))) 72 | img_io = StringIO() 73 | wordcloud.to_image().save(img_io, 'JPEG', quality=70) 74 | img_io.seek(0) 75 | return img_io 76 | 77 | def get_bokeh_word_frequencies(self): 78 | plot_scripts = [] 79 | plot_divs = [] 80 | for interval_id in range(self.num_intervals): 81 | text_freq = self.get_word_frequencies(interval_id) 82 | fig = figure(title="Word frequency", title_text_font_size="12pt", plot_width=self.image_width, plot_height=150, 83 | outline_line_color=None, tools="hover") 84 | source = ColumnDataSource( 85 | data=dict( 86 | left=range(len(text_freq)), 87 | right=[i + 0.7 for i in range(0, len(text_freq))], 88 | top=map(lambda x: x[1], text_freq), 89 | word=map(lambda x: x[0], text_freq), 90 | ) 91 | ) 92 | fig.quad("left", "right", "top", 0, source=source), 93 | fig.toolbar_location = None 94 | fig.grid.grid_line_color = None 95 | fig.xaxis.axis_line_color = None 96 | fig.xaxis.major_tick_line_color = None 97 | fig.xaxis.minor_tick_line_color = None 98 | fig.xaxis.major_label_text_color = None 99 | fig.yaxis.minor_tick_line_color = None 100 | hover = fig.select(dict(type=HoverTool)) 101 | hover.tooltips = [ 102 | ("word", "@word"), 103 | ("frequency", "@top"), 104 | ("fill color", "$color[hex, swatch]:fill_color"), 105 | ] 106 | script, div = components(fig, INLINE) 107 | plot_scripts.append(script) 108 | plot_divs.append(div) 109 | return plot_scripts, plot_divs 110 | 111 | 112 | 113 | REQUESTS = {0: RequestData('1980', '2014', [])} 114 | 115 | 116 | @app.route('/topic_space/') 117 | def hello_world(): 118 | return 'Hello World!' 119 | 120 | @app.route("/topic_space/termite/") 121 | def termite(): 122 | viz = Termite(TERMITE_FILE) 123 | script, div = viz.plot() 124 | return render_template('termite.html', script=script, div=div) 125 | 126 | @app.route("/topic_space/diversity/") 127 | def diversity(): 128 | return send_file(os.path.join(tmpl_dir,'diversity.html')) 129 | 130 | @app.route("/topic_space/ldavis/") 131 | def ldavis(): 132 | return render_template("ldavis.html") 133 | 134 | 135 | def cache_request(): 136 | year1 = request.values.get('year1', '1980') 137 | year2 = request.values.get('year2', '2014') 138 | stop_words = map(lambda x: x.strip(), request.values.get('words','').split('\n')) 139 | percents = request.values.get("percents", "0% - 100%") 140 | percent1, percent2 = map(lambda t: int(t.strip()), percents.strip().replace("%", '').split('-')) 141 | try: 142 | num_intervals = int(request.values.get('intervals', 1)) 143 | except ValueError: 144 | num_intervals = 1 145 | year1, year2 = int(year1), int(year2) 146 | req_id = uuid1().get_fields()[0] 147 | side_by_side = request.values.get("side_by_side", False) == "side_by_side" 148 | # import pdb; pdb.set_trace() 149 | REQUESTS[req_id] = RequestData(year1, year2, stop_words, percent1, percent2, num_intervals, side_by_side) 150 | return req_id 151 | 152 | 153 | @app.route('/topic_space/wordcloud/', methods=["GET", "POST"]) 154 | def wordcloud(): 155 | req_id = cache_request() 156 | req = REQUESTS[req_id] 157 | 158 | start_years = [] 159 | end_years = [] 160 | for i in range(req.num_intervals): 161 | start_years.append(i*req.interval_len + req.year1) 162 | end_years.append(min(req.year2, start_years[-1] + req.interval_len)) 163 | plot_resources = RESOURCES.render( 164 | js_raw=INLINE.js_raw, 165 | css_raw=INLINE.css_raw, 166 | js_files=INLINE.js_files, 167 | css_files=INLINE.css_files, 168 | ) 169 | 170 | plot_scripts, plot_divs = req.get_bokeh_word_frequencies() 171 | num_docs = req.get_num_docs() 172 | return render_template('wordcloud.html', year1=req.year1, year2=req.year2, words=req.stop_words, req_id=req_id, 173 | percent1=req.percent1, percent2=req.percent2, num_intervals=req.num_intervals, 174 | start_years=start_years, end_years=end_years, 175 | plot_resources=plot_resources, plot_scripts=plot_scripts, plot_divs=plot_divs, 176 | num_docs=num_docs, side_by_side=req.side_by_side) 177 | 178 | 179 | @app.route('/topic_space///get_wordcloud.jpg') 180 | def wordcloud_img(req_id, interval_id): 181 | req = REQUESTS.get(int(req_id), REQUESTS[0]) 182 | return send_file(req.get_wordcloud_img(interval_id), mimetype='image/jpeg') 183 | 184 | 185 | if __name__ == '__main__': 186 | app.run(debug=True, host='0.0.0.0', port=8017) 187 | -------------------------------------------------------------------------------- /topic_space/app/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | ELASTICSEARCH_HOST = "http://10.3.2.56:9200" 5 | ELASTICSEARCH_INDEX = "dig-mrs-dev16" -------------------------------------------------------------------------------- /topic_space/app/static/ldavis/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | LDAvis 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 |
18 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /topic_space/app/static/ldavis/lda.css: -------------------------------------------------------------------------------- 1 | /* http://stackoverflow.com/questions/25194631/is-it-possible-to-always-show-up-down-arrows-for-input-number */ 2 | input[type=number]::-webkit-inner-spin-button, 3 | input[type=number]::-webkit-outer-spin-button { 4 | opacity: 1; 5 | } 6 | 7 | div .tab-content { 8 | overflow:hidden; 9 | } 10 | 11 | /* 12 | http://bl.ocks.org/mbostock/1212215 13 | */ 14 | 15 | .points:not(:hover) .docsize { 16 | display: none; 17 | } 18 | 19 | /* 20 | http://alignedleft.com/tutorials/d3/axes/ 21 | .axis path, 22 | .axis line { 23 | stroke: black; 24 | shape-rendering: crispEdges; 25 | } 26 | */ 27 | 28 | text { 29 | font-family: sans-serif; 30 | font-size: 11px; 31 | } 32 | 33 | .axis { 34 | shape-rendering: crispEdges; 35 | } 36 | 37 | /* 38 | this is the major grid line 39 | .x.axis line { 40 | stroke: lightgrey; 41 | } 42 | */ 43 | 44 | .xaxis .tick.major { 45 | fill: black; 46 | stroke: black; 47 | stroke-width: 0.1; 48 | opacity: 0.7; 49 | } 50 | 51 | .xaxis .tick.minor { 52 | display: none; 53 | } 54 | 55 | .xaxis line { 56 | opacity: 0.1; 57 | stroke-width: 1; 58 | } 59 | 60 | .xaxis path { 61 | display: none; 62 | } 63 | 64 | .inlineForm { 65 | display: inline-block; 66 | } 67 | 68 | .slideraxis .tick.major { 69 | fill: black; 70 | stroke: black; 71 | stroke-width: 0.4; 72 | opacity: 1; 73 | } 74 | 75 | .slideraxis .tick.minor { 76 | fill: black; 77 | stroke: black; 78 | stroke-width: 0.4; 79 | opacity: 1; 80 | } 81 | 82 | .slideraxis path { 83 | display: none; 84 | } 85 | -------------------------------------------------------------------------------- /topic_space/app/static/ldavis/ldavis.js: -------------------------------------------------------------------------------- 1 | LDAvis = function(to_select, json_file) { 2 | 3 | // This section sets up the logic for event handling 4 | var current_clicked = { 5 | what: "nothing", 6 | element: undefined 7 | }, 8 | current_hover = { 9 | what: "nothing", 10 | element: undefined 11 | }, 12 | old_winning_state = { 13 | what: "nothing", 14 | element: undefined 15 | }, 16 | vis_state = { 17 | lambda: 1, 18 | topic: 0, 19 | term: "" 20 | }; 21 | 22 | // Set up a few 'global' variables to hold the data: 23 | var K, // number of topics 24 | R, // number of terms to display in bar chart 25 | mdsData, // (x,y) locations and topic proportions 26 | mdsData3, // topic proportions for all terms in the viz 27 | lamData, // all terms that are among the top-R most relevant for all topics, lambda values 28 | lambda = { 29 | old: 1, 30 | current: 1 31 | }, 32 | color1 = "#1f77b4", // baseline color for default topic circles and overall term frequencies 33 | color2 = "#d62728"; // 'highlight' color for selected topics and term-topic frequencies 34 | 35 | // Set the duration of each half of the transition: 36 | var duration = 750; 37 | 38 | // Set global margins used for everything 39 | var margin = { 40 | top: 30, 41 | right: 30, 42 | bottom: 70, 43 | left: 30 44 | }, 45 | mdswidth = 530, 46 | mdsheight = 530, 47 | barwidth = 530, 48 | barheight = 530, 49 | termwidth = 90, // width to add between two panels to display terms 50 | mdsarea = mdsheight * mdswidth; 51 | // controls how big the maximum circle can be 52 | // doesn't depend on data, only on mds width and height: 53 | var rMax = 60; 54 | 55 | // proportion of area of MDS plot to which the sum of default topic circle areas is set 56 | var circle_prop = 0.25; 57 | var word_prop = 0.25; 58 | 59 | // opacity of topic circles: 60 | var base_opacity = 0.2, 61 | highlight_opacity = 0.6; 62 | 63 | // topic/lambda selection names are specific to *this* vis 64 | var topic_select = to_select + "-topic"; 65 | var lambda_select = to_select + "-lambda"; 66 | 67 | // get rid of the # in the to_select (useful) for setting ID values 68 | var parts = to_select.split("#"); 69 | var visID = parts[parts.length - 1]; 70 | var topicID = visID + "-topic"; 71 | var lambdaID = visID + "-lambda"; 72 | var termID = visID + "-term"; 73 | var topicDown = topicID + "-down"; 74 | var topicUp = topicID + "-up"; 75 | var topicClear = topicID + "-clear"; 76 | 77 | ////////////////////////////////////////////////////////////////////////////// 78 | 79 | // sort array according to a specified object key name 80 | // Note that default is decreasing sort, set decreasing = -1 for increasing 81 | // adpated from http://stackoverflow.com/questions/16648076/sort-array-on-key-value 82 | function fancysort(key_name, decreasing) { 83 | decreasing = (typeof decreasing === "undefined") ? 1 : decreasing; 84 | return function(a, b) { 85 | if (a[key_name] < b[key_name]) 86 | return 1 * decreasing; 87 | if (a[key_name] > b[key_name]) 88 | return -1 * decreasing; 89 | return 0; 90 | }; 91 | } 92 | 93 | 94 | // The actual read-in of the data and main code: 95 | d3.json(json_file, function(error, data) { 96 | 97 | // set the number of topics to global variable K: 98 | K = data['mdsDat'].x.length; 99 | 100 | // R is the number of top relevant (or salient) words whose bars we display 101 | R = data['R']; 102 | 103 | // a (K x 5) matrix with columns x, y, topics, Freq, cluster (where x and y are locations for left panel) 104 | mdsData = []; 105 | for (var i = 0; i < K; i++) { 106 | var obj = {}; 107 | for (var key in data['mdsDat']) { 108 | obj[key] = data['mdsDat'][key][i]; 109 | } 110 | mdsData.push(obj); 111 | } 112 | 113 | // a huge matrix with 3 columns: Term, Topic, Freq, where Freq is all non-zero probabilities of topics given terms 114 | // for the terms that appear in the barcharts for this data 115 | mdsData3 = []; 116 | for (var i = 0; i < data['token.table'].Term.length; i++) { 117 | var obj = {}; 118 | for (var key in data['token.table']) { 119 | obj[key] = data['token.table'][key][i]; 120 | } 121 | mdsData3.push(obj); 122 | } 123 | 124 | // large data for the widths of bars in bar-charts. 6 columns: Term, logprob, loglift, Freq, Total, Category 125 | // Contains all possible terms for topics in (1, 2, ..., k) and lambda in the user-supplied grid of lambda values 126 | // which defaults to (0, 0.01, 0.02, ..., 0.99, 1). 127 | lamData = []; 128 | for (var i = 0; i < data['tinfo'].Term.length; i++) { 129 | var obj = {}; 130 | for (var key in data['tinfo']) { 131 | obj[key] = data['tinfo'][key][i]; 132 | } 133 | lamData.push(obj); 134 | } 135 | 136 | // Create the topic input & lambda slider forms. Inspired from: 137 | // http://bl.ocks.org/d3noob/10632804 138 | // http://bl.ocks.org/d3noob/10633704 139 | init_forms(topicID, lambdaID, visID); 140 | 141 | // When the value of lambda changes, update the visualization 142 | d3.select(lambda_select) 143 | .on("mouseup", function() { 144 | // store the previous lambda value 145 | lambda.old = lambda.current; 146 | lambda.current = document.getElementById(lambdaID).value; 147 | vis_state.lambda = +this.value; 148 | // adjust the text on the range slider 149 | d3.select(lambda_select).property("value", vis_state.lambda); 150 | d3.select(lambda_select + "-value").text(vis_state.lambda); 151 | // transition the order of the bars 152 | var increased = lambda.old < vis_state.lambda; 153 | if (vis_state.topic > 0) reorder_bars(increased); 154 | // store the current lambda value 155 | state_save(true); 156 | document.getElementById(lambdaID).value = vis_state.lambda; 157 | }); 158 | 159 | d3.select("#" + topicUp) 160 | .on("click", function() { 161 | // remove term selection if it exists (from a saved URL) 162 | var termElem = document.getElementById(termID + vis_state.term); 163 | if (termElem !== undefined) term_off(termElem); 164 | vis_state.term = ""; 165 | var value_old = document.getElementById(topicID).value; 166 | var value_new = Math.min(K, +value_old + 1).toFixed(0); 167 | // increment the value in the input box 168 | document.getElementById(topicID).value = value_new; 169 | topic_off(document.getElementById(topicID + value_old)); 170 | topic_on(document.getElementById(topicID + value_new)); 171 | vis_state.topic = value_new; 172 | state_save(true); 173 | }) 174 | 175 | d3.select("#" + topicDown) 176 | .on("click", function() { 177 | // remove term selection if it exists (from a saved URL) 178 | var termElem = document.getElementById(termID + vis_state.term); 179 | if (termElem !== undefined) term_off(termElem); 180 | vis_state.term = ""; 181 | var value_old = document.getElementById(topicID).value; 182 | var value_new = Math.max(0, +value_old - 1).toFixed(0); 183 | // increment the value in the input box 184 | document.getElementById(topicID).value = value_new; 185 | topic_off(document.getElementById(topicID + value_old)); 186 | topic_on(document.getElementById(topicID + value_new)); 187 | vis_state.topic = value_new; 188 | state_save(true); 189 | }) 190 | 191 | d3.select("#" + topicID) 192 | .on("keyup", function() { 193 | // remove term selection if it exists (from a saved URL) 194 | var termElem = document.getElementById(termID + vis_state.term); 195 | if (termElem !== undefined) term_off(termElem); 196 | vis_state.term = ""; 197 | topic_off(document.getElementById(topicID + vis_state.topic)) 198 | var value_new = document.getElementById(topicID).value; 199 | if (!isNaN(value_new) && value_new > 0) { 200 | value_new = Math.min(K, Math.max(1, value_new)) 201 | topic_on(document.getElementById(topicID + value_new)); 202 | vis_state.topic = value_new; 203 | state_save(true); 204 | document.getElementById(topicID).value = vis_state.topic; 205 | } 206 | }) 207 | 208 | d3.select("#" + topicClear) 209 | .on("click", function() { 210 | state_reset(); 211 | state_save(true); 212 | }) 213 | 214 | // create linear scaling to pixels (and add some padding on outer region of scatterplot) 215 | var xrange = d3.extent(mdsData, function(d) { 216 | return d.x; 217 | }); //d3.extent returns min and max of an array 218 | var xdiff = xrange[1] - xrange[0], 219 | xpad = 0.05; 220 | var yrange = d3.extent(mdsData, function(d) { 221 | return d.y; 222 | }); 223 | var ydiff = yrange[1] - yrange[0], 224 | ypad = 0.05; 225 | 226 | if (xdiff > ydiff) { 227 | var xScale = d3.scale.linear() 228 | .range([0, mdswidth]) 229 | .domain([xrange[0] - xpad * xdiff, xrange[1] + xpad * xdiff]); 230 | 231 | var yScale = d3.scale.linear() 232 | .range([mdsheight, 0]) 233 | .domain([yrange[0] - 0.5*(xdiff - ydiff) - ypad*xdiff, yrange[1] + 0.5*(xdiff - ydiff) + ypad*xdiff]); 234 | } else { 235 | var xScale = d3.scale.linear() 236 | .range([0, mdswidth]) 237 | .domain([xrange[0] - 0.5*(ydiff - xdiff) - xpad*ydiff, xrange[1] + 0.5*(ydiff - xdiff) + xpad*ydiff]); 238 | 239 | var yScale = d3.scale.linear() 240 | .range([mdsheight, 0]) 241 | .domain([yrange[0] - ypad * ydiff, yrange[1] + ypad * ydiff]); 242 | } 243 | 244 | // Create new svg element (that will contain everything): 245 | var svg = d3.select(to_select).append("svg") 246 | .attr("width", mdswidth + barwidth + margin.left + termwidth + margin.right) 247 | .attr("height", mdsheight + 2 * margin.top + margin.bottom + 2 * rMax); 248 | 249 | // Create a group for the mds plot 250 | var mdsplot = svg.append("g") 251 | .attr("id", "leftpanel") 252 | .attr("class", "points") 253 | .attr("transform", "translate(" + margin.left + "," + 2 * margin.top + ")"); 254 | 255 | // Clicking on the mdsplot should clear the selection 256 | mdsplot 257 | .append("rect") 258 | .attr("x", 0) 259 | .attr("y", 0) 260 | .attr("height", mdsheight) 261 | .attr("width", mdswidth) 262 | .style("fill", color1) 263 | .attr("opacity", 0) 264 | .on("click", function() { 265 | state_reset(); 266 | state_save(true); 267 | }); 268 | 269 | mdsplot.append("line") // draw x-axis 270 | .attr("x1", 0) 271 | .attr("x2", mdswidth) 272 | .attr("y1", mdsheight / 2) 273 | .attr("y2", mdsheight / 2) 274 | .attr("stroke", "gray") 275 | .attr("opacity", 0.3); 276 | mdsplot.append("text") // label x-axis 277 | .attr("x", 0) 278 | .attr("y", mdsheight/2 - 5) 279 | .text(data['plot.opts'].xlab) 280 | .attr("fill", "gray"); 281 | 282 | mdsplot.append("line") // draw y-axis 283 | .attr("x1", mdswidth / 2) 284 | .attr("x2", mdswidth / 2) 285 | .attr("y1", 0) 286 | .attr("y2", mdsheight) 287 | .attr("stroke", "gray") 288 | .attr("opacity", 0.3); 289 | mdsplot.append("text") // label y-axis 290 | .attr("x", mdswidth/2 + 5) 291 | .attr("y", 7) 292 | .text(data['plot.opts'].ylab) 293 | .attr("fill", "gray"); 294 | 295 | // new definitions based on fixing the sum of the areas of the default topic circles: 296 | var newSmall = Math.sqrt(0.02*mdsarea*circle_prop/Math.PI); 297 | var newMedium = Math.sqrt(0.05*mdsarea*circle_prop/Math.PI); 298 | var newLarge = Math.sqrt(0.10*mdsarea*circle_prop/Math.PI); 299 | var cx = 10 + newLarge, 300 | cx2 = cx + 1.5 * newLarge; 301 | 302 | // circle guide inspired from 303 | // http://www.nytimes.com/interactive/2012/02/13/us/politics/2013-budget-proposal-graphic.html?_r=0 304 | circleGuide = function(rSize, size) { 305 | d3.select("#leftpanel").append("circle") 306 | .attr('class', "circleGuide" + size) 307 | .attr('r', rSize) 308 | .attr('cx', cx) 309 | .attr('cy', mdsheight + rSize) 310 | .style('fill', 'none') 311 | .style('stroke-dasharray', '2 2') 312 | .style('stroke', '#999'); 313 | d3.select("#leftpanel").append("line") 314 | .attr('class', "lineGuide" + size) 315 | .attr("x1", cx) 316 | .attr("x2", cx2) 317 | .attr("y1", mdsheight + 2 * rSize) 318 | .attr("y2", mdsheight + 2 * rSize) 319 | .style("stroke", "gray") 320 | .style("opacity", 0.3); 321 | } 322 | 323 | circleGuide(newSmall, "Small"); 324 | circleGuide(newMedium, "Medium"); 325 | circleGuide(newLarge, "Large"); 326 | 327 | var defaultLabelSmall = "2%"; 328 | var defaultLabelMedium = "5%"; 329 | var defaultLabelLarge = "10%"; 330 | 331 | d3.select("#leftpanel").append("text") 332 | .attr("x", 10) 333 | .attr("y", mdsheight - 10) 334 | .attr('class', "circleGuideTitle") 335 | .style("text-anchor", "left") 336 | .style("fontWeight", "bold") 337 | .text("Marginal topic distribtion"); 338 | d3.select("#leftpanel").append("text") 339 | .attr("x", cx2 + 10) 340 | .attr("y", mdsheight + 2 * newSmall) 341 | .attr('class', "circleGuideLabelSmall") 342 | .style("text-anchor", "start") 343 | .text(defaultLabelSmall); 344 | d3.select("#leftpanel").append("text") 345 | .attr("x", cx2 + 10) 346 | .attr("y", mdsheight + 2 * newMedium) 347 | .attr('class', "circleGuideLabelMedium") 348 | .style("text-anchor", "start") 349 | .text(defaultLabelMedium); 350 | d3.select("#leftpanel").append("text") 351 | .attr("x", cx2 + 10) 352 | .attr("y", mdsheight + 2 * newLarge) 353 | .attr('class', "circleGuideLabelLarge") 354 | .style("text-anchor", "start") 355 | .text(defaultLabelLarge); 356 | 357 | // bind mdsData to the points in the left panel: 358 | var points = mdsplot.selectAll("points") 359 | .data(mdsData) 360 | .enter(); 361 | 362 | // text to indicate topic 363 | points.append("text") 364 | .attr("class", "txt") 365 | .attr("x", function(d) { 366 | return (xScale(+d.x)); 367 | }) 368 | .attr("y", function(d) { 369 | return (yScale(+d.y) + 4); 370 | }) 371 | .attr("stroke", "black") 372 | .attr("opacity", 1) 373 | .style("text-anchor", "middle") 374 | .style("font-size", "11px") 375 | .style("fontWeight", 100) 376 | .text(function(d) { 377 | return d.topics; 378 | }); 379 | 380 | // draw circles 381 | points.append("circle") 382 | .attr("class", "dot") 383 | .style("opacity", 0.2) 384 | .style("fill", color1) 385 | .attr("r", function(d) { 386 | //return (rScaleMargin(+d.Freq)); 387 | return (Math.sqrt((d.Freq/100)*mdswidth*mdsheight*circle_prop/Math.PI)); 388 | }) 389 | .attr("cx", function(d) { 390 | return (xScale(+d.x)); 391 | }) 392 | .attr("cy", function(d) { 393 | return (yScale(+d.y)); 394 | }) 395 | .attr("stroke", "black") 396 | .attr("id", function(d) { 397 | return (topicID + d.topics) 398 | }) 399 | .on("mouseover", function(d) { 400 | var old_topic = topicID + vis_state.topic; 401 | if (vis_state.topic > 0 && old_topic != this.id) { 402 | topic_off(document.getElementById(old_topic)); 403 | } 404 | topic_on(this); 405 | }) 406 | .on("click", function(d) { 407 | // prevent click event defined on the div container from firing 408 | // http://bl.ocks.org/jasondavies/3186840 409 | d3.event.stopPropagation(); 410 | var old_topic = topicID + vis_state.topic; 411 | if (vis_state.topic > 0 && old_topic != this.id) { 412 | topic_off(document.getElementById(old_topic)); 413 | } 414 | // make sure topic input box value and fragment reflects clicked selection 415 | document.getElementById(topicID).value = vis_state.topic = d.topics; 416 | state_save(true); 417 | topic_on(this); 418 | }) 419 | .on("mouseout", function(d) { 420 | if (vis_state.topic != d.topics) topic_off(this); 421 | if (vis_state.topic > 0) topic_on(document.getElementById(topicID + vis_state.topic)); 422 | }); 423 | 424 | svg.append("text") 425 | .text("Intertopic Distance Map (via multidimensional scaling)") 426 | .attr("x", mdswidth/2 + margin.left) 427 | .attr("y", 30) 428 | .style("font-size", "16px") 429 | .style("text-anchor", "middle"); 430 | 431 | // establish layout and vars for bar chart 432 | var barDefault2 = lamData.filter(function(d) { 433 | return d.Category == "Default" 434 | }); 435 | 436 | var y = d3.scale.ordinal() 437 | .domain(barDefault2.map(function(d) { 438 | return d.Term; 439 | })) 440 | .rangeRoundBands([0, barheight], 0.15); 441 | var x = d3.scale.linear() 442 | .domain([1, d3.max(barDefault2, function(d) { 443 | return d.Total; 444 | })]) 445 | .range([0, barwidth]) 446 | .nice(); 447 | var yAxis = d3.svg.axis() 448 | .scale(y); 449 | 450 | // Add a group for the bar chart 451 | var chart = svg.append("g") 452 | .attr("transform", "translate(" + +(mdswidth + margin.left + termwidth) + "," + 2 * margin.top + ")") 453 | .attr("id", "bar-freqs"); 454 | 455 | // bar chart legend/guide: 456 | var barguide = {"width": 100, "height": 15}; 457 | d3.select("#bar-freqs").append("rect") 458 | .attr("x", 0) 459 | .attr("y", mdsheight + 10) 460 | .attr("height", barguide.height) 461 | .attr("width", barguide.width) 462 | .style("fill", color1) 463 | .attr("opacity", 0.4); 464 | d3.select("#bar-freqs").append("text") 465 | .attr("x", barguide.width + 5) 466 | .attr("y", mdsheight + 10 + barguide.height/2) 467 | .style("dominant-baseline", "middle") 468 | .text("Overall term frequency"); 469 | 470 | d3.select("#bar-freqs").append("rect") 471 | .attr("x", 0) 472 | .attr("y", mdsheight + 10 + barguide.height + 5) 473 | .attr("height", barguide.height) 474 | .attr("width", barguide.width/2) 475 | .style("fill", color2) 476 | .attr("opacity", 0.8); 477 | d3.select("#bar-freqs").append("text") 478 | .attr("x", barguide.width/2 + 5) 479 | .attr("y", mdsheight + 10 + (3/2)*barguide.height + 5) 480 | .style("dominant-baseline", "middle") 481 | .text("Estimated term frequency within the selected topic"); 482 | 483 | // footnotes: 484 | d3.select("#bar-freqs") 485 | .append("a") 486 | .attr("xlink:href", "http://vis.stanford.edu/files/2012-Termite-AVI.pdf") 487 | .attr("target", "_blank") 488 | .append("text") 489 | .attr("x", 0) 490 | .attr("y", mdsheight + 10 + (6/2)*barguide.height + 5) 491 | .style("dominant-baseline", "middle") 492 | .text("1. saliency(term w) = frequency(w) * [sum_t p(t | w) * log(p(t | w)/p(t))] for topics t; see Chuang et. al (2012)"); 493 | d3.select("#bar-freqs") 494 | .append("a") 495 | .attr("xlink:href", "http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf") 496 | .attr("target", "_blank") 497 | .append("text") 498 | .attr("x", 0) 499 | .attr("y", mdsheight + 10 + (8/2)*barguide.height + 5) 500 | .style("dominant-baseline", "middle") 501 | .text("2. relevance(term w | topic t) = \u03BB * p(w | t) + (1 - \u03BB) * p(w | t)/p(w); see Sievert & Shirley (2014)"); 502 | 503 | // Bind 'default' data to 'default' bar chart 504 | var basebars = chart.selectAll(".bar-totals") 505 | .data(barDefault2) 506 | .enter(); 507 | 508 | // Draw the gray background bars defining the overall frequency of each word 509 | basebars 510 | .append("rect") 511 | .attr("class", "bar-totals") 512 | .attr("x", 0) 513 | .attr("y", function(d) { 514 | return y(d.Term); 515 | }) 516 | .attr("height", y.rangeBand()) 517 | .attr("width", function(d) { 518 | return x(d.Total); 519 | }) 520 | .style("fill", color1) 521 | .attr("opacity", 0.4); 522 | 523 | // Add word labels to the side of each bar 524 | basebars 525 | .append("text") 526 | .attr("x", -5) 527 | .attr("class", "terms") 528 | .attr("y", function(d) { 529 | return y(d.Term) + 12; 530 | }) 531 | .attr("cursor", "pointer") 532 | .attr("id", function(d) { 533 | return (termID + d.Term) 534 | }) 535 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 536 | .text(function(d) { 537 | return d.Term; 538 | }) 539 | .on("mouseover", function() { 540 | term_hover(this); 541 | }) 542 | // .on("click", function(d) { 543 | // var old_term = termID + vis_state.term; 544 | // if (vis_state.term != "" && old_term != this.id) { 545 | // term_off(document.getElementById(old_term)); 546 | // } 547 | // vis_state.term = d.Term; 548 | // state_save(true); 549 | // term_on(this); 550 | // debugger; 551 | // }) 552 | .on("mouseout", function() { 553 | vis_state.term = ""; 554 | term_off(this); 555 | state_save(true); 556 | }); 557 | 558 | var title = chart.append("text") 559 | .attr("x", barwidth/2) 560 | .attr("y", -30) 561 | .attr("class", "bubble-tool") // set class so we can remove it when highlight_off is called 562 | .style("text-anchor", "middle") 563 | .style("font-size", "16px") 564 | .text("Top-" + R + " Most Salient Terms"); 565 | 566 | title.append("tspan") 567 | .attr("baseline-shift", "super") 568 | .attr("font-size", "12px") 569 | .text("(1)"); 570 | 571 | // barchart axis adapted from http://bl.ocks.org/mbostock/1166403 572 | var xAxis = d3.svg.axis().scale(x) 573 | .orient("top") 574 | .tickSize(-barheight) 575 | .tickSubdivide(true) 576 | .ticks(6); 577 | 578 | chart.attr("class", "xaxis") 579 | .call(xAxis); 580 | 581 | // dynamically create the topic and lambda input forms at the top of the page: 582 | function init_forms(topicID, lambdaID, visID) { 583 | 584 | // create container div for topic and lambda input: 585 | var inputDiv = document.createElement("div"); 586 | inputDiv.setAttribute("id", "top"); 587 | 588 | // insert the input container just before the vis: 589 | var visDiv = document.getElementById(visID); 590 | document.body.insertBefore(inputDiv, visDiv); 591 | 592 | // topic input container: 593 | var topicDiv = document.createElement("div"); 594 | topicDiv.setAttribute("style", "padding: 5px; background-color: #e8e8e8; position: absolute; top: 10px; left: 38px; height: 40px; width: " + mdswidth + "px; display: inline-block"); 595 | inputDiv.appendChild(topicDiv); 596 | 597 | var topicLabel = document.createElement("label"); 598 | topicLabel.setAttribute("for", topicID); 599 | topicLabel.setAttribute("style", "font-family: sans-serif; font-size: 14px"); 600 | topicLabel.innerHTML = "Selected Topic: "; 601 | topicDiv.appendChild(topicLabel); 602 | 603 | var topicInput = document.createElement("input"); 604 | topicInput.setAttribute("style", "width: 50px"); 605 | topicInput.type = "text"; 606 | topicInput.min = "0"; 607 | topicInput.max = K; // assumes the data has already been read in 608 | topicInput.step = "1"; 609 | topicInput.value = "0"; // a value of 0 indicates no topic is selected 610 | topicInput.id = topicID; 611 | topicDiv.appendChild(topicInput); 612 | 613 | var previous = document.createElement("button"); 614 | previous.setAttribute("id", topicDown); 615 | previous.setAttribute("style", "margin-left: 5px"); 616 | previous.innerHTML = "Previous Topic"; 617 | topicDiv.appendChild(previous); 618 | 619 | var next = document.createElement("button"); 620 | next.setAttribute("id", topicUp); 621 | next.setAttribute("style", "margin-left: 5px"); 622 | next.innerHTML = "Next Topic"; 623 | topicDiv.appendChild(next); 624 | 625 | var clear = document.createElement("button"); 626 | clear.setAttribute("id", topicClear); 627 | clear.setAttribute("style", "margin-left: 5px"); 628 | clear.innerHTML = "Clear Topic"; 629 | topicDiv.appendChild(clear); 630 | 631 | // lambda inputs 632 | var lambdaDivLeft = 8 + mdswidth + margin.left + termwidth; 633 | var lambdaDivWidth = barwidth; 634 | var lambdaDiv = document.createElement("div"); 635 | lambdaDiv.setAttribute("id", "lambdaInput"); 636 | lambdaDiv.setAttribute("style", "padding: 5px; background-color: #e8e8e8; position: absolute; top: 10px; left: " + 637 | lambdaDivLeft + "px; height: 50px; width: " + lambdaDivWidth + "px"); 638 | inputDiv.appendChild(lambdaDiv); 639 | 640 | var lambdaZero = document.createElement("div"); 641 | lambdaZero.setAttribute("style", "padding: 5px; height: 20px; width: 220px; font-family: sans-serif; position: absolute; top: 0px; left: 0px;"); 642 | lambdaZero.setAttribute("id", "lambdaZero"); 643 | lambdaDiv.appendChild(lambdaZero); 644 | var xx = d3.select("#lambdaZero") 645 | .append("text") 646 | .attr("x", 0) 647 | .attr("y", 0) 648 | .style("font-size", "14px") 649 | .text("Slide to adjust relevance metric:"); 650 | var yy = d3.select("#lambdaZero") 651 | .append("text") 652 | .attr("x", 125) 653 | .attr("y", -5) 654 | .style("font-size", "10px") 655 | .style("position", "absolute") 656 | .text("(2)"); 657 | 658 | var lambdaLabel = document.createElement("label"); 659 | lambdaLabel.setAttribute("for", lambdaID); 660 | lambdaLabel.setAttribute("style", "height: 20px; width: 60px; position: absolute; top: 25px; left: 90px; font-family: sans-serif; font-size: 14px"); 661 | lambdaLabel.innerHTML = "λ = " + vis_state.lambda + ""; 662 | lambdaDiv.appendChild(lambdaLabel); 663 | 664 | var sliderDiv = document.createElement("div"); 665 | sliderDiv.setAttribute("id", "sliderdiv"); 666 | sliderDiv.setAttribute("style", "padding: 5px; height: 40px; position: absolute; top:0px; left: 240px; width: 250px"); 667 | lambdaDiv.appendChild(sliderDiv); 668 | 669 | var lambdaInput = document.createElement("input"); 670 | lambdaInput.setAttribute("style", "width: 250px; margin-top: -20px; margin-left: 0px; margin-right: 0px"); 671 | lambdaInput.type = "range"; 672 | lambdaInput.min = 0; 673 | lambdaInput.max = 1; 674 | lambdaInput.step = data['lambda.step']; 675 | lambdaInput.value = vis_state.lambda; 676 | lambdaInput.id = lambdaID; 677 | lambdaInput.setAttribute("list", "ticks"); // to enable automatic ticks (with no labels, see below) 678 | sliderDiv.appendChild(lambdaInput); 679 | 680 | // Create the svg to contain the slider scale: 681 | var scaleContainer = d3.select("#sliderdiv").append("svg") 682 | .attr("width", 250) 683 | .attr("height", 25); 684 | 685 | var sliderScale = d3.scale.linear() 686 | .domain([0, 1]) 687 | .range([7.5, 242.5]) // trimmed by 7.5px on each side to match the input type=range slider: 688 | .nice(); 689 | 690 | // adapted from http://bl.ocks.org/mbostock/1166403 691 | var sliderAxis = d3.svg.axis() 692 | .scale(sliderScale) 693 | .orient("bottom") 694 | .tickSize(10) 695 | .tickSubdivide(true) 696 | .ticks(6); 697 | 698 | // group to contain the elements of the slider axis: 699 | var sliderAxisGroup = scaleContainer.append("g") 700 | .attr("class", "slideraxis") 701 | .attr("margin-top", "-10px") 702 | .call(sliderAxis); 703 | 704 | // Another strategy for tick marks on the slider; simpler, but not labels 705 | // var sliderTicks = document.createElement("datalist"); 706 | // sliderTicks.setAttribute("id", "ticks"); 707 | // for (var tick = 0; tick <= 10; tick++) { 708 | // var tickOption = document.createElement("option"); 709 | // //tickOption.value = tick/10; 710 | // tickOption.innerHTML = tick/10; 711 | // sliderTicks.appendChild(tickOption); 712 | // } 713 | // append the forms to the containers 714 | //lambdaDiv.appendChild(sliderTicks); 715 | 716 | } 717 | 718 | // function to re-order the bars (gray and red), and terms: 719 | function reorder_bars(increase) { 720 | // grab the bar-chart data for this topic only: 721 | var dat2 = lamData.filter(function(d) { 722 | //return d.Category == "Topic" + Math.min(K, Math.max(0, vis_state.topic)) // fails for negative topic numbers... 723 | return d.Category == "Topic" + vis_state.topic; 724 | }); 725 | // define relevance: 726 | for (var i = 0; i < dat2.length; i++) { 727 | dat2[i].relevance = vis_state.lambda * dat2[i].logprob + 728 | (1 - vis_state.lambda) * dat2[i].loglift; 729 | } 730 | 731 | // sort by relevance: 732 | dat2.sort(fancysort("relevance")); 733 | 734 | // truncate to the top R tokens: 735 | var dat3 = dat2.slice(0, R); 736 | 737 | var y = d3.scale.ordinal() 738 | .domain(dat3.map(function(d) { 739 | return d.Term; 740 | })) 741 | .rangeRoundBands([0, barheight], 0.15); 742 | var x = d3.scale.linear() 743 | .domain([1, d3.max(dat3, function(d) { 744 | return d.Total; 745 | })]) 746 | .range([0, barwidth]) 747 | .nice(); 748 | 749 | // Change Total Frequency bars 750 | var graybars = d3.select("#bar-freqs") 751 | .selectAll(".bar-totals") 752 | .data(dat3, function(d) { 753 | return d.Term; 754 | }); 755 | 756 | // Change word labels 757 | var labels = d3.select("#bar-freqs") 758 | .selectAll(".terms") 759 | .data(dat3, function(d) { 760 | return d.Term; 761 | }); 762 | 763 | // Create red bars (drawn over the gray ones) to signify the frequency under the selected topic 764 | var redbars = d3.select("#bar-freqs") 765 | .selectAll(".overlay") 766 | .data(dat3, function(d) { 767 | return d.Term; 768 | }); 769 | 770 | // adapted from http://bl.ocks.org/mbostock/1166403 771 | var xAxis = d3.svg.axis().scale(x) 772 | .orient("top") 773 | .tickSize(-barheight) 774 | .tickSubdivide(true) 775 | .ticks(6); 776 | 777 | // New axis definition: 778 | var newaxis = d3.selectAll(".xaxis"); 779 | 780 | // define the new elements to enter: 781 | var graybarsEnter = graybars.enter().append("rect") 782 | .attr("class", "bar-totals") 783 | .attr("x", 0) 784 | .attr("y", function(d) { 785 | return y(d.Term) + barheight + margin.bottom + 2 * rMax; 786 | }) 787 | .attr("height", y.rangeBand()) 788 | .style("fill", color1) 789 | .attr("opacity", 0.4); 790 | 791 | var labelsEnter = labels.enter() 792 | .append("text") 793 | .attr("x", -5) 794 | .attr("class", "terms") 795 | .attr("y", function(d) { 796 | return y(d.Term) + 12 + barheight + margin.bottom + 2 * rMax; 797 | }) 798 | .attr("cursor", "pointer") 799 | .style("text-anchor", "end") 800 | .attr("id", function(d) { 801 | return (termID + d.Term) 802 | }) 803 | .text(function(d) { 804 | return d.Term; 805 | }) 806 | .on("mouseover", function() { 807 | term_hover(this); 808 | }) 809 | // .on("click", function(d) { 810 | // var old_term = termID + vis_state.term; 811 | // if (vis_state.term != "" && old_term != this.id) { 812 | // term_off(document.getElementById(old_term)); 813 | // } 814 | // vis_state.term = d.Term; 815 | // state_save(true); 816 | // term_on(this); 817 | // }) 818 | .on("mouseout", function() { 819 | vis_state.term = ""; 820 | term_off(this); 821 | state_save(true); 822 | }); 823 | 824 | var redbarsEnter = redbars.enter().append("rect") 825 | .attr("class", "overlay") 826 | .attr("x", 0) 827 | .attr("y", function(d) { 828 | return y(d.Term) + barheight + margin.bottom + 2 * rMax; 829 | }) 830 | .attr("height", y.rangeBand()) 831 | .style("fill", color2) 832 | .attr("opacity", 0.8); 833 | 834 | 835 | if (increase) { 836 | graybarsEnter 837 | .attr("width", function(d) { 838 | return x(d.Total); 839 | }) 840 | .transition().duration(duration) 841 | .delay(duration) 842 | .attr("y", function(d) { 843 | return y(d.Term); 844 | }); 845 | labelsEnter 846 | .transition().duration(duration) 847 | .delay(duration) 848 | .attr("y", function(d) { 849 | return y(d.Term) + 12; 850 | }); 851 | redbarsEnter 852 | .attr("width", function(d) { 853 | return x(d.Freq); 854 | }) 855 | .transition().duration(duration) 856 | .delay(duration) 857 | .attr("y", function(d) { 858 | return y(d.Term); 859 | }); 860 | 861 | graybars.transition().duration(duration) 862 | .attr("width", function(d) { 863 | return x(d.Total); 864 | }) 865 | .transition().duration(duration) 866 | .attr("y", function(d) { 867 | return y(d.Term); 868 | }); 869 | labels.transition().duration(duration) 870 | .delay(duration) 871 | .attr("y", function(d) { 872 | return y(d.Term) + 12; 873 | }); 874 | redbars.transition().duration(duration) 875 | .attr("width", function(d) { 876 | return x(d.Freq); 877 | }) 878 | .transition().duration(duration) 879 | .attr("y", function(d) { 880 | return y(d.Term); 881 | }); 882 | 883 | // Transition exiting rectangles to the bottom of the barchart: 884 | graybars.exit() 885 | .transition().duration(duration) 886 | .attr("width", function(d) { 887 | return x(d.Total); 888 | }) 889 | .transition().duration(duration) 890 | .attr("y", function(d, i) { 891 | return barheight + margin.bottom + 6 + i * 18; 892 | }) 893 | .remove(); 894 | labels.exit() 895 | .transition().duration(duration) 896 | .delay(duration) 897 | .attr("y", function(d, i) { 898 | return barheight + margin.bottom + 18 + i * 18; 899 | }) 900 | .remove(); 901 | redbars.exit() 902 | .transition().duration(duration) 903 | .attr("width", function(d) { 904 | return x(d.Freq); 905 | }) 906 | .transition().duration(duration) 907 | .attr("y", function(d, i) { 908 | return barheight + margin.bottom + 6 + i * 18; 909 | }) 910 | .remove(); 911 | // https://github.com/mbostock/d3/wiki/Transitions#wiki-d3_ease 912 | newaxis.transition().duration(duration) 913 | .call(xAxis) 914 | .transition().duration(duration); 915 | } else { 916 | graybarsEnter 917 | .attr("width", 100) // FIXME by looking up old width of these bars 918 | .transition().duration(duration) 919 | .attr("y", function(d) { 920 | return y(d.Term); 921 | }) 922 | .transition().duration(duration) 923 | .attr("width", function(d) { 924 | return x(d.Total); 925 | }); 926 | labelsEnter 927 | .transition().duration(duration) 928 | .attr("y", function(d) { 929 | return y(d.Term) + 12; 930 | }); 931 | redbarsEnter 932 | .attr("width", 50) // FIXME by looking up old width of these bars 933 | .transition().duration(duration) 934 | .attr("y", function(d) { 935 | return y(d.Term); 936 | }) 937 | .transition().duration(duration) 938 | .attr("width", function(d) { 939 | return x(d.Freq); 940 | }); 941 | 942 | graybars.transition().duration(duration) 943 | .attr("y", function(d) { 944 | return y(d.Term); 945 | }) 946 | .transition().duration(duration) 947 | .attr("width", function(d) { 948 | return x(d.Total); 949 | }); 950 | labels.transition().duration(duration) 951 | .attr("y", function(d) { 952 | return y(d.Term) + 12; 953 | }); 954 | redbars.transition().duration(duration) 955 | .attr("y", function(d) { 956 | return y(d.Term); 957 | }) 958 | .transition().duration(duration) 959 | .attr("width", function(d) { 960 | return x(d.Freq); 961 | }); 962 | 963 | // Transition exiting rectangles to the bottom of the barchart: 964 | graybars.exit() 965 | .transition().duration(duration) 966 | .attr("y", function(d, i) { 967 | return barheight + margin.bottom + 6 + i * 18 + 2 * rMax; 968 | }) 969 | .remove(); 970 | labels.exit() 971 | .transition().duration(duration) 972 | .attr("y", function(d, i) { 973 | return barheight + margin.bottom + 18 + i * 18 + 2 * rMax; 974 | }) 975 | .remove(); 976 | redbars.exit() 977 | .transition().duration(duration) 978 | .attr("y", function(d, i) { 979 | return barheight + margin.bottom + 6 + i * 18 + 2 * rMax; 980 | }) 981 | .remove(); 982 | 983 | // https://github.com/mbostock/d3/wiki/Transitions#wiki-d3_ease 984 | newaxis.transition().duration(duration) 985 | .transition().duration(duration) 986 | .call(xAxis); 987 | } 988 | } 989 | 990 | ////////////////////////////////////////////////////////////////////////////// 991 | 992 | // function to update bar chart when a topic is selected 993 | // the circle argument should be the appropriate circle element 994 | function topic_on(circle) { 995 | if (circle == null) return null; 996 | 997 | // grab data bound to this element 998 | var d = circle.__data__ 999 | var Freq = Math.round(d.Freq * 10) / 10, 1000 | topics = d.topics; 1001 | 1002 | // change opacity and fill of the selected circle 1003 | circle.style.opacity = highlight_opacity; 1004 | circle.style.fill = color2; 1005 | 1006 | // Remove 'old' bar chart title 1007 | var text = d3.select(".bubble-tool"); 1008 | text.remove(); 1009 | 1010 | // append text with info relevant to topic of interest 1011 | d3.select("#bar-freqs") 1012 | .append("text") 1013 | .attr("x", barwidth/2) 1014 | .attr("y", -30) 1015 | .attr("class", "bubble-tool") // set class so we can remove it when highlight_off is called 1016 | .style("text-anchor", "middle") 1017 | .style("font-size", "16px") 1018 | .text("Top-" + R + " Most Relevant Terms for Topic " + topics + " (" + Freq + "% of tokens)"); 1019 | 1020 | // grab the bar-chart data for this topic only: 1021 | var dat2 = lamData.filter(function(d) { 1022 | return d.Category == "Topic" + topics 1023 | }); 1024 | 1025 | // define relevance: 1026 | for (var i = 0; i < dat2.length; i++) { 1027 | dat2[i].relevance = lambda.current * dat2[i].logprob + 1028 | (1 - lambda.current) * dat2[i].loglift; 1029 | } 1030 | 1031 | // sort by relevance: 1032 | dat2.sort(fancysort("relevance")); 1033 | 1034 | // truncate to the top R tokens: 1035 | var dat3 = dat2.slice(0, R); 1036 | 1037 | // scale the bars to the top R terms: 1038 | var y = d3.scale.ordinal() 1039 | .domain(dat3.map(function(d) { 1040 | return d.Term; 1041 | })) 1042 | .rangeRoundBands([0, barheight], 0.15); 1043 | var x = d3.scale.linear() 1044 | .domain([1, d3.max(dat3, function(d) { 1045 | return d.Total; 1046 | })]) 1047 | .range([0, barwidth]) 1048 | .nice(); 1049 | 1050 | // remove the red bars if there are any: 1051 | d3.selectAll(".overlay").remove(); 1052 | 1053 | // Change Total Frequency bars 1054 | d3.selectAll(".bar-totals") 1055 | .data(dat3) 1056 | .attr("x", 0) 1057 | .attr("y", function(d) { 1058 | return y(d.Term); 1059 | }) 1060 | .attr("height", y.rangeBand()) 1061 | .attr("width", function(d) { 1062 | return x(d.Total); 1063 | }) 1064 | .style("fill", color1) 1065 | .attr("opacity", 0.4); 1066 | 1067 | // Change word labels 1068 | d3.selectAll(".terms") 1069 | .data(dat3) 1070 | .attr("x", -5) 1071 | .attr("y", function(d) { 1072 | return y(d.Term) + 12; 1073 | }) 1074 | .attr("id", function(d) { 1075 | return (termID + d.Term) 1076 | }) 1077 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 1078 | .text(function(d) { 1079 | return d.Term; 1080 | }); 1081 | 1082 | // Create red bars (drawn over the gray ones) to signify the frequency under the selected topic 1083 | d3.select("#bar-freqs").selectAll(".overlay") 1084 | .data(dat3) 1085 | .enter() 1086 | .append("rect") 1087 | .attr("class", "overlay") 1088 | .attr("x", 0) 1089 | .attr("y", function(d) { 1090 | return y(d.Term); 1091 | }) 1092 | .attr("height", y.rangeBand()) 1093 | .attr("width", function(d) { 1094 | return x(d.Freq); 1095 | }) 1096 | .style("fill", color2) 1097 | .attr("opacity", 0.8); 1098 | 1099 | // adapted from http://bl.ocks.org/mbostock/1166403 1100 | var xAxis = d3.svg.axis().scale(x) 1101 | .orient("top") 1102 | .tickSize(-barheight) 1103 | .tickSubdivide(true) 1104 | .ticks(6); 1105 | 1106 | // redraw x-axis 1107 | d3.selectAll(".xaxis") 1108 | //.attr("class", "xaxis") 1109 | .call(xAxis); 1110 | } 1111 | 1112 | 1113 | function topic_off(circle) { 1114 | if (circle == null) return circle; 1115 | // go back to original opacity/fill 1116 | circle.style.opacity = base_opacity; 1117 | circle.style.fill = color1; 1118 | 1119 | var title = d3.selectAll(".bubble-tool") 1120 | .text("Top-" + R + " Most Salient Terms"); 1121 | title.append("tspan") 1122 | .attr("baseline-shift", "super") 1123 | .attr("font-size", 12) 1124 | .text(1); 1125 | 1126 | // remove the red bars 1127 | d3.selectAll(".overlay").remove(); 1128 | 1129 | // go back to 'default' bar chart 1130 | var dat2 = lamData.filter(function(d) { 1131 | return d.Category == "Default" 1132 | }); 1133 | 1134 | var y = d3.scale.ordinal() 1135 | .domain(dat2.map(function(d) { 1136 | return d.Term; 1137 | })) 1138 | .rangeRoundBands([0, barheight], 0.15); 1139 | var x = d3.scale.linear() 1140 | .domain([1, d3.max(dat2, function(d) { 1141 | return d.Total; 1142 | })]) 1143 | .range([0, barwidth]) 1144 | .nice(); 1145 | 1146 | // Change Total Frequency bars 1147 | d3.selectAll(".bar-totals") 1148 | .data(dat2) 1149 | .attr("x", 0) 1150 | .attr("y", function(d) { 1151 | return y(d.Term); 1152 | }) 1153 | .attr("height", y.rangeBand()) 1154 | .attr("width", function(d) { 1155 | return x(d.Total); 1156 | }) 1157 | .style("fill", color1) 1158 | .attr("opacity", 0.4); 1159 | 1160 | //Change word labels 1161 | d3.selectAll(".terms") 1162 | .data(dat2) 1163 | .attr("x", -5) 1164 | .attr("y", function(d) { 1165 | return y(d.Term) + 12; 1166 | }) 1167 | .style("text-anchor", "end") // right align text - use 'middle' for center alignment 1168 | .text(function(d) { 1169 | return d.Term; 1170 | }); 1171 | 1172 | // adapted from http://bl.ocks.org/mbostock/1166403 1173 | var xAxis = d3.svg.axis().scale(x) 1174 | .orient("top") 1175 | .tickSize(-barheight) 1176 | .tickSubdivide(true) 1177 | .ticks(6); 1178 | 1179 | // redraw x-axis 1180 | d3.selectAll(".xaxis") 1181 | .attr("class", "xaxis") 1182 | .call(xAxis); 1183 | } 1184 | 1185 | // event definition for mousing over a term 1186 | function term_hover(term) { 1187 | var old_term = termID + vis_state.term; 1188 | if (vis_state.term != "" && old_term != term.id) { 1189 | term_off(document.getElementById(old_term)); 1190 | } 1191 | vis_state.term = term.innerHTML; 1192 | term_on(term); 1193 | state_save(true); 1194 | } 1195 | // updates vis when a term is selected via click or hover 1196 | function term_on(term) { 1197 | if (term == null) return null; 1198 | term.style["fontWeight"] = "bold"; 1199 | var d = term.__data__ 1200 | var Term = d.Term; 1201 | var dat2 = mdsData3.filter(function(d2) { 1202 | return d2.Term == Term 1203 | }); 1204 | 1205 | var k = dat2.length; // number of topics for this token with non-zero frequency 1206 | 1207 | var radius = []; 1208 | for (var i = 0; i < K; ++i) { 1209 | radius[i] = 0; 1210 | } 1211 | for (i = 0; i < k; i++) { 1212 | radius[dat2[i].Topic - 1] = dat2[i].Freq; 1213 | } 1214 | 1215 | var size = []; 1216 | for (var i = 0; i < K; ++i) { 1217 | size[i] = 0; 1218 | } 1219 | for (i = 0; i < k; i++) { 1220 | // If we want to also re-size the topic number labels, do it here 1221 | // 11 is the default, so leaving this as 11 won't change anything. 1222 | size[dat2[i].Topic - 1] = 11; 1223 | } 1224 | 1225 | var rScaleCond = d3.scale.sqrt() 1226 | .domain([0, 1]).range([0, rMax]); 1227 | 1228 | // Change size of bubbles according to the word's distribution over topics 1229 | d3.selectAll(".dot") 1230 | .data(radius) 1231 | .transition() 1232 | .attr("r", function(d) { 1233 | //return (rScaleCond(d)); 1234 | return (Math.sqrt(d*mdswidth*mdsheight*word_prop/Math.PI)); 1235 | }); 1236 | 1237 | // re-bind mdsData so we can handle multiple selection 1238 | d3.selectAll(".dot") 1239 | .data(mdsData) 1240 | 1241 | // Change sizes of topic numbers: 1242 | d3.selectAll(".txt") 1243 | .data(size) 1244 | .transition() 1245 | .style("font-size", function(d) { 1246 | return +d; 1247 | }); 1248 | 1249 | // Alter the guide 1250 | d3.select(".circleGuideTitle") 1251 | .text("Conditional topic distribution given term = '" + term.innerHTML + "'"); 1252 | } 1253 | 1254 | function term_off(term) { 1255 | if (term == null) return null; 1256 | term.style["fontWeight"] = "normal"; 1257 | 1258 | d3.selectAll(".dot") 1259 | .data(mdsData) 1260 | .transition() 1261 | .attr("r", function(d) { 1262 | //return (rScaleMargin(+d.Freq)); 1263 | return (Math.sqrt((d.Freq/100)*mdswidth*mdsheight*circle_prop/Math.PI)); 1264 | }); 1265 | 1266 | // Change sizes of topic numbers: 1267 | d3.selectAll(".txt") 1268 | .transition() 1269 | .style("font-size", "11px"); 1270 | 1271 | // Go back to the default guide 1272 | d3.select(".circleGuideTitle") 1273 | .text("Marginal topic distribution"); 1274 | d3.select(".circleGuideLabelLarge") 1275 | .text(defaultLabelLarge); 1276 | d3.select(".circleGuideLabelSmall") 1277 | .attr("y", mdsheight + 2 * newSmall) 1278 | .text(defaultLabelSmall); 1279 | d3.select(".circleGuideSmall") 1280 | .attr("r", newSmall) 1281 | .attr("cy", mdsheight + newSmall); 1282 | d3.select(".lineGuideSmall") 1283 | .attr("y1", mdsheight + 2 * newSmall) 1284 | .attr("y2", mdsheight + 2 * newSmall); 1285 | } 1286 | 1287 | 1288 | // serialize the visualization state using fragment identifiers -- http://en.wikipedia.org/wiki/Fragment_identifier 1289 | // location.hash holds the address information 1290 | 1291 | var params = location.hash.split("&"); 1292 | if (params.length > 1) { 1293 | vis_state.topic = params[0].split("=")[1]; 1294 | vis_state.lambda = params[1].split("=")[1]; 1295 | vis_state.term = params[2].split("=")[1]; 1296 | 1297 | // Idea: write a function to parse the URL string 1298 | // only accept values in [0,1] for lambda, {0, 1, ..., K} for topics (any string is OK for term) 1299 | // Allow for subsets of the three to be entered: 1300 | // (1) topic only (lambda = 1 term = "") 1301 | // (2) lambda only (topic = 0 term = "") visually the same but upon hovering a topic, the effect of lambda will be seen 1302 | // (3) term only (topic = 0 lambda = 1) only fires when the term is among the R most salient 1303 | // (4) topic + lambda (term = "") 1304 | // (5) topic + term (lambda = 1) 1305 | // (6) lambda + term (topic = 0) visually lambda doesn't make a difference unless a topic is hovered 1306 | // (7) topic + lambda + term 1307 | 1308 | // Short-term: assume format of "#topic=k&lambda=l&term=s" where k, l, and s are strings (b/c they're from a URL) 1309 | 1310 | // Force k (topic identifier) to be an integer between 0 and K: 1311 | vis_state.topic = Math.round(Math.min(K, Math.max(0, vis_state.topic))); 1312 | 1313 | // Force l (lambda identifier) to be in [0, 1]: 1314 | vis_state.lambda = Math.min(1, Math.max(0, vis_state.lambda)); 1315 | 1316 | // impose the value of lambda: 1317 | document.getElementById(lambdaID).value = vis_state.lambda; 1318 | document.getElementById(lambdaID + "-value").innerHTML = vis_state.lambda; 1319 | 1320 | // select the topic and transition the order of the bars (if approporiate) 1321 | if (!isNaN(vis_state.topic)) { 1322 | document.getElementById(topicID).value = vis_state.topic; 1323 | if (vis_state.topic > 0) { 1324 | topic_on(document.getElementById(topicID + vis_state.topic)); 1325 | } 1326 | if (vis_state.lambda < 1 && vis_state.topic > 0) { 1327 | reorder_bars(false); 1328 | } 1329 | } 1330 | lambda.current = vis_state.lambda; 1331 | var termElem = document.getElementById(termID + vis_state.term); 1332 | if (termElem !== undefined) term_on(termElem); 1333 | } 1334 | 1335 | function state_url() { 1336 | return location.origin + location.pathname + "#topic=" + vis_state.topic + 1337 | "&lambda=" + vis_state.lambda + "&term=" + vis_state.term; 1338 | } 1339 | 1340 | function state_save(replace) { 1341 | if (replace) 1342 | history.replaceState(vis_state, "Query", state_url()); 1343 | else 1344 | history.pushState(vis_state, "Query", state_url()); 1345 | } 1346 | 1347 | function state_reset() { 1348 | if (vis_state.topic > 0) { 1349 | topic_off(document.getElementById(topicID + vis_state.topic)); 1350 | } 1351 | if (vis_state.term != "") { 1352 | term_off(document.getElementById(termID + vis_state.term)); 1353 | } 1354 | vis_state.term = ""; 1355 | document.getElementById(topicID).value = vis_state.topic = 0; 1356 | state_save(true); 1357 | } 1358 | 1359 | }); 1360 | // var current_clicked = { 1361 | // what: "nothing", 1362 | // element: undefined 1363 | // }, 1364 | 1365 | //debugger; 1366 | 1367 | } 1368 | 1369 | -------------------------------------------------------------------------------- /topic_space/app/static/termite/bootstrap.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap v3.3.5 (http://getbootstrap.com) 3 | * Copyright 2011-2015 Twitter, Inc. 4 | * Licensed under the MIT license 5 | */ 6 | if("undefined"==typeof jQuery)throw new Error("Bootstrap's JavaScript requires jQuery");+function(a){"use strict";var b=a.fn.jquery.split(" ")[0].split(".");if(b[0]<2&&b[1]<9||1==b[0]&&9==b[1]&&b[2]<1)throw new Error("Bootstrap's JavaScript requires jQuery version 1.9.1 or higher")}(jQuery),+function(a){"use strict";function b(){var a=document.createElement("bootstrap"),b={WebkitTransition:"webkitTransitionEnd",MozTransition:"transitionend",OTransition:"oTransitionEnd otransitionend",transition:"transitionend"};for(var c in b)if(void 0!==a.style[c])return{end:b[c]};return!1}a.fn.emulateTransitionEnd=function(b){var c=!1,d=this;a(this).one("bsTransitionEnd",function(){c=!0});var e=function(){c||a(d).trigger(a.support.transition.end)};return setTimeout(e,b),this},a(function(){a.support.transition=b(),a.support.transition&&(a.event.special.bsTransitionEnd={bindType:a.support.transition.end,delegateType:a.support.transition.end,handle:function(b){return a(b.target).is(this)?b.handleObj.handler.apply(this,arguments):void 0}})})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var c=a(this),e=c.data("bs.alert");e||c.data("bs.alert",e=new d(this)),"string"==typeof b&&e[b].call(c)})}var c='[data-dismiss="alert"]',d=function(b){a(b).on("click",c,this.close)};d.VERSION="3.3.5",d.TRANSITION_DURATION=150,d.prototype.close=function(b){function c(){g.detach().trigger("closed.bs.alert").remove()}var e=a(this),f=e.attr("data-target");f||(f=e.attr("href"),f=f&&f.replace(/.*(?=#[^\s]*$)/,""));var g=a(f);b&&b.preventDefault(),g.length||(g=e.closest(".alert")),g.trigger(b=a.Event("close.bs.alert")),b.isDefaultPrevented()||(g.removeClass("in"),a.support.transition&&g.hasClass("fade")?g.one("bsTransitionEnd",c).emulateTransitionEnd(d.TRANSITION_DURATION):c())};var e=a.fn.alert;a.fn.alert=b,a.fn.alert.Constructor=d,a.fn.alert.noConflict=function(){return a.fn.alert=e,this},a(document).on("click.bs.alert.data-api",c,d.prototype.close)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.button"),f="object"==typeof b&&b;e||d.data("bs.button",e=new c(this,f)),"toggle"==b?e.toggle():b&&e.setState(b)})}var c=function(b,d){this.$element=a(b),this.options=a.extend({},c.DEFAULTS,d),this.isLoading=!1};c.VERSION="3.3.5",c.DEFAULTS={loadingText:"loading..."},c.prototype.setState=function(b){var c="disabled",d=this.$element,e=d.is("input")?"val":"html",f=d.data();b+="Text",null==f.resetText&&d.data("resetText",d[e]()),setTimeout(a.proxy(function(){d[e](null==f[b]?this.options[b]:f[b]),"loadingText"==b?(this.isLoading=!0,d.addClass(c).attr(c,c)):this.isLoading&&(this.isLoading=!1,d.removeClass(c).removeAttr(c))},this),0)},c.prototype.toggle=function(){var a=!0,b=this.$element.closest('[data-toggle="buttons"]');if(b.length){var c=this.$element.find("input");"radio"==c.prop("type")?(c.prop("checked")&&(a=!1),b.find(".active").removeClass("active"),this.$element.addClass("active")):"checkbox"==c.prop("type")&&(c.prop("checked")!==this.$element.hasClass("active")&&(a=!1),this.$element.toggleClass("active")),c.prop("checked",this.$element.hasClass("active")),a&&c.trigger("change")}else this.$element.attr("aria-pressed",!this.$element.hasClass("active")),this.$element.toggleClass("active")};var d=a.fn.button;a.fn.button=b,a.fn.button.Constructor=c,a.fn.button.noConflict=function(){return a.fn.button=d,this},a(document).on("click.bs.button.data-api",'[data-toggle^="button"]',function(c){var d=a(c.target);d.hasClass("btn")||(d=d.closest(".btn")),b.call(d,"toggle"),a(c.target).is('input[type="radio"]')||a(c.target).is('input[type="checkbox"]')||c.preventDefault()}).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',function(b){a(b.target).closest(".btn").toggleClass("focus",/^focus(in)?$/.test(b.type))})}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.carousel"),f=a.extend({},c.DEFAULTS,d.data(),"object"==typeof b&&b),g="string"==typeof b?b:f.slide;e||d.data("bs.carousel",e=new c(this,f)),"number"==typeof b?e.to(b):g?e[g]():f.interval&&e.pause().cycle()})}var c=function(b,c){this.$element=a(b),this.$indicators=this.$element.find(".carousel-indicators"),this.options=c,this.paused=null,this.sliding=null,this.interval=null,this.$active=null,this.$items=null,this.options.keyboard&&this.$element.on("keydown.bs.carousel",a.proxy(this.keydown,this)),"hover"==this.options.pause&&!("ontouchstart"in document.documentElement)&&this.$element.on("mouseenter.bs.carousel",a.proxy(this.pause,this)).on("mouseleave.bs.carousel",a.proxy(this.cycle,this))};c.VERSION="3.3.5",c.TRANSITION_DURATION=600,c.DEFAULTS={interval:5e3,pause:"hover",wrap:!0,keyboard:!0},c.prototype.keydown=function(a){if(!/input|textarea/i.test(a.target.tagName)){switch(a.which){case 37:this.prev();break;case 39:this.next();break;default:return}a.preventDefault()}},c.prototype.cycle=function(b){return b||(this.paused=!1),this.interval&&clearInterval(this.interval),this.options.interval&&!this.paused&&(this.interval=setInterval(a.proxy(this.next,this),this.options.interval)),this},c.prototype.getItemIndex=function(a){return this.$items=a.parent().children(".item"),this.$items.index(a||this.$active)},c.prototype.getItemForDirection=function(a,b){var c=this.getItemIndex(b),d="prev"==a&&0===c||"next"==a&&c==this.$items.length-1;if(d&&!this.options.wrap)return b;var e="prev"==a?-1:1,f=(c+e)%this.$items.length;return this.$items.eq(f)},c.prototype.to=function(a){var b=this,c=this.getItemIndex(this.$active=this.$element.find(".item.active"));return a>this.$items.length-1||0>a?void 0:this.sliding?this.$element.one("slid.bs.carousel",function(){b.to(a)}):c==a?this.pause().cycle():this.slide(a>c?"next":"prev",this.$items.eq(a))},c.prototype.pause=function(b){return b||(this.paused=!0),this.$element.find(".next, .prev").length&&a.support.transition&&(this.$element.trigger(a.support.transition.end),this.cycle(!0)),this.interval=clearInterval(this.interval),this},c.prototype.next=function(){return this.sliding?void 0:this.slide("next")},c.prototype.prev=function(){return this.sliding?void 0:this.slide("prev")},c.prototype.slide=function(b,d){var e=this.$element.find(".item.active"),f=d||this.getItemForDirection(b,e),g=this.interval,h="next"==b?"left":"right",i=this;if(f.hasClass("active"))return this.sliding=!1;var j=f[0],k=a.Event("slide.bs.carousel",{relatedTarget:j,direction:h});if(this.$element.trigger(k),!k.isDefaultPrevented()){if(this.sliding=!0,g&&this.pause(),this.$indicators.length){this.$indicators.find(".active").removeClass("active");var l=a(this.$indicators.children()[this.getItemIndex(f)]);l&&l.addClass("active")}var m=a.Event("slid.bs.carousel",{relatedTarget:j,direction:h});return a.support.transition&&this.$element.hasClass("slide")?(f.addClass(b),f[0].offsetWidth,e.addClass(h),f.addClass(h),e.one("bsTransitionEnd",function(){f.removeClass([b,h].join(" ")).addClass("active"),e.removeClass(["active",h].join(" ")),i.sliding=!1,setTimeout(function(){i.$element.trigger(m)},0)}).emulateTransitionEnd(c.TRANSITION_DURATION)):(e.removeClass("active"),f.addClass("active"),this.sliding=!1,this.$element.trigger(m)),g&&this.cycle(),this}};var d=a.fn.carousel;a.fn.carousel=b,a.fn.carousel.Constructor=c,a.fn.carousel.noConflict=function(){return a.fn.carousel=d,this};var e=function(c){var d,e=a(this),f=a(e.attr("data-target")||(d=e.attr("href"))&&d.replace(/.*(?=#[^\s]+$)/,""));if(f.hasClass("carousel")){var g=a.extend({},f.data(),e.data()),h=e.attr("data-slide-to");h&&(g.interval=!1),b.call(f,g),h&&f.data("bs.carousel").to(h),c.preventDefault()}};a(document).on("click.bs.carousel.data-api","[data-slide]",e).on("click.bs.carousel.data-api","[data-slide-to]",e),a(window).on("load",function(){a('[data-ride="carousel"]').each(function(){var c=a(this);b.call(c,c.data())})})}(jQuery),+function(a){"use strict";function b(b){var c,d=b.attr("data-target")||(c=b.attr("href"))&&c.replace(/.*(?=#[^\s]+$)/,"");return a(d)}function c(b){return this.each(function(){var c=a(this),e=c.data("bs.collapse"),f=a.extend({},d.DEFAULTS,c.data(),"object"==typeof b&&b);!e&&f.toggle&&/show|hide/.test(b)&&(f.toggle=!1),e||c.data("bs.collapse",e=new d(this,f)),"string"==typeof b&&e[b]()})}var d=function(b,c){this.$element=a(b),this.options=a.extend({},d.DEFAULTS,c),this.$trigger=a('[data-toggle="collapse"][href="#'+b.id+'"],[data-toggle="collapse"][data-target="#'+b.id+'"]'),this.transitioning=null,this.options.parent?this.$parent=this.getParent():this.addAriaAndCollapsedClass(this.$element,this.$trigger),this.options.toggle&&this.toggle()};d.VERSION="3.3.5",d.TRANSITION_DURATION=350,d.DEFAULTS={toggle:!0},d.prototype.dimension=function(){var a=this.$element.hasClass("width");return a?"width":"height"},d.prototype.show=function(){if(!this.transitioning&&!this.$element.hasClass("in")){var b,e=this.$parent&&this.$parent.children(".panel").children(".in, .collapsing");if(!(e&&e.length&&(b=e.data("bs.collapse"),b&&b.transitioning))){var f=a.Event("show.bs.collapse");if(this.$element.trigger(f),!f.isDefaultPrevented()){e&&e.length&&(c.call(e,"hide"),b||e.data("bs.collapse",null));var g=this.dimension();this.$element.removeClass("collapse").addClass("collapsing")[g](0).attr("aria-expanded",!0),this.$trigger.removeClass("collapsed").attr("aria-expanded",!0),this.transitioning=1;var h=function(){this.$element.removeClass("collapsing").addClass("collapse in")[g](""),this.transitioning=0,this.$element.trigger("shown.bs.collapse")};if(!a.support.transition)return h.call(this);var i=a.camelCase(["scroll",g].join("-"));this.$element.one("bsTransitionEnd",a.proxy(h,this)).emulateTransitionEnd(d.TRANSITION_DURATION)[g](this.$element[0][i])}}}},d.prototype.hide=function(){if(!this.transitioning&&this.$element.hasClass("in")){var b=a.Event("hide.bs.collapse");if(this.$element.trigger(b),!b.isDefaultPrevented()){var c=this.dimension();this.$element[c](this.$element[c]())[0].offsetHeight,this.$element.addClass("collapsing").removeClass("collapse in").attr("aria-expanded",!1),this.$trigger.addClass("collapsed").attr("aria-expanded",!1),this.transitioning=1;var e=function(){this.transitioning=0,this.$element.removeClass("collapsing").addClass("collapse").trigger("hidden.bs.collapse")};return a.support.transition?void this.$element[c](0).one("bsTransitionEnd",a.proxy(e,this)).emulateTransitionEnd(d.TRANSITION_DURATION):e.call(this)}}},d.prototype.toggle=function(){this[this.$element.hasClass("in")?"hide":"show"]()},d.prototype.getParent=function(){return a(this.options.parent).find('[data-toggle="collapse"][data-parent="'+this.options.parent+'"]').each(a.proxy(function(c,d){var e=a(d);this.addAriaAndCollapsedClass(b(e),e)},this)).end()},d.prototype.addAriaAndCollapsedClass=function(a,b){var c=a.hasClass("in");a.attr("aria-expanded",c),b.toggleClass("collapsed",!c).attr("aria-expanded",c)};var e=a.fn.collapse;a.fn.collapse=c,a.fn.collapse.Constructor=d,a.fn.collapse.noConflict=function(){return a.fn.collapse=e,this},a(document).on("click.bs.collapse.data-api",'[data-toggle="collapse"]',function(d){var e=a(this);e.attr("data-target")||d.preventDefault();var f=b(e),g=f.data("bs.collapse"),h=g?"toggle":e.data();c.call(f,h)})}(jQuery),+function(a){"use strict";function b(b){var c=b.attr("data-target");c||(c=b.attr("href"),c=c&&/#[A-Za-z]/.test(c)&&c.replace(/.*(?=#[^\s]*$)/,""));var d=c&&a(c);return d&&d.length?d:b.parent()}function c(c){c&&3===c.which||(a(e).remove(),a(f).each(function(){var d=a(this),e=b(d),f={relatedTarget:this};e.hasClass("open")&&(c&&"click"==c.type&&/input|textarea/i.test(c.target.tagName)&&a.contains(e[0],c.target)||(e.trigger(c=a.Event("hide.bs.dropdown",f)),c.isDefaultPrevented()||(d.attr("aria-expanded","false"),e.removeClass("open").trigger("hidden.bs.dropdown",f))))}))}function d(b){return this.each(function(){var c=a(this),d=c.data("bs.dropdown");d||c.data("bs.dropdown",d=new g(this)),"string"==typeof b&&d[b].call(c)})}var e=".dropdown-backdrop",f='[data-toggle="dropdown"]',g=function(b){a(b).on("click.bs.dropdown",this.toggle)};g.VERSION="3.3.5",g.prototype.toggle=function(d){var e=a(this);if(!e.is(".disabled, :disabled")){var f=b(e),g=f.hasClass("open");if(c(),!g){"ontouchstart"in document.documentElement&&!f.closest(".navbar-nav").length&&a(document.createElement("div")).addClass("dropdown-backdrop").insertAfter(a(this)).on("click",c);var h={relatedTarget:this};if(f.trigger(d=a.Event("show.bs.dropdown",h)),d.isDefaultPrevented())return;e.trigger("focus").attr("aria-expanded","true"),f.toggleClass("open").trigger("shown.bs.dropdown",h)}return!1}},g.prototype.keydown=function(c){if(/(38|40|27|32)/.test(c.which)&&!/input|textarea/i.test(c.target.tagName)){var d=a(this);if(c.preventDefault(),c.stopPropagation(),!d.is(".disabled, :disabled")){var e=b(d),g=e.hasClass("open");if(!g&&27!=c.which||g&&27==c.which)return 27==c.which&&e.find(f).trigger("focus"),d.trigger("click");var h=" li:not(.disabled):visible a",i=e.find(".dropdown-menu"+h);if(i.length){var j=i.index(c.target);38==c.which&&j>0&&j--,40==c.which&&jdocument.documentElement.clientHeight;this.$element.css({paddingLeft:!this.bodyIsOverflowing&&a?this.scrollbarWidth:"",paddingRight:this.bodyIsOverflowing&&!a?this.scrollbarWidth:""})},c.prototype.resetAdjustments=function(){this.$element.css({paddingLeft:"",paddingRight:""})},c.prototype.checkScrollbar=function(){var a=window.innerWidth;if(!a){var b=document.documentElement.getBoundingClientRect();a=b.right-Math.abs(b.left)}this.bodyIsOverflowing=document.body.clientWidth
',trigger:"hover focus",title:"",delay:0,html:!1,container:!1,viewport:{selector:"body",padding:0}},c.prototype.init=function(b,c,d){if(this.enabled=!0,this.type=b,this.$element=a(c),this.options=this.getOptions(d),this.$viewport=this.options.viewport&&a(a.isFunction(this.options.viewport)?this.options.viewport.call(this,this.$element):this.options.viewport.selector||this.options.viewport),this.inState={click:!1,hover:!1,focus:!1},this.$element[0]instanceof document.constructor&&!this.options.selector)throw new Error("`selector` option must be specified when initializing "+this.type+" on the window.document object!");for(var e=this.options.trigger.split(" "),f=e.length;f--;){var g=e[f];if("click"==g)this.$element.on("click."+this.type,this.options.selector,a.proxy(this.toggle,this));else if("manual"!=g){var h="hover"==g?"mouseenter":"focusin",i="hover"==g?"mouseleave":"focusout";this.$element.on(h+"."+this.type,this.options.selector,a.proxy(this.enter,this)),this.$element.on(i+"."+this.type,this.options.selector,a.proxy(this.leave,this))}}this.options.selector?this._options=a.extend({},this.options,{trigger:"manual",selector:""}):this.fixTitle()},c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.getOptions=function(b){return b=a.extend({},this.getDefaults(),this.$element.data(),b),b.delay&&"number"==typeof b.delay&&(b.delay={show:b.delay,hide:b.delay}),b},c.prototype.getDelegateOptions=function(){var b={},c=this.getDefaults();return this._options&&a.each(this._options,function(a,d){c[a]!=d&&(b[a]=d)}),b},c.prototype.enter=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusin"==b.type?"focus":"hover"]=!0),c.tip().hasClass("in")||"in"==c.hoverState?void(c.hoverState="in"):(clearTimeout(c.timeout),c.hoverState="in",c.options.delay&&c.options.delay.show?void(c.timeout=setTimeout(function(){"in"==c.hoverState&&c.show()},c.options.delay.show)):c.show())},c.prototype.isInStateTrue=function(){for(var a in this.inState)if(this.inState[a])return!0;return!1},c.prototype.leave=function(b){var c=b instanceof this.constructor?b:a(b.currentTarget).data("bs."+this.type);return c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c)),b instanceof a.Event&&(c.inState["focusout"==b.type?"focus":"hover"]=!1),c.isInStateTrue()?void 0:(clearTimeout(c.timeout),c.hoverState="out",c.options.delay&&c.options.delay.hide?void(c.timeout=setTimeout(function(){"out"==c.hoverState&&c.hide()},c.options.delay.hide)):c.hide())},c.prototype.show=function(){var b=a.Event("show.bs."+this.type);if(this.hasContent()&&this.enabled){this.$element.trigger(b);var d=a.contains(this.$element[0].ownerDocument.documentElement,this.$element[0]);if(b.isDefaultPrevented()||!d)return;var e=this,f=this.tip(),g=this.getUID(this.type);this.setContent(),f.attr("id",g),this.$element.attr("aria-describedby",g),this.options.animation&&f.addClass("fade");var h="function"==typeof this.options.placement?this.options.placement.call(this,f[0],this.$element[0]):this.options.placement,i=/\s?auto?\s?/i,j=i.test(h);j&&(h=h.replace(i,"")||"top"),f.detach().css({top:0,left:0,display:"block"}).addClass(h).data("bs."+this.type,this),this.options.container?f.appendTo(this.options.container):f.insertAfter(this.$element),this.$element.trigger("inserted.bs."+this.type);var k=this.getPosition(),l=f[0].offsetWidth,m=f[0].offsetHeight;if(j){var n=h,o=this.getPosition(this.$viewport);h="bottom"==h&&k.bottom+m>o.bottom?"top":"top"==h&&k.top-mo.width?"left":"left"==h&&k.left-lg.top+g.height&&(e.top=g.top+g.height-i)}else{var j=b.left-f,k=b.left+f+c;jg.right&&(e.left=g.left+g.width-k)}return e},c.prototype.getTitle=function(){var a,b=this.$element,c=this.options;return a=b.attr("data-original-title")||("function"==typeof c.title?c.title.call(b[0]):c.title)},c.prototype.getUID=function(a){do a+=~~(1e6*Math.random());while(document.getElementById(a));return a},c.prototype.tip=function(){if(!this.$tip&&(this.$tip=a(this.options.template),1!=this.$tip.length))throw new Error(this.type+" `template` option must consist of exactly 1 top-level element!");return this.$tip},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".tooltip-arrow")},c.prototype.enable=function(){this.enabled=!0},c.prototype.disable=function(){this.enabled=!1},c.prototype.toggleEnabled=function(){this.enabled=!this.enabled},c.prototype.toggle=function(b){var c=this;b&&(c=a(b.currentTarget).data("bs."+this.type),c||(c=new this.constructor(b.currentTarget,this.getDelegateOptions()),a(b.currentTarget).data("bs."+this.type,c))),b?(c.inState.click=!c.inState.click,c.isInStateTrue()?c.enter(c):c.leave(c)):c.tip().hasClass("in")?c.leave(c):c.enter(c)},c.prototype.destroy=function(){var a=this;clearTimeout(this.timeout),this.hide(function(){a.$element.off("."+a.type).removeData("bs."+a.type),a.$tip&&a.$tip.detach(),a.$tip=null,a.$arrow=null,a.$viewport=null})};var d=a.fn.tooltip;a.fn.tooltip=b,a.fn.tooltip.Constructor=c,a.fn.tooltip.noConflict=function(){return a.fn.tooltip=d,this}}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.popover"),f="object"==typeof b&&b;(e||!/destroy|hide/.test(b))&&(e||d.data("bs.popover",e=new c(this,f)),"string"==typeof b&&e[b]())})}var c=function(a,b){this.init("popover",a,b)};if(!a.fn.tooltip)throw new Error("Popover requires tooltip.js");c.VERSION="3.3.5",c.DEFAULTS=a.extend({},a.fn.tooltip.Constructor.DEFAULTS,{placement:"right",trigger:"click",content:"",template:''}),c.prototype=a.extend({},a.fn.tooltip.Constructor.prototype),c.prototype.constructor=c,c.prototype.getDefaults=function(){return c.DEFAULTS},c.prototype.setContent=function(){var a=this.tip(),b=this.getTitle(),c=this.getContent();a.find(".popover-title")[this.options.html?"html":"text"](b),a.find(".popover-content").children().detach().end()[this.options.html?"string"==typeof c?"html":"append":"text"](c),a.removeClass("fade top bottom left right in"),a.find(".popover-title").html()||a.find(".popover-title").hide()},c.prototype.hasContent=function(){return this.getTitle()||this.getContent()},c.prototype.getContent=function(){var a=this.$element,b=this.options;return a.attr("data-content")||("function"==typeof b.content?b.content.call(a[0]):b.content)},c.prototype.arrow=function(){return this.$arrow=this.$arrow||this.tip().find(".arrow")};var d=a.fn.popover;a.fn.popover=b,a.fn.popover.Constructor=c,a.fn.popover.noConflict=function(){return a.fn.popover=d,this}}(jQuery),+function(a){"use strict";function b(c,d){this.$body=a(document.body),this.$scrollElement=a(a(c).is(document.body)?window:c),this.options=a.extend({},b.DEFAULTS,d),this.selector=(this.options.target||"")+" .nav li > a",this.offsets=[],this.targets=[],this.activeTarget=null,this.scrollHeight=0,this.$scrollElement.on("scroll.bs.scrollspy",a.proxy(this.process,this)),this.refresh(),this.process()}function c(c){return this.each(function(){var d=a(this),e=d.data("bs.scrollspy"),f="object"==typeof c&&c;e||d.data("bs.scrollspy",e=new b(this,f)),"string"==typeof c&&e[c]()})}b.VERSION="3.3.5",b.DEFAULTS={offset:10},b.prototype.getScrollHeight=function(){return this.$scrollElement[0].scrollHeight||Math.max(this.$body[0].scrollHeight,document.documentElement.scrollHeight)},b.prototype.refresh=function(){var b=this,c="offset",d=0;this.offsets=[],this.targets=[],this.scrollHeight=this.getScrollHeight(),a.isWindow(this.$scrollElement[0])||(c="position",d=this.$scrollElement.scrollTop()),this.$body.find(this.selector).map(function(){var b=a(this),e=b.data("target")||b.attr("href"),f=/^#./.test(e)&&a(e);return f&&f.length&&f.is(":visible")&&[[f[c]().top+d,e]]||null}).sort(function(a,b){return a[0]-b[0]}).each(function(){b.offsets.push(this[0]),b.targets.push(this[1])})},b.prototype.process=function(){var a,b=this.$scrollElement.scrollTop()+this.options.offset,c=this.getScrollHeight(),d=this.options.offset+c-this.$scrollElement.height(),e=this.offsets,f=this.targets,g=this.activeTarget;if(this.scrollHeight!=c&&this.refresh(),b>=d)return g!=(a=f[f.length-1])&&this.activate(a);if(g&&b=e[a]&&(void 0===e[a+1]||b .dropdown-menu > .active").removeClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!1),b.addClass("active").find('[data-toggle="tab"]').attr("aria-expanded",!0),h?(b[0].offsetWidth,b.addClass("in")):b.removeClass("fade"),b.parent(".dropdown-menu").length&&b.closest("li.dropdown").addClass("active").end().find('[data-toggle="tab"]').attr("aria-expanded",!0),e&&e()}var g=d.find("> .active"),h=e&&a.support.transition&&(g.length&&g.hasClass("fade")||!!d.find("> .fade").length);g.length&&h?g.one("bsTransitionEnd",f).emulateTransitionEnd(c.TRANSITION_DURATION):f(),g.removeClass("in")};var d=a.fn.tab;a.fn.tab=b,a.fn.tab.Constructor=c,a.fn.tab.noConflict=function(){return a.fn.tab=d,this};var e=function(c){c.preventDefault(),b.call(a(this),"show")};a(document).on("click.bs.tab.data-api",'[data-toggle="tab"]',e).on("click.bs.tab.data-api",'[data-toggle="pill"]',e)}(jQuery),+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.affix"),f="object"==typeof b&&b;e||d.data("bs.affix",e=new c(this,f)),"string"==typeof b&&e[b]()})}var c=function(b,d){this.options=a.extend({},c.DEFAULTS,d),this.$target=a(this.options.target).on("scroll.bs.affix.data-api",a.proxy(this.checkPosition,this)).on("click.bs.affix.data-api",a.proxy(this.checkPositionWithEventLoop,this)),this.$element=a(b),this.affixed=null,this.unpin=null,this.pinnedOffset=null,this.checkPosition()};c.VERSION="3.3.5",c.RESET="affix affix-top affix-bottom",c.DEFAULTS={offset:0,target:window},c.prototype.getState=function(a,b,c,d){var e=this.$target.scrollTop(),f=this.$element.offset(),g=this.$target.height();if(null!=c&&"top"==this.affixed)return c>e?"top":!1;if("bottom"==this.affixed)return null!=c?e+this.unpin<=f.top?!1:"bottom":a-d>=e+g?!1:"bottom";var h=null==this.affixed,i=h?e:f.top,j=h?g:b;return null!=c&&c>=e?"top":null!=d&&i+j>=a-d?"bottom":!1},c.prototype.getPinnedOffset=function(){if(this.pinnedOffset)return this.pinnedOffset;this.$element.removeClass(c.RESET).addClass("affix");var a=this.$target.scrollTop(),b=this.$element.offset();return this.pinnedOffset=b.top-a},c.prototype.checkPositionWithEventLoop=function(){setTimeout(a.proxy(this.checkPosition,this),1)},c.prototype.checkPosition=function(){if(this.$element.is(":visible")){var b=this.$element.height(),d=this.options.offset,e=d.top,f=d.bottom,g=Math.max(a(document).height(),a(document.body).height());"object"!=typeof d&&(f=e=d),"function"==typeof e&&(e=d.top(this.$element)),"function"==typeof f&&(f=d.bottom(this.$element));var h=this.getState(g,b,e,f);if(this.affixed!=h){null!=this.unpin&&this.$element.css("top","");var i="affix"+(h?"-"+h:""),j=a.Event(i+".bs.affix");if(this.$element.trigger(j),j.isDefaultPrevented())return;this.affixed=h,this.unpin="bottom"==h?this.getPinnedOffset():null,this.$element.removeClass(c.RESET).addClass(i).trigger(i.replace("affix","affixed")+".bs.affix")}"bottom"==h&&this.$element.offset({top:g-b-f})}};var d=a.fn.affix;a.fn.affix=b,a.fn.affix.Constructor=c,a.fn.affix.noConflict=function(){return a.fn.affix=d,this},a(window).on("load",function(){a('[data-spy="affix"]').each(function(){var c=a(this),d=c.data();d.offset=d.offset||{},null!=d.offsetBottom&&(d.offset.bottom=d.offsetBottom),null!=d.offsetTop&&(d.offset.top=d.offsetTop),b.call(c,d)})})}(jQuery); -------------------------------------------------------------------------------- /topic_space/app/templates/histogram.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Topic Space WordCloud Generator 6 | 7 | 8 | 9 | 10 | 11 | 31 | 32 | 33 | 37 | 38 | {{ plot_resources | indent(4) | safe }} 39 | 40 | {{ plot_script | indent(4) | safe }} 41 | 42 | 43 | 44 | 45 | 46 | 47 | 54 | 55 |
56 | 57 | {{ plot_div | indent(4) | safe }} 58 | 59 | 60 | 61 |
62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /topic_space/app/templates/ldavis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Topic Space LDAvis 6 | 7 | 8 | 9 | 10 | 11 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 54 | 55 |
56 | 57 |
58 | 59 | 60 | 61 |

LDAVis

LDAVis allows you to explore the different topics in Material Science.

These are the results after performing LDA on the ~74,000 documents with an EntityTokenizer and n=10

62 |
63 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /topic_space/app/templates/termite.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Topic Space Termite Plot 5 | 6 | 7 | 8 | 9 | 10 | 11 | {{ script | safe }} 12 | 13 | 14 | 15 | 16 | 30 | 31 |
32 |

Termite Plot

A Termite plot allows you to explore the different topics in Autonomy.

These are the results after performing LDA on the ~115,000 documents with an EntityTokenizer and n=10

33 | 34 | {{ div | safe }} 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /topic_space/app/templates/wordcloud.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Topic Space WordCloud Generator 5 | 6 | 7 | 8 | 9 | 10 | 11 | 31 | 32 | 33 | 37 | 38 | 39 | 40 | 41 | 42 | 56 | 57 | 58 | {{ plot_resources | indent(4) | safe }} 59 | 60 | {% for i in range(num_intervals) %} 61 | {{ plot_scripts[i] | indent(4) | safe }} 62 | {% endfor %} 63 | 64 | 65 | 66 | 67 | 68 | 82 | 83 |
84 | 85 | 140 | 141 |
142 |

Displaying from {{year1}} to {{year2}} in {{num_intervals}} intervals

143 |

Without: {{ words | join(",") }}

144 |

Percent filter: {{percent1}}% to {{percent2}}%

145 | {% if side_by_side %} 146 |
147 | {% for i in range(num_intervals) %} 148 |
149 |

{{start_years[i]}} to {{end_years[i]}} from {{num_docs[i]}} docs

150 | {{ plot_divs[i] | indent(4) | safe }} 151 |
152 |
153 | {% endfor %} 154 |
155 | 156 | {% else %} 157 | {% for i in range(num_intervals) %} 158 |
159 |

Top words in years {{start_years[i]}} to {{end_years[i]}} from {{num_docs[i]}} documents

160 | {{ plot_divs[i] | indent(4) | safe }} 161 |
162 | {% endfor %} 163 | {% endif %} 164 |
165 | 166 | 167 |
168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /topic_space/app/viz.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | """ 3 | Draw a termite plot to visualize topics and words from an LDA. 4 | """ 5 | 6 | import logging 7 | 8 | import blaze as blz 9 | from odo import into 10 | import pandas as pd 11 | import bokeh.plotting as plt 12 | from bokeh.models.sources import ColumnDataSource 13 | from bokeh.embed import components 14 | from bokeh.resources import CDN 15 | 16 | class Termite(object): 17 | 18 | def __init__(self, input_file): 19 | self.input_file = input_file 20 | 21 | def plot(self): 22 | t = blz.Data(self.input_file) 23 | df = pd.read_csv(self.input_file) 24 | 25 | MAX = blz.compute(t.weight.max()) 26 | MIN = blz.compute(t.weight.min()) 27 | 28 | # Create a size variable to define the size of the the circle for the plot. 29 | t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) 30 | 31 | WORDS = t['word'].distinct() 32 | WORDS = into(list, WORDS) 33 | topics = t['topic'].distinct() 34 | topics = into(list, topics) 35 | # Convert topics to strings 36 | TOPICS = [str(i) for i in topics] 37 | 38 | source = into(pd.DataFrame, t) 39 | 40 | data_source = ColumnDataSource(source) 41 | 42 | p = plt.figure(x_range=TOPICS, y_range=WORDS, 43 | plot_width=1000, plot_height=1700, title=None) 44 | 45 | p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) 46 | #p.xaxis().major_label_orientation = np.pi/3 47 | logging.info("generating termite plot for file %s" % self.input_file) 48 | 49 | script, div = components(p, CDN) 50 | 51 | return script, div 52 | -------------------------------------------------------------------------------- /topic_space/app/wordcloud_generator.py: -------------------------------------------------------------------------------- 1 | """Visualizations of the material science research files""" 2 | 3 | import cPickle 4 | import os 5 | import os.path 6 | 7 | import pandas as pd 8 | import pattern.vector as pv 9 | from wordcloud import WordCloud 10 | 11 | from elasticsearch import Elasticsearch 12 | from elasticsearch.helpers import scan 13 | 14 | from config import ELASTICSEARCH_HOST, ELASTICSEARCH_INDEX 15 | 16 | FONT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "DejaVuSans.ttf") 17 | PKL_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "docs_by_year.pkl") 18 | 19 | 20 | 21 | def make_output_dir(dir_name="output"): 22 | try: 23 | os.mkdir(dir_name) 24 | except OSError: 25 | pass 26 | 27 | 28 | #USED 29 | def read_elasticsearch(): 30 | es = Elasticsearch([ELASTICSEARCH_HOST]) 31 | years = [] 32 | abstracts = [] 33 | if es.indices.exists(ELASTICSEARCH_INDEX): 34 | query = {"query": {"match_all": {}}} 35 | scanner = scan(es, index=ELASTICSEARCH_INDEX, query=query, size=1000) 36 | for res in scanner: 37 | try: 38 | res = res["_source"] 39 | year = res["dateCreated"].split('-')[0] 40 | abstract = res["hasAbstractPart"]["text"] 41 | years.append(year) 42 | abstracts.append(abstract) 43 | except KeyError: 44 | pass 45 | else: 46 | raise RuntimeError("Could not connect to ELASTICSEARCH_INDEX: %s" % ELASTICSEARCH_INDEX) 47 | return pd.DataFrame({"year": years, "abstract": abstracts}) 48 | 49 | 50 | def lsa_apply(df): 51 | print("Building model") 52 | m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF) 53 | print("Returning reduction") 54 | return m.reduce(2) 55 | 56 | 57 | def get_lsa_by_year(df): 58 | return df[['year', 'abstract']].groupby('year').apply(lsa_apply) 59 | 60 | 61 | def get_word_cloud_image(text): 62 | wordcloud = WordCloud(font_path=FONT_PATH).generate(text) 63 | return wordcloud.to_image() 64 | 65 | 66 | def generate_word_cloud_image(text, filename="output/wordcloud.jpg"): 67 | get_word_cloud_image().save(filename, "JPEG") 68 | 69 | 70 | def interesting_words_1(lsa, n=3): 71 | lsa_df = pd.DataFrame.from_dict(lsa.concepts) 72 | res = [] 73 | for row, series in lsa_df.iterrows(): 74 | s = sorted([(abs(y), x) for x, y in series.iterkv()], reverse=True) 75 | res.extend([x for y,x in s[:n]]) 76 | return set(res) 77 | 78 | 79 | def get_docs_by_year(): 80 | print("reading data") 81 | df = read_elasticsearch() 82 | 83 | print("computing lsa") 84 | lsas = get_lsa_by_year(df) 85 | print("concating texts") 86 | texts = df[['year', 'abstract']].groupby('year').sum() 87 | counts = df[['year', 'abstract']].groupby('year').count() 88 | print("building dataframe") 89 | doc_dicts = [] 90 | for year, lsa in lsas.iterkv(): 91 | text = texts.ix[year]['abstract'] 92 | words = interesting_words_1(lsa, 100) 93 | lsa_terms = set(words) 94 | processed_texts = " ".join([w for w in text.split() if w in lsa_terms]) 95 | doc_dicts.append({"year": year, 96 | "lsa_abs": processed_texts, 97 | "num_docs": counts.ix[year]['abstract'], 98 | }) 99 | doc_df = pd.DataFrame(doc_dicts) 100 | return doc_df 101 | 102 | 103 | def main_msr_wordclouds(): 104 | make_output_dir("output/lsa_abs") 105 | doc_df = get_docs_by_year() 106 | for row, (abs, year) in doc_df.iterrows(): 107 | generate_word_cloud_image(abs, "output/lsa_abs/"+year+".jpg") 108 | 109 | 110 | def read_court_files(folder_path): 111 | import glob 112 | files = glob.glob(folder_path + "/*") 113 | texts = [] 114 | for name in files: # 'file' is a builtin type, 'name' is a less-ambiguous variable name. 115 | try: 116 | with open(name) as f: # No need to specify 'r': this is the default. 117 | texts.append(f.read()) 118 | except IOError as exc: 119 | pass 120 | return texts 121 | 122 | 123 | def get_lsa(texts): 124 | docs = [pv.Document(a) for a in texts] 125 | model = pv.Model(docs, weight=pv.TFIDF) 126 | lsa = model.reduce(2) 127 | return lsa 128 | 129 | 130 | def flatten_list(l): 131 | return [item for sublist in l for item in sublist] 132 | 133 | 134 | def main_court_lsa_words(): 135 | texts = read_court_files("court") 136 | lsa_terms = " ".join(get_lsa(texts).terms) 137 | generate_word_cloud_image(lsa_terms, "output/court_doc") 138 | 139 | 140 | def main_court_minus_lsa_words(): 141 | texts = read_court_files("court") 142 | words = interesting_words_1(get_lsa(texts), 100) 143 | lsa_terms = set(words) 144 | processed_texts = [w for w in flatten_list([t.split() for t in texts]) if w in lsa_terms] 145 | doc = " ".join(processed_texts) 146 | generate_word_cloud_image(doc, "output/court_doc.jpg") 147 | 148 | def test_wordcloud(): 149 | generate_word_cloud_image("ABC ABC ABD ABD", "test.jpg") 150 | 151 | 152 | def create_docs(): 153 | df = get_docs_by_year() 154 | cPickle.dump(df, open(PKL_FILE, 'w'), protocol=2) 155 | 156 | 157 | def load_docs(): 158 | if not os.path.exists(PKL_FILE): 159 | create_docs() 160 | return cPickle.load(open(PKL_FILE)) 161 | 162 | if __name__ == "__main__": 163 | #main_example() 164 | #test_wordcloud() 165 | create_docs() 166 | -------------------------------------------------------------------------------- /topic_space/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import os 4 | import time 5 | import subprocess 6 | import logging 7 | import shutil 8 | import webbrowser 9 | 10 | import numpy as np 11 | 12 | from topik.readers import iter_elastic_query 13 | from topik.tokenizers import EntitiesTokenizer 14 | from topik.vectorizers import CorpusBOW 15 | from topik.models import LDA 16 | from topik.viz import Termite 17 | from topik.utils import to_r_ldavis, generate_csv_output_file 18 | 19 | 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 21 | 22 | BASEDIR = os.path.abspath(os.path.dirname(__file__)) 23 | 24 | ES_INSTANCE = os.environ['ES_INSTANCE'] 25 | ES_INDEX = os.environ['ES_INDEX'] 26 | 27 | def run_topic_model(field, subfield, output_dir, n_topics, seed=42): 28 | 29 | np.random.seed(seed) 30 | 31 | documents = iter_elastic_query(ES_INSTANCE, ES_INDEX, field, subfield) 32 | 33 | corpus = EntitiesTokenizer(documents) 34 | 35 | if os.path.isdir(output_dir): 36 | shutil.rmtree(output_dir) 37 | 38 | os.makedirs(output_dir) 39 | 40 | # Create dictionary 41 | corpus_bow = CorpusBOW(corpus) 42 | corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict')) 43 | # Serialize and store the corpus 44 | corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm')) 45 | # Create LDA model from corpus and dictionary 46 | lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir,'corpus.dict'), n_topics, 47 | update_every=1, chuncksize=10000, passes=1) 48 | # Generate the input for the termite plot 49 | lda.termite_data(os.path.join(output_dir,'termite.csv')) 50 | # Get termite plot for this model 51 | 52 | termite = Termite(os.path.join(output_dir,'termite.csv'), "Termite Plot") 53 | termite.plot(os.path.join(output_dir,'termite.html')) 54 | 55 | df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model) 56 | 57 | to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=lda) 58 | os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis') 59 | try: 60 | subprocess.call(['Rscript', os.path.join(BASEDIR,'R/runLDAvis.R')]) 61 | except ValueError: 62 | logging.warning("Unable to run runLDAvis.R") 63 | -------------------------------------------------------------------------------- /topic_space/run.py: -------------------------------------------------------------------------------- 1 | from app.app import app 2 | 3 | if __name__ == '__main__': 4 | app.run(debug=True, host='0.0.0.0', port=8017) 5 | 6 | --------------------------------------------------------------------------------