├── .gitignore
├── LICENSE
├── README.md
├── dat
    ├── PeerRead
    │   └── proc
    │   │   └── arxiv-all.tf_record
    └── reddit
    │   └── README.md
└── src
    ├── .gitignore
    ├── .idea
        ├── encodings.xml
        ├── misc.xml
        ├── modules.xml
        ├── src.iml
        └── vcs.xml
    ├── PeerRead
        ├── ScienceParse
        │   ├── Paper.py
        │   ├── README.md
        │   ├── Review.py
        │   ├── ScienceParse.py
        │   ├── ScienceParseReader.py
        │   └── __init__.py
        ├── __init__.py
        ├── data_cleaning
        │   ├── PeerRead_hand_features.py
        │   ├── __init__.py
        │   ├── clean_PeerRead.py
        │   ├── extra_vocab.py
        │   ├── process_PeerRead_abstracts.py
        │   └── scripts
        │   │   ├── clean_PeerRead.sh
        │   │   ├── clean_nips_prefix.sh
        │   │   └── merge_train_dev_test.sh
        ├── dataset
        │   ├── __init__.py
        │   ├── array_from_dataset.py
        │   ├── dataset.py
        │   └── sentence_masking.py
        ├── model
        │   ├── __init__.py
        │   ├── bert_multiclass.py
        │   ├── run_causal_bert.py
        │   └── run_multiclass.py
        └── submit_scripts
        │   ├── run_model.sh
        │   └── run_unsupervised.sh
    ├── __init__.py
    ├── bert
        ├── README
        ├── __init__.py
        ├── create_pretraining_data.py
        ├── modeling.py
        ├── optimization.py
        └── tokenization.py
    ├── causal_bert
        ├── __init__.py
        ├── bert_predictors.py
        ├── bert_unsupervised.py
        └── logging.py
    ├── data_cleaning
        └── reddit_posts.py
    ├── lda_baseline
        ├── helpers.py
        ├── peerread_fit_topics.py
        ├── peerread_get_abstracts.py
        ├── peerread_output_att.py
        ├── reddit_fit_topics.py
        ├── reddit_output_att.py
        └── scripts
        │   └── sweep_over_sims.sh
    ├── model_checking
        └── plot_adjustment.py
    ├── plot_treatment_model.ipynb
    ├── reddit
        ├── __init__.py
        ├── data_cleaning
        │   ├── BigQuery_get_data
        │   ├── __init__.py
        │   ├── process_reddit.py
        │   ├── reddit_gender_sentiment.ipynb
        │   └── reddit_posts.py
        ├── dataset
        │   ├── __init__.py
        │   ├── array_from_dataset.py
        │   ├── dataset.py
        │   └── sentence_masking.py
        ├── model
        │   ├── __init__.py
        │   ├── run_causal_bert.py
        │   ├── run_subreddit_classifier.py
        │   ├── run_unsupervised_pretraining.py
        │   └── subreddit_predictors.py
        └── submit_scripts
        │   ├── run_model.sh
        │   └── run_unsupervised.sh
    ├── result_processing
        ├── compute_ate.py
        ├── compute_att.py
        ├── helpers.py
        ├── process_predictions.py
        ├── prop_sim_plotting.py
        └── test_cond_indep.py
    ├── semi_parametric_estimation
        ├── __init__.py
        ├── ate.py
        ├── att.py
        └── helpers.py
    ├── supervised_lda
        ├── add_split_to_simulations.ipynb
        ├── compute_estimates.py
        ├── helpers.py
        ├── peerread_output_att.py
        ├── reddit_output_att.py
        ├── run_supervised_tm.py
        ├── submit_scripts
        │   ├── peerread-exps
        │   │   ├── run_peerread_simulation.sh
        │   │   ├── submit_no_sup.sh
        │   │   ├── submit_no_unsup.sh
        │   │   ├── submit_nonlinear.sh
        │   │   └── submit_peerread_simulation.sh
        │   └── reddit-exps
        │   │   ├── run_reddit_simulation.sh
        │   │   ├── submit_no_sup.sh
        │   │   ├── submit_no_unsup.sh
        │   │   ├── submit_nonlinear.sh
        │   │   ├── submit_reddit_simulation.sh
        │   │   └── submit_reddit_test.sh
        ├── supervised_topic_model.py
        └── test_slda.ipynb
    └── words_baseline
        ├── helpers.py
        ├── peerread_output_ate.py
        ├── reddit_output_att.py
        └── scripts
            └── sweep_over_sims.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | logdir/**
  2 | **/tmp/**
  3 | output/**
  4 | dat/**
  5 | dat/gender-text-corpus
  6 | .DS_Store
  7 | **/.DS_Store
  8 | **/*.pyc
  9 | **/*.pyo
 10 | *checkpoint*
 11 | *aux
 12 | *log
 13 | *.out
 14 | *.synct*
 15 | *__pycache__*
 16 | 
 17 | #################################
 18 | # Victor's standard gitignore
 19 | # mostly python and tex
 20 | #################################
 21 | 
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | *.egg-info/
 45 | .installed.cfg
 46 | *.egg
 47 | MANIFEST
 48 | 
 49 | # PyInstaller
 50 | #  Usually these files are written by a python script from a template
 51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 52 | *.manifest
 53 | *.spec
 54 | 
 55 | # Installer logs
 56 | pip-log.txt
 57 | pip-delete-this-directory.txt
 58 | 
 59 | # Unit test / coverage reports
 60 | htmlcov/
 61 | .tox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | .hypothesis/
 69 | .pytest_cache/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # celery beat schedule file
100 | celerybeat-schedule
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | 
127 | # JetBrains (PyCharm) stuff
128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
130 | 
131 | # User-specific stuff
132 | .idea/**/workspace.xml
133 | .idea/**/tasks.xml
134 | .idea/**/usage.statistics.xml
135 | .idea/**/dictionaries
136 | .idea/**/shelf
137 | 
138 | # Generated files
139 | .idea/**/contentModel.xml
140 | 
141 | # Sensitive or high-churn files
142 | .idea/**/dataSources/
143 | .idea/**/dataSources.ids
144 | .idea/**/dataSources.local.xml
145 | .idea/**/sqlDataSources.xml
146 | .idea/**/dynamic.xml
147 | .idea/**/uiDesigner.xml
148 | .idea/**/dbnavigator.xml
149 | 
150 | # Gradle
151 | .idea/**/gradle.xml
152 | .idea/**/libraries
153 | 
154 | # Gradle and Maven with auto-import
155 | # When using Gradle or Maven with auto-import, you should exclude module files,
156 | # since they will be recreated, and may cause churn.  Uncomment if using
157 | # auto-import.
158 | # .idea/modules.xml
159 | # .idea/*.iml
160 | # .idea/modules
161 | 
162 | # CMake
163 | cmake-build-*/
164 | 
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 | 
168 | # File-based project format
169 | *.iws
170 | 
171 | # IntelliJ
172 | out/
173 | 
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 | 
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 | 
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 | 
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 | 
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 | 
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 | 
195 | # text
196 | *.pdf
197 | 
198 | # linux backup files
199 | *~
200 | *#
201 | 
202 | ## Core latex/pdflatex auxiliary files:
203 | *.aux
204 | *.lof
205 | *.log
206 | *.lot
207 | *.fls
208 | *.out
209 | *.toc
210 | *.fmt
211 | *.fot
212 | *.cb
213 | *.cb2
214 | 
215 | ## Intermediate documents:
216 | *.dvi
217 | *-converted-to.*
218 | # these rules might exclude image files for figures etc.
219 | # *.ps
220 | # *.eps
221 | # *.pdf
222 | 
223 | ## Generated if empty string is given at "Please type another file name for output:"
224 | .pdf
225 | 
226 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
227 | *.bbl
228 | *.bcf
229 | *.blg
230 | *-blx.aux
231 | *-blx.bib
232 | *.run.xml
233 | 
234 | ## Build tool auxiliary files:
235 | *.fdb_latexmk
236 | *.synctex
237 | *.synctex(busy)
238 | *.synctex.gz
239 | *.synctex.gz(busy)
240 | *.pdfsync
241 | 
242 | ## Auxiliary and intermediate files from other packages:
243 | # algorithms
244 | *.alg
245 | *.loa
246 | 
247 | # achemso
248 | acs-*.bib
249 | 
250 | # amsthm
251 | *.thm
252 | 
253 | # beamer
254 | *.nav
255 | *.pre
256 | *.snm
257 | *.vrb
258 | 
259 | # changes
260 | *.soc
261 | 
262 | # cprotect
263 | *.cpt
264 | 
265 | # elsarticle (documentclass of Elsevier journals)
266 | *.spl
267 | 
268 | # endnotes
269 | *.ent
270 | 
271 | # fixme
272 | *.lox
273 | 
274 | # feynmf/feynmp
275 | *.mf
276 | *.mp
277 | *.t[1-9]
278 | *.t[1-9][0-9]
279 | *.tfm
280 | 
281 | #(r)(e)ledmac/(r)(e)ledpar
282 | *.end
283 | *.?end
284 | *.[1-9]
285 | *.[1-9][0-9]
286 | *.[1-9][0-9][0-9]
287 | *.[1-9]R
288 | *.[1-9][0-9]R
289 | *.[1-9][0-9][0-9]R
290 | *.eledsec[1-9]
291 | *.eledsec[1-9]R
292 | *.eledsec[1-9][0-9]
293 | *.eledsec[1-9][0-9]R
294 | *.eledsec[1-9][0-9][0-9]
295 | *.eledsec[1-9][0-9][0-9]R
296 | 
297 | # glossaries
298 | *.acn
299 | *.acr
300 | *.glg
301 | *.glo
302 | *.gls
303 | *.glsdefs
304 | 
305 | # gnuplottex
306 | *-gnuplottex-*
307 | 
308 | # gregoriotex
309 | *.gaux
310 | *.gtex
311 | 
312 | # hyperref
313 | *.brf
314 | 
315 | # knitr
316 | *-concordance.tex
317 | # TODO Comment the next line if you want to keep your tikz graphics files
318 | *.tikz
319 | *-tikzDictionary
320 | 
321 | # listings
322 | *.lol
323 | 
324 | # makeidx
325 | *.idx
326 | *.ilg
327 | *.ind
328 | *.ist
329 | 
330 | # minitoc
331 | *.maf
332 | *.mlf
333 | *.mlt
334 | *.mtc[0-9]*
335 | *.slf[0-9]*
336 | *.slt[0-9]*
337 | *.stc[0-9]*
338 | 
339 | # minted
340 | _minted*
341 | *.pyg
342 | 
343 | # morewrites
344 | *.mw
345 | 
346 | # nomencl
347 | *.nlo
348 | 
349 | # pax
350 | *.pax
351 | 
352 | # pdfpcnotes
353 | *.pdfpc
354 | 
355 | # sagetex
356 | *.sagetex.sage
357 | *.sagetex.py
358 | *.sagetex.scmd
359 | 
360 | # scrwfile
361 | *.wrt
362 | 
363 | # sympy
364 | *.sout
365 | *.sympy
366 | sympy-plots-for-*.tex/
367 | 
368 | # pdfcomment
369 | *.upa
370 | *.upb
371 | 
372 | # pythontex
373 | *.pytxcode
374 | pythontex-files-*/
375 | 
376 | # thmtools
377 | *.loe
378 | 
379 | # TikZ & PGF
380 | *.dpth
381 | *.md5
382 | *.auxlock
383 | 
384 | # todonotes
385 | *.tdo
386 | 
387 | # easy-todo
388 | *.lod
389 | 
390 | # xindy
391 | *.xdy
392 | 
393 | # xypic precompiled matrices
394 | *.xyc
395 | 
396 | # endfloat
397 | *.ttt
398 | *.fff
399 | 
400 | # Latexian
401 | TSWLatexianTemp*
402 | 
403 | ## Editors:
404 | # WinEdt
405 | *.bak
406 | *.sav
407 | 
408 | # Texpad
409 | .texpadtmp
410 | 
411 | # Kile
412 | *.backup
413 | 
414 | # KBibTeX
415 | *~[0-9]*
416 | 
417 | # auto folder when using emacs and auctex
418 | auto/*
419 | 
420 | # auto folder when using emacs and auctex
421 | auto
422 | 
423 | # expex forward references with \gathertags
424 | *-tags.tex
425 | 
426 | # os x stuff
427 | .DS_Store
428 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Blei Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This repository contains software and data for "Using Text Embeddings for Causal Inference" ([arxiv.org/abs/1905.12741](https://arxiv.org/abs/1905.12741)).
 4 | The paper describes a method for causal inference with text documents. For example, does adding a
 5 | theorem to a paper affect its chance of acceptance? The method adapts deep language models to address the causal problem. 
 6 | 
 7 | This software builds on
 8 | 1. Bert: [github.com/google-research/bert](https://github.com/google-research/bert), and on
 9 | 2. PeerRead: [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead)
10 | 
11 | We include pre-processed PeerRead arxiv data for convenience.
12 | 
13 | There is also a [reference implementation in pytorch.](https://github.com/rpryzant/causal-bert-pytorch)
14 | 
15 | # Tensorflow 2
16 | For new projects, we recommend building on the [reference tensorflow 2 implementation](https://github.com/vveitch/causal-text-embeddings-tf2).
17 | 
18 | # Requirements and setup
19 | 
20 | 1. You'll need to download a pre-trained BERT model (following the above github link). We use `uncased_L-12_H-768_A-12`.
21 | 2. Install Tensorflow 1.12
22 | 
23 | # Data
24 | 
25 | 1. We include a pre-processed copy of PeerRead data for convenience.
26 | This data is a collection of arXiv papers submitted to computer science conferences, the accept/reject decisions for these papers,
27 | and their abstracts.
28 | The raw PeerRead data contains significantly more information.
29 | You can get the raw data by following instructions at [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead). 
30 | Running the included pre-processing scripts in the PeerRead folder will recreate the included tfrecord file. 
31 | 
32 | 2. The reddit data can be downloaded at [archive.org/details/reddit_posts_2018](https://archive.org/details/reddit_posts_2018).
33 | This data includes all top-level reddit comments where the gender of the poster was annotated in some fashion.
34 | Each post has meta information (score, date, username, etc.) and includes the text for the first reply.
35 | The processed data used in the paper can be recreated by running the pre-processing scripts in the `reddit` folder.
36 | 
37 | You can also re-collect the data from Google BigQuery.
38 | The SQL command to do this is in `reddit/data_cleaning/BigQuery_get_data`.
39 | Modifying this script will allow you to change collection parameters (e.g., the year, which responses are included)
40 | 
41 | 
42 | # Reproducing the PeerRead experiments
43 | 
44 | The default settings for the code match the settings used in the software.
45 | These match the default settings used by BERT, except
46 | 1. we reduce batch size to allow training on a Titan X, and
47 | 2. we adjust the learning rate to account for this.
48 | 
49 | You'll run the from `src` code as 
50 | `./PeerRead/submit_scripts/run_model.sh`
51 | Before doing this, you'll need to edit `run_classifier.sh` to change 
52 | `BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12`
53 | to
54 | `BERT_BASE_DIR=[path to BERT_pre-trained]/uncased_L-12_H-768_A-12`.
55 | 
56 | The flag 
57 | `--treatment=theorem_referenced`
58 | controls the experiment. 
59 | The flag 
60 | `--simulated=real`
61 | controls whether to use the real effect or one of the semi-synthetic modes.
62 | 
63 | The effect estimates can be reproduced by running `python -m result_processing.compute_ate`.
64 | This takes in the predictions of the bert model (in tsv format) and passes them into downstream estimators
65 | of the causal effect.
66 | 
67 | To reproduce the baselines, you'll need to produce a tsv for each simulated dataset you want to test on. To do this, you can run `python -m PeerRead.dataset.array_from_dataset` from src. The flag `--beta1=1.0` controls the strength of the confounding. (The other flags control other simulation parameters not used in the paper.)
68 | 
69 | # Misc.
70 | 
71 | The experiments in the paper use a version of BERT that was further pre-trained on the PeerRead corpus
72 | using an unsupervised objective. 
73 | This can be replicated with `./PeerRead/submit_scripts/run_classifier.sh`.
74 | This takes about 24 hours on a single Titan Xp.
75 | To use a pre-trained BERT, uncomment the `INIT_DIR` options in `run_classifier.sh`.
76 | 
77 | # Reproducing the Reddit experiment
78 | 
79 | 1. First, get the data following instructions above and save it as `dat/reddit/2018.json`
80 | 2. Run data pre-processing with `python -m reddit.data_cleaning.process_reddit`
81 | 3. Once the data is processed, instructions for running the experiments are essentially the same as for PeerRead
82 | 
83 | # Maintainers
84 | [Dhanya Sridhar](https://github.com/dsridhar91`) and [Victor Veitch](`github.com/vveitch`)
85 | 
86 | 


--------------------------------------------------------------------------------
/dat/PeerRead/proc/arxiv-all.tf_record:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/dat/PeerRead/proc/arxiv-all.tf_record


--------------------------------------------------------------------------------
/dat/reddit/README.md:
--------------------------------------------------------------------------------
1 | This folder is the expected location for the reddit data, "2018.json".
2 | 
3 | Follow instructions in the top-level README to get this data and save it here.
4 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
  1 | logdir/**
  2 | **/tmp/**
  3 | output/**
  4 | dat/**
  5 | dat/gender-text-corpus
  6 | .DS_Store
  7 | **/.DS_Store
  8 | **/*.pyc
  9 | **/*.pyo
 10 | *checkpoint*
 11 | *aux
 12 | *log
 13 | *.out
 14 | *.synct*
 15 | *__pycache__*
 16 | 
 17 | #################################
 18 | # Victor's standard gitignore
 19 | # mostly python and tex
 20 | #################################
 21 | 
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | *.egg-info/
 45 | .installed.cfg
 46 | *.egg
 47 | MANIFEST
 48 | 
 49 | # PyInstaller
 50 | #  Usually these files are written by a python script from a template
 51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 52 | *.manifest
 53 | *.spec
 54 | 
 55 | # Installer logs
 56 | pip-log.txt
 57 | pip-delete-this-directory.txt
 58 | 
 59 | # Unit test / coverage reports
 60 | htmlcov/
 61 | .tox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | .hypothesis/
 69 | .pytest_cache/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # celery beat schedule file
100 | celerybeat-schedule
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | 
127 | # JetBrains (PyCharm) stuff
128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
130 | 
131 | # User-specific stuff
132 | .idea/**/workspace.xml
133 | .idea/**/tasks.xml
134 | .idea/**/usage.statistics.xml
135 | .idea/**/dictionaries
136 | .idea/**/shelf
137 | 
138 | # Generated files
139 | .idea/**/contentModel.xml
140 | 
141 | # Sensitive or high-churn files
142 | .idea/**/dataSources/
143 | .idea/**/dataSources.ids
144 | .idea/**/dataSources.local.xml
145 | .idea/**/sqlDataSources.xml
146 | .idea/**/dynamic.xml
147 | .idea/**/uiDesigner.xml
148 | .idea/**/dbnavigator.xml
149 | 
150 | # Gradle
151 | .idea/**/gradle.xml
152 | .idea/**/libraries
153 | 
154 | # Gradle and Maven with auto-import
155 | # When using Gradle or Maven with auto-import, you should exclude module files,
156 | # since they will be recreated, and may cause churn.  Uncomment if using
157 | # auto-import.
158 | # .idea/modules.xml
159 | # .idea/*.iml
160 | # .idea/modules
161 | 
162 | # CMake
163 | cmake-build-*/
164 | 
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 | 
168 | # File-based project format
169 | *.iws
170 | 
171 | # IntelliJ
172 | out/
173 | 
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 | 
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 | 
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 | 
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 | 
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 | 
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 | 
195 | # text
196 | *.pdf
197 | 
198 | # linux backup files
199 | *~
200 | *#
201 | 
202 | ## Core latex/pdflatex auxiliary files:
203 | *.aux
204 | *.lof
205 | *.log
206 | *.lot
207 | *.fls
208 | *.out
209 | *.toc
210 | *.fmt
211 | *.fot
212 | *.cb
213 | *.cb2
214 | 
215 | ## Intermediate documents:
216 | *.dvi
217 | *-converted-to.*
218 | # these rules might exclude image files for figures etc.
219 | # *.ps
220 | # *.eps
221 | # *.pdf
222 | 
223 | ## Generated if empty string is given at "Please type another file name for output:"
224 | .pdf
225 | 
226 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
227 | *.bbl
228 | *.bcf
229 | *.blg
230 | *-blx.aux
231 | *-blx.bib
232 | *.run.xml
233 | 
234 | ## Build tool auxiliary files:
235 | *.fdb_latexmk
236 | *.synctex
237 | *.synctex(busy)
238 | *.synctex.gz
239 | *.synctex.gz(busy)
240 | *.pdfsync
241 | 
242 | ## Auxiliary and intermediate files from other packages:
243 | # algorithms
244 | *.alg
245 | *.loa
246 | 
247 | # achemso
248 | acs-*.bib
249 | 
250 | # amsthm
251 | *.thm
252 | 
253 | # beamer
254 | *.nav
255 | *.pre
256 | *.snm
257 | *.vrb
258 | 
259 | # changes
260 | *.soc
261 | 
262 | # cprotect
263 | *.cpt
264 | 
265 | # elsarticle (documentclass of Elsevier journals)
266 | *.spl
267 | 
268 | # endnotes
269 | *.ent
270 | 
271 | # fixme
272 | *.lox
273 | 
274 | # feynmf/feynmp
275 | *.mf
276 | *.mp
277 | *.t[1-9]
278 | *.t[1-9][0-9]
279 | *.tfm
280 | 
281 | #(r)(e)ledmac/(r)(e)ledpar
282 | *.end
283 | *.?end
284 | *.[1-9]
285 | *.[1-9][0-9]
286 | *.[1-9][0-9][0-9]
287 | *.[1-9]R
288 | *.[1-9][0-9]R
289 | *.[1-9][0-9][0-9]R
290 | *.eledsec[1-9]
291 | *.eledsec[1-9]R
292 | *.eledsec[1-9][0-9]
293 | *.eledsec[1-9][0-9]R
294 | *.eledsec[1-9][0-9][0-9]
295 | *.eledsec[1-9][0-9][0-9]R
296 | 
297 | # glossaries
298 | *.acn
299 | *.acr
300 | *.glg
301 | *.glo
302 | *.gls
303 | *.glsdefs
304 | 
305 | # gnuplottex
306 | *-gnuplottex-*
307 | 
308 | # gregoriotex
309 | *.gaux
310 | *.gtex
311 | 
312 | # hyperref
313 | *.brf
314 | 
315 | # knitr
316 | *-concordance.tex
317 | # TODO Comment the next line if you want to keep your tikz graphics files
318 | *.tikz
319 | *-tikzDictionary
320 | 
321 | # listings
322 | *.lol
323 | 
324 | # makeidx
325 | *.idx
326 | *.ilg
327 | *.ind
328 | *.ist
329 | 
330 | # minitoc
331 | *.maf
332 | *.mlf
333 | *.mlt
334 | *.mtc[0-9]*
335 | *.slf[0-9]*
336 | *.slt[0-9]*
337 | *.stc[0-9]*
338 | 
339 | # minted
340 | _minted*
341 | *.pyg
342 | 
343 | # morewrites
344 | *.mw
345 | 
346 | # nomencl
347 | *.nlo
348 | 
349 | # pax
350 | *.pax
351 | 
352 | # pdfpcnotes
353 | *.pdfpc
354 | 
355 | # sagetex
356 | *.sagetex.sage
357 | *.sagetex.py
358 | *.sagetex.scmd
359 | 
360 | # scrwfile
361 | *.wrt
362 | 
363 | # sympy
364 | *.sout
365 | *.sympy
366 | sympy-plots-for-*.tex/
367 | 
368 | # pdfcomment
369 | *.upa
370 | *.upb
371 | 
372 | # pythontex
373 | *.pytxcode
374 | pythontex-files-*/
375 | 
376 | # thmtools
377 | *.loe
378 | 
379 | # TikZ & PGF
380 | *.dpth
381 | *.md5
382 | *.auxlock
383 | 
384 | # todonotes
385 | *.tdo
386 | 
387 | # easy-todo
388 | *.lod
389 | 
390 | # xindy
391 | *.xdy
392 | 
393 | # xypic precompiled matrices
394 | *.xyc
395 | 
396 | # endfloat
397 | *.ttt
398 | *.fff
399 | 
400 | # Latexian
401 | TSWLatexianTemp*
402 | 
403 | ## Editors:
404 | # WinEdt
405 | *.bak
406 | *.sav
407 | 
408 | # Texpad
409 | .texpadtmp
410 | 
411 | # Kile
412 | *.backup
413 | 
414 | # KBibTeX
415 | *~[0-9]*
416 | 
417 | # auto folder when using emacs and auctex
418 | auto/*
419 | 
420 | # auto folder when using emacs and auctex
421 | auto
422 | 
423 | # expex forward references with \gathertags
424 | *-tags.tex
425 | 
426 | # os x stuff
427 | .DS_Store
428 | 


--------------------------------------------------------------------------------
/src/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/src/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.6 (~/anaconda3/envs/causal-text-dev/bin/python)" project-jdk-type="Python SDK" />
4 |   <component name="PyCharmProfessionalAdvertiser">
5 |     <option name="shown" value="true" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/src/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/src.iml" filepath="$PROJECT_DIR$/.idea/src.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/src/.idea/src.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/BERT_classifier_reference_implementation" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.6.6 (~/anaconda3/envs/causal-text-dev/bin/python)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="TestRunnerService">
11 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
12 |   </component>
13 | </module>


--------------------------------------------------------------------------------
/src/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/Paper.py:
--------------------------------------------------------------------------------
  1 | import re,io,json,sys
  2 | from .Review import Review
  3 | 
  4 | class Paper:
  5 |   """A paper class, which contains relevant fields and a list of reviews"""
  6 |   def __init__(self, TITLE, ABSTRACT, ID, REVIEWS, AUTHORS=None, CONFERENCE=None, ACCEPTED=None, SCORE=None,
  7 |          PUBLICATION_TYPE=None, SCIENCEPARSE=None, KEYWORDS=None, AUTHOR_EMAILS=None, DATE_OF_SUBMISSION=None,
  8 |          SUBJECTS=None,COMMENTS=None,VERSION=None,HISTORIES=None):
  9 |     self.TITLE = TITLE
 10 |     self.ABSTRACT = re.sub("\\n", " ", ABSTRACT)
 11 |     self.ID = ID
 12 |     self.AUTHORS = AUTHORS
 13 |     self.REVIEWS = REVIEWS
 14 |     self.SCIENCEPARSE = SCIENCEPARSE
 15 |     self.CONFERENCE = CONFERENCE
 16 |     self.ACCEPTED = ACCEPTED
 17 |     self.SCORE = SCORE
 18 |     self.PUBLICATION_TYPE = PUBLICATION_TYPE
 19 |     self.KEYWORDS = KEYWORDS
 20 |     self.AUTHOR_EMAILS = AUTHOR_EMAILS
 21 |     self.DATE_OF_SUBMISSION = DATE_OF_SUBMISSION
 22 | 
 23 |     # additional properties for arxiv papers
 24 |     self.SUBJECTS = SUBJECTS
 25 |     self.COMMENTS = COMMENTS
 26 |     self.VERSION = VERSION
 27 |     self.HISTORIES = HISTORIES #[(version,date,link,comments),...]
 28 | 
 29 |   @staticmethod
 30 |   def from_softconf_dump(json_file, conference=None):
 31 |     with io.open(json_file, "r", encoding="utf8") as ifh:
 32 |       json_str = ifh.read()
 33 | 
 34 |     # print (json_str)
 35 |     json_data = json.loads(json_str)["submissions"]
 36 | 
 37 |     papers = []
 38 |     for i in range(len(json_data)):
 39 |       reviews = []
 40 |       for k in range(len(json_data[i]["reviews"])):
 41 |         # print(json_data[i]["reviews"][k])
 42 |         review_data = []
 43 | 
 44 |         review = Review.from_json_object(json_data[i]["reviews"][k], k==i==0)
 45 |         #review = None
 46 | 
 47 |         reviews.append(review)
 48 | 
 49 |       authors = json_data[i]["authors"] if "authors" in json_data[i] else None
 50 |       score = json_data[i]["score"] if "score" in json_data[i] else None
 51 |       accepted = json_data[i]["accepted"] if "accepted" in json_data[i] else None
 52 |       publication_type = json_data[i]["publication_type"] if "publication_type" in json_data[i] else None
 53 |       keywords = json_data[i]["KEYWORDS"] if "KEYWORDS" in json_data[i] else None
 54 |       author_emails = json_data[i]["AUTHOR_EMAILS"] if "AUTHOR_EMAILS" in json_data[i] else None
 55 |       date_of_submission = json_data[i]["DATE_OF_SUBMISSION"] if "DATE_OF_SUBMISSION" in json_data[i] else None
 56 | 
 57 |       paper = Paper(json_data[i]["title"], json_data[i]["abstract"], json_data[i]["id"], reviews, authors, \
 58 |               conference, accepted, score, publication_type, None, keywords, author_emails, \
 59 |               date_of_submission)
 60 | 
 61 |       papers.append(paper)
 62 |       # break
 63 | 
 64 |     return papers
 65 | 
 66 |   @staticmethod
 67 |   def from_json(json_filename, from_annotated = False):
 68 |     paper = Paper('', '', None, [])
 69 | 
 70 |     datas = []
 71 |     with io.open(json_filename, mode='rt', encoding='utf8') as json_file:
 72 |       for line in json_file:
 73 |         try:
 74 |           data = json.loads(line.strip())
 75 |           datas.append(data)
 76 |         except Exception as e:
 77 |           print(line)
 78 |           continue
 79 |     if len(datas)==0: return None
 80 |     data = datas[-1]
 81 | 
 82 |     # Read required fields.
 83 |     assert 'title' in data
 84 |     assert 'abstract' in data
 85 |     paper.TITLE = data['title']
 86 |     paper.ABSTRACT = data['abstract']
 87 | 
 88 |     if 'id' in data:
 89 |       if data['id'] == "":
 90 |         paper.ID = json_filename.split("/")[-1].split(".")[0]
 91 |       else:
 92 |         paper.ID = data['id']
 93 |     else:
 94 |       paper.ID = json_filename.split("/")[-1].split(".")[0]
 95 | 
 96 |     # Read optional fields.
 97 |     paper.AUTHORS = data['authors'] if 'authors' in data else None
 98 |     paper.CONFERENCE = data['conference'] if 'conference' in data else None
 99 |     paper.ACCEPTED = data['accepted'] if 'accepted' in data else None
100 |     paper.SCORE = data['score'] if 'score' in data else None
101 |     paper.PUBLICATION_TYPE = data['publication_type'] if 'publication_type' in data else None
102 |     paper.SCIENCEPARSE = data['scienceparse'] if 'scienceparse' in data else None
103 |     paper.KEYWORDS = data['keywords'] if 'keywords' in data else None
104 |     paper.AUTHOR_EMAILS = data['author_emails'] if 'author_emails' in data else None
105 | 
106 |     paper.DATE_OF_SUBMISSION = data['DATE_OF_SUBMISSION'] if 'DATE_OF_SUBMISSION' in data else None
107 | 
108 |     paper.SUBJECTS = data['SUBJECTS'] if 'SUBJECTS' in data else None
109 |     paper.COMMENTS = data['COMMENTS'] if 'COMMENTS' in data else None
110 |     paper.VERSION = data['VERSION'] if 'VERSION' in data else None
111 |     paper.HISTORIES = data['histories'] if 'histories' in data else None
112 | 
113 |     # Read reviews (mandatory).
114 |     assert 'reviews' in data
115 |     for review_data in data['reviews']:
116 |       review = Review.from_json_object(review_data)
117 |       paper.REVIEWS.append(review)
118 |     return paper
119 | 
120 | 
121 | 
122 |   def to_json_object(self):
123 |     data = dict()
124 | 
125 |     data["title"] = self.get_title()
126 |     data["abstract"] = self.get_abstract()
127 |     data["id"] = self.get_id()
128 | 
129 |     if self.AUTHORS is not None:
130 |       data["authors"] = self.get_authors()
131 | 
132 |     if self.CONFERENCE is not None:
133 |       data["conference"] = self.get_conference()
134 | 
135 |     if self.ACCEPTED is not None:
136 |       data["accepted"] = self.get_accepted()
137 | 
138 |     if self.SCORE is not None:
139 |       data["SCORE"] = self.get_score()
140 | 
141 |     if self.PUBLICATION_TYPE is not None:
142 |       data["publication_type"] = self.get_publication_type()
143 | 
144 |     if self.SCIENCEPARSE is not None:
145 |       data["SCIENCEPARSE"] = self.get_scienceparse()
146 | 
147 |     if self.AUTHOR_EMAILS is not None:
148 |       data["AUTHOR_EMAILS"] = self.get_author_emails()
149 | 
150 |     if self.KEYWORDS is not None:
151 |       data["KEYWORDS"] = self.get_keywords()
152 | 
153 |     if self.DATE_OF_SUBMISSION is not None:
154 |       data["DATE_OF_SUBMISSION"] = self.get_date_of_submission()
155 | 
156 |     data["reviews"] = []
157 | 
158 |     for r in self.get_reviews():
159 |       data["reviews"].append(r.to_json_object())
160 | 
161 |     # added for arxiv papers
162 | 
163 |     if self.SUBJECTS is not None:
164 |       data["SUBJECTS"] = self.get_subjects()
165 | 
166 |     if self.COMMENTS is not None:
167 |       data["COMMENTS"] = self.get_comments()
168 | 
169 |     if self.VERSION is not None:
170 |       data["VERSION"] = self.get_version()
171 | 
172 |     data["histories"] = []
173 |     if self.HISTORIES is not None:
174 |       for h in self.get_histories():
175 |         if h is not None:
176 |           v,d,l,p = h
177 |           data["histories"].append((v,d,l, p if p else None))
178 | 
179 |     return data
180 | 
181 |   def to_json(self, json_file, mode='a'):
182 | 
183 |     data = self.to_json_object()
184 | 
185 |     with open(json_file, mode) as ofh:
186 |       json.dump(data, ofh)
187 |       ofh.write("\n")
188 | 
189 | 
190 |   def get_subjects(self):
191 |     return self.SUBJECTS
192 |   def get_comments(self):
193 |     return self.COMMENTS
194 |   def get_version(self):
195 |     return self.VERSION
196 |   def get_histories(self):
197 |     return self.HISTORIES
198 | 
199 | 
200 |   def get_title(self):
201 |     return self.TITLE
202 | 
203 |   def get_abstract(self):
204 |     return self.ABSTRACT
205 | 
206 |   def abstract_contains_a_term(self, term):
207 |     return (term in self.ABSTRACT)
208 | 
209 |   def get_id(self):
210 |     return self.ID
211 | 
212 |   def get_authors(self):
213 |     return self.AUTHORS
214 | 
215 |   def get_reviews(self):
216 |     return self.REVIEWS
217 | 
218 |   def get_scienceparse(self):
219 |     return self.SCIENCEPARSE
220 | 
221 |   def get_title_len(self):
222 |     return len(self.TITLE)
223 | 
224 |   def get_abstract_len(self):
225 |     return len(self.ABSTRACT)
226 | 
227 |   def get_conference(self):
228 |     return self.CONFERENCE
229 | 
230 |   def get_score(self):
231 |     return self.SCORE
232 | 
233 |   def get_accepted(self):
234 |     return self.ACCEPTED
235 | 
236 |   def get_publication_type(self):
237 |     return self.PUBLICATION_TYPE
238 | 
239 |   def get_author_emails(self):
240 |     return self.AUTHOR_EMAILS
241 | 
242 |   def get_keywords(self):
243 |     return self.KEYWORDS
244 | 
245 |   def get_date_of_submission(self):
246 |     return self.DATE_OF_SUBMISSION
247 | 
248 | def main(args):
249 |   papers = Paper.from_softconf_dump('../../data/conll16/reviews.json')
250 |   for paper in papers:
251 |     paper.to_json('../../data/conll16_new/{}.json'.format(paper.ID))
252 | 
253 | if __name__ == "__main__":
254 |   sys.exit(main(sys.argv))
255 | 


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/README.md:
--------------------------------------------------------------------------------
1 | Code from ScienceParse (via PeerRead)
2 | 
3 | TODO: determine liscense and add it


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/Review.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class Review:
  3 | 
  4 |   """A review class, contains all bunch of relevant fields"""
  5 |   def __init__(self, RECOMMENDATION, COMMENTS, REPLICABILITY=None, PRESENTATION_FORMAT=None, \
  6 |          CLARITY=None, MEANINGFUL_COMPARISON=None, SUBSTANCE=None, REVIEWER_CONFIDENCE=None, \
  7 |          SOUNDNESS_CORRECTNESS=None, APPROPRIATENESS=None, IMPACT=None, ORIGINALITY=None, OTHER_KEYS=None, \
  8 |          IS_META_REVIEW=False, TITLE=None, DATE=None, RECOMMENDATION_UNOFFICIAL=None, IS_ANNOTATED=False):
  9 |     self.RECOMMENDATION = RECOMMENDATION
 10 |     self.RECOMMENDATION_UNOFFICIAL = RECOMMENDATION_UNOFFICIAL #None # only for aspect prediction
 11 |     self.IS_ANNOTATED = IS_ANNOTATED
 12 | 
 13 |     self.COMMENTS = COMMENTS
 14 |     self.REPLICABILITY = REPLICABILITY
 15 |     self.PRESENTATION_FORMAT = PRESENTATION_FORMAT
 16 |     self.CLARITY = CLARITY
 17 |     self.MEANINGFUL_COMPARISON = MEANINGFUL_COMPARISON
 18 |     self.SUBSTANCE = SUBSTANCE
 19 |     self.REVIEWER_CONFIDENCE = REVIEWER_CONFIDENCE
 20 |     self.SOUNDNESS_CORRECTNESS = SOUNDNESS_CORRECTNESS
 21 |     self.APPROPRIATENESS = APPROPRIATENESS
 22 |     self.IMPACT = IMPACT
 23 |     self.ORIGINALITY = ORIGINALITY
 24 |     self.OTHER_KEYS = OTHER_KEYS
 25 |     self.IS_META_REVIEW = IS_META_REVIEW
 26 |     self.TITLE = TITLE
 27 |     self.DATE = DATE
 28 | 
 29 |   @staticmethod
 30 |   def get_json_string(json_object, string, missing_fields):
 31 |     if string in json_object:
 32 |       return json_object[string]
 33 |     elif missing_fields is not None:
 34 |       missing_fields.append(string)
 35 | 
 36 |     return None
 37 | 
 38 |   @staticmethod
 39 |   def from_json_object(json_object, print_missing_fields=False):
 40 |     assert "comments" in json_object
 41 |     comments = json_object["comments"]
 42 | 
 43 |     missing_fields = None
 44 | 
 45 |     if print_missing_fields:
 46 |       missing_fields = []
 47 | 
 48 |     recommendation = Review.get_json_string(json_object, "RECOMMENDATION", missing_fields)
 49 | 
 50 | 
 51 |     recommendation_unofficial = Review.get_json_string(json_object, "RECOMMENDATION_UNOFFICIAL", missing_fields)
 52 | 
 53 |     is_annotated = Review.get_json_string(json_object, "IS_ANNOTATED", missing_fields)
 54 | 
 55 |     replicability = Review.get_json_string(json_object, "REPLICABILITY", missing_fields)
 56 |     clarity = Review.get_json_string(json_object, "CLARITY", missing_fields)
 57 |     substance = Review.get_json_string(json_object, "SUBSTANCE", missing_fields)
 58 |     appropriateness = Review.get_json_string(json_object, "APPROPRIATENESS", missing_fields)
 59 |     originality = Review.get_json_string(json_object, "ORIGINALITY", missing_fields)
 60 |     presentation_format = Review.get_json_string(json_object, "PRESENTATION_FORMAT", missing_fields)
 61 |     meaningful_comparison = Review.get_json_string(json_object, "MEANINGFUL_COMPARISON", missing_fields)
 62 |     reviewer_confidence = Review.get_json_string(json_object, "REVIEWER_CONFIDENCE", missing_fields)
 63 |     soundness_correctness = Review.get_json_string(json_object, "SOUNDNESS_CORRECTNESS", missing_fields)
 64 |     impact = Review.get_json_string(json_object, "IMPACT", missing_fields)
 65 |     is_meta_review = Review.get_json_string(json_object, "IS_META_REVIEW", missing_fields)
 66 |     date = Review.get_json_string(json_object, "DATE", missing_fields)
 67 |     title = Review.get_json_string(json_object, "TITLE", missing_fields)
 68 |     other_keys = Review.get_json_string(json_object, "OTHER_KEYS", missing_fields)
 69 | 
 70 |     if print_missing_fields and len(missing_fields):
 71 |       print("The following fields are missing in json input file:",missing_fields)
 72 |     return Review(recommendation, comments, replicability, presentation_format, clarity, meaningful_comparison, \
 73 |             substance, reviewer_confidence, soundness_correctness, appropriateness, impact, originality, \
 74 |             other_keys, is_meta_review, title, date, recommendation_unofficial, is_annotated )
 75 | 
 76 |   def to_json_object(self):
 77 |     data = dict()
 78 | 
 79 |     data["comments"] = self.get_comments().decode('cp1252', errors='ignore').encode('utf-8')
 80 | 
 81 |     if self.RECOMMENDATION is not None:
 82 |       data["RECOMMENDATION"] = self.get_recommendation()
 83 | 
 84 |     if self.RECOMMENDATION_UNOFFICIAL is not None:
 85 |       data["RECOMMENDATION_UNOFFICIAL"] = self.get_recommendation_unofficial()
 86 |     if self.IS_ANNOTATED is not None:
 87 |       data["IS_ANNOTATED"] = self.get_is_annotated()
 88 | 
 89 | 
 90 |     if self.REPLICABILITY is not None:
 91 |       data["REPLICABILITY"] = self.get_replicability()
 92 |     if self.PRESENTATION_FORMAT is not None:
 93 |       data["PRESENTATION_FORMAT"] = self.get_presentation_format()
 94 |     if self.CLARITY is not None:
 95 |       data["CLARITY"] = self.get_clarity()
 96 |     if self.MEANINGFUL_COMPARISON is not None:
 97 |       data["MEANINGFUL_COMPARISON"] = self.get_meaningful_comparison()
 98 |     if self.SUBSTANCE is not None:
 99 |       data["SUBSTANCE"] = self.get_substance()
100 |     if self.REVIEWER_CONFIDENCE is not None:
101 |       data["REVIEWER_CONFIDENCE"] = self.get_reviewer_confidence()
102 |     if self.SOUNDNESS_CORRECTNESS is not None:
103 |       data["SOUNDNESS_CORRECTNESS"] = self.get_soundness_correctness()
104 |     if self.APPROPRIATENESS is not None:
105 |       data["APPROPRIATENESS"] = self.get_appropriateness()
106 |     if self.IMPACT is not None:
107 |       data["IMPACT"] = self.get_impact()
108 |     if self.ORIGINALITY is not None:
109 |       data["ORIGINALITY"] = self.get_originality()
110 |     if self.OTHER_KEYS is not None:
111 |       data["OTHER_KEYS"] = self.get_other_keys()
112 |     if self.IS_META_REVIEW is not None:
113 |       data["IS_META_REVIEW"] = self.is_meta_review()
114 |     if self.TITLE is not None:
115 |       data["TITLE"] = self.get_title()
116 |     if self.DATE is not None:
117 |       data["DATE"] = self.get_date()
118 | 
119 | 
120 |     return data
121 | 
122 |   def get_recommendation(self):
123 |     return self.RECOMMENDATION
124 | 
125 |   def get_recommendation_unofficial(self):
126 |     return self.RECOMMENDATION_UNOFFICIAL
127 | 
128 |   def get_is_annotated(self):
129 |     return self.IS_ANNOTATED
130 | 
131 |   def get_comments(self):
132 |     return self.COMMENTS
133 | 
134 |   def get_replicability(self):
135 |     return self.REPLICABILITY
136 | 
137 |   def get_presentation_format(self):
138 |     return self.PRESENTATION_FORMAT
139 | 
140 |   def get_clarity(self):
141 |     return self.CLARITY
142 | 
143 |   def get_meaningful_comparison(self):
144 |     return self.MEANINGFUL_COMPARISON
145 | 
146 |   def get_substance(self):
147 |     return self.SUBSTANCE
148 | 
149 |   def get_reviewer_confidence(self):
150 |     return self.REVIEWER_CONFIDENCE
151 | 
152 |   def get_soundness_correctness(self):
153 |     return self.SOUNDNESS_CORRECTNESS
154 | 
155 |   def get_appropriateness(self):
156 |     return self.APPROPRIATENESS
157 | 
158 |   def get_impact(self):
159 |     return self.IMPACT
160 | 
161 |   def get_originality(self):
162 |     return self.ORIGINALITY
163 | 
164 |   def get_other_keys(self):
165 |     return self.OTHER_KEYS
166 | 
167 |   def is_meta_review(self):
168 |     return self.IS_META_REVIEW
169 | 
170 |   def get_title(self):
171 |     return self.TITLE
172 | 
173 |   def get_date(self):
174 |     return self.DATE
175 | 


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/ScienceParse.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import re
  3 | 
  4 | class ScienceParse:
  5 |   """
  6 |     A data structure for paper fields extracted by ScienceParse
  7 |   """
  8 |   def __init__(self, title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts,
  9 |          reference_num_mentions, authors=None, emails = None, other_keys=None):
 10 |     self.title = title
 11 |     self.abstract = abstract
 12 |     self.sections = sections
 13 |     self.reference_titles = reference_titles
 14 |     self.reference_venues = reference_venues
 15 |     self.reference_years = reference_years
 16 |     self.reference_mention_contexts = reference_mention_contexts
 17 |     self.reference_num_mentions = reference_num_mentions
 18 |     self.authors = authors
 19 |     self.emails = emails
 20 | 
 21 |   def get_sections_dict(self):
 22 |     return self.sections
 23 | 
 24 |   def get_reference_title_dict(self):
 25 |     return self.reference_titles
 26 | 
 27 |   def get_reference_venues_dict(self):
 28 |     return self.reference_venues
 29 | 
 30 |   def get_reference_years_dict(self):
 31 |     return self.reference_years
 32 | 
 33 |   def get_reference_mention_contexts_dict(self):
 34 |     return self.reference_mention_contexts
 35 | 
 36 |   def get_reference_num_mentions_dict(self):
 37 |     return self.reference_num_mentions
 38 | 
 39 |   def get_num_references(self):
 40 |     return len(self.get_reference_years_dict())
 41 | 
 42 |   def get_num_refmentions(self):
 43 |     num_refmentions = 0
 44 |     for refid in self.reference_num_mentions:
 45 |       num_refmentions  = num_refmentions + self.reference_num_mentions[refid]
 46 |     return num_refmentions
 47 | 
 48 |   def get_most_recent_reference_year(self):
 49 |     most_recent = 0
 50 |     for refid in self.reference_years:
 51 |       if self.reference_years[refid] > most_recent:
 52 |         most_recent = self.reference_years[refid]
 53 |     return most_recent
 54 | 
 55 |   def get_avg_length_reference_mention_contexts(self):
 56 |     sum_length = 0.0
 57 |     for refid in self.reference_mention_contexts:
 58 |       sum_length = sum_length + len(self.reference_mention_contexts[refid])
 59 |     avg_length = 0
 60 |     if len(self.reference_mention_contexts) > 0:
 61 |       avg_length = sum_length / len(self.reference_mention_contexts)
 62 |     return avg_length
 63 | 
 64 |   def get_paper_content(self):
 65 |     content = self.title + " " + self.abstract + " " + self.get_author_names_string() + " " + \
 66 |           self.get_domains_from_emails()
 67 |     for sect_id in sorted(self.sections):
 68 |       # print("###",str(sect_id))
 69 |       content = content + " " +  self.sections[sect_id]
 70 |     content = re.sub("\n([0-9]*\n)+", "\n", content)
 71 |     return content
 72 | 
 73 |   def get_tagged_paper_content(self):
 74 |     content = self.get_paper_content()
 75 | 
 76 |     nlp = spacy.load('en', parser=False)
 77 | 
 78 |     doc = nlp(content)
 79 | 
 80 |     return " ".join([x.text+"_"+x.tag_ for x in doc])
 81 | 
 82 |   def get_frequent_words_proportion(self, hfws, most_frequent_words, least_frequent_words):
 83 |     content = self.get_paper_content().split()
 84 | 
 85 |     n = 0
 86 |     t = 0
 87 |     # print(str(most_frequent_words).encode('utf8'))
 88 |     for w in content:
 89 |       if w not in hfws and w not in least_frequent_words:
 90 |         t += 1
 91 |         n += w in most_frequent_words
 92 | 
 93 |     # print (n,len(content),1.*n/t)
 94 | 
 95 |     return 1.*n/t
 96 | 
 97 |   # #papers referred from -5 years from year of submission
 98 |   def get_num_recent_references(self, submission_year):
 99 |     num_recent_references = 0
100 |     for refid in self.reference_years:
101 |       if (submission_year - self.reference_years[refid] < 5):
102 |         num_recent_references = num_recent_references + 1
103 |     return num_recent_references
104 | 
105 |   # word offset of figure 1
106 |   def get_word_offset_of_first_fig_reference(self):
107 |     content_words = self.get_paper_content().split(" ")
108 |     indices = [i for i, x in enumerate(content_words) if x == "Figure"]
109 |     return indices[0]
110 | 
111 |   # num references to #figures
112 |   def get_num_ref_to_figures(self):
113 |     content_words = self.get_paper_content().split(" ")
114 |     figure_indices = [i for i, x in enumerate(content_words) if x == "Figure"]
115 |     return len(figure_indices)
116 | 
117 |   # num references to #tables
118 |   def get_num_ref_to_tables(self):
119 |     content_words = self.get_paper_content().split(" ")
120 |     table_indices = [i for i, x in enumerate(content_words) if x == "Table"]
121 |     return len(table_indices)
122 | 
123 |   # # of references to Section
124 |   def get_num_ref_to_sections(self):
125 |     content_words = self.get_paper_content().split(" ")
126 |     section_indices = [i for i, x in enumerate(content_words) if x == "Section"]
127 |     return len(section_indices)
128 | 
129 |   # related work at front/back
130 |   # #unique words
131 |   def get_num_uniq_words(self):
132 |     return len(set(self.get_paper_content().split(" ")))
133 | 
134 |   # num of sections
135 |   def get_num_sections(self):
136 |     return len(self.sections)
137 | 
138 |   # avg length of sentences
139 |   def get_avg_sentence_length(self):
140 |     sentences = self.get_paper_content().split(". ")
141 |     sentence_lengths = [len(s.split(" ")) for s in sentences]
142 |     return (1.0 * sum(sentence_lengths))/len(sentence_lengths)
143 | 
144 |   # whether paper has appendix
145 |   def get_contains_appendix(self):
146 |     content_words = self.get_paper_content().split(" ")
147 |     figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"]
148 |     return int(len(figure_indices) > 0)
149 | 
150 |   # publishing a dataset / code
151 |   def get_contains_appendix(self):
152 |     content_words = self.get_paper_content().split(" ")
153 |     figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"]
154 |     return int(len(figure_indices) > 0)
155 | 
156 |   # #authors
157 |   def get_num_authors(self):
158 |     if self.authors == None:
159 |       return 0
160 |     return len(self.authors)
161 | 
162 |   # get author names as a string
163 |   def get_author_names_string(self):
164 |     if self.authors == None:
165 |       return ""
166 |     return str.join(' ', self.authors)
167 | 
168 |   # get domains from emails
169 |   def get_domains_from_emails(self):
170 |     domains = []
171 |     for email in self.emails:
172 |       domains.append(email.split('@')[1].replace(".", "_"))
173 |     return str.join(' ', domains)
174 | 
175 |   # num references to equations
176 |   def get_num_ref_to_equations(self):
177 |     content_words = self.get_paper_content().split(" ")
178 |     equation_indices = [i for i, x in enumerate(content_words) if x == "Equation"]
179 |     return len(equation_indices)
180 | 
181 |   # num references to theorems
182 |   def get_num_ref_to_theorems(self):
183 |     content_words = self.get_paper_content().split(" ")
184 |     theorem_indices = [i for i, x in enumerate(content_words) if x == "Theorem"]
185 |     return len(theorem_indices)
186 | 


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/ScienceParseReader.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import io
 3 | import os
 4 | import json
 5 | from PeerRead.ScienceParse.ScienceParse import ScienceParse
 6 | 
 7 | class ScienceParseReader:
 8 |   """
 9 |     This class reads the output of the science parse library and stores it in theScienceParseclass
10 |   """
11 | 
12 |   @staticmethod
13 |   def read_science_parse(paperid, title, abstract, scienceparse_dir):
14 |     scienceparse_file = io.open(os.path.join(scienceparse_dir, '{0}.pdf.json'.format(paperid)))
15 |     # scienceparse_file = io.open('%s%s.pdf.json'%(scienceparse_dir,paperid), "r", encoding="utf8")
16 |     scienceparse_str = scienceparse_file.read()
17 |     scienceparse_data = json.loads(scienceparse_str)
18 | 
19 |     #read scienceparse
20 |     scienceparse_map = {}
21 | 
22 |     sections = {}
23 |     reference_years = {}
24 |     reference_titles = {}
25 |     reference_venues = {}
26 |     reference_mention_contexts = {}
27 |     reference_num_mentions = {}
28 | 
29 |     name = scienceparse_data["name"]
30 |     metadata = scienceparse_data["metadata"]
31 | 
32 |     if metadata["sections"] is not None:
33 |       for sectid in range(len(metadata["sections"])):
34 |         heading = metadata["sections"][sectid]["heading"]
35 |         text = metadata["sections"][sectid]["text"]
36 |         sections[str(heading)] = text
37 | 
38 |     for refid in range(len(metadata["references"])):
39 |       reference_titles[refid] = metadata["references"][refid]["title"]
40 |       reference_years[refid] = metadata["references"][refid]["year"]
41 |       reference_venues[refid] = metadata["references"][refid]["venue"]
42 | 
43 |     for menid in range(len(metadata["referenceMentions"])):
44 |       refid = metadata["referenceMentions"][menid]["referenceID"]
45 |       context = metadata["referenceMentions"][menid]["context"]
46 |       oldContext = reference_mention_contexts.get(refid, "")
47 |       reference_mention_contexts[refid] = oldContext + "\t" + context
48 |       count = reference_num_mentions.get(refid, 0)
49 |       reference_num_mentions[refid] = count + 1
50 | 
51 |     authors = metadata["authors"]
52 |     emails = metadata["emails"]
53 |     #print(authors)
54 |     #print(emails)
55 | 
56 |     science_parse = ScienceParse(title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts, reference_num_mentions, authors, emails)
57 |     return science_parse
58 | 


--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/ScienceParse/__init__.py


--------------------------------------------------------------------------------
/src/PeerRead/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/__init__.py


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/PeerRead_hand_features.py:
--------------------------------------------------------------------------------
 1 | """
 2 | create (hand-authored and lexical) features for baselines classifiers and save to under dataset folder in each split
 3 | """
 4 | 
 5 | import sys, os, random, glob
 6 | 
 7 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 8 | from PeerRead.ScienceParse.Paper import Paper
 9 | from PeerRead.ScienceParse.ScienceParseReader import ScienceParseReader
10 | 
11 | 
12 | def get_PeerRead_hand_features(paper):
13 |     sp = paper.get_scienceparse()
14 | 
15 |     hand_features = {}
16 | 
17 |     hand_features["accepted"] = paper.get_accepted()
18 | 
19 |     hand_features["most_recent_reference_year"] = sp.get_most_recent_reference_year() - 2000
20 |     hand_features["num_recent_references"] = sp.get_num_recent_references(2017)
21 |     hand_features["num_references"] = sp.get_num_references()
22 |     hand_features["num_refmentions"] = sp.get_num_refmentions()
23 |     hand_features["avg_length_reference_mention_contexts"] = sp.get_avg_length_reference_mention_contexts()
24 | 
25 |     hand_features["num_ref_to_figures"] = sp.get_num_ref_to_figures()
26 |     hand_features["num_ref_to_tables"] = sp.get_num_ref_to_tables()
27 |     hand_features["num_ref_to_sections"] = sp.get_num_ref_to_sections()
28 | 
29 |     hand_features["num_uniq_words"] = sp.get_num_uniq_words()
30 |     hand_features["num_sections"] = sp.get_num_sections()
31 |     hand_features["avg_sentence_length"] = sp.get_avg_sentence_length()
32 | 
33 |     hand_features["contains_appendix"] = sp.get_contains_appendix()
34 | 
35 |     hand_features["title_length"] = paper.get_title_len()
36 |     hand_features["num_authors"] = sp.get_num_authors()
37 |     hand_features["num_ref_to_equations"] = sp.get_num_ref_to_equations()
38 |     hand_features["num_ref_to_theorems"] = sp.get_num_ref_to_theorems()
39 | 
40 |     abstract = str.lower(paper.ABSTRACT)
41 |     hand_features["abstract_contains_deep"] = "deep" in abstract
42 |     hand_features["abstract_contains_neural"] = "neural" in abstract
43 |     hand_features["abstract_contains_embedding"] = "embedding" in abstract
44 |     hand_features["abstract_contains_outperform"] = "outperform" in abstract
45 |     hand_features["abstract_contains_novel"] = "novel" in abstract
46 |     hand_features["abstract_contains_state-of-the-art"] = \
47 |         "state-of-the-art" in abstract or "state of the art" in abstract
48 | 
49 |     title = str.lower(paper.TITLE)
50 |     hand_features["title_contains_deep"] = "deep" in title
51 |     hand_features["title_contains_neural"] = "neural" in title
52 |     hand_features["title_contains_embedding"] = "embed" in title
53 |     hand_features["title_contains_gan"] = ("gan" in title) or ("adversarial net" in title)
54 | 
55 |     return hand_features
56 | 
57 | 
58 | def main(args):
59 | 
60 |     paper_json_dir = args[1]  # train/reviews
61 |     scienceparse_dir = args[2]  # train/parsed_pdfs
62 | 
63 | 
64 |     ################################
65 |     # read reviews
66 |     ################################
67 |     print('Reading reviews from...', paper_json_dir)
68 |     paper_json_filenames = sorted(glob.glob('{}/*.json'.format(paper_json_dir)))
69 |     papers = []
70 |     for paper_json_filename in paper_json_filenames:
71 |         paper = Paper.from_json(paper_json_filename)
72 |         paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(paper.ID, paper.TITLE, paper.ABSTRACT,
73 |                                                                    scienceparse_dir)
74 |         papers.append(paper)
75 |     random.shuffle(papers)
76 |     print('Total number of reviews', len(papers))
77 | 
78 |     id = 1
79 |     for p in papers:
80 |         rec = int(p.get_accepted() == True)
81 | 
82 |         handy = get_PeerRead_hand_features(p)
83 | 
84 |         id += 1
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main(sys.argv)
89 | 


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/data_cleaning/__init__.py


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/clean_PeerRead.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import bert.tokenization as tokenization
 5 | from PeerRead.data_cleaning.process_PeerRead_abstracts import clean_PeerRead_dataset
 6 | 
 7 | dataset_names = ['acl_2017',
 8 |                   'arxiv.cs.ai_2007-2017',
 9 |                   'arxiv.cs.cl_2007-2017',
10 |                   'arxiv.cs.lg_2007-2017',
11 |                   'conll_2016',
12 |                   'iclr_2017',
13 |                   'nips_2013',
14 |                   'nips_2014',
15 |                   'nips_2015',
16 |                   'nips_2016',
17 |                   'nips_2017'
18 |                   ]
19 | 
20 | dataset_paths = ['acl_2017',
21 |                   'arxiv.cs.ai_2007-2017',
22 |                   'arxiv.cs.cl_2007-2017',
23 |                   'arxiv.cs.lg_2007-2017',
24 |                   'conll_2016',
25 |                   'iclr_2017',
26 |                   'nips_2013-2017/2013',
27 |                   'nips_2013-2017/2014',
28 |                   'nips_2013-2017/2015',
29 |                   'nips_2013-2017/2016',
30 |                   'nips_2013-2017/2017'
31 |                   ]
32 | 
33 | dataset_paths = dict(zip(dataset_names, dataset_paths))
34 | 
35 | dataset_years = {'acl_2017': 2017,
36 |                   'conll_2016': 2016,
37 |                   'iclr_2017': 2017,
38 |                   'arxiv.cs.ai_2007-2017': None,
39 |                   'arxiv.cs.cl_2007-2017': None,
40 |                   'arxiv.cs.lg_2007-2017': None,
41 |                   'nips_2013': 2013,
42 |                   'nips_2014': 2014,
43 |                   'nips_2015': 2015,
44 |                   'nips_2016': 2016,
45 |                   'nips_2017': 2017}
46 | 
47 | # dataset_venues = {k: v for v,k in enumerate(dataset_names)}
48 | 
49 | dataset_venues = {'acl_2017': 0,
50 |                   'conll_2016': 1,
51 |                   'iclr_2017': 2,
52 |                   'nips_2013': 3,
53 |                   'nips_2014': 3,
54 |                   'nips_2015': 3,
55 |                   'nips_2016': 3,
56 |                   'nips_2017': 3,
57 |                   'arxiv.cs.ai_2007-2017': 4,
58 |                   'arxiv.cs.cl_2007-2017': 5,
59 |                   'arxiv.cs.lg_2007-2017': 6,
60 |                   }
61 | 
62 | 
63 | def main():
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
66 |     parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
67 |     args = parser.parse_args()
68 | 
69 |     datasets_dir = args.datasets_dir
70 |     tokenizer = tokenization.FullTokenizer(
71 |         vocab_file=args.vocab_file, do_lower_case=True)
72 | 
73 |     def proc_dataset(dataset):
74 |         all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
75 |         review_json_dir = os.path.join(all_dir, 'reviews')
76 |         parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')
77 | 
78 |         venue = dataset_venues[dataset]
79 |         year = dataset_years[dataset]
80 | 
81 |         out_dir = os.path.join(datasets_dir, 'proc')
82 |         out_file = dataset + '.tf_record'
83 |         max_abs_len = 250
84 | 
85 |         clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
86 |                                tokenizer)
87 | 
88 |     # pool = mp.Pool(4)
89 |     # pool.map(proc_dataset, dataset_names)
90 | 
91 |     for dataset in dataset_names:
92 |         proc_dataset(dataset)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/extra_vocab.py:
--------------------------------------------------------------------------------
 1 | """
 2 | vv: wrote this to inspect what bert's tokenizer does with vocabulary terms it doesn't know.
 3 | The answer is: it splits them into word pieces where it has embeddings for each piece. Example:
 4 | 
 5 | tokenizer.tokenize('embedding')
 6 | ['em', '##bed', '##ding']
 7 | 
 8 | tokenizer.convert_tokens_to_ids(['em', '##bed', '##ding'])
 9 | [7861, 8270, 4667]
10 | 
11 | Accordingly, the meaning of embedding can be learned so long as there's a suitably rich training corpus
12 | """
13 | 
14 | import argparse
15 | import glob
16 | import random
17 | 
18 | import io
19 | import json
20 | 
21 | import bert.tokenization as tokenization
22 | 
23 | rng = random.Random(0)
24 | 
25 | def main():
26 | 
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('--review-json-dir', type=str, default=None)
29 |     parser.add_argument('--vocab-file', type=str, default=None)
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     tokenizer = tokenization.FullTokenizer(
34 |         vocab_file=args.vocab_file, do_lower_case=True)
35 | 
36 |     review_json_dir = args.review_json_dir
37 | 
38 |     print('Reading reviews from...', review_json_dir)
39 |     paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
40 | 
41 |     paper_json_filename = paper_json_filenames[0]
42 |     with io.open(paper_json_filename) as json_file:
43 |         loaded = json.load(json_file)
44 |     abstract = loaded['abstract']
45 |     print(abstract)
46 |     tokens = tokenizer.tokenize(abstract)
47 |     print(tokens)
48 |     print(tokenizer.convert_tokens_to_ids(tokens))
49 | 
50 |     # for idx, paper_json_filename in enumerate(paper_json_filenames):
51 |     #     with io.open(paper_json_filename) as json_file:
52 |     #         loaded = json.load(json_file)
53 |     #
54 |     #     print(loaded['abstract'])
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/clean_PeerRead.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Process all PeerRead data into tf_record format to feed into Bert
 4 | 
 5 | PeerDir=../dat/PeerRead/
 6 | 
 7 | for dataset in $PeerDir*/; do
 8 |     echo $dataset
 9 | #    python -m data_cleaning.process_PeerRead_abstracts \
10 | #    --review-json-dir \
11 | #    --parsedpdf-json-dir \
12 | #    --out-dir \
13 | #    --out-file \
14 | #    --vocab_file \
15 | #    --max_abs_len
16 | done


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/clean_nips_prefix.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PeerDir=../dat/PeerRead/nips_2013-2017
 4 | PARSE_DIR=$PeerDir/2017/all/parsed_pdfs
 5 | 
 6 | for pdf in $PARSE_DIR/*; do
 7 | #    echo $pdf
 8 |     mv $pdf $PARSE_DIR/"${pdf#*/pdfs}"
 9 | done
10 | 


--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/merge_train_dev_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Each Peer read dataset is pre-divided into train/dev/test. Merge these into "all"
 4 | 
 5 | #PeerDir=../dat/PeerRead
 6 | PeerDir=../dat/PeerRead/nips_2013-2017
 7 | 
 8 | for dir in $PeerDir*/; do
 9 |     for subdir in $dir*/; do
10 | 	echo $subdir;
11 | 	cp -RT $subdir/ $dir/all/
12 |     done
13 | done
14 | 


--------------------------------------------------------------------------------
/src/PeerRead/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/dataset/__init__.py


--------------------------------------------------------------------------------
/src/PeerRead/dataset/array_from_dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | helpers to take samples from the dataset and turn them into numpy arrays
  3 | (for ease of inspection and use with baselines)
  4 | """
  5 | import argparse
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | import os
 10 | try:
 11 |     import mkl_random as random
 12 | except ImportError:
 13 |     import numpy.random as random
 14 | 
 15 | import bert.tokenization as tokenization
 16 | from PeerRead.dataset.dataset import make_input_fn_from_file, make_buzzy_based_simulated_labeler
 17 | 
 18 | 
 19 | def dataset_fn_to_df(dataset_fn):
 20 | 
 21 |     params = {'batch_size': 1}
 22 |     dataset = dataset_fn(params)
 23 | 
 24 |     itr = dataset.make_one_shot_iterator()
 25 | 
 26 |     samples = []
 27 | 
 28 |     for i in range(25000):
 29 |         try:
 30 |             sample = itr.get_next()
 31 |             for k in sample:
 32 |                 sample[k] = sample[k].numpy()[0]
 33 |             samples += [sample]
 34 |             # print("year: {}".format(sample['year']))
 35 |         except:
 36 |             print(i)
 37 |             break
 38 | 
 39 |     df = pd.DataFrame(samples)
 40 | 
 41 |     return df
 42 | 
 43 | def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
 44 |                             base_output_dir='../dat/sim/peerread_buzzytitle_based/'):
 45 | 
 46 |     labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
 47 | 
 48 |     num_splits = 10
 49 |     dev_splits = [0]
 50 |     test_splits = [0]
 51 | 
 52 |     # data_file = '../dat/reddit/proc.tf_record'
 53 |     # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
 54 |     tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
 55 | 
 56 |     input_dataset_from_filenames = make_input_fn_from_file(data_file,
 57 |                                                            250,
 58 |                                                            num_splits,
 59 |                                                            dev_splits,
 60 |                                                            test_splits,
 61 |                                                            tokenizer,
 62 |                                                            is_training=False,
 63 |                                                            filter_test=False,
 64 |                                                            shuffle_buffer_size=25000,
 65 |                                                            seed=seed,
 66 |                                                            labeler=labeler)
 67 | 
 68 |     output_df = dataset_fn_to_df(input_dataset_from_filenames)
 69 |     output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})
 70 | 
 71 |     output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
 72 |     os.makedirs(output_dir, exist_ok=True)
 73 |     output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
 74 | 
 75 |     output_df.to_csv(output_path, '\t')
 76 | 
 77 | 
 78 | def main():
 79 |     tf.enable_eager_execution()
 80 | 
 81 |     buzzy_title_based_sim_dfs(treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0,
 82 |                             base_output_dir=base_output_dir)
 83 | 
 84 | if __name__ == '__main__':
 85 |     parser = argparse.ArgumentParser()
 86 |     parser.add_argument("--data-file", action="store", default='../dat/PeerRead/proc/arxiv-all.tf_record')
 87 |     parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
 88 |     parser.add_argument("--base-output-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
 89 |     parser.add_argument("--mode", action="store", default="simple")
 90 |     parser.add_argument("--beta0", action="store", default='1.0')
 91 |     parser.add_argument("--beta1", action="store", default='1.0')
 92 |     parser.add_argument("--gamma", action="store", default='1.0')
 93 |     args = parser.parse_args()
 94 | 
 95 |     data_file = args.data_file
 96 |     vocab_file = args.vocab_file
 97 |     base_output_dir = args.base_output_dir
 98 |     mode = args.mode
 99 |     beta0 = float(args.beta0)
100 |     beta1 = float(args.beta1)
101 |     gamma = float(args.gamma)
102 | 
103 |     main()


--------------------------------------------------------------------------------
/src/PeerRead/dataset/sentence_masking.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific PeerRead governing permissions and
14 | # limitations under the License.
15 | """Create masked LM TF examples for BERT."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import collections
22 | 
23 | import tensorflow as tf
24 | 
25 | 
26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
27 |                                           ["index", "label"])
28 | 
29 | 
30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed):
31 |     """Creates the predictions for the masked LM objective.
32 | 
33 |     This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords
34 | 
35 |     Except: we just include masks instead of randomly letting the words through or randomly replacing
36 |     """
37 | 
38 |     basic_mask = tf.less(
39 |         tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed),
40 |         masked_lm_prob)
41 | 
42 |     # don't mask special characters or padding
43 |     cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]),
44 |                                   tf.not_equal(token_ids, vocab["[SEP]"]))
45 |     cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0))
46 |     mask = tf.logical_and(cand_indexes, basic_mask)
47 | 
48 |     # truncate to max predictions for ease of padding
49 |     masked_lm_positions = tf.where(mask)
50 |     # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ?
51 |     # very rare event: nothing gets picked for mask, causing an irritating bug
52 |     # in this case, just mask the first candidate index
53 |     mlm_shape = tf.shape(masked_lm_positions)[0]
54 |     masked_lm_positions = tf.cond(mlm_shape > 1,
55 |                                   lambda: masked_lm_positions,
56 |                                   lambda: tf.where(cand_indexes)[0:2])
57 | 
58 |     masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq]
59 |     masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32)
60 |     masked_lm_ids = tf.gather(token_ids, masked_lm_positions)
61 | 
62 |     mask = tf.cast(
63 |         tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape),
64 |         bool)
65 | 
66 |     output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids)
67 | 
68 |     # pad out to max_predictions_per_seq
69 |     masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding
70 |     add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]]
71 |     masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant')
72 |     masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant')
73 |     masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant')
74 | 
75 |     return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
76 | 
77 | 
78 | def main(_):
79 |     pass
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()


--------------------------------------------------------------------------------
/src/PeerRead/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/model/__init__.py


--------------------------------------------------------------------------------
/src/PeerRead/model/bert_multiclass.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper to check which categorical attributes of PeerRead are predictable from the text
  3 | """
  4 | 
  5 | import tensorflow as tf
  6 | import bert.modeling as modeling
  7 | import bert.optimization as optimization
  8 | from causal_bert.bert_unsupervised import get_masked_lm_output
  9 | from causal_bert.logging import make_label_binary_prediction_summaries, binary_label_eval_metric_fn
 10 | 
 11 | 
 12 | def _create_unsupervised_only_model(bert, bert_config, features):
 13 |     # PeerRead v. reddit inconsistency
 14 |     if "op_masked_lm_positions" in features:
 15 |         masked_lm_positions = features["op_masked_lm_positions"]
 16 |         masked_lm_ids = features["op_masked_lm_ids"]
 17 |         masked_lm_weights = features["op_masked_lm_weights"]
 18 |     else:
 19 |         masked_lm_positions = features["masked_lm_positions"]
 20 |         masked_lm_ids = features["masked_lm_ids"]
 21 |         masked_lm_weights = features["masked_lm_weights"]
 22 | 
 23 |     masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = get_masked_lm_output(
 24 |         bert_config, bert.get_sequence_output(), bert.get_embedding_table(),
 25 |         masked_lm_positions, masked_lm_ids, masked_lm_weights)
 26 |     return masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs
 27 | 
 28 | 
 29 | def _make_feedforward_classifier(embedding, labels, num_labels, split, num_hidden_layers, extra_features=None,
 30 |                                  label_smoothing=0.01):
 31 |     regularizer = tf.contrib.layers.l2_regularizer(scale=1e-6)
 32 |     if extra_features is None:
 33 |         full_embedding = embedding
 34 |     else:
 35 |         full_embedding = tf.concat([embedding, extra_features], axis=1)
 36 | 
 37 |     if num_hidden_layers == 0:
 38 |         logits = tf.layers.dense(full_embedding, num_labels, activation=None,
 39 |                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
 40 | 
 41 |     else:
 42 |         layer = tf.layers.dense(full_embedding, 200, activation=tf.nn.elu)
 43 |         for _ in range(num_hidden_layers - 1):
 44 |             layer = tf.layers.dense(layer, 200, activation=tf.nn.elu,
 45 |                                     kernel_regularizer=regularizer, bias_regularizer=regularizer)
 46 | 
 47 |         if extra_features is None:
 48 |             final_embedding = layer
 49 |         else:
 50 |             final_embedding = tf.concat([layer, extra_features], axis=1)
 51 | 
 52 |         logits = tf.layers.dense(final_embedding, num_labels, activation=None,
 53 |                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
 54 | 
 55 |     with tf.name_scope("loss"):
 56 |         one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32,
 57 |                                     on_value=1. - label_smoothing, off_value=label_smoothing)
 58 |         log_probs = tf.nn.log_softmax(logits, axis=-1)
 59 |         per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
 60 |         censored_per_example_loss = split * per_example_loss
 61 |         loss = tf.reduce_sum(censored_per_example_loss)
 62 | 
 63 |     probabilities = tf.nn.softmax(logits, axis=-1)[:, 1]  # P(T=1)
 64 | 
 65 |     return loss, per_example_loss, logits, probabilities
 66 | 
 67 | 
 68 | def _get_getter(ema):
 69 |     def ema_getter(getter, name, *args, **kwargs):
 70 |         var = getter(name, *args, **kwargs)
 71 |         ema_var = ema.average(var)
 72 |         return ema_var  # if ema_var else var
 73 | 
 74 |     return ema_getter
 75 | 
 76 | 
 77 | def multiclass_model_fn_builder(bert_config, init_checkpoint, learning_rate,
 78 |                                 num_train_steps, num_warmup_steps, use_tpu,
 79 |                                 use_one_hot_embeddings, label_pred=True, unsupervised=False,
 80 |                                 polyak=False, use_extra_features=False):
 81 |     """Returns `model_fn` closure for TPUEstimator."""
 82 | 
 83 |     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 84 |         """The `model_fn` for TPUEstimator."""
 85 | 
 86 |         tf.logging.info("*** Features ***")
 87 |         for name in sorted(features.keys()):
 88 |             tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
 89 | 
 90 |         target_name = params['target_name']
 91 |         num_labels = params['num_labels']
 92 | 
 93 |         labels = features[target_name]
 94 | 
 95 |         # because reddit and peerread use slightly different text and pre-training structure
 96 |         if "op_token_ids" in features:
 97 |             token_mask = features["op_token_mask"]
 98 |             maybe_masked_token_ids = features["op_maybe_masked_input_ids"]
 99 |         else:
100 |             token_mask = features["token_mask"]
101 |             maybe_masked_token_ids = features["maybe_masked_input_ids"]
102 | 
103 |         index = features['index']
104 |         in_train = features['in_train']
105 |         in_dev = features['in_dev']
106 |         in_test = features['in_test']
107 | 
108 |         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
109 | 
110 |         # Predictive Model
111 | 
112 |         bert = modeling.BertModel(
113 |             config=bert_config,
114 |             is_training=is_training,
115 |             input_ids=maybe_masked_token_ids,
116 |             input_mask=token_mask,
117 |             token_type_ids=None,
118 |             use_one_hot_embeddings=use_one_hot_embeddings)
119 | 
120 |         masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = \
121 |             _create_unsupervised_only_model(bert, bert_config, features)
122 | 
123 |         bert_embedding = bert.get_pooled_output()
124 | 
125 |         label_loss, per_example_loss, logits, probabilities = \
126 |             _make_feedforward_classifier(bert_embedding, labels, num_labels, in_train, num_hidden_layers=0,
127 |                                          extra_features=None, label_smoothing=0.01)
128 | 
129 |         tf.losses.add_loss(masked_lm_loss)
130 |         tf.losses.add_loss(0.1 * label_loss)
131 | 
132 |         tf.summary.scalar('masked_lm_loss', masked_lm_loss, family='loss')
133 |         tf.summary.scalar('label_loss', label_loss, family='loss')
134 | 
135 |         total_loss = masked_lm_loss + 0.1 * label_loss
136 | 
137 |         # some logging
138 |         make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_train, "train")
139 |         make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_dev, "dev")
140 | 
141 |         # pre-trained model loading
142 |         tvars = tf.trainable_variables()
143 |         initialized_variable_names = {}
144 |         scaffold_fn = None
145 |         if init_checkpoint:
146 |             (assignment_map, initialized_variable_names
147 |              ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
148 |             if use_tpu:
149 | 
150 |                 def tpu_scaffold():
151 |                     tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
152 |                     return tf.train.Scaffold()
153 | 
154 |                 scaffold_fn = tpu_scaffold
155 |             else:
156 |                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
157 | 
158 |         tf.logging.info("**** Trainable Variables ****")
159 |         for var in tvars:
160 |             init_string = ""
161 |             if var.name in initialized_variable_names:
162 |                 init_string = ", *INIT_FROM_CKPT*"
163 |             tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
164 |                             init_string)
165 | 
166 |         output_spec = None
167 |         if mode == tf.estimator.ModeKeys.TRAIN:
168 | 
169 |             # sgd_opt = tf.train.GradientDescentOptimizer(learning_rate)
170 |             # train_op = sgd_opt.minimize(total_loss, global_step=tf.train.get_global_step())
171 | 
172 |             train_op = optimization.create_optimizer(
173 |                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
174 | 
175 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
176 |                 mode=mode,
177 |                 loss=total_loss,
178 |                 train_op=train_op,
179 |                 scaffold_fn=scaffold_fn)
180 | 
181 |         elif mode == tf.estimator.ModeKeys.EVAL:
182 |             pass
183 | 
184 |         else:
185 |             pass
186 | 
187 |         return output_spec
188 | 
189 |     return model_fn
190 | 


--------------------------------------------------------------------------------
/src/PeerRead/submit_scripts/run_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
 4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record
 5 | OUTPUT_DIR=../output/PeerRead/local_test
 6 | #INIT_DIR=../../output/unsupervised_PeerRead_embeddings/
 7 | #INIT_FILE=$INIT_DIR/model.ckpt-175000
 8 | 
 9 | 
10 | #rm -rf $OUTPUT_DIR
11 | 
12 | python -m PeerRead.model.run_causal_bert \
13 |   --seed=0 \
14 |   --do_train=true \
15 |   --do_eval=false \
16 |   --do_predict=true \
17 |   --input_files_or_glob=$DATA_FILE \
18 |   --vocab_file=$BERT_BASE_DIR/vocab.txt \
19 |   --bert_config_file=$BERT_BASE_DIR/bert_config.json \
20 |   --max_seq_length=250 \
21 |   --output_dir=$OUTPUT_DIR \
22 |   --train_batch_size=16 \
23 |   --learning_rate=3e-5 \
24 |   --num_warmup_steps 200 \
25 |   --num_train_steps=4500 \
26 |   --save_checkpoint_steps=3000 \
27 |   --unsupervised=True \
28 |   --label_pred=True \
29 |   --num_splits=10 \
30 |   --test_splits=0 \
31 |   --dev_splits=0 \
32 |   --simulated='real' \
33 |   --treatment='buzzy_title'
34 | #  --init_checkpoint=${INIT_FILE}
35 | 


--------------------------------------------------------------------------------
/src/PeerRead/submit_scripts/run_unsupervised.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BERT_BASE_DIR=../../BERT_pre-trained/uncased_L-12_H-768_A-12
 4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record
 5 | OUTPUT_DIR=../../output/unsupervised_PeerRead_embeddings/
 6 | 
 7 | #rm -rf $OUTPUT_DIR
 8 | python -m PeerRead.model.run_causal_bert \
 9 |   --seed=0 \
10 |   --do_train=true \
11 |   --input_files_or_glob=${DATA_FILE} \
12 |   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
13 |   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
14 |   --output_dir=${OUTPUT_DIR} \
15 |   --max_seq_length=250 \
16 |   --train_batch_size=16 \
17 |   --learning_rate=3e-5 \
18 |   --num_warmup_steps 200 \
19 |   --num_train_steps=175000 \
20 |   --save_checkpoints_steps 5000 \
21 |   --keep_checkpoints 3 \
22 |   --unsupervised=True


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/__init__.py


--------------------------------------------------------------------------------
/src/bert/README:
--------------------------------------------------------------------------------
1 | Chunks of google's Bert code, https://github.com/google-research/bert
2 | 
3 | pre-trained presumed to be in:
4 | '../../bert/pre-trained/uncased_L-12_H-768_A-12'


--------------------------------------------------------------------------------
/src/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/bert/__init__.py


--------------------------------------------------------------------------------
/src/bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   new_global_step = global_step + 1
 80 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 81 |   return train_op
 82 | 
 83 | 
 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 85 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 86 | 
 87 |   def __init__(self,
 88 |                learning_rate,
 89 |                weight_decay_rate=0.0,
 90 |                beta_1=0.9,
 91 |                beta_2=0.999,
 92 |                epsilon=1e-6,
 93 |                exclude_from_weight_decay=None,
 94 |                name="AdamWeightDecayOptimizer"):
 95 |     """Constructs a AdamWeightDecayOptimizer."""
 96 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
 97 | 
 98 |     self.learning_rate = learning_rate
 99 |     self.weight_decay_rate = weight_decay_rate
100 |     self.beta_1 = beta_1
101 |     self.beta_2 = beta_2
102 |     self.epsilon = epsilon
103 |     self.exclude_from_weight_decay = exclude_from_weight_decay
104 | 
105 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
106 |     """See base class."""
107 |     assignments = []
108 |     for (grad, param) in grads_and_vars:
109 |       if grad is None or param is None:
110 |         continue
111 | 
112 |       param_name = self._get_variable_name(param.name)
113 | 
114 |       m = tf.get_variable(
115 |           name=param_name + "/adam_m",
116 |           shape=param.shape.as_list(),
117 |           dtype=tf.float32,
118 |           trainable=False,
119 |           initializer=tf.zeros_initializer())
120 |       v = tf.get_variable(
121 |           name=param_name + "/adam_v",
122 |           shape=param.shape.as_list(),
123 |           dtype=tf.float32,
124 |           trainable=False,
125 |           initializer=tf.zeros_initializer())
126 | 
127 |       # Standard Adam update.
128 |       next_m = (
129 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
130 |       next_v = (
131 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
132 |                                                     tf.square(grad)))
133 | 
134 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
135 | 
136 |       # Just adding the square of the weights to the loss function is *not*
137 |       # the correct way of using L2 regularization/weight decay with Adam,
138 |       # since that will interact with the m and v parameters in strange ways.
139 |       #
140 |       # Instead we want ot decay the weights in a manner that doesn't interact
141 |       # with the m/v parameters. This is equivalent to adding the square
142 |       # of the weights to the loss with plain (non-momentum) SGD.
143 |       if self._do_use_weight_decay(param_name):
144 |         update += self.weight_decay_rate * param
145 | 
146 |       update_with_lr = self.learning_rate * update
147 | 
148 |       next_param = param - update_with_lr
149 | 
150 |       assignments.extend(
151 |           [param.assign(next_param),
152 |            m.assign(next_m),
153 |            v.assign(next_v)])
154 |     return tf.group(*assignments, name=name)
155 | 
156 |   def _do_use_weight_decay(self, param_name):
157 |     """Whether to use L2 weight decay for `param_name`."""
158 |     if not self.weight_decay_rate:
159 |       return False
160 |     if self.exclude_from_weight_decay:
161 |       for r in self.exclude_from_weight_decay:
162 |         if re.search(r, param_name) is not None:
163 |           return False
164 |     return True
165 | 
166 |   def _get_variable_name(self, param_name):
167 |     """Get the variable name from the tensor name."""
168 |     m = re.match("^(.*):\\d+$", param_name)
169 |     if m is not None:
170 |       param_name = m.group(1)
171 |     return param_name
172 | 


--------------------------------------------------------------------------------
/src/causal_bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/causal_bert/__init__.py


--------------------------------------------------------------------------------
/src/causal_bert/logging.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def batch_random_agreement(labels, predictions, weights, name=None):
  5 |     """ Computes the probability of random agreement between the
  6 |     labels and predictions assuming independence.
  7 | 
  8 |     Parameters
  9 |     ----------
 10 |     labels: a tensor of any shape taking values in {0, 1}.
 11 |     predictions: a tensor of the same shape as labels taking values in {0, 1}.
 12 |     weights: a tensor that can be broadcasted to labels.
 13 |     name: an optional name for the operation.
 14 | 
 15 |     Returns
 16 |     -------
 17 |     random_agreement: a scalar tensor representing the probability of random
 18 |         agreement.
 19 |     """
 20 |     with tf.name_scope(name, 'batch_random_agreement', [labels, predictions, weights]):
 21 |         weights_mean = tf.reduce_mean(weights)
 22 |         weights_mean = tf.where(tf.not_equal(weights_mean, 0), weights_mean, 1)
 23 | 
 24 |         labels = tf.to_float(labels)
 25 |         predictions = tf.to_float(predictions)
 26 | 
 27 |         p_labels = tf.metrics.mean(labels * weights / weights_mean)[1]
 28 |         p_predictions = tf.metrics.mean(predictions * weights / weights_mean)[1]
 29 | 
 30 |         random_agreement = tf.identity(
 31 |             p_labels * p_predictions + (1 - p_labels) * (1 - p_predictions),
 32 |             name='random_agreement')
 33 | 
 34 |         print(random_agreement.name)
 35 | 
 36 |     return random_agreement
 37 | 
 38 | 
 39 | def batch_kappa(labels, predictions, weights, name=None):
 40 |     """ Computes Cohen's kappa on the given batch of predictions.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     labels: a tensor of any shape taking values in {0, 1}.
 45 |     predictions: a tensor of the same shape as labels taking values in {0, 1}.
 46 |     weights: a tensor that can be broadcasted to labels.
 47 |     name: an optional name for the operation.
 48 | 
 49 |     Returns
 50 |     -------
 51 |     kappa: a scalar tensor representing the Kappa measure of agreement
 52 |         between labels and predictions.
 53 |     """
 54 |     with tf.name_scope(name, 'batch_kappa', [labels, predictions, weights]):
 55 |         accuracy = tf.metrics.accuracy(labels, predictions, weights=weights)[1]
 56 |         random_agreement = batch_random_agreement(labels, predictions, weights)
 57 | 
 58 |         # hack for small batch sizes
 59 |         random_agreement = tf.clip_by_value(random_agreement, 0.001, 0.999)
 60 | 
 61 |         kappa = tf.divide(
 62 |             accuracy - random_agreement, 1 - random_agreement,
 63 |             name='kappa')
 64 | 
 65 |     return kappa
 66 | 
 67 | 
 68 | def make_label_binary_prediction_summaries(per_example_loss, logits, label_ids, split, family):
 69 |     with tf.name_scope("summary"+"/"+family):
 70 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions')
 71 |         
 72 |         accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
 73 |         precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
 74 |         recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
 75 |         kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
 76 | 
 77 |         loss = tf.metrics.mean(per_example_loss, weights=split)
 78 |         # censored_per_example_loss = split * per_example_loss
 79 |         # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
 80 | 
 81 |         tf.summary.scalar('accuracy', accuracy[1], family=family)
 82 |         tf.summary.scalar('precision', precision[1], family=family)
 83 |         tf.summary.scalar('recall', recall[1], family=family)
 84 |         tf.summary.scalar('kappa', kappa, family=family)
 85 |         tf.summary.scalar('loss', loss[1], family=family)
 86 | 
 87 | 
 88 | def make_label_multiclass_prediction_summaries(per_example_loss, logits, one_hot_label, split, family):
 89 |     with tf.name_scope("summary"+"/"+family):
 90 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions')
 91 |         label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32)
 92 |         
 93 |         accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
 94 |         precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
 95 |         recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
 96 |         kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
 97 | 
 98 |         loss = tf.metrics.mean(per_example_loss, weights=split)
 99 |         # censored_per_example_loss = split * per_example_loss
100 |         # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
101 | 
102 |         tf.summary.scalar('accuracy', accuracy[1], family=family)
103 |         tf.summary.scalar('precision', precision[1], family=family)
104 |         tf.summary.scalar('recall', recall[1], family=family)
105 |         tf.summary.scalar('kappa', kappa, family=family)
106 |         tf.summary.scalar('loss', loss[1], family=family)
107 | 
108 | 
109 | 
110 | def make_label_regression_prediction_summaries(per_example_loss, split, family):
111 |     with tf.name_scope("summary"+"/"+family):
112 |         
113 |         loss = tf.metrics.mean(per_example_loss, weights=split)
114 |         # censored_per_example_loss = split * per_example_loss
115 |         # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
116 | 
117 |         tf.summary.scalar('loss', loss[1], family=family)
118 | 
119 | 
120 | def cont_label_eval_metric_fn(per_example_loss, outcome, split=None, family=''):
121 |     loss = tf.metrics.mean(per_example_loss, weights=split)
122 | 
123 |     return {
124 |         family+"/eval_loss": loss
125 |     }
126 | 
127 | 
128 | def binary_label_eval_metric_fn(per_example_loss, label_ids, logits, split=None, family=''):
129 |     predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
130 | 
131 |     accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split)
132 |     precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
133 |     recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
134 |     # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
135 |     loss = tf.metrics.mean(per_example_loss, weights=split)
136 | 
137 |     return {
138 |         family+"/eval_accuracy": accuracy,
139 |         family+"/eval_precision": precision,
140 |         family+"/eval_recall": recall,
141 |         family+"/eval_loss": loss
142 |     }
143 | 
144 | 
145 | def multiclass_label_eval_metric_fn(per_example_loss, logits, one_hot_label, split=None, family=''):
146 | 
147 |     predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
148 |     label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32)
149 | 
150 |     accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
151 |     precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
152 |     recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
153 |     # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
154 |     loss = tf.metrics.mean(per_example_loss, weights=split)
155 | 
156 |     return {
157 |         family+"/eval_accuracy": accuracy,
158 |         family+"/eval_precision": precision,
159 |         family+"/eval_recall": recall,
160 |         family+"/eval_loss": loss
161 |     }
162 | 
163 | 
164 | def unsupervised_eval_metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
165 |                                 masked_lm_weights):
166 |     """Computes the loss and accuracy of the model."""
167 |     masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
168 |                                      [-1, masked_lm_log_probs.shape[-1]])
169 |     masked_lm_predictions = tf.argmax(
170 |         masked_lm_log_probs, axis=-1, output_type=tf.int32)
171 |     masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
172 |     masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
173 |     masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
174 |     masked_lm_accuracy = tf.metrics.accuracy(
175 |         labels=masked_lm_ids,
176 |         predictions=masked_lm_predictions,
177 |         weights=masked_lm_weights)
178 |     masked_lm_mean_loss = tf.metrics.mean(
179 |         values=masked_lm_example_loss, weights=masked_lm_weights)
180 | 
181 |     return {
182 |         "masked_lm_accuracy": masked_lm_accuracy,
183 |         "masked_lm_loss": masked_lm_mean_loss,
184 |     }


--------------------------------------------------------------------------------
/src/lda_baseline/helpers.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import word_tokenize
 2 | from nltk.stem import WordNetLemmatizer
 3 | from nltk.corpus import stopwords
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.decomposition import LatentDirichletAllocation
 8 | 
 9 | class LemmaTokenizer(object):
10 | 	def __init__(self):
11 | 		self.wnl = WordNetLemmatizer()
12 | 	def __call__(self, articles):
13 | 		stop = stopwords.words('english')
14 | 		return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 | 
16 | def filter_by_subreddit(reddit, subs=None):
17 | 	if not subs:
18 | 		return reddit.index.values
19 | 	else:
20 | 		return reddit[reddit.subreddit.isin(subs)].index.values
21 | 
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
23 | 	from nltk.corpus import stopwords
24 | 	'''
25 | 	From a list of documents raw text build a matrix DxV
26 | 	D: number of docs
27 | 	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | 	'''
29 | 	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | 	corpus = count_vect.fit_transform(documents)
31 | 	vocabulary = count_vect.get_feature_names()
32 | 	
33 | 	return corpus,vocabulary,count_vect
34 | 
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | 	indices = np.arange(num_docs)
37 | 	np.random.shuffle(indices)
38 | 	size = int(indices.shape[0]*percentage)
39 | 	dev = indices[:size]
40 | 	return dev
41 | 
42 | def learn_topics(X, X_dev, K=50):
43 | 	lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | 	print("Fitting", K, "topics...")
45 | 	lda.fit(X)
46 | 	score = lda.perplexity(X_dev)
47 | 	print("Log likelihood:", score)
48 | 	topics = lda.components_
49 | 	return score, lda, topics
50 | 
51 | def show_topics(vocab, topics, n_words=20):
52 | 	topic_keywords = []
53 | 	for topic_weights in topics:
54 | 		top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | 		topic_keywords.append(vocab.take(top_keyword_locs))
56 | 
57 | 	df_topic_keywords = pd.DataFrame(topic_keywords)
58 | 	df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | 	df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | 	return df_topic_keywords
61 | 
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | 	filtered_indices = filtered_df[on].values
64 | 	doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | 	embeddings = doc_embeddings[doc_idx, :]
66 | 	return embeddings
67 | 
68 | def make_index_mapping(df, on='post_index', convert_to_int=True):
69 | 	if on=='index':
70 | 		indices = df.index.values
71 | 	else:
72 | 		indices = df[on].values
73 | 
74 | 	if convert_to_int:
75 | 		return {int(ind):i for (i,ind) in enumerate(indices)}
76 | 
77 | 	return {ind:i for (i,ind) in enumerate(indices)}
78 | 
79 | def assign_split(df, num_splits=10, col_to_add='split'):
80 | 	df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
81 | 	return df
82 | 


--------------------------------------------------------------------------------
/src/lda_baseline/peerread_fit_topics.py:
--------------------------------------------------------------------------------
 1 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit
 2 | import numpy as np
 3 | import pandas as pd
 4 | import os
 5 | from scipy import sparse
 6 | import argparse
 7 | import sys
 8 | 
 9 | def load_peerread(path='../dat/PeerRead/'):
10 | 	return pd.read_csv(path + 'proc_abstracts.csv')
11 | 
12 | 
13 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
14 | 	count_filename = path  + 'term_counts'
15 | 	vocab_filename = path + 'vocab'
16 | 
17 | 	if os.path.exists(count_filename + '.npz') and not force_redo:
18 | 		return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy')
19 | 
20 | 	post_docs = df[text_col].values
21 | 	counts, vocab, _ = tokenize_documents(post_docs)    
22 | 	sparse.save_npz(count_filename, counts)
23 | 	np.save(vocab_filename, vocab)
24 | 	return counts, np.array(vocab)
25 | 
26 | def main():
27 | 	if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda:
28 | 		if dat_dir:
29 | 			peerread = load_peerread(path=dat_dir)
30 | 			terms, vocab = load_term_counts(peerread, path=dat_dir, force_redo=redo_proc)
31 | 		else:
32 | 			peerread = load_peerread()
33 | 			terms, vocab = load_term_counts(peerread, force_redo=redo_proc)
34 | 
35 | 		N = terms.shape[0]
36 | 		indices = np.arange(N)
37 | 		dev_idx = assign_dev_split(N)
38 | 		train_idx = np.setdiff1d(indices, dev_idx)
39 | 		X_tr = terms[train_idx, :]
40 | 		X_dev = terms[dev_idx, :]
41 | 		K_vals = [50]
42 | 		validation_scores = np.zeros(len(K_vals))
43 | 		all_topics = []
44 | 		models = []
45 | 		for i,k in enumerate(K_vals):
46 | 			score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k)
47 | 			validation_scores[i] = score
48 | 			all_topics.append(topics)
49 | 			models.append(lda_obj)
50 | 		k_idx = np.argsort(validation_scores)[0]#[-1]
51 | 		best_k = K_vals[k_idx]
52 | 		best_topics = all_topics[k_idx]
53 | 		best_model = models[k_idx] 
54 | 		best_doc_prop = best_model.transform(terms)
55 | 		np.save(os.path.join(out_dir, 'topics'), best_topics)
56 | 		np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop)
57 | 	else:
58 | 		best_topics = np.load(os.path.join(out_dir, 'topics.npy'))
59 | 		vocab = np.load(os.path.join(out_dir, 'vocab.npy'))
60 | 
61 | 	print("Best topic")
62 | 	topics = show_topics(vocab, best_topics, n_words=10)
63 | 	print(topics)
64 | 
65 | if __name__ == '__main__':
66 | 	parser = argparse.ArgumentParser()
67 | 	parser.add_argument("--dat-dir", action="store", default=None)
68 | 	parser.add_argument("--out-dir", action="store", default="../dat/PeerRead/")
69 | 	parser.add_argument("--redo-lda", action="store_true")
70 | 	parser.add_argument("--redo-proc", action="store_true")
71 | 	parser.add_argument("--test", action="store_true")
72 | 	args = parser.parse_args()
73 | 	out_dir = args.out_dir
74 | 	redo_lda = args.redo_lda
75 | 	redo_proc = args.redo_proc
76 | 	dat_dir = args.dat_dir
77 | 
78 | 	main()


--------------------------------------------------------------------------------
/src/lda_baseline/peerread_get_abstracts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple pre-processing for PeerRead papers.
 3 | Takes in JSON formatted data from ScienceParse and outputs a tfrecord
 4 | 
 5 | 
 6 | Reference example:
 7 | https://github.com/tensorlayer/tensorlayer/blob/9528da50dfcaf9f0f81fba9453e488a1e6c8ee8f/examples/data_process/tutorial_tfrecord3.py
 8 | """
 9 | 
10 | import argparse
11 | import glob
12 | import os
13 | import random
14 | import pandas as pd
15 | import io
16 | import json
17 | from dateutil.parser import parse as parse_date
18 | from PeerRead.ScienceParse.Paper import Paper
19 | 
20 | rng = random.Random(0)
21 | 
22 | 
23 | def process_json_paper(paper_json_filename, scienceparse_dir, tokenizer):
24 |     paper = Paper.from_json(paper_json_filename)
25 |     return paper.ABSTRACT
26 | 
27 | 
28 | def output_PeerRead_text(review_json_dir, parsedpdf_json_dir,
29 |                         out_dir, out_file):
30 | 
31 |     if not os.path.exists(out_dir):
32 |         os.makedirs(out_dir)
33 | 
34 |     paper_data = []
35 |     print('Reading reviews from...', review_json_dir)
36 |     paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
37 |     for idx, paper_json_filename in enumerate(paper_json_filenames):
38 |         paper = Paper.from_json(paper_json_filename)
39 |         paper_data.append([paper.ID, paper.ABSTRACT])
40 | 
41 |     df = pd.DataFrame(paper_data, columns=['paper_id', 'abstract_text'])
42 |     df.to_csv(out_dir + 'proc_abstracts.csv')
43 | 
44 | def main():
45 |     parser = argparse.ArgumentParser()
46 | 
47 |     parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
48 |     parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
49 |     parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/')
50 |     parser.add_argument('--out-file', type=str, default='proc_text.csv')
51 | 
52 |     args = parser.parse_args()
53 | 
54 |     output_PeerRead_text(args.review_json_dir, args.parsedpdf_json_dir,
55 |                            args.out_dir, args.out_file)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/src/lda_baseline/peerread_output_att.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.ate import ate_estimates
  2 | from .peerread_fit_topics import load_peerread
  3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | from sklearn.linear_model import LogisticRegression, LinearRegression
  8 | from sklearn.metrics import mean_squared_error as mse
  9 | import argparse
 10 | import sys
 11 | from scipy.special import logit
 12 | 
 13 | def compute_ground_truth_treatment_effect(df):
 14 | 	y1 = df['y1']
 15 | 	y0 = df['y0']
 16 | 	return y1.mean() - y0.mean()
 17 | 
 18 | def get_log_outcomes(outcomes):
 19 | 	#relu
 20 | 	outcomes = np.array([max(0.0, out) + 1.0  for out in outcomes])
 21 | 	return np.log(outcomes)
 22 | 
 23 | def predict_expected_outcomes(model, doc_embeddings):
 24 | 	features = logit(doc_embeddings)
 25 | 	return model.predict_proba(features)[:,1]
 26 | 
 27 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings):
 28 | 	model = LogisticRegression(solver='liblinear')
 29 | 	features = logit(doc_embeddings)
 30 | 	model.fit(features, outcomes)
 31 | 	if verbose:
 32 | 		print("Training accuracy:", model.score(features, outcomes))
 33 | 	return model
 34 | 
 35 | def predict_treatment_probability(labels, doc_embeddings):
 36 | 	model = LogisticRegression(solver='liblinear')
 37 | 	features = logit(doc_embeddings)
 38 | 	model.fit(features, labels)
 39 | 	if verbose:
 40 | 		print("Training accuracy:", model.score(features, labels))
 41 | 	treatment_probability = model.predict_proba(features)[:,1]
 42 | 	return treatment_probability
 43 | 
 44 | def load_simulated_data():
 45 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 46 | 	return sim_df
 47 | 
 48 | def load_document_proportions(path='../dat/PeerRead/'):
 49 | 	return np.load(path + 'document_proportions.npy')
 50 | 
 51 | def main():
 52 | 	peerread = load_peerread()
 53 | 	indices = peerread['paper_id'].values
 54 | 	index_mapping = make_index_mapping(peerread, on='index')
 55 | 
 56 | 	if not dat_dir:
 57 | 		doc_embeddings = load_document_proportions()
 58 | 	else:
 59 | 		doc_embeddings = load_document_proportions(path=dat_dir)
 60 | 
 61 | 	sim_df = load_simulated_data()
 62 | 	num_reps = 10
 63 | 	mean_estimates = {}
 64 | 
 65 | 	for rep in range(num_reps):
 66 | 		bootstrap_sim_df = assign_split(sim_df, num_splits=2)
 67 | 		bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0]
 68 | 		treatment_labels = bootstrap_sim_df.treatment.values
 69 | 		filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping, on='id')
 70 | 		treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings)
 71 | 
 72 | 		treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1]
 73 | 		untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0]
 74 | 		
 75 | 		all_outcomes = bootstrap_sim_df.outcome.values
 76 | 		outcomes_st_treated = treated_sim.outcome.values
 77 | 		outcomes_st_not_treated = untreated_sim.outcome.values
 78 | 		
 79 | 		doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping, on='id')
 80 | 		doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping, on='id')
 81 | 
 82 | 		model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated)
 83 | 		model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated)
 84 | 
 85 | 		expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings)
 86 | 		expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings)
 87 | 
 88 | 		estimates = ate_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 89 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)
 90 | 
 91 | 		for est, ate in estimates.items():
 92 | 			if est in mean_estimates:
 93 | 				mean_estimates[est].append(ate)
 94 | 			else:
 95 | 				mean_estimates[est] = [ate]
 96 | 
 97 | 	ground_truth_ate = compute_ground_truth_treatment_effect(sim_df)
 98 | 	mean_estimates.update({'ground_truth_ate':ground_truth_ate})
 99 | 	if verbose:
100 | 		for est, ates in mean_estimates.items():
101 | 			print(est, np.mean(ates), np.std(ates))
102 | 	else:
103 | 		config = ';'.join([str(mode)] + params)
104 | 		log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out')
105 | 		with open(log_file, 'a') as h:
106 | 			h.write(config + '\n')
107 | 			for est, ates in mean_estimates.items():
108 | 				h.write(est + ',' +  str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n')
109 | 
110 | 
111 | if __name__ == '__main__':
112 | 	parser = argparse.ArgumentParser()
113 | 	parser.add_argument("--dat-dir", action="store", default=None)
114 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
115 | 	parser.add_argument("--mode", action="store", default="simple")
116 | 	parser.add_argument("--params", action="store", default="1.0")
117 | 	parser.add_argument("--verbose", action='store_true')
118 | 	args = parser.parse_args()
119 | 
120 | 	sim_dir = args.sim_dir
121 | 	dat_dir = args.dat_dir
122 | 	verbose = args.verbose
123 | 	params = args.params
124 | 	sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
125 | 	mode = args.mode
126 | 	simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
127 | 
128 | 	main()


--------------------------------------------------------------------------------
/src/lda_baseline/reddit_fit_topics.py:
--------------------------------------------------------------------------------
 1 | from reddit.data_cleaning.reddit_posts import load_reddit
 2 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit
 3 | import numpy as np
 4 | import pandas as pd
 5 | import os
 6 | from scipy import sparse
 7 | import argparse
 8 | import sys
 9 | 
10 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False):
11 | 	count_filename = path  + 'term_counts'
12 | 	vocab_filename = path + 'vocab'
13 | 
14 | 	if os.path.exists(count_filename + '.npz') and not force_redo:
15 | 		return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy')
16 | 
17 | 	post_docs = reddit['post_text'].values
18 | 	counts, vocab, _ = tokenize_documents(post_docs)    
19 | 	sparse.save_npz(count_filename, counts)
20 | 	np.save(vocab_filename, vocab)
21 | 	return counts, np.array(vocab)
22 | 
23 | def main():
24 | 	if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda:
25 | 
26 | 		subreddits = {'keto', 'OkCupid', 'childfree'}
27 | 		reddit = load_reddit()
28 | 		filtered_indices = filter_by_subreddit(reddit, subs=subreddits)
29 | 
30 | 		if dat_dir:
31 | 			terms, vocab = load_term_counts(reddit, path=dat_dir, force_redo=redo_proc)
32 | 		else:
33 | 			terms, vocab = load_term_counts(reddit, force_redo=redo_proc)
34 | 
35 | 		terms = terms[filtered_indices, :]
36 | 		N = terms.shape[0]
37 | 		indices = np.arange(N)
38 | 		dev_idx = assign_dev_split(N)
39 | 		train_idx = np.setdiff1d(indices, dev_idx)
40 | 		X_tr = terms[train_idx, :]
41 | 		X_dev = terms[dev_idx, :]
42 | 		print(dev_idx.shape)
43 | 
44 | 		K_vals = [100]
45 | 		validation_scores = np.zeros(len(K_vals))
46 | 		all_topics = []
47 | 		models = []
48 | 		for i,k in enumerate(K_vals):
49 | 			score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k)
50 | 			validation_scores[i] = score
51 | 			all_topics.append(topics)
52 | 			models.append(lda_obj)
53 | 		k_idx = np.argsort(validation_scores)[0]#[-1]
54 | 		best_k = K_vals[k_idx]
55 | 		best_topics = all_topics[k_idx]
56 | 		best_model = models[k_idx] 
57 | 		best_doc_prop = best_model.transform(terms)
58 | 		np.save(os.path.join(out_dir, 'topics'), best_topics)
59 | 		np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop)
60 | 	else:
61 | 		best_topics = np.load(os.path.join(out_dir, 'topics.npy'))
62 | 		vocab = np.load(os.path.join(out_dir, 'vocab.npy'))
63 | 
64 | 	# print("Best topic")
65 | 	# topics = show_topics(vocab, best_topics, n_words=10)
66 | 	# print(topics)
67 | 
68 | if __name__ == '__main__':
69 | 	parser = argparse.ArgumentParser()
70 | 	parser.add_argument("--dat-dir", action="store", default=None)
71 | 	parser.add_argument("--out-dir", action="store", default="../dat/reddit/")
72 | 	parser.add_argument("--redo-lda", action="store_true")
73 | 	parser.add_argument("--redo-proc", action="store_true")
74 | 	parser.add_argument("--test", action="store_true")
75 | 	args = parser.parse_args()
76 | 	out_dir = args.out_dir
77 | 	redo_lda = args.redo_lda
78 | 	redo_proc = args.redo_proc
79 | 	dat_dir = args.dat_dir
80 | 	test = args.test
81 | 
82 | 	main()


--------------------------------------------------------------------------------
/src/lda_baseline/reddit_output_att.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.att import att_estimates
  2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
  3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | from sklearn.linear_model import LogisticRegression, LinearRegression
  8 | from sklearn.metrics import mean_squared_error as mse
  9 | import argparse
 10 | import sys
 11 | from scipy.special import logit
 12 | 
 13 | def get_log_outcomes(outcomes):
 14 | 	#relu
 15 | 	outcomes = np.array([max(0.0, out) + 1.0  for out in outcomes])
 16 | 	return np.log(outcomes)
 17 | 
 18 | def predict_expected_outcomes(model, doc_embeddings):
 19 | 	features = logit(doc_embeddings)
 20 | 	return model.predict(features)
 21 | 
 22 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings):
 23 | 	model = LinearRegression()
 24 | 	features = logit(doc_embeddings)
 25 | 	model.fit(features, outcomes)
 26 | 	predict = model.predict(features)
 27 | 	if verbose:
 28 | 		print("Training MSE:", mse(outcomes, predict))
 29 | 	return model
 30 | 
 31 | def predict_treatment_probability(labels, doc_embeddings):
 32 | 	model = LogisticRegression(solver='liblinear')
 33 | 	features = logit(doc_embeddings)
 34 | 	model.fit(features, labels)
 35 | 	if verbose:
 36 | 		print("Training accuracy:", model.score(features, labels))
 37 | 	treatment_probability = model.predict_proba(features)[:,1]
 38 | 	return treatment_probability
 39 | 
 40 | def load_simulated_data():
 41 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 42 | 	sim_df = sim_df.rename(columns={'index':'post_index'})
 43 | 	return sim_df
 44 | 
 45 | def load_document_proportions(path='../dat/reddit/'):
 46 | 	return np.load(path + 'document_proportions.npy')
 47 | 
 48 | def main():
 49 | 	reddit = load_reddit_processed()
 50 | 	if subs:
 51 | 		reddit = reddit[reddit.subreddit.isin(subs)]
 52 | 
 53 | 	index_mapping = make_index_mapping(reddit, on='orig_index')
 54 | 	if not dat_dir:
 55 | 		doc_embeddings = load_document_proportions()
 56 | 	else:
 57 | 		doc_embeddings = load_document_proportions(path=dat_dir)
 58 | 
 59 | 	sim_df = load_simulated_data()
 60 | 	num_reps = 10
 61 | 	mean_estimates = {}
 62 | 
 63 | 	for rep in range(num_reps):
 64 | 		bootstrap_sim_df = assign_split(sim_df, num_splits=2)
 65 | 		bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0]
 66 | 		treatment_labels = bootstrap_sim_df.treatment.values
 67 | 		filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping)
 68 | 		treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings)
 69 | 
 70 | 		treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1]
 71 | 		untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0]
 72 | 		
 73 | 		all_outcomes = bootstrap_sim_df.outcome.values
 74 | 		outcomes_st_treated = treated_sim.outcome.values
 75 | 		outcomes_st_not_treated = untreated_sim.outcome.values
 76 | 		
 77 | 		doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping)
 78 | 		doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping)
 79 | 
 80 | 		model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated)
 81 | 		model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated)
 82 | 
 83 | 		expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings)
 84 | 		expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings)
 85 | 
 86 | 		estimates = att_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 87 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
 88 | 
 89 | 		for est, ate in estimates.items():
 90 | 			if est in mean_estimates:
 91 | 				mean_estimates[est].append(ate)
 92 | 			else:
 93 | 				mean_estimates[est] = [ate]
 94 | 	if verbose:
 95 | 		for est, ates in mean_estimates.items():
 96 | 			print(est, np.mean(ates), np.std(ates))
 97 | 	else:
 98 | 		config = ';'.join([str(mode)] + params)
 99 | 		log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out')
100 | 		with open(log_file, 'a') as h:
101 | 			h.write(config + '\n')
102 | 			for est, ates in mean_estimates.items():
103 | 				h.write(est + ',' +  str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n')
104 | 
105 | 
106 | if __name__ == '__main__':
107 | 	parser = argparse.ArgumentParser()
108 | 	parser.add_argument("--dat-dir", action="store", default=None)
109 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
110 | 	parser.add_argument("--subs", action="store", default='13,6,8')
111 | 	parser.add_argument("--mode", action="store", default="simple")
112 | 	parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
113 | 	parser.add_argument("--verbose", action='store_true')
114 | 	args = parser.parse_args()
115 | 
116 | 	sim_dir = args.sim_dir
117 | 	dat_dir = args.dat_dir
118 | 	subs = None
119 | 	if args.subs != '':
120 | 		subs = [int(s) for s in args.subs.split(',')]
121 | 	verbose = args.verbose
122 | 	params = args.params.split(',')
123 | 	sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
124 | 	subs_string = ', '.join(args.subs.split(','))
125 | 	mode = args.mode
126 | 	simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
127 | 
128 | 	main()


--------------------------------------------------------------------------------
/src/lda_baseline/scripts/sweep_over_sims.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #NUM_SEED=2
 3 | #SEEDS=$(seq 0 $NUM_SEED)
 4 | rm ../dat/sim/reddit_subreddit_based/two-stage-lda-estimates.out
 5 | export SUBREDDITS=13,6,8
 6 | export BETA0=1.0
 7 | declare -a SIMMODES=('simple')
 8 | declare -a BETA1S=(1.0 10.0 100.0)
 9 | declare -a GAMMAS=(1.0 4.0)
10 | 
11 | for SIMMODEj in "${SIMMODES[@]}"; do
12 |     for BETA1j in "${BETA1S[@]}"; do
13 |         for GAMMAj in "${GAMMAS[@]}"; do
14 |             python -m lda_baseline.reddit_output_att \
15 |             --subs=${SUBREDDITS} \
16 |             --mode=${SIMMODEj} \
17 |             --params=${BETA0},${BETA1j},${GAMMAj}
18 |         done
19 |     done
20 | done


--------------------------------------------------------------------------------
/src/model_checking/plot_adjustment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from sklearn.linear_model import LogisticRegression, LinearRegression
  6 | from scipy.special import logit
  7 | from result_processing.helpers import convert_str_columns_to_float, assign_split, filter_imbalanced_terms
  8 | from sklearn.metrics import mean_squared_error as mse
  9 | from scipy.sparse import load_npz
 10 | import matplotlib.pyplot as plt
 11 | from scipy.stats import gaussian_kde
 12 | 
 13 | def get_prediction_file():
 14 | 	predict_df = pd.read_csv(log_file, delimiter='\t')
 15 | 	predict_df = predict_df.rename(columns={'index':'post_index'})
 16 | 	return predict_df
 17 | 
 18 | def fit_treatment(features, labels, verbose=False, coeff_offset=1):
 19 | 	model = LogisticRegression(solver='liblinear')
 20 | 	model.fit(features, labels)
 21 | 	coeffs = np.array(model.coef_).flatten()[coeff_offset:]
 22 | 	if verbose:
 23 | 		print("Model accuracy:", model.score(features, labels))
 24 | 		print("Mean and std. of the word coeffs:", coeffs.mean(), coeffs.std())     
 25 | 	return coeffs
 26 | 
 27 | def truncate(df, truncate_level=0.1):
 28 | 	df = df[(df.treatment_probability >= truncate_level) & (df.treatment_probability <= 1.0-truncate_level)]
 29 | 	return df
 30 | 
 31 | def plot_density(unadjusted, adjusted, permuted):
 32 | 	density = gaussian_kde(adjusted.mean(axis=0))
 33 | 	permutation_density = gaussian_kde(permuted.mean(axis=0))
 34 | 	missing_z_density = gaussian_kde(unadjusted.mean(axis=0))
 35 | 	xs = np.linspace(-0.5,0.5,1000)
 36 | 	plt.plot(xs,density(xs), label='Adjusted model (not permuted)')
 37 | 	plt.plot(xs, permutation_density(xs), label='Permuted model')
 38 | 	plt.plot(xs, missing_z_density(xs), label='Unadjusted model')
 39 | 	plt.xlabel('Coefficient values for words')
 40 | 	plt.legend()
 41 | 
 42 | 	if not os.path.exists(out_dir):
 43 | 		os.makedirs(out_dir)
 44 | 	# plt.tight_layout()
 45 | 	plt.savefig(out_dir + out_file, dpi=300)
 46 | 
 47 | def load_terms(data):
 48 | 	termfile = '../dat/' + data + '/term_counts.npz'
 49 | 	if data == 'reddit':
 50 | 		termfile = '../dat/' + data + '_term_counts.npz'
 51 | 	term_counts = load_npz(termfile).toarray()
 52 | 	if drop_terms:
 53 | 		term_indices = np.arange(term_counts.shape[1])
 54 | 		random_indices = np.random.choice(term_indices, 1000)
 55 | 		term_counts = term_counts[:,random_indices]
 56 | 	return term_counts
 57 | 
 58 | def main():
 59 | 	predict_df = get_prediction_file()
 60 | 	term_counts = load_terms(dataset)
 61 | 	print(predict_df.shape, term_counts.shape)
 62 | 	if dataset == 'reddit':
 63 | 		imbalanced_terms = filter_imbalanced_terms(predict_df, term_counts)
 64 | 		term_counts = term_counts[:,imbalanced_terms]
 65 | 		print(term_counts.shape)
 66 | 
 67 | 	n_bootstraps = 10
 68 | 	n_w = term_counts.shape[1]
 69 | 	
 70 | 	adjusted = np.zeros((n_bootstraps, n_w))
 71 | 	permuted = np.zeros((n_bootstraps, n_w))
 72 | 	unadjusted = np.zeros((n_bootstraps, n_w))
 73 | 
 74 | 	for i in range(n_bootstraps):
 75 | 		sample = assign_split(predict_df,num_splits=2)
 76 | 		sample = sample[sample.split==0]
 77 | 		indices = sample.post_index.values
 78 | 		labels = sample.treatment.values
 79 | 		words = term_counts[indices, :]
 80 | 		propensity_score = logit(sample.treatment_probability.values)
 81 | 		all_features = np.column_stack((propensity_score, words))
 82 | 		unadjusted[i,:] = fit_treatment(words, labels, coeff_offset=0)
 83 | 		adjusted[i,:] = fit_treatment(all_features, labels)
 84 | 		np.random.shuffle(words)
 85 | 		permuted_features = np.column_stack((propensity_score, words))
 86 | 		permuted[i,:] = fit_treatment(permuted_features, labels)
 87 | 
 88 | 	plot_density(unadjusted, adjusted, permuted)
 89 | 
 90 | if __name__ == '__main__':
 91 | 	parser = argparse.ArgumentParser()
 92 | 	parser.add_argument("--out-dir", action="store", default='../figures/')
 93 | 	parser.add_argument("--out-file", action="store", default='reddit.pdf')
 94 | 	parser.add_argument("--log-file", action="store", default='../logdir/reddit/modesimple/beta01.0.beta110.0.gamma1.0/predict/test_results_all.tsv')
 95 | 	parser.add_argument("--drop-terms", action="store_true")
 96 | 	parser.add_argument("--dataset", action="store", default='reddit')
 97 | 	args = parser.parse_args()
 98 | 	log_file = args.log_file
 99 | 	drop_terms = args.drop_terms
100 | 	dataset = args.dataset
101 | 	out_dir = args.out_dir
102 | 	out_file = args.out_file
103 | 	main()


--------------------------------------------------------------------------------
/src/reddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/__init__.py


--------------------------------------------------------------------------------
/src/reddit/data_cleaning/BigQuery_get_data:
--------------------------------------------------------------------------------
  1 | ```/*
  2 | based on https://www.reddit.com/r/bigquery/comments/4f2yp7/best_way_to_look_at_conversation_chains_in_reddit/
  3 | and on https://nbviewer.jupyter.org/github/bburky/subredditgenderratios/blob/master/Subreddit%20Gender%20Ratios.ipynb
  4 | 
  5 | KNOWN LIMITATIONS:
  6 | does not look for male/female (zodiac?) symbols.
  7 | I didn't do a fresh search over subreddits w/ gender, so some may be missing
  8 | up/downs was mostly null, so I omitted this field
  9 | */
 10 | 
 11 | WITH
 12 | reddit_comments AS (
 13 | SELECT
 14 |   body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class
 15 | FROM
 16 |   `fh-bigquery.reddit_comments.2018*`
 17 | --  `reddit-gender.comment_response_tuples.gendered_2018`
 18 | ),
 19 | replies AS (
 20 | SELECT
 21 |   REGEXP_EXTRACT(parent_id, r'[a-zA-Z0-9]+$') as parent_id,
 22 | --  MIN(subreddit) AS subreddit,
 23 |   ARRAY_AGG(STRUCT(body, author, created_utc, id) ORDER BY created_utc ASC) AS reply
 24 | FROM
 25 |   reddit_comments
 26 | WHERE
 27 |   --parent id starting w t1_ indicates not-top-level comment
 28 |   REGEXP_CONTAINS(parent_id, r'^(t1_)')
 29 | GROUP BY
 30 |   parent_id
 31 | ),
 32 | ops AS (
 33 | SELECT
 34 |     gender, body, author, author_flair_text, created_utc, link_id, score, controversiality, gilded, id, subreddit, author_flair_css_class
 35 | FROM
 36 | (
 37 |   -- male/female
 38 |   SELECT
 39 |     *,
 40 |     REGEXP_EXTRACT(
 41 |       LOWER(author_flair_css_class),
 42 |       '(?:fe)?male') AS gender
 43 |   FROM
 44 |     reddit_comments
 45 |   WHERE
 46 |     subreddit IN (
 47 |       'AskMen',
 48 |       'AskWomen',
 49 |       'AskMenOver30',
 50 |       'AskWomenOver30',
 51 |       'sexover30')
 52 |   UNION ALL
 53 |   -- pink/blue
 54 |   SELECT
 55 |     *,
 56 |     CASE
 57 |       WHEN author_flair_css_class = 'blue' THEN 'male'
 58 |       WHEN author_flair_css_class = 'pink' THEN 'female'
 59 |     END AS gender
 60 |   FROM
 61 |     reddit_comments
 62 |   WHERE
 63 |     subreddit IN (
 64 |       'tall',
 65 |       'short')
 66 |   UNION ALL
 67 |   -- A/S/L
 68 |   SELECT
 69 |   -- need to do this one manually because of asl
 70 |       body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class,
 71 |     CASE
 72 |       WHEN asl = 'm' THEN 'male'
 73 |       WHEN asl = 'f' THEN 'female'
 74 |     END AS gender
 75 |   FROM (
 76 |     SELECT
 77 |       *,
 78 |       REGEXP_EXTRACT(
 79 |         LOWER(author_flair_text),
 80 |         "(?:^|[^\\p{L}0-9'\\.\\$])\\s*(?:\\d\\d)?\\s*(f|m)\\s*(?:\\d\\d)?\\s*(?:$|[^\\p{L}0-9'\\.])") AS asl
 81 |     FROM
 82 |       reddit_comments
 83 |     WHERE
 84 |       subreddit IN (
 85 |         'OkCupid',
 86 |         'keto',
 87 |         'childfree',
 88 |         'xxketo',
 89 |         'LGBTeens',
 90 |         'loseit',
 91 |         'Tinder',
 92 |         'proED',
 93 |         'fatlogic',
 94 |         'financialindependence',
 95 |         'infj',
 96 |         'infertility',
 97 |         '100DaysofKeto')) )
 98 | WHERE
 99 |   gender IS NOT NULL AND
100 |     --parent id starting w t3_ indicates top-level comment
101 |   REGEXP_CONTAINS(parent_id, r'^(t3_)')
102 | )
103 | 
104 | SELECT
105 |   ops.*,
106 |   replies.*
107 | FROM
108 |   ops INNER JOIN replies ON ops.id = replies.parent_id```


--------------------------------------------------------------------------------
/src/reddit/data_cleaning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/data_cleaning/__init__.py


--------------------------------------------------------------------------------
/src/reddit/data_cleaning/reddit_gender_sentiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 22,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import json\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 23,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "datafile = os.path.join('..', 'dat', '2018')\n",
 22 |     "\n",
 23 |     "with open(datafile, 'r') as f:\n",
 24 |     "    record_dicts = []\n",
 25 |     "    for line in f.readlines():\n",
 26 |     "        record = json.loads(line)\n",
 27 |     "        reply_list = record['reply']\n",
 28 |     "        earliest_reply_text = None\n",
 29 |     "        for reply_dict in sorted(reply_list, key=lambda x: x['created_utc']):\n",
 30 |     "            if reply_dict['body'] != '[deleted]' and reply_dict['body'] != '[removed]':\n",
 31 |     "                earliest_reply_text = reply_dict['body']\n",
 32 |     "            if earliest_reply_text:\n",
 33 |     "                break\n",
 34 |     "        if earliest_reply_text:\n",
 35 |     "            record.pop('reply')\n",
 36 |     "            record['reply_text'] = earliest_reply_text\n",
 37 |     "            record_dicts.append(record)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 24,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "reddit_df = pd.DataFrame(record_dicts)\n",
 47 |     "reddit_df = reddit_df[reddit_df.body != '[deleted]']\n",
 48 |     "reddit_df = reddit_df.astype({'score':np.int64, 'controversiality':np.int64, 'gilded':np.int64, 'created_utc':np.int64})"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 25,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "from google.cloud import language\n",
 58 |     "from google.cloud.language import enums\n",
 59 |     "from google.cloud.language import types\n",
 60 |     "client = language.LanguageServiceClient()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 61,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "replies = reddit_df[['body','reply_text']].values\n",
 70 |     "indices = np.arange(len(replies))\n",
 71 |     "np.random.shuffle(indices)\n",
 72 |     "random_idx = indices[:10]"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 63,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "OP: How much our bridal party could drink when given the news they had an open bar tab. That was a couple thousand extra we didn’t expect to spend.\n",
 85 |       "\n",
 86 |       "\n",
 87 |       "Actual married life is exactly the same as before we were married since we lived together and shared finances well before marrying. \n",
 88 |       "Text: I wonder if it would be a good idea to just say it's a discounted bar or something and then at the end reveal it was an open bar.\n",
 89 |       "Sentiment: -0.30000001192092896, 0.30000001192092896\n",
 90 |       "****************************************\n",
 91 |       "OP: excuse me but this is a christian subreddit\n",
 92 |       "Text: But they said no homo\n",
 93 |       "Sentiment: 0.30000001192092896, 0.30000001192092896\n",
 94 |       "****************************************\n",
 95 |       "OP: I don't buy that the inches=pounds thing is real but if you want to add some scientific information to this, for me, one inch was equal to 6.1lbs when I took my starting weight and measurements.\n",
 96 |       "Text: Until I get an accurate scale, I think I'm going to try to do an average of the three numbers I've heard so far. That will at least give me a starting point so I can chart how far I've come. Thanks!\n",
 97 |       "Sentiment: 0.10000000149011612, 0.5\n",
 98 |       "****************************************\n",
 99 |       "OP: My SO referred to my mixed race roommate as \"half-caste\". He didn't realise that was considered offensive by some, it was what everyone at his school said.\n",
100 |       "Text: Did you explain to him why it was offensive? I’ve noticed a lot of ppl say things that others say around them. \n",
101 |       "Sentiment: -0.10000000149011612, 1.0\n",
102 |       "****************************************\n",
103 |       "OP: I get like that every shark week!! If you have to, up your calories to maintenance :)\n",
104 |       "Text: How do I adjust my macros? I don't want to eat too much fat lol\n",
105 |       "Sentiment: -0.10000000149011612, 0.20000000298023224\n",
106 |       "****************************************\n",
107 |       "OP: Only problem I have with it is the repetition/inconsistency of “ask(ed)”. Aside from that, seems like a real conversation I could see people having. Nice work :)\n",
108 |       "Text: I'm trying to cut down on my repetition and more on letting the reader assume it was a question rather than having it say that instead. Thanks. \n",
109 |       "Sentiment: 0.0, 0.5\n",
110 |       "****************************************\n",
111 |       "OP: This week I'm listening David Bowie, Nesrin Sipahi, Run DMC.\n",
112 |       "Text: Run DMC :) YES! \n",
113 |       "Sentiment: 0.30000001192092896, 0.6000000238418579\n",
114 |       "****************************************\n",
115 |       "OP: Pursuing the things you want to pursue, whether that’s love, fun, success, or anything else. Being willing to take risks in that pursuit.\n",
116 |       "Text: Lots of ppl mentioning taking risks in this thread. What exactly do you mean by that?\n",
117 |       "Sentiment: 0.0, 0.10000000149011612\n",
118 |       "****************************************\n",
119 |       "OP: It's a toss up between my diploma and my wedding ring.\n",
120 |       "Text: I still have yet to pick up my diploma from my college and I graduated in 2012...\n",
121 |       "Sentiment: 0.30000001192092896, 0.30000001192092896\n",
122 |       "****************************************\n",
123 |       "OP: Hate it. Partly to do with my other mental illnesses, but have trouble with hygiene in general.\n",
124 |       "Text: How do you mean? If you don't mind sharing, that is. \n",
125 |       "Sentiment: 0.0, 0.10000000149011612\n",
126 |       "****************************************\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "for idx in random_idx:\n",
132 |     "    op = replies[idx][0]\n",
133 |     "    post = replies[idx][1]\n",
134 |     "    lines = post.split('\\n')\n",
135 |     "    for text in lines:\n",
136 |     "        if text == '':\n",
137 |     "            continue\n",
138 |     "        document = types.Document(\n",
139 |     "            content=text,\n",
140 |     "            type=enums.Document.Type.PLAIN_TEXT)\n",
141 |     "        sentiment = client.analyze_sentiment(document=document).document_sentiment\n",
142 |     "        print(\"OP:\", op)\n",
143 |     "        print(\"Text:\", text)\n",
144 |     "        print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))\n",
145 |     "        print(\"*\"*40)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": []
154 |   }
155 |  ],
156 |  "metadata": {
157 |   "kernelspec": {
158 |    "display_name": "Python 3",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "codemirror_mode": {
164 |     "name": "ipython",
165 |     "version": 3
166 |    },
167 |    "file_extension": ".py",
168 |    "mimetype": "text/x-python",
169 |    "name": "python",
170 |    "nbconvert_exporter": "python",
171 |    "pygments_lexer": "ipython3",
172 |    "version": "3.6.6"
173 |   }
174 |  },
175 |  "nbformat": 4,
176 |  "nbformat_minor": 2
177 | }
178 | 


--------------------------------------------------------------------------------
/src/reddit/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/dataset/__init__.py


--------------------------------------------------------------------------------
/src/reddit/dataset/array_from_dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | helpers to take samples from the dataset and turn them into numpy arrays
  3 | (for ease of inspection and use with baselines)
  4 | """
  5 | import argparse
  6 | import os
  7 | import numpy as np
  8 | import pandas as pd
  9 | import tensorflow as tf
 10 | try:
 11 |     import mkl_random as random
 12 | except ImportError:
 13 |     import numpy.random as random
 14 | 
 15 | import bert.tokenization as tokenization
 16 | from reddit.dataset.dataset import make_input_fn_from_file, make_subreddit_based_simulated_labeler
 17 | 
 18 | 
 19 | def dataset_fn_to_df(dataset_fn):
 20 | 
 21 |     params = {'batch_size': 1}
 22 |     dataset = dataset_fn(params)
 23 | 
 24 |     itr = dataset.make_one_shot_iterator()
 25 | 
 26 |     samples = []
 27 | 
 28 |     for i in range(250000):
 29 |         try:
 30 |             sample = itr.get_next()
 31 |             for k in sample:
 32 |                 sample[k] = sample[k].numpy()[0]
 33 |             samples += [sample]
 34 |             # print("year: {}".format(sample['year']))
 35 |         except:
 36 |             print(i)
 37 |             break
 38 | 
 39 |     df = pd.DataFrame(samples)
 40 | 
 41 |     return df
 42 | 
 43 | 
 44 | def subreddit_based_sim_dfs(subreddits, treat_strength, con_strength, noise_level, setting="simple", seed=0,
 45 |                             base_output_dir='../dat/sim/reddit_subreddit_based/'):
 46 | 
 47 |     labeler = make_subreddit_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
 48 | 
 49 |     num_splits = 10
 50 |     dev_splits = [0]
 51 |     test_splits = [0]
 52 | 
 53 |     # data_file = '../dat/reddit/proc.tf_record'
 54 |     # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
 55 |     tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
 56 | 
 57 |     input_dataset_from_filenames = make_input_fn_from_file(data_file,
 58 |                                                            250,
 59 |                                                            num_splits,
 60 |                                                            dev_splits,
 61 |                                                            test_splits,
 62 |                                                            tokenizer,
 63 |                                                            subreddits=subreddits,
 64 |                                                            is_training=False,
 65 |                                                            filter_test=False,
 66 |                                                            shuffle_buffer_size=25000,
 67 |                                                            seed=seed,
 68 |                                                            labeler=labeler)
 69 | 
 70 |     all_data = dataset_fn_to_df(input_dataset_from_filenames)
 71 |     output_df = all_data[['index', 'gender','outcome', 'y0', 'y1']]
 72 |     output_df = output_df.rename(index=str, columns={'gender': 'treatment'})
 73 | 
 74 |     output_dir = os.path.join(base_output_dir, "subreddits{}".format(subreddits), "mode{}".format(setting))
 75 |     os.makedirs(output_dir, exist_ok=True)
 76 |     output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
 77 | 
 78 |     output_df.to_csv(output_path, '\t')
 79 | 
 80 | 
 81 | def main():
 82 |     tf.enable_eager_execution()
 83 | 
 84 | 
 85 |     subreddit_based_sim_dfs(subreddits=subs, treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0,
 86 |                             base_output_dir=base_output_dir)
 87 | 
 88 | 
 89 | 
 90 |     # print(itr.get_next()["token_ids"].name)
 91 |     # for i in range(1000):
 92 |     #     sample = itr.get_next()
 93 | 
 94 |     #
 95 |     # print(np.unique(df['year']))
 96 |     # print(df.groupby(['year'])['buzzy_title'].agg(np.mean))
 97 |     # print(df.groupby(['year'])['theorem_referenced'].agg(np.mean))
 98 |     # print(df.groupby(['year'])['accepted'].agg(np.mean))
 99 | 
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument("--data-file", action="store", default='../dat/reddit/proc.tf_record')
105 |     parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
106 |     parser.add_argument("--base-output-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
107 |     parser.add_argument("--subs", action="store", default='13,8,6')
108 |     parser.add_argument("--mode", action="store", default="simple")
109 |     parser.add_argument("--beta0", action="store", default='1.0')
110 |     parser.add_argument("--beta1", action="store", default='1.0')
111 |     parser.add_argument("--gamma", action="store", default='1.0')
112 |     args = parser.parse_args()
113 | 
114 |     data_file = args.data_file
115 |     vocab_file = args.vocab_file
116 |     base_output_dir = args.base_output_dir
117 |     subs = [int(s) for s in args.subs.split(',')]
118 |     mode = args.mode
119 |     beta0 = float(args.beta0)
120 |     beta1 = float(args.beta1)
121 |     gamma = float(args.gamma)
122 | 
123 |     # pass
124 |     main()


--------------------------------------------------------------------------------
/src/reddit/dataset/sentence_masking.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Create masked LM TF examples for BERT."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import collections
22 | 
23 | import tensorflow as tf
24 | 
25 | 
26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
27 |                                           ["index", "label"])
28 | 
29 | 
30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed):
31 |     """Creates the predictions for the masked LM objective.
32 | 
33 |     This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords
34 | 
35 |     Except: we just include masks instead of randomly letting the words through or randomly replacing
36 |     """
37 | 
38 |     basic_mask = tf.less(
39 |         tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed),
40 |         masked_lm_prob)
41 | 
42 |     # don't mask special characters or padding
43 |     cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]),
44 |                                   tf.not_equal(token_ids, vocab["[SEP]"]))
45 |     cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0))
46 |     mask = tf.logical_and(cand_indexes, basic_mask)
47 | 
48 |     # truncate to max predictions for ease of padding
49 |     masked_lm_positions = tf.where(mask)
50 |     # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ?
51 |     # very rare event: nothing gets picked for mask, causing an irritating bug
52 |     # in this case, just mask the first candidate index
53 |     mlm_shape = tf.shape(masked_lm_positions)[0]
54 |     masked_lm_positions = tf.cond(mlm_shape > 1,
55 |                                   lambda: masked_lm_positions,
56 |                                   lambda: tf.where(cand_indexes)[0:2])
57 | 
58 |     masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq]
59 |     masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32)
60 |     masked_lm_ids = tf.gather(token_ids, masked_lm_positions)
61 | 
62 |     mask = tf.cast(
63 |         tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape),
64 |         bool)
65 | 
66 |     output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids)
67 | 
68 |     # pad out to max_predictions_per_seq
69 |     masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding
70 |     add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]]
71 |     masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant')
72 |     masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant')
73 |     masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant')
74 | 
75 |     return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
76 | 
77 | 
78 | def main(_):
79 |     pass
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     flags.mark_flag_as_required("input_file")
84 |     flags.mark_flag_as_required("output_file")
85 |     flags.mark_flag_as_required("vocab_file")
86 |     tf.app.run()
87 | 


--------------------------------------------------------------------------------
/src/reddit/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/model/__init__.py


--------------------------------------------------------------------------------
/src/reddit/submit_scripts/run_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
 4 | export INIT_FILE=../dat/reddit/model.ckpt-400000
 5 | export DATA_FILE=../dat/reddit/proc.tf_record
 6 | export OUTPUT_DIR=../output/reddit_embeddings/
 7 | 
 8 | #13,6,8 are keto, okcupid, childfree
 9 | export SUBREDDITS=13,6,8
10 | export USE_SUB_FLAG=false
11 | export BETA0=1.0
12 | export BETA1=1.0
13 | export GAMMA=1.0
14 | 
15 | python -m reddit.model.run_causal_bert \
16 |   --seed=0 \
17 |   --do_train=true \
18 |   --do_eval=false \
19 |   --do_predict=true \
20 |   --label_pred=true \
21 |   --unsupervised=true \
22 |   --input_files_or_glob=${DATA_FILE} \
23 |   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
24 |   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
25 |   --output_dir=${OUTPUT_DIR} \
26 |   --dev_splits=0 \
27 |   --test_splits=0 \
28 |   --max_seq_length=128 \
29 |   --train_batch_size=16 \
30 |   --learning_rate=3e-5 \
31 |   --num_warmup_steps 1000 \
32 |   --num_train_steps=10000 \
33 |   --save_checkpoints_steps=5000 \
34 |   --keep_checkpoints=1 \
35 |   --subreddits=${SUBREDDITS} \
36 |   --beta0=${BETA0} \
37 |   --beta1=${BETA1} \
38 |   --gamma=${GAMMA}
39 | #  --init_checkpoint=${INIT_FILE}


--------------------------------------------------------------------------------
/src/reddit/submit_scripts/run_unsupervised.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
 4 | 
 5 | export DATA_FILE=../dat/reddit/proc.tf_record
 6 | export OUTPUT_DIR=../output/reddit_embeddings/
 7 | 
 8 | #rm -rf $OUTPUT_DIR
 9 | python -m model.run_unsupervised_pretraining \
10 |   --seed=0 \
11 |   --do_train=true \
12 |   --input_file=${DATA_FILE} \
13 |   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
14 |   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
15 |   --output_dir=${OUTPUT_DIR} \
16 |   --max_seq_length=256 \
17 |   --train_batch_size=16 \
18 |   --learning_rate=3e-5 \
19 |   --num_warmup_steps 200 \
20 |   --num_train_steps=175000 \
21 |   --save_checkpoints_steps 5000 \
22 |   --keep_checkpoints 3


--------------------------------------------------------------------------------
/src/result_processing/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | np.random.seed(0)
 4 | 
 5 | def convert_str_columns_to_float(df):
 6 | 	df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].str[1:-1]
 7 | 	df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].astype(np.float64)
 8 | 
 9 | 	df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].str[1:-1]
10 | 	df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].astype(np.float64)
11 | 	return df
12 | 
13 | 
14 | def tokenize_documents(documents,max_df0=0.8, min_df0=0.01,print_vocabulary=False,outfolder=None,output_vocabulary_fname='vocabulary.dat'):
15 | 	from nltk.corpus import stopwords
16 | 	'''
17 | 	From a list of documents raw text build a matrix DxV
18 | 	D: number of docs
19 | 	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
20 | 	'''
21 | 	stop = stopwords.words('english')
22 | 	count_vect = CountVectorizer(stop_words=stop,max_df=max_df0, min_df=min_df0)
23 | 	corpus = count_vect.fit_transform(documents)
24 | 	vocabulary = count_vect.get_feature_names()
25 | 	
26 | 	return corpus,vocabulary,count_vect
27 | 
28 | 
29 | def assign_split(df, num_splits=10, col_to_add='split'):
30 | 	df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
31 | 	return df
32 | 
33 | 
34 | def filter_imbalanced_terms(df, term_counts, imbalance=0.1, key='post_index'):
35 | 	t_indices = []
36 | 	n_terms = term_counts.shape[1]
37 | 	for t in range(n_terms):
38 | 		ind_occur = np.nonzero(term_counts[:,t])[0]
39 | 		subset = df[df[key].isin(ind_occur)]
40 | 		if subset.shape[0] != 0:
41 | 			prop_men = subset[subset.treatment==1].shape[0]/subset.shape[0]
42 | 			prop_women = subset[subset.treatment==0].shape[0]/subset.shape[0]
43 | 			if abs(prop_women-prop_men)>=imbalance:
44 | 				t_indices.append(t)
45 | 	return t_indices
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/result_processing/process_predictions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from reddit.data_cleaning import load_reddit, process_text_length
  4 | import pandas as pd
  5 | import numpy as np
  6 | from random import sample
  7 | import matplotlib.pyplot as plt
  8 | from scipy.stats import pearsonr
  9 | 
 10 | from .helpers import convert_str_columns_to_float
 11 | 
 12 | 
 13 | def plot_covariate_proportions_per_stratum(treated, control, num_bins, covariate='subreddit'):
 14 | 	cov_vals = treated[covariate].values
 15 | 	n_groups = num_bins
 16 | 	
 17 | 	for val in cov_vals:
 18 | 		# data to plot
 19 | 		treat_props = treated.loc[treated[covariate] == val, 'count'].values
 20 | 		control_props = control.loc[control[covariate] == val, 'count'].values
 21 | 
 22 | 		# create plot
 23 | 		fig, ax = plt.subplots()
 24 | 		index = np.arange(n_groups)
 25 | 		bar_width = 0.3
 26 | 		opacity = 0.8
 27 | 
 28 | 		rects1 = plt.bar(index, treat_props, bar_width,
 29 | 		alpha=opacity,
 30 | 		color='b',
 31 | 		label='Treated Units')
 32 | 
 33 | 		rects2 = plt.bar(index + bar_width, control_props, bar_width,
 34 | 		alpha=opacity,
 35 | 		color='g',
 36 | 		label='Control Units')
 37 | 
 38 | 		plt.ylim((0.0,1.0))
 39 | 		plt.xlabel('Stratas')
 40 | 		plt.ylabel('Proportions of posts in ' + covariate + ':' + val)
 41 | 		plt.xticks(index + bar_width, tuple(range(1,num_bins+1)))
 42 | 		plt.legend()
 43 | 
 44 | 		plt.tight_layout()
 45 | 		plt.savefig(os.path.join(log_dir, 'proportions_for_' + covariate + '_' + val + '.png'))
 46 | 
 47 | def normalize(df, col):
 48 | 	vals = df[col].values
 49 | 	min_col = vals.min()
 50 | 	max_col = vals.max()
 51 | 	df[col] = (df[col] - min_col)/(max_col-min_col)
 52 | 	return df
 53 | 
 54 | 
 55 | def get_covariate_proportions(stratified_df, covariate='subreddit'):
 56 | 	counts_df = stratified_df.groupby(['strata', covariate]).size().reset_index(name="count")
 57 | 	total_by_strata = stratified_df.groupby("strata").size().reset_index(name="total")
 58 | 	counts_df = counts_df.merge(total_by_strata, how='inner', on='strata')
 59 | 	counts_df['count'] /= counts_df['total']
 60 | 	return counts_df
 61 | 
 62 | 
 63 | def get_text_results(reddit_df, result_df, sub=None):
 64 | 	indices = result_df['index'].values
 65 | 	result_df = reddit_df.loc[indices, ['subreddit', 'post_text', 'author']]
 66 | 
 67 | 	if sub:
 68 | 		result_df = result_df[result_df.subreddit.isin([sub])]
 69 | 
 70 | 	return result_df
 71 | 
 72 | 
 73 | def print_example_posts(sub_text_df, n=10):
 74 | 	post_list = [tuple(val) for val in sub_text_df.values]
 75 | 	random_posts = sample(post_list, n)
 76 | 	print("*"*10 + "Examples" + "*"*10)
 77 | 	for post in random_posts:
 78 | 		print("Subreddit:", post[0])
 79 | 		print("-"*40)
 80 | 		print("Text:", post[1])
 81 | 		print("-"*40)
 82 | 		print("Author:", post[2])
 83 | 		print("*"*40)
 84 | 
 85 | 
 86 | def stratify_by_value(df, num_bins=10, sort_by='treatment_probability', col_to_add='strata'):
 87 | 	values = df[sort_by].values
 88 | 	min_val = values.min()
 89 | 	max_val = values.max()
 90 | 	interval = (max_val-min_val)/num_bins
 91 | 	bins = np.arange(min_val, max_val, step=interval)
 92 | 	bin_indices = np.digitize(values, bins)
 93 | 	df[col_to_add] = bin_indices
 94 | 	return df
 95 | 
 96 | 
 97 | def main():
 98 | 	num_examples_to_print=5
 99 | 	num_bins = 5
100 | 
101 | 	predictions_file = os.path.join(log_dir, 'predict', 'test_results_all.tsv')
102 | 	predict_df = pd.read_csv(predictions_file, delimiter='\t')
103 | 	predict_df = convert_str_columns_to_float(predict_df)
104 | 	predict_df = predict_df.rename(columns={'index':'post_index'})
105 | 	print(predict_df)
106 | 
107 | 	treated = predict_df[predict_df.treatment == 1]
108 | 	control = predict_df[predict_df.treatment == 0]
109 | 
110 | 	treated_stratified = stratify_by_value(treated, num_bins=num_bins)
111 | 	control_stratified = stratify_by_value(control, num_bins=num_bins)
112 | 
113 | 	if res_type == 'subreddit':
114 | 		treated_cov_prop = get_covariate_proportions(treated_stratified)
115 | 		control_cov_prop = get_covariate_proportions(control_stratified)
116 | 
117 | 		plot_covariate_proportions_per_stratum(treated_cov_prop, control_cov_prop, num_bins)
118 | 
119 | 		for i in range(1,num_bins+1):
120 | 			print("*"*20, "Proportions for stratum:", i, "*"*20)
121 | 			print("-"*10, "Treated:", "-"*10)
122 | 			print(treated_cov_prop[treated_cov_prop.strata == i])
123 | 
124 | 			print("-"*10, "Control:", "-"*10)
125 | 			print(control_cov_prop[control_cov_prop.strata == i])
126 | 
127 | 	elif res_type == 'length':
128 | 		text = load_reddit()
129 | 		text = process_text_length(text)
130 | 		text = normalize(text, 'post_length')
131 | 
132 | 		treated = treated.merge(text, left_on='post_index', right_index=True, how='inner')
133 | 		control = control.merge(text, left_on='post_index', right_index=True, how='inner')
134 | 
135 | 		treated_corr = pearsonr(treated.post_length.values, treated.treatment_probability.values)
136 | 		control_corr = pearsonr(control.post_length.values, control.treatment_probability.values)
137 | 		print("Corr. between treated and post length", treated_corr)
138 | 		print("Corr. between control and post length", control_corr)
139 | 
140 | 
141 | 		# binned_post_length = stratify_by_value(text, num_bins=20, sort_by='post_length', col_to_add='length_bin')
142 | 
143 | 		# columns_to_keep = treated_stratified.columns.tolist().extend('length_bin')
144 | 		# treated_text = treated_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')# [columns_to_keep]
145 | 		# control_text = control_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')#[columns_to_keep]
146 | 
147 | 		# treated_cov_prop = get_covariate_proportions(treated_text, covariate='length_bin')
148 | 		# control_cov_prop = get_covariate_proportions(control_text, covariate='length_bin')
149 | 
150 | 		# for i in range(1,num_bins+1):
151 | 		# 	print("*"*20, "Proportions for stratum:", i, "*"*20)
152 | 		# 	print("-"*10, "Treated:", "-"*10)
153 | 		# 	print(treated_cov_prop[treated_cov_prop.strata == i])
154 | 
155 | 		# 	print("-"*10, "Control:", "-"*10)
156 | 		# 	print(control_cov_prop[control_cov_prop.strata == i])
157 | 
158 | 
159 | 
160 | if __name__ == '__main__':
161 | 	parser = argparse.ArgumentParser()
162 | 	parser.add_argument("--log-dir", action="store", default="../logdir/simulated_training_1.0_1.0_1.0")
163 | 	parser.add_argument("--result-type", action="store", default="subreddit")
164 | 	args = parser.parse_args()
165 | 	log_dir = args.log_dir
166 | 	res_type = args.result_type
167 | 
168 | 	main()


--------------------------------------------------------------------------------
/src/result_processing/prop_sim_plotting.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | import result_processing.compute_att as att
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def make_reddit_prop_plt():
 9 |     sns.set()
10 |     prop_expt = pd.DataFrame(att.process_propensity_experiment())
11 | 
12 |     prop_expt = prop_expt[['exog', 'plugin', 'one_step_tmle', 'very_naive']]
13 |     prop_expt = prop_expt.rename(index=str, columns={'exog': 'Exogeneity',
14 |                                          'very_naive': 'Unadjusted',
15 |                                          'plugin': 'Plug-in',
16 |                                          'one_step_tmle': 'TMLE'})
17 |     prop_expt = prop_expt.set_index('Exogeneity')
18 | 
19 |     plt.figure(figsize=(4.75, 3.00))
20 |     # plt.figure(figsize=(2.37, 1.5))
21 |     sns.scatterplot(data=prop_expt, legend='brief', s=75)
22 |     plt.xlabel("Exogeneity", fontfamily='monospace')
23 |     plt.ylabel("NDE Estimate", fontfamily='monospace')
24 |     plt.tight_layout()
25 | 
26 |     fig_dir = '../output/figures'
27 |     os.makedirs(fig_dir, exist_ok=True)
28 |     plt.savefig(os.path.join(fig_dir,'reddit_propensity.pdf'))
29 | 
30 | 
31 | def main():
32 |     make_reddit_prop_plt()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()


--------------------------------------------------------------------------------
/src/semi_parametric_estimation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/semi_parametric_estimation/__init__.py


--------------------------------------------------------------------------------
/src/semi_parametric_estimation/ate.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.special import logit, expit
  3 | from scipy.optimize import minimize
  4 | 
  5 | from .helpers import truncate_by_g, mse, cross_entropy, truncate_all_by_g
  6 | from .att import att_estimates
  7 | 
  8 | """
  9 | Note: the standard deviations reported by this methods are actually standard deviations conditioned on the nuisance
 10 | parameters.
 11 | 
 12 | That is, we do not account for variability in the estimation of Q and g
 13 | """
 14 | 
 15 | 
 16 | def _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps):
 17 |     """
 18 |     Helper for psi_tmle_bin_outcome
 19 | 
 20 |     Returns q_\eps (t,x)
 21 |     (i.e., value of perturbed predictor at t, eps, x; where q_t0, q_t1, g are all evaluated at x
 22 |     """
 23 |     h = t * (1./g) - (1.-t) / (1. - g)
 24 |     full_lq = (1.-t)*logit(q_t0) + t*logit(q_t1)  # logit predictions from unperturbed model
 25 |     logit_perturb = full_lq + eps * h
 26 |     return expit(logit_perturb)
 27 | 
 28 | 
 29 | def psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=0.05):
 30 |     # TODO: make me useable
 31 |     # solve the perturbation problem
 32 | 
 33 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 34 | 
 35 |     eps_hat = minimize(lambda eps: cross_entropy(y, _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps))
 36 |                        , 0., method='Nelder-Mead')
 37 | 
 38 |     eps_hat = eps_hat.x[0]
 39 | 
 40 |     def q1(t_cf):
 41 |         return _perturbed_model_bin_outcome(q_t0, q_t1, g, t_cf, eps_hat)
 42 | 
 43 |     ite = q1(np.ones_like(t)) - q1(np.zeros_like(t))
 44 | 
 45 |     return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
 46 | 
 47 | 
 48 | def psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, eps_hat=None, truncate_level=0.05):
 49 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 50 | 
 51 |     g_loss = mse(g, t)
 52 |     h = t * (1.0/g) - (1.0-t) / (1.0 - g)
 53 |     full_q = (1.0-t)*q_t0 + t*q_t1 # predictions from unperturbed model
 54 | 
 55 |     if eps_hat is None:
 56 |         eps_hat = np.sum(h*(y-full_q)) / np.sum(np.square(h))
 57 | 
 58 |     def q1(t_cf):
 59 |         h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
 60 |         full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1  # predictions from unperturbed model
 61 |         return full_q + eps_hat * h_cf
 62 | 
 63 |     ite = q1(np.ones_like(t)) - q1(np.zeros_like(t))
 64 |     psi_tmle = np.mean(ite)
 65 | 
 66 |     # standard deviation computation relies on asymptotic expansion of non-parametric estimator, see van der Laan and Rose p 96
 67 |     ic = h*(y-q1(t)) + ite - psi_tmle
 68 |     psi_tmle_std = np.std(ic) / np.sqrt(t.shape[0])
 69 |     initial_loss = np.mean(np.square(full_q-y))
 70 |     final_loss = np.mean(np.square(q1(t)-y))
 71 | 
 72 |     # print("tmle epsilon_hat: ", eps_hat)
 73 |     # print("initial risk: {}".format(initial_loss))
 74 |     # print("final risk: {}".format(final_loss))
 75 | 
 76 |     return psi_tmle, psi_tmle_std, eps_hat, initial_loss, final_loss, g_loss
 77 | 
 78 | 
 79 | def psi_iptw(q_t0, q_t1, g, t, y, truncate_level=0.05):
 80 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 81 | 
 82 |     ite=(t / g - (1-t) / (1-g))*y
 83 |     return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
 84 | 
 85 | 
 86 | def psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=0.05):
 87 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 88 | 
 89 |     full_q = q_t0 * (1 - t) + q_t1 * t
 90 |     h = t * (1.0 / g) - (1.0 - t) / (1.0 - g)
 91 |     ite = h * (y - full_q) + q_t1 - q_t0
 92 | 
 93 |     return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
 94 | 
 95 | 
 96 | def psi_q_only(q_t0, q_t1, g, t, y, truncate_level=0.):
 97 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 98 |     ite = (q_t1 - q_t0)
 99 |     return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
100 | 
101 | 
102 | def psi_very_naive(t, y):
103 |     psi_hat = y[t == 1].mean() - y[t == 0].mean()
104 |     psi_std = np.sqrt(np.var(y[t == 1]) / np.sum(t) + np.var(y[t == 0]) / np.sum(1-t))
105 |     return
106 | 
107 | 
108 | def ates_from_atts(q_t0, q_t1, g, t, y, truncate_level=0.05):
109 |     """
110 |     Sanity check code: ATE = ATT_1*P(T=1) + ATT_0*P(T=1)
111 | 
112 |     :param q_t0:
113 |     :param q_t1:
114 |     :param g:
115 |     :param t:
116 |     :param y:
117 |     :param truncate_level:
118 |     :return:
119 |     """
120 | 
121 |     prob_t = t.mean()
122 | 
123 |     att = att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=truncate_level)
124 |     att_flip = att_estimates(q_t1, q_t0, 1.-g, 1-t, y, 1.-prob_t, truncate_level=truncate_level)
125 | 
126 |     ates = {}
127 |     for k in att.keys():
128 |         # note: minus because the flip computes E[Y^0 - Y^1 | T=0]
129 |         ates[k] = att[k]*prob_t - att_flip[k]*(1.-prob_t)
130 |         # ates[k] = att_flip[k]
131 | 
132 |     return ates
133 | 
134 | 
135 | def ate_estimates(q_t0, q_t1, g, t, y, truncate_level=0.05):
136 | 
137 |     very_naive = psi_very_naive(t,y)
138 |     q_only = psi_q_only(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
139 |     iptw = psi_iptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
140 |     aiptw = psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
141 |     tmle = psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level)[0:1]
142 |     bin_tmle = psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
143 | 
144 |     estimates = {'very_naive': very_naive,
145 |                  'q_only': q_only,
146 |                  'iptw': iptw,
147 |                  'tmle': tmle,
148 |                  'bin-tmle': bin_tmle,
149 |                  'aiptw': aiptw}
150 | 
151 |     return estimates
152 | 
153 | 
154 | 
155 | def main():
156 |     pass
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     main()
161 | 


--------------------------------------------------------------------------------
/src/semi_parametric_estimation/att.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.special import logit, expit
  3 | from scipy.optimize import minimize
  4 | 
  5 | from .helpers import truncate_all_by_g, cross_entropy, mse
  6 | 
  7 | 
  8 | def _perturbed_model(q_t0, q_t1, g, t, q, eps):
  9 |     # helper function for psi_tmle
 10 | 
 11 |     h1 = t / q - ((1 - t) * g) / (q * (1 - g))
 12 |     full_q = (1.0 - t) * q_t0 + t * q_t1
 13 |     perturbed_q = full_q - eps * h1
 14 | 
 15 |     def q1(t_cf, epsilon):
 16 |         h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
 17 |         full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1  # predictions from unperturbed model
 18 |         return full_q - epsilon * h_cf
 19 | 
 20 |     psi_init = np.mean(t * (q1(np.ones_like(t), eps) - q1(np.zeros_like(t), eps))) / q
 21 |     h2 = (q_t1 - q_t0 - psi_init) / q
 22 |     perturbed_g = expit(logit(g) - eps * h2)
 23 | 
 24 |     return perturbed_q, perturbed_g
 25 | 
 26 | 
 27 | def psi_tmle(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
 28 |     """
 29 |     Near canonical van der Laan TMLE, except we use a
 30 |     1 dimension epsilon shared between the Q and g update models
 31 | 
 32 |     """
 33 | 
 34 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
 35 | 
 36 |     def _perturbed_loss(eps):
 37 |         pert_q, pert_g = _perturbed_model(q_t0, q_t1, g, t, prob_t, eps)
 38 |         loss = (np.square(y - pert_q)).mean() + cross_entropy(t, pert_g)
 39 |         return loss
 40 | 
 41 |     eps_hat = minimize(_perturbed_loss, 0.)
 42 |     eps_hat = eps_hat.x[0]
 43 | 
 44 |     def q2(t_cf, epsilon):
 45 |         h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
 46 |         full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1  # predictions from unperturbed model
 47 |         return full_q - epsilon * h_cf
 48 | 
 49 |     psi_tmle = np.mean(t * (q2(np.ones_like(t), eps_hat) - q2(np.zeros_like(t), eps_hat))) / prob_t
 50 |     return psi_tmle
 51 | 
 52 | 
 53 | def make_one_step_tmle(prob_t, deps_default=0.001):
 54 |     "Make a function that computes the 1-step TMLE ala https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4912007/"
 55 | 
 56 |     def _perturb_q(q_t0, q_t1, g, t, deps=deps_default):
 57 |         h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g))
 58 | 
 59 |         full_q = (1.0 - t) * q_t0 + t * q_t1
 60 |         perturbed_q = full_q - deps * h1
 61 |         # perturbed_q= expit(logit(full_q) - deps*h1)
 62 |         return perturbed_q
 63 | 
 64 |     def _perturb_g(q_t0, q_t1, g, deps=deps_default):
 65 |         h2 = (q_t1 - q_t0 - _psi(q_t0, q_t1, g)) / prob_t
 66 |         perturbed_g = expit(logit(g) - deps * h2)
 67 |         return perturbed_g
 68 | 
 69 |     def _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps_default):
 70 |         # get the values of Q_{eps+deps} and g_{eps+deps} by using the recursive formula
 71 | 
 72 |         perturbed_g = _perturb_g(q0_old, q1_old, g_old, deps=deps)
 73 | 
 74 |         perturbed_q = _perturb_q(q0_old, q1_old, perturbed_g, t, deps=deps)
 75 |         perturbed_q0 = _perturb_q(q0_old, q1_old, perturbed_g, np.zeros_like(t), deps=deps)
 76 |         perturbed_q1 = _perturb_q(q0_old, q1_old, perturbed_g, np.ones_like(t), deps=deps)
 77 | 
 78 |         return perturbed_q0, perturbed_q1, perturbed_q, perturbed_g
 79 | 
 80 |     def _loss(q, g, y, t):
 81 |         # compute the new loss
 82 |         q_loss = mse(y, q)
 83 |         g_loss = cross_entropy(t, g)
 84 |         return q_loss + g_loss
 85 | 
 86 |     def _psi(q0, q1, g):
 87 |         return np.mean(g*(q1 - q0)) / prob_t
 88 | 
 89 |     def tmle(q_t0, q_t1, g, t, y, truncate_level=0.05, deps=deps_default):
 90 |         """
 91 |         Computes the tmle for the ATT (equivalently: direct effect)
 92 | 
 93 |         :param q_t0:
 94 |         :param q_t1:
 95 |         :param g:
 96 |         :param t:
 97 |         :param y:
 98 |         :param truncate_level:
 99 |         :param deps:
100 |         :return:
101 |         """
102 |         q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
103 | 
104 |         eps = 0.0
105 | 
106 |         q0_old = q_t0
107 |         q1_old = q_t1
108 |         g_old = g
109 | 
110 |         # determine whether epsilon should go up or down
111 |         # translated blindly from line 299 of https://github.com/cran/tmle/blob/master/R/tmle.R
112 |         h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g))
113 |         full_q = (1.0 - t) * q_t0 + t * q_t1
114 |         deriv = np.mean(prob_t*h1*(y-full_q) + t*(q_t1 - q_t0 - _psi(q_t0, q_t1, g)))
115 |         if deriv > 0:
116 |             deps = -deps
117 | 
118 |         # run until loss starts going up
119 |         # old_loss = np.inf  # this is the thing used by Rose' implementation
120 |         old_loss = _loss(full_q, g, y, t)
121 | 
122 |         while True:
123 |             perturbed_q0, perturbed_q1, perturbed_q, perturbed_g = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps)
124 | 
125 |             new_loss = _loss(perturbed_q, perturbed_g, y, t)
126 | 
127 |             # debugging
128 |             # print("Psi: {}".format(_psi(q0_old, q1_old, g_old)))
129 |             # print("new_loss is: ", new_loss, "old_loss is ", old_loss)
130 | 
131 |             # # if this is the first step, decide whether to go down or up from eps=0.0
132 |             # if eps == 0.0:
133 |             #     _, _, perturbed_q_neg, perturbed_g_neg = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=-deps)
134 |             #     neg_loss = _loss(perturbed_q_neg, perturbed_g_neg, y, t)
135 |             #
136 |             #     if neg_loss < new_loss:
137 |             #         return tmle(q_t0, q_t1, g, t, y, deps=-1.0 * deps)
138 | 
139 |             # check if converged
140 |             if new_loss > old_loss:
141 |                 if eps == 0.:
142 |                     print("Warning: no update occurred (is deps too big?)")
143 |                 return _psi(q0_old, q1_old, g_old)
144 |             else:
145 |                 eps += deps
146 | 
147 |                 q0_old = perturbed_q0
148 |                 q1_old = perturbed_q1
149 |                 g_old = perturbed_g
150 | 
151 |                 old_loss = new_loss
152 | 
153 |     return tmle
154 | 
155 | 
156 | def psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
157 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
158 | 
159 |     ite_t = (q_t1 - q_t0)[t == 1]
160 |     estimate = ite_t.mean()
161 |     return estimate
162 | 
163 | 
164 | def psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
165 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
166 | 
167 |     ite_t = g*(q_t1 - q_t0)/prob_t
168 |     estimate = ite_t.mean()
169 |     return estimate
170 | 
171 | 
172 | def psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
173 |     # the robust ATT estimator described in eqn 3.9 of
174 |     # https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf
175 | 
176 |     q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
177 |     estimate = (t*(y-q_t0) - (1-t)*(g/(1-g))*(y-q_t0)).mean() / prob_t
178 | 
179 |     return estimate
180 | 
181 | 
182 | def psi_very_naive(t, y):
183 |     return y[t == 1].mean() - y[t == 0].mean()
184 | 
185 | 
186 | def att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05, deps=0.0001):
187 | 
188 |     one_step_tmle = make_one_step_tmle(prob_t, deps_default=deps)
189 | 
190 |     very_naive = psi_very_naive(t,y)
191 |     q_only = psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level)
192 |     plugin = psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level)
193 |     aiptw = psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level)
194 |     one_step_tmle = one_step_tmle(q_t0, q_t1, g, t, y, truncate_level)  # note different signature
195 | 
196 |     estimates = {'very_naive': very_naive, 'q_only': q_only, 'plugin': plugin, 'one_step_tmle': one_step_tmle, 'aiptw': aiptw}
197 | 
198 |     return estimates
199 | 


--------------------------------------------------------------------------------
/src/semi_parametric_estimation/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.special import logit
 3 | 
 4 | import sklearn.linear_model as lm
 5 | 
 6 | 
 7 | def calibrate_g(g, t):
 8 |     """
 9 |     Improve calibation of propensity scores by fitting 1 parameter (temperature) logistic regression on heldout data
10 | 
11 |     :param g: raw propensity score estimates
12 |     :param t: treatment assignments
13 |     :return:
14 |     """
15 | 
16 |     logit_g = logit(g).reshape(-1,1)
17 |     calibrator = lm.LogisticRegression(fit_intercept=False, C=1e6, solver='lbfgs')  # no intercept or regularization
18 |     calibrator.fit(logit_g, t)
19 |     calibrated_g = calibrator.predict_proba(logit_g)[:,1]
20 |     return calibrated_g
21 | 
22 | 
23 | def truncate_by_g(attribute, g, level=0.1):
24 |     keep_these = np.logical_and(g >= level, g <= 1.-level)
25 | 
26 |     return attribute[keep_these]
27 | 
28 | 
29 | def truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level=0.05):
30 |     """
31 |     Helper function to clean up nuisance parameter estimates.
32 | 
33 |     """
34 | 
35 |     orig_g = np.copy(g)
36 | 
37 |     q_t0 = truncate_by_g(np.copy(q_t0), orig_g, truncate_level)
38 |     q_t1 = truncate_by_g(np.copy(q_t1), orig_g, truncate_level)
39 |     g = truncate_by_g(np.copy(g), orig_g, truncate_level)
40 |     t = truncate_by_g(np.copy(t), orig_g, truncate_level)
41 |     y = truncate_by_g(np.copy(y), orig_g, truncate_level)
42 | 
43 |     return q_t0, q_t1, g, t, y
44 | 
45 | 
46 | 
47 | def cross_entropy(y, p):
48 |     return -np.mean((y*np.log(p) + (1.-y)*np.log(1.-p)))
49 | 
50 | 
51 | def mse(x, y):
52 |     return np.mean(np.square(x-y))
53 | 


--------------------------------------------------------------------------------
/src/supervised_lda/add_split_to_simulations.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import os\n",
10 |     "import numpy as np\n",
11 |     "import pandas as pd"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": 8,
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "base_sim_dir = '../../dat/sim/'\n",
21 |     "datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n",
22 |     "mode = 'modesimple'\n",
23 |     "\n",
24 |     "for dataset in datasets:\n",
25 |     "    simdir = os.path.join(base_sim_dir, dataset, mode)\n",
26 |     "    for simfile in os.listdir(simdir):\n",
27 |     "        df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n",
28 |     "        df['split'] = np.random.randint(0, 10, size=df.shape[0])\n",
29 |     "        df.to_csv(os.path.join(simdir, simfile),sep='\\t')"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {},
36 |    "outputs": [],
37 |    "source": []
38 |   }
39 |  ],
40 |  "metadata": {
41 |   "kernelspec": {
42 |    "display_name": "Python 3",
43 |    "language": "python",
44 |    "name": "python3"
45 |   },
46 |   "language_info": {
47 |    "codemirror_mode": {
48 |     "name": "ipython",
49 |     "version": 3
50 |    },
51 |    "file_extension": ".py",
52 |    "mimetype": "text/x-python",
53 |    "name": "python",
54 |    "nbconvert_exporter": "python",
55 |    "pygments_lexer": "ipython3",
56 |    "version": "3.6.8"
57 |   }
58 |  },
59 |  "nbformat": 4,
60 |  "nbformat_minor": 2
61 | }
62 | 


--------------------------------------------------------------------------------
/src/supervised_lda/compute_estimates.py:
--------------------------------------------------------------------------------
 1 | from semi_parametric_estimation.att import att_estimates
 2 | import numpy as np
 3 | import os
 4 | import argparse
 5 | import pandas as pd
 6 | 
 7 | def main():
 8 | 	outdir = os.path.join('..', 'out', args.data, args.experiment)
 9 | 	for sim in os.listdir(outdir):
10 | 		mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []}
11 | 		for split in os.listdir(os.path.join(outdir, sim)):
12 | 			if args.num_splits is not None:
13 | 				# print("ignoring split", split)
14 | 				if int(split) >= int(args.num_splits):
15 | 					continue
16 | 			array = np.load(os.path.join(outdir, sim, split, 'predictions.npz'))
17 | 			g = array['g']
18 | 			q0 = array['q0']
19 | 			q1 = array['q1']
20 | 			y = array['y']
21 | 			t = array['t']
22 | 			estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03)
23 | 			for est, att in estimates.items():
24 | 				mean_estimates[est].append(att)
25 | 
26 | 		if args.data == 'reddit':
27 | 			sim = sim.replace('beta01.0.', '')
28 | 			options = sim.split('.0.')
29 | 			p2 = options[0].replace('beta1', '')
30 | 			p3 = options[1].replace('gamma', '')
31 | 
32 | 			print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------")
33 | 			print("True effect = 1.0")
34 | 		else:
35 | 			ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03}
36 | 			print("------ Simulation setting: Confounding strength =", sim)
37 | 			print("True effect = ", ground_truth_map[sim])
38 | 
39 | 
40 | 		for est, atts in mean_estimates.items():
41 | 			print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3))
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 	parser = argparse.ArgumentParser()
46 | 	parser.add_argument("--data", action="store", default="reddit")
47 | 	parser.add_argument("--experiment", action="store", default="base_model")
48 | 	parser.add_argument("--num-splits", action="store", default=None)
49 | 	args = parser.parse_args()
50 | 
51 | 	main()


--------------------------------------------------------------------------------
/src/supervised_lda/helpers.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import word_tokenize
 2 | from nltk.stem import WordNetLemmatizer
 3 | from nltk.corpus import stopwords
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.decomposition import LatentDirichletAllocation
 8 | 
 9 | class LemmaTokenizer(object):
10 | 	def __init__(self):
11 | 		self.wnl = WordNetLemmatizer()
12 | 	def __call__(self, articles):
13 | 		stop = stopwords.words('english')
14 | 		return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 | 
16 | def filter_by_subreddit(reddit, subs=None):
17 | 	if not subs:
18 | 		return reddit.index.values
19 | 	else:
20 | 		return reddit[reddit.subreddit.isin(subs)].index.values
21 | 
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005):
23 | 	from nltk.corpus import stopwords
24 | 	'''
25 | 	From a list of documents raw text build a matrix DxV
26 | 	D: number of docs
27 | 	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | 	'''
29 | 	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | 	corpus = count_vect.fit_transform(documents)
31 | 	vocabulary = count_vect.get_feature_names()
32 | 	
33 | 	return corpus,vocabulary,count_vect
34 | 
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | 	indices = np.arange(num_docs)
37 | 	np.random.shuffle(indices)
38 | 	size = int(indices.shape[0]*percentage)
39 | 	dev = indices[:size]
40 | 	return dev
41 | 
42 | def learn_topics(X, X_dev, K=50):
43 | 	lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | 	print("Fitting", K, "topics...")
45 | 	lda.fit(X)
46 | 	score = lda.perplexity(X_dev)
47 | 	print("Log likelihood:", score)
48 | 	topics = lda.components_
49 | 	return score, lda, topics
50 | 
51 | def show_topics(vocab, topics, n_words=20):
52 | 	topic_keywords = []
53 | 	for topic_weights in topics:
54 | 		top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | 		topic_keywords.append(vocab.take(top_keyword_locs))
56 | 
57 | 	df_topic_keywords = pd.DataFrame(topic_keywords)
58 | 	df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | 	df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | 	return df_topic_keywords
61 | 
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | 	filtered_indices = filtered_df[on].values
64 | 	doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | 	embeddings = doc_embeddings[doc_idx, :]
66 | 	return embeddings
67 | 
68 | def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'):
69 | 	filtered_indices = filtered_df[on].values
70 | 	doc_idx = [index_mapping[idx] for idx in filtered_indices]
71 | 	filtered_counts = counts[doc_idx, :]
72 | 	return filtered_counts
73 | 
74 | def make_index_mapping(df, on='post_index', convert_to_int=True):
75 | 	if on=='index':
76 | 		indices = df.index.values
77 | 	else:
78 | 		indices = df[on].values
79 | 
80 | 	if convert_to_int:
81 | 		return {int(ind):i for (i,ind) in enumerate(indices)}
82 | 
83 | 	return {ind:i for (i,ind) in enumerate(indices)}
84 | 
85 | def assign_split(df, num_splits=10, col_to_add='split'):
86 | 	df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
87 | 	return df
88 | 


--------------------------------------------------------------------------------
/src/supervised_lda/peerread_output_att.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.att import att_estimates
  2 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
  3 | import numpy as np
  4 | import pandas as pd
  5 | import os
  6 | from sklearn.metrics import mean_squared_error as mse
  7 | import argparse
  8 | import sys
  9 | from supervised_lda.supervised_topic_model import SupervisedTopicModel
 10 | from supervised_lda import run_supervised_tm
 11 | from scipy import sparse
 12 | from sklearn.linear_model import LogisticRegression, Ridge
 13 | from scipy.special import logit
 14 | 
 15 | def load_peerread(path='../dat/PeerRead/'):
 16 | 	return pd.read_csv(path + 'proc_abstracts.csv')
 17 | 
 18 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
 19 | 	count_filename = path  + 'term_counts'
 20 | 	vocab_filename = path + 'vocab'
 21 | 
 22 | 	if os.path.exists(count_filename + '.npz') and not force_redo:
 23 | 		return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')
 24 | 
 25 | 	post_docs = df[text_col].values
 26 | 	counts, vocab, _ = tokenize_documents(post_docs)    
 27 | 	sparse.save_npz(count_filename, counts)
 28 | 	np.save(vocab_filename, vocab)
 29 | 	return counts.toarray(), np.array(vocab)
 30 | 
 31 | def compute_ground_truth_treatment_effect(df):
 32 | 	y1 = df['y1']
 33 | 	y0 = df['y0']
 34 | 	return y1.mean() - y0.mean()
 35 | 
 36 | def load_simulated_data():
 37 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 38 | 	return sim_df
 39 | 
 40 | def fit_model(doc_embeddings, labels, is_binary=False):
 41 | 	if is_binary:
 42 | 		model = LogisticRegression(solver='liblinear')
 43 | 	else:
 44 | 		model = Ridge()
 45 | 	model.fit(doc_embeddings, labels)
 46 | 	return model
 47 | 
 48 | def main():
 49 | 	if dat_dir:
 50 | 		peerread = load_peerread(path=dat_dir)
 51 | 		counts,vocab = load_term_counts(peerread,path=dat_dir)
 52 | 	else:
 53 | 		peerread = load_peerread()
 54 | 		counts,vocab = load_term_counts(peerread)
 55 | 
 56 | 	indices = peerread['paper_id'].values
 57 | 	index_mapping = make_index_mapping(peerread, on='index')
 58 | 
 59 | 	sim_df = load_simulated_data()
 60 | 
 61 | 	train_df = sim_df[sim_df.split != split]
 62 | 	predict_df = sim_df[sim_df.split == split]
 63 | 	tr_treatment_labels = train_df.treatment.values
 64 | 	tr_outcomes = train_df.outcome.values
 65 | 	predict_treatment = predict_df.treatment.values
 66 | 	predict_outcomes = predict_df.outcome.values
 67 | 
 68 | 	tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id')
 69 | 	predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id')
 70 | 
 71 | 	num_documents = tr_counts.shape[0]
 72 | 	vocab_size = tr_counts.shape[1]
 73 | 	model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)
 74 | 
 75 | 	run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary', 
 76 | 		num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)
 77 | 
 78 | 	if use_supervised_loss:
 79 | 		propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary')
 80 | 	else:
 81 | 		tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
 82 | 		treated = tr_treatment_labels == 1
 83 | 		out_treat = tr_outcomes[treated]
 84 | 		out_no_treat = tr_outcomes[~treated]
 85 | 		q0_embeddings = tr_doc_embeddings[~treated,:]
 86 | 		q1_embeddings = tr_doc_embeddings[treated,:]
 87 | 		q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True)
 88 | 		q1_model = fit_model(q1_embeddings, out_treat, is_binary=True)
 89 | 		g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)
 90 | 
 91 | 		pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
 92 | 		propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
 93 | 		expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1]
 94 | 		expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1]
 95 | 
 96 | 	out = os.path.join(outdir, str(split))
 97 | 	os.makedirs(out, exist_ok=True)
 98 | 	outfile = os.path.join(out, 'predictions')
 99 | 	np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)
100 | 
101 | if __name__ == '__main__':
102 | 	parser = argparse.ArgumentParser()
103 | 	parser.add_argument("--dat-dir", action="store", default=None)
104 | 	parser.add_argument("--outdir", action="store", default='../out/')
105 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
106 | 	parser.add_argument("--mode", action="store", default="simple")
107 | 	parser.add_argument("--params", action="store", default="1.0")
108 | 	parser.add_argument("--verbose", action='store_true')
109 | 	parser.add_argument("--split", action='store', default=0)
110 | 	parser.add_argument("--num-iters", action="store", default=3000)
111 | 	parser.add_argument("--num-topics", action='store', default=100)
112 | 	parser.add_argument("--linear-outcome-model", action='store', default="t")
113 | 	parser.add_argument("--use-recon-loss", action='store', default="t")
114 | 	parser.add_argument("--use-supervised-loss", action='store', default="t")
115 | 	args = parser.parse_args()
116 | 
117 | 	sim_dir = args.sim_dir
118 | 	outdir = args.outdir
119 | 	dat_dir = args.dat_dir
120 | 	verbose = args.verbose
121 | 	params = args.params
122 | 	sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
123 | 	mode = args.mode
124 | 	simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
125 | 	num_topics = args.num_topics
126 | 	split = int(args.split)
127 | 	linear_outcome_model = True if args.linear_outcome_model == "t" else False 
128 | 	use_supervised_loss = True if args.use_supervised_loss == "t" else False
129 | 	use_recon_loss = True if args.use_recon_loss == "t" else False
130 | 	num_iters = int(args.num_iters)
131 | 	print(use_supervised_loss, use_recon_loss, linear_outcome_model)
132 | 
133 | 	main()


--------------------------------------------------------------------------------
/src/supervised_lda/reddit_output_att.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.att import att_estimates
  2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
  3 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | from supervised_lda.supervised_topic_model import SupervisedTopicModel
  8 | from sklearn.linear_model import LogisticRegression, Ridge
  9 | from supervised_lda import run_supervised_tm
 10 | from sklearn.metrics import mean_squared_error as mse
 11 | import argparse
 12 | import sys
 13 | from scipy.special import logit
 14 | from scipy import sparse
 15 | 
 16 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False):
 17 | 	count_filename = path  + 'term_counts'
 18 | 	vocab_filename = path + 'vocab'
 19 | 
 20 | 	if os.path.exists(count_filename + '.npz') and not force_redo:
 21 | 		return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')
 22 | 
 23 | 	post_docs = reddit['post_text'].values
 24 | 	counts, vocab, _ = tokenize_documents(post_docs)    
 25 | 	sparse.save_npz(count_filename, counts)
 26 | 	np.save(vocab_filename, vocab)
 27 | 	return counts.toarray(), np.array(vocab)
 28 | 
 29 | def load_simulated_data():
 30 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 31 | 	sim_df = sim_df.rename(columns={'index':'post_index'})
 32 | 	return sim_df
 33 | 
 34 | def drop_empty_posts(counts):
 35 | 	doc_terms = counts.sum(axis=1)
 36 | 	return doc_terms >= 5
 37 | 
 38 | def fit_model(doc_embeddings, labels, is_binary=False):
 39 | 	if is_binary:
 40 | 		model = LogisticRegression(solver='liblinear')
 41 | 	else:
 42 | 		model = Ridge()
 43 | 	model.fit(doc_embeddings, labels)
 44 | 	return model
 45 | 
 46 | def main():
 47 | 	if dat_dir:
 48 | 		reddit = load_reddit_processed(path=dat_dir)
 49 | 	else:
 50 | 		reddit = load_reddit_processed()
 51 | 
 52 | 	if subs:
 53 | 		reddit = reddit[reddit.subreddit.isin(subs)]
 54 | 	reddit = reddit.dropna(subset=['post_text'])
 55 | 
 56 | 	
 57 | 	index_mapping = make_index_mapping(reddit, on='orig_index')
 58 | 	if not dat_dir:
 59 | 		counts, vocab = load_term_counts(reddit)
 60 | 	else:
 61 | 		counts, vocab = load_term_counts(reddit, path=dat_dir)
 62 | 
 63 | 	sim_df = load_simulated_data()
 64 | 
 65 | 	train_df = sim_df[sim_df.split != split]
 66 | 	predict_df = sim_df[sim_df.split == split]
 67 | 
 68 | 	tr_treatment_labels = train_df.treatment.values
 69 | 	tr_outcomes = train_df.outcome.values
 70 | 	predict_treatment = predict_df.treatment.values
 71 | 	predict_outcomes = predict_df.outcome.values
 72 | 
 73 | 	tr_counts = filter_document_terms(train_df, counts, index_mapping)
 74 | 	predict_counts = filter_document_terms(predict_df, counts, index_mapping)
 75 | 	tr_valid = drop_empty_posts(tr_counts)
 76 | 	pred_valid = drop_empty_posts(predict_counts)
 77 | 	tr_counts = tr_counts[tr_valid, :]
 78 | 	predict_counts = predict_counts[pred_valid, :]
 79 | 
 80 | 	tr_treatment_labels = tr_treatment_labels[tr_valid]
 81 | 	tr_outcomes = tr_outcomes[tr_valid]
 82 | 	predict_treatment = predict_treatment[pred_valid]
 83 | 	predict_outcomes = predict_outcomes[pred_valid]
 84 | 
 85 | 	num_documents = tr_counts.shape[0]
 86 | 	vocab_size = tr_counts.shape[1]
 87 | 	model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)
 88 | 
 89 | 	run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)
 90 | 
 91 | 	if use_supervised_loss:
 92 | 		propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts)
 93 | 	else:
 94 | 		tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
 95 | 		treated = tr_treatment_labels == 1
 96 | 		out_treat = tr_outcomes[treated]
 97 | 		out_no_treat = tr_outcomes[~treated]
 98 | 		q0_embeddings = tr_doc_embeddings[~treated,:]
 99 | 		q1_embeddings = tr_doc_embeddings[treated,:]
100 | 		q0_model = fit_model(q0_embeddings, out_no_treat)
101 | 		q1_model = fit_model(q1_embeddings, out_treat)
102 | 		g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)
103 | 
104 | 		pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
105 | 		propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
106 | 		expected_outcome_no_treat = q0_model.predict(pred_doc_embeddings)
107 | 		expected_outcome_treat = q1_model.predict(pred_doc_embeddings)
108 | 		
109 | 	out = os.path.join(outdir, str(split))
110 | 	os.makedirs(out, exist_ok=True)
111 | 	outfile = os.path.join(out, 'predictions')
112 | 	np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)
113 | 
114 | 
115 | if __name__ == '__main__':
116 | 	parser = argparse.ArgumentParser()
117 | 	parser.add_argument("--dat-dir", action="store", default=None)
118 | 	parser.add_argument("--outdir", action="store", default='../out/')
119 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
120 | 	parser.add_argument("--subs", action="store", default='13,6,8')
121 | 	parser.add_argument("--mode", action="store", default="simple")
122 | 	parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
123 | 	parser.add_argument("--verbose", action='store_true')
124 | 	parser.add_argument("--num-topics", action='store', default=100)
125 | 	parser.add_argument("--split", action='store', default=0)
126 | 	parser.add_argument("--num-iters", action="store", default=4000)
127 | 	# parser.add_argument("--num_splits", action='store', default=10)
128 | 	parser.add_argument("--linear-outcome-model", action='store', default="t")
129 | 	parser.add_argument("--use-recon-loss", action='store', default="t")
130 | 	parser.add_argument("--use-supervised-loss", action='store', default="t")
131 | 	args = parser.parse_args()
132 | 
133 | 	sim_dir = args.sim_dir
134 | 	dat_dir = args.dat_dir
135 | 	outdir = args.outdir
136 | 	subs = None
137 | 	if args.subs != '':
138 | 		subs = [int(s) for s in args.subs.split(',')]
139 | 	verbose = args.verbose
140 | 	params = args.params.split(',')
141 | 	sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
142 | 	subs_string = ', '.join(args.subs.split(','))
143 | 	mode = args.mode
144 | 	simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
145 | 	num_iters = int(args.num_iters)
146 | 	num_topics = int(args.num_topics)
147 | 	split = int(args.split)
148 | 	# num_splits = args.num_splits
149 | 	linear_outcome_model = True if args.linear_outcome_model == "t" else False 
150 | 	use_supervised_loss = True if args.use_supervised_loss == "t" else False
151 | 	use_recon_loss = True if args.use_recon_loss == "t" else False
152 | 
153 | 	main()


--------------------------------------------------------------------------------
/src/supervised_lda/run_supervised_tm.py:
--------------------------------------------------------------------------------
 1 | from torch import nn, optim
 2 | from torch.nn import functional as F
 3 | import torch
 4 | # from torch.utils.tensorboard import SummaryWriter
 5 | import numpy as np
 6 | import argparse
 7 | from scipy.special import expit
 8 | 
 9 | def visualize_topics(model, vocab, num_topics, num_words=10):
10 | 	model.eval() 
11 | 	with torch.no_grad():
12 | 		print('#'*100)
13 | 		print('Visualize topics...')
14 | 		betas = model.alphas.t() #model.get_beta()
15 | 		for k in range(num_topics):
16 | 			beta = betas[k].detach().numpy()
17 | 			top_words = beta.argsort()[-num_words:]
18 | 			topic_words = vocab[top_words]
19 | 			print('Topic {}: {}'.format(k, topic_words))
20 | 
21 | def get_representation(model, docs):
22 | 	normalized = docs/docs.sum(axis=-1)[:,np.newaxis]
23 | 	normalized_bow = torch.tensor(normalized, dtype=torch.float)
24 | 	num_documents = docs.shape[0]
25 | 	model.eval()
26 | 	with torch.no_grad():
27 | 		doc_representation,_ = model.get_theta(normalized_bow)
28 | 		embeddings = doc_representation.detach().numpy()
29 | 	return embeddings
30 | 
31 | 
32 | def predict(model, docs, dtype='real'):
33 | 	normalized = docs/docs.sum(axis=-1)[:,np.newaxis]
34 | 	normalized_bow = torch.tensor(normalized, dtype=torch.float)
35 | 	num_documents = docs.shape[0]
36 | 
37 | 	treatment_ones = torch.ones(num_documents) 
38 | 	treatment_zeros = torch.zeros(num_documents) 
39 | 
40 | 	model.eval()
41 | 	with torch.no_grad():
42 | 		doc_representation,_ = model.get_theta(normalized_bow)
43 | 		propensity_score = model.predict_treatment(doc_representation).squeeze().detach().numpy()
44 | 		propensity_score = expit(propensity_score)
45 | 		expected_outcome_treat = model.predict_outcome_st_treat(doc_representation, treatment_ones).squeeze().detach().numpy()
46 | 		expected_outcome_no_treat = model.predict_outcome_st_no_treat(doc_representation, treatment_zeros).squeeze().detach().numpy()
47 | 
48 | 		if dtype == 'binary':
49 | 			expected_outcome_treat = expit(expected_outcome_treat)
50 | 			expected_outcome_no_treat = expit(expected_outcome_no_treat)
51 | 		
52 | 		return propensity_score, expected_outcome_treat, expected_outcome_no_treat
53 | 
54 | def train(model, docs, treatment_labels, outcomes, dtype='real', num_epochs=20000, lr=0.005, wdecay=1.2e-5,batch_size=1000, use_recon_loss=True, use_sup_loss=True):
55 | 	optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wdecay)
56 | 	num_documents = docs.shape[0]
57 | 	indices = np.arange(num_documents)
58 | 	np.random.shuffle(indices)
59 | 
60 | 	for e_idx in range(num_epochs):
61 | 		model.train()
62 | 		k = e_idx%(num_documents//batch_size)
63 | 		start_index = k*batch_size
64 | 		end_index = (k+1)*batch_size
65 | 		batch = indices[start_index:end_index]
66 | 		docs_batch = docs[batch,:]
67 | 		treatment_labels_batch = treatment_labels[batch]
68 | 		outcomes_batch = outcomes[batch]
69 | 		normalized_batch = docs_batch/docs_batch.sum(axis=1)[:,np.newaxis]
70 | 		
71 | 		outcome_labels = torch.tensor(outcomes_batch, dtype=torch.float)
72 | 		treat_labels = torch.tensor(treatment_labels_batch, dtype=torch.float)
73 | 		bow = torch.tensor(docs_batch, dtype=torch.float)
74 | 		normalized_bow = torch.tensor(normalized_batch, dtype=torch.float)
75 | 
76 | 		optimizer.zero_grad()
77 | 		model.zero_grad()
78 | 
79 | 		recon_loss, supervised_loss, kld_theta = model(bow, normalized_bow, treat_labels, outcome_labels,dtype=dtype, use_supervised_loss=use_sup_loss)
80 | 		acc_kl_theta_loss = torch.sum(kld_theta).item()
81 | 		acc_sup_loss = 0.
82 | 		acc_loss = 0.
83 | 		
84 | 		total_loss = kld_theta #+ recon_loss + supervised_loss
85 | 		if use_recon_loss:
86 | 			acc_loss = torch.sum(recon_loss).item()
87 | 			total_loss += 0.1*recon_loss
88 | 		if use_sup_loss:
89 | 			acc_sup_loss = torch.sum(supervised_loss).item()
90 | 			total_loss += supervised_loss
91 | 
92 | 		total_loss.backward()
93 | 		optimizer.step()
94 | 		
95 | 		print("Acc. loss:", acc_loss, "KL loss.:", acc_kl_theta_loss, "Supervised loss:", acc_sup_loss)


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH -A sml
 3 | #SBATCH -c 8
 4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | source activate py3.6
 8 | 
 9 | python -m supervised_lda.peerread_output_att \
10 | --dat-dir=${DIR} \
11 | --mode=${MODE} \
12 | --params=${BETA1} \
13 | --sim-dir=${SIMDIR} \
14 | --outdir=${OUT}/${BETA1} \
15 | --split=${SPLIT} \
16 | --linear-outcome-model=${LINOUTCOME} \
17 | --use-recon-loss=${RECONLOSS} \
18 | --use-supervised-loss=${SUPLOSS} \


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_no_sup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
 6 | 
 7 | export MODE=simple
 8 | export LINOUTCOME=t
 9 | export RECONLOSS=t
10 | export SUPLOSS=f
11 | 
12 | declare -a BETA1S=(5.0)
13 | 
14 | for BETA1j in "${BETA1S[@]}"; do
15 | 	for SPLITi in $(seq 0 9); do
16 | 	    export BETA1=${BETA1j}
17 | 	    export SPLIT=${SPLITi}
18 | 	    export OUT=${BASE_OUT}/no_sup/
19 | 	    sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | 	           --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | 	           supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | 	done
23 | done
24 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_no_unsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
 6 | 
 7 | export MODE=simple
 8 | export LINOUTCOME=t
 9 | export RECONLOSS=f
10 | export SUPLOSS=t
11 | 
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 | 
14 | for BETA1j in "${BETA1S[@]}"; do
15 | 	for SPLITi in $(seq 0 9); do
16 | 	    export BETA1=${BETA1j}
17 | 	    export SPLIT=${SPLITi}
18 | 	    export OUT=${BASE_OUT}/no_unsup/
19 | 	    sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | 	           --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | 	           supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | 	done
23 | done
24 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_nonlinear.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
 6 | 
 7 | export MODE=simple
 8 | export LINOUTCOME=f
 9 | export RECONLOSS=t
10 | export SUPLOSS=t
11 | 
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 | 
14 | for BETA1j in "${BETA1S[@]}"; do
15 | 	for SPLITi in $(seq 0 9); do
16 | 	    export BETA1=${BETA1j}
17 | 	    export SPLIT=${SPLITi}
18 | 	    export OUT=${BASE_OUT}/non_linear/
19 | 	    sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | 	           --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | 	           supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | 	done
23 | done
24 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_peerread_simulation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
 6 | 
 7 | export MODE=simple
 8 | export LINOUTCOME=t
 9 | export RECONLOSS=t
10 | export SUPLOSS=t
11 | 
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 | 
14 | for BETA1j in "${BETA1S[@]}"; do
15 | 	for SPLITi in $(seq 0 9); do
16 | 	    export BETA1=${BETA1j}
17 | 	    export SPLIT=${SPLITi}
18 | 	    export OUT=${BASE_OUT}/base_model/
19 | 	    sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | 	           --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | 	           supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | 	done
23 | done
24 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH -A sml
 3 | #SBATCH -c 8
 4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu
 5 | #SBATCH --mail-type=ALL
 6 | 
 7 | source activate py3.6
 8 | 
 9 | python -m supervised_lda.reddit_output_att \
10 | --dat-dir=${DIR} \
11 | --mode=${MODE} \
12 | --subs=${SUBS} \
13 | --params=${BETA0},${BETA1},${GAMMA} \
14 | --sim-dir=${SIMDIR} \
15 | --outdir=${OUT}/beta0${BETA0}.beta1${BETA1}.gamma${GAMMA} \
16 | --split=${SPLIT} \
17 | --linear-outcome-model=${LINOUTCOME} \
18 | --use-recon-loss=${RECONLOSS} \
19 | --use-supervised-loss=${SUPLOSS} \
20 | 
21 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_no_sup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
 6 | 
 7 | export MODE=simple
 8 | export SUBS=13,6,8
 9 | export LINOUTCOME=t
10 | export RECONLOSS=t
11 | export SUPLOSS=f
12 | 
13 | export BETA0=1.0
14 | declare -a BETA1S=(10.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 | 
17 | for BETA1j in "${BETA1S[@]}"; do
18 | 	export BETA1=${BETA1j}
19 | 	for GAMMAj in "${GAMMAS[@]}"; do
20 | 		for SPLITi in $(seq 0 4); do
21 | 			export SPLIT=${SPLITi}
22 | 			export GAMMA=${GAMMAj}
23 | 			export OUT=${BASE_OUT}/no_sup/
24 | 			sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | 				   --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | 				   supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | 		done
28 | 	done
29 | done
30 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_no_unsup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
 6 | 
 7 | export MODE=simple
 8 | export SUBS=13,6,8
 9 | export LINOUTCOME=t
10 | export RECONLOSS=f
11 | export SUPLOSS=t
12 | 
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 | 
17 | for BETA1j in "${BETA1S[@]}"; do
18 | 	export BETA1=${BETA1j}
19 | 	for GAMMAj in "${GAMMAS[@]}"; do
20 | 		for SPLITi in $(seq 0 4); do
21 | 			export SPLIT=${SPLITi}
22 | 			export GAMMA=${GAMMAj}
23 | 			export OUT=${BASE_OUT}/no_unsup/
24 | 			sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | 				   --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | 				   supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | 
28 | 	   done
29 | 	done
30 | done
31 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_nonlinear.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
 6 | 
 7 | export MODE=simple
 8 | export SUBS=13,6,8
 9 | export LINOUTCOME=f
10 | export RECONLOSS=t
11 | export SUPLOSS=t
12 | 
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 | 
17 | for BETA1j in "${BETA1S[@]}"; do
18 | 	export BETA1=${BETA1j}
19 | 	for GAMMAj in "${GAMMAS[@]}"; do
20 | 		for SPLITi in $(seq 0 4); do
21 | 			export SPLIT=${SPLITi}
22 | 			export GAMMA=${GAMMAj}
23 | 			export OUT=${BASE_OUT}/non_linear/
24 | 			sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | 				   --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | 				   supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | 		done
28 | 	done
29 | done
30 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_simulation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
 6 | 
 7 | export MODE=simple
 8 | export SUBS=13,6,8
 9 | export LINOUTCOME=t
10 | export RECONLOSS=t
11 | export SUPLOSS=t
12 | 
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 | 
17 | for BETA1j in "${BETA1S[@]}"; do
18 | 	export BETA1=${BETA1j}
19 | 	for GAMMAj in "${GAMMAS[@]}"; do
20 | 		for SPLITi in $(seq 0 4); do
21 | 			export SPLIT=${SPLITi}
22 | 			export GAMMA=${GAMMAj}
23 | 			export OUT=${BASE_OUT}/base_model/
24 | 			sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | 				   --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | 				   supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | 		done
28 | 	done
29 | done
30 | 


--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
 3 | 
 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
 6 | 
 7 | export MODE=simple
 8 | export SUBS=13,6,8
 9 | export LINOUTCOME=True
10 | export RECONLOSS=True
11 | export SUPLOSS=True
12 | 
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0)
15 | declare -a GAMMAS=(1.0)
16 | 
17 | for BETA1j in "${BETA1S[@]}"; do
18 | 	export BETA1=${BETA1j}
19 | 	for GAMMAj in "${GAMMAS[@]}"; do
20 | 		for SPLITi in $(seq 0 1); do
21 | 			export SPLIT=${SPLITi}
22 | 			export GAMMA=${GAMMAj}
23 | 			export OUT=${BASE_OUT}/base_model/
24 | 			sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | 				   --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | 				   supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | 		done
28 | 	done
29 | done
30 | 


--------------------------------------------------------------------------------
/src/supervised_lda/supervised_topic_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F 
  3 | import numpy as np 
  4 | import math 
  5 | 
  6 | from torch import nn
  7 | 
  8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  9 | 
 10 | class SupervisedTopicModel(nn.Module):
 11 |     def __init__(self, num_topics, vocab_size, num_documents, t_hidden_size=800, theta_act='relu', enc_drop=0., outcome_linear_map=True):
 12 |         super(SupervisedTopicModel, self).__init__()
 13 | 
 14 |         ## define hyperparameters
 15 |         self.num_topics = num_topics
 16 |         self.vocab_size = vocab_size
 17 |         self.num_documents = num_documents
 18 |         self.t_hidden_size = t_hidden_size
 19 |         self.enc_drop = enc_drop
 20 |         self.t_drop = nn.Dropout(enc_drop)
 21 |         self.theta_act = self.get_activation(theta_act)
 22 |         self.outcome_linear_map = outcome_linear_map
 23 |         
 24 |         ## define the matrix containing the topic embeddings
 25 |         self.alphas = nn.Parameter(torch.randn(vocab_size, num_topics))
 26 | 
 27 |         if self.outcome_linear_map:
 28 |             ## define linear regression weights for predicting expected outcomes for treated
 29 |             self.w_expected_outcome_treated = nn.Linear(num_topics, 1)
 30 | 
 31 |             ## define linear regression weights for predicting expected outcomes for untreated
 32 |             self.w_expected_outcome_untreated = nn.Linear(num_topics, 1)
 33 |         else:
 34 |             self.f_outcome_treated = nn.Sequential(
 35 |                 nn.Linear(num_topics, t_hidden_size), 
 36 |                 self.theta_act,
 37 |                 # nn.BatchNorm1d(t_hidden_size),
 38 |                 nn.Linear(t_hidden_size, t_hidden_size),
 39 |                 self.theta_act,
 40 |                 # nn.BatchNorm1d(t_hidden_size),
 41 |                 nn.Linear(t_hidden_size,1)
 42 |             )
 43 |             self.f_outcome_untreated = nn.Sequential(
 44 |                 nn.Linear(num_topics, t_hidden_size), 
 45 |                 self.theta_act,
 46 |                 # nn.BatchNorm1d(t_hidden_size),
 47 |                 nn.Linear(t_hidden_size, t_hidden_size),
 48 |                 self.theta_act,
 49 |                 # nn.BatchNorm1d(t_hidden_size),
 50 |                 nn.Linear(t_hidden_size,1)
 51 |             )
 52 |         ## define linear regression weights for predicting binary treatment label
 53 |         self.w_treatment = nn.Linear(num_topics,1)
 54 |         
 55 |         self.q_theta = nn.Sequential(
 56 |                 nn.Linear(vocab_size, t_hidden_size), 
 57 |                 self.theta_act,
 58 |                 nn.BatchNorm1d(t_hidden_size),
 59 |                 nn.Linear(t_hidden_size, t_hidden_size),
 60 |                 self.theta_act,
 61 |                 nn.BatchNorm1d(t_hidden_size)
 62 |             )
 63 |         self.mu_q_theta = nn.Linear(t_hidden_size, num_topics)
 64 |         self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics)
 65 | 
 66 |     def get_activation(self, act):
 67 |         if act == 'tanh':
 68 |             act = nn.Tanh()
 69 |         elif act == 'relu':
 70 |             act = nn.ReLU()
 71 |         elif act == 'softplus':
 72 |             act = nn.Softplus()
 73 |         elif act == 'rrelu':
 74 |             act = nn.RReLU()
 75 |         elif act == 'leakyrelu':
 76 |             act = nn.LeakyReLU()
 77 |         elif act == 'elu':
 78 |             act = nn.ELU()
 79 |         elif act == 'selu':
 80 |             act = nn.SELU()
 81 |         elif act == 'glu':
 82 |             act = nn.GLU()
 83 |         else:
 84 |             print('Defaulting to tanh activations...')
 85 |             act = nn.Tanh()
 86 |         return act 
 87 | 
 88 |     def reparameterize(self, mu, logvar):
 89 |         """Returns a sample from a Gaussian distribution via reparameterization.
 90 |         """
 91 |         if self.training:
 92 |             std = torch.exp(0.5 * logvar) 
 93 |             eps = torch.randn_like(std)
 94 |             return eps.mul_(std).add_(mu)
 95 |         else:
 96 |             return mu
 97 | 
 98 |     def encode(self, bows):
 99 |         """Returns paramters of the variational distribution for \theta.
100 | 
101 |         input: bows
102 |                 batch of bag-of-words...tensor of shape bsz x V
103 |         output: mu_theta, log_sigma_theta
104 |         """
105 |         q_theta = self.q_theta(bows)
106 |         if self.enc_drop > 0:
107 |             q_theta = self.t_drop(q_theta)
108 |         mu_theta = self.mu_q_theta(q_theta)
109 |         logsigma_theta = self.logsigma_q_theta(q_theta)
110 |         kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean()
111 |         return mu_theta, logsigma_theta, kl_theta
112 | 
113 |     def get_beta(self):
114 |         beta = F.softmax(self.alphas, dim=0).transpose(1, 0) ## softmax over vocab dimension
115 |         return beta
116 | 
117 |     def get_theta(self, normalized_bows):
118 |         mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows)
119 |         z = self.reparameterize(mu_theta, logsigma_theta)
120 |         theta = F.softmax(z, dim=-1) 
121 |         return theta, kld_theta
122 | 
123 |     def decode(self, theta, beta):
124 |         res = torch.mm(theta, beta) 
125 |         preds = torch.log(res+1e-6)
126 |         return preds
127 | 
128 |     def predict_treatment(self, theta):
129 |         logits = self.w_treatment(theta)
130 |         return logits
131 | 
132 |     def predict_outcome_st_treat(self, theta, treatment_labels):
133 |         treated_indices = [treatment_labels == 1]
134 |         theta_treated = theta[treated_indices]
135 | 
136 |         if not self.outcome_linear_map:
137 |             expected_outcome_treated = self.f_outcome_treated(theta_treated)
138 |         else:
139 |             expected_outcome_treated = self.w_expected_outcome_treated(theta_treated) 
140 | 
141 |         return expected_outcome_treated
142 | 
143 |     def predict_outcome_st_no_treat(self, theta, treatment_labels):
144 |         untreated_indices = [treatment_labels == 0]
145 |         theta_untreated = theta[untreated_indices]
146 | 
147 |         if not self.outcome_linear_map:
148 |             expected_outcome_untreated = self.f_outcome_untreated(theta_untreated)
149 |         else:
150 |             expected_outcome_untreated = self.w_expected_outcome_untreated(theta_untreated)
151 |             
152 |         return expected_outcome_untreated
153 | 
154 | 
155 |     def forward(self, bows, normalized_bows, treatment_labels, outcomes, dtype='real', use_supervised_loss=True):
156 |         ## get \theta
157 |         theta, kld_theta = self.get_theta(normalized_bows)
158 |         beta = self.get_beta()
159 | 
160 |         bce_loss = nn.BCEWithLogitsLoss()
161 |         mse_loss = nn.MSELoss()
162 |         
163 |         ## get reconstruction loss
164 |         preds = self.decode(theta, beta)
165 |         recon_loss = -(preds * bows).sum(1)
166 |         recon_loss = recon_loss.mean()
167 | 
168 |         supervised_loss=None
169 |         if use_supervised_loss:
170 | 
171 |             #get treatment loss 
172 |             treatment_logits = self.predict_treatment(theta).squeeze()
173 |             treatment_loss = bce_loss(treatment_logits, treatment_labels)
174 | 
175 |             #get expected outcome loss
176 |             treated  = [treatment_labels == 1]
177 |             untreated = [treatment_labels == 0]
178 |             outcomes_treated = outcomes[treated]
179 |             outcomes_untreated = outcomes[untreated]
180 |             expected_treated = self.predict_outcome_st_treat(theta, treatment_labels).squeeze()
181 |             expected_untreated = self.predict_outcome_st_no_treat(theta, treatment_labels).squeeze()
182 | 
183 |             if dtype == 'real':
184 |                 outcome_loss_treated = mse_loss(expected_treated,outcomes_treated)
185 |                 outcome_loss_untreated = mse_loss(expected_treated,outcomes_treated)
186 |             else:
187 |                 outcome_loss_treated = bce_loss(expected_treated,outcomes_treated)
188 |                 outcome_loss_untreated = bce_loss(expected_treated,outcomes_treated)
189 | 
190 |             supervised_loss = treatment_loss + outcome_loss_treated + outcome_loss_untreated
191 | 
192 |         return recon_loss, supervised_loss, kld_theta
193 | 
194 | 


--------------------------------------------------------------------------------
/src/words_baseline/helpers.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import word_tokenize
 2 | from nltk.stem import WordNetLemmatizer
 3 | from nltk.corpus import stopwords
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.decomposition import LatentDirichletAllocation
 8 | 
 9 | class LemmaTokenizer(object):
10 | 	def __init__(self):
11 | 		self.wnl = WordNetLemmatizer()
12 | 	def __call__(self, articles):
13 | 		stop = stopwords.words('english')
14 | 		return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 | 
16 | def filter_by_subreddit(reddit, subs=None):
17 | 	if not subs:
18 | 		return reddit.index.values
19 | 	else:
20 | 		return reddit[reddit.subreddit.isin(subs)].index.values
21 | 
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
23 | 	from nltk.corpus import stopwords
24 | 	'''
25 | 	From a list of documents raw text build a matrix DxV
26 | 	D: number of docs
27 | 	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | 	'''
29 | 	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | 	corpus = count_vect.fit_transform(documents)
31 | 	vocabulary = count_vect.get_feature_names()
32 | 	
33 | 	return corpus,vocabulary,count_vect
34 | 
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | 	indices = np.arange(num_docs)
37 | 	np.random.shuffle(indices)
38 | 	size = int(indices.shape[0]*percentage)
39 | 	dev = indices[:size]
40 | 	return dev
41 | 
42 | def learn_topics(X, X_dev, K=50):
43 | 	lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | 	print("Fitting", K, "topics...")
45 | 	lda.fit(X)
46 | 	score = lda.perplexity(X_dev)
47 | 	print("Log likelihood:", score)
48 | 	topics = lda.components_
49 | 	return score, lda, topics
50 | 
51 | def show_topics(vocab, topics, n_words=20):
52 | 	topic_keywords = []
53 | 	for topic_weights in topics:
54 | 		top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | 		topic_keywords.append(vocab.take(top_keyword_locs))
56 | 
57 | 	df_topic_keywords = pd.DataFrame(topic_keywords)
58 | 	df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | 	df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | 	return df_topic_keywords
61 | 
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | 	filtered_indices = filtered_df[on].values
64 | 	doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | 	embeddings = doc_embeddings[doc_idx, :]
66 | 	return embeddings
67 | 
68 | def make_index_mapping(df, on='post_index', convert_to_int=True):
69 | 	if on=='index':
70 | 		indices = df.index.values
71 | 	else:
72 | 		indices = df[on].values
73 | 
74 | 	if convert_to_int:
75 | 		return {int(ind):i for (i,ind) in enumerate(indices)}
76 | 
77 | 	return {ind:i for (i,ind) in enumerate(indices)}
78 | 
79 | def assign_split(df, num_splits=10, col_to_add='split'):
80 | 	df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
81 | 	return df
82 | 


--------------------------------------------------------------------------------
/src/words_baseline/peerread_output_ate.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.ate import psi_q_only,psi_tmle_cont_outcome
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | from sklearn.linear_model import LogisticRegression, LinearRegression
  6 | from sklearn.metrics import mean_squared_error as mse
  7 | import argparse
  8 | import sys
  9 | from scipy.special import logit
 10 | from scipy.sparse import load_npz
 11 | 
 12 | def compute_ground_truth_treatment_effect(df):
 13 | 	y1 = df['y1']
 14 | 	y0 = df['y0']
 15 | 	return y1.mean() - y0.mean()
 16 | 
 17 | def get_log_outcomes(outcomes):
 18 | 	#relu
 19 | 	outcomes = np.array([max(0.0, out) + 1.0  for out in outcomes])
 20 | 	return np.log(outcomes)
 21 | 
 22 | def predict_expected_outcomes(model, features):
 23 | 	return model.predict_proba(features)[:,1]
 24 | 
 25 | def fit_conditional_expected_outcomes(outcomes, features):
 26 | 	model = LogisticRegression(solver='liblinear')
 27 | 	model.fit(features, outcomes)
 28 | 	if verbose:
 29 | 		print("Training accuracy:", model.score(features, outcomes))
 30 | 	return model
 31 | 
 32 | def predict_treatment_probability(labels, features):
 33 | 	model = LogisticRegression(solver='liblinear')
 34 | 	model.fit(features, labels)
 35 | 	if verbose:
 36 | 		print("Training accuracy:", model.score(features, labels))
 37 | 	treatment_probability = model.predict_proba(features)[:,1]
 38 | 	return treatment_probability
 39 | 
 40 | def load_simulated_data():
 41 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 42 | 	sim_df = sim_df.rename(columns={'index':'post_index'})
 43 | 	return sim_df
 44 | 
 45 | def load_term_counts(path='../dat/reddit/'):
 46 | 	return load_npz(path + 'term_counts.npz').toarray()
 47 | 
 48 | def main():
 49 | 	if not dat_dir:
 50 | 		term_counts = load_term_counts()
 51 | 	else:
 52 | 		term_counts = load_term_counts(path=dat_dir)
 53 | 
 54 | 	sim_df = load_simulated_data()
 55 | 	treatment_labels = sim_df.treatment.values
 56 | 	indices = sim_df.post_index.values
 57 | 	all_words = term_counts[indices, :]
 58 | 
 59 | 	treated_sim = sim_df[sim_df.treatment==1]
 60 | 	untreated_sim = sim_df[sim_df.treatment==0]
 61 | 	treated_indices = treated_sim.post_index.values
 62 | 	untreated_indices = untreated_sim.post_index.values
 63 | 	
 64 | 	all_outcomes = sim_df.outcome.values
 65 | 	outcomes_st_treated = treated_sim.outcome.values
 66 | 	outcomes_st_not_treated = untreated_sim.outcome.values
 67 | 	
 68 | 	words_st_treated = term_counts[treated_indices,:]
 69 | 	words_st_not_treated = term_counts[untreated_indices,:]
 70 | 
 71 | 	treatment_probability = predict_treatment_probability(treatment_labels, all_words)
 72 | 	model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated)
 73 | 	model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated)
 74 | 
 75 | 	expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words)
 76 | 	expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words)
 77 | 
 78 | 	q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 79 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)
 80 | 
 81 | 	tmle = psi_tmle_cont_outcome(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 82 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)[0]
 83 | 	
 84 | 	print("Q hat:", q_hat)
 85 | 	print("TMLE:", tmle)
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 | 	parser = argparse.ArgumentParser()
 90 | 	parser.add_argument("--dat-dir", action="store", default=None)
 91 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
 92 | 	parser.add_argument("--mode", action="store", default="simple")
 93 | 	parser.add_argument("--params", action="store", default="1.0")
 94 | 	parser.add_argument("--verbose", action='store_true')
 95 | 	args = parser.parse_args()
 96 | 
 97 | 	sim_dir = args.sim_dir
 98 | 	dat_dir = args.dat_dir
 99 | 	verbose = args.verbose
100 | 	params = args.params
101 | 	sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
102 | 	mode = args.mode
103 | 	simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
104 | 
105 | 	main()


--------------------------------------------------------------------------------
/src/words_baseline/reddit_output_att.py:
--------------------------------------------------------------------------------
  1 | from semi_parametric_estimation.att import att_estimates, psi_plugin, psi_q_only
  2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
  3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
  8 | from sklearn.metrics import mean_squared_error as mse
  9 | import argparse
 10 | import sys
 11 | from scipy.special import logit
 12 | from scipy.sparse import load_npz
 13 | 
 14 | def get_log_outcomes(outcomes):
 15 | 	#relu
 16 | 	outcomes = np.array([max(0.0, out) + 1.0  for out in outcomes])
 17 | 	return np.log(outcomes)
 18 | 
 19 | def predict_expected_outcomes(model, features):
 20 | 	return model.predict(features)
 21 | 
 22 | def fit_conditional_expected_outcomes(outcomes, features):
 23 | 	model = Ridge()
 24 | 	model.fit(features, outcomes)
 25 | 	predict = model.predict(features)
 26 | 	if verbose:
 27 | 		print("Training MSE:", mse(outcomes, predict))
 28 | 	return model
 29 | 
 30 | def predict_treatment_probability(labels, features):
 31 | 	model = LogisticRegression(solver='liblinear')
 32 | 	model.fit(features, labels)
 33 | 	if verbose:
 34 | 		print("Training accuracy:", model.score(features, labels))
 35 | 	treatment_probability = model.predict_proba(features)[:,1]
 36 | 	return treatment_probability
 37 | 
 38 | def load_simulated_data():
 39 | 	sim_df = pd.read_csv(simulation_file, delimiter='\t')
 40 | 	sim_df = sim_df.rename(columns={'index':'post_index'})
 41 | 	return sim_df
 42 | 
 43 | def load_term_counts(path='../dat/reddit/'):
 44 | 	return load_npz(path + 'term_counts.npz').toarray()
 45 | 
 46 | def main():
 47 | 	
 48 | 	if not dat_dir:
 49 | 		term_counts = load_term_counts()
 50 | 	else:
 51 | 		term_counts = load_term_counts(path=dat_dir)
 52 | 
 53 | 	sim_df = load_simulated_data()
 54 | 	treatment_labels = sim_df.treatment.values
 55 | 	indices = sim_df.post_index.values
 56 | 	all_words = term_counts[indices, :]
 57 | 
 58 | 	treated_sim = sim_df[sim_df.treatment==1]
 59 | 	untreated_sim = sim_df[sim_df.treatment==0]
 60 | 	treated_indices = treated_sim.post_index.values
 61 | 	untreated_indices = untreated_sim.post_index.values
 62 | 	
 63 | 	all_outcomes = sim_df.outcome.values
 64 | 	outcomes_st_treated = treated_sim.outcome.values
 65 | 	outcomes_st_not_treated = untreated_sim.outcome.values
 66 | 	
 67 | 	words_st_treated = term_counts[treated_indices,:]
 68 | 	words_st_not_treated = term_counts[untreated_indices,:]
 69 | 
 70 | 	treatment_probability = predict_treatment_probability(treatment_labels, all_words)
 71 | 	model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated)
 72 | 	model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated)
 73 | 
 74 | 	expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words)
 75 | 	expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words)
 76 | 
 77 | 	q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 78 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
 79 | 
 80 | 	tmle = psi_plugin(expected_outcome_st_not_treated, expected_outcome_st_treated, 
 81 | 			treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
 82 | 	
 83 | 	print("Q hat:", q_hat)
 84 | 	print("TMLE:", tmle)
 85 | 
 86 | if __name__ == '__main__':
 87 | 	parser = argparse.ArgumentParser()
 88 | 	parser.add_argument("--dat-dir", action="store", default=None)
 89 | 	parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
 90 | 	parser.add_argument("--subs", action="store", default='13,6,8')
 91 | 	parser.add_argument("--mode", action="store", default="simple")
 92 | 	parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
 93 | 	parser.add_argument("--verbose", action='store_true')
 94 | 	args = parser.parse_args()
 95 | 
 96 | 	sim_dir = args.sim_dir
 97 | 	dat_dir = args.dat_dir
 98 | 	subs = None
 99 | 	if args.subs != '':
100 | 		subs = [int(s) for s in args.subs.split(',')]
101 | 	verbose = args.verbose
102 | 	params = args.params.split(',')
103 | 	sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
104 | 	subs_string = ', '.join(args.subs.split(','))
105 | 	mode = args.mode
106 | 	simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
107 | 
108 | 	main()


--------------------------------------------------------------------------------
/src/words_baseline/scripts/sweep_over_sims.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #NUM_SEED=2
 3 | #SEEDS=$(seq 0 $NUM_SEED)
 4 | rm ../dat/reddit/sim/reddit_subreddit_based/two-stage-lda-estimates.out
 5 | export SUBREDDITS=13,6,8
 6 | export BETA0=1.0
 7 | declare -a SIMMODES=('simple')
 8 | declare -a BETA1S=(1.0 10.0 100.0)
 9 | declare -a GAMMAS=(1.0 4.0)
10 | 
11 | for SIMMODEj in "${SIMMODES[@]}"; do
12 |     for BETA1j in "${BETA1S[@]}"; do
13 |         for GAMMAj in "${GAMMAS[@]}"; do
14 |             python -m lda_baseline.reddit_output_att \
15 |             --subs=${SUBREDDITS} \
16 |             --mode=${SIMMODEj} \
17 |             --params=${BETA0},${BETA1j},${GAMMAj}
18 |         done
19 |     done
20 | done


--------------------------------------------------------------------------------