├── .gitignore
├── LICENSE
├── README.md
├── dat
├── PeerRead
│ └── proc
│ │ └── arxiv-all.tf_record
└── reddit
│ └── README.md
└── src
├── .gitignore
├── .idea
├── encodings.xml
├── misc.xml
├── modules.xml
├── src.iml
└── vcs.xml
├── PeerRead
├── ScienceParse
│ ├── Paper.py
│ ├── README.md
│ ├── Review.py
│ ├── ScienceParse.py
│ ├── ScienceParseReader.py
│ └── __init__.py
├── __init__.py
├── data_cleaning
│ ├── PeerRead_hand_features.py
│ ├── __init__.py
│ ├── clean_PeerRead.py
│ ├── extra_vocab.py
│ ├── process_PeerRead_abstracts.py
│ └── scripts
│ │ ├── clean_PeerRead.sh
│ │ ├── clean_nips_prefix.sh
│ │ └── merge_train_dev_test.sh
├── dataset
│ ├── __init__.py
│ ├── array_from_dataset.py
│ ├── dataset.py
│ └── sentence_masking.py
├── model
│ ├── __init__.py
│ ├── bert_multiclass.py
│ ├── run_causal_bert.py
│ └── run_multiclass.py
└── submit_scripts
│ ├── run_model.sh
│ └── run_unsupervised.sh
├── __init__.py
├── bert
├── README
├── __init__.py
├── create_pretraining_data.py
├── modeling.py
├── optimization.py
└── tokenization.py
├── causal_bert
├── __init__.py
├── bert_predictors.py
├── bert_unsupervised.py
└── logging.py
├── data_cleaning
└── reddit_posts.py
├── lda_baseline
├── helpers.py
├── peerread_fit_topics.py
├── peerread_get_abstracts.py
├── peerread_output_att.py
├── reddit_fit_topics.py
├── reddit_output_att.py
└── scripts
│ └── sweep_over_sims.sh
├── model_checking
└── plot_adjustment.py
├── plot_treatment_model.ipynb
├── reddit
├── __init__.py
├── data_cleaning
│ ├── BigQuery_get_data
│ ├── __init__.py
│ ├── process_reddit.py
│ ├── reddit_gender_sentiment.ipynb
│ └── reddit_posts.py
├── dataset
│ ├── __init__.py
│ ├── array_from_dataset.py
│ ├── dataset.py
│ └── sentence_masking.py
├── model
│ ├── __init__.py
│ ├── run_causal_bert.py
│ ├── run_subreddit_classifier.py
│ ├── run_unsupervised_pretraining.py
│ └── subreddit_predictors.py
└── submit_scripts
│ ├── run_model.sh
│ └── run_unsupervised.sh
├── result_processing
├── compute_ate.py
├── compute_att.py
├── helpers.py
├── process_predictions.py
├── prop_sim_plotting.py
└── test_cond_indep.py
├── semi_parametric_estimation
├── __init__.py
├── ate.py
├── att.py
└── helpers.py
├── supervised_lda
├── add_split_to_simulations.ipynb
├── compute_estimates.py
├── helpers.py
├── peerread_output_att.py
├── reddit_output_att.py
├── run_supervised_tm.py
├── submit_scripts
│ ├── peerread-exps
│ │ ├── run_peerread_simulation.sh
│ │ ├── submit_no_sup.sh
│ │ ├── submit_no_unsup.sh
│ │ ├── submit_nonlinear.sh
│ │ └── submit_peerread_simulation.sh
│ └── reddit-exps
│ │ ├── run_reddit_simulation.sh
│ │ ├── submit_no_sup.sh
│ │ ├── submit_no_unsup.sh
│ │ ├── submit_nonlinear.sh
│ │ ├── submit_reddit_simulation.sh
│ │ └── submit_reddit_test.sh
├── supervised_topic_model.py
└── test_slda.ipynb
└── words_baseline
├── helpers.py
├── peerread_output_ate.py
├── reddit_output_att.py
└── scripts
└── sweep_over_sims.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | logdir/**
2 | **/tmp/**
3 | output/**
4 | dat/**
5 | dat/gender-text-corpus
6 | .DS_Store
7 | **/.DS_Store
8 | **/*.pyc
9 | **/*.pyo
10 | *checkpoint*
11 | *aux
12 | *log
13 | *.out
14 | *.synct*
15 | *__pycache__*
16 |
17 | #################################
18 | # Victor's standard gitignore
19 | # mostly python and tex
20 | #################################
21 |
22 | # Byte-compiled / optimized / DLL files
23 | __pycache__/
24 | *.py[cod]
25 | *$py.class
26 |
27 | # C extensions
28 | *.so
29 |
30 | # Distribution / packaging
31 | .Python
32 | build/
33 | develop-eggs/
34 | dist/
35 | downloads/
36 | eggs/
37 | .eggs/
38 | lib/
39 | lib64/
40 | parts/
41 | sdist/
42 | var/
43 | wheels/
44 | *.egg-info/
45 | .installed.cfg
46 | *.egg
47 | MANIFEST
48 |
49 | # PyInstaller
50 | # Usually these files are written by a python script from a template
51 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 |
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 |
59 | # Unit test / coverage reports
60 | htmlcov/
61 | .tox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *.cover
68 | .hypothesis/
69 | .pytest_cache/
70 |
71 | # Translations
72 | *.mo
73 | *.pot
74 |
75 | # Django stuff:
76 | *.log
77 | local_settings.py
78 | db.sqlite3
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # Sphinx documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # pyenv
97 | .python-version
98 |
99 | # celery beat schedule file
100 | celerybeat-schedule
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 |
127 | # JetBrains (PyCharm) stuff
128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
130 |
131 | # User-specific stuff
132 | .idea/**/workspace.xml
133 | .idea/**/tasks.xml
134 | .idea/**/usage.statistics.xml
135 | .idea/**/dictionaries
136 | .idea/**/shelf
137 |
138 | # Generated files
139 | .idea/**/contentModel.xml
140 |
141 | # Sensitive or high-churn files
142 | .idea/**/dataSources/
143 | .idea/**/dataSources.ids
144 | .idea/**/dataSources.local.xml
145 | .idea/**/sqlDataSources.xml
146 | .idea/**/dynamic.xml
147 | .idea/**/uiDesigner.xml
148 | .idea/**/dbnavigator.xml
149 |
150 | # Gradle
151 | .idea/**/gradle.xml
152 | .idea/**/libraries
153 |
154 | # Gradle and Maven with auto-import
155 | # When using Gradle or Maven with auto-import, you should exclude module files,
156 | # since they will be recreated, and may cause churn. Uncomment if using
157 | # auto-import.
158 | # .idea/modules.xml
159 | # .idea/*.iml
160 | # .idea/modules
161 |
162 | # CMake
163 | cmake-build-*/
164 |
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 |
168 | # File-based project format
169 | *.iws
170 |
171 | # IntelliJ
172 | out/
173 |
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 |
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 |
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 |
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 |
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 |
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 |
195 | # text
196 | *.pdf
197 |
198 | # linux backup files
199 | *~
200 | *#
201 |
202 | ## Core latex/pdflatex auxiliary files:
203 | *.aux
204 | *.lof
205 | *.log
206 | *.lot
207 | *.fls
208 | *.out
209 | *.toc
210 | *.fmt
211 | *.fot
212 | *.cb
213 | *.cb2
214 |
215 | ## Intermediate documents:
216 | *.dvi
217 | *-converted-to.*
218 | # these rules might exclude image files for figures etc.
219 | # *.ps
220 | # *.eps
221 | # *.pdf
222 |
223 | ## Generated if empty string is given at "Please type another file name for output:"
224 | .pdf
225 |
226 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
227 | *.bbl
228 | *.bcf
229 | *.blg
230 | *-blx.aux
231 | *-blx.bib
232 | *.run.xml
233 |
234 | ## Build tool auxiliary files:
235 | *.fdb_latexmk
236 | *.synctex
237 | *.synctex(busy)
238 | *.synctex.gz
239 | *.synctex.gz(busy)
240 | *.pdfsync
241 |
242 | ## Auxiliary and intermediate files from other packages:
243 | # algorithms
244 | *.alg
245 | *.loa
246 |
247 | # achemso
248 | acs-*.bib
249 |
250 | # amsthm
251 | *.thm
252 |
253 | # beamer
254 | *.nav
255 | *.pre
256 | *.snm
257 | *.vrb
258 |
259 | # changes
260 | *.soc
261 |
262 | # cprotect
263 | *.cpt
264 |
265 | # elsarticle (documentclass of Elsevier journals)
266 | *.spl
267 |
268 | # endnotes
269 | *.ent
270 |
271 | # fixme
272 | *.lox
273 |
274 | # feynmf/feynmp
275 | *.mf
276 | *.mp
277 | *.t[1-9]
278 | *.t[1-9][0-9]
279 | *.tfm
280 |
281 | #(r)(e)ledmac/(r)(e)ledpar
282 | *.end
283 | *.?end
284 | *.[1-9]
285 | *.[1-9][0-9]
286 | *.[1-9][0-9][0-9]
287 | *.[1-9]R
288 | *.[1-9][0-9]R
289 | *.[1-9][0-9][0-9]R
290 | *.eledsec[1-9]
291 | *.eledsec[1-9]R
292 | *.eledsec[1-9][0-9]
293 | *.eledsec[1-9][0-9]R
294 | *.eledsec[1-9][0-9][0-9]
295 | *.eledsec[1-9][0-9][0-9]R
296 |
297 | # glossaries
298 | *.acn
299 | *.acr
300 | *.glg
301 | *.glo
302 | *.gls
303 | *.glsdefs
304 |
305 | # gnuplottex
306 | *-gnuplottex-*
307 |
308 | # gregoriotex
309 | *.gaux
310 | *.gtex
311 |
312 | # hyperref
313 | *.brf
314 |
315 | # knitr
316 | *-concordance.tex
317 | # TODO Comment the next line if you want to keep your tikz graphics files
318 | *.tikz
319 | *-tikzDictionary
320 |
321 | # listings
322 | *.lol
323 |
324 | # makeidx
325 | *.idx
326 | *.ilg
327 | *.ind
328 | *.ist
329 |
330 | # minitoc
331 | *.maf
332 | *.mlf
333 | *.mlt
334 | *.mtc[0-9]*
335 | *.slf[0-9]*
336 | *.slt[0-9]*
337 | *.stc[0-9]*
338 |
339 | # minted
340 | _minted*
341 | *.pyg
342 |
343 | # morewrites
344 | *.mw
345 |
346 | # nomencl
347 | *.nlo
348 |
349 | # pax
350 | *.pax
351 |
352 | # pdfpcnotes
353 | *.pdfpc
354 |
355 | # sagetex
356 | *.sagetex.sage
357 | *.sagetex.py
358 | *.sagetex.scmd
359 |
360 | # scrwfile
361 | *.wrt
362 |
363 | # sympy
364 | *.sout
365 | *.sympy
366 | sympy-plots-for-*.tex/
367 |
368 | # pdfcomment
369 | *.upa
370 | *.upb
371 |
372 | # pythontex
373 | *.pytxcode
374 | pythontex-files-*/
375 |
376 | # thmtools
377 | *.loe
378 |
379 | # TikZ & PGF
380 | *.dpth
381 | *.md5
382 | *.auxlock
383 |
384 | # todonotes
385 | *.tdo
386 |
387 | # easy-todo
388 | *.lod
389 |
390 | # xindy
391 | *.xdy
392 |
393 | # xypic precompiled matrices
394 | *.xyc
395 |
396 | # endfloat
397 | *.ttt
398 | *.fff
399 |
400 | # Latexian
401 | TSWLatexianTemp*
402 |
403 | ## Editors:
404 | # WinEdt
405 | *.bak
406 | *.sav
407 |
408 | # Texpad
409 | .texpadtmp
410 |
411 | # Kile
412 | *.backup
413 |
414 | # KBibTeX
415 | *~[0-9]*
416 |
417 | # auto folder when using emacs and auctex
418 | auto/*
419 |
420 | # auto folder when using emacs and auctex
421 | auto
422 |
423 | # expex forward references with \gathertags
424 | *-tags.tex
425 |
426 | # os x stuff
427 | .DS_Store
428 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Blei Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | This repository contains software and data for "Using Text Embeddings for Causal Inference" ([arxiv.org/abs/1905.12741](https://arxiv.org/abs/1905.12741)).
4 | The paper describes a method for causal inference with text documents. For example, does adding a
5 | theorem to a paper affect its chance of acceptance? The method adapts deep language models to address the causal problem.
6 |
7 | This software builds on
8 | 1. Bert: [github.com/google-research/bert](https://github.com/google-research/bert), and on
9 | 2. PeerRead: [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead)
10 |
11 | We include pre-processed PeerRead arxiv data for convenience.
12 |
13 | There is also a [reference implementation in pytorch.](https://github.com/rpryzant/causal-bert-pytorch)
14 |
15 | # Tensorflow 2
16 | For new projects, we recommend building on the [reference tensorflow 2 implementation](https://github.com/vveitch/causal-text-embeddings-tf2).
17 |
18 | # Requirements and setup
19 |
20 | 1. You'll need to download a pre-trained BERT model (following the above github link). We use `uncased_L-12_H-768_A-12`.
21 | 2. Install Tensorflow 1.12
22 |
23 | # Data
24 |
25 | 1. We include a pre-processed copy of PeerRead data for convenience.
26 | This data is a collection of arXiv papers submitted to computer science conferences, the accept/reject decisions for these papers,
27 | and their abstracts.
28 | The raw PeerRead data contains significantly more information.
29 | You can get the raw data by following instructions at [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead).
30 | Running the included pre-processing scripts in the PeerRead folder will recreate the included tfrecord file.
31 |
32 | 2. The reddit data can be downloaded at [archive.org/details/reddit_posts_2018](https://archive.org/details/reddit_posts_2018).
33 | This data includes all top-level reddit comments where the gender of the poster was annotated in some fashion.
34 | Each post has meta information (score, date, username, etc.) and includes the text for the first reply.
35 | The processed data used in the paper can be recreated by running the pre-processing scripts in the `reddit` folder.
36 |
37 | You can also re-collect the data from Google BigQuery.
38 | The SQL command to do this is in `reddit/data_cleaning/BigQuery_get_data`.
39 | Modifying this script will allow you to change collection parameters (e.g., the year, which responses are included)
40 |
41 |
42 | # Reproducing the PeerRead experiments
43 |
44 | The default settings for the code match the settings used in the software.
45 | These match the default settings used by BERT, except
46 | 1. we reduce batch size to allow training on a Titan X, and
47 | 2. we adjust the learning rate to account for this.
48 |
49 | You'll run the from `src` code as
50 | `./PeerRead/submit_scripts/run_model.sh`
51 | Before doing this, you'll need to edit `run_classifier.sh` to change
52 | `BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12`
53 | to
54 | `BERT_BASE_DIR=[path to BERT_pre-trained]/uncased_L-12_H-768_A-12`.
55 |
56 | The flag
57 | `--treatment=theorem_referenced`
58 | controls the experiment.
59 | The flag
60 | `--simulated=real`
61 | controls whether to use the real effect or one of the semi-synthetic modes.
62 |
63 | The effect estimates can be reproduced by running `python -m result_processing.compute_ate`.
64 | This takes in the predictions of the bert model (in tsv format) and passes them into downstream estimators
65 | of the causal effect.
66 |
67 | To reproduce the baselines, you'll need to produce a tsv for each simulated dataset you want to test on. To do this, you can run `python -m PeerRead.dataset.array_from_dataset` from src. The flag `--beta1=1.0` controls the strength of the confounding. (The other flags control other simulation parameters not used in the paper.)
68 |
69 | # Misc.
70 |
71 | The experiments in the paper use a version of BERT that was further pre-trained on the PeerRead corpus
72 | using an unsupervised objective.
73 | This can be replicated with `./PeerRead/submit_scripts/run_classifier.sh`.
74 | This takes about 24 hours on a single Titan Xp.
75 | To use a pre-trained BERT, uncomment the `INIT_DIR` options in `run_classifier.sh`.
76 |
77 | # Reproducing the Reddit experiment
78 |
79 | 1. First, get the data following instructions above and save it as `dat/reddit/2018.json`
80 | 2. Run data pre-processing with `python -m reddit.data_cleaning.process_reddit`
81 | 3. Once the data is processed, instructions for running the experiments are essentially the same as for PeerRead
82 |
83 | # Maintainers
84 | [Dhanya Sridhar](https://github.com/dsridhar91`) and [Victor Veitch](`github.com/vveitch`)
85 |
86 |
--------------------------------------------------------------------------------
/dat/PeerRead/proc/arxiv-all.tf_record:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/dat/PeerRead/proc/arxiv-all.tf_record
--------------------------------------------------------------------------------
/dat/reddit/README.md:
--------------------------------------------------------------------------------
1 | This folder is the expected location for the reddit data, "2018.json".
2 |
3 | Follow instructions in the top-level README to get this data and save it here.
4 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | logdir/**
2 | **/tmp/**
3 | output/**
4 | dat/**
5 | dat/gender-text-corpus
6 | .DS_Store
7 | **/.DS_Store
8 | **/*.pyc
9 | **/*.pyo
10 | *checkpoint*
11 | *aux
12 | *log
13 | *.out
14 | *.synct*
15 | *__pycache__*
16 |
17 | #################################
18 | # Victor's standard gitignore
19 | # mostly python and tex
20 | #################################
21 |
22 | # Byte-compiled / optimized / DLL files
23 | __pycache__/
24 | *.py[cod]
25 | *$py.class
26 |
27 | # C extensions
28 | *.so
29 |
30 | # Distribution / packaging
31 | .Python
32 | build/
33 | develop-eggs/
34 | dist/
35 | downloads/
36 | eggs/
37 | .eggs/
38 | lib/
39 | lib64/
40 | parts/
41 | sdist/
42 | var/
43 | wheels/
44 | *.egg-info/
45 | .installed.cfg
46 | *.egg
47 | MANIFEST
48 |
49 | # PyInstaller
50 | # Usually these files are written by a python script from a template
51 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 |
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 |
59 | # Unit test / coverage reports
60 | htmlcov/
61 | .tox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *.cover
68 | .hypothesis/
69 | .pytest_cache/
70 |
71 | # Translations
72 | *.mo
73 | *.pot
74 |
75 | # Django stuff:
76 | *.log
77 | local_settings.py
78 | db.sqlite3
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # Sphinx documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # pyenv
97 | .python-version
98 |
99 | # celery beat schedule file
100 | celerybeat-schedule
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 |
127 | # JetBrains (PyCharm) stuff
128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
130 |
131 | # User-specific stuff
132 | .idea/**/workspace.xml
133 | .idea/**/tasks.xml
134 | .idea/**/usage.statistics.xml
135 | .idea/**/dictionaries
136 | .idea/**/shelf
137 |
138 | # Generated files
139 | .idea/**/contentModel.xml
140 |
141 | # Sensitive or high-churn files
142 | .idea/**/dataSources/
143 | .idea/**/dataSources.ids
144 | .idea/**/dataSources.local.xml
145 | .idea/**/sqlDataSources.xml
146 | .idea/**/dynamic.xml
147 | .idea/**/uiDesigner.xml
148 | .idea/**/dbnavigator.xml
149 |
150 | # Gradle
151 | .idea/**/gradle.xml
152 | .idea/**/libraries
153 |
154 | # Gradle and Maven with auto-import
155 | # When using Gradle or Maven with auto-import, you should exclude module files,
156 | # since they will be recreated, and may cause churn. Uncomment if using
157 | # auto-import.
158 | # .idea/modules.xml
159 | # .idea/*.iml
160 | # .idea/modules
161 |
162 | # CMake
163 | cmake-build-*/
164 |
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 |
168 | # File-based project format
169 | *.iws
170 |
171 | # IntelliJ
172 | out/
173 |
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 |
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 |
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 |
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 |
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 |
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 |
195 | # text
196 | *.pdf
197 |
198 | # linux backup files
199 | *~
200 | *#
201 |
202 | ## Core latex/pdflatex auxiliary files:
203 | *.aux
204 | *.lof
205 | *.log
206 | *.lot
207 | *.fls
208 | *.out
209 | *.toc
210 | *.fmt
211 | *.fot
212 | *.cb
213 | *.cb2
214 |
215 | ## Intermediate documents:
216 | *.dvi
217 | *-converted-to.*
218 | # these rules might exclude image files for figures etc.
219 | # *.ps
220 | # *.eps
221 | # *.pdf
222 |
223 | ## Generated if empty string is given at "Please type another file name for output:"
224 | .pdf
225 |
226 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
227 | *.bbl
228 | *.bcf
229 | *.blg
230 | *-blx.aux
231 | *-blx.bib
232 | *.run.xml
233 |
234 | ## Build tool auxiliary files:
235 | *.fdb_latexmk
236 | *.synctex
237 | *.synctex(busy)
238 | *.synctex.gz
239 | *.synctex.gz(busy)
240 | *.pdfsync
241 |
242 | ## Auxiliary and intermediate files from other packages:
243 | # algorithms
244 | *.alg
245 | *.loa
246 |
247 | # achemso
248 | acs-*.bib
249 |
250 | # amsthm
251 | *.thm
252 |
253 | # beamer
254 | *.nav
255 | *.pre
256 | *.snm
257 | *.vrb
258 |
259 | # changes
260 | *.soc
261 |
262 | # cprotect
263 | *.cpt
264 |
265 | # elsarticle (documentclass of Elsevier journals)
266 | *.spl
267 |
268 | # endnotes
269 | *.ent
270 |
271 | # fixme
272 | *.lox
273 |
274 | # feynmf/feynmp
275 | *.mf
276 | *.mp
277 | *.t[1-9]
278 | *.t[1-9][0-9]
279 | *.tfm
280 |
281 | #(r)(e)ledmac/(r)(e)ledpar
282 | *.end
283 | *.?end
284 | *.[1-9]
285 | *.[1-9][0-9]
286 | *.[1-9][0-9][0-9]
287 | *.[1-9]R
288 | *.[1-9][0-9]R
289 | *.[1-9][0-9][0-9]R
290 | *.eledsec[1-9]
291 | *.eledsec[1-9]R
292 | *.eledsec[1-9][0-9]
293 | *.eledsec[1-9][0-9]R
294 | *.eledsec[1-9][0-9][0-9]
295 | *.eledsec[1-9][0-9][0-9]R
296 |
297 | # glossaries
298 | *.acn
299 | *.acr
300 | *.glg
301 | *.glo
302 | *.gls
303 | *.glsdefs
304 |
305 | # gnuplottex
306 | *-gnuplottex-*
307 |
308 | # gregoriotex
309 | *.gaux
310 | *.gtex
311 |
312 | # hyperref
313 | *.brf
314 |
315 | # knitr
316 | *-concordance.tex
317 | # TODO Comment the next line if you want to keep your tikz graphics files
318 | *.tikz
319 | *-tikzDictionary
320 |
321 | # listings
322 | *.lol
323 |
324 | # makeidx
325 | *.idx
326 | *.ilg
327 | *.ind
328 | *.ist
329 |
330 | # minitoc
331 | *.maf
332 | *.mlf
333 | *.mlt
334 | *.mtc[0-9]*
335 | *.slf[0-9]*
336 | *.slt[0-9]*
337 | *.stc[0-9]*
338 |
339 | # minted
340 | _minted*
341 | *.pyg
342 |
343 | # morewrites
344 | *.mw
345 |
346 | # nomencl
347 | *.nlo
348 |
349 | # pax
350 | *.pax
351 |
352 | # pdfpcnotes
353 | *.pdfpc
354 |
355 | # sagetex
356 | *.sagetex.sage
357 | *.sagetex.py
358 | *.sagetex.scmd
359 |
360 | # scrwfile
361 | *.wrt
362 |
363 | # sympy
364 | *.sout
365 | *.sympy
366 | sympy-plots-for-*.tex/
367 |
368 | # pdfcomment
369 | *.upa
370 | *.upb
371 |
372 | # pythontex
373 | *.pytxcode
374 | pythontex-files-*/
375 |
376 | # thmtools
377 | *.loe
378 |
379 | # TikZ & PGF
380 | *.dpth
381 | *.md5
382 | *.auxlock
383 |
384 | # todonotes
385 | *.tdo
386 |
387 | # easy-todo
388 | *.lod
389 |
390 | # xindy
391 | *.xdy
392 |
393 | # xypic precompiled matrices
394 | *.xyc
395 |
396 | # endfloat
397 | *.ttt
398 | *.fff
399 |
400 | # Latexian
401 | TSWLatexianTemp*
402 |
403 | ## Editors:
404 | # WinEdt
405 | *.bak
406 | *.sav
407 |
408 | # Texpad
409 | .texpadtmp
410 |
411 | # Kile
412 | *.backup
413 |
414 | # KBibTeX
415 | *~[0-9]*
416 |
417 | # auto folder when using emacs and auctex
418 | auto/*
419 |
420 | # auto folder when using emacs and auctex
421 | auto
422 |
423 | # expex forward references with \gathertags
424 | *-tags.tex
425 |
426 | # os x stuff
427 | .DS_Store
428 |
--------------------------------------------------------------------------------
/src/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/src/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/src/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/.idea/src.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/Paper.py:
--------------------------------------------------------------------------------
1 | import re,io,json,sys
2 | from .Review import Review
3 |
4 | class Paper:
5 | """A paper class, which contains relevant fields and a list of reviews"""
6 | def __init__(self, TITLE, ABSTRACT, ID, REVIEWS, AUTHORS=None, CONFERENCE=None, ACCEPTED=None, SCORE=None,
7 | PUBLICATION_TYPE=None, SCIENCEPARSE=None, KEYWORDS=None, AUTHOR_EMAILS=None, DATE_OF_SUBMISSION=None,
8 | SUBJECTS=None,COMMENTS=None,VERSION=None,HISTORIES=None):
9 | self.TITLE = TITLE
10 | self.ABSTRACT = re.sub("\\n", " ", ABSTRACT)
11 | self.ID = ID
12 | self.AUTHORS = AUTHORS
13 | self.REVIEWS = REVIEWS
14 | self.SCIENCEPARSE = SCIENCEPARSE
15 | self.CONFERENCE = CONFERENCE
16 | self.ACCEPTED = ACCEPTED
17 | self.SCORE = SCORE
18 | self.PUBLICATION_TYPE = PUBLICATION_TYPE
19 | self.KEYWORDS = KEYWORDS
20 | self.AUTHOR_EMAILS = AUTHOR_EMAILS
21 | self.DATE_OF_SUBMISSION = DATE_OF_SUBMISSION
22 |
23 | # additional properties for arxiv papers
24 | self.SUBJECTS = SUBJECTS
25 | self.COMMENTS = COMMENTS
26 | self.VERSION = VERSION
27 | self.HISTORIES = HISTORIES #[(version,date,link,comments),...]
28 |
29 | @staticmethod
30 | def from_softconf_dump(json_file, conference=None):
31 | with io.open(json_file, "r", encoding="utf8") as ifh:
32 | json_str = ifh.read()
33 |
34 | # print (json_str)
35 | json_data = json.loads(json_str)["submissions"]
36 |
37 | papers = []
38 | for i in range(len(json_data)):
39 | reviews = []
40 | for k in range(len(json_data[i]["reviews"])):
41 | # print(json_data[i]["reviews"][k])
42 | review_data = []
43 |
44 | review = Review.from_json_object(json_data[i]["reviews"][k], k==i==0)
45 | #review = None
46 |
47 | reviews.append(review)
48 |
49 | authors = json_data[i]["authors"] if "authors" in json_data[i] else None
50 | score = json_data[i]["score"] if "score" in json_data[i] else None
51 | accepted = json_data[i]["accepted"] if "accepted" in json_data[i] else None
52 | publication_type = json_data[i]["publication_type"] if "publication_type" in json_data[i] else None
53 | keywords = json_data[i]["KEYWORDS"] if "KEYWORDS" in json_data[i] else None
54 | author_emails = json_data[i]["AUTHOR_EMAILS"] if "AUTHOR_EMAILS" in json_data[i] else None
55 | date_of_submission = json_data[i]["DATE_OF_SUBMISSION"] if "DATE_OF_SUBMISSION" in json_data[i] else None
56 |
57 | paper = Paper(json_data[i]["title"], json_data[i]["abstract"], json_data[i]["id"], reviews, authors, \
58 | conference, accepted, score, publication_type, None, keywords, author_emails, \
59 | date_of_submission)
60 |
61 | papers.append(paper)
62 | # break
63 |
64 | return papers
65 |
66 | @staticmethod
67 | def from_json(json_filename, from_annotated = False):
68 | paper = Paper('', '', None, [])
69 |
70 | datas = []
71 | with io.open(json_filename, mode='rt', encoding='utf8') as json_file:
72 | for line in json_file:
73 | try:
74 | data = json.loads(line.strip())
75 | datas.append(data)
76 | except Exception as e:
77 | print(line)
78 | continue
79 | if len(datas)==0: return None
80 | data = datas[-1]
81 |
82 | # Read required fields.
83 | assert 'title' in data
84 | assert 'abstract' in data
85 | paper.TITLE = data['title']
86 | paper.ABSTRACT = data['abstract']
87 |
88 | if 'id' in data:
89 | if data['id'] == "":
90 | paper.ID = json_filename.split("/")[-1].split(".")[0]
91 | else:
92 | paper.ID = data['id']
93 | else:
94 | paper.ID = json_filename.split("/")[-1].split(".")[0]
95 |
96 | # Read optional fields.
97 | paper.AUTHORS = data['authors'] if 'authors' in data else None
98 | paper.CONFERENCE = data['conference'] if 'conference' in data else None
99 | paper.ACCEPTED = data['accepted'] if 'accepted' in data else None
100 | paper.SCORE = data['score'] if 'score' in data else None
101 | paper.PUBLICATION_TYPE = data['publication_type'] if 'publication_type' in data else None
102 | paper.SCIENCEPARSE = data['scienceparse'] if 'scienceparse' in data else None
103 | paper.KEYWORDS = data['keywords'] if 'keywords' in data else None
104 | paper.AUTHOR_EMAILS = data['author_emails'] if 'author_emails' in data else None
105 |
106 | paper.DATE_OF_SUBMISSION = data['DATE_OF_SUBMISSION'] if 'DATE_OF_SUBMISSION' in data else None
107 |
108 | paper.SUBJECTS = data['SUBJECTS'] if 'SUBJECTS' in data else None
109 | paper.COMMENTS = data['COMMENTS'] if 'COMMENTS' in data else None
110 | paper.VERSION = data['VERSION'] if 'VERSION' in data else None
111 | paper.HISTORIES = data['histories'] if 'histories' in data else None
112 |
113 | # Read reviews (mandatory).
114 | assert 'reviews' in data
115 | for review_data in data['reviews']:
116 | review = Review.from_json_object(review_data)
117 | paper.REVIEWS.append(review)
118 | return paper
119 |
120 |
121 |
122 | def to_json_object(self):
123 | data = dict()
124 |
125 | data["title"] = self.get_title()
126 | data["abstract"] = self.get_abstract()
127 | data["id"] = self.get_id()
128 |
129 | if self.AUTHORS is not None:
130 | data["authors"] = self.get_authors()
131 |
132 | if self.CONFERENCE is not None:
133 | data["conference"] = self.get_conference()
134 |
135 | if self.ACCEPTED is not None:
136 | data["accepted"] = self.get_accepted()
137 |
138 | if self.SCORE is not None:
139 | data["SCORE"] = self.get_score()
140 |
141 | if self.PUBLICATION_TYPE is not None:
142 | data["publication_type"] = self.get_publication_type()
143 |
144 | if self.SCIENCEPARSE is not None:
145 | data["SCIENCEPARSE"] = self.get_scienceparse()
146 |
147 | if self.AUTHOR_EMAILS is not None:
148 | data["AUTHOR_EMAILS"] = self.get_author_emails()
149 |
150 | if self.KEYWORDS is not None:
151 | data["KEYWORDS"] = self.get_keywords()
152 |
153 | if self.DATE_OF_SUBMISSION is not None:
154 | data["DATE_OF_SUBMISSION"] = self.get_date_of_submission()
155 |
156 | data["reviews"] = []
157 |
158 | for r in self.get_reviews():
159 | data["reviews"].append(r.to_json_object())
160 |
161 | # added for arxiv papers
162 |
163 | if self.SUBJECTS is not None:
164 | data["SUBJECTS"] = self.get_subjects()
165 |
166 | if self.COMMENTS is not None:
167 | data["COMMENTS"] = self.get_comments()
168 |
169 | if self.VERSION is not None:
170 | data["VERSION"] = self.get_version()
171 |
172 | data["histories"] = []
173 | if self.HISTORIES is not None:
174 | for h in self.get_histories():
175 | if h is not None:
176 | v,d,l,p = h
177 | data["histories"].append((v,d,l, p if p else None))
178 |
179 | return data
180 |
181 | def to_json(self, json_file, mode='a'):
182 |
183 | data = self.to_json_object()
184 |
185 | with open(json_file, mode) as ofh:
186 | json.dump(data, ofh)
187 | ofh.write("\n")
188 |
189 |
190 | def get_subjects(self):
191 | return self.SUBJECTS
192 | def get_comments(self):
193 | return self.COMMENTS
194 | def get_version(self):
195 | return self.VERSION
196 | def get_histories(self):
197 | return self.HISTORIES
198 |
199 |
200 | def get_title(self):
201 | return self.TITLE
202 |
203 | def get_abstract(self):
204 | return self.ABSTRACT
205 |
206 | def abstract_contains_a_term(self, term):
207 | return (term in self.ABSTRACT)
208 |
209 | def get_id(self):
210 | return self.ID
211 |
212 | def get_authors(self):
213 | return self.AUTHORS
214 |
215 | def get_reviews(self):
216 | return self.REVIEWS
217 |
218 | def get_scienceparse(self):
219 | return self.SCIENCEPARSE
220 |
221 | def get_title_len(self):
222 | return len(self.TITLE)
223 |
224 | def get_abstract_len(self):
225 | return len(self.ABSTRACT)
226 |
227 | def get_conference(self):
228 | return self.CONFERENCE
229 |
230 | def get_score(self):
231 | return self.SCORE
232 |
233 | def get_accepted(self):
234 | return self.ACCEPTED
235 |
236 | def get_publication_type(self):
237 | return self.PUBLICATION_TYPE
238 |
239 | def get_author_emails(self):
240 | return self.AUTHOR_EMAILS
241 |
242 | def get_keywords(self):
243 | return self.KEYWORDS
244 |
245 | def get_date_of_submission(self):
246 | return self.DATE_OF_SUBMISSION
247 |
248 | def main(args):
249 | papers = Paper.from_softconf_dump('../../data/conll16/reviews.json')
250 | for paper in papers:
251 | paper.to_json('../../data/conll16_new/{}.json'.format(paper.ID))
252 |
253 | if __name__ == "__main__":
254 | sys.exit(main(sys.argv))
255 |
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/README.md:
--------------------------------------------------------------------------------
1 | Code from ScienceParse (via PeerRead)
2 |
3 | TODO: determine liscense and add it
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/Review.py:
--------------------------------------------------------------------------------
1 |
2 | class Review:
3 |
4 | """A review class, contains all bunch of relevant fields"""
5 | def __init__(self, RECOMMENDATION, COMMENTS, REPLICABILITY=None, PRESENTATION_FORMAT=None, \
6 | CLARITY=None, MEANINGFUL_COMPARISON=None, SUBSTANCE=None, REVIEWER_CONFIDENCE=None, \
7 | SOUNDNESS_CORRECTNESS=None, APPROPRIATENESS=None, IMPACT=None, ORIGINALITY=None, OTHER_KEYS=None, \
8 | IS_META_REVIEW=False, TITLE=None, DATE=None, RECOMMENDATION_UNOFFICIAL=None, IS_ANNOTATED=False):
9 | self.RECOMMENDATION = RECOMMENDATION
10 | self.RECOMMENDATION_UNOFFICIAL = RECOMMENDATION_UNOFFICIAL #None # only for aspect prediction
11 | self.IS_ANNOTATED = IS_ANNOTATED
12 |
13 | self.COMMENTS = COMMENTS
14 | self.REPLICABILITY = REPLICABILITY
15 | self.PRESENTATION_FORMAT = PRESENTATION_FORMAT
16 | self.CLARITY = CLARITY
17 | self.MEANINGFUL_COMPARISON = MEANINGFUL_COMPARISON
18 | self.SUBSTANCE = SUBSTANCE
19 | self.REVIEWER_CONFIDENCE = REVIEWER_CONFIDENCE
20 | self.SOUNDNESS_CORRECTNESS = SOUNDNESS_CORRECTNESS
21 | self.APPROPRIATENESS = APPROPRIATENESS
22 | self.IMPACT = IMPACT
23 | self.ORIGINALITY = ORIGINALITY
24 | self.OTHER_KEYS = OTHER_KEYS
25 | self.IS_META_REVIEW = IS_META_REVIEW
26 | self.TITLE = TITLE
27 | self.DATE = DATE
28 |
29 | @staticmethod
30 | def get_json_string(json_object, string, missing_fields):
31 | if string in json_object:
32 | return json_object[string]
33 | elif missing_fields is not None:
34 | missing_fields.append(string)
35 |
36 | return None
37 |
38 | @staticmethod
39 | def from_json_object(json_object, print_missing_fields=False):
40 | assert "comments" in json_object
41 | comments = json_object["comments"]
42 |
43 | missing_fields = None
44 |
45 | if print_missing_fields:
46 | missing_fields = []
47 |
48 | recommendation = Review.get_json_string(json_object, "RECOMMENDATION", missing_fields)
49 |
50 |
51 | recommendation_unofficial = Review.get_json_string(json_object, "RECOMMENDATION_UNOFFICIAL", missing_fields)
52 |
53 | is_annotated = Review.get_json_string(json_object, "IS_ANNOTATED", missing_fields)
54 |
55 | replicability = Review.get_json_string(json_object, "REPLICABILITY", missing_fields)
56 | clarity = Review.get_json_string(json_object, "CLARITY", missing_fields)
57 | substance = Review.get_json_string(json_object, "SUBSTANCE", missing_fields)
58 | appropriateness = Review.get_json_string(json_object, "APPROPRIATENESS", missing_fields)
59 | originality = Review.get_json_string(json_object, "ORIGINALITY", missing_fields)
60 | presentation_format = Review.get_json_string(json_object, "PRESENTATION_FORMAT", missing_fields)
61 | meaningful_comparison = Review.get_json_string(json_object, "MEANINGFUL_COMPARISON", missing_fields)
62 | reviewer_confidence = Review.get_json_string(json_object, "REVIEWER_CONFIDENCE", missing_fields)
63 | soundness_correctness = Review.get_json_string(json_object, "SOUNDNESS_CORRECTNESS", missing_fields)
64 | impact = Review.get_json_string(json_object, "IMPACT", missing_fields)
65 | is_meta_review = Review.get_json_string(json_object, "IS_META_REVIEW", missing_fields)
66 | date = Review.get_json_string(json_object, "DATE", missing_fields)
67 | title = Review.get_json_string(json_object, "TITLE", missing_fields)
68 | other_keys = Review.get_json_string(json_object, "OTHER_KEYS", missing_fields)
69 |
70 | if print_missing_fields and len(missing_fields):
71 | print("The following fields are missing in json input file:",missing_fields)
72 | return Review(recommendation, comments, replicability, presentation_format, clarity, meaningful_comparison, \
73 | substance, reviewer_confidence, soundness_correctness, appropriateness, impact, originality, \
74 | other_keys, is_meta_review, title, date, recommendation_unofficial, is_annotated )
75 |
76 | def to_json_object(self):
77 | data = dict()
78 |
79 | data["comments"] = self.get_comments().decode('cp1252', errors='ignore').encode('utf-8')
80 |
81 | if self.RECOMMENDATION is not None:
82 | data["RECOMMENDATION"] = self.get_recommendation()
83 |
84 | if self.RECOMMENDATION_UNOFFICIAL is not None:
85 | data["RECOMMENDATION_UNOFFICIAL"] = self.get_recommendation_unofficial()
86 | if self.IS_ANNOTATED is not None:
87 | data["IS_ANNOTATED"] = self.get_is_annotated()
88 |
89 |
90 | if self.REPLICABILITY is not None:
91 | data["REPLICABILITY"] = self.get_replicability()
92 | if self.PRESENTATION_FORMAT is not None:
93 | data["PRESENTATION_FORMAT"] = self.get_presentation_format()
94 | if self.CLARITY is not None:
95 | data["CLARITY"] = self.get_clarity()
96 | if self.MEANINGFUL_COMPARISON is not None:
97 | data["MEANINGFUL_COMPARISON"] = self.get_meaningful_comparison()
98 | if self.SUBSTANCE is not None:
99 | data["SUBSTANCE"] = self.get_substance()
100 | if self.REVIEWER_CONFIDENCE is not None:
101 | data["REVIEWER_CONFIDENCE"] = self.get_reviewer_confidence()
102 | if self.SOUNDNESS_CORRECTNESS is not None:
103 | data["SOUNDNESS_CORRECTNESS"] = self.get_soundness_correctness()
104 | if self.APPROPRIATENESS is not None:
105 | data["APPROPRIATENESS"] = self.get_appropriateness()
106 | if self.IMPACT is not None:
107 | data["IMPACT"] = self.get_impact()
108 | if self.ORIGINALITY is not None:
109 | data["ORIGINALITY"] = self.get_originality()
110 | if self.OTHER_KEYS is not None:
111 | data["OTHER_KEYS"] = self.get_other_keys()
112 | if self.IS_META_REVIEW is not None:
113 | data["IS_META_REVIEW"] = self.is_meta_review()
114 | if self.TITLE is not None:
115 | data["TITLE"] = self.get_title()
116 | if self.DATE is not None:
117 | data["DATE"] = self.get_date()
118 |
119 |
120 | return data
121 |
122 | def get_recommendation(self):
123 | return self.RECOMMENDATION
124 |
125 | def get_recommendation_unofficial(self):
126 | return self.RECOMMENDATION_UNOFFICIAL
127 |
128 | def get_is_annotated(self):
129 | return self.IS_ANNOTATED
130 |
131 | def get_comments(self):
132 | return self.COMMENTS
133 |
134 | def get_replicability(self):
135 | return self.REPLICABILITY
136 |
137 | def get_presentation_format(self):
138 | return self.PRESENTATION_FORMAT
139 |
140 | def get_clarity(self):
141 | return self.CLARITY
142 |
143 | def get_meaningful_comparison(self):
144 | return self.MEANINGFUL_COMPARISON
145 |
146 | def get_substance(self):
147 | return self.SUBSTANCE
148 |
149 | def get_reviewer_confidence(self):
150 | return self.REVIEWER_CONFIDENCE
151 |
152 | def get_soundness_correctness(self):
153 | return self.SOUNDNESS_CORRECTNESS
154 |
155 | def get_appropriateness(self):
156 | return self.APPROPRIATENESS
157 |
158 | def get_impact(self):
159 | return self.IMPACT
160 |
161 | def get_originality(self):
162 | return self.ORIGINALITY
163 |
164 | def get_other_keys(self):
165 | return self.OTHER_KEYS
166 |
167 | def is_meta_review(self):
168 | return self.IS_META_REVIEW
169 |
170 | def get_title(self):
171 | return self.TITLE
172 |
173 | def get_date(self):
174 | return self.DATE
175 |
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/ScienceParse.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import re
3 |
4 | class ScienceParse:
5 | """
6 | A data structure for paper fields extracted by ScienceParse
7 | """
8 | def __init__(self, title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts,
9 | reference_num_mentions, authors=None, emails = None, other_keys=None):
10 | self.title = title
11 | self.abstract = abstract
12 | self.sections = sections
13 | self.reference_titles = reference_titles
14 | self.reference_venues = reference_venues
15 | self.reference_years = reference_years
16 | self.reference_mention_contexts = reference_mention_contexts
17 | self.reference_num_mentions = reference_num_mentions
18 | self.authors = authors
19 | self.emails = emails
20 |
21 | def get_sections_dict(self):
22 | return self.sections
23 |
24 | def get_reference_title_dict(self):
25 | return self.reference_titles
26 |
27 | def get_reference_venues_dict(self):
28 | return self.reference_venues
29 |
30 | def get_reference_years_dict(self):
31 | return self.reference_years
32 |
33 | def get_reference_mention_contexts_dict(self):
34 | return self.reference_mention_contexts
35 |
36 | def get_reference_num_mentions_dict(self):
37 | return self.reference_num_mentions
38 |
39 | def get_num_references(self):
40 | return len(self.get_reference_years_dict())
41 |
42 | def get_num_refmentions(self):
43 | num_refmentions = 0
44 | for refid in self.reference_num_mentions:
45 | num_refmentions = num_refmentions + self.reference_num_mentions[refid]
46 | return num_refmentions
47 |
48 | def get_most_recent_reference_year(self):
49 | most_recent = 0
50 | for refid in self.reference_years:
51 | if self.reference_years[refid] > most_recent:
52 | most_recent = self.reference_years[refid]
53 | return most_recent
54 |
55 | def get_avg_length_reference_mention_contexts(self):
56 | sum_length = 0.0
57 | for refid in self.reference_mention_contexts:
58 | sum_length = sum_length + len(self.reference_mention_contexts[refid])
59 | avg_length = 0
60 | if len(self.reference_mention_contexts) > 0:
61 | avg_length = sum_length / len(self.reference_mention_contexts)
62 | return avg_length
63 |
64 | def get_paper_content(self):
65 | content = self.title + " " + self.abstract + " " + self.get_author_names_string() + " " + \
66 | self.get_domains_from_emails()
67 | for sect_id in sorted(self.sections):
68 | # print("###",str(sect_id))
69 | content = content + " " + self.sections[sect_id]
70 | content = re.sub("\n([0-9]*\n)+", "\n", content)
71 | return content
72 |
73 | def get_tagged_paper_content(self):
74 | content = self.get_paper_content()
75 |
76 | nlp = spacy.load('en', parser=False)
77 |
78 | doc = nlp(content)
79 |
80 | return " ".join([x.text+"_"+x.tag_ for x in doc])
81 |
82 | def get_frequent_words_proportion(self, hfws, most_frequent_words, least_frequent_words):
83 | content = self.get_paper_content().split()
84 |
85 | n = 0
86 | t = 0
87 | # print(str(most_frequent_words).encode('utf8'))
88 | for w in content:
89 | if w not in hfws and w not in least_frequent_words:
90 | t += 1
91 | n += w in most_frequent_words
92 |
93 | # print (n,len(content),1.*n/t)
94 |
95 | return 1.*n/t
96 |
97 | # #papers referred from -5 years from year of submission
98 | def get_num_recent_references(self, submission_year):
99 | num_recent_references = 0
100 | for refid in self.reference_years:
101 | if (submission_year - self.reference_years[refid] < 5):
102 | num_recent_references = num_recent_references + 1
103 | return num_recent_references
104 |
105 | # word offset of figure 1
106 | def get_word_offset_of_first_fig_reference(self):
107 | content_words = self.get_paper_content().split(" ")
108 | indices = [i for i, x in enumerate(content_words) if x == "Figure"]
109 | return indices[0]
110 |
111 | # num references to #figures
112 | def get_num_ref_to_figures(self):
113 | content_words = self.get_paper_content().split(" ")
114 | figure_indices = [i for i, x in enumerate(content_words) if x == "Figure"]
115 | return len(figure_indices)
116 |
117 | # num references to #tables
118 | def get_num_ref_to_tables(self):
119 | content_words = self.get_paper_content().split(" ")
120 | table_indices = [i for i, x in enumerate(content_words) if x == "Table"]
121 | return len(table_indices)
122 |
123 | # # of references to Section
124 | def get_num_ref_to_sections(self):
125 | content_words = self.get_paper_content().split(" ")
126 | section_indices = [i for i, x in enumerate(content_words) if x == "Section"]
127 | return len(section_indices)
128 |
129 | # related work at front/back
130 | # #unique words
131 | def get_num_uniq_words(self):
132 | return len(set(self.get_paper_content().split(" ")))
133 |
134 | # num of sections
135 | def get_num_sections(self):
136 | return len(self.sections)
137 |
138 | # avg length of sentences
139 | def get_avg_sentence_length(self):
140 | sentences = self.get_paper_content().split(". ")
141 | sentence_lengths = [len(s.split(" ")) for s in sentences]
142 | return (1.0 * sum(sentence_lengths))/len(sentence_lengths)
143 |
144 | # whether paper has appendix
145 | def get_contains_appendix(self):
146 | content_words = self.get_paper_content().split(" ")
147 | figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"]
148 | return int(len(figure_indices) > 0)
149 |
150 | # publishing a dataset / code
151 | def get_contains_appendix(self):
152 | content_words = self.get_paper_content().split(" ")
153 | figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"]
154 | return int(len(figure_indices) > 0)
155 |
156 | # #authors
157 | def get_num_authors(self):
158 | if self.authors == None:
159 | return 0
160 | return len(self.authors)
161 |
162 | # get author names as a string
163 | def get_author_names_string(self):
164 | if self.authors == None:
165 | return ""
166 | return str.join(' ', self.authors)
167 |
168 | # get domains from emails
169 | def get_domains_from_emails(self):
170 | domains = []
171 | for email in self.emails:
172 | domains.append(email.split('@')[1].replace(".", "_"))
173 | return str.join(' ', domains)
174 |
175 | # num references to equations
176 | def get_num_ref_to_equations(self):
177 | content_words = self.get_paper_content().split(" ")
178 | equation_indices = [i for i, x in enumerate(content_words) if x == "Equation"]
179 | return len(equation_indices)
180 |
181 | # num references to theorems
182 | def get_num_ref_to_theorems(self):
183 | content_words = self.get_paper_content().split(" ")
184 | theorem_indices = [i for i, x in enumerate(content_words) if x == "Theorem"]
185 | return len(theorem_indices)
186 |
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/ScienceParseReader.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import io
3 | import os
4 | import json
5 | from PeerRead.ScienceParse.ScienceParse import ScienceParse
6 |
7 | class ScienceParseReader:
8 | """
9 | This class reads the output of the science parse library and stores it in theScienceParseclass
10 | """
11 |
12 | @staticmethod
13 | def read_science_parse(paperid, title, abstract, scienceparse_dir):
14 | scienceparse_file = io.open(os.path.join(scienceparse_dir, '{0}.pdf.json'.format(paperid)))
15 | # scienceparse_file = io.open('%s%s.pdf.json'%(scienceparse_dir,paperid), "r", encoding="utf8")
16 | scienceparse_str = scienceparse_file.read()
17 | scienceparse_data = json.loads(scienceparse_str)
18 |
19 | #read scienceparse
20 | scienceparse_map = {}
21 |
22 | sections = {}
23 | reference_years = {}
24 | reference_titles = {}
25 | reference_venues = {}
26 | reference_mention_contexts = {}
27 | reference_num_mentions = {}
28 |
29 | name = scienceparse_data["name"]
30 | metadata = scienceparse_data["metadata"]
31 |
32 | if metadata["sections"] is not None:
33 | for sectid in range(len(metadata["sections"])):
34 | heading = metadata["sections"][sectid]["heading"]
35 | text = metadata["sections"][sectid]["text"]
36 | sections[str(heading)] = text
37 |
38 | for refid in range(len(metadata["references"])):
39 | reference_titles[refid] = metadata["references"][refid]["title"]
40 | reference_years[refid] = metadata["references"][refid]["year"]
41 | reference_venues[refid] = metadata["references"][refid]["venue"]
42 |
43 | for menid in range(len(metadata["referenceMentions"])):
44 | refid = metadata["referenceMentions"][menid]["referenceID"]
45 | context = metadata["referenceMentions"][menid]["context"]
46 | oldContext = reference_mention_contexts.get(refid, "")
47 | reference_mention_contexts[refid] = oldContext + "\t" + context
48 | count = reference_num_mentions.get(refid, 0)
49 | reference_num_mentions[refid] = count + 1
50 |
51 | authors = metadata["authors"]
52 | emails = metadata["emails"]
53 | #print(authors)
54 | #print(emails)
55 |
56 | science_parse = ScienceParse(title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts, reference_num_mentions, authors, emails)
57 | return science_parse
58 |
--------------------------------------------------------------------------------
/src/PeerRead/ScienceParse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/ScienceParse/__init__.py
--------------------------------------------------------------------------------
/src/PeerRead/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/__init__.py
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/PeerRead_hand_features.py:
--------------------------------------------------------------------------------
1 | """
2 | create (hand-authored and lexical) features for baselines classifiers and save to under dataset folder in each split
3 | """
4 |
5 | import sys, os, random, glob
6 |
7 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
8 | from PeerRead.ScienceParse.Paper import Paper
9 | from PeerRead.ScienceParse.ScienceParseReader import ScienceParseReader
10 |
11 |
12 | def get_PeerRead_hand_features(paper):
13 | sp = paper.get_scienceparse()
14 |
15 | hand_features = {}
16 |
17 | hand_features["accepted"] = paper.get_accepted()
18 |
19 | hand_features["most_recent_reference_year"] = sp.get_most_recent_reference_year() - 2000
20 | hand_features["num_recent_references"] = sp.get_num_recent_references(2017)
21 | hand_features["num_references"] = sp.get_num_references()
22 | hand_features["num_refmentions"] = sp.get_num_refmentions()
23 | hand_features["avg_length_reference_mention_contexts"] = sp.get_avg_length_reference_mention_contexts()
24 |
25 | hand_features["num_ref_to_figures"] = sp.get_num_ref_to_figures()
26 | hand_features["num_ref_to_tables"] = sp.get_num_ref_to_tables()
27 | hand_features["num_ref_to_sections"] = sp.get_num_ref_to_sections()
28 |
29 | hand_features["num_uniq_words"] = sp.get_num_uniq_words()
30 | hand_features["num_sections"] = sp.get_num_sections()
31 | hand_features["avg_sentence_length"] = sp.get_avg_sentence_length()
32 |
33 | hand_features["contains_appendix"] = sp.get_contains_appendix()
34 |
35 | hand_features["title_length"] = paper.get_title_len()
36 | hand_features["num_authors"] = sp.get_num_authors()
37 | hand_features["num_ref_to_equations"] = sp.get_num_ref_to_equations()
38 | hand_features["num_ref_to_theorems"] = sp.get_num_ref_to_theorems()
39 |
40 | abstract = str.lower(paper.ABSTRACT)
41 | hand_features["abstract_contains_deep"] = "deep" in abstract
42 | hand_features["abstract_contains_neural"] = "neural" in abstract
43 | hand_features["abstract_contains_embedding"] = "embedding" in abstract
44 | hand_features["abstract_contains_outperform"] = "outperform" in abstract
45 | hand_features["abstract_contains_novel"] = "novel" in abstract
46 | hand_features["abstract_contains_state-of-the-art"] = \
47 | "state-of-the-art" in abstract or "state of the art" in abstract
48 |
49 | title = str.lower(paper.TITLE)
50 | hand_features["title_contains_deep"] = "deep" in title
51 | hand_features["title_contains_neural"] = "neural" in title
52 | hand_features["title_contains_embedding"] = "embed" in title
53 | hand_features["title_contains_gan"] = ("gan" in title) or ("adversarial net" in title)
54 |
55 | return hand_features
56 |
57 |
58 | def main(args):
59 |
60 | paper_json_dir = args[1] # train/reviews
61 | scienceparse_dir = args[2] # train/parsed_pdfs
62 |
63 |
64 | ################################
65 | # read reviews
66 | ################################
67 | print('Reading reviews from...', paper_json_dir)
68 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(paper_json_dir)))
69 | papers = []
70 | for paper_json_filename in paper_json_filenames:
71 | paper = Paper.from_json(paper_json_filename)
72 | paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(paper.ID, paper.TITLE, paper.ABSTRACT,
73 | scienceparse_dir)
74 | papers.append(paper)
75 | random.shuffle(papers)
76 | print('Total number of reviews', len(papers))
77 |
78 | id = 1
79 | for p in papers:
80 | rec = int(p.get_accepted() == True)
81 |
82 | handy = get_PeerRead_hand_features(p)
83 |
84 | id += 1
85 |
86 |
87 | if __name__ == "__main__":
88 | main(sys.argv)
89 |
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/data_cleaning/__init__.py
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/clean_PeerRead.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import bert.tokenization as tokenization
5 | from PeerRead.data_cleaning.process_PeerRead_abstracts import clean_PeerRead_dataset
6 |
7 | dataset_names = ['acl_2017',
8 | 'arxiv.cs.ai_2007-2017',
9 | 'arxiv.cs.cl_2007-2017',
10 | 'arxiv.cs.lg_2007-2017',
11 | 'conll_2016',
12 | 'iclr_2017',
13 | 'nips_2013',
14 | 'nips_2014',
15 | 'nips_2015',
16 | 'nips_2016',
17 | 'nips_2017'
18 | ]
19 |
20 | dataset_paths = ['acl_2017',
21 | 'arxiv.cs.ai_2007-2017',
22 | 'arxiv.cs.cl_2007-2017',
23 | 'arxiv.cs.lg_2007-2017',
24 | 'conll_2016',
25 | 'iclr_2017',
26 | 'nips_2013-2017/2013',
27 | 'nips_2013-2017/2014',
28 | 'nips_2013-2017/2015',
29 | 'nips_2013-2017/2016',
30 | 'nips_2013-2017/2017'
31 | ]
32 |
33 | dataset_paths = dict(zip(dataset_names, dataset_paths))
34 |
35 | dataset_years = {'acl_2017': 2017,
36 | 'conll_2016': 2016,
37 | 'iclr_2017': 2017,
38 | 'arxiv.cs.ai_2007-2017': None,
39 | 'arxiv.cs.cl_2007-2017': None,
40 | 'arxiv.cs.lg_2007-2017': None,
41 | 'nips_2013': 2013,
42 | 'nips_2014': 2014,
43 | 'nips_2015': 2015,
44 | 'nips_2016': 2016,
45 | 'nips_2017': 2017}
46 |
47 | # dataset_venues = {k: v for v,k in enumerate(dataset_names)}
48 |
49 | dataset_venues = {'acl_2017': 0,
50 | 'conll_2016': 1,
51 | 'iclr_2017': 2,
52 | 'nips_2013': 3,
53 | 'nips_2014': 3,
54 | 'nips_2015': 3,
55 | 'nips_2016': 3,
56 | 'nips_2017': 3,
57 | 'arxiv.cs.ai_2007-2017': 4,
58 | 'arxiv.cs.cl_2007-2017': 5,
59 | 'arxiv.cs.lg_2007-2017': 6,
60 | }
61 |
62 |
63 | def main():
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
66 | parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
67 | args = parser.parse_args()
68 |
69 | datasets_dir = args.datasets_dir
70 | tokenizer = tokenization.FullTokenizer(
71 | vocab_file=args.vocab_file, do_lower_case=True)
72 |
73 | def proc_dataset(dataset):
74 | all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
75 | review_json_dir = os.path.join(all_dir, 'reviews')
76 | parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')
77 |
78 | venue = dataset_venues[dataset]
79 | year = dataset_years[dataset]
80 |
81 | out_dir = os.path.join(datasets_dir, 'proc')
82 | out_file = dataset + '.tf_record'
83 | max_abs_len = 250
84 |
85 | clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
86 | tokenizer)
87 |
88 | # pool = mp.Pool(4)
89 | # pool.map(proc_dataset, dataset_names)
90 |
91 | for dataset in dataset_names:
92 | proc_dataset(dataset)
93 |
94 |
95 | if __name__ == "__main__":
96 | main()
97 |
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/extra_vocab.py:
--------------------------------------------------------------------------------
1 | """
2 | vv: wrote this to inspect what bert's tokenizer does with vocabulary terms it doesn't know.
3 | The answer is: it splits them into word pieces where it has embeddings for each piece. Example:
4 |
5 | tokenizer.tokenize('embedding')
6 | ['em', '##bed', '##ding']
7 |
8 | tokenizer.convert_tokens_to_ids(['em', '##bed', '##ding'])
9 | [7861, 8270, 4667]
10 |
11 | Accordingly, the meaning of embedding can be learned so long as there's a suitably rich training corpus
12 | """
13 |
14 | import argparse
15 | import glob
16 | import random
17 |
18 | import io
19 | import json
20 |
21 | import bert.tokenization as tokenization
22 |
23 | rng = random.Random(0)
24 |
25 | def main():
26 |
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument('--review-json-dir', type=str, default=None)
29 | parser.add_argument('--vocab-file', type=str, default=None)
30 |
31 | args = parser.parse_args()
32 |
33 | tokenizer = tokenization.FullTokenizer(
34 | vocab_file=args.vocab_file, do_lower_case=True)
35 |
36 | review_json_dir = args.review_json_dir
37 |
38 | print('Reading reviews from...', review_json_dir)
39 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
40 |
41 | paper_json_filename = paper_json_filenames[0]
42 | with io.open(paper_json_filename) as json_file:
43 | loaded = json.load(json_file)
44 | abstract = loaded['abstract']
45 | print(abstract)
46 | tokens = tokenizer.tokenize(abstract)
47 | print(tokens)
48 | print(tokenizer.convert_tokens_to_ids(tokens))
49 |
50 | # for idx, paper_json_filename in enumerate(paper_json_filenames):
51 | # with io.open(paper_json_filename) as json_file:
52 | # loaded = json.load(json_file)
53 | #
54 | # print(loaded['abstract'])
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/clean_PeerRead.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Process all PeerRead data into tf_record format to feed into Bert
4 |
5 | PeerDir=../dat/PeerRead/
6 |
7 | for dataset in $PeerDir*/; do
8 | echo $dataset
9 | # python -m data_cleaning.process_PeerRead_abstracts \
10 | # --review-json-dir \
11 | # --parsedpdf-json-dir \
12 | # --out-dir \
13 | # --out-file \
14 | # --vocab_file \
15 | # --max_abs_len
16 | done
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/clean_nips_prefix.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | PeerDir=../dat/PeerRead/nips_2013-2017
4 | PARSE_DIR=$PeerDir/2017/all/parsed_pdfs
5 |
6 | for pdf in $PARSE_DIR/*; do
7 | # echo $pdf
8 | mv $pdf $PARSE_DIR/"${pdf#*/pdfs}"
9 | done
10 |
--------------------------------------------------------------------------------
/src/PeerRead/data_cleaning/scripts/merge_train_dev_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Each Peer read dataset is pre-divided into train/dev/test. Merge these into "all"
4 |
5 | #PeerDir=../dat/PeerRead
6 | PeerDir=../dat/PeerRead/nips_2013-2017
7 |
8 | for dir in $PeerDir*/; do
9 | for subdir in $dir*/; do
10 | echo $subdir;
11 | cp -RT $subdir/ $dir/all/
12 | done
13 | done
14 |
--------------------------------------------------------------------------------
/src/PeerRead/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/dataset/__init__.py
--------------------------------------------------------------------------------
/src/PeerRead/dataset/array_from_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | helpers to take samples from the dataset and turn them into numpy arrays
3 | (for ease of inspection and use with baselines)
4 | """
5 | import argparse
6 | import numpy as np
7 | import pandas as pd
8 | import tensorflow as tf
9 | import os
10 | try:
11 | import mkl_random as random
12 | except ImportError:
13 | import numpy.random as random
14 |
15 | import bert.tokenization as tokenization
16 | from PeerRead.dataset.dataset import make_input_fn_from_file, make_buzzy_based_simulated_labeler
17 |
18 |
19 | def dataset_fn_to_df(dataset_fn):
20 |
21 | params = {'batch_size': 1}
22 | dataset = dataset_fn(params)
23 |
24 | itr = dataset.make_one_shot_iterator()
25 |
26 | samples = []
27 |
28 | for i in range(25000):
29 | try:
30 | sample = itr.get_next()
31 | for k in sample:
32 | sample[k] = sample[k].numpy()[0]
33 | samples += [sample]
34 | # print("year: {}".format(sample['year']))
35 | except:
36 | print(i)
37 | break
38 |
39 | df = pd.DataFrame(samples)
40 |
41 | return df
42 |
43 | def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
44 | base_output_dir='../dat/sim/peerread_buzzytitle_based/'):
45 |
46 | labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
47 |
48 | num_splits = 10
49 | dev_splits = [0]
50 | test_splits = [0]
51 |
52 | # data_file = '../dat/reddit/proc.tf_record'
53 | # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
54 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
55 |
56 | input_dataset_from_filenames = make_input_fn_from_file(data_file,
57 | 250,
58 | num_splits,
59 | dev_splits,
60 | test_splits,
61 | tokenizer,
62 | is_training=False,
63 | filter_test=False,
64 | shuffle_buffer_size=25000,
65 | seed=seed,
66 | labeler=labeler)
67 |
68 | output_df = dataset_fn_to_df(input_dataset_from_filenames)
69 | output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})
70 |
71 | output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
72 | os.makedirs(output_dir, exist_ok=True)
73 | output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
74 |
75 | output_df.to_csv(output_path, '\t')
76 |
77 |
78 | def main():
79 | tf.enable_eager_execution()
80 |
81 | buzzy_title_based_sim_dfs(treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0,
82 | base_output_dir=base_output_dir)
83 |
84 | if __name__ == '__main__':
85 | parser = argparse.ArgumentParser()
86 | parser.add_argument("--data-file", action="store", default='../dat/PeerRead/proc/arxiv-all.tf_record')
87 | parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
88 | parser.add_argument("--base-output-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
89 | parser.add_argument("--mode", action="store", default="simple")
90 | parser.add_argument("--beta0", action="store", default='1.0')
91 | parser.add_argument("--beta1", action="store", default='1.0')
92 | parser.add_argument("--gamma", action="store", default='1.0')
93 | args = parser.parse_args()
94 |
95 | data_file = args.data_file
96 | vocab_file = args.vocab_file
97 | base_output_dir = args.base_output_dir
98 | mode = args.mode
99 | beta0 = float(args.beta0)
100 | beta1 = float(args.beta1)
101 | gamma = float(args.gamma)
102 |
103 | main()
--------------------------------------------------------------------------------
/src/PeerRead/dataset/sentence_masking.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific PeerRead governing permissions and
14 | # limitations under the License.
15 | """Create masked LM TF examples for BERT."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 |
23 | import tensorflow as tf
24 |
25 |
26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
27 | ["index", "label"])
28 |
29 |
30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed):
31 | """Creates the predictions for the masked LM objective.
32 |
33 | This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords
34 |
35 | Except: we just include masks instead of randomly letting the words through or randomly replacing
36 | """
37 |
38 | basic_mask = tf.less(
39 | tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed),
40 | masked_lm_prob)
41 |
42 | # don't mask special characters or padding
43 | cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]),
44 | tf.not_equal(token_ids, vocab["[SEP]"]))
45 | cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0))
46 | mask = tf.logical_and(cand_indexes, basic_mask)
47 |
48 | # truncate to max predictions for ease of padding
49 | masked_lm_positions = tf.where(mask)
50 | # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ?
51 | # very rare event: nothing gets picked for mask, causing an irritating bug
52 | # in this case, just mask the first candidate index
53 | mlm_shape = tf.shape(masked_lm_positions)[0]
54 | masked_lm_positions = tf.cond(mlm_shape > 1,
55 | lambda: masked_lm_positions,
56 | lambda: tf.where(cand_indexes)[0:2])
57 |
58 | masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq]
59 | masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32)
60 | masked_lm_ids = tf.gather(token_ids, masked_lm_positions)
61 |
62 | mask = tf.cast(
63 | tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape),
64 | bool)
65 |
66 | output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids)
67 |
68 | # pad out to max_predictions_per_seq
69 | masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding
70 | add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]]
71 | masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant')
72 | masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant')
73 | masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant')
74 |
75 | return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
76 |
77 |
78 | def main(_):
79 | pass
80 |
81 |
82 | if __name__ == "__main__":
83 | main()
--------------------------------------------------------------------------------
/src/PeerRead/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/model/__init__.py
--------------------------------------------------------------------------------
/src/PeerRead/model/bert_multiclass.py:
--------------------------------------------------------------------------------
1 | """
2 | Helper to check which categorical attributes of PeerRead are predictable from the text
3 | """
4 |
5 | import tensorflow as tf
6 | import bert.modeling as modeling
7 | import bert.optimization as optimization
8 | from causal_bert.bert_unsupervised import get_masked_lm_output
9 | from causal_bert.logging import make_label_binary_prediction_summaries, binary_label_eval_metric_fn
10 |
11 |
12 | def _create_unsupervised_only_model(bert, bert_config, features):
13 | # PeerRead v. reddit inconsistency
14 | if "op_masked_lm_positions" in features:
15 | masked_lm_positions = features["op_masked_lm_positions"]
16 | masked_lm_ids = features["op_masked_lm_ids"]
17 | masked_lm_weights = features["op_masked_lm_weights"]
18 | else:
19 | masked_lm_positions = features["masked_lm_positions"]
20 | masked_lm_ids = features["masked_lm_ids"]
21 | masked_lm_weights = features["masked_lm_weights"]
22 |
23 | masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = get_masked_lm_output(
24 | bert_config, bert.get_sequence_output(), bert.get_embedding_table(),
25 | masked_lm_positions, masked_lm_ids, masked_lm_weights)
26 | return masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs
27 |
28 |
29 | def _make_feedforward_classifier(embedding, labels, num_labels, split, num_hidden_layers, extra_features=None,
30 | label_smoothing=0.01):
31 | regularizer = tf.contrib.layers.l2_regularizer(scale=1e-6)
32 | if extra_features is None:
33 | full_embedding = embedding
34 | else:
35 | full_embedding = tf.concat([embedding, extra_features], axis=1)
36 |
37 | if num_hidden_layers == 0:
38 | logits = tf.layers.dense(full_embedding, num_labels, activation=None,
39 | kernel_regularizer=regularizer, bias_regularizer=regularizer)
40 |
41 | else:
42 | layer = tf.layers.dense(full_embedding, 200, activation=tf.nn.elu)
43 | for _ in range(num_hidden_layers - 1):
44 | layer = tf.layers.dense(layer, 200, activation=tf.nn.elu,
45 | kernel_regularizer=regularizer, bias_regularizer=regularizer)
46 |
47 | if extra_features is None:
48 | final_embedding = layer
49 | else:
50 | final_embedding = tf.concat([layer, extra_features], axis=1)
51 |
52 | logits = tf.layers.dense(final_embedding, num_labels, activation=None,
53 | kernel_regularizer=regularizer, bias_regularizer=regularizer)
54 |
55 | with tf.name_scope("loss"):
56 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32,
57 | on_value=1. - label_smoothing, off_value=label_smoothing)
58 | log_probs = tf.nn.log_softmax(logits, axis=-1)
59 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
60 | censored_per_example_loss = split * per_example_loss
61 | loss = tf.reduce_sum(censored_per_example_loss)
62 |
63 | probabilities = tf.nn.softmax(logits, axis=-1)[:, 1] # P(T=1)
64 |
65 | return loss, per_example_loss, logits, probabilities
66 |
67 |
68 | def _get_getter(ema):
69 | def ema_getter(getter, name, *args, **kwargs):
70 | var = getter(name, *args, **kwargs)
71 | ema_var = ema.average(var)
72 | return ema_var # if ema_var else var
73 |
74 | return ema_getter
75 |
76 |
77 | def multiclass_model_fn_builder(bert_config, init_checkpoint, learning_rate,
78 | num_train_steps, num_warmup_steps, use_tpu,
79 | use_one_hot_embeddings, label_pred=True, unsupervised=False,
80 | polyak=False, use_extra_features=False):
81 | """Returns `model_fn` closure for TPUEstimator."""
82 |
83 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
84 | """The `model_fn` for TPUEstimator."""
85 |
86 | tf.logging.info("*** Features ***")
87 | for name in sorted(features.keys()):
88 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
89 |
90 | target_name = params['target_name']
91 | num_labels = params['num_labels']
92 |
93 | labels = features[target_name]
94 |
95 | # because reddit and peerread use slightly different text and pre-training structure
96 | if "op_token_ids" in features:
97 | token_mask = features["op_token_mask"]
98 | maybe_masked_token_ids = features["op_maybe_masked_input_ids"]
99 | else:
100 | token_mask = features["token_mask"]
101 | maybe_masked_token_ids = features["maybe_masked_input_ids"]
102 |
103 | index = features['index']
104 | in_train = features['in_train']
105 | in_dev = features['in_dev']
106 | in_test = features['in_test']
107 |
108 | is_training = (mode == tf.estimator.ModeKeys.TRAIN)
109 |
110 | # Predictive Model
111 |
112 | bert = modeling.BertModel(
113 | config=bert_config,
114 | is_training=is_training,
115 | input_ids=maybe_masked_token_ids,
116 | input_mask=token_mask,
117 | token_type_ids=None,
118 | use_one_hot_embeddings=use_one_hot_embeddings)
119 |
120 | masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = \
121 | _create_unsupervised_only_model(bert, bert_config, features)
122 |
123 | bert_embedding = bert.get_pooled_output()
124 |
125 | label_loss, per_example_loss, logits, probabilities = \
126 | _make_feedforward_classifier(bert_embedding, labels, num_labels, in_train, num_hidden_layers=0,
127 | extra_features=None, label_smoothing=0.01)
128 |
129 | tf.losses.add_loss(masked_lm_loss)
130 | tf.losses.add_loss(0.1 * label_loss)
131 |
132 | tf.summary.scalar('masked_lm_loss', masked_lm_loss, family='loss')
133 | tf.summary.scalar('label_loss', label_loss, family='loss')
134 |
135 | total_loss = masked_lm_loss + 0.1 * label_loss
136 |
137 | # some logging
138 | make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_train, "train")
139 | make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_dev, "dev")
140 |
141 | # pre-trained model loading
142 | tvars = tf.trainable_variables()
143 | initialized_variable_names = {}
144 | scaffold_fn = None
145 | if init_checkpoint:
146 | (assignment_map, initialized_variable_names
147 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
148 | if use_tpu:
149 |
150 | def tpu_scaffold():
151 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
152 | return tf.train.Scaffold()
153 |
154 | scaffold_fn = tpu_scaffold
155 | else:
156 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
157 |
158 | tf.logging.info("**** Trainable Variables ****")
159 | for var in tvars:
160 | init_string = ""
161 | if var.name in initialized_variable_names:
162 | init_string = ", *INIT_FROM_CKPT*"
163 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
164 | init_string)
165 |
166 | output_spec = None
167 | if mode == tf.estimator.ModeKeys.TRAIN:
168 |
169 | # sgd_opt = tf.train.GradientDescentOptimizer(learning_rate)
170 | # train_op = sgd_opt.minimize(total_loss, global_step=tf.train.get_global_step())
171 |
172 | train_op = optimization.create_optimizer(
173 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
174 |
175 | output_spec = tf.contrib.tpu.TPUEstimatorSpec(
176 | mode=mode,
177 | loss=total_loss,
178 | train_op=train_op,
179 | scaffold_fn=scaffold_fn)
180 |
181 | elif mode == tf.estimator.ModeKeys.EVAL:
182 | pass
183 |
184 | else:
185 | pass
186 |
187 | return output_spec
188 |
189 | return model_fn
190 |
--------------------------------------------------------------------------------
/src/PeerRead/submit_scripts/run_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record
5 | OUTPUT_DIR=../output/PeerRead/local_test
6 | #INIT_DIR=../../output/unsupervised_PeerRead_embeddings/
7 | #INIT_FILE=$INIT_DIR/model.ckpt-175000
8 |
9 |
10 | #rm -rf $OUTPUT_DIR
11 |
12 | python -m PeerRead.model.run_causal_bert \
13 | --seed=0 \
14 | --do_train=true \
15 | --do_eval=false \
16 | --do_predict=true \
17 | --input_files_or_glob=$DATA_FILE \
18 | --vocab_file=$BERT_BASE_DIR/vocab.txt \
19 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \
20 | --max_seq_length=250 \
21 | --output_dir=$OUTPUT_DIR \
22 | --train_batch_size=16 \
23 | --learning_rate=3e-5 \
24 | --num_warmup_steps 200 \
25 | --num_train_steps=4500 \
26 | --save_checkpoint_steps=3000 \
27 | --unsupervised=True \
28 | --label_pred=True \
29 | --num_splits=10 \
30 | --test_splits=0 \
31 | --dev_splits=0 \
32 | --simulated='real' \
33 | --treatment='buzzy_title'
34 | # --init_checkpoint=${INIT_FILE}
35 |
--------------------------------------------------------------------------------
/src/PeerRead/submit_scripts/run_unsupervised.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | BERT_BASE_DIR=../../BERT_pre-trained/uncased_L-12_H-768_A-12
4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record
5 | OUTPUT_DIR=../../output/unsupervised_PeerRead_embeddings/
6 |
7 | #rm -rf $OUTPUT_DIR
8 | python -m PeerRead.model.run_causal_bert \
9 | --seed=0 \
10 | --do_train=true \
11 | --input_files_or_glob=${DATA_FILE} \
12 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \
13 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
14 | --output_dir=${OUTPUT_DIR} \
15 | --max_seq_length=250 \
16 | --train_batch_size=16 \
17 | --learning_rate=3e-5 \
18 | --num_warmup_steps 200 \
19 | --num_train_steps=175000 \
20 | --save_checkpoints_steps 5000 \
21 | --keep_checkpoints 3 \
22 | --unsupervised=True
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/__init__.py
--------------------------------------------------------------------------------
/src/bert/README:
--------------------------------------------------------------------------------
1 | Chunks of google's Bert code, https://github.com/google-research/bert
2 |
3 | pre-trained presumed to be in:
4 | '../../bert/pre-trained/uncased_L-12_H-768_A-12'
--------------------------------------------------------------------------------
/src/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/bert/__init__.py
--------------------------------------------------------------------------------
/src/bert/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | new_global_step = global_step + 1
80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
81 | return train_op
82 |
83 |
84 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
85 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
86 |
87 | def __init__(self,
88 | learning_rate,
89 | weight_decay_rate=0.0,
90 | beta_1=0.9,
91 | beta_2=0.999,
92 | epsilon=1e-6,
93 | exclude_from_weight_decay=None,
94 | name="AdamWeightDecayOptimizer"):
95 | """Constructs a AdamWeightDecayOptimizer."""
96 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
97 |
98 | self.learning_rate = learning_rate
99 | self.weight_decay_rate = weight_decay_rate
100 | self.beta_1 = beta_1
101 | self.beta_2 = beta_2
102 | self.epsilon = epsilon
103 | self.exclude_from_weight_decay = exclude_from_weight_decay
104 |
105 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
106 | """See base class."""
107 | assignments = []
108 | for (grad, param) in grads_and_vars:
109 | if grad is None or param is None:
110 | continue
111 |
112 | param_name = self._get_variable_name(param.name)
113 |
114 | m = tf.get_variable(
115 | name=param_name + "/adam_m",
116 | shape=param.shape.as_list(),
117 | dtype=tf.float32,
118 | trainable=False,
119 | initializer=tf.zeros_initializer())
120 | v = tf.get_variable(
121 | name=param_name + "/adam_v",
122 | shape=param.shape.as_list(),
123 | dtype=tf.float32,
124 | trainable=False,
125 | initializer=tf.zeros_initializer())
126 |
127 | # Standard Adam update.
128 | next_m = (
129 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
130 | next_v = (
131 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
132 | tf.square(grad)))
133 |
134 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
135 |
136 | # Just adding the square of the weights to the loss function is *not*
137 | # the correct way of using L2 regularization/weight decay with Adam,
138 | # since that will interact with the m and v parameters in strange ways.
139 | #
140 | # Instead we want ot decay the weights in a manner that doesn't interact
141 | # with the m/v parameters. This is equivalent to adding the square
142 | # of the weights to the loss with plain (non-momentum) SGD.
143 | if self._do_use_weight_decay(param_name):
144 | update += self.weight_decay_rate * param
145 |
146 | update_with_lr = self.learning_rate * update
147 |
148 | next_param = param - update_with_lr
149 |
150 | assignments.extend(
151 | [param.assign(next_param),
152 | m.assign(next_m),
153 | v.assign(next_v)])
154 | return tf.group(*assignments, name=name)
155 |
156 | def _do_use_weight_decay(self, param_name):
157 | """Whether to use L2 weight decay for `param_name`."""
158 | if not self.weight_decay_rate:
159 | return False
160 | if self.exclude_from_weight_decay:
161 | for r in self.exclude_from_weight_decay:
162 | if re.search(r, param_name) is not None:
163 | return False
164 | return True
165 |
166 | def _get_variable_name(self, param_name):
167 | """Get the variable name from the tensor name."""
168 | m = re.match("^(.*):\\d+$", param_name)
169 | if m is not None:
170 | param_name = m.group(1)
171 | return param_name
172 |
--------------------------------------------------------------------------------
/src/causal_bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/causal_bert/__init__.py
--------------------------------------------------------------------------------
/src/causal_bert/logging.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | def batch_random_agreement(labels, predictions, weights, name=None):
5 | """ Computes the probability of random agreement between the
6 | labels and predictions assuming independence.
7 |
8 | Parameters
9 | ----------
10 | labels: a tensor of any shape taking values in {0, 1}.
11 | predictions: a tensor of the same shape as labels taking values in {0, 1}.
12 | weights: a tensor that can be broadcasted to labels.
13 | name: an optional name for the operation.
14 |
15 | Returns
16 | -------
17 | random_agreement: a scalar tensor representing the probability of random
18 | agreement.
19 | """
20 | with tf.name_scope(name, 'batch_random_agreement', [labels, predictions, weights]):
21 | weights_mean = tf.reduce_mean(weights)
22 | weights_mean = tf.where(tf.not_equal(weights_mean, 0), weights_mean, 1)
23 |
24 | labels = tf.to_float(labels)
25 | predictions = tf.to_float(predictions)
26 |
27 | p_labels = tf.metrics.mean(labels * weights / weights_mean)[1]
28 | p_predictions = tf.metrics.mean(predictions * weights / weights_mean)[1]
29 |
30 | random_agreement = tf.identity(
31 | p_labels * p_predictions + (1 - p_labels) * (1 - p_predictions),
32 | name='random_agreement')
33 |
34 | print(random_agreement.name)
35 |
36 | return random_agreement
37 |
38 |
39 | def batch_kappa(labels, predictions, weights, name=None):
40 | """ Computes Cohen's kappa on the given batch of predictions.
41 |
42 | Parameters
43 | ----------
44 | labels: a tensor of any shape taking values in {0, 1}.
45 | predictions: a tensor of the same shape as labels taking values in {0, 1}.
46 | weights: a tensor that can be broadcasted to labels.
47 | name: an optional name for the operation.
48 |
49 | Returns
50 | -------
51 | kappa: a scalar tensor representing the Kappa measure of agreement
52 | between labels and predictions.
53 | """
54 | with tf.name_scope(name, 'batch_kappa', [labels, predictions, weights]):
55 | accuracy = tf.metrics.accuracy(labels, predictions, weights=weights)[1]
56 | random_agreement = batch_random_agreement(labels, predictions, weights)
57 |
58 | # hack for small batch sizes
59 | random_agreement = tf.clip_by_value(random_agreement, 0.001, 0.999)
60 |
61 | kappa = tf.divide(
62 | accuracy - random_agreement, 1 - random_agreement,
63 | name='kappa')
64 |
65 | return kappa
66 |
67 |
68 | def make_label_binary_prediction_summaries(per_example_loss, logits, label_ids, split, family):
69 | with tf.name_scope("summary"+"/"+family):
70 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions')
71 |
72 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
73 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
74 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
75 | kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
76 |
77 | loss = tf.metrics.mean(per_example_loss, weights=split)
78 | # censored_per_example_loss = split * per_example_loss
79 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
80 |
81 | tf.summary.scalar('accuracy', accuracy[1], family=family)
82 | tf.summary.scalar('precision', precision[1], family=family)
83 | tf.summary.scalar('recall', recall[1], family=family)
84 | tf.summary.scalar('kappa', kappa, family=family)
85 | tf.summary.scalar('loss', loss[1], family=family)
86 |
87 |
88 | def make_label_multiclass_prediction_summaries(per_example_loss, logits, one_hot_label, split, family):
89 | with tf.name_scope("summary"+"/"+family):
90 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions')
91 | label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32)
92 |
93 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
94 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
95 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
96 | kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
97 |
98 | loss = tf.metrics.mean(per_example_loss, weights=split)
99 | # censored_per_example_loss = split * per_example_loss
100 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
101 |
102 | tf.summary.scalar('accuracy', accuracy[1], family=family)
103 | tf.summary.scalar('precision', precision[1], family=family)
104 | tf.summary.scalar('recall', recall[1], family=family)
105 | tf.summary.scalar('kappa', kappa, family=family)
106 | tf.summary.scalar('loss', loss[1], family=family)
107 |
108 |
109 |
110 | def make_label_regression_prediction_summaries(per_example_loss, split, family):
111 | with tf.name_scope("summary"+"/"+family):
112 |
113 | loss = tf.metrics.mean(per_example_loss, weights=split)
114 | # censored_per_example_loss = split * per_example_loss
115 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split)
116 |
117 | tf.summary.scalar('loss', loss[1], family=family)
118 |
119 |
120 | def cont_label_eval_metric_fn(per_example_loss, outcome, split=None, family=''):
121 | loss = tf.metrics.mean(per_example_loss, weights=split)
122 |
123 | return {
124 | family+"/eval_loss": loss
125 | }
126 |
127 |
128 | def binary_label_eval_metric_fn(per_example_loss, label_ids, logits, split=None, family=''):
129 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
130 |
131 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split)
132 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
133 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
134 | # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
135 | loss = tf.metrics.mean(per_example_loss, weights=split)
136 |
137 | return {
138 | family+"/eval_accuracy": accuracy,
139 | family+"/eval_precision": precision,
140 | family+"/eval_recall": recall,
141 | family+"/eval_loss": loss
142 | }
143 |
144 |
145 | def multiclass_label_eval_metric_fn(per_example_loss, logits, one_hot_label, split=None, family=''):
146 |
147 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
148 | label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32)
149 |
150 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels')
151 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels')
152 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels')
153 | # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa')
154 | loss = tf.metrics.mean(per_example_loss, weights=split)
155 |
156 | return {
157 | family+"/eval_accuracy": accuracy,
158 | family+"/eval_precision": precision,
159 | family+"/eval_recall": recall,
160 | family+"/eval_loss": loss
161 | }
162 |
163 |
164 | def unsupervised_eval_metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
165 | masked_lm_weights):
166 | """Computes the loss and accuracy of the model."""
167 | masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
168 | [-1, masked_lm_log_probs.shape[-1]])
169 | masked_lm_predictions = tf.argmax(
170 | masked_lm_log_probs, axis=-1, output_type=tf.int32)
171 | masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
172 | masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
173 | masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
174 | masked_lm_accuracy = tf.metrics.accuracy(
175 | labels=masked_lm_ids,
176 | predictions=masked_lm_predictions,
177 | weights=masked_lm_weights)
178 | masked_lm_mean_loss = tf.metrics.mean(
179 | values=masked_lm_example_loss, weights=masked_lm_weights)
180 |
181 | return {
182 | "masked_lm_accuracy": masked_lm_accuracy,
183 | "masked_lm_loss": masked_lm_mean_loss,
184 | }
--------------------------------------------------------------------------------
/src/lda_baseline/helpers.py:
--------------------------------------------------------------------------------
1 | from nltk.tokenize import word_tokenize
2 | from nltk.stem import WordNetLemmatizer
3 | from nltk.corpus import stopwords
4 | from sklearn.feature_extraction.text import CountVectorizer
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.decomposition import LatentDirichletAllocation
8 |
9 | class LemmaTokenizer(object):
10 | def __init__(self):
11 | self.wnl = WordNetLemmatizer()
12 | def __call__(self, articles):
13 | stop = stopwords.words('english')
14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 |
16 | def filter_by_subreddit(reddit, subs=None):
17 | if not subs:
18 | return reddit.index.values
19 | else:
20 | return reddit[reddit.subreddit.isin(subs)].index.values
21 |
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
23 | from nltk.corpus import stopwords
24 | '''
25 | From a list of documents raw text build a matrix DxV
26 | D: number of docs
27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | '''
29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | corpus = count_vect.fit_transform(documents)
31 | vocabulary = count_vect.get_feature_names()
32 |
33 | return corpus,vocabulary,count_vect
34 |
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | indices = np.arange(num_docs)
37 | np.random.shuffle(indices)
38 | size = int(indices.shape[0]*percentage)
39 | dev = indices[:size]
40 | return dev
41 |
42 | def learn_topics(X, X_dev, K=50):
43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | print("Fitting", K, "topics...")
45 | lda.fit(X)
46 | score = lda.perplexity(X_dev)
47 | print("Log likelihood:", score)
48 | topics = lda.components_
49 | return score, lda, topics
50 |
51 | def show_topics(vocab, topics, n_words=20):
52 | topic_keywords = []
53 | for topic_weights in topics:
54 | top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | topic_keywords.append(vocab.take(top_keyword_locs))
56 |
57 | df_topic_keywords = pd.DataFrame(topic_keywords)
58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | return df_topic_keywords
61 |
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | filtered_indices = filtered_df[on].values
64 | doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | embeddings = doc_embeddings[doc_idx, :]
66 | return embeddings
67 |
68 | def make_index_mapping(df, on='post_index', convert_to_int=True):
69 | if on=='index':
70 | indices = df.index.values
71 | else:
72 | indices = df[on].values
73 |
74 | if convert_to_int:
75 | return {int(ind):i for (i,ind) in enumerate(indices)}
76 |
77 | return {ind:i for (i,ind) in enumerate(indices)}
78 |
79 | def assign_split(df, num_splits=10, col_to_add='split'):
80 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
81 | return df
82 |
--------------------------------------------------------------------------------
/src/lda_baseline/peerread_fit_topics.py:
--------------------------------------------------------------------------------
1 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit
2 | import numpy as np
3 | import pandas as pd
4 | import os
5 | from scipy import sparse
6 | import argparse
7 | import sys
8 |
9 | def load_peerread(path='../dat/PeerRead/'):
10 | return pd.read_csv(path + 'proc_abstracts.csv')
11 |
12 |
13 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
14 | count_filename = path + 'term_counts'
15 | vocab_filename = path + 'vocab'
16 |
17 | if os.path.exists(count_filename + '.npz') and not force_redo:
18 | return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy')
19 |
20 | post_docs = df[text_col].values
21 | counts, vocab, _ = tokenize_documents(post_docs)
22 | sparse.save_npz(count_filename, counts)
23 | np.save(vocab_filename, vocab)
24 | return counts, np.array(vocab)
25 |
26 | def main():
27 | if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda:
28 | if dat_dir:
29 | peerread = load_peerread(path=dat_dir)
30 | terms, vocab = load_term_counts(peerread, path=dat_dir, force_redo=redo_proc)
31 | else:
32 | peerread = load_peerread()
33 | terms, vocab = load_term_counts(peerread, force_redo=redo_proc)
34 |
35 | N = terms.shape[0]
36 | indices = np.arange(N)
37 | dev_idx = assign_dev_split(N)
38 | train_idx = np.setdiff1d(indices, dev_idx)
39 | X_tr = terms[train_idx, :]
40 | X_dev = terms[dev_idx, :]
41 | K_vals = [50]
42 | validation_scores = np.zeros(len(K_vals))
43 | all_topics = []
44 | models = []
45 | for i,k in enumerate(K_vals):
46 | score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k)
47 | validation_scores[i] = score
48 | all_topics.append(topics)
49 | models.append(lda_obj)
50 | k_idx = np.argsort(validation_scores)[0]#[-1]
51 | best_k = K_vals[k_idx]
52 | best_topics = all_topics[k_idx]
53 | best_model = models[k_idx]
54 | best_doc_prop = best_model.transform(terms)
55 | np.save(os.path.join(out_dir, 'topics'), best_topics)
56 | np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop)
57 | else:
58 | best_topics = np.load(os.path.join(out_dir, 'topics.npy'))
59 | vocab = np.load(os.path.join(out_dir, 'vocab.npy'))
60 |
61 | print("Best topic")
62 | topics = show_topics(vocab, best_topics, n_words=10)
63 | print(topics)
64 |
65 | if __name__ == '__main__':
66 | parser = argparse.ArgumentParser()
67 | parser.add_argument("--dat-dir", action="store", default=None)
68 | parser.add_argument("--out-dir", action="store", default="../dat/PeerRead/")
69 | parser.add_argument("--redo-lda", action="store_true")
70 | parser.add_argument("--redo-proc", action="store_true")
71 | parser.add_argument("--test", action="store_true")
72 | args = parser.parse_args()
73 | out_dir = args.out_dir
74 | redo_lda = args.redo_lda
75 | redo_proc = args.redo_proc
76 | dat_dir = args.dat_dir
77 |
78 | main()
--------------------------------------------------------------------------------
/src/lda_baseline/peerread_get_abstracts.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple pre-processing for PeerRead papers.
3 | Takes in JSON formatted data from ScienceParse and outputs a tfrecord
4 |
5 |
6 | Reference example:
7 | https://github.com/tensorlayer/tensorlayer/blob/9528da50dfcaf9f0f81fba9453e488a1e6c8ee8f/examples/data_process/tutorial_tfrecord3.py
8 | """
9 |
10 | import argparse
11 | import glob
12 | import os
13 | import random
14 | import pandas as pd
15 | import io
16 | import json
17 | from dateutil.parser import parse as parse_date
18 | from PeerRead.ScienceParse.Paper import Paper
19 |
20 | rng = random.Random(0)
21 |
22 |
23 | def process_json_paper(paper_json_filename, scienceparse_dir, tokenizer):
24 | paper = Paper.from_json(paper_json_filename)
25 | return paper.ABSTRACT
26 |
27 |
28 | def output_PeerRead_text(review_json_dir, parsedpdf_json_dir,
29 | out_dir, out_file):
30 |
31 | if not os.path.exists(out_dir):
32 | os.makedirs(out_dir)
33 |
34 | paper_data = []
35 | print('Reading reviews from...', review_json_dir)
36 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
37 | for idx, paper_json_filename in enumerate(paper_json_filenames):
38 | paper = Paper.from_json(paper_json_filename)
39 | paper_data.append([paper.ID, paper.ABSTRACT])
40 |
41 | df = pd.DataFrame(paper_data, columns=['paper_id', 'abstract_text'])
42 | df.to_csv(out_dir + 'proc_abstracts.csv')
43 |
44 | def main():
45 | parser = argparse.ArgumentParser()
46 |
47 | parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
48 | parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
49 | parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/')
50 | parser.add_argument('--out-file', type=str, default='proc_text.csv')
51 |
52 | args = parser.parse_args()
53 |
54 | output_PeerRead_text(args.review_json_dir, args.parsedpdf_json_dir,
55 | args.out_dir, args.out_file)
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/src/lda_baseline/peerread_output_att.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.ate import ate_estimates
2 | from .peerread_fit_topics import load_peerread
3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
4 | import numpy as np
5 | import pandas as pd
6 | import os
7 | from sklearn.linear_model import LogisticRegression, LinearRegression
8 | from sklearn.metrics import mean_squared_error as mse
9 | import argparse
10 | import sys
11 | from scipy.special import logit
12 |
13 | def compute_ground_truth_treatment_effect(df):
14 | y1 = df['y1']
15 | y0 = df['y0']
16 | return y1.mean() - y0.mean()
17 |
18 | def get_log_outcomes(outcomes):
19 | #relu
20 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes])
21 | return np.log(outcomes)
22 |
23 | def predict_expected_outcomes(model, doc_embeddings):
24 | features = logit(doc_embeddings)
25 | return model.predict_proba(features)[:,1]
26 |
27 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings):
28 | model = LogisticRegression(solver='liblinear')
29 | features = logit(doc_embeddings)
30 | model.fit(features, outcomes)
31 | if verbose:
32 | print("Training accuracy:", model.score(features, outcomes))
33 | return model
34 |
35 | def predict_treatment_probability(labels, doc_embeddings):
36 | model = LogisticRegression(solver='liblinear')
37 | features = logit(doc_embeddings)
38 | model.fit(features, labels)
39 | if verbose:
40 | print("Training accuracy:", model.score(features, labels))
41 | treatment_probability = model.predict_proba(features)[:,1]
42 | return treatment_probability
43 |
44 | def load_simulated_data():
45 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
46 | return sim_df
47 |
48 | def load_document_proportions(path='../dat/PeerRead/'):
49 | return np.load(path + 'document_proportions.npy')
50 |
51 | def main():
52 | peerread = load_peerread()
53 | indices = peerread['paper_id'].values
54 | index_mapping = make_index_mapping(peerread, on='index')
55 |
56 | if not dat_dir:
57 | doc_embeddings = load_document_proportions()
58 | else:
59 | doc_embeddings = load_document_proportions(path=dat_dir)
60 |
61 | sim_df = load_simulated_data()
62 | num_reps = 10
63 | mean_estimates = {}
64 |
65 | for rep in range(num_reps):
66 | bootstrap_sim_df = assign_split(sim_df, num_splits=2)
67 | bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0]
68 | treatment_labels = bootstrap_sim_df.treatment.values
69 | filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping, on='id')
70 | treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings)
71 |
72 | treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1]
73 | untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0]
74 |
75 | all_outcomes = bootstrap_sim_df.outcome.values
76 | outcomes_st_treated = treated_sim.outcome.values
77 | outcomes_st_not_treated = untreated_sim.outcome.values
78 |
79 | doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping, on='id')
80 | doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping, on='id')
81 |
82 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated)
83 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated)
84 |
85 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings)
86 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings)
87 |
88 | estimates = ate_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated,
89 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)
90 |
91 | for est, ate in estimates.items():
92 | if est in mean_estimates:
93 | mean_estimates[est].append(ate)
94 | else:
95 | mean_estimates[est] = [ate]
96 |
97 | ground_truth_ate = compute_ground_truth_treatment_effect(sim_df)
98 | mean_estimates.update({'ground_truth_ate':ground_truth_ate})
99 | if verbose:
100 | for est, ates in mean_estimates.items():
101 | print(est, np.mean(ates), np.std(ates))
102 | else:
103 | config = ';'.join([str(mode)] + params)
104 | log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out')
105 | with open(log_file, 'a') as h:
106 | h.write(config + '\n')
107 | for est, ates in mean_estimates.items():
108 | h.write(est + ',' + str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n')
109 |
110 |
111 | if __name__ == '__main__':
112 | parser = argparse.ArgumentParser()
113 | parser.add_argument("--dat-dir", action="store", default=None)
114 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
115 | parser.add_argument("--mode", action="store", default="simple")
116 | parser.add_argument("--params", action="store", default="1.0")
117 | parser.add_argument("--verbose", action='store_true')
118 | args = parser.parse_args()
119 |
120 | sim_dir = args.sim_dir
121 | dat_dir = args.dat_dir
122 | verbose = args.verbose
123 | params = args.params
124 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
125 | mode = args.mode
126 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
127 |
128 | main()
--------------------------------------------------------------------------------
/src/lda_baseline/reddit_fit_topics.py:
--------------------------------------------------------------------------------
1 | from reddit.data_cleaning.reddit_posts import load_reddit
2 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit
3 | import numpy as np
4 | import pandas as pd
5 | import os
6 | from scipy import sparse
7 | import argparse
8 | import sys
9 |
10 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False):
11 | count_filename = path + 'term_counts'
12 | vocab_filename = path + 'vocab'
13 |
14 | if os.path.exists(count_filename + '.npz') and not force_redo:
15 | return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy')
16 |
17 | post_docs = reddit['post_text'].values
18 | counts, vocab, _ = tokenize_documents(post_docs)
19 | sparse.save_npz(count_filename, counts)
20 | np.save(vocab_filename, vocab)
21 | return counts, np.array(vocab)
22 |
23 | def main():
24 | if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda:
25 |
26 | subreddits = {'keto', 'OkCupid', 'childfree'}
27 | reddit = load_reddit()
28 | filtered_indices = filter_by_subreddit(reddit, subs=subreddits)
29 |
30 | if dat_dir:
31 | terms, vocab = load_term_counts(reddit, path=dat_dir, force_redo=redo_proc)
32 | else:
33 | terms, vocab = load_term_counts(reddit, force_redo=redo_proc)
34 |
35 | terms = terms[filtered_indices, :]
36 | N = terms.shape[0]
37 | indices = np.arange(N)
38 | dev_idx = assign_dev_split(N)
39 | train_idx = np.setdiff1d(indices, dev_idx)
40 | X_tr = terms[train_idx, :]
41 | X_dev = terms[dev_idx, :]
42 | print(dev_idx.shape)
43 |
44 | K_vals = [100]
45 | validation_scores = np.zeros(len(K_vals))
46 | all_topics = []
47 | models = []
48 | for i,k in enumerate(K_vals):
49 | score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k)
50 | validation_scores[i] = score
51 | all_topics.append(topics)
52 | models.append(lda_obj)
53 | k_idx = np.argsort(validation_scores)[0]#[-1]
54 | best_k = K_vals[k_idx]
55 | best_topics = all_topics[k_idx]
56 | best_model = models[k_idx]
57 | best_doc_prop = best_model.transform(terms)
58 | np.save(os.path.join(out_dir, 'topics'), best_topics)
59 | np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop)
60 | else:
61 | best_topics = np.load(os.path.join(out_dir, 'topics.npy'))
62 | vocab = np.load(os.path.join(out_dir, 'vocab.npy'))
63 |
64 | # print("Best topic")
65 | # topics = show_topics(vocab, best_topics, n_words=10)
66 | # print(topics)
67 |
68 | if __name__ == '__main__':
69 | parser = argparse.ArgumentParser()
70 | parser.add_argument("--dat-dir", action="store", default=None)
71 | parser.add_argument("--out-dir", action="store", default="../dat/reddit/")
72 | parser.add_argument("--redo-lda", action="store_true")
73 | parser.add_argument("--redo-proc", action="store_true")
74 | parser.add_argument("--test", action="store_true")
75 | args = parser.parse_args()
76 | out_dir = args.out_dir
77 | redo_lda = args.redo_lda
78 | redo_proc = args.redo_proc
79 | dat_dir = args.dat_dir
80 | test = args.test
81 |
82 | main()
--------------------------------------------------------------------------------
/src/lda_baseline/reddit_output_att.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.att import att_estimates
2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
4 | import numpy as np
5 | import pandas as pd
6 | import os
7 | from sklearn.linear_model import LogisticRegression, LinearRegression
8 | from sklearn.metrics import mean_squared_error as mse
9 | import argparse
10 | import sys
11 | from scipy.special import logit
12 |
13 | def get_log_outcomes(outcomes):
14 | #relu
15 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes])
16 | return np.log(outcomes)
17 |
18 | def predict_expected_outcomes(model, doc_embeddings):
19 | features = logit(doc_embeddings)
20 | return model.predict(features)
21 |
22 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings):
23 | model = LinearRegression()
24 | features = logit(doc_embeddings)
25 | model.fit(features, outcomes)
26 | predict = model.predict(features)
27 | if verbose:
28 | print("Training MSE:", mse(outcomes, predict))
29 | return model
30 |
31 | def predict_treatment_probability(labels, doc_embeddings):
32 | model = LogisticRegression(solver='liblinear')
33 | features = logit(doc_embeddings)
34 | model.fit(features, labels)
35 | if verbose:
36 | print("Training accuracy:", model.score(features, labels))
37 | treatment_probability = model.predict_proba(features)[:,1]
38 | return treatment_probability
39 |
40 | def load_simulated_data():
41 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
42 | sim_df = sim_df.rename(columns={'index':'post_index'})
43 | return sim_df
44 |
45 | def load_document_proportions(path='../dat/reddit/'):
46 | return np.load(path + 'document_proportions.npy')
47 |
48 | def main():
49 | reddit = load_reddit_processed()
50 | if subs:
51 | reddit = reddit[reddit.subreddit.isin(subs)]
52 |
53 | index_mapping = make_index_mapping(reddit, on='orig_index')
54 | if not dat_dir:
55 | doc_embeddings = load_document_proportions()
56 | else:
57 | doc_embeddings = load_document_proportions(path=dat_dir)
58 |
59 | sim_df = load_simulated_data()
60 | num_reps = 10
61 | mean_estimates = {}
62 |
63 | for rep in range(num_reps):
64 | bootstrap_sim_df = assign_split(sim_df, num_splits=2)
65 | bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0]
66 | treatment_labels = bootstrap_sim_df.treatment.values
67 | filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping)
68 | treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings)
69 |
70 | treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1]
71 | untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0]
72 |
73 | all_outcomes = bootstrap_sim_df.outcome.values
74 | outcomes_st_treated = treated_sim.outcome.values
75 | outcomes_st_not_treated = untreated_sim.outcome.values
76 |
77 | doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping)
78 | doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping)
79 |
80 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated)
81 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated)
82 |
83 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings)
84 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings)
85 |
86 | estimates = att_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated,
87 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
88 |
89 | for est, ate in estimates.items():
90 | if est in mean_estimates:
91 | mean_estimates[est].append(ate)
92 | else:
93 | mean_estimates[est] = [ate]
94 | if verbose:
95 | for est, ates in mean_estimates.items():
96 | print(est, np.mean(ates), np.std(ates))
97 | else:
98 | config = ';'.join([str(mode)] + params)
99 | log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out')
100 | with open(log_file, 'a') as h:
101 | h.write(config + '\n')
102 | for est, ates in mean_estimates.items():
103 | h.write(est + ',' + str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n')
104 |
105 |
106 | if __name__ == '__main__':
107 | parser = argparse.ArgumentParser()
108 | parser.add_argument("--dat-dir", action="store", default=None)
109 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
110 | parser.add_argument("--subs", action="store", default='13,6,8')
111 | parser.add_argument("--mode", action="store", default="simple")
112 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
113 | parser.add_argument("--verbose", action='store_true')
114 | args = parser.parse_args()
115 |
116 | sim_dir = args.sim_dir
117 | dat_dir = args.dat_dir
118 | subs = None
119 | if args.subs != '':
120 | subs = [int(s) for s in args.subs.split(',')]
121 | verbose = args.verbose
122 | params = args.params.split(',')
123 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
124 | subs_string = ', '.join(args.subs.split(','))
125 | mode = args.mode
126 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
127 |
128 | main()
--------------------------------------------------------------------------------
/src/lda_baseline/scripts/sweep_over_sims.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #NUM_SEED=2
3 | #SEEDS=$(seq 0 $NUM_SEED)
4 | rm ../dat/sim/reddit_subreddit_based/two-stage-lda-estimates.out
5 | export SUBREDDITS=13,6,8
6 | export BETA0=1.0
7 | declare -a SIMMODES=('simple')
8 | declare -a BETA1S=(1.0 10.0 100.0)
9 | declare -a GAMMAS=(1.0 4.0)
10 |
11 | for SIMMODEj in "${SIMMODES[@]}"; do
12 | for BETA1j in "${BETA1S[@]}"; do
13 | for GAMMAj in "${GAMMAS[@]}"; do
14 | python -m lda_baseline.reddit_output_att \
15 | --subs=${SUBREDDITS} \
16 | --mode=${SIMMODEj} \
17 | --params=${BETA0},${BETA1j},${GAMMAj}
18 | done
19 | done
20 | done
--------------------------------------------------------------------------------
/src/model_checking/plot_adjustment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.linear_model import LogisticRegression, LinearRegression
6 | from scipy.special import logit
7 | from result_processing.helpers import convert_str_columns_to_float, assign_split, filter_imbalanced_terms
8 | from sklearn.metrics import mean_squared_error as mse
9 | from scipy.sparse import load_npz
10 | import matplotlib.pyplot as plt
11 | from scipy.stats import gaussian_kde
12 |
13 | def get_prediction_file():
14 | predict_df = pd.read_csv(log_file, delimiter='\t')
15 | predict_df = predict_df.rename(columns={'index':'post_index'})
16 | return predict_df
17 |
18 | def fit_treatment(features, labels, verbose=False, coeff_offset=1):
19 | model = LogisticRegression(solver='liblinear')
20 | model.fit(features, labels)
21 | coeffs = np.array(model.coef_).flatten()[coeff_offset:]
22 | if verbose:
23 | print("Model accuracy:", model.score(features, labels))
24 | print("Mean and std. of the word coeffs:", coeffs.mean(), coeffs.std())
25 | return coeffs
26 |
27 | def truncate(df, truncate_level=0.1):
28 | df = df[(df.treatment_probability >= truncate_level) & (df.treatment_probability <= 1.0-truncate_level)]
29 | return df
30 |
31 | def plot_density(unadjusted, adjusted, permuted):
32 | density = gaussian_kde(adjusted.mean(axis=0))
33 | permutation_density = gaussian_kde(permuted.mean(axis=0))
34 | missing_z_density = gaussian_kde(unadjusted.mean(axis=0))
35 | xs = np.linspace(-0.5,0.5,1000)
36 | plt.plot(xs,density(xs), label='Adjusted model (not permuted)')
37 | plt.plot(xs, permutation_density(xs), label='Permuted model')
38 | plt.plot(xs, missing_z_density(xs), label='Unadjusted model')
39 | plt.xlabel('Coefficient values for words')
40 | plt.legend()
41 |
42 | if not os.path.exists(out_dir):
43 | os.makedirs(out_dir)
44 | # plt.tight_layout()
45 | plt.savefig(out_dir + out_file, dpi=300)
46 |
47 | def load_terms(data):
48 | termfile = '../dat/' + data + '/term_counts.npz'
49 | if data == 'reddit':
50 | termfile = '../dat/' + data + '_term_counts.npz'
51 | term_counts = load_npz(termfile).toarray()
52 | if drop_terms:
53 | term_indices = np.arange(term_counts.shape[1])
54 | random_indices = np.random.choice(term_indices, 1000)
55 | term_counts = term_counts[:,random_indices]
56 | return term_counts
57 |
58 | def main():
59 | predict_df = get_prediction_file()
60 | term_counts = load_terms(dataset)
61 | print(predict_df.shape, term_counts.shape)
62 | if dataset == 'reddit':
63 | imbalanced_terms = filter_imbalanced_terms(predict_df, term_counts)
64 | term_counts = term_counts[:,imbalanced_terms]
65 | print(term_counts.shape)
66 |
67 | n_bootstraps = 10
68 | n_w = term_counts.shape[1]
69 |
70 | adjusted = np.zeros((n_bootstraps, n_w))
71 | permuted = np.zeros((n_bootstraps, n_w))
72 | unadjusted = np.zeros((n_bootstraps, n_w))
73 |
74 | for i in range(n_bootstraps):
75 | sample = assign_split(predict_df,num_splits=2)
76 | sample = sample[sample.split==0]
77 | indices = sample.post_index.values
78 | labels = sample.treatment.values
79 | words = term_counts[indices, :]
80 | propensity_score = logit(sample.treatment_probability.values)
81 | all_features = np.column_stack((propensity_score, words))
82 | unadjusted[i,:] = fit_treatment(words, labels, coeff_offset=0)
83 | adjusted[i,:] = fit_treatment(all_features, labels)
84 | np.random.shuffle(words)
85 | permuted_features = np.column_stack((propensity_score, words))
86 | permuted[i,:] = fit_treatment(permuted_features, labels)
87 |
88 | plot_density(unadjusted, adjusted, permuted)
89 |
90 | if __name__ == '__main__':
91 | parser = argparse.ArgumentParser()
92 | parser.add_argument("--out-dir", action="store", default='../figures/')
93 | parser.add_argument("--out-file", action="store", default='reddit.pdf')
94 | parser.add_argument("--log-file", action="store", default='../logdir/reddit/modesimple/beta01.0.beta110.0.gamma1.0/predict/test_results_all.tsv')
95 | parser.add_argument("--drop-terms", action="store_true")
96 | parser.add_argument("--dataset", action="store", default='reddit')
97 | args = parser.parse_args()
98 | log_file = args.log_file
99 | drop_terms = args.drop_terms
100 | dataset = args.dataset
101 | out_dir = args.out_dir
102 | out_file = args.out_file
103 | main()
--------------------------------------------------------------------------------
/src/reddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/__init__.py
--------------------------------------------------------------------------------
/src/reddit/data_cleaning/BigQuery_get_data:
--------------------------------------------------------------------------------
1 | ```/*
2 | based on https://www.reddit.com/r/bigquery/comments/4f2yp7/best_way_to_look_at_conversation_chains_in_reddit/
3 | and on https://nbviewer.jupyter.org/github/bburky/subredditgenderratios/blob/master/Subreddit%20Gender%20Ratios.ipynb
4 |
5 | KNOWN LIMITATIONS:
6 | does not look for male/female (zodiac?) symbols.
7 | I didn't do a fresh search over subreddits w/ gender, so some may be missing
8 | up/downs was mostly null, so I omitted this field
9 | */
10 |
11 | WITH
12 | reddit_comments AS (
13 | SELECT
14 | body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class
15 | FROM
16 | `fh-bigquery.reddit_comments.2018*`
17 | -- `reddit-gender.comment_response_tuples.gendered_2018`
18 | ),
19 | replies AS (
20 | SELECT
21 | REGEXP_EXTRACT(parent_id, r'[a-zA-Z0-9]+$') as parent_id,
22 | -- MIN(subreddit) AS subreddit,
23 | ARRAY_AGG(STRUCT(body, author, created_utc, id) ORDER BY created_utc ASC) AS reply
24 | FROM
25 | reddit_comments
26 | WHERE
27 | --parent id starting w t1_ indicates not-top-level comment
28 | REGEXP_CONTAINS(parent_id, r'^(t1_)')
29 | GROUP BY
30 | parent_id
31 | ),
32 | ops AS (
33 | SELECT
34 | gender, body, author, author_flair_text, created_utc, link_id, score, controversiality, gilded, id, subreddit, author_flair_css_class
35 | FROM
36 | (
37 | -- male/female
38 | SELECT
39 | *,
40 | REGEXP_EXTRACT(
41 | LOWER(author_flair_css_class),
42 | '(?:fe)?male') AS gender
43 | FROM
44 | reddit_comments
45 | WHERE
46 | subreddit IN (
47 | 'AskMen',
48 | 'AskWomen',
49 | 'AskMenOver30',
50 | 'AskWomenOver30',
51 | 'sexover30')
52 | UNION ALL
53 | -- pink/blue
54 | SELECT
55 | *,
56 | CASE
57 | WHEN author_flair_css_class = 'blue' THEN 'male'
58 | WHEN author_flair_css_class = 'pink' THEN 'female'
59 | END AS gender
60 | FROM
61 | reddit_comments
62 | WHERE
63 | subreddit IN (
64 | 'tall',
65 | 'short')
66 | UNION ALL
67 | -- A/S/L
68 | SELECT
69 | -- need to do this one manually because of asl
70 | body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class,
71 | CASE
72 | WHEN asl = 'm' THEN 'male'
73 | WHEN asl = 'f' THEN 'female'
74 | END AS gender
75 | FROM (
76 | SELECT
77 | *,
78 | REGEXP_EXTRACT(
79 | LOWER(author_flair_text),
80 | "(?:^|[^\\p{L}0-9'\\.\\$])\\s*(?:\\d\\d)?\\s*(f|m)\\s*(?:\\d\\d)?\\s*(?:$|[^\\p{L}0-9'\\.])") AS asl
81 | FROM
82 | reddit_comments
83 | WHERE
84 | subreddit IN (
85 | 'OkCupid',
86 | 'keto',
87 | 'childfree',
88 | 'xxketo',
89 | 'LGBTeens',
90 | 'loseit',
91 | 'Tinder',
92 | 'proED',
93 | 'fatlogic',
94 | 'financialindependence',
95 | 'infj',
96 | 'infertility',
97 | '100DaysofKeto')) )
98 | WHERE
99 | gender IS NOT NULL AND
100 | --parent id starting w t3_ indicates top-level comment
101 | REGEXP_CONTAINS(parent_id, r'^(t3_)')
102 | )
103 |
104 | SELECT
105 | ops.*,
106 | replies.*
107 | FROM
108 | ops INNER JOIN replies ON ops.id = replies.parent_id```
--------------------------------------------------------------------------------
/src/reddit/data_cleaning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/data_cleaning/__init__.py
--------------------------------------------------------------------------------
/src/reddit/data_cleaning/reddit_gender_sentiment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 22,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import json\n",
11 | "import pandas as pd\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 23,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "datafile = os.path.join('..', 'dat', '2018')\n",
22 | "\n",
23 | "with open(datafile, 'r') as f:\n",
24 | " record_dicts = []\n",
25 | " for line in f.readlines():\n",
26 | " record = json.loads(line)\n",
27 | " reply_list = record['reply']\n",
28 | " earliest_reply_text = None\n",
29 | " for reply_dict in sorted(reply_list, key=lambda x: x['created_utc']):\n",
30 | " if reply_dict['body'] != '[deleted]' and reply_dict['body'] != '[removed]':\n",
31 | " earliest_reply_text = reply_dict['body']\n",
32 | " if earliest_reply_text:\n",
33 | " break\n",
34 | " if earliest_reply_text:\n",
35 | " record.pop('reply')\n",
36 | " record['reply_text'] = earliest_reply_text\n",
37 | " record_dicts.append(record)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 24,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "reddit_df = pd.DataFrame(record_dicts)\n",
47 | "reddit_df = reddit_df[reddit_df.body != '[deleted]']\n",
48 | "reddit_df = reddit_df.astype({'score':np.int64, 'controversiality':np.int64, 'gilded':np.int64, 'created_utc':np.int64})"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 25,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "from google.cloud import language\n",
58 | "from google.cloud.language import enums\n",
59 | "from google.cloud.language import types\n",
60 | "client = language.LanguageServiceClient()"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 61,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "replies = reddit_df[['body','reply_text']].values\n",
70 | "indices = np.arange(len(replies))\n",
71 | "np.random.shuffle(indices)\n",
72 | "random_idx = indices[:10]"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 63,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "OP: How much our bridal party could drink when given the news they had an open bar tab. That was a couple thousand extra we didn’t expect to spend.\n",
85 | "\n",
86 | "\n",
87 | "Actual married life is exactly the same as before we were married since we lived together and shared finances well before marrying. \n",
88 | "Text: I wonder if it would be a good idea to just say it's a discounted bar or something and then at the end reveal it was an open bar.\n",
89 | "Sentiment: -0.30000001192092896, 0.30000001192092896\n",
90 | "****************************************\n",
91 | "OP: excuse me but this is a christian subreddit\n",
92 | "Text: But they said no homo\n",
93 | "Sentiment: 0.30000001192092896, 0.30000001192092896\n",
94 | "****************************************\n",
95 | "OP: I don't buy that the inches=pounds thing is real but if you want to add some scientific information to this, for me, one inch was equal to 6.1lbs when I took my starting weight and measurements.\n",
96 | "Text: Until I get an accurate scale, I think I'm going to try to do an average of the three numbers I've heard so far. That will at least give me a starting point so I can chart how far I've come. Thanks!\n",
97 | "Sentiment: 0.10000000149011612, 0.5\n",
98 | "****************************************\n",
99 | "OP: My SO referred to my mixed race roommate as \"half-caste\". He didn't realise that was considered offensive by some, it was what everyone at his school said.\n",
100 | "Text: Did you explain to him why it was offensive? I’ve noticed a lot of ppl say things that others say around them. \n",
101 | "Sentiment: -0.10000000149011612, 1.0\n",
102 | "****************************************\n",
103 | "OP: I get like that every shark week!! If you have to, up your calories to maintenance :)\n",
104 | "Text: How do I adjust my macros? I don't want to eat too much fat lol\n",
105 | "Sentiment: -0.10000000149011612, 0.20000000298023224\n",
106 | "****************************************\n",
107 | "OP: Only problem I have with it is the repetition/inconsistency of “ask(ed)”. Aside from that, seems like a real conversation I could see people having. Nice work :)\n",
108 | "Text: I'm trying to cut down on my repetition and more on letting the reader assume it was a question rather than having it say that instead. Thanks. \n",
109 | "Sentiment: 0.0, 0.5\n",
110 | "****************************************\n",
111 | "OP: This week I'm listening David Bowie, Nesrin Sipahi, Run DMC.\n",
112 | "Text: Run DMC :) YES! \n",
113 | "Sentiment: 0.30000001192092896, 0.6000000238418579\n",
114 | "****************************************\n",
115 | "OP: Pursuing the things you want to pursue, whether that’s love, fun, success, or anything else. Being willing to take risks in that pursuit.\n",
116 | "Text: Lots of ppl mentioning taking risks in this thread. What exactly do you mean by that?\n",
117 | "Sentiment: 0.0, 0.10000000149011612\n",
118 | "****************************************\n",
119 | "OP: It's a toss up between my diploma and my wedding ring.\n",
120 | "Text: I still have yet to pick up my diploma from my college and I graduated in 2012...\n",
121 | "Sentiment: 0.30000001192092896, 0.30000001192092896\n",
122 | "****************************************\n",
123 | "OP: Hate it. Partly to do with my other mental illnesses, but have trouble with hygiene in general.\n",
124 | "Text: How do you mean? If you don't mind sharing, that is. \n",
125 | "Sentiment: 0.0, 0.10000000149011612\n",
126 | "****************************************\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "for idx in random_idx:\n",
132 | " op = replies[idx][0]\n",
133 | " post = replies[idx][1]\n",
134 | " lines = post.split('\\n')\n",
135 | " for text in lines:\n",
136 | " if text == '':\n",
137 | " continue\n",
138 | " document = types.Document(\n",
139 | " content=text,\n",
140 | " type=enums.Document.Type.PLAIN_TEXT)\n",
141 | " sentiment = client.analyze_sentiment(document=document).document_sentiment\n",
142 | " print(\"OP:\", op)\n",
143 | " print(\"Text:\", text)\n",
144 | " print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))\n",
145 | " print(\"*\"*40)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": []
154 | }
155 | ],
156 | "metadata": {
157 | "kernelspec": {
158 | "display_name": "Python 3",
159 | "language": "python",
160 | "name": "python3"
161 | },
162 | "language_info": {
163 | "codemirror_mode": {
164 | "name": "ipython",
165 | "version": 3
166 | },
167 | "file_extension": ".py",
168 | "mimetype": "text/x-python",
169 | "name": "python",
170 | "nbconvert_exporter": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.6.6"
173 | }
174 | },
175 | "nbformat": 4,
176 | "nbformat_minor": 2
177 | }
178 |
--------------------------------------------------------------------------------
/src/reddit/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/dataset/__init__.py
--------------------------------------------------------------------------------
/src/reddit/dataset/array_from_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | helpers to take samples from the dataset and turn them into numpy arrays
3 | (for ease of inspection and use with baselines)
4 | """
5 | import argparse
6 | import os
7 | import numpy as np
8 | import pandas as pd
9 | import tensorflow as tf
10 | try:
11 | import mkl_random as random
12 | except ImportError:
13 | import numpy.random as random
14 |
15 | import bert.tokenization as tokenization
16 | from reddit.dataset.dataset import make_input_fn_from_file, make_subreddit_based_simulated_labeler
17 |
18 |
19 | def dataset_fn_to_df(dataset_fn):
20 |
21 | params = {'batch_size': 1}
22 | dataset = dataset_fn(params)
23 |
24 | itr = dataset.make_one_shot_iterator()
25 |
26 | samples = []
27 |
28 | for i in range(250000):
29 | try:
30 | sample = itr.get_next()
31 | for k in sample:
32 | sample[k] = sample[k].numpy()[0]
33 | samples += [sample]
34 | # print("year: {}".format(sample['year']))
35 | except:
36 | print(i)
37 | break
38 |
39 | df = pd.DataFrame(samples)
40 |
41 | return df
42 |
43 |
44 | def subreddit_based_sim_dfs(subreddits, treat_strength, con_strength, noise_level, setting="simple", seed=0,
45 | base_output_dir='../dat/sim/reddit_subreddit_based/'):
46 |
47 | labeler = make_subreddit_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
48 |
49 | num_splits = 10
50 | dev_splits = [0]
51 | test_splits = [0]
52 |
53 | # data_file = '../dat/reddit/proc.tf_record'
54 | # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
55 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
56 |
57 | input_dataset_from_filenames = make_input_fn_from_file(data_file,
58 | 250,
59 | num_splits,
60 | dev_splits,
61 | test_splits,
62 | tokenizer,
63 | subreddits=subreddits,
64 | is_training=False,
65 | filter_test=False,
66 | shuffle_buffer_size=25000,
67 | seed=seed,
68 | labeler=labeler)
69 |
70 | all_data = dataset_fn_to_df(input_dataset_from_filenames)
71 | output_df = all_data[['index', 'gender','outcome', 'y0', 'y1']]
72 | output_df = output_df.rename(index=str, columns={'gender': 'treatment'})
73 |
74 | output_dir = os.path.join(base_output_dir, "subreddits{}".format(subreddits), "mode{}".format(setting))
75 | os.makedirs(output_dir, exist_ok=True)
76 | output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
77 |
78 | output_df.to_csv(output_path, '\t')
79 |
80 |
81 | def main():
82 | tf.enable_eager_execution()
83 |
84 |
85 | subreddit_based_sim_dfs(subreddits=subs, treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0,
86 | base_output_dir=base_output_dir)
87 |
88 |
89 |
90 | # print(itr.get_next()["token_ids"].name)
91 | # for i in range(1000):
92 | # sample = itr.get_next()
93 |
94 | #
95 | # print(np.unique(df['year']))
96 | # print(df.groupby(['year'])['buzzy_title'].agg(np.mean))
97 | # print(df.groupby(['year'])['theorem_referenced'].agg(np.mean))
98 | # print(df.groupby(['year'])['accepted'].agg(np.mean))
99 |
100 |
101 |
102 | if __name__ == '__main__':
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument("--data-file", action="store", default='../dat/reddit/proc.tf_record')
105 | parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
106 | parser.add_argument("--base-output-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
107 | parser.add_argument("--subs", action="store", default='13,8,6')
108 | parser.add_argument("--mode", action="store", default="simple")
109 | parser.add_argument("--beta0", action="store", default='1.0')
110 | parser.add_argument("--beta1", action="store", default='1.0')
111 | parser.add_argument("--gamma", action="store", default='1.0')
112 | args = parser.parse_args()
113 |
114 | data_file = args.data_file
115 | vocab_file = args.vocab_file
116 | base_output_dir = args.base_output_dir
117 | subs = [int(s) for s in args.subs.split(',')]
118 | mode = args.mode
119 | beta0 = float(args.beta0)
120 | beta1 = float(args.beta1)
121 | gamma = float(args.gamma)
122 |
123 | # pass
124 | main()
--------------------------------------------------------------------------------
/src/reddit/dataset/sentence_masking.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Create masked LM TF examples for BERT."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 |
23 | import tensorflow as tf
24 |
25 |
26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
27 | ["index", "label"])
28 |
29 |
30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed):
31 | """Creates the predictions for the masked LM objective.
32 |
33 | This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords
34 |
35 | Except: we just include masks instead of randomly letting the words through or randomly replacing
36 | """
37 |
38 | basic_mask = tf.less(
39 | tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed),
40 | masked_lm_prob)
41 |
42 | # don't mask special characters or padding
43 | cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]),
44 | tf.not_equal(token_ids, vocab["[SEP]"]))
45 | cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0))
46 | mask = tf.logical_and(cand_indexes, basic_mask)
47 |
48 | # truncate to max predictions for ease of padding
49 | masked_lm_positions = tf.where(mask)
50 | # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ?
51 | # very rare event: nothing gets picked for mask, causing an irritating bug
52 | # in this case, just mask the first candidate index
53 | mlm_shape = tf.shape(masked_lm_positions)[0]
54 | masked_lm_positions = tf.cond(mlm_shape > 1,
55 | lambda: masked_lm_positions,
56 | lambda: tf.where(cand_indexes)[0:2])
57 |
58 | masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq]
59 | masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32)
60 | masked_lm_ids = tf.gather(token_ids, masked_lm_positions)
61 |
62 | mask = tf.cast(
63 | tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape),
64 | bool)
65 |
66 | output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids)
67 |
68 | # pad out to max_predictions_per_seq
69 | masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding
70 | add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]]
71 | masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant')
72 | masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant')
73 | masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant')
74 |
75 | return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
76 |
77 |
78 | def main(_):
79 | pass
80 |
81 |
82 | if __name__ == "__main__":
83 | flags.mark_flag_as_required("input_file")
84 | flags.mark_flag_as_required("output_file")
85 | flags.mark_flag_as_required("vocab_file")
86 | tf.app.run()
87 |
--------------------------------------------------------------------------------
/src/reddit/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/model/__init__.py
--------------------------------------------------------------------------------
/src/reddit/submit_scripts/run_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
4 | export INIT_FILE=../dat/reddit/model.ckpt-400000
5 | export DATA_FILE=../dat/reddit/proc.tf_record
6 | export OUTPUT_DIR=../output/reddit_embeddings/
7 |
8 | #13,6,8 are keto, okcupid, childfree
9 | export SUBREDDITS=13,6,8
10 | export USE_SUB_FLAG=false
11 | export BETA0=1.0
12 | export BETA1=1.0
13 | export GAMMA=1.0
14 |
15 | python -m reddit.model.run_causal_bert \
16 | --seed=0 \
17 | --do_train=true \
18 | --do_eval=false \
19 | --do_predict=true \
20 | --label_pred=true \
21 | --unsupervised=true \
22 | --input_files_or_glob=${DATA_FILE} \
23 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \
24 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
25 | --output_dir=${OUTPUT_DIR} \
26 | --dev_splits=0 \
27 | --test_splits=0 \
28 | --max_seq_length=128 \
29 | --train_batch_size=16 \
30 | --learning_rate=3e-5 \
31 | --num_warmup_steps 1000 \
32 | --num_train_steps=10000 \
33 | --save_checkpoints_steps=5000 \
34 | --keep_checkpoints=1 \
35 | --subreddits=${SUBREDDITS} \
36 | --beta0=${BETA0} \
37 | --beta1=${BETA1} \
38 | --gamma=${GAMMA}
39 | # --init_checkpoint=${INIT_FILE}
--------------------------------------------------------------------------------
/src/reddit/submit_scripts/run_unsupervised.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12
4 |
5 | export DATA_FILE=../dat/reddit/proc.tf_record
6 | export OUTPUT_DIR=../output/reddit_embeddings/
7 |
8 | #rm -rf $OUTPUT_DIR
9 | python -m model.run_unsupervised_pretraining \
10 | --seed=0 \
11 | --do_train=true \
12 | --input_file=${DATA_FILE} \
13 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \
14 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
15 | --output_dir=${OUTPUT_DIR} \
16 | --max_seq_length=256 \
17 | --train_batch_size=16 \
18 | --learning_rate=3e-5 \
19 | --num_warmup_steps 200 \
20 | --num_train_steps=175000 \
21 | --save_checkpoints_steps 5000 \
22 | --keep_checkpoints 3
--------------------------------------------------------------------------------
/src/result_processing/helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.feature_extraction.text import CountVectorizer
3 | np.random.seed(0)
4 |
5 | def convert_str_columns_to_float(df):
6 | df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].str[1:-1]
7 | df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].astype(np.float64)
8 |
9 | df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].str[1:-1]
10 | df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].astype(np.float64)
11 | return df
12 |
13 |
14 | def tokenize_documents(documents,max_df0=0.8, min_df0=0.01,print_vocabulary=False,outfolder=None,output_vocabulary_fname='vocabulary.dat'):
15 | from nltk.corpus import stopwords
16 | '''
17 | From a list of documents raw text build a matrix DxV
18 | D: number of docs
19 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
20 | '''
21 | stop = stopwords.words('english')
22 | count_vect = CountVectorizer(stop_words=stop,max_df=max_df0, min_df=min_df0)
23 | corpus = count_vect.fit_transform(documents)
24 | vocabulary = count_vect.get_feature_names()
25 |
26 | return corpus,vocabulary,count_vect
27 |
28 |
29 | def assign_split(df, num_splits=10, col_to_add='split'):
30 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
31 | return df
32 |
33 |
34 | def filter_imbalanced_terms(df, term_counts, imbalance=0.1, key='post_index'):
35 | t_indices = []
36 | n_terms = term_counts.shape[1]
37 | for t in range(n_terms):
38 | ind_occur = np.nonzero(term_counts[:,t])[0]
39 | subset = df[df[key].isin(ind_occur)]
40 | if subset.shape[0] != 0:
41 | prop_men = subset[subset.treatment==1].shape[0]/subset.shape[0]
42 | prop_women = subset[subset.treatment==0].shape[0]/subset.shape[0]
43 | if abs(prop_women-prop_men)>=imbalance:
44 | t_indices.append(t)
45 | return t_indices
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/src/result_processing/process_predictions.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from reddit.data_cleaning import load_reddit, process_text_length
4 | import pandas as pd
5 | import numpy as np
6 | from random import sample
7 | import matplotlib.pyplot as plt
8 | from scipy.stats import pearsonr
9 |
10 | from .helpers import convert_str_columns_to_float
11 |
12 |
13 | def plot_covariate_proportions_per_stratum(treated, control, num_bins, covariate='subreddit'):
14 | cov_vals = treated[covariate].values
15 | n_groups = num_bins
16 |
17 | for val in cov_vals:
18 | # data to plot
19 | treat_props = treated.loc[treated[covariate] == val, 'count'].values
20 | control_props = control.loc[control[covariate] == val, 'count'].values
21 |
22 | # create plot
23 | fig, ax = plt.subplots()
24 | index = np.arange(n_groups)
25 | bar_width = 0.3
26 | opacity = 0.8
27 |
28 | rects1 = plt.bar(index, treat_props, bar_width,
29 | alpha=opacity,
30 | color='b',
31 | label='Treated Units')
32 |
33 | rects2 = plt.bar(index + bar_width, control_props, bar_width,
34 | alpha=opacity,
35 | color='g',
36 | label='Control Units')
37 |
38 | plt.ylim((0.0,1.0))
39 | plt.xlabel('Stratas')
40 | plt.ylabel('Proportions of posts in ' + covariate + ':' + val)
41 | plt.xticks(index + bar_width, tuple(range(1,num_bins+1)))
42 | plt.legend()
43 |
44 | plt.tight_layout()
45 | plt.savefig(os.path.join(log_dir, 'proportions_for_' + covariate + '_' + val + '.png'))
46 |
47 | def normalize(df, col):
48 | vals = df[col].values
49 | min_col = vals.min()
50 | max_col = vals.max()
51 | df[col] = (df[col] - min_col)/(max_col-min_col)
52 | return df
53 |
54 |
55 | def get_covariate_proportions(stratified_df, covariate='subreddit'):
56 | counts_df = stratified_df.groupby(['strata', covariate]).size().reset_index(name="count")
57 | total_by_strata = stratified_df.groupby("strata").size().reset_index(name="total")
58 | counts_df = counts_df.merge(total_by_strata, how='inner', on='strata')
59 | counts_df['count'] /= counts_df['total']
60 | return counts_df
61 |
62 |
63 | def get_text_results(reddit_df, result_df, sub=None):
64 | indices = result_df['index'].values
65 | result_df = reddit_df.loc[indices, ['subreddit', 'post_text', 'author']]
66 |
67 | if sub:
68 | result_df = result_df[result_df.subreddit.isin([sub])]
69 |
70 | return result_df
71 |
72 |
73 | def print_example_posts(sub_text_df, n=10):
74 | post_list = [tuple(val) for val in sub_text_df.values]
75 | random_posts = sample(post_list, n)
76 | print("*"*10 + "Examples" + "*"*10)
77 | for post in random_posts:
78 | print("Subreddit:", post[0])
79 | print("-"*40)
80 | print("Text:", post[1])
81 | print("-"*40)
82 | print("Author:", post[2])
83 | print("*"*40)
84 |
85 |
86 | def stratify_by_value(df, num_bins=10, sort_by='treatment_probability', col_to_add='strata'):
87 | values = df[sort_by].values
88 | min_val = values.min()
89 | max_val = values.max()
90 | interval = (max_val-min_val)/num_bins
91 | bins = np.arange(min_val, max_val, step=interval)
92 | bin_indices = np.digitize(values, bins)
93 | df[col_to_add] = bin_indices
94 | return df
95 |
96 |
97 | def main():
98 | num_examples_to_print=5
99 | num_bins = 5
100 |
101 | predictions_file = os.path.join(log_dir, 'predict', 'test_results_all.tsv')
102 | predict_df = pd.read_csv(predictions_file, delimiter='\t')
103 | predict_df = convert_str_columns_to_float(predict_df)
104 | predict_df = predict_df.rename(columns={'index':'post_index'})
105 | print(predict_df)
106 |
107 | treated = predict_df[predict_df.treatment == 1]
108 | control = predict_df[predict_df.treatment == 0]
109 |
110 | treated_stratified = stratify_by_value(treated, num_bins=num_bins)
111 | control_stratified = stratify_by_value(control, num_bins=num_bins)
112 |
113 | if res_type == 'subreddit':
114 | treated_cov_prop = get_covariate_proportions(treated_stratified)
115 | control_cov_prop = get_covariate_proportions(control_stratified)
116 |
117 | plot_covariate_proportions_per_stratum(treated_cov_prop, control_cov_prop, num_bins)
118 |
119 | for i in range(1,num_bins+1):
120 | print("*"*20, "Proportions for stratum:", i, "*"*20)
121 | print("-"*10, "Treated:", "-"*10)
122 | print(treated_cov_prop[treated_cov_prop.strata == i])
123 |
124 | print("-"*10, "Control:", "-"*10)
125 | print(control_cov_prop[control_cov_prop.strata == i])
126 |
127 | elif res_type == 'length':
128 | text = load_reddit()
129 | text = process_text_length(text)
130 | text = normalize(text, 'post_length')
131 |
132 | treated = treated.merge(text, left_on='post_index', right_index=True, how='inner')
133 | control = control.merge(text, left_on='post_index', right_index=True, how='inner')
134 |
135 | treated_corr = pearsonr(treated.post_length.values, treated.treatment_probability.values)
136 | control_corr = pearsonr(control.post_length.values, control.treatment_probability.values)
137 | print("Corr. between treated and post length", treated_corr)
138 | print("Corr. between control and post length", control_corr)
139 |
140 |
141 | # binned_post_length = stratify_by_value(text, num_bins=20, sort_by='post_length', col_to_add='length_bin')
142 |
143 | # columns_to_keep = treated_stratified.columns.tolist().extend('length_bin')
144 | # treated_text = treated_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')# [columns_to_keep]
145 | # control_text = control_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')#[columns_to_keep]
146 |
147 | # treated_cov_prop = get_covariate_proportions(treated_text, covariate='length_bin')
148 | # control_cov_prop = get_covariate_proportions(control_text, covariate='length_bin')
149 |
150 | # for i in range(1,num_bins+1):
151 | # print("*"*20, "Proportions for stratum:", i, "*"*20)
152 | # print("-"*10, "Treated:", "-"*10)
153 | # print(treated_cov_prop[treated_cov_prop.strata == i])
154 |
155 | # print("-"*10, "Control:", "-"*10)
156 | # print(control_cov_prop[control_cov_prop.strata == i])
157 |
158 |
159 |
160 | if __name__ == '__main__':
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument("--log-dir", action="store", default="../logdir/simulated_training_1.0_1.0_1.0")
163 | parser.add_argument("--result-type", action="store", default="subreddit")
164 | args = parser.parse_args()
165 | log_dir = args.log_dir
166 | res_type = args.result_type
167 |
168 | main()
--------------------------------------------------------------------------------
/src/result_processing/prop_sim_plotting.py:
--------------------------------------------------------------------------------
1 | import os
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | import result_processing.compute_att as att
5 | import pandas as pd
6 |
7 |
8 | def make_reddit_prop_plt():
9 | sns.set()
10 | prop_expt = pd.DataFrame(att.process_propensity_experiment())
11 |
12 | prop_expt = prop_expt[['exog', 'plugin', 'one_step_tmle', 'very_naive']]
13 | prop_expt = prop_expt.rename(index=str, columns={'exog': 'Exogeneity',
14 | 'very_naive': 'Unadjusted',
15 | 'plugin': 'Plug-in',
16 | 'one_step_tmle': 'TMLE'})
17 | prop_expt = prop_expt.set_index('Exogeneity')
18 |
19 | plt.figure(figsize=(4.75, 3.00))
20 | # plt.figure(figsize=(2.37, 1.5))
21 | sns.scatterplot(data=prop_expt, legend='brief', s=75)
22 | plt.xlabel("Exogeneity", fontfamily='monospace')
23 | plt.ylabel("NDE Estimate", fontfamily='monospace')
24 | plt.tight_layout()
25 |
26 | fig_dir = '../output/figures'
27 | os.makedirs(fig_dir, exist_ok=True)
28 | plt.savefig(os.path.join(fig_dir,'reddit_propensity.pdf'))
29 |
30 |
31 | def main():
32 | make_reddit_prop_plt()
33 |
34 |
35 | if __name__ == '__main__':
36 | main()
--------------------------------------------------------------------------------
/src/semi_parametric_estimation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/semi_parametric_estimation/__init__.py
--------------------------------------------------------------------------------
/src/semi_parametric_estimation/ate.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.special import logit, expit
3 | from scipy.optimize import minimize
4 |
5 | from .helpers import truncate_by_g, mse, cross_entropy, truncate_all_by_g
6 | from .att import att_estimates
7 |
8 | """
9 | Note: the standard deviations reported by this methods are actually standard deviations conditioned on the nuisance
10 | parameters.
11 |
12 | That is, we do not account for variability in the estimation of Q and g
13 | """
14 |
15 |
16 | def _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps):
17 | """
18 | Helper for psi_tmle_bin_outcome
19 |
20 | Returns q_\eps (t,x)
21 | (i.e., value of perturbed predictor at t, eps, x; where q_t0, q_t1, g are all evaluated at x
22 | """
23 | h = t * (1./g) - (1.-t) / (1. - g)
24 | full_lq = (1.-t)*logit(q_t0) + t*logit(q_t1) # logit predictions from unperturbed model
25 | logit_perturb = full_lq + eps * h
26 | return expit(logit_perturb)
27 |
28 |
29 | def psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=0.05):
30 | # TODO: make me useable
31 | # solve the perturbation problem
32 |
33 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
34 |
35 | eps_hat = minimize(lambda eps: cross_entropy(y, _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps))
36 | , 0., method='Nelder-Mead')
37 |
38 | eps_hat = eps_hat.x[0]
39 |
40 | def q1(t_cf):
41 | return _perturbed_model_bin_outcome(q_t0, q_t1, g, t_cf, eps_hat)
42 |
43 | ite = q1(np.ones_like(t)) - q1(np.zeros_like(t))
44 |
45 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
46 |
47 |
48 | def psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, eps_hat=None, truncate_level=0.05):
49 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
50 |
51 | g_loss = mse(g, t)
52 | h = t * (1.0/g) - (1.0-t) / (1.0 - g)
53 | full_q = (1.0-t)*q_t0 + t*q_t1 # predictions from unperturbed model
54 |
55 | if eps_hat is None:
56 | eps_hat = np.sum(h*(y-full_q)) / np.sum(np.square(h))
57 |
58 | def q1(t_cf):
59 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
60 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model
61 | return full_q + eps_hat * h_cf
62 |
63 | ite = q1(np.ones_like(t)) - q1(np.zeros_like(t))
64 | psi_tmle = np.mean(ite)
65 |
66 | # standard deviation computation relies on asymptotic expansion of non-parametric estimator, see van der Laan and Rose p 96
67 | ic = h*(y-q1(t)) + ite - psi_tmle
68 | psi_tmle_std = np.std(ic) / np.sqrt(t.shape[0])
69 | initial_loss = np.mean(np.square(full_q-y))
70 | final_loss = np.mean(np.square(q1(t)-y))
71 |
72 | # print("tmle epsilon_hat: ", eps_hat)
73 | # print("initial risk: {}".format(initial_loss))
74 | # print("final risk: {}".format(final_loss))
75 |
76 | return psi_tmle, psi_tmle_std, eps_hat, initial_loss, final_loss, g_loss
77 |
78 |
79 | def psi_iptw(q_t0, q_t1, g, t, y, truncate_level=0.05):
80 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
81 |
82 | ite=(t / g - (1-t) / (1-g))*y
83 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
84 |
85 |
86 | def psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=0.05):
87 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
88 |
89 | full_q = q_t0 * (1 - t) + q_t1 * t
90 | h = t * (1.0 / g) - (1.0 - t) / (1.0 - g)
91 | ite = h * (y - full_q) + q_t1 - q_t0
92 |
93 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
94 |
95 |
96 | def psi_q_only(q_t0, q_t1, g, t, y, truncate_level=0.):
97 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
98 | ite = (q_t1 - q_t0)
99 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0])
100 |
101 |
102 | def psi_very_naive(t, y):
103 | psi_hat = y[t == 1].mean() - y[t == 0].mean()
104 | psi_std = np.sqrt(np.var(y[t == 1]) / np.sum(t) + np.var(y[t == 0]) / np.sum(1-t))
105 | return
106 |
107 |
108 | def ates_from_atts(q_t0, q_t1, g, t, y, truncate_level=0.05):
109 | """
110 | Sanity check code: ATE = ATT_1*P(T=1) + ATT_0*P(T=1)
111 |
112 | :param q_t0:
113 | :param q_t1:
114 | :param g:
115 | :param t:
116 | :param y:
117 | :param truncate_level:
118 | :return:
119 | """
120 |
121 | prob_t = t.mean()
122 |
123 | att = att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=truncate_level)
124 | att_flip = att_estimates(q_t1, q_t0, 1.-g, 1-t, y, 1.-prob_t, truncate_level=truncate_level)
125 |
126 | ates = {}
127 | for k in att.keys():
128 | # note: minus because the flip computes E[Y^0 - Y^1 | T=0]
129 | ates[k] = att[k]*prob_t - att_flip[k]*(1.-prob_t)
130 | # ates[k] = att_flip[k]
131 |
132 | return ates
133 |
134 |
135 | def ate_estimates(q_t0, q_t1, g, t, y, truncate_level=0.05):
136 |
137 | very_naive = psi_very_naive(t,y)
138 | q_only = psi_q_only(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
139 | iptw = psi_iptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
140 | aiptw = psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
141 | tmle = psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level)[0:1]
142 | bin_tmle = psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level)
143 |
144 | estimates = {'very_naive': very_naive,
145 | 'q_only': q_only,
146 | 'iptw': iptw,
147 | 'tmle': tmle,
148 | 'bin-tmle': bin_tmle,
149 | 'aiptw': aiptw}
150 |
151 | return estimates
152 |
153 |
154 |
155 | def main():
156 | pass
157 |
158 |
159 | if __name__ == "__main__":
160 | main()
161 |
--------------------------------------------------------------------------------
/src/semi_parametric_estimation/att.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.special import logit, expit
3 | from scipy.optimize import minimize
4 |
5 | from .helpers import truncate_all_by_g, cross_entropy, mse
6 |
7 |
8 | def _perturbed_model(q_t0, q_t1, g, t, q, eps):
9 | # helper function for psi_tmle
10 |
11 | h1 = t / q - ((1 - t) * g) / (q * (1 - g))
12 | full_q = (1.0 - t) * q_t0 + t * q_t1
13 | perturbed_q = full_q - eps * h1
14 |
15 | def q1(t_cf, epsilon):
16 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
17 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model
18 | return full_q - epsilon * h_cf
19 |
20 | psi_init = np.mean(t * (q1(np.ones_like(t), eps) - q1(np.zeros_like(t), eps))) / q
21 | h2 = (q_t1 - q_t0 - psi_init) / q
22 | perturbed_g = expit(logit(g) - eps * h2)
23 |
24 | return perturbed_q, perturbed_g
25 |
26 |
27 | def psi_tmle(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
28 | """
29 | Near canonical van der Laan TMLE, except we use a
30 | 1 dimension epsilon shared between the Q and g update models
31 |
32 | """
33 |
34 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
35 |
36 | def _perturbed_loss(eps):
37 | pert_q, pert_g = _perturbed_model(q_t0, q_t1, g, t, prob_t, eps)
38 | loss = (np.square(y - pert_q)).mean() + cross_entropy(t, pert_g)
39 | return loss
40 |
41 | eps_hat = minimize(_perturbed_loss, 0.)
42 | eps_hat = eps_hat.x[0]
43 |
44 | def q2(t_cf, epsilon):
45 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g)
46 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model
47 | return full_q - epsilon * h_cf
48 |
49 | psi_tmle = np.mean(t * (q2(np.ones_like(t), eps_hat) - q2(np.zeros_like(t), eps_hat))) / prob_t
50 | return psi_tmle
51 |
52 |
53 | def make_one_step_tmle(prob_t, deps_default=0.001):
54 | "Make a function that computes the 1-step TMLE ala https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4912007/"
55 |
56 | def _perturb_q(q_t0, q_t1, g, t, deps=deps_default):
57 | h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g))
58 |
59 | full_q = (1.0 - t) * q_t0 + t * q_t1
60 | perturbed_q = full_q - deps * h1
61 | # perturbed_q= expit(logit(full_q) - deps*h1)
62 | return perturbed_q
63 |
64 | def _perturb_g(q_t0, q_t1, g, deps=deps_default):
65 | h2 = (q_t1 - q_t0 - _psi(q_t0, q_t1, g)) / prob_t
66 | perturbed_g = expit(logit(g) - deps * h2)
67 | return perturbed_g
68 |
69 | def _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps_default):
70 | # get the values of Q_{eps+deps} and g_{eps+deps} by using the recursive formula
71 |
72 | perturbed_g = _perturb_g(q0_old, q1_old, g_old, deps=deps)
73 |
74 | perturbed_q = _perturb_q(q0_old, q1_old, perturbed_g, t, deps=deps)
75 | perturbed_q0 = _perturb_q(q0_old, q1_old, perturbed_g, np.zeros_like(t), deps=deps)
76 | perturbed_q1 = _perturb_q(q0_old, q1_old, perturbed_g, np.ones_like(t), deps=deps)
77 |
78 | return perturbed_q0, perturbed_q1, perturbed_q, perturbed_g
79 |
80 | def _loss(q, g, y, t):
81 | # compute the new loss
82 | q_loss = mse(y, q)
83 | g_loss = cross_entropy(t, g)
84 | return q_loss + g_loss
85 |
86 | def _psi(q0, q1, g):
87 | return np.mean(g*(q1 - q0)) / prob_t
88 |
89 | def tmle(q_t0, q_t1, g, t, y, truncate_level=0.05, deps=deps_default):
90 | """
91 | Computes the tmle for the ATT (equivalently: direct effect)
92 |
93 | :param q_t0:
94 | :param q_t1:
95 | :param g:
96 | :param t:
97 | :param y:
98 | :param truncate_level:
99 | :param deps:
100 | :return:
101 | """
102 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
103 |
104 | eps = 0.0
105 |
106 | q0_old = q_t0
107 | q1_old = q_t1
108 | g_old = g
109 |
110 | # determine whether epsilon should go up or down
111 | # translated blindly from line 299 of https://github.com/cran/tmle/blob/master/R/tmle.R
112 | h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g))
113 | full_q = (1.0 - t) * q_t0 + t * q_t1
114 | deriv = np.mean(prob_t*h1*(y-full_q) + t*(q_t1 - q_t0 - _psi(q_t0, q_t1, g)))
115 | if deriv > 0:
116 | deps = -deps
117 |
118 | # run until loss starts going up
119 | # old_loss = np.inf # this is the thing used by Rose' implementation
120 | old_loss = _loss(full_q, g, y, t)
121 |
122 | while True:
123 | perturbed_q0, perturbed_q1, perturbed_q, perturbed_g = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps)
124 |
125 | new_loss = _loss(perturbed_q, perturbed_g, y, t)
126 |
127 | # debugging
128 | # print("Psi: {}".format(_psi(q0_old, q1_old, g_old)))
129 | # print("new_loss is: ", new_loss, "old_loss is ", old_loss)
130 |
131 | # # if this is the first step, decide whether to go down or up from eps=0.0
132 | # if eps == 0.0:
133 | # _, _, perturbed_q_neg, perturbed_g_neg = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=-deps)
134 | # neg_loss = _loss(perturbed_q_neg, perturbed_g_neg, y, t)
135 | #
136 | # if neg_loss < new_loss:
137 | # return tmle(q_t0, q_t1, g, t, y, deps=-1.0 * deps)
138 |
139 | # check if converged
140 | if new_loss > old_loss:
141 | if eps == 0.:
142 | print("Warning: no update occurred (is deps too big?)")
143 | return _psi(q0_old, q1_old, g_old)
144 | else:
145 | eps += deps
146 |
147 | q0_old = perturbed_q0
148 | q1_old = perturbed_q1
149 | g_old = perturbed_g
150 |
151 | old_loss = new_loss
152 |
153 | return tmle
154 |
155 |
156 | def psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
157 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
158 |
159 | ite_t = (q_t1 - q_t0)[t == 1]
160 | estimate = ite_t.mean()
161 | return estimate
162 |
163 |
164 | def psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
165 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
166 |
167 | ite_t = g*(q_t1 - q_t0)/prob_t
168 | estimate = ite_t.mean()
169 | return estimate
170 |
171 |
172 | def psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05):
173 | # the robust ATT estimator described in eqn 3.9 of
174 | # https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf
175 |
176 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level)
177 | estimate = (t*(y-q_t0) - (1-t)*(g/(1-g))*(y-q_t0)).mean() / prob_t
178 |
179 | return estimate
180 |
181 |
182 | def psi_very_naive(t, y):
183 | return y[t == 1].mean() - y[t == 0].mean()
184 |
185 |
186 | def att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05, deps=0.0001):
187 |
188 | one_step_tmle = make_one_step_tmle(prob_t, deps_default=deps)
189 |
190 | very_naive = psi_very_naive(t,y)
191 | q_only = psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level)
192 | plugin = psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level)
193 | aiptw = psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level)
194 | one_step_tmle = one_step_tmle(q_t0, q_t1, g, t, y, truncate_level) # note different signature
195 |
196 | estimates = {'very_naive': very_naive, 'q_only': q_only, 'plugin': plugin, 'one_step_tmle': one_step_tmle, 'aiptw': aiptw}
197 |
198 | return estimates
199 |
--------------------------------------------------------------------------------
/src/semi_parametric_estimation/helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.special import logit
3 |
4 | import sklearn.linear_model as lm
5 |
6 |
7 | def calibrate_g(g, t):
8 | """
9 | Improve calibation of propensity scores by fitting 1 parameter (temperature) logistic regression on heldout data
10 |
11 | :param g: raw propensity score estimates
12 | :param t: treatment assignments
13 | :return:
14 | """
15 |
16 | logit_g = logit(g).reshape(-1,1)
17 | calibrator = lm.LogisticRegression(fit_intercept=False, C=1e6, solver='lbfgs') # no intercept or regularization
18 | calibrator.fit(logit_g, t)
19 | calibrated_g = calibrator.predict_proba(logit_g)[:,1]
20 | return calibrated_g
21 |
22 |
23 | def truncate_by_g(attribute, g, level=0.1):
24 | keep_these = np.logical_and(g >= level, g <= 1.-level)
25 |
26 | return attribute[keep_these]
27 |
28 |
29 | def truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level=0.05):
30 | """
31 | Helper function to clean up nuisance parameter estimates.
32 |
33 | """
34 |
35 | orig_g = np.copy(g)
36 |
37 | q_t0 = truncate_by_g(np.copy(q_t0), orig_g, truncate_level)
38 | q_t1 = truncate_by_g(np.copy(q_t1), orig_g, truncate_level)
39 | g = truncate_by_g(np.copy(g), orig_g, truncate_level)
40 | t = truncate_by_g(np.copy(t), orig_g, truncate_level)
41 | y = truncate_by_g(np.copy(y), orig_g, truncate_level)
42 |
43 | return q_t0, q_t1, g, t, y
44 |
45 |
46 |
47 | def cross_entropy(y, p):
48 | return -np.mean((y*np.log(p) + (1.-y)*np.log(1.-p)))
49 |
50 |
51 | def mse(x, y):
52 | return np.mean(np.square(x-y))
53 |
--------------------------------------------------------------------------------
/src/supervised_lda/add_split_to_simulations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 8,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "base_sim_dir = '../../dat/sim/'\n",
21 | "datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n",
22 | "mode = 'modesimple'\n",
23 | "\n",
24 | "for dataset in datasets:\n",
25 | " simdir = os.path.join(base_sim_dir, dataset, mode)\n",
26 | " for simfile in os.listdir(simdir):\n",
27 | " df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n",
28 | " df['split'] = np.random.randint(0, 10, size=df.shape[0])\n",
29 | " df.to_csv(os.path.join(simdir, simfile),sep='\\t')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": []
38 | }
39 | ],
40 | "metadata": {
41 | "kernelspec": {
42 | "display_name": "Python 3",
43 | "language": "python",
44 | "name": "python3"
45 | },
46 | "language_info": {
47 | "codemirror_mode": {
48 | "name": "ipython",
49 | "version": 3
50 | },
51 | "file_extension": ".py",
52 | "mimetype": "text/x-python",
53 | "name": "python",
54 | "nbconvert_exporter": "python",
55 | "pygments_lexer": "ipython3",
56 | "version": "3.6.8"
57 | }
58 | },
59 | "nbformat": 4,
60 | "nbformat_minor": 2
61 | }
62 |
--------------------------------------------------------------------------------
/src/supervised_lda/compute_estimates.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.att import att_estimates
2 | import numpy as np
3 | import os
4 | import argparse
5 | import pandas as pd
6 |
7 | def main():
8 | outdir = os.path.join('..', 'out', args.data, args.experiment)
9 | for sim in os.listdir(outdir):
10 | mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []}
11 | for split in os.listdir(os.path.join(outdir, sim)):
12 | if args.num_splits is not None:
13 | # print("ignoring split", split)
14 | if int(split) >= int(args.num_splits):
15 | continue
16 | array = np.load(os.path.join(outdir, sim, split, 'predictions.npz'))
17 | g = array['g']
18 | q0 = array['q0']
19 | q1 = array['q1']
20 | y = array['y']
21 | t = array['t']
22 | estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03)
23 | for est, att in estimates.items():
24 | mean_estimates[est].append(att)
25 |
26 | if args.data == 'reddit':
27 | sim = sim.replace('beta01.0.', '')
28 | options = sim.split('.0.')
29 | p2 = options[0].replace('beta1', '')
30 | p3 = options[1].replace('gamma', '')
31 |
32 | print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------")
33 | print("True effect = 1.0")
34 | else:
35 | ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03}
36 | print("------ Simulation setting: Confounding strength =", sim)
37 | print("True effect = ", ground_truth_map[sim])
38 |
39 |
40 | for est, atts in mean_estimates.items():
41 | print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3))
42 |
43 |
44 | if __name__ == '__main__':
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument("--data", action="store", default="reddit")
47 | parser.add_argument("--experiment", action="store", default="base_model")
48 | parser.add_argument("--num-splits", action="store", default=None)
49 | args = parser.parse_args()
50 |
51 | main()
--------------------------------------------------------------------------------
/src/supervised_lda/helpers.py:
--------------------------------------------------------------------------------
1 | from nltk.tokenize import word_tokenize
2 | from nltk.stem import WordNetLemmatizer
3 | from nltk.corpus import stopwords
4 | from sklearn.feature_extraction.text import CountVectorizer
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.decomposition import LatentDirichletAllocation
8 |
9 | class LemmaTokenizer(object):
10 | def __init__(self):
11 | self.wnl = WordNetLemmatizer()
12 | def __call__(self, articles):
13 | stop = stopwords.words('english')
14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 |
16 | def filter_by_subreddit(reddit, subs=None):
17 | if not subs:
18 | return reddit.index.values
19 | else:
20 | return reddit[reddit.subreddit.isin(subs)].index.values
21 |
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005):
23 | from nltk.corpus import stopwords
24 | '''
25 | From a list of documents raw text build a matrix DxV
26 | D: number of docs
27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | '''
29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | corpus = count_vect.fit_transform(documents)
31 | vocabulary = count_vect.get_feature_names()
32 |
33 | return corpus,vocabulary,count_vect
34 |
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | indices = np.arange(num_docs)
37 | np.random.shuffle(indices)
38 | size = int(indices.shape[0]*percentage)
39 | dev = indices[:size]
40 | return dev
41 |
42 | def learn_topics(X, X_dev, K=50):
43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | print("Fitting", K, "topics...")
45 | lda.fit(X)
46 | score = lda.perplexity(X_dev)
47 | print("Log likelihood:", score)
48 | topics = lda.components_
49 | return score, lda, topics
50 |
51 | def show_topics(vocab, topics, n_words=20):
52 | topic_keywords = []
53 | for topic_weights in topics:
54 | top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | topic_keywords.append(vocab.take(top_keyword_locs))
56 |
57 | df_topic_keywords = pd.DataFrame(topic_keywords)
58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | return df_topic_keywords
61 |
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | filtered_indices = filtered_df[on].values
64 | doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | embeddings = doc_embeddings[doc_idx, :]
66 | return embeddings
67 |
68 | def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'):
69 | filtered_indices = filtered_df[on].values
70 | doc_idx = [index_mapping[idx] for idx in filtered_indices]
71 | filtered_counts = counts[doc_idx, :]
72 | return filtered_counts
73 |
74 | def make_index_mapping(df, on='post_index', convert_to_int=True):
75 | if on=='index':
76 | indices = df.index.values
77 | else:
78 | indices = df[on].values
79 |
80 | if convert_to_int:
81 | return {int(ind):i for (i,ind) in enumerate(indices)}
82 |
83 | return {ind:i for (i,ind) in enumerate(indices)}
84 |
85 | def assign_split(df, num_splits=10, col_to_add='split'):
86 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
87 | return df
88 |
--------------------------------------------------------------------------------
/src/supervised_lda/peerread_output_att.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.att import att_estimates
2 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
3 | import numpy as np
4 | import pandas as pd
5 | import os
6 | from sklearn.metrics import mean_squared_error as mse
7 | import argparse
8 | import sys
9 | from supervised_lda.supervised_topic_model import SupervisedTopicModel
10 | from supervised_lda import run_supervised_tm
11 | from scipy import sparse
12 | from sklearn.linear_model import LogisticRegression, Ridge
13 | from scipy.special import logit
14 |
15 | def load_peerread(path='../dat/PeerRead/'):
16 | return pd.read_csv(path + 'proc_abstracts.csv')
17 |
18 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
19 | count_filename = path + 'term_counts'
20 | vocab_filename = path + 'vocab'
21 |
22 | if os.path.exists(count_filename + '.npz') and not force_redo:
23 | return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')
24 |
25 | post_docs = df[text_col].values
26 | counts, vocab, _ = tokenize_documents(post_docs)
27 | sparse.save_npz(count_filename, counts)
28 | np.save(vocab_filename, vocab)
29 | return counts.toarray(), np.array(vocab)
30 |
31 | def compute_ground_truth_treatment_effect(df):
32 | y1 = df['y1']
33 | y0 = df['y0']
34 | return y1.mean() - y0.mean()
35 |
36 | def load_simulated_data():
37 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
38 | return sim_df
39 |
40 | def fit_model(doc_embeddings, labels, is_binary=False):
41 | if is_binary:
42 | model = LogisticRegression(solver='liblinear')
43 | else:
44 | model = Ridge()
45 | model.fit(doc_embeddings, labels)
46 | return model
47 |
48 | def main():
49 | if dat_dir:
50 | peerread = load_peerread(path=dat_dir)
51 | counts,vocab = load_term_counts(peerread,path=dat_dir)
52 | else:
53 | peerread = load_peerread()
54 | counts,vocab = load_term_counts(peerread)
55 |
56 | indices = peerread['paper_id'].values
57 | index_mapping = make_index_mapping(peerread, on='index')
58 |
59 | sim_df = load_simulated_data()
60 |
61 | train_df = sim_df[sim_df.split != split]
62 | predict_df = sim_df[sim_df.split == split]
63 | tr_treatment_labels = train_df.treatment.values
64 | tr_outcomes = train_df.outcome.values
65 | predict_treatment = predict_df.treatment.values
66 | predict_outcomes = predict_df.outcome.values
67 |
68 | tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id')
69 | predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id')
70 |
71 | num_documents = tr_counts.shape[0]
72 | vocab_size = tr_counts.shape[1]
73 | model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)
74 |
75 | run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary',
76 | num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)
77 |
78 | if use_supervised_loss:
79 | propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary')
80 | else:
81 | tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
82 | treated = tr_treatment_labels == 1
83 | out_treat = tr_outcomes[treated]
84 | out_no_treat = tr_outcomes[~treated]
85 | q0_embeddings = tr_doc_embeddings[~treated,:]
86 | q1_embeddings = tr_doc_embeddings[treated,:]
87 | q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True)
88 | q1_model = fit_model(q1_embeddings, out_treat, is_binary=True)
89 | g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)
90 |
91 | pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
92 | propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
93 | expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1]
94 | expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1]
95 |
96 | out = os.path.join(outdir, str(split))
97 | os.makedirs(out, exist_ok=True)
98 | outfile = os.path.join(out, 'predictions')
99 | np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)
100 |
101 | if __name__ == '__main__':
102 | parser = argparse.ArgumentParser()
103 | parser.add_argument("--dat-dir", action="store", default=None)
104 | parser.add_argument("--outdir", action="store", default='../out/')
105 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
106 | parser.add_argument("--mode", action="store", default="simple")
107 | parser.add_argument("--params", action="store", default="1.0")
108 | parser.add_argument("--verbose", action='store_true')
109 | parser.add_argument("--split", action='store', default=0)
110 | parser.add_argument("--num-iters", action="store", default=3000)
111 | parser.add_argument("--num-topics", action='store', default=100)
112 | parser.add_argument("--linear-outcome-model", action='store', default="t")
113 | parser.add_argument("--use-recon-loss", action='store', default="t")
114 | parser.add_argument("--use-supervised-loss", action='store', default="t")
115 | args = parser.parse_args()
116 |
117 | sim_dir = args.sim_dir
118 | outdir = args.outdir
119 | dat_dir = args.dat_dir
120 | verbose = args.verbose
121 | params = args.params
122 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
123 | mode = args.mode
124 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
125 | num_topics = args.num_topics
126 | split = int(args.split)
127 | linear_outcome_model = True if args.linear_outcome_model == "t" else False
128 | use_supervised_loss = True if args.use_supervised_loss == "t" else False
129 | use_recon_loss = True if args.use_recon_loss == "t" else False
130 | num_iters = int(args.num_iters)
131 | print(use_supervised_loss, use_recon_loss, linear_outcome_model)
132 |
133 | main()
--------------------------------------------------------------------------------
/src/supervised_lda/reddit_output_att.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.att import att_estimates
2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
3 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
4 | import numpy as np
5 | import pandas as pd
6 | import os
7 | from supervised_lda.supervised_topic_model import SupervisedTopicModel
8 | from sklearn.linear_model import LogisticRegression, Ridge
9 | from supervised_lda import run_supervised_tm
10 | from sklearn.metrics import mean_squared_error as mse
11 | import argparse
12 | import sys
13 | from scipy.special import logit
14 | from scipy import sparse
15 |
16 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False):
17 | count_filename = path + 'term_counts'
18 | vocab_filename = path + 'vocab'
19 |
20 | if os.path.exists(count_filename + '.npz') and not force_redo:
21 | return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')
22 |
23 | post_docs = reddit['post_text'].values
24 | counts, vocab, _ = tokenize_documents(post_docs)
25 | sparse.save_npz(count_filename, counts)
26 | np.save(vocab_filename, vocab)
27 | return counts.toarray(), np.array(vocab)
28 |
29 | def load_simulated_data():
30 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
31 | sim_df = sim_df.rename(columns={'index':'post_index'})
32 | return sim_df
33 |
34 | def drop_empty_posts(counts):
35 | doc_terms = counts.sum(axis=1)
36 | return doc_terms >= 5
37 |
38 | def fit_model(doc_embeddings, labels, is_binary=False):
39 | if is_binary:
40 | model = LogisticRegression(solver='liblinear')
41 | else:
42 | model = Ridge()
43 | model.fit(doc_embeddings, labels)
44 | return model
45 |
46 | def main():
47 | if dat_dir:
48 | reddit = load_reddit_processed(path=dat_dir)
49 | else:
50 | reddit = load_reddit_processed()
51 |
52 | if subs:
53 | reddit = reddit[reddit.subreddit.isin(subs)]
54 | reddit = reddit.dropna(subset=['post_text'])
55 |
56 |
57 | index_mapping = make_index_mapping(reddit, on='orig_index')
58 | if not dat_dir:
59 | counts, vocab = load_term_counts(reddit)
60 | else:
61 | counts, vocab = load_term_counts(reddit, path=dat_dir)
62 |
63 | sim_df = load_simulated_data()
64 |
65 | train_df = sim_df[sim_df.split != split]
66 | predict_df = sim_df[sim_df.split == split]
67 |
68 | tr_treatment_labels = train_df.treatment.values
69 | tr_outcomes = train_df.outcome.values
70 | predict_treatment = predict_df.treatment.values
71 | predict_outcomes = predict_df.outcome.values
72 |
73 | tr_counts = filter_document_terms(train_df, counts, index_mapping)
74 | predict_counts = filter_document_terms(predict_df, counts, index_mapping)
75 | tr_valid = drop_empty_posts(tr_counts)
76 | pred_valid = drop_empty_posts(predict_counts)
77 | tr_counts = tr_counts[tr_valid, :]
78 | predict_counts = predict_counts[pred_valid, :]
79 |
80 | tr_treatment_labels = tr_treatment_labels[tr_valid]
81 | tr_outcomes = tr_outcomes[tr_valid]
82 | predict_treatment = predict_treatment[pred_valid]
83 | predict_outcomes = predict_outcomes[pred_valid]
84 |
85 | num_documents = tr_counts.shape[0]
86 | vocab_size = tr_counts.shape[1]
87 | model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)
88 |
89 | run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)
90 |
91 | if use_supervised_loss:
92 | propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts)
93 | else:
94 | tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
95 | treated = tr_treatment_labels == 1
96 | out_treat = tr_outcomes[treated]
97 | out_no_treat = tr_outcomes[~treated]
98 | q0_embeddings = tr_doc_embeddings[~treated,:]
99 | q1_embeddings = tr_doc_embeddings[treated,:]
100 | q0_model = fit_model(q0_embeddings, out_no_treat)
101 | q1_model = fit_model(q1_embeddings, out_treat)
102 | g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)
103 |
104 | pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
105 | propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
106 | expected_outcome_no_treat = q0_model.predict(pred_doc_embeddings)
107 | expected_outcome_treat = q1_model.predict(pred_doc_embeddings)
108 |
109 | out = os.path.join(outdir, str(split))
110 | os.makedirs(out, exist_ok=True)
111 | outfile = os.path.join(out, 'predictions')
112 | np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)
113 |
114 |
115 | if __name__ == '__main__':
116 | parser = argparse.ArgumentParser()
117 | parser.add_argument("--dat-dir", action="store", default=None)
118 | parser.add_argument("--outdir", action="store", default='../out/')
119 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
120 | parser.add_argument("--subs", action="store", default='13,6,8')
121 | parser.add_argument("--mode", action="store", default="simple")
122 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
123 | parser.add_argument("--verbose", action='store_true')
124 | parser.add_argument("--num-topics", action='store', default=100)
125 | parser.add_argument("--split", action='store', default=0)
126 | parser.add_argument("--num-iters", action="store", default=4000)
127 | # parser.add_argument("--num_splits", action='store', default=10)
128 | parser.add_argument("--linear-outcome-model", action='store', default="t")
129 | parser.add_argument("--use-recon-loss", action='store', default="t")
130 | parser.add_argument("--use-supervised-loss", action='store', default="t")
131 | args = parser.parse_args()
132 |
133 | sim_dir = args.sim_dir
134 | dat_dir = args.dat_dir
135 | outdir = args.outdir
136 | subs = None
137 | if args.subs != '':
138 | subs = [int(s) for s in args.subs.split(',')]
139 | verbose = args.verbose
140 | params = args.params.split(',')
141 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
142 | subs_string = ', '.join(args.subs.split(','))
143 | mode = args.mode
144 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
145 | num_iters = int(args.num_iters)
146 | num_topics = int(args.num_topics)
147 | split = int(args.split)
148 | # num_splits = args.num_splits
149 | linear_outcome_model = True if args.linear_outcome_model == "t" else False
150 | use_supervised_loss = True if args.use_supervised_loss == "t" else False
151 | use_recon_loss = True if args.use_recon_loss == "t" else False
152 |
153 | main()
--------------------------------------------------------------------------------
/src/supervised_lda/run_supervised_tm.py:
--------------------------------------------------------------------------------
1 | from torch import nn, optim
2 | from torch.nn import functional as F
3 | import torch
4 | # from torch.utils.tensorboard import SummaryWriter
5 | import numpy as np
6 | import argparse
7 | from scipy.special import expit
8 |
9 | def visualize_topics(model, vocab, num_topics, num_words=10):
10 | model.eval()
11 | with torch.no_grad():
12 | print('#'*100)
13 | print('Visualize topics...')
14 | betas = model.alphas.t() #model.get_beta()
15 | for k in range(num_topics):
16 | beta = betas[k].detach().numpy()
17 | top_words = beta.argsort()[-num_words:]
18 | topic_words = vocab[top_words]
19 | print('Topic {}: {}'.format(k, topic_words))
20 |
21 | def get_representation(model, docs):
22 | normalized = docs/docs.sum(axis=-1)[:,np.newaxis]
23 | normalized_bow = torch.tensor(normalized, dtype=torch.float)
24 | num_documents = docs.shape[0]
25 | model.eval()
26 | with torch.no_grad():
27 | doc_representation,_ = model.get_theta(normalized_bow)
28 | embeddings = doc_representation.detach().numpy()
29 | return embeddings
30 |
31 |
32 | def predict(model, docs, dtype='real'):
33 | normalized = docs/docs.sum(axis=-1)[:,np.newaxis]
34 | normalized_bow = torch.tensor(normalized, dtype=torch.float)
35 | num_documents = docs.shape[0]
36 |
37 | treatment_ones = torch.ones(num_documents)
38 | treatment_zeros = torch.zeros(num_documents)
39 |
40 | model.eval()
41 | with torch.no_grad():
42 | doc_representation,_ = model.get_theta(normalized_bow)
43 | propensity_score = model.predict_treatment(doc_representation).squeeze().detach().numpy()
44 | propensity_score = expit(propensity_score)
45 | expected_outcome_treat = model.predict_outcome_st_treat(doc_representation, treatment_ones).squeeze().detach().numpy()
46 | expected_outcome_no_treat = model.predict_outcome_st_no_treat(doc_representation, treatment_zeros).squeeze().detach().numpy()
47 |
48 | if dtype == 'binary':
49 | expected_outcome_treat = expit(expected_outcome_treat)
50 | expected_outcome_no_treat = expit(expected_outcome_no_treat)
51 |
52 | return propensity_score, expected_outcome_treat, expected_outcome_no_treat
53 |
54 | def train(model, docs, treatment_labels, outcomes, dtype='real', num_epochs=20000, lr=0.005, wdecay=1.2e-5,batch_size=1000, use_recon_loss=True, use_sup_loss=True):
55 | optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wdecay)
56 | num_documents = docs.shape[0]
57 | indices = np.arange(num_documents)
58 | np.random.shuffle(indices)
59 |
60 | for e_idx in range(num_epochs):
61 | model.train()
62 | k = e_idx%(num_documents//batch_size)
63 | start_index = k*batch_size
64 | end_index = (k+1)*batch_size
65 | batch = indices[start_index:end_index]
66 | docs_batch = docs[batch,:]
67 | treatment_labels_batch = treatment_labels[batch]
68 | outcomes_batch = outcomes[batch]
69 | normalized_batch = docs_batch/docs_batch.sum(axis=1)[:,np.newaxis]
70 |
71 | outcome_labels = torch.tensor(outcomes_batch, dtype=torch.float)
72 | treat_labels = torch.tensor(treatment_labels_batch, dtype=torch.float)
73 | bow = torch.tensor(docs_batch, dtype=torch.float)
74 | normalized_bow = torch.tensor(normalized_batch, dtype=torch.float)
75 |
76 | optimizer.zero_grad()
77 | model.zero_grad()
78 |
79 | recon_loss, supervised_loss, kld_theta = model(bow, normalized_bow, treat_labels, outcome_labels,dtype=dtype, use_supervised_loss=use_sup_loss)
80 | acc_kl_theta_loss = torch.sum(kld_theta).item()
81 | acc_sup_loss = 0.
82 | acc_loss = 0.
83 |
84 | total_loss = kld_theta #+ recon_loss + supervised_loss
85 | if use_recon_loss:
86 | acc_loss = torch.sum(recon_loss).item()
87 | total_loss += 0.1*recon_loss
88 | if use_sup_loss:
89 | acc_sup_loss = torch.sum(supervised_loss).item()
90 | total_loss += supervised_loss
91 |
92 | total_loss.backward()
93 | optimizer.step()
94 |
95 | print("Acc. loss:", acc_loss, "KL loss.:", acc_kl_theta_loss, "Supervised loss:", acc_sup_loss)
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #SBATCH -A sml
3 | #SBATCH -c 8
4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu
5 | #SBATCH --mail-type=ALL
6 |
7 | source activate py3.6
8 |
9 | python -m supervised_lda.peerread_output_att \
10 | --dat-dir=${DIR} \
11 | --mode=${MODE} \
12 | --params=${BETA1} \
13 | --sim-dir=${SIMDIR} \
14 | --outdir=${OUT}/${BETA1} \
15 | --split=${SPLIT} \
16 | --linear-outcome-model=${LINOUTCOME} \
17 | --use-recon-loss=${RECONLOSS} \
18 | --use-supervised-loss=${SUPLOSS} \
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_no_sup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
6 |
7 | export MODE=simple
8 | export LINOUTCOME=t
9 | export RECONLOSS=t
10 | export SUPLOSS=f
11 |
12 | declare -a BETA1S=(5.0)
13 |
14 | for BETA1j in "${BETA1S[@]}"; do
15 | for SPLITi in $(seq 0 9); do
16 | export BETA1=${BETA1j}
17 | export SPLIT=${SPLITi}
18 | export OUT=${BASE_OUT}/no_sup/
19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | done
23 | done
24 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_no_unsup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
6 |
7 | export MODE=simple
8 | export LINOUTCOME=t
9 | export RECONLOSS=f
10 | export SUPLOSS=t
11 |
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 |
14 | for BETA1j in "${BETA1S[@]}"; do
15 | for SPLITi in $(seq 0 9); do
16 | export BETA1=${BETA1j}
17 | export SPLIT=${SPLITi}
18 | export OUT=${BASE_OUT}/no_unsup/
19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | done
23 | done
24 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_nonlinear.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
6 |
7 | export MODE=simple
8 | export LINOUTCOME=f
9 | export RECONLOSS=t
10 | export SUPLOSS=t
11 |
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 |
14 | for BETA1j in "${BETA1S[@]}"; do
15 | for SPLITi in $(seq 0 9); do
16 | export BETA1=${BETA1j}
17 | export SPLIT=${SPLITi}
18 | export OUT=${BASE_OUT}/non_linear/
19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | done
23 | done
24 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/peerread-exps/submit_peerread_simulation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/
6 |
7 | export MODE=simple
8 | export LINOUTCOME=t
9 | export RECONLOSS=t
10 | export SUPLOSS=t
11 |
12 | declare -a BETA1S=(1.0 5.0 25.0)
13 |
14 | for BETA1j in "${BETA1S[@]}"; do
15 | for SPLITi in $(seq 0 9); do
16 | export BETA1=${BETA1j}
17 | export SPLIT=${SPLITi}
18 | export OUT=${BASE_OUT}/base_model/
19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \
20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \
21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh
22 | done
23 | done
24 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #SBATCH -A sml
3 | #SBATCH -c 8
4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu
5 | #SBATCH --mail-type=ALL
6 |
7 | source activate py3.6
8 |
9 | python -m supervised_lda.reddit_output_att \
10 | --dat-dir=${DIR} \
11 | --mode=${MODE} \
12 | --subs=${SUBS} \
13 | --params=${BETA0},${BETA1},${GAMMA} \
14 | --sim-dir=${SIMDIR} \
15 | --outdir=${OUT}/beta0${BETA0}.beta1${BETA1}.gamma${GAMMA} \
16 | --split=${SPLIT} \
17 | --linear-outcome-model=${LINOUTCOME} \
18 | --use-recon-loss=${RECONLOSS} \
19 | --use-supervised-loss=${SUPLOSS} \
20 |
21 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_no_sup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
6 |
7 | export MODE=simple
8 | export SUBS=13,6,8
9 | export LINOUTCOME=t
10 | export RECONLOSS=t
11 | export SUPLOSS=f
12 |
13 | export BETA0=1.0
14 | declare -a BETA1S=(10.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 |
17 | for BETA1j in "${BETA1S[@]}"; do
18 | export BETA1=${BETA1j}
19 | for GAMMAj in "${GAMMAS[@]}"; do
20 | for SPLITi in $(seq 0 4); do
21 | export SPLIT=${SPLITi}
22 | export GAMMA=${GAMMAj}
23 | export OUT=${BASE_OUT}/no_sup/
24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | done
28 | done
29 | done
30 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_no_unsup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
6 |
7 | export MODE=simple
8 | export SUBS=13,6,8
9 | export LINOUTCOME=t
10 | export RECONLOSS=f
11 | export SUPLOSS=t
12 |
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 |
17 | for BETA1j in "${BETA1S[@]}"; do
18 | export BETA1=${BETA1j}
19 | for GAMMAj in "${GAMMAS[@]}"; do
20 | for SPLITi in $(seq 0 4); do
21 | export SPLIT=${SPLITi}
22 | export GAMMA=${GAMMAj}
23 | export OUT=${BASE_OUT}/no_unsup/
24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 |
28 | done
29 | done
30 | done
31 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_nonlinear.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
6 |
7 | export MODE=simple
8 | export SUBS=13,6,8
9 | export LINOUTCOME=f
10 | export RECONLOSS=t
11 | export SUPLOSS=t
12 |
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 |
17 | for BETA1j in "${BETA1S[@]}"; do
18 | export BETA1=${BETA1j}
19 | for GAMMAj in "${GAMMAS[@]}"; do
20 | for SPLITi in $(seq 0 4); do
21 | export SPLIT=${SPLITi}
22 | export GAMMA=${GAMMAj}
23 | export OUT=${BASE_OUT}/non_linear/
24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | done
28 | done
29 | done
30 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_simulation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
6 |
7 | export MODE=simple
8 | export SUBS=13,6,8
9 | export LINOUTCOME=t
10 | export RECONLOSS=t
11 | export SUPLOSS=t
12 |
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0 10.0 100.0)
15 | declare -a GAMMAS=(1.0 4.0)
16 |
17 | for BETA1j in "${BETA1S[@]}"; do
18 | export BETA1=${BETA1j}
19 | for GAMMAj in "${GAMMAS[@]}"; do
20 | for SPLITi in $(seq 0 4); do
21 | export SPLIT=${SPLITi}
22 | export GAMMA=${GAMMAj}
23 | export OUT=${BASE_OUT}/base_model/
24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | done
28 | done
29 | done
30 |
--------------------------------------------------------------------------------
/src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/
3 |
4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/
5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/
6 |
7 | export MODE=simple
8 | export SUBS=13,6,8
9 | export LINOUTCOME=True
10 | export RECONLOSS=True
11 | export SUPLOSS=True
12 |
13 | export BETA0=1.0
14 | declare -a BETA1S=(1.0)
15 | declare -a GAMMAS=(1.0)
16 |
17 | for BETA1j in "${BETA1S[@]}"; do
18 | export BETA1=${BETA1j}
19 | for GAMMAj in "${GAMMAS[@]}"; do
20 | for SPLITi in $(seq 0 1); do
21 | export SPLIT=${SPLITi}
22 | export GAMMA=${GAMMAj}
23 | export OUT=${BASE_OUT}/base_model/
24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \
25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \
26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh
27 | done
28 | done
29 | done
30 |
--------------------------------------------------------------------------------
/src/supervised_lda/supervised_topic_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | import numpy as np
4 | import math
5 |
6 | from torch import nn
7 |
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 |
10 | class SupervisedTopicModel(nn.Module):
11 | def __init__(self, num_topics, vocab_size, num_documents, t_hidden_size=800, theta_act='relu', enc_drop=0., outcome_linear_map=True):
12 | super(SupervisedTopicModel, self).__init__()
13 |
14 | ## define hyperparameters
15 | self.num_topics = num_topics
16 | self.vocab_size = vocab_size
17 | self.num_documents = num_documents
18 | self.t_hidden_size = t_hidden_size
19 | self.enc_drop = enc_drop
20 | self.t_drop = nn.Dropout(enc_drop)
21 | self.theta_act = self.get_activation(theta_act)
22 | self.outcome_linear_map = outcome_linear_map
23 |
24 | ## define the matrix containing the topic embeddings
25 | self.alphas = nn.Parameter(torch.randn(vocab_size, num_topics))
26 |
27 | if self.outcome_linear_map:
28 | ## define linear regression weights for predicting expected outcomes for treated
29 | self.w_expected_outcome_treated = nn.Linear(num_topics, 1)
30 |
31 | ## define linear regression weights for predicting expected outcomes for untreated
32 | self.w_expected_outcome_untreated = nn.Linear(num_topics, 1)
33 | else:
34 | self.f_outcome_treated = nn.Sequential(
35 | nn.Linear(num_topics, t_hidden_size),
36 | self.theta_act,
37 | # nn.BatchNorm1d(t_hidden_size),
38 | nn.Linear(t_hidden_size, t_hidden_size),
39 | self.theta_act,
40 | # nn.BatchNorm1d(t_hidden_size),
41 | nn.Linear(t_hidden_size,1)
42 | )
43 | self.f_outcome_untreated = nn.Sequential(
44 | nn.Linear(num_topics, t_hidden_size),
45 | self.theta_act,
46 | # nn.BatchNorm1d(t_hidden_size),
47 | nn.Linear(t_hidden_size, t_hidden_size),
48 | self.theta_act,
49 | # nn.BatchNorm1d(t_hidden_size),
50 | nn.Linear(t_hidden_size,1)
51 | )
52 | ## define linear regression weights for predicting binary treatment label
53 | self.w_treatment = nn.Linear(num_topics,1)
54 |
55 | self.q_theta = nn.Sequential(
56 | nn.Linear(vocab_size, t_hidden_size),
57 | self.theta_act,
58 | nn.BatchNorm1d(t_hidden_size),
59 | nn.Linear(t_hidden_size, t_hidden_size),
60 | self.theta_act,
61 | nn.BatchNorm1d(t_hidden_size)
62 | )
63 | self.mu_q_theta = nn.Linear(t_hidden_size, num_topics)
64 | self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics)
65 |
66 | def get_activation(self, act):
67 | if act == 'tanh':
68 | act = nn.Tanh()
69 | elif act == 'relu':
70 | act = nn.ReLU()
71 | elif act == 'softplus':
72 | act = nn.Softplus()
73 | elif act == 'rrelu':
74 | act = nn.RReLU()
75 | elif act == 'leakyrelu':
76 | act = nn.LeakyReLU()
77 | elif act == 'elu':
78 | act = nn.ELU()
79 | elif act == 'selu':
80 | act = nn.SELU()
81 | elif act == 'glu':
82 | act = nn.GLU()
83 | else:
84 | print('Defaulting to tanh activations...')
85 | act = nn.Tanh()
86 | return act
87 |
88 | def reparameterize(self, mu, logvar):
89 | """Returns a sample from a Gaussian distribution via reparameterization.
90 | """
91 | if self.training:
92 | std = torch.exp(0.5 * logvar)
93 | eps = torch.randn_like(std)
94 | return eps.mul_(std).add_(mu)
95 | else:
96 | return mu
97 |
98 | def encode(self, bows):
99 | """Returns paramters of the variational distribution for \theta.
100 |
101 | input: bows
102 | batch of bag-of-words...tensor of shape bsz x V
103 | output: mu_theta, log_sigma_theta
104 | """
105 | q_theta = self.q_theta(bows)
106 | if self.enc_drop > 0:
107 | q_theta = self.t_drop(q_theta)
108 | mu_theta = self.mu_q_theta(q_theta)
109 | logsigma_theta = self.logsigma_q_theta(q_theta)
110 | kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean()
111 | return mu_theta, logsigma_theta, kl_theta
112 |
113 | def get_beta(self):
114 | beta = F.softmax(self.alphas, dim=0).transpose(1, 0) ## softmax over vocab dimension
115 | return beta
116 |
117 | def get_theta(self, normalized_bows):
118 | mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows)
119 | z = self.reparameterize(mu_theta, logsigma_theta)
120 | theta = F.softmax(z, dim=-1)
121 | return theta, kld_theta
122 |
123 | def decode(self, theta, beta):
124 | res = torch.mm(theta, beta)
125 | preds = torch.log(res+1e-6)
126 | return preds
127 |
128 | def predict_treatment(self, theta):
129 | logits = self.w_treatment(theta)
130 | return logits
131 |
132 | def predict_outcome_st_treat(self, theta, treatment_labels):
133 | treated_indices = [treatment_labels == 1]
134 | theta_treated = theta[treated_indices]
135 |
136 | if not self.outcome_linear_map:
137 | expected_outcome_treated = self.f_outcome_treated(theta_treated)
138 | else:
139 | expected_outcome_treated = self.w_expected_outcome_treated(theta_treated)
140 |
141 | return expected_outcome_treated
142 |
143 | def predict_outcome_st_no_treat(self, theta, treatment_labels):
144 | untreated_indices = [treatment_labels == 0]
145 | theta_untreated = theta[untreated_indices]
146 |
147 | if not self.outcome_linear_map:
148 | expected_outcome_untreated = self.f_outcome_untreated(theta_untreated)
149 | else:
150 | expected_outcome_untreated = self.w_expected_outcome_untreated(theta_untreated)
151 |
152 | return expected_outcome_untreated
153 |
154 |
155 | def forward(self, bows, normalized_bows, treatment_labels, outcomes, dtype='real', use_supervised_loss=True):
156 | ## get \theta
157 | theta, kld_theta = self.get_theta(normalized_bows)
158 | beta = self.get_beta()
159 |
160 | bce_loss = nn.BCEWithLogitsLoss()
161 | mse_loss = nn.MSELoss()
162 |
163 | ## get reconstruction loss
164 | preds = self.decode(theta, beta)
165 | recon_loss = -(preds * bows).sum(1)
166 | recon_loss = recon_loss.mean()
167 |
168 | supervised_loss=None
169 | if use_supervised_loss:
170 |
171 | #get treatment loss
172 | treatment_logits = self.predict_treatment(theta).squeeze()
173 | treatment_loss = bce_loss(treatment_logits, treatment_labels)
174 |
175 | #get expected outcome loss
176 | treated = [treatment_labels == 1]
177 | untreated = [treatment_labels == 0]
178 | outcomes_treated = outcomes[treated]
179 | outcomes_untreated = outcomes[untreated]
180 | expected_treated = self.predict_outcome_st_treat(theta, treatment_labels).squeeze()
181 | expected_untreated = self.predict_outcome_st_no_treat(theta, treatment_labels).squeeze()
182 |
183 | if dtype == 'real':
184 | outcome_loss_treated = mse_loss(expected_treated,outcomes_treated)
185 | outcome_loss_untreated = mse_loss(expected_treated,outcomes_treated)
186 | else:
187 | outcome_loss_treated = bce_loss(expected_treated,outcomes_treated)
188 | outcome_loss_untreated = bce_loss(expected_treated,outcomes_treated)
189 |
190 | supervised_loss = treatment_loss + outcome_loss_treated + outcome_loss_untreated
191 |
192 | return recon_loss, supervised_loss, kld_theta
193 |
194 |
--------------------------------------------------------------------------------
/src/words_baseline/helpers.py:
--------------------------------------------------------------------------------
1 | from nltk.tokenize import word_tokenize
2 | from nltk.stem import WordNetLemmatizer
3 | from nltk.corpus import stopwords
4 | from sklearn.feature_extraction.text import CountVectorizer
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.decomposition import LatentDirichletAllocation
8 |
9 | class LemmaTokenizer(object):
10 | def __init__(self):
11 | self.wnl = WordNetLemmatizer()
12 | def __call__(self, articles):
13 | stop = stopwords.words('english')
14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
15 |
16 | def filter_by_subreddit(reddit, subs=None):
17 | if not subs:
18 | return reddit.index.values
19 | else:
20 | return reddit[reddit.subreddit.isin(subs)].index.values
21 |
22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
23 | from nltk.corpus import stopwords
24 | '''
25 | From a list of documents raw text build a matrix DxV
26 | D: number of docs
27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
28 | '''
29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
30 | corpus = count_vect.fit_transform(documents)
31 | vocabulary = count_vect.get_feature_names()
32 |
33 | return corpus,vocabulary,count_vect
34 |
35 | def assign_dev_split(num_docs, percentage=0.05):
36 | indices = np.arange(num_docs)
37 | np.random.shuffle(indices)
38 | size = int(indices.shape[0]*percentage)
39 | dev = indices[:size]
40 | return dev
41 |
42 | def learn_topics(X, X_dev, K=50):
43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
44 | print("Fitting", K, "topics...")
45 | lda.fit(X)
46 | score = lda.perplexity(X_dev)
47 | print("Log likelihood:", score)
48 | topics = lda.components_
49 | return score, lda, topics
50 |
51 | def show_topics(vocab, topics, n_words=20):
52 | topic_keywords = []
53 | for topic_weights in topics:
54 | top_keyword_locs = (-topic_weights).argsort()[:n_words]
55 | topic_keywords.append(vocab.take(top_keyword_locs))
56 |
57 | df_topic_keywords = pd.DataFrame(topic_keywords)
58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
60 | return df_topic_keywords
61 |
62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
63 | filtered_indices = filtered_df[on].values
64 | doc_idx = [index_mapping[idx] for idx in filtered_indices]
65 | embeddings = doc_embeddings[doc_idx, :]
66 | return embeddings
67 |
68 | def make_index_mapping(df, on='post_index', convert_to_int=True):
69 | if on=='index':
70 | indices = df.index.values
71 | else:
72 | indices = df[on].values
73 |
74 | if convert_to_int:
75 | return {int(ind):i for (i,ind) in enumerate(indices)}
76 |
77 | return {ind:i for (i,ind) in enumerate(indices)}
78 |
79 | def assign_split(df, num_splits=10, col_to_add='split'):
80 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
81 | return df
82 |
--------------------------------------------------------------------------------
/src/words_baseline/peerread_output_ate.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.ate import psi_q_only,psi_tmle_cont_outcome
2 | import numpy as np
3 | import pandas as pd
4 | import os
5 | from sklearn.linear_model import LogisticRegression, LinearRegression
6 | from sklearn.metrics import mean_squared_error as mse
7 | import argparse
8 | import sys
9 | from scipy.special import logit
10 | from scipy.sparse import load_npz
11 |
12 | def compute_ground_truth_treatment_effect(df):
13 | y1 = df['y1']
14 | y0 = df['y0']
15 | return y1.mean() - y0.mean()
16 |
17 | def get_log_outcomes(outcomes):
18 | #relu
19 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes])
20 | return np.log(outcomes)
21 |
22 | def predict_expected_outcomes(model, features):
23 | return model.predict_proba(features)[:,1]
24 |
25 | def fit_conditional_expected_outcomes(outcomes, features):
26 | model = LogisticRegression(solver='liblinear')
27 | model.fit(features, outcomes)
28 | if verbose:
29 | print("Training accuracy:", model.score(features, outcomes))
30 | return model
31 |
32 | def predict_treatment_probability(labels, features):
33 | model = LogisticRegression(solver='liblinear')
34 | model.fit(features, labels)
35 | if verbose:
36 | print("Training accuracy:", model.score(features, labels))
37 | treatment_probability = model.predict_proba(features)[:,1]
38 | return treatment_probability
39 |
40 | def load_simulated_data():
41 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
42 | sim_df = sim_df.rename(columns={'index':'post_index'})
43 | return sim_df
44 |
45 | def load_term_counts(path='../dat/reddit/'):
46 | return load_npz(path + 'term_counts.npz').toarray()
47 |
48 | def main():
49 | if not dat_dir:
50 | term_counts = load_term_counts()
51 | else:
52 | term_counts = load_term_counts(path=dat_dir)
53 |
54 | sim_df = load_simulated_data()
55 | treatment_labels = sim_df.treatment.values
56 | indices = sim_df.post_index.values
57 | all_words = term_counts[indices, :]
58 |
59 | treated_sim = sim_df[sim_df.treatment==1]
60 | untreated_sim = sim_df[sim_df.treatment==0]
61 | treated_indices = treated_sim.post_index.values
62 | untreated_indices = untreated_sim.post_index.values
63 |
64 | all_outcomes = sim_df.outcome.values
65 | outcomes_st_treated = treated_sim.outcome.values
66 | outcomes_st_not_treated = untreated_sim.outcome.values
67 |
68 | words_st_treated = term_counts[treated_indices,:]
69 | words_st_not_treated = term_counts[untreated_indices,:]
70 |
71 | treatment_probability = predict_treatment_probability(treatment_labels, all_words)
72 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated)
73 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated)
74 |
75 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words)
76 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words)
77 |
78 | q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated,
79 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)
80 |
81 | tmle = psi_tmle_cont_outcome(expected_outcome_st_not_treated, expected_outcome_st_treated,
82 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)[0]
83 |
84 | print("Q hat:", q_hat)
85 | print("TMLE:", tmle)
86 |
87 |
88 | if __name__ == '__main__':
89 | parser = argparse.ArgumentParser()
90 | parser.add_argument("--dat-dir", action="store", default=None)
91 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
92 | parser.add_argument("--mode", action="store", default="simple")
93 | parser.add_argument("--params", action="store", default="1.0")
94 | parser.add_argument("--verbose", action='store_true')
95 | args = parser.parse_args()
96 |
97 | sim_dir = args.sim_dir
98 | dat_dir = args.dat_dir
99 | verbose = args.verbose
100 | params = args.params
101 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
102 | mode = args.mode
103 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
104 |
105 | main()
--------------------------------------------------------------------------------
/src/words_baseline/reddit_output_att.py:
--------------------------------------------------------------------------------
1 | from semi_parametric_estimation.att import att_estimates, psi_plugin, psi_q_only
2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed
3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split
4 | import numpy as np
5 | import pandas as pd
6 | import os
7 | from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
8 | from sklearn.metrics import mean_squared_error as mse
9 | import argparse
10 | import sys
11 | from scipy.special import logit
12 | from scipy.sparse import load_npz
13 |
14 | def get_log_outcomes(outcomes):
15 | #relu
16 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes])
17 | return np.log(outcomes)
18 |
19 | def predict_expected_outcomes(model, features):
20 | return model.predict(features)
21 |
22 | def fit_conditional_expected_outcomes(outcomes, features):
23 | model = Ridge()
24 | model.fit(features, outcomes)
25 | predict = model.predict(features)
26 | if verbose:
27 | print("Training MSE:", mse(outcomes, predict))
28 | return model
29 |
30 | def predict_treatment_probability(labels, features):
31 | model = LogisticRegression(solver='liblinear')
32 | model.fit(features, labels)
33 | if verbose:
34 | print("Training accuracy:", model.score(features, labels))
35 | treatment_probability = model.predict_proba(features)[:,1]
36 | return treatment_probability
37 |
38 | def load_simulated_data():
39 | sim_df = pd.read_csv(simulation_file, delimiter='\t')
40 | sim_df = sim_df.rename(columns={'index':'post_index'})
41 | return sim_df
42 |
43 | def load_term_counts(path='../dat/reddit/'):
44 | return load_npz(path + 'term_counts.npz').toarray()
45 |
46 | def main():
47 |
48 | if not dat_dir:
49 | term_counts = load_term_counts()
50 | else:
51 | term_counts = load_term_counts(path=dat_dir)
52 |
53 | sim_df = load_simulated_data()
54 | treatment_labels = sim_df.treatment.values
55 | indices = sim_df.post_index.values
56 | all_words = term_counts[indices, :]
57 |
58 | treated_sim = sim_df[sim_df.treatment==1]
59 | untreated_sim = sim_df[sim_df.treatment==0]
60 | treated_indices = treated_sim.post_index.values
61 | untreated_indices = untreated_sim.post_index.values
62 |
63 | all_outcomes = sim_df.outcome.values
64 | outcomes_st_treated = treated_sim.outcome.values
65 | outcomes_st_not_treated = untreated_sim.outcome.values
66 |
67 | words_st_treated = term_counts[treated_indices,:]
68 | words_st_not_treated = term_counts[untreated_indices,:]
69 |
70 | treatment_probability = predict_treatment_probability(treatment_labels, all_words)
71 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated)
72 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated)
73 |
74 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words)
75 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words)
76 |
77 | q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated,
78 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
79 |
80 | tmle = psi_plugin(expected_outcome_st_not_treated, expected_outcome_st_treated,
81 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean())
82 |
83 | print("Q hat:", q_hat)
84 | print("TMLE:", tmle)
85 |
86 | if __name__ == '__main__':
87 | parser = argparse.ArgumentParser()
88 | parser.add_argument("--dat-dir", action="store", default=None)
89 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/')
90 | parser.add_argument("--subs", action="store", default='13,6,8')
91 | parser.add_argument("--mode", action="store", default="simple")
92 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0")
93 | parser.add_argument("--verbose", action='store_true')
94 | args = parser.parse_args()
95 |
96 | sim_dir = args.sim_dir
97 | dat_dir = args.dat_dir
98 | subs = None
99 | if args.subs != '':
100 | subs = [int(s) for s in args.subs.split(',')]
101 | verbose = args.verbose
102 | params = args.params.split(',')
103 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2]
104 | subs_string = ', '.join(args.subs.split(','))
105 | mode = args.mode
106 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv"
107 |
108 | main()
--------------------------------------------------------------------------------
/src/words_baseline/scripts/sweep_over_sims.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #NUM_SEED=2
3 | #SEEDS=$(seq 0 $NUM_SEED)
4 | rm ../dat/reddit/sim/reddit_subreddit_based/two-stage-lda-estimates.out
5 | export SUBREDDITS=13,6,8
6 | export BETA0=1.0
7 | declare -a SIMMODES=('simple')
8 | declare -a BETA1S=(1.0 10.0 100.0)
9 | declare -a GAMMAS=(1.0 4.0)
10 |
11 | for SIMMODEj in "${SIMMODES[@]}"; do
12 | for BETA1j in "${BETA1S[@]}"; do
13 | for GAMMAj in "${GAMMAS[@]}"; do
14 | python -m lda_baseline.reddit_output_att \
15 | --subs=${SUBREDDITS} \
16 | --mode=${SIMMODEj} \
17 | --params=${BETA0},${BETA1j},${GAMMAj}
18 | done
19 | done
20 | done
--------------------------------------------------------------------------------