├── .gitignore ├── LICENSE ├── README.md ├── dat ├── PeerRead │ └── proc │ │ └── arxiv-all.tf_record └── reddit │ └── README.md └── src ├── .gitignore ├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── src.iml └── vcs.xml ├── PeerRead ├── ScienceParse │ ├── Paper.py │ ├── README.md │ ├── Review.py │ ├── ScienceParse.py │ ├── ScienceParseReader.py │ └── __init__.py ├── __init__.py ├── data_cleaning │ ├── PeerRead_hand_features.py │ ├── __init__.py │ ├── clean_PeerRead.py │ ├── extra_vocab.py │ ├── process_PeerRead_abstracts.py │ └── scripts │ │ ├── clean_PeerRead.sh │ │ ├── clean_nips_prefix.sh │ │ └── merge_train_dev_test.sh ├── dataset │ ├── __init__.py │ ├── array_from_dataset.py │ ├── dataset.py │ └── sentence_masking.py ├── model │ ├── __init__.py │ ├── bert_multiclass.py │ ├── run_causal_bert.py │ └── run_multiclass.py └── submit_scripts │ ├── run_model.sh │ └── run_unsupervised.sh ├── __init__.py ├── bert ├── README ├── __init__.py ├── create_pretraining_data.py ├── modeling.py ├── optimization.py └── tokenization.py ├── causal_bert ├── __init__.py ├── bert_predictors.py ├── bert_unsupervised.py └── logging.py ├── data_cleaning └── reddit_posts.py ├── lda_baseline ├── helpers.py ├── peerread_fit_topics.py ├── peerread_get_abstracts.py ├── peerread_output_att.py ├── reddit_fit_topics.py ├── reddit_output_att.py └── scripts │ └── sweep_over_sims.sh ├── model_checking └── plot_adjustment.py ├── plot_treatment_model.ipynb ├── reddit ├── __init__.py ├── data_cleaning │ ├── BigQuery_get_data │ ├── __init__.py │ ├── process_reddit.py │ ├── reddit_gender_sentiment.ipynb │ └── reddit_posts.py ├── dataset │ ├── __init__.py │ ├── array_from_dataset.py │ ├── dataset.py │ └── sentence_masking.py ├── model │ ├── __init__.py │ ├── run_causal_bert.py │ ├── run_subreddit_classifier.py │ ├── run_unsupervised_pretraining.py │ └── subreddit_predictors.py └── submit_scripts │ ├── run_model.sh │ └── run_unsupervised.sh ├── result_processing ├── compute_ate.py ├── compute_att.py ├── helpers.py ├── process_predictions.py ├── prop_sim_plotting.py └── test_cond_indep.py ├── semi_parametric_estimation ├── __init__.py ├── ate.py ├── att.py └── helpers.py ├── supervised_lda ├── add_split_to_simulations.ipynb ├── compute_estimates.py ├── helpers.py ├── peerread_output_att.py ├── reddit_output_att.py ├── run_supervised_tm.py ├── submit_scripts │ ├── peerread-exps │ │ ├── run_peerread_simulation.sh │ │ ├── submit_no_sup.sh │ │ ├── submit_no_unsup.sh │ │ ├── submit_nonlinear.sh │ │ └── submit_peerread_simulation.sh │ └── reddit-exps │ │ ├── run_reddit_simulation.sh │ │ ├── submit_no_sup.sh │ │ ├── submit_no_unsup.sh │ │ ├── submit_nonlinear.sh │ │ ├── submit_reddit_simulation.sh │ │ └── submit_reddit_test.sh ├── supervised_topic_model.py └── test_slda.ipynb └── words_baseline ├── helpers.py ├── peerread_output_ate.py ├── reddit_output_att.py └── scripts └── sweep_over_sims.sh /.gitignore: -------------------------------------------------------------------------------- 1 | logdir/** 2 | **/tmp/** 3 | output/** 4 | dat/** 5 | dat/gender-text-corpus 6 | .DS_Store 7 | **/.DS_Store 8 | **/*.pyc 9 | **/*.pyo 10 | *checkpoint* 11 | *aux 12 | *log 13 | *.out 14 | *.synct* 15 | *__pycache__* 16 | 17 | ################################# 18 | # Victor's standard gitignore 19 | # mostly python and tex 20 | ################################# 21 | 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | MANIFEST 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | 127 | # JetBrains (PyCharm) stuff 128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 130 | 131 | # User-specific stuff 132 | .idea/**/workspace.xml 133 | .idea/**/tasks.xml 134 | .idea/**/usage.statistics.xml 135 | .idea/**/dictionaries 136 | .idea/**/shelf 137 | 138 | # Generated files 139 | .idea/**/contentModel.xml 140 | 141 | # Sensitive or high-churn files 142 | .idea/**/dataSources/ 143 | .idea/**/dataSources.ids 144 | .idea/**/dataSources.local.xml 145 | .idea/**/sqlDataSources.xml 146 | .idea/**/dynamic.xml 147 | .idea/**/uiDesigner.xml 148 | .idea/**/dbnavigator.xml 149 | 150 | # Gradle 151 | .idea/**/gradle.xml 152 | .idea/**/libraries 153 | 154 | # Gradle and Maven with auto-import 155 | # When using Gradle or Maven with auto-import, you should exclude module files, 156 | # since they will be recreated, and may cause churn. Uncomment if using 157 | # auto-import. 158 | # .idea/modules.xml 159 | # .idea/*.iml 160 | # .idea/modules 161 | 162 | # CMake 163 | cmake-build-*/ 164 | 165 | # Mongo Explorer plugin 166 | .idea/**/mongoSettings.xml 167 | 168 | # File-based project format 169 | *.iws 170 | 171 | # IntelliJ 172 | out/ 173 | 174 | # mpeltonen/sbt-idea plugin 175 | .idea_modules/ 176 | 177 | # JIRA plugin 178 | atlassian-ide-plugin.xml 179 | 180 | # Cursive Clojure plugin 181 | .idea/replstate.xml 182 | 183 | # Crashlytics plugin (for Android Studio and IntelliJ) 184 | com_crashlytics_export_strings.xml 185 | crashlytics.properties 186 | crashlytics-build.properties 187 | fabric.properties 188 | 189 | # Editor-based Rest Client 190 | .idea/httpRequests 191 | 192 | # Android studio 3.1+ serialized cache file 193 | .idea/caches/build_file_checksums.ser 194 | 195 | # text 196 | *.pdf 197 | 198 | # linux backup files 199 | *~ 200 | *# 201 | 202 | ## Core latex/pdflatex auxiliary files: 203 | *.aux 204 | *.lof 205 | *.log 206 | *.lot 207 | *.fls 208 | *.out 209 | *.toc 210 | *.fmt 211 | *.fot 212 | *.cb 213 | *.cb2 214 | 215 | ## Intermediate documents: 216 | *.dvi 217 | *-converted-to.* 218 | # these rules might exclude image files for figures etc. 219 | # *.ps 220 | # *.eps 221 | # *.pdf 222 | 223 | ## Generated if empty string is given at "Please type another file name for output:" 224 | .pdf 225 | 226 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 227 | *.bbl 228 | *.bcf 229 | *.blg 230 | *-blx.aux 231 | *-blx.bib 232 | *.run.xml 233 | 234 | ## Build tool auxiliary files: 235 | *.fdb_latexmk 236 | *.synctex 237 | *.synctex(busy) 238 | *.synctex.gz 239 | *.synctex.gz(busy) 240 | *.pdfsync 241 | 242 | ## Auxiliary and intermediate files from other packages: 243 | # algorithms 244 | *.alg 245 | *.loa 246 | 247 | # achemso 248 | acs-*.bib 249 | 250 | # amsthm 251 | *.thm 252 | 253 | # beamer 254 | *.nav 255 | *.pre 256 | *.snm 257 | *.vrb 258 | 259 | # changes 260 | *.soc 261 | 262 | # cprotect 263 | *.cpt 264 | 265 | # elsarticle (documentclass of Elsevier journals) 266 | *.spl 267 | 268 | # endnotes 269 | *.ent 270 | 271 | # fixme 272 | *.lox 273 | 274 | # feynmf/feynmp 275 | *.mf 276 | *.mp 277 | *.t[1-9] 278 | *.t[1-9][0-9] 279 | *.tfm 280 | 281 | #(r)(e)ledmac/(r)(e)ledpar 282 | *.end 283 | *.?end 284 | *.[1-9] 285 | *.[1-9][0-9] 286 | *.[1-9][0-9][0-9] 287 | *.[1-9]R 288 | *.[1-9][0-9]R 289 | *.[1-9][0-9][0-9]R 290 | *.eledsec[1-9] 291 | *.eledsec[1-9]R 292 | *.eledsec[1-9][0-9] 293 | *.eledsec[1-9][0-9]R 294 | *.eledsec[1-9][0-9][0-9] 295 | *.eledsec[1-9][0-9][0-9]R 296 | 297 | # glossaries 298 | *.acn 299 | *.acr 300 | *.glg 301 | *.glo 302 | *.gls 303 | *.glsdefs 304 | 305 | # gnuplottex 306 | *-gnuplottex-* 307 | 308 | # gregoriotex 309 | *.gaux 310 | *.gtex 311 | 312 | # hyperref 313 | *.brf 314 | 315 | # knitr 316 | *-concordance.tex 317 | # TODO Comment the next line if you want to keep your tikz graphics files 318 | *.tikz 319 | *-tikzDictionary 320 | 321 | # listings 322 | *.lol 323 | 324 | # makeidx 325 | *.idx 326 | *.ilg 327 | *.ind 328 | *.ist 329 | 330 | # minitoc 331 | *.maf 332 | *.mlf 333 | *.mlt 334 | *.mtc[0-9]* 335 | *.slf[0-9]* 336 | *.slt[0-9]* 337 | *.stc[0-9]* 338 | 339 | # minted 340 | _minted* 341 | *.pyg 342 | 343 | # morewrites 344 | *.mw 345 | 346 | # nomencl 347 | *.nlo 348 | 349 | # pax 350 | *.pax 351 | 352 | # pdfpcnotes 353 | *.pdfpc 354 | 355 | # sagetex 356 | *.sagetex.sage 357 | *.sagetex.py 358 | *.sagetex.scmd 359 | 360 | # scrwfile 361 | *.wrt 362 | 363 | # sympy 364 | *.sout 365 | *.sympy 366 | sympy-plots-for-*.tex/ 367 | 368 | # pdfcomment 369 | *.upa 370 | *.upb 371 | 372 | # pythontex 373 | *.pytxcode 374 | pythontex-files-*/ 375 | 376 | # thmtools 377 | *.loe 378 | 379 | # TikZ & PGF 380 | *.dpth 381 | *.md5 382 | *.auxlock 383 | 384 | # todonotes 385 | *.tdo 386 | 387 | # easy-todo 388 | *.lod 389 | 390 | # xindy 391 | *.xdy 392 | 393 | # xypic precompiled matrices 394 | *.xyc 395 | 396 | # endfloat 397 | *.ttt 398 | *.fff 399 | 400 | # Latexian 401 | TSWLatexianTemp* 402 | 403 | ## Editors: 404 | # WinEdt 405 | *.bak 406 | *.sav 407 | 408 | # Texpad 409 | .texpadtmp 410 | 411 | # Kile 412 | *.backup 413 | 414 | # KBibTeX 415 | *~[0-9]* 416 | 417 | # auto folder when using emacs and auctex 418 | auto/* 419 | 420 | # auto folder when using emacs and auctex 421 | auto 422 | 423 | # expex forward references with \gathertags 424 | *-tags.tex 425 | 426 | # os x stuff 427 | .DS_Store 428 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Blei Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This repository contains software and data for "Using Text Embeddings for Causal Inference" ([arxiv.org/abs/1905.12741](https://arxiv.org/abs/1905.12741)). 4 | The paper describes a method for causal inference with text documents. For example, does adding a 5 | theorem to a paper affect its chance of acceptance? The method adapts deep language models to address the causal problem. 6 | 7 | This software builds on 8 | 1. Bert: [github.com/google-research/bert](https://github.com/google-research/bert), and on 9 | 2. PeerRead: [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead) 10 | 11 | We include pre-processed PeerRead arxiv data for convenience. 12 | 13 | There is also a [reference implementation in pytorch.](https://github.com/rpryzant/causal-bert-pytorch) 14 | 15 | # Tensorflow 2 16 | For new projects, we recommend building on the [reference tensorflow 2 implementation](https://github.com/vveitch/causal-text-embeddings-tf2). 17 | 18 | # Requirements and setup 19 | 20 | 1. You'll need to download a pre-trained BERT model (following the above github link). We use `uncased_L-12_H-768_A-12`. 21 | 2. Install Tensorflow 1.12 22 | 23 | # Data 24 | 25 | 1. We include a pre-processed copy of PeerRead data for convenience. 26 | This data is a collection of arXiv papers submitted to computer science conferences, the accept/reject decisions for these papers, 27 | and their abstracts. 28 | The raw PeerRead data contains significantly more information. 29 | You can get the raw data by following instructions at [github.com/allenai/PeerRead](https://github.com/allenai/PeerRead). 30 | Running the included pre-processing scripts in the PeerRead folder will recreate the included tfrecord file. 31 | 32 | 2. The reddit data can be downloaded at [archive.org/details/reddit_posts_2018](https://archive.org/details/reddit_posts_2018). 33 | This data includes all top-level reddit comments where the gender of the poster was annotated in some fashion. 34 | Each post has meta information (score, date, username, etc.) and includes the text for the first reply. 35 | The processed data used in the paper can be recreated by running the pre-processing scripts in the `reddit` folder. 36 | 37 | You can also re-collect the data from Google BigQuery. 38 | The SQL command to do this is in `reddit/data_cleaning/BigQuery_get_data`. 39 | Modifying this script will allow you to change collection parameters (e.g., the year, which responses are included) 40 | 41 | 42 | # Reproducing the PeerRead experiments 43 | 44 | The default settings for the code match the settings used in the software. 45 | These match the default settings used by BERT, except 46 | 1. we reduce batch size to allow training on a Titan X, and 47 | 2. we adjust the learning rate to account for this. 48 | 49 | You'll run the from `src` code as 50 | `./PeerRead/submit_scripts/run_model.sh` 51 | Before doing this, you'll need to edit `run_classifier.sh` to change 52 | `BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12` 53 | to 54 | `BERT_BASE_DIR=[path to BERT_pre-trained]/uncased_L-12_H-768_A-12`. 55 | 56 | The flag 57 | `--treatment=theorem_referenced` 58 | controls the experiment. 59 | The flag 60 | `--simulated=real` 61 | controls whether to use the real effect or one of the semi-synthetic modes. 62 | 63 | The effect estimates can be reproduced by running `python -m result_processing.compute_ate`. 64 | This takes in the predictions of the bert model (in tsv format) and passes them into downstream estimators 65 | of the causal effect. 66 | 67 | To reproduce the baselines, you'll need to produce a tsv for each simulated dataset you want to test on. To do this, you can run `python -m PeerRead.dataset.array_from_dataset` from src. The flag `--beta1=1.0` controls the strength of the confounding. (The other flags control other simulation parameters not used in the paper.) 68 | 69 | # Misc. 70 | 71 | The experiments in the paper use a version of BERT that was further pre-trained on the PeerRead corpus 72 | using an unsupervised objective. 73 | This can be replicated with `./PeerRead/submit_scripts/run_classifier.sh`. 74 | This takes about 24 hours on a single Titan Xp. 75 | To use a pre-trained BERT, uncomment the `INIT_DIR` options in `run_classifier.sh`. 76 | 77 | # Reproducing the Reddit experiment 78 | 79 | 1. First, get the data following instructions above and save it as `dat/reddit/2018.json` 80 | 2. Run data pre-processing with `python -m reddit.data_cleaning.process_reddit` 81 | 3. Once the data is processed, instructions for running the experiments are essentially the same as for PeerRead 82 | 83 | # Maintainers 84 | [Dhanya Sridhar](https://github.com/dsridhar91`) and [Victor Veitch](`github.com/vveitch`) 85 | 86 | -------------------------------------------------------------------------------- /dat/PeerRead/proc/arxiv-all.tf_record: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/dat/PeerRead/proc/arxiv-all.tf_record -------------------------------------------------------------------------------- /dat/reddit/README.md: -------------------------------------------------------------------------------- 1 | This folder is the expected location for the reddit data, "2018.json". 2 | 3 | Follow instructions in the top-level README to get this data and save it here. 4 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | logdir/** 2 | **/tmp/** 3 | output/** 4 | dat/** 5 | dat/gender-text-corpus 6 | .DS_Store 7 | **/.DS_Store 8 | **/*.pyc 9 | **/*.pyo 10 | *checkpoint* 11 | *aux 12 | *log 13 | *.out 14 | *.synct* 15 | *__pycache__* 16 | 17 | ################################# 18 | # Victor's standard gitignore 19 | # mostly python and tex 20 | ################################# 21 | 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | MANIFEST 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | 127 | # JetBrains (PyCharm) stuff 128 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 129 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 130 | 131 | # User-specific stuff 132 | .idea/**/workspace.xml 133 | .idea/**/tasks.xml 134 | .idea/**/usage.statistics.xml 135 | .idea/**/dictionaries 136 | .idea/**/shelf 137 | 138 | # Generated files 139 | .idea/**/contentModel.xml 140 | 141 | # Sensitive or high-churn files 142 | .idea/**/dataSources/ 143 | .idea/**/dataSources.ids 144 | .idea/**/dataSources.local.xml 145 | .idea/**/sqlDataSources.xml 146 | .idea/**/dynamic.xml 147 | .idea/**/uiDesigner.xml 148 | .idea/**/dbnavigator.xml 149 | 150 | # Gradle 151 | .idea/**/gradle.xml 152 | .idea/**/libraries 153 | 154 | # Gradle and Maven with auto-import 155 | # When using Gradle or Maven with auto-import, you should exclude module files, 156 | # since they will be recreated, and may cause churn. Uncomment if using 157 | # auto-import. 158 | # .idea/modules.xml 159 | # .idea/*.iml 160 | # .idea/modules 161 | 162 | # CMake 163 | cmake-build-*/ 164 | 165 | # Mongo Explorer plugin 166 | .idea/**/mongoSettings.xml 167 | 168 | # File-based project format 169 | *.iws 170 | 171 | # IntelliJ 172 | out/ 173 | 174 | # mpeltonen/sbt-idea plugin 175 | .idea_modules/ 176 | 177 | # JIRA plugin 178 | atlassian-ide-plugin.xml 179 | 180 | # Cursive Clojure plugin 181 | .idea/replstate.xml 182 | 183 | # Crashlytics plugin (for Android Studio and IntelliJ) 184 | com_crashlytics_export_strings.xml 185 | crashlytics.properties 186 | crashlytics-build.properties 187 | fabric.properties 188 | 189 | # Editor-based Rest Client 190 | .idea/httpRequests 191 | 192 | # Android studio 3.1+ serialized cache file 193 | .idea/caches/build_file_checksums.ser 194 | 195 | # text 196 | *.pdf 197 | 198 | # linux backup files 199 | *~ 200 | *# 201 | 202 | ## Core latex/pdflatex auxiliary files: 203 | *.aux 204 | *.lof 205 | *.log 206 | *.lot 207 | *.fls 208 | *.out 209 | *.toc 210 | *.fmt 211 | *.fot 212 | *.cb 213 | *.cb2 214 | 215 | ## Intermediate documents: 216 | *.dvi 217 | *-converted-to.* 218 | # these rules might exclude image files for figures etc. 219 | # *.ps 220 | # *.eps 221 | # *.pdf 222 | 223 | ## Generated if empty string is given at "Please type another file name for output:" 224 | .pdf 225 | 226 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 227 | *.bbl 228 | *.bcf 229 | *.blg 230 | *-blx.aux 231 | *-blx.bib 232 | *.run.xml 233 | 234 | ## Build tool auxiliary files: 235 | *.fdb_latexmk 236 | *.synctex 237 | *.synctex(busy) 238 | *.synctex.gz 239 | *.synctex.gz(busy) 240 | *.pdfsync 241 | 242 | ## Auxiliary and intermediate files from other packages: 243 | # algorithms 244 | *.alg 245 | *.loa 246 | 247 | # achemso 248 | acs-*.bib 249 | 250 | # amsthm 251 | *.thm 252 | 253 | # beamer 254 | *.nav 255 | *.pre 256 | *.snm 257 | *.vrb 258 | 259 | # changes 260 | *.soc 261 | 262 | # cprotect 263 | *.cpt 264 | 265 | # elsarticle (documentclass of Elsevier journals) 266 | *.spl 267 | 268 | # endnotes 269 | *.ent 270 | 271 | # fixme 272 | *.lox 273 | 274 | # feynmf/feynmp 275 | *.mf 276 | *.mp 277 | *.t[1-9] 278 | *.t[1-9][0-9] 279 | *.tfm 280 | 281 | #(r)(e)ledmac/(r)(e)ledpar 282 | *.end 283 | *.?end 284 | *.[1-9] 285 | *.[1-9][0-9] 286 | *.[1-9][0-9][0-9] 287 | *.[1-9]R 288 | *.[1-9][0-9]R 289 | *.[1-9][0-9][0-9]R 290 | *.eledsec[1-9] 291 | *.eledsec[1-9]R 292 | *.eledsec[1-9][0-9] 293 | *.eledsec[1-9][0-9]R 294 | *.eledsec[1-9][0-9][0-9] 295 | *.eledsec[1-9][0-9][0-9]R 296 | 297 | # glossaries 298 | *.acn 299 | *.acr 300 | *.glg 301 | *.glo 302 | *.gls 303 | *.glsdefs 304 | 305 | # gnuplottex 306 | *-gnuplottex-* 307 | 308 | # gregoriotex 309 | *.gaux 310 | *.gtex 311 | 312 | # hyperref 313 | *.brf 314 | 315 | # knitr 316 | *-concordance.tex 317 | # TODO Comment the next line if you want to keep your tikz graphics files 318 | *.tikz 319 | *-tikzDictionary 320 | 321 | # listings 322 | *.lol 323 | 324 | # makeidx 325 | *.idx 326 | *.ilg 327 | *.ind 328 | *.ist 329 | 330 | # minitoc 331 | *.maf 332 | *.mlf 333 | *.mlt 334 | *.mtc[0-9]* 335 | *.slf[0-9]* 336 | *.slt[0-9]* 337 | *.stc[0-9]* 338 | 339 | # minted 340 | _minted* 341 | *.pyg 342 | 343 | # morewrites 344 | *.mw 345 | 346 | # nomencl 347 | *.nlo 348 | 349 | # pax 350 | *.pax 351 | 352 | # pdfpcnotes 353 | *.pdfpc 354 | 355 | # sagetex 356 | *.sagetex.sage 357 | *.sagetex.py 358 | *.sagetex.scmd 359 | 360 | # scrwfile 361 | *.wrt 362 | 363 | # sympy 364 | *.sout 365 | *.sympy 366 | sympy-plots-for-*.tex/ 367 | 368 | # pdfcomment 369 | *.upa 370 | *.upb 371 | 372 | # pythontex 373 | *.pytxcode 374 | pythontex-files-*/ 375 | 376 | # thmtools 377 | *.loe 378 | 379 | # TikZ & PGF 380 | *.dpth 381 | *.md5 382 | *.auxlock 383 | 384 | # todonotes 385 | *.tdo 386 | 387 | # easy-todo 388 | *.lod 389 | 390 | # xindy 391 | *.xdy 392 | 393 | # xypic precompiled matrices 394 | *.xyc 395 | 396 | # endfloat 397 | *.ttt 398 | *.fff 399 | 400 | # Latexian 401 | TSWLatexianTemp* 402 | 403 | ## Editors: 404 | # WinEdt 405 | *.bak 406 | *.sav 407 | 408 | # Texpad 409 | .texpadtmp 410 | 411 | # Kile 412 | *.backup 413 | 414 | # KBibTeX 415 | *~[0-9]* 416 | 417 | # auto folder when using emacs and auctex 418 | auto/* 419 | 420 | # auto folder when using emacs and auctex 421 | auto 422 | 423 | # expex forward references with \gathertags 424 | *-tags.tex 425 | 426 | # os x stuff 427 | .DS_Store 428 | -------------------------------------------------------------------------------- /src/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /src/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/.idea/src.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /src/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/Paper.py: -------------------------------------------------------------------------------- 1 | import re,io,json,sys 2 | from .Review import Review 3 | 4 | class Paper: 5 | """A paper class, which contains relevant fields and a list of reviews""" 6 | def __init__(self, TITLE, ABSTRACT, ID, REVIEWS, AUTHORS=None, CONFERENCE=None, ACCEPTED=None, SCORE=None, 7 | PUBLICATION_TYPE=None, SCIENCEPARSE=None, KEYWORDS=None, AUTHOR_EMAILS=None, DATE_OF_SUBMISSION=None, 8 | SUBJECTS=None,COMMENTS=None,VERSION=None,HISTORIES=None): 9 | self.TITLE = TITLE 10 | self.ABSTRACT = re.sub("\\n", " ", ABSTRACT) 11 | self.ID = ID 12 | self.AUTHORS = AUTHORS 13 | self.REVIEWS = REVIEWS 14 | self.SCIENCEPARSE = SCIENCEPARSE 15 | self.CONFERENCE = CONFERENCE 16 | self.ACCEPTED = ACCEPTED 17 | self.SCORE = SCORE 18 | self.PUBLICATION_TYPE = PUBLICATION_TYPE 19 | self.KEYWORDS = KEYWORDS 20 | self.AUTHOR_EMAILS = AUTHOR_EMAILS 21 | self.DATE_OF_SUBMISSION = DATE_OF_SUBMISSION 22 | 23 | # additional properties for arxiv papers 24 | self.SUBJECTS = SUBJECTS 25 | self.COMMENTS = COMMENTS 26 | self.VERSION = VERSION 27 | self.HISTORIES = HISTORIES #[(version,date,link,comments),...] 28 | 29 | @staticmethod 30 | def from_softconf_dump(json_file, conference=None): 31 | with io.open(json_file, "r", encoding="utf8") as ifh: 32 | json_str = ifh.read() 33 | 34 | # print (json_str) 35 | json_data = json.loads(json_str)["submissions"] 36 | 37 | papers = [] 38 | for i in range(len(json_data)): 39 | reviews = [] 40 | for k in range(len(json_data[i]["reviews"])): 41 | # print(json_data[i]["reviews"][k]) 42 | review_data = [] 43 | 44 | review = Review.from_json_object(json_data[i]["reviews"][k], k==i==0) 45 | #review = None 46 | 47 | reviews.append(review) 48 | 49 | authors = json_data[i]["authors"] if "authors" in json_data[i] else None 50 | score = json_data[i]["score"] if "score" in json_data[i] else None 51 | accepted = json_data[i]["accepted"] if "accepted" in json_data[i] else None 52 | publication_type = json_data[i]["publication_type"] if "publication_type" in json_data[i] else None 53 | keywords = json_data[i]["KEYWORDS"] if "KEYWORDS" in json_data[i] else None 54 | author_emails = json_data[i]["AUTHOR_EMAILS"] if "AUTHOR_EMAILS" in json_data[i] else None 55 | date_of_submission = json_data[i]["DATE_OF_SUBMISSION"] if "DATE_OF_SUBMISSION" in json_data[i] else None 56 | 57 | paper = Paper(json_data[i]["title"], json_data[i]["abstract"], json_data[i]["id"], reviews, authors, \ 58 | conference, accepted, score, publication_type, None, keywords, author_emails, \ 59 | date_of_submission) 60 | 61 | papers.append(paper) 62 | # break 63 | 64 | return papers 65 | 66 | @staticmethod 67 | def from_json(json_filename, from_annotated = False): 68 | paper = Paper('', '', None, []) 69 | 70 | datas = [] 71 | with io.open(json_filename, mode='rt', encoding='utf8') as json_file: 72 | for line in json_file: 73 | try: 74 | data = json.loads(line.strip()) 75 | datas.append(data) 76 | except Exception as e: 77 | print(line) 78 | continue 79 | if len(datas)==0: return None 80 | data = datas[-1] 81 | 82 | # Read required fields. 83 | assert 'title' in data 84 | assert 'abstract' in data 85 | paper.TITLE = data['title'] 86 | paper.ABSTRACT = data['abstract'] 87 | 88 | if 'id' in data: 89 | if data['id'] == "": 90 | paper.ID = json_filename.split("/")[-1].split(".")[0] 91 | else: 92 | paper.ID = data['id'] 93 | else: 94 | paper.ID = json_filename.split("/")[-1].split(".")[0] 95 | 96 | # Read optional fields. 97 | paper.AUTHORS = data['authors'] if 'authors' in data else None 98 | paper.CONFERENCE = data['conference'] if 'conference' in data else None 99 | paper.ACCEPTED = data['accepted'] if 'accepted' in data else None 100 | paper.SCORE = data['score'] if 'score' in data else None 101 | paper.PUBLICATION_TYPE = data['publication_type'] if 'publication_type' in data else None 102 | paper.SCIENCEPARSE = data['scienceparse'] if 'scienceparse' in data else None 103 | paper.KEYWORDS = data['keywords'] if 'keywords' in data else None 104 | paper.AUTHOR_EMAILS = data['author_emails'] if 'author_emails' in data else None 105 | 106 | paper.DATE_OF_SUBMISSION = data['DATE_OF_SUBMISSION'] if 'DATE_OF_SUBMISSION' in data else None 107 | 108 | paper.SUBJECTS = data['SUBJECTS'] if 'SUBJECTS' in data else None 109 | paper.COMMENTS = data['COMMENTS'] if 'COMMENTS' in data else None 110 | paper.VERSION = data['VERSION'] if 'VERSION' in data else None 111 | paper.HISTORIES = data['histories'] if 'histories' in data else None 112 | 113 | # Read reviews (mandatory). 114 | assert 'reviews' in data 115 | for review_data in data['reviews']: 116 | review = Review.from_json_object(review_data) 117 | paper.REVIEWS.append(review) 118 | return paper 119 | 120 | 121 | 122 | def to_json_object(self): 123 | data = dict() 124 | 125 | data["title"] = self.get_title() 126 | data["abstract"] = self.get_abstract() 127 | data["id"] = self.get_id() 128 | 129 | if self.AUTHORS is not None: 130 | data["authors"] = self.get_authors() 131 | 132 | if self.CONFERENCE is not None: 133 | data["conference"] = self.get_conference() 134 | 135 | if self.ACCEPTED is not None: 136 | data["accepted"] = self.get_accepted() 137 | 138 | if self.SCORE is not None: 139 | data["SCORE"] = self.get_score() 140 | 141 | if self.PUBLICATION_TYPE is not None: 142 | data["publication_type"] = self.get_publication_type() 143 | 144 | if self.SCIENCEPARSE is not None: 145 | data["SCIENCEPARSE"] = self.get_scienceparse() 146 | 147 | if self.AUTHOR_EMAILS is not None: 148 | data["AUTHOR_EMAILS"] = self.get_author_emails() 149 | 150 | if self.KEYWORDS is not None: 151 | data["KEYWORDS"] = self.get_keywords() 152 | 153 | if self.DATE_OF_SUBMISSION is not None: 154 | data["DATE_OF_SUBMISSION"] = self.get_date_of_submission() 155 | 156 | data["reviews"] = [] 157 | 158 | for r in self.get_reviews(): 159 | data["reviews"].append(r.to_json_object()) 160 | 161 | # added for arxiv papers 162 | 163 | if self.SUBJECTS is not None: 164 | data["SUBJECTS"] = self.get_subjects() 165 | 166 | if self.COMMENTS is not None: 167 | data["COMMENTS"] = self.get_comments() 168 | 169 | if self.VERSION is not None: 170 | data["VERSION"] = self.get_version() 171 | 172 | data["histories"] = [] 173 | if self.HISTORIES is not None: 174 | for h in self.get_histories(): 175 | if h is not None: 176 | v,d,l,p = h 177 | data["histories"].append((v,d,l, p if p else None)) 178 | 179 | return data 180 | 181 | def to_json(self, json_file, mode='a'): 182 | 183 | data = self.to_json_object() 184 | 185 | with open(json_file, mode) as ofh: 186 | json.dump(data, ofh) 187 | ofh.write("\n") 188 | 189 | 190 | def get_subjects(self): 191 | return self.SUBJECTS 192 | def get_comments(self): 193 | return self.COMMENTS 194 | def get_version(self): 195 | return self.VERSION 196 | def get_histories(self): 197 | return self.HISTORIES 198 | 199 | 200 | def get_title(self): 201 | return self.TITLE 202 | 203 | def get_abstract(self): 204 | return self.ABSTRACT 205 | 206 | def abstract_contains_a_term(self, term): 207 | return (term in self.ABSTRACT) 208 | 209 | def get_id(self): 210 | return self.ID 211 | 212 | def get_authors(self): 213 | return self.AUTHORS 214 | 215 | def get_reviews(self): 216 | return self.REVIEWS 217 | 218 | def get_scienceparse(self): 219 | return self.SCIENCEPARSE 220 | 221 | def get_title_len(self): 222 | return len(self.TITLE) 223 | 224 | def get_abstract_len(self): 225 | return len(self.ABSTRACT) 226 | 227 | def get_conference(self): 228 | return self.CONFERENCE 229 | 230 | def get_score(self): 231 | return self.SCORE 232 | 233 | def get_accepted(self): 234 | return self.ACCEPTED 235 | 236 | def get_publication_type(self): 237 | return self.PUBLICATION_TYPE 238 | 239 | def get_author_emails(self): 240 | return self.AUTHOR_EMAILS 241 | 242 | def get_keywords(self): 243 | return self.KEYWORDS 244 | 245 | def get_date_of_submission(self): 246 | return self.DATE_OF_SUBMISSION 247 | 248 | def main(args): 249 | papers = Paper.from_softconf_dump('../../data/conll16/reviews.json') 250 | for paper in papers: 251 | paper.to_json('../../data/conll16_new/{}.json'.format(paper.ID)) 252 | 253 | if __name__ == "__main__": 254 | sys.exit(main(sys.argv)) 255 | -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/README.md: -------------------------------------------------------------------------------- 1 | Code from ScienceParse (via PeerRead) 2 | 3 | TODO: determine liscense and add it -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/Review.py: -------------------------------------------------------------------------------- 1 | 2 | class Review: 3 | 4 | """A review class, contains all bunch of relevant fields""" 5 | def __init__(self, RECOMMENDATION, COMMENTS, REPLICABILITY=None, PRESENTATION_FORMAT=None, \ 6 | CLARITY=None, MEANINGFUL_COMPARISON=None, SUBSTANCE=None, REVIEWER_CONFIDENCE=None, \ 7 | SOUNDNESS_CORRECTNESS=None, APPROPRIATENESS=None, IMPACT=None, ORIGINALITY=None, OTHER_KEYS=None, \ 8 | IS_META_REVIEW=False, TITLE=None, DATE=None, RECOMMENDATION_UNOFFICIAL=None, IS_ANNOTATED=False): 9 | self.RECOMMENDATION = RECOMMENDATION 10 | self.RECOMMENDATION_UNOFFICIAL = RECOMMENDATION_UNOFFICIAL #None # only for aspect prediction 11 | self.IS_ANNOTATED = IS_ANNOTATED 12 | 13 | self.COMMENTS = COMMENTS 14 | self.REPLICABILITY = REPLICABILITY 15 | self.PRESENTATION_FORMAT = PRESENTATION_FORMAT 16 | self.CLARITY = CLARITY 17 | self.MEANINGFUL_COMPARISON = MEANINGFUL_COMPARISON 18 | self.SUBSTANCE = SUBSTANCE 19 | self.REVIEWER_CONFIDENCE = REVIEWER_CONFIDENCE 20 | self.SOUNDNESS_CORRECTNESS = SOUNDNESS_CORRECTNESS 21 | self.APPROPRIATENESS = APPROPRIATENESS 22 | self.IMPACT = IMPACT 23 | self.ORIGINALITY = ORIGINALITY 24 | self.OTHER_KEYS = OTHER_KEYS 25 | self.IS_META_REVIEW = IS_META_REVIEW 26 | self.TITLE = TITLE 27 | self.DATE = DATE 28 | 29 | @staticmethod 30 | def get_json_string(json_object, string, missing_fields): 31 | if string in json_object: 32 | return json_object[string] 33 | elif missing_fields is not None: 34 | missing_fields.append(string) 35 | 36 | return None 37 | 38 | @staticmethod 39 | def from_json_object(json_object, print_missing_fields=False): 40 | assert "comments" in json_object 41 | comments = json_object["comments"] 42 | 43 | missing_fields = None 44 | 45 | if print_missing_fields: 46 | missing_fields = [] 47 | 48 | recommendation = Review.get_json_string(json_object, "RECOMMENDATION", missing_fields) 49 | 50 | 51 | recommendation_unofficial = Review.get_json_string(json_object, "RECOMMENDATION_UNOFFICIAL", missing_fields) 52 | 53 | is_annotated = Review.get_json_string(json_object, "IS_ANNOTATED", missing_fields) 54 | 55 | replicability = Review.get_json_string(json_object, "REPLICABILITY", missing_fields) 56 | clarity = Review.get_json_string(json_object, "CLARITY", missing_fields) 57 | substance = Review.get_json_string(json_object, "SUBSTANCE", missing_fields) 58 | appropriateness = Review.get_json_string(json_object, "APPROPRIATENESS", missing_fields) 59 | originality = Review.get_json_string(json_object, "ORIGINALITY", missing_fields) 60 | presentation_format = Review.get_json_string(json_object, "PRESENTATION_FORMAT", missing_fields) 61 | meaningful_comparison = Review.get_json_string(json_object, "MEANINGFUL_COMPARISON", missing_fields) 62 | reviewer_confidence = Review.get_json_string(json_object, "REVIEWER_CONFIDENCE", missing_fields) 63 | soundness_correctness = Review.get_json_string(json_object, "SOUNDNESS_CORRECTNESS", missing_fields) 64 | impact = Review.get_json_string(json_object, "IMPACT", missing_fields) 65 | is_meta_review = Review.get_json_string(json_object, "IS_META_REVIEW", missing_fields) 66 | date = Review.get_json_string(json_object, "DATE", missing_fields) 67 | title = Review.get_json_string(json_object, "TITLE", missing_fields) 68 | other_keys = Review.get_json_string(json_object, "OTHER_KEYS", missing_fields) 69 | 70 | if print_missing_fields and len(missing_fields): 71 | print("The following fields are missing in json input file:",missing_fields) 72 | return Review(recommendation, comments, replicability, presentation_format, clarity, meaningful_comparison, \ 73 | substance, reviewer_confidence, soundness_correctness, appropriateness, impact, originality, \ 74 | other_keys, is_meta_review, title, date, recommendation_unofficial, is_annotated ) 75 | 76 | def to_json_object(self): 77 | data = dict() 78 | 79 | data["comments"] = self.get_comments().decode('cp1252', errors='ignore').encode('utf-8') 80 | 81 | if self.RECOMMENDATION is not None: 82 | data["RECOMMENDATION"] = self.get_recommendation() 83 | 84 | if self.RECOMMENDATION_UNOFFICIAL is not None: 85 | data["RECOMMENDATION_UNOFFICIAL"] = self.get_recommendation_unofficial() 86 | if self.IS_ANNOTATED is not None: 87 | data["IS_ANNOTATED"] = self.get_is_annotated() 88 | 89 | 90 | if self.REPLICABILITY is not None: 91 | data["REPLICABILITY"] = self.get_replicability() 92 | if self.PRESENTATION_FORMAT is not None: 93 | data["PRESENTATION_FORMAT"] = self.get_presentation_format() 94 | if self.CLARITY is not None: 95 | data["CLARITY"] = self.get_clarity() 96 | if self.MEANINGFUL_COMPARISON is not None: 97 | data["MEANINGFUL_COMPARISON"] = self.get_meaningful_comparison() 98 | if self.SUBSTANCE is not None: 99 | data["SUBSTANCE"] = self.get_substance() 100 | if self.REVIEWER_CONFIDENCE is not None: 101 | data["REVIEWER_CONFIDENCE"] = self.get_reviewer_confidence() 102 | if self.SOUNDNESS_CORRECTNESS is not None: 103 | data["SOUNDNESS_CORRECTNESS"] = self.get_soundness_correctness() 104 | if self.APPROPRIATENESS is not None: 105 | data["APPROPRIATENESS"] = self.get_appropriateness() 106 | if self.IMPACT is not None: 107 | data["IMPACT"] = self.get_impact() 108 | if self.ORIGINALITY is not None: 109 | data["ORIGINALITY"] = self.get_originality() 110 | if self.OTHER_KEYS is not None: 111 | data["OTHER_KEYS"] = self.get_other_keys() 112 | if self.IS_META_REVIEW is not None: 113 | data["IS_META_REVIEW"] = self.is_meta_review() 114 | if self.TITLE is not None: 115 | data["TITLE"] = self.get_title() 116 | if self.DATE is not None: 117 | data["DATE"] = self.get_date() 118 | 119 | 120 | return data 121 | 122 | def get_recommendation(self): 123 | return self.RECOMMENDATION 124 | 125 | def get_recommendation_unofficial(self): 126 | return self.RECOMMENDATION_UNOFFICIAL 127 | 128 | def get_is_annotated(self): 129 | return self.IS_ANNOTATED 130 | 131 | def get_comments(self): 132 | return self.COMMENTS 133 | 134 | def get_replicability(self): 135 | return self.REPLICABILITY 136 | 137 | def get_presentation_format(self): 138 | return self.PRESENTATION_FORMAT 139 | 140 | def get_clarity(self): 141 | return self.CLARITY 142 | 143 | def get_meaningful_comparison(self): 144 | return self.MEANINGFUL_COMPARISON 145 | 146 | def get_substance(self): 147 | return self.SUBSTANCE 148 | 149 | def get_reviewer_confidence(self): 150 | return self.REVIEWER_CONFIDENCE 151 | 152 | def get_soundness_correctness(self): 153 | return self.SOUNDNESS_CORRECTNESS 154 | 155 | def get_appropriateness(self): 156 | return self.APPROPRIATENESS 157 | 158 | def get_impact(self): 159 | return self.IMPACT 160 | 161 | def get_originality(self): 162 | return self.ORIGINALITY 163 | 164 | def get_other_keys(self): 165 | return self.OTHER_KEYS 166 | 167 | def is_meta_review(self): 168 | return self.IS_META_REVIEW 169 | 170 | def get_title(self): 171 | return self.TITLE 172 | 173 | def get_date(self): 174 | return self.DATE 175 | -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/ScienceParse.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import re 3 | 4 | class ScienceParse: 5 | """ 6 | A data structure for paper fields extracted by ScienceParse 7 | """ 8 | def __init__(self, title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts, 9 | reference_num_mentions, authors=None, emails = None, other_keys=None): 10 | self.title = title 11 | self.abstract = abstract 12 | self.sections = sections 13 | self.reference_titles = reference_titles 14 | self.reference_venues = reference_venues 15 | self.reference_years = reference_years 16 | self.reference_mention_contexts = reference_mention_contexts 17 | self.reference_num_mentions = reference_num_mentions 18 | self.authors = authors 19 | self.emails = emails 20 | 21 | def get_sections_dict(self): 22 | return self.sections 23 | 24 | def get_reference_title_dict(self): 25 | return self.reference_titles 26 | 27 | def get_reference_venues_dict(self): 28 | return self.reference_venues 29 | 30 | def get_reference_years_dict(self): 31 | return self.reference_years 32 | 33 | def get_reference_mention_contexts_dict(self): 34 | return self.reference_mention_contexts 35 | 36 | def get_reference_num_mentions_dict(self): 37 | return self.reference_num_mentions 38 | 39 | def get_num_references(self): 40 | return len(self.get_reference_years_dict()) 41 | 42 | def get_num_refmentions(self): 43 | num_refmentions = 0 44 | for refid in self.reference_num_mentions: 45 | num_refmentions = num_refmentions + self.reference_num_mentions[refid] 46 | return num_refmentions 47 | 48 | def get_most_recent_reference_year(self): 49 | most_recent = 0 50 | for refid in self.reference_years: 51 | if self.reference_years[refid] > most_recent: 52 | most_recent = self.reference_years[refid] 53 | return most_recent 54 | 55 | def get_avg_length_reference_mention_contexts(self): 56 | sum_length = 0.0 57 | for refid in self.reference_mention_contexts: 58 | sum_length = sum_length + len(self.reference_mention_contexts[refid]) 59 | avg_length = 0 60 | if len(self.reference_mention_contexts) > 0: 61 | avg_length = sum_length / len(self.reference_mention_contexts) 62 | return avg_length 63 | 64 | def get_paper_content(self): 65 | content = self.title + " " + self.abstract + " " + self.get_author_names_string() + " " + \ 66 | self.get_domains_from_emails() 67 | for sect_id in sorted(self.sections): 68 | # print("###",str(sect_id)) 69 | content = content + " " + self.sections[sect_id] 70 | content = re.sub("\n([0-9]*\n)+", "\n", content) 71 | return content 72 | 73 | def get_tagged_paper_content(self): 74 | content = self.get_paper_content() 75 | 76 | nlp = spacy.load('en', parser=False) 77 | 78 | doc = nlp(content) 79 | 80 | return " ".join([x.text+"_"+x.tag_ for x in doc]) 81 | 82 | def get_frequent_words_proportion(self, hfws, most_frequent_words, least_frequent_words): 83 | content = self.get_paper_content().split() 84 | 85 | n = 0 86 | t = 0 87 | # print(str(most_frequent_words).encode('utf8')) 88 | for w in content: 89 | if w not in hfws and w not in least_frequent_words: 90 | t += 1 91 | n += w in most_frequent_words 92 | 93 | # print (n,len(content),1.*n/t) 94 | 95 | return 1.*n/t 96 | 97 | # #papers referred from -5 years from year of submission 98 | def get_num_recent_references(self, submission_year): 99 | num_recent_references = 0 100 | for refid in self.reference_years: 101 | if (submission_year - self.reference_years[refid] < 5): 102 | num_recent_references = num_recent_references + 1 103 | return num_recent_references 104 | 105 | # word offset of figure 1 106 | def get_word_offset_of_first_fig_reference(self): 107 | content_words = self.get_paper_content().split(" ") 108 | indices = [i for i, x in enumerate(content_words) if x == "Figure"] 109 | return indices[0] 110 | 111 | # num references to #figures 112 | def get_num_ref_to_figures(self): 113 | content_words = self.get_paper_content().split(" ") 114 | figure_indices = [i for i, x in enumerate(content_words) if x == "Figure"] 115 | return len(figure_indices) 116 | 117 | # num references to #tables 118 | def get_num_ref_to_tables(self): 119 | content_words = self.get_paper_content().split(" ") 120 | table_indices = [i for i, x in enumerate(content_words) if x == "Table"] 121 | return len(table_indices) 122 | 123 | # # of references to Section 124 | def get_num_ref_to_sections(self): 125 | content_words = self.get_paper_content().split(" ") 126 | section_indices = [i for i, x in enumerate(content_words) if x == "Section"] 127 | return len(section_indices) 128 | 129 | # related work at front/back 130 | # #unique words 131 | def get_num_uniq_words(self): 132 | return len(set(self.get_paper_content().split(" "))) 133 | 134 | # num of sections 135 | def get_num_sections(self): 136 | return len(self.sections) 137 | 138 | # avg length of sentences 139 | def get_avg_sentence_length(self): 140 | sentences = self.get_paper_content().split(". ") 141 | sentence_lengths = [len(s.split(" ")) for s in sentences] 142 | return (1.0 * sum(sentence_lengths))/len(sentence_lengths) 143 | 144 | # whether paper has appendix 145 | def get_contains_appendix(self): 146 | content_words = self.get_paper_content().split(" ") 147 | figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"] 148 | return int(len(figure_indices) > 0) 149 | 150 | # publishing a dataset / code 151 | def get_contains_appendix(self): 152 | content_words = self.get_paper_content().split(" ") 153 | figure_indices = [i for i, x in enumerate(content_words) if x == "Appendix"] 154 | return int(len(figure_indices) > 0) 155 | 156 | # #authors 157 | def get_num_authors(self): 158 | if self.authors == None: 159 | return 0 160 | return len(self.authors) 161 | 162 | # get author names as a string 163 | def get_author_names_string(self): 164 | if self.authors == None: 165 | return "" 166 | return str.join(' ', self.authors) 167 | 168 | # get domains from emails 169 | def get_domains_from_emails(self): 170 | domains = [] 171 | for email in self.emails: 172 | domains.append(email.split('@')[1].replace(".", "_")) 173 | return str.join(' ', domains) 174 | 175 | # num references to equations 176 | def get_num_ref_to_equations(self): 177 | content_words = self.get_paper_content().split(" ") 178 | equation_indices = [i for i, x in enumerate(content_words) if x == "Equation"] 179 | return len(equation_indices) 180 | 181 | # num references to theorems 182 | def get_num_ref_to_theorems(self): 183 | content_words = self.get_paper_content().split(" ") 184 | theorem_indices = [i for i, x in enumerate(content_words) if x == "Theorem"] 185 | return len(theorem_indices) 186 | -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/ScienceParseReader.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import io 3 | import os 4 | import json 5 | from PeerRead.ScienceParse.ScienceParse import ScienceParse 6 | 7 | class ScienceParseReader: 8 | """ 9 | This class reads the output of the science parse library and stores it in theScienceParseclass 10 | """ 11 | 12 | @staticmethod 13 | def read_science_parse(paperid, title, abstract, scienceparse_dir): 14 | scienceparse_file = io.open(os.path.join(scienceparse_dir, '{0}.pdf.json'.format(paperid))) 15 | # scienceparse_file = io.open('%s%s.pdf.json'%(scienceparse_dir,paperid), "r", encoding="utf8") 16 | scienceparse_str = scienceparse_file.read() 17 | scienceparse_data = json.loads(scienceparse_str) 18 | 19 | #read scienceparse 20 | scienceparse_map = {} 21 | 22 | sections = {} 23 | reference_years = {} 24 | reference_titles = {} 25 | reference_venues = {} 26 | reference_mention_contexts = {} 27 | reference_num_mentions = {} 28 | 29 | name = scienceparse_data["name"] 30 | metadata = scienceparse_data["metadata"] 31 | 32 | if metadata["sections"] is not None: 33 | for sectid in range(len(metadata["sections"])): 34 | heading = metadata["sections"][sectid]["heading"] 35 | text = metadata["sections"][sectid]["text"] 36 | sections[str(heading)] = text 37 | 38 | for refid in range(len(metadata["references"])): 39 | reference_titles[refid] = metadata["references"][refid]["title"] 40 | reference_years[refid] = metadata["references"][refid]["year"] 41 | reference_venues[refid] = metadata["references"][refid]["venue"] 42 | 43 | for menid in range(len(metadata["referenceMentions"])): 44 | refid = metadata["referenceMentions"][menid]["referenceID"] 45 | context = metadata["referenceMentions"][menid]["context"] 46 | oldContext = reference_mention_contexts.get(refid, "") 47 | reference_mention_contexts[refid] = oldContext + "\t" + context 48 | count = reference_num_mentions.get(refid, 0) 49 | reference_num_mentions[refid] = count + 1 50 | 51 | authors = metadata["authors"] 52 | emails = metadata["emails"] 53 | #print(authors) 54 | #print(emails) 55 | 56 | science_parse = ScienceParse(title, abstract, sections, reference_titles, reference_venues, reference_years, reference_mention_contexts, reference_num_mentions, authors, emails) 57 | return science_parse 58 | -------------------------------------------------------------------------------- /src/PeerRead/ScienceParse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/ScienceParse/__init__.py -------------------------------------------------------------------------------- /src/PeerRead/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/__init__.py -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/PeerRead_hand_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | create (hand-authored and lexical) features for baselines classifiers and save to under dataset folder in each split 3 | """ 4 | 5 | import sys, os, random, glob 6 | 7 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 8 | from PeerRead.ScienceParse.Paper import Paper 9 | from PeerRead.ScienceParse.ScienceParseReader import ScienceParseReader 10 | 11 | 12 | def get_PeerRead_hand_features(paper): 13 | sp = paper.get_scienceparse() 14 | 15 | hand_features = {} 16 | 17 | hand_features["accepted"] = paper.get_accepted() 18 | 19 | hand_features["most_recent_reference_year"] = sp.get_most_recent_reference_year() - 2000 20 | hand_features["num_recent_references"] = sp.get_num_recent_references(2017) 21 | hand_features["num_references"] = sp.get_num_references() 22 | hand_features["num_refmentions"] = sp.get_num_refmentions() 23 | hand_features["avg_length_reference_mention_contexts"] = sp.get_avg_length_reference_mention_contexts() 24 | 25 | hand_features["num_ref_to_figures"] = sp.get_num_ref_to_figures() 26 | hand_features["num_ref_to_tables"] = sp.get_num_ref_to_tables() 27 | hand_features["num_ref_to_sections"] = sp.get_num_ref_to_sections() 28 | 29 | hand_features["num_uniq_words"] = sp.get_num_uniq_words() 30 | hand_features["num_sections"] = sp.get_num_sections() 31 | hand_features["avg_sentence_length"] = sp.get_avg_sentence_length() 32 | 33 | hand_features["contains_appendix"] = sp.get_contains_appendix() 34 | 35 | hand_features["title_length"] = paper.get_title_len() 36 | hand_features["num_authors"] = sp.get_num_authors() 37 | hand_features["num_ref_to_equations"] = sp.get_num_ref_to_equations() 38 | hand_features["num_ref_to_theorems"] = sp.get_num_ref_to_theorems() 39 | 40 | abstract = str.lower(paper.ABSTRACT) 41 | hand_features["abstract_contains_deep"] = "deep" in abstract 42 | hand_features["abstract_contains_neural"] = "neural" in abstract 43 | hand_features["abstract_contains_embedding"] = "embedding" in abstract 44 | hand_features["abstract_contains_outperform"] = "outperform" in abstract 45 | hand_features["abstract_contains_novel"] = "novel" in abstract 46 | hand_features["abstract_contains_state-of-the-art"] = \ 47 | "state-of-the-art" in abstract or "state of the art" in abstract 48 | 49 | title = str.lower(paper.TITLE) 50 | hand_features["title_contains_deep"] = "deep" in title 51 | hand_features["title_contains_neural"] = "neural" in title 52 | hand_features["title_contains_embedding"] = "embed" in title 53 | hand_features["title_contains_gan"] = ("gan" in title) or ("adversarial net" in title) 54 | 55 | return hand_features 56 | 57 | 58 | def main(args): 59 | 60 | paper_json_dir = args[1] # train/reviews 61 | scienceparse_dir = args[2] # train/parsed_pdfs 62 | 63 | 64 | ################################ 65 | # read reviews 66 | ################################ 67 | print('Reading reviews from...', paper_json_dir) 68 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(paper_json_dir))) 69 | papers = [] 70 | for paper_json_filename in paper_json_filenames: 71 | paper = Paper.from_json(paper_json_filename) 72 | paper.SCIENCEPARSE = ScienceParseReader.read_science_parse(paper.ID, paper.TITLE, paper.ABSTRACT, 73 | scienceparse_dir) 74 | papers.append(paper) 75 | random.shuffle(papers) 76 | print('Total number of reviews', len(papers)) 77 | 78 | id = 1 79 | for p in papers: 80 | rec = int(p.get_accepted() == True) 81 | 82 | handy = get_PeerRead_hand_features(p) 83 | 84 | id += 1 85 | 86 | 87 | if __name__ == "__main__": 88 | main(sys.argv) 89 | -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/data_cleaning/__init__.py -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/clean_PeerRead.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import bert.tokenization as tokenization 5 | from PeerRead.data_cleaning.process_PeerRead_abstracts import clean_PeerRead_dataset 6 | 7 | dataset_names = ['acl_2017', 8 | 'arxiv.cs.ai_2007-2017', 9 | 'arxiv.cs.cl_2007-2017', 10 | 'arxiv.cs.lg_2007-2017', 11 | 'conll_2016', 12 | 'iclr_2017', 13 | 'nips_2013', 14 | 'nips_2014', 15 | 'nips_2015', 16 | 'nips_2016', 17 | 'nips_2017' 18 | ] 19 | 20 | dataset_paths = ['acl_2017', 21 | 'arxiv.cs.ai_2007-2017', 22 | 'arxiv.cs.cl_2007-2017', 23 | 'arxiv.cs.lg_2007-2017', 24 | 'conll_2016', 25 | 'iclr_2017', 26 | 'nips_2013-2017/2013', 27 | 'nips_2013-2017/2014', 28 | 'nips_2013-2017/2015', 29 | 'nips_2013-2017/2016', 30 | 'nips_2013-2017/2017' 31 | ] 32 | 33 | dataset_paths = dict(zip(dataset_names, dataset_paths)) 34 | 35 | dataset_years = {'acl_2017': 2017, 36 | 'conll_2016': 2016, 37 | 'iclr_2017': 2017, 38 | 'arxiv.cs.ai_2007-2017': None, 39 | 'arxiv.cs.cl_2007-2017': None, 40 | 'arxiv.cs.lg_2007-2017': None, 41 | 'nips_2013': 2013, 42 | 'nips_2014': 2014, 43 | 'nips_2015': 2015, 44 | 'nips_2016': 2016, 45 | 'nips_2017': 2017} 46 | 47 | # dataset_venues = {k: v for v,k in enumerate(dataset_names)} 48 | 49 | dataset_venues = {'acl_2017': 0, 50 | 'conll_2016': 1, 51 | 'iclr_2017': 2, 52 | 'nips_2013': 3, 53 | 'nips_2014': 3, 54 | 'nips_2015': 3, 55 | 'nips_2016': 3, 56 | 'nips_2017': 3, 57 | 'arxiv.cs.ai_2007-2017': 4, 58 | 'arxiv.cs.cl_2007-2017': 5, 59 | 'arxiv.cs.lg_2007-2017': 6, 60 | } 61 | 62 | 63 | def main(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead') 66 | parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') 67 | args = parser.parse_args() 68 | 69 | datasets_dir = args.datasets_dir 70 | tokenizer = tokenization.FullTokenizer( 71 | vocab_file=args.vocab_file, do_lower_case=True) 72 | 73 | def proc_dataset(dataset): 74 | all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all') 75 | review_json_dir = os.path.join(all_dir, 'reviews') 76 | parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs') 77 | 78 | venue = dataset_venues[dataset] 79 | year = dataset_years[dataset] 80 | 81 | out_dir = os.path.join(datasets_dir, 'proc') 82 | out_file = dataset + '.tf_record' 83 | max_abs_len = 250 84 | 85 | clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len, 86 | tokenizer) 87 | 88 | # pool = mp.Pool(4) 89 | # pool.map(proc_dataset, dataset_names) 90 | 91 | for dataset in dataset_names: 92 | proc_dataset(dataset) 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/extra_vocab.py: -------------------------------------------------------------------------------- 1 | """ 2 | vv: wrote this to inspect what bert's tokenizer does with vocabulary terms it doesn't know. 3 | The answer is: it splits them into word pieces where it has embeddings for each piece. Example: 4 | 5 | tokenizer.tokenize('embedding') 6 | ['em', '##bed', '##ding'] 7 | 8 | tokenizer.convert_tokens_to_ids(['em', '##bed', '##ding']) 9 | [7861, 8270, 4667] 10 | 11 | Accordingly, the meaning of embedding can be learned so long as there's a suitably rich training corpus 12 | """ 13 | 14 | import argparse 15 | import glob 16 | import random 17 | 18 | import io 19 | import json 20 | 21 | import bert.tokenization as tokenization 22 | 23 | rng = random.Random(0) 24 | 25 | def main(): 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--review-json-dir', type=str, default=None) 29 | parser.add_argument('--vocab-file', type=str, default=None) 30 | 31 | args = parser.parse_args() 32 | 33 | tokenizer = tokenization.FullTokenizer( 34 | vocab_file=args.vocab_file, do_lower_case=True) 35 | 36 | review_json_dir = args.review_json_dir 37 | 38 | print('Reading reviews from...', review_json_dir) 39 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir))) 40 | 41 | paper_json_filename = paper_json_filenames[0] 42 | with io.open(paper_json_filename) as json_file: 43 | loaded = json.load(json_file) 44 | abstract = loaded['abstract'] 45 | print(abstract) 46 | tokens = tokenizer.tokenize(abstract) 47 | print(tokens) 48 | print(tokenizer.convert_tokens_to_ids(tokens)) 49 | 50 | # for idx, paper_json_filename in enumerate(paper_json_filenames): 51 | # with io.open(paper_json_filename) as json_file: 52 | # loaded = json.load(json_file) 53 | # 54 | # print(loaded['abstract']) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/scripts/clean_PeerRead.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Process all PeerRead data into tf_record format to feed into Bert 4 | 5 | PeerDir=../dat/PeerRead/ 6 | 7 | for dataset in $PeerDir*/; do 8 | echo $dataset 9 | # python -m data_cleaning.process_PeerRead_abstracts \ 10 | # --review-json-dir \ 11 | # --parsedpdf-json-dir \ 12 | # --out-dir \ 13 | # --out-file \ 14 | # --vocab_file \ 15 | # --max_abs_len 16 | done -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/scripts/clean_nips_prefix.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PeerDir=../dat/PeerRead/nips_2013-2017 4 | PARSE_DIR=$PeerDir/2017/all/parsed_pdfs 5 | 6 | for pdf in $PARSE_DIR/*; do 7 | # echo $pdf 8 | mv $pdf $PARSE_DIR/"${pdf#*/pdfs}" 9 | done 10 | -------------------------------------------------------------------------------- /src/PeerRead/data_cleaning/scripts/merge_train_dev_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Each Peer read dataset is pre-divided into train/dev/test. Merge these into "all" 4 | 5 | #PeerDir=../dat/PeerRead 6 | PeerDir=../dat/PeerRead/nips_2013-2017 7 | 8 | for dir in $PeerDir*/; do 9 | for subdir in $dir*/; do 10 | echo $subdir; 11 | cp -RT $subdir/ $dir/all/ 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /src/PeerRead/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/dataset/__init__.py -------------------------------------------------------------------------------- /src/PeerRead/dataset/array_from_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | helpers to take samples from the dataset and turn them into numpy arrays 3 | (for ease of inspection and use with baselines) 4 | """ 5 | import argparse 6 | import numpy as np 7 | import pandas as pd 8 | import tensorflow as tf 9 | import os 10 | try: 11 | import mkl_random as random 12 | except ImportError: 13 | import numpy.random as random 14 | 15 | import bert.tokenization as tokenization 16 | from PeerRead.dataset.dataset import make_input_fn_from_file, make_buzzy_based_simulated_labeler 17 | 18 | 19 | def dataset_fn_to_df(dataset_fn): 20 | 21 | params = {'batch_size': 1} 22 | dataset = dataset_fn(params) 23 | 24 | itr = dataset.make_one_shot_iterator() 25 | 26 | samples = [] 27 | 28 | for i in range(25000): 29 | try: 30 | sample = itr.get_next() 31 | for k in sample: 32 | sample[k] = sample[k].numpy()[0] 33 | samples += [sample] 34 | # print("year: {}".format(sample['year'])) 35 | except: 36 | print(i) 37 | break 38 | 39 | df = pd.DataFrame(samples) 40 | 41 | return df 42 | 43 | def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0, 44 | base_output_dir='../dat/sim/peerread_buzzytitle_based/'): 45 | 46 | labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed) 47 | 48 | num_splits = 10 49 | dev_splits = [0] 50 | test_splits = [0] 51 | 52 | # data_file = '../dat/reddit/proc.tf_record' 53 | # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt" 54 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) 55 | 56 | input_dataset_from_filenames = make_input_fn_from_file(data_file, 57 | 250, 58 | num_splits, 59 | dev_splits, 60 | test_splits, 61 | tokenizer, 62 | is_training=False, 63 | filter_test=False, 64 | shuffle_buffer_size=25000, 65 | seed=seed, 66 | labeler=labeler) 67 | 68 | output_df = dataset_fn_to_df(input_dataset_from_filenames) 69 | output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'}) 70 | 71 | output_dir = os.path.join(base_output_dir, "mode{}".format(setting)) 72 | os.makedirs(output_dir, exist_ok=True) 73 | output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level)) 74 | 75 | output_df.to_csv(output_path, '\t') 76 | 77 | 78 | def main(): 79 | tf.enable_eager_execution() 80 | 81 | buzzy_title_based_sim_dfs(treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0, 82 | base_output_dir=base_output_dir) 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument("--data-file", action="store", default='../dat/PeerRead/proc/arxiv-all.tf_record') 87 | parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') 88 | parser.add_argument("--base-output-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/') 89 | parser.add_argument("--mode", action="store", default="simple") 90 | parser.add_argument("--beta0", action="store", default='1.0') 91 | parser.add_argument("--beta1", action="store", default='1.0') 92 | parser.add_argument("--gamma", action="store", default='1.0') 93 | args = parser.parse_args() 94 | 95 | data_file = args.data_file 96 | vocab_file = args.vocab_file 97 | base_output_dir = args.base_output_dir 98 | mode = args.mode 99 | beta0 = float(args.beta0) 100 | beta1 = float(args.beta1) 101 | gamma = float(args.gamma) 102 | 103 | main() -------------------------------------------------------------------------------- /src/PeerRead/dataset/sentence_masking.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific PeerRead governing permissions and 14 | # limitations under the License. 15 | """Create masked LM TF examples for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | 23 | import tensorflow as tf 24 | 25 | 26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance", 27 | ["index", "label"]) 28 | 29 | 30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed): 31 | """Creates the predictions for the masked LM objective. 32 | 33 | This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords 34 | 35 | Except: we just include masks instead of randomly letting the words through or randomly replacing 36 | """ 37 | 38 | basic_mask = tf.less( 39 | tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed), 40 | masked_lm_prob) 41 | 42 | # don't mask special characters or padding 43 | cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]), 44 | tf.not_equal(token_ids, vocab["[SEP]"])) 45 | cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0)) 46 | mask = tf.logical_and(cand_indexes, basic_mask) 47 | 48 | # truncate to max predictions for ease of padding 49 | masked_lm_positions = tf.where(mask) 50 | # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ? 51 | # very rare event: nothing gets picked for mask, causing an irritating bug 52 | # in this case, just mask the first candidate index 53 | mlm_shape = tf.shape(masked_lm_positions)[0] 54 | masked_lm_positions = tf.cond(mlm_shape > 1, 55 | lambda: masked_lm_positions, 56 | lambda: tf.where(cand_indexes)[0:2]) 57 | 58 | masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq] 59 | masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32) 60 | masked_lm_ids = tf.gather(token_ids, masked_lm_positions) 61 | 62 | mask = tf.cast( 63 | tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape), 64 | bool) 65 | 66 | output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids) 67 | 68 | # pad out to max_predictions_per_seq 69 | masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding 70 | add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]] 71 | masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant') 72 | masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant') 73 | masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant') 74 | 75 | return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights 76 | 77 | 78 | def main(_): 79 | pass 80 | 81 | 82 | if __name__ == "__main__": 83 | main() -------------------------------------------------------------------------------- /src/PeerRead/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/PeerRead/model/__init__.py -------------------------------------------------------------------------------- /src/PeerRead/model/bert_multiclass.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper to check which categorical attributes of PeerRead are predictable from the text 3 | """ 4 | 5 | import tensorflow as tf 6 | import bert.modeling as modeling 7 | import bert.optimization as optimization 8 | from causal_bert.bert_unsupervised import get_masked_lm_output 9 | from causal_bert.logging import make_label_binary_prediction_summaries, binary_label_eval_metric_fn 10 | 11 | 12 | def _create_unsupervised_only_model(bert, bert_config, features): 13 | # PeerRead v. reddit inconsistency 14 | if "op_masked_lm_positions" in features: 15 | masked_lm_positions = features["op_masked_lm_positions"] 16 | masked_lm_ids = features["op_masked_lm_ids"] 17 | masked_lm_weights = features["op_masked_lm_weights"] 18 | else: 19 | masked_lm_positions = features["masked_lm_positions"] 20 | masked_lm_ids = features["masked_lm_ids"] 21 | masked_lm_weights = features["masked_lm_weights"] 22 | 23 | masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = get_masked_lm_output( 24 | bert_config, bert.get_sequence_output(), bert.get_embedding_table(), 25 | masked_lm_positions, masked_lm_ids, masked_lm_weights) 26 | return masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs 27 | 28 | 29 | def _make_feedforward_classifier(embedding, labels, num_labels, split, num_hidden_layers, extra_features=None, 30 | label_smoothing=0.01): 31 | regularizer = tf.contrib.layers.l2_regularizer(scale=1e-6) 32 | if extra_features is None: 33 | full_embedding = embedding 34 | else: 35 | full_embedding = tf.concat([embedding, extra_features], axis=1) 36 | 37 | if num_hidden_layers == 0: 38 | logits = tf.layers.dense(full_embedding, num_labels, activation=None, 39 | kernel_regularizer=regularizer, bias_regularizer=regularizer) 40 | 41 | else: 42 | layer = tf.layers.dense(full_embedding, 200, activation=tf.nn.elu) 43 | for _ in range(num_hidden_layers - 1): 44 | layer = tf.layers.dense(layer, 200, activation=tf.nn.elu, 45 | kernel_regularizer=regularizer, bias_regularizer=regularizer) 46 | 47 | if extra_features is None: 48 | final_embedding = layer 49 | else: 50 | final_embedding = tf.concat([layer, extra_features], axis=1) 51 | 52 | logits = tf.layers.dense(final_embedding, num_labels, activation=None, 53 | kernel_regularizer=regularizer, bias_regularizer=regularizer) 54 | 55 | with tf.name_scope("loss"): 56 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32, 57 | on_value=1. - label_smoothing, off_value=label_smoothing) 58 | log_probs = tf.nn.log_softmax(logits, axis=-1) 59 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 60 | censored_per_example_loss = split * per_example_loss 61 | loss = tf.reduce_sum(censored_per_example_loss) 62 | 63 | probabilities = tf.nn.softmax(logits, axis=-1)[:, 1] # P(T=1) 64 | 65 | return loss, per_example_loss, logits, probabilities 66 | 67 | 68 | def _get_getter(ema): 69 | def ema_getter(getter, name, *args, **kwargs): 70 | var = getter(name, *args, **kwargs) 71 | ema_var = ema.average(var) 72 | return ema_var # if ema_var else var 73 | 74 | return ema_getter 75 | 76 | 77 | def multiclass_model_fn_builder(bert_config, init_checkpoint, learning_rate, 78 | num_train_steps, num_warmup_steps, use_tpu, 79 | use_one_hot_embeddings, label_pred=True, unsupervised=False, 80 | polyak=False, use_extra_features=False): 81 | """Returns `model_fn` closure for TPUEstimator.""" 82 | 83 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 84 | """The `model_fn` for TPUEstimator.""" 85 | 86 | tf.logging.info("*** Features ***") 87 | for name in sorted(features.keys()): 88 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 89 | 90 | target_name = params['target_name'] 91 | num_labels = params['num_labels'] 92 | 93 | labels = features[target_name] 94 | 95 | # because reddit and peerread use slightly different text and pre-training structure 96 | if "op_token_ids" in features: 97 | token_mask = features["op_token_mask"] 98 | maybe_masked_token_ids = features["op_maybe_masked_input_ids"] 99 | else: 100 | token_mask = features["token_mask"] 101 | maybe_masked_token_ids = features["maybe_masked_input_ids"] 102 | 103 | index = features['index'] 104 | in_train = features['in_train'] 105 | in_dev = features['in_dev'] 106 | in_test = features['in_test'] 107 | 108 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 109 | 110 | # Predictive Model 111 | 112 | bert = modeling.BertModel( 113 | config=bert_config, 114 | is_training=is_training, 115 | input_ids=maybe_masked_token_ids, 116 | input_mask=token_mask, 117 | token_type_ids=None, 118 | use_one_hot_embeddings=use_one_hot_embeddings) 119 | 120 | masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs = \ 121 | _create_unsupervised_only_model(bert, bert_config, features) 122 | 123 | bert_embedding = bert.get_pooled_output() 124 | 125 | label_loss, per_example_loss, logits, probabilities = \ 126 | _make_feedforward_classifier(bert_embedding, labels, num_labels, in_train, num_hidden_layers=0, 127 | extra_features=None, label_smoothing=0.01) 128 | 129 | tf.losses.add_loss(masked_lm_loss) 130 | tf.losses.add_loss(0.1 * label_loss) 131 | 132 | tf.summary.scalar('masked_lm_loss', masked_lm_loss, family='loss') 133 | tf.summary.scalar('label_loss', label_loss, family='loss') 134 | 135 | total_loss = masked_lm_loss + 0.1 * label_loss 136 | 137 | # some logging 138 | make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_train, "train") 139 | make_label_binary_prediction_summaries(per_example_loss, logits, labels, in_dev, "dev") 140 | 141 | # pre-trained model loading 142 | tvars = tf.trainable_variables() 143 | initialized_variable_names = {} 144 | scaffold_fn = None 145 | if init_checkpoint: 146 | (assignment_map, initialized_variable_names 147 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 148 | if use_tpu: 149 | 150 | def tpu_scaffold(): 151 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 152 | return tf.train.Scaffold() 153 | 154 | scaffold_fn = tpu_scaffold 155 | else: 156 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 157 | 158 | tf.logging.info("**** Trainable Variables ****") 159 | for var in tvars: 160 | init_string = "" 161 | if var.name in initialized_variable_names: 162 | init_string = ", *INIT_FROM_CKPT*" 163 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 164 | init_string) 165 | 166 | output_spec = None 167 | if mode == tf.estimator.ModeKeys.TRAIN: 168 | 169 | # sgd_opt = tf.train.GradientDescentOptimizer(learning_rate) 170 | # train_op = sgd_opt.minimize(total_loss, global_step=tf.train.get_global_step()) 171 | 172 | train_op = optimization.create_optimizer( 173 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 174 | 175 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 176 | mode=mode, 177 | loss=total_loss, 178 | train_op=train_op, 179 | scaffold_fn=scaffold_fn) 180 | 181 | elif mode == tf.estimator.ModeKeys.EVAL: 182 | pass 183 | 184 | else: 185 | pass 186 | 187 | return output_spec 188 | 189 | return model_fn 190 | -------------------------------------------------------------------------------- /src/PeerRead/submit_scripts/run_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12 4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record 5 | OUTPUT_DIR=../output/PeerRead/local_test 6 | #INIT_DIR=../../output/unsupervised_PeerRead_embeddings/ 7 | #INIT_FILE=$INIT_DIR/model.ckpt-175000 8 | 9 | 10 | #rm -rf $OUTPUT_DIR 11 | 12 | python -m PeerRead.model.run_causal_bert \ 13 | --seed=0 \ 14 | --do_train=true \ 15 | --do_eval=false \ 16 | --do_predict=true \ 17 | --input_files_or_glob=$DATA_FILE \ 18 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 19 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 20 | --max_seq_length=250 \ 21 | --output_dir=$OUTPUT_DIR \ 22 | --train_batch_size=16 \ 23 | --learning_rate=3e-5 \ 24 | --num_warmup_steps 200 \ 25 | --num_train_steps=4500 \ 26 | --save_checkpoint_steps=3000 \ 27 | --unsupervised=True \ 28 | --label_pred=True \ 29 | --num_splits=10 \ 30 | --test_splits=0 \ 31 | --dev_splits=0 \ 32 | --simulated='real' \ 33 | --treatment='buzzy_title' 34 | # --init_checkpoint=${INIT_FILE} 35 | -------------------------------------------------------------------------------- /src/PeerRead/submit_scripts/run_unsupervised.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BERT_BASE_DIR=../../BERT_pre-trained/uncased_L-12_H-768_A-12 4 | DATA_FILE=../dat/PeerRead/proc/arxiv-all.tf_record 5 | OUTPUT_DIR=../../output/unsupervised_PeerRead_embeddings/ 6 | 7 | #rm -rf $OUTPUT_DIR 8 | python -m PeerRead.model.run_causal_bert \ 9 | --seed=0 \ 10 | --do_train=true \ 11 | --input_files_or_glob=${DATA_FILE} \ 12 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \ 13 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \ 14 | --output_dir=${OUTPUT_DIR} \ 15 | --max_seq_length=250 \ 16 | --train_batch_size=16 \ 17 | --learning_rate=3e-5 \ 18 | --num_warmup_steps 200 \ 19 | --num_train_steps=175000 \ 20 | --save_checkpoints_steps 5000 \ 21 | --keep_checkpoints 3 \ 22 | --unsupervised=True -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/__init__.py -------------------------------------------------------------------------------- /src/bert/README: -------------------------------------------------------------------------------- 1 | Chunks of google's Bert code, https://github.com/google-research/bert 2 | 3 | pre-trained presumed to be in: 4 | '../../bert/pre-trained/uncased_L-12_H-768_A-12' -------------------------------------------------------------------------------- /src/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/bert/__init__.py -------------------------------------------------------------------------------- /src/bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | new_global_step = global_step + 1 80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 81 | return train_op 82 | 83 | 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 85 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 86 | 87 | def __init__(self, 88 | learning_rate, 89 | weight_decay_rate=0.0, 90 | beta_1=0.9, 91 | beta_2=0.999, 92 | epsilon=1e-6, 93 | exclude_from_weight_decay=None, 94 | name="AdamWeightDecayOptimizer"): 95 | """Constructs a AdamWeightDecayOptimizer.""" 96 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 97 | 98 | self.learning_rate = learning_rate 99 | self.weight_decay_rate = weight_decay_rate 100 | self.beta_1 = beta_1 101 | self.beta_2 = beta_2 102 | self.epsilon = epsilon 103 | self.exclude_from_weight_decay = exclude_from_weight_decay 104 | 105 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 106 | """See base class.""" 107 | assignments = [] 108 | for (grad, param) in grads_and_vars: 109 | if grad is None or param is None: 110 | continue 111 | 112 | param_name = self._get_variable_name(param.name) 113 | 114 | m = tf.get_variable( 115 | name=param_name + "/adam_m", 116 | shape=param.shape.as_list(), 117 | dtype=tf.float32, 118 | trainable=False, 119 | initializer=tf.zeros_initializer()) 120 | v = tf.get_variable( 121 | name=param_name + "/adam_v", 122 | shape=param.shape.as_list(), 123 | dtype=tf.float32, 124 | trainable=False, 125 | initializer=tf.zeros_initializer()) 126 | 127 | # Standard Adam update. 128 | next_m = ( 129 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 130 | next_v = ( 131 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 132 | tf.square(grad))) 133 | 134 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 135 | 136 | # Just adding the square of the weights to the loss function is *not* 137 | # the correct way of using L2 regularization/weight decay with Adam, 138 | # since that will interact with the m and v parameters in strange ways. 139 | # 140 | # Instead we want ot decay the weights in a manner that doesn't interact 141 | # with the m/v parameters. This is equivalent to adding the square 142 | # of the weights to the loss with plain (non-momentum) SGD. 143 | if self._do_use_weight_decay(param_name): 144 | update += self.weight_decay_rate * param 145 | 146 | update_with_lr = self.learning_rate * update 147 | 148 | next_param = param - update_with_lr 149 | 150 | assignments.extend( 151 | [param.assign(next_param), 152 | m.assign(next_m), 153 | v.assign(next_v)]) 154 | return tf.group(*assignments, name=name) 155 | 156 | def _do_use_weight_decay(self, param_name): 157 | """Whether to use L2 weight decay for `param_name`.""" 158 | if not self.weight_decay_rate: 159 | return False 160 | if self.exclude_from_weight_decay: 161 | for r in self.exclude_from_weight_decay: 162 | if re.search(r, param_name) is not None: 163 | return False 164 | return True 165 | 166 | def _get_variable_name(self, param_name): 167 | """Get the variable name from the tensor name.""" 168 | m = re.match("^(.*):\\d+$", param_name) 169 | if m is not None: 170 | param_name = m.group(1) 171 | return param_name 172 | -------------------------------------------------------------------------------- /src/causal_bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/causal_bert/__init__.py -------------------------------------------------------------------------------- /src/causal_bert/logging.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def batch_random_agreement(labels, predictions, weights, name=None): 5 | """ Computes the probability of random agreement between the 6 | labels and predictions assuming independence. 7 | 8 | Parameters 9 | ---------- 10 | labels: a tensor of any shape taking values in {0, 1}. 11 | predictions: a tensor of the same shape as labels taking values in {0, 1}. 12 | weights: a tensor that can be broadcasted to labels. 13 | name: an optional name for the operation. 14 | 15 | Returns 16 | ------- 17 | random_agreement: a scalar tensor representing the probability of random 18 | agreement. 19 | """ 20 | with tf.name_scope(name, 'batch_random_agreement', [labels, predictions, weights]): 21 | weights_mean = tf.reduce_mean(weights) 22 | weights_mean = tf.where(tf.not_equal(weights_mean, 0), weights_mean, 1) 23 | 24 | labels = tf.to_float(labels) 25 | predictions = tf.to_float(predictions) 26 | 27 | p_labels = tf.metrics.mean(labels * weights / weights_mean)[1] 28 | p_predictions = tf.metrics.mean(predictions * weights / weights_mean)[1] 29 | 30 | random_agreement = tf.identity( 31 | p_labels * p_predictions + (1 - p_labels) * (1 - p_predictions), 32 | name='random_agreement') 33 | 34 | print(random_agreement.name) 35 | 36 | return random_agreement 37 | 38 | 39 | def batch_kappa(labels, predictions, weights, name=None): 40 | """ Computes Cohen's kappa on the given batch of predictions. 41 | 42 | Parameters 43 | ---------- 44 | labels: a tensor of any shape taking values in {0, 1}. 45 | predictions: a tensor of the same shape as labels taking values in {0, 1}. 46 | weights: a tensor that can be broadcasted to labels. 47 | name: an optional name for the operation. 48 | 49 | Returns 50 | ------- 51 | kappa: a scalar tensor representing the Kappa measure of agreement 52 | between labels and predictions. 53 | """ 54 | with tf.name_scope(name, 'batch_kappa', [labels, predictions, weights]): 55 | accuracy = tf.metrics.accuracy(labels, predictions, weights=weights)[1] 56 | random_agreement = batch_random_agreement(labels, predictions, weights) 57 | 58 | # hack for small batch sizes 59 | random_agreement = tf.clip_by_value(random_agreement, 0.001, 0.999) 60 | 61 | kappa = tf.divide( 62 | accuracy - random_agreement, 1 - random_agreement, 63 | name='kappa') 64 | 65 | return kappa 66 | 67 | 68 | def make_label_binary_prediction_summaries(per_example_loss, logits, label_ids, split, family): 69 | with tf.name_scope("summary"+"/"+family): 70 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions') 71 | 72 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels') 73 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels') 74 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels') 75 | kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa') 76 | 77 | loss = tf.metrics.mean(per_example_loss, weights=split) 78 | # censored_per_example_loss = split * per_example_loss 79 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split) 80 | 81 | tf.summary.scalar('accuracy', accuracy[1], family=family) 82 | tf.summary.scalar('precision', precision[1], family=family) 83 | tf.summary.scalar('recall', recall[1], family=family) 84 | tf.summary.scalar('kappa', kappa, family=family) 85 | tf.summary.scalar('loss', loss[1], family=family) 86 | 87 | 88 | def make_label_multiclass_prediction_summaries(per_example_loss, logits, one_hot_label, split, family): 89 | with tf.name_scope("summary"+"/"+family): 90 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32, name='predictions') 91 | label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32) 92 | 93 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels') 94 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels') 95 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels') 96 | kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa') 97 | 98 | loss = tf.metrics.mean(per_example_loss, weights=split) 99 | # censored_per_example_loss = split * per_example_loss 100 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split) 101 | 102 | tf.summary.scalar('accuracy', accuracy[1], family=family) 103 | tf.summary.scalar('precision', precision[1], family=family) 104 | tf.summary.scalar('recall', recall[1], family=family) 105 | tf.summary.scalar('kappa', kappa, family=family) 106 | tf.summary.scalar('loss', loss[1], family=family) 107 | 108 | 109 | 110 | def make_label_regression_prediction_summaries(per_example_loss, split, family): 111 | with tf.name_scope("summary"+"/"+family): 112 | 113 | loss = tf.metrics.mean(per_example_loss, weights=split) 114 | # censored_per_example_loss = split * per_example_loss 115 | # loss = tf.reduce_sum(censored_per_example_loss) / tf.reduce_sum(split) 116 | 117 | tf.summary.scalar('loss', loss[1], family=family) 118 | 119 | 120 | def cont_label_eval_metric_fn(per_example_loss, outcome, split=None, family=''): 121 | loss = tf.metrics.mean(per_example_loss, weights=split) 122 | 123 | return { 124 | family+"/eval_loss": loss 125 | } 126 | 127 | 128 | def binary_label_eval_metric_fn(per_example_loss, label_ids, logits, split=None, family=''): 129 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 130 | 131 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split) 132 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels') 133 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels') 134 | # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa') 135 | loss = tf.metrics.mean(per_example_loss, weights=split) 136 | 137 | return { 138 | family+"/eval_accuracy": accuracy, 139 | family+"/eval_precision": precision, 140 | family+"/eval_recall": recall, 141 | family+"/eval_loss": loss 142 | } 143 | 144 | 145 | def multiclass_label_eval_metric_fn(per_example_loss, logits, one_hot_label, split=None, family=''): 146 | 147 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 148 | label_ids = tf.argmax(one_hot_label, axis=-1, output_type=tf.int32) 149 | 150 | accuracy = tf.metrics.accuracy(label_ids, predictions, weights=split, metrics_collections='labels') 151 | precision = tf.metrics.precision(label_ids, predictions, weights=split, metrics_collections='labels') 152 | recall = tf.metrics.recall(label_ids, predictions, weights=split, metrics_collections='labels') 153 | # kappa = batch_kappa(label_ids, predictions, weights=split, name='labels/kappa') 154 | loss = tf.metrics.mean(per_example_loss, weights=split) 155 | 156 | return { 157 | family+"/eval_accuracy": accuracy, 158 | family+"/eval_precision": precision, 159 | family+"/eval_recall": recall, 160 | family+"/eval_loss": loss 161 | } 162 | 163 | 164 | def unsupervised_eval_metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, 165 | masked_lm_weights): 166 | """Computes the loss and accuracy of the model.""" 167 | masked_lm_log_probs = tf.reshape(masked_lm_log_probs, 168 | [-1, masked_lm_log_probs.shape[-1]]) 169 | masked_lm_predictions = tf.argmax( 170 | masked_lm_log_probs, axis=-1, output_type=tf.int32) 171 | masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) 172 | masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) 173 | masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) 174 | masked_lm_accuracy = tf.metrics.accuracy( 175 | labels=masked_lm_ids, 176 | predictions=masked_lm_predictions, 177 | weights=masked_lm_weights) 178 | masked_lm_mean_loss = tf.metrics.mean( 179 | values=masked_lm_example_loss, weights=masked_lm_weights) 180 | 181 | return { 182 | "masked_lm_accuracy": masked_lm_accuracy, 183 | "masked_lm_loss": masked_lm_mean_loss, 184 | } -------------------------------------------------------------------------------- /src/lda_baseline/helpers.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import word_tokenize 2 | from nltk.stem import WordNetLemmatizer 3 | from nltk.corpus import stopwords 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.decomposition import LatentDirichletAllocation 8 | 9 | class LemmaTokenizer(object): 10 | def __init__(self): 11 | self.wnl = WordNetLemmatizer() 12 | def __call__(self, articles): 13 | stop = stopwords.words('english') 14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop] 15 | 16 | def filter_by_subreddit(reddit, subs=None): 17 | if not subs: 18 | return reddit.index.values 19 | else: 20 | return reddit[reddit.subreddit.isin(subs)].index.values 21 | 22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001): 23 | from nltk.corpus import stopwords 24 | ''' 25 | From a list of documents raw text build a matrix DxV 26 | D: number of docs 27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs 28 | ''' 29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0) 30 | corpus = count_vect.fit_transform(documents) 31 | vocabulary = count_vect.get_feature_names() 32 | 33 | return corpus,vocabulary,count_vect 34 | 35 | def assign_dev_split(num_docs, percentage=0.05): 36 | indices = np.arange(num_docs) 37 | np.random.shuffle(indices) 38 | size = int(indices.shape[0]*percentage) 39 | dev = indices[:size] 40 | return dev 41 | 42 | def learn_topics(X, X_dev, K=50): 43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1) 44 | print("Fitting", K, "topics...") 45 | lda.fit(X) 46 | score = lda.perplexity(X_dev) 47 | print("Log likelihood:", score) 48 | topics = lda.components_ 49 | return score, lda, topics 50 | 51 | def show_topics(vocab, topics, n_words=20): 52 | topic_keywords = [] 53 | for topic_weights in topics: 54 | top_keyword_locs = (-topic_weights).argsort()[:n_words] 55 | topic_keywords.append(vocab.take(top_keyword_locs)) 56 | 57 | df_topic_keywords = pd.DataFrame(topic_keywords) 58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])] 59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])] 60 | return df_topic_keywords 61 | 62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'): 63 | filtered_indices = filtered_df[on].values 64 | doc_idx = [index_mapping[idx] for idx in filtered_indices] 65 | embeddings = doc_embeddings[doc_idx, :] 66 | return embeddings 67 | 68 | def make_index_mapping(df, on='post_index', convert_to_int=True): 69 | if on=='index': 70 | indices = df.index.values 71 | else: 72 | indices = df[on].values 73 | 74 | if convert_to_int: 75 | return {int(ind):i for (i,ind) in enumerate(indices)} 76 | 77 | return {ind:i for (i,ind) in enumerate(indices)} 78 | 79 | def assign_split(df, num_splits=10, col_to_add='split'): 80 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0]) 81 | return df 82 | -------------------------------------------------------------------------------- /src/lda_baseline/peerread_fit_topics.py: -------------------------------------------------------------------------------- 1 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | from scipy import sparse 6 | import argparse 7 | import sys 8 | 9 | def load_peerread(path='../dat/PeerRead/'): 10 | return pd.read_csv(path + 'proc_abstracts.csv') 11 | 12 | 13 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'): 14 | count_filename = path + 'term_counts' 15 | vocab_filename = path + 'vocab' 16 | 17 | if os.path.exists(count_filename + '.npz') and not force_redo: 18 | return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy') 19 | 20 | post_docs = df[text_col].values 21 | counts, vocab, _ = tokenize_documents(post_docs) 22 | sparse.save_npz(count_filename, counts) 23 | np.save(vocab_filename, vocab) 24 | return counts, np.array(vocab) 25 | 26 | def main(): 27 | if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda: 28 | if dat_dir: 29 | peerread = load_peerread(path=dat_dir) 30 | terms, vocab = load_term_counts(peerread, path=dat_dir, force_redo=redo_proc) 31 | else: 32 | peerread = load_peerread() 33 | terms, vocab = load_term_counts(peerread, force_redo=redo_proc) 34 | 35 | N = terms.shape[0] 36 | indices = np.arange(N) 37 | dev_idx = assign_dev_split(N) 38 | train_idx = np.setdiff1d(indices, dev_idx) 39 | X_tr = terms[train_idx, :] 40 | X_dev = terms[dev_idx, :] 41 | K_vals = [50] 42 | validation_scores = np.zeros(len(K_vals)) 43 | all_topics = [] 44 | models = [] 45 | for i,k in enumerate(K_vals): 46 | score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k) 47 | validation_scores[i] = score 48 | all_topics.append(topics) 49 | models.append(lda_obj) 50 | k_idx = np.argsort(validation_scores)[0]#[-1] 51 | best_k = K_vals[k_idx] 52 | best_topics = all_topics[k_idx] 53 | best_model = models[k_idx] 54 | best_doc_prop = best_model.transform(terms) 55 | np.save(os.path.join(out_dir, 'topics'), best_topics) 56 | np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop) 57 | else: 58 | best_topics = np.load(os.path.join(out_dir, 'topics.npy')) 59 | vocab = np.load(os.path.join(out_dir, 'vocab.npy')) 60 | 61 | print("Best topic") 62 | topics = show_topics(vocab, best_topics, n_words=10) 63 | print(topics) 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--dat-dir", action="store", default=None) 68 | parser.add_argument("--out-dir", action="store", default="../dat/PeerRead/") 69 | parser.add_argument("--redo-lda", action="store_true") 70 | parser.add_argument("--redo-proc", action="store_true") 71 | parser.add_argument("--test", action="store_true") 72 | args = parser.parse_args() 73 | out_dir = args.out_dir 74 | redo_lda = args.redo_lda 75 | redo_proc = args.redo_proc 76 | dat_dir = args.dat_dir 77 | 78 | main() -------------------------------------------------------------------------------- /src/lda_baseline/peerread_get_abstracts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple pre-processing for PeerRead papers. 3 | Takes in JSON formatted data from ScienceParse and outputs a tfrecord 4 | 5 | 6 | Reference example: 7 | https://github.com/tensorlayer/tensorlayer/blob/9528da50dfcaf9f0f81fba9453e488a1e6c8ee8f/examples/data_process/tutorial_tfrecord3.py 8 | """ 9 | 10 | import argparse 11 | import glob 12 | import os 13 | import random 14 | import pandas as pd 15 | import io 16 | import json 17 | from dateutil.parser import parse as parse_date 18 | from PeerRead.ScienceParse.Paper import Paper 19 | 20 | rng = random.Random(0) 21 | 22 | 23 | def process_json_paper(paper_json_filename, scienceparse_dir, tokenizer): 24 | paper = Paper.from_json(paper_json_filename) 25 | return paper.ABSTRACT 26 | 27 | 28 | def output_PeerRead_text(review_json_dir, parsedpdf_json_dir, 29 | out_dir, out_file): 30 | 31 | if not os.path.exists(out_dir): 32 | os.makedirs(out_dir) 33 | 34 | paper_data = [] 35 | print('Reading reviews from...', review_json_dir) 36 | paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir))) 37 | for idx, paper_json_filename in enumerate(paper_json_filenames): 38 | paper = Paper.from_json(paper_json_filename) 39 | paper_data.append([paper.ID, paper.ABSTRACT]) 40 | 41 | df = pd.DataFrame(paper_data, columns=['paper_id', 'abstract_text']) 42 | df.to_csv(out_dir + 'proc_abstracts.csv') 43 | 44 | def main(): 45 | parser = argparse.ArgumentParser() 46 | 47 | parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews') 48 | parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs') 49 | parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/') 50 | parser.add_argument('--out-file', type=str, default='proc_text.csv') 51 | 52 | args = parser.parse_args() 53 | 54 | output_PeerRead_text(args.review_json_dir, args.parsedpdf_json_dir, 55 | args.out_dir, args.out_file) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /src/lda_baseline/peerread_output_att.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.ate import ate_estimates 2 | from .peerread_fit_topics import load_peerread 3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | from sklearn.linear_model import LogisticRegression, LinearRegression 8 | from sklearn.metrics import mean_squared_error as mse 9 | import argparse 10 | import sys 11 | from scipy.special import logit 12 | 13 | def compute_ground_truth_treatment_effect(df): 14 | y1 = df['y1'] 15 | y0 = df['y0'] 16 | return y1.mean() - y0.mean() 17 | 18 | def get_log_outcomes(outcomes): 19 | #relu 20 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes]) 21 | return np.log(outcomes) 22 | 23 | def predict_expected_outcomes(model, doc_embeddings): 24 | features = logit(doc_embeddings) 25 | return model.predict_proba(features)[:,1] 26 | 27 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings): 28 | model = LogisticRegression(solver='liblinear') 29 | features = logit(doc_embeddings) 30 | model.fit(features, outcomes) 31 | if verbose: 32 | print("Training accuracy:", model.score(features, outcomes)) 33 | return model 34 | 35 | def predict_treatment_probability(labels, doc_embeddings): 36 | model = LogisticRegression(solver='liblinear') 37 | features = logit(doc_embeddings) 38 | model.fit(features, labels) 39 | if verbose: 40 | print("Training accuracy:", model.score(features, labels)) 41 | treatment_probability = model.predict_proba(features)[:,1] 42 | return treatment_probability 43 | 44 | def load_simulated_data(): 45 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 46 | return sim_df 47 | 48 | def load_document_proportions(path='../dat/PeerRead/'): 49 | return np.load(path + 'document_proportions.npy') 50 | 51 | def main(): 52 | peerread = load_peerread() 53 | indices = peerread['paper_id'].values 54 | index_mapping = make_index_mapping(peerread, on='index') 55 | 56 | if not dat_dir: 57 | doc_embeddings = load_document_proportions() 58 | else: 59 | doc_embeddings = load_document_proportions(path=dat_dir) 60 | 61 | sim_df = load_simulated_data() 62 | num_reps = 10 63 | mean_estimates = {} 64 | 65 | for rep in range(num_reps): 66 | bootstrap_sim_df = assign_split(sim_df, num_splits=2) 67 | bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0] 68 | treatment_labels = bootstrap_sim_df.treatment.values 69 | filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping, on='id') 70 | treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings) 71 | 72 | treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1] 73 | untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0] 74 | 75 | all_outcomes = bootstrap_sim_df.outcome.values 76 | outcomes_st_treated = treated_sim.outcome.values 77 | outcomes_st_not_treated = untreated_sim.outcome.values 78 | 79 | doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping, on='id') 80 | doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping, on='id') 81 | 82 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated) 83 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated) 84 | 85 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings) 86 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings) 87 | 88 | estimates = ate_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated, 89 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03) 90 | 91 | for est, ate in estimates.items(): 92 | if est in mean_estimates: 93 | mean_estimates[est].append(ate) 94 | else: 95 | mean_estimates[est] = [ate] 96 | 97 | ground_truth_ate = compute_ground_truth_treatment_effect(sim_df) 98 | mean_estimates.update({'ground_truth_ate':ground_truth_ate}) 99 | if verbose: 100 | for est, ates in mean_estimates.items(): 101 | print(est, np.mean(ates), np.std(ates)) 102 | else: 103 | config = ';'.join([str(mode)] + params) 104 | log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out') 105 | with open(log_file, 'a') as h: 106 | h.write(config + '\n') 107 | for est, ates in mean_estimates.items(): 108 | h.write(est + ',' + str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n') 109 | 110 | 111 | if __name__ == '__main__': 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--dat-dir", action="store", default=None) 114 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/') 115 | parser.add_argument("--mode", action="store", default="simple") 116 | parser.add_argument("--params", action="store", default="1.0") 117 | parser.add_argument("--verbose", action='store_true') 118 | args = parser.parse_args() 119 | 120 | sim_dir = args.sim_dir 121 | dat_dir = args.dat_dir 122 | verbose = args.verbose 123 | params = args.params 124 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0' 125 | mode = args.mode 126 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv" 127 | 128 | main() -------------------------------------------------------------------------------- /src/lda_baseline/reddit_fit_topics.py: -------------------------------------------------------------------------------- 1 | from reddit.data_cleaning.reddit_posts import load_reddit 2 | from .helpers import tokenize_documents, assign_dev_split, learn_topics, show_topics, filter_by_subreddit 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | from scipy import sparse 7 | import argparse 8 | import sys 9 | 10 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False): 11 | count_filename = path + 'term_counts' 12 | vocab_filename = path + 'vocab' 13 | 14 | if os.path.exists(count_filename + '.npz') and not force_redo: 15 | return sparse.load_npz(count_filename + '.npz'), np.load(vocab_filename + '.npy') 16 | 17 | post_docs = reddit['post_text'].values 18 | counts, vocab, _ = tokenize_documents(post_docs) 19 | sparse.save_npz(count_filename, counts) 20 | np.save(vocab_filename, vocab) 21 | return counts, np.array(vocab) 22 | 23 | def main(): 24 | if not os.path.exists(os.path.join(out_dir, 'topics.npy')) or redo_lda: 25 | 26 | subreddits = {'keto', 'OkCupid', 'childfree'} 27 | reddit = load_reddit() 28 | filtered_indices = filter_by_subreddit(reddit, subs=subreddits) 29 | 30 | if dat_dir: 31 | terms, vocab = load_term_counts(reddit, path=dat_dir, force_redo=redo_proc) 32 | else: 33 | terms, vocab = load_term_counts(reddit, force_redo=redo_proc) 34 | 35 | terms = terms[filtered_indices, :] 36 | N = terms.shape[0] 37 | indices = np.arange(N) 38 | dev_idx = assign_dev_split(N) 39 | train_idx = np.setdiff1d(indices, dev_idx) 40 | X_tr = terms[train_idx, :] 41 | X_dev = terms[dev_idx, :] 42 | print(dev_idx.shape) 43 | 44 | K_vals = [100] 45 | validation_scores = np.zeros(len(K_vals)) 46 | all_topics = [] 47 | models = [] 48 | for i,k in enumerate(K_vals): 49 | score, lda_obj, topics = learn_topics(X_tr, X_dev, K=k) 50 | validation_scores[i] = score 51 | all_topics.append(topics) 52 | models.append(lda_obj) 53 | k_idx = np.argsort(validation_scores)[0]#[-1] 54 | best_k = K_vals[k_idx] 55 | best_topics = all_topics[k_idx] 56 | best_model = models[k_idx] 57 | best_doc_prop = best_model.transform(terms) 58 | np.save(os.path.join(out_dir, 'topics'), best_topics) 59 | np.save(os.path.join(out_dir, 'document_proportions'), best_doc_prop) 60 | else: 61 | best_topics = np.load(os.path.join(out_dir, 'topics.npy')) 62 | vocab = np.load(os.path.join(out_dir, 'vocab.npy')) 63 | 64 | # print("Best topic") 65 | # topics = show_topics(vocab, best_topics, n_words=10) 66 | # print(topics) 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument("--dat-dir", action="store", default=None) 71 | parser.add_argument("--out-dir", action="store", default="../dat/reddit/") 72 | parser.add_argument("--redo-lda", action="store_true") 73 | parser.add_argument("--redo-proc", action="store_true") 74 | parser.add_argument("--test", action="store_true") 75 | args = parser.parse_args() 76 | out_dir = args.out_dir 77 | redo_lda = args.redo_lda 78 | redo_proc = args.redo_proc 79 | dat_dir = args.dat_dir 80 | test = args.test 81 | 82 | main() -------------------------------------------------------------------------------- /src/lda_baseline/reddit_output_att.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.att import att_estimates 2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed 3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | from sklearn.linear_model import LogisticRegression, LinearRegression 8 | from sklearn.metrics import mean_squared_error as mse 9 | import argparse 10 | import sys 11 | from scipy.special import logit 12 | 13 | def get_log_outcomes(outcomes): 14 | #relu 15 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes]) 16 | return np.log(outcomes) 17 | 18 | def predict_expected_outcomes(model, doc_embeddings): 19 | features = logit(doc_embeddings) 20 | return model.predict(features) 21 | 22 | def fit_conditional_expected_outcomes(outcomes, doc_embeddings): 23 | model = LinearRegression() 24 | features = logit(doc_embeddings) 25 | model.fit(features, outcomes) 26 | predict = model.predict(features) 27 | if verbose: 28 | print("Training MSE:", mse(outcomes, predict)) 29 | return model 30 | 31 | def predict_treatment_probability(labels, doc_embeddings): 32 | model = LogisticRegression(solver='liblinear') 33 | features = logit(doc_embeddings) 34 | model.fit(features, labels) 35 | if verbose: 36 | print("Training accuracy:", model.score(features, labels)) 37 | treatment_probability = model.predict_proba(features)[:,1] 38 | return treatment_probability 39 | 40 | def load_simulated_data(): 41 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 42 | sim_df = sim_df.rename(columns={'index':'post_index'}) 43 | return sim_df 44 | 45 | def load_document_proportions(path='../dat/reddit/'): 46 | return np.load(path + 'document_proportions.npy') 47 | 48 | def main(): 49 | reddit = load_reddit_processed() 50 | if subs: 51 | reddit = reddit[reddit.subreddit.isin(subs)] 52 | 53 | index_mapping = make_index_mapping(reddit, on='orig_index') 54 | if not dat_dir: 55 | doc_embeddings = load_document_proportions() 56 | else: 57 | doc_embeddings = load_document_proportions(path=dat_dir) 58 | 59 | sim_df = load_simulated_data() 60 | num_reps = 10 61 | mean_estimates = {} 62 | 63 | for rep in range(num_reps): 64 | bootstrap_sim_df = assign_split(sim_df, num_splits=2) 65 | bootstrap_sim_df = bootstrap_sim_df[bootstrap_sim_df.split==0] 66 | treatment_labels = bootstrap_sim_df.treatment.values 67 | filtered_doc_embeddings = filter_document_embeddings(bootstrap_sim_df, doc_embeddings, index_mapping) 68 | treatment_probability = predict_treatment_probability(treatment_labels, filtered_doc_embeddings) 69 | 70 | treated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==1] 71 | untreated_sim = bootstrap_sim_df[bootstrap_sim_df.treatment==0] 72 | 73 | all_outcomes = bootstrap_sim_df.outcome.values 74 | outcomes_st_treated = treated_sim.outcome.values 75 | outcomes_st_not_treated = untreated_sim.outcome.values 76 | 77 | doc_embed_st_treated = filter_document_embeddings(treated_sim, doc_embeddings, index_mapping) 78 | doc_embed_st_not_treated = filter_document_embeddings(untreated_sim, doc_embeddings, index_mapping) 79 | 80 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, doc_embed_st_treated) 81 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, doc_embed_st_not_treated) 82 | 83 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, filtered_doc_embeddings) 84 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, filtered_doc_embeddings) 85 | 86 | estimates = att_estimates(expected_outcome_st_not_treated, expected_outcome_st_treated, 87 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean()) 88 | 89 | for est, ate in estimates.items(): 90 | if est in mean_estimates: 91 | mean_estimates[est].append(ate) 92 | else: 93 | mean_estimates[est] = [ate] 94 | if verbose: 95 | for est, ates in mean_estimates.items(): 96 | print(est, np.mean(ates), np.std(ates)) 97 | else: 98 | config = ';'.join([str(mode)] + params) 99 | log_file = os.path.join(sim_dir, 'two-stage-lda-estimates.out') 100 | with open(log_file, 'a') as h: 101 | h.write(config + '\n') 102 | for est, ates in mean_estimates.items(): 103 | h.write(est + ',' + str(np.mean(ates)) + ',' + str(np.std(ates)) + '\n') 104 | 105 | 106 | if __name__ == '__main__': 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument("--dat-dir", action="store", default=None) 109 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/') 110 | parser.add_argument("--subs", action="store", default='13,6,8') 111 | parser.add_argument("--mode", action="store", default="simple") 112 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0") 113 | parser.add_argument("--verbose", action='store_true') 114 | args = parser.parse_args() 115 | 116 | sim_dir = args.sim_dir 117 | dat_dir = args.dat_dir 118 | subs = None 119 | if args.subs != '': 120 | subs = [int(s) for s in args.subs.split(',')] 121 | verbose = args.verbose 122 | params = args.params.split(',') 123 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2] 124 | subs_string = ', '.join(args.subs.split(',')) 125 | mode = args.mode 126 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv" 127 | 128 | main() -------------------------------------------------------------------------------- /src/lda_baseline/scripts/sweep_over_sims.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #NUM_SEED=2 3 | #SEEDS=$(seq 0 $NUM_SEED) 4 | rm ../dat/sim/reddit_subreddit_based/two-stage-lda-estimates.out 5 | export SUBREDDITS=13,6,8 6 | export BETA0=1.0 7 | declare -a SIMMODES=('simple') 8 | declare -a BETA1S=(1.0 10.0 100.0) 9 | declare -a GAMMAS=(1.0 4.0) 10 | 11 | for SIMMODEj in "${SIMMODES[@]}"; do 12 | for BETA1j in "${BETA1S[@]}"; do 13 | for GAMMAj in "${GAMMAS[@]}"; do 14 | python -m lda_baseline.reddit_output_att \ 15 | --subs=${SUBREDDITS} \ 16 | --mode=${SIMMODEj} \ 17 | --params=${BETA0},${BETA1j},${GAMMAj} 18 | done 19 | done 20 | done -------------------------------------------------------------------------------- /src/model_checking/plot_adjustment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.linear_model import LogisticRegression, LinearRegression 6 | from scipy.special import logit 7 | from result_processing.helpers import convert_str_columns_to_float, assign_split, filter_imbalanced_terms 8 | from sklearn.metrics import mean_squared_error as mse 9 | from scipy.sparse import load_npz 10 | import matplotlib.pyplot as plt 11 | from scipy.stats import gaussian_kde 12 | 13 | def get_prediction_file(): 14 | predict_df = pd.read_csv(log_file, delimiter='\t') 15 | predict_df = predict_df.rename(columns={'index':'post_index'}) 16 | return predict_df 17 | 18 | def fit_treatment(features, labels, verbose=False, coeff_offset=1): 19 | model = LogisticRegression(solver='liblinear') 20 | model.fit(features, labels) 21 | coeffs = np.array(model.coef_).flatten()[coeff_offset:] 22 | if verbose: 23 | print("Model accuracy:", model.score(features, labels)) 24 | print("Mean and std. of the word coeffs:", coeffs.mean(), coeffs.std()) 25 | return coeffs 26 | 27 | def truncate(df, truncate_level=0.1): 28 | df = df[(df.treatment_probability >= truncate_level) & (df.treatment_probability <= 1.0-truncate_level)] 29 | return df 30 | 31 | def plot_density(unadjusted, adjusted, permuted): 32 | density = gaussian_kde(adjusted.mean(axis=0)) 33 | permutation_density = gaussian_kde(permuted.mean(axis=0)) 34 | missing_z_density = gaussian_kde(unadjusted.mean(axis=0)) 35 | xs = np.linspace(-0.5,0.5,1000) 36 | plt.plot(xs,density(xs), label='Adjusted model (not permuted)') 37 | plt.plot(xs, permutation_density(xs), label='Permuted model') 38 | plt.plot(xs, missing_z_density(xs), label='Unadjusted model') 39 | plt.xlabel('Coefficient values for words') 40 | plt.legend() 41 | 42 | if not os.path.exists(out_dir): 43 | os.makedirs(out_dir) 44 | # plt.tight_layout() 45 | plt.savefig(out_dir + out_file, dpi=300) 46 | 47 | def load_terms(data): 48 | termfile = '../dat/' + data + '/term_counts.npz' 49 | if data == 'reddit': 50 | termfile = '../dat/' + data + '_term_counts.npz' 51 | term_counts = load_npz(termfile).toarray() 52 | if drop_terms: 53 | term_indices = np.arange(term_counts.shape[1]) 54 | random_indices = np.random.choice(term_indices, 1000) 55 | term_counts = term_counts[:,random_indices] 56 | return term_counts 57 | 58 | def main(): 59 | predict_df = get_prediction_file() 60 | term_counts = load_terms(dataset) 61 | print(predict_df.shape, term_counts.shape) 62 | if dataset == 'reddit': 63 | imbalanced_terms = filter_imbalanced_terms(predict_df, term_counts) 64 | term_counts = term_counts[:,imbalanced_terms] 65 | print(term_counts.shape) 66 | 67 | n_bootstraps = 10 68 | n_w = term_counts.shape[1] 69 | 70 | adjusted = np.zeros((n_bootstraps, n_w)) 71 | permuted = np.zeros((n_bootstraps, n_w)) 72 | unadjusted = np.zeros((n_bootstraps, n_w)) 73 | 74 | for i in range(n_bootstraps): 75 | sample = assign_split(predict_df,num_splits=2) 76 | sample = sample[sample.split==0] 77 | indices = sample.post_index.values 78 | labels = sample.treatment.values 79 | words = term_counts[indices, :] 80 | propensity_score = logit(sample.treatment_probability.values) 81 | all_features = np.column_stack((propensity_score, words)) 82 | unadjusted[i,:] = fit_treatment(words, labels, coeff_offset=0) 83 | adjusted[i,:] = fit_treatment(all_features, labels) 84 | np.random.shuffle(words) 85 | permuted_features = np.column_stack((propensity_score, words)) 86 | permuted[i,:] = fit_treatment(permuted_features, labels) 87 | 88 | plot_density(unadjusted, adjusted, permuted) 89 | 90 | if __name__ == '__main__': 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument("--out-dir", action="store", default='../figures/') 93 | parser.add_argument("--out-file", action="store", default='reddit.pdf') 94 | parser.add_argument("--log-file", action="store", default='../logdir/reddit/modesimple/beta01.0.beta110.0.gamma1.0/predict/test_results_all.tsv') 95 | parser.add_argument("--drop-terms", action="store_true") 96 | parser.add_argument("--dataset", action="store", default='reddit') 97 | args = parser.parse_args() 98 | log_file = args.log_file 99 | drop_terms = args.drop_terms 100 | dataset = args.dataset 101 | out_dir = args.out_dir 102 | out_file = args.out_file 103 | main() -------------------------------------------------------------------------------- /src/reddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/__init__.py -------------------------------------------------------------------------------- /src/reddit/data_cleaning/BigQuery_get_data: -------------------------------------------------------------------------------- 1 | ```/* 2 | based on https://www.reddit.com/r/bigquery/comments/4f2yp7/best_way_to_look_at_conversation_chains_in_reddit/ 3 | and on https://nbviewer.jupyter.org/github/bburky/subredditgenderratios/blob/master/Subreddit%20Gender%20Ratios.ipynb 4 | 5 | KNOWN LIMITATIONS: 6 | does not look for male/female (zodiac?) symbols. 7 | I didn't do a fresh search over subreddits w/ gender, so some may be missing 8 | up/downs was mostly null, so I omitted this field 9 | */ 10 | 11 | WITH 12 | reddit_comments AS ( 13 | SELECT 14 | body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class 15 | FROM 16 | `fh-bigquery.reddit_comments.2018*` 17 | -- `reddit-gender.comment_response_tuples.gendered_2018` 18 | ), 19 | replies AS ( 20 | SELECT 21 | REGEXP_EXTRACT(parent_id, r'[a-zA-Z0-9]+$') as parent_id, 22 | -- MIN(subreddit) AS subreddit, 23 | ARRAY_AGG(STRUCT(body, author, created_utc, id) ORDER BY created_utc ASC) AS reply 24 | FROM 25 | reddit_comments 26 | WHERE 27 | --parent id starting w t1_ indicates not-top-level comment 28 | REGEXP_CONTAINS(parent_id, r'^(t1_)') 29 | GROUP BY 30 | parent_id 31 | ), 32 | ops AS ( 33 | SELECT 34 | gender, body, author, author_flair_text, created_utc, link_id, score, controversiality, gilded, id, subreddit, author_flair_css_class 35 | FROM 36 | ( 37 | -- male/female 38 | SELECT 39 | *, 40 | REGEXP_EXTRACT( 41 | LOWER(author_flair_css_class), 42 | '(?:fe)?male') AS gender 43 | FROM 44 | reddit_comments 45 | WHERE 46 | subreddit IN ( 47 | 'AskMen', 48 | 'AskWomen', 49 | 'AskMenOver30', 50 | 'AskWomenOver30', 51 | 'sexover30') 52 | UNION ALL 53 | -- pink/blue 54 | SELECT 55 | *, 56 | CASE 57 | WHEN author_flair_css_class = 'blue' THEN 'male' 58 | WHEN author_flair_css_class = 'pink' THEN 'female' 59 | END AS gender 60 | FROM 61 | reddit_comments 62 | WHERE 63 | subreddit IN ( 64 | 'tall', 65 | 'short') 66 | UNION ALL 67 | -- A/S/L 68 | SELECT 69 | -- need to do this one manually because of asl 70 | body, author, author_flair_text, created_utc, link_id, parent_id, score, controversiality, gilded, id, subreddit, author_flair_css_class, 71 | CASE 72 | WHEN asl = 'm' THEN 'male' 73 | WHEN asl = 'f' THEN 'female' 74 | END AS gender 75 | FROM ( 76 | SELECT 77 | *, 78 | REGEXP_EXTRACT( 79 | LOWER(author_flair_text), 80 | "(?:^|[^\\p{L}0-9'\\.\\$])\\s*(?:\\d\\d)?\\s*(f|m)\\s*(?:\\d\\d)?\\s*(?:$|[^\\p{L}0-9'\\.])") AS asl 81 | FROM 82 | reddit_comments 83 | WHERE 84 | subreddit IN ( 85 | 'OkCupid', 86 | 'keto', 87 | 'childfree', 88 | 'xxketo', 89 | 'LGBTeens', 90 | 'loseit', 91 | 'Tinder', 92 | 'proED', 93 | 'fatlogic', 94 | 'financialindependence', 95 | 'infj', 96 | 'infertility', 97 | '100DaysofKeto')) ) 98 | WHERE 99 | gender IS NOT NULL AND 100 | --parent id starting w t3_ indicates top-level comment 101 | REGEXP_CONTAINS(parent_id, r'^(t3_)') 102 | ) 103 | 104 | SELECT 105 | ops.*, 106 | replies.* 107 | FROM 108 | ops INNER JOIN replies ON ops.id = replies.parent_id``` -------------------------------------------------------------------------------- /src/reddit/data_cleaning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/data_cleaning/__init__.py -------------------------------------------------------------------------------- /src/reddit/data_cleaning/reddit_gender_sentiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 22, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import json\n", 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 23, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "datafile = os.path.join('..', 'dat', '2018')\n", 22 | "\n", 23 | "with open(datafile, 'r') as f:\n", 24 | " record_dicts = []\n", 25 | " for line in f.readlines():\n", 26 | " record = json.loads(line)\n", 27 | " reply_list = record['reply']\n", 28 | " earliest_reply_text = None\n", 29 | " for reply_dict in sorted(reply_list, key=lambda x: x['created_utc']):\n", 30 | " if reply_dict['body'] != '[deleted]' and reply_dict['body'] != '[removed]':\n", 31 | " earliest_reply_text = reply_dict['body']\n", 32 | " if earliest_reply_text:\n", 33 | " break\n", 34 | " if earliest_reply_text:\n", 35 | " record.pop('reply')\n", 36 | " record['reply_text'] = earliest_reply_text\n", 37 | " record_dicts.append(record)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 24, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "reddit_df = pd.DataFrame(record_dicts)\n", 47 | "reddit_df = reddit_df[reddit_df.body != '[deleted]']\n", 48 | "reddit_df = reddit_df.astype({'score':np.int64, 'controversiality':np.int64, 'gilded':np.int64, 'created_utc':np.int64})" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 25, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from google.cloud import language\n", 58 | "from google.cloud.language import enums\n", 59 | "from google.cloud.language import types\n", 60 | "client = language.LanguageServiceClient()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 61, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "replies = reddit_df[['body','reply_text']].values\n", 70 | "indices = np.arange(len(replies))\n", 71 | "np.random.shuffle(indices)\n", 72 | "random_idx = indices[:10]" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 63, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "OP: How much our bridal party could drink when given the news they had an open bar tab. That was a couple thousand extra we didn’t expect to spend.\n", 85 | "\n", 86 | "\n", 87 | "Actual married life is exactly the same as before we were married since we lived together and shared finances well before marrying. \n", 88 | "Text: I wonder if it would be a good idea to just say it's a discounted bar or something and then at the end reveal it was an open bar.\n", 89 | "Sentiment: -0.30000001192092896, 0.30000001192092896\n", 90 | "****************************************\n", 91 | "OP: excuse me but this is a christian subreddit\n", 92 | "Text: But they said no homo\n", 93 | "Sentiment: 0.30000001192092896, 0.30000001192092896\n", 94 | "****************************************\n", 95 | "OP: I don't buy that the inches=pounds thing is real but if you want to add some scientific information to this, for me, one inch was equal to 6.1lbs when I took my starting weight and measurements.\n", 96 | "Text: Until I get an accurate scale, I think I'm going to try to do an average of the three numbers I've heard so far. That will at least give me a starting point so I can chart how far I've come. Thanks!\n", 97 | "Sentiment: 0.10000000149011612, 0.5\n", 98 | "****************************************\n", 99 | "OP: My SO referred to my mixed race roommate as \"half-caste\". He didn't realise that was considered offensive by some, it was what everyone at his school said.\n", 100 | "Text: Did you explain to him why it was offensive? I’ve noticed a lot of ppl say things that others say around them. \n", 101 | "Sentiment: -0.10000000149011612, 1.0\n", 102 | "****************************************\n", 103 | "OP: I get like that every shark week!! If you have to, up your calories to maintenance :)\n", 104 | "Text: How do I adjust my macros? I don't want to eat too much fat lol\n", 105 | "Sentiment: -0.10000000149011612, 0.20000000298023224\n", 106 | "****************************************\n", 107 | "OP: Only problem I have with it is the repetition/inconsistency of “ask(ed)”. Aside from that, seems like a real conversation I could see people having. Nice work :)\n", 108 | "Text: I'm trying to cut down on my repetition and more on letting the reader assume it was a question rather than having it say that instead. Thanks. \n", 109 | "Sentiment: 0.0, 0.5\n", 110 | "****************************************\n", 111 | "OP: This week I'm listening David Bowie, Nesrin Sipahi, Run DMC.\n", 112 | "Text: Run DMC :) YES! \n", 113 | "Sentiment: 0.30000001192092896, 0.6000000238418579\n", 114 | "****************************************\n", 115 | "OP: Pursuing the things you want to pursue, whether that’s love, fun, success, or anything else. Being willing to take risks in that pursuit.\n", 116 | "Text: Lots of ppl mentioning taking risks in this thread. What exactly do you mean by that?\n", 117 | "Sentiment: 0.0, 0.10000000149011612\n", 118 | "****************************************\n", 119 | "OP: It's a toss up between my diploma and my wedding ring.\n", 120 | "Text: I still have yet to pick up my diploma from my college and I graduated in 2012...\n", 121 | "Sentiment: 0.30000001192092896, 0.30000001192092896\n", 122 | "****************************************\n", 123 | "OP: Hate it. Partly to do with my other mental illnesses, but have trouble with hygiene in general.\n", 124 | "Text: How do you mean? If you don't mind sharing, that is. \n", 125 | "Sentiment: 0.0, 0.10000000149011612\n", 126 | "****************************************\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "for idx in random_idx:\n", 132 | " op = replies[idx][0]\n", 133 | " post = replies[idx][1]\n", 134 | " lines = post.split('\\n')\n", 135 | " for text in lines:\n", 136 | " if text == '':\n", 137 | " continue\n", 138 | " document = types.Document(\n", 139 | " content=text,\n", 140 | " type=enums.Document.Type.PLAIN_TEXT)\n", 141 | " sentiment = client.analyze_sentiment(document=document).document_sentiment\n", 142 | " print(\"OP:\", op)\n", 143 | " print(\"Text:\", text)\n", 144 | " print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))\n", 145 | " print(\"*\"*40)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.6.6" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /src/reddit/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/dataset/__init__.py -------------------------------------------------------------------------------- /src/reddit/dataset/array_from_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | helpers to take samples from the dataset and turn them into numpy arrays 3 | (for ease of inspection and use with baselines) 4 | """ 5 | import argparse 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | import tensorflow as tf 10 | try: 11 | import mkl_random as random 12 | except ImportError: 13 | import numpy.random as random 14 | 15 | import bert.tokenization as tokenization 16 | from reddit.dataset.dataset import make_input_fn_from_file, make_subreddit_based_simulated_labeler 17 | 18 | 19 | def dataset_fn_to_df(dataset_fn): 20 | 21 | params = {'batch_size': 1} 22 | dataset = dataset_fn(params) 23 | 24 | itr = dataset.make_one_shot_iterator() 25 | 26 | samples = [] 27 | 28 | for i in range(250000): 29 | try: 30 | sample = itr.get_next() 31 | for k in sample: 32 | sample[k] = sample[k].numpy()[0] 33 | samples += [sample] 34 | # print("year: {}".format(sample['year'])) 35 | except: 36 | print(i) 37 | break 38 | 39 | df = pd.DataFrame(samples) 40 | 41 | return df 42 | 43 | 44 | def subreddit_based_sim_dfs(subreddits, treat_strength, con_strength, noise_level, setting="simple", seed=0, 45 | base_output_dir='../dat/sim/reddit_subreddit_based/'): 46 | 47 | labeler = make_subreddit_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed) 48 | 49 | num_splits = 10 50 | dev_splits = [0] 51 | test_splits = [0] 52 | 53 | # data_file = '../dat/reddit/proc.tf_record' 54 | # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt" 55 | tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) 56 | 57 | input_dataset_from_filenames = make_input_fn_from_file(data_file, 58 | 250, 59 | num_splits, 60 | dev_splits, 61 | test_splits, 62 | tokenizer, 63 | subreddits=subreddits, 64 | is_training=False, 65 | filter_test=False, 66 | shuffle_buffer_size=25000, 67 | seed=seed, 68 | labeler=labeler) 69 | 70 | all_data = dataset_fn_to_df(input_dataset_from_filenames) 71 | output_df = all_data[['index', 'gender','outcome', 'y0', 'y1']] 72 | output_df = output_df.rename(index=str, columns={'gender': 'treatment'}) 73 | 74 | output_dir = os.path.join(base_output_dir, "subreddits{}".format(subreddits), "mode{}".format(setting)) 75 | os.makedirs(output_dir, exist_ok=True) 76 | output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level)) 77 | 78 | output_df.to_csv(output_path, '\t') 79 | 80 | 81 | def main(): 82 | tf.enable_eager_execution() 83 | 84 | 85 | subreddit_based_sim_dfs(subreddits=subs, treat_strength=beta0, con_strength=beta1, noise_level=gamma, setting=mode, seed=0, 86 | base_output_dir=base_output_dir) 87 | 88 | 89 | 90 | # print(itr.get_next()["token_ids"].name) 91 | # for i in range(1000): 92 | # sample = itr.get_next() 93 | 94 | # 95 | # print(np.unique(df['year'])) 96 | # print(df.groupby(['year'])['buzzy_title'].agg(np.mean)) 97 | # print(df.groupby(['year'])['theorem_referenced'].agg(np.mean)) 98 | # print(df.groupby(['year'])['accepted'].agg(np.mean)) 99 | 100 | 101 | 102 | if __name__ == '__main__': 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument("--data-file", action="store", default='../dat/reddit/proc.tf_record') 105 | parser.add_argument("--vocab-file", action="store", default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') 106 | parser.add_argument("--base-output-dir", action="store", default='../dat/sim/reddit_subreddit_based/') 107 | parser.add_argument("--subs", action="store", default='13,8,6') 108 | parser.add_argument("--mode", action="store", default="simple") 109 | parser.add_argument("--beta0", action="store", default='1.0') 110 | parser.add_argument("--beta1", action="store", default='1.0') 111 | parser.add_argument("--gamma", action="store", default='1.0') 112 | args = parser.parse_args() 113 | 114 | data_file = args.data_file 115 | vocab_file = args.vocab_file 116 | base_output_dir = args.base_output_dir 117 | subs = [int(s) for s in args.subs.split(',')] 118 | mode = args.mode 119 | beta0 = float(args.beta0) 120 | beta1 = float(args.beta1) 121 | gamma = float(args.gamma) 122 | 123 | # pass 124 | main() -------------------------------------------------------------------------------- /src/reddit/dataset/sentence_masking.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Create masked LM TF examples for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | 23 | import tensorflow as tf 24 | 25 | 26 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance", 27 | ["index", "label"]) 28 | 29 | 30 | def create_masked_lm_predictions(token_ids, masked_lm_prob, max_predictions_per_seq, vocab, seed): 31 | """Creates the predictions for the masked LM objective. 32 | 33 | This should be essentially equivalent to the bits that Bert loads from pre-processed tfrecords 34 | 35 | Except: we just include masks instead of randomly letting the words through or randomly replacing 36 | """ 37 | 38 | basic_mask = tf.less( 39 | tf.random_uniform(token_ids.shape, minval=0, maxval=1, dtype=tf.float32, seed=seed), 40 | masked_lm_prob) 41 | 42 | # don't mask special characters or padding 43 | cand_indexes = tf.logical_and(tf.not_equal(token_ids, vocab["[CLS]"]), 44 | tf.not_equal(token_ids, vocab["[SEP]"])) 45 | cand_indexes = tf.logical_and(cand_indexes, tf.not_equal(token_ids, 0)) 46 | mask = tf.logical_and(cand_indexes, basic_mask) 47 | 48 | # truncate to max predictions for ease of padding 49 | masked_lm_positions = tf.where(mask) 50 | # TODO: it should be essentially impossible for me to see this bug (very unlikely), but I do... symptom of :( ? 51 | # very rare event: nothing gets picked for mask, causing an irritating bug 52 | # in this case, just mask the first candidate index 53 | mlm_shape = tf.shape(masked_lm_positions)[0] 54 | masked_lm_positions = tf.cond(mlm_shape > 1, 55 | lambda: masked_lm_positions, 56 | lambda: tf.where(cand_indexes)[0:2]) 57 | 58 | masked_lm_positions = tf.squeeze(masked_lm_positions)[0:max_predictions_per_seq] 59 | masked_lm_positions = tf.cast(masked_lm_positions, dtype=tf.int32) 60 | masked_lm_ids = tf.gather(token_ids, masked_lm_positions) 61 | 62 | mask = tf.cast( 63 | tf.scatter_nd(tf.expand_dims(masked_lm_positions, 1), tf.ones_like(masked_lm_positions), token_ids.shape), 64 | bool) 65 | 66 | output_ids = tf.where(mask, vocab["[MASK]"]*tf.ones_like(token_ids), token_ids) 67 | 68 | # pad out to max_predictions_per_seq 69 | masked_lm_weights = tf.ones_like(masked_lm_ids, dtype=tf.float32) # tracks padding 70 | add_pad = [[0, max_predictions_per_seq - tf.shape(masked_lm_positions)[0]]] 71 | masked_lm_weights = tf.pad(masked_lm_weights, add_pad, 'constant') 72 | masked_lm_positions = tf.pad(masked_lm_positions, add_pad, 'constant') 73 | masked_lm_ids = tf.pad(masked_lm_ids, add_pad, 'constant') 74 | 75 | return output_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights 76 | 77 | 78 | def main(_): 79 | pass 80 | 81 | 82 | if __name__ == "__main__": 83 | flags.mark_flag_as_required("input_file") 84 | flags.mark_flag_as_required("output_file") 85 | flags.mark_flag_as_required("vocab_file") 86 | tf.app.run() 87 | -------------------------------------------------------------------------------- /src/reddit/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/reddit/model/__init__.py -------------------------------------------------------------------------------- /src/reddit/submit_scripts/run_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12 4 | export INIT_FILE=../dat/reddit/model.ckpt-400000 5 | export DATA_FILE=../dat/reddit/proc.tf_record 6 | export OUTPUT_DIR=../output/reddit_embeddings/ 7 | 8 | #13,6,8 are keto, okcupid, childfree 9 | export SUBREDDITS=13,6,8 10 | export USE_SUB_FLAG=false 11 | export BETA0=1.0 12 | export BETA1=1.0 13 | export GAMMA=1.0 14 | 15 | python -m reddit.model.run_causal_bert \ 16 | --seed=0 \ 17 | --do_train=true \ 18 | --do_eval=false \ 19 | --do_predict=true \ 20 | --label_pred=true \ 21 | --unsupervised=true \ 22 | --input_files_or_glob=${DATA_FILE} \ 23 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \ 24 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \ 25 | --output_dir=${OUTPUT_DIR} \ 26 | --dev_splits=0 \ 27 | --test_splits=0 \ 28 | --max_seq_length=128 \ 29 | --train_batch_size=16 \ 30 | --learning_rate=3e-5 \ 31 | --num_warmup_steps 1000 \ 32 | --num_train_steps=10000 \ 33 | --save_checkpoints_steps=5000 \ 34 | --keep_checkpoints=1 \ 35 | --subreddits=${SUBREDDITS} \ 36 | --beta0=${BETA0} \ 37 | --beta1=${BETA1} \ 38 | --gamma=${GAMMA} 39 | # --init_checkpoint=${INIT_FILE} -------------------------------------------------------------------------------- /src/reddit/submit_scripts/run_unsupervised.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export BERT_BASE_DIR=../../bert/pre-trained/uncased_L-12_H-768_A-12 4 | 5 | export DATA_FILE=../dat/reddit/proc.tf_record 6 | export OUTPUT_DIR=../output/reddit_embeddings/ 7 | 8 | #rm -rf $OUTPUT_DIR 9 | python -m model.run_unsupervised_pretraining \ 10 | --seed=0 \ 11 | --do_train=true \ 12 | --input_file=${DATA_FILE} \ 13 | --vocab_file=${BERT_BASE_DIR}/vocab.txt \ 14 | --bert_config_file=${BERT_BASE_DIR}/bert_config.json \ 15 | --output_dir=${OUTPUT_DIR} \ 16 | --max_seq_length=256 \ 17 | --train_batch_size=16 \ 18 | --learning_rate=3e-5 \ 19 | --num_warmup_steps 200 \ 20 | --num_train_steps=175000 \ 21 | --save_checkpoints_steps 5000 \ 22 | --keep_checkpoints 3 -------------------------------------------------------------------------------- /src/result_processing/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | np.random.seed(0) 4 | 5 | def convert_str_columns_to_float(df): 6 | df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].str[1:-1] 7 | df['expected_outcome_st_treatment'] = df['expected_outcome_st_treatment'].astype(np.float64) 8 | 9 | df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].str[1:-1] 10 | df['expected_outcome_st_no_treatment'] = df['expected_outcome_st_no_treatment'].astype(np.float64) 11 | return df 12 | 13 | 14 | def tokenize_documents(documents,max_df0=0.8, min_df0=0.01,print_vocabulary=False,outfolder=None,output_vocabulary_fname='vocabulary.dat'): 15 | from nltk.corpus import stopwords 16 | ''' 17 | From a list of documents raw text build a matrix DxV 18 | D: number of docs 19 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs 20 | ''' 21 | stop = stopwords.words('english') 22 | count_vect = CountVectorizer(stop_words=stop,max_df=max_df0, min_df=min_df0) 23 | corpus = count_vect.fit_transform(documents) 24 | vocabulary = count_vect.get_feature_names() 25 | 26 | return corpus,vocabulary,count_vect 27 | 28 | 29 | def assign_split(df, num_splits=10, col_to_add='split'): 30 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0]) 31 | return df 32 | 33 | 34 | def filter_imbalanced_terms(df, term_counts, imbalance=0.1, key='post_index'): 35 | t_indices = [] 36 | n_terms = term_counts.shape[1] 37 | for t in range(n_terms): 38 | ind_occur = np.nonzero(term_counts[:,t])[0] 39 | subset = df[df[key].isin(ind_occur)] 40 | if subset.shape[0] != 0: 41 | prop_men = subset[subset.treatment==1].shape[0]/subset.shape[0] 42 | prop_women = subset[subset.treatment==0].shape[0]/subset.shape[0] 43 | if abs(prop_women-prop_men)>=imbalance: 44 | t_indices.append(t) 45 | return t_indices 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/result_processing/process_predictions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from reddit.data_cleaning import load_reddit, process_text_length 4 | import pandas as pd 5 | import numpy as np 6 | from random import sample 7 | import matplotlib.pyplot as plt 8 | from scipy.stats import pearsonr 9 | 10 | from .helpers import convert_str_columns_to_float 11 | 12 | 13 | def plot_covariate_proportions_per_stratum(treated, control, num_bins, covariate='subreddit'): 14 | cov_vals = treated[covariate].values 15 | n_groups = num_bins 16 | 17 | for val in cov_vals: 18 | # data to plot 19 | treat_props = treated.loc[treated[covariate] == val, 'count'].values 20 | control_props = control.loc[control[covariate] == val, 'count'].values 21 | 22 | # create plot 23 | fig, ax = plt.subplots() 24 | index = np.arange(n_groups) 25 | bar_width = 0.3 26 | opacity = 0.8 27 | 28 | rects1 = plt.bar(index, treat_props, bar_width, 29 | alpha=opacity, 30 | color='b', 31 | label='Treated Units') 32 | 33 | rects2 = plt.bar(index + bar_width, control_props, bar_width, 34 | alpha=opacity, 35 | color='g', 36 | label='Control Units') 37 | 38 | plt.ylim((0.0,1.0)) 39 | plt.xlabel('Stratas') 40 | plt.ylabel('Proportions of posts in ' + covariate + ':' + val) 41 | plt.xticks(index + bar_width, tuple(range(1,num_bins+1))) 42 | plt.legend() 43 | 44 | plt.tight_layout() 45 | plt.savefig(os.path.join(log_dir, 'proportions_for_' + covariate + '_' + val + '.png')) 46 | 47 | def normalize(df, col): 48 | vals = df[col].values 49 | min_col = vals.min() 50 | max_col = vals.max() 51 | df[col] = (df[col] - min_col)/(max_col-min_col) 52 | return df 53 | 54 | 55 | def get_covariate_proportions(stratified_df, covariate='subreddit'): 56 | counts_df = stratified_df.groupby(['strata', covariate]).size().reset_index(name="count") 57 | total_by_strata = stratified_df.groupby("strata").size().reset_index(name="total") 58 | counts_df = counts_df.merge(total_by_strata, how='inner', on='strata') 59 | counts_df['count'] /= counts_df['total'] 60 | return counts_df 61 | 62 | 63 | def get_text_results(reddit_df, result_df, sub=None): 64 | indices = result_df['index'].values 65 | result_df = reddit_df.loc[indices, ['subreddit', 'post_text', 'author']] 66 | 67 | if sub: 68 | result_df = result_df[result_df.subreddit.isin([sub])] 69 | 70 | return result_df 71 | 72 | 73 | def print_example_posts(sub_text_df, n=10): 74 | post_list = [tuple(val) for val in sub_text_df.values] 75 | random_posts = sample(post_list, n) 76 | print("*"*10 + "Examples" + "*"*10) 77 | for post in random_posts: 78 | print("Subreddit:", post[0]) 79 | print("-"*40) 80 | print("Text:", post[1]) 81 | print("-"*40) 82 | print("Author:", post[2]) 83 | print("*"*40) 84 | 85 | 86 | def stratify_by_value(df, num_bins=10, sort_by='treatment_probability', col_to_add='strata'): 87 | values = df[sort_by].values 88 | min_val = values.min() 89 | max_val = values.max() 90 | interval = (max_val-min_val)/num_bins 91 | bins = np.arange(min_val, max_val, step=interval) 92 | bin_indices = np.digitize(values, bins) 93 | df[col_to_add] = bin_indices 94 | return df 95 | 96 | 97 | def main(): 98 | num_examples_to_print=5 99 | num_bins = 5 100 | 101 | predictions_file = os.path.join(log_dir, 'predict', 'test_results_all.tsv') 102 | predict_df = pd.read_csv(predictions_file, delimiter='\t') 103 | predict_df = convert_str_columns_to_float(predict_df) 104 | predict_df = predict_df.rename(columns={'index':'post_index'}) 105 | print(predict_df) 106 | 107 | treated = predict_df[predict_df.treatment == 1] 108 | control = predict_df[predict_df.treatment == 0] 109 | 110 | treated_stratified = stratify_by_value(treated, num_bins=num_bins) 111 | control_stratified = stratify_by_value(control, num_bins=num_bins) 112 | 113 | if res_type == 'subreddit': 114 | treated_cov_prop = get_covariate_proportions(treated_stratified) 115 | control_cov_prop = get_covariate_proportions(control_stratified) 116 | 117 | plot_covariate_proportions_per_stratum(treated_cov_prop, control_cov_prop, num_bins) 118 | 119 | for i in range(1,num_bins+1): 120 | print("*"*20, "Proportions for stratum:", i, "*"*20) 121 | print("-"*10, "Treated:", "-"*10) 122 | print(treated_cov_prop[treated_cov_prop.strata == i]) 123 | 124 | print("-"*10, "Control:", "-"*10) 125 | print(control_cov_prop[control_cov_prop.strata == i]) 126 | 127 | elif res_type == 'length': 128 | text = load_reddit() 129 | text = process_text_length(text) 130 | text = normalize(text, 'post_length') 131 | 132 | treated = treated.merge(text, left_on='post_index', right_index=True, how='inner') 133 | control = control.merge(text, left_on='post_index', right_index=True, how='inner') 134 | 135 | treated_corr = pearsonr(treated.post_length.values, treated.treatment_probability.values) 136 | control_corr = pearsonr(control.post_length.values, control.treatment_probability.values) 137 | print("Corr. between treated and post length", treated_corr) 138 | print("Corr. between control and post length", control_corr) 139 | 140 | 141 | # binned_post_length = stratify_by_value(text, num_bins=20, sort_by='post_length', col_to_add='length_bin') 142 | 143 | # columns_to_keep = treated_stratified.columns.tolist().extend('length_bin') 144 | # treated_text = treated_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')# [columns_to_keep] 145 | # control_text = control_stratified.merge(binned_post_length, left_on='post_index', right_index=True, how='inner')#[columns_to_keep] 146 | 147 | # treated_cov_prop = get_covariate_proportions(treated_text, covariate='length_bin') 148 | # control_cov_prop = get_covariate_proportions(control_text, covariate='length_bin') 149 | 150 | # for i in range(1,num_bins+1): 151 | # print("*"*20, "Proportions for stratum:", i, "*"*20) 152 | # print("-"*10, "Treated:", "-"*10) 153 | # print(treated_cov_prop[treated_cov_prop.strata == i]) 154 | 155 | # print("-"*10, "Control:", "-"*10) 156 | # print(control_cov_prop[control_cov_prop.strata == i]) 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument("--log-dir", action="store", default="../logdir/simulated_training_1.0_1.0_1.0") 163 | parser.add_argument("--result-type", action="store", default="subreddit") 164 | args = parser.parse_args() 165 | log_dir = args.log_dir 166 | res_type = args.result_type 167 | 168 | main() -------------------------------------------------------------------------------- /src/result_processing/prop_sim_plotting.py: -------------------------------------------------------------------------------- 1 | import os 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | import result_processing.compute_att as att 5 | import pandas as pd 6 | 7 | 8 | def make_reddit_prop_plt(): 9 | sns.set() 10 | prop_expt = pd.DataFrame(att.process_propensity_experiment()) 11 | 12 | prop_expt = prop_expt[['exog', 'plugin', 'one_step_tmle', 'very_naive']] 13 | prop_expt = prop_expt.rename(index=str, columns={'exog': 'Exogeneity', 14 | 'very_naive': 'Unadjusted', 15 | 'plugin': 'Plug-in', 16 | 'one_step_tmle': 'TMLE'}) 17 | prop_expt = prop_expt.set_index('Exogeneity') 18 | 19 | plt.figure(figsize=(4.75, 3.00)) 20 | # plt.figure(figsize=(2.37, 1.5)) 21 | sns.scatterplot(data=prop_expt, legend='brief', s=75) 22 | plt.xlabel("Exogeneity", fontfamily='monospace') 23 | plt.ylabel("NDE Estimate", fontfamily='monospace') 24 | plt.tight_layout() 25 | 26 | fig_dir = '../output/figures' 27 | os.makedirs(fig_dir, exist_ok=True) 28 | plt.savefig(os.path.join(fig_dir,'reddit_propensity.pdf')) 29 | 30 | 31 | def main(): 32 | make_reddit_prop_plt() 33 | 34 | 35 | if __name__ == '__main__': 36 | main() -------------------------------------------------------------------------------- /src/semi_parametric_estimation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/causal-text-embeddings/25488e798c3985ca7f7070290f4192b9dbc73ee0/src/semi_parametric_estimation/__init__.py -------------------------------------------------------------------------------- /src/semi_parametric_estimation/ate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import logit, expit 3 | from scipy.optimize import minimize 4 | 5 | from .helpers import truncate_by_g, mse, cross_entropy, truncate_all_by_g 6 | from .att import att_estimates 7 | 8 | """ 9 | Note: the standard deviations reported by this methods are actually standard deviations conditioned on the nuisance 10 | parameters. 11 | 12 | That is, we do not account for variability in the estimation of Q and g 13 | """ 14 | 15 | 16 | def _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps): 17 | """ 18 | Helper for psi_tmle_bin_outcome 19 | 20 | Returns q_\eps (t,x) 21 | (i.e., value of perturbed predictor at t, eps, x; where q_t0, q_t1, g are all evaluated at x 22 | """ 23 | h = t * (1./g) - (1.-t) / (1. - g) 24 | full_lq = (1.-t)*logit(q_t0) + t*logit(q_t1) # logit predictions from unperturbed model 25 | logit_perturb = full_lq + eps * h 26 | return expit(logit_perturb) 27 | 28 | 29 | def psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=0.05): 30 | # TODO: make me useable 31 | # solve the perturbation problem 32 | 33 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 34 | 35 | eps_hat = minimize(lambda eps: cross_entropy(y, _perturbed_model_bin_outcome(q_t0, q_t1, g, t, eps)) 36 | , 0., method='Nelder-Mead') 37 | 38 | eps_hat = eps_hat.x[0] 39 | 40 | def q1(t_cf): 41 | return _perturbed_model_bin_outcome(q_t0, q_t1, g, t_cf, eps_hat) 42 | 43 | ite = q1(np.ones_like(t)) - q1(np.zeros_like(t)) 44 | 45 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0]) 46 | 47 | 48 | def psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, eps_hat=None, truncate_level=0.05): 49 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 50 | 51 | g_loss = mse(g, t) 52 | h = t * (1.0/g) - (1.0-t) / (1.0 - g) 53 | full_q = (1.0-t)*q_t0 + t*q_t1 # predictions from unperturbed model 54 | 55 | if eps_hat is None: 56 | eps_hat = np.sum(h*(y-full_q)) / np.sum(np.square(h)) 57 | 58 | def q1(t_cf): 59 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g) 60 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model 61 | return full_q + eps_hat * h_cf 62 | 63 | ite = q1(np.ones_like(t)) - q1(np.zeros_like(t)) 64 | psi_tmle = np.mean(ite) 65 | 66 | # standard deviation computation relies on asymptotic expansion of non-parametric estimator, see van der Laan and Rose p 96 67 | ic = h*(y-q1(t)) + ite - psi_tmle 68 | psi_tmle_std = np.std(ic) / np.sqrt(t.shape[0]) 69 | initial_loss = np.mean(np.square(full_q-y)) 70 | final_loss = np.mean(np.square(q1(t)-y)) 71 | 72 | # print("tmle epsilon_hat: ", eps_hat) 73 | # print("initial risk: {}".format(initial_loss)) 74 | # print("final risk: {}".format(final_loss)) 75 | 76 | return psi_tmle, psi_tmle_std, eps_hat, initial_loss, final_loss, g_loss 77 | 78 | 79 | def psi_iptw(q_t0, q_t1, g, t, y, truncate_level=0.05): 80 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 81 | 82 | ite=(t / g - (1-t) / (1-g))*y 83 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0]) 84 | 85 | 86 | def psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=0.05): 87 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 88 | 89 | full_q = q_t0 * (1 - t) + q_t1 * t 90 | h = t * (1.0 / g) - (1.0 - t) / (1.0 - g) 91 | ite = h * (y - full_q) + q_t1 - q_t0 92 | 93 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0]) 94 | 95 | 96 | def psi_q_only(q_t0, q_t1, g, t, y, truncate_level=0.): 97 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 98 | ite = (q_t1 - q_t0) 99 | return np.mean(ite), np.std(ite) / np.sqrt(t.shape[0]) 100 | 101 | 102 | def psi_very_naive(t, y): 103 | psi_hat = y[t == 1].mean() - y[t == 0].mean() 104 | psi_std = np.sqrt(np.var(y[t == 1]) / np.sum(t) + np.var(y[t == 0]) / np.sum(1-t)) 105 | return 106 | 107 | 108 | def ates_from_atts(q_t0, q_t1, g, t, y, truncate_level=0.05): 109 | """ 110 | Sanity check code: ATE = ATT_1*P(T=1) + ATT_0*P(T=1) 111 | 112 | :param q_t0: 113 | :param q_t1: 114 | :param g: 115 | :param t: 116 | :param y: 117 | :param truncate_level: 118 | :return: 119 | """ 120 | 121 | prob_t = t.mean() 122 | 123 | att = att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=truncate_level) 124 | att_flip = att_estimates(q_t1, q_t0, 1.-g, 1-t, y, 1.-prob_t, truncate_level=truncate_level) 125 | 126 | ates = {} 127 | for k in att.keys(): 128 | # note: minus because the flip computes E[Y^0 - Y^1 | T=0] 129 | ates[k] = att[k]*prob_t - att_flip[k]*(1.-prob_t) 130 | # ates[k] = att_flip[k] 131 | 132 | return ates 133 | 134 | 135 | def ate_estimates(q_t0, q_t1, g, t, y, truncate_level=0.05): 136 | 137 | very_naive = psi_very_naive(t,y) 138 | q_only = psi_q_only(q_t0, q_t1, g, t, y, truncate_level=truncate_level) 139 | iptw = psi_iptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level) 140 | aiptw = psi_aiptw(q_t0, q_t1, g, t, y, truncate_level=truncate_level) 141 | tmle = psi_tmle_cont_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level)[0:1] 142 | bin_tmle = psi_tmle_bin_outcome(q_t0, q_t1, g, t, y, truncate_level=truncate_level) 143 | 144 | estimates = {'very_naive': very_naive, 145 | 'q_only': q_only, 146 | 'iptw': iptw, 147 | 'tmle': tmle, 148 | 'bin-tmle': bin_tmle, 149 | 'aiptw': aiptw} 150 | 151 | return estimates 152 | 153 | 154 | 155 | def main(): 156 | pass 157 | 158 | 159 | if __name__ == "__main__": 160 | main() 161 | -------------------------------------------------------------------------------- /src/semi_parametric_estimation/att.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import logit, expit 3 | from scipy.optimize import minimize 4 | 5 | from .helpers import truncate_all_by_g, cross_entropy, mse 6 | 7 | 8 | def _perturbed_model(q_t0, q_t1, g, t, q, eps): 9 | # helper function for psi_tmle 10 | 11 | h1 = t / q - ((1 - t) * g) / (q * (1 - g)) 12 | full_q = (1.0 - t) * q_t0 + t * q_t1 13 | perturbed_q = full_q - eps * h1 14 | 15 | def q1(t_cf, epsilon): 16 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g) 17 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model 18 | return full_q - epsilon * h_cf 19 | 20 | psi_init = np.mean(t * (q1(np.ones_like(t), eps) - q1(np.zeros_like(t), eps))) / q 21 | h2 = (q_t1 - q_t0 - psi_init) / q 22 | perturbed_g = expit(logit(g) - eps * h2) 23 | 24 | return perturbed_q, perturbed_g 25 | 26 | 27 | def psi_tmle(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05): 28 | """ 29 | Near canonical van der Laan TMLE, except we use a 30 | 1 dimension epsilon shared between the Q and g update models 31 | 32 | """ 33 | 34 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 35 | 36 | def _perturbed_loss(eps): 37 | pert_q, pert_g = _perturbed_model(q_t0, q_t1, g, t, prob_t, eps) 38 | loss = (np.square(y - pert_q)).mean() + cross_entropy(t, pert_g) 39 | return loss 40 | 41 | eps_hat = minimize(_perturbed_loss, 0.) 42 | eps_hat = eps_hat.x[0] 43 | 44 | def q2(t_cf, epsilon): 45 | h_cf = t_cf * (1.0 / g) - (1.0 - t_cf) / (1.0 - g) 46 | full_q = (1.0 - t_cf) * q_t0 + t_cf * q_t1 # predictions from unperturbed model 47 | return full_q - epsilon * h_cf 48 | 49 | psi_tmle = np.mean(t * (q2(np.ones_like(t), eps_hat) - q2(np.zeros_like(t), eps_hat))) / prob_t 50 | return psi_tmle 51 | 52 | 53 | def make_one_step_tmle(prob_t, deps_default=0.001): 54 | "Make a function that computes the 1-step TMLE ala https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4912007/" 55 | 56 | def _perturb_q(q_t0, q_t1, g, t, deps=deps_default): 57 | h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g)) 58 | 59 | full_q = (1.0 - t) * q_t0 + t * q_t1 60 | perturbed_q = full_q - deps * h1 61 | # perturbed_q= expit(logit(full_q) - deps*h1) 62 | return perturbed_q 63 | 64 | def _perturb_g(q_t0, q_t1, g, deps=deps_default): 65 | h2 = (q_t1 - q_t0 - _psi(q_t0, q_t1, g)) / prob_t 66 | perturbed_g = expit(logit(g) - deps * h2) 67 | return perturbed_g 68 | 69 | def _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps_default): 70 | # get the values of Q_{eps+deps} and g_{eps+deps} by using the recursive formula 71 | 72 | perturbed_g = _perturb_g(q0_old, q1_old, g_old, deps=deps) 73 | 74 | perturbed_q = _perturb_q(q0_old, q1_old, perturbed_g, t, deps=deps) 75 | perturbed_q0 = _perturb_q(q0_old, q1_old, perturbed_g, np.zeros_like(t), deps=deps) 76 | perturbed_q1 = _perturb_q(q0_old, q1_old, perturbed_g, np.ones_like(t), deps=deps) 77 | 78 | return perturbed_q0, perturbed_q1, perturbed_q, perturbed_g 79 | 80 | def _loss(q, g, y, t): 81 | # compute the new loss 82 | q_loss = mse(y, q) 83 | g_loss = cross_entropy(t, g) 84 | return q_loss + g_loss 85 | 86 | def _psi(q0, q1, g): 87 | return np.mean(g*(q1 - q0)) / prob_t 88 | 89 | def tmle(q_t0, q_t1, g, t, y, truncate_level=0.05, deps=deps_default): 90 | """ 91 | Computes the tmle for the ATT (equivalently: direct effect) 92 | 93 | :param q_t0: 94 | :param q_t1: 95 | :param g: 96 | :param t: 97 | :param y: 98 | :param truncate_level: 99 | :param deps: 100 | :return: 101 | """ 102 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 103 | 104 | eps = 0.0 105 | 106 | q0_old = q_t0 107 | q1_old = q_t1 108 | g_old = g 109 | 110 | # determine whether epsilon should go up or down 111 | # translated blindly from line 299 of https://github.com/cran/tmle/blob/master/R/tmle.R 112 | h1 = t / prob_t - ((1 - t) * g) / (prob_t * (1 - g)) 113 | full_q = (1.0 - t) * q_t0 + t * q_t1 114 | deriv = np.mean(prob_t*h1*(y-full_q) + t*(q_t1 - q_t0 - _psi(q_t0, q_t1, g))) 115 | if deriv > 0: 116 | deps = -deps 117 | 118 | # run until loss starts going up 119 | # old_loss = np.inf # this is the thing used by Rose' implementation 120 | old_loss = _loss(full_q, g, y, t) 121 | 122 | while True: 123 | perturbed_q0, perturbed_q1, perturbed_q, perturbed_g = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=deps) 124 | 125 | new_loss = _loss(perturbed_q, perturbed_g, y, t) 126 | 127 | # debugging 128 | # print("Psi: {}".format(_psi(q0_old, q1_old, g_old))) 129 | # print("new_loss is: ", new_loss, "old_loss is ", old_loss) 130 | 131 | # # if this is the first step, decide whether to go down or up from eps=0.0 132 | # if eps == 0.0: 133 | # _, _, perturbed_q_neg, perturbed_g_neg = _perturb_g_and_q(q0_old, q1_old, g_old, t, deps=-deps) 134 | # neg_loss = _loss(perturbed_q_neg, perturbed_g_neg, y, t) 135 | # 136 | # if neg_loss < new_loss: 137 | # return tmle(q_t0, q_t1, g, t, y, deps=-1.0 * deps) 138 | 139 | # check if converged 140 | if new_loss > old_loss: 141 | if eps == 0.: 142 | print("Warning: no update occurred (is deps too big?)") 143 | return _psi(q0_old, q1_old, g_old) 144 | else: 145 | eps += deps 146 | 147 | q0_old = perturbed_q0 148 | q1_old = perturbed_q1 149 | g_old = perturbed_g 150 | 151 | old_loss = new_loss 152 | 153 | return tmle 154 | 155 | 156 | def psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05): 157 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 158 | 159 | ite_t = (q_t1 - q_t0)[t == 1] 160 | estimate = ite_t.mean() 161 | return estimate 162 | 163 | 164 | def psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05): 165 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 166 | 167 | ite_t = g*(q_t1 - q_t0)/prob_t 168 | estimate = ite_t.mean() 169 | return estimate 170 | 171 | 172 | def psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05): 173 | # the robust ATT estimator described in eqn 3.9 of 174 | # https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf 175 | 176 | q_t0, q_t1, g, t, y = truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level) 177 | estimate = (t*(y-q_t0) - (1-t)*(g/(1-g))*(y-q_t0)).mean() / prob_t 178 | 179 | return estimate 180 | 181 | 182 | def psi_very_naive(t, y): 183 | return y[t == 1].mean() - y[t == 0].mean() 184 | 185 | 186 | def att_estimates(q_t0, q_t1, g, t, y, prob_t, truncate_level=0.05, deps=0.0001): 187 | 188 | one_step_tmle = make_one_step_tmle(prob_t, deps_default=deps) 189 | 190 | very_naive = psi_very_naive(t,y) 191 | q_only = psi_q_only(q_t0, q_t1, g, t, y, prob_t, truncate_level) 192 | plugin = psi_plugin(q_t0, q_t1, g, t, y, prob_t, truncate_level) 193 | aiptw = psi_aiptw(q_t0, q_t1, g, t, y, prob_t, truncate_level) 194 | one_step_tmle = one_step_tmle(q_t0, q_t1, g, t, y, truncate_level) # note different signature 195 | 196 | estimates = {'very_naive': very_naive, 'q_only': q_only, 'plugin': plugin, 'one_step_tmle': one_step_tmle, 'aiptw': aiptw} 197 | 198 | return estimates 199 | -------------------------------------------------------------------------------- /src/semi_parametric_estimation/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import logit 3 | 4 | import sklearn.linear_model as lm 5 | 6 | 7 | def calibrate_g(g, t): 8 | """ 9 | Improve calibation of propensity scores by fitting 1 parameter (temperature) logistic regression on heldout data 10 | 11 | :param g: raw propensity score estimates 12 | :param t: treatment assignments 13 | :return: 14 | """ 15 | 16 | logit_g = logit(g).reshape(-1,1) 17 | calibrator = lm.LogisticRegression(fit_intercept=False, C=1e6, solver='lbfgs') # no intercept or regularization 18 | calibrator.fit(logit_g, t) 19 | calibrated_g = calibrator.predict_proba(logit_g)[:,1] 20 | return calibrated_g 21 | 22 | 23 | def truncate_by_g(attribute, g, level=0.1): 24 | keep_these = np.logical_and(g >= level, g <= 1.-level) 25 | 26 | return attribute[keep_these] 27 | 28 | 29 | def truncate_all_by_g(q_t0, q_t1, g, t, y, truncate_level=0.05): 30 | """ 31 | Helper function to clean up nuisance parameter estimates. 32 | 33 | """ 34 | 35 | orig_g = np.copy(g) 36 | 37 | q_t0 = truncate_by_g(np.copy(q_t0), orig_g, truncate_level) 38 | q_t1 = truncate_by_g(np.copy(q_t1), orig_g, truncate_level) 39 | g = truncate_by_g(np.copy(g), orig_g, truncate_level) 40 | t = truncate_by_g(np.copy(t), orig_g, truncate_level) 41 | y = truncate_by_g(np.copy(y), orig_g, truncate_level) 42 | 43 | return q_t0, q_t1, g, t, y 44 | 45 | 46 | 47 | def cross_entropy(y, p): 48 | return -np.mean((y*np.log(p) + (1.-y)*np.log(1.-p))) 49 | 50 | 51 | def mse(x, y): 52 | return np.mean(np.square(x-y)) 53 | -------------------------------------------------------------------------------- /src/supervised_lda/add_split_to_simulations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 8, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "base_sim_dir = '../../dat/sim/'\n", 21 | "datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n", 22 | "mode = 'modesimple'\n", 23 | "\n", 24 | "for dataset in datasets:\n", 25 | " simdir = os.path.join(base_sim_dir, dataset, mode)\n", 26 | " for simfile in os.listdir(simdir):\n", 27 | " df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n", 28 | " df['split'] = np.random.randint(0, 10, size=df.shape[0])\n", 29 | " df.to_csv(os.path.join(simdir, simfile),sep='\\t')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.8" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /src/supervised_lda/compute_estimates.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.att import att_estimates 2 | import numpy as np 3 | import os 4 | import argparse 5 | import pandas as pd 6 | 7 | def main(): 8 | outdir = os.path.join('..', 'out', args.data, args.experiment) 9 | for sim in os.listdir(outdir): 10 | mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []} 11 | for split in os.listdir(os.path.join(outdir, sim)): 12 | if args.num_splits is not None: 13 | # print("ignoring split", split) 14 | if int(split) >= int(args.num_splits): 15 | continue 16 | array = np.load(os.path.join(outdir, sim, split, 'predictions.npz')) 17 | g = array['g'] 18 | q0 = array['q0'] 19 | q1 = array['q1'] 20 | y = array['y'] 21 | t = array['t'] 22 | estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03) 23 | for est, att in estimates.items(): 24 | mean_estimates[est].append(att) 25 | 26 | if args.data == 'reddit': 27 | sim = sim.replace('beta01.0.', '') 28 | options = sim.split('.0.') 29 | p2 = options[0].replace('beta1', '') 30 | p3 = options[1].replace('gamma', '') 31 | 32 | print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------") 33 | print("True effect = 1.0") 34 | else: 35 | ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03} 36 | print("------ Simulation setting: Confounding strength =", sim) 37 | print("True effect = ", ground_truth_map[sim]) 38 | 39 | 40 | for est, atts in mean_estimates.items(): 41 | print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3)) 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--data", action="store", default="reddit") 47 | parser.add_argument("--experiment", action="store", default="base_model") 48 | parser.add_argument("--num-splits", action="store", default=None) 49 | args = parser.parse_args() 50 | 51 | main() -------------------------------------------------------------------------------- /src/supervised_lda/helpers.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import word_tokenize 2 | from nltk.stem import WordNetLemmatizer 3 | from nltk.corpus import stopwords 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.decomposition import LatentDirichletAllocation 8 | 9 | class LemmaTokenizer(object): 10 | def __init__(self): 11 | self.wnl = WordNetLemmatizer() 12 | def __call__(self, articles): 13 | stop = stopwords.words('english') 14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop] 15 | 16 | def filter_by_subreddit(reddit, subs=None): 17 | if not subs: 18 | return reddit.index.values 19 | else: 20 | return reddit[reddit.subreddit.isin(subs)].index.values 21 | 22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005): 23 | from nltk.corpus import stopwords 24 | ''' 25 | From a list of documents raw text build a matrix DxV 26 | D: number of docs 27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs 28 | ''' 29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0) 30 | corpus = count_vect.fit_transform(documents) 31 | vocabulary = count_vect.get_feature_names() 32 | 33 | return corpus,vocabulary,count_vect 34 | 35 | def assign_dev_split(num_docs, percentage=0.05): 36 | indices = np.arange(num_docs) 37 | np.random.shuffle(indices) 38 | size = int(indices.shape[0]*percentage) 39 | dev = indices[:size] 40 | return dev 41 | 42 | def learn_topics(X, X_dev, K=50): 43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1) 44 | print("Fitting", K, "topics...") 45 | lda.fit(X) 46 | score = lda.perplexity(X_dev) 47 | print("Log likelihood:", score) 48 | topics = lda.components_ 49 | return score, lda, topics 50 | 51 | def show_topics(vocab, topics, n_words=20): 52 | topic_keywords = [] 53 | for topic_weights in topics: 54 | top_keyword_locs = (-topic_weights).argsort()[:n_words] 55 | topic_keywords.append(vocab.take(top_keyword_locs)) 56 | 57 | df_topic_keywords = pd.DataFrame(topic_keywords) 58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])] 59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])] 60 | return df_topic_keywords 61 | 62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'): 63 | filtered_indices = filtered_df[on].values 64 | doc_idx = [index_mapping[idx] for idx in filtered_indices] 65 | embeddings = doc_embeddings[doc_idx, :] 66 | return embeddings 67 | 68 | def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'): 69 | filtered_indices = filtered_df[on].values 70 | doc_idx = [index_mapping[idx] for idx in filtered_indices] 71 | filtered_counts = counts[doc_idx, :] 72 | return filtered_counts 73 | 74 | def make_index_mapping(df, on='post_index', convert_to_int=True): 75 | if on=='index': 76 | indices = df.index.values 77 | else: 78 | indices = df[on].values 79 | 80 | if convert_to_int: 81 | return {int(ind):i for (i,ind) in enumerate(indices)} 82 | 83 | return {ind:i for (i,ind) in enumerate(indices)} 84 | 85 | def assign_split(df, num_splits=10, col_to_add='split'): 86 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0]) 87 | return df 88 | -------------------------------------------------------------------------------- /src/supervised_lda/peerread_output_att.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.att import att_estimates 2 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | from sklearn.metrics import mean_squared_error as mse 7 | import argparse 8 | import sys 9 | from supervised_lda.supervised_topic_model import SupervisedTopicModel 10 | from supervised_lda import run_supervised_tm 11 | from scipy import sparse 12 | from sklearn.linear_model import LogisticRegression, Ridge 13 | from scipy.special import logit 14 | 15 | def load_peerread(path='../dat/PeerRead/'): 16 | return pd.read_csv(path + 'proc_abstracts.csv') 17 | 18 | def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'): 19 | count_filename = path + 'term_counts' 20 | vocab_filename = path + 'vocab' 21 | 22 | if os.path.exists(count_filename + '.npz') and not force_redo: 23 | return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy') 24 | 25 | post_docs = df[text_col].values 26 | counts, vocab, _ = tokenize_documents(post_docs) 27 | sparse.save_npz(count_filename, counts) 28 | np.save(vocab_filename, vocab) 29 | return counts.toarray(), np.array(vocab) 30 | 31 | def compute_ground_truth_treatment_effect(df): 32 | y1 = df['y1'] 33 | y0 = df['y0'] 34 | return y1.mean() - y0.mean() 35 | 36 | def load_simulated_data(): 37 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 38 | return sim_df 39 | 40 | def fit_model(doc_embeddings, labels, is_binary=False): 41 | if is_binary: 42 | model = LogisticRegression(solver='liblinear') 43 | else: 44 | model = Ridge() 45 | model.fit(doc_embeddings, labels) 46 | return model 47 | 48 | def main(): 49 | if dat_dir: 50 | peerread = load_peerread(path=dat_dir) 51 | counts,vocab = load_term_counts(peerread,path=dat_dir) 52 | else: 53 | peerread = load_peerread() 54 | counts,vocab = load_term_counts(peerread) 55 | 56 | indices = peerread['paper_id'].values 57 | index_mapping = make_index_mapping(peerread, on='index') 58 | 59 | sim_df = load_simulated_data() 60 | 61 | train_df = sim_df[sim_df.split != split] 62 | predict_df = sim_df[sim_df.split == split] 63 | tr_treatment_labels = train_df.treatment.values 64 | tr_outcomes = train_df.outcome.values 65 | predict_treatment = predict_df.treatment.values 66 | predict_outcomes = predict_df.outcome.values 67 | 68 | tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id') 69 | predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id') 70 | 71 | num_documents = tr_counts.shape[0] 72 | vocab_size = tr_counts.shape[1] 73 | model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model) 74 | 75 | run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary', 76 | num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss) 77 | 78 | if use_supervised_loss: 79 | propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary') 80 | else: 81 | tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts) 82 | treated = tr_treatment_labels == 1 83 | out_treat = tr_outcomes[treated] 84 | out_no_treat = tr_outcomes[~treated] 85 | q0_embeddings = tr_doc_embeddings[~treated,:] 86 | q1_embeddings = tr_doc_embeddings[treated,:] 87 | q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True) 88 | q1_model = fit_model(q1_embeddings, out_treat, is_binary=True) 89 | g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True) 90 | 91 | pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts) 92 | propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1] 93 | expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1] 94 | expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1] 95 | 96 | out = os.path.join(outdir, str(split)) 97 | os.makedirs(out, exist_ok=True) 98 | outfile = os.path.join(out, 'predictions') 99 | np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes) 100 | 101 | if __name__ == '__main__': 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument("--dat-dir", action="store", default=None) 104 | parser.add_argument("--outdir", action="store", default='../out/') 105 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/') 106 | parser.add_argument("--mode", action="store", default="simple") 107 | parser.add_argument("--params", action="store", default="1.0") 108 | parser.add_argument("--verbose", action='store_true') 109 | parser.add_argument("--split", action='store', default=0) 110 | parser.add_argument("--num-iters", action="store", default=3000) 111 | parser.add_argument("--num-topics", action='store', default=100) 112 | parser.add_argument("--linear-outcome-model", action='store', default="t") 113 | parser.add_argument("--use-recon-loss", action='store', default="t") 114 | parser.add_argument("--use-supervised-loss", action='store', default="t") 115 | args = parser.parse_args() 116 | 117 | sim_dir = args.sim_dir 118 | outdir = args.outdir 119 | dat_dir = args.dat_dir 120 | verbose = args.verbose 121 | params = args.params 122 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0' 123 | mode = args.mode 124 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv" 125 | num_topics = args.num_topics 126 | split = int(args.split) 127 | linear_outcome_model = True if args.linear_outcome_model == "t" else False 128 | use_supervised_loss = True if args.use_supervised_loss == "t" else False 129 | use_recon_loss = True if args.use_recon_loss == "t" else False 130 | num_iters = int(args.num_iters) 131 | print(use_supervised_loss, use_recon_loss, linear_outcome_model) 132 | 133 | main() -------------------------------------------------------------------------------- /src/supervised_lda/reddit_output_att.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.att import att_estimates 2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed 3 | from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | from supervised_lda.supervised_topic_model import SupervisedTopicModel 8 | from sklearn.linear_model import LogisticRegression, Ridge 9 | from supervised_lda import run_supervised_tm 10 | from sklearn.metrics import mean_squared_error as mse 11 | import argparse 12 | import sys 13 | from scipy.special import logit 14 | from scipy import sparse 15 | 16 | def load_term_counts(reddit, path='../dat/reddit/', force_redo=False): 17 | count_filename = path + 'term_counts' 18 | vocab_filename = path + 'vocab' 19 | 20 | if os.path.exists(count_filename + '.npz') and not force_redo: 21 | return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy') 22 | 23 | post_docs = reddit['post_text'].values 24 | counts, vocab, _ = tokenize_documents(post_docs) 25 | sparse.save_npz(count_filename, counts) 26 | np.save(vocab_filename, vocab) 27 | return counts.toarray(), np.array(vocab) 28 | 29 | def load_simulated_data(): 30 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 31 | sim_df = sim_df.rename(columns={'index':'post_index'}) 32 | return sim_df 33 | 34 | def drop_empty_posts(counts): 35 | doc_terms = counts.sum(axis=1) 36 | return doc_terms >= 5 37 | 38 | def fit_model(doc_embeddings, labels, is_binary=False): 39 | if is_binary: 40 | model = LogisticRegression(solver='liblinear') 41 | else: 42 | model = Ridge() 43 | model.fit(doc_embeddings, labels) 44 | return model 45 | 46 | def main(): 47 | if dat_dir: 48 | reddit = load_reddit_processed(path=dat_dir) 49 | else: 50 | reddit = load_reddit_processed() 51 | 52 | if subs: 53 | reddit = reddit[reddit.subreddit.isin(subs)] 54 | reddit = reddit.dropna(subset=['post_text']) 55 | 56 | 57 | index_mapping = make_index_mapping(reddit, on='orig_index') 58 | if not dat_dir: 59 | counts, vocab = load_term_counts(reddit) 60 | else: 61 | counts, vocab = load_term_counts(reddit, path=dat_dir) 62 | 63 | sim_df = load_simulated_data() 64 | 65 | train_df = sim_df[sim_df.split != split] 66 | predict_df = sim_df[sim_df.split == split] 67 | 68 | tr_treatment_labels = train_df.treatment.values 69 | tr_outcomes = train_df.outcome.values 70 | predict_treatment = predict_df.treatment.values 71 | predict_outcomes = predict_df.outcome.values 72 | 73 | tr_counts = filter_document_terms(train_df, counts, index_mapping) 74 | predict_counts = filter_document_terms(predict_df, counts, index_mapping) 75 | tr_valid = drop_empty_posts(tr_counts) 76 | pred_valid = drop_empty_posts(predict_counts) 77 | tr_counts = tr_counts[tr_valid, :] 78 | predict_counts = predict_counts[pred_valid, :] 79 | 80 | tr_treatment_labels = tr_treatment_labels[tr_valid] 81 | tr_outcomes = tr_outcomes[tr_valid] 82 | predict_treatment = predict_treatment[pred_valid] 83 | predict_outcomes = predict_outcomes[pred_valid] 84 | 85 | num_documents = tr_counts.shape[0] 86 | vocab_size = tr_counts.shape[1] 87 | model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model) 88 | 89 | run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss) 90 | 91 | if use_supervised_loss: 92 | propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts) 93 | else: 94 | tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts) 95 | treated = tr_treatment_labels == 1 96 | out_treat = tr_outcomes[treated] 97 | out_no_treat = tr_outcomes[~treated] 98 | q0_embeddings = tr_doc_embeddings[~treated,:] 99 | q1_embeddings = tr_doc_embeddings[treated,:] 100 | q0_model = fit_model(q0_embeddings, out_no_treat) 101 | q1_model = fit_model(q1_embeddings, out_treat) 102 | g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True) 103 | 104 | pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts) 105 | propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1] 106 | expected_outcome_no_treat = q0_model.predict(pred_doc_embeddings) 107 | expected_outcome_treat = q1_model.predict(pred_doc_embeddings) 108 | 109 | out = os.path.join(outdir, str(split)) 110 | os.makedirs(out, exist_ok=True) 111 | outfile = os.path.join(out, 'predictions') 112 | np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes) 113 | 114 | 115 | if __name__ == '__main__': 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument("--dat-dir", action="store", default=None) 118 | parser.add_argument("--outdir", action="store", default='../out/') 119 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/') 120 | parser.add_argument("--subs", action="store", default='13,6,8') 121 | parser.add_argument("--mode", action="store", default="simple") 122 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0") 123 | parser.add_argument("--verbose", action='store_true') 124 | parser.add_argument("--num-topics", action='store', default=100) 125 | parser.add_argument("--split", action='store', default=0) 126 | parser.add_argument("--num-iters", action="store", default=4000) 127 | # parser.add_argument("--num_splits", action='store', default=10) 128 | parser.add_argument("--linear-outcome-model", action='store', default="t") 129 | parser.add_argument("--use-recon-loss", action='store', default="t") 130 | parser.add_argument("--use-supervised-loss", action='store', default="t") 131 | args = parser.parse_args() 132 | 133 | sim_dir = args.sim_dir 134 | dat_dir = args.dat_dir 135 | outdir = args.outdir 136 | subs = None 137 | if args.subs != '': 138 | subs = [int(s) for s in args.subs.split(',')] 139 | verbose = args.verbose 140 | params = args.params.split(',') 141 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2] 142 | subs_string = ', '.join(args.subs.split(',')) 143 | mode = args.mode 144 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv" 145 | num_iters = int(args.num_iters) 146 | num_topics = int(args.num_topics) 147 | split = int(args.split) 148 | # num_splits = args.num_splits 149 | linear_outcome_model = True if args.linear_outcome_model == "t" else False 150 | use_supervised_loss = True if args.use_supervised_loss == "t" else False 151 | use_recon_loss = True if args.use_recon_loss == "t" else False 152 | 153 | main() -------------------------------------------------------------------------------- /src/supervised_lda/run_supervised_tm.py: -------------------------------------------------------------------------------- 1 | from torch import nn, optim 2 | from torch.nn import functional as F 3 | import torch 4 | # from torch.utils.tensorboard import SummaryWriter 5 | import numpy as np 6 | import argparse 7 | from scipy.special import expit 8 | 9 | def visualize_topics(model, vocab, num_topics, num_words=10): 10 | model.eval() 11 | with torch.no_grad(): 12 | print('#'*100) 13 | print('Visualize topics...') 14 | betas = model.alphas.t() #model.get_beta() 15 | for k in range(num_topics): 16 | beta = betas[k].detach().numpy() 17 | top_words = beta.argsort()[-num_words:] 18 | topic_words = vocab[top_words] 19 | print('Topic {}: {}'.format(k, topic_words)) 20 | 21 | def get_representation(model, docs): 22 | normalized = docs/docs.sum(axis=-1)[:,np.newaxis] 23 | normalized_bow = torch.tensor(normalized, dtype=torch.float) 24 | num_documents = docs.shape[0] 25 | model.eval() 26 | with torch.no_grad(): 27 | doc_representation,_ = model.get_theta(normalized_bow) 28 | embeddings = doc_representation.detach().numpy() 29 | return embeddings 30 | 31 | 32 | def predict(model, docs, dtype='real'): 33 | normalized = docs/docs.sum(axis=-1)[:,np.newaxis] 34 | normalized_bow = torch.tensor(normalized, dtype=torch.float) 35 | num_documents = docs.shape[0] 36 | 37 | treatment_ones = torch.ones(num_documents) 38 | treatment_zeros = torch.zeros(num_documents) 39 | 40 | model.eval() 41 | with torch.no_grad(): 42 | doc_representation,_ = model.get_theta(normalized_bow) 43 | propensity_score = model.predict_treatment(doc_representation).squeeze().detach().numpy() 44 | propensity_score = expit(propensity_score) 45 | expected_outcome_treat = model.predict_outcome_st_treat(doc_representation, treatment_ones).squeeze().detach().numpy() 46 | expected_outcome_no_treat = model.predict_outcome_st_no_treat(doc_representation, treatment_zeros).squeeze().detach().numpy() 47 | 48 | if dtype == 'binary': 49 | expected_outcome_treat = expit(expected_outcome_treat) 50 | expected_outcome_no_treat = expit(expected_outcome_no_treat) 51 | 52 | return propensity_score, expected_outcome_treat, expected_outcome_no_treat 53 | 54 | def train(model, docs, treatment_labels, outcomes, dtype='real', num_epochs=20000, lr=0.005, wdecay=1.2e-5,batch_size=1000, use_recon_loss=True, use_sup_loss=True): 55 | optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wdecay) 56 | num_documents = docs.shape[0] 57 | indices = np.arange(num_documents) 58 | np.random.shuffle(indices) 59 | 60 | for e_idx in range(num_epochs): 61 | model.train() 62 | k = e_idx%(num_documents//batch_size) 63 | start_index = k*batch_size 64 | end_index = (k+1)*batch_size 65 | batch = indices[start_index:end_index] 66 | docs_batch = docs[batch,:] 67 | treatment_labels_batch = treatment_labels[batch] 68 | outcomes_batch = outcomes[batch] 69 | normalized_batch = docs_batch/docs_batch.sum(axis=1)[:,np.newaxis] 70 | 71 | outcome_labels = torch.tensor(outcomes_batch, dtype=torch.float) 72 | treat_labels = torch.tensor(treatment_labels_batch, dtype=torch.float) 73 | bow = torch.tensor(docs_batch, dtype=torch.float) 74 | normalized_bow = torch.tensor(normalized_batch, dtype=torch.float) 75 | 76 | optimizer.zero_grad() 77 | model.zero_grad() 78 | 79 | recon_loss, supervised_loss, kld_theta = model(bow, normalized_bow, treat_labels, outcome_labels,dtype=dtype, use_supervised_loss=use_sup_loss) 80 | acc_kl_theta_loss = torch.sum(kld_theta).item() 81 | acc_sup_loss = 0. 82 | acc_loss = 0. 83 | 84 | total_loss = kld_theta #+ recon_loss + supervised_loss 85 | if use_recon_loss: 86 | acc_loss = torch.sum(recon_loss).item() 87 | total_loss += 0.1*recon_loss 88 | if use_sup_loss: 89 | acc_sup_loss = torch.sum(supervised_loss).item() 90 | total_loss += supervised_loss 91 | 92 | total_loss.backward() 93 | optimizer.step() 94 | 95 | print("Acc. loss:", acc_loss, "KL loss.:", acc_kl_theta_loss, "Supervised loss:", acc_sup_loss) -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH -A sml 3 | #SBATCH -c 8 4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu 5 | #SBATCH --mail-type=ALL 6 | 7 | source activate py3.6 8 | 9 | python -m supervised_lda.peerread_output_att \ 10 | --dat-dir=${DIR} \ 11 | --mode=${MODE} \ 12 | --params=${BETA1} \ 13 | --sim-dir=${SIMDIR} \ 14 | --outdir=${OUT}/${BETA1} \ 15 | --split=${SPLIT} \ 16 | --linear-outcome-model=${LINOUTCOME} \ 17 | --use-recon-loss=${RECONLOSS} \ 18 | --use-supervised-loss=${SUPLOSS} \ -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/peerread-exps/submit_no_sup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/ 6 | 7 | export MODE=simple 8 | export LINOUTCOME=t 9 | export RECONLOSS=t 10 | export SUPLOSS=f 11 | 12 | declare -a BETA1S=(5.0) 13 | 14 | for BETA1j in "${BETA1S[@]}"; do 15 | for SPLITi in $(seq 0 9); do 16 | export BETA1=${BETA1j} 17 | export SPLIT=${SPLITi} 18 | export OUT=${BASE_OUT}/no_sup/ 19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \ 20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \ 21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh 22 | done 23 | done 24 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/peerread-exps/submit_no_unsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/ 6 | 7 | export MODE=simple 8 | export LINOUTCOME=t 9 | export RECONLOSS=f 10 | export SUPLOSS=t 11 | 12 | declare -a BETA1S=(1.0 5.0 25.0) 13 | 14 | for BETA1j in "${BETA1S[@]}"; do 15 | for SPLITi in $(seq 0 9); do 16 | export BETA1=${BETA1j} 17 | export SPLIT=${SPLITi} 18 | export OUT=${BASE_OUT}/no_unsup/ 19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \ 20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \ 21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh 22 | done 23 | done 24 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/peerread-exps/submit_nonlinear.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/ 6 | 7 | export MODE=simple 8 | export LINOUTCOME=f 9 | export RECONLOSS=t 10 | export SUPLOSS=t 11 | 12 | declare -a BETA1S=(1.0 5.0 25.0) 13 | 14 | for BETA1j in "${BETA1S[@]}"; do 15 | for SPLITi in $(seq 0 9); do 16 | export BETA1=${BETA1j} 17 | export SPLIT=${SPLITi} 18 | export OUT=${BASE_OUT}/non_linear/ 19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \ 20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \ 21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh 22 | done 23 | done 24 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/peerread-exps/submit_peerread_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/PeerRead/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/peerread_buzzytitle_based/ 6 | 7 | export MODE=simple 8 | export LINOUTCOME=t 9 | export RECONLOSS=t 10 | export SUPLOSS=t 11 | 12 | declare -a BETA1S=(1.0 5.0 25.0) 13 | 14 | for BETA1j in "${BETA1S[@]}"; do 15 | for SPLITi in $(seq 0 9); do 16 | export BETA1=${BETA1j} 17 | export SPLIT=${SPLITi} 18 | export OUT=${BASE_OUT}/base_model/ 19 | sbatch --job-name=peerread_supervised_lda_sim_${BETA1j}_${SPLITi} \ 20 | --output=peerread_supervised_lda_sim_${BETA1j}_${SPLITi}.out \ 21 | supervised_lda/submit_scripts/peerread-exps/run_peerread_simulation.sh 22 | done 23 | done 24 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH -A sml 3 | #SBATCH -c 8 4 | #SBATCH --mail-user=dhanya.sridhar@columbia.edu 5 | #SBATCH --mail-type=ALL 6 | 7 | source activate py3.6 8 | 9 | python -m supervised_lda.reddit_output_att \ 10 | --dat-dir=${DIR} \ 11 | --mode=${MODE} \ 12 | --subs=${SUBS} \ 13 | --params=${BETA0},${BETA1},${GAMMA} \ 14 | --sim-dir=${SIMDIR} \ 15 | --outdir=${OUT}/beta0${BETA0}.beta1${BETA1}.gamma${GAMMA} \ 16 | --split=${SPLIT} \ 17 | --linear-outcome-model=${LINOUTCOME} \ 18 | --use-recon-loss=${RECONLOSS} \ 19 | --use-supervised-loss=${SUPLOSS} \ 20 | 21 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/submit_no_sup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/ 6 | 7 | export MODE=simple 8 | export SUBS=13,6,8 9 | export LINOUTCOME=t 10 | export RECONLOSS=t 11 | export SUPLOSS=f 12 | 13 | export BETA0=1.0 14 | declare -a BETA1S=(10.0) 15 | declare -a GAMMAS=(1.0 4.0) 16 | 17 | for BETA1j in "${BETA1S[@]}"; do 18 | export BETA1=${BETA1j} 19 | for GAMMAj in "${GAMMAS[@]}"; do 20 | for SPLITi in $(seq 0 4); do 21 | export SPLIT=${SPLITi} 22 | export GAMMA=${GAMMAj} 23 | export OUT=${BASE_OUT}/no_sup/ 24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \ 25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \ 26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh 27 | done 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/submit_no_unsup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/ 6 | 7 | export MODE=simple 8 | export SUBS=13,6,8 9 | export LINOUTCOME=t 10 | export RECONLOSS=f 11 | export SUPLOSS=t 12 | 13 | export BETA0=1.0 14 | declare -a BETA1S=(1.0 10.0 100.0) 15 | declare -a GAMMAS=(1.0 4.0) 16 | 17 | for BETA1j in "${BETA1S[@]}"; do 18 | export BETA1=${BETA1j} 19 | for GAMMAj in "${GAMMAS[@]}"; do 20 | for SPLITi in $(seq 0 4); do 21 | export SPLIT=${SPLITi} 22 | export GAMMA=${GAMMAj} 23 | export OUT=${BASE_OUT}/no_unsup/ 24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \ 25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \ 26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh 27 | 28 | done 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/submit_nonlinear.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/ 6 | 7 | export MODE=simple 8 | export SUBS=13,6,8 9 | export LINOUTCOME=f 10 | export RECONLOSS=t 11 | export SUPLOSS=t 12 | 13 | export BETA0=1.0 14 | declare -a BETA1S=(1.0 10.0 100.0) 15 | declare -a GAMMAS=(1.0 4.0) 16 | 17 | for BETA1j in "${BETA1S[@]}"; do 18 | export BETA1=${BETA1j} 19 | for GAMMAj in "${GAMMAS[@]}"; do 20 | for SPLITi in $(seq 0 4); do 21 | export SPLIT=${SPLITi} 22 | export GAMMA=${GAMMAj} 23 | export OUT=${BASE_OUT}/non_linear/ 24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \ 25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \ 26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh 27 | done 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_simulation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/ 6 | 7 | export MODE=simple 8 | export SUBS=13,6,8 9 | export LINOUTCOME=t 10 | export RECONLOSS=t 11 | export SUPLOSS=t 12 | 13 | export BETA0=1.0 14 | declare -a BETA1S=(1.0 10.0 100.0) 15 | declare -a GAMMAS=(1.0 4.0) 16 | 17 | for BETA1j in "${BETA1S[@]}"; do 18 | export BETA1=${BETA1j} 19 | for GAMMAj in "${GAMMAS[@]}"; do 20 | for SPLITi in $(seq 0 4); do 21 | export SPLIT=${SPLITi} 22 | export GAMMA=${GAMMAj} 23 | export OUT=${BASE_OUT}/base_model/ 24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \ 25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \ 26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh 27 | done 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /src/supervised_lda/submit_scripts/reddit-exps/submit_reddit_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BASE_OUT=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/out/ 3 | 4 | export DIR=/proj/sml_netapp/projects/causal-text/reddit/supervised_lda_baseline/proc/ 5 | export SIMDIR=/proj/sml_netapp/projects/causal-text/sim/reddit_subreddit_based/ 6 | 7 | export MODE=simple 8 | export SUBS=13,6,8 9 | export LINOUTCOME=True 10 | export RECONLOSS=True 11 | export SUPLOSS=True 12 | 13 | export BETA0=1.0 14 | declare -a BETA1S=(1.0) 15 | declare -a GAMMAS=(1.0) 16 | 17 | for BETA1j in "${BETA1S[@]}"; do 18 | export BETA1=${BETA1j} 19 | for GAMMAj in "${GAMMAS[@]}"; do 20 | for SPLITi in $(seq 0 1); do 21 | export SPLIT=${SPLITi} 22 | export GAMMA=${GAMMAj} 23 | export OUT=${BASE_OUT}/base_model/ 24 | sbatch --job-name=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi} \ 25 | --output=reddit_supervised_lda_sim_${BETA1j}_${GAMMAj}_${SPLITi}.out \ 26 | supervised_lda/submit_scripts/reddit-exps/run_reddit_simulation.sh 27 | done 28 | done 29 | done 30 | -------------------------------------------------------------------------------- /src/supervised_lda/supervised_topic_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import math 5 | 6 | from torch import nn 7 | 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | 10 | class SupervisedTopicModel(nn.Module): 11 | def __init__(self, num_topics, vocab_size, num_documents, t_hidden_size=800, theta_act='relu', enc_drop=0., outcome_linear_map=True): 12 | super(SupervisedTopicModel, self).__init__() 13 | 14 | ## define hyperparameters 15 | self.num_topics = num_topics 16 | self.vocab_size = vocab_size 17 | self.num_documents = num_documents 18 | self.t_hidden_size = t_hidden_size 19 | self.enc_drop = enc_drop 20 | self.t_drop = nn.Dropout(enc_drop) 21 | self.theta_act = self.get_activation(theta_act) 22 | self.outcome_linear_map = outcome_linear_map 23 | 24 | ## define the matrix containing the topic embeddings 25 | self.alphas = nn.Parameter(torch.randn(vocab_size, num_topics)) 26 | 27 | if self.outcome_linear_map: 28 | ## define linear regression weights for predicting expected outcomes for treated 29 | self.w_expected_outcome_treated = nn.Linear(num_topics, 1) 30 | 31 | ## define linear regression weights for predicting expected outcomes for untreated 32 | self.w_expected_outcome_untreated = nn.Linear(num_topics, 1) 33 | else: 34 | self.f_outcome_treated = nn.Sequential( 35 | nn.Linear(num_topics, t_hidden_size), 36 | self.theta_act, 37 | # nn.BatchNorm1d(t_hidden_size), 38 | nn.Linear(t_hidden_size, t_hidden_size), 39 | self.theta_act, 40 | # nn.BatchNorm1d(t_hidden_size), 41 | nn.Linear(t_hidden_size,1) 42 | ) 43 | self.f_outcome_untreated = nn.Sequential( 44 | nn.Linear(num_topics, t_hidden_size), 45 | self.theta_act, 46 | # nn.BatchNorm1d(t_hidden_size), 47 | nn.Linear(t_hidden_size, t_hidden_size), 48 | self.theta_act, 49 | # nn.BatchNorm1d(t_hidden_size), 50 | nn.Linear(t_hidden_size,1) 51 | ) 52 | ## define linear regression weights for predicting binary treatment label 53 | self.w_treatment = nn.Linear(num_topics,1) 54 | 55 | self.q_theta = nn.Sequential( 56 | nn.Linear(vocab_size, t_hidden_size), 57 | self.theta_act, 58 | nn.BatchNorm1d(t_hidden_size), 59 | nn.Linear(t_hidden_size, t_hidden_size), 60 | self.theta_act, 61 | nn.BatchNorm1d(t_hidden_size) 62 | ) 63 | self.mu_q_theta = nn.Linear(t_hidden_size, num_topics) 64 | self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics) 65 | 66 | def get_activation(self, act): 67 | if act == 'tanh': 68 | act = nn.Tanh() 69 | elif act == 'relu': 70 | act = nn.ReLU() 71 | elif act == 'softplus': 72 | act = nn.Softplus() 73 | elif act == 'rrelu': 74 | act = nn.RReLU() 75 | elif act == 'leakyrelu': 76 | act = nn.LeakyReLU() 77 | elif act == 'elu': 78 | act = nn.ELU() 79 | elif act == 'selu': 80 | act = nn.SELU() 81 | elif act == 'glu': 82 | act = nn.GLU() 83 | else: 84 | print('Defaulting to tanh activations...') 85 | act = nn.Tanh() 86 | return act 87 | 88 | def reparameterize(self, mu, logvar): 89 | """Returns a sample from a Gaussian distribution via reparameterization. 90 | """ 91 | if self.training: 92 | std = torch.exp(0.5 * logvar) 93 | eps = torch.randn_like(std) 94 | return eps.mul_(std).add_(mu) 95 | else: 96 | return mu 97 | 98 | def encode(self, bows): 99 | """Returns paramters of the variational distribution for \theta. 100 | 101 | input: bows 102 | batch of bag-of-words...tensor of shape bsz x V 103 | output: mu_theta, log_sigma_theta 104 | """ 105 | q_theta = self.q_theta(bows) 106 | if self.enc_drop > 0: 107 | q_theta = self.t_drop(q_theta) 108 | mu_theta = self.mu_q_theta(q_theta) 109 | logsigma_theta = self.logsigma_q_theta(q_theta) 110 | kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean() 111 | return mu_theta, logsigma_theta, kl_theta 112 | 113 | def get_beta(self): 114 | beta = F.softmax(self.alphas, dim=0).transpose(1, 0) ## softmax over vocab dimension 115 | return beta 116 | 117 | def get_theta(self, normalized_bows): 118 | mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows) 119 | z = self.reparameterize(mu_theta, logsigma_theta) 120 | theta = F.softmax(z, dim=-1) 121 | return theta, kld_theta 122 | 123 | def decode(self, theta, beta): 124 | res = torch.mm(theta, beta) 125 | preds = torch.log(res+1e-6) 126 | return preds 127 | 128 | def predict_treatment(self, theta): 129 | logits = self.w_treatment(theta) 130 | return logits 131 | 132 | def predict_outcome_st_treat(self, theta, treatment_labels): 133 | treated_indices = [treatment_labels == 1] 134 | theta_treated = theta[treated_indices] 135 | 136 | if not self.outcome_linear_map: 137 | expected_outcome_treated = self.f_outcome_treated(theta_treated) 138 | else: 139 | expected_outcome_treated = self.w_expected_outcome_treated(theta_treated) 140 | 141 | return expected_outcome_treated 142 | 143 | def predict_outcome_st_no_treat(self, theta, treatment_labels): 144 | untreated_indices = [treatment_labels == 0] 145 | theta_untreated = theta[untreated_indices] 146 | 147 | if not self.outcome_linear_map: 148 | expected_outcome_untreated = self.f_outcome_untreated(theta_untreated) 149 | else: 150 | expected_outcome_untreated = self.w_expected_outcome_untreated(theta_untreated) 151 | 152 | return expected_outcome_untreated 153 | 154 | 155 | def forward(self, bows, normalized_bows, treatment_labels, outcomes, dtype='real', use_supervised_loss=True): 156 | ## get \theta 157 | theta, kld_theta = self.get_theta(normalized_bows) 158 | beta = self.get_beta() 159 | 160 | bce_loss = nn.BCEWithLogitsLoss() 161 | mse_loss = nn.MSELoss() 162 | 163 | ## get reconstruction loss 164 | preds = self.decode(theta, beta) 165 | recon_loss = -(preds * bows).sum(1) 166 | recon_loss = recon_loss.mean() 167 | 168 | supervised_loss=None 169 | if use_supervised_loss: 170 | 171 | #get treatment loss 172 | treatment_logits = self.predict_treatment(theta).squeeze() 173 | treatment_loss = bce_loss(treatment_logits, treatment_labels) 174 | 175 | #get expected outcome loss 176 | treated = [treatment_labels == 1] 177 | untreated = [treatment_labels == 0] 178 | outcomes_treated = outcomes[treated] 179 | outcomes_untreated = outcomes[untreated] 180 | expected_treated = self.predict_outcome_st_treat(theta, treatment_labels).squeeze() 181 | expected_untreated = self.predict_outcome_st_no_treat(theta, treatment_labels).squeeze() 182 | 183 | if dtype == 'real': 184 | outcome_loss_treated = mse_loss(expected_treated,outcomes_treated) 185 | outcome_loss_untreated = mse_loss(expected_treated,outcomes_treated) 186 | else: 187 | outcome_loss_treated = bce_loss(expected_treated,outcomes_treated) 188 | outcome_loss_untreated = bce_loss(expected_treated,outcomes_treated) 189 | 190 | supervised_loss = treatment_loss + outcome_loss_treated + outcome_loss_untreated 191 | 192 | return recon_loss, supervised_loss, kld_theta 193 | 194 | -------------------------------------------------------------------------------- /src/words_baseline/helpers.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import word_tokenize 2 | from nltk.stem import WordNetLemmatizer 3 | from nltk.corpus import stopwords 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.decomposition import LatentDirichletAllocation 8 | 9 | class LemmaTokenizer(object): 10 | def __init__(self): 11 | self.wnl = WordNetLemmatizer() 12 | def __call__(self, articles): 13 | stop = stopwords.words('english') 14 | return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop] 15 | 16 | def filter_by_subreddit(reddit, subs=None): 17 | if not subs: 18 | return reddit.index.values 19 | else: 20 | return reddit[reddit.subreddit.isin(subs)].index.values 21 | 22 | def tokenize_documents(documents,max_df0=0.9, min_df0=0.001): 23 | from nltk.corpus import stopwords 24 | ''' 25 | From a list of documents raw text build a matrix DxV 26 | D: number of docs 27 | V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs 28 | ''' 29 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0) 30 | corpus = count_vect.fit_transform(documents) 31 | vocabulary = count_vect.get_feature_names() 32 | 33 | return corpus,vocabulary,count_vect 34 | 35 | def assign_dev_split(num_docs, percentage=0.05): 36 | indices = np.arange(num_docs) 37 | np.random.shuffle(indices) 38 | size = int(indices.shape[0]*percentage) 39 | dev = indices[:size] 40 | return dev 41 | 42 | def learn_topics(X, X_dev, K=50): 43 | lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1) 44 | print("Fitting", K, "topics...") 45 | lda.fit(X) 46 | score = lda.perplexity(X_dev) 47 | print("Log likelihood:", score) 48 | topics = lda.components_ 49 | return score, lda, topics 50 | 51 | def show_topics(vocab, topics, n_words=20): 52 | topic_keywords = [] 53 | for topic_weights in topics: 54 | top_keyword_locs = (-topic_weights).argsort()[:n_words] 55 | topic_keywords.append(vocab.take(top_keyword_locs)) 56 | 57 | df_topic_keywords = pd.DataFrame(topic_keywords) 58 | df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])] 59 | df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])] 60 | return df_topic_keywords 61 | 62 | def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'): 63 | filtered_indices = filtered_df[on].values 64 | doc_idx = [index_mapping[idx] for idx in filtered_indices] 65 | embeddings = doc_embeddings[doc_idx, :] 66 | return embeddings 67 | 68 | def make_index_mapping(df, on='post_index', convert_to_int=True): 69 | if on=='index': 70 | indices = df.index.values 71 | else: 72 | indices = df[on].values 73 | 74 | if convert_to_int: 75 | return {int(ind):i for (i,ind) in enumerate(indices)} 76 | 77 | return {ind:i for (i,ind) in enumerate(indices)} 78 | 79 | def assign_split(df, num_splits=10, col_to_add='split'): 80 | df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0]) 81 | return df 82 | -------------------------------------------------------------------------------- /src/words_baseline/peerread_output_ate.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.ate import psi_q_only,psi_tmle_cont_outcome 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | from sklearn.linear_model import LogisticRegression, LinearRegression 6 | from sklearn.metrics import mean_squared_error as mse 7 | import argparse 8 | import sys 9 | from scipy.special import logit 10 | from scipy.sparse import load_npz 11 | 12 | def compute_ground_truth_treatment_effect(df): 13 | y1 = df['y1'] 14 | y0 = df['y0'] 15 | return y1.mean() - y0.mean() 16 | 17 | def get_log_outcomes(outcomes): 18 | #relu 19 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes]) 20 | return np.log(outcomes) 21 | 22 | def predict_expected_outcomes(model, features): 23 | return model.predict_proba(features)[:,1] 24 | 25 | def fit_conditional_expected_outcomes(outcomes, features): 26 | model = LogisticRegression(solver='liblinear') 27 | model.fit(features, outcomes) 28 | if verbose: 29 | print("Training accuracy:", model.score(features, outcomes)) 30 | return model 31 | 32 | def predict_treatment_probability(labels, features): 33 | model = LogisticRegression(solver='liblinear') 34 | model.fit(features, labels) 35 | if verbose: 36 | print("Training accuracy:", model.score(features, labels)) 37 | treatment_probability = model.predict_proba(features)[:,1] 38 | return treatment_probability 39 | 40 | def load_simulated_data(): 41 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 42 | sim_df = sim_df.rename(columns={'index':'post_index'}) 43 | return sim_df 44 | 45 | def load_term_counts(path='../dat/reddit/'): 46 | return load_npz(path + 'term_counts.npz').toarray() 47 | 48 | def main(): 49 | if not dat_dir: 50 | term_counts = load_term_counts() 51 | else: 52 | term_counts = load_term_counts(path=dat_dir) 53 | 54 | sim_df = load_simulated_data() 55 | treatment_labels = sim_df.treatment.values 56 | indices = sim_df.post_index.values 57 | all_words = term_counts[indices, :] 58 | 59 | treated_sim = sim_df[sim_df.treatment==1] 60 | untreated_sim = sim_df[sim_df.treatment==0] 61 | treated_indices = treated_sim.post_index.values 62 | untreated_indices = untreated_sim.post_index.values 63 | 64 | all_outcomes = sim_df.outcome.values 65 | outcomes_st_treated = treated_sim.outcome.values 66 | outcomes_st_not_treated = untreated_sim.outcome.values 67 | 68 | words_st_treated = term_counts[treated_indices,:] 69 | words_st_not_treated = term_counts[untreated_indices,:] 70 | 71 | treatment_probability = predict_treatment_probability(treatment_labels, all_words) 72 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated) 73 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated) 74 | 75 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words) 76 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words) 77 | 78 | q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated, 79 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03) 80 | 81 | tmle = psi_tmle_cont_outcome(expected_outcome_st_not_treated, expected_outcome_st_treated, 82 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03)[0] 83 | 84 | print("Q hat:", q_hat) 85 | print("TMLE:", tmle) 86 | 87 | 88 | if __name__ == '__main__': 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument("--dat-dir", action="store", default=None) 91 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/') 92 | parser.add_argument("--mode", action="store", default="simple") 93 | parser.add_argument("--params", action="store", default="1.0") 94 | parser.add_argument("--verbose", action='store_true') 95 | args = parser.parse_args() 96 | 97 | sim_dir = args.sim_dir 98 | dat_dir = args.dat_dir 99 | verbose = args.verbose 100 | params = args.params 101 | sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0' 102 | mode = args.mode 103 | simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv" 104 | 105 | main() -------------------------------------------------------------------------------- /src/words_baseline/reddit_output_att.py: -------------------------------------------------------------------------------- 1 | from semi_parametric_estimation.att import att_estimates, psi_plugin, psi_q_only 2 | from reddit.data_cleaning.reddit_posts import load_reddit_processed 3 | from .helpers import filter_document_embeddings, make_index_mapping, assign_split 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge 8 | from sklearn.metrics import mean_squared_error as mse 9 | import argparse 10 | import sys 11 | from scipy.special import logit 12 | from scipy.sparse import load_npz 13 | 14 | def get_log_outcomes(outcomes): 15 | #relu 16 | outcomes = np.array([max(0.0, out) + 1.0 for out in outcomes]) 17 | return np.log(outcomes) 18 | 19 | def predict_expected_outcomes(model, features): 20 | return model.predict(features) 21 | 22 | def fit_conditional_expected_outcomes(outcomes, features): 23 | model = Ridge() 24 | model.fit(features, outcomes) 25 | predict = model.predict(features) 26 | if verbose: 27 | print("Training MSE:", mse(outcomes, predict)) 28 | return model 29 | 30 | def predict_treatment_probability(labels, features): 31 | model = LogisticRegression(solver='liblinear') 32 | model.fit(features, labels) 33 | if verbose: 34 | print("Training accuracy:", model.score(features, labels)) 35 | treatment_probability = model.predict_proba(features)[:,1] 36 | return treatment_probability 37 | 38 | def load_simulated_data(): 39 | sim_df = pd.read_csv(simulation_file, delimiter='\t') 40 | sim_df = sim_df.rename(columns={'index':'post_index'}) 41 | return sim_df 42 | 43 | def load_term_counts(path='../dat/reddit/'): 44 | return load_npz(path + 'term_counts.npz').toarray() 45 | 46 | def main(): 47 | 48 | if not dat_dir: 49 | term_counts = load_term_counts() 50 | else: 51 | term_counts = load_term_counts(path=dat_dir) 52 | 53 | sim_df = load_simulated_data() 54 | treatment_labels = sim_df.treatment.values 55 | indices = sim_df.post_index.values 56 | all_words = term_counts[indices, :] 57 | 58 | treated_sim = sim_df[sim_df.treatment==1] 59 | untreated_sim = sim_df[sim_df.treatment==0] 60 | treated_indices = treated_sim.post_index.values 61 | untreated_indices = untreated_sim.post_index.values 62 | 63 | all_outcomes = sim_df.outcome.values 64 | outcomes_st_treated = treated_sim.outcome.values 65 | outcomes_st_not_treated = untreated_sim.outcome.values 66 | 67 | words_st_treated = term_counts[treated_indices,:] 68 | words_st_not_treated = term_counts[untreated_indices,:] 69 | 70 | treatment_probability = predict_treatment_probability(treatment_labels, all_words) 71 | model_outcome_st_treated = fit_conditional_expected_outcomes(outcomes_st_treated, words_st_treated) 72 | model_outcome_st_not_treated = fit_conditional_expected_outcomes(outcomes_st_not_treated, words_st_not_treated) 73 | 74 | expected_outcome_st_treated = predict_expected_outcomes(model_outcome_st_treated, all_words) 75 | expected_outcome_st_not_treated = predict_expected_outcomes(model_outcome_st_not_treated, all_words) 76 | 77 | q_hat = psi_q_only(expected_outcome_st_not_treated, expected_outcome_st_treated, 78 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean()) 79 | 80 | tmle = psi_plugin(expected_outcome_st_not_treated, expected_outcome_st_treated, 81 | treatment_probability, treatment_labels, all_outcomes, truncate_level=0.03, prob_t=treatment_labels.mean()) 82 | 83 | print("Q hat:", q_hat) 84 | print("TMLE:", tmle) 85 | 86 | if __name__ == '__main__': 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument("--dat-dir", action="store", default=None) 89 | parser.add_argument("--sim-dir", action="store", default='../dat/sim/reddit_subreddit_based/') 90 | parser.add_argument("--subs", action="store", default='13,6,8') 91 | parser.add_argument("--mode", action="store", default="simple") 92 | parser.add_argument("--params", action="store", default="1.0,1.0,1.0") 93 | parser.add_argument("--verbose", action='store_true') 94 | args = parser.parse_args() 95 | 96 | sim_dir = args.sim_dir 97 | dat_dir = args.dat_dir 98 | subs = None 99 | if args.subs != '': 100 | subs = [int(s) for s in args.subs.split(',')] 101 | verbose = args.verbose 102 | params = args.params.split(',') 103 | sim_setting = 'beta0' + params[0] + '.beta1' + params[1] + '.gamma' + params[2] 104 | subs_string = ', '.join(args.subs.split(',')) 105 | mode = args.mode 106 | simulation_file = sim_dir + 'subreddits['+ subs_string + ']/mode' + mode + '/' + sim_setting + ".tsv" 107 | 108 | main() -------------------------------------------------------------------------------- /src/words_baseline/scripts/sweep_over_sims.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #NUM_SEED=2 3 | #SEEDS=$(seq 0 $NUM_SEED) 4 | rm ../dat/reddit/sim/reddit_subreddit_based/two-stage-lda-estimates.out 5 | export SUBREDDITS=13,6,8 6 | export BETA0=1.0 7 | declare -a SIMMODES=('simple') 8 | declare -a BETA1S=(1.0 10.0 100.0) 9 | declare -a GAMMAS=(1.0 4.0) 10 | 11 | for SIMMODEj in "${SIMMODES[@]}"; do 12 | for BETA1j in "${BETA1S[@]}"; do 13 | for GAMMAj in "${GAMMAS[@]}"; do 14 | python -m lda_baseline.reddit_output_att \ 15 | --subs=${SUBREDDITS} \ 16 | --mode=${SIMMODEj} \ 17 | --params=${BETA0},${BETA1j},${GAMMAj} 18 | done 19 | done 20 | done --------------------------------------------------------------------------------