├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── README.md ├── movie_reviews_pang_lee │ ├── README.md │ ├── X_colnames.txt │ ├── X_csr_test.npz │ ├── X_csr_train.npz │ ├── X_csr_valid.npz │ ├── Y_colnames.txt │ ├── Y_test.npy │ ├── Y_train.npy │ └── Y_valid.npy └── toy_bars_3x3 │ ├── README.md │ ├── X_colnames.txt │ ├── X_csr_test.npz │ ├── X_csr_train.npz │ ├── X_csr_valid.npz │ ├── Y_colnames.txt │ ├── Y_test.npy │ ├── Y_train.npy │ ├── Y_valid.npy │ ├── good_loss_label_rep_K4_param_dict.dump │ ├── good_loss_pc_K4_param_dict.dump │ ├── good_loss_x_K4_param_dict.dump │ ├── good_loss_y_K4_param_dict.dump │ └── src │ ├── Makefile │ ├── make_dataset.py │ ├── make_partial_labeled_dataset.py │ └── make_possible_solutions.py ├── pc_toolbox ├── __init__.py ├── algs_gradient_descent │ ├── __init__.py │ ├── grad_descent_minimizer.py │ └── scipy_lbfgs_minimizer.py ├── binary_classifiers │ ├── calc_roc_auc_via_bootstrap.py │ ├── eval_pretrained_sklearn_binary_classifier.py │ ├── eval_pretrained_sklearn_binary_classifier_avg_many_outcomes.py │ ├── train_and_eval_sklearn_binary_classifier.py │ └── utils_calibration.py ├── model_slda │ ├── __init__.py │ ├── est_local_params__many_doc_map │ │ ├── __init__.py │ │ ├── calc_nef_map_pi_DK.py │ │ └── utils_summarize_pi_DK_estimation.py │ ├── est_local_params__single_doc_map │ │ ├── README.md │ │ ├── __init__.py │ │ ├── calc_nef_map_pi_d_K.py │ │ ├── calc_nef_map_pi_d_K__autograd.py │ │ ├── calc_nef_map_pi_d_K__cython.pyx │ │ ├── calc_nef_map_pi_d_K__defaults.py │ │ ├── calc_nef_map_pi_d_K__numpy_linesearch.py │ │ └── calc_nef_map_pi_d_K__tensorflow.py │ ├── est_local_params__vb_qpiDir_qzCat │ │ ├── __init__.py │ │ ├── calc_N_d_K__vb_qpiDir_qzCat.py │ │ └── calc_elbo_for_many_docs__vb_qpiDir_qzCat.py │ ├── slda_estimator__w_given_pi.py │ ├── slda_loss__autograd.py │ ├── slda_loss__cython.py │ ├── slda_loss__tensorflow.py │ ├── slda_snapshot_perf_metrics.py │ ├── slda_utils__dataset_manager.py │ ├── slda_utils__diffable_param_manager__tensorflow.py │ ├── slda_utils__init_manager.py │ ├── slda_utils__param_io_manager.py │ └── slda_utils__param_manager.py ├── topic_quality_metrics │ ├── __init__.py │ └── calc_coherence_metrics.py ├── train_slda_model.py ├── utils_data │ ├── __init__.py │ ├── util_data_slicer.py │ └── util_stratified_subsample.py ├── utils_diffable_transforms │ ├── __init__.py │ ├── util_differentiable_transform__2D_rows_sum_to_one.py │ ├── util_differentiable_transform__log_unit_interval.py │ └── util_differentiable_transform__unit_interval.py ├── utils_io │ ├── __init__.py │ ├── pprint_logging.py │ ├── util_array.py │ ├── util_io_csr.py │ ├── util_io_training.py │ ├── util_io_txt.py │ ├── util_pprint_percentiles.py │ ├── util_setup.py │ ├── util_timing.py │ └── util_watermark.py ├── utils_snapshots │ ├── __init__.py │ ├── select_best_runs_and_snapshots.py │ ├── snapshot_perf_metrics__binary_outcomes.json │ └── utils_snapshots.py └── utils_vizhtml │ ├── __init__.py │ ├── make_html_collection_from_csv.py │ ├── make_html_collection_from_png_dir.py │ ├── template.html │ ├── utils_top_words_html.py │ └── utils_viz_topic_model.py ├── requirements.txt ├── scripts ├── install │ ├── create_conda_env.sh │ ├── install_tensorflow_linux.sh │ └── requirements.txt ├── launch_job_on_host_via_env.sh ├── launcher_tools │ ├── detect_grid_executable.py │ ├── make_launcher_script.py │ ├── print_lowercase_env_vars_as_keyword_args.py │ ├── template.lsf │ ├── template.sge │ └── template.slurm ├── movie_reviews │ ├── quicktest_topic_models │ │ ├── pcslda_ag_adam_fromscratch.sh │ │ └── pcslda_tf_adam_fromscratch.sh │ ├── train_base_classifiers │ │ └── train_baseline_classifiers.sh │ └── train_topic_models │ │ ├── make_html_viz_for_best_snapshots.sh │ │ ├── pcslda_ag_adam_fromscratch.sh │ │ ├── pcslda_tf_adam_fromscratch.sh │ │ └── select_best_snapshots.sh ├── product_reviews │ ├── train_base_classifiers │ │ ├── train_baseline_classifiers.sh │ │ └── train_baseline_rf.sh │ └── train_topic_models │ │ ├── make_html_viz_for_best_snapshots.sh │ │ ├── rsync_snapshot_perf_csv.sh │ │ └── select_best_snapshots.sh ├── rsync_tools │ ├── README.md │ ├── rsync_snapshot_perf_metrics.sh │ └── rsync_specific_snapshot.sh ├── setup_train_env.sh ├── toy_bars_3x3 │ ├── quicktest_topic_models │ │ ├── pcslda_ag_adam_fromgood.sh │ │ ├── pcslda_ag_adam_fromscratch.sh │ │ ├── pcslda_ag_lbfgs_fromscratch.sh │ │ └── pcslda_tf_adam_fromscratch.sh │ ├── train_base_classifiers │ │ └── train_baseline_classifiers.sh │ └── train_topic_models │ │ ├── pcslda_ag_adam_fromscratch.sh │ │ ├── pcslda_ag_lbfgs_fromgood.sh │ │ ├── pcslda_ag_lbfgs_fromscratch.sh │ │ └── pcslda_tf_adam_fromscratch.sh ├── train_clf.sh └── train_slda.sh ├── setup.cfg ├── setup.py └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # C code compiled from cython 2 | *cython.c 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 dtak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prediction-Constrained Topic Models 2 | 3 | Public repo containing code to train, visualize, and evaluate semi-supervised topic models. Also includes code for baseline classifiers/regressors to perform supervised prediction on bag-of-words datasets. 4 | 5 | # Overview 6 | 7 | This repo is based on the following academic publication: 8 | 9 | > "Prediction-constrained semi-supervised topic models" 10 | > M. C. Hughes, L. Weiner, G. Hope, T. H. McCoy, R. H. Perlis, E. B. Sudderth, and F. Doshi-Velez 11 | > Artificial Intelligence & Statistics (AISTATS), 2018. 12 | 13 | * Paper PDF: https://www.michaelchughes.com/papers/HughesEtAl_AISTATS_2018.pdf 14 | * Supplement PDF: https://www.michaelchughes.com/papers/HughesEtAl_AISTATS_2018_supplement.pdf 15 | 16 | ### Contents 17 | 18 | * [datasets/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/datasets/) 19 | * * Provided example datasets for simple experiments. Overview in [datasets/README.md](https://github.com/dtak/prediction-constrained-topic-models/tree/master/datasets/README.md). 20 | 21 | * [pc_toolbox/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/pc_toolbox/) 22 | * * Main python package, with code for training PC topic models and some baseline classifiers/regressors. 23 | 24 | * [scripts/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/scripts/) 25 | * * Bash scripts to run experiments. Support SLURM/LSF/SGE clusters. 26 | 27 | 28 | # Examples 29 | 30 | ## Python script to train binary classifier from bag-of-words data 31 | 32 | The primary script is train_and_eval_sklearn_binary_classifier.py 33 | ``` 34 | python train_and_eval_sklearn_binary_classifier.py \ 35 | --dataset_path $PC_REPO_DIR/datasets/toy_bars_3x3/ \ 36 | --output_path /tmp/demo_results/ \ 37 | --seed 8675309 \ # random seed (for reproducibility) 38 | --feature_arr_names X \ 39 | --target_arr_name Y \ 40 | --classifier_name extra_trees \ 41 | ``` 42 | 43 | 44 | ## Python script to train topic models with PC objective 45 | 46 | The primary script is train_slda_model.py. For a quick exmaple, you might call this python script as follows: 47 | 48 | ``` 49 | python train_slda_model.py \ 50 | --dataset_path $PC_REPO_DIR/datasets/toy_bars_3x3/ \ 51 | --output_path /tmp/demo_results/ \ 52 | --seed 8675309 \ # random seed (for reproducibility) 53 | --alpha 1.1 \ # scalar hyperparameter for Dirichlet prior over doc-topic probas 54 | --tau 1.1 \ # scalar hyperparameter for Dirichlet prior over topic-word probas 55 | --weight_y 5.0 \ # aka "lambda" in AISTATS paper, the key hyperparameter to emphasize y|x 56 | --n_laps 10 \ # number of laps (aka epochs). this will complete 10 full passes thru training dataset. 57 | --n_batches 1 \ 58 | --alg_name grad_descent_minimizer \ 59 | ``` 60 | 61 | Mostly, we use wrapper bash scripts that call this function with many different hyperparameters (model, algorithm, initialization, etc) 62 | 63 | ## Quicktest: Train PC sLDA topic models on toy bars with autograd 64 | 65 | Test script to train with autograd (ag) as source of automatic gradients: 66 | ``` 67 | cd $PC_REPO_DIR/scripts/toy_bars_3x3/quicktest_topic_models/ 68 | export XHOST_RESULTS_DIR=/tmp/ 69 | XHOST=local bash pcslda_ag_adam_fromscratch.sh 70 | ``` 71 | Should finish in <1 minute, just demonstrate that training occurs without errors. 72 | 73 | ## Quicktest: Train PC sLDA topic models on toy bars with tensorflow 74 | 75 | Test script to train with tensorflow (tf) as source of automatic gradients: 76 | ``` 77 | cd $PC_REPO_DIR/scripts/toy_bars_3x3/quicktest_topic_models/ 78 | export XHOST_RESULTS_DIR=/tmp/ 79 | XHOST=local bash pcslda_tf_adam_fromscratch.sh 80 | ``` 81 | Should finish in <1 minute, just demonstrate that training occurs without errors. 82 | 83 | 84 | ## Train PC sLDA topic models extensively on movie_reviews dataset 85 | 86 | Script to train with tensorflow (tf) as source of automatic gradients: 87 | ``` 88 | cd $PC_REPO_DIR/scripts/movie_reviews_pang_lee/train_topic_models/ 89 | export XHOST_RESULTS_DIR=/tmp/ 90 | XHOST={local|grid} bash pcslda_tf_adam_fromscratch.sh 91 | ``` 92 | Use XHOST=local to run on local computer. 93 | Use XHOST=grid to launch jobs on a cluster (Sun Grid Engine, SLURM, IBM's LSF, etc). 94 | 95 | Should finish in a few hours. 96 | 97 | 98 | # Installation 99 | 100 | * Step 1: Clone this repo 101 | 102 | git clone https://github.com/dtak/prediction-constrained-topic-models/ 103 | 104 | * Step 2: Setup a fresh conda enviroment with all required Python packages 105 | 106 | bash [`$PC_REPO_DIR/scripts/install/create_conda_env.sh`](https://github.com/dtak/prediction-constrained-topic-models/tree/master/scripts/install/create_conda_env.sh) 107 | 108 | * Step 3: Compile Cython code for per-document inference (makes things very fast) 109 | 110 | `cd $PC_REPO_DIR/` 111 | 112 | python [`setup.py`](https://github.com/dtak/prediction-constrained-topic-models/tree/master/setup.py) `build_ext --inplace` 113 | 114 | * Step 4: (Optional) Install tensorflow 115 | 116 | bash [`$PC_REPO_DIR/scripts/install/install_tensorflow_linux.sh`](https://github.com/dtak/prediction-constrained-topic-models/tree/master/scripts/install/install_tensorflow_linux.sh) 117 | 118 | # Configuration 119 | 120 | Set up your environment variables! 121 | 122 | First, make a shortcut variable to the location of this repo, so you can easily reference datasets, etc. 123 | 124 | $ export PC_REPO_DIR=/path/to/prediction_constrained_topic_models/ 125 | 126 | Second, add this repo to your python path, so you can do "import pc_toolbox" 127 | 128 | $ export PYTHONPATH=$PC_REPO_DIR:$PYTHONPATH 129 | 130 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | Quick Links: 2 | 3 | * [Example datasets](#example-datasets) 4 | * [In-memory format](#in-memory-format) 5 | * [On-disk format](#on-disk-format) 6 | 7 | # Background: Datasets for supervised bag-of-words tasks 8 | 9 | We consider supervised bag-of-words tasks, where we have as observed data many examples (aka 'documents'), indexed by 'd', which consist of pairs $x_d, y_d$, where: 10 | 11 | * x_d represents the input count data 12 | * y_d represents some target outcome labels of interest (binary movie rating, real-valued document score, etc) 13 | 14 | In Python, we could represent these values using Numpy arrays: 15 | ``` 16 | * x_d_V : 1D array, size V 17 | bag-of-words count vector 18 | x_d_V[v] is a non-negative integer in {0, 1, 2, ...} 19 | 20 | * y_d_C : 1D array, size C 21 | outcome vector 22 | y_d_C[c] is a scalar 23 | If all binary, this is a multivariate-outcome binary classification task 24 | If all real-valued, this is a multivariate-outcome regression task. 25 | ``` 26 | 27 | Dataset size variables and abbreviations: 28 | ``` 29 | * D : int 30 | n_docs 31 | number of documents in current data subset 32 | * V : int 33 | n_vocabs 34 | number of vocabulary words 35 | * C : int 36 | n_labels 37 | number of outcome 38 | * U : int 39 | n_unique_tokens 40 | number of non-zero (doc_id, vocab_id) pairs in sparse matrix 41 | ``` 42 | 43 | 44 | # Example datasets 45 | 46 | This repo comes with two example datasets, provided in our standard [on-disk format](#on-disk-format): 47 | 48 | * [toy_bars_3x3/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/datasets/toy_bars_3x3/) 49 | 50 | > Small toy dataset of 9 vocab words arranged in 3x3 grid. Useful for visualing inspecting learned topic structure, which look like bars on the 3x3 grid. 51 | 52 | * [movie_reviews_pang_lee/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/datasets/movie_reviews_pang_lee/) 53 | 54 | > Dataset of movie reviews, where prediction task is take a careful bag-of-words representation of plain-text reviews from professional critics, and predict a binary label of movie quality (1 = movie received more than 2-out-of-4 stars, 0 = otherwise). Originally from Pang & Lee ACL 2005. 55 | 56 | 57 | # In-memory format 58 | 59 | For PC toolbox code, we represent one entire dataset (e.g. the train set or the test set) as one **Python dictionary** ('dict') object. 60 | 61 | This dictionary has at least the following key,value entries: 62 | ``` 63 | * x_csr_DV : 2D scipy.sparse.csr_matrix, shape D x V (n_docs x n_vocabs) 64 | Each row is sparse representation of x_d's count data. 65 | 66 | * y_DC : 2D numpy array, shape D x C (n_docs x n_labels) 67 | Each row gives outcomes for doc d 68 | 69 | * n_docs : int 70 | Total number of documents in this dataset 71 | 72 | * n_vocabs : int 73 | Total number of possible vocabulary words in this dataset. 74 | ``` 75 | 76 | ## Python code for saving/loading 77 | 78 | A dataset's dictionary representation can be loaded/saved to disk via some useful functions defined in [`$PC_REPO_DIR/pc_toolbox/model_slda/slda_utils__dataset_manager.py`](https://github.com/dtak/prediction-constrained-topic-models/tree/master/pc_toolbox/model_slda/slda_utils__dataset_manager.py) 79 | 80 | Example usage: 81 | ``` 82 | >>> from slda_utils__dataset_manager import load_dataset 83 | >>> tr_dataset = load_dataset("$PC_REPO_DIR/datasets/toy_bars_3x3/", split_name='train') 84 | 85 | # Show y labels for first 5 documents 86 | >>> tr_dataset['y_DC'][:5] 87 | 88 | # Show dense array repr of x data of first 5 documents 89 | >>> tr_dataset['x_csr_DV'][:5].toarray() 90 | 91 | ``` 92 | 93 | 94 | # On-disk format 95 | 96 | Each dataset is located in its own folder on disk, such as [datasets/movie_reviews_pang_lee/](https://github.com/dtak/prediction-constrained-topic-models/tree/master/datasets/movie_reviews_pang_lee) 97 | 98 | Inside the folder, the dataset is represented by several files contents that must match the following file names: 99 | 100 | ``` 101 | * X_colnames.txt : utf-8 formatted text file 102 | V lines (one line per vocab term) 103 | Each line contains the string name of its corresponding vocab term 104 | 105 | * Y_colnames.txt : utf-8 formatted text file 106 | C lines (one line per outcome) 107 | Each line contains the string name of its corresponding outcome 108 | 109 | * X_csr_train.npz : npz file for a scipy.sparse.csr_matrix 110 | * X_csr_valid.npz : npz file for a scipy.sparse.csr_matrix 111 | * X_csr_test.npz : npz file for a scipy.sparse.csr_matrix 112 | 113 | * Y_train.npy : npy file 114 | * Y_valid.npy : npy file 115 | * Y_test.npy : npy file 116 | ``` 117 | 118 | 119 | ## X_csr Disk Format : .npz file 120 | 121 | The [.npz file format](https://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html) is a standard numpy way to save/load multiple related arrays to/from a single file. 122 | 123 | To make on-disk storage compact, we store the csr_matrix formated X for a single dataset split (train/valid/test) as a single file named "X_csr_train.npz". 124 | 125 | ``` 126 | X_csr_$SPLIT.npz : .npz file contains 127 | shape : 1D array, shape 2 128 | Encodes shape of the array (n_docs, n_vocabs) 129 | data : 1D array, shape U 130 | Contains count values for *all* non-zero (doc_id, vocab_id) entries. 131 | indices : 1D array, shape U 132 | Contains vocab ids for *all* non-zero (doc_id, vocab_id) entries. 133 | indptr : 1D array, shape D+1 134 | Defines (start,stop) slices for each document within data and indices 135 | ``` 136 | 137 | To obtain the relevant arrays for a given dense array, just do: 138 | ``` 139 | >>> import scipy.sparse 140 | >>> x_arr = np.eye(10) 141 | >>> x_csr = scipy.sparse.csr_matrix(x_arr) 142 | >>> npz_dict = dict( 143 | ... shape=x_csr.shape, 144 | ... data=x_csr.data, 145 | ... indices=x_csr.indices, 146 | ... indptr=x_csr.indptr) 147 | ``` 148 | ## Y_split.npy Disk Format : .npy file 149 | 150 | The [.npy file format](https://docs.scipy.org/doc/numpy-dev/neps/npy-format.html) is a standard provided by numpy for saving/loading single arrays. 151 | 152 | We save the y outcomes from each dataset split (train/valid/test) as a single .npy file. 153 | 154 | 155 | 156 | ## TODO describe how missing values work 157 | -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/README.md: -------------------------------------------------------------------------------- 1 | # Movie Reviews dataset (Pang and Lee 2005) 2 | 3 | Raw text from movie reviews of four critics comes from scaledata v1.0 dataset released by Pang and Lee (http://www.cs.cornell.edu/people/pabo/movie-review-data/). 4 | 5 | ## Preprocessing 6 | 7 | Given plain text files of movie reviews, we tokenized and then stemmed using the Snowball stemmer from the nltk Python package, so that words with similar roots (e.g. film, films, filming) all become the same token. We removed all tokens in Mallet's list of common English stop words as well as any token included in the 1000 most common first names from the US census. We added this step after seeing too many common first names like Michael and Jennifer appear meaninglessly in many top-word lists for trained topics. We manually whitelisted "oscar" and "tony" due to their saliency to movie reviews sentiment. We then performed counts of all remaining tokens across the full raw corpus of 5006 documents, discarding any tokens that appear at least once in more than 20\% of all documents or less than 30 distinct documents. The final vocabulary list has 5375 terms. 8 | 9 | Each of the 5006 original documents was then reduced to this vocabulary set. We discarded any documents that were too short (less than 20 tokens), leaving 5005 documents. Each document has a binary label, where 0 indicates it has a negative review (below 0.6 in the original datasets' 0-1 scale) and 1 indicates positive review (>= 0.6). This 0.6 threshold matches a threshold previously used in the raw data's 4-category scale to separate 0 and 1 star reviews from 2 and 3 (of 3) star reviews. Data pairs ($x_d, y_d$) were then split into training, validation, test. Both validation and test used 10 \% of all documents, evenly balancing positive and negative labeled documents. The remaining documents were allocated to the training set. 10 | 11 | 12 | ## Dataset Specs 13 | 14 | Specs computed via 15 | ``` 16 | python $PC_REPO_DIR/pc_toolbox/model_slda/slda_utils__dataset_manager.py \ 17 | --dataset_path $PC_REPO_DIR/datasets/movie_reviews_pang_lee/ \ 18 | --dataset_name movie_reviews 19 | ``` 20 | 21 | ### TRAIN set of movie_reviews 22 | 23 | ``` 24 | 4004 docs 25 | 5338 vocab words 26 | unique tokens per doc 0%: 29 1%: 69 10%: 103 50%: 151 90%: 205 99%: 295 100%: 438 27 | total tokens per doc 0%: 29 1%: 77 10%: 120 50%: 183 90%: 260 99%: 403 100%: 644 28 | 1 labels 29 | 1.000 (4004/4004) docs are labeled 30 | more_than_2_out_of_4_stars ( 1/1) frac positive 0.578 ( 2315/4004) 31 | ``` 32 | 33 | ### VALID set of movie_reviews 34 | ``` 35 | 500 docs 36 | 5338 vocab words 37 | unique tokens per doc 0%: 33 1%: 64 10%: 107 50%: 153 90%: 213 99%: 296 100%: 462 38 | total tokens per doc 0%: 46 1%: 76 10%: 125 50%: 189 90%: 272 99%: 416 100%: 780 39 | 1 labels 40 | 1.000 (500/500) docs are labeled 41 | more_than_2_out_of_4_stars ( 1/1) frac positive 0.498 ( 249/500) 42 | ``` 43 | 44 | ### TEST set of movie_reviews 45 | ``` 46 | 501 docs 47 | 5338 vocab words 48 | unique tokens per doc 0%: 37 1%: 74 10%: 101 50%: 146 90%: 206 99%: 300 100%: 405 49 | total tokens per doc 0%: 39 1%: 84 10%: 119 50%: 177 90%: 264 99%: 406 100%: 621 50 | 1 labels 51 | 1.000 (501/501) docs are labeled 52 | more_than_2_out_of_4_stars ( 1/1) frac positive 0.547 ( 274/501) 53 | ``` 54 | -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/X_csr_test.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/X_csr_test.npz -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/X_csr_train.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/X_csr_train.npz -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/X_csr_valid.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/X_csr_valid.npz -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/Y_colnames.txt: -------------------------------------------------------------------------------- 1 | more_than_2_out_of_4_stars 2 | -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/Y_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/Y_test.npy -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/Y_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/Y_train.npy -------------------------------------------------------------------------------- /datasets/movie_reviews_pang_lee/Y_valid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/movie_reviews_pang_lee/Y_valid.npy -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/README.md: -------------------------------------------------------------------------------- 1 | Toy Bars 3x3 Dataset 2 | 3 | # To rebuild the dataset 4 | 5 | We provide a Makefile and python scripts inside the src/ directory. 6 | 7 | ``` 8 | cd src/ 9 | make dataset 10 | make solutions 11 | ``` 12 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/X_colnames.txt: -------------------------------------------------------------------------------- 1 | needle 2 | finance 3 | tech 4 | river 5 | bank 6 | stream 7 | mineral 8 | gold 9 | silicon 10 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/X_csr_test.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/X_csr_test.npz -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/X_csr_train.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/X_csr_train.npz -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/X_csr_valid.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/X_csr_valid.npz -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/Y_colnames.txt: -------------------------------------------------------------------------------- 1 | has_needle 2 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/Y_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/Y_test.npy -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/Y_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/Y_train.npy -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/Y_valid.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/Y_valid.npy -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/good_loss_label_rep_K4_param_dict.dump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/good_loss_label_rep_K4_param_dict.dump -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/good_loss_pc_K4_param_dict.dump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/good_loss_pc_K4_param_dict.dump -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/good_loss_x_K4_param_dict.dump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/good_loss_x_K4_param_dict.dump -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/good_loss_y_K4_param_dict.dump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/datasets/toy_bars_3x3/good_loss_y_K4_param_dict.dump -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/src/Makefile: -------------------------------------------------------------------------------- 1 | 2 | dataset: 3 | python make_dataset.py --dataset_path ../ 4 | 5 | solutions: 6 | python make_possible_solutions.py --dataset_path ../ 7 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/src/make_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import scipy.sparse 5 | from distutils.dir_util import mkpath 6 | from sklearn.externals import joblib 7 | 8 | vocab_list = np.asarray([ 9 | ['needle', 'finance', 'tech'], 10 | ['river', 'bank', 'stream'], 11 | ['mineral', 'gold', 'silicon'], 12 | ]).flatten().tolist() 13 | 14 | tA = np.asarray([ 15 | [.00, .00, .00], 16 | [.16, .16, .16], 17 | [.16, .16, .16], 18 | ]) 19 | tB = np.asarray([ 20 | [.00, .16, .16], 21 | [.00, .16, .16], 22 | [.00, .16, .16], 23 | ]) 24 | tC = np.asarray([ 25 | [.00, .00, .00], 26 | [.33, .33, .33], 27 | [.00, .00, .00], 28 | ]) 29 | tD = np.asarray([ 30 | [.00, .00, .00], 31 | [.00, .00, .00], 32 | [.33, .33, .33], 33 | ]) 34 | tE = np.asarray([ 35 | [.00, .33, .00], 36 | [.00, .33, .00], 37 | [.00, .33, .00], 38 | ]) 39 | tF = np.asarray([ 40 | [.00, .00, .33], 41 | [.00, .00, .33], 42 | [.00, .00, .33], 43 | ]) 44 | tG = np.asarray([ 45 | [.00, .33, .00], 46 | [.33, .33, .33], 47 | [.00, .33, .00], 48 | ]) 49 | tH = np.asarray([ 50 | [.00, .00, .33], 51 | [.00, .00, .33], 52 | [.33, .33, .33], 53 | ]) 54 | proba_list = [.38, .38, .08, .08, .02, .02, .02, .02] 55 | topic_list = [tA, tB, tC, tD, tE, tF, tG, tH] 56 | for t in topic_list: 57 | t /= t.sum() 58 | 59 | 60 | def draw_random_doc( 61 | topic_list, 62 | proba_list, 63 | min_n_words_per_doc=45, 64 | max_n_words_per_doc=60, 65 | do_return_square=True, 66 | proba_positive_label=0.2, 67 | d=0): 68 | prng = np.random.RandomState(d) 69 | V = topic_list[0].size 70 | 71 | # Pick which template 72 | k = prng.choice(len(proba_list), p=proba_list) 73 | n_words = prng.randint(low=min_n_words_per_doc, high=max_n_words_per_doc) 74 | words = prng.choice( 75 | V, 76 | p=topic_list[k].flatten(), 77 | replace=True, 78 | size=n_words) 79 | x_V = np.bincount(words, minlength=V) 80 | if prng.rand() < proba_positive_label: 81 | y_C = 1.0 82 | x_V[0] += 1 83 | else: 84 | y_C = 0.0 85 | return x_V, y_C 86 | 87 | 88 | def save_csr_matrix(filename, array): 89 | np.savez( 90 | filename, 91 | data=array.data, 92 | indices=array.indices, 93 | indptr=array.indptr, 94 | shape=array.shape) 95 | 96 | if __name__ == '__main__': 97 | 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument("--dataset_path", default=os.path.abspath('.'), type=str) 100 | parser.add_argument("--n_docs_train", default=500, type=int) 101 | parser.add_argument("--n_docs_test", default=500, type=int) 102 | parser.add_argument("--n_docs_valid", default=500, type=int) 103 | 104 | args = parser.parse_args() 105 | dataset_path = os.path.abspath(args.dataset_path) 106 | 107 | x_list = list() 108 | y_list = list() 109 | n_docs = args.n_docs_train + args.n_docs_valid + args.n_docs_test 110 | for d in range(n_docs): 111 | x_V, y_C = draw_random_doc( 112 | topic_list, 113 | proba_list, 114 | do_return_square=False, 115 | d=d, 116 | ) 117 | x_list.append(x_V) 118 | y_list.append(y_C) 119 | if (d+1) % 100 == 0 or (d == n_docs -1): 120 | print "generated doc %d/%d" % (d+1, n_docs) 121 | 122 | # stack into array format 123 | x_DV = np.vstack(x_list) 124 | x_csr_DV = scipy.sparse.csr_matrix(x_DV) 125 | y_DC = np.vstack(y_list) 126 | if y_DC.ndim == 1: 127 | y_DC = y_DC[:,np.newaxis] 128 | 129 | train_doc_ids = np.arange(args.n_docs_train) 130 | valid_doc_ids = np.arange( 131 | args.n_docs_train, 132 | args.n_docs_train + args.n_docs_valid) 133 | test_doc_ids = np.arange( 134 | args.n_docs_train + args.n_docs_valid, 135 | x_DV.shape[0]) 136 | 137 | np.save(os.path.join(dataset_path, "Y_train.npy"), y_DC[train_doc_ids]) 138 | np.save(os.path.join(dataset_path, "Y_valid.npy"), y_DC[valid_doc_ids]) 139 | np.save(os.path.join(dataset_path, "Y_test.npy"), y_DC[test_doc_ids]) 140 | 141 | save_csr_matrix(os.path.join(dataset_path, "X_csr_train.npz"), x_csr_DV[train_doc_ids]) 142 | save_csr_matrix(os.path.join(dataset_path, "X_csr_valid.npz"), x_csr_DV[valid_doc_ids]) 143 | save_csr_matrix(os.path.join(dataset_path, "X_csr_test.npz"), x_csr_DV[test_doc_ids]) 144 | 145 | 146 | # Write necessary txt files 147 | V = x_DV.shape[1] 148 | with open(os.path.join(dataset_path, 'X_colnames.txt'), 'w') as f: 149 | for vocab_term in vocab_list: 150 | f.write('%s\n' % vocab_term) 151 | with open(os.path.join(dataset_path, 'Y_colnames.txt'), 'w') as f: 152 | f.write('has_needle\n') 153 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/src/make_partial_labeled_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import scipy.sparse 5 | from distutils.dir_util import mkpath 6 | from sklearn.externals import joblib 7 | 8 | from sscape.utils_io import load_csr_matrix, save_csr_matrix 9 | import bow_dataset 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--dataset_path", default=".", type=str) 16 | parser.add_argument( 17 | "--output_path", default="./frac_labels=$frac_labels_train/", type=str) 18 | parser.add_argument("--frac_labels_train", default=0.2, type=float) 19 | args = parser.parse_args() 20 | locals().update(vars(args)) 21 | 22 | dataset_path = os.path.abspath(dataset_path) 23 | output_path = os.path.abspath(output_path.replace( 24 | '$frac_labels_train', '%.3f' % frac_labels_train)) 25 | for key in sorted(vars(args).keys()): 26 | print '--%s %s' % (key, locals()[key]) 27 | 28 | for split_name in ['train', 'valid', 'test']: 29 | dataset_info = bow_dataset.load_dataset( 30 | dataset_path, split_name, 31 | frac_labels_train=frac_labels_train) 32 | dataset = dataset_info['dataset'] 33 | 34 | if 'y_rowmask' in dataset: 35 | dataset['y_DC'][dataset['y_rowmask']==0] = np.nan 36 | 37 | print bow_dataset.describe_bow_dataset( 38 | dataset=dataset, 39 | dataset_name="haystack: %s set" % split_name, 40 | label_list=dataset_info.get('label_list', None)) 41 | bow_dataset.save_dataset( 42 | dataset=dataset, 43 | output_path=output_path, 44 | split_name=split_name) 45 | 46 | print dataset_info.keys() 47 | with open(os.path.join(output_path, 'X_colnames.txt'), 'w') as f: 48 | for xname in dataset_info['vocab_list']: 49 | f.write("%s\n" % xname) 50 | with open(os.path.join(output_path, 'Y_colnames.txt'), 'w') as f: 51 | for name in dataset_info['label_list']: 52 | f.write("%s\n" % name) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /datasets/toy_bars_3x3/src/make_possible_solutions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sklearn.linear_model 4 | import sys 5 | import os 6 | 7 | from sklearn.externals import joblib 8 | 9 | from pc_toolbox.model_slda import ( 10 | slda_loss__autograd, 11 | slda_utils__dataset_manager) 12 | 13 | V = 9 14 | b1 = np.asarray([ 15 | [.00, .00, .00], 16 | [.33, .33, .33], 17 | [.00, .00, .00], 18 | ]) 19 | b2 = np.asarray([ 20 | [.00, .00, .00], 21 | [.00, .00, .00], 22 | [.33, .33, .33], 23 | ]) 24 | b3 = np.asarray([ 25 | [.00, .33, .00], 26 | [.00, .33, .00], 27 | [.00, .33, .00], 28 | ]) 29 | b4 = np.asarray([ 30 | [.00, .00, .33], 31 | [.00, .00, .33], 32 | [.00, .00, .33], 33 | ]) 34 | bY = np.asarray([ 35 | [.99, .00, .00], 36 | [.00, .00, .00], 37 | [.00, .00, .00], 38 | ]) 39 | 40 | def make_one_hot_topic(hot_word_id): 41 | bY = np.zeros((1,9)) 42 | bY[0, hot_word_id] = 0.99 43 | return bY 44 | 45 | # Reshape to 1 x V 46 | for arr_name in ['b1', 'b2', 'b3', 'b4', 'bY']: 47 | arr = locals()[arr_name] 48 | assert np.allclose(0.99, np.sum(arr)) 49 | locals()[arr_name] = np.reshape(arr, (1,9)) 50 | 51 | topics_KV_by_name = { 52 | 'good_loss_x_K4': 53 | np.vstack([b3, b4, b1, b2]), 54 | 'good_loss_pc_K4': 55 | np.vstack([bY, b3 + b4, b1, b2]), 56 | 'good_loss_label_rep_K4': 57 | np.vstack([b3 + b4, b3 + b4, b1 + b2, b1 + b2]), 58 | 'good_loss_y_K4': 59 | np.vstack([ 60 | make_one_hot_topic(0), 61 | make_one_hot_topic(1), 62 | make_one_hot_topic(3), 63 | make_one_hot_topic(8)]), 64 | } 65 | 66 | for arr in topics_KV_by_name.values(): 67 | # Start each topic with mass ~1.0 68 | arr /= arr.sum(axis=1)[:,np.newaxis] 69 | # Add small extra mass to each vocab term 70 | arr += .001 71 | # Normalize so sums to one 72 | arr /= arr.sum(axis=1)[:,np.newaxis] 73 | 74 | np.set_printoptions(linewidth=120, precision=4, suppress=1) 75 | for key in topics_KV_by_name: 76 | print key 77 | print topics_KV_by_name[key] 78 | 79 | 80 | w_CK_by_name = { 81 | 'good_loss_x_K4': 82 | np.asarray([[-01.0, -01.0, -01.0, -01.0]]), 83 | 'good_loss_pc_K4': 84 | np.asarray([[+40.0, -02.0, -02.0, -02.0]]), 85 | 'good_loss_label_rep_K4': 86 | np.asarray([[+10.0, -10.0, +10.0, -10.0]]), 87 | 'good_loss_y_K4': 88 | np.asarray([[+30.0, -3.0, -3.0, -3.0]]), 89 | } 90 | 91 | if __name__ == '__main__': 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument("--dataset_path", default=os.path.abspath('.'), type=str) 94 | args = parser.parse_args() 95 | dataset_path = os.path.abspath(args.dataset_path) 96 | 97 | 98 | dataset = slda_utils__dataset_manager.load_dataset(dataset_path, 'train') 99 | keys = w_CK_by_name.keys() 100 | Cs_grid = np.logspace(-5, 5, 11) 101 | best_pos = np.flatnonzero(1.0/Cs_grid == 0.001)[0] 102 | prior_Cs_grid = 0.01 * np.exp(-1.0 * (best_pos - np.arange(11))**2 / 50.0) 103 | 104 | print "" 105 | print "==== FINE TUNING WEIGHT VECTORS" 106 | pi_estimation_weight_y = 0.0 107 | pi_estimation_mode = 'missing_y' 108 | nef_alpha = 1.1 109 | tau = 1.1 110 | lambda_w = 0.001 111 | 112 | for key in keys: 113 | topics_KV = topics_KV_by_name[key] 114 | w_CK = w_CK_by_name[key] 115 | 116 | # Perform loss calculation (also delivers pi_DK) 117 | loss_dict = slda_loss__autograd.calc_loss__slda( 118 | dataset=dataset, 119 | topics_KV=topics_KV, 120 | w_CK=w_CK, 121 | weight_x=1.0, 122 | weight_y=1.0, 123 | pi_estimation_mode=pi_estimation_mode, 124 | pi_estimation_weight_y=pi_estimation_weight_y, 125 | nef_alpha=nef_alpha, 126 | tau=tau, 127 | lambda_w=lambda_w, 128 | return_dict=True) 129 | 130 | print "" 131 | print "======", key 132 | print loss_dict['summary_msg'] 133 | 134 | # Fit logistic regression model via cross-validation 135 | feat_DK = loss_dict['pi_DK'] 136 | y_D = dataset['y_DC'][:,0] 137 | cv_clf = sklearn.linear_model.LogisticRegressionCV( 138 | fit_intercept=False, 139 | Cs=Cs_grid, 140 | cv=3, # num folds 141 | random_state=np.random.RandomState(42)) 142 | cv_clf.fit(feat_DK, y_D) 143 | 144 | acc_per_Cval = ( 145 | np.median(cv_clf.scores_.values()[0], axis=0) 146 | + prior_Cs_grid) 147 | best_p = np.argmax(acc_per_Cval) 148 | best_C = Cs_grid[best_p] 149 | print "## best lambda_w" 150 | print 0.5 / best_C 151 | 152 | clf_with_best_C = sklearn.linear_model.LogisticRegression( 153 | fit_intercept=False, C=best_C) 154 | clf_with_best_C.fit(feat_DK, y_D) 155 | 156 | print "## best w_CK:" 157 | print clf_with_best_C.coef_ 158 | w_CK_by_name[key] = clf_with_best_C.coef_ 159 | 160 | 161 | print "" 162 | print "==== REPORTING RESULTS WITH FIXED TOPICS AND FINE-TUNED WEIGHTS" 163 | for pi_estimation_mode in ['missing_y']: #, 'observe_y']: 164 | print "" 165 | print '---- pi_estimation_mode =', pi_estimation_mode 166 | for key in keys: 167 | topics_KV = topics_KV_by_name[key] 168 | w_CK = w_CK_by_name[key] 169 | loss_dict = slda_loss__autograd.calc_loss__slda( 170 | dataset=dataset, 171 | topics_KV=topics_KV, 172 | w_CK=w_CK, 173 | weight_x=1.0, 174 | weight_y=1.0, 175 | pi_estimation_mode=pi_estimation_mode, 176 | pi_estimation_weight_y=pi_estimation_weight_y, 177 | nef_alpha=nef_alpha, 178 | tau=tau, 179 | lambda_w=lambda_w, 180 | return_dict=True) 181 | print "%-25s uloss_x__pertok %.4f\n%-25s uloss_y__perdoc %.4f\n" % ( 182 | key, loss_dict['uloss_x__pertok'], 183 | '', loss_dict['uloss_y__perdoc']) 184 | 185 | 186 | 187 | print "" 188 | print "==== SAVING PARAMS PACKAGED UP AS _param_dict.dump" 189 | for key in keys: 190 | fpath = os.path.join(dataset_path, '%s_param_dict.dump' % (key)) 191 | GP = dict( 192 | topics_KV=topics_KV_by_name[key], 193 | w_CK=w_CK_by_name[key], 194 | n_labels=1, 195 | n_states=4, 196 | n_vocabs=9) 197 | joblib.dump( 198 | GP, 199 | fpath, 200 | compress=1) 201 | -------------------------------------------------------------------------------- /pc_toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import utils_io 4 | import utils_data 5 | import utils_snapshots 6 | 7 | import model_slda 8 | 9 | # TODO discard this line 10 | # calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK 11 | 12 | PC_REPO_DIR = os.path.sep.join( 13 | os.path.abspath(__file__).split(os.path.sep)[:-2]) 14 | 15 | ## Create version attrib 16 | __version__ = None 17 | version_txt_path = os.path.join(PC_REPO_DIR, 'version.txt') 18 | if os.path.exists(version_txt_path): 19 | with open(version_txt_path, 'r') as f: 20 | __version__ = f.readline().strip() 21 | 22 | ## Create requirements attrib 23 | __requirements__ = None 24 | reqs_txt_path = os.path.join(PC_REPO_DIR, 'requirements.txt') 25 | if os.path.exists(reqs_txt_path): 26 | with open(reqs_txt_path, 'r') as f: 27 | __requirements__ = [] 28 | for line in f.readlines(): 29 | __requirements__.append(line.strip()) 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /pc_toolbox/algs_gradient_descent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/pc_toolbox/algs_gradient_descent/__init__.py -------------------------------------------------------------------------------- /pc_toolbox/algs_gradient_descent/scipy_lbfgs_minimizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import time 4 | 5 | import scipy.optimize 6 | 7 | from pc_toolbox.utils_io import ( 8 | pprint, 9 | do_print_now, 10 | do_save_now, 11 | default_settings_alg_io, 12 | init_alg_state_kwargs, 13 | update_alg_state_kwargs, 14 | make_status_string, 15 | save_status_to_txt_files, 16 | append_to_txtfile, 17 | update_alg_state_kwargs_after_print, 18 | update_alg_state_kwargs_after_save, 19 | calc_laps_when_snapshots_saved, 20 | ) 21 | 22 | from grad_descent_minimizer import calc_l2_norm_of_vector_per_entry 23 | 24 | def minimize( 25 | loss_func_wrt_paramvec_and_step=None, 26 | grad_func_wrt_paramvec_and_step=None, 27 | save_func_wrt_param_dict=None, 28 | callback_func_wrt_param_dict=None, 29 | callback_kwargs=None, 30 | param_tfm_manager=None, 31 | dim_P=None, 32 | init_param_dict=None, 33 | n_line_search_steps=10, 34 | n_terms_approx_hessian=10, 35 | **kwargs): 36 | """ Minimize provided loss function using L-BFGS algorithm 37 | 38 | Returns 39 | ------- 40 | param_dict : dict 41 | Contains estimated parameters that minimize the loss 42 | alg_state_dict : dict 43 | Contains algorithm information (num steps completed, etc.) 44 | """ 45 | pprint('[scipy_lbfgs_minimizer] Begin training...') 46 | pprint('--n_line_search_steps %.3f' % n_line_search_steps) 47 | pprint('--n_terms_approx_hessian %.3f' % n_terms_approx_hessian) 48 | 49 | # Parse user input 50 | n_line_search_steps = int(n_line_search_steps) 51 | n_terms_approx_hessian = int(n_terms_approx_hessian) 52 | 53 | # Convert provided common param dict 54 | # to a flat 1D array with unconstrained values 55 | param_vec = param_tfm_manager.flatten_to_differentiable_param_vec( 56 | init_param_dict, 57 | **dim_P) 58 | 59 | # Warmup 60 | start_time_sec = time.time() 61 | init_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=0) 62 | loss_eval_time_sec = time.time() - start_time_sec 63 | pprint("Loss @ init: %8.3f sec | val %.6e" % ( 64 | loss_eval_time_sec, init_loss_val)) 65 | pprint("Params @ init: %8s | %5d params | l2 norm / entry %.4e" % ( 66 | ' ', 67 | param_vec.size, 68 | calc_l2_norm_of_vector_per_entry(param_vec))) 69 | start_time_sec = time.time() 70 | init_grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=0) 71 | elapsed_time_sec = time.time() - start_time_sec 72 | init_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(init_grad_vec) 73 | pprint("Gradient @ init: %8.3f sec | %5d params | l2 norm / entry %.4e" % ( 74 | elapsed_time_sec, init_grad_vec.size, init_grad_norm_per_entry)) 75 | 76 | # Create settings that track algorithm state 77 | # cur_step, cur_lap, n_laps, n_steps, etc. 78 | alg_state_kwargs = init_alg_state_kwargs( 79 | cur_step=0.0, 80 | **kwargs) 81 | n_steps = alg_state_kwargs['n_steps'] 82 | if 'output_path' in alg_state_kwargs: 83 | laps_to_save_str, steps_to_save_str = calc_laps_when_snapshots_saved( 84 | return_str=True, 85 | keep_first=5, 86 | keep_last=5, 87 | **alg_state_kwargs) 88 | pprint("Snapshots will be saved at intervals:") 89 | pprint(" laps: %s" % laps_to_save_str) 90 | pprint(" steps: %s" % steps_to_save_str) 91 | pprint("Snapshot saved to --output_path:\n%s" % ( 92 | alg_state_kwargs['output_path'])) 93 | 94 | # Translate settings into scipy's specific options format 95 | options_dict = dict( 96 | maxiter=n_steps, 97 | maxfun=n_line_search_steps * n_steps, 98 | maxcor=n_terms_approx_hessian, 99 | maxls=n_line_search_steps, 100 | ftol=0.0, 101 | gtol=0.0, 102 | ) 103 | alg_state_kwargs['cur_loss_val'] = init_loss_val 104 | 105 | ## Define special callback function 106 | # Which does things like print progress at relevant steps 107 | # Save snapshots to files at relevant steps, etc. 108 | def my_callback_func( 109 | cur_param_vec, 110 | is_init=False, 111 | alg_state_kwargs=alg_state_kwargs): 112 | # Update step counter, timer, etc. 113 | if not is_init: 114 | alg_state_kwargs.update( 115 | update_alg_state_kwargs( 116 | **alg_state_kwargs)) 117 | if do_print_now(**alg_state_kwargs) or do_save_now(**alg_state_kwargs): 118 | cur_loss_val = loss_func_wrt_paramvec_and_step(cur_param_vec) 119 | alg_state_kwargs['cur_loss_val'] = cur_loss_val 120 | 121 | if do_print_now(**alg_state_kwargs): 122 | pprint(make_status_string( 123 | **alg_state_kwargs)) # assume cur_loss_val is inside 124 | save_status_to_txt_files( 125 | **alg_state_kwargs) 126 | alg_state_kwargs.update( 127 | update_alg_state_kwargs_after_print(**alg_state_kwargs)) 128 | 129 | if do_save_now(**alg_state_kwargs): 130 | param_dict = param_tfm_manager.unflatten_to_common_param_dict( 131 | cur_param_vec, **dim_P) 132 | if save_func_wrt_param_dict is not None: 133 | save_func_wrt_param_dict( 134 | param_dict=param_dict, 135 | **alg_state_kwargs) 136 | if callback_func_wrt_param_dict is not None: 137 | callback_func_wrt_param_dict( 138 | param_dict=param_dict, 139 | losstrain_ttl=alg_state_kwargs.get('cur_loss_val', init_loss_val), 140 | alg_state_kwargs=alg_state_kwargs, 141 | **callback_kwargs) 142 | alg_state_kwargs.update( 143 | update_alg_state_kwargs_after_save(**alg_state_kwargs)) 144 | 145 | ## Run training ... 146 | my_callback_func(param_vec, is_init=True) 147 | if n_steps > 0: 148 | opt_result_obj = scipy.optimize.minimize( 149 | loss_func_wrt_paramvec_and_step, 150 | param_vec, 151 | method='l-bfgs-b', 152 | jac=grad_func_wrt_paramvec_and_step, 153 | options=options_dict, 154 | callback=my_callback_func) 155 | pprint('[scipy_lbfgs_minimizer] msg %s' % opt_result_obj.message) 156 | param_vec = opt_result_obj.x 157 | # Relies on alg_state_kwargs already being defined in callback 158 | my_callback_func(param_vec) 159 | 160 | param_dict = param_tfm_manager.unflatten_to_common_param_dict( 161 | param_vec, **dim_P) 162 | pprint('[scipy_lbfgs_minimizer] Done with training.') 163 | return param_dict, alg_state_kwargs 164 | -------------------------------------------------------------------------------- /pc_toolbox/binary_classifiers/calc_roc_auc_via_bootstrap.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | from sklearn.metrics import roc_auc_score 4 | 5 | def verify_min_examples_per_label(y_NC, min_examples_per_label): 6 | ''' 7 | 8 | Examples 9 | -------- 10 | >>> y_all_0 = np.zeros(10) 11 | >>> y_all_1 = np.ones(30) 12 | >>> verify_min_examples_per_label(y_all_0, 3) 13 | False 14 | >>> verify_min_examples_per_label(y_all_1, 2) 15 | False 16 | >>> verify_min_examples_per_label(np.hstack([y_all_0, y_all_1]), 10) 17 | True 18 | >>> verify_min_examples_per_label(np.eye(3), 2) 19 | False 20 | ''' 21 | if y_NC.ndim < 2: 22 | y_NC = np.atleast_2d(y_NC).T 23 | n_C = np.sum(np.isfinite(y_NC), axis=0) 24 | n_pos_C = n_C * np.nanmean(y_NC, axis=0) 25 | min_neg = np.max(n_C - n_pos_C) 26 | min_pos = np.min(n_pos_C) 27 | if min_pos < min_examples_per_label: 28 | return False 29 | elif min_neg < min_examples_per_label: 30 | return False 31 | return True 32 | 33 | def calc_binary_clf_metric_with_ci_via_bootstrap( 34 | y_pred=None, 35 | y_true=None, 36 | metric_func=roc_auc_score, 37 | seed=42, 38 | verbose=False, 39 | n_bootstraps=1000, 40 | stratify_pos_and_neg=True, 41 | min_examples_per_label=10, 42 | return_dict=False, 43 | ci_tuples=[(10,90)]): 44 | if not isinstance(ci_tuples, list): 45 | ci_tuples = [ci_tuples] 46 | for ci_tuple in ci_tuples: 47 | assert len(ci_tuple) == 2 48 | 49 | roc_auc_value = metric_func(y_true, y_pred) 50 | if verbose: 51 | print( 52 | "Original score: {:0.3f}".format(roc_auc_value)) 53 | 54 | n_samples = y_true.shape[0] 55 | prng = np.random.RandomState(seed) 56 | 57 | bootstrapped_scores = np.zeros(n_bootstraps, dtype=np.float64) 58 | 59 | if stratify_pos_and_neg: 60 | assert y_true.ndim == 1 61 | pos_ids = np.flatnonzero(y_true == 1) 62 | neg_ids = np.flatnonzero(y_true == 0) 63 | min_ex = np.minimum(pos_ids.size, neg_ids.size) 64 | assert min_ex >= min_examples_per_label 65 | i = 0 66 | while i < n_bootstraps: 67 | 68 | # Sample from original population with replacement 69 | if stratify_pos_and_neg: 70 | # Preserving the original number of pos and neg examples 71 | sampled_pos_inds = prng.random_integers( 72 | 0, len(pos_ids) - 1, len(pos_ids)) 73 | sampled_neg_inds = prng.random_integers( 74 | 0, len(neg_ids) - 1, len(neg_ids)) 75 | sampled_ids = np.hstack([ 76 | neg_ids[sampled_neg_inds], 77 | pos_ids[sampled_pos_inds]]) 78 | else: 79 | # Don't care about pos and neg ratio at all 80 | sampled_ids = prng.choice( 81 | n_samples, size=n_samples, replace=True) 82 | 83 | sampled_y_true = y_true[sampled_ids] 84 | sampled_y_pred = y_pred[sampled_ids] 85 | is_good = verify_min_examples_per_label(sampled_y_true, min_examples_per_label) 86 | if not is_good: 87 | continue 88 | 89 | bootstrapped_scores[i] = metric_func(sampled_y_true, sampled_y_pred) 90 | i += 1 91 | 92 | if verbose: 93 | for perc in [05, 10, 25, 50, 75, 90, 95]: 94 | print "%02d percentile: %.3f" % ( 95 | perc, np.percentile(bootstrapped_scores, perc)) 96 | 97 | intervals = list() 98 | for ci_tuple in ci_tuples: 99 | ci_bound_low = int(ci_tuple[0]) 100 | ci_bound_high = int(ci_tuple[1]) 101 | interval = ( 102 | np.percentile(bootstrapped_scores, ci_bound_low), 103 | np.percentile(bootstrapped_scores, ci_bound_high), 104 | ) 105 | intervals.append(interval) 106 | if verbose: 107 | print "CI %2d-%2d: %.3f - %.3f" % ( 108 | ci_bound_low, ci_bound_high, 109 | interval[0], interval[1], 110 | ) 111 | 112 | if return_dict: 113 | info_dict = dict( 114 | ci_tuples=ci_tuples, 115 | bootstrapped_scores=bootstrapped_scores) 116 | return roc_auc_value, intervals, info_dict 117 | else: 118 | return roc_auc_value, intervals 119 | 120 | if __name__ == '__main__': 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--seed', type=int, default=42) 123 | parser.add_argument('--n_bootstraps', type=int, default=1000) 124 | parser.add_argument('--stratify_pos_and_neg', type=int, default=1) 125 | parser.add_argument('--verbose', type=int, default=1) 126 | arg_dict = vars(parser.parse_args()) 127 | 128 | y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04]) 129 | y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ]) 130 | 131 | val, ci = calc_binary_clf_metric_with_ci_via_bootstrap( 132 | y_pred=y_pred, 133 | y_true=y_true, 134 | return_dict=False, 135 | **arg_dict) 136 | -------------------------------------------------------------------------------- /pc_toolbox/binary_classifiers/utils_calibration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import expit 3 | import matplotlib.gridspec as gridspec 4 | import matplotlib.pyplot as plt 5 | 6 | def plot_binary_clf_calibration_curve_and_histograms( 7 | info_per_bin=None, 8 | fig_kws=dict( 9 | figsize=(1.4*3, 1.4*4), 10 | tight_layout=True), 11 | ): 12 | fig_h = plt.figure(**fig_kws) 13 | ax_grid = gridspec.GridSpec( 14 | nrows=4, ncols=1, 15 | height_ratios=[1, 1, 4, 0.1], 16 | ) 17 | ax_cal = fig_h.add_subplot(ax_grid[2,0]) 18 | ax_TP = fig_h.add_subplot(ax_grid[0,0]) 19 | ax_TN = fig_h.add_subplot(ax_grid[1,0]) 20 | 21 | # Plot calibration curve 22 | # First, lay down idealized line from 0-1 23 | unit_grid = np.linspace(0, 1, 10) 24 | ax_cal.plot( 25 | unit_grid, unit_grid, 'k--', alpha=0.5) 26 | # Then, plot actual-vs-expected fractions on top 27 | ax_cal.plot( 28 | info_per_bin['xcenter_per_bin'], 29 | info_per_bin['fracTP_per_bin'], 30 | 'ks-') 31 | ax_cal.set_ylabel('frac. true positive') 32 | ax_cal.set_xlabel('predicted proba.') 33 | 34 | # Plot TP histogram 35 | ax_TP.bar( 36 | info_per_bin['xcenter_per_bin'], 37 | info_per_bin['countTP_per_bin'], 38 | width=0.9*info_per_bin['xwidth_per_bin'], 39 | color='b') 40 | 41 | # Plot TN histogram 42 | ax_TN.bar( 43 | info_per_bin['xcenter_per_bin'], 44 | info_per_bin['countTN_per_bin'], 45 | width=0.9*info_per_bin['xwidth_per_bin'], 46 | color='r') 47 | for ax in [ax_cal, ax_TP, ax_TN]: 48 | ax.set_xlim([0, 1]) 49 | ax_cal.set_ylim([0, 1]) 50 | 51 | def calc_binary_clf_calibration_per_bin( 52 | y_true, y_prob, 53 | bins=10): 54 | """ 55 | """ 56 | if y_prob.min() < 0 or y_prob.max() > 1: 57 | raise ValueError("y_prob has values outside [0, 1]") 58 | 59 | bins = np.asarray(bins) 60 | if bins.ndim == 1 and bins.size > 1: 61 | bin_edges = bins 62 | else: 63 | bin_edges = np.linspace(0, 1, int(bins) + 1) 64 | if bin_edges[-1] == 1.0: 65 | bin_edges[-1] += 1e-8 66 | assert bin_edges.ndim == 1 67 | assert bin_edges.size > 2 68 | nbins = bin_edges.size - 1 69 | # Assign each predicted probability into one bin 70 | # from 0, 1, ... nbins 71 | binids = np.digitize(y_prob, bin_edges) - 1 72 | assert binids.max() <= nbins 73 | assert binids.min() >= 0 74 | 75 | count_per_bin = np.bincount(binids, minlength=nbins) 76 | countTP_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 1) 77 | countTN_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 0) 78 | 79 | # This divide will (and should) yield nan 80 | # if any bin has no content 81 | fracTP_per_bin = countTP_per_bin / np.asarray(count_per_bin, dtype=np.float64) 82 | 83 | info_per_bin = dict( 84 | count_per_bin=count_per_bin, 85 | countTP_per_bin=countTP_per_bin, 86 | countTN_per_bin=countTN_per_bin, 87 | fracTP_per_bin=fracTP_per_bin, 88 | xcenter_per_bin=0.5 * (bin_edges[:-1] + bin_edges[1:]), 89 | xwidth_per_bin=(bin_edges[1:] - bin_edges[:-1]), 90 | bin_edges=bin_edges, 91 | ) 92 | return info_per_bin 93 | 94 | 95 | if __name__ == '__main__': 96 | prng = np.random.RandomState(0) 97 | thr_true = prng.rand(100000) 98 | u_true = 0.65 * prng.randn(100000) 99 | y_true = np.asarray(expit(u_true) >= thr_true, dtype=np.float32) 100 | y_prob = expit(u_true) 101 | 102 | bins = 20 103 | 104 | info_per_bin = calc_binary_clf_calibration_per_bin( 105 | y_true=y_true, 106 | y_prob=y_prob, 107 | bins=bins) 108 | bin_edges = info_per_bin['bin_edges'] 109 | for bb in range(bin_edges.size - 1): 110 | print "bin [%.2f, %.2f] count %5d fracTP %.3f" % ( 111 | bin_edges[bb], 112 | bin_edges[bb+1], 113 | info_per_bin['count_per_bin'][bb], 114 | info_per_bin['fracTP_per_bin'][bb], 115 | ) 116 | 117 | plot_binary_clf_calibration_curve_and_histograms( 118 | info_per_bin=info_per_bin) 119 | 120 | plt.show() -------------------------------------------------------------------------------- /pc_toolbox/model_slda/__init__.py: -------------------------------------------------------------------------------- 1 | from est_local_params__single_doc_map import ( 2 | calc_nef_map_pi_d_K, 3 | calc_nef_map_pi_d_K__autograd, 4 | calc_nef_map_pi_d_K__cython, 5 | DefaultDocTopicOptKwargs, 6 | ) 7 | 8 | from est_local_params__many_doc_map import ( 9 | calc_nef_map_pi_DK, 10 | ) 11 | 12 | from est_local_params__vb_qpiDir_qzCat import ( 13 | calc_elbo_for_many_docs, 14 | ) 15 | 16 | 17 | import slda_utils__dataset_manager 18 | import slda_utils__param_io_manager 19 | save_topic_model_param_dict = slda_utils__param_io_manager.save_topic_model_param_dict 20 | load_topic_model_param_dict = slda_utils__param_io_manager.load_topic_model_param_dict 21 | 22 | import slda_utils__param_manager 23 | import slda_utils__init_manager 24 | 25 | import slda_loss__autograd 26 | import slda_loss__cython 27 | 28 | try: 29 | import slda_loss__tensorflow 30 | HAS_TENSORFLOW = True 31 | except ImportError: 32 | HAS_TENSORFLOW = False 33 | slda_loss__tensorflow = None 34 | 35 | import slda_snapshot_perf_metrics 36 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__many_doc_map/__init__.py: -------------------------------------------------------------------------------- 1 | from calc_nef_map_pi_DK import ( 2 | calc_nef_map_pi_DK, 3 | make_readable_summary_for_pi_DK_estimation, 4 | ) -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__many_doc_map/calc_nef_map_pi_DK.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | from pc_toolbox.model_slda.est_local_params__single_doc_map import ( 5 | calc_nef_map_pi_d_K, 6 | DefaultDocTopicOptKwargs, 7 | ) 8 | 9 | from pc_toolbox.utils_io import ( 10 | pprint, 11 | make_percentile_str) 12 | 13 | from utils_summarize_pi_DK_estimation import ( 14 | make_readable_summary_for_pi_DK_estimation) 15 | 16 | def calc_nef_map_pi_DK( 17 | dataset=None, 18 | topics_KV=None, 19 | alpha=None, 20 | nef_alpha=None, 21 | init_pi_DK=None, 22 | n_seconds_between_print=-1, 23 | active_proba_thr=0.005, 24 | return_info=False, 25 | calc_pi_d_K=calc_nef_map_pi_d_K, 26 | **some_pi_estimation_kwargs): 27 | ''' Extract doc-topic probability features for every doc in dataset. 28 | 29 | Args 30 | ---- 31 | dataset : dict with array fields 32 | 'n_docs' : int, non-negative 33 | number of documents in dataset 34 | 'word_id_U' : 1D array, size U, dtype=int 35 | vocab ids for each doc-term pair in dataset 36 | 'word_ct_U' : 1D array, size U, dtype=float 37 | counts for each doc-term pair in dataset 38 | 'doc_indptr_Dp1' : 1D array, size D+1, type=int 39 | indptr / fenceposts delineating where individual docs begin/end 40 | topics_KV : 2D array, size K x V, rows sum to one 41 | probability of each word v appearing under each topic k 42 | alpha : float, positive value 43 | concentration parameter of Dirichlet prior on doc-topic probas 44 | 45 | Returns 46 | ------- 47 | pi_DK : 2D array, size D x K 48 | Each row has positive entries and sums to one. 49 | info_dict : dict 50 | Only returned if called with return_info=True 51 | ''' 52 | # Parse pi estimation kwargs 53 | pi_estimation_kwargs = dict(**DefaultDocTopicOptKwargs) 54 | for key in pi_estimation_kwargs.keys(): 55 | if key in some_pi_estimation_kwargs: 56 | val = DefaultDocTopicOptKwargs[key] 57 | if isinstance(val, float): 58 | pi_estimation_kwargs[key] = float(some_pi_estimation_kwargs[key]) 59 | else: 60 | pi_estimation_kwargs[key] = int(some_pi_estimation_kwargs[key]) 61 | 62 | assert topics_KV is not None 63 | K = int(topics_KV.shape[0]) 64 | 65 | n_docs = dataset['n_docs'] 66 | doc_indptr_Dp1 = dataset['doc_indptr_Dp1'] 67 | word_id_U = dataset['word_id_U'] 68 | word_ct_U = dataset['word_ct_U'] 69 | 70 | pi_DK = np.zeros((n_docs, K)) 71 | n_docs_converged = 0 72 | n_docs_restarted = 0 73 | iters_per_doc = np.zeros(n_docs, dtype=np.int32) 74 | n_active_per_doc = np.zeros(n_docs, dtype=np.int32) 75 | restarts_per_doc = np.zeros(n_docs, dtype=np.int32) 76 | step_size_per_doc = np.zeros(n_docs, dtype=np.float32) 77 | dist_per_doc = np.zeros(n_docs, dtype=np.float32) 78 | loss_per_doc = np.zeros(n_docs, dtype=np.float32) 79 | 80 | is_time = False 81 | start_time_sec = time.time() 82 | last_print_sec = start_time_sec 83 | for d in xrange(n_docs): 84 | start_d = doc_indptr_Dp1[d] 85 | stop_d = doc_indptr_Dp1[d+1] 86 | 87 | if init_pi_DK is None: 88 | init_pi_d_K = None 89 | else: 90 | init_pi_d_K = init_pi_DK[d] 91 | 92 | # MCH: Cannot autograd when doing this kind of assignment 93 | pi_DK[d,:], info_dict = \ 94 | calc_pi_d_K( 95 | word_id_U[start_d:stop_d], 96 | word_ct_U[start_d:stop_d], 97 | topics_KV=topics_KV, 98 | alpha=alpha, 99 | nef_alpha=nef_alpha, 100 | init_pi_d_K=init_pi_d_K, 101 | **pi_estimation_kwargs) 102 | if return_info or n_seconds_between_print > 0: 103 | n_active_per_doc[d] = \ 104 | np.sum(pi_DK[d,:] > active_proba_thr) 105 | n_docs_restarted += info_dict['n_restarts'] > 0 106 | n_docs_converged += info_dict['did_converge'] 107 | iters_per_doc[d] = info_dict['n_iters'] 108 | step_size_per_doc[d] = info_dict['pi_step_size'] 109 | try: 110 | dist_per_doc[d] = info_dict['cur_L1_diff'] 111 | except KeyError: 112 | dist_per_doc = None 113 | try: 114 | restarts_per_doc[d] = info_dict['n_restarts'] 115 | except KeyError: 116 | restarts_per_doc = None 117 | try: 118 | loss_per_doc[d] = info_dict['loss'] 119 | except KeyError: 120 | pass 121 | 122 | cur_time_sec = time.time() 123 | if n_seconds_between_print > 0: 124 | is_time = cur_time_sec - last_print_sec > n_seconds_between_print 125 | is_last = (d + 1) == n_docs 126 | if is_last or is_time: 127 | msg = make_readable_summary_for_pi_DK_estimation( 128 | elapsed_time_sec=cur_time_sec - start_time_sec, 129 | n_docs=n_docs, 130 | n_docs_completed=d+1, 131 | n_docs_converged=n_docs_converged, 132 | n_docs_restarted=n_docs_restarted, 133 | iters_per_doc=iters_per_doc, 134 | n_active_per_doc=n_active_per_doc, 135 | dist_per_doc=dist_per_doc, 136 | restarts_per_doc=restarts_per_doc, 137 | step_size_per_doc=step_size_per_doc, 138 | loss_per_doc=loss_per_doc) 139 | 140 | last_print_sec = cur_time_sec 141 | if n_seconds_between_print > 0: 142 | pprint(msg) 143 | if return_info: 144 | agg_info_dict = dict( 145 | summary_msg=msg, 146 | iters_per_doc=iters_per_doc, 147 | n_active_per_doc=n_active_per_doc, 148 | dist_per_doc=dist_per_doc, 149 | restarts_per_doc=restarts_per_doc, 150 | step_size_per_doc=step_size_per_doc, 151 | loss_per_doc=loss_per_doc, 152 | loss=np.sum(loss_per_doc), 153 | alpha=alpha) 154 | return pi_DK, agg_info_dict 155 | else: 156 | return pi_DK 157 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__many_doc_map/utils_summarize_pi_DK_estimation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pc_toolbox.utils_io import make_percentile_str 4 | 5 | def make_readable_summary_for_pi_DK_estimation( 6 | n_docs=None, 7 | elapsed_time_sec=None, 8 | n_docs_completed=None, 9 | n_docs_converged=None, 10 | n_docs_restarted=None, 11 | iters_per_doc=None, 12 | dist_per_doc=None, 13 | loss_per_doc=None, 14 | step_size_per_doc=None, 15 | converged_per_doc=None, 16 | n_active_per_doc=None, 17 | restarts_per_doc=None, 18 | pi_converge_thr=None, 19 | **unused_kws): 20 | if n_docs_completed is None: 21 | n_docs_completed = n_docs 22 | msg = "completed %d/%d docs" % (n_docs_completed, n_docs) 23 | if elapsed_time_sec is not None: 24 | msg += " after %7.2f sec" % (elapsed_time_sec) 25 | if converged_per_doc is not None: 26 | n_docs_converged = np.sum(converged_per_doc[:n_docs_completed]) 27 | if n_docs_converged is not None: 28 | msg += " %6d not converged" % ( 29 | n_docs_completed - n_docs_converged) 30 | if pi_converge_thr is not None: 31 | msg += " %6.2g conv_thr" % pi_converge_thr 32 | if n_docs_restarted is not None: 33 | msg += " %6d restarted" % (n_docs_restarted) 34 | if iters_per_doc is not None: 35 | msg += "\n iters / doc: %s" % ( 36 | make_percentile_str( 37 | iters_per_doc[:n_docs_completed], 38 | fmt_str='%7d')) 39 | if dist_per_doc is not None: 40 | msg += "\n l1 dist / doc: %s" % ( 41 | make_percentile_str( 42 | dist_per_doc[:n_docs_completed], 43 | fmt_str='%7.2g')) 44 | if step_size_per_doc is not None: 45 | msg += "\n pi_step_size / doc: %s" % ( 46 | make_percentile_str( 47 | step_size_per_doc[:n_docs_completed], 48 | fmt_str='%7.2g')) 49 | if loss_per_doc is not None: 50 | msg += "\n loss / doc: %s" % ( 51 | make_percentile_str( 52 | loss_per_doc[:n_docs_completed], 53 | fmt_str='% 7.4g')) 54 | if restarts_per_doc is not None: 55 | msg += "\n restarts / doc: %s" % ( 56 | make_percentile_str( 57 | restarts_per_doc[:n_docs_completed], 58 | fmt_str='%7d')) 59 | if n_active_per_doc is not None: 60 | msg += "\n active topics / doc: %s" % ( 61 | make_percentile_str( 62 | n_active_per_doc[:n_docs_completed], 63 | fmt_str='%7d')) 64 | return msg 65 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/README.md: -------------------------------------------------------------------------------- 1 | Estimation for single-document's topic probability vector via MAP estimation. 2 | 3 | The provided estimates will be probability vectors (sum to one), 4 | and (if converged properly) will be MAP estimates under the 5 | unconstrained natural exponential family (NEF) parameterization. 6 | 7 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from calc_nef_map_pi_d_K__tensorflow import ( 3 | calc_nef_map_pi_d_K__tensorflow, 4 | _calc_nef_map_pi_d_K__tensorflow_graph, 5 | ) 6 | HAS_TENSORFLOW = True 7 | except ImportError: 8 | HAS_TENSORFLOW = False 9 | _calc_nef_map_pi_d_K__tensorflow_graph = None 10 | calc_nef_map_pi_d_K__tensorflow = None 11 | 12 | from calc_nef_map_pi_d_K import ( 13 | calc_nef_map_pi_d_K, 14 | make_convex_alpha_minus_1, 15 | DefaultDocTopicOptKwargs, 16 | calc_nef_map_pi_d_K__autograd, 17 | calc_nef_map_pi_d_K__cython, 18 | ) 19 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/calc_nef_map_pi_d_K__autograd.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | 3 | from calc_nef_map_pi_d_K__defaults import DefaultDocTopicOptKwargs 4 | 5 | def calc_nef_map_pi_d_K__autograd( 6 | word_id_d_Ud=None, 7 | word_ct_d_Ud=None, 8 | topics_KUd=None, 9 | topics_KV=None, 10 | convex_alpha_minus_1=None, 11 | init_pi_d_K=None, 12 | ct_topics_KUd=None, 13 | pi_max_iters=DefaultDocTopicOptKwargs['pi_max_iters'], 14 | pi_converge_thr=DefaultDocTopicOptKwargs['pi_converge_thr'], 15 | pi_step_size=DefaultDocTopicOptKwargs['pi_step_size'], 16 | pi_min_step_size=DefaultDocTopicOptKwargs['pi_min_step_size'], 17 | pi_step_decay_rate=DefaultDocTopicOptKwargs['pi_step_decay_rate'], 18 | pi_min_mass_preserved_to_trust_step=( 19 | DefaultDocTopicOptKwargs['pi_min_mass_preserved_to_trust_step']), 20 | **kwargs): 21 | ''' Find MAP estimate of the K-dim. proba vector for specific document. 22 | 23 | Uses Natural-parameter Exponential Family (NEF) formulation, 24 | so the optimization problem is always convex. 25 | 26 | Finds solution via iterative exponentiated gradient steps. 27 | 28 | Returns 29 | ------- 30 | pi_d_K : 1D array, size K 31 | Contains non-negative entries that sum to one. 32 | info_dict : dict 33 | ''' 34 | pi_step_size = float(pi_step_size) 35 | pi_converge_thr = float(pi_converge_thr) 36 | 37 | if topics_KUd is None: 38 | topics_KUd = topics_KV[:, word_id_d_Ud] 39 | K = topics_KUd.shape[0] 40 | 41 | # Precompute some useful things 42 | if ct_topics_KUd is None: 43 | ct_topics_KUd = topics_KUd * word_ct_d_Ud[np.newaxis, :] 44 | 45 | # Parse convex_alpha_minus_1 46 | convex_alpha_minus_1 = float(convex_alpha_minus_1) 47 | assert convex_alpha_minus_1 < 1.0 48 | assert convex_alpha_minus_1 >= 0.0 49 | 50 | # Initialize as uniform vector over K simplex 51 | if init_pi_d_K is None: 52 | init_pi_d_K = np.ones(K) / float(K) 53 | else: 54 | init_pi_d_K = np.asarray(init_pi_d_K) 55 | assert init_pi_d_K.ndim == 1 56 | assert init_pi_d_K.size == K 57 | 58 | pi_d_K = 1.0 * init_pi_d_K 59 | best_pi_d_K = 1.0 * init_pi_d_K 60 | # Start loop over iterations 61 | did_converge = 0 62 | n_restarts = 0 63 | giter = 0 64 | cur_L1_diff = 1.0 65 | while giter < pi_max_iters: 66 | giter = giter + 1 67 | denom_Ud = 1.0 / np.dot(pi_d_K, topics_KUd) 68 | grad_K = pi_step_size * ( 69 | np.dot(ct_topics_KUd, denom_Ud) 70 | + convex_alpha_minus_1 / (1e-9 + pi_d_K) 71 | ) 72 | grad_K = grad_K - np.max(grad_K) 73 | new_pi_d_K = pi_d_K * np.exp(grad_K) 74 | new_pi_d_K_sum = np.sum(new_pi_d_K) 75 | if new_pi_d_K_sum <= pi_min_mass_preserved_to_trust_step: 76 | if pi_step_size > pi_min_step_size: 77 | # Undo the latest update to pi_d_K 78 | # and continue from previous pi_d_K with smaller step size 79 | giter = giter - 1 80 | n_restarts = n_restarts + 1 81 | pi_step_size = pi_step_size * pi_step_decay_rate 82 | pi_d_K = 1.0 * best_pi_d_K 83 | continue 84 | else: 85 | pi_d_K = 1.0 * best_pi_d_K 86 | break 87 | pi_d_K = new_pi_d_K / new_pi_d_K_sum 88 | # Check for convergence every few iters 89 | if giter % 5 == 0: 90 | cur_L1_diff = np.sum(np.abs(best_pi_d_K - pi_d_K)) 91 | if cur_L1_diff < pi_converge_thr: 92 | did_converge = 1 93 | break 94 | best_pi_d_K = 1.0 * pi_d_K 95 | 96 | return pi_d_K, dict( 97 | n_iters=giter, 98 | pi_max_iters=pi_max_iters, 99 | did_converge=did_converge, 100 | cur_L1_diff=cur_L1_diff, 101 | pi_converge_thr=pi_converge_thr, 102 | n_restarts=n_restarts, 103 | pi_step_size=pi_step_size, 104 | pi_min_step_size=pi_min_step_size, 105 | convex_alpha_minus_1=convex_alpha_minus_1) -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/calc_nef_map_pi_d_K__cython.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True 2 | 3 | import numpy as np 4 | from libc.math cimport log, exp, abs 5 | 6 | def calc_nef_map_pi_d_K__cython( 7 | double[:] init_pi_d_K, 8 | double[:,:] topics_KUd, 9 | double[:,:] ct_topics_KUd, 10 | double convex_alpha_minus_1=0.0, 11 | int pi_max_iters=0, 12 | double pi_converge_thr=0.0, 13 | double pi_step_size=0.0, 14 | double pi_step_decay_rate=0.0, 15 | double pi_min_mass_preserved_to_trust_step=1.0, 16 | double pi_min_step_size=0.0, 17 | **kwargs): 18 | """ Find MAP estimate of the K-dim. proba vector for specific document. 19 | 20 | Uses Natural-parameter Exponential Family (NEF) formulation, 21 | so the optimization problem is always convex. 22 | 23 | Finds solution via iterative exponentiated gradient steps. 24 | 25 | Returns 26 | ------- 27 | pi_d_K : 1D array, size K 28 | Contains non-negative entries that sum to one. 29 | info_dict : dict 30 | Contains info about the optimization 31 | """ 32 | 33 | cdef int K = topics_KUd.shape[0] 34 | cdef int Ud = topics_KUd.shape[1] 35 | 36 | cdef double[:] denom_Ud = np.zeros(Ud) 37 | cdef double[:] grad_K = np.zeros(K) 38 | cdef double[:] pi_d_K = np.asarray(init_pi_d_K).copy() 39 | 40 | cdef double cur_L1_diff = 1.0 41 | cdef double new_pi_sum = 0.0 42 | cdef double new_pi_k = 0.0 43 | cdef int giter = 0 44 | cdef int did_converge = 0 45 | cdef int n_restarts = 0 46 | cdef int k = 0 47 | cdef int u = 0 48 | cdef double max_val = -1e9 49 | while giter < pi_max_iters: 50 | for k in range(K): 51 | grad_K[k] = 0.0 52 | for u in xrange(Ud): 53 | denom_Ud[u] = 0.0 54 | for k in xrange(K): 55 | denom_Ud[u] += pi_d_K[k] * topics_KUd[k,u] 56 | denom_Ud[u] = 1.0 / denom_Ud[u] 57 | for k in range(K): 58 | grad_K[k] += denom_Ud[u] * ct_topics_KUd[k, u] 59 | #np.dot(pi_d_K, topics_KUd, out=denom_Ud) 60 | #np.divide(1.0, denom_Ud, out=denom_Ud) 61 | #np.dot(ct_topics_KUd, denom_Ud, out=grad_K) 62 | 63 | max_val = -1e9 64 | for k in range(K): 65 | grad_K[k] += convex_alpha_minus_1 / (1e-9 + pi_d_K[k]) 66 | grad_K[k] *= pi_step_size 67 | if (grad_K[k] > max_val): 68 | max_val = grad_K[k] 69 | 70 | # Let grad_K now contain the new pi vector 71 | new_pi_sum = 0.0 72 | for k in range(K): 73 | grad_K[k] = pi_d_K[k] * exp(grad_K[k] - max_val) 74 | new_pi_sum += grad_K[k] 75 | 76 | if new_pi_sum <= pi_min_mass_preserved_to_trust_step: 77 | if pi_step_size > pi_min_step_size: 78 | # Retry from previous pi_d_K with smaller step size 79 | n_restarts += 1 80 | pi_step_size *= pi_step_decay_rate 81 | continue 82 | else: 83 | # We've reached minimum step size. Abort. 84 | break 85 | 86 | giter += 1 87 | if giter % 5 == 0: 88 | cur_L1_diff = 0.0 89 | for k in range(K): 90 | new_pi_k = grad_K[k] / new_pi_sum 91 | cur_L1_diff += abs(pi_d_K[k] - new_pi_k) 92 | pi_d_K[k] = new_pi_k 93 | if cur_L1_diff < pi_converge_thr: 94 | did_converge = 1 95 | break 96 | else: 97 | for k in range(K): 98 | pi_d_K[k] = grad_K[k] / new_pi_sum 99 | 100 | return np.asarray(pi_d_K), dict( 101 | n_iters=giter, 102 | did_converge=did_converge, 103 | n_restarts=n_restarts, 104 | cur_L1_diff=cur_L1_diff, 105 | pi_max_iters=pi_max_iters, 106 | pi_converge_thr=pi_converge_thr, 107 | pi_step_size=pi_step_size, 108 | pi_min_step_size=pi_min_step_size, 109 | convex_alpha_minus_1=convex_alpha_minus_1) 110 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/calc_nef_map_pi_d_K__defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ## Create defaults on load 4 | ## Overriding some options if set in os.environ namespace 5 | def make_default_kwargs(): 6 | lstep_kwargs = dict( 7 | pi_max_iters=100, 8 | pi_min_iters=10, 9 | pi_converge_thr=0.0001, 10 | pi_step_size=0.005, 11 | pi_max_step_size=0.1, 12 | pi_min_step_size=1.0e-9, 13 | pi_step_decay_rate=0.75, 14 | pi_min_mass_preserved_to_trust_step=0.25) 15 | for key, val in lstep_kwargs.items(): 16 | if key in os.environ: 17 | if isinstance(val, float): 18 | lstep_kwargs[key] = float(os.environ[key]) 19 | else: 20 | lstep_kwargs[key] = int(os.environ[key]) 21 | print ">>> OVERRIDE DEFAULT LSTEP KW: %s = %s" % ( 22 | key, os.environ[key]) 23 | return lstep_kwargs 24 | DefaultDocTopicOptKwargs = make_default_kwargs() 25 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__single_doc_map/calc_nef_map_pi_d_K__numpy_linesearch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from calc_nef_map_pi_d_K__defaults import DefaultDocTopicOptKwargs 4 | 5 | def calc_nef_map_pi_d_K__numpy_linesearch( 6 | word_id_d_Ud=None, 7 | word_ct_d_Ud=None, 8 | topics_KUd=None, 9 | topics_KV=None, 10 | nef_alpha=None, 11 | init_pi_d_K=None, 12 | pi_max_iters=DefaultDocTopicOptKwargs['pi_max_iters'], 13 | pi_converge_thr=DefaultDocTopicOptKwargs['pi_converge_thr'], 14 | pi_step_size=DefaultDocTopicOptKwargs['pi_step_size'], 15 | pi_max_step_size=DefaultDocTopicOptKwargs['pi_max_step_size'], 16 | pi_min_step_size=DefaultDocTopicOptKwargs['pi_min_step_size'], 17 | pi_step_decay_rate=DefaultDocTopicOptKwargs['pi_step_decay_rate'], 18 | pi_min_mass_preserved_to_trust_step=\ 19 | DefaultDocTopicOptKwargs['pi_min_mass_preserved_to_trust_step'], 20 | verbose=False, 21 | verbose_pi=False, 22 | track_stuff=False, 23 | **kwargs): 24 | ''' Find MAP estimate of the K-dim. proba vector for specific document. 25 | 26 | Uses Natural-parameter Exponential Family (NEF) formulation. 27 | 28 | Returns 29 | ------- 30 | pi_d_K : 1D array, size K 31 | Contains non-negative entries that sum to one. 32 | info_dict : dict 33 | ''' 34 | raise ValueError("TODO: NEEDS CHECKING/FIXING") 35 | 36 | if topics_KUd is None: 37 | topics_KUd = topics_KV[:, word_id_d_Ud] 38 | 39 | # Precompute some useful things 40 | ct_topics_KUd = topics_KUd * word_ct_d_Ud[np.newaxis, :] 41 | K = topics_KUd.shape[0] 42 | 43 | # Parse nef_alpha 44 | nef_alpha = float(nef_alpha) 45 | assert nef_alpha >= 1.0 46 | convex_alpha_minus_1 = float(nef_alpha) - 1.0 47 | assert convex_alpha_minus_1 < 1.0 48 | assert convex_alpha_minus_1 >= 0.0 49 | 50 | # Initialize as uniform vector over K simplex 51 | if init_pi_d_K is None: 52 | init_pi_d_K = np.ones(K) / float(K) 53 | else: 54 | init_pi_d_K = np.asarray(init_pi_d_K) 55 | assert init_pi_d_K.ndim == 1 56 | assert init_pi_d_K.size == K 57 | 58 | best_pi_d_K = 1.0 * init_pi_d_K 59 | best_denom_Ud = np.dot(best_pi_d_K, topics_KUd) 60 | best_loss = -1.0 * np.inner(word_ct_d_Ud, np.log(best_denom_Ud)) 61 | 62 | if track_stuff: 63 | pi_list = list() 64 | loss_list = list() 65 | step_list = list() 66 | 67 | # Start loop over iterations 68 | did_converge = 0 69 | n_restarts = 0 70 | n_improve_in_a_row = 0 71 | giter = 0 72 | cur_step_size = pi_step_size * pi_step_decay_rate 73 | while giter < pi_max_iters: 74 | giter = giter + 1 75 | #denom_Ud = 1.0 / np.dot(pi_d_K, topics_KUd) 76 | grad_K = ( 77 | np.dot(ct_topics_KUd, 1.0 / best_denom_Ud) 78 | # Purposefully not using alpha here. 79 | ) 80 | grad_K = grad_K - np.max(grad_K) 81 | 82 | if n_improve_in_a_row > 2: 83 | # Increase step size (since we seem to be improving regularly) 84 | cur_step_size = cur_step_size / pi_step_decay_rate 85 | 86 | # But scale it down slightly (between 0.9 and 1.0) 87 | # so that we avoid too much oscillation around optimum 88 | cur_step_size *= 1.0 - 0.1 * (float(giter)/float(pi_max_iters)) 89 | 90 | cur_step_size = np.minimum( 91 | max_pi_step_size, 92 | cur_step_size) 93 | did_improve = False 94 | while cur_step_size >= pi_min_step_size: 95 | new_pi_d_K = best_pi_d_K * np.exp(cur_step_size * grad_K) 96 | new_pi_d_K_sum = np.sum(new_pi_d_K) 97 | new_pi_d_K = new_pi_d_K / new_pi_d_K_sum 98 | 99 | new_denom_Ud = np.dot(new_pi_d_K, topics_KUd) 100 | new_loss = -1.0 * np.inner(word_ct_d_Ud, np.log(new_denom_Ud)) 101 | if new_loss > best_loss: 102 | # Try smaller stepsize 103 | cur_step_size = cur_step_size * pi_step_decay_rate 104 | n_restarts += 1 105 | n_improve_in_a_row = 0 106 | else: 107 | n_improve_in_a_row += 1 108 | did_improve = True 109 | break 110 | 111 | # Check for convergence 112 | delta_mass = np.sum(np.abs(best_pi_d_K - new_pi_d_K)) 113 | if delta_mass < pi_converge_thr: 114 | did_converge = 1 115 | if did_improve: 116 | if verbose: 117 | delta_loss = (best_loss - new_loss) / np.abs(best_loss) 118 | msg_str = \ 119 | "iter %4d step_size %.5f loss %.8e" \ 120 | + " delta_pi %.5f delta_loss %.9e <<< keep" 121 | msg_str = msg_str % ( 122 | giter, cur_step_size, new_loss, delta_mass, delta_loss) 123 | print msg_str 124 | if verbose_pi: 125 | print ' '.join(["%.5e" % a for a in new_pi_d_K]) 126 | best_pi_d_K = 1.0 * new_pi_d_K 127 | best_denom_Ud = new_denom_Ud 128 | best_loss = new_loss 129 | if track_stuff: 130 | pi_list.append(best_pi_d_K) 131 | step_list.append(cur_step_size) 132 | loss_list.append(best_loss) 133 | if did_converge or not did_improve: 134 | break 135 | 136 | info_dict = dict( 137 | did_converge=did_converge, 138 | n_iters=giter, 139 | n_iters_try=giter + n_restarts, 140 | pi_max_iters=pi_max_iters, 141 | n_restarts=n_restarts, 142 | pi_converge_thr=pi_converge_thr, 143 | pi_step_size=cur_step_size, 144 | pi_min_step_size=pi_min_step_size) 145 | if track_stuff: 146 | info_dict['pi_list'] = pi_list 147 | info_dict['step_list'] = step_list 148 | info_dict['loss_list'] = loss_list 149 | 150 | return best_pi_d_K, info_dict -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py: -------------------------------------------------------------------------------- 1 | from calc_elbo_for_many_docs__vb_qpiDir_qzCat import ( 2 | calc_elbo_for_many_docs) 3 | 4 | from calc_N_d_K__vb_qpiDir_qzCat import ( 5 | calc_N_d_K__vb_coord_ascent__many_tries, 6 | calc_N_d_K__vb_coord_ascent) -------------------------------------------------------------------------------- /pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_elbo_for_many_docs__vb_qpiDir_qzCat.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | from scipy.special import gammaln, digamma 5 | from scipy.misc import logsumexp 6 | 7 | from calc_N_d_K__vb_qpiDir_qzCat import ( 8 | calc_N_d_K__vb_coord_ascent__many_tries) 9 | 10 | def calc_elbo_for_many_docs( 11 | dataset=None, 12 | alpha=None, 13 | alpha_K=None, 14 | topics_KV=None, 15 | verbose=False, 16 | print_progress_every=-1, 17 | init_name_list=['prior_mean'], 18 | init_pi_DK=None, 19 | prng=None, 20 | seed=0, 21 | return_info=False, 22 | active_ct_thr=0.01, 23 | do_trace_elbo=False, 24 | **lstep_kwargs): 25 | 26 | assert dataset is not None 27 | assert topics_KV is not None 28 | 29 | K = topics_KV.shape[0] 30 | dtype = topics_KV.dtype 31 | word_ct_U = np.asarray(dataset['word_ct_U'], dtype=dtype) 32 | if alpha_K is None: 33 | alpha_K = float(alpha) * np.ones(K, dtype=dtype) 34 | else: 35 | alpha_K = np.asarray(alpha_K, dtype=dtype) 36 | 37 | if return_info: 38 | theta_DK = np.zeros((dataset['n_docs'], K)) 39 | 40 | if init_pi_DK is not None: 41 | assert init_pi_DK.shape[0] == dataset['n_docs'] 42 | assert init_pi_DK.shape[1] == K 43 | assert 'warm' in init_name_list 44 | else: 45 | init_P_d_K = None 46 | 47 | if prng is None: 48 | prng = np.random.RandomState(seed) 49 | 50 | ttl_lb_logpdf_x = 0.0 51 | ttl_n_tokens = 0 52 | ttl_n_docs = 0 53 | 54 | D = dataset['n_docs'] 55 | if print_progress_every > 0: 56 | converged_per_doc = np.zeros(D, dtype=np.int32) 57 | dist_per_doc = np.zeros(D, dtype=np.float64) 58 | iter_per_doc = np.zeros(D, dtype=np.int32) 59 | n_active_per_doc = np.zeros(D, dtype=np.float64) 60 | start_time_sec = time.time() 61 | for d in range(D): 62 | start = dataset['doc_indptr_Dp1'][d] 63 | stop = dataset['doc_indptr_Dp1'][d+1] 64 | Ud = stop - start 65 | word_ct_d_Ud = word_ct_U[start:stop] 66 | word_id_d_Ud = dataset['word_id_U'][start:stop] 67 | 68 | if init_pi_DK is not None: 69 | init_pi_d_K = init_pi_DK[d] 70 | 71 | N_d_K, info_dict = \ 72 | calc_N_d_K__vb_coord_ascent__many_tries( 73 | word_id_d_Ud=word_id_d_Ud, 74 | word_ct_d_Ud=word_ct_d_Ud, 75 | topics_KV=topics_KV, 76 | alpha_K=alpha_K, 77 | init_name_list=init_name_list, 78 | init_pi_d_K=init_pi_d_K, 79 | prng=prng, 80 | verbose=verbose, 81 | do_trace_elbo=do_trace_elbo, 82 | **lstep_kwargs) 83 | 84 | if return_info: 85 | theta_DK[d] = N_d_K + alpha_K 86 | 87 | # Norm constant per document 88 | h_x_d = gammaln(1.0 + np.sum(word_ct_d_Ud)) \ 89 | - np.sum(gammaln(1.0 + word_ct_d_Ud)) 90 | 91 | # Aggregate 92 | ttl_lb_logpdf_x += info_dict['ELBO'] + h_x_d 93 | ttl_n_tokens += np.sum(word_ct_d_Ud) 94 | ttl_n_docs += 1 95 | 96 | if print_progress_every > 0: 97 | dist_per_doc[d] = info_dict['converge_dist'] 98 | converged_per_doc[d] = info_dict['did_converge'] 99 | iter_per_doc[d] = info_dict['n_iters'] 100 | n_active_per_doc[d] = np.sum(N_d_K >= active_ct_thr) 101 | # Do the printing of the progress 102 | if print_progress_every > 0 and ( 103 | (d + 1) % print_progress_every == 0 104 | or (d + 1) == D 105 | ): 106 | msg = make_readable_summary_for_pi_DK_inference( 107 | n_docs_completed=ttl_n_docs, 108 | n_docs=D, 109 | dist_per_doc=dist_per_doc, 110 | iters_per_doc=iter_per_doc, 111 | converged_per_doc=converged_per_doc, 112 | n_active_per_doc=n_active_per_doc, 113 | elapsed_time_sec=time.time() - start_time_sec) 114 | msg += "\n neg_log_p(x) %.6e" % ( 115 | ttl_neg_log_p_x / ttl_n_tokens) 116 | pprint(msg) 117 | 118 | ttl_lb_logpdf_x_per_tok = ttl_lb_logpdf_x / ttl_n_tokens 119 | if return_info: 120 | info_dict = dict( 121 | theta_DK=theta_DK, 122 | dist_per_doc=dist_per_doc, 123 | iters_per_doc=iter_per_doc, 124 | converged_per_doc=converged_per_doc, 125 | n_active_per_doc=n_active_per_doc, 126 | ) 127 | return ttl_lb_logpdf_x, ttl_lb_logpdf_x_per_tok, info_dict 128 | else: 129 | return ttl_lb_logpdf_x, ttl_lb_logpdf_x_per_tok 130 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/slda_estimator__w_given_pi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.linear_model import Ridge as RidgeRegression 5 | 6 | from pc_toolbox.utils_io import ( 7 | pprint, 8 | ) 9 | 10 | def estimate_w_CK__given_pi_DK( 11 | dataset=None, 12 | pi_DK=None, 13 | lambda_w=0.001, 14 | seed=42, 15 | prefix='', 16 | verbose=False, 17 | **kwargs): 18 | """ Estimate regression weights from provided probability features. 19 | 20 | Uses sklearn's regularized regressors under the hood. 21 | 22 | Returns 23 | ------- 24 | w_CK : 2D array, size C x K 25 | Regression weights 26 | """ 27 | 28 | K = pi_DK.shape[1] 29 | C = int(dataset['n_labels']) 30 | if verbose: 31 | pprint('%s Fitting %d regressions...' % ( 32 | prefix, C)) 33 | 34 | w_CK = np.zeros((C, K)) 35 | 36 | u_y_vals = np.unique(dataset['y_DC'].flatten()) 37 | if u_y_vals.size <= 2 and np.union1d([0.0, 1.0], u_y_vals).size == 2: 38 | output_data_type = 'binary' 39 | else: 40 | output_data_type = 'real' 41 | 42 | if 'y_rowmask' in dataset: 43 | y_DC = dataset['y_DC'][1 == dataset['y_rowmask']] 44 | pi_DK = pi_DK[1 == dataset['y_rowmask']] 45 | u_y_vals = np.unique(y_DC.sum(axis=1)) 46 | assert u_y_vals.size > 1 47 | else: 48 | y_DC = dataset['y_DC'] 49 | 50 | for c in xrange(C): 51 | # Do a quick regression to get initial weights! 52 | if output_data_type.count('binary') > 0: 53 | clf = LogisticRegression( 54 | fit_intercept=False, 55 | C=0.5/lambda_w, 56 | random_state=seed, 57 | ) 58 | else: 59 | clf = RidgeRegression( 60 | fit_intercept=False, 61 | alpha=lambda_w, 62 | random_state=seed, 63 | ) 64 | 65 | clf.fit(pi_DK, y_DC[:, c]) 66 | w_CK[c] = clf.coef_ 67 | if verbose: 68 | pprint(' w_CK[%d, :5]=' % c + ' '.join(['% .2f' % w for w in w_CK[c, :5]])) 69 | pprint(' label id %d / %d done with lambda_w = %.5f' % ( 70 | c+1, C, lambda_w)) 71 | return w_CK -------------------------------------------------------------------------------- /pc_toolbox/model_slda/slda_utils__diffable_param_manager__tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from pc_toolbox.utils_diffable_transforms import tfm__2D_rows_sum_to_one 4 | 5 | def _unflatten_to_common_param_dict__tf_graph( 6 | param_vec, 7 | n_states=0, 8 | n_vocabs=0, 9 | n_labels=0, 10 | min_eps=tfm__2D_rows_sum_to_one.MIN_EPS, 11 | **unused_kwargs): 12 | K = int(n_states) 13 | V = int(n_vocabs) 14 | C = int(n_labels) 15 | F_topics = K * (V-1) 16 | log_topics_KVm1 = tf.reshape(param_vec[:F_topics], (K, V-1)) 17 | log_topics_KV = tf.concat([ 18 | log_topics_KVm1, 19 | tf.zeros([K, 1], dtype=tf.float64)], 20 | axis=1) 21 | topics_KV = min_eps + tf.exp( 22 | log_topics_KV 23 | - tf.reduce_logsumexp( 24 | log_topics_KV, 25 | reduction_indices=[1], 26 | keepdims=True) 27 | + tf.log1p(tf.cast(-V * min_eps, dtype=tf.float64))) 28 | w_CK = tf.reshape(param_vec[F_topics:], (C, K)) 29 | return dict(topics_KV=topics_KV, w_CK=w_CK) 30 | 31 | 32 | def unflatten_to_common_param_dict__tf( 33 | param_vec=None, 34 | n_states=1, 35 | n_labels=1, 36 | n_vocabs=1, 37 | **dim_kwargs): 38 | K = int(n_states) 39 | V = int(n_vocabs) 40 | C = int(n_labels) 41 | S = K * (V-1) + K * C 42 | _param_vec = tf.placeholder(shape=[S], dtype=tf.float64) 43 | _param_dict = _unflatten_to_common_param_dict__tf_graph( 44 | _param_vec, 45 | n_states=n_states, 46 | n_labels=n_labels, 47 | n_vocabs=n_vocabs, 48 | ) 49 | sess = tf.Session() 50 | param_dict = sess.run([_param_dict], feed_dict={_param_vec:param_vec})[0] 51 | return param_dict 52 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/slda_utils__param_io_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | import scipy.sparse 5 | from distutils.dir_util import mkpath 6 | from sklearn.externals import joblib 7 | 8 | from pc_toolbox.utils_io import update_symbolic_link 9 | 10 | def save_topic_model_param_dict( 11 | param_dict, 12 | output_path=None, 13 | param_output_fmt='dump', 14 | disable_output=False, 15 | **alg_state_kwargs): 16 | """ Save snapshot of topic model parameters to disk 17 | 18 | Returns 19 | ------- 20 | snapshot_path : path to where results were saved. 21 | """ 22 | snapshot_path = None 23 | if output_path is not None and (not disable_output): 24 | cur_lap = alg_state_kwargs['cur_lap'] 25 | if param_output_fmt.count('dump'): 26 | best_filepath = os.path.join( 27 | output_path, 'best_param_dict.dump') 28 | cur_filepath = os.path.join( 29 | output_path, 'lap%011.3f_param_dict.dump' % (cur_lap)) 30 | joblib.dump(param_dict, cur_filepath, compress=1) 31 | update_symbolic_link(cur_filepath, best_filepath) 32 | 33 | if param_output_fmt.count('topic_model_snapshot'): 34 | prefix = 'lap%011.3f' % cur_lap 35 | snapshot_path = save_topic_model_params_as_txt_files( 36 | output_path, 37 | prefix, 38 | **param_dict) 39 | best_path = snapshot_path.replace(prefix, 'best') 40 | if best_path.count('best') > 0: 41 | update_symbolic_link(snapshot_path, best_path) 42 | else: 43 | raise ValueError("Bad path: " + snapshot_path) 44 | return snapshot_path 45 | 46 | def load_topic_model_param_dict( 47 | snapshot_path=None, 48 | task_path=None, 49 | prefix='best', 50 | lap=None, 51 | w_txt_basename='w_CK.txt', 52 | add_bias_term_to_w_CK=0.0, 53 | **kwargs): 54 | ''' Load topic model parameters from disk. 55 | 56 | Supports either dump file or folder of txt files 57 | 58 | Returns 59 | ------- 60 | param_dict : dict with fields 61 | * topics_KV : 2D array, K x V 62 | * w_CK : 2D array, C x K 63 | ''' 64 | if snapshot_path is None: 65 | if lap is not None: 66 | prefix = 'lap%011.3f' % float(lap) 67 | assert prefix is not None 68 | 69 | for pprefix in [prefix, prefix + "_param_dict.dump"]: 70 | try: 71 | dump_path = os.path.join(task_path, pprefix) 72 | param_dict = joblib.load(dump_path) 73 | return param_dict 74 | except IOError as e: 75 | pass 76 | snapshot_path = os.path.join( 77 | task_path, 78 | prefix + "_topic_model_snapshot") 79 | try: 80 | param_dict = joblib.load(snapshot_path) 81 | return param_dict 82 | except IOError: 83 | pass 84 | 85 | try: 86 | tau = float(np.loadtxt(os.path.join(snapshot_path, 'tau.txt'))) 87 | except IOError: 88 | if 'tau' in kwargs: 89 | tau = float(kwargs['tau']) 90 | else: 91 | tau = None 92 | try: 93 | alpha = float(np.loadtxt(os.path.join(snapshot_path, 'alpha.txt'))) 94 | except IOError: 95 | if 'alpha' in kwargs: 96 | alpha = float(kwargs['alpha']) 97 | else: 98 | alpha = None 99 | try: 100 | lambda_w = float(np.loadtxt(os.path.join(snapshot_path, 'lambda_w.txt'))) 101 | except IOError: 102 | if 'lambda_w' in kwargs: 103 | lambda_w = float(kwargs['lambda_w']) 104 | else: 105 | lambda_w = None 106 | 107 | try: 108 | topics_KV = np.loadtxt( 109 | os.path.join(snapshot_path, 'topics_KV.txt')) 110 | except IOError: 111 | csr_prefix = 'topic_word_count_csr' 112 | Q = dict() 113 | for suffix in ['data', 'indices', 'indptr', 'shape']: 114 | csr_fpath = '%s_%s.txt' % (csr_prefix, suffix) 115 | Q[suffix] = np.loadtxt(os.path.join(snapshot_path, csr_fpath)) 116 | topic_count_KV = scipy.sparse.csr_matrix( 117 | (Q['data'], Q['indices'], Q['indptr']), 118 | shape=Q['shape']) 119 | topics_KV = topic_count_KV.toarray().copy() 120 | del Q 121 | topics_KV += tau 122 | topics_KV /= topics_KV.sum(axis=1)[:,np.newaxis] 123 | 124 | try: 125 | w_txt_fpath = os.path.join(snapshot_path, w_txt_basename) 126 | if w_txt_basename != 'w_CK.txt': 127 | if os.path.exists(w_txt_fpath): 128 | print " USING w_txt_basename:", w_txt_basename 129 | else: 130 | print " FALLING BACK TO w_CK.txt" 131 | w_txt_fpath = os.path.join(snapshot_path, 'w_CK.txt') 132 | w_CK = np.loadtxt(w_txt_fpath) 133 | if w_CK.ndim == 1: 134 | w_CK = w_CK[np.newaxis,:].copy() 135 | 136 | if add_bias_term_to_w_CK != 0.0: 137 | K = w_CK.shape[1] 138 | w_CK = w_CK - add_bias_term_to_w_CK 139 | except IOError: 140 | w_CK = None 141 | return dict( 142 | topics_KV=topics_KV, 143 | w_CK=w_CK, 144 | tau=tau, 145 | alpha=alpha, 146 | lambda_w=lambda_w) 147 | 148 | def save_topic_model_params_as_txt_files( 149 | output_path=None, 150 | prefix='', 151 | topics_KV=None, 152 | w_CK=None, 153 | pi_DK=None, 154 | **kwargs): 155 | snapshot_path = os.path.join( 156 | output_path, 157 | prefix + "_topic_model_snapshot") 158 | mkpath(snapshot_path) 159 | np.savetxt( 160 | os.path.join(snapshot_path, 'topics_KV.txt'), 161 | topics_KV, 162 | fmt='%.11f', 163 | delimiter=' ') 164 | if w_CK is not None: 165 | np.savetxt( 166 | os.path.join(snapshot_path, 'w_CK.txt'), 167 | w_CK, 168 | fmt='%.9f', 169 | delimiter=' ') 170 | if pi_DK is not None: 171 | np.savetxt( 172 | os.path.join(snapshot_path, 'pi_DK.txt'), 173 | pi_DK, 174 | fmt='%.6f', 175 | delimiter=' ') 176 | for key in kwargs: 177 | if key.endswith('_param_dict'): 178 | fpath = os.path.join(snapshot_path, key + ".dump") 179 | joblib.dump(kwargs[key], fpath, compress=1) 180 | 181 | return snapshot_path 182 | -------------------------------------------------------------------------------- /pc_toolbox/model_slda/slda_utils__param_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parameter management functions for sLDA 3 | 4 | Key functions: 5 | * flatten_to_differentiable_param_vec 6 | * unflatten_to_common_param_dict 7 | """ 8 | 9 | import autograd.numpy as np 10 | from pc_toolbox.utils_diffable_transforms import ( 11 | tfm__2D_rows_sum_to_one, 12 | ) 13 | 14 | def flatten_to_differentiable_param_vec( 15 | param_dict=None, 16 | topics_KV=None, 17 | w_CK=None, 18 | **unused_kwargs): 19 | """ Convert common parameters of sLDA into flat vector of reals. 20 | 21 | Examples 22 | -------- 23 | >>> K = 2; V = 3; C = 2; 24 | >>> topics_KV = np.asarray([[0.6, 0.3, 0.1], [0.2, 0.1, 0.7]]) 25 | >>> w_CK = np.asarray([[4.0, -4.0], [-1.0, 1.0]]) 26 | >>> param_vec = flatten_to_differentiable_param_vec( 27 | ... topics_KV=topics_KV, 28 | ... w_CK=w_CK) 29 | >>> param_dict = unflatten_to_common_param_dict( 30 | ... param_vec=param_vec, n_states=K, n_vocabs=V, n_labels=C) 31 | >>> print param_dict['w_CK'] 32 | [[ 4. -4.] 33 | [-1. 1.]] 34 | 35 | >>> print param_dict['topics_KV'] 36 | [[0.6 0.3 0.1] 37 | [0.2 0.1 0.7]] 38 | >>> np.allclose(param_dict['topics_KV'], topics_KV) 39 | True 40 | """ 41 | if isinstance(param_dict, dict): 42 | topics_KV = param_dict['topics_KV'] 43 | w_CK = param_dict['w_CK'] 44 | return np.hstack([ 45 | tfm__2D_rows_sum_to_one.to_diffable_arr(topics_KV).flatten(), 46 | w_CK.flatten()]) 47 | 48 | def unflatten_to_common_param_dict( 49 | param_vec=None, 50 | n_states=0, 51 | n_vocabs=0, 52 | n_labels=0, 53 | **unused_kwargs): 54 | K = int(n_states) 55 | V = int(n_vocabs) 56 | C = int(n_labels) 57 | F_topics = K * (V-1) 58 | topics_KV = tfm__2D_rows_sum_to_one.to_common_arr( 59 | param_vec[:F_topics].reshape(K, V-1)) 60 | w_CK = np.reshape(param_vec[F_topics:], (C, K)) 61 | return dict(topics_KV=topics_KV, w_CK=w_CK) -------------------------------------------------------------------------------- /pc_toolbox/topic_quality_metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/pc_toolbox/topic_quality_metrics/__init__.py -------------------------------------------------------------------------------- /pc_toolbox/topic_quality_metrics/calc_coherence_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | 4 | def calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic( 5 | top_vocab_ids=None, 6 | ndocs_csc_VV=None, 7 | dataset=None, 8 | pair_smooth_eps=0.1, 9 | marg_smooth_eps=None, 10 | ): 11 | """ Compute Coherence metric for given topic's top-ranked terms. 12 | 13 | Returns 14 | ------- 15 | coherence_score : float 16 | Larger values indicate more coherent topics. 17 | 18 | Examples 19 | -------- 20 | >>> x_DV = np.arange(6)[:,np.newaxis] * np.hstack([np.eye(6), np.zeros((6, 3))]) 21 | >>> x_DV[:3, :3] += 1 22 | >>> x_DV[4, 5] += 17 23 | >>> _, ndocs_csc_VV = calc_pairwise_cooccurance_counts(x_csr_DV=x_DV) 24 | 25 | # Compute coherence for a very related pair 26 | >>> calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic([2, 0], ndocs_csc_VV)[0] 27 | 0.86755478351365201 28 | 29 | # Compute coherence for a very unrelated pair 30 | >>> calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic([2, 5], ndocs_csc_VV)[0] 31 | -0.16789018869324493 32 | 33 | # Compute coherence for a pair where one word doesnt appear much 34 | >>> calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic([0, 8], ndocs_csc_VV)[0] 35 | -0.0093324001175008262 36 | 37 | # Try coherence for first 3 (should be large) 38 | >>> calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic([0,1,2], ndocs_csc_VV) 39 | (0.86755478351365201, 1.2954904783676406) 40 | 41 | # Try coherence for a bad set of 3 (should be small) 42 | >>> calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic([0,3,6], ndocs_csc_VV) 43 | (0.13222810917463279, 0.65152143821207875) 44 | """ 45 | top_vocab_ids = np.asarray(top_vocab_ids, dtype=np.int32) 46 | M = top_vocab_ids.size 47 | V = ndocs_csc_VV.shape[0] 48 | diag_ids = np.diag_indices(V) 49 | triu_ids = np.triu_indices(V, 1) 50 | P = len(triu_ids[0]) + len(diag_ids) 51 | 52 | ndocs_V = np.squeeze(np.asarray(ndocs_csc_VV.sum(axis=0))) 53 | ndocs_V -= np.squeeze(np.asarray(ndocs_csc_VV[diag_ids])) 54 | ndocs_V /= 2.0 55 | n_utoken_pairs = float(np.sum(ndocs_csc_VV[triu_ids])) 56 | assert np.allclose(n_utoken_pairs, ndocs_V.sum()) 57 | 58 | if marg_smooth_eps is None: 59 | marg_smooth_eps = float(pair_smooth_eps * P) / V 60 | assert np.allclose( 61 | pair_smooth_eps * P, 62 | marg_smooth_eps * V) 63 | 64 | n_top_pairs = 0.0 65 | npmi_coherence_score = 0.0 66 | pmi_coherence_score = 0.0 67 | for mm, v in enumerate(top_vocab_ids[:-1]): 68 | Mrem = M - mm - 1 69 | counts_v_Mrem = ndocs_csc_VV[v, top_vocab_ids[mm+1:]] 70 | try: 71 | counts_v_Mrem = counts_v_Mrem.toarray() 72 | except AttributeError: 73 | pass 74 | assert counts_v_Mrem.size == Mrem 75 | jointprob_v_Mrem = (counts_v_Mrem + pair_smooth_eps) / (n_utoken_pairs + pair_smooth_eps * P) 76 | margprob_v_Mrem = (ndocs_V[top_vocab_ids[mm+1:]] + marg_smooth_eps) / (ndocs_V.sum() + marg_smooth_eps * V) 77 | margprob_v = (ndocs_V[v] + marg_smooth_eps) / (ndocs_V.sum() + marg_smooth_eps * V) 78 | 79 | denom_Mrem = np.log(jointprob_v_Mrem) 80 | numer_Mrem = denom_Mrem - np.log(margprob_v_Mrem) - np.log(margprob_v) 81 | 82 | npmi_coherence_score_Mrem = numer_Mrem / (-1.0 * denom_Mrem) 83 | assert np.all(npmi_coherence_score_Mrem >= -(1.00001)) 84 | assert np.all(npmi_coherence_score_Mrem <= (1.00001)) 85 | 86 | pmi_coherence_score += np.sum(numer_Mrem) 87 | npmi_coherence_score += np.sum(npmi_coherence_score_Mrem) 88 | n_top_pairs += Mrem 89 | return ( 90 | npmi_coherence_score / (n_top_pairs + 1e-13), 91 | pmi_coherence_score / (n_top_pairs + 1e-13), 92 | ) 93 | 94 | 95 | def calc_umass_coherence_for_top_ranked_terms_in_topic( 96 | top_vocab_ids=None, 97 | ndocs_V=None, 98 | ndocs_csc_VV=None, 99 | topics_KV=None, 100 | k=None, 101 | dataset=None, 102 | pair_smooth_eps=0.1, 103 | marg_smooth_eps=1e-9, 104 | ): 105 | """ Compute Coherence metric for given topic's top-ranked terms. 106 | 107 | Returns 108 | ------- 109 | coherence_score : float 110 | Larger values indicate more coherent topics. 111 | 112 | Examples 113 | -------- 114 | >>> x_DV = np.arange(6)[:,np.newaxis] * np.hstack([np.eye(6), np.zeros((6, 3))]) 115 | >>> x_DV[:3, :3] += 1 116 | >>> x_DV[4, 5] += 17 117 | >>> ndocs_V, ndocs_csc_VV = calc_pairwise_cooccurance_counts(x_csr_DV=x_DV) 118 | >>> coh = calc_umass_coherence_for_top_ranked_terms_in_topic([0, 8], ndocs_V, ndocs_csc_VV) 119 | >>> coh2 = np.log(0.1 / 3.0) 120 | >>> np.allclose(coh, coh2) 121 | True 122 | >>> coh_good = calc_umass_coherence_for_top_ranked_terms_in_topic([0, 1, 2], ndocs_V, ndocs_csc_VV) 123 | >>> coh_bad = calc_umass_coherence_for_top_ranked_terms_in_topic([0, 4, 5], ndocs_V, ndocs_csc_VV) 124 | >>> coh_worst = calc_umass_coherence_for_top_ranked_terms_in_topic([0, 3, 7], ndocs_V, ndocs_csc_VV) 125 | >>> coh_good > coh_bad 126 | True 127 | >>> coh_bad > coh_worst 128 | True 129 | """ 130 | V = ndocs_V.size 131 | top_vocab_ids = np.asarray(top_vocab_ids, dtype=np.int32) 132 | M = top_vocab_ids.size 133 | coherence_score = 0.0 134 | for mm, v in enumerate(top_vocab_ids[:-1]): 135 | Mrem = M - mm - 1 136 | counts_Mrem = ndocs_csc_VV[v, top_vocab_ids[mm+1:]] 137 | try: 138 | counts_Mrem = counts_Mrem.toarray() 139 | except AttributeError: 140 | pass 141 | assert counts_Mrem.size == Mrem 142 | coherence_score += ( 143 | np.sum(np.log(counts_Mrem + pair_smooth_eps)) 144 | - Mrem * np.log(ndocs_V[v] + marg_smooth_eps) 145 | ) 146 | return coherence_score 147 | 148 | def calc_pairwise_cooccurance_counts( 149 | x_csr_DV=None, 150 | dataset=None, 151 | ): 152 | """ Calculate word cooccurances across a corpus of D documents 153 | 154 | Returns 155 | ------- 156 | ndocs_V : 1D array, size V 157 | entry v counts the number of documents that contain v at least once 158 | ndocs_csc_VV : 2D csc sparse matrix, V x V 159 | entry v,w counts the number of documents which contain 160 | the word pair (v, w) at least once 161 | 162 | Examples 163 | -------- 164 | >>> x_DV = np.arange(6)[:,np.newaxis] * np.hstack([np.eye(6), np.zeros((6, 3))]) 165 | >>> x_DV[:3, :3] += 1 166 | >>> x_DV[4, 5] += 17 167 | >>> ndocs_V, ndocs_csc_VV = calc_pairwise_cooccurance_counts(x_csr_DV=x_DV) 168 | >>> ndocs_V.astype(np.int32).tolist() 169 | [3, 3, 3, 1, 1, 2, 0, 0, 0] 170 | >>> ndocs_csc_VV.toarray()[:3, :3] 171 | array([[ 3., 3., 3.], 172 | [ 3., 3., 3.], 173 | [ 3., 3., 3.]]) 174 | """ 175 | if x_csr_DV is None: 176 | x_csr_DV = dataset['x_csr_DV'] 177 | x_csr_DV = scipy.sparse.csr_matrix(x_csr_DV, dtype=np.float64) 178 | 179 | binx_csr_DV = x_csr_DV.copy() 180 | binx_csr_DV.data[:] = 1.0 181 | 182 | ndocs_V = np.squeeze(np.asarray(binx_csr_DV.sum(axis=0))) 183 | 184 | ndocs_csc_VV = (binx_csr_DV.T * binx_csr_DV).tocsc() 185 | return ndocs_V, ndocs_csc_VV 186 | 187 | 188 | -------------------------------------------------------------------------------- /pc_toolbox/utils_data/__init__.py: -------------------------------------------------------------------------------- 1 | from util_data_slicer import make_slice_for_step 2 | from util_stratified_subsample import get_stratified_subsample_ids -------------------------------------------------------------------------------- /pc_toolbox/utils_data/util_data_slicer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def make_slice_for_step( 4 | step_id=0, 5 | n_total=0, 6 | n_batches=1, 7 | seed=42, 8 | **kwargs): 9 | ''' Compute slice for provided step 10 | 11 | If step_id < 0, always given the first slice 12 | Otherwise, give a random slice 13 | 14 | Returns 15 | ------- 16 | cur_slice : slice object 17 | ''' 18 | if step_id >= 0: 19 | ## Seed the random generator with current lap number 20 | prng = np.random.RandomState(seed + (step_id // n_batches)) 21 | batch_order = prng.permutation(n_batches) 22 | batch_id = batch_order[step_id % n_batches] 23 | else: 24 | batch_id = 0 25 | batch_size = int(np.ceil(n_total / float(n_batches))) 26 | start = batch_id * batch_size 27 | stop = np.minimum(n_total, (batch_id + 1) * batch_size) 28 | return slice(start, stop) 29 | 30 | 31 | -------------------------------------------------------------------------------- /pc_toolbox/utils_data/util_stratified_subsample.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pc_toolbox.utils_io import pprint 4 | 5 | def get_stratified_subsample_ids( 6 | y_DC=None, 7 | n_subsamples=1000, 8 | min_per_label=5, 9 | seed=42, 10 | verbose=False): 11 | ''' Get row ids of examples to keep in subsample for initializing weights 12 | 13 | Returns 14 | ------- 15 | doc_ids : 1D array of ids 16 | 17 | Examples 18 | -------- 19 | >>> y_DC = np.zeros((1000, 3)) 20 | >>> y_DC[200:205, 0] = 1 21 | >>> y_DC[400:405, 1] = 1 22 | >>> y_DC[:995, 2] = 1 23 | >>> mask = get_stratified_subsample_ids(y_DC, 10, min_per_label=5) 24 | >>> mask.tolist() 25 | [200, 201, 202, 203, 204, 400, 401, 402, 403, 404, 995, 996, 997, 998, 999] 26 | >>> np.sum(y_DC[mask] == 0, axis=0).tolist() 27 | [10, 10, 10] 28 | >>> np.sum(y_DC[mask] == 1, axis=0).tolist() 29 | [5, 5, 5] 30 | ''' 31 | n_labels = y_DC.shape[1] 32 | n_examples = y_DC.shape[0] 33 | if n_subsamples >= n_examples: 34 | return np.arange(n_examples) 35 | # If here, we actually need to subsample 36 | 37 | # Make version of y_DC where 1 is the minority class in EVERY column 38 | sums_total = np.sum(y_DC, axis=0) 39 | need_flip = sums_total / n_examples > 0.5 40 | y_DC[:, need_flip] = 1.0 - y_DC[:, need_flip] 41 | sums_total[need_flip] = n_examples - sums_total[need_flip] 42 | 43 | 44 | keep_mask = np.zeros(y_DC.shape[0], dtype=np.bool) 45 | sums_subsample = np.sum(y_DC[keep_mask], axis=0) 46 | for c in xrange(n_labels): 47 | if sums_subsample[c] < min_per_label \ 48 | and sums_subsample[c] < sums_total[c]: 49 | n_more = np.minimum(min_per_label, sums_total[c]) 50 | on_ids = np.flatnonzero(y_DC[:, c])[:min_per_label] 51 | keep_mask[on_ids] = True 52 | size = np.sum(keep_mask) 53 | if size < n_subsamples: 54 | prng = np.random.RandomState(seed) 55 | eligible_ids = np.flatnonzero(keep_mask == 0) 56 | chosen_ids = prng.choice( 57 | eligible_ids, n_subsamples - size, replace=False) 58 | keep_mask[chosen_ids] = 1 59 | size = np.sum(keep_mask) 60 | assert size >= n_subsamples 61 | sums_subsample = np.sum(y_DC[keep_mask], axis=0) 62 | if verbose: 63 | pprint('Minority examples per label in dataset of size %d' % n_examples) 64 | pprint(' '.join(['%4d' % val for val in sums_total])) 65 | pprint('Minority examples per label in subsample of size %d:' % size) 66 | pprint(' '.join(['%4d' % val for val in sums_subsample])) 67 | return np.flatnonzero(keep_mask) 68 | -------------------------------------------------------------------------------- /pc_toolbox/utils_diffable_transforms/__init__.py: -------------------------------------------------------------------------------- 1 | import util_differentiable_transform__2D_rows_sum_to_one \ 2 | as tfm__2D_rows_sum_to_one 3 | import util_differentiable_transform__unit_interval \ 4 | as tfm__unit_interval 5 | import util_differentiable_transform__log_unit_interval \ 6 | as tfm__log_unit_interval 7 | 8 | # Make a few functions easily available 9 | logistic_sigmoid = tfm__unit_interval.logistic_sigmoid 10 | log_logistic_sigmoid = tfm__log_unit_interval.log_logistic_sigmoid 11 | -------------------------------------------------------------------------------- /pc_toolbox/utils_diffable_transforms/util_differentiable_transform__2D_rows_sum_to_one.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentiable transform for 2D array whose rows sum to one. 3 | 4 | Need to_common_arr to have no edge cases ... 5 | always deliver something that sums to 1.0 6 | with minimum value min_eps 7 | 8 | Need to_diffable_arr to be robust 9 | always cast its input to the proper domain 10 | and then take logs, etc. 11 | 12 | ''' 13 | 14 | import autograd.numpy as np 15 | from autograd.scipy.misc import logsumexp 16 | 17 | MIN_EPS = 1e-11 18 | 19 | def to_common_arr( 20 | log_topics_KVm1, 21 | min_eps=MIN_EPS, 22 | **kwargs): 23 | ''' Convert unconstrained topic weights to proper normalized topics 24 | 25 | Should handle any non-nan, non-inf input without numerical problems. 26 | 27 | Args 28 | ---- 29 | log_topics_KVm1 : 2D array, size K x V-1 30 | 31 | Returns 32 | ------- 33 | topics_KV : 2D array, size K x V 34 | minimum value of any entry will be min_eps 35 | each row will sum to 1.0 (+/- min_eps) 36 | ''' 37 | K, Vm1 = log_topics_KVm1.shape 38 | V = Vm1 + 1 39 | log_topics_KV = np.hstack([ 40 | log_topics_KVm1, 41 | np.zeros((K, 1))]) 42 | log_topics_KV -= logsumexp(log_topics_KV, axis=1, keepdims=1) 43 | log_topics_KV += np.log1p(-V * min_eps) 44 | topics_KV = np.exp(log_topics_KV) 45 | 46 | return min_eps + topics_KV 47 | 48 | def to_diffable_arr(topics_KV, min_eps=MIN_EPS, do_force_safe=False): 49 | ''' Transform normalized topics to unconstrained space. 50 | 51 | Args 52 | ---- 53 | topics_KV : 2D array, size K x V 54 | minimum value of any entry must be min_eps 55 | each row should sum to 1.0 56 | 57 | Returns 58 | ------- 59 | log_topics_vec : 2D array, size K x (V-1) 60 | unconstrained real values 61 | 62 | Examples 63 | -------- 64 | >>> topics_KV = np.eye(3) + np.ones((3,3)) 65 | >>> topics_KV /= topics_KV.sum(axis=1)[:,np.newaxis] 66 | >>> log_topics_vec = to_diffable_arr(topics_KV) 67 | >>> out_KV = to_common_arr(log_topics_vec) 68 | >>> np.allclose(out_KV, topics_KV) 69 | True 70 | ''' 71 | if do_force_safe: 72 | topics_KV = to_safe_common_arr(topics_KV, min_eps) 73 | K, V = topics_KV.shape 74 | log_topics_KV = np.log(topics_KV) 75 | log_topics_KVm1 = log_topics_KV[:, :-1] 76 | log_topics_KVm1 = log_topics_KVm1 - log_topics_KV[:, -1][:,np.newaxis] 77 | return log_topics_KVm1 + np.log1p(-V * min_eps) 78 | 79 | def to_safe_common_arr(topics_KV, min_eps=MIN_EPS): 80 | ''' Force provided topics_KV array to be numerically safe. 81 | 82 | Returns 83 | ------- 84 | topics_KV : 2D array, size K x V 85 | minimum value of each row is min_eps 86 | each row will sum to 1.0 (+/- min_eps) 87 | ''' 88 | K, V = topics_KV.shape 89 | topics_KV = topics_KV.copy() 90 | for rep in range(2): 91 | topics_KV /= topics_KV.sum(axis=1)[:,np.newaxis] 92 | np.maximum(topics_KV, min_eps, out=topics_KV) 93 | return topics_KV 94 | 95 | 96 | if __name__ == '__main__': 97 | topics_KV = np.eye(3) + np.ones((3,3)) 98 | topics_KV /= topics_KV.sum(axis=1)[:,np.newaxis] 99 | 100 | print('------ before') 101 | print(topics_KV) 102 | print('------ after') 103 | print(to_common_arr(to_diffable_arr(topics_KV))) -------------------------------------------------------------------------------- /pc_toolbox/utils_diffable_transforms/util_differentiable_transform__log_unit_interval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Define invertible transform from unit interval to real line. 3 | 4 | Examples 5 | -------- 6 | >>> from autograd import elementwise_grad 7 | >>> g_auto = elementwise_grad( 8 | ... _log_logistic_sigmoid_not_vectorized) 9 | >>> g_manual = elementwise_grad(log_logistic_sigmoid) 10 | >>> vals = np.linspace(-5000., 5000., 100) 11 | >>> for x in vals: assert np.allclose(g_auto(x), g_manual(x)) 12 | 13 | # Can successfully call g_manual on array of values 14 | >>> np.all(np.isfinite(g_manual(vals))) 15 | True 16 | 17 | # Cannot do so with autograd 18 | >>> g_auto(vals) 19 | Traceback (most recent call last): 20 | ... 21 | ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() 22 | ''' 23 | 24 | import autograd.numpy as np 25 | from autograd.scipy.misc import logsumexp 26 | from autograd.core import primitive 27 | from autograd import elementwise_grad, grad 28 | try: 29 | from autograd.extend import primitive, defvjp # defvjp is now a function 30 | except ImportError: 31 | from autograd.core import primitive 32 | defvjp = None 33 | 34 | def to_common_arr(x): 35 | return log_logistic_sigmoid(x) 36 | 37 | def _log_logistic_sigmoid(x_real): 38 | ''' Compute log of logistic sigmoid transform from real line to unit interval. 39 | 40 | Numerically stable and fully vectorized. 41 | 42 | Args 43 | ---- 44 | x_real : array-like, with values in (-infty, +infty) 45 | 46 | Returns 47 | ------- 48 | log_p_real : array-like, size of x_real, with values in <= 0 49 | ''' 50 | if not isinstance(x_real, float): 51 | out = np.zeros_like(x_real) 52 | mask1 = x_real > 50.0 53 | out[mask1] = - np.log1p(np.exp(-x_real[mask1])) 54 | mask0 = np.logical_not(mask1) 55 | out[mask0] = x_real[mask0] 56 | out[mask0] -= np.log1p(np.exp(x_real[mask0])) 57 | return out 58 | return _log_logistic_sigmoid_not_vectorized(x_real) 59 | 60 | def _log_logistic_sigmoid_not_vectorized(x_real): 61 | if x_real > 50.0: 62 | return - np.log1p(np.exp(-x_real)) 63 | else: 64 | return x_real - np.log1p(np.exp(x_real)) 65 | 66 | @primitive 67 | def log_logistic_sigmoid(x): 68 | return _log_logistic_sigmoid(x) 69 | 70 | # Definite gradient function via manual formula 71 | # Supporting different versions of autograd software 72 | if defvjp is not None: 73 | # Latest version of autograd 74 | def _vjp__log_logistic_sigmoid(ans, x): 75 | def _my_gradient(g, x=x, ans=ans): 76 | x = np.asarray(x) 77 | return np.full(x.shape, g) * (1 - np.exp(ans)) 78 | return _my_gradient 79 | defvjp( 80 | log_logistic_sigmoid, 81 | _vjp__log_logistic_sigmoid, 82 | ) 83 | elif hasattr(primitive, 'defvjp'): 84 | # Slightly older version of autograd 85 | def _vjp__log_logistic_sigmoid(g, ans, vs, gvs, x): 86 | x = np.asarray(x) 87 | return np.full(x.shape, g) * (1 - np.exp(ans)) 88 | log_logistic_sigmoid.defvjp(_vjp__log_logistic_sigmoid) 89 | else: 90 | # Older version of autograd 91 | def _make_grad_product(ans, x): 92 | x = np.asarray(x) 93 | def grad_product(g): 94 | return np.full(x.shape, g) * (1 - np.exp(ans)) 95 | return grad_product 96 | log_logistic_sigmoid.defgrad(_make_grad_product) 97 | -------------------------------------------------------------------------------- /pc_toolbox/utils_diffable_transforms/util_differentiable_transform__unit_interval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentiable transform for any array whose elements lie in (0,1). 3 | 4 | logistic_sigmoid(x) : 1 / (1 + exp(-x)) 5 | 6 | Examples 7 | -------- 8 | >>> from autograd import elementwise_grad 9 | >>> g_auto = elementwise_grad( 10 | ... _logistic_sigmoid_not_vectorized) 11 | >>> g_manual = elementwise_grad(logistic_sigmoid) 12 | 13 | # Create grid of possible inputs 14 | >>> vals = np.linspace(-5000., 5000., 100) 15 | 16 | # Verify two funcs compute the same answers for all grid elements 17 | >>> for x in vals: assert np.allclose(g_auto(x), g_manual(x)) 18 | 19 | # Can successfully call g_manual on array of values 20 | >>> np.all(np.isfinite(g_manual(vals))) 21 | True 22 | 23 | # Cannot do so with autograd using not-vectorized function 24 | >>> g_auto(vals) 25 | Traceback (most recent call last): 26 | ... 27 | ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() 28 | ''' 29 | 30 | import autograd.numpy as np 31 | import autograd.extend 32 | from autograd.scipy.misc import logsumexp 33 | from autograd import elementwise_grad, grad 34 | try: 35 | from autograd.extend import primitive, defvjp # defvjp is now a function 36 | except ImportError: 37 | from autograd.core import primitive 38 | defvjp = None 39 | 40 | MIN_VAL=1e-200 41 | MAX_VAL=1 - 1e-14 42 | 43 | @primitive 44 | def logistic_sigmoid(x_real): 45 | ''' Compute logistic sigmoid transform from real line to unit interval. 46 | 47 | Numerically stable and fully vectorized. 48 | 49 | Args 50 | ---- 51 | x_real : array-like, with values in (-infty, +infty) 52 | 53 | Returns 54 | ------- 55 | p_real : array-like, size of x_real, with values in (0, 1) 56 | 57 | Examples 58 | -------- 59 | >>> logistic_sigmoid(-55555.) 60 | 0.0 61 | >>> logistic_sigmoid(0.0) 62 | 0.5 63 | >>> logistic_sigmoid(55555.) 64 | 1.0 65 | >>> logistic_sigmoid(np.asarray([-999999, 0, 999999.])) 66 | array([ 0. , 0.5, 1. ]) 67 | ''' 68 | if not isinstance(x_real, float): 69 | out = np.zeros_like(x_real) 70 | mask1 = x_real > 50.0 71 | out[mask1] = 1.0 / (1.0 + np.exp(-x_real[mask1])) 72 | mask0 = np.logical_not(mask1) 73 | out[mask0] = np.exp(x_real[mask0]) 74 | out[mask0] /= (1.0 + out[mask0]) 75 | return out 76 | if x_real > 50.0: 77 | pos_real = np.exp(-x_real) 78 | return 1.0 / (1.0 + pos_real) 79 | else: 80 | pos_real = np.exp(x_real) 81 | return pos_real / (1.0 + pos_real) 82 | 83 | def _logistic_sigmoid_not_vectorized(x_real): 84 | if x_real > 50.0: 85 | pos_real = np.exp(-x_real) 86 | return 1.0 / (1.0 + pos_real) 87 | else: 88 | pos_real = np.exp(x_real) 89 | return pos_real / (1.0 + pos_real) 90 | 91 | 92 | # Definite gradient function via manual formula 93 | # Supporting different versions of autograd software 94 | if defvjp is not None: 95 | # Latest version of autograd 96 | def _vjp__logistic_sigmoid(ans, x): 97 | def _my_gradient(g, x=x, ans=ans): 98 | x = np.asarray(x) 99 | return np.full(x.shape, g) * ans * (1.0 - ans) 100 | return _my_gradient 101 | defvjp( 102 | logistic_sigmoid, 103 | _vjp__logistic_sigmoid, 104 | ) 105 | elif hasattr(primitive, 'defvjp'): 106 | def _vjp__logistic_sigmoid(ans, g, vs, gvs, x): 107 | x = np.asarray(x) 108 | return np.full(x.shape, g) * ans * (1.0 - ans) 109 | logistic_sigmoid.defvjp(_vjp__logistic_sigmoid) 110 | else: 111 | def _make_grad_prod(ans,x): 112 | x = np.asarray(x) 113 | def gradient_product(g): 114 | return np.full(x.shape, g) * ans * (1-ans) 115 | return gradient_product 116 | logistic_sigmoid.defgrad(_make_grad_prod) 117 | 118 | 119 | def inv_logistic_sigmoid( 120 | p, do_force_safe=True): 121 | ''' Compute inverse logistic sigmoid from unit interval to reals. 122 | 123 | Numerically stable and fully vectorized. 124 | 125 | Args 126 | ---- 127 | p : array-like, with values in (0, 1) 128 | 129 | Returns 130 | ------- 131 | x : array-like, size of p, with values in (-infty, infty) 132 | 133 | Examples 134 | -------- 135 | >>> np.round(inv_logistic_sigmoid(0.11), 6) 136 | -2.090741 137 | >>> np.round(inv_logistic_sigmoid(0.5), 6) 138 | 0.0 139 | >>> np.round(inv_logistic_sigmoid(0.89), 6) 140 | 2.090741 141 | 142 | >>> p_vec = np.asarray([ 143 | ... 1e-100, 1e-10, 1e-5, 144 | ... 0.25, 0.75, .9999, 1-1e-14]) 145 | >>> np.round(inv_logistic_sigmoid(p_vec), 2) 146 | array([-230.26, -23.03, -11.51, -1.1 , 1.1 , 9.21, 32.24]) 147 | ''' 148 | if do_force_safe: 149 | p = np.minimum(np.maximum(p, MIN_VAL), MAX_VAL) 150 | return np.log(p) - np.log1p(-p) 151 | 152 | def to_safe_common_arr(p): 153 | p = np.minimum(np.maximum(p, MIN_VAL), MAX_VAL) 154 | return p 155 | 156 | to_common_arr = logistic_sigmoid 157 | to_diffable_arr = inv_logistic_sigmoid -------------------------------------------------------------------------------- /pc_toolbox/utils_io/__init__.py: -------------------------------------------------------------------------------- 1 | from pprint_logging import pprint, config_pprint_logging 2 | from util_pprint_percentiles import make_percentile_str 3 | 4 | from util_watermark import make_string_of_reachable_modules_with_versions 5 | 6 | from util_timing import ( 7 | start_timer_segment, 8 | stop_timer_segment, 9 | pprint_timer_segments, 10 | ) 11 | 12 | from util_io_training import ( 13 | do_print_now, 14 | do_save_now, 15 | default_settings_alg_io, 16 | init_alg_state_kwargs, 17 | update_alg_state_kwargs, 18 | make_status_string, 19 | save_status_to_txt_files, 20 | append_to_txtfile, 21 | update_alg_state_kwargs_after_print, 22 | update_alg_state_kwargs_after_save, 23 | update_symbolic_link, 24 | calc_laps_when_snapshots_saved, 25 | ) 26 | 27 | from util_setup import ( 28 | setup_detect_taskid_and_insert_into_output_path, 29 | setup_random_seed, 30 | setup_output_path, 31 | write_user_provided_kwargs_to_txt, 32 | write_env_vars_to_txt, 33 | write_python_module_versions_to_txt, 34 | ) 35 | 36 | from util_io_csr import ( 37 | load_csr_matrix, 38 | save_csr_matrix, 39 | load_csr_matrix_from_ldac_txtfile, 40 | ) 41 | 42 | from util_io_txt import ( 43 | load_list_of_strings_from_txt, 44 | load_list_of_unicode_from_txt, 45 | ) 46 | 47 | from util_array import ( 48 | toCArray, 49 | as1D, 50 | as2D, 51 | as3D) 52 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/pprint_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | RootLog = None 6 | 7 | def pprint(msg_str='', level=logging.INFO): 8 | global RootLog 9 | if RootLog is None: 10 | print msg_str 11 | else: 12 | RootLog.log(level, msg_str) 13 | 14 | def config_pprint_logging( 15 | output_path='/tmp/', 16 | do_write_txtfile=True, 17 | do_write_stdout=True, 18 | txtfile='stdout.txt', 19 | ): 20 | global RootLog 21 | RootLog = logging.getLogger('pprint_logging') 22 | RootLog.handlers = [] 23 | RootLog.setLevel(logging.DEBUG) 24 | 25 | formatter = logging.Formatter('%(message)s') 26 | # Config logger to save transcript of log messages to plain-text file 27 | if do_write_txtfile: 28 | fh = logging.FileHandler(os.path.join(output_path, txtfile)) 29 | fh.setLevel(logging.DEBUG) 30 | fh.setFormatter(formatter) 31 | RootLog.addHandler(fh) 32 | # Config logger that can write to stdout 33 | if do_write_stdout: 34 | ch = logging.StreamHandler(sys.stdout) 35 | ch.setLevel(logging.DEBUG) 36 | ch.setFormatter(formatter) 37 | RootLog.addHandler(ch) 38 | 39 | # Config null logger, avoids error messages about no handler existing 40 | if not do_write_txtfile and not do_write_stdout: 41 | RootLog.addHandler(logging.NullHandler()) 42 | 43 | ''' 44 | # Prepare special logs if we are running on the Brown CS grid 45 | try: 46 | jobID = int(os.getenv('JOB_ID')) 47 | except TypeError: 48 | jobID = 0 49 | if jobID > 0: 50 | Log.info('SGE Grid Job ID: %d' % (jobID)) 51 | 52 | if 'SGE_STDOUT_PATH' in os.environ: 53 | # Create symlinks to captured stdout, stdout in output directory 54 | os.symlink(os.getenv('SGE_STDOUT_PATH'), 55 | os.path.join(taskoutpath, 'stdout')) 56 | os.symlink(os.getenv('SGE_STDERR_PATH'), 57 | os.path.join(taskoutpath, 'stderr')) 58 | 59 | with open(os.path.join(taskoutpath, 'GridInfo.txt'), 'w') as f: 60 | f.write(str(jobID) + "\n") 61 | f.write(str(taskid) + "\n") 62 | f.write('stdout: ' + os.getenv('SGE_STDOUT_PATH') + "\n") 63 | f.write('stderr: ' + os.getenv('SGE_STDERR_PATH') + "\n") 64 | return jobID 65 | ''' 66 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_array.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def toCArray(X, dtype=np.float64): 4 | """ Convert input into numpy array of C-contiguous order. 5 | 6 | Ensures returned array is aligned and owns its own data, 7 | not a view of another array. 8 | 9 | Returns 10 | ------- 11 | X : ND array 12 | 13 | Examples 14 | ------- 15 | >>> Q = np.zeros(10, dtype=np.int32, order='F') 16 | >>> toCArray(Q).flags.c_contiguous 17 | True 18 | >>> toCArray(Q).dtype.byteorder 19 | '=' 20 | """ 21 | X = np.asarray_chkfinite(X, dtype=dtype, order='C') 22 | if X.dtype.byteorder != '=': 23 | X = X.newbyteorder('=').copy() 24 | if not X.flags.owndata or X.flags.aligned: 25 | X = X.copy() 26 | assert X.flags.owndata 27 | assert X.flags.aligned 28 | return X 29 | 30 | def as1D(x): 31 | """ Convert input into to 1D numpy array. 32 | 33 | Returns 34 | ------- 35 | x : 1D array 36 | 37 | Examples 38 | ------- 39 | >>> as1D(5) 40 | array([5]) 41 | >>> as1D([1,2,3]) 42 | array([1, 2, 3]) 43 | >>> as1D([[3,4,5,6]]) 44 | array([3, 4, 5, 6]) 45 | """ 46 | if not isinstance(x, np.ndarray): 47 | x = np.asarray_chkfinite(x) 48 | if x.ndim < 1: 49 | x = np.asarray_chkfinite([x]) 50 | elif x.ndim > 1: 51 | x = np.squeeze(x) 52 | return x 53 | 54 | 55 | def as2D(x): 56 | """ Convert input into to 2D numpy array. 57 | 58 | 59 | Returns 60 | ------- 61 | x : 2D array 62 | 63 | Examples 64 | ------- 65 | >>> as2D(5) 66 | array([[5]]) 67 | >>> as2D([1,2,3]) 68 | array([[1, 2, 3]]) 69 | >>> as2D([[3,4,5,6]]) 70 | array([[3, 4, 5, 6]]) 71 | """ 72 | if not isinstance(x, np.ndarray): 73 | x = np.asarray_chkfinite(x) 74 | if x.ndim < 1: 75 | x = np.asarray_chkfinite([x]) 76 | while x.ndim < 2: 77 | x = x[np.newaxis, :] 78 | return x 79 | 80 | 81 | def as3D(x): 82 | """ Convert input into to 3D numpy array. 83 | 84 | Returns 85 | ------- 86 | x : 3D array 87 | 88 | Examples 89 | ------- 90 | >>> as3D(5) 91 | array([[[5]]]) 92 | >>> as3D([1,2,3]) 93 | array([[[1, 2, 3]]]) 94 | >>> as3D([[3,4,5,6]]) 95 | array([[[3, 4, 5, 6]]]) 96 | """ 97 | if not isinstance(x, np.ndarray): 98 | x = np.asarray_chkfinite(x) 99 | if x.ndim < 1: 100 | x = np.asarray_chkfinite([x]) 101 | while x.ndim < 3: 102 | x = x[np.newaxis, :] 103 | return x 104 | 105 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_io_csr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | import os 4 | 5 | def load_csr_matrix(filename): 6 | Q = np.load(filename) 7 | return scipy.sparse.csr_matrix( 8 | (Q['data'], Q['indices'], Q['indptr']), 9 | shape=Q['shape']) 10 | 11 | def save_csr_matrix(filename, array): 12 | np.savez( 13 | filename, 14 | data=array.data, 15 | indices=array.indices, 16 | indptr=array.indptr, 17 | shape=array.shape) 18 | 19 | def load_csr_matrix_from_ldac_txtfile( 20 | filepath=None, 21 | shape=None, 22 | n_vocabs=None, 23 | index_dtype=np.int32, 24 | data_dtype=np.float64, 25 | ): 26 | ''' Creates csr_matrix from a .ldac formatted plain-text file. 27 | 28 | Returns 29 | ------- 30 | x_DV : scipy.sparse.csr_matrix 31 | ''' 32 | assert n_vocabs is not None or shape is not None 33 | # Estimate num tokens in the file 34 | fileSize_bytes = os.path.getsize(filepath) 35 | nTokensPerByte = 1.0 / 5 36 | estimate_nUniqueTokens = int(nTokensPerByte * fileSize_bytes) 37 | 38 | # Preallocate space 39 | word_id_U = np.zeros(estimate_nUniqueTokens, dtype=index_dtype) 40 | word_ct_U = np.zeros(estimate_nUniqueTokens, dtype=data_dtype) 41 | nSeen = 0 42 | doc_sizes = [] 43 | with open(filepath, 'r') as f: 44 | # Simple case: read the whole file 45 | for line in f.readlines(): 46 | nUnique_d = -1 47 | while nUnique_d < 0: 48 | try: 49 | nUnique_d = process_ldac_line_into_preallocated_arrays( 50 | line, word_id, word_ct, nSeen) 51 | assert nUnique_d >= 0 52 | except IndexError as e: 53 | # Preallocated arrays not large enough 54 | # Double our preallocation, then try again 55 | extra_word_id = np.zeros(word_id.size, dtype=word_id.dtype) 56 | extra_word_ct = np.zeros(word_ct.size, dtype=word_ct.dtype) 57 | word_id = np.hstack([word_id, extra_word_id]) 58 | word_ct = np.hstack([word_ct, extra_word_ct]) 59 | 60 | doc_sizes.append(nUnique_d) 61 | nSeen += nUnique_d 62 | word_id = word_id[:nSeen] 63 | word_ct = word_ct[:nSeen] 64 | n_docs = len(doc_sizes) 65 | doc_range = np.hstack([0, np.cumsum(doc_sizes)], dtype=index_dtype) 66 | 67 | if shape is None: 68 | assert n_vocabs is not None 69 | shape = (n_docs, n_vocabs) 70 | x_csr_DV = scipy.csr_matrix( 71 | (word_ct, word_id, doc_range), 72 | shape=shape) 73 | return x_csr_DV 74 | 75 | def process_ldac_line_into_preallocated_arrays(line, word_id, word_ct, start): 76 | """ 77 | 78 | Returns 79 | ------- 80 | 81 | Examples 82 | -------- 83 | >>> word_id = np.zeros(5, dtype=np.int32) 84 | >>> word_ct = np.zeros(5, dtype=np.float64) 85 | >>> a = process_ldac_line_into_preallocated_arrays( 86 | ... '5 66:6 77:7 88:8', 87 | ... word_id, word_ct, 0) 88 | >>> a 89 | 5 90 | >>> word_id.tolist() 91 | [66, 77, 88, 0, 0] 92 | >>> word_ct.tolist() 93 | [ 6.,7., 8., 0., 0.] 94 | """ 95 | line = line.replace(':', ' ') 96 | data = np.fromstring(line, sep=' ', dtype=np.int32) 97 | stop = start + (len(data) - 1) // 2 98 | if stop >= word_id.size: 99 | raise IndexError("Provided array not large enough") 100 | word_id[start:stop] = data[1::2] 101 | word_ct[start:stop] = data[2::2] 102 | return data[0] 103 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_io_txt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import codecs 3 | 4 | def load_list_of_unicode_from_txt(txt_path): 5 | ''' Loads list of unicode_strings from plain-txt file 6 | 7 | Returns 8 | ------- 9 | list_of_u : list of unicode strings 10 | ''' 11 | possible_fmts = ['utf-8', 'iso-8859-1', 'ascii'] 12 | for fid, fmt in enumerate(possible_fmts): 13 | try: 14 | list_of_u = list() 15 | with codecs.open(txt_path, 'rU', fmt) as f: 16 | for line in f.readlines(): 17 | u = line.strip() 18 | u = u.replace(u' ', u'_') 19 | u = u.replace(u',', u'+') 20 | list_of_u.append(u) 21 | return list_of_u 22 | except UnicodeDecodeError as e: 23 | pass 24 | raise e 25 | 26 | def load_list_of_strings_from_txt(txt_path): 27 | ''' Load list of strings from a plain-txt file 28 | 29 | Assumes each string is on separate line of the file. 30 | 31 | Returns 32 | ------- 33 | list_of_str : list of strings 34 | Will have any whitespace replaced by underscore 35 | ''' 36 | array_of_str = np.loadtxt(txt_path, dtype=str, delimiter='\n') 37 | if array_of_str.ndim < 1: 38 | array_of_str = np.expand_dims(array_of_str, axis=0) 39 | list_of_str = [ 40 | s.replace(' ', '_').replace(',','+') 41 | for s in array_of_str.tolist()] 42 | assert isinstance(list_of_str, list) 43 | return list_of_str 44 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_pprint_percentiles.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def make_percentile_str( 4 | arr, 5 | percentiles=[0, 1, 10, 50, 90, 99, 100], 6 | fmt_str="%4d", 7 | sep_str=' '): 8 | msg_list = list() 9 | for p in percentiles: 10 | cur_fmt = "%3d%%:" + fmt_str 11 | msg_list.append( 12 | cur_fmt % (p, np.percentile(arr, p))) 13 | return sep_str.join(msg_list) 14 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_timing.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | 4 | def start_timer_segment(etime_info_dict, label): 5 | etime_info_dict[label + "_start"] = time.time() 6 | return etime_info_dict 7 | 8 | def stop_timer_segment(etime_info_dict, label): 9 | try: 10 | etime_info_dict[label + "_elapsed"] = \ 11 | time.time() - etime_info_dict[label + "_start"] 12 | except KeyError as e: 13 | pass 14 | return etime_info_dict 15 | 16 | def pprint_timer_segments( 17 | etime_info_dict, total_key='total', prefix=''): 18 | line_list = list() 19 | total_elapsed_key= total_key + "_elapsed" 20 | try: 21 | total_elapsed = etime_info_dict[total_elapsed_key] 22 | except KeyError: 23 | # Find earliest start time and mark that as elapsed 24 | earliest_start_time = np.inf 25 | for key in etime_info_dict: 26 | if key.endswith('_start'): 27 | etime = etime_info_dict[key] 28 | if etime < earliest_start_time: 29 | earliest_start_time = etime 30 | total_elapsed = time.time() - earliest_start_time 31 | total_measured = 0.0 32 | for key in etime_info_dict: 33 | if key.endswith("_elapsed"): 34 | etime = etime_info_dict[key] 35 | total_measured += etime 36 | msg_line = "%-10s %5.1f%% %8.2f sec %s" % ( 37 | prefix, 38 | float(etime / total_elapsed) * 100, 39 | etime, 40 | key.replace('_elapsed', '')) 41 | line_list.append(msg_line) 42 | 43 | total_other = total_elapsed - total_measured 44 | msg_line = "%-10s %5.1f%% %8.2f sec %s" % ( 45 | prefix, 46 | float(total_other / total_elapsed) * 100, 47 | total_other, 48 | 'other_unmeasured') 49 | line_list.append(msg_line) 50 | 51 | return '\n'.join(line_list) + '\n' 52 | -------------------------------------------------------------------------------- /pc_toolbox/utils_io/util_watermark.py: -------------------------------------------------------------------------------- 1 | import pip 2 | 3 | def make_string_of_reachable_modules_with_versions(context_dict=None): 4 | if context_dict is None: 5 | context_dict = globals() 6 | reachable_modules = dict() 7 | for key, val in context_dict.items(): 8 | if key.startswith('_'): 9 | continue 10 | if str(type(val)).count('module'): 11 | # This trick will import parent package 12 | # e.g. scipy.stats becomes scipy 13 | if val.__package__ is None: 14 | mod_name = val.__name__ 15 | mod = val 16 | else: 17 | try: 18 | mod = __import__(val.__package__) 19 | except ImportError: 20 | continue 21 | mod_name = mod.__name__ 22 | reachable_modules[mod_name] = mod 23 | if hasattr(mod, '__requirements__'): 24 | for req_line in mod.__requirements__: 25 | if req_line.count("=="): 26 | mname = req_line.split("==")[0] 27 | elif req_line.count(">="): 28 | mname = req_line.split(">=")[0] 29 | reachable_modules[mname] = None 30 | 31 | ver_info_list = [val for val in pip.operations.freeze.freeze()] 32 | 33 | explained_reachables = [] 34 | ans_list = [] 35 | for vstr in ver_info_list: 36 | if vstr.count('=='): 37 | name, version = vstr.split("==") 38 | elif vstr.count('egg'): 39 | parts = vstr.split('#egg=') 40 | name = parts[1] 41 | version = parts[0].replace('-e ', '') 42 | if version.count('.git@'): 43 | # Only display first 10 chars of git hash 44 | version = version[:version.find('.git@') + 15] 45 | else: 46 | name = vstr 47 | for mod_name in reachable_modules.keys(): 48 | if vstr.count(mod_name): 49 | ans_list.append("%-40s %s" % (name, version)) 50 | explained_reachables.append(mod_name) 51 | for rname, rmod in reachable_modules.items(): 52 | if rname not in explained_reachables: 53 | if hasattr(rmod, '__version__'): 54 | version = rmod.__version__ 55 | ans_list.append("%-40s %s" % (rname, version)) 56 | # Sort and return a list 57 | ans_list = sorted([s for s in ans_list]) 58 | ans = "\n".join(ans_list) + "\n" 59 | return ans -------------------------------------------------------------------------------- /pc_toolbox/utils_snapshots/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/pc_toolbox/utils_snapshots/__init__.py -------------------------------------------------------------------------------- /pc_toolbox/utils_snapshots/snapshot_perf_metrics__binary_outcomes.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "name": "snapshot_perf_metrics_binary_outcomes", 4 | "resources": [ 5 | { 6 | "path": "snapshot_perf_metrics_$SPLIT_NAME.csv", 7 | "profile": "tabular-data-resource", 8 | "schema": { 9 | "fields": [ 10 | { 11 | "name": "JOB_PATH", 12 | "type": "string" 13 | }, 14 | { 15 | "name": "TASK_PATH", 16 | "type": "string" 17 | }, 18 | { 19 | "name": "TASKID", 20 | "type": "integer" 21 | }, 22 | { 23 | "name": "SPLIT_NAME", 24 | "type": "string" 25 | }, 26 | { 27 | "name": "N_STATES", 28 | "type": "integer" 29 | }, 30 | { 31 | "name": "WEIGHT_Y", 32 | "type": "number" 33 | }, 34 | { 35 | "name": "WEIGHT_X", 36 | "type": "number" 37 | }, 38 | { 39 | "name": "ALPHA", 40 | "type": "number" 41 | }, 42 | { 43 | "name": "LAMBDA_W", 44 | "type": "number" 45 | }, 46 | { 47 | "name": "TAU", 48 | "type": "number" 49 | }, 50 | { 51 | "name": "INIT_NAME", 52 | "type": "string" 53 | }, 54 | { 55 | "name": "ALG_NAME", 56 | "type": "string" 57 | }, 58 | { 59 | "name": "FRAC_LABELS", 60 | "type": "number" 61 | }, 62 | { 63 | "name": "N_BATCHES", 64 | "type": "integer" 65 | }, 66 | { 67 | "name": "STEP_SIZE", 68 | "type": "number" 69 | }, 70 | { 71 | "name": "DECAY_RATE", 72 | "type": "number" 73 | }, 74 | { 75 | "name": "LAP", 76 | "type": "number" 77 | }, 78 | { 79 | "name": "STEP", 80 | "type": "integer" 81 | }, 82 | { 83 | "name": "Y_ERROR_RATE", 84 | "type": "number" 85 | }, 86 | { 87 | "name": "Y_ROC_AUC", 88 | "type": "number" 89 | }, 90 | { 91 | "name": "LOGPDF_X_PERTOK", 92 | "type": "number" 93 | }, 94 | { 95 | "name": "LOGPDF_Y_PERDOC", 96 | "type": "number" 97 | }, 98 | { 99 | "name": "LOSSMAP_TTL_PERTOK", 100 | "type": "number" 101 | }, 102 | { 103 | "name": "LOSSMAP_X_PERTOK", 104 | "type": "number" 105 | }, 106 | { 107 | "name": "LOSSMAP_Y_PERTOK", 108 | "type": "number" 109 | }, 110 | { 111 | "name": "LOSSMAP_PI_PERTOK", 112 | "type": "number" 113 | }, 114 | { 115 | "name": "LOSSMAP_W_PERTOK", 116 | "type": "number" 117 | }, 118 | { 119 | "name": "LOSSMAP_TOPIC_PERTOK", 120 | "type": "number" 121 | } 122 | ] 123 | } 124 | } 125 | ] 126 | } -------------------------------------------------------------------------------- /pc_toolbox/utils_snapshots/utils_snapshots.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | from collections import OrderedDict 6 | from pc_toolbox.model_slda.slda_utils__param_io_manager import load_topic_model_param_dict 7 | 8 | def load_param_dict_at_specific_snapshot( 9 | snapshot_path=None, 10 | task_path=None, 11 | lap=None, 12 | download_if_necessary=True, 13 | rsync_path=None, 14 | local_path=None, 15 | remote_path=None, 16 | snapshot_suffix='topic_model_snapshot/', 17 | w_txt_basename="w_CK.txt", 18 | add_bias_term_to_w_CK=0.0, 19 | **kwargs): 20 | if snapshot_path is None: 21 | snapshot_path = make_snapshot_path_for_lap( 22 | task_path, lap=lap, snapshot_suffix=snapshot_suffix) 23 | 24 | if not os.path.exists(snapshot_path) and download_if_necessary: 25 | download_snapshot(snapshot_path, rsync_path, local_path, remote_path) 26 | 27 | GP = load_topic_model_param_dict( 28 | snapshot_path=snapshot_path, 29 | w_txt_basename=w_txt_basename, 30 | add_bias_term_to_w_CK=add_bias_term_to_w_CK, 31 | ) 32 | return GP 33 | 34 | 35 | def download_snapshot( 36 | snapshot_path=None, 37 | rsync_path=None, 38 | local_path=None, 39 | remote_path=None, 40 | ): 41 | if local_path is None: 42 | try: 43 | local_path = os.environ['XHOST_LOCAL_PATH'] 44 | except KeyError: 45 | raise ValueError("Bad value for local_path: %s" % local_path) 46 | if remote_path is None: 47 | try: 48 | remote_path = os.environ['XHOST_REMOTE_PATH'] 49 | except KeyError: 50 | raise ValueError("Bad value for remote_path: %s" % remote_path) 51 | if rsync_path is None: 52 | try: 53 | rsync_path = os.environ['XHOST_RSYNC_PATH'] 54 | except KeyError: 55 | rsync_path = os.path.expandvars("$PC_REPO_DIR/scripts/rsync_tools/") 56 | if not os.path.exists(rsync_path): 57 | raise ValueError("Bad value for rsync_path: %s" % rsync_path) 58 | 59 | local_path = os.path.abspath(local_path) 60 | remote_path = os.path.abspath(remote_path) 61 | if snapshot_path.count(local_path): 62 | local_snapshot_path = snapshot_path 63 | remote_snapshot_path = snapshot_path.replace(local_path, remote_path) 64 | elif snapshot_path.count(remote_path): 65 | remote_snapshot_path = snapshot_path 66 | local_snapshot_path = snapshot_path.replace(remote_path, local_path) 67 | else: 68 | raise ValueError("Bad snapshot_path:\n%s" % snapshot_path) 69 | 70 | old_path = os.getcwd() 71 | print("cd %s" % rsync_path) 72 | os.chdir(rsync_path) 73 | cmd_str = "bash rsync_specific_snapshot.sh %s" % remote_snapshot_path 74 | print("ATTEMPTING DOWNLOAD via CUSTOM RSYNC CMD:") 75 | print(cmd_str) 76 | ans = os.system(cmd_str) 77 | print("cd %s" % old_path) 78 | os.chdir(old_path) 79 | if int(str(ans)) != 0: 80 | raise ValueError("BAD DOWNLOAD: ANSWER CODE %s" % ans) 81 | return True 82 | 83 | def make_snapshot_path_for_lap( 84 | task_path=None, lap=None, snapshot_suffix='topic_model_snapshot/'): 85 | if isinstance(lap, float) or isinstance(lap, int): 86 | best_lap = float(lap) 87 | else: 88 | raise ValueError("Bad value for lap %s" % lap) 89 | snapshot_path = os.path.join( 90 | task_path, 91 | 'lap%011.3f_%s' % (best_lap, snapshot_suffix)) 92 | return snapshot_path 93 | -------------------------------------------------------------------------------- /pc_toolbox/utils_vizhtml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtak/prediction-constrained-topic-models/8a324c7048c95469df50314e376a1fbb3e764859/pc_toolbox/utils_vizhtml/__init__.py -------------------------------------------------------------------------------- /pc_toolbox/utils_vizhtml/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | $PAGE_TITLE 5 | 6 | 7 | 8 | 9 | 10 | 43 | 44 | 45 | 46 | 47 |
48 |
49 |
50 |
51 |
52 | $PAGE_CONTENT 53 | 54 |
55 |
56 |
57 |
58 |
59 | 60 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /pc_toolbox/utils_vizhtml/utils_top_words_html.py: -------------------------------------------------------------------------------- 1 | ''' 2 | utils_viz_top_words.py 3 | 4 | ''' 5 | import numpy as np 6 | import argparse 7 | import os 8 | import sys 9 | 10 | STYLE_HEADER_HTML_STR = """ 11 | 24 | """ 25 | 26 | 27 | def make_top_words_html_from_topics( 28 | topics_KV, 29 | vocab_list=None, 30 | order=None, 31 | uids_K=None, 32 | ncols=5, 33 | n_words_per_topic=10, 34 | max_topics_to_display=100, 35 | proba_fmt_str='%.4f', 36 | n_chars_per_word=30, 37 | show_longer_words_via_tooltip=0, 38 | xlabels=None, 39 | **kwargs): 40 | K, V = topics_KV.shape 41 | if order is None: 42 | order = np.arange(K) 43 | htmllines = list() 44 | htmllines.append(STYLE_HEADER_HTML_STR) 45 | htmllines.append('') 46 | for posID, k in enumerate(order[:max_topics_to_display]): 47 | if posID % ncols == 0: 48 | htmllines.append(' ') 49 | 50 | if uids_K is None: 51 | uid = k + 1 52 | else: 53 | uid = uids_K[k] 54 | #k = k[0] 55 | if xlabels is None: 56 | titleline = '

%4d/%d

' % ( 57 | uid, K) 58 | else: 59 | titleline = ( 60 | '

%4d/%d

' + 61 | '

%10s


') % ( 62 | uid, K, xlabels[k]) 63 | htmllines.append(' ') 85 | 86 | if posID % ncols == ncols - 1: 87 | htmllines.append(' ') 88 | htmllines.append('
' + titleline) 64 | htmllines.append(' ') 65 | 66 | # want to use fmtr like "%-20s" to force 20 chars of whitespace 67 | fixed_width_str__fmtr = "%" + "-" + str(n_chars_per_word) + "s" 68 | htmlPattern = \ 69 | '
' + proba_fmt_str + ' ' + \
70 |             '
' \
71 |             + fixed_width_str__fmtr + "
" 72 | topIDs = np.argsort(-1 * topics_KV[k])[:n_words_per_topic] 73 | for topID in topIDs: 74 | dataline = htmlPattern % ( 75 | topics_KV[k, topID], 76 | vocab_list[topID][:n_chars_per_word]) 77 | if show_longer_words_via_tooltip: 78 | if len(vocab_list[topID]) > n_chars_per_word: 79 | dataline = dataline.replace( 80 | '
',
81 |                         '
' % vocab_list[topID],                            
82 |                         )
83 |             htmllines.append(dataline + "
") 84 | htmllines.append('
') 89 | htmlstr = '\n'.join(htmllines) 90 | return htmlstr 91 | 92 | 93 | -------------------------------------------------------------------------------- /pc_toolbox/utils_vizhtml/utils_viz_topic_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from utils_top_words_html import make_top_words_html_from_topics 4 | 5 | def show_topics_and_weights( 6 | param_dict=None, 7 | topics_KV=None, 8 | w_CK=None, 9 | uids_K=None, 10 | sort_topics_by=None, 11 | vocab_list=None, 12 | max_topics_to_display=200, 13 | n_words_per_topic=10, 14 | n_chars_per_word=30, 15 | rank_words_by='proba_word_given_topic', 16 | y_ind=0, 17 | vmax=0.05, 18 | vmin=0.00, 19 | add_bias_term_to_w_CK=0.0, 20 | **viz_kwargs): 21 | """ Show topics and weights for specific sLDA param dict 22 | 23 | Returns 24 | ------- 25 | html_str : list of lines of html 26 | """ 27 | if param_dict is not None: 28 | topics_KV = param_dict['topics_KV'] 29 | w_CK = param_dict['w_CK'] 30 | assert topics_KV is not None 31 | assert w_CK is not None 32 | # Make local temp copies 33 | # so we can re-sort at will 34 | topics_KV = topics_KV.copy() 35 | w_c_K = w_CK[y_ind].copy() 36 | assert w_c_K.ndim == 1 37 | K = w_c_K.size 38 | 39 | if uids_K is None: 40 | uids_K = np.arange(K) 41 | 42 | if rank_words_by == 'proba_word_given_topic': 43 | topics_KV /= topics_KV.sum(axis=1)[:,np.newaxis] 44 | elif rank_words_by == 'proba_topic_given_word': 45 | topics_KV /= topics_KV.sum(axis=0) 46 | else: 47 | raise ValueError("Unrecognized rank_words_by: %s" % rank_words_by) 48 | 49 | ## Sort params if needed 50 | if sort_topics_by is not None: 51 | if sort_topics_by.count('w'): 52 | sort_ids = np.argsort(w_c_K) 53 | w_c_K = w_c_K[sort_ids] 54 | topics_KV = topics_KV[sort_ids] 55 | uids_K = uids_K[sort_ids] 56 | 57 | ## Prepare xlabels 58 | xlabels = ['% .1f' % a for a in w_c_K] 59 | 60 | ## Make plots 61 | if vocab_list is None: 62 | raise NotImplementedError("TODO make bars viz") 63 | else: 64 | return make_top_words_html_from_topics( 65 | topics_KV, 66 | vocab_list=vocab_list, 67 | xlabels=xlabels, 68 | uids_K=uids_K, 69 | n_words_per_topic=n_words_per_topic, 70 | n_chars_per_word=n_chars_per_word, 71 | max_topics_to_display=max_topics_to_display, 72 | **viz_kwargs) 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | psutil>=5.0 2 | Cython>=0.25 3 | numpy>=1.13 4 | pandas>=0.18 5 | scipy>=0.19 6 | scikit-learn>=0.18 7 | numexpr>=2.6 8 | autograd>=1.2 9 | dill>=0.2 10 | matplotlib>=2.0 11 | jupyter>=1.0 12 | -------------------------------------------------------------------------------- /scripts/install/create_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PC_ENV_NAME=pc_toolbox_env 4 | 5 | conda create --name $PC_ENV_NAME python=2.7 6 | 7 | source activate $PC_ENV_NAME 8 | 9 | # Read requirements and install one line at a time 10 | # For each req, we first try to do a conda install 11 | # Falling back on 'pip' if conda doesnt work 12 | while read requirement; do 13 | echo ">>> install $requirement START" 14 | conda install --yes $requirement || pip install $requirement; 15 | echo ">>> install $requirement DONE" 16 | done < requirements.txt 17 | 18 | 19 | -------------------------------------------------------------------------------- /scripts/install/install_tensorflow_linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install --ignore-installed --upgrade \ 4 | https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0-cp27-none-linux_x86_64.whl 5 | -------------------------------------------------------------------------------- /scripts/install/requirements.txt: -------------------------------------------------------------------------------- 1 | psutil>=5.0 2 | Cython>=0.25 3 | numpy>=1.13 4 | pandas>=0.18 5 | scipy>=0.19 6 | scikit-learn>=0.18 7 | numexpr>=2.6 8 | autograd>=1.2 9 | matplotlib>=2.0 10 | jupyter>=1.0 11 | -------------------------------------------------------------------------------- /scripts/launch_job_on_host_via_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Common launcher script for ALL experiments 4 | # 5 | # Each call to this script will launch separate "job" (locally or on grid) 6 | # Location of job depends on the value of XHOST environment variable 7 | 8 | if [[ -z $XHOST_RESULTS_DIR || ! -d $XHOST_RESULTS_DIR ]]; then 9 | echo "Env var XHOST_RESULTS_DIR must exist and point to valid directory; Try 'export XHOST_RESULTS_DIR=/tmp/'" 10 | exit 1; 11 | fi 12 | if [[ -z $PC_REPO_DIR || ! -d $PC_REPO_DIR ]]; then 13 | echo "Env var PC_REPO_DIR must exist and point to the directory where you've cloned the repo; Try 'export PC_REPO_DIR=/path/to/repo/'" 14 | exit 1; 15 | fi; 16 | 17 | 18 | 19 | if [[ $XHOST == 'list' || $XHOST == 'dry' ]]; then 20 | if [[ -z $target_names ]]; then 21 | echo $output_path 22 | else 23 | echo $target_names $output_path 24 | fi 25 | elif [[ $XHOST == 'grid' ]]; then 26 | # Launch each job on grid computing system (LSF/SLURM/SGE) 27 | launcher_exe=`python $PC_REPO_DIR/scripts/launcher_tools/detect_grid_executable.py` 28 | tmp_script_path=`python $PC_REPO_DIR/scripts/launcher_tools/make_launcher_script.py` 29 | CMD="$launcher_exe < $tmp_script_path" 30 | eval $CMD 31 | elif [[ $XHOST == 'local' ]]; then 32 | # Launch each job on local cpu (same process that called launch_job.sh) 33 | echo $output_path 34 | bash $XHOST_BASH_EXE 35 | exit 1 36 | elif [[ $XHOST == 'local_alltasks' ]]; then 37 | # Launch each job on local cpu (same process that called launch_job.sh) 38 | echo $output_path 39 | for XHOST_TASK_ID in `seq $XHOST_FIRSTTASK $XHOST_NTASKS` 40 | do 41 | echo ">>> task $XHOST_TASK_ID" 42 | export XHOST_TASK_ID=$XHOST_TASK_ID 43 | bash $XHOST_BASH_EXE 44 | done 45 | unset XHOST_TASK_ID 46 | else 47 | if [[ -z $XHOST ]]; then 48 | echo "ERROR: User did not define env variable: XHOST" 49 | else 50 | echo "ERROR: Unrecognized value for XHOST: $XHOST" 51 | fi 52 | echo "SUPPORTED OPTIONS:" 53 | echo "XHOST=list : list output_path for all tasks, then exit" 54 | echo "XHOST=local : run first task on current local machine" 55 | echo "XHOST=local_alltasks : run all tasks serially on current local machine" 56 | echo "XHOST=grid : run all tasks on available grid engine (SLURM/SGE/LSF)" 57 | exit 1 58 | fi 59 | -------------------------------------------------------------------------------- /scripts/launcher_tools/detect_grid_executable.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | 4 | import make_launcher_script as mls 5 | 6 | if __name__ == '__main__': 7 | ext_str = mls.detect_template_ext_for_current_system() 8 | 9 | if ext_str == 'sge': 10 | print('qsub') 11 | elif ext_str == 'lsf': 12 | print('bsub') 13 | elif ext_str == 'slurm': 14 | print('sbatch') 15 | else: 16 | raise ValueError("Unrecognized extension: %s" % ext_str) 17 | -------------------------------------------------------------------------------- /scripts/launcher_tools/make_launcher_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | make_launcher_script.py 4 | 5 | User-Executable script that creates temp launcher script file 6 | 7 | Usage 8 | ----- 9 | $ python make_launcher_script.py 10 | 11 | """ 12 | import os 13 | import distutils.spawn 14 | import tempfile 15 | 16 | DEFAULT_KEYS = [ 17 | 'XHOST_JOB_NAME', 18 | 'XHOST_MACHINE_NAME', 19 | 'XHOST_LOG_DIR', 20 | 'XHOST_FIRSTTASK', 21 | 'XHOST_NTASKS', 22 | 'XHOST_MEM_MB', 23 | 'XHOST_SWP_MB', 24 | 'XHOST_TIME_HR', 25 | ] 26 | 27 | def set_default_environment(): 28 | if 'XHOST_BASH_EXE' not in os.environ: 29 | raise ValueError("Need to define env var: XHOST_BASH_EXE") 30 | assert os.path.exists(os.environ['XHOST_BASH_EXE']) 31 | 32 | if 'XHOST_LOG_DIR' not in os.environ: 33 | raise ValueError("Need to define env var: XHOST_LOG_DIR") 34 | assert os.path.exists(os.environ['XHOST_LOG_DIR']) 35 | 36 | if 'XHOST_NTASKS' not in os.environ: 37 | os.environ['XHOST_NTASKS'] = '1' 38 | if 'XHOST_FIRSTTASK' not in os.environ: 39 | os.environ['XHOST_FIRSTTASK'] = '1' 40 | if 'XHOST_MEM_MB' not in os.environ: 41 | os.environ['XHOST_MEM_MB'] = '5000' 42 | if 'XHOST_SWP_MB' not in os.environ: 43 | os.environ['XHOST_SWP_MB'] = '5000' 44 | if 'XHOST_MACHINE_NAME' not in os.environ: 45 | os.environ['XHOST_MACHINE_NAME'] = 'liv' 46 | if 'XHOST_JOB_NAME' not in os.environ: 47 | os.environ['XHOST_JOB_NAME'] = 'my_job' 48 | if 'XHOST_TIME_HR' not in os.environ: 49 | os.environ['XHOST_TIME_HR'] = '24' 50 | 51 | def detect_template_ext_for_current_system(): 52 | if distutils.spawn.find_executable("sacct"): 53 | return "slurm" 54 | elif distutils.spawn.find_executable("bjobs"): 55 | return "lsf" 56 | elif distutils.spawn.find_executable("qstat"): 57 | return "sge" 58 | raise ValueError("Unknown grid system") 59 | 60 | def make_launcher_script_file(): 61 | """ Create temporary file for launching job on grid system 62 | 63 | Post Condition 64 | -------------- 65 | Temporary file written to /tmp/ or similar via tempfile module 66 | 67 | Returns 68 | ------- 69 | fpath : string 70 | Valid path to temporary file 71 | """ 72 | 73 | ext_str = detect_template_ext_for_current_system() 74 | template_fpath = os.path.join( 75 | os.path.expandvars("$PC_REPO_DIR/scripts/launcher_tools/"), 76 | "template.%s" % ext_str) 77 | with open(template_fpath, "r") as f: 78 | template_lines = f.readlines() 79 | 80 | launcher_f = tempfile.NamedTemporaryFile( 81 | mode="w", 82 | prefix="launcher_for_%s_" % os.environ['USER'], 83 | suffix="." + ext_str, 84 | delete=False) 85 | for line in template_lines: 86 | for key in DEFAULT_KEYS: 87 | line = line.replace("$" + key, os.environ[key]) 88 | line = line.replace( 89 | '$XHOST_BASH_EXE', 90 | os.path.abspath(os.environ['XHOST_BASH_EXE'])) 91 | launcher_f.write(line) 92 | launcher_f.close() 93 | return os.path.abspath(launcher_f.name) 94 | 95 | 96 | if __name__ == "__main__": 97 | set_default_environment() 98 | print(make_launcher_script_file()) 99 | -------------------------------------------------------------------------------- /scripts/launcher_tools/print_lowercase_env_vars_as_keyword_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | Write all lower-case environment variables to stdout 3 | 4 | Examples 5 | -------- 6 | >>> os.environ['abc'] = '123' 7 | >>> os.environ['DO_NOT_PRINT'] = '456' 8 | >>> print_lowercase_env_vars_as_keyword_args() 9 | --abc 123 10 | 11 | """ 12 | 13 | from __future__ import print_function 14 | 15 | import os 16 | 17 | def print_lowercase_env_vars_as_keyword_args(): 18 | try: 19 | XHOST_PREFIXES_TO_SKIP = os.environ['XHOST_PREFIXES_TO_SKIP'].split(',') 20 | except KeyError: 21 | XHOST_PREFIXES_TO_SKIP = list() 22 | XHOST_PREFIXES_TO_SKIP = set(XHOST_PREFIXES_TO_SKIP) 23 | 24 | for key in sorted(os.environ.keys()): 25 | if key[0].islower(): 26 | val = os.environ[key] 27 | # Manually remove some unnecessary env vars 28 | if key in XHOST_PREFIXES_TO_SKIP: 29 | continue 30 | 31 | if key == 'extra_kwargs_str': 32 | # Handle args like "name1,val1,name2,val2" 33 | kkeys = val.split(",")[::2] 34 | kvals = val.split(",")[1::2] 35 | for k, v in zip(kkeys,kvals): 36 | if not k.startswith("--"): 37 | k = "--" + k 38 | print("%s %s" % (k, v)) 39 | continue 40 | 41 | # handle negative nums as values 42 | if val.startswith("-") and val[1].isdigit(): 43 | val = '" %s"' % (val) 44 | # handle paths with ( or ) in them 45 | if val.count("("): 46 | # wrap parentheses with double quotes 47 | val = '"%s"' % (val) 48 | if val.count(" ") and not (val.startswith('"') and val.endswith('"')): 49 | val = '"%s"' % (val) 50 | 51 | #else: 52 | # assert val.count(" ") == 0 53 | 54 | assert key.count(" ") == 0 55 | print("--%s %s" % (key, val)) 56 | 57 | if __name__ == '__main__': 58 | print_lowercase_env_vars_as_keyword_args() 59 | -------------------------------------------------------------------------------- /scripts/launcher_tools/template.lsf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #BSUB -J $XHOST_JOB_NAME[$XHOST_FIRSTTASK-$XHOST_NTASKS] 3 | #BSUB -o $XHOST_LOG_DIR/%J.%I.out 4 | #BSUB -e $XHOST_LOG_DIR/%J.%I.err 5 | #BSUB -R "rusage[mem=$XHOST_MEM_MB,swp=$XHOST_SWP_MB]" 6 | #BSUB -q $XHOST_MACHINE_NAME 7 | 8 | bash $XHOST_BASH_EXE 9 | 10 | -------------------------------------------------------------------------------- /scripts/launcher_tools/template.sge: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #$ -cwd 3 | #$ -S /bin/bash 4 | #$ -o $XHOST_LOG_DIR/$JOB_ID.$TASK_ID.out 5 | #$ -e $XHOST_LOG_DIR/$JOB_ID.$TASK_ID.err 6 | #$ -V 7 | #$ -t $XHOST_FIRSTTASK-$XHOST_NTASKS 8 | #$ -q '*@@$XHOST_MACHINE_NAME' 9 | #$ -l vf=$XHOST_MEM_MBM 10 | #$ -l h_rt=$XHOST_TIME_HR:00:00 11 | 12 | bash $XHOST_BASH_EXE 13 | -------------------------------------------------------------------------------- /scripts/launcher_tools/template.slurm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH -n 1 # Number of cores 3 | #SBATCH -t 0-$XHOST_TIME_HR:00 # Runtime in D-HH:MM 4 | #SBATCH -p $XHOST_MACHINE_NAME # Partition to submit to 5 | #SBATCH --mem-per-cpu $XHOST_MEM_MB # Memory (in MB) per cpu 6 | #SBATCH -o $XHOST_LOG_DIR/%A.%a.out 7 | #SBATCH -e $XHOST_LOG_DIR/%A.%a.err 8 | #SBATCH --array=$XHOST_FIRSTTASK-$XHOST_NTASKS 9 | #SBATCH --export=ALL 10 | 11 | bash $XHOST_BASH_EXE 12 | -------------------------------------------------------------------------------- /scripts/movie_reviews/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=movie_reviews_pang_lee 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=5338 11 | export n_train_docs=4004 12 | 13 | export n_batches=5 14 | 15 | # =============================== OUTPUT SETTINGS 16 | export param_output_fmt="topic_model_snapshot" 17 | export n_steps_between_save=10 18 | export n_steps_between_print=10 19 | export n_seconds_between_save=-1 20 | export n_seconds_between_print=-1 21 | export n_steps_to_print_early=2 22 | export n_steps_to_save_early=2 23 | export laps_to_save_custom='0,1,2,4,6,8,10' 24 | 25 | # =============================== ALGO SETTINGS 26 | export n_laps=2 27 | 28 | ## Overall training: ADAM 29 | export alg_name="grad_descent_minimizer" 30 | export step_direction='adam' 31 | export decay_staircase=0 32 | export decay_interval=1 33 | export decay_rate=0.997 34 | for step_size in 0.0333 #0.1000 0.3333 35 | do 36 | export step_size=$step_size 37 | 38 | 39 | ## Per-doc inference settings 40 | # Quicktest goal is just to make sure it all runs 41 | # so do very few per-doc exponentiated gradient iterations 42 | export pi_max_iters=10 43 | # Set step-size of the exponentiated gradient algorithm 44 | export pi_step_size=0.05 45 | # Try to make early iterations faster by doing less per-doc work 46 | # Will gradually ramp up from ___ to 10 per-doc iterations 47 | for pi_max_iters_first_train_lap in 4 10 48 | do 49 | export pi_max_iters_first_train_lap=$pi_max_iters_first_train_lap 50 | 51 | 52 | # =============================== INIT SETTINGS 53 | export init_model_path=none 54 | for init_name in rand_smooth 55 | do 56 | export init_name=$init_name 57 | 58 | # =============================== MODEL HYPERS 59 | export alpha=1.100 60 | export tau=1.100 61 | export lambda_w=0.001 62 | 63 | export weight_x=1.0 64 | 65 | ## Loop over weights to place on log p(y|x) 66 | for weight_y in 10.0 02.0 01.0 67 | do 68 | export weight_y=$weight_y 69 | 70 | for n_states in 004 71 | do 72 | export n_states=$n_states 73 | 74 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size-pi_iters_first_lap=$pi_max_iters_first_train_lap/1/" 75 | 76 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 77 | 78 | done 79 | done 80 | done 81 | done 82 | done 83 | -------------------------------------------------------------------------------- /scripts/movie_reviews/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__tensorflow" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=movie_reviews_pang_lee 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=5338 11 | export n_train_docs=4004 12 | 13 | export n_batches=5 14 | 15 | # =============================== OUTPUT SETTINGS 16 | export param_output_fmt="topic_model_snapshot" 17 | export n_steps_between_save=10 18 | export n_steps_between_print=10 19 | export n_seconds_between_save=-1 20 | export n_seconds_between_print=-1 21 | export n_steps_to_print_early=2 22 | export n_steps_to_save_early=2 23 | export laps_to_save_custom='0,1,2,4,6,8,10' 24 | 25 | # =============================== ALGO SETTINGS 26 | export n_laps=2 27 | 28 | ## Overall training: ADAM 29 | export alg_name="grad_descent_minimizer" 30 | export step_direction='adam' 31 | export decay_staircase=0 32 | export decay_interval=1 33 | export decay_rate=0.997 34 | for step_size in 0.0333 #0.1000 0.3333 35 | do 36 | export step_size=$step_size 37 | 38 | ## Per-doc inference settings 39 | # Quicktest goal is just to make sure it all runs 40 | # so do very few per-doc exponentiated gradient iterations 41 | export pi_max_iters=10 42 | # Set step-size of the exponentiated gradient algorithm 43 | export pi_step_size=0.05 44 | # Try to make early iterations faster by doing less per-doc work 45 | # Will gradually ramp up from ___ to 10 per-doc iterations 46 | for pi_max_iters_first_train_lap in 4 10 47 | do 48 | export pi_max_iters_first_train_lap=$pi_max_iters_first_train_lap 49 | 50 | # =============================== INIT SETTINGS 51 | export init_model_path=none 52 | for init_name in rand_smooth 53 | do 54 | export init_name=$init_name 55 | 56 | # =============================== MODEL HYPERS 57 | export alpha=1.100 58 | export tau=1.100 59 | export lambda_w=0.001 60 | 61 | export weight_x=1.0 62 | 63 | ## Loop over weights to place on log p(y|x) 64 | for weight_y in 10.0 02.0 01.0 65 | do 66 | export weight_y=$weight_y 67 | 68 | for n_states in 004 69 | do 70 | export n_states=$n_states 71 | 72 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size-pi_iters_first_lap=$pi_max_iters_first_train_lap/1/" 73 | 74 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 75 | 76 | done 77 | done 78 | done 79 | done 80 | done 81 | -------------------------------------------------------------------------------- /scripts/movie_reviews/train_base_classifiers/train_baseline_classifiers.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_clf.sh 3 | nickname="20180301" 4 | 5 | # =============================== DATA SETTINGS 6 | export dataset_name=movie_reviews_pang_lee 7 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 8 | 9 | export feature_arr_names=X 10 | export target_arr_name=Y 11 | 12 | all_target_names=`cat "$dataset_path/Y_colnames.txt"` 13 | 14 | for classifier_name in logistic_regression extra_trees 15 | do 16 | 17 | for target_name in $all_target_names 18 | do 19 | export target_names=$target_name 20 | export classifier_name=$classifier_name 21 | export class_weight_opts='none' 22 | export preproc_X='none' 23 | export c_logspace_arg_str=" -6,6,13" 24 | export max_grid_search_steps=13 25 | 26 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-classifier_name=$classifier_name/1/" 27 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 28 | 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /scripts/movie_reviews/train_topic_models/make_html_viz_for_best_snapshots.sh: -------------------------------------------------------------------------------- 1 | 2 | pushd $PC_REPO_DIR/pc_toolbox/utils_vizhtml/ 3 | 4 | template_html='
TRAIN AUC=$TRAIN_Y_ROC_AUC
VALID AUC=$VALID_Y_ROC_AUC
TEST AUC=$TEST_Y_ROC_AUC
' 5 | 6 | for rank_words_by in 'proba_word_given_topic' 'proba_topic_given_word' 7 | do 8 | python make_html_collection_from_csv.py \ 9 | --snapshot_csv_path $XHOST_LOCAL_PATH/best_runs_20180301_pcslda_tensorflow/best_snapshots_PC_sLDA.csv \ 10 | --html_output_path /tmp/movie_reviews_html/rank_words_by="$rank_words_by"/ \ 11 | --field_order LEGEND_NAME,LABEL_NAME,N_STATES,WEIGHT_Y \ 12 | --ncols 4 \ 13 | --n_chars_per_word 20 \ 14 | --n_words_per_topic 15 \ 15 | --rank_words_by $rank_words_by \ 16 | --show_longer_words_via_tooltip 1 \ 17 | --metrics_template_html "$template_html" \ 18 | 19 | done 20 | 21 | 22 | popd -------------------------------------------------------------------------------- /scripts/movie_reviews/train_topic_models/pcslda_ag_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=3 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=movie_reviews_pang_lee 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=5338 11 | export n_train_docs=4004 12 | export frac_labels_train=1.000 13 | 14 | for n_batches in 1 5 15 | do 16 | export n_batches=$n_batches 17 | 18 | # =============================== OUTPUT SETTINGS 19 | export param_output_fmt="topic_model_snapshot" 20 | n_laps_btw=25 21 | n_steps_btw=`python -c "print $n_batches * $n_laps_btw"` 22 | export n_steps_between_save=$n_steps_btw 23 | export n_steps_between_print=$n_steps_btw 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,5,10,15,20' 27 | 28 | # =============================== ALGO SETTINGS 29 | export n_laps=500 30 | 31 | ## Overall training: ADAM 32 | export alg_name="grad_descent_minimizer" 33 | export step_direction='adam' 34 | export decay_staircase=0 35 | export decay_interval=1 36 | export decay_rate=0.997 37 | for step_size in 0.0333 0.3333 38 | do 39 | export step_size=$step_size 40 | 41 | ## Per-doc inference settings 42 | export pi_max_iters=100 43 | export pi_step_size=0.05 44 | # Try to make early iterations faster by doing less per-doc work 45 | # Will gradually ramp up from ___ to pi_max_iters 46 | for pi_max_iters_first_train_lap in 010 100 47 | do 48 | export pi_max_iters_first_train_lap=$pi_max_iters_first_train_lap 49 | 50 | # =============================== INIT SETTINGS 51 | export init_model_path=none 52 | for init_name in rand_smooth 53 | do 54 | export init_name=$init_name 55 | 56 | # =============================== MODEL HYPERS 57 | export alpha=1.100 58 | export tau=1.100 59 | export lambda_w=0.001 60 | 61 | export weight_x=1.0 62 | 63 | ## Loop over weights to place on log p(y|x) 64 | for weight_y in 100.0 010.0 001.0 #000.0 65 | do 66 | export weight_y=$weight_y 67 | 68 | ## Loop over number of topics K 69 | for n_states in 010 025 050 70 | do 71 | export n_states=$n_states 72 | 73 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$step_direction-step_size=$step_size-pi_iters_first_lap=$pi_max_iters_first_train_lap/1/" 74 | 75 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 76 | 77 | done 78 | done 79 | done 80 | done 81 | done 82 | -------------------------------------------------------------------------------- /scripts/movie_reviews/train_topic_models/pcslda_tf_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=3 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__tensorflow" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=movie_reviews_pang_lee 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=5338 11 | export n_train_docs=4004 12 | export frac_labels_train=1.000 13 | 14 | for n_batches in 1 5 15 | do 16 | export n_batches=$n_batches 17 | 18 | # =============================== OUTPUT SETTINGS 19 | export param_output_fmt="topic_model_snapshot" 20 | n_laps_btw=25 21 | n_steps_btw=`python -c "print $n_batches * $n_laps_btw"` 22 | export n_steps_between_save=$n_steps_btw 23 | export n_steps_between_print=$n_steps_btw 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,5,10,15,20' 27 | 28 | # =============================== ALGO SETTINGS 29 | export n_laps=500 30 | 31 | ## Overall training: ADAM 32 | export alg_name="grad_descent_minimizer" 33 | export step_direction='adam' 34 | export decay_staircase=0 35 | export decay_interval=1 36 | export decay_rate=0.997 37 | for step_size in 0.0333 0.3333 38 | do 39 | export step_size=$step_size 40 | 41 | ## Per-doc inference settings 42 | export pi_max_iters=100 43 | export pi_step_size=0.05 44 | # Try to make early iterations faster by doing less per-doc work 45 | # Will gradually ramp up from ___ to pi_max_iters 46 | for pi_max_iters_first_train_lap in 010 100 47 | do 48 | export pi_max_iters_first_train_lap=$pi_max_iters_first_train_lap 49 | 50 | # =============================== INIT SETTINGS 51 | export init_model_path=none 52 | for init_name in rand_smooth 53 | do 54 | export init_name=$init_name 55 | 56 | # =============================== MODEL HYPERS 57 | export alpha=1.100 58 | export tau=1.100 59 | export lambda_w=0.001 60 | 61 | export weight_x=1.0 62 | 63 | ## Loop over weights to place on log p(y|x) 64 | for weight_y in 100.0 010.0 001.0 #000.0 65 | do 66 | export weight_y=$weight_y 67 | 68 | ## Loop over number of topics K 69 | for n_states in 025 050 70 | do 71 | export n_states=$n_states 72 | 73 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$step_direction-step_size=$step_size-pi_iters_first_lap=$pi_max_iters_first_train_lap/1/" 74 | 75 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 76 | 77 | done 78 | done 79 | done 80 | done 81 | done 82 | done 83 | -------------------------------------------------------------------------------- /scripts/movie_reviews/train_topic_models/select_best_snapshots.sh: -------------------------------------------------------------------------------- 1 | dataset_subpath="movie_reviews_pang_lee/" 2 | dataset_path="$PC_REPO_DIR/datasets/$dataset_subpath" 3 | export XHOST_SSH_ADDR=mchughes@browncs 4 | export XHOST_REMOTE_PATH="/nbu/liv/mhughes/slda_results/$dataset_subpath/" 5 | export XHOST_LOCAL_PATH="/results/$dataset_subpath/" 6 | 7 | results_path_pattern_01="$XHOST_LOCAL_PATH/20180301*tensorflow*" 8 | 9 | output_path="$XHOST_LOCAL_PATH/best_runs_20180301_pcslda_tensorflow/" 10 | 11 | python $PC_REPO_DIR/pc_toolbox/utils_snapshots/select_best_runs_and_snapshots.py \ 12 | --output_path $output_path \ 13 | --legend_name PC_sLDA \ 14 | --results_path_patterns "$results_path_pattern_01" \ 15 | --txt_src_path $dataset_path \ 16 | --target_y_name more_than_2_out_of_4_stars \ 17 | --all_y_names more_than_2_out_of_4_stars \ 18 | --selection_score_colname Y_ROC_AUC \ 19 | --selection_score_ranking_func argmax \ 20 | --col_names_to_use_at_selection N_STATES,WEIGHT_Y \ 21 | --col_names_to_keep_per_split \ 22 | Y_ROC_AUC,Y_ERROR_RATE,LOGPDF_X_PERTOK,LOGPDF_Y_PERDOC \ 23 | --col_names_to_keep \ 24 | ALPHA,TAU,LAMBDA_W \ -------------------------------------------------------------------------------- /scripts/product_reviews/train_base_classifiers/train_baseline_classifiers.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_clf.sh 3 | nickname="20180301" 4 | 5 | # =============================== DATA SETTINGS 6 | export dataset_name=multi_domain_product_reviews_dataset/clean_data_v20180403/ 7 | export dataset_path="$HOME/git/$dataset_name/" 8 | 9 | export feature_arr_names=X 10 | export target_arr_name=Y 11 | 12 | all_target_names=`cat "$dataset_path/Y_colnames.txt"` 13 | 14 | for classifier_name in logistic_regression extra_trees 15 | do 16 | 17 | for target_name in $all_target_names 18 | do 19 | export target_names=$target_name 20 | export classifier_name=$classifier_name 21 | export class_weight_opts='none' 22 | export preproc_X='none' 23 | export c_logspace_arg_str=" -6,6,13" 24 | export max_grid_search_steps=13 25 | 26 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-classifier_name=$classifier_name/1/" 27 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 28 | 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /scripts/product_reviews/train_base_classifiers/train_baseline_rf.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_clf.sh 3 | nickname="20180301" 4 | 5 | # =============================== DATA SETTINGS 6 | export dataset_name=multi_domain_product_reviews_dataset/clean_data_v20180403/ 7 | export dataset_path="$HOME/git/$dataset_name/" 8 | 9 | export feature_arr_names=X 10 | export target_arr_name=Y 11 | 12 | all_target_names=`cat "$dataset_path/Y_colnames.txt"` 13 | 14 | for classifier_name in extra_trees 15 | do 16 | 17 | for target_name in $all_target_names 18 | do 19 | export target_names=$target_name 20 | export classifier_name=$classifier_name 21 | export class_weight_opts='none' 22 | export preproc_X='none' 23 | export max_grid_search_steps=5 24 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-classifier_name=$classifier_name/1/" 25 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 26 | 27 | done 28 | done 29 | -------------------------------------------------------------------------------- /scripts/product_reviews/train_topic_models/make_html_viz_for_best_snapshots.sh: -------------------------------------------------------------------------------- 1 | dataset_subpath="multi_domain_product_reviews_dataset/clean_data_v20180403/" 2 | dataset_path="$HOME/git/$dataset_subpath/" 3 | echo "dataset_path:" 4 | echo "$dataset_path" 5 | 6 | export XHOST_SSH_ADDR=mchughes@browncs 7 | export XHOST_REMOTE_PATH="/nbu/liv/mhughes/slda_results/$dataset_subpath/" 8 | export XHOST_LOCAL_PATH="/results/$dataset_subpath/" 9 | 10 | pushd $PC_REPO_DIR/pc_toolbox/utils_vizhtml/ 11 | 12 | template_html='
TRAIN AUC=$TRAIN_Y_ROC_AUC
VALID AUC=$VALID_Y_ROC_AUC
TEST AUC=$TEST_Y_ROC_AUC
' 13 | 14 | for rank_words_by in 'proba_word_given_topic' 'proba_topic_given_word' 15 | do 16 | python make_html_collection_from_csv.py \ 17 | --snapshot_csv_path $XHOST_LOCAL_PATH/best_runs_20180301_pcslda_tensorflow/best_snapshots_PC_sLDA.csv \ 18 | --html_output_path /tmp/product_reviews_html/rank_words_by="$rank_words_by"/ \ 19 | --field_order LEGEND_NAME,LABEL_NAME,N_STATES,WEIGHT_Y \ 20 | --ncols 4 \ 21 | --n_chars_per_word 20 \ 22 | --n_words_per_topic 15 \ 23 | --rank_words_by $rank_words_by \ 24 | --show_longer_words_via_tooltip 1 \ 25 | --metrics_template_html "$template_html" \ 26 | 27 | done 28 | 29 | 30 | popd 31 | -------------------------------------------------------------------------------- /scripts/product_reviews/train_topic_models/rsync_snapshot_perf_csv.sh: -------------------------------------------------------------------------------- 1 | dataset_subpath="multi_domain_product_reviews_dataset/clean_data_v20180403/" 2 | dataset_path="$HOME/git/$dataset_subpath/" 3 | echo "dataset_path:" 4 | echo "$dataset_path" 5 | 6 | export XHOST_SSH_ADDR=mchughes@browncs 7 | export XHOST_REMOTE_PATH="/nbu/liv/mhughes/slda_results/$dataset_subpath/" 8 | export XHOST_LOCAL_PATH="/results/$dataset_subpath/" 9 | 10 | bash $PC_REPO_DIR/scripts/rsync_tools/rsync_snapshot_perf_metrics.sh 11 | 12 | -------------------------------------------------------------------------------- /scripts/product_reviews/train_topic_models/select_best_snapshots.sh: -------------------------------------------------------------------------------- 1 | dataset_subpath="multi_domain_product_reviews_dataset/clean_data_v20180403/" 2 | dataset_path="$HOME/git/$dataset_subpath/" 3 | echo "dataset_path:" 4 | echo "$dataset_path" 5 | 6 | export XHOST_SSH_ADDR=mchughes@browncs 7 | export XHOST_REMOTE_PATH="/nbu/liv/mhughes/slda_results/$dataset_subpath/" 8 | export XHOST_LOCAL_PATH="/results/$dataset_subpath/" 9 | 10 | y_colnames=`cat $dataset_path/Y_colnames.txt` 11 | echo "target_y_name: $y_colnames" 12 | 13 | results_path_pattern_01="$XHOST_LOCAL_PATH/20180301*tensorflow*" 14 | output_path="$XHOST_LOCAL_PATH/best_runs_20180301_pcslda_tensorflow/" 15 | 16 | python $PC_REPO_DIR/pc_toolbox/utils_snapshots/select_best_runs_and_snapshots.py \ 17 | --output_path $output_path \ 18 | --legend_name PC_sLDA \ 19 | --results_path_patterns "$results_path_pattern_01" \ 20 | --txt_src_path $dataset_path \ 21 | --target_y_name $y_colnames \ 22 | --all_y_names $y_colnames \ 23 | --selection_score_colname Y_ROC_AUC \ 24 | --selection_score_ranking_func argmax \ 25 | --col_names_to_use_at_selection N_STATES,WEIGHT_Y \ 26 | --col_names_to_keep_per_split \ 27 | Y_ROC_AUC,Y_ERROR_RATE,LOGPDF_X_PERTOK,LOGPDF_Y_PERDOC \ 28 | --col_names_to_keep \ 29 | ALPHA,TAU,LAMBDA_W,N_BATCHES \ 30 | -------------------------------------------------------------------------------- /scripts/rsync_tools/README.md: -------------------------------------------------------------------------------- 1 | # Steps to download training results to local computer and make plots 2 | 3 | ## 1) On remote machines, do intensive training 4 | 5 | Typically, you'll run some pc_toolbox training algos on big datasets for several hours or days. 6 | 7 | These will dump results onto disk on the remote file system, in a path like 8 | 9 | XHOST_REMOTE_PATH/20180301-/1/snapshot_perf_metrics_train.csv 10 | 11 | We'll use `XHOST_REMOTE_PATH` to denote the value of XHOST_RESULTS_DIR on the remote system. 12 | 13 | ## 2) On local machine, run "rsync_snapshot_perf_metrics.sh" to grab the results 14 | 15 | ``` 16 | $ cd $PC_REPO_DIR/scripts/rsync_tools/ 17 | 18 | # SET UP YOUR SSH INFO 19 | $ export SSH_ADDR=@ 20 | 21 | # SET REMOTE PATH TO DIRECTORY THAT CONTAINS FILES FOR DATASET OF INTEREST 22 | $ export XHOST_REMOTE_PATH=/remote_path/to/results/ 23 | 24 | # SET LOCAL PATH ON YOUR MACHINE 25 | $ export XHOST_LOCAL_PATH=/local_path/to/results/ 26 | 27 | # RUN RSYNC SCRIPT 28 | $ bash rsync_snaphsot_perf_metrics.sh 29 | ``` 30 | 31 | #### Expected output: 32 | 33 | ``` 34 | $ bash rsync_snapshot_perf_metrics.sh 35 | receiving file list ... 36 | 24 files to consider 37 | 38 | ``` 39 | 40 | ## 3) On local machine, run jupyter notebook to plot results 41 | 42 | #### **in bash** 43 | $ cd /path/to/notebooks/ 44 | $ jupyter notebook 45 | 46 | #### **in jupyter notebook** 47 | 48 | 49 | -------------------------------------------------------------------------------- /scripts/rsync_tools/rsync_snapshot_perf_metrics.sh: -------------------------------------------------------------------------------- 1 | if [[ -z $XHOST_SSH_ADDR ]]; then 2 | echo "ERROR: Define $XHOST_SSH_ADDR=@" 3 | exit; 4 | fi 5 | 6 | if [[ -z $XHOST_REMOTE_PATH ]]; then 7 | XHOST_REMOTE_PATH=/nbu/liv/mhughes/public_results/toy_bars_3x3/ 8 | fi 9 | 10 | if [[ -z $XHOST_LOCAL_PATH ]]; then 11 | XHOST_LOCAL_PATH=/tmp/toy_bars_3x3/ 12 | fi 13 | mkdir -p $XHOST_LOCAL_PATH 14 | 15 | # Ask rsync to copy all files from remote to local 16 | # which match the specific .csv file template 17 | 18 | rsync -armPKL \ 19 | --include="/*/*/snapshot_perf_metrics_*.csv" \ 20 | --exclude="/*/*/*" \ 21 | $XHOST_SSH_ADDR:$XHOST_REMOTE_PATH \ 22 | $XHOST_LOCAL_PATH/ 23 | 24 | -------------------------------------------------------------------------------- /scripts/rsync_tools/rsync_specific_snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # rsync_specific_snapshot.sh : Remote sync model snapshots. 4 | # 5 | # Usage 6 | # ----- 7 | # $ bash rsync_specific_snapshot.sh 8 | # 9 | # Will rsync all content of that remote path to provided XHOST_LOCAL_PATH. 10 | 11 | # Parse REMOTE_SNAPSHOT_PATH 12 | if [[ -z $1 ]]; then 13 | echo "ERROR: Missing remote snapshot path arg;" 14 | exit; 15 | fi 16 | REMOTE_SNAPSHOT_PATH=$1 17 | 18 | if [[ -z $XHOST_SSH_ADDR ]]; then 19 | XHOST_SSH_ADDR=mhughes@ssh.cs.brown.edu 20 | fi 21 | 22 | if [[ -z $XHOST_REMOTE_PATH ]]; then 23 | XHOST_REMOTE_PATH=/nbu/liv/mhughes/public_results/bow_toy_letters/ 24 | fi 25 | XHOST_REMOTE_PATH=`python -c "import os; print '$XHOST_REMOTE_PATH'.rstrip(os.path.sep) + '/'"` 26 | 27 | if [[ -z $XHOST_LOCAL_PATH ]]; then 28 | XHOST_LOCAL_PATH=/tmp/bow_toy_letters/ 29 | fi 30 | XHOST_LOCAL_PATH=`python -c "import os; print '$XHOST_LOCAL_PATH'.rstrip(os.path.sep) + '/'"` 31 | 32 | # Force REMOTE_SNAPSHOT_PATH to look like directory, if desired 33 | REMOTE_SNAPSHOT_PATH=`python -c "print '$REMOTE_SNAPSHOT_PATH'.replace('$XHOST_LOCAL_PATH', '$XHOST_REMOTE_PATH')"` 34 | IS_DIR=`python -c "print '$REMOTE_SNAPSHOT_PATH'.endswith('/')"` 35 | if [[ $IS_DIR == 'True' ]]; then 36 | REMOTE_SNAPSHOT_PATH=`python -c "import os; print '$REMOTE_SNAPSHOT_PATH'.rstrip(os.path.sep) + '/'"` 37 | fi 38 | 39 | echo "START rsync_specific_snapshot.sh" 40 | echo ">>> IS_DIR=$IS_DIR" 41 | echo ">>> XHOST_SSH_ADDR=$XHOST_SSH_ADDR" 42 | echo ">>> XHOST_REMOTE_PATH=$XHOST_REMOTE_PATH" 43 | echo ">>> XHOST_LOCAL_PATH=$XHOST_LOCAL_PATH" 44 | 45 | if [[ -z $2 ]]; then 46 | LOCAL_SNAPSHOT_PATH=`python -c "print '$REMOTE_SNAPSHOT_PATH'.replace('$XHOST_REMOTE_PATH', '$XHOST_LOCAL_PATH')"` 47 | else 48 | LOCAL_SNAPSHOT_PATH=$2; 49 | fi 50 | 51 | # Copy any files in the provided snapshot folder 52 | # to the local snapshot folder 53 | if [[ $IS_DIR == 'True' ]]; then 54 | # This branch needs the trailing / 55 | echo ">>> REMOTE_SNAPSHOT_PATH=$REMOTE_SNAPSHOT_PATH" 56 | echo ">>> LOCAL_SNAPSHOT_PATH=$LOCAL_SNAPSHOT_PATH" 57 | 58 | # Avoid "yes/no" question to "are you sure you trust...?" 59 | # -e "ssh -o StrictHostKeyChecking=no" \ 60 | rsync -armPKL \ 61 | $XHOST_SSH_ADDR:$REMOTE_SNAPSHOT_PATH/ \ 62 | $LOCAL_SNAPSHOT_PATH/ 63 | 64 | else 65 | BASENAME=`python -c "import os; print os.path.split('$REMOTE_SNAPSHOT_PATH')[-1]"` 66 | REMOTE_SNAPSHOT_PATH=`python -c "import os; print os.path.split('$REMOTE_SNAPSHOT_PATH')[0]"` 67 | 68 | LOCAL_SNAPSHOT_PATH=`python -c "print '$REMOTE_SNAPSHOT_PATH'.replace('$XHOST_REMOTE_PATH', '$XHOST_LOCAL_PATH')"` 69 | 70 | echo $REMOTE_SNAPSHOT_PATH/ 71 | echo $LOCAL_SNAPSHOT_PATH/ 72 | echo $BASENAME 73 | 74 | scp $XHOST_SSH_ADDR:$REMOTE_SNAPSHOT_PATH/$BASENAME $LOCAL_SNAPSHOT_PATH/ 75 | #rsync -armPKL \ 76 | # $XHOST_SSH_ADDR:$REMOTE_SNAPSHOT_PATH/ \ 77 | # $LOCAL_SNAPSHOT_PATH/ \ 78 | # --include='$BASENAME' \ 79 | # --exclude='*' \ 80 | 81 | fi 82 | 83 | 84 | -------------------------------------------------------------------------------- /scripts/setup_train_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "---------- START $0" 4 | 5 | if [[ -z $PC_REPO_DIR ]]; then 6 | echo "Error: Need to define PC_REPO_DIR." 1>&2; 7 | exit; 8 | fi 9 | echo "PC_REPO_DIR=$PC_REPO_DIR" 10 | 11 | if [[ -z $output_path ]]; then 12 | echo "Error: Need to define output_path." 1>&2; 13 | fi 14 | echo "output_path:" 15 | echo $output_path 16 | 17 | export PYTHONPATH="$PC_REPO_DIR:$PYTHONPATH" 18 | echo "PYTHONPATH=$PYTHONPATH" 19 | 20 | # Set default: single threaded 21 | if [[ -z $OMP_NUM_THREADS ]]; then 22 | export OMP_NUM_THREADS=1 23 | fi 24 | if [[ -z $MKL_NUM_THREADS ]]; then 25 | export MKL_NUM_THREADS=1 26 | fi 27 | 28 | # Set default: which python executable to use 29 | if [[ -z $XHOST_PYTHON_EXE ]]; then 30 | export XHOST_PYTHON_EXE=`which python` 31 | fi 32 | echo "Python executable:" 33 | echo $XHOST_PYTHON_EXE 34 | 35 | # Verify place to save results exists 36 | if [[ -z $XHOST_RESULTS_DIR ]]; then 37 | echo "Error: Need to define XHOST_RESULTS_DIR." 1>&2; 38 | exit; 39 | fi 40 | echo "XHOST_RESULTS_DIR=$XHOST_RESULTS_DIR" 41 | 42 | 43 | 44 | # If user desired to run on grid computing... 45 | if [[ $XHOST == 'grid' ]]; then 46 | # Verify place to write logs exists 47 | if [[ -z $XHOST_LOG_DIR ]]; then 48 | echo "Error: Need to define XHOST_LOG_DIR." 1>&2; 49 | exit; 50 | fi 51 | echo "XHOST_LOG_DIR=$XHOST_LOG_DIR" 52 | 53 | 54 | # Avoid race conditions on NFS file access 55 | # by sleeping a little while (60 sec or less) 56 | sleep $[ ( $RANDOM % 60 ) + 1 ]s 57 | fi 58 | 59 | echo "---------- STOP $0" 60 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromgood.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=toy_bars_3x3 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=9 11 | export n_outputs=2 12 | export n_train_docs=500 13 | 14 | export n_batches=1 15 | 16 | # =============================== OUTPUT SETTINGS 17 | export param_output_fmt="topic_model_snapshot" 18 | export n_steps_between_save=10 19 | export n_steps_between_print=10 20 | export n_steps_to_print_early=2 21 | export n_steps_to_save_early=2 22 | export laps_to_save_custom='0,1,2,4,6,8,10' 23 | 24 | # =============================== ALGO SETTINGS 25 | export n_laps=3 26 | 27 | ## Overall training: ADAM 28 | export alg_name="grad_descent_minimizer" 29 | export step_direction='adam' 30 | export decay_staircase=0 31 | export decay_interval=1 32 | export decay_rate=0.997 33 | for step_size in 0.0333 #0.1000 0.3333 34 | do 35 | export step_size=$step_size 36 | 37 | # =============================== PER-DOC INFER SETTINGS 38 | ## Per-doc inference settings at training 39 | export pi_max_iters=5 40 | export pi_step_size=0.05 41 | export pi_max_iters_first_train_lap=3 42 | 43 | ## Per-doc inference settings at perf-metric (eval step) 44 | export perf_metrics_pi_max_iters=50 45 | 46 | 47 | # =============================== INIT SETTINGS 48 | for init_name in good_loss_pc_K4 good_loss_x_K4 good_loss_pc_K4 49 | do 50 | 51 | export init_model_path=$dataset_path"/"$init_name"_param_dict.dump" 52 | export init_name=$init_name 53 | export n_states=004 54 | 55 | # =============================== MODEL HYPERS 56 | export alpha=1.100 57 | export tau=1.100 58 | export lambda_w=0.001 59 | 60 | export weight_x=1.0 61 | 62 | ## Loop over weights to place on log p(y|x) 63 | for weight_y in 10.0 02.0 01.0 64 | do 65 | export weight_y=$weight_y 66 | 67 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size/1/" 68 | 69 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 70 | 71 | done 72 | done 73 | done 74 | done 75 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=toy_bars_3x3 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=9 11 | export n_outputs=2 12 | export n_train_docs=500 13 | 14 | export n_batches=1 15 | 16 | # =============================== OUTPUT SETTINGS 17 | export param_output_fmt="topic_model_snapshot" 18 | export n_steps_between_save=10 19 | export n_steps_between_print=10 20 | export n_steps_to_print_early=2 21 | export n_steps_to_save_early=2 22 | export laps_to_save_custom='0,1,2,4,6,8,10' 23 | 24 | # =============================== ALGO SETTINGS 25 | export n_laps=3 26 | 27 | ## Overall training: ADAM 28 | export alg_name="grad_descent_minimizer" 29 | export step_direction='adam' 30 | export decay_staircase=0 31 | export decay_interval=1 32 | export decay_rate=0.997 33 | for step_size in 0.0333 34 | do 35 | export step_size=$step_size 36 | 37 | # =============================== PER-DOC INFER SETTINGS 38 | ## Per-doc inference settings at training 39 | export pi_max_iters=5 40 | export pi_step_size=0.05 41 | export pi_max_iters_first_train_lap=3 42 | 43 | ## Per-doc inference settings at perf-metric (eval step) 44 | export perf_metrics_pi_max_iters=50 45 | 46 | 47 | # =============================== INIT SETTINGS 48 | export init_model_path=none 49 | for init_name in rand_smooth 50 | do 51 | export init_name=$init_name 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 10.0 01.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | ## Loop over number of topics K 66 | for n_states in 004 67 | do 68 | export n_states=$n_states 69 | 70 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size/1/" 71 | 72 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 73 | 74 | done 75 | done 76 | done 77 | done 78 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_lbfgs_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=toy_bars_3x3 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=9 11 | export n_outputs=2 12 | export n_train_docs=500 13 | 14 | export n_batches=1 15 | 16 | # =============================== OUTPUT SETTINGS 17 | export param_output_fmt="topic_model_snapshot" 18 | export n_steps_between_save=10 19 | export n_steps_between_print=10 20 | export n_steps_to_print_early=2 21 | export n_steps_to_save_early=2 22 | export laps_to_save_custom='0,1,2,4,6,8,10' 23 | 24 | # =============================== ALGO SETTINGS 25 | export n_laps=3 26 | 27 | ## Overall training: L-BFGS 28 | export alg_name="scipy_lbfgs_minimizer" 29 | 30 | 31 | # =============================== PER-DOC INFER SETTINGS 32 | ## Per-doc inference settings at training 33 | export pi_max_iters=5 34 | export pi_step_size=0.05 35 | export pi_max_iters_first_train_lap=3 36 | 37 | ## Per-doc inference settings at perf-metric (eval step) 38 | export perf_metrics_pi_max_iters=50 39 | 40 | 41 | ## Per-doc inference settings during training 42 | export pi_max_iters_first_train_lap=3 43 | 44 | ## Per-doc inference settings at perf-metric (eval step) 45 | export perf_metrics_pi_max_iters=50 46 | 47 | # =============================== INIT SETTINGS 48 | export init_model_path=none 49 | for init_name in rand_smooth 50 | do 51 | export init_name=$init_name 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 10.0 02.0 01.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | for n_states in 004 66 | do 67 | export n_states=$n_states 68 | 69 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size/1/" 70 | 71 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 72 | 73 | done 74 | done 75 | done 76 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=quicktest 4 | 5 | export lossandgrad_mod_name="slda_loss__tensorflow" 6 | 7 | # =============================== DATA SETTINGS 8 | export dataset_name=toy_bars_3x3 9 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 10 | export n_vocabs=9 11 | export n_outputs=2 12 | export n_train_docs=500 13 | 14 | export n_batches=1 15 | 16 | # =============================== OUTPUT SETTINGS 17 | export param_output_fmt="topic_model_snapshot" 18 | export n_steps_between_save=10 19 | export n_steps_between_print=10 20 | export n_steps_to_print_early=2 21 | export n_steps_to_save_early=2 22 | export laps_to_save_custom='0,1,2,4,6,8,10' 23 | 24 | # =============================== ALGO SETTINGS 25 | export n_laps=3 26 | 27 | ## Overall training: ADAM 28 | export alg_name="grad_descent_minimizer" 29 | export step_direction='adam' 30 | export decay_staircase=0 31 | export decay_interval=1 32 | export decay_rate=0.997 33 | for step_size in 0.0333 34 | do 35 | export step_size=$step_size 36 | 37 | # =============================== PER-DOC INFER SETTINGS 38 | ## Per-doc inference settings at training 39 | export pi_max_iters=5 40 | export pi_step_size=0.05 41 | export pi_max_iters_first_train_lap=3 42 | 43 | ## Per-doc inference settings at perf-metric (eval step) 44 | export perf_metrics_pi_max_iters=50 45 | 46 | 47 | # =============================== INIT SETTINGS 48 | export init_model_path=none 49 | for init_name in rand_smooth 50 | do 51 | export init_name=$init_name 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 10.0 01.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | ## Loop over number of topics K 66 | for n_states in 004 67 | do 68 | export n_states=$n_states 69 | 70 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-init_name=$init_name-alg_name=$alg_name-weight_x=$weight_x-weight_y=$weight_y-step_size=$step_size/1/" 71 | 72 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 73 | 74 | done 75 | done 76 | done 77 | done 78 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/train_base_classifiers/train_baseline_classifiers.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_clf.sh 3 | nickname="20180301" 4 | 5 | # =============================== DATA SETTINGS 6 | export dataset_name=toy_bars_3x3 7 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 8 | 9 | export feature_arr_names=X 10 | export target_arr_name=Y 11 | 12 | all_target_names=`cat "$dataset_path/Y_colnames.txt"` 13 | 14 | for classifier_name in logistic_regression extra_trees 15 | do 16 | 17 | for target_name in $all_target_names 18 | do 19 | export target_names=$target_name 20 | export classifier_name=$classifier_name 21 | export class_weight_opts='none' 22 | export preproc_X='none' 23 | 24 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-classifier_name=$classifier_name/1/" 25 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 26 | 27 | done 28 | done 29 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/train_topic_models/pcslda_ag_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=3 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | 8 | # =============================== DATA SETTINGS 9 | export dataset_name=toy_bars_3x3 10 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 11 | export n_vocabs=9 12 | export n_outputs=2 13 | export n_train_docs=500 14 | 15 | for n_batches in 01 05 16 | do 17 | export n_batches=$n_batches 18 | 19 | 20 | # =============================== OUTPUT SETTINGS 21 | export param_output_fmt="topic_model_snapshot" 22 | export n_steps_between_save=10 23 | export n_steps_between_print=10 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,4,6,8,10' 27 | 28 | 29 | # =============================== ALGO SETTINGS 30 | export n_laps=200 31 | 32 | ## Overall training: ADAM 33 | export alg_name="grad_descent_minimizer" 34 | export step_direction='adam' 35 | export decay_staircase=0 36 | export decay_interval=1 37 | export decay_rate=0.997 38 | for step_size in 0.0333 0.3333 39 | do 40 | export step_size=$step_size 41 | 42 | 43 | # =============================== PER-DOC INFER SETTINGS 44 | ## Per-doc inference settings 45 | export pi_max_iters=100 46 | export pi_step_size=0.05 47 | export pi_max_iters_first_train_lap=10 48 | 49 | ## Per-doc inference settings at perf-metric (eval step) 50 | export perf_metrics_pi_max_iters=100 51 | 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 100.0 010.0 001.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | 66 | # =============================== INIT SETTINGS 67 | export init_model_path=none 68 | for init_name in rand_smooth 69 | do 70 | export init_name=$init_name 71 | 72 | ## Loop over number of topics K 73 | for n_states in 004 74 | do 75 | export n_states=$n_states 76 | 77 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$step_direction-step_size=$step_size/1/" 78 | 79 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 80 | 81 | done 82 | done 83 | done 84 | done 85 | done 86 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromgood.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=1 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | 8 | # =============================== DATA SETTINGS 9 | export dataset_name=toy_bars_3x3 10 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 11 | export n_vocabs=9 12 | export n_outputs=2 13 | export n_train_docs=500 14 | 15 | for n_batches in 01 16 | do 17 | export n_batches=$n_batches 18 | 19 | 20 | # =============================== OUTPUT SETTINGS 21 | export param_output_fmt="topic_model_snapshot" 22 | export n_steps_between_save=10 23 | export n_steps_between_print=10 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,4,6,8,10' 27 | 28 | 29 | # =============================== ALGO SETTINGS 30 | export n_laps=200 31 | 32 | ## Overall training: L-BFGS 33 | export alg_name="scipy_lbfgs_minimizer" 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | # =============================== PER-DOC INFER SETTINGS 44 | ## Per-doc inference settings 45 | export pi_max_iters=100 46 | export pi_step_size=0.05 47 | export pi_max_iters_first_train_lap=10 48 | 49 | ## Per-doc inference settings at perf-metric (eval step) 50 | export perf_metrics_pi_max_iters=100 51 | 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 100.0 010.0 001.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | 66 | # =============================== INIT SETTINGS 67 | for init_name in good_loss_x_K4 good_loss_pc_K4 68 | do 69 | 70 | export init_model_path=$dataset_path"/"$init_name"_param_dict.dump" 71 | export init_name=$init_name 72 | export n_states=004 73 | 74 | 75 | 76 | 77 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$alg_name/1/" 78 | 79 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 80 | 81 | done 82 | done 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=3 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__autograd" 6 | 7 | 8 | # =============================== DATA SETTINGS 9 | export dataset_name=toy_bars_3x3 10 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 11 | export n_vocabs=9 12 | export n_outputs=2 13 | export n_train_docs=500 14 | 15 | for n_batches in 01 05 16 | do 17 | export n_batches=$n_batches 18 | 19 | 20 | # =============================== OUTPUT SETTINGS 21 | export param_output_fmt="topic_model_snapshot" 22 | export n_steps_between_save=10 23 | export n_steps_between_print=10 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,4,6,8,10' 27 | 28 | 29 | # =============================== ALGO SETTINGS 30 | export n_laps=100 31 | 32 | ## Overall training: L-BFGS 33 | export alg_name="scipy_lbfgs_minimizer" 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | # =============================== PER-DOC INFER SETTINGS 44 | ## Per-doc inference settings 45 | export pi_max_iters=100 46 | export pi_step_size=0.05 47 | export pi_max_iters_first_train_lap=10 48 | 49 | ## Per-doc inference settings at perf-metric (eval step) 50 | export perf_metrics_pi_max_iters=100 51 | 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 100.0 010.0 001.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | 66 | # =============================== INIT SETTINGS 67 | export init_model_path=none 68 | for init_name in rand_smooth 69 | do 70 | export init_name=$init_name 71 | 72 | ## Loop over number of topics K 73 | for n_states in 004 74 | do 75 | export n_states=$n_states 76 | 77 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$alg_name/1/" 78 | 79 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 80 | 81 | done 82 | done 83 | done 84 | done 85 | -------------------------------------------------------------------------------- /scripts/toy_bars_3x3/train_topic_models/pcslda_tf_adam_fromscratch.sh: -------------------------------------------------------------------------------- 1 | export XHOST_NTASKS=3 2 | export XHOST_BASH_EXE=$PC_REPO_DIR/scripts/train_slda.sh 3 | nickname=20180301 4 | 5 | export lossandgrad_mod_name="slda_loss__tensorflow" 6 | 7 | 8 | # =============================== DATA SETTINGS 9 | export dataset_name=toy_bars_3x3 10 | export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" 11 | export n_vocabs=9 12 | export n_outputs=2 13 | export n_train_docs=500 14 | 15 | for n_batches in 01 05 16 | do 17 | export n_batches=$n_batches 18 | 19 | 20 | # =============================== OUTPUT SETTINGS 21 | export param_output_fmt="topic_model_snapshot" 22 | export n_steps_between_save=10 23 | export n_steps_between_print=10 24 | export n_steps_to_print_early=2 25 | export n_steps_to_save_early=2 26 | export laps_to_save_custom='0,1,2,4,6,8,10' 27 | 28 | 29 | # =============================== ALGO SETTINGS 30 | export n_laps=200 31 | 32 | ## Overall training: ADAM 33 | export alg_name="grad_descent_minimizer" 34 | export step_direction='adam' 35 | export decay_staircase=0 36 | export decay_interval=1 37 | export decay_rate=0.997 38 | for step_size in 0.0333 0.3333 39 | do 40 | export step_size=$step_size 41 | 42 | 43 | # =============================== PER-DOC INFER SETTINGS 44 | ## Per-doc inference settings 45 | export pi_max_iters=100 46 | export pi_step_size=0.05 47 | export pi_max_iters_first_train_lap=10 48 | 49 | ## Per-doc inference settings at perf-metric (eval step) 50 | export perf_metrics_pi_max_iters=100 51 | 52 | 53 | # =============================== MODEL HYPERS 54 | export alpha=1.100 55 | export tau=1.100 56 | export lambda_w=0.001 57 | 58 | export weight_x=1.0 59 | 60 | ## Loop over weights to place on log p(y|x) 61 | for weight_y in 100.0 010.0 001.0 62 | do 63 | export weight_y=$weight_y 64 | 65 | 66 | # =============================== INIT SETTINGS 67 | export init_model_path=none 68 | for init_name in rand_smooth 69 | do 70 | export init_name=$init_name 71 | 72 | ## Loop over number of topics K 73 | for n_states in 004 74 | do 75 | export n_states=$n_states 76 | 77 | export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$step_direction-step_size=$step_size/1/" 78 | 79 | bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } 80 | 81 | done 82 | done 83 | done 84 | done 85 | done 86 | -------------------------------------------------------------------------------- /scripts/train_clf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "---------- START $0" 4 | 5 | # Setup env vars for this training run 6 | . $PC_REPO_DIR/scripts/setup_train_env.sh 7 | XHOST_SCRIPT=$PC_REPO_DIR/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py 8 | 9 | # Parse keyword args from env 10 | keyword_args=`python $PC_REPO_DIR/scripts/launcher_tools/print_lowercase_env_vars_as_keyword_args.py` 11 | echo "SCRIPT BASENAME:" 12 | echo `basename $XHOST_SCRIPT` 13 | echo "SCRIPT PATH:" 14 | echo $XHOST_SCRIPT 15 | echo "SCRIPT KWARGS:" 16 | echo $keyword_args 17 | 18 | # Run desired script 19 | eval $XHOST_PYTHON_EXE -u $XHOST_SCRIPT $keyword_args 20 | 21 | echo "---------- STOP $0" 22 | -------------------------------------------------------------------------------- /scripts/train_slda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "---------- START $0" 4 | 5 | # Setup env vars for this training run 6 | . $PC_REPO_DIR/scripts/setup_train_env.sh 7 | XHOST_SCRIPT=$PC_REPO_DIR/pc_toolbox/train_slda_model.py 8 | 9 | # Parse keyword args from env 10 | keyword_args=`python $PC_REPO_DIR/scripts/launcher_tools/print_lowercase_env_vars_as_keyword_args.py` 11 | 12 | # If any keyword arg processing raised an error, just stop here 13 | if [ $? -ne 0 ]; then 14 | exit $? 15 | fi 16 | 17 | echo "SCRIPT BASENAME:" 18 | echo `basename $XHOST_SCRIPT` 19 | echo "SCRIPT PATH:" 20 | echo $XHOST_SCRIPT 21 | echo "SCRIPT KWARGS:" 22 | echo $keyword_args 23 | 24 | # Run desired script 25 | eval $XHOST_PYTHON_EXE -u $XHOST_SCRIPT $keyword_args 26 | 27 | echo "---------- STOP $0" 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_ext] 2 | inplace=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | 4 | try: 5 | from Cython.Distutils import build_ext 6 | HAS_CYTHON = True 7 | except ImportError: 8 | from distutils.command.build_ext import build_ext 9 | HAS_CYTHON = False 10 | 11 | def make_cython_extension__nef_map_pi_d_K(): 12 | ext = Extension( 13 | "pc_toolbox.model_slda.est_local_params__single_doc_map.calc_nef_map_pi_d_K__cython", 14 | ["pc_toolbox/model_slda/est_local_params__single_doc_map/calc_nef_map_pi_d_K__cython.pyx"], 15 | libraries=["m"], 16 | extra_compile_args = ["-O3", "-ffast-math"]) 17 | return add_directives_to_cython_ext(ext) 18 | 19 | def make_extensions(): 20 | ''' Assemble C++/Cython extension objects for compilation. 21 | 22 | Warns user if required prerequisites are not specified. 23 | 24 | Returns 25 | ------- 26 | ext_list : list of extension objects 27 | ''' 28 | ext_list = list() 29 | if HAS_CYTHON: 30 | ext_list.append(make_cython_extension__nef_map_pi_d_K()) 31 | return ext_list 32 | 33 | def add_directives_to_cython_ext(ext): 34 | ''' Improve speed of cython code extensions 35 | 36 | References 37 | ---------- 38 | http://docs.cython.org/src/reference/compilation.html#compiler-directives 39 | ''' 40 | ext.cython_directives = { 41 | 'embedsignature':True, 42 | 'boundscheck':False, 43 | 'nonecheck':False, 44 | 'wraparound':False, 45 | 'cdivision':True} 46 | return ext 47 | 48 | def read_version(txtpath): 49 | with open(txtpath, 'r') as f: 50 | version = f.readline().strip() 51 | return version 52 | 53 | setup( 54 | name='pc_toolbox', 55 | version=read_version('version.txt'), 56 | description='Prediction-constrained training for supervised topic models', 57 | long_description='Support code for Hughes et al AISTATS 2018', 58 | classifiers=[ 59 | 'Development Status :: 3 - Alpha', 60 | 'License :: OSI Approved :: MIT License', 61 | 'Programming Language :: Python :: 2.7', 62 | ], 63 | url='https://github.com/dtak/prediction-constrained-topic-models', 64 | author='Michael C. Hughes', 65 | author_email='mike@michaelchughes.com', 66 | license='MIT', 67 | setup_requires=["Cython>=0.25"], 68 | cmdclass = {"build_ext": build_ext}, 69 | ext_modules = make_extensions(), 70 | ) 71 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.1.20180712 2 | 3 | --------------------------------------------------------------------------------