├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── datasets └── download-data.sh ├── environment.yml ├── eval_linear.sh ├── eval_transformer.sh ├── pretrained_models └── download-models.sh ├── results_linear ├── Eurlex-4K.v0.txt ├── Eurlex-4K.v1.txt ├── Wiki10-31K.v0.txt └── Wiki10-31K.v1.txt ├── results_transformer └── Eurlex-4K.final.txt ├── run_preprocess_feat.sh ├── run_preprocess_label.sh ├── run_transformer_predict.sh ├── run_transformer_train.sh ├── setup.py └── xbert ├── __init__.py ├── corelib ├── COPYRIGHT ├── COPYRIGHT.liblinear ├── COPYRIGHT.trmf ├── Makefile ├── rf_linear.cpp └── rf_matrix.h ├── evaluator.py ├── indexer.py ├── modeling.py ├── preprocess.py ├── ranker.py ├── rf_linear.py ├── rf_util.py └── transformer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # others 2 | *.xz 3 | *.bz2 4 | *.swp 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.o 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020 The X-Transformer Project. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | make -C xbert/corelib/ 3 | clean: 4 | make -C xbert/corelib/ clean 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Taming Pretrained Transformers for XMC problems 2 | 3 | This is a README for the experimental code of the following paper 4 | >[Taming Pretrained Transformers for eXtreme Multi-label Text Classification](https://arxiv.org/abs/1905.02331) 5 | 6 | >Wei-Cheng Chang, Hsiang-Fu Yu, Kai Zhong, Yiming Yang, Inderjit Dhillon 7 | 8 | >KDD 2020 9 | 10 | ## Upates (2021-04-27) 11 | Latest implementation (faster training with stronger performance) of X-Transformer is available at [PECOS](https://github.com/amzn/pecos), feel free to try it out! 12 | 13 | 14 | ## Installation 15 | 16 | ### Depedencies via Conda Environment 17 | 18 | > conda env create -f environment.yml 19 | > source activate pt1.2_xmlc_transformer 20 | > (pt1.2_xmlc_transformer) pip install -e . 21 | > (pt1.2_xmlc_transformer) python setup.py install --force 22 | 23 | 24 | **Notice: the following examples are executed under the ```> (pt1.2_xmlc_transformer)``` conda virtual environment 25 | 26 | 27 | ## Reproduce Evaulation Results in the Paper 28 | We demonstrate how to reproduce the evaluation results in our paper 29 | by downloading the raw dataset and pretrained models. 30 | 31 | ### Download Dataset (Eurlex-4K, Wiki10-31K, AmazonCat-13K, Wiki-500K) 32 | Change directory into ./datasets folder, download and unzip each dataset 33 | 34 | ```bash 35 | cd ./datasets 36 | bash download-data.sh Eurlex-4K 37 | bash download-data.sh Wiki10-31K 38 | bash download-data.sh AmazonCat-13K 39 | bash download-data.sh Wiki-500K 40 | cd ../ 41 | ``` 42 | 43 | Each dataset contains the following files 44 | - ```label_map.txt```: each line is the raw text of the label 45 | - ```train_raw_text.txt, test_raw_text.txt```: each line is the raw text of the instance 46 | - ```X.trn.npz, X.tst.npz```: instance's embedding matrix (either sparse TF-IDF or fine-tuned dense embedding) 47 | - ```Y.trn.npz, Y.tst.npz```: instance-to-label assignment matrix 48 | 49 | ### Download Pretrained Models (processed data, Indexing codes, fine-tuned Transformer models) 50 | Change directory into ./pretrained_models folder, download and unzip models for each dataset 51 | 52 | ```bash 53 | cd ./pretrained_models 54 | bash download-models.sh Eurlex-4K 55 | bash download-models.sh Wiki10-31K 56 | bash download-models.sh AmazonCat-13K 57 | bash download-models.sh Wiki-500K 58 | cd ../ 59 | ``` 60 | Each folder has the following strcture 61 | - ```proc_data```: a sub-folder containing: X.{trn|tst}.{model}.128.pkl, C.{label-emb}.npz, L.{label-emb}.npz 62 | - ```pifa-tfidf-s0```: a sub-folder containing indexer and matcher 63 | - ```pifa-neural-s0```: a sub-folder containing indexer and matcher 64 | - ```text-emb-s0```: a sub-folder containing indexer and matcher 65 | 66 | 67 | ### Evaluate Linear Models 68 | Given the provided indexing codes (label-to-cluster assignments), train/predict linear models, and evaluate with Precision/Recall@k: 69 | 70 | ```bash 71 | bash eval_linear.sh ${DATASET} ${VERSION} 72 | ``` 73 | 74 | - ```DATASET```: the dataset name such as Eurlex-4K, Wiki10-31K, AmazonCat-13K, or Wiki-500K. 75 | - ```VERSION```: v0=sparse TF-IDF features. v1=sparse TF-IDF features concatenate with dense fine-tuned XLNet embedding. 76 | 77 | The evaluaiton results should located at 78 | ``` ./results_linear/${DATASET}.${VERSION}.txt ``` 79 | 80 | 81 | ### Evaluate Fine-tuned X-Transformer Models 82 | Given the provided indexing codes (label-to-cluster assignments) and the fine-tuned Transformer models, train/predict ranker of the X-Transformer framework, and evaluate with Precision/Recall@k: 83 | 84 | ```bash 85 | bash eval_transformer.sh ${DATASET} 86 | ``` 87 | 88 | - ```DATASET```: the dataset name such as Eurlex-4K, Wiki10-31K, AmazonCat-13K, or Wiki-500K. 89 | 90 | The evaluaiton results should located at 91 | ``` ./results_transformer/${DATASET}.final.txt ``` 92 | 93 | 94 | ## Running X-Transformer on customized datasets 95 | The X-Transformer framework consists of 9 configurations (3 label-embedding times 3 model-type). 96 | For simplicity, we show you 1 out-of 9 here, using ```LABEL_EMB=pifa-tfidf``` and ```MODEL_TYPE=bert```. 97 | 98 | We will use Eurlex-4K as an example. In the ./datasets/Eurlex-4K folder, we assume the following files are provided: 99 | 100 | - ```X.trn.npz```: the instance TF-IDF feature matrix for the train set. The data type is scipy.sparse.csr_matrix of size (N_trn, D_tfidf), where N_trn is the number of train instances and D_tfidf is the number of features. 101 | - ```X.tst.npz```: the instance TF-IDF feature matrix for the test set. The data type is scipy.sparse.csr_matrix of size (N_tst, D_tfidf), where N_tst is the number of test instances and D_tfidf is the number of features. 102 | - ```Y.trn.npz```: the instance-to-label matrix for the train set. The data type is scipy.sparse.csr_matrix of size (N_trn, L), where n_trn is the number of train instances and L is the number of labels. 103 | - ```Y.tst.npz```: the instance-to-label matrix for the test set. The data type is scipy.sparse.csr_matrix of size (N_tst, L), where n_tst is the number of test instances and L is the number of labels. 104 | - ```train_raw_texts.txt```: The raw text of the train set. 105 | - ```test_raw_texts.txt```: The raw text of the test set. 106 | - ```label_map.txt```: the label's text description. 107 | 108 | Given those input files, the pipeline can be divided into three stages: Indexer, Matcher, and Ranker. 109 | 110 | ### Indexer 111 | In stage 1, we will do the following 112 | - (1) construct label embedding 113 | - (2) perform hierarchical 2-means and output the instance-to-cluster assignment matrix 114 | - (3) preprocess the input and output for training Transformer models. 115 | 116 | **TLDR**: we combine and summarize (1),(2),(3) into two scripts: ```run_preprocess_label.sh``` and ```run_preprocess_feat.sh```. See more detailed explaination in the following. 117 | 118 | 119 | (1) To construct label embedding, 120 | ```bash 121 | OUTPUT_DIR=save_models/${DATASET} 122 | PROC_DATA_DIR=${OUTPUT_DIR}/proc_data 123 | mkdir -p ${PROC_DATA_DIR} 124 | python -m xbert.preprocess \ 125 | --do_label_embedding \ 126 | -i ${DATA_DIR} \ 127 | -o ${PROC_DATA_DIR} \ 128 | -l ${LABEL_EMB} \ 129 | -x ${LABEL_EMB_INST_PATH} 130 | ``` 131 | 132 | - ```DATA_DIR```: ./datasets/Eurlex-4K 133 | - ```PROC_DATA_DIR```: ./save_models/Eurlex-4K/proc_data 134 | - ```LABEL_EMB```: pifa-tfidf (you can also try text-emb or pifa-neural if you have fine-tuned instance embeddings) 135 | - ```LABEL_EMB_INST_PATH```: ./datasets/Eurlex-4K/X.trn.npz 136 | 137 | This should yield ```L.${LABEL_EMB}.npz``` in the ```PROC_DATA_DIR```. 138 | 139 | (2) To perform hierarchical 2-means, 140 | ```bash 141 | SEED_LIST=( 0 1 2 ) 142 | for SEED in "${SEED_LIST[@]}"; do 143 | LABEL_EMB_NAME=${LABEL_EMB}-s${SEED} 144 | INDEXER_DIR=${OUTPUT_DIR}/${LABEL_EMB_NAME}/indexer 145 | python -u -m xbert.indexer \ 146 | python -m xbert.preprocess \ 147 | -i ${PROC_DATA_DIR}/L.${LABEL_EMB}.npz \ 148 | -o ${INDEXER_DIR} --seed ${SEED} 149 | ``` 150 | This should yield ```code.npz``` in the ```INDEXIER_DIR```. 151 | 152 | (3) To preprocess input and output for Transformer models, 153 | ```bash 154 | SEED=0 155 | LABEL_EMB_NAME=${LABEL_EMB}-s${SEED} 156 | INDEXER_DIR=${OUTPUT_DIR}/${LABEL_EMB_NAME}/indexer 157 | python -u -m xbert.preprocess \ 158 | --do_proc_label \ 159 | -i ${DATA_DIR} \ 160 | -o ${PROC_DATA_DIR} \ 161 | -l ${LABEL_EMB_NAME} \ 162 | -c ${INDEXER_DIR}/code.npz 163 | ``` 164 | This should yield the instance-to-cluster matrix ```C.trn.npz``` and ```C.tst.npz``` in the ```PROC_DATA_DIR```. 165 | 166 | ```bash 167 | OUTPUT_DIR=save_models/${DATASET} 168 | PROC_DATA_DIR=${OUTPUT_DIR}/proc_data 169 | python -u -m xbert.preprocess \ 170 | --do_proc_feat \ 171 | -i ${DATA_DIR} \ 172 | -o ${PROC_DATA_DIR} \ 173 | -m ${MODEL_TYPE} \ 174 | -n ${MODEL_NAME} \ 175 | --max_xseq_len ${MAX_XSEQ_LEN} \ 176 | |& tee ${PROC_DATA_DIR}/log.${MODEL_TYPE}.${MAX_XSEQ_LEN}.txt 177 | ``` 178 | - ```MODEL_TYPE```: bert (or roberta, xlnet) 179 | - ```MODEL_NAME```: bert-large-cased-whole-word-masking (or roberta-large, xlnet-large-cased) 180 | - ```MAX_XSEQ_LEN```: maximum number of tokens, we set to 128 181 | 182 | This should yield ```X.trn.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pt``` and ```X.tst.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pt``` in the ```PROC_DATA_DIR```. 183 | 184 | 185 | ### Matcher 186 | In stage 2, we will do the following 187 | - (1) train deep Transformer models to map instances to the induced clusters 188 | - (2) output the predicted cluster scores and fine-tune instance embeddings 189 | 190 | **TLDR**: ```run_transformer_train.sh```. See more detailed explaination in the following. 191 | 192 | 193 | (1) Assume we have 8 Nvidia V100 GPUs. To train the models, 194 | ```bash 195 | MODEL_DIR=${OUTPUT_DIR}/${INDEXER_NAME}/matcher/${MODEL_NAME} 196 | mkdir -p ${MODEL_DIR} 197 | ``` 198 | ```python 199 | python -m torch.distributed.launch \ 200 | --nproc_per_node 8 xbert/transformer.py \ 201 | -m ${MODEL_TYPE} -n ${MODEL_NAME} --do_train \ 202 | -x_trn ${PROC_DATA_DIR}/X.trn.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 203 | -c_trn ${PROC_DATA_DIR}/C.trn.${INDEXER_NAME}.npz \ 204 | -o ${MODEL_DIR} --overwrite_output_dir \ 205 | --per_device_train_batch_size ${PER_DEVICE_TRN_BSZ} \ 206 | --gradient_accumulation_steps ${GRAD_ACCU_STEPS} \ 207 | --max_steps ${MAX_STEPS} \ 208 | --warmup_steps ${WARMUP_STEPS} \ 209 | --learning_rate ${LEARNING_RATE} \ 210 | --logging_steps ${LOGGING_STEPS} \ 211 | |& tee ${MODEL_DIR}/log.txt 212 | ``` 213 | - ```MODEL_TYPE```: bert (or roberta, xlnet) 214 | - ```MODEL_NAME```: bert-large-cased-whole-word-masking (or roberta-large, xlnet-large-cased) 215 | - ```PER_DEVICE_TRN_BSZ```: 16 if using Nvidia V100 (or set to 8 if using Nvidia 2080Ti) 216 | - ```GRAD_ACCU_STEPS```: 2 if using Nvidia V100 (or set to 4 if using Nvidia 2080Ti) 217 | - ```MAX_STEPS```: set to 1,000 for Eurlex-4K. Depending on your datasets 218 | - ```WARMUP_STEPS```: set to 1,00 for Eurlex-4K. Depending on your datasets 219 | - ```LEARNING_RATE```: set to 5e-5 for Eurlex-4K. Depending on your datasets 220 | - ```LOGGING_STEPS```: set to 100 221 | 222 | 223 | (2) To generate predictions and instance embedding, 224 | ```bash 225 | GPID=0,1,2,3,4,5,6,7 226 | PER_DEVICE_VAL_BSZ=32 227 | ``` 228 | ```python 229 | CUDA_VISIBLE_DEVICES=${GPID} python -u xbert/transformer.py 230 | -m ${MODEL_TYPE} -n ${MODEL_NAME} \ 231 | --do_eval -o ${MODEL_DIR} \ 232 | -x_trn ${PROC_DATA_DIR}/X.trn.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 233 | -c_trn ${PROC_DATA_DIR}/C.trn.${INDEXER_NAME}.npz \ 234 | -x_tst ${PROC_DATA_DIR}/X.tst.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 235 | -c_tst ${PROC_DATA_DIR}/C.tst.${INDEXER_NAME}.npz \ 236 | --per_device_eval_batch_size ${PER_DEVICE_VAL_BSZ} 237 | ``` 238 | This should yield the following output in the ```MODEL_DIR``` 239 | - ```C_trn_pred.npz``` and ```C_tst_pred.npz```: model-predicted cluster scores 240 | - ```trn_embeddings.npy``` and ```tst_embeddings.npy```: fine-tuned instance embeddings 241 | 242 | ### Ranker 243 | In stage 3, we will do the following 244 | - (1) train linear rankers to map instances and predicted cluster scores to label scores 245 | - (2) output top-k predicted labels 246 | 247 | **TLDR**: ```run_transformer_predict.sh```. See more detailed explaination in the following. 248 | 249 | (1) To train linear rankers, 250 | ```bash 251 | LABEL_NAME=pifa-tfidf-s0 252 | MODEL_NAME=bert-large-cased-whole-word-masking 253 | OUTPUT_DIR=save_models/${DATASET}/${LABEL_NAME} 254 | INDEXER_DIR=${OUTPUT_DIR}/indexer 255 | MATCHER_DIR=${OUTPUT_DIR}/matcher/${MODEL_NAME} 256 | RANKER_DIR=${OUTPUT_DIR}/ranker/${MODEL_NAME} 257 | mkdir -p ${RANKER_DIR} 258 | ``` 259 | ```python 260 | python -m xbert.ranker train \ 261 | -x1 ${DATA_DIR}/X.trn.npz \ 262 | -x2 ${MATCHER_DIR}/trn_embeddings.npy \ 263 | -y ${DATA_DIR}/Y.trn.npz \ 264 | -z ${MATCHER_DIR}/C_trn_pred.npz \ 265 | -c ${INDEXER_DIR}/code.npz \ 266 | -o ${RANKER_DIR} -t 0.01 \ 267 | -f 0 --mode ranker 268 | ``` 269 | 270 | (2) To predict the final top-k labels, 271 | ```bash 272 | PRED_NPZ_PATH=${RANKER_DIR}/tst.pred.npz 273 | python -m xbert.ranker predict \ 274 | -m ${RANKER_DIR} -o ${PRED_NPZ_PATH} \ 275 | -x1 ${DATA_DIR}/X.tst.npz \ 276 | -x2 ${MATCHER_DIR}/tst_embeddings.npy \ 277 | -y ${DATA_DIR}/Y.tst.npz \ 278 | -z ${MATCHER_DIR}/C_tst_pred.npz \ 279 | -f 0 -t noop 280 | ``` 281 | 282 | This should yield the predicted top-k labels tst.pred.npz specified in ```PRED_NPZ_PATH```. 283 | 284 | 285 | 286 | ## Acknowledge 287 | 288 | Some portions of this repo is borrowed from the following repos: 289 | - [transformers(v2.2.0)](https://github.com/huggingface/transformers) 290 | - [liblinear](https://github.com/cjlin1/liblinear) 291 | - [TRMF](https://github.com/rofuyu/exp-trmf-nips16) 292 | -------------------------------------------------------------------------------- /datasets/download-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset=$1 4 | 5 | function gdrive-get() { 6 | fileid=$1 7 | filename=$2 8 | if [[ "${fileid}" == "" || "${filename}" == "" ]]; then 9 | echo "gdrive-curl gdrive-url|gdrive-fileid filename" 10 | return 1 11 | else 12 | if [[ ${fileid} = http* ]]; then 13 | fileid=$(echo ${fileid} | sed "s/http.*drive.google.com.*id=\([^&]*\).*/\1/") 14 | fi 15 | echo "Download ${filename} from google drive with id ${fileid}..." 16 | cookie="/tmp/cookies.txt" 17 | curl -c ${cookie} -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 18 | confirmid=$(awk '/download/ {print $NF}' ${cookie}) 19 | curl -Lb ${cookie} "https://drive.google.com/uc?export=download&confirm=${confirmid}&id=${fileid}" -o ${filename} 20 | rm -rf ${cookie} 21 | return 0 22 | fi 23 | } 24 | 25 | if [ ${dataset} == 'Eurlex-4K' ]; then 26 | gdrive-get 1A_sL_mzpkmnr6g0DSZ0_xJTr4GN-rIfi ${dataset}.tar.gz 27 | elif [ ${dataset} == 'Wiki10-31K' ]; then 28 | gdrive-get 1V22zUzzoXjb-nHqZAJNcKtjNDAph81jt ${dataset}.tar.gz 29 | elif [ ${dataset} == 'AmazonCat-13K' ]; then 30 | gdrive-get 1oxNwL9o9zGEhnBT8i0g5tN7ZBIggLk85 ${dataset}.tar.gz 31 | elif [ ${dataset} == 'Wiki-500K' ]; then 32 | gdrive-get 1WMyCqVstoZdjFU0-WBaK2cRV09IzUwav ${dataset}.tar.gz 33 | else 34 | echo "unknown dataset [ Eurlex-4K | Wiki10-31K | AmazonCat-13K | Wiki-500K ]" 35 | exit 36 | fi 37 | 38 | tar -xzvf ${dataset}.tar.gz 39 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pt1.2_xmlc_transformer 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - backcall=0.1.0=py37_0 8 | - blas=1.0=mkl 9 | - ca-certificates=2020.1.1=0 10 | - certifi=2019.11.28=py37_0 11 | - cffi=1.13.2=py37h2e261b9_0 12 | - cudatoolkit=10.0.130=0 13 | - decorator=4.4.1=py_0 14 | - freetype=2.9.1=h8a8886c_1 15 | - intel-openmp=2019.4=243 16 | - ipython=7.11.1=py37h39e3cac_0 17 | - ipython_genutils=0.2.0=py37_0 18 | - jedi=0.15.2=py37_0 19 | - joblib=0.14.1=py_0 20 | - jpeg=9b=h024ee3a_2 21 | - ld_impl_linux-64=2.33.1=h53a641e_7 22 | - libedit=3.1.20181209=hc058e9b_0 23 | - libffi=3.2.1=hd88cf55_4 24 | - libgcc-ng=9.1.0=hdf63c60_0 25 | - libgfortran-ng=7.3.0=hdf63c60_0 26 | - libpng=1.6.37=hbc83047_0 27 | - libstdcxx-ng=9.1.0=hdf63c60_0 28 | - libtiff=4.1.0=h2733197_0 29 | - mkl=2019.4=243 30 | - mkl-service=2.3.0=py37he904b0f_0 31 | - mkl_fft=1.0.15=py37ha843d7b_0 32 | - mkl_random=1.1.0=py37hd6b4f25_0 33 | - ncurses=6.1=he6710b0_1 34 | - ninja=1.9.0=py37hfd86e86_0 35 | - numpy=1.18.1=py37h4f9e942_0 36 | - numpy-base=1.18.1=py37hde5b4d6_0 37 | - olefile=0.46=py_0 38 | - openssl=1.1.1d=h7b6447c_4 39 | - parso=0.5.2=py_0 40 | - pexpect=4.7.0=py37_0 41 | - pickleshare=0.7.5=py37_0 42 | - pillow=7.0.0=py37hb39fc2d_0 43 | - pip=20.0.2=py37_1 44 | - prompt_toolkit=3.0.2=py_0 45 | - ptyprocess=0.6.0=py37_0 46 | - pycparser=2.19=py_0 47 | - pygments=2.5.2=py_0 48 | - python=3.7.6=h0371630_2 49 | - readline=7.0=h7b6447c_5 50 | - scikit-learn=0.22.1=py37hd81dba3_0 51 | - scipy=1.3.2=py37h7c811a0_0 52 | - setuptools=44.0.0=py37_0 53 | - six=1.13.0=py37_0 54 | - sqlite=3.30.1=h7b6447c_0 55 | - tk=8.6.8=hbc83047_0 56 | - traitlets=4.3.3=py37_0 57 | - wcwidth=0.1.7=py37_0 58 | - wheel=0.33.6=py37_0 59 | - xz=5.2.4=h14c3975_4 60 | - zlib=1.2.11=h7b6447c_3 61 | - zstd=1.3.7=h0b5b093_0 62 | - pytorch=1.2.0=py3.7_cuda10.0.130_cudnn7.6.2_0 63 | - torchvision=0.4.0=py37_cu100 64 | - pip: 65 | - appdirs==1.4.3 66 | - attrs==19.3.0 67 | - black==19.10b0 68 | - boto3==1.11.5 69 | - botocore==1.14.5 70 | - chardet==3.0.4 71 | - click==7.0 72 | - docutils==0.15.2 73 | - idna==2.8 74 | - jmespath==0.9.4 75 | - pathspec==0.7.0 76 | - python-dateutil==2.8.1 77 | - regex==2020.1.8 78 | - requests==2.22.0 79 | - s3transfer==0.3.1 80 | - sacremoses==0.0.38 81 | - sentencepiece==0.1.85 82 | - toml==0.10.0 83 | - torch==1.2.0 84 | - tqdm==4.41.1 85 | - transformers==2.2.0 86 | - typed-ast==1.4.1 87 | - urllib3==1.25.7 88 | prefix: /home/ubuntu/anaconda3/envs/pt1.2_xmlc_transformer 89 | 90 | -------------------------------------------------------------------------------- /eval_linear.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | VERSION=$2 5 | LABEL_EMB=pifa-tfidf 6 | DATA_DIR=./datasets/${DATASET} 7 | 8 | PRED_NPZ_PATHS="" 9 | SEED_LIST=( 0 1 2 ) 10 | for SEED in "${SEED_LIST[@]}"; do 11 | # indexer (for reproducibility, use clusters from pretrained_dir) 12 | OUTPUT_DIR=pretrained_models/${DATASET}/${LABEL_EMB}-s${SEED} 13 | INDEXER_DIR=${OUTPUT_DIR}/indexer 14 | RANKER_DIR=${OUTPUT_DIR}/ranker/linear-${VERSION} 15 | mkdir -p ${RANKER_DIR} 16 | 17 | # ranker train and predict 18 | PRED_NPZ_PATH=${RANKER_DIR}/tst.pred.npz 19 | 20 | # x_emb=TF-IDF, model=Parabel 21 | if [ ${VERSION} == 'v0' ]; then 22 | python -m xbert.ranker train \ 23 | -x ${DATA_DIR}/X.trn.npz \ 24 | -y ${DATA_DIR}/Y.trn.npz \ 25 | -c ${INDEXER_DIR}/code.npz \ 26 | -o ${RANKER_DIR} -t 0.01 27 | 28 | python -m xbert.ranker predict \ 29 | -m ${RANKER_DIR} -o ${PRED_NPZ_PATH} \ 30 | -x ${DATA_DIR}/X.tst.npz \ 31 | -y ${DATA_DIR}/Y.tst.npz 32 | 33 | # x_emb=xlnet_finetuned+TF-IDF, model=Parabel 34 | elif [ ${VERSION} == 'v1' ]; then 35 | python -m xbert.ranker train \ 36 | -x ${DATA_DIR}/X.trn.npz \ 37 | -x2 ${DATA_DIR}/X.trn.finetune.xlnet.npy \ 38 | -y ${DATA_DIR}/Y.trn.npz \ 39 | -c ${INDEXER_DIR}/code.npz \ 40 | -o ${RANKER_DIR} -t 0.01 -f 0 41 | 42 | python -m xbert.ranker predict \ 43 | -m ${RANKER_DIR} -o ${PRED_NPZ_PATH} \ 44 | -x ${DATA_DIR}/X.tst.npz \ 45 | -x2 ${DATA_DIR}/X.tst.finetune.xlnet.npy \ 46 | -y ${DATA_DIR}/Y.tst.npz -f 0 47 | 48 | else 49 | echo 'unknown linear version' 50 | exit 51 | fi 52 | 53 | # append all prediction path 54 | PRED_NPZ_PATHS="${PRED_NPZ_PATHS} ${PRED_NPZ_PATH}" 55 | done 56 | 57 | # final eval 58 | EVAL_DIR=results_linear 59 | mkdir -p ${EVAL_DIR} 60 | python -u -m xbert.evaluator \ 61 | -y datasets/${DATASET}/Y.tst.npz \ 62 | -e -p ${PRED_NPZ_PATHS} \ 63 | |& tee ${EVAL_DIR}/${DATASET}.${VERSION}.txt 64 | 65 | -------------------------------------------------------------------------------- /eval_transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | DATA_DIR=./datasets/${DATASET} 5 | 6 | LABEL_NAME_ARR=( pifa-tfidf-s0 pifa-neural-s0 text-emb-s0 ) 7 | MODEL_NAME_ARR=( bert-large-cased-whole-word-masking roberta-large xlnet-large-cased ) 8 | EXP_NAME=${DATASET}.final 9 | 10 | PRED_NPZ_PATHS="" 11 | for LABEL_NAME in "${LABEL_NAME_ARR[@]}"; do 12 | OUTPUT_DIR=pretrained_models/${DATASET}/${LABEL_NAME} 13 | INDEXER_DIR=${OUTPUT_DIR}/indexer 14 | for MODEL_NAME in "${MODEL_NAME_ARR[@]}"; do 15 | MATCHER_DIR=${OUTPUT_DIR}/matcher/${MODEL_NAME} 16 | RANKER_DIR=${OUTPUT_DIR}/ranker/${MODEL_NAME} 17 | mkdir -p ${RANKER_DIR} 18 | 19 | # train linear ranker 20 | python -m xbert.ranker train \ 21 | -x1 ${DATA_DIR}/X.trn.npz \ 22 | -x2 ${MATCHER_DIR}/trn_embeddings.npy \ 23 | -y datasets/${DATASET}/Y.trn.npz \ 24 | -z ${MATCHER_DIR}/C_trn_pred.npz \ 25 | -c ${OUTPUT_DIR}/indexer/code.npz \ 26 | -o ${RANKER_DIR} -t 0.01 \ 27 | -f 0 -ns 0 --mode ranker \ 28 | 29 | # predict final label ranking, using transformer's predicted cluster scores 30 | PRED_NPZ_PATH=${RANKER_DIR}/tst.pred.npz 31 | python -m xbert.ranker predict \ 32 | -m ${RANKER_DIR} -o ${PRED_NPZ_PATH} \ 33 | -x1 datasets/${DATASET}/X.tst.npz \ 34 | -x2 ${MATCHER_DIR}/tst_embeddings.npy \ 35 | -y datasets/${DATASET}/Y.tst.npz \ 36 | -z ${MATCHER_DIR}/C_tst_pred.npz \ 37 | -f 0 -t noop 38 | 39 | # append all prediction path 40 | PRED_NPZ_PATHS="${PRED_NPZ_PATHS} ${PRED_NPZ_PATH}" 41 | done 42 | done 43 | 44 | # final eval 45 | EVAL_DIR=results_transformer 46 | mkdir -p ${EVAL_DIR} 47 | python -u -m xbert.evaluator \ 48 | -y datasets/${DATASET}/Y.tst.npz \ 49 | -e -p ${PRED_NPZ_PATHS} |& tee ${EVAL_DIR}/${EXP_NAME}.txt 50 | 51 | -------------------------------------------------------------------------------- /pretrained_models/download-models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset=$1 4 | 5 | function gdrive-get() { 6 | fileid=$1 7 | filename=$2 8 | if [[ "${fileid}" == "" || "${filename}" == "" ]]; then 9 | echo "gdrive-curl gdrive-url|gdrive-fileid filename" 10 | return 1 11 | else 12 | if [[ ${fileid} = http* ]]; then 13 | fileid=$(echo ${fileid} | sed "s/http.*drive.google.com.*id=\([^&]*\).*/\1/") 14 | fi 15 | echo "Download ${filename} from google drive with id ${fileid}..." 16 | cookie="/tmp/cookies.txt" 17 | curl -c ${cookie} -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 18 | confirmid=$(awk '/download/ {print $NF}' ${cookie}) 19 | curl -Lb ${cookie} "https://drive.google.com/uc?export=download&confirm=${confirmid}&id=${fileid}" -o ${filename} 20 | rm -rf ${cookie} 21 | return 0 22 | fi 23 | } 24 | 25 | mkdir -p ${dataset} 26 | cd ${dataset} 27 | 28 | 29 | 30 | if [ ${dataset} == 'Eurlex-4K' ]; then 31 | gdrive-get 1ZplgIYqJavtAJNaqnyvaE77IZZbCrXJw indexer.tar.gz 32 | gdrive-get 1Kv8_62RU1gfiAULCPvkMY5kk1ttTxQzc proc_data.tar.gz 33 | gdrive-get 1sxiwozg5hCDc28BuNyYt8R3Vm_TbIJXI pifa-tfidf-s0.bert.tar.gz 34 | gdrive-get 19SO8koS8mCapG0wM3dF_ffVZeMpKMuCJ pifa-tfidf-s0.roberta.tar.gz 35 | gdrive-get 1R8gXkvG8lwFawJYHthNZn3kX9XyzC82g pifa-tfidf-s0.xlnet.tar.gz 36 | gdrive-get 1cZ4JtRuCTucvrjwLBjFADfosc2ugxrAG pifa-neural-s0.bert.tar.gz 37 | gdrive-get 1gTJoNtJc6VdY9B2pbyLANr3y3QWrn_4J pifa-neural-s0.roberta.tar.gz 38 | gdrive-get 181G_Oi51Cqp9-aGKhtcp6r6VaHpltNSM pifa-neural-s0.xlnet.tar.gz 39 | gdrive-get 1TD3fvN0YLuJwl4t2Tgl1Ms7ViJaU1Faj text-emb-s0.bert.tar.gz 40 | gdrive-get 1i-gyrDglBwjgpbigITz-0C7uEcO1EPQ9 text-emb-s0.roberta.tar.gz 41 | gdrive-get 14OrxieNLivGFK-rFEmubsb8JEh5kzQjY text-emb-s0.xlnet.tar.gz 42 | 43 | elif [ ${dataset} == 'Wiki10-31K' ]; then 44 | gdrive-get 1cQvX5ayDGwgGc4hCM9XHAVRISWaSknFC indexer.tar.gz 45 | gdrive-get 1A87V4CaY-PwuiqrB61z5_2WWpFWFt_db proc_data.tar.gz 46 | gdrive-get 1wujzxiUePpEo0hoZYc1QqczFzDngR22y pifa-tfidf-s0.bert.tar.gz 47 | gdrive-get 1uYHml0FlvXfWy8wAy7fiZQGPlPQkPThE pifa-tfidf-s0.roberta.tar.gz 48 | gdrive-get 1-GBDHry_ThSWN_FYJe4swSvuBOmFBYaJ pifa-tfidf-s0.xlnet.tar.gz 49 | gdrive-get 1_MnCXY3DiS2Pi8OCgPAxt98LyEpUGsMR pifa-neural-s0.bert.tar.gz 50 | gdrive-get 1TXEg37XNtNFa8DbB6YwRy9qgfW9Od1x5 pifa-neural-s0.roberta.tar.gz 51 | gdrive-get 1_pZVYYT_8H2sf1N0PEd5kxgzskMX43U9 pifa-neural-s0.xlnet.tar.gz 52 | gdrive-get 1GecTrAlwAvFZl5cUIxy388x49hTO8WAL text-emb-s0.bert.tar.gz 53 | gdrive-get 1_L3AMw5uGadSTVyAbWAD25SuXM9F3AgZ text-emb-s0.roberta.tar.gz 54 | gdrive-get 1ZbXP-wTsEhlcl67XsRgCTXgg5bVuBJI6 text-emb-s0.xlnet.tar.gz 55 | 56 | elif [ ${dataset} == 'AmazonCat-13K' ]; then 57 | gdrive-get 15h1l05M3zxyjQQQQLz7mNoll5SmTl9B_ indexer.tar.gz 58 | gdrive-get 1VN39DazmTb3GvdO5qDNp15BEhd40c4HY proc_data.tar.gz 59 | gdrive-get 1cUfCRcgcoeu-DW6r1D22D88sUx_5MNU6 pifa-tfidf-s0.bert.tar.gz 60 | gdrive-get 1hNcG-URVZ6Dx3z5LNwOJ1xq-Pgx-4_Cs pifa-tfidf-s0.roberta.tar.gz 61 | gdrive-get 1wK0wTrrazAAaYguI0QKPhfDGBap-bO9q pifa-tfidf-s0.xlnet.tar.gz 62 | gdrive-get 123NxkH9Sw0IDpEKdHgjnfWbniTEE96n7 pifa-neural-s0.bert.tar.gz 63 | gdrive-get 1MWcIPlLlPeIPQ7IYS64-b0G56OW3hxFk pifa-neural-s0.roberta.tar.gz 64 | gdrive-get 1zkRWN-IVUeF2wImrroH0s1dPPOlnIqNa pifa-neural-s0.xlnet.tar.gz 65 | gdrive-get 1fbgGIOlYF4lWLqr86SC72eLcAvYb9sMw text-emb-s0.bert.tar.gz 66 | gdrive-get 1VnFGL15WbyyqAEOgHbPlBRWAcmCz-L50 text-emb-s0.roberta.tar.gz 67 | gdrive-get 1vnbL1wUGiYeLfZ-y5w1q9nljn3sGUlxD text-emb-s0.xlnet.tar.gz 68 | 69 | elif [ ${dataset} == 'Wiki-500K' ]; then 70 | gdrive-get 18KhHUCijtGb71Kx7vyPjwPjpdGeRcbpV indexer.tar.gz 71 | gdrive-get 1cR4yHaeVaGNK4HVhxb4h09XxlI2EU_Y- proc_data.tar.gz 72 | gdrive-get 1uAyd-Mp1IG8SNeJveVJ0tqir8dIhqU6D pifa-tfidf-s0.bert.tar.gz 73 | gdrive-get 1na32fqXzVk2sXNc1D7ZwgshZr45vUYyv pifa-tfidf-s0.roberta.tar.gz 74 | gdrive-get 1AtHrq4nEkGIjvTZRenOkPhuBSPRo6FOG pifa-tfidf-s0.xlnet.tar.gz 75 | gdrive-get 1NRM_Uy-83xSz5feeXLJhUqm7cp4LOptH pifa-neural-s0.bert.tar.gz 76 | gdrive-get 1Rd-bN6Q0grgv_bCVLtmhDaKQ-pEW9unw pifa-neural-s0.roberta.tar.gz 77 | gdrive-get 1fDgY0ejFBcS6EWTvJ8M_m0ugLO59SsKq pifa-neural-s0.xlnet.tar.gz 78 | gdrive-get 1MxHxYfT5WJUA8Nf_KckpQrN0MDFDHDo- text-emb-s0.bert.tar.gz 79 | gdrive-get 1cepVBs0hdNlTYXvvTuYHrqEGGwYsmzAm text-emb-s0.roberta.tar.gz 80 | gdrive-get 1HbI1qTVYewXsSXE8Yl21h8NGDgLkNn3a text-emb-s0.xlnet.tar.gz 81 | 82 | else 83 | echo "unknown dataset [ Eurlex-4K | Wiki10-31K | AmazonCat-13K | Wiki-500K ]" 84 | exit 85 | fi 86 | 87 | 88 | tar -xzvf indexer.tar.gz 89 | tar -xzvf proc_data.tar.gz 90 | 91 | label_emb_arr=( pifa-tfidf pifa-neural text-emb ) 92 | model_type_arr=( bert roberta xlnet ) 93 | for label_emb in "${label_emb_arr[@]}"; do 94 | for model_type in "${model_type_arr[@]}"; do 95 | tar -xzvf ${label_emb}-s0.${model_type}.tar.gz 96 | done 97 | done 98 | 99 | -------------------------------------------------------------------------------- /results_linear/Eurlex-4K.v0.txt: -------------------------------------------------------------------------------- 1 | Namespace(ensemble=True, input_inst_label='datasets/Eurlex-4K/Y.tst.npz', pred_path=['pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/linear-v0/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s1/ranker/linear-v0/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s2/ranker/linear-v0/tst.pred.npz']) 2 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/linear-v0/tst.pred.npz 3 | prec = 81.14 74.70 68.54 62.81 57.37 52.36 47.77 43.62 40.00 37.07 4 | recall = 16.46 29.88 40.71 49.28 55.81 60.77 64.40 67.06 69.02 70.93 5 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s1/ranker/linear-v0/tst.pred.npz 6 | prec = 81.14 74.28 68.18 62.55 57.22 52.07 47.49 43.54 40.13 37.25 7 | recall = 16.45 29.76 40.58 49.12 55.64 60.36 63.95 66.85 69.17 71.24 8 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s2/ranker/linear-v0/tst.pred.npz 9 | prec = 80.93 74.70 68.15 62.65 57.21 52.12 47.49 43.42 39.94 37.10 10 | recall = 16.36 29.90 40.50 49.17 55.65 60.42 64.01 66.70 68.83 70.94 11 | ==== Evaluations of Ensembles of All Predictions ==== 12 | ens: average 13 | prec = 82.02 75.51 69.22 63.40 57.86 52.80 48.16 44.22 40.67 37.57 14 | recall = 16.62 30.24 41.16 49.75 56.29 61.27 64.94 67.96 70.15 71.85 15 | ens: rank_average 16 | prec = 81.71 75.33 69.15 63.67 58.11 52.98 48.25 44.15 40.59 37.56 17 | recall = 16.54 30.20 41.12 49.95 56.55 61.46 65.05 67.84 69.99 71.81 18 | ens: round_robin 19 | prec = 81.14 74.41 68.28 62.79 57.51 52.47 47.92 43.91 40.42 37.38 20 | recall = 16.46 29.75 40.61 49.30 55.95 60.90 64.57 67.46 69.71 71.52 21 | -------------------------------------------------------------------------------- /results_linear/Eurlex-4K.v1.txt: -------------------------------------------------------------------------------- 1 | Namespace(ensemble=True, input_inst_label='datasets/Eurlex-4K/Y.tst.npz', pred_path=['pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/linear-v1/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s1/ranker/linear-v1/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s2/ranker/linear-v1/tst.pred.npz']) 2 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/linear-v1/tst.pred.npz 3 | prec = 83.65 77.31 71.25 65.53 59.63 54.13 49.26 44.97 41.20 37.99 4 | recall = 16.94 31.05 42.39 51.48 58.01 62.78 66.35 69.09 71.07 72.68 5 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s1/ranker/linear-v1/tst.pred.npz 6 | prec = 83.05 77.37 70.71 65.07 59.38 53.89 49.15 44.87 41.18 37.98 7 | recall = 16.80 30.98 42.09 51.08 57.72 62.45 66.18 68.81 70.96 72.60 8 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s2/ranker/linear-v1/tst.pred.npz 9 | prec = 82.87 76.53 70.89 65.27 59.40 54.14 49.19 44.91 41.23 38.06 10 | recall = 16.77 30.64 42.14 51.22 57.76 62.76 66.28 68.94 71.08 72.80 11 | ==== Evaluations of Ensembles of All Predictions ==== 12 | ens: average 13 | prec = 84.06 77.81 71.57 65.80 59.79 54.45 49.65 45.43 41.73 38.53 14 | recall = 17.02 31.22 42.58 51.65 58.15 63.13 66.86 69.69 71.89 73.64 15 | ens: rank_average 16 | prec = 84.09 77.90 71.50 65.98 60.12 54.76 49.90 45.55 41.71 38.45 17 | recall = 17.02 31.24 42.59 51.81 58.46 63.49 67.20 69.89 71.84 73.51 18 | ens: round_robin 19 | prec = 83.65 77.32 71.19 65.45 59.70 54.40 49.59 45.28 41.49 38.35 20 | recall = 16.94 31.00 42.33 51.42 58.07 63.05 66.78 69.47 71.52 73.34 21 | -------------------------------------------------------------------------------- /results_linear/Wiki10-31K.v0.txt: -------------------------------------------------------------------------------- 1 | Namespace(ensemble=True, input_inst_label='datasets/Wiki10-31K/Y.tst.npz', pred_path=['pretrained_models/Wiki10-31K/pifa-tfidf-s0/ranker/linear-v0/tst.pred.npz', 'pretrained_models/Wiki10-31K/pifa-tfidf-s1/ranker/linear-v0/tst.pred.npz', 'pretrained_models/Wiki10-31K/pifa-tfidf-s2/ranker/linear-v0/tst.pred.npz']) 2 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s0/ranker/linear-v0/tst.pred.npz 3 | prec = 83.96 78.01 72.21 67.14 62.78 59.18 55.78 52.88 50.22 47.79 4 | recall = 4.96 9.12 12.54 15.42 17.90 20.16 22.05 23.80 25.32 26.68 5 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s1/ranker/linear-v0/tst.pred.npz 6 | prec = 84.01 78.69 72.78 67.87 63.42 59.59 56.26 53.00 50.22 47.65 7 | recall = 4.96 9.23 12.65 15.61 18.13 20.31 22.24 23.84 25.30 26.58 8 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s2/ranker/linear-v0/tst.pred.npz 9 | prec = 84.22 78.02 72.08 67.16 62.83 59.15 56.08 53.14 50.44 47.94 10 | recall = 4.96 9.14 12.53 15.43 17.94 20.17 22.18 23.92 25.45 26.77 11 | ==== Evaluations of Ensembles of All Predictions ==== 12 | ens: average 13 | prec = 84.55 79.04 73.13 67.78 63.34 59.61 56.38 53.66 51.17 48.91 14 | recall = 4.99 9.27 12.72 15.59 18.13 20.36 22.33 24.15 25.78 27.27 15 | ens: rank_average 16 | prec = 84.25 79.06 73.20 67.88 63.66 59.96 56.70 53.84 51.30 48.93 17 | recall = 4.97 9.27 12.73 15.61 18.21 20.45 22.44 24.23 25.85 27.28 18 | ens: round_robin 19 | prec = 83.96 78.29 72.56 67.46 63.24 59.53 56.37 53.38 50.92 48.72 20 | recall = 4.96 9.16 12.61 15.49 18.04 20.25 22.29 24.00 25.63 27.16 21 | -------------------------------------------------------------------------------- /results_linear/Wiki10-31K.v1.txt: -------------------------------------------------------------------------------- 1 | Namespace(ensemble=True, input_inst_label='datasets/Wiki10-31K/Y.tst.npz', pred_path=['pretrained_models/Wiki10-31K/pifa-tfidf-s0/ranker/linear-v1/tst.pred.npz', 'pretrained_models/Wiki10-31K/pifa-tfidf-s1/ranker/linear-v1/tst.pred.npz', 'pretrained_models/Wiki10-31K/pifa-tfidf-s2/ranker/linear-v1/tst.pred.npz']) 2 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s0/ranker/linear-v1/tst.pred.npz 3 | prec = 87.39 83.03 77.67 72.59 67.78 63.62 60.28 57.20 54.36 51.78 4 | recall = 5.21 9.78 13.59 16.78 19.42 21.71 23.85 25.74 27.41 28.90 5 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s1/ranker/linear-v1/tst.pred.npz 6 | prec = 86.94 82.82 77.69 72.94 68.19 63.94 60.30 57.24 54.30 51.61 7 | recall = 5.18 9.76 13.57 16.86 19.56 21.84 23.86 25.74 27.35 28.80 8 | ==== Evaluation on pretrained_models/Wiki10-31K/pifa-tfidf-s2/ranker/linear-v1/tst.pred.npz 9 | prec = 87.11 82.72 77.45 72.16 67.81 63.90 60.44 57.38 54.59 51.95 10 | recall = 5.20 9.76 13.57 16.68 19.46 21.87 23.98 25.90 27.61 29.09 11 | ==== Evaluations of Ensembles of All Predictions ==== 12 | ens: average 13 | prec = 87.48 83.25 78.22 73.02 68.36 64.20 60.79 57.82 55.18 52.67 14 | recall = 5.22 9.81 13.70 16.88 19.63 21.95 24.10 26.05 27.84 29.39 15 | ens: rank_average 16 | prec = 87.35 83.15 78.24 73.06 68.62 64.57 61.00 58.04 55.22 52.63 17 | recall = 5.22 9.80 13.70 16.88 19.68 22.07 24.18 26.17 27.86 29.37 18 | ens: round_robin 19 | prec = 87.39 83.00 77.89 72.69 68.17 64.03 60.61 57.56 54.85 52.40 20 | recall = 5.21 9.77 13.63 16.79 19.55 21.86 24.00 25.90 27.64 29.24 21 | -------------------------------------------------------------------------------- /results_transformer/Eurlex-4K.final.txt: -------------------------------------------------------------------------------- 1 | Namespace(ensemble=True, input_inst_label='datasets/Eurlex-4K/Y.tst.npz', pred_path=['pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/roberta-large/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/xlnet-large-cased/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/roberta-large/tst.pred.npz', 'pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/xlnet-large-cased/tst.pred.npz', 'pretrained_models/Eurlex-4K/text-emb-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz', 'pretrained_models/Eurlex-4K/text-emb-s0/ranker/roberta-large/tst.pred.npz', 'pretrained_models/Eurlex-4K/text-emb-s0/ranker/xlnet-large-cased/tst.pred.npz']) 2 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz 3 | prec = 85.46 79.17 72.87 66.68 60.78 55.28 50.08 45.66 41.84 38.54 4 | recall = 17.31 31.82 43.42 52.39 59.18 64.12 67.48 70.07 72.07 73.67 5 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/roberta-large/tst.pred.npz 6 | prec = 85.36 79.08 72.85 66.73 60.84 55.38 50.26 45.79 41.97 38.73 7 | recall = 17.33 31.78 43.39 52.46 59.21 64.23 67.72 70.31 72.36 74.03 8 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-tfidf-s0/ranker/xlnet-large-cased/tst.pred.npz 9 | prec = 84.99 78.99 72.82 66.71 60.76 55.27 50.06 45.54 41.77 38.43 10 | recall = 17.23 31.71 43.33 52.37 59.06 64.04 67.41 69.89 71.97 73.42 11 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz 12 | prec = 84.92 78.58 72.46 66.64 60.80 55.06 49.98 45.43 41.64 38.29 13 | recall = 17.19 31.50 43.13 52.29 59.12 63.81 67.31 69.72 71.77 73.19 14 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/roberta-large/tst.pred.npz 15 | prec = 84.79 78.94 72.67 66.84 60.83 55.34 50.28 45.72 41.85 38.57 16 | recall = 17.19 31.69 43.28 52.51 59.21 64.21 67.71 70.17 72.11 73.68 17 | ==== Evaluation on pretrained_models/Eurlex-4K/pifa-neural-s0/ranker/xlnet-large-cased/tst.pred.npz 18 | prec = 84.86 78.55 72.43 66.30 60.36 54.80 49.63 45.31 41.51 38.23 19 | recall = 17.20 31.47 43.01 52.01 58.67 63.52 66.81 69.45 71.43 72.98 20 | ==== Evaluation on pretrained_models/Eurlex-4K/text-emb-s0/ranker/bert-large-cased-whole-word-masking/tst.pred.npz 21 | prec = 85.43 78.87 72.89 66.93 60.36 54.50 49.14 44.43 40.53 37.17 22 | recall = 17.34 31.67 43.41 52.53 58.71 63.18 66.18 68.18 69.84 71.07 23 | ==== Evaluation on pretrained_models/Eurlex-4K/text-emb-s0/ranker/roberta-large/tst.pred.npz 24 | prec = 85.49 79.30 73.26 67.25 61.08 55.12 49.67 44.99 41.01 37.59 25 | recall = 17.34 31.79 43.61 52.79 59.41 63.93 66.95 69.12 70.75 71.93 26 | ==== Evaluation on pretrained_models/Eurlex-4K/text-emb-s0/ranker/xlnet-large-cased/tst.pred.npz 27 | prec = 85.30 78.68 72.77 66.59 60.28 54.33 49.03 44.38 40.41 37.03 28 | recall = 17.30 31.58 43.27 52.24 58.62 63.00 66.05 68.16 69.70 70.86 29 | ==== Evaluations of Ensembles of All Predictions ==== 30 | ens: average 31 | prec = 86.96 81.23 75.20 69.15 62.89 57.05 51.91 47.37 43.40 40.01 32 | recall = 17.66 32.67 44.81 54.31 61.15 66.12 69.87 72.66 74.73 76.40 33 | ens: rank_average 34 | prec = 87.22 81.24 75.12 68.87 62.90 57.05 51.94 47.38 43.47 40.04 35 | recall = 17.69 32.64 44.73 54.09 61.17 66.12 69.94 72.70 74.85 76.45 36 | ens: round_robin 37 | prec = 85.46 78.98 72.90 67.01 61.02 55.76 50.82 46.47 42.71 39.46 38 | recall = 17.31 31.73 43.40 52.68 59.43 64.73 68.52 71.38 73.60 75.40 39 | -------------------------------------------------------------------------------- /run_preprocess_feat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | MODEL_TYPE=$2 5 | MAX_XSEQ_LEN=$3 6 | 7 | # HuggingFace pretrained model preprocess 8 | if [ $MODEL_TYPE == "bert" ]; then 9 | MODEL_NAME="bert-large-cased-whole-word-masking" 10 | elif [ $MODEL_TYPE == "roberta" ]; then 11 | MODEL_NAME="roberta-large" 12 | elif [ $MODEL_TYPE == 'xlnet' ]; then 13 | MODEL_NAME="xlnet-large-cased" 14 | else 15 | echo "Unknown MODEL_NAME!" 16 | exit 17 | fi 18 | 19 | 20 | OUTPUT_DIR=save_models/${DATASET} 21 | PROC_DATA_DIR=${OUTPUT_DIR}/proc_data 22 | mkdir -p ${PROC_DATA_DIR} 23 | python -u -m xbert.preprocess \ 24 | --do_proc_feat \ 25 | -i ./datasets/${DATASET} \ 26 | -o ${PROC_DATA_DIR} \ 27 | -m ${MODEL_TYPE} \ 28 | -n ${MODEL_NAME} \ 29 | --max_xseq_len ${MAX_XSEQ_LEN} \ 30 | |& tee ${PROC_DATA_DIR}/log.${MODEL_TYPE}.${MAX_XSEQ_LEN}.txt 31 | -------------------------------------------------------------------------------- /run_preprocess_label.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | LABEL_EMB=$2 # pifa-tfidf | pifa-neural | text-emb 5 | 6 | 7 | # setup label embedding feature path 8 | # overwrite it if necessary 9 | DATA_DIR=datasets 10 | if [ ${LABEL_EMB} == 'pifa-tfidf' ]; then 11 | label_emb_inst_path=${DATA_DIR}/${DATASET}/X.trn.npz 12 | elif [ ${LABEL_EMB} == 'pifa-neural' ]; then 13 | label_emb_inst_path=${DATA_DIR}/${DATASET}/X.trn.finetune.xlnet.npy 14 | elif [ ${LABEL_EMB} == 'text-emb' ]; then 15 | label_emb_inst_path=${DATA_DIR}/${DATASET}/X.trn.npz 16 | fi 17 | 18 | 19 | # construct label embedding 20 | OUTPUT_DIR=save_models/${DATASET} 21 | PROC_DATA_DIR=${OUTPUT_DIR}/proc_data 22 | mkdir -p ${PROC_DATA_DIR} 23 | : " 24 | python -u -m xbert.preprocess \ 25 | --do_label_embedding \ 26 | -i ${DATA_DIR}/${DATASET} \ 27 | -o ${PROC_DATA_DIR} \ 28 | -l ${LABEL_EMB} \ 29 | -x ${label_emb_inst_path} 30 | 31 | 32 | # semantic label indexing 33 | SEED_LIST=( 0 1 2 ) 34 | for SEED in "${SEED_LIST[@]}"; do 35 | LABEL_EMB_NAME=${LABEL_EMB}-s${SEED} 36 | INDEXER_DIR=${OUTPUT_DIR}/${LABEL_EMB_NAME}/indexer 37 | python -u -m xbert.indexer \ 38 | -i ${PROC_DATA_DIR}/L.${LABEL_EMB}.npz \ 39 | -o ${INDEXER_DIR} --seed ${SEED} 40 | done 41 | " 42 | 43 | # construct C.[trn|tst].[label-emb].npz for training matcher 44 | SEED=0 45 | LABEL_EMB_NAME=${LABEL_EMB}-s${SEED} 46 | INDEXER_DIR=${OUTPUT_DIR}/${LABEL_EMB_NAME}/indexer 47 | python -u -m xbert.preprocess \ 48 | --do_proc_label \ 49 | -i ${DATA_DIR}/${DATASET} \ 50 | -o ${PROC_DATA_DIR} \ 51 | -l ${LABEL_EMB_NAME} \ 52 | -c ${INDEXER_DIR}/code.npz 53 | 54 | #### end #### 55 | 56 | -------------------------------------------------------------------------------- /run_transformer_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASET=$1 4 | DATA_DIR=./datasets/${DATASET} 5 | 6 | LABEL_NAME_ARR=( pifa-tfidf-s0 pifa-neural-s0 text-emb-s0 ) 7 | MODEL_NAME_ARR=( bert-large-cased-whole-word-masking roberta-large xlnet-large-cased ) 8 | EXP_NAME=${DATASET}.final 9 | 10 | PRED_NPZ_PATHS="" 11 | for LABEL_NAME in "${LABEL_NAME_ARR[@]}"; do 12 | OUTPUT_DIR=save_models/${DATASET}/${LABEL_NAME} 13 | INDEXER_DIR=${OUTPUT_DIR}/indexer 14 | for MODEL_NAME in "${MODEL_NAME_ARR[@]}"; do 15 | MATCHER_DIR=${OUTPUT_DIR}/matcher/${MODEL_NAME} 16 | RANKER_DIR=${OUTPUT_DIR}/ranker/${MODEL_NAME} 17 | mkdir -p ${RANKER_DIR} 18 | 19 | # train linear ranker 20 | python -m xbert.ranker train \ 21 | -x1 ${DATA_DIR}/X.trn.npz \ 22 | -x2 ${MATCHER_DIR}/trn_embeddings.npy \ 23 | -y ${DATA_DIR}/Y.trn.npz \ 24 | -z ${MATCHER_DIR}/C_trn_pred.npz \ 25 | -c ${INDEXER_DIR}/code.npz \ 26 | -o ${RANKER_DIR} -t 0.01 \ 27 | -f 0 -ns 2 --mode ranker 28 | 29 | # predict final label ranking 30 | PRED_NPZ_PATH=${RANKER_DIR}/tst.pred.npz 31 | python -m xbert.ranker predict \ 32 | -m ${RANKER_DIR} -o ${PRED_NPZ_PATH} \ 33 | -x1 ${DATA_DIR}/X.tst.npz \ 34 | -x2 ${MATCHER_DIR}/tst_embeddings.npy \ 35 | -y ${DATA_DIR}/Y.tst.npz \ 36 | -z ${MATCHER_DIR}/C_tst_pred.npz \ 37 | -f 0 -t noop 38 | 39 | # append all prediction path 40 | PRED_NPZ_PATHS="${PRED_NPZ_PATHS} ${PRED_NPZ_PATH}" 41 | done 42 | done 43 | 44 | # final eval 45 | EVAL_DIR=results_transformer-large 46 | mkdir -p ${EVAL_DIR} 47 | python -u -m xbert.evaluator \ 48 | -y ${DATA_DIR}/Y.tst.npz \ 49 | -e -p ${PRED_NPZ_PATHS} \ 50 | |& tee ${EVAL_DIR}/${EXP_NAME}.txt 51 | 52 | -------------------------------------------------------------------------------- /run_transformer_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPID=$1 4 | DATASET=$2 5 | MODEL_TYPE=$3 6 | INDEXER_NAME=$4 # pifa-tfidf-s0 ||| pifa-neural-s0 ||| text-emb-s0 7 | if [ ${MODEL_TYPE} == "bert" ]; then 8 | MODEL_NAME=bert-large-cased-whole-word-masking 9 | elif [ ${MODEL_TYPE} == "roberta" ]; then 10 | MODEL_NAME=roberta-large 11 | elif [ ${MODEL_TYPE} == "xlnet" ]; then 12 | MODEL_NAME=xlnet-large-cased 13 | else 14 | echo "unknown MODEL_TYPE! [ bert | robeta | xlnet ]" 15 | exit 16 | fi 17 | OUTPUT_DIR=save_models/${DATASET} 18 | PROC_DATA_DIR=${OUTPUT_DIR}/proc_data 19 | MAX_XSEQ_LEN=128 20 | 21 | # Nvidia 2080Ti (11Gb), fp32 22 | PER_DEVICE_TRN_BSZ=8 23 | PER_DEVICE_VAL_BSZ=16 24 | GRAD_ACCU_STEPS=4 25 | 26 | # Nvidia V100 (16Gb), fp32 27 | PER_DEVICE_TRN_BSZ=16 28 | PER_DEVICE_VAL_BSZ=32 29 | GRAD_ACCU_STEPS=2 30 | 31 | # set hyper-params by dataset 32 | if [ ${DATASET} == "Eurlex-4K" ]; then 33 | MAX_STEPS=1000 34 | WARMUP_STEPS=100 35 | LOGGING_STEPS=50 36 | LEARNING_RATE=5e-5 37 | elif [ ${DATASET} == "Wiki10-31K" ]; then 38 | MAX_STEPS=1400 39 | WARMUP_STEPS=100 40 | LOGGING_STEPS=50 41 | LEARNING_RATE=5e-5 42 | elif [ ${DATASET} == "AmazonCat-13K" ]; then 43 | MAX_STEPS=20000 44 | WARMUP_STEPS=2000 45 | LOGGING_STEPS=100 46 | LEARNING_RATE=8e-5 47 | elif [ ${DATASET} == "Wiki-500K" ]; then 48 | MAX_STEPS=80000 49 | WARMUP_STEPS=1000 50 | LOGGING_STEPS=100 51 | LEARNING_RATE=6e-5 # users may need to tune this LEARNING_RATE={2e-5,4e-5,6e-5,8e-5} depending on their CUDA/Pytorch environments 52 | else 53 | echo "dataset not support [ Eurlex-4K | Wiki10-31K | AmazonCat-13K | Wiki-500K ]" 54 | exit 55 | fi 56 | 57 | MODEL_DIR=${OUTPUT_DIR}/${INDEXER_NAME}/matcher/${MODEL_NAME} 58 | mkdir -p ${MODEL_DIR} 59 | 60 | 61 | # train 62 | CUDA_VISIBLE_DEVICES=${GPID} python -m torch.distributed.launch \ 63 | --nproc_per_node 8 xbert/transformer.py \ 64 | -m ${MODEL_TYPE} -n ${MODEL_NAME} --do_train \ 65 | -x_trn ${PROC_DATA_DIR}/X.trn.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 66 | -c_trn ${PROC_DATA_DIR}/C.trn.${INDEXER_NAME}.npz \ 67 | -o ${MODEL_DIR} --overwrite_output_dir \ 68 | --per_device_train_batch_size ${PER_DEVICE_TRN_BSZ} \ 69 | --gradient_accumulation_steps ${GRAD_ACCU_STEPS} \ 70 | --max_steps ${MAX_STEPS} \ 71 | --warmup_steps ${WARMUP_STEPS} \ 72 | --learning_rate ${LEARNING_RATE} \ 73 | --logging_steps ${LOGGING_STEPS} \ 74 | |& tee ${MODEL_DIR}/log.txt 75 | 76 | 77 | # predict 78 | CUDA_VISIBLE_DEVICES=${GPID} python -u xbert/transformer.py \ 79 | -m ${MODEL_TYPE} -n ${MODEL_NAME} \ 80 | --do_eval -o ${MODEL_DIR} \ 81 | -x_trn ${PROC_DATA_DIR}/X.trn.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 82 | -c_trn ${PROC_DATA_DIR}/C.trn.${INDEXER_NAME}.npz \ 83 | -x_tst ${PROC_DATA_DIR}/X.tst.${MODEL_TYPE}.${MAX_XSEQ_LEN}.pkl \ 84 | -c_tst ${PROC_DATA_DIR}/C.tst.${INDEXER_NAME}.npz \ 85 | --per_device_eval_batch_size ${PER_DEVICE_VAL_BSZ} 86 | 87 | #### end #### 88 | 89 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os 4 | from os import path 5 | from shutil import copytree, rmtree, ignore_patterns 6 | 7 | try : 8 | from setuptools import setup, Extension 9 | except : 10 | from distutils.core import setup, Extension 11 | 12 | # a technique to build a shared library on windows 13 | from distutils.command.build_ext import build_ext 14 | build_ext.get_export_symbols = lambda x,y: [] 15 | 16 | def get_blas_link_args(blas='lapack_opt'): 17 | import numpy.distutils.system_info as info 18 | dirs = info.get_info(blas)['library_dirs'] 19 | libs = info.get_info(blas)['libraries'] 20 | libs_cmd = ['-l{}'.format(x) for x in libs] 21 | dirs_cmd = ['-L{}'.format(x) for x in dirs] 22 | rpath_cmd = ['-Wl,-rpath,{}'.format(':'.join(dirs))] 23 | blas_link_args = ['-fopenmp', '-Wl,--as-needed'] + rpath_cmd + libs_cmd + dirs_cmd + ['-liomp5'] 24 | if sys.platform.lower() == 'darwin': 25 | blas_link_args = rpath_cmd + ['-framework Accelerate', '-liomp5'] 26 | return blas_link_args 27 | 28 | source_codes = ["xbert/corelib/rf_linear.cpp"] 29 | headers = ["xbert/corelib/rf_matrix.h"] 30 | include_dirs = ["trmf/corelib"] 31 | libname = "xbert.corelib.rf_linear" 32 | blas_link_args = get_blas_link_args() 33 | 34 | if sys.platform == "win32": 35 | print('Not supported in Windows') 36 | sys.exit(-1) 37 | dynamic_lib = Extension('liblinear.liblinear_dynamic', source_codes, 38 | depends=headers, 39 | include_dirs=["src/"], 40 | define_macros=[("_WIN64",""), ("_CRT_SECURE_NO_DEPRECATE","")], 41 | language="c++", 42 | extra_link_args=["-DEF:src\linear.def"]) 43 | else : 44 | dynamic_lib_float32 = Extension('{}_float32'.format(libname), 45 | source_codes, 46 | depends=headers, 47 | include_dirs=include_dirs, 48 | define_macros=[("ValueType","float")], 49 | extra_compile_args=["-fopenmp", "-march=native", "-O3", "-std=c++11"], 50 | extra_link_args=blas_link_args, 51 | language="c++") 52 | 53 | dynamic_lib_float64 = Extension('{}_float64'.format(libname), 54 | source_codes, 55 | depends=headers, 56 | include_dirs=include_dirs, 57 | define_macros=[("ValueType","double")], 58 | extra_compile_args=["-fopenmp", "-march=native", "-O3", "-std=c++11"], 59 | extra_link_args=blas_link_args, 60 | language="c++") 61 | setup( 62 | name='xbert', 63 | packages=["xbert"], 64 | version='0.1', 65 | description='Experimental Codes for X-BERT paper', 66 | author='Wei-Cheng Chang', 67 | author_email='peter78789@gmail.com', 68 | ext_modules=[dynamic_lib_float32, dynamic_lib_float64], 69 | package_data={"xbert":["corelib/*.cpp", "corelib/*.h"]}, 70 | setup_requires=["mkl", "scipy", "numpy"], 71 | install_requires=["mkl", "scipy", "numpy"] 72 | ) 73 | -------------------------------------------------------------------------------- /xbert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OctoberChang/X-Transformer/2902d59c38f7f60b49b328adc55cf20d77499dc3/xbert/__init__.py -------------------------------------------------------------------------------- /xbert/corelib/COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 The X-BERT Project. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | 3. Neither name of copyright holders nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | ========================================================================= 32 | * The linear solver implementations in rf_linear.cpp are based on LIBLINEAR project 33 | * The implemetnations in rf_matrix.h are based on TRMF project 34 | -------------------------------------------------------------------------------- /xbert/corelib/COPYRIGHT.liblinear: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2007-2019 The LIBLINEAR Project. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither name of copyright holders nor the names of its contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /xbert/corelib/COPYRIGHT.trmf: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2019 The TRMF Project. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | 3. Neither name of copyright holders nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /xbert/corelib/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | #CXXFLAGS=-fopenmp -static -O3 3 | CXXFLAGS=-fopenmp -ffast-math -pipe -g -fPIC 4 | CXXFLAGS=-fopenmp -march=native -ffast-math -fPIC -g -std=c++11 5 | #CXXFLAGS=-fopenmp -march=native -ffast-math -O3 -std=c++11 6 | CXXFLAGS=-fopenmp -march=native -ffast-math -std=c++11 -O3 7 | CXXFLAGS=-fopenmp -march=native -O3 -std=c++11 -fPIC 8 | CXXFLAGS_DEBUG=-fopenmp -march=native -g -std=c++11 -fPIC -pg 9 | VERSION=0.1 10 | 11 | #CXXFLAGS+= -D_USE_FLOAT_ 12 | 13 | SCIPY_ROOT=$(shell python -c "from scipy import linalg as lib;print(lib.__file__.rsplit('/', 1)[0])") 14 | SCIPY_BLAS=$(shell python -c "from scipy.linalg import cython_blas as lib;print(lib.__file__)") 15 | SCIPY_LAPACK=$(shell python -c "from scipy.linalg import cython_lapack as lib;print(lib.__file__)") 16 | 17 | SCIPY_BLAS=$(shell python -W ignore -c "import numpy.distutils.system_info as C;print(' '.join('-l{}'.format(x) for x in C.get_info('lapack_opt')['libraries']))" | tail -n 1) 18 | SCIPY_BLAS_DIR=$(shell python -W ignore -c "import numpy.distutils.system_info as C;print(' '.join('-L{}'.format(x) for x in C.get_info('lapack_opt')['library_dirs']))" | tail -n 1) 19 | SCIPY_BLAS_RPATH=$(shell python -W ignore -c "import numpy.distutils.system_info as C;print('-Wl,-rpath,{}'.format(':'.join(C.get_info('lapack_opt')['library_dirs'])))" | tail -n 1) 20 | 21 | BLASFLAGS=-llapack_atlas -lf77blas -lcblas -latlas -lgfortran -L/u/rofuyu/.local/lib #GCC in UTCS 22 | BLASFLAGS=-llapack -lptf77blas -lptcblas -latlas -L/home/01845/rofuyu/.local/lib # GCC in TACC 23 | BLASFLAGS=-lblas -llapack 24 | BLASFLAGS=-lmklml_intel 25 | LIBSFLAGS=-L/home/rofuyu/anaconda3/lib -lmkl_intel_ilp64 -lmkl_gnu_thread -lmkl_core -lgomp 26 | #LIBSFLAGS=-L$(SCIPY_ROOT) $(SCIPY_BLAS) $(SCIPY_LAPACK) 27 | BLASFLAGS=${LIBSFLAGS} 28 | LIBSFLAGS=${SCIPY_BLAS_RPATH} ${SCIPY_BLAS_DIR} ${SCIPY_BLAS} 29 | 30 | OS=$(shell uname) 31 | 32 | all: lib go-linear 33 | 34 | rf_linear.o: rf_linear.cpp rf_matrix.h 35 | ${CXX} ${CXXFLAGS} -Wall -c -o rf_linear.o rf_linear.cpp 36 | 37 | go-linear: rf_linear.o 38 | ${CXX} ${CXXFLAGS_DEBUG} -DValueType=double -Wall -g -o go-linear rf_linear.o ${LIBSFLAGS} 39 | 40 | lib: rf_linear_so 41 | 42 | rf_linear_so: rf_linear.cpp rf_matrix.h 43 | SO=rf_linear; \ 44 | if [ "$(OS)" = "Darwin" ]; then \ 45 | SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,$${SO}"; \ 46 | BLAS_LIB_FLAG="-framework Accelerate";\ 47 | else \ 48 | SHARED_LIB_FLAG="-shared -Wl,-soname,$${SO}"; \ 49 | BLAS_LIB_FLAG=" -Wl,--as-needed ${SCIPY_BLAS_RPATH} ${SCIPY_BLAS_DIR} ${SCIPY_BLAS} -liomp5";\ 50 | fi; \ 51 | echo CXXFLAGs=${CXXFLAGS}; \ 52 | echo BLAS_LIB_FLAG=$${BLAS_LIB_FLAG}; \ 53 | ${CXX} ${CXXFLAGS} -D ValueType=float $${SHARED_LIB_FLAG} -o $${SO}_float32.so rf_linear.cpp $${BLAS_LIB_FLAG} ; \ 54 | ${CXX} ${CXXFLAGS} -D ValueType=double $${SHARED_LIB_FLAG} -o $${SO}_float64.so rf_linear.cpp $${BLAS_LIB_FLAG}; \ 55 | 56 | clean: 57 | rm -rf go-linear rf_linear*.so *.o 58 | 59 | -------------------------------------------------------------------------------- /xbert/evaluator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import argparse 5 | import os 6 | import pickle 7 | import scipy as sp 8 | import scipy.sparse as smat 9 | import xbert.rf_linear as rf_linear 10 | 11 | 12 | def print_ens(Y_true, Y_pred_list): 13 | for ens in [ 14 | rf_linear.CsrEnsembler.average, 15 | rf_linear.CsrEnsembler.rank_average, 16 | rf_linear.CsrEnsembler.round_robin, 17 | ]: 18 | print("ens: {}".format(ens.__name__)) 19 | print(rf_linear.Metrics.generate(Y_true, ens(*Y_pred_list))) 20 | 21 | 22 | def main(args): 23 | # loading test set 24 | Y_true = smat.load_npz(args.input_inst_label) 25 | Y_pred_list = [] 26 | for pred_path in args.pred_path: 27 | if not os.path.exists(pred_path): 28 | raise Warning("pred_path does not exists: {}".format(pred_path)) 29 | else: 30 | Y_pred = smat.load_npz(pred_path) 31 | Y_pred.data = rf_linear.Transform.sigmoid(Y_pred.data) 32 | 33 | Y_pred_list += [Y_pred] 34 | print("==== Evaluation on {}".format(pred_path)) 35 | print(rf_linear.Metrics.generate(Y_true, Y_pred)) 36 | if args.ensemble and len(Y_pred_list) > 1: 37 | print("==== Evaluations of Ensembles of All Predictions ====") 38 | for ens in [ 39 | rf_linear.CsrEnsembler.average, 40 | rf_linear.CsrEnsembler.rank_average, 41 | rf_linear.CsrEnsembler.round_robin, 42 | ]: 43 | print("ens: {}".format(ens.__name__)) 44 | print(rf_linear.Metrics.generate(Y_true, ens(*Y_pred_list))) 45 | 46 | 47 | if __name__ == "__main__": 48 | # parse argument 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( 51 | "-y", 52 | "--input-inst-label", 53 | type=str, 54 | required=True, 55 | help="path to the npz file of the truth label matrix (CSR) for computing metrics", 56 | ) 57 | parser.add_argument( 58 | "-e", "--ensemble", action="store_true", help="whether to perform ensemble evaluations as well", 59 | ) 60 | parser.add_argument( 61 | "-p", "--pred_path", nargs="+", help="path to the npz file of the sorted prediction (CSR)", 62 | ) 63 | args = parser.parse_args() 64 | print(args) 65 | main(args) 66 | -------------------------------------------------------------------------------- /xbert/indexer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import argparse 5 | import json 6 | import os 7 | from os import path 8 | import numpy as np 9 | import scipy as sp 10 | import scipy.sparse as smat 11 | import ctypes 12 | from ctypes import * 13 | from sklearn.preprocessing import normalize as sk_normalize 14 | 15 | from xbert.rf_util import PyMatrix, fillprototype, load_dynamic_library 16 | 17 | class RandomProject(object): 18 | 19 | """Encode and decode a label into a K-way D-dimensional code. 20 | 21 | feat_mat: L by P matrix 22 | L: number of label 23 | P: label feature dimension 24 | K: range of the code = {0, 1, 2, ..., K-1} 25 | D: length of the codes for each label (number of hashing functions) 26 | """ 27 | 28 | def __init__(self, feat_mat, kdim, depth, algo, seed): 29 | self.feat_mat = feat_mat 30 | self.code_delim = " | " 31 | self.K, self.D = kdim, depth 32 | self.L, self.P = feat_mat.shape 33 | self.algo = algo 34 | self.random_matrix = np.random.randn(self.P, self.D) 35 | 36 | def get_codes(self): 37 | if self.algo == 2: # ordinal 38 | Z_quant = self.ordinal_quantization(self.feat_mat) 39 | elif self.algo == 3: # uniform 40 | Z_quant = self.uniform_quantization(self.feat_mat) 41 | else: 42 | raise NotImplementedError("unknown algo {}".format(self.algo)) 43 | self.hash_func = np.array([self.K ** d for d in range(0, self.D)]) 44 | return Z_quant.dot(self.hash_func) 45 | 46 | # Z: L by D projected label embedding 47 | # Z_quant: L by D quantized projected label embeddings 48 | # quantize it by ordinal ranking 49 | def ordinal_quantization(self, label_embedding): 50 | Z = label_embedding.dot(self.random_matrix) 51 | Z_argsort = np.argsort(Z, axis=0) 52 | bin_size = math.ceil(self.L * 1.0 / self.K) 53 | Z_quant = [] 54 | for d in range(self.D): 55 | rank = np.zeros(self.L, dtype=np.int64) 56 | rank[Z_argsort[:, d]] = np.arange(self.L) 57 | quantized_rows = (rank // bin_size).tolist() 58 | Z_quant.append(quantized_rows) 59 | Z_quant = np.array(Z_quant).T 60 | return Z_quant 61 | 62 | # quantize it by min/max of each cols into K bins 63 | def uniform_quantization(self, label_embedding): 64 | Z = label_embedding.dot(self.random_matrix) 65 | Z_quant = [] 66 | for d in range(self.D): 67 | bins = np.linspace(min(Z[:, d]), max(Z[:, d]), self.K) 68 | quantized_rows = np.digitize(Z[:, d], bins) - 1 # bits = {0, 1, ..., K-1} 69 | Z_quant.append(quantized_rows) 70 | Z_quant = np.array(Z_quant).T 71 | return Z_quant 72 | 73 | def prepare_coding(self, Z_quant): 74 | # L dimensional array, 75 | # each entry is the hash code (row idx) 76 | self.hash_code_arr = Z_quant.dot(self.hash_func) 77 | rows, cols, vals = [], [], [] 78 | for l in range(self.L): 79 | rows.append(self.hash_code_arr[l]) 80 | cols.append(l) 81 | vals.append(1) 82 | 83 | m = self.K ** self.D 84 | n = self.L 85 | M = sp.csr_matrix((vals, (rows, cols)), shape=(m, n)) 86 | self.code2label_mat = M 87 | 88 | code2label_set = {} 89 | for code in np.nonzero(M.indptr[1:] - M.indptr[:-1])[0]: 90 | code2label_set[code] = set(M.indices[M.indptr[code] : M.indptr[code + 1]]) 91 | self.code2label_set = code2label_set 92 | 93 | 94 | class corelib(object): 95 | def __init__(self, dirname, soname, forced_rebuild=False): 96 | self.clib_float32 = load_dynamic_library(dirname, soname + "_float32", forced_rebuild=forced_rebuild) 97 | self.clib_float64 = load_dynamic_library(dirname, soname + "_float64", forced_rebuild=forced_rebuild) 98 | arg_list = [ 99 | POINTER(PyMatrix), 100 | c_uint32, 101 | c_uint32, 102 | c_int32, 103 | c_uint32, 104 | c_int32, 105 | POINTER(c_uint32), 106 | ] 107 | fillprototype(self.clib_float32.get_codes, None, arg_list) 108 | fillprototype(self.clib_float64.get_codes, None, arg_list) 109 | 110 | def get_codes(self, py_feat_mat, depth, algo, seed, codes, verbose=0, max_iter=10, threads=-1): 111 | clib = self.clib_float32 112 | if py_feat_mat.dtype == sp.float64: 113 | clib = self.clib_float64 114 | if verbose != 0: 115 | print("perform float64 computation") 116 | else: 117 | clib = self.clib_float32 118 | if verbose != 0: 119 | print("perform float32 computation") 120 | clib.get_codes( 121 | byref(py_feat_mat), depth, algo, seed, max_iter, threads, codes.ctypes.data_as(POINTER(c_uint32)), 122 | ) 123 | 124 | 125 | forced_rebuild = False 126 | corelib_path = path.join(path.dirname(path.abspath(__file__)), "corelib/") 127 | soname = "rf_linear" 128 | clib = corelib(corelib_path, soname, forced_rebuild) 129 | 130 | # SEmatic-aware Code 131 | class SeC(object): 132 | def __init__(self, kdim, depth, algo, seed, codes): 133 | assert kdim == 2 134 | self.kdim = kdim 135 | self.depth = depth 136 | self.algo = algo 137 | self.seed = seed 138 | self.codes = codes 139 | self.indptr = sp.cumsum(sp.bincount(codes + 1, minlength=(self.nr_codes + 1)), dtype=sp.uint64) 140 | self.indices = sp.argsort(codes * sp.float64(self.nr_elements) + sp.arange(self.nr_elements)) 141 | 142 | @property 143 | def nr_elements(self): 144 | return len(self.codes) 145 | 146 | @property 147 | def nr_codes(self): 148 | return 1 << self.depth 149 | 150 | def __len__(self): 151 | return len(self.codes) 152 | 153 | def get_code_for_element(self, eid): 154 | assert 0 <= eid and eid < self.nr_elements 155 | return self.codes[eid] 156 | 157 | def get_elements_with_code(self, code): 158 | assert 0 <= code and code < self.nr_codes 159 | begin, end = self.indptr[code], self.indptr[code + 1] 160 | return self.indices[begin:end] 161 | 162 | def get_csc_matrix(self): 163 | return smat.csc_matrix( 164 | (sp.ones_like(self.indices, dtype=sp.float64), self.indices, self.indptr), shape=(self.nr_elements, self.nr_codes), 165 | ) 166 | 167 | def print(self): 168 | print("nr_codes: {}".format(self.nr_codes)) 169 | print("nr_elements: {}".format(self.nr_elements)) 170 | print("algo: {}".format(Indexer.algos[self.algo])) 171 | for nid in range(self.nr_codes): 172 | labels = " ".join(map(str, self.get_elements_with_code(nid))) 173 | print("code({nid}): {labels}".format(nid=nid, labels=labels)) 174 | 175 | 176 | class Indexer(object): 177 | KMEANS = 0 178 | KDTREE = 1 # KDTREE with Roound-Robin feature splits 179 | ORDINAL = 2 # random projection with ordinal quantization 180 | UNIFORM = 3 # random projection with uniform quantization 181 | BALANCED_ORDINAL = 4 # random projection with balaced ordinal quantization 182 | SKMEANS = 5 # Spherical KMEANS 183 | KDTREE_CYCLIC = 11 # KDTREE with cyclic feature splits( 0,...,0, 1,...,1, 2,...,2) 184 | PURE_RANDOM = 12 # Random assign cluster for each element 185 | 186 | algos = {v: k for k, v in vars().items() if isinstance(v, int)} 187 | 188 | def __init__(self, feat_mat): 189 | self.py_feat_mat = PyMatrix.init_from(feat_mat) 190 | 191 | @property 192 | def feat_mat(self): 193 | return self.py_feat_mat.buf 194 | 195 | @property 196 | def nr_labels(self): 197 | return self.feat_mat.shape[0] 198 | 199 | @staticmethod 200 | def load_indexed_code(code_path, label_feat): 201 | C = None 202 | mapping = { 203 | "none": Indexer.SKMEANS, 204 | "skmeans": Indexer.SKMEANS, 205 | "kmeans": Indexer.KMEANS, 206 | "kdtree": Indexer.KDTREE, 207 | "random": Indexer.PURE_RANDOM, 208 | "ordinal": Indexer.BALANCED_ORDINAL, 209 | "uniform": Indexer.UNIFORM, 210 | } 211 | if code_path is None: 212 | code_path = "none" 213 | 214 | if code_path.lower() in mapping: 215 | if label_feat is not None: 216 | algo = mapping[code_path.lower()] 217 | if algo == Indexer.SKMEANS: 218 | label_feat = sk_normalize(label_feat, axis=1, norm="l2", copy=False) 219 | indexer = Indexer(label_feat) 220 | code = indexer.gen( 221 | kdim=2, 222 | depth=indexer.estimate_depth_with_cluster_size(100), 223 | algo=algo, 224 | seed=0, 225 | max_iter=20, 226 | threads=1, 227 | ) 228 | C = code.get_csc_matrix() 229 | else: 230 | if code_path.endswith(".npz") and path.exists(code_path): 231 | C = smat.load_npz(code_path) 232 | elif path.isdir(code_path) and path.exists(path.join(code_path, "code.npz")): 233 | C = smat.load_npz(path.join(code_path, "code.npz")) 234 | else: 235 | assert False, f"'{code_path}' does not exist. Valid ones {mapping.keys()}" 236 | return C 237 | 238 | def estimate_depth_with_nr_clusters(self, nr_clusters): 239 | depth = int(sp.log2(nr_clusters)) 240 | return depth 241 | 242 | def estimate_depth_with_cluster_size(self, cluster_size): 243 | return self.estimate_depth_with_nr_clusters(self.nr_labels // cluster_size + 1) 244 | 245 | def ordinal_gen(self, kdim, depth, seed): 246 | sp.random.seed(seed) 247 | random_matrix = sp.randn(self.feat_mat.shape[1], depth) 248 | X = self.feat_mat.dot(random_matrix) 249 | m = self.feat_mat.shape[0] // kdim + [1, 0][self.feat_mat.shape[0] % kdim != 0] 250 | X = sp.argsort(sp.argsort(X, axis=0), axis=0) // m 251 | print(X) 252 | codes = sp.array((X * (kdim ** sp.arange(depth)).reshape(1, -1)).sum(axis=1), dtype=sp.uint32) 253 | return codes 254 | 255 | def balaced_ordinal_gen(self, kdim, depth, seed, threads=1): 256 | assert int(2 ** sp.log2(kdim)) == kdim 257 | sp.random.seed(seed) 258 | random_matrix = sp.randn(self.feat_mat.shape[1], depth) 259 | X = PyMatrix(self.feat_mat.dot(random_matrix)) 260 | codes = sp.zeros(X.rows, dtype=sp.uint32) 261 | new_depth = depth * int(sp.log2(kdim)) 262 | clib.get_codes(X, new_depth, Indexer.KDTREE_CYCLIC, seed, codes, threads=threads) 263 | return codes 264 | 265 | def gen(self, kdim, depth, algo, seed, max_iter=10, threads=1): 266 | assert algo in [ 267 | Indexer.KMEANS, 268 | Indexer.KDTREE, 269 | Indexer.ORDINAL, 270 | Indexer.UNIFORM, 271 | Indexer.BALANCED_ORDINAL, 272 | Indexer.KDTREE_CYCLIC, 273 | Indexer.SKMEANS, 274 | Indexer.PURE_RANDOM, 275 | ] 276 | if algo in [ 277 | Indexer.KMEANS, 278 | Indexer.KDTREE, 279 | Indexer.KDTREE_CYCLIC, 280 | Indexer.SKMEANS, 281 | ]: 282 | feat_mat = self.py_feat_mat 283 | codes = sp.zeros(feat_mat.rows, dtype=sp.uint32) 284 | clib.get_codes(feat_mat, depth, algo, seed, codes, max_iter=max_iter, threads=threads) 285 | elif algo in [Indexer.ORDINAL, Indexer.UNIFORM]: 286 | rp_clf = RandomProject(self.feat_mat, kdim, depth, algo, seed) 287 | codes = rp_clf.get_codes() 288 | elif algo in [Indexer.BALANCED_ORDINAL]: 289 | assert int(2 ** sp.log2(kdim)) == kdim 290 | codes = self.balaced_ordinal_gen(kdim, depth, seed, threads=threads) 291 | elif algo in [Indexer.PURE_RANDOM]: 292 | feat_mat = self.py_feat_mat 293 | codes = sp.zeros(feat_mat.rows, dtype=sp.uint32) 294 | cluster_size = kdim ** depth 295 | for idx in range(feat_mat.rows): 296 | codes[idx] = np.random.randint(0, cluster_size) 297 | else: 298 | raise NotImplementedError("unknown algo {}".format(algo)) 299 | return SeC(kdim, depth, algo, seed, codes) 300 | 301 | 302 | def run_test(data_folder="./datasets/Eurlex-4K"): 303 | import xbert.rf_linear as rf_linear 304 | 305 | data = rf_linear.Data.load(data_folder, label_emb=None) 306 | L = smat.load_npz(data_folder + "/L.pifa.npz") 307 | code = Indexer(L).gen(kdim=2, depth=6, algo=0, seed=5, max_iter=20, threads=1) 308 | code.print() 309 | code = Indexer(L).gen(kdim=2, depth=6, algo=5, seed=5, max_iter=20, threads=1) 310 | code.print() 311 | 312 | def load_feature_matrix(src, dtype=sp.float32): 313 | if src.endswith(".npz"): 314 | return smat.load_npz(src).tocsr().astype(dtype) 315 | elif src.endswith(".npy"): 316 | return smat.csr_matrix(sp.ascontiguousarray(sp.load(src), dtype=dtype)) 317 | else: 318 | raise ValueError("src must end with .npz or .npy") 319 | 320 | def main(args): 321 | # set hyper-parameters 322 | input_feat_path = args.input_feat_path 323 | kdim = args.kdim 324 | algo = args.algo 325 | seed = args.seed 326 | max_iter = args.max_iter 327 | threads = args.threads 328 | output_code_dir = args.output_code_dir 329 | 330 | # load label feature matrix (nr_labels * nr_features) 331 | if path.exists(input_feat_path): 332 | feat_mat = load_feature_matrix(input_feat_path) 333 | else: 334 | raise ValueError("label embedding path does not exist {}".format(input_feat_path)) 335 | 336 | if not path.exists(output_code_dir): 337 | os.makedirs(output_code_dir, exist_ok=True) 338 | 339 | # Indexing algorithm 340 | # C: nr_labels x nr_codes, stored in csr sparse matrix 341 | indexer = Indexer(feat_mat) 342 | if algo == indexer.SKMEANS: 343 | feat_mat = sk_normalize(feat_mat, axis=1, norm="l2", copy=False) 344 | depth = indexer.estimate_depth_with_cluster_size(100)+1 345 | code = indexer.gen(kdim=kdim, depth=depth, algo=algo, seed=seed, max_iter=max_iter, threads=threads) 346 | C = code.get_csc_matrix() 347 | print("INDEXER DEPTH", depth, "C", C.shape) 348 | 349 | # save code and args 350 | output_code_path = path.join(output_code_dir, "code.npz") 351 | smat.save_npz("{}".format(output_code_path), C, compressed=False) 352 | output_config_path = path.join(output_code_dir, "config.json") 353 | with open(output_config_path, "w") as fout: 354 | fout.write(json.dumps(vars(args), indent=True)) 355 | 356 | 357 | if __name__ == "__main__": 358 | parser = argparse.ArgumentParser() 359 | ## Required parameters 360 | parser.add_argument( 361 | "-i", 362 | "-L", 363 | "--input-feat-path", 364 | type=str, 365 | required=True, 366 | default="./datasets/Eurlex-4K/L.pifa.npz", 367 | help="path to the npz file of input label feature matrix (nr_labels * nr_features, CSR)", 368 | ) 369 | parser.add_argument( 370 | "-o", 371 | "-c", 372 | "--output-code-dir", 373 | type=str, 374 | required=True, 375 | default="./save_models/Eurlex-4K/indexer/code.npz", 376 | help="path to the output npz file of indexing codes (nr_labels * nr_codes, CSR)", 377 | ) 378 | # optional 379 | parser.add_argument("--algo", type=int, default=5, help="0 for KMEANS 5 for SKMEANS (default 5)") 380 | parser.add_argument("--seed", type=int, default=0, help="random seed (default 0)") 381 | parser.add_argument("--kdim", type=int, default=2) 382 | parser.add_argument("--threads", type=int, default=1) 383 | parser.add_argument("--max-iter", type=int, default=20) 384 | 385 | args = parser.parse_args() 386 | print(args) 387 | main(args) 388 | -------------------------------------------------------------------------------- /xbert/modeling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from transformers.file_utils import add_start_docstrings 9 | from transformers.configuration_bert import BertConfig 10 | from transformers.configuration_roberta import RobertaConfig 11 | from transformers.configuration_xlnet import XLNetConfig 12 | from transformers.modeling_utils import SequenceSummary 13 | from transformers.modeling_bert import ( 14 | BERT_START_DOCSTRING, 15 | BERT_INPUTS_DOCSTRING, 16 | BERT_PRETRAINED_MODEL_ARCHIVE_MAP, 17 | ) 18 | from transformers.modeling_bert import BertModel, BertPreTrainedModel 19 | from transformers.modeling_roberta import ( 20 | ROBERTA_START_DOCSTRING, 21 | ROBERTA_INPUTS_DOCSTRING, 22 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, 23 | ) 24 | from transformers.modeling_roberta import RobertaModel, RobertaClassificationHead 25 | from transformers.modeling_xlnet import ( 26 | XLNET_START_DOCSTRING, 27 | XLNET_INPUTS_DOCSTRING, 28 | XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, 29 | ) 30 | from transformers.modeling_xlnet import XLNetPreTrainedModel, XLNetModel 31 | 32 | 33 | def repack_output(output_ids, output_mask, num_labels): 34 | batch_size = output_ids.size(0) 35 | idx_arr = torch.nonzero(output_mask) 36 | rows = idx_arr[:, 0] 37 | cols = output_ids[idx_arr[:, 0], idx_arr[:, 1]] 38 | c_true = torch.zeros((batch_size, num_labels), dtype=torch.float, device=output_ids.device) 39 | c_true[rows, cols] = 1.0 40 | return c_true 41 | 42 | 43 | 44 | @add_start_docstrings( 45 | """Bert Model transformer with a sequence classification head on top (a linear layer on top of 46 | the pooled output) e.g. for eXtreme Multi-label Classification (XMLC). """, 47 | BERT_START_DOCSTRING, 48 | BERT_INPUTS_DOCSTRING, 49 | ) 50 | class BertForXMLC(BertPreTrainedModel): 51 | r""" 52 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 53 | Labels for computing the sequence classification/regression loss. 54 | Indices should be in ``[0, ..., config.num_labels - 1]``. 55 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 56 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 57 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 58 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 59 | Classification (or regression if config.num_labels==1) loss. 60 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 61 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 62 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 63 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 64 | of shape ``(batch_size, sequence_length, hidden_size)``: 65 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 66 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 67 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 68 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 69 | Examples:: 70 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 71 | model = BertForXMLC.from_pretrained('bert-base-uncased') 72 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 73 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 74 | outputs = model(input_ids, labels=labels) 75 | loss, logits = outputs[:2] 76 | """ 77 | 78 | def __init__(self, config): 79 | super(BertForXMLC, self).__init__(config) 80 | self.num_labels = config.num_labels 81 | self.bert = BertModel(config) 82 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 83 | self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) 84 | self.init_weights() 85 | 86 | def forward( 87 | self, 88 | input_ids=None, 89 | attention_mask=None, 90 | token_type_ids=None, 91 | position_ids=None, 92 | head_mask=None, 93 | inputs_embeds=None, 94 | ): 95 | outputs = self.bert( 96 | input_ids, 97 | attention_mask=attention_mask, 98 | token_type_ids=token_type_ids, 99 | position_ids=position_ids, 100 | head_mask=head_mask, 101 | inputs_embeds=inputs_embeds, 102 | ) 103 | # get [cls] hidden states 104 | pooled_output = outputs[1] 105 | pooled_output = self.dropout(pooled_output) 106 | logits = self.classifier(pooled_output) 107 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 108 | return outputs # logits, (hidden_states), (attentions) 109 | 110 | 111 | @add_start_docstrings( 112 | """RoBERTa Model transformer with a sequence classification head on top (a linear layer 113 | on top of the pooled output) e.g. for eXtreme Multi-label Classification (XMLC). """, 114 | ROBERTA_START_DOCSTRING, 115 | ROBERTA_INPUTS_DOCSTRING, 116 | ) 117 | class RobertaForXMLC(BertPreTrainedModel): 118 | r""" 119 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 120 | Labels for computing the sequence classification/regression loss. 121 | Indices should be in ``[0, ..., config.num_labels]``. 122 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 123 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 124 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 125 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 126 | Classification (or regression if config.num_labels==1) loss. 127 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 128 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 129 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 130 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 131 | of shape ``(batch_size, sequence_length, hidden_size)``: 132 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 133 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 134 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 135 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 136 | Examples:: 137 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 138 | model = RobertaForSequenceClassification.from_pretrained('roberta-base') 139 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 140 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 141 | outputs = model(input_ids, labels=labels) 142 | loss, logits = outputs[:2] 143 | """ 144 | config_class = RobertaConfig 145 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 146 | base_model_prefix = "roberta" 147 | 148 | def __init__(self, config): 149 | super(RobertaForXMLC, self).__init__(config) 150 | self.num_labels = config.num_labels 151 | self.roberta = RobertaModel(config) 152 | self.classifier = RobertaClassificationHead(config) 153 | 154 | def forward( 155 | self, 156 | input_ids=None, 157 | attention_mask=None, 158 | token_type_ids=None, 159 | position_ids=None, 160 | head_mask=None, 161 | inputs_embeds=None, 162 | ): 163 | outputs = self.roberta( 164 | input_ids, 165 | attention_mask=attention_mask, 166 | token_type_ids=token_type_ids, 167 | position_ids=position_ids, 168 | head_mask=head_mask, 169 | inputs_embeds=inputs_embeds, 170 | ) 171 | sequence_output = outputs[0] 172 | logits = self.classifier(sequence_output) 173 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 174 | return outputs # logits, (hidden_states), (attentions) 175 | 176 | 177 | @add_start_docstrings( 178 | """XLNet Model with a sequence classification head on top (a linear layer on top of 179 | the pooled output) for eXtreme Multi-label Classification (XMLC)""", 180 | XLNET_START_DOCSTRING, 181 | XLNET_INPUTS_DOCSTRING, 182 | ) 183 | class XLNetForXMLC(XLNetPreTrainedModel): 184 | r""" 185 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 186 | Labels for computing the sequence classification/regression loss. 187 | Indices should be in ``[0, ..., config.num_labels - 1]``. 188 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 189 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 190 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 191 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 192 | Classification (or regression if config.num_labels==1) loss. 193 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 194 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 195 | **mems**: (`optional`, returned when ``config.mem_len > 0``) 196 | list of ``torch.FloatTensor`` (one for each layer): 197 | that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model 198 | if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. 199 | See details in the docstring of the `mems` input above. 200 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 201 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 202 | of shape ``(batch_size, sequence_length, hidden_size)``: 203 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 204 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 205 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 206 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 207 | When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``. 208 | Examples:: 209 | tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') 210 | model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') 211 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 212 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 213 | outputs = model(input_ids, labels=labels) 214 | loss, logits = outputs[:2] 215 | """ 216 | 217 | def __init__(self, config): 218 | super(XLNetForXMLC, self).__init__(config) 219 | self.num_labels = config.num_labels 220 | self.transformer = XLNetModel(config) 221 | self.sequence_summary = SequenceSummary(config) 222 | self.logits_proj = nn.Linear(config.d_model, config.num_labels) 223 | self.init_weights() 224 | 225 | def forward( 226 | self, 227 | input_ids=None, 228 | attention_mask=None, 229 | mems=None, 230 | perm_mask=None, 231 | target_mapping=None, 232 | token_type_ids=None, 233 | input_mask=None, 234 | head_mask=None, 235 | inputs_embeds=None, 236 | ): 237 | transformer_outputs = self.transformer( 238 | input_ids, 239 | attention_mask=attention_mask, 240 | mems=mems, 241 | perm_mask=perm_mask, 242 | target_mapping=target_mapping, 243 | token_type_ids=token_type_ids, 244 | input_mask=input_mask, 245 | head_mask=head_mask, 246 | inputs_embeds=inputs_embeds, 247 | ) 248 | output = transformer_outputs[0] 249 | output = self.sequence_summary(output) 250 | logits = self.logits_proj(output) 251 | outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it 252 | return outputs # return logits, (mems), (hidden states), (attentions) 253 | 254 | -------------------------------------------------------------------------------- /xbert/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import argparse 5 | from collections import Counter 6 | import itertools 7 | import json 8 | import os 9 | from os import path 10 | import logging 11 | import numpy as np 12 | import pickle 13 | import time 14 | from tqdm import tqdm 15 | import scipy as sp 16 | import scipy.sparse as smat 17 | from sklearn.preprocessing import normalize 18 | import pandas as pd 19 | 20 | from transformers import ( 21 | WEIGHTS_NAME, 22 | BertConfig, 23 | BertForSequenceClassification, 24 | BertTokenizer, 25 | RobertaConfig, 26 | RobertaForSequenceClassification, 27 | RobertaTokenizer, 28 | XLMConfig, 29 | XLMForSequenceClassification, 30 | XLMTokenizer, 31 | XLNetConfig, 32 | XLNetForSequenceClassification, 33 | XLNetTokenizer, 34 | DistilBertConfig, 35 | DistilBertForSequenceClassification, 36 | DistilBertTokenizer, 37 | AlbertConfig, 38 | AlbertForSequenceClassification, 39 | AlbertTokenizer, 40 | ) 41 | 42 | ALL_MODELS = sum( 43 | (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig,)), 44 | (), 45 | ) 46 | 47 | MODEL_CLASSES = { 48 | "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), 49 | "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 50 | "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 51 | "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 52 | "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,), 53 | "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), 54 | } 55 | 56 | 57 | logging.basicConfig( 58 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, 59 | ) 60 | logger = logging.getLogger(__name__) 61 | 62 | 63 | def run_label_embedding(args): 64 | label_map_path = "{}/label_map.txt".format(args.input_data_dir) 65 | id2label = [line.strip() for line in open(label_map_path, 'r', encoding='ISO-8859-1')] 66 | n_label = len(id2label) 67 | 68 | if args.label_emb_name.startswith('pifa'): 69 | if args.label_emb_name.startswith('pifa-tfidf'): 70 | assert args.inst_embedding.endswith(".npz") 71 | X = smat.load_npz("{}/X.trn.npz".format(args.input_data_dir)) 72 | elif args.label_emb_name.startswith('pifa-neural'): 73 | assert args.inst_embedding.endswith(".npy") 74 | X = np.load(args.inst_embedding) 75 | else: 76 | raise ValueError("only support .npz or .npy object!") 77 | Y = smat.load_npz("{}/Y.trn.npz".format(args.input_data_dir)) 78 | logger.info("X {} {} Y {} {}".format(type(X), X.shape, type(Y), Y.shape)) 79 | 80 | # create label embedding 81 | Y_avg = normalize(Y, axis=1, norm="l2") 82 | label_embedding = smat.csr_matrix(Y_avg.T.dot(X)) 83 | label_embedding = normalize(label_embedding, axis=1, norm="l2") 84 | 85 | elif args.label_emb_name == "text-emb": 86 | # xlnet-large-cased tokenizer 87 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 88 | tokenizer = RobertaTokenizer.from_pretrained("roberta-large") 89 | model = RobertaModel.from_pretrained("roberta-large") 90 | model = model.to(device) 91 | model.eval() 92 | 93 | # get label embedding 94 | label_embedding = [] 95 | for idx in tqdm(range(n_label)): 96 | inputs = torch.tensor([tokenizer.encode(id2label[idx])]) 97 | inputs = inputs.to(device) 98 | with torch.no_grad(): 99 | last_hidden_states = model(inputs)[0] # [1, seq_len, hidden_dim] 100 | seq_embedding = last_hidden_states.mean(dim=1) 101 | label_embedding.append(seq_embedding) 102 | label_embedding = torch.cat(label_embedding, dim=0) 103 | label_embedding = label_embedding.cpu().numpy() 104 | label_embedding = smat.csr_matrix(label_embedding) 105 | label_embedding = normalize(label_embedding, axis=1, norm="l2") 106 | 107 | else: 108 | raise NotImplementedError("unknown embed_type {}".format(args.embed_type)) 109 | 110 | # save label embedding 111 | logger.info("label_embedding {} {}".format(type(label_embedding), label_embedding.shape)) 112 | label_embedding_path = "{}/L.{}.npz".format(args.output_data_dir, args.label_emb_name) 113 | smat.save_npz(label_embedding_path, label_embedding) 114 | 115 | 116 | def load_feat_data(text_path): 117 | xseqs = pd.read_csv(text_path, header=None, sep='\t').replace( 118 | r'\n', ' ', regex=True)[0] # we replaced any newline characters within each "line" here. 119 | #Note that this is potentially redundant due to the to_list method. 120 | xseqs = xseqs.apply(lambda x: x.strip()) 121 | xseq_list = xseqs.to_list() 122 | logger.info(f'Created X_seq list of size {len(xseq_list)}') 123 | return xseq_list 124 | 125 | 126 | def proc_feat( 127 | args, 128 | input_text_path, 129 | tokenizer, 130 | pad_on_left=False, 131 | pad_token=0, 132 | pad_token_segment_id=0, 133 | mask_padding_with_zero=True): 134 | 135 | # load raw text feat data 136 | xseq_list = load_feat_data(input_text_path) 137 | 138 | # convert raw text into tokens, and convert tokens into tok_ids 139 | # features: List[Dict(key,val)], where key=['inst_idx', 'input_ids', 'attention_mask', 'token_type_ids'] 140 | features, xseq_lens = [], [] 141 | for (inst_idx, xseq) in enumerate(xseq_list): 142 | if inst_idx % 1000 == 0: 143 | logger.info("Writing example %d" % (inst_idx)) 144 | 145 | # truncate long text by 4096 chars as they will exceed max_seq_len anyway 146 | inputs = tokenizer.encode_plus( 147 | text=xseq[:args.max_trunc_char], 148 | text_pair=None, 149 | add_special_tokens=True, 150 | max_length=args.max_xseq_len, 151 | ) 152 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 153 | xseq_lens.append(len(input_ids)) 154 | 155 | # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. 156 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 157 | 158 | # Zero-pad up to the sequence length. 159 | padding_length = args.max_xseq_len - len(input_ids) 160 | if pad_on_left: 161 | input_ids = ([pad_token] * padding_length) + input_ids 162 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask 163 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 164 | else: 165 | input_ids = input_ids + ([pad_token] * padding_length) 166 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 167 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 168 | 169 | # sanity check and logging 170 | assert len(input_ids) == args.max_xseq_len, "Error with input length {} vs {}".format(len(input_ids), args.max_xseq_len) 171 | assert len(attention_mask) == args.max_xseq_len, "Error with input length {} vs {}".format(len(attention_mask), args.max_xseq_len) 172 | assert len(token_type_ids) ==args.max_xseq_len, "Error with input length {} vs {}".format(len(token_type_ids), args.max_xseq_len) 173 | if inst_idx < 5: 174 | logger.info("*** Example ***") 175 | logger.info("inst_idx: %s" % (inst_idx)) 176 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 177 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) 178 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) 179 | 180 | cur_inst_dict = { 181 | 'inst_idx': inst_idx, 182 | 'input_ids': input_ids, 183 | 'attention_mask': attention_mask, 184 | 'token_type_ids': token_type_ids 185 | } 186 | features.append(cur_inst_dict) 187 | # end for loop 188 | return features, xseq_lens 189 | 190 | 191 | def main(args): 192 | 193 | if args.do_label_embedding: 194 | run_label_embedding(args) 195 | 196 | elif args.do_proc_feat: 197 | # load pretrained model tokenizers 198 | args.model_type = args.model_type.lower() 199 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 200 | tokenizer = tokenizer_class.from_pretrained( 201 | args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 202 | do_lower_case=args.do_lower_case, 203 | cache_dir=args.cache_dir if args.cache_dir else None, 204 | ) 205 | 206 | # process train features 207 | inp_trn_feat_path = os.path.join(args.input_data_dir, 'train_raw_texts.txt') 208 | logger.info("processing train features {}".format(inp_trn_feat_path)) 209 | trn_features, trn_xseq_lens = proc_feat( 210 | args, inp_trn_feat_path, tokenizer, 211 | pad_on_left=bool(args.model_type in ["xlnet"]), 212 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 213 | pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, 214 | ) 215 | logger.info( 216 | "trn_xseq: min={} max={} mean={} median={}".format( 217 | np.min(trn_xseq_lens), np.max(trn_xseq_lens), 218 | np.mean(trn_xseq_lens), np.median(trn_xseq_lens),) 219 | ) 220 | 221 | # save trn features 222 | os.makedirs(args.output_data_dir, exist_ok=True) 223 | out_trn_feat_path = path.join(args.output_data_dir, "X.trn.{}.{}.pkl".format(args.model_type, args.max_xseq_len)) 224 | with open(out_trn_feat_path, "wb") as fout: 225 | pickle.dump(trn_features, fout, protocol=pickle.HIGHEST_PROTOCOL) 226 | 227 | # process test features 228 | inp_tst_feat_path = os.path.join(args.input_data_dir, 'test_raw_texts.txt') 229 | logger.info("processing test features {}".format(inp_tst_feat_path)) 230 | tst_features, tst_xseq_lens = proc_feat( 231 | args, inp_tst_feat_path, tokenizer, 232 | pad_on_left=bool(args.model_type in ["xlnet"]), 233 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 234 | pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, 235 | ) 236 | logger.info( 237 | "tst_xseq: min={} max={} mean={} median={}".format( 238 | np.min(tst_xseq_lens), np.max(tst_xseq_lens), 239 | np.mean(tst_xseq_lens), np.median(tst_xseq_lens),) 240 | ) 241 | 242 | # save tst features 243 | out_tst_feat_path = path.join(args.output_data_dir, "X.tst.{}.{}.pkl".format(args.model_type, args.max_xseq_len)) 244 | with open(out_tst_feat_path, "wb") as fout: 245 | pickle.dump(tst_features, fout, protocol=pickle.HIGHEST_PROTOCOL) 246 | 247 | elif args.do_proc_label: 248 | # load existing code 249 | label2cluster_csr = smat.load_npz(args.input_code_path) 250 | csr_codes = label2cluster_csr.nonzero()[1] 251 | 252 | # load trn label matrix 253 | inp_trn_label_path = os.path.join(args.input_data_dir, "Y.trn.npz") 254 | inp_tst_label_path = os.path.join(args.input_data_dir, "Y.tst.npz") 255 | Y_trn = smat.load_npz(inp_trn_label_path) 256 | Y_tst = smat.load_npz(inp_tst_label_path) 257 | assert Y_trn.shape[1] == label2cluster_csr.shape[0] 258 | 259 | # save C_trn and C_tst 260 | C_trn = Y_trn.dot(label2cluster_csr) 261 | C_tst = Y_tst.dot(label2cluster_csr) 262 | logger.info("NUM_LABELS: {}".format(label2cluster_csr.shape[0])) 263 | logger.info("NUM_CLUSTERS: {}".format(label2cluster_csr.shape[1])) 264 | logger.info("C_trn: {}".format(C_trn.shape)) 265 | logger.info("C_tst: {}".format(C_tst.shape)) 266 | 267 | out_trn_label_path = os.path.join(args.output_data_dir, "C.trn.{}.npz".format(args.label_emb_name)) 268 | out_tst_label_path = os.path.join(args.output_data_dir, "C.tst.{}.npz".format(args.label_emb_name)) 269 | smat.save_npz(out_trn_label_path, C_trn) 270 | smat.save_npz(out_tst_label_path, C_tst) 271 | 272 | else: 273 | raise ValueError("one of --do_label_embedding or --do_proc_feat or --do_proc_label must be set!") 274 | 275 | 276 | if __name__ == "__main__": 277 | parser = argparse.ArgumentParser() 278 | ## Required parameters 279 | parser.add_argument( 280 | "-i", 281 | "--input-data-dir", 282 | type=str, 283 | required=True, 284 | metavar="DIR", 285 | default="./datasets/Eurlex-4K", 286 | help="path to the dataset directory containing train_texts.txt and test_texts.txt", 287 | ) 288 | parser.add_argument( 289 | "-o", 290 | "--output-data-dir", 291 | type=str, 292 | required=True, 293 | metavar="DIR", 294 | default="./save_models/Eurlex-4K/proc_data", 295 | help="directory for storing X.[trn|tst].[model-type].[xseq-len].pkl and C.[trn|tst].npz", 296 | ) 297 | parser.add_argument( 298 | "--do_proc_feat", action="store_true", help="Set this flag if you are processing features.", 299 | ) 300 | parser.add_argument( 301 | "--do_proc_label", action="store_true", help="Set this flag if you are processing cluster labels.", 302 | ) 303 | parser.add_argument( 304 | "--do_label_embedding", action="store_true", help="Set this flag if you are constructing label embeddings.", 305 | ) 306 | # tokenizers 307 | parser.add_argument( 308 | "-m", "--model-type", type=str, default="bert", help="preprocess for model-type [bert | xlnet | xlm | roberta]", 309 | ) 310 | parser.add_argument( 311 | "-n", 312 | "--model_name_or_path", 313 | type=str, 314 | default="bert-large-cased-whole-word-masking", 315 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), 316 | ) 317 | parser.add_argument( 318 | "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", 319 | ) 320 | parser.add_argument( 321 | "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", 322 | ) 323 | parser.add_argument( 324 | "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", 325 | ) 326 | parser.add_argument( 327 | "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", 328 | ) 329 | parser.add_argument( 330 | "--max_xseq_len", 331 | default=128, 332 | type=int, 333 | help="The maximum total input sequence length after WordPiece tokenization. \n" 334 | "Sequences longer than this will be truncated, and sequences shorter \n" 335 | "than this will be padded.", 336 | ) 337 | parser.add_argument( 338 | "--max_trunc_char", 339 | default=4096, 340 | type=int, 341 | help="The maximum total number of character extracted from input raw text for fast processing.\n" 342 | "Should set it to larger than max_xseq_len*avg_char_per_word." 343 | ) 344 | # label embedding 345 | parser.add_argument( 346 | "-l", 347 | "--label-emb-name", 348 | type=str, 349 | default="pifa-tfidf-a5-s0", 350 | help="pifa-tfidf-a5-s0 | pifa-neural-a5-s0 | text-emb-a5-s0", 351 | ) 352 | parser.add_argument( 353 | "-c", 354 | "--input-code-path", 355 | type=str, 356 | metavar="PATH", 357 | default="./save_models/Eurlex-4K/pifa-tfidf-a5-s0/indexer/code.npz", 358 | help="path to the npz file of the indexing codes (CSR, nr_labels * nr_codes)", 359 | ) 360 | parser.add_argument( 361 | "-x", "--inst_embedding", 362 | type=str, 363 | default=None, 364 | help="instance embedding for PIFA", 365 | ) 366 | # parse argument 367 | args = parser.parse_args() 368 | print(args) 369 | main(args) 370 | -------------------------------------------------------------------------------- /xbert/ranker.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import argparse 4 | import sys 5 | from os import path 6 | import numpy as np 7 | import scipy as sp 8 | import scipy.sparse as smat 9 | from sklearn.preprocessing import normalize as sk_normalize 10 | from xbert.rf_linear import MLProblem, Metrics, HierarchicalMLModel, PostProcessor, LabelEmbeddingFactory 11 | from xbert.indexer import Indexer 12 | 13 | # solver_type 14 | solver_dict = { 15 | #'L2R_LR':0, 16 | "L2R_L2LOSS_SVC_DUAL": 1, 17 | #'L2R_L2LOSS_SVC':2, 18 | "L2R_L1LOSS_SVC_DUAL": 3, 19 | #'MCSVM_CS':4, 20 | "L1R_L2LOSS_SVC": 5, 21 | #'L1R_LR':6, 22 | "L2R_LR_DUAL": 7, 23 | } 24 | 25 | 26 | class LinearModel(object): 27 | def __init__(self, model=None): 28 | self.model = model 29 | 30 | def __getitem__(self, key): 31 | return LinearModel(self.model[key]) 32 | 33 | def __add__(self, other): 34 | return LinearModel(self.model + other.model, self.bias) 35 | 36 | def save(self, model_folder): 37 | self.model.save(model_folder) 38 | 39 | @classmethod 40 | def load(cls, model_folder): 41 | return cls(HierarchicalMLModel.load(model_folder)) 42 | 43 | @classmethod 44 | def train( 45 | cls, 46 | X, 47 | Y, 48 | C, 49 | mode="full-model", 50 | shallow=False, 51 | solver_type=solver_dict["L2R_L2LOSS_SVC_DUAL"], 52 | Cp=1.0, 53 | Cn=1.0, 54 | threshold=0.1, 55 | max_iter=100, 56 | threads=-1, 57 | bias=-1.0, 58 | Z_pred=None, 59 | negative_sampling_scheme=None, 60 | ): 61 | if mode in ["full-model", "matcher"]: 62 | if mode == "full-model": 63 | prob = MLProblem(X, Y, C, Z_pred=Z_pred, negative_sampling_scheme=negative_sampling_scheme,) 64 | elif mode == "matcher": 65 | assert C is not None 66 | Y = Y.dot(C) 67 | prob = MLProblem(X, Y, C=None) 68 | 69 | hierarchical = True 70 | min_labels = 2 71 | if shallow: 72 | if prob.C is None: 73 | min_labels = prob.Y.shape[1] 74 | else: 75 | min_labels = prob.C.shape[1] 76 | elif mode == "ranker": 77 | assert C is not None 78 | prob = MLProblem(X, Y, C, Z_pred=Z_pred, negative_sampling_scheme=negative_sampling_scheme,) 79 | hierarchical = False 80 | min_labels = 2 81 | 82 | model = HierarchicalMLModel.train( 83 | prob, 84 | hierarchical=hierarchical, 85 | min_labels=min_labels, 86 | solver_type=solver_type, 87 | Cp=Cp, 88 | Cn=Cn, 89 | threshold=threshold, 90 | threads=threads, 91 | bias=bias, 92 | max_iter=max_iter, 93 | ) 94 | return cls(model) 95 | 96 | def predict(self, X, csr_codes=None, beam_size=10, only_topk=10, cond_prob=True): 97 | pred_csr = self.model.predict(X, only_topk=only_topk, csr_codes=csr_codes, beam_size=beam_size, cond_prob=cond_prob,) 98 | return pred_csr 99 | 100 | 101 | class SubCommand(object): 102 | def __init__(self): 103 | pass 104 | 105 | @classmethod 106 | def add_parser(cls, super_parser): 107 | pass 108 | 109 | @staticmethod 110 | def add_arguments(parser): 111 | pass 112 | 113 | 114 | def load_feature_matrix(args): 115 | if args.feature_format % 3 == 0: 116 | X1 = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat1) 117 | X2 = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat2) 118 | X = smat.hstack([sk_normalize(X1, axis=1), sk_normalize(X2, axis=1)]).tocsr() 119 | elif args.feature_format % 3 == 1 and args.input_inst_feat1: 120 | X = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat1) 121 | elif args.feature_format % 3 == 2 and args.input_inst_feat2: 122 | X = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat2) 123 | else: 124 | raise NotImplementedError(f"args.feature_format = {args.feature_format} is not supported.") 125 | if args.feature_format // 3 == 0: 126 | X = sk_normalize(X, axis=1, copy=False) 127 | return X 128 | 129 | class LinearTrainCommand(SubCommand): 130 | @staticmethod 131 | def run(args): 132 | X = load_feature_matrix(args) 133 | Y = smat.load_npz(args.input_inst_label) 134 | label_feat = LabelEmbeddingFactory.create(Y, X, method=args.input_label_feat, dtype=X.dtype) 135 | C = Indexer.load_indexed_code(args.input_code_path, label_feat) 136 | if args.pred_inst_codes is not None: 137 | Z_pred = smat.load_npz(args.pred_inst_codes) 138 | else: 139 | Z_pred = None 140 | model = LinearModel.train( 141 | X, 142 | Y, 143 | C, 144 | mode=args.mode, 145 | shallow=args.shallow, 146 | solver_type=solver_dict[args.solver_type], 147 | Cp=args.Cp, 148 | Cn=args.Cn, 149 | threshold=args.threshold, 150 | threads=args.threads, 151 | bias=args.bias, 152 | Z_pred=Z_pred, 153 | negative_sampling_scheme=args.negative_sampling_scheme, 154 | ) 155 | model.save(args.output_ranker_folder) 156 | 157 | @classmethod 158 | def add_parser(cls, super_parser): 159 | parser = super_parser.add_parser("train", aliases=[], help="Train a linear ranker with codes") 160 | cls.add_arguments(parser) 161 | parser.set_defaults(run=cls.run) 162 | 163 | @staticmethod 164 | def add_arguments(parser): 165 | parser.add_argument( 166 | "-x", 167 | "-x1", 168 | "--input-inst-feat1", 169 | metavar="PATH", 170 | type=str, 171 | required=True, 172 | help="path to the npz file of the feature matrix (CSR)", 173 | ) 174 | 175 | parser.add_argument( 176 | "-x2", 177 | "--input-inst-feat2", 178 | type=str, 179 | default=None, 180 | metavar="PATH", 181 | help="path to the npz file of the feature matrix (CSR, nr_insts * nr_feats)", 182 | ) 183 | 184 | parser.add_argument( 185 | "-f", 186 | "--feature-format", 187 | dest="feature_format", 188 | type=int, 189 | default=1, 190 | metavar="INT", 191 | help="feature format: 0=> normalized [x1, x2], 1=> normalized x1, 2=> normalized x2; 3=> [x1 x2], 4=> x1, 5=> x2\n", 192 | ) 193 | 194 | parser.add_argument( 195 | "-y", 196 | "--input-inst-label", 197 | type=str, 198 | required=True, 199 | metavar="PATH", 200 | help="path to the npz file of the label matrix (CSR, nr_insts * nr_labels)", 201 | ) 202 | 203 | parser.add_argument( 204 | "-z", 205 | "--pred-inst-codes", 206 | type=str, 207 | metavar="PATH", 208 | help="path to the npz file of the predicted inst-cluster matrix (CSR, nr_insts * nr_codes)", 209 | ) 210 | 211 | parser.add_argument( 212 | "-c", 213 | "--input-code-path", 214 | type=str, 215 | required=True, 216 | metavar="PATH", 217 | help="path to the npz file of the indexing codes (CSR, nr_labels * nr_codes)", 218 | ) 219 | 220 | parser.add_argument( 221 | "-L", 222 | "--input-label-feat", 223 | type=str, 224 | default=None, 225 | metavar="PATH", 226 | help="path to the npz file of the feature matrix (CSR)", 227 | ) 228 | 229 | parser.add_argument( 230 | "-ns", 231 | "--negative-sampling-scheme", 232 | type=int, 233 | default=1, # for backward compatibiilty 234 | metavar="INT", 235 | help="0: negative from both indexer and matcher, 1: negative from indexer, 2: negative from matcher", 236 | ) 237 | 238 | parser.add_argument( 239 | "-m", 240 | "-o", # for backward compatibility 241 | "--output-ranker-folder", 242 | type=str, 243 | required=True, 244 | metavar="DIR", 245 | help="directory for storing linear ranker", 246 | ) 247 | 248 | parser.add_argument( 249 | "--mode", type=str, default="full-model", metavar="STR", help="mode: [full-model|ranker] (default full-model)", 250 | ) 251 | 252 | parser.add_argument( 253 | "-S", "--shallow", action="store_true", help="perform shallow linear modeling instead of hierarchical linear modeling", 254 | ) 255 | 256 | parser.add_argument( 257 | "-s", 258 | "--solver-type", 259 | type=str, 260 | default="L2R_L2LOSS_SVC_DUAL", 261 | metavar="SOLVER_STR", 262 | help="{} (default L2R_L2LOSS_SVC_DUAL)".format(" | ".join(solver_dict.keys())), 263 | ) 264 | 265 | parser.add_argument( 266 | "--Cp", type=float, default=1.0, metavar="VAL", help="coefficient for positive class in the loss function (default 1.0)", 267 | ) 268 | 269 | parser.add_argument( 270 | "--Cn", type=float, default=1.0, metavar="VAL", help="coefficient for negative class in the loss function (default 1.0)", 271 | ) 272 | 273 | parser.add_argument( 274 | "-B", 275 | "--bias", 276 | type=float, 277 | default=1.0, 278 | metavar="bias", 279 | help="if bias > 0, instance x becomes [x; bias]; if <= 0, no bias term added (default 1.0)", 280 | ) 281 | 282 | parser.add_argument( 283 | "-t", "--threshold", type=float, default=0.1, metavar="VAL", help="threshold to sparsity the model weights (default 0.1)", 284 | ) 285 | 286 | parser.add_argument( 287 | "-n", "--threads", type=int, default=-1, metavar="INT", help="number of threads to use (default -1 to denote all the CPUs)", 288 | ) 289 | 290 | 291 | class LinearPredictCommand(SubCommand): 292 | @staticmethod 293 | def run(args): 294 | Xt = load_feature_matrix(args) 295 | model = LinearModel.load(args.input_ranker_folder) 296 | # get only ranker part if predicted_csr_code from a matcher is provided 297 | if args.predicted_csr_code is not None and path.exists(args.predicted_csr_code): 298 | csr_codes = smat.load_npz(args.predicted_csr_code) 299 | model = model[-1] 300 | else: 301 | csr_codes = None 302 | 303 | cond_prob = PostProcessor.get(args.transform) 304 | Yt_pred = model.predict(Xt, csr_codes=csr_codes, beam_size=args.beam_size, only_topk=args.only_topk, cond_prob=cond_prob,) 305 | if args.input_inst_label is not None and path.exists(args.input_inst_label): 306 | Yt = smat.load_npz(args.input_inst_label) if args.input_inst_label else None 307 | metric = Metrics.generate(Yt, Yt_pred, topk=10) 308 | print("==== tst_set evaluation ====") 309 | print(metric) 310 | 311 | smat.save_npz(args.output_path, Yt_pred) 312 | 313 | @classmethod 314 | def add_parser(cls, super_parser): 315 | parser = super_parser.add_parser("predict", aliases=[], help="Generate predictions based on the given ranker") 316 | cls.add_arguments(parser) 317 | parser.set_defaults(run=cls.run) 318 | 319 | @staticmethod 320 | def add_arguments(parser): 321 | parser.add_argument( 322 | "-m", "--input-ranker-folder", type=str, required=True, help="path to the ranker folder", 323 | ) 324 | 325 | parser.add_argument( 326 | "-x", "-x1", "--input-inst-feat1", type=str, required=True, help="path to the npz file of the feature matrix (CSR)", 327 | ) 328 | parser.add_argument( 329 | "-x2", 330 | "--input-inst-feat2", 331 | type=str, 332 | default=None, 333 | metavar="PATH", 334 | help="path to the npz file of the feature matrix (CSR, nr_insts * nr_feats)", 335 | ) 336 | 337 | parser.add_argument( 338 | "-f", 339 | "--feature-format", 340 | dest="feature_format", 341 | type=int, 342 | default=1, 343 | metavar="INT", 344 | help="feature format: 0=> normalized [x1, x2], 1=> normalized x1, 2=> normalized x2; 3=> [x1 x2], 4=> x1, 5=> x2\n", 345 | ) 346 | 347 | parser.add_argument( 348 | "-y", 349 | "--input-inst-label", 350 | type=str, 351 | required=False, 352 | help="path to the npz file of the label matrix (CSR) for computing metrics", 353 | ) 354 | 355 | parser.add_argument( 356 | "-o", "--output-path", type=str, required=True, help="path to the npz file of output prediction (CSR)", 357 | ) 358 | 359 | parser.add_argument( 360 | "-c", 361 | "-z", 362 | "--predicted-csr-code", 363 | type=str, 364 | required=False, 365 | help="path to the npz file of the csr codes generated by the matcher", 366 | ) 367 | 368 | parser.add_argument( 369 | "-t", 370 | "--transform", 371 | type=str, 372 | default="l3-hinge", 373 | help="transform of the ranker prediction to be multiplied by the input csr codes sigmoid | l1-hinge | l2-hinge | l3-hinge (default l2-hinge)", 374 | ) 375 | 376 | parser.add_argument( 377 | "-k", "--only-topk", type=int, default=10, help="number of top labels in the prediction", 378 | ) 379 | 380 | parser.add_argument( 381 | "-b", "--beam-size", type=int, default=10, help="size of beam search in the prediction", 382 | ) 383 | 384 | 385 | def get_parser(): 386 | parser = argparse.ArgumentParser() 387 | subparsers = parser.add_subparsers(help="subcommands", metavar="SUBCOMMAND") 388 | subparsers.required = True 389 | LinearTrainCommand.add_parser(subparsers) 390 | LinearPredictCommand.add_parser(subparsers) 391 | return parser 392 | 393 | 394 | if __name__ == "__main__": 395 | 396 | parser = get_parser() 397 | args = parser.parse_args() 398 | args.run(args) 399 | -------------------------------------------------------------------------------- /xbert/rf_linear.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from os import path, system 4 | import time 5 | import collections 6 | import itertools 7 | import pickle 8 | import json 9 | import glob 10 | from glob import glob 11 | import ctypes 12 | from ctypes import * 13 | 14 | import scipy as sp 15 | import scipy.sparse as smat 16 | from sklearn.preprocessing import normalize as sk_normalize 17 | 18 | import xbert.indexer as indexer 19 | from xbert.rf_util import ( 20 | PyMatrix, 21 | fillprototype, 22 | load_dynamic_library, 23 | COOAllocator, 24 | PredAllocator, 25 | smat_util, 26 | ) 27 | 28 | 29 | # solver_type 30 | L2R_LR = 0 31 | L2R_L2LOSS_SVC_DUAL = 1 32 | L2R_L2LOSS_SVC = 2 33 | L2R_L1LOSS_SVC_DUAL = 3 34 | MCSVM_CS = 4 35 | L1R_L2LOSS_SVC = 5 36 | L1R_LR = 6 37 | L2R_LR_DUAL = 7 38 | 39 | 40 | class corelib(object): 41 | def __init__(self, dirname, soname, forced_rebuild=False): 42 | self.clib_float32 = load_dynamic_library(dirname, soname + "_float32", forced_rebuild=forced_rebuild) 43 | self.clib_float64 = load_dynamic_library(dirname, soname + "_float64", forced_rebuild=forced_rebuild) 44 | arg_list = [ 45 | POINTER(PyMatrix), # PyMatrix X 46 | POINTER(PyMatrix), # PyMatrix Y 47 | POINTER(PyMatrix), # PyMatrix C 48 | POINTER(PyMatrix), # PyMatrix Z 49 | COOAllocator.CFUNCTYPE, # py_coo_allocator 50 | c_double, # threshold 51 | c_int, # solver_type 52 | c_double, # Cp 53 | c_double, # Cn 54 | c_uint64, # max_iter 55 | c_double, # eps 56 | c_double, # bias 57 | c_int, # threads 58 | ] 59 | fillprototype(self.clib_float32.c_multilabel_train_with_codes, None, arg_list) 60 | fillprototype(self.clib_float64.c_multilabel_train_with_codes, None, arg_list) 61 | 62 | arg_list = [ 63 | POINTER(PyMatrix), 64 | POINTER(PyMatrix), 65 | POINTER(PyMatrix), 66 | POINTER(PyMatrix), 67 | PredAllocator.CFUNCTYPE, 68 | c_int, 69 | ] 70 | fillprototype(self.clib_float32.c_multilabel_predict_with_codes, None, arg_list) 71 | fillprototype(self.clib_float64.c_multilabel_predict_with_codes, None, arg_list) 72 | 73 | arg_list = [ 74 | POINTER(PyMatrix), 75 | POINTER(PyMatrix), 76 | c_uint64, 77 | POINTER(c_uint32), 78 | POINTER(c_uint32), 79 | c_void_p, 80 | c_int, 81 | ] 82 | fillprototype(self.clib_float32.c_sparse_inner_products, None, arg_list) 83 | fillprototype(self.clib_float64.c_sparse_inner_products, None, arg_list) 84 | 85 | def sparse_inner_products(self, pX, pM, X_row_idx, M_col_idx, pred_values=None, threads=-1, verbose=0): 86 | clib = self.clib_float32 87 | if pX.dtype == sp.float64: 88 | clib = self.clib_float64 89 | assert pM.dtype == sp.float64 90 | if verbose != 0: 91 | print("perform float64 computation") 92 | else: 93 | clib = self.clib_float32 94 | assert pM.dtype == sp.float32 95 | if verbose != 0: 96 | print("perform float32 computation") 97 | 98 | nnz = len(X_row_idx) 99 | if pred_values is None or pred_values.dtype != pM.dtype or len(pred_values) != nnz: 100 | pred_values = sp.zeros(nnz, dtype=pM.dtype) 101 | clib.c_sparse_inner_products( 102 | byref(pX), 103 | byref(pM), 104 | nnz, 105 | X_row_idx.ctypes.data_as(POINTER(c_uint32)), 106 | M_col_idx.ctypes.data_as(POINTER(c_uint32)), 107 | pred_values.ctypes.data_as(c_void_p), 108 | threads, 109 | ) 110 | return pred_values 111 | 112 | def multilabel_predict_with_codes(self, pX, pW, pC, pZ, threads=-1, verbose=0): 113 | clib = self.clib_float32 114 | if pX.dtype == sp.float64: 115 | clib = self.clib_float64 116 | if verbose != 0: 117 | print("perform float64 computation") 118 | else: 119 | clib = self.clib_float32 120 | if verbose != 0: 121 | print("perform float32 computation") 122 | pred_alloc = PredAllocator(dtype=pX.dtype) 123 | clib.c_multilabel_predict_with_codes(byref(pX), byref(pW), byref(pC), byref(pZ), pred_alloc.cfunc, threads) 124 | return pred_alloc.get_pred() 125 | 126 | def multilabel_train_with_codes( 127 | self, 128 | pX, 129 | pY, 130 | pC, 131 | pZ, 132 | threshold=0, 133 | solver_type=L2R_L2LOSS_SVC_DUAL, 134 | Cp=1.0, 135 | Cn=1.0, 136 | max_iter=1000, 137 | eps=0.1, 138 | bias=1.0, 139 | threads=-1, 140 | verbose=0, 141 | ): 142 | clib = self.clib_float32 143 | if pX.dtype == sp.float64: 144 | clib = self.clib_float64 145 | if verbose != 0: 146 | print("perform float64 computation") 147 | else: 148 | clib = self.clib_float32 149 | if verbose != 0: 150 | print("perform float32 computation") 151 | coo_alloc = COOAllocator(dtype=pX.dtype) 152 | clib.c_multilabel_train_with_codes( 153 | byref(pX), 154 | byref(pY), 155 | byref(pC) if pC is not None else None, 156 | byref(pZ) if pZ is not None else None, 157 | coo_alloc.cfunc, 158 | threshold, 159 | solver_type, 160 | Cp, 161 | Cn, 162 | max_iter, 163 | eps, 164 | bias, 165 | threads, 166 | ) 167 | return coo_alloc.tocsc() 168 | 169 | 170 | forced_rebuild = False 171 | corelib_path = path.join(path.dirname(path.abspath(__file__)), "corelib/") 172 | soname = "rf_linear" 173 | clib = corelib(corelib_path, soname, forced_rebuild) 174 | 175 | 176 | class WallTimer(object): 177 | def __init__(self): 178 | self.last_time = 0 179 | 180 | def now(self): 181 | return time.time() 182 | 183 | def tic(self): 184 | self.last_time = self.now() 185 | 186 | def toc(self): 187 | return (self.now() - self.last_time) * 1e3 188 | 189 | 190 | class Metrics(collections.namedtuple("Metrics", ["prec", "recall"])): 191 | __slots__ = () 192 | 193 | def __str__(self): 194 | fmt = lambda key: " ".join("{:4.2f}".format(100 * v) for v in getattr(self, key)[:]) 195 | return "\n".join("{:7}= {}".format(key, fmt(key)) for key in self._fields) 196 | 197 | @classmethod 198 | def default(cls): 199 | return cls(prec=[], recall=[]) 200 | 201 | @classmethod 202 | def generate(cls, tY, pY, topk=10): 203 | assert isinstance(tY, smat.csr_matrix), type(tY) 204 | assert isinstance(pY, smat.csr_matrix), type(pY) 205 | assert tY.shape == pY.shape, "tY.shape = {}, pY.shape = {}".format(tY.shape, pY.shape) 206 | pY = smat_util.sorted_csr(pY) 207 | total_matched = sp.zeros(topk, dtype=sp.uint64) 208 | recall = sp.zeros(topk, dtype=sp.float64) 209 | for i in range(tY.shape[0]): 210 | truth = tY.indices[tY.indptr[i] : tY.indptr[i + 1]] 211 | matched = sp.isin(pY.indices[pY.indptr[i] : pY.indptr[i + 1]][:topk], truth) 212 | cum_matched = sp.cumsum(matched, dtype=sp.uint64) 213 | total_matched[: len(cum_matched)] += cum_matched 214 | recall[: len(cum_matched)] += cum_matched / len(truth) 215 | if len(cum_matched) != 0: 216 | total_matched[len(cum_matched) :] += cum_matched[-1] 217 | recall[len(cum_matched) :] += cum_matched[-1] / len(truth) 218 | prec = total_matched / tY.shape[0] / sp.arange(1, topk + 1) 219 | recall = recall / tY.shape[0] 220 | return cls(prec=prec, recall=recall) 221 | 222 | 223 | class Transform(object): 224 | @staticmethod 225 | def identity(v, inplace=False): 226 | return v 227 | 228 | @staticmethod 229 | def log_lpsvm(p, v, inplace=False): 230 | if inplace: 231 | out = v 232 | else: 233 | out = sp.zeros_like(v) 234 | out[:] = -(sp.maximum(1.0 - v, 0) ** p) 235 | return out 236 | 237 | @staticmethod 238 | def lpsvm(p, v, inplace=False): 239 | out = Transform.log_lpsvm(p, v, inplace) 240 | sp.exp(out, out=out) 241 | return out 242 | 243 | @staticmethod 244 | def get_log_lpsvm(p): 245 | def f(v, inplace=False): 246 | return Transform.log_lpsvm(p, v, inplace) 247 | 248 | return f 249 | 250 | @staticmethod 251 | def get_lpsvm(p): 252 | def f(v, inplace=False): 253 | return Transform.lpsvm(p, v, inplace) 254 | 255 | return f 256 | 257 | @staticmethod 258 | def sigmoid(v, inplace=False): 259 | if inplace: 260 | out = v 261 | else: 262 | out = sp.zeros_like(v) 263 | out[:] = 1.0 / (1.0 + sp.exp(-v)) 264 | return out 265 | 266 | @staticmethod 267 | def log_sigmoid(v, inplace=False): 268 | out = Transform.sigmoid(v, inplace) 269 | out[:] = sp.log(out) 270 | return out 271 | 272 | 273 | class Combiner(object): 274 | @staticmethod 275 | def noop(x, y): 276 | return x 277 | 278 | @staticmethod 279 | def add(x, y): 280 | x[:] += y[:] 281 | return x 282 | 283 | @staticmethod 284 | def mul(x, y): 285 | x[:] *= y[:] 286 | return x 287 | 288 | @staticmethod 289 | def max(x, y): 290 | x[:] = sp.maximum(x[:], y[:]) 291 | return x 292 | 293 | @staticmethod 294 | def noisyor(x, y): 295 | x[:] = 1.0 - (1.0 - x[:]) * (1.0 - y[:]) 296 | return x 297 | 298 | 299 | class PostProcessor(object): 300 | def __init__(self, transform, combiner): 301 | self.transform = transform 302 | self.combiner = combiner 303 | 304 | @classmethod 305 | def get(cls, name): 306 | mapping = { 307 | "sigmoid": PostProcessor.sigmoid(), 308 | "log-sigmoid": cls(Transform.log_sigmoid, Combiner.add), 309 | "noop": cls(Transform.identity, Combiner.noop), 310 | } 311 | for p in [1, 2, 3, 4, 5, 6]: 312 | mapping["l{}-hinge".format(p)] = cls(Transform.get_lpsvm(p), Combiner.mul) 313 | mapping["log-l{}-hinge".format(p)] = cls(Transform.get_log_lpsvm(p), Combiner.add) 314 | mapping["l{}-hinge-noisyor".format(p)] = cls(Transform.get_lpsvm(p), Combiner.noisyor) 315 | return mapping[name] 316 | 317 | @classmethod 318 | def sigmoid(cls): 319 | return cls(Transform.sigmoid, Combiner.mul) 320 | 321 | @classmethod 322 | def l2svm(cls): 323 | return cls(Transform.l2svm, Combiner.mul) 324 | 325 | @classmethod 326 | def noisyor_l2svm(cls): 327 | return cls(Transform.l2svm, Combiner.noisyor) 328 | 329 | @classmethod 330 | def noisyor_sigmoid(cls): 331 | return cls(Transform.sigmoid, Combiner.noisyor) 332 | 333 | class LabelEmbeddingFactory(object): 334 | 335 | @staticmethod 336 | def create(Y, X, method="pifa", dtype=sp.float32): 337 | mapping = { 338 | "pifa": LabelEmbeddingFactory.pifa, 339 | "homer": LabelEmbeddingFactory.homer, 340 | "spectral": LabelEmbeddingFactory.spectral, 341 | "none": lambda Y, X, dtype: None, 342 | } 343 | if method is None: 344 | method = "none" 345 | if method.lower() in mapping: 346 | return mapping[method.lower()](Y, X, dtype) 347 | elif (method.endswith(".npz") or method.endswith(".npy")) and path.exists(method): 348 | label_embedding = HierarchicalMLModel.load_feature_matrix(method, dtype=dtype) 349 | assert label_embedding.shape[0] == Y.shape[1], f"{label_embedding.shape[0]} != Y.{shape[1]}" 350 | return label_embedding 351 | else: 352 | assert False, f"Something wrong with this label embedding '{method}'. valid ones {mapping.keys()}" 353 | 354 | @staticmethod 355 | def pifa(Y, X, dtype=sp.float32): 356 | Y_avg = sk_normalize(Y, axis=1, norm="l2") 357 | label_embedding = smat.csr_matrix(Y_avg.T.dot(X), dtype=dtype) 358 | return label_embedding 359 | 360 | @staticmethod 361 | def homer(Y, X, dtype=sp.float32): 362 | label_embedding = smat.csr_matrix(Y.T, dtype=dtype) 363 | return label_embedding 364 | 365 | @staticmethod 366 | def spectral(Y, X, dtype=sp.float32): 367 | from sklearn.cluster import SpectralCoclustering 368 | def scale_normalize(X): 369 | " from https://github.com/scikit-learn/scikit-learn/blob/b194674c4/sklearn/cluster/_bicluster.py#L108" 370 | row_diag = sp.asarray(sp.sqrt(X.sum(axis=1))).squeeze() 371 | col_diag = sp.asarray(sp.sqrt(X.sum(axis=0))).squeeze() 372 | row_diag[row_diag == 0] = 1.0; 373 | col_diag[col_diag == 0] = 1.0; 374 | row_diag= 1.0 / row_diag 375 | col_diag= 1.0 / col_diag 376 | if smat.issparse(X): 377 | n_rows, n_cols = X.shape 378 | r = smat.dia_matrix((row_diag, [0]), shape=(n_rows, n_rows)) 379 | c = smat.dia_matrix((col_diag, [0]), shape=(n_cols, n_cols)) 380 | an = r * X * c 381 | else: 382 | an = row_diag[:, sp.newaxis] * X * col_diag 383 | return an, row_diag, col_diag 384 | 385 | coclustering = SpectralCoclustering(n_clusters=16384, random_state=1) 386 | normalized_data, row_diag, col_diag = scale_normalize(Y.T) 387 | n_sv = 1 + int(sp.ceil(sp.log2(coclustering.n_clusters))) 388 | u, v = coclustering._svd(normalized_data, n_sv, n_discard=1) 389 | label_embedding = smat.csr_matrix(u, dtype=dtype) 390 | return label_embedding 391 | 392 | class MLProblem(object): 393 | def __init__(self, X, Y, C=None, dtype=None, Z_pred=None, negative_sampling_scheme=None): 394 | if dtype is None: 395 | dtype = X.dtype 396 | self.pX = PyMatrix.init_from(X, dtype) 397 | self.pY = PyMatrix.init_from(Y, dtype) 398 | self.pC = PyMatrix.init_from(C, dtype) 399 | Z = None if C is None else smat.csr_matrix(self.Y.dot(self.C)) 400 | if negative_sampling_scheme is None or negative_sampling_scheme == 1: 401 | Z = Z 402 | elif negative_sampling_scheme is not None: 403 | if negative_sampling_scheme == 0: 404 | Z = (Z + Z_pred).tocsr() 405 | elif negative_sampling_scheme == 1: 406 | Z = Z 407 | elif negative_sampling_scheme == 2 and Z_pred is not None: 408 | Z = Z_pred 409 | self.pZ = PyMatrix.init_from(Z, dtype) # Z = Y * C 410 | self.dtype = dtype 411 | 412 | @property 413 | def X(self): 414 | return None if self.pX is None else self.pX.buf 415 | 416 | @property 417 | def Y(self): 418 | return None if self.pY is None else self.pY.buf 419 | 420 | @property 421 | def C(self): 422 | return None if self.pC is None else self.pC.buf 423 | 424 | @property 425 | def Z(self): 426 | return None if self.pZ is None else self.pZ.buf 427 | 428 | @property 429 | def nr_labels(self): 430 | return None if self.pY is None else self.Y.shape[1] 431 | 432 | 433 | class MLModel(object): 434 | def __init__(self, W, C=None, dtype=None): 435 | if C is not None: 436 | if isinstance(C, PyMatrix): 437 | assert C.buf.shape[0] == W.shape[1] 438 | else: 439 | assert C.shape[0] == W.shape[1], "C:{} W:{}".format(C.shape, W.shape) 440 | if dtype is None: 441 | dtype = W.dtype 442 | self.pC = PyMatrix.init_from(C, dtype) 443 | self.pW = PyMatrix.init_from(W, dtype) 444 | 445 | @property 446 | def C(self): 447 | return None if self.pC is None else self.pC.buf 448 | 449 | @property 450 | def W(self): 451 | return None if self.pW is None else self.pW.buf 452 | 453 | @property 454 | def nr_labels(self): 455 | return self.W.shape[1] 456 | 457 | @property 458 | def nr_codes(self): 459 | return 0 if self.C is None else self.C.shape[1] 460 | 461 | @property 462 | def nr_features(self): 463 | return self.W.shape[0] 464 | 465 | @property 466 | def dtype(self): 467 | return self.pW.dtype 468 | 469 | def astype(self, dtype): 470 | if dtype == self.pW.dtype: 471 | return self 472 | else: 473 | return MLModel(self.W, self.C, dtype) 474 | 475 | @classmethod 476 | def load(cls, folder, dtype=None): 477 | param = json.loads(open("{}/param.json".format(folder), "r").read()) 478 | assert param["model"] == cls.__name__ 479 | W = smat.load_npz("{}/W.npz".format(folder)).sorted_indices() 480 | if path.exists("{}/C.npz".format(folder)): 481 | C = smat.load_npz("{}/C.npz".format(folder)).sorted_indices() 482 | else: 483 | C = None 484 | return cls(W, C, dtype=dtype) 485 | 486 | def save(self, folder): 487 | if not path.exists(folder): 488 | os.makedirs(folder) 489 | param = { 490 | "model": self.__class__.__name__, 491 | "nr_labels": self.nr_labels, 492 | "nr_features": self.nr_features, 493 | "nr_codes": self.nr_codes, 494 | } 495 | open("{}/param.json".format(folder), "w").write(json.dumps(param, indent=True)) 496 | smat.save_npz("{}/W.npz".format(folder), self.W, compressed=False) 497 | if self.C is not None: 498 | smat.save_npz("{}/C.npz".format(folder), self.C, compressed=False) 499 | 500 | @classmethod 501 | def train( 502 | cls, 503 | prob, 504 | threshold=0.0, 505 | solver_type=L2R_L2LOSS_SVC_DUAL, 506 | Cp=1.0, 507 | Cn=1.0, 508 | max_iter=100, 509 | eps=0.1, 510 | bias=1.0, 511 | threads=-1, 512 | verbose=0, 513 | **arg_kw, 514 | ): 515 | model = clib.multilabel_train_with_codes( 516 | prob.pX, 517 | prob.pY, 518 | prob.pC, 519 | prob.pZ, 520 | threshold=threshold, 521 | solver_type=solver_type, 522 | Cp=Cp, 523 | Cn=Cn, 524 | max_iter=max_iter, 525 | eps=eps, 526 | bias=bias, 527 | threads=threads, 528 | verbose=verbose, 529 | ) 530 | return cls(model, prob.pC) 531 | 532 | def predict( 533 | self, X, only_topk=None, csr_codes=None, cond_prob=None, normalized=False, threads=-1, 534 | ): 535 | assert X.shape[1] == self.nr_features 536 | if csr_codes is None: 537 | dense = X.dot(self.W).toarray() 538 | if cond_prob: 539 | dense = cond_prob.transform(dense, inplace=True) 540 | coo = smat_util.dense_to_coo(dense) 541 | pred_csr = smat_util.sorted_csr_from_coo(coo.shape, coo.row, coo.col, coo.data, only_topk=only_topk) 542 | else: # csr_codes is given 543 | assert self.C is not None, "This model does not have C" 544 | assert X.shape[1] == self.nr_features 545 | assert csr_codes.shape[0] == X.shape[0] 546 | assert csr_codes.shape[1] == self.nr_codes 547 | if (csr_codes.data == 0).sum() != 0: 548 | # this is a trick to avoid zero entries explicit removal from the smat_dot_smat 549 | offset = sp.absolute(csr_codes.data).max() + 1 550 | csr_codes = smat.csr_matrix((csr_codes.data + offset, csr_codes.indices, csr_codes.indptr), shape=csr_codes.shape,) 551 | csr_labels = (csr_codes.dot(self.C.T)).tocsr() 552 | csr_labels.data -= offset 553 | else: 554 | csr_labels = (csr_codes.dot(self.C.T)).tocsr() 555 | nnz_of_insts = csr_labels.indptr[1:] - csr_labels.indptr[:-1] 556 | inst_idx = sp.repeat(sp.arange(X.shape[0], dtype=sp.uint32), nnz_of_insts) 557 | label_idx = csr_labels.indices.astype(sp.uint32) 558 | val = self.predict_values(X, inst_idx, label_idx, threads=threads) 559 | if cond_prob: 560 | val = cond_prob.transform(val, inplace=True) 561 | val = cond_prob.combiner(val, csr_labels.data) 562 | 563 | pred_csr = smat_util.sorted_csr_from_coo(csr_labels.shape, inst_idx, label_idx, val, only_topk=only_topk) 564 | 565 | if normalized: 566 | pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") 567 | return pred_csr 568 | 569 | def predict_new( 570 | self, X, only_topk=None, csr_codes=None, cond_prob=None, normalized=False, threads=-1, 571 | ): 572 | assert X.shape[1] == self.nr_features 573 | if csr_codes is None: 574 | dense = X.dot(self.W).toarray() 575 | if cond_prob: 576 | dense = cond_prob.transform(dense, inplace=True) 577 | coo = smat_util.dense_to_coo(dense) 578 | pred_csr = smat_util.sorted_csr_from_coo(coo.shape, coo.row, coo.col, coo.data, only_topk=only_topk) 579 | else: # csr_codes is given 580 | assert self.C is not None, "This model does not have C" 581 | assert X.shape[1] == self.nr_features 582 | assert csr_codes.shape[0] == X.shape[0] 583 | assert csr_codes.shape[1] == self.nr_codes 584 | if not csr_codes.has_sorted_indices: 585 | csr_codes = csr_codes.sorted_indices() 586 | if (csr_codes.data == 0).sum() != 0: 587 | # this is a trick to avoid zero entries explicit removal from the smat_dot_smat 588 | offset = sp.absolute(csr_codes.data).max() + 1 589 | csr_codes = smat.csr_matrix((csr_codes.data + offset, csr_codes.indices, csr_codes.indptr), shape=csr_codes.shape,) 590 | pZ = PyMatrix.init_from(csr_codes, self.dtype) 591 | csr_labels, pred_csr = clib.multilabel_predict_with_codes(X, self.pW, self.pC, pZ, threads=threads) 592 | csr_labels.data -= offset 593 | else: 594 | pZ = PyMatrix.init_from(csr_codes.sorted_indices(), self.dtype) 595 | csr_labels, pred_csr = clib.multilabel_predict_with_codes(X, self.pW, self.pC, pZ, threads=threads) 596 | val = pred_csr.data 597 | if cond_prob: 598 | val = cond_prob.transform(val, inplace=True) 599 | val = cond_prob.combiner(val, csr_labels.data) 600 | 601 | pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) 602 | 603 | if normalized: 604 | pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") 605 | return pred_csr 606 | 607 | def predict_values(self, X, inst_idx, label_idx, out=None, threads=-1): 608 | assert X.shape[1] == self.nr_features 609 | if out is None: 610 | out = sp.zeros(inst_idx.shape, dtype=self.pW.dtype) 611 | pX = PyMatrix.init_from(X, dtype=self.pW.dtype) 612 | out = clib.sparse_inner_products(pX, self.pW, inst_idx.astype(sp.uint32), label_idx.astype(sp.uint32), out, threads=threads,) 613 | return out 614 | 615 | def predict_with_coo_labels(self, X, inst_idx, label_idx, only_topk=None): 616 | val = self.predict_values(X, inst_idx, label_idx) 617 | shape = (X.shape[0], self.nr_labels) 618 | pred_csr = smat_util.sorted_csr_from_coo(shape, inst_idx, label_idx, val, only_topk=only_topk) 619 | return pred_csr 620 | 621 | def predict_with_csr_labels(self, X, csr_labels, only_topk=None): 622 | assert X.shape[1] == self.nr_features 623 | assert csr_labels.shape[0] == X.shape[0] 624 | assert csr_labels.shape[1] == self.nr_labels 625 | nz_of_rows = csr_labels.indptr[1:] - csr_labels.indptr[:-1] 626 | inst_idx = sp.repeat(sp.arange(X.shape[0]), nz_of_rows).astype(sp.uint32) 627 | label_idx = csr_labels.indices 628 | return self.predict_with_coo_labels(X, inst_idx, label_idx, only_topk) 629 | 630 | def predict_with_coo_codes(self, X, inst_idx, code_idx, only_topk=None): 631 | assert self.C != None, "This Model does not have codes" 632 | shape = (X.shape[0], self.nr_codes) 633 | tmp_ones = sp.ones_like(code_idx) 634 | csr_codes = smat.csr_matrix((tmp_ones, (inst_idx, code_idx)), shape=shape, dtype=sp.float32) 635 | coo_labels = (csr_codes.dot(self.C.T)).tocoo() 636 | return self.predict_with_coo_labels(X, coo_labels.row, coo_labels.col, only_topk) 637 | 638 | def predict_with_csr_codes(self, X, csr_codes, only_topk=None): 639 | assert self.C != None, "This Model does not have codes" 640 | assert X.shape[1] == self.nr_features 641 | assert csr_codes.shape[0] == X.shape[0] 642 | assert csr_codes.shape[1] == self.nr_codes 643 | coo_labels = (csr_codes.dot(self.C.T)).tocoo() 644 | return self.predict_with_coo_labels(X, coo_labels.row, coo_labels.col, only_topk) 645 | 646 | 647 | class HierarchicalMLModel(object): 648 | """A hierachical linear multilable model""" 649 | 650 | def __init__(self, model_chain, bias=-1): 651 | if isinstance(model_chain, (list, tuple)): 652 | self.model_chain = model_chain 653 | else: 654 | self.model_chain = [model_chain] 655 | self.bias = bias 656 | 657 | @staticmethod 658 | def load_feature_matrix(src, dtype=sp.float32): 659 | if src.endswith(".npz"): 660 | return smat.load_npz(src).tocsr().astype(dtype) 661 | elif src.endswith(".npy"): 662 | return smat.csr_matrix(sp.ascontiguousarray(sp.load(src), dtype=dtype)) 663 | else: 664 | raise ValueError("src must end with .npz or .npy") 665 | 666 | @property 667 | def depth(self): 668 | return len(self.model_chain) 669 | 670 | @property 671 | def nr_features(self): 672 | return self.model_chain[0].nr_features - (1 if self.bias > 0 else 0) 673 | 674 | @property 675 | def nr_codes(self): 676 | return self.model_chain[-1].nr_codes 677 | 678 | @property 679 | def nr_labels(self): 680 | return self.model_chain[-1].nr_labels 681 | 682 | def __add__(self, other): 683 | if not isinstance(other, HierarchicalMLModel): 684 | other = HierarchicalMLModel(other) 685 | assert self.model_chain[-1].nr_labels == other.model_chain[0].nr_codes 686 | return HierarchicalMLModel(self.model_chain + other.model_chain, self.bias) 687 | 688 | def __getitem__(self, key): 689 | return HierarchicalMLModel(self.model_chain[key], self.bias) 690 | 691 | def astype(self, dtype): 692 | if dtype == self.model_chain[0].dtype: 693 | return self 694 | else: 695 | return HierarchicalMLModel([m.astype(dtype) for m in self.model_chain]) 696 | 697 | @classmethod 698 | def load(cls, folder, dtype=None): 699 | param = json.loads(open("{}/param.json".format(folder), "r").read()) 700 | assert param["model"] == cls.__name__ 701 | depth = int(param.get("depth", len(glob("{}/*.model".format(folder))))) 702 | 703 | bias = float(param.get("bias", -1.0)) # backward compatibility in case bias term is not listed in param.json 704 | return cls([load_model("{}/{}.model".format(folder, d), dtype=dtype) for d in range(depth)], bias,) 705 | 706 | def save(self, folder): 707 | if not path.exists(folder): 708 | os.makedirs(folder) 709 | depth = self.depth 710 | param = { 711 | "model": self.__class__.__name__, 712 | "depth": self.depth, 713 | "nr_features": self.nr_features, 714 | "nr_codes": self.nr_codes, 715 | "nr_labels": self.nr_labels, 716 | "bias": self.bias, 717 | } 718 | open("{}/param.json".format(folder), "w").write(json.dumps(param, indent=True)) 719 | for d in range(depth): 720 | local_folder = "{}/{}.model".format(folder, d) 721 | self.model_chain[d].save(local_folder) 722 | 723 | @classmethod 724 | def train(cls, prob, hierarchical=None, min_labels=2, nr_splits=2, **arg_kw): 725 | if hierarchical is None or hierarchical == False: 726 | return HierarchicalMLModel([MLModel.train(prob, **arg_kw)], arg_kw.get("bias", 1.0)) 727 | 728 | model_chain = [] 729 | cur_prob = prob 730 | if min_labels <= 1: 731 | min_labels = prob.C.shape[1] 732 | while True: 733 | if cur_prob.C is None and cur_prob.nr_labels > min_labels: 734 | cur_codes = sp.arange(cur_prob.nr_labels) 735 | new_codes = cur_codes // nr_splits 736 | shape = (len(cur_codes), new_codes.max() + 1) 737 | newC = smat.csr_matrix((sp.ones_like(cur_codes), (cur_codes, new_codes)), shape=shape) 738 | cur_prob = MLProblem(cur_prob.pX, cur_prob.pY, newC) 739 | cur_model = MLModel.train(cur_prob, **arg_kw) 740 | model_chain += [cur_model] 741 | if cur_model.C is None: 742 | break 743 | else: 744 | newY = cur_prob.Y.dot(cur_prob.C) 745 | cur_prob = MLProblem(cur_prob.pX, newY) 746 | model_chain = model_chain[::-1] 747 | return cls(model_chain, arg_kw.get("bias", 1.0)) 748 | 749 | def predict( 750 | self, X, only_topk=None, csr_codes=None, beam_size=2, max_depth=None, cond_prob=True, normalized=False, threads=-1, 751 | ): 752 | if max_depth is None: 753 | max_depth = self.depth 754 | if cond_prob is None or cond_prob == False: 755 | cond_prob = PostProcessor(Transform.identity, Combiner.noop) 756 | if cond_prob == True: 757 | cond_prob = PostProcessor(Transform.get_lpsvm(3), Combiner.mul) 758 | assert isinstance(cond_prob, PostProcessor), type(cond_prob) 759 | 760 | assert X.shape[1] == self.nr_features, f"{X.shape[1]} != {self.nr_features}" 761 | if self.bias > 0: 762 | X = smat_util.append_column(X, self.bias) 763 | if not X.has_sorted_indices: 764 | X = X.sorted_indices() 765 | pX = PyMatrix.init_from(X, dtype=self.model_chain[0].pW.dtype) 766 | max_depth = min(self.depth, max_depth) 767 | pred_csr = csr_codes 768 | for d in range(max_depth): 769 | cur_model = self.model_chain[d] 770 | local_only_topk = only_topk if d == (max_depth - 1) else beam_size 771 | pred_csr = cur_model.predict(pX, only_topk=local_only_topk, csr_codes=pred_csr, cond_prob=cond_prob, threads=threads,) 772 | if normalized: 773 | pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") 774 | return pred_csr 775 | 776 | def predict_new( 777 | self, X, only_topk=None, csr_codes=None, beam_size=2, max_depth=None, cond_prob=True, normalized=False, threads=-1, 778 | ): 779 | if max_depth is None: 780 | max_depth = self.depth 781 | if cond_prob is None or cond_prob == False: 782 | cond_prob = PostProcessor(Transform.identity, Combiner.noop) 783 | if cond_prob == True: 784 | cond_prob = PostProcessor(Transform.get_lpsvm(3), Combiner.mul) 785 | assert isinstance(cond_prob, PostProcessor), tpye(cond_prob) 786 | 787 | assert X.shape[1] == self.nr_features 788 | if self.bias > 0: 789 | X = smat_util.append_column(X, self.bias) 790 | pX = PyMatrix.init_from(X, dtype=self.model_chain[0].pW.dtype) 791 | max_depth = min(self.depth, max_depth) 792 | pred_csr = csr_codes 793 | for d in range(max_depth): 794 | cur_model = self.model_chain[d] 795 | local_only_topk = only_topk if d == (max_depth - 1) else beam_size 796 | pred_csr = cur_model.predict_new(pX, only_topk=local_only_topk, csr_codes=pred_csr, cond_prob=cond_prob, threads=threads,) 797 | if normalized: 798 | pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") 799 | return pred_csr 800 | 801 | 802 | class Parabel(object): 803 | """An utility Class to load model/prediction from Parabel Package""" 804 | 805 | @staticmethod 806 | def load_tree(path_to_file, path_to_param=None, bias=None): 807 | if path_to_param is not None: 808 | with open(path_to_param, "r") as fin: 809 | real_nr_features = int(fin.readline()) 810 | for i in range(6): # bypassing uncessary features 811 | fin.readline() 812 | bias = float(fin.readline) 813 | if bias is None: 814 | bias = 1.0 # the bias term is default to 1.0 in the parabel package 815 | """Load a single tree model obtained from Parabel Package""" 816 | with open(path_to_file, "r") as fin: 817 | nr_features = int(fin.readline()) - 1 if bias <= 0 else 0 # remove the bias term 818 | nr_labels = int(fin.readline()) 819 | nr_nodes = int(fin.readline()) 820 | max_depth = int(sp.log2(nr_nodes + 1)) 821 | Clist, Wlist = [], [] 822 | for depth in range(max_depth): 823 | nr_nodes_with_depth = 2 ** depth 824 | if depth != max_depth - 1: 825 | C = smat_util.coo_appender((2 ** (depth + 1), 2 ** depth)) 826 | W = smat_util.coo_appender((nr_features, 2 ** (depth + 1))) 827 | else: 828 | C = smat_util.coo_appender((nr_labels, 2 ** depth)) 829 | W = smat_util.coo_appender((nr_features, nr_labels)) 830 | 831 | child_offset = 2 ** (depth + 1) - 1 832 | for nid in range(nr_nodes_with_depth): 833 | is_leaf = int(fin.readline().strip()) 834 | left, right = [int(x) - child_offset for x in fin.readline().strip().split()] 835 | cur_depth = int(fin.readline().strip()) 836 | assert cur_depth == depth 837 | tmp = fin.readline().strip().split() 838 | labels = [int(y) for y in tmp[1:]] 839 | nr_childs = int(fin.readline().strip().split()[0]) 840 | if is_leaf != 1: 841 | labels = [left, right] 842 | for y in labels: 843 | C.append(y, nid, 1.0) 844 | for iv in fin.readline().strip().split(): 845 | iv = iv.split(":") 846 | col = y 847 | row = int(iv[0]) 848 | if row >= nr_features: 849 | continue 850 | v = float(iv[1]) 851 | W.append(row, col, v) 852 | Clist += [C.tocsr()] 853 | Wlist += [W.tocsr()] 854 | return HierarchicalMLModel([MLModel(w, c) for w, c in zip(Wlist, Clist)], bias) 855 | 856 | @staticmethod 857 | def load_prediction(path_to_file, only_topk=None): 858 | with open(path_to_file, "r") as fin: 859 | nr_insts, nr_labels = [int(x) for x in fin.readline().strip().split()] 860 | coo = smat_util.coo_appender((nr_insts, nr_labels)) 861 | for i in range(nr_insts): 862 | for iv in fin.readline().strip().split(): 863 | iv = iv.split(":") 864 | j = int(iv[0]) 865 | v = float(iv[1]) 866 | coo.append(i, j, v) 867 | return smat_util.sorted_csr(coo.tocsr(), only_topk=only_topk) 868 | 869 | 870 | class CountModel(object): 871 | def __init__(self, code_to_label): 872 | assert isinstance(code_to_label, smat.spmatrix) 873 | code_to_label = code_to_label.tocsr() 874 | self.code_to_label = sk_normalize(code_to_label, axis=1, copy=False, norm="l1") 875 | 876 | @property 877 | def nr_labels(self): 878 | return self.code_to_label.shape[1] 879 | 880 | @property 881 | def nr_codes(self): 882 | return self.code_to_label.shape[0] 883 | 884 | @classmethod 885 | def train(cls, prob, *arg_kw): 886 | assert prob.C is not None, "prob.C must be provided in CountModel.train()" 887 | return cls(prob.Z.T.dot(prob.Y)) 888 | 889 | def predict( 890 | self, X, csr_codes=None, only_topk=None, cond_prob=True, normalize=False, **arg_kw, 891 | ): 892 | assert csr_codes is not None, "csr_codes must be provided for CountModel.prdict)" 893 | assert csr_codes.shape[0] == X.shape[0] 894 | assert csr_codes.shape[1] == self.nr_codes 895 | if cond_prob: 896 | pred_csr = csr_codes.dot(self.code_to_label).tocsr() 897 | else: 898 | tmp = csr_codes.data 899 | tmp2 = sp.ones_like(tmp) 900 | csr_codes.data = tmp2 901 | pred_csr = csr_codes.dot(self.code_to_label).tocsr() 902 | csr_codes.data = tmp 903 | 904 | pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) 905 | if normalize: 906 | pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") 907 | return pred_csr 908 | 909 | 910 | class CsrEnsembler(object): 911 | """A class implementing serveal ensembler for a list sorted CSR predictions""" 912 | 913 | @staticmethod 914 | def check_validlity(*args): 915 | for x in args: 916 | assert isinstance(x, smat.csr_matrix), type(x) 917 | assert all(x.shape == args[0].shape for x in args) 918 | 919 | @staticmethod 920 | def average(*args): 921 | CsrEnsembler.check_validlity(*args) 922 | ret = sum(args) 923 | ret = smat_util.sorted_csr(ret) 924 | ret.data /= len(args) 925 | return ret 926 | 927 | @staticmethod 928 | def rank_average(*args): 929 | CsrEnsembler.check_validlity(*args) 930 | mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args) 931 | ret = sum(smat_util.get_relevance_csr(csr, mm) for csr in args) 932 | ret = smat_util.sorted_csr(ret) 933 | ret.data /= len(args) 934 | return ret 935 | 936 | @staticmethod 937 | def round_robin(*args): 938 | CsrEnsembler.check_validlity(*args) 939 | base = 1.0 / (len(args) + 1.0) 940 | mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args) 941 | ret = smat_util.get_relevance_csr(args[0], mm) 942 | ret.data[:] += len(args) * base 943 | for i, x in enumerate(args[1:], 1): 944 | tmp = smat_util.get_relevance_csr(x, mm) 945 | tmp.data[:] += (len(args) - i) * base 946 | ret = ret.maximum(tmp) 947 | ret = smat_util.sorted_csr(ret) 948 | ret.data /= len(args) 949 | return ret 950 | 951 | @staticmethod 952 | def print_ens(Ytrue, pred_set, param_set): 953 | for param, pred in zip(param_set, pred_set): 954 | print("param: {}".format(param)) 955 | print(Metrics.generate(Ytrue, pred)) 956 | for ens in [ 957 | CsrEnsembler.average, 958 | CsrEnsembler.rank_average, 959 | CsrEnsembler.round_robin, 960 | ]: 961 | print("ens: {}".format(ens.__name__)) 962 | print(Metrics.generate(Ytrue, ens(*pred_set))) 963 | 964 | 965 | def ml_train(X, Y, C=None, bias=None, hierarchical=None, min_labels=2, nr_splits=2, **arg_kw): 966 | """An interface function for HierarchicalMLModel.train""" 967 | prob = MLProblem(X, Y, C) 968 | return HierarchicalMLModel.train(prob, hierarchical, min_labels, nr_splits, bias=bias, **arg_kw) 969 | 970 | 971 | def load_model(folder, dtype=None): 972 | if dtype is None: 973 | dtype = sp.float32 974 | param = json.loads(open("{}/param.json".format(folder), "r").read()) 975 | cls = getattr(sys.modules[__name__], param["model"]) 976 | return cls.load(folder, dtype=dtype) 977 | 978 | 979 | def get_optimal_codes(Y, C, only_topk=None): 980 | csr_codes = smat_util.sorted_csr(Y.dot(C).tocsr(), only_topk=only_topk) 981 | csr_codes = sk_normalize(csr_codes, axis=1, copy=False, norm="l1") 982 | return csr_codes 983 | 984 | 985 | # ============= Section for Ad-hoc Testing Code ============== 986 | class Data(object): 987 | def __init__(self, X, Y, L, C, code, Xt=None, Yt=None, Xv=None, Yv=None, dataset=None): 988 | self.X = X # feature matrix: nr_insts * nr_features 989 | self.Y = Y # label matrix: nr_insts * nr_labels 990 | self.L = L # label embedding: nr_labels * nr_label_features 991 | self.C = C # label codes: nr_labels * nr_codes 992 | self.code = code 993 | self.Xt = Xt 994 | self.Yt = Yt 995 | self.Xv = Xv 996 | self.Yv = Yv 997 | self.data_folder = "./datasets/{}".format(dataset) 998 | self.save_folder = "./save_models/{}".format(dataset) 999 | 1000 | def update_codes( 1001 | self, label_emb="elmo", kdim=2, depth=6, algo=indexer.Indexer.KMEANS, seed=0, max_iter=20, threads=-1, **arg_kw, 1002 | ): 1003 | 1004 | # print('depth {} kdim {} label_emb {} algo {}'.format(depth, kdim, label_emb, algo)) 1005 | param = { 1006 | "label_emb": label_emb, 1007 | "depth": depth, 1008 | "algo": algo, 1009 | "seed": seed, 1010 | "max_iter": max_iter, 1011 | } 1012 | code_name = "#".join(["{}:{}".format(k, v) for k, v in sorted(param.items())]) 1013 | code_npz = "{}/indexer/codes.{}.npz".format(self.save_folder, code_name) 1014 | if path.exists(code_npz): 1015 | self.C = smat.load_npz(code_npz) 1016 | else: 1017 | self.L = smat.load_npz("{}/L.{}.npz".format(self.data_folder, label_emb)) 1018 | code = indexer.Indexer(self.L).gen(kdim=kdim, depth=depth, algo=algo, seed=seed, max_iter=max_iter, threads=threads,) 1019 | self.C = code.get_csc_matrix() 1020 | smat.save_npz(code_npz, self.C, compressed=False) 1021 | 1022 | @classmethod 1023 | def load( 1024 | cls, 1025 | dataset=None, 1026 | label_emb="elmo", 1027 | kdim=2, 1028 | depth=6, 1029 | algo=indexer.Indexer.KMEANS, 1030 | seed=0, 1031 | max_iter=10, 1032 | threads=-1, 1033 | dtype=None, 1034 | **arg_kw, 1035 | ): 1036 | if dtype is None: 1037 | dtype = sp.float32 1038 | data_folder = "./datasets" 1039 | X = smat.load_npz("{}/{}/X.trn.npz".format(data_folder, dataset)) 1040 | Y = smat.load_npz("{}/{}/Y.trn.npz".format(data_folder, dataset)) 1041 | try: 1042 | Xt = smat.load_npz("{}/{}/X.tst.npz".format(data_folder, dataset)) 1043 | Yt = smat.load_npz("{}/{}/Y.tst.npz".format(data_folder, dataset)) 1044 | Xv = smat.load_npz("{}/{}/X.val.npz".format(data_folder, dataset)) 1045 | Yv = smat.load_npz("{}/{}/Y.val.npz".format(data_folder, dataset)) 1046 | except: 1047 | Xt = None 1048 | Yt = None 1049 | Xv = None 1050 | Yv = None 1051 | L, code, C = None, None, None 1052 | ret = cls(X, Y, L, C, code, Xt, Yt, Xv, Yv, dataset) 1053 | if label_emb is not None: 1054 | ret.update_codes( 1055 | label_emb=label_emb, kdim=kdim, depth=depth, seed=seed, max_iter=max_iter, threads=threads, 1056 | ) 1057 | return ret 1058 | 1059 | 1060 | def grid_search(data, grid_params, **kw_args): 1061 | params = [] 1062 | results = [] 1063 | keys = list(grid_params.keys()) 1064 | for values in itertools.product(*[grid_params[k] for k in keys]): 1065 | new_kw_args = kw_args.copy() 1066 | new_kw_args.update(dict(zip(keys, values))) 1067 | data.update_codes(**new_kw_args) 1068 | prob = MLProblem(data.X, data.Y, data.C) 1069 | model = ml_train(X=data.X, Y=data.Y, C=data.C, hierarchical=True, threshold=0.01, **new_kw_args,) 1070 | pred_csr = model.predict(data.Xt, only_topk=20, beam_size=10, normalized=False) 1071 | # print(Metrics.generate(data.Yt, pred_csr)) 1072 | results += [pred_csr] 1073 | params += [dict(zip(keys, values))] 1074 | return results, params 1075 | 1076 | 1077 | def test_speed(datafolder="dataset/Eurlex-4K", depth=3): 1078 | data = Data.load(datafolder, depth=depth) 1079 | X = data.X 1080 | Y = data.Y 1081 | C = data.C 1082 | only_topk = 20 1083 | topk = 10 1084 | Cp = 1 1085 | Cn = 1 1086 | threshold = 0.01 1087 | # solver_type = L2R_LR_DUAL 1088 | solver_type = L2R_L2LOSS_SVC_DUAL 1089 | # test multi-label with codes 1090 | prob = MLProblem(X, Y, C) 1091 | m = MLModel(smat.rand(data.X.shape[1], data.Y.shape[1], 0.1)) 1092 | rows = sp.arange(data.Yt.shape[0], dtype=sp.uint32) 1093 | cols = sp.arange(data.Yt.shape[1], dtype=sp.uint32) 1094 | inst_idx = sp.repeat(rows, sp.ones_like(rows, dtype=rows.dtype) * data.Yt.shape[1]).astype(sp.uint32) 1095 | label_idx = sp.ones((len(rows), 1), dtype=sp.uint32).dot(cols.reshape(1, -1))[:] 1096 | yy = m.predict_values(data.Xt, inst_idx, label_idx).reshape(data.Yt.shape[0], -1) 1097 | 1098 | 1099 | def test_svm(datafolder="dataset/Eurlex-4K", depth=3): 1100 | data = Data.load(datafolder, depth=depth) 1101 | X = PyMatrix(data.X, dtype=data.X.dtype) 1102 | # X = data.X 1103 | Y = data.Y 1104 | C = data.C 1105 | only_topk = 20 1106 | topk = 10 1107 | Cp = 1 1108 | Cn = 1 1109 | threshold = 0.01 1110 | # solver_type = L2R_LR_DUAL 1111 | solver_type = L2R_L2LOSS_SVC_DUAL 1112 | 1113 | # test multi-label with codes 1114 | prob = MLProblem(X, Y, C) 1115 | m = MLModel.train(prob, threshold=threshold, solver_type=solver_type, Cp=Cp, Cn=Cn) 1116 | pred_Y = m.predict(X, only_topk=only_topk) 1117 | print("sparse W with top {}".format(topk)) 1118 | metric = Metrics.generate(Y, pred_Y, topk) 1119 | print(metric) 1120 | """ 1121 | print('|W|^2 = {}'.format((m.W.toarray() * m.W.toarray()).sum())) 1122 | coo = smat_util.dense_to_coo(sp.ones(pred_Y.shape)) 1123 | YY = smat_util.sorted_csr(smat.csr_matrix(m.predict_values(X, coo.row, coo.col).reshape(pred_Y.shape))) 1124 | metric = Metrics.generate(Y, YY, topk) 1125 | print(metric) 1126 | YY = smat_util.sorted_csr(smat.csr_matrix(X.dot(m.W))) 1127 | metric = Metrics.generate(Y, YY, topk) 1128 | print(metric) 1129 | """ 1130 | 1131 | # test hierarchical multi-label 1132 | print("Hierarchical-Multilabel") 1133 | beam_size = 4 1134 | min_labels = 2 1135 | nr_splits = 2 1136 | m = ml_train(prob, hierarchical=True, min_labels=min_labels, threshold=threshold, solver_type=solver_type, Cp=Cp, Cn=Cn,) 1137 | print("m.depth = {}".format(m.depth)) 1138 | pred_Y = m.predict(X, beam_size=beam_size, only_topk=only_topk) 1139 | print(pred_Y.shape) 1140 | print("sparse W with top {}".format(topk)) 1141 | metric = Metrics.generate(Y, pred_Y, topk) 1142 | print(metric) 1143 | """ 1144 | max_depth = 2 1145 | print('Predict up to depth = {}'.format(max_depth)) 1146 | pred_Y = m.predict(X, only_topk=only_topk, max_depth=max_depth) 1147 | trueY = Y.copy() 1148 | for d in range(m.depth - 1, max_depth - 1, -1): 1149 | trueY = trueY.dot(m.model_chain[d].C) 1150 | metric = Metrics.generate(trueY, pred_Y, topk) 1151 | print(metric) 1152 | #print('|W|^2 = {}'.format((m.W.toarray() * m.W.toarray()).sum())) 1153 | """ 1154 | 1155 | # test pure multi-label 1156 | print("pure one-vs-rest Multi-label") 1157 | prob = MLProblem(X, Y) 1158 | m = MLModel.train(prob, threshold=threshold, solver_type=solver_type, Cp=Cp, Cn=Cn) 1159 | pred_Y = m.predict(X, only_topk=only_topk) 1160 | metric = Metrics.generate(Y, pred_Y, topk) 1161 | print(metric) 1162 | print("|W|^2 = {}".format((m.W.toarray() * m.W.toarray()).sum())) 1163 | 1164 | 1165 | if __name__ == "__main__": 1166 | test_svm(datafolder="./datasets/Eurlex-4K", depth=6) 1167 | test_speed(datafolder="./datasets/Eurlex-4K", depth=6) 1168 | -------------------------------------------------------------------------------- /xbert/rf_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from os import path, system 5 | from glob import glob 6 | import scipy as sp 7 | import scipy.sparse as smat 8 | from scipy.sparse import identity as speye 9 | import ctypes 10 | from ctypes import * 11 | 12 | 13 | def genFields(names, types): 14 | return list(zip(names, types)) 15 | 16 | 17 | def fillprototype(f, restype, argtypes): 18 | f.restype = restype 19 | f.argtypes = argtypes 20 | 21 | 22 | def load_dynamic_library(dirname, soname, forced_rebuild=False): 23 | try: 24 | if forced_rebuild: 25 | system("make -C {} clean lib".format(dirname)) 26 | path_to_so = glob(path.join(dirname, soname) + "*.so")[0] 27 | _c_lib = CDLL(path_to_so) 28 | except: 29 | try: 30 | system("make -C {} clean lib".format(dirname)) 31 | path_to_so = glob(path.join(dirname, soname) + "*.so")[0] 32 | _c_lib = CDLL(path_to_so) 33 | except: 34 | raise Exception("{soname} library cannot be found and built.".format(soname=soname)) 35 | return _c_lib 36 | 37 | 38 | # Wrapper for Scipy/Numpy Matrix 39 | class PyMatrix(ctypes.Structure): 40 | DENSE_ROWMAJOR = 1 41 | DENSE_COLMAJOR = 2 42 | SPARSE = 3 43 | EYE = 4 44 | 45 | _fields_ = [ 46 | ("rows", c_uint64), 47 | ("cols", c_uint64), 48 | ("nnz", c_uint64), 49 | ("row_ptr", POINTER(c_uint64)), 50 | ("col_ptr", POINTER(c_uint64)), 51 | ("row_idx", POINTER(c_uint32)), 52 | ("col_idx", POINTER(c_uint32)), 53 | ("val", c_void_p), 54 | ("val_t", c_void_p), 55 | ("type", c_int32), 56 | ] 57 | 58 | def check_identiy(self, A): 59 | rows, cols = A.shape 60 | if rows != cols: 61 | return False 62 | if isinstance(A, sp.ndarray) and (sp.diag(A) == 1).all() != True: 63 | return False 64 | if isinstance(A, smat.spmatrix): 65 | return smat.csr_matrix(A) - speye(rows).nnz == 0 66 | 67 | return True 68 | 69 | @classmethod 70 | def identity(cls, size, dtype=sp.float32): 71 | eye = cls(A=None, dtype=dtype) 72 | eye.rows = c_uint64(size) 73 | eye.cols = c_uint64(size) 74 | eye.nnz = c_uint64(size) 75 | eye.dtype = dtype 76 | eye.type = PyMatrix.EYE 77 | name2type = dict(PyMatrix._fields_) 78 | for name in ["row_ptr", "col_ptr", "row_idx", "col_idx", "val", "val_t"]: 79 | setattr(eye, name, None) 80 | return eye 81 | 82 | def __init__(self, A, dtype=None): 83 | if A is None: 84 | return 85 | 86 | if dtype is None: 87 | dtype = sp.float32 88 | 89 | self.rows = c_uint64(A.shape[0]) 90 | self.cols = c_uint64(A.shape[1]) 91 | self.py_buf = {} 92 | self.dtype = dtype 93 | py_buf = self.py_buf 94 | 95 | if isinstance(A, (smat.csc_matrix, smat.csr_matrix)): 96 | Acsr = smat.csr_matrix(A) 97 | Acsc = smat.csc_matrix(A) 98 | self.type = PyMatrix.SPARSE 99 | self.nnz = c_uint64(Acsr.indptr[-1]) 100 | py_buf["row_ptr"] = Acsr.indptr.astype(sp.uint64) 101 | py_buf["col_idx"] = Acsr.indices.astype(sp.uint32) 102 | py_buf["val_t"] = Acsr.data.astype(dtype) 103 | py_buf["col_ptr"] = Acsc.indptr.astype(sp.uint64) 104 | py_buf["row_idx"] = Acsc.indices.astype(sp.uint32) 105 | py_buf["val"] = Acsc.data.astype(dtype) 106 | 107 | elif isinstance(A, smat.coo_matrix): 108 | 109 | def coo_to_csr(coo): 110 | nr_rows, nr_cols, nnz, row, col, val = ( 111 | coo.shape[0], 112 | coo.shape[1], 113 | coo.data.shape[0], 114 | coo.row, 115 | coo.col, 116 | coo.data, 117 | ) 118 | indptr = sp.cumsum(sp.bincount(row + 1, minlength=(nr_rows + 1)), dtype=sp.uint64) 119 | indices = sp.zeros(nnz, dtype=sp.uint32) 120 | data = sp.zeros(nnz, dtype=dtype) 121 | sorted_idx = sp.argsort(row * sp.float64(nr_cols) + col) 122 | indices[:] = col[sorted_idx] 123 | data[:] = val[sorted_idx] 124 | return indptr, indices, data 125 | 126 | def coo_to_csc(coo): 127 | return coo_to_csr(smat.coo_matrix((coo.data, (coo.col, coo.row)), shape=[coo.shape[1], coo.shape[0]],)) 128 | 129 | coo = A.tocoo() 130 | self.type = PyMatrix.SPARSE 131 | self.nnz = c_uint64(coo.data.shape[0]) 132 | py_buf["row_ptr"], py_buf["col_idx"], py_buf["val_t"] = coo_to_csr(coo) 133 | py_buf["col_ptr"], py_buf["row_idx"], py_buf["val"] = coo_to_csc(coo) 134 | 135 | elif isinstance(A, sp.ndarray): 136 | py_buf["val"] = A.astype(dtype) 137 | if py_buf["val"].flags.f_contiguous: 138 | self.type = PyMatrix.DENSE_COLMAJOR 139 | else: 140 | self.type = PyMatrix.DENSE_ROWMAJOR 141 | self.nnz = c_uint64(A.shape[0] * A.shape[1]) 142 | name2type = dict(PyMatrix._fields_) 143 | for name in py_buf: 144 | setattr(self, name, py_buf[name].ctypes.data_as(name2type[name])) 145 | self.buf = A 146 | 147 | @property 148 | def shape(self): 149 | return self.buf.shape 150 | 151 | def dot(self, other): 152 | return self.buf.dot(other) 153 | 154 | @classmethod 155 | def init_from(cls, A, dtype=None): 156 | if A is None: 157 | return None 158 | elif isinstance(A, PyMatrix): 159 | if dtype is None or A.dtype == dtype: 160 | return A 161 | else: 162 | return cls(A.buf, dtype) 163 | else: 164 | return cls(A, dtype) 165 | 166 | 167 | class PredAllocator(object): 168 | CFUNCTYPE = CFUNCTYPE(None, c_uint64, c_uint64, c_uint64, c_void_p, c_void_p, c_void_p, c_void_p) 169 | 170 | def __init__(self, rows=0, cols=0, dtype=sp.float64): 171 | self.rows = rows 172 | self.cols = cols 173 | self.indptr = None 174 | self.indices = None 175 | self.data1 = None 176 | self.data2 = None 177 | self.dtype = dtype 178 | assert dtype == sp.float32 or dtype == sp.float64 179 | 180 | def __call__(self, rows, cols, nnz, indptr_ptr, indices_ptr, data1_ptr, data2_ptr): 181 | self.rows = rows 182 | self.cols = cols 183 | self.indptr = sp.zeros(self.cols + 1, dtype=sp.uint64) 184 | self.indices = sp.zeros(nnz, dtype=sp.uint64) 185 | self.data1 = sp.zeros(nnz, dtype=self.dtype) 186 | self.data2 = sp.zeros(nnz, dtype=self.dtype) 187 | 188 | cast(indptr_ptr, POINTER(c_uint64)).contents.value = self.indptr.ctypes.data_as(c_void_p).value 189 | cast(indices_ptr, POINTER(c_uint64)).contents.value = self.indices.ctypes.data_as(c_void_p).value 190 | cast(data1_ptr, POINTER(c_uint64)).contents.value = self.data1.ctypes.data_as(c_void_p).value 191 | cast(data2_ptr, POINTER(c_uint64)).contents.value = self.data2.ctypes.data_as(c_void_p).value 192 | 193 | def get_pred(self): 194 | csr_labels = smat.csc_matrix((self.data1, self.indices, self.indptr), shape=(self.rows, self.cols)).tocsr() 195 | pred_csr = smat.csc_matrix((self.data2, self.indices, self.indptr), shape=(self.rows, self.cols)).tocsr() 196 | return csr_labels, pred_csr 197 | 198 | @property 199 | def cfunc(self): 200 | return self.CFUNCTYPE(self) 201 | 202 | 203 | class COOAllocator(object): 204 | CFUNCTYPE = CFUNCTYPE(None, c_uint64, c_uint64, c_uint64, c_void_p, c_void_p, c_void_p) 205 | 206 | def __init__(self, rows=0, cols=0, dtype=sp.float64): 207 | self.rows = rows 208 | self.cols = cols 209 | self.row_idx = None 210 | self.col_idx = None 211 | self.data = None 212 | self.dtype = dtype 213 | assert dtype == sp.float32 or dtype == sp.float64 214 | 215 | def __call__(self, rows, cols, nnz, row_ptr, col_ptr, val_ptr): 216 | self.rows = rows 217 | self.cols = cols 218 | self.row_idx = sp.zeros(nnz, dtype=sp.uint64) 219 | self.col_idx = sp.zeros(nnz, dtype=sp.uint64) 220 | self.data = sp.zeros(nnz, dtype=self.dtype) 221 | cast(row_ptr, POINTER(c_uint64)).contents.value = self.row_idx.ctypes.data_as(c_void_p).value 222 | cast(col_ptr, POINTER(c_uint64)).contents.value = self.col_idx.ctypes.data_as(c_void_p).value 223 | cast(val_ptr, POINTER(c_uint64)).contents.value = self.data.ctypes.data_as(c_void_p).value 224 | 225 | def tocoo(self): 226 | return smat.coo_matrix((self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)) 227 | 228 | def tocsr(self): 229 | return smat.csr_matrix((self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)) 230 | 231 | def tocsc(self): 232 | return smat.csc_matrix((self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)) 233 | 234 | @property 235 | def cfunc(self): 236 | return self.CFUNCTYPE(self) 237 | 238 | 239 | class PyAllocator: 240 | CFUNCTYPE = CFUNCTYPE(c_long, c_int, POINTER(c_int), c_char) 241 | 242 | def __init__(self): 243 | self.allocated_arrays = [] 244 | 245 | def __call__(self, dims, shape, dtype): 246 | x = sp.zeros(shape[:dims], sp.dtype(dtype)) 247 | self.allocated_arrays.append(x) 248 | return x.ctypes.data_as(c_void_p).value 249 | 250 | def getcfunc(self): 251 | return self.CFUNCTYPE(self) 252 | 253 | cfunc = property(getcfunc) 254 | 255 | 256 | class smat_util(object): 257 | class coo_appender(object): 258 | def __init__(self, shape): 259 | self.shape = shape 260 | self.row = [] 261 | self.col = [] 262 | self.val = [] 263 | 264 | def append(self, i, j, v): 265 | self.row += [i] 266 | self.col += [j] 267 | self.val += [v] 268 | 269 | def tocoo(self): 270 | row = sp.array(self.row) 271 | col = sp.array(self.col) 272 | val = sp.array(self.val) 273 | return smat.coo_matrix((val, (row, col)), shape=self.shape) 274 | 275 | def tocsc(self): 276 | return self.tocoo().tocsc() 277 | 278 | def tocsr(self): 279 | return self.tocoo().tocsr() 280 | 281 | #""" 282 | @staticmethod 283 | def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None): 284 | m = (sp.absolute(val.astype(sp.float64)).sum() + 1.0) * 3 285 | sorted_idx = sp.argsort(row_idx * m - val) 286 | row_idx[:] = row_idx[sorted_idx] 287 | col_idx[:] = col_idx[sorted_idx] 288 | val[:] = val[sorted_idx] 289 | indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) 290 | if only_topk is not None and isinstance(only_topk, int): 291 | only_topk = max(min(1, only_topk), only_topk) 292 | selected_idx = (sp.arange(len(val)) - indptr[row_idx]) < only_topk 293 | row_idx = row_idx[selected_idx] 294 | col_idx = col_idx[selected_idx] 295 | val = val[selected_idx] 296 | indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) 297 | return smat.csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype) 298 | #""" 299 | """ 300 | @staticmethod 301 | def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None): 302 | csr = smat.csr_matrix((val, (row_idx, col_idx)), shape=shape) 303 | csr.sort_indices() 304 | for i in range(shape[0]): 305 | rng = slice(csr.indptr[i], csr.indptr[i + 1]) 306 | sorted_idx = sp.argsort(-csr.data[rng], kind="mergesort") 307 | csr.indices[rng] = csr.indices[rng][sorted_idx] 308 | csr.data[rng] = csr.data[rng][sorted_idx] 309 | if only_topk is not None: 310 | assert isinstance(only_topk, int), f"Wrong type: type(only_topk) = {type(only_topk)}" 311 | only_topk = max(min(1, only_topk), only_topk) 312 | nnz_of_insts = csr.indptr[1:] - csr.indptr[:-1] 313 | row_idx = sp.repeat(sp.arange(shape[0], dtype=sp.uint32), nnz_of_insts) 314 | selected_idx = (sp.arange(len(csr.data)) - csr.indptr[row_idx]) < only_topk 315 | row_idx = row_idx[selected_idx] 316 | col_idx = csr.indices[selected_idx] 317 | val = csr.data[selected_idx] 318 | indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) 319 | csr = smat.csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype) 320 | return csr 321 | """ 322 | @staticmethod 323 | def sorted_csc_from_coo(shape, row_idx, col_idx, val, only_topk=None): 324 | csr = smat_util.sorted_csr_from_coo(shape[::-1], col_idx, row_idx, val, only_topk=None) 325 | return smat.csc_matrix((csr.data, csr.indices, csr.indptr), shape, dtype=val.dtype) 326 | 327 | @staticmethod 328 | def sorted_csr(csr, only_topk=None): 329 | assert isinstance(csr, smat.csr_matrix) 330 | row_idx = sp.repeat(sp.arange(csr.shape[0], dtype=sp.uint32), csr.indptr[1:] - csr.indptr[:-1]) 331 | return smat_util.sorted_csr_from_coo(csr.shape, row_idx, csr.indices, csr.data, only_topk) 332 | 333 | @staticmethod 334 | def sorted_csc(csc, only_topk=None): 335 | assert isinstance(csc, smat.csc_matrix) 336 | return smat_util.sorted_csr(csc.T).T 337 | 338 | @staticmethod 339 | def append_column(X, value=1.0, fast=True): 340 | assert len(X.shape) == 2 341 | new_column = value * sp.ones((X.shape[0], 1), dtype=X.dtype) 342 | if isinstance(X, smat.csc_matrix): 343 | if fast: # around 5x to 10x faster than smat.hstack 344 | data = sp.concatenate((X.data, new_column.ravel())) 345 | indices = sp.concatenate((X.indices, sp.arange(X.shape[0], dtype=X.indices.dtype))) 346 | indptr = sp.concatenate((X.indptr, sp.array([X.indptr[-1] + X.shape[0]], dtype=X.indptr.dtype),)) 347 | X = smat.csc_matrix((data, indices, indptr), shape=(X.shape[0], X.shape[1] + 1)) 348 | else: 349 | X = smat.hstack([X, new_column]).tocsc() 350 | elif isinstance(X, smat.csr_matrix): 351 | if fast: # around 5x to 10x faster than smat.hstack 352 | indptr = X.indptr + sp.arange(X.shape[0] + 1, dtype=X.indptr.dtype) 353 | indices = sp.zeros(len(X.indices) + X.shape[0], dtype=X.indices.dtype) 354 | data = sp.zeros(len(X.data) + X.shape[0], dtype=X.data.dtype) 355 | mask_loc = indptr[1:] - 1 356 | inv_mask = sp.ones_like(indices, dtype=sp.bool8) 357 | inv_mask[mask_loc] = False 358 | indices[mask_loc] = X.shape[1] 359 | data[mask_loc] = value 360 | indices[inv_mask] = X.indices 361 | data[inv_mask] = X.data 362 | X = smat.csr_matrix((data, indices, indptr), shape=(X.shape[0], X.shape[1] + 1)) 363 | else: 364 | X = smat.hstack([X, new_column]).tocsr() 365 | elif isinstance(X, sp.ndarray): 366 | X = sp.hstack([X, new_column]) 367 | return X 368 | 369 | @staticmethod 370 | def dense_to_coo(dense): 371 | rows = sp.arange(dense.shape[0], dtype=sp.uint32) 372 | cols = sp.arange(dense.shape[1], dtype=sp.uint32) 373 | row_idx = sp.repeat(rows, sp.ones_like(rows) * len(cols)).astype(sp.uint32) 374 | col_idx = sp.ones((len(rows), 1), dtype=sp.uint32).dot(cols.reshape(1, -1)).ravel() 375 | return smat.coo_matrix((dense.ravel(), (row_idx, col_idx)), shape=dense.shape) 376 | 377 | @staticmethod 378 | def get_relevance_csr(csr, mm=None, dtype=sp.float64): 379 | if mm is None: 380 | mm = (csr.indptr[1:] - csr.indptr[:-1]).max() 381 | nnz = len(csr.data) 382 | nnz_of_rows = csr.indptr[1:] - csr.indptr[:-1] 383 | row_idx = sp.repeat(sp.arange(csr.shape[0]), nnz_of_rows) 384 | rel = sp.array(mm - (sp.arange(nnz) - csr.indptr[row_idx]), dtype=dtype) # adding 1 to avoiding zero entries 385 | return smat.csr_matrix((rel, csr.indices, csr.indptr), csr.shape) 386 | 387 | 388 | def svm_read_problem(data_file_name, return_scipy=True): 389 | """ 390 | svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary 391 | svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix 392 | 393 | Read LIBSVM-format data from data_file_name and return labels y 394 | and data instances x. 395 | """ 396 | scipy = sp 397 | prob_y = [] 398 | prob_x = [] 399 | row_ptr = [0] 400 | col_idx = [] 401 | for i, line in enumerate(open(data_file_name)): 402 | line = line.split(None, 1) 403 | # In case an instance with all zero features 404 | if len(line) == 1: 405 | line += [""] 406 | label, features = line 407 | prob_y += [float(label)] 408 | if scipy != None and return_scipy: 409 | nz = 0 410 | for e in features.split(): 411 | ind, val = e.split(":") 412 | val = float(val) 413 | if val != 0: 414 | col_idx += [int(ind) - 1] 415 | prob_x += [val] 416 | nz += 1 417 | row_ptr += [row_ptr[-1] + nz] 418 | else: 419 | xi = {} 420 | for e in features.split(): 421 | ind, val = e.split(":") 422 | xi[int(ind)] = float(val) 423 | prob_x += [xi] 424 | if scipy != None and return_scipy: 425 | prob_y = scipy.array(prob_y) 426 | prob_x = scipy.array(prob_x) 427 | col_idx = scipy.array(col_idx) 428 | row_ptr = scipy.array(row_ptr) 429 | prob_x = smat.csr_matrix((prob_x, col_idx, row_ptr)) 430 | return (prob_y, prob_x) 431 | -------------------------------------------------------------------------------- /xbert/transformer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch BERT model.""" 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import argparse 21 | import copy 22 | import csv 23 | import glob 24 | import json 25 | import logging 26 | import math 27 | import numpy as np 28 | import os 29 | import random 30 | import re 31 | import pickle 32 | import shutil 33 | import tarfile 34 | import tempfile 35 | import scipy as sp 36 | import scipy.sparse as smat 37 | import sys 38 | 39 | import time 40 | from os import path 41 | from io import open 42 | 43 | import torch 44 | import torch.nn as nn 45 | import torch.nn.functional as F 46 | 47 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset 48 | from torch.utils.data.distributed import DistributedSampler 49 | 50 | # from torch.utils.tensorboard import SummaryWriter 51 | 52 | from tqdm import tqdm, trange 53 | 54 | import xbert.rf_linear as rf_linear 55 | import xbert.rf_util as rf_util 56 | from xbert.modeling import BertForXMLC, RobertaForXMLC, XLNetForXMLC 57 | 58 | from transformers import ( 59 | WEIGHTS_NAME, 60 | BertConfig, 61 | BertTokenizer, 62 | RobertaConfig, 63 | RobertaTokenizer, 64 | XLNetConfig, 65 | XLNetTokenizer, 66 | ) 67 | 68 | from transformers import AdamW, get_linear_schedule_with_warmup 69 | 70 | 71 | # global variable within the module 72 | 73 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig)), (),) 74 | 75 | MODEL_CLASSES = { 76 | "bert": (BertConfig, BertForXMLC, BertTokenizer), 77 | "roberta": (RobertaConfig, RobertaForXMLC, RobertaTokenizer), 78 | "xlnet": (XLNetConfig, XLNetForXMLC, XLNetTokenizer), 79 | } 80 | 81 | logger = None 82 | 83 | 84 | def set_seed(args): 85 | random.seed(args.seed) 86 | np.random.seed(args.seed) 87 | torch.manual_seed(args.seed) 88 | if args.n_gpu > 0: 89 | torch.cuda.manual_seed_all(args.seed) 90 | 91 | 92 | # transform model prediction optimized under margin-loss 93 | # into smoother curve for ranker 94 | def transform_prediction(csr_codes, transform="lpsvm-l2"): 95 | if transform == "sigmoid": 96 | csr_codes.data[:] = rf_linear.Transform.sigmoid(csr_codes.data[:]) 97 | elif transform == "lpsvm-l2": 98 | csr_codes.data[:] = rf_linear.Transform.lpsvm(2, csr_codes.data[:]) 99 | elif transform == "lpsvm-l3": 100 | csr_codes.data[:] = rf_linear.Transform.lpsvm(3, csr_codes.data[:]) 101 | else: 102 | raise NotImplementedError("unknown transform {}".format(transform)) 103 | return csr_codes 104 | 105 | class HingeLoss(nn.Module): 106 | """criterion for loss function 107 | y: 0/1 ground truth matrix of size: batch_size x output_size 108 | f: real number pred matrix of size: batch_size x output_size 109 | """ 110 | 111 | def __init__(self, margin=1.0, squared=True): 112 | super(HingeLoss, self).__init__() 113 | self.margin = margin 114 | self.squared = squared 115 | 116 | def forward(self, f, y, C_pos=1.0, C_neg=1.0): 117 | # convert y into {-1,1} 118 | y_new = 2.0 * y - 1.0 119 | tmp = y_new * f 120 | 121 | # Hinge loss 122 | loss = F.relu(self.margin - tmp) 123 | if self.squared: 124 | loss = loss ** 2 125 | loss = loss * (C_pos * y + C_neg * (1.0 - y)) 126 | return loss.mean() 127 | 128 | 129 | class TransformerMatcher(object): 130 | """ TODO Doc""" 131 | 132 | def __init__(self, model=None, num_clusters=None): 133 | self.model = model 134 | self.num_clusters = num_clusters 135 | self.loss_fn = HingeLoss(margin=1.0, squared=True) 136 | 137 | @staticmethod 138 | def get_args_and_set_logger(): 139 | global logger 140 | logging.basicConfig( 141 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, 142 | ) 143 | logger = logging.getLogger(__name__) 144 | parser = argparse.ArgumentParser(description="") 145 | 146 | ## Required parameters 147 | parser.add_argument( 148 | "-m", "--model-type", type=str, required=True, default="bert", help="preprocess for model-type [bert | xlnet | xlm | roberta]", 149 | ) 150 | parser.add_argument( 151 | "-n", 152 | "--model_name_or_path", 153 | type=str, 154 | required=True, 155 | default="bert-base-uncased", 156 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), 157 | ) 158 | parser.add_argument( 159 | "-x_trn", 160 | "--trn_feat_path", 161 | default="./save_models/Eurlex-4K/proc_data/X.trn.bert.128.pkl", 162 | type=str, 163 | ) 164 | parser.add_argument( 165 | "-x_tst", 166 | "--tst_feat_path", 167 | default="./save_models/Eurlex-4K/proc_data/X.tst.bert.128.pkl", 168 | type=str, 169 | ) 170 | parser.add_argument( 171 | "-c_trn", 172 | "--trn_label_path", 173 | default="./save_models/Eurlex-4K/proc_data/C.trn.pifa-tfidf-s0.npz", 174 | type=str, 175 | ) 176 | parser.add_argument( 177 | "-c_tst", 178 | "--tst_label_path", 179 | default="./save_models/Eurlex-4K/proc_data/C.tst.pifa-tfidf-s0.npz", 180 | type=str, 181 | ) 182 | parser.add_argument( 183 | "-o", 184 | "--output_dir", 185 | default="./tmp", 186 | type=str, 187 | help="The output directory where the model predictions and checkpoints will be written.", 188 | ) 189 | ## Other parameters 190 | parser.add_argument( 191 | "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", 192 | ) 193 | parser.add_argument( 194 | "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", 195 | ) 196 | parser.add_argument("--do_train", action="store_true", help="Whether to run training.") 197 | parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") 198 | parser.add_argument( 199 | "--hidden_dropout_prob", default=0.1, type=float, help="hidden dropout prob in deep transformer models.", 200 | ) 201 | parser.add_argument( 202 | "--per_device_train_batch_size", default=8, type=int, help="Batch size per GPU for training.", 203 | ) 204 | parser.add_argument( 205 | "--per_device_eval_batch_size", default=8, type=int, help="Batch size per GPU for evaluation.", 206 | ) 207 | parser.add_argument( 208 | "--gradient_accumulation_steps", 209 | type=int, 210 | default=1, 211 | help="Number of updates steps to accumulate before performing a backward/update pass.", 212 | ) 213 | parser.add_argument( 214 | "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", 215 | ) 216 | parser.add_argument( 217 | "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.", 218 | ) 219 | parser.add_argument( 220 | "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", 221 | ) 222 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 223 | parser.add_argument( 224 | "--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.", 225 | ) 226 | parser.add_argument( 227 | "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", 228 | ) 229 | parser.add_argument( 230 | "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.", 231 | ) 232 | parser.add_argument( 233 | "--logging_steps", default=100, type=int, help="Log every X updates steps.", 234 | ) 235 | parser.add_argument( 236 | "--loss_func", default="l2-hinge", type=str, help="loss function: bce | l1-hinge | l2-hinge", 237 | ) 238 | parser.add_argument("--margin", default=1.0, type=float, help="margin in hinge loss") 239 | parser.add_argument( 240 | "--only_topk", default=10, type=int, help="store topk prediction for matching stage", 241 | ) 242 | 243 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 244 | parser.add_argument( 245 | "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", 246 | ) 247 | parser.add_argument( 248 | "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", 249 | ) 250 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 251 | 252 | parser.add_argument( 253 | "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", 254 | ) 255 | parser.add_argument( 256 | "--fp16_opt_level", 257 | type=str, 258 | default="O1", 259 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 260 | "See details at https://nvidia.github.io/apex/amp.html", 261 | ) 262 | parser.add_argument( 263 | "--local_rank", type=int, default=-1, help="For distributed training: local_rank", 264 | ) 265 | 266 | args = parser.parse_args() 267 | return {"parser": parser, "logger": logger, "args": args} 268 | 269 | 270 | @staticmethod 271 | def set_device(args): 272 | """ set device for multi-gpu training, and fix random seed, and exp logging. """ 273 | 274 | # Setup CUDA, GPU & distributed training 275 | if args.no_cuda: 276 | device = torch.device("cpu") 277 | args.n_gpu = 0 278 | elif args.local_rank == -1: 279 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 280 | args.n_gpu = torch.cuda.device_count() 281 | else: 282 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 283 | torch.distributed.init_process_group(backend="nccl") 284 | device = torch.device("cuda", args.local_rank) 285 | args.n_gpu = 1 286 | if device.type == "cuda": 287 | torch.cuda.set_device(device) 288 | args.device = device 289 | 290 | logger.info( 291 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 292 | args.local_rank, 293 | args.device, 294 | args.n_gpu, 295 | bool(args.local_rank != -1), 296 | args.fp16, 297 | ) 298 | # Set seed 299 | set_seed(args) 300 | 301 | def prepare_model(self, args): 302 | """ Load a pretrained model for sequence classification. """ 303 | if args.local_rank not in [-1, 0]: 304 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 305 | 306 | args.model_type = args.model_type.lower() 307 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 308 | config = config_class.from_pretrained( 309 | args.config_name if args.config_name else args.model_name_or_path, 310 | hidden_dropout_prob=args.hidden_dropout_prob, 311 | num_labels=self.num_clusters, 312 | finetuning_task=None, 313 | cache_dir=args.cache_dir if args.cache_dir else None, 314 | ) 315 | model = model_class.from_pretrained( 316 | args.model_name_or_path, 317 | from_tf=bool(".ckpt" in args.model_name_or_path), 318 | config=config, 319 | cache_dir=args.cache_dir if args.cache_dir else None, 320 | ) 321 | 322 | if args.local_rank == 0: 323 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 324 | model.to(args.device) 325 | 326 | # overwrite 327 | self.config = config 328 | self.model = model 329 | 330 | def save_model(self, args): 331 | # Save model checkpoint 332 | if not os.path.exists(args.output_dir): 333 | os.makedirs(args.output_dir) 334 | model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Take care of distributed/parallel training 335 | model_to_save.save_pretrained(args.output_dir) 336 | torch.save(args, os.path.join(args.output_dir, "training_args.bin")) 337 | 338 | def predict(self, args, X_eval, C_eval_true, topk=10, get_hidden=False): 339 | """Prediction interface""" 340 | args.eval_batch_size = args.per_device_eval_batch_size * max(1, args.n_gpu) 341 | all_inst_idx = torch.tensor([f["inst_idx"] for f in X_eval], dtype=torch.long) 342 | all_input_ids = torch.tensor([f["input_ids"] for f in X_eval], dtype=torch.long) 343 | all_attention_mask = torch.tensor([f["attention_mask"] for f in X_eval], dtype=torch.long) 344 | all_token_type_ids = torch.tensor([f["token_type_ids"] for f in X_eval], dtype=torch.long) 345 | eval_data = TensorDataset(all_inst_idx, all_input_ids, all_attention_mask, all_token_type_ids) 346 | 347 | eval_sampler = SequentialSampler(eval_data) 348 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4,) 349 | 350 | # multi-gpu eval 351 | if args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): 352 | self.model = torch.nn.DataParallel(self.model) 353 | 354 | if args.local_rank in [-1, 0]: 355 | logger.info("***** Running evaluation *****") 356 | logger.info(" Num examples = %d", len(X_eval)) 357 | logger.info(" Batch size = %d", args.eval_batch_size) 358 | 359 | total_loss = 0.0 360 | total_example = 0.0 361 | rows, cols, vals = [], [], [] 362 | all_pooled_output = [] 363 | self.model.eval() 364 | for batch in eval_dataloader: 365 | with torch.no_grad(): 366 | inst_idx = batch[0] 367 | inputs = { 368 | "input_ids": batch[1].to(args.device), 369 | "attention_mask": batch[2].to(args.device), 370 | } 371 | if args.model_type != "distilbert": 372 | inputs["token_type_ids"] = ( 373 | batch[3].to(args.device) if args.model_type in ["bert", "xlnet"] else None 374 | ) # XLM, DistilBERT and RoBERTa don't use segment_ids 375 | cur_batch_size = inputs["input_ids"].shape[0] 376 | 377 | # forward 378 | outputs = self.model( 379 | input_ids=inputs["input_ids"], 380 | attention_mask=inputs["attention_mask"], 381 | token_type_ids=inputs["token_type_ids"], 382 | ) 383 | if get_hidden and self.config.output_hidden_states: 384 | c_pred, hidden_states = outputs[0], outputs[1] 385 | else: 386 | c_pred = outputs[0] 387 | 388 | # compute loss 389 | c_eval = np.array(C_eval_true[inst_idx].toarray()) 390 | c_eval = torch.tensor(c_eval, dtype=torch.float).to(args.device) 391 | loss = self.loss_fn(c_pred, c_eval) 392 | 393 | if args.n_gpu > 1: 394 | loss = loss.mean() # mean() to average on multi-gpu parallel training 395 | total_loss += cur_batch_size * loss 396 | 397 | # get pooled_output, which is the [CLS] embedding for the document 398 | if get_hidden: 399 | if args.model_type == "bert": 400 | if args.n_gpu > 1: 401 | # assume self.model hasattr module because torch.nn.DataParallel. Else, just pull model.bert. in single gpu case 402 | pooled_output = self.model.module.bert.pooler(hidden_states[-1]) 403 | pooled_output = self.model.module.dropout(pooled_output) 404 | else: #single-gpu 405 | pooled_output = self.model.bert.pooler(hidden_states[-1]) 406 | pooled_output = self.model.dropout(pooled_output) 407 | # logits = self.model.classifier(pooled_output) 408 | elif args.model_type == "roberta": 409 | if args.n_gpu > 1: 410 | pooled_output = self.model.module.classifier.dropout(hidden_states[-1][:, 0, :]) 411 | pooled_output = self.model.module.classifier.dense(pooled_output) 412 | pooled_output = torch.tanh(pooled_output) 413 | pooled_output = self.model.module.classifier.dropout(pooled_output) 414 | # logits = self.model.classifier.out_proj(pooled_output) 415 | else: 416 | pooled_output = self.model.classifier.dropout(hidden_states[-1][:, 0, :]) 417 | pooled_output = self.model.classifier.dense(pooled_output) 418 | pooled_output = torch.tanh(pooled_output) 419 | pooled_output = self.model.classifier.dropout(pooled_output) 420 | elif args.model_type == "xlnet": 421 | if args.n_gpu > 1: 422 | pooled_output = self.model.module.sequence_summary(hidden_states[-1]) 423 | else: 424 | pooled_output = self.model.sequence_summary(hidden_states[-1]) 425 | 426 | # logits = self.model.logits_proj(pooled_output) 427 | else: 428 | raise NotImplementedError("unknown args.model_type {}".format(args.model_type)) 429 | all_pooled_output.append(pooled_output.cpu().numpy()) 430 | 431 | # get topk prediction rows,cols,vals 432 | cpred_topk_vals, cpred_topk_cols = c_pred.topk(topk, dim=1) 433 | cpred_topk_rows = total_example + torch.arange(cur_batch_size) 434 | cpred_topk_rows = cpred_topk_rows.view(cur_batch_size, 1).expand_as(cpred_topk_cols) 435 | total_example += cur_batch_size 436 | 437 | # append 438 | rows += cpred_topk_rows.numpy().flatten().tolist() 439 | cols += cpred_topk_cols.cpu().numpy().flatten().tolist() 440 | vals += cpred_topk_vals.cpu().numpy().flatten().tolist() 441 | 442 | eval_loss = total_loss / total_example 443 | m = int(total_example) 444 | n = self.num_clusters 445 | pred_csr_codes = smat.csr_matrix((vals, (rows, cols)), shape=(m, n)) 446 | pred_csr_codes = rf_util.smat_util.sorted_csr(pred_csr_codes, only_topk=args.only_topk) 447 | C_eval_pred = pred_csr_codes 448 | 449 | # evaluation 450 | eval_metrics = rf_linear.Metrics.generate(C_eval_true, C_eval_pred, topk=args.only_topk) 451 | if get_hidden: 452 | eval_embeddings = np.concatenate(all_pooled_output, axis=0) 453 | else: 454 | eval_embeddings = None 455 | return eval_loss, eval_metrics, C_eval_pred, eval_embeddings 456 | 457 | def train(self, args, X_trn, C_trn): 458 | """ Train the model """ 459 | args.train_batch_size = args.per_device_train_batch_size * max(1, args.n_gpu) 460 | all_inst_idx = torch.tensor([f["inst_idx"] for f in X_trn], dtype=torch.long) 461 | all_input_ids = torch.tensor([f["input_ids"] for f in X_trn], dtype=torch.long) 462 | all_attention_mask = torch.tensor([f["attention_mask"] for f in X_trn], dtype=torch.long) 463 | all_token_type_ids = torch.tensor([f["token_type_ids"] for f in X_trn], dtype=torch.long) 464 | train_data = TensorDataset(all_inst_idx, all_input_ids, all_attention_mask, all_token_type_ids) 465 | train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data) 466 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4,) 467 | 468 | if args.max_steps > 0: 469 | t_total = args.max_steps 470 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 471 | else: 472 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 473 | 474 | # Prepare optimizer 475 | no_decay = ["bias", "LayerNorm.weight"] 476 | optimizer_grouped_parameters = [ 477 | { 478 | "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 479 | "weight_decay": args.weight_decay, 480 | }, 481 | {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0,}, 482 | ] 483 | 484 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 485 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) 486 | if args.fp16: 487 | try: 488 | from apex import amp 489 | except ImportError: 490 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 491 | self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=args.fp16_opt_level) 492 | 493 | # multi-gpu training (should be after apex fp16 initialization) 494 | if args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): 495 | self.model = torch.nn.DataParallel(self.model) 496 | 497 | # Distributed training (should be after apex fp16 initialization) 498 | if args.local_rank != -1: 499 | self.model = torch.nn.parallel.DistributedDataParallel( 500 | self.model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, 501 | ) 502 | 503 | # Start Batch Training 504 | if args.local_rank in [-1, 0]: 505 | logger.info("***** Running training *****") 506 | logger.info(" Num examples = %d", len(X_trn)) 507 | logger.info(" Num Epochs = %d", args.num_train_epochs) 508 | logger.info(" Instantaneous batch size per GPU = %d", args.per_device_train_batch_size) 509 | logger.info( 510 | " Total train batch size (w. parallel, distributed & accumulation) = %d", 511 | args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), 512 | ) 513 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 514 | logger.info(" Total optimization steps = %d", t_total) 515 | 516 | global_step = 0 517 | tr_loss, logging_loss = 0.0, 0.0 518 | total_run_time = 0.0 519 | best_matcher_prec = -1 520 | 521 | self.model.zero_grad() 522 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 523 | for epoch in range(1, int(args.num_train_epochs) + 1): 524 | for step, batch in enumerate(train_dataloader): 525 | self.model.train() 526 | start_time = time.time() 527 | inst_idx = batch[0] 528 | inputs = { 529 | "input_ids": batch[1].to(args.device), 530 | "attention_mask": batch[2].to(args.device), 531 | } 532 | if args.model_type != "distilbert": 533 | inputs["token_type_ids"] = ( 534 | batch[3].to(args.device) if args.model_type in ["bert", "xlnet"] else None 535 | ) # XLM, DistilBERT and RoBERTa don't use segment_ids 536 | 537 | outputs = self.model( 538 | input_ids=inputs["input_ids"], 539 | attention_mask=inputs["attention_mask"], 540 | token_type_ids=inputs["token_type_ids"], 541 | ) 542 | logits = outputs[0] # model outputs are always tuple in transformers (see doc) 543 | 544 | # compute loss, average across multi-gpu 545 | labels = np.array(C_trn[inst_idx].toarray()) 546 | labels = torch.tensor(labels, dtype=torch.float).to(args.device) 547 | loss = self.loss_fn(logits, labels) 548 | 549 | if args.n_gpu > 1: 550 | loss = loss.mean() # mean() to average on multi-gpu parallel training 551 | if args.gradient_accumulation_steps > 1: 552 | loss = loss / args.gradient_accumulation_steps 553 | 554 | if args.fp16: 555 | with amp.scale_loss(loss, optimizer) as scaled_loss: 556 | scaled_loss.backward() 557 | else: 558 | loss.backward() 559 | 560 | tr_loss += loss.item() 561 | total_run_time += time.time() - start_time 562 | if (step + 1) % args.gradient_accumulation_steps == 0: 563 | if args.fp16: 564 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 565 | else: 566 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) 567 | 568 | optimizer.step() 569 | scheduler.step() # Update learning rate schedule 570 | optimizer.zero_grad() 571 | global_step += 1 572 | 573 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 574 | # print training log 575 | elapsed = time.time() - start_time 576 | cur_loss = (tr_loss - logging_loss) / args.logging_steps 577 | cur_lr = scheduler.get_lr()[0] 578 | logger.info( 579 | "| [{:4d}/{:4d}][{:6d}/{:6d}] | {:4d}/{:4d} batches | ms/batch {:5.4f} | train_loss {:6e} | lr {:.6e}".format( 580 | int(epoch), 581 | int(args.num_train_epochs), 582 | int(global_step), 583 | int(t_total), 584 | int(step), 585 | len(train_dataloader), 586 | elapsed * 1000.0 / args.logging_steps, 587 | cur_loss, 588 | cur_lr, 589 | ) 590 | ) 591 | logging_loss = tr_loss 592 | 593 | if args.max_steps > 0 and global_step > args.max_steps: 594 | break 595 | if args.max_steps > 0 and global_step > args.max_steps: 596 | break 597 | 598 | 599 | def main(): 600 | # get args 601 | args = TransformerMatcher.get_args_and_set_logger()["args"] 602 | 603 | # do_train and save model 604 | if args.do_train: 605 | # setup output_dir 606 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: 607 | raise ValueError( 608 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir) 609 | ) 610 | if not os.path.exists(args.output_dir): 611 | os.makedirs(args.output_dir) 612 | 613 | # load data 614 | with open(args.trn_feat_path, "rb") as fin: 615 | X_trn = pickle.load(fin) 616 | C_trn = smat.load_npz(args.trn_label_path) 617 | 618 | # prepare transformer pretrained models 619 | TransformerMatcher.set_device(args) 620 | matcher = TransformerMatcher(num_clusters=C_trn.shape[1]) 621 | matcher.prepare_model(args) 622 | 623 | # train 624 | matcher.train(args, X_trn, C_trn) 625 | if args.local_rank in [-1, 0]: 626 | matcher.save_model(args) 627 | 628 | # do_eval on test set and save prediction output 629 | if args.do_eval: 630 | # we only support multigpu mode but not distributed mode 631 | assert args.local_rank == -1 632 | 633 | # load data 634 | with open(args.trn_feat_path, "rb") as fin: 635 | X_trn = pickle.load(fin) 636 | with open(args.tst_feat_path, "rb") as fin: 637 | X_tst = pickle.load(fin) 638 | C_trn = smat.load_npz(args.trn_label_path) 639 | C_tst = smat.load_npz(args.tst_label_path) 640 | 641 | # load fine-tuned model in the args.output_dir 642 | TransformerMatcher.set_device(args) 643 | matcher = TransformerMatcher(num_clusters=C_trn.shape[1]) 644 | args.model_type = args.model_type.lower() 645 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 646 | matcher.config = config_class.from_pretrained(args.output_dir) 647 | matcher.config.output_hidden_states = True 648 | model = model_class.from_pretrained(args.output_dir, config=matcher.config) 649 | model.to(args.device) 650 | matcher.model = model 651 | 652 | # predict 653 | trn_loss, trn_metrics, C_trn_pred, trn_embeddings = matcher.predict(args, X_trn, C_trn, topk=args.only_topk, get_hidden=True) 654 | tst_loss, tst_metrics, C_tst_pred, tst_embeddings = matcher.predict(args, X_tst, C_tst, topk=args.only_topk, get_hidden=True) 655 | logger.info("| matcher_trn_prec {}".format(" ".join("{:4.2f}".format(100 * v) for v in trn_metrics.prec))) 656 | logger.info("| matcher_trn_recl {}".format(" ".join("{:4.2f}".format(100 * v) for v in trn_metrics.recall))) 657 | logger.info("| matcher_tst_prec {}".format(" ".join("{:4.2f}".format(100 * v) for v in tst_metrics.prec))) 658 | logger.info("| matcher_tst_recl {}".format(" ".join("{:4.2f}".format(100 * v) for v in tst_metrics.recall))) 659 | 660 | # save C_trn_pred.npz and trn_embedding.npy 661 | trn_csr_codes = rf_util.smat_util.sorted_csr(C_trn_pred, only_topk=args.only_topk) 662 | trn_csr_codes = transform_prediction(trn_csr_codes, transform="lpsvm-l2") 663 | csr_codes_path = os.path.join(args.output_dir, "C_trn_pred.npz") 664 | smat.save_npz(csr_codes_path, trn_csr_codes) 665 | embedding_path = os.path.join(args.output_dir, "trn_embeddings.npy") 666 | np.save(embedding_path, trn_embeddings) 667 | 668 | # save C_eval_pred.npz and tst_embedding.npy 669 | tst_csr_codes = rf_util.smat_util.sorted_csr(C_tst_pred, only_topk=args.only_topk) 670 | tst_csr_codes = transform_prediction(tst_csr_codes, transform="lpsvm-l2") 671 | csr_codes_path = os.path.join(args.output_dir, "C_tst_pred.npz") 672 | smat.save_npz(csr_codes_path, tst_csr_codes) 673 | embedding_path = os.path.join(args.output_dir, "tst_embeddings.npy") 674 | np.save(embedding_path, tst_embeddings) 675 | 676 | 677 | if __name__ == "__main__": 678 | main() 679 | --------------------------------------------------------------------------------