├── LICENSE ├── README.md ├── __init__.py ├── analysis ├── data │ └── README.md ├── graphs │ ├── Eurlex4k-complex.html │ └── Eurlex4k.html ├── lib │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── mathops.cpython-36.pyc │ ├── data.py │ ├── embeddings.py │ ├── mathops.py │ ├── metrics.py │ ├── metrics_old.py │ ├── model.py │ ├── model_v1.py │ ├── model_v2.py │ ├── plots.py │ └── utils.py └── random_labels.ipynb ├── attention-xml ├── README.md ├── configure │ ├── datasets │ │ ├── Amazon-3M.yaml │ │ ├── Amazon-670K-spn.yaml │ │ ├── Amazon-670K.yaml │ │ ├── AmazonCat-13K-spn.yaml │ │ ├── AmazonCat-13K.yaml │ │ ├── EUR-Lex-spn.yaml │ │ ├── EUR-Lex.yaml │ │ ├── Wiki-500K-spn.yaml │ │ ├── Wiki-500K.yaml │ │ ├── Wiki10-31K-spn.yaml │ │ └── Wiki10-31K.yaml │ └── models │ │ ├── AttentionXML-Amazon-670K.yaml │ │ ├── AttentionXML-AmazonCat-13K.yaml │ │ ├── AttentionXML-EUR-Lex.yaml │ │ ├── AttentionXML-Wiki10-31K.yaml │ │ ├── FastAttentionXML-Amazon-3M.yaml │ │ ├── FastAttentionXML-Amazon-670K.yaml │ │ └── FastAttentionXML-Wiki-500K.yaml ├── data │ └── README.md ├── deepxml │ ├── __init__.py │ ├── cluster.py │ ├── data_utils.py │ ├── dataset.py │ ├── evaluation.py │ ├── lib │ ├── models.py │ ├── modules.py │ ├── networks.py │ ├── optimizers.py │ └── tree.py ├── ensemble.py ├── evaluation.py ├── experiments.sh ├── main.py ├── preprocess.py ├── requirements.txt ├── scripts │ ├── run_amazon.sh │ ├── run_amazon3m.sh │ ├── run_amazoncat.sh │ ├── run_eurlex.sh │ ├── run_preprocess.sh │ ├── run_wiki.sh │ ├── run_wiki10.sh │ └── run_xml.sh └── train.slurm.sh ├── build_label_vectors.py ├── combine_results.sh ├── data └── README.md ├── experiments.sh ├── hrr-example-representation.png ├── hrr-example.png ├── lib ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── mathops.cpython-36.pyc ├── data.py ├── embeddings.py ├── mathops.py ├── metrics.py ├── metrics_old.py ├── model.py ├── plots.py └── utils.py ├── output └── README.md ├── requirements.txt ├── run_classifier.py ├── train.slurm.sh └── xml-cnn ├── README.md ├── code ├── __init__.py ├── cnn_test.py ├── cnn_train.py ├── experiments.sh ├── header.py ├── lib ├── main.py ├── models │ ├── classifier.py │ ├── cnn_encoder.py │ ├── embedding_layer.py │ ├── header.py │ └── xmlCNN.py ├── precision_k.py ├── run.sh ├── test_manik.m └── train.slurm.sh ├── data └── README.md ├── embedding_weights └── .gitignore └── utils ├── data_dive.py ├── data_helpers.py ├── fiddle_clusters.py ├── futils.py ├── futils_old.py ├── loss.py ├── process_eurlex.py └── w2v.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Federico Bianchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/__init__.py -------------------------------------------------------------------------------- /analysis/data/README.md: -------------------------------------------------------------------------------- 1 | # AttentionXML 2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727) 3 | 4 | ## Requirements 5 | 6 | * python==3.7.4 7 | * click==7.0 8 | * ruamel.yaml==0.16.5 9 | * numpy==1.16.2 10 | * scipy==1.3.1 11 | * scikit-learn==0.21.2 12 | * gensim==3.4.0 13 | * torch==1.0.1 14 | * nltk==3.4 15 | * tqdm==4.31.1 16 | * joblib==0.13.2 17 | * logzero==1.5.0 18 | 19 | ## Datasets 20 | 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2) 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR) 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL) 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW) 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk) 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4) 27 | 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**). 29 | 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 31 | 32 | ## XML Experiments 33 | 34 | XML experiments in paper can be run directly such as: 35 | ```bash 36 | ./scripts/run_eurlex.sh 37 | ``` 38 | ## Preprocess 39 | 40 | Run preprocess.py for train and test datasets with tokenized texts as follows: 41 | ```bash 42 | python preprocess.py \ 43 | --text-path data/EUR-Lex/train_texts.txt \ 44 | --label-path data/EUR-Lex/train_labels.txt \ 45 | --vocab-path data/EUR-Lex/vocab.npy \ 46 | --emb-path data/EUR-Lex/emb_init.npy \ 47 | --w2v-model data/glove.840B.300d.gensim 48 | 49 | python preprocess.py \ 50 | --text-path data/EUR-Lex/test_texts.txt \ 51 | --label-path data/EUR-Lex/test_labels.txt \ 52 | --vocab-path data/EUR-Lex/vocab.npy 53 | ``` 54 | 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows: 56 | ```bash 57 | python preprocess.py \ 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \ 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \ 60 | --label-path data/Wiki10-31K/train_labels.txt \ 61 | --vocab-path data/Wiki10-31K/vocab.npy \ 62 | --emb-path data/Wiki10-31K/emb_init.npy \ 63 | --w2v-model data/glove.840B.300d.gensim 64 | 65 | python preprocess.py \ 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \ 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \ 68 | --label-path data/Wiki10-31K/test_labels.txt \ 69 | --vocab-path data/Wiki10-31K/vocab.npy 70 | ``` 71 | 72 | 73 | ## Train and Predict 74 | 75 | Train and predict as follows: 76 | ```bash 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 78 | ``` 79 | 80 | Or do prediction only with option "--mode eval". 81 | 82 | ## Ensemble 83 | 84 | Train and predict with an ensemble: 85 | ```bash 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3 90 | ``` 91 | 92 | ## Evaluation 93 | 94 | ```bash 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy 96 | ``` 97 | Or get propensity scored metrics together: 98 | 99 | ```bash 100 | python evaluation.py \ 101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \ 102 | --targets data/Amazon-670K/test_labels.npy \ 103 | --train-labels data/Amazon-670K/train_labels.npy \ 104 | -a 0.6 \ 105 | -b 2.6 106 | 107 | ``` 108 | 109 | ## Reference 110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019 111 | 112 | ## Declaration 113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn). -------------------------------------------------------------------------------- /analysis/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__init__.py -------------------------------------------------------------------------------- /analysis/lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /analysis/lib/__pycache__/mathops.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__pycache__/mathops.cpython-36.pyc -------------------------------------------------------------------------------- /analysis/lib/embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Operations to generate embeddings. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | import numpy as np 9 | import torch 10 | from gensim.models import KeyedVectors 11 | 12 | from .mathops import complex_multiplication, complex_division, circular_conv 13 | from .mathops import get_appx_inv, get_inv, complexMagProj, normalize 14 | from .mathops import npcomplexMagProj 15 | 16 | """ 17 | Load Pretrained Label Embeddings. 18 | """ 19 | def load_embeddings(save_loc, vocab_size): 20 | fname = save_loc + "-complex.bin" 21 | model = KeyedVectors.load_word2vec_format(fname, binary=True) 22 | rand_vec_cnt = 0 23 | vectors = [] # positions in vector space. 24 | for i in range(0, vocab_size): 25 | if str(i) in model.wv.vocab: 26 | vectors.append(model.wv[str(i)]) 27 | else: 28 | # NOTE: When a label is not present in training then we generate a 29 | # default vector and add it to the label vector matrix. 30 | # As SPN select the label based on the index it remains consistent while training. 31 | rand_vec_cnt += 1 32 | vectors.append(gen_rand_vec(model.vector_size)) 33 | 34 | # Add Padding idx. 35 | print("Vocabulary Size: {}".format(vocab_size)) 36 | print("Number of Random vectors generated: {}".format(rand_vec_cnt)) 37 | vectors.append(gen_rand_vec(model.vector_size)) 38 | vectors = torch.from_numpy(np.array(vectors, dtype=np.float32)) 39 | return vectors 40 | 41 | """ 42 | NumPY operations for embeddings. 43 | """ 44 | def generate_vectors(num_vectors, dims): 45 | """ 46 | Generate n vectors of size dims that are orthogonal to each other. 47 | """ 48 | if num_vectors > dims: 49 | raise ValueError("num_vectors cannot be greater than dims!") 50 | 51 | # Intializing class vectors. 52 | vecs = torch.randn(dims, num_vectors, dtype=torch.float) 53 | 54 | # Using QR decomposition to get orthogonal vectors. 55 | vecs, _ = torch.qr(vecs) 56 | vecs = vecs.t() 57 | vecs = vecs / torch.norm(vecs, dim=-1, keepdim=True) 58 | return vecs 59 | 60 | 61 | def gen_rand_vec(dims): 62 | """ 63 | Generate a random vector of size dims. 64 | """ 65 | return npcomplexMagProj(np.random.normal(0, 1. / dims, size=(dims))) 66 | 67 | 68 | """ 69 | Torch functions. 70 | """ 71 | def get_vectors(num_vectors, dims, ortho=False): 72 | if ortho: 73 | vectors = generate_vectors(num_vectors, dims) 74 | return complexMagProj(vectors) 75 | else: 76 | vectors = [gen_rand_vec(dims) for i in range(num_vectors)] 77 | return torch.from_numpy(np.array(vectors, dtype=np.float32)) 78 | 79 | def get_static_embedding(seeds, dims): 80 | vec = [] 81 | for s in seeds: 82 | torch.manual_seed(s) 83 | vec.append(torch.randn((1, dims), dtype=torch.float)) 84 | 85 | return torch.cat(vec, dim=0) 86 | -------------------------------------------------------------------------------- /analysis/lib/mathops.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to perform circular convolution operations. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | """ 14 | Pytorch functions. 15 | """ 16 | def complex_multiplication(left, right): 17 | """ 18 | Multiply two vectors in complex domain. 19 | """ 20 | left_real, left_complex = left[..., 0], left[..., 1] 21 | right_real, right_complex = right[..., 0], right[..., 1] 22 | 23 | output_real = left_real * right_real - left_complex * right_complex 24 | output_complex = left_real * right_complex + left_complex * right_real 25 | return torch.stack([output_real, output_complex], dim=-1) 26 | 27 | def complex_division(left, right): 28 | """ 29 | Divide two vectors in complex domain. 30 | """ 31 | left_real, left_complex = left[..., 0], left[..., 1] 32 | right_real, right_complex = right[..., 0], right[..., 1] 33 | 34 | output_real = torch.div((left_real * right_real + left_complex * right_complex),(right_real**2 + right_complex**2)) 35 | output_complex = torch.div((left_complex * right_real - left_real * right_complex ),(right_real**2 + right_complex**2)) 36 | return torch.stack([output_real, output_complex], dim=-1) 37 | 38 | def circular_conv(a, b): 39 | """ Defines the circular convolution operation 40 | a: tensor of shape (batch, D) 41 | b: tensor of shape (batch, D) 42 | """ 43 | left = torch.rfft(a, 1, onesided=False) 44 | right = torch.rfft(b, 1, onesided=False) 45 | output = complex_multiplication(left, right) 46 | output = torch.irfft(output, 1, signal_sizes=a.shape[-1:], onesided=False) 47 | return output 48 | 49 | def get_appx_inv(a): 50 | """ 51 | Compute approximate inverse of vector a. 52 | """ 53 | return torch.roll(torch.flip(a, dims=[-1]), 1,-1) 54 | 55 | def get_inv(a, typ=torch.DoubleTensor): 56 | """ 57 | Compute exact inverse of vector a. 58 | """ 59 | left = torch.rfft(a, 1, onesided=False) 60 | complex_1 = np.zeros(left.shape) 61 | complex_1[...,0] = 1 62 | op = complex_division(typ(complex_1),left) 63 | return torch.irfft(op,1,onesided=False) 64 | 65 | def complexMagProj(x): 66 | """ 67 | Normalize a vector x in complex domain. 68 | """ 69 | c = torch.rfft(x, 1, onesided=False) 70 | c_ish=c/torch.norm(c, dim=-1,keepdim=True) 71 | output = torch.irfft(c_ish, 1, signal_sizes=x.shape[1:], onesided=False) 72 | return output 73 | 74 | def normalize(x): 75 | return x/torch.norm(x) 76 | 77 | """ 78 | Numpy Functions. 79 | """ 80 | # Make them work with batch dimensions 81 | def cc(a, b): 82 | return np.fft.irfft(np.fft.rfft(a) * np.fft.rfft(b)) 83 | 84 | def np_inv(a): 85 | return np.fft.irfft((1.0/np.fft.rfft(a)),n=a.shape[-1]) 86 | 87 | def np_appx_inv(a): 88 | #Faster implementation 89 | return np.roll(np.flip(a, axis=-1), 1,-1) 90 | 91 | def npcomplexMagProj(x): 92 | """ 93 | Normalize a vector x in complex domain. 94 | """ 95 | c = np.fft.rfft(x) 96 | 97 | # Look at real and image as if they were real 98 | c_ish = np.vstack([c.real, c.imag]) 99 | 100 | # Normalize magnitude of each complex/real pair 101 | c_ish=c_ish/np.linalg.norm(c_ish, axis=0) 102 | c_proj = c_ish[0,:] + 1j * c_ish[1,:] 103 | return np.fft.irfft(c_proj,n=x.shape[-1]) 104 | 105 | def nrm(a): 106 | return a / np.linalg.norm(a) 107 | -------------------------------------------------------------------------------- /analysis/lib/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to compute different metrics for tasks. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | from tabulate import tabulate 9 | import math 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import torch 13 | import xclib.evaluation.xc_metrics as xc_metrics 14 | 15 | # Compute the precision score for multi-label binary classification task. 16 | def mbprecision(y_true, y_pred): 17 | correct_pred = torch.sum(y_pred & y_true, axis=1).float() 18 | print(correct_pred.dtype) 19 | return torch.mean(correct_pred / torch.sum(y_true, axis=1)) 20 | 21 | # Compute the recall score for multi-label binary classification task. 22 | def mbrecall(y_true, y_pred): 23 | return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1)) 24 | 25 | 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename): 27 | """ 28 | Plot stats about the experiment. 29 | tr_stats: Training statistics (includes loss, precision, recall and F1) 30 | th_stats: Grid search statistics for configuring threshold. 31 | epochs: Number of epochs that the model is trained for. 32 | spoch: epoch that has optimal paramaters. 33 | sth: optimal threshold. 34 | filename: location to store plots. 35 | """ 36 | fig, ax = plt.subplots(3, figsize=(10, 10)) 37 | 38 | ep = tr_stats['Epoch'] 39 | tr_loss = tr_stats['Training Loss'] 40 | val_loss = tr_stats['Val Loss'] 41 | pr = tr_stats['Precision'] 42 | re = tr_stats['Recall'] 43 | f1 = tr_stats['F1 Score'] 44 | th = th_stats['Threshold'] 45 | 46 | ax[0].plot(ep, tr_loss) 47 | ax[0].plot(ep, val_loss) 48 | ax[0].set_title("Training & Validation Loss Per Epoch", size=16) 49 | ax[0].set_xlabel("Epoch", size=14) 50 | ax[0].set_ylabel("Loss", size=14) 51 | ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large") 52 | ax[0].axvline(x=spoch, linestyle='dashed') 53 | 54 | ax[1].plot(ep, pr) 55 | ax[1].plot(ep, re) 56 | ax[1].plot(ep, f1) 57 | ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16) 58 | ax[1].set_xlabel("Epoch", size=14) 59 | ax[1].set_ylabel("Score", size=14) 60 | ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 61 | ax[1].axvline(x=spoch, linestyle='dashed') 62 | 63 | ax[2].plot(th, th_stats['Precision']) 64 | ax[2].plot(th, th_stats['Recall']) 65 | ax[2].plot(th, th_stats['F1 Score']) 66 | ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16) 67 | ax[2].set_xlabel("Theshold", size=14) 68 | ax[2].set_ylabel("Score", size=14) 69 | ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 70 | ax[2].axvline(x=sth, linestyle='dashed') 71 | 72 | fig.tight_layout() 73 | plt.savefig(filename + ".png") 74 | 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5): 77 | """ 78 | Compute Inverse propensity values 79 | Values for A/B: 80 | Wikpedia-500K: 0.5/0.4 81 | Amazon-670K, Amazon-3M: 0.6/2.6 82 | Others: 0.55/1.5 83 | 84 | Arguments: 85 | train_labels : numpy ndarray 86 | """ 87 | inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B) 88 | return inv_propen 89 | 90 | # Compute metrics with propensity. 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5): 92 | """Compute propensity weighted precision@k and DCG@k. 93 | Arguments: 94 | true_labels : numpy ndarray 95 | Ground truth labels from the dataset (one-hot vector). 96 | predicted_labels : numpy ndarray 97 | Predicted labels (one-hot vector of labels) 98 | """ 99 | acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores, 100 | remove_invalid=False) 101 | return acc.eval(predicted_labels, topk) 102 | 103 | 104 | # Print the final results. 105 | # This provides the results for agg metrics when threshold for inference 106 | # is optimized and metrics are then computed. 107 | def display_agg_results(args, te_loss, pr, rec, f1): 108 | print("----------Tests with Threshold Inference------------") 109 | print("Inference Threshold: {:.3f}".format(args.th)) 110 | print("Test Loss: {:.3f}".format(te_loss)) 111 | print("Test Precision: {:.3f}".format(pr * 100)) 112 | print("Test Recall: {:.3f}".format(rec * 100)) 113 | print("Test F1-Score: {:.3f}\n".format(f1 * 100)) 114 | 115 | 116 | def display_metrics(metrics, k=5): 117 | # Merge batchwise metrics. 118 | final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k] 119 | for idx, metric in enumerate(metrics): 120 | for i in range(0, 4): 121 | for j in range(0, k): 122 | final_metrics[i][j] += metric[i][j] 123 | 124 | # Dataset metrics. 125 | print("----------Tests with Ordered Retrieval------------") 126 | table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]] 127 | table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]]) 128 | table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]]) 129 | table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]]) 130 | print(tabulate(table, headers=[i+1 for i in range(0, k)], 131 | floatfmt=".3f")) 132 | -------------------------------------------------------------------------------- /analysis/lib/metrics_old.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to compute different metrics for tasks. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | from tabulate import tabulate 9 | import math 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import torch 13 | import xclib.evaluation.xc_metrics as xc_metrics 14 | 15 | # Compute the precision score for multi-label binary classification task. 16 | def mbprecision(y_true, y_pred): 17 | correct_pred = torch.sum(y_pred & y_true, axis=1).float() 18 | print(correct_pred.dtype) 19 | return torch.mean(correct_pred / torch.sum(y_true, axis=1)) 20 | 21 | # Compute the recall score for multi-label binary classification task. 22 | def mbrecall(y_true, y_pred): 23 | return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1)) 24 | 25 | 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename): 27 | """ 28 | Plot stats about the experiment. 29 | tr_stats: Training statistics (includes loss, precision, recall and F1) 30 | th_stats: Grid search statistics for configuring threshold. 31 | epochs: Number of epochs that the model is trained for. 32 | spoch: epoch that has optimal paramaters. 33 | sth: optimal threshold. 34 | filename: location to store plots. 35 | """ 36 | fig, ax = plt.subplots(3, figsize=(10, 10)) 37 | 38 | ep = tr_stats['Epoch'] 39 | tr_loss = tr_stats['Training Loss'] 40 | val_loss = tr_stats['Val Loss'] 41 | pr = tr_stats['Precision'] 42 | re = tr_stats['Recall'] 43 | f1 = tr_stats['F1 Score'] 44 | th = th_stats['Threshold'] 45 | 46 | ax[0].plot(ep, tr_loss) 47 | ax[0].plot(ep, val_loss) 48 | ax[0].set_title("Training & Validation Loss Per Epoch", size=16) 49 | ax[0].set_xlabel("Epoch", size=14) 50 | ax[0].set_ylabel("Loss", size=14) 51 | ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large") 52 | ax[0].axvline(x=spoch, linestyle='dashed') 53 | 54 | ax[1].plot(ep, pr) 55 | ax[1].plot(ep, re) 56 | ax[1].plot(ep, f1) 57 | ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16) 58 | ax[1].set_xlabel("Epoch", size=14) 59 | ax[1].set_ylabel("Score", size=14) 60 | ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 61 | ax[1].axvline(x=spoch, linestyle='dashed') 62 | 63 | ax[2].plot(th, th_stats['Precision']) 64 | ax[2].plot(th, th_stats['Recall']) 65 | ax[2].plot(th, th_stats['F1 Score']) 66 | ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16) 67 | ax[2].set_xlabel("Theshold", size=14) 68 | ax[2].set_ylabel("Score", size=14) 69 | ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 70 | ax[2].axvline(x=sth, linestyle='dashed') 71 | 72 | fig.tight_layout() 73 | plt.savefig(filename + ".png") 74 | 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5): 77 | """ 78 | Compute Inverse propensity values 79 | Values for A/B: 80 | Wikpedia-500K: 0.5/0.4 81 | Amazon-670K, Amazon-3M: 0.6/2.6 82 | Others: 0.55/1.5 83 | 84 | Arguments: 85 | train_labels : numpy ndarray 86 | """ 87 | inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B) 88 | return inv_propen 89 | 90 | # Compute metrics with propensity. 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5): 92 | """Compute propensity weighted precision@k and DCG@k. 93 | Arguments: 94 | true_labels : numpy ndarray 95 | Ground truth labels from the dataset (one-hot vector). 96 | predicted_labels : numpy ndarray 97 | Predicted labels (one-hot vector of labels) 98 | """ 99 | acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores, 100 | remove_invalid=False) 101 | return acc.eval(predicted_labels, topk) 102 | 103 | # Print the final results. 104 | # This provides the results for agg metrics when threshold for inference 105 | # is optimized and metrics are then computed. 106 | def display_agg_results(args, te_loss, pr, rec, f1): 107 | print("----------Tests with Threshold Inference------------") 108 | print("Inference Threshold: {:.3f}".format(args.th)) 109 | print("Test Loss: {:.3f}".format(te_loss)) 110 | print("Test Precision: {:.3f}".format(pr * 100)) 111 | print("Test Recall: {:.3f}".format(rec * 100)) 112 | print("Test F1-Score: {:.3f}\n".format(f1 * 100)) 113 | 114 | 115 | def display_metrics(metrics, k=5): 116 | # Merge batchwise metrics. 117 | final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k] 118 | for idx, metric in enumerate(metrics): 119 | for i in range(0, 4): 120 | for j in range(0, k): 121 | final_metrics[i][j] += metric[i][j] 122 | 123 | # Dataset metrics. 124 | print("----------Tests with Ordered Retrieval------------") 125 | table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]] 126 | table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]]) 127 | table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]]) 128 | table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]]) 129 | print(tabulate(table, headers=[i+1 for i in range(0, k)], 130 | floatfmt=".3f")) 131 | -------------------------------------------------------------------------------- /analysis/lib/plots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manage plots. 3 | AUTHOR: Ashwinkumar Ganesan. 4 | """ 5 | 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import csv 9 | import pandas as pd 10 | 11 | """ 12 | Plot training and testing curves. 13 | The graph includes: 14 | 1. Training loss per epoch. 15 | 2. Test loss per epoch. 16 | 3. Precision per epoch. 17 | 4. Recall per epoch. 18 | 5. F1 score per epoch. 19 | """ 20 | def plot_stats(tr_stats): 21 | fig, ax = plt.subplots(2) 22 | 23 | ep = [i for i in range(0, epochs)] 24 | tr_loss = tr_stats['Training Loss'] 25 | te_loss = tr_stats['Test Loss'] 26 | pr = tr_stats['Precision'] 27 | re = tr_stats['Recall'] 28 | f1 = tr_stats['F1 Score'] 29 | 30 | # Loss Curve. 31 | ax[0].plot(ep, tr_loss) 32 | ax[0].plot(ep, te_loss) 33 | ax[0].set_title("Training & Testing Loss Per Epoch") 34 | 35 | 36 | ax[1].plot(ep, pr) 37 | ax[1].plot(ep, re) 38 | ax[1].plot(ep, f1) 39 | -------------------------------------------------------------------------------- /analysis/lib/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. 3 | """ 4 | 5 | from prettytable import PrettyTable 6 | import pandas as pd 7 | from time import time 8 | import torch 9 | 10 | GB_DIV = 1024 * 1024 * 1024 11 | 12 | 13 | def print_memory_profile(): 14 | """ 15 | Get basic memory information. 16 | """ 17 | device = torch.cuda.current_device() 18 | print("Allocated: {:.4f}".format(int(torch.cuda.memory_allocated()) / GB_DIV)) 19 | print("Reserved: {:.4f}\n".format(int(torch.cuda.memory_allocated()) / GB_DIV)) 20 | 21 | # https://stackoverflow.com/questions/9535954/printing-lists-as-tabular-data 22 | def print_command_arguments(args): 23 | table = PrettyTable(['Parameter', 'Value']) 24 | table.title = 'Experimental Setup' 25 | for arg in vars(args): 26 | table.add_row([arg, getattr(args, arg)]) 27 | print(table) 28 | 29 | class Measure(object): 30 | """ 31 | Manage runtimes for a specific code block. 32 | """ 33 | def __init__(self, name): 34 | self._measure = name 35 | self._is_measuring = False 36 | self._elapsed_time = 0 37 | 38 | def is_measuring(self): 39 | return self._is_measuring 40 | 41 | def start(self): 42 | self._stime = time() 43 | self._is_measuring = True 44 | 45 | def end(self): 46 | self._etime = time() 47 | self._elapsed_time += self._etime - self._stime 48 | self._is_measuring = False 49 | 50 | def get_elapsed_time(self): 51 | return self._elapsed_time 52 | 53 | def get_name(self): 54 | return self._measure 55 | 56 | 57 | class ExperimentTime(object): 58 | """ 59 | Manage time for different parts in an experiment. 60 | """ 61 | def __init__(self): 62 | self._table = pd.DataFrame(columns=['Measurement', 'Elapsed Time']) 63 | self._pos = 0 64 | self.measure = {} 65 | 66 | def _append(self, name): 67 | self._table.loc[self._pos] = [name, self.measure[name].get_elapsed_time()] 68 | self._pos += 1 69 | 70 | def register(self, name): 71 | if name in self.measure: 72 | print("Measurement with same name previously added.") 73 | else: 74 | self.measure[name] = Measure(name) 75 | 76 | def measure_time(self, name): 77 | if self.measure[name].is_measuring(): 78 | self.measure[name].end() 79 | # Add time to the dataframe. 80 | self._append(name) 81 | else: 82 | self.measure[name].start() 83 | 84 | def get_measurements(self): 85 | return self._table 86 | -------------------------------------------------------------------------------- /attention-xml/README.md: -------------------------------------------------------------------------------- 1 | # HRR-AttentionXML 2 | This is a modified implementation of [AttentionalXML architecture](https://arxiv.org/abs/1811.01727) to train with HRR label representations and perform inference. 3 | 4 | ## List of changes to the Codebase. 5 | The XML-CNN codebase has been modified to with the following list of changes: 6 | 1. Retooled to use semantic pointers. The architecture can use HRRs to learn and infer labels. 7 | 2. The dataset and model YAML files have additional arguments for HRR label representations. 8 | 9 | ## NOTES 10 | 1. For details about datasets and how to setup the repository, please look at instructions [here](https://github.com/yourh/AttentionXML). 11 | 2. AttentionXML is NOT configured for tree-based inference and HRR is applied only to a standard inference with a softmax layer. 12 | 3. The codebase also contains two scripts, i.e., ```experiments.sh``` and ```train.slurm.sh``` for execution of training and evaluation jobs on a SLURM enabled cluster. 13 | 14 | ## Datasets Locations 15 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2) 16 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR) 17 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL) 18 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW) 19 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk) 20 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4) 21 | 22 | ## Preprocess 23 | Run ```preprocss.py``` including tokenizing the raw texts by NLTK as follows: 24 | ```bash 25 | python preprocess.py \ 26 | --text-path data/Wiki10-31K/train_raw_texts.txt \ 27 | --tokenized-path data/Wiki10-31K/train_texts.txt \ 28 | --label-path data/Wiki10-31K/train_labels.txt \ 29 | --vocab-path data/Wiki10-31K/vocab.npy \ 30 | --emb-path data/Wiki10-31K/emb_init.npy \ 31 | --w2v-model data/glove.840B.300d.gensim 32 | 33 | python preprocess.py \ 34 | --text-path data/Wiki10-31K/test_raw_texts.txt \ 35 | --tokenized-path data/Wiki10-31K/test_texts.txt \ 36 | --label-path data/Wiki10-31K/test_labels.txt \ 37 | --vocab-path data/Wiki10-31K/vocab.npy 38 | ``` 39 | 40 | ## XML Experiments 41 | In this example let us consider the dataset: ```Wiki10-31K```. 42 | 43 | To execute the baseline model: 44 | ```bash 45 | python main.py --data-cnf configure/datasets/Wiki10-31K.yaml --model-cnf configure/models/AttentionXML-Wiki10-31K.yaml 46 | ``` 47 | 48 | To execute the same model with HRR labels: 49 | ```bash 50 | python main.py --data-cnf configure/datasets/Wiki10-31K-spn.yaml --model-cnf configure/models/AttentionXML-Wiki10-31K.yaml 51 | ``` 52 | 53 | To evaluate the model: 54 | ```bash 55 | LABEL_NAME=AttentionXML-400-Wiki10-31K-spn-400 # For baseline the LABEL_NAME is AttentionXML-0-Wiki10-31K-spn-baseline-0. 56 | NAME=Wiki10-31K 57 | python evaluation.py --results results/${LABEL_NAME}-labels.npy \ 58 | --targets data/${NAME}/test_labels.npy --train-labels data/${NAME}/train_labels.npy 59 | ``` 60 | where ```${LABEL_NAME}``` is the name of the file containing labels for the above experiment run. ```${NAME}``` is the name of the dataset. 61 | 62 | References 63 | ---------- 64 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727) -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Amazon-3M.yaml: -------------------------------------------------------------------------------- 1 | name: Amazon-3M 2 | 3 | train: 4 | sparse: data/Amazon-3M/train_v1.txt 5 | texts: data/Amazon-3M/train_texts.npy 6 | labels: data/Amazon-3M/train_labels.npy 7 | 8 | valid: 9 | size: 4000 10 | 11 | test: 12 | texts: data/Amazon-3M/test_texts.npy 13 | 14 | embedding: 15 | emb_init: data/Amazon-3M/emb_init.npy 16 | 17 | output: 18 | res: results 19 | 20 | labels_binarizer: data/Amazon-3M/labels_binarizer 21 | 22 | model: 23 | emb_size: 300 24 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Amazon-670K-spn.yaml: -------------------------------------------------------------------------------- 1 | name: Amazon-670K-spn 2 | 3 | train: 4 | sparse: data/Amazon-670K/train_v1.txt 5 | texts: data/Amazon-670K/train_texts.npy 6 | labels: data/Amazon-670K/train_labels.npy 7 | 8 | valid: 9 | size: 4000 10 | 11 | test: 12 | texts: data/Amazon-670K/test_texts.npy 13 | 14 | embedding: 15 | emb_init: data/Amazon-670K/emb_init.npy 16 | 17 | output: 18 | res: results 19 | 20 | labels_binarizer: data/Amazon-670K/labels_binarizer 21 | 22 | use_spn: True 23 | 24 | model: 25 | emb_size: 300 26 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Amazon-670K.yaml: -------------------------------------------------------------------------------- 1 | name: Amazon-670K-baseline 2 | 3 | train: 4 | sparse: data/Amazon-670K/train_v1.txt 5 | texts: data/Amazon-670K/train_texts.npy 6 | labels: data/Amazon-670K/train_labels.npy 7 | 8 | valid: 9 | size: 4000 10 | 11 | test: 12 | texts: data/Amazon-670K/test_texts.npy 13 | 14 | embedding: 15 | emb_init: data/Amazon-670K/emb_init.npy 16 | 17 | output: 18 | res: results 19 | 20 | labels_binarizer: data/Amazon-670K/labels_binarizer 21 | 22 | use_spn: False 23 | 24 | model: 25 | emb_size: 300 26 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/AmazonCat-13K-spn.yaml: -------------------------------------------------------------------------------- 1 | name: AmazonCat-13K-spn 2 | 3 | train: 4 | texts: data/AmazonCat-13K/train_texts.npy 5 | labels: data/AmazonCat-13K/train_labels.npy 6 | 7 | valid: 8 | size: 4000 9 | 10 | test: 11 | texts: data/AmazonCat-13K/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/AmazonCat-13K/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/AmazonCat-13K/labels_binarizer 20 | 21 | use_spn: True 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/AmazonCat-13K.yaml: -------------------------------------------------------------------------------- 1 | name: AmazonCat-13K-baseline 2 | 3 | train: 4 | texts: data/AmazonCat-13K/train_texts.npy 5 | labels: data/AmazonCat-13K/train_labels.npy 6 | 7 | valid: 8 | size: 4000 9 | 10 | test: 11 | texts: data/AmazonCat-13K/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/AmazonCat-13K/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/AmazonCat-13K/labels_binarizer 20 | 21 | use_spn: False 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/EUR-Lex-spn.yaml: -------------------------------------------------------------------------------- 1 | name: EUR-Lex-spn 2 | 3 | train: 4 | texts: data/EUR-Lex/train_texts.npy 5 | labels: data/EUR-Lex/train_labels.npy 6 | 7 | valid: 8 | size: 200 9 | 10 | test: 11 | texts: data/EUR-Lex/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/EUR-Lex/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/EUR-Lex/labels_binarizer 20 | 21 | use_spn: True 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/EUR-Lex.yaml: -------------------------------------------------------------------------------- 1 | name: EUR-Lex-baseline 2 | 3 | train: 4 | texts: data/EUR-Lex/train_texts.npy 5 | labels: data/EUR-Lex/train_labels.npy 6 | 7 | valid: 8 | size: 200 9 | 10 | test: 11 | texts: data/EUR-Lex/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/EUR-Lex/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/EUR-Lex/labels_binarizer 20 | 21 | use_spn: False 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Wiki-500K-spn.yaml: -------------------------------------------------------------------------------- 1 | name: Wiki-500K-spn 2 | 3 | train: 4 | sparse: data/Wiki-500K/train.txt 5 | texts: data/Wiki-500K/train_texts.npy 6 | labels: data/Wiki-500K/train_labels.npy 7 | 8 | valid: 9 | size: 4000 10 | 11 | test: 12 | texts: data/Wiki-500K/test_texts.npy 13 | 14 | embedding: 15 | emb_init: data/Wiki-500K/emb_init.npy 16 | 17 | output: 18 | res: results 19 | 20 | labels_binarizer: data/Wiki-500K/labels_binarizer 21 | 22 | use_spn: True 23 | 24 | model: 25 | emb_size: 300 26 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Wiki-500K.yaml: -------------------------------------------------------------------------------- 1 | name: Wiki-500K-baseline 2 | 3 | train: 4 | sparse: data/Wiki-500K/train.txt 5 | texts: data/Wiki-500K/train_texts.npy 6 | labels: data/Wiki-500K/train_labels.npy 7 | 8 | valid: 9 | size: 4000 10 | 11 | test: 12 | texts: data/Wiki-500K/test_texts.npy 13 | 14 | embedding: 15 | emb_init: data/Wiki-500K/emb_init.npy 16 | 17 | output: 18 | res: results 19 | 20 | labels_binarizer: data/Wiki-500K/labels_binarizer 21 | 22 | use_spn: False 23 | 24 | model: 25 | emb_size: 300 26 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Wiki10-31K-spn.yaml: -------------------------------------------------------------------------------- 1 | name: Wiki10-31K-spn 2 | 3 | train: 4 | texts: data/Wiki10-31K/train_texts.npy 5 | labels: data/Wiki10-31K/train_labels.npy 6 | 7 | valid: 8 | size: 200 9 | 10 | test: 11 | texts: data/Wiki10-31K/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/Wiki10-31K/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/Wiki10-31K/labels_binarizer 20 | 21 | use_spn: True 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/datasets/Wiki10-31K.yaml: -------------------------------------------------------------------------------- 1 | name: Wiki10-31K-baseline 2 | 3 | train: 4 | texts: data/Wiki10-31K/train_texts.npy 5 | labels: data/Wiki10-31K/train_labels.npy 6 | 7 | valid: 8 | size: 200 9 | 10 | test: 11 | texts: data/Wiki10-31K/test_texts.npy 12 | 13 | embedding: 14 | emb_init: data/Wiki10-31K/emb_init.npy 15 | 16 | output: 17 | res: results 18 | 19 | labels_binarizer: data/Wiki10-31K/labels_binarizer 20 | 21 | use_spn: False 22 | 23 | model: 24 | emb_size: 300 25 | -------------------------------------------------------------------------------- /attention-xml/configure/models/AttentionXML-Amazon-670K.yaml: -------------------------------------------------------------------------------- 1 | name: AttentionXML 2 | 3 | model: 4 | hidden_size: 256 5 | layers_num: 1 6 | linear_size: [256, 256] 7 | dropout: 0.5 8 | emb_trainable: False 9 | spn_dim: False 10 | no_grad: False 11 | without_negative: False 12 | 13 | train: 14 | batch_size: 8 15 | nb_epoch: 30 16 | swa_warmup: 4 17 | 18 | valid: 19 | batch_size: 8 20 | 21 | predict: 22 | batch_size: 8 23 | 24 | path: models 25 | -------------------------------------------------------------------------------- /attention-xml/configure/models/AttentionXML-AmazonCat-13K.yaml: -------------------------------------------------------------------------------- 1 | name: AttentionXML 2 | 3 | model: 4 | hidden_size: 512 5 | layers_num: 1 6 | linear_size: [512, 512] 7 | dropout: 0.5 8 | emb_trainable: False 9 | spn_dim: False 10 | no_grad: False 11 | without_negative: False 12 | 13 | train: 14 | batch_size: 32 15 | nb_epoch: 10 16 | swa_warmup: 2 17 | 18 | valid: 19 | batch_size: 32 20 | 21 | predict: 22 | batch_size: 32 23 | 24 | path: models 25 | -------------------------------------------------------------------------------- /attention-xml/configure/models/AttentionXML-EUR-Lex.yaml: -------------------------------------------------------------------------------- 1 | name: AttentionXML 2 | 3 | model: 4 | hidden_size: 512 5 | layers_num: 1 6 | linear_size: [1024, 1024] 7 | dropout: 0.5 8 | emb_trainable: False 9 | spn_dim: False 10 | no_grad: False 11 | without_negative: False 12 | 13 | train: 14 | batch_size: 40 15 | nb_epoch: 30 16 | swa_warmup: 10 17 | 18 | valid: 19 | batch_size: 40 20 | 21 | predict: 22 | batch_size: 40 23 | 24 | path: models 25 | -------------------------------------------------------------------------------- /attention-xml/configure/models/AttentionXML-Wiki10-31K.yaml: -------------------------------------------------------------------------------- 1 | name: AttentionXML 2 | 3 | model: 4 | hidden_size: 512 5 | layers_num: 1 6 | linear_size: [1024, 1024] 7 | dropout: 0.5 8 | emb_trainable: False 9 | spn_dim: False 10 | no_grad: False 11 | without_negative: False 12 | 13 | train: 14 | batch_size: 16 15 | nb_epoch: 30 16 | swa_warmup: 4 17 | 18 | valid: 19 | batch_size: 32 20 | 21 | predict: 22 | batch_size: 40 23 | 24 | path: models 25 | -------------------------------------------------------------------------------- /attention-xml/configure/models/FastAttentionXML-Amazon-3M.yaml: -------------------------------------------------------------------------------- 1 | name: FastAttentionXML 2 | 3 | level: 4 4 | k: 8 5 | top: 160 6 | 7 | model: 8 | hidden_size: 512 9 | layers_num: 1 10 | linear_size: [512, 256] 11 | dropout: 0.5 12 | 13 | cluster: 14 | max_leaf: 8 15 | eps: 1e-4 16 | levels: [13, 16, 19] 17 | 18 | 19 | train: 20 | [{batch_size: 200, nb_epoch: 5, swa_warmup: 2}, 21 | {batch_size: 200, nb_epoch: 5, swa_warmup: 1}, 22 | {batch_size: 200, nb_epoch: 5, swa_warmup: 1}, 23 | {batch_size: 200, nb_epoch: 5, swa_warmup: 1}] 24 | 25 | valid: 26 | batch_size: 200 27 | 28 | predict: 29 | batch_size: 200 30 | 31 | path: models 32 | -------------------------------------------------------------------------------- /attention-xml/configure/models/FastAttentionXML-Amazon-670K.yaml: -------------------------------------------------------------------------------- 1 | name: FastAttentionXML 2 | 3 | level: 2 4 | k: 8 5 | top: 160 6 | 7 | model: 8 | hidden_size: 512 9 | layers_num: 1 10 | linear_size: [512, 256] 11 | dropout: 0.5 12 | 13 | cluster: 14 | max_leaf: 8 15 | eps: 1e-4 16 | levels: [11, 14, 17] 17 | 18 | train: 19 | [{batch_size: 128, nb_epoch: 10, swa_warmup: 6}, 20 | {batch_size: 128, nb_epoch: 10, swa_warmup: 2}, 21 | {batch_size: 128, nb_epoch: 10, swa_warmup: 2}, 22 | {batch_size: 128, nb_epoch: 10, swa_warmup: 2}] 23 | 24 | valid: 25 | batch_size: 200 26 | 27 | predict: 28 | batch_size: 200 29 | 30 | path: models 31 | -------------------------------------------------------------------------------- /attention-xml/configure/models/FastAttentionXML-Wiki-500K.yaml: -------------------------------------------------------------------------------- 1 | name: FastAttentionXML 2 | 3 | level: 2 4 | k: 64 5 | top: 15 6 | 7 | model: 8 | hidden_size: 512 9 | layers_num: 1 10 | linear_size: [512, 256] 11 | dropout: 0.5 12 | 13 | cluster: 14 | max_leaf: 64 15 | eps: 1e-4 16 | levels: [13] 17 | 18 | train: 19 | [{batch_size: 200, nb_epoch: 5, swa_warmup: 2}, 20 | {batch_size: 200, nb_epoch: 5, swa_warmup: 1}] 21 | 22 | valid: 23 | batch_size: 200 24 | 25 | predict: 26 | batch_size: 200 27 | 28 | path: models 29 | -------------------------------------------------------------------------------- /attention-xml/data/README.md: -------------------------------------------------------------------------------- 1 | # AttentionXML 2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727) 3 | 4 | ## Requirements 5 | 6 | * python==3.7.4 7 | * click==7.0 8 | * ruamel.yaml==0.16.5 9 | * numpy==1.16.2 10 | * scipy==1.3.1 11 | * scikit-learn==0.21.2 12 | * gensim==3.4.0 13 | * torch==1.0.1 14 | * nltk==3.4 15 | * tqdm==4.31.1 16 | * joblib==0.13.2 17 | * logzero==1.5.0 18 | 19 | ## Datasets 20 | 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2) 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR) 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL) 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW) 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk) 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4) 27 | 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**). 29 | 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 31 | 32 | ## XML Experiments 33 | 34 | XML experiments in paper can be run directly such as: 35 | ```bash 36 | ./scripts/run_eurlex.sh 37 | ``` 38 | ## Preprocess 39 | 40 | Run preprocess.py for train and test datasets with tokenized texts as follows: 41 | ```bash 42 | python preprocess.py \ 43 | --text-path data/EUR-Lex/train_texts.txt \ 44 | --label-path data/EUR-Lex/train_labels.txt \ 45 | --vocab-path data/EUR-Lex/vocab.npy \ 46 | --emb-path data/EUR-Lex/emb_init.npy \ 47 | --w2v-model data/glove.840B.300d.gensim 48 | 49 | python preprocess.py \ 50 | --text-path data/EUR-Lex/test_texts.txt \ 51 | --label-path data/EUR-Lex/test_labels.txt \ 52 | --vocab-path data/EUR-Lex/vocab.npy 53 | ``` 54 | 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows: 56 | ```bash 57 | python preprocess.py \ 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \ 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \ 60 | --label-path data/Wiki10-31K/train_labels.txt \ 61 | --vocab-path data/Wiki10-31K/vocab.npy \ 62 | --emb-path data/Wiki10-31K/emb_init.npy \ 63 | --w2v-model data/glove.840B.300d.gensim 64 | 65 | python preprocess.py \ 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \ 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \ 68 | --label-path data/Wiki10-31K/test_labels.txt \ 69 | --vocab-path data/Wiki10-31K/vocab.npy 70 | ``` 71 | 72 | 73 | ## Train and Predict 74 | 75 | Train and predict as follows: 76 | ```bash 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 78 | ``` 79 | 80 | Or do prediction only with option "--mode eval". 81 | 82 | ## Ensemble 83 | 84 | Train and predict with an ensemble: 85 | ```bash 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3 90 | ``` 91 | 92 | ## Evaluation 93 | 94 | ```bash 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy 96 | ``` 97 | Or get propensity scored metrics together: 98 | 99 | ```bash 100 | python evaluation.py \ 101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \ 102 | --targets data/Amazon-670K/test_labels.npy \ 103 | --train-labels data/Amazon-670K/train_labels.npy \ 104 | -a 0.6 \ 105 | -b 2.6 106 | 107 | ``` 108 | 109 | ## Reference 110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019 111 | 112 | ## Declaration 113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn). -------------------------------------------------------------------------------- /attention-xml/deepxml/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/10/17 5 | @author yrh 6 | 7 | """ -------------------------------------------------------------------------------- /attention-xml/deepxml/cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/24 5 | @author yrh 6 | 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | from scipy.sparse import csr_matrix, csc_matrix 12 | from sklearn.preprocessing import normalize 13 | from logzero import logger 14 | 15 | from deepxml.data_utils import get_sparse_feature 16 | 17 | 18 | __all__ = ['build_tree_by_level'] 19 | 20 | 21 | def build_tree_by_level(sparse_data_x, sparse_data_y, mlb, eps: float, max_leaf: int, levels: list, groups_path): 22 | os.makedirs(os.path.split(groups_path)[0], exist_ok=True) 23 | logger.info('Clustering') 24 | sparse_x, sparse_labels = get_sparse_feature(sparse_data_x, sparse_data_y) 25 | sparse_y = mlb.transform(sparse_labels) 26 | logger.info('Getting Labels Feature') 27 | labels_f = normalize(csr_matrix(sparse_y.T) @ csc_matrix(sparse_x)) 28 | logger.info(F'Start Clustering {levels}') 29 | levels, q = [2**x for x in levels], None 30 | for i in range(len(levels)-1, -1, -1): 31 | if os.path.exists(F'{groups_path}-Level-{i}.npy'): 32 | labels_list = np.load(F'{groups_path}-Level-{i}.npy', allow_pickle=True) 33 | q = [(labels_i, labels_f[labels_i]) for labels_i in labels_list] 34 | break 35 | if q is None: 36 | q = [(np.arange(labels_f.shape[0]), labels_f)] 37 | while q: 38 | labels_list = np.asarray([x[0] for x in q]) 39 | assert sum(len(labels) for labels in labels_list) == labels_f.shape[0] 40 | if len(labels_list) in levels: 41 | level = levels.index(len(labels_list)) 42 | logger.info(F'Finish Clustering Level-{level}') 43 | np.save(F'{groups_path}-Level-{level}.npy', np.asarray(labels_list)) 44 | else: 45 | logger.info(F'Finish Clustering {len(labels_list)}') 46 | next_q = [] 47 | for node_i, node_f in q: 48 | if len(node_i) > max_leaf: 49 | next_q += list(split_node(node_i, node_f, eps)) 50 | q = next_q 51 | logger.info('Finish Clustering') 52 | 53 | 54 | def split_node(labels_i: np.ndarray, labels_f: csr_matrix, eps: float): 55 | n = len(labels_i) 56 | c1, c2 = np.random.choice(np.arange(n), 2, replace=False) 57 | centers, old_dis, new_dis = labels_f[[c1, c2]].toarray(), -10000.0, -1.0 58 | l_labels_i, r_labels_i = None, None 59 | while new_dis - old_dis >= eps: 60 | dis = labels_f @ centers.T # N, 2 61 | partition = np.argsort(dis[:, 1] - dis[:, 0]) 62 | l_labels_i, r_labels_i = partition[:n//2], partition[n//2:] 63 | old_dis, new_dis = new_dis, (dis[l_labels_i, 0].sum() + dis[r_labels_i, 1].sum()) / n 64 | centers = normalize(np.asarray([np.squeeze(np.asarray(labels_f[l_labels_i].sum(axis=0))), 65 | np.squeeze(np.asarray(labels_f[r_labels_i].sum(axis=0)))])) 66 | return (labels_i[l_labels_i], labels_f[l_labels_i]), (labels_i[r_labels_i], labels_f[r_labels_i]) 67 | -------------------------------------------------------------------------------- /attention-xml/deepxml/data_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/9 5 | @author yrh 6 | 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | import joblib 12 | from collections import Counter 13 | from scipy import sparse 14 | from sklearn.preprocessing import MultiLabelBinarizer, normalize 15 | from sklearn.datasets import load_svmlight_file 16 | from gensim.models import KeyedVectors 17 | from tqdm import tqdm 18 | from typing import Union, Iterable 19 | 20 | 21 | __all__ = ['build_vocab', 'get_data', 'convert_to_binary', 'truncate_text', 'get_word_emb', 'get_mlb', 22 | 'get_sparse_feature', 'output_res'] 23 | 24 | 25 | def build_vocab(texts: Iterable, w2v_model: Union[KeyedVectors, str], vocab_size=500000, 26 | pad='', unknown='', sep='/SEP/', max_times=1, freq_times=1): 27 | if isinstance(w2v_model, str): 28 | # w2v_model = KeyedVectors.load(w2v_model, binary=True) 29 | w2v_model = KeyedVectors.load_word2vec_format(w2v_model, binary=True) 30 | emb_size = w2v_model.vector_size 31 | vocab, emb_init = [pad, unknown], [np.zeros(emb_size), np.random.uniform(-1.0, 1.0, emb_size)] 32 | counter = Counter(token for t in texts for token in set(t.split())) 33 | for word, freq in sorted(counter.items(), key=lambda x: (x[1], x[0] in w2v_model), reverse=True): 34 | if word in w2v_model or freq >= freq_times: 35 | vocab.append(word) 36 | # We used embedding of '.' as embedding of '/SEP/' symbol. 37 | word = '.' if word == sep else word 38 | emb_init.append(w2v_model[word] if word in w2v_model else np.random.uniform(-1.0, 1.0, emb_size)) 39 | if freq < max_times or vocab_size == len(vocab): 40 | break 41 | 42 | return np.asarray(vocab), np.asarray(emb_init) 43 | 44 | 45 | def get_word_emb(vec_path, vocab_path=None): 46 | if vocab_path is not None: 47 | with open(vocab_path) as fp: 48 | vocab = {word: idx for idx, word in enumerate(fp)} 49 | return np.load(vec_path, allow_pickle=True), vocab 50 | else: 51 | return np.load(vec_path, allow_pickle=True) 52 | 53 | 54 | def get_data(text_file, label_file=None): 55 | return np.load(text_file, allow_pickle=True), np.load(label_file, allow_pickle=True) if label_file is not None else None 56 | 57 | 58 | def convert_to_binary(text_file, label_file=None, max_len=None, vocab=None, pad='', unknown=''): 59 | with open(text_file) as fp: 60 | texts = np.asarray([[vocab.get(word, vocab[unknown]) for word in line.split()] 61 | for line in tqdm(fp, desc='Converting token to id', leave=False)]) 62 | labels = None 63 | if label_file is not None: 64 | with open(label_file) as fp: 65 | labels = np.asarray([[label for label in line.split()] 66 | for line in tqdm(fp, desc='Converting labels', leave=False)]) 67 | return truncate_text(texts, max_len, vocab[pad], vocab[unknown]), labels 68 | 69 | 70 | def truncate_text(texts, max_len=500, padding_idx=0, unknown_idx=1): 71 | if max_len is None: 72 | return texts 73 | texts = np.asarray([list(x[:max_len]) + [padding_idx] * (max_len - len(x)) for x in texts]) 74 | texts[(texts == padding_idx).all(axis=1), 0] = unknown_idx 75 | return texts 76 | 77 | def build_spn_labels(labels, split_idx=None, label_idx=None, max_len=0): 78 | if label_idx is None: 79 | label_idx = {} 80 | 81 | idx = 0 82 | spn_labels = [] 83 | for label_row in labels: 84 | spn_label_row = [] 85 | for l in label_row: 86 | if l not in label_idx: 87 | label_idx[l] = idx 88 | idx += 1 89 | 90 | spn_label_row.append(label_idx[l]) 91 | 92 | spn_labels.append(spn_label_row) 93 | if len(spn_label_row) > max_len: 94 | max_len = len(spn_label_row) 95 | 96 | # Add a END of set label for SPN. 97 | label_idx["SPN"] = idx 98 | idx += 1 99 | 100 | # Create a constant size matrix. 101 | # NOTE: This is for SPN label set. 102 | for i, row in enumerate(spn_labels): 103 | row = row + [label_idx["SPN"] for i in range(0, max_len - len(row))] 104 | spn_labels[i] = row 105 | 106 | spn_labels = sparse.csr_matrix(np.asarray(spn_labels)) 107 | return label_idx, spn_labels[0: split_idx], spn_labels[split_idx: ], idx 108 | 109 | 110 | def convert_to_spn(labels): 111 | sum = labels.sum(axis=1) 112 | max_len = sum.max() 113 | 114 | spn_labels = [[] for i in range(labels.shape[0])] 115 | row, col = labels.nonzero() 116 | for i in range(row.shape[0]): 117 | spn_labels[row[i]].append(col[i]) 118 | 119 | # Create a constant size matrix. 120 | # NOTE: This is for SPN label set. 121 | for i, row in enumerate(spn_labels): 122 | row = row + [labels.shape[1] for i in range(0, max_len - len(row))] 123 | spn_labels[i] = row 124 | 125 | 126 | spn_labels = sparse.csr_matrix(np.asarray(spn_labels)) 127 | return spn_labels 128 | 129 | def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer: 130 | if os.path.exists(mlb_path): 131 | return joblib.load(mlb_path) 132 | mlb = MultiLabelBinarizer(sparse_output=True) # Create a binarizer if one has not been created before. 133 | mlb.fit(labels) 134 | joblib.dump(mlb, mlb_path) 135 | return mlb 136 | 137 | 138 | def get_sparse_feature(feature_file, label_file): 139 | sparse_x, _ = load_svmlight_file(feature_file, multilabel=True) 140 | return normalize(sparse_x), np.load(label_file, allow_pickle=True) if label_file is not None else None 141 | 142 | 143 | def output_res(output_path, name, scores, labels): 144 | os.makedirs(output_path, exist_ok=True) 145 | np.save(os.path.join(output_path, F'{name}-scores'), scores) 146 | np.save(os.path.join(output_path, F'{name}-labels'), labels) 147 | -------------------------------------------------------------------------------- /attention-xml/deepxml/dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/10 5 | @author yrh 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | from torch.utils.data import Dataset 12 | from scipy.sparse import csr_matrix 13 | from tqdm import tqdm 14 | from typing import Sequence, Optional, Union 15 | 16 | 17 | __all__ = ['MultiLabelDataset', 'XMLDataset'] 18 | 19 | TDataX = Sequence[Sequence] 20 | TDataY = Optional[csr_matrix] 21 | TDataZ = Optional[csr_matrix] 22 | TCandidate = TGroup = Optional[np.ndarray] 23 | TGroupLabel = TGroupScore = Optional[Union[np.ndarray, torch.Tensor]] 24 | 25 | 26 | class MultiLabelDataset(Dataset): 27 | """ 28 | 29 | """ 30 | def __init__(self, data_x: TDataX, data_y: TDataY = None, 31 | spn_data_y: TDataZ = None, training=True): 32 | self.data_x, self.data_y, self.training = data_x, data_y, training 33 | self.spn_data_y = spn_data_y 34 | 35 | def __getitem__(self, item): 36 | data_x = self.data_x[item] 37 | if self.training and self.data_y is not None: 38 | data_y = self.data_y[item].toarray().squeeze(0).astype(np.float32) 39 | if self.spn_data_y is not None: 40 | spn_data_y = self.spn_data_y[item].toarray().squeeze(0).astype(np.long) 41 | return data_x, data_y, spn_data_y 42 | else: 43 | return data_x, data_y 44 | else: 45 | return data_x 46 | 47 | def __len__(self): 48 | return len(self.data_x) 49 | 50 | 51 | class XMLDataset(MultiLabelDataset): 52 | """ 53 | 54 | """ 55 | def __init__(self, data_x: TDataX, data_y: TDataY = None, training=True, 56 | labels_num=None, candidates: TCandidate = None, candidates_num=None, 57 | groups: TGroup = None, group_labels: TGroupLabel = None, group_scores: TGroupScore = None): 58 | super(XMLDataset, self).__init__(data_x, data_y, training) 59 | self.labels_num, self.candidates, self.candidates_num = labels_num, candidates, candidates_num 60 | self.groups, self.group_labels, self.group_scores = groups, group_labels, group_scores 61 | if self.candidates is None: 62 | self.candidates = [np.concatenate([self.groups[g] for g in group_labels]) 63 | for group_labels in tqdm(self.group_labels, leave=False, desc='Candidates')] 64 | if self.group_scores is not None: 65 | self.candidates_scores = [np.concatenate([[s] * len(self.groups[g]) 66 | for g, s in zip(group_labels, group_scores)]) 67 | for group_labels, group_scores in zip(self.group_labels, self.group_scores)] 68 | else: 69 | self.candidates_scores = [np.ones_like(candidates) for candidates in self.candidates] 70 | if self.candidates_num is None: 71 | self.candidates_num = self.group_labels.shape[1] * max(len(g) for g in groups) 72 | 73 | def __getitem__(self, item): 74 | data_x, candidates = self.data_x[item], np.asarray(self.candidates[item], dtype=np.int) 75 | if self.training and self.data_y is not None: 76 | if len(candidates) < self.candidates_num: 77 | sample = np.random.randint(self.labels_num, size=self.candidates_num - len(candidates)) 78 | candidates = np.concatenate([candidates, sample]) 79 | elif len(candidates) > self.candidates_num: 80 | candidates = np.random.choice(candidates, self.candidates_num, replace=False) 81 | data_y = self.data_y[item, candidates].toarray().squeeze(0).astype(np.float32) 82 | return (data_x, candidates), data_y 83 | else: 84 | scores = self.candidates_scores[item] 85 | if len(candidates) < self.candidates_num: 86 | scores = np.concatenate([scores, [-np.inf] * (self.candidates_num - len(candidates))]) 87 | candidates = np.concatenate([candidates, [self.labels_num] * (self.candidates_num - len(candidates))]) 88 | scores = np.asarray(scores, dtype=np.float32) 89 | return data_x, candidates, scores 90 | 91 | def __len__(self): 92 | return len(self.data_x) 93 | -------------------------------------------------------------------------------- /attention-xml/deepxml/evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/9 5 | @author yrh 6 | 7 | """ 8 | 9 | import numpy as np 10 | from functools import partial 11 | from scipy.sparse import csr_matrix 12 | from sklearn.preprocessing import MultiLabelBinarizer 13 | from typing import Union, Optional, List, Iterable, Hashable 14 | 15 | 16 | __all__ = ['get_precision', 'get_p_1', 'get_p_3', 'get_p_5', 'get_p_10', 17 | 'get_ndcg', 'get_n_1', 'get_n_3', 'get_n_5', 'get_n_10', 18 | 'get_inv_propensity', 'get_psp', 19 | 'get_psp_1', 'get_psp_3', 'get_psp_5', 'get_psp_10', 20 | 'get_psndcg_1', 'get_psndcg_3', 'get_psndcg_5', 'get_psndcg_10'] 21 | 22 | TPredict = np.ndarray 23 | TTarget = Union[Iterable[Iterable[Hashable]], csr_matrix] 24 | TMlb = Optional[MultiLabelBinarizer] 25 | TClass = Optional[List[Hashable]] 26 | 27 | 28 | def get_mlb(classes: TClass = None, mlb: TMlb = None, targets: TTarget = None): 29 | if classes is not None: 30 | mlb = MultiLabelBinarizer(classes, sparse_output=True) 31 | if mlb is None and targets is not None: 32 | if isinstance(targets, csr_matrix): 33 | mlb = MultiLabelBinarizer(range(targets.shape[1]), sparse_output=True) 34 | mlb.fit(None) 35 | else: 36 | mlb = MultiLabelBinarizer(sparse_output=True) 37 | mlb.fit(targets) 38 | return mlb 39 | 40 | 41 | def get_precision(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5): 42 | mlb = get_mlb(classes, mlb, targets) 43 | if not isinstance(targets, csr_matrix): 44 | targets = mlb.transform(targets) 45 | prediction = mlb.transform(prediction[:, :top]) 46 | return prediction.multiply(targets).sum() / (top * targets.shape[0]) 47 | 48 | 49 | get_p_1 = partial(get_precision, top=1) 50 | get_p_3 = partial(get_precision, top=3) 51 | get_p_5 = partial(get_precision, top=5) 52 | get_p_10 = partial(get_precision, top=10) 53 | 54 | 55 | def get_ndcg(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5): 56 | mlb = get_mlb(classes, mlb, targets) 57 | log = 1.0 / np.log2(np.arange(top) + 2) 58 | dcg = np.zeros((targets.shape[0], 1)) 59 | if not isinstance(targets, csr_matrix): 60 | targets = mlb.transform(targets) 61 | for i in range(top): 62 | p = mlb.transform(prediction[:, i: i+1]) 63 | dcg += p.multiply(targets).sum(axis=-1) * log[i] 64 | return np.average(dcg / log.cumsum()[np.minimum(targets.sum(axis=-1), top) - 1]) 65 | 66 | 67 | get_n_1 = partial(get_ndcg, top=1) 68 | get_n_3 = partial(get_ndcg, top=3) 69 | get_n_5 = partial(get_ndcg, top=5) 70 | get_n_10 = partial(get_ndcg, top=10) 71 | 72 | 73 | def get_inv_propensity(train_y: csr_matrix, a=0.55, b=1.5): 74 | n, number = train_y.shape[0], np.asarray(train_y.sum(axis=0)).squeeze() 75 | c = (np.log(n) - 1) * ((b + 1) ** a) 76 | return 1.0 + c * (number + b) ** (-a) 77 | 78 | 79 | def get_psp(prediction: TPredict, targets: TTarget, inv_w: np.ndarray, mlb: TMlb = None, 80 | classes: TClass = None, top=5): 81 | mlb = get_mlb(classes, mlb) 82 | if not isinstance(targets, csr_matrix): 83 | targets = mlb.transform(targets) 84 | prediction = mlb.transform(prediction[:, :top]).multiply(inv_w) 85 | num = prediction.multiply(targets).sum() 86 | t, den = csr_matrix(targets.multiply(inv_w)), 0 87 | for i in range(t.shape[0]): 88 | den += np.sum(np.sort(t.getrow(i).data)[-top:]) 89 | return num / den 90 | 91 | 92 | get_psp_1 = partial(get_psp, top=1) 93 | get_psp_3 = partial(get_psp, top=3) 94 | get_psp_5 = partial(get_psp, top=5) 95 | get_psp_10 = partial(get_psp, top=10) 96 | 97 | 98 | def get_psndcg(prediction: TPredict, targets: TTarget, inv_w: np.ndarray, mlb: TMlb = None, 99 | classes: TClass = None, top=5): 100 | mlb = get_mlb(classes, mlb) 101 | log = 1.0 / np.log2(np.arange(top) + 2) 102 | psdcg = 0.0 103 | if not isinstance(targets, csr_matrix): 104 | targets = mlb.transform(targets) 105 | for i in range(top): 106 | p = mlb.transform(prediction[:, i: i+1]).multiply(inv_w) 107 | psdcg += p.multiply(targets).sum() * log[i] 108 | t, den = csr_matrix(targets.multiply(inv_w)), 0.0 109 | for i in range(t.shape[0]): 110 | num = min(top, len(t.getrow(i).data)) 111 | den += -np.sum(np.sort(-t.getrow(i).data)[:num] * log[:num]) 112 | return psdcg / den 113 | 114 | 115 | get_psndcg_1 = partial(get_psndcg, top=1) 116 | get_psndcg_3 = partial(get_psndcg, top=3) 117 | get_psndcg_5 = partial(get_psndcg, top=5) 118 | get_psndcg_10 = partial(get_psndcg, top=10) 119 | -------------------------------------------------------------------------------- /attention-xml/deepxml/lib: -------------------------------------------------------------------------------- 1 | ../../lib -------------------------------------------------------------------------------- /attention-xml/deepxml/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/9 5 | @author yrh 6 | 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | import torch 12 | import torch.nn as nn 13 | from collections import deque 14 | from torch.utils.data import DataLoader 15 | from tqdm import tqdm 16 | from logzero import logger 17 | from typing import Optional, Mapping, Tuple 18 | 19 | import torch.optim as optim 20 | from deepxml.evaluation import get_p_5, get_n_5 21 | from deepxml.modules import * 22 | from deepxml.optimizers import * 23 | 24 | 25 | __all__ = ['Model', 'XMLModel'] 26 | 27 | 28 | class Model(object): 29 | """ 30 | 31 | """ 32 | def __init__(self, network, model_path, gradient_clip_value=5.0, device_ids=None, **kwargs): 33 | self.model = nn.DataParallel(network(**kwargs).cuda(), device_ids=device_ids) 34 | if self.model.module.use_spn: 35 | self.loss_fn = self.model.module.spp_loss 36 | else: 37 | self.loss_fn = nn.BCEWithLogitsLoss() 38 | 39 | self.model_path, self.state = model_path, {} 40 | os.makedirs(os.path.split(self.model_path)[0], exist_ok=True) 41 | self.gradient_clip_value, self.gradient_norm_queue = gradient_clip_value, deque([np.inf], maxlen=5) 42 | self.optimizer = None 43 | 44 | def train_step(self, train_x: torch.Tensor, train_y: torch.Tensor): 45 | self.optimizer.zero_grad() 46 | self.model.train() 47 | scores = self.model(train_x) 48 | loss = self.loss_fn(scores, train_y) 49 | loss.backward() 50 | if self.model.module.use_spn == False: 51 | self.clip_gradient() 52 | 53 | self.optimizer.step(closure=None) 54 | return loss.item() 55 | 56 | def predict_step(self, data_x: torch.Tensor, k: int): 57 | self.model.eval() 58 | with torch.no_grad(): 59 | if self.model.module.use_spn: 60 | s = self.model(data_x) 61 | y = self.model.module.inference(s, s.shape[0]) 62 | weights = self.model.module.class_vec.weight.t() 63 | all_scores = torch.abs(torch.mm(y, weights)[:,:-1]) 64 | 65 | scores, labels = torch.topk(all_scores, k) 66 | return scores.cpu(), labels.cpu() 67 | else: 68 | scores, labels = torch.topk(self.model(data_x), k) 69 | return torch.sigmoid(scores).cpu(), labels.cpu() 70 | 71 | def get_optimizer(self, **kwargs): 72 | self.optimizer = DenseSparseAdam(self.model.parameters(), **kwargs) 73 | 74 | def train(self, train_loader: DataLoader, valid_loader: DataLoader, opt_params: Optional[Mapping] = None, 75 | nb_epoch=100, step=100, k=5, early=50, verbose=True, swa_warmup=None, **kwargs): 76 | self.get_optimizer(**({} if opt_params is None else opt_params)) 77 | global_step, best_n5, e = 0, 0.0, 0 78 | for epoch_idx in range(nb_epoch): 79 | if epoch_idx == swa_warmup: 80 | self.swa_init() 81 | 82 | for i, data in enumerate(train_loader, 1): 83 | train_x, train_y = data[0], data[1] 84 | global_step += 1 85 | if self.model.module.use_spn: 86 | spn_train_y = data[2] 87 | loss = self.train_step(train_x, spn_train_y.cuda().long()) 88 | else: 89 | loss = self.train_step(train_x, train_y.cuda()) 90 | if global_step % step == 0: 91 | self.swa_step() 92 | self.swap_swa_params() 93 | targets = valid_loader.dataset.data_y 94 | labels = np.concatenate([self.predict_step(valid_x, k)[1] for valid_x in valid_loader]) 95 | 96 | p5, n5 = get_p_5(labels, targets), get_n_5(labels, targets) 97 | if n5 > best_n5: 98 | self.save_model() 99 | best_n5, e = n5, 0 100 | else: 101 | e += 1 102 | if early is not None and e > early: 103 | return 104 | self.swap_swa_params() 105 | if verbose: 106 | logger.info(F'{epoch_idx} {i * train_loader.batch_size} train loss: {round(loss, 5)} ' 107 | F'P@5: {round(p5, 5)} nDCG@5: {round(n5, 5)} early stop: {e}') 108 | 109 | def predict(self, data_loader: DataLoader, k=100, desc='Predict', **kwargs): 110 | self.load_model() 111 | scores_list, labels_list = zip(*(self.predict_step(data_x, k) 112 | for data_x in tqdm(data_loader, desc=desc, leave=False))) 113 | return np.concatenate(scores_list), np.concatenate(labels_list) 114 | 115 | def save_model(self): 116 | torch.save(self.model.module.state_dict(), self.model_path) 117 | 118 | def load_model(self): 119 | self.model.module.load_state_dict(torch.load(self.model_path)) 120 | 121 | def clip_gradient(self): 122 | if self.gradient_clip_value is not None: 123 | max_norm = max(self.gradient_norm_queue) 124 | total_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm * self.gradient_clip_value) 125 | self.gradient_norm_queue.append(min(total_norm, max_norm * 2.0, 1.0)) 126 | if total_norm > max_norm * self.gradient_clip_value: 127 | logger.warn(F'Clipping gradients with total norm {round(total_norm, 5)} ' 128 | F'and max norm {round(max_norm, 5)}') 129 | 130 | def swa_init(self): 131 | if 'swa' not in self.state: 132 | logger.info('SWA Initializing') 133 | swa_state = self.state['swa'] = {'models_num': 1} 134 | for n, p in self.model.named_parameters(): 135 | swa_state[n] = p.data.clone().detach() 136 | 137 | def swa_step(self): 138 | if 'swa' in self.state: 139 | swa_state = self.state['swa'] 140 | swa_state['models_num'] += 1 141 | beta = 1.0 / swa_state['models_num'] 142 | with torch.no_grad(): 143 | for n, p in self.model.named_parameters(): 144 | swa_state[n].mul_(1.0 - beta).add_(beta, p.data) 145 | 146 | def swap_swa_params(self): 147 | if 'swa' in self.state: 148 | swa_state = self.state['swa'] 149 | for n, p in self.model.named_parameters(): 150 | p.data, swa_state[n] = swa_state[n], p.data 151 | 152 | def disable_swa(self): 153 | if 'swa' in self.state: 154 | del self.state['swa'] 155 | 156 | 157 | class XMLModel(Model): 158 | """ 159 | 160 | """ 161 | def __init__(self, labels_num, hidden_size, device_ids=None, attn_device_ids=None, 162 | most_labels_parallel_attn=80000, **kwargs): 163 | parallel_attn = labels_num <= most_labels_parallel_attn 164 | super(XMLModel, self).__init__(hidden_size=hidden_size, device_ids=device_ids, labels_num=labels_num, 165 | parallel_attn=parallel_attn, **kwargs) 166 | self.network, self.attn_weights = self.model, nn.Sequential() 167 | if not parallel_attn: 168 | self.attn_weights = AttentionWeights(labels_num, hidden_size*2, attn_device_ids) 169 | self.model = nn.ModuleDict({'Network': self.network.module, 'AttentionWeights': self.attn_weights}) 170 | self.state['best'] = {} 171 | 172 | def train_step(self, train_x: Tuple[torch.Tensor, torch.Tensor], train_y: torch.Tensor): 173 | self.optimizer.zero_grad() 174 | train_x, candidates = train_x 175 | self.model.train() 176 | scores = self.network(train_x, candidates=candidates, attn_weights=self.attn_weights) 177 | loss = self.loss_fn(scores, train_y) 178 | loss.backward() 179 | # self.clip_gradient() 180 | self.optimizer.step(closure=None) 181 | return loss.item() 182 | 183 | def predict_step(self, data_x: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], k): 184 | data_x, candidates, group_scores = data_x 185 | self.model.eval() 186 | with torch.no_grad(): 187 | scores = torch.sigmoid(self.network(data_x, candidates=candidates, attn_weights=self.attn_weights)) 188 | scores, labels = torch.topk(scores * group_scores.cuda(), k) 189 | return scores.cpu(), candidates[np.arange(len(data_x)).reshape(-1, 1), labels.cpu()] 190 | 191 | def train(self, *args, **kwargs): 192 | super(XMLModel, self).train(*args, **kwargs) 193 | self.save_model_to_disk() 194 | 195 | def save_model(self): 196 | model_dict = self.model.state_dict() 197 | for key in model_dict: 198 | self.state['best'][key] = model_dict[key].cpu().detach() 199 | 200 | def save_model_to_disk(self): 201 | model_dict = self.model.state_dict() 202 | for key in model_dict: 203 | model_dict[key][:] = self.state['best'][key] 204 | torch.save(self.model.state_dict(), self.model_path) 205 | 206 | def load_model(self): 207 | self.model.load_state_dict(torch.load(self.model_path)) 208 | -------------------------------------------------------------------------------- /attention-xml/deepxml/modules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/29 5 | @author yrh 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | 15 | __all__ = ['Embedding', 'LSTMEncoder', 'MLAttention', 'AttentionWeights', 'FastMLAttention', 'MLLinear'] 16 | 17 | 18 | class Embedding(nn.Module): 19 | """ 20 | 21 | """ 22 | def __init__(self, vocab_size=None, emb_size=None, emb_init=None, emb_trainable=True, padding_idx=0, dropout=0.2): 23 | super(Embedding, self).__init__() 24 | if emb_init is not None: 25 | if vocab_size is not None: 26 | assert vocab_size == emb_init.shape[0] 27 | if emb_size is not None: 28 | assert emb_size == emb_init.shape[1] 29 | vocab_size, emb_size = emb_init.shape 30 | self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx, sparse=True, 31 | _weight=torch.from_numpy(emb_init).float() if emb_init is not None else None) 32 | self.emb.weight.requires_grad = emb_trainable 33 | self.dropout = nn.Dropout(dropout) 34 | self.padding_idx = padding_idx 35 | 36 | def forward(self, inputs): 37 | emb_out = self.dropout(self.emb(inputs)) 38 | lengths, masks = (inputs != self.padding_idx).sum(dim=-1), inputs != self.padding_idx 39 | return emb_out[:, :lengths.max()], lengths, masks[:, :lengths.max()] 40 | 41 | 42 | class LSTMEncoder(nn.Module): 43 | """ 44 | 45 | """ 46 | def __init__(self, input_size, hidden_size, layers_num, dropout): 47 | super(LSTMEncoder, self).__init__() 48 | self.lstm = nn.LSTM(input_size, hidden_size, layers_num, batch_first=True, bidirectional=True) 49 | self.init_state = nn.Parameter(torch.zeros(2*2*layers_num, 1, hidden_size)) 50 | self.dropout = nn.Dropout(dropout) 51 | 52 | def forward(self, inputs, lengths, **kwargs): 53 | self.lstm.flatten_parameters() 54 | init_state = self.init_state.repeat([1, inputs.size(0), 1]) 55 | cell_init, hidden_init = init_state[:init_state.size(0)//2], init_state[init_state.size(0)//2:] 56 | idx = torch.argsort(lengths, descending=True) 57 | packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs[idx], lengths[idx], batch_first=True) 58 | outputs, _ = nn.utils.rnn.pad_packed_sequence( 59 | self.lstm(packed_inputs, (hidden_init, cell_init))[0], batch_first=True) 60 | return self.dropout(outputs[torch.argsort(idx)]) 61 | 62 | 63 | class MLAttention(nn.Module): 64 | """ 65 | 66 | """ 67 | def __init__(self, labels_num, hidden_size): 68 | super(MLAttention, self).__init__() 69 | self.attention = nn.Linear(hidden_size, labels_num, bias=False) 70 | nn.init.xavier_uniform_(self.attention.weight) 71 | 72 | def forward(self, inputs, masks): 73 | masks = torch.unsqueeze(masks, 1) # N, 1, L 74 | attention = self.attention(inputs).transpose(1, 2).masked_fill(~masks, -np.inf) # N, labels_num, L 75 | attention = F.softmax(attention, -1) 76 | output = attention @ inputs 77 | output = output.mean(dim=1) # Take a mean of the vectors across axis=1. 78 | return output # N, labels_num, hidden_size 79 | 80 | 81 | class AttentionWeights(nn.Module): 82 | """ 83 | 84 | """ 85 | def __init__(self, labels_num, hidden_size, device_ids=None): 86 | super(AttentionWeights, self).__init__() 87 | if device_ids is None: 88 | device_ids = list(range(1, torch.cuda.device_count())) 89 | assert labels_num >= len(device_ids) 90 | group_size, plus_num = labels_num // len(device_ids), labels_num % len(device_ids) 91 | self.group = [group_size + 1] * plus_num + [group_size] * (len(device_ids) - plus_num) 92 | assert sum(self.group) == labels_num 93 | self.emb = nn.ModuleList(nn.Embedding(size, hidden_size, sparse=True).cuda(device_ids[i]) 94 | for i, size in enumerate(self.group)) 95 | std = (6.0 / (labels_num + hidden_size)) ** 0.5 96 | with torch.no_grad(): 97 | for emb in self.emb: 98 | emb.weight.data.uniform_(-std, std) 99 | self.group_offset, self.hidden_size = np.cumsum([0] + self.group), hidden_size 100 | 101 | def forward(self, inputs: torch.Tensor): 102 | outputs = torch.zeros(*inputs.size(), self.hidden_size, device=inputs.device) 103 | for left, right, emb in zip(self.group_offset[:-1], self.group_offset[1:], self.emb): 104 | index = (left <= inputs) & (inputs < right) 105 | group_inputs = (inputs[index] - left).to(emb.weight.device) 106 | outputs[index] = emb(group_inputs).to(inputs.device) 107 | return outputs 108 | 109 | 110 | class FastMLAttention(nn.Module): 111 | """ 112 | 113 | """ 114 | def __init__(self, labels_num, hidden_size, parallel_attn=False): 115 | super(FastMLAttention, self).__init__() 116 | if parallel_attn: 117 | self.attention = nn.Embedding(labels_num + 1, hidden_size, sparse=True) 118 | nn.init.xavier_uniform_(self.attention.weight) 119 | 120 | def forward(self, inputs, masks, candidates, attn_weights: nn.Module): 121 | masks = torch.unsqueeze(masks, 1) # N, 1, L 122 | attn_inputs = inputs.transpose(1, 2) # N, hidden, L 123 | attn_weights = self.attention(candidates) if hasattr(self, 'attention') else attn_weights(candidates) 124 | # attention = (attn_weights @ attn_inputs).masked_fill(1.0 - masks, -np.inf) # N, sampled_size, L 125 | attention = (attn_weights @ attn_inputs).masked_fill(~masks, -np.inf) # N, sampled_size, L 126 | attention = F.softmax(attention, -1) # N, sampled_size, L 127 | return attention @ inputs # N, sampled_size, hidden_size 128 | 129 | 130 | class MLLinear(nn.Module): 131 | """ 132 | """ 133 | def __init__(self, linear_size, output_size): 134 | super(MLLinear, self).__init__() 135 | self.linear = nn.ModuleList(nn.Linear(in_s, out_s) 136 | for in_s, out_s in zip(linear_size[:-1], 137 | linear_size[1:])) 138 | # Initialize each layer. 139 | for linear in self.linear: 140 | nn.init.xavier_uniform_(linear.weight) 141 | 142 | # NOTE: In the original architecture: 143 | # Output size is 1. 144 | # Giving an output of x, 1 tensor. 145 | 146 | # New Architecture: output size is label size. 147 | self.output = nn.Linear(linear_size[-1], output_size) 148 | nn.init.xavier_uniform_(self.output.weight) 149 | 150 | def forward(self, inputs): 151 | linear_out = inputs 152 | for linear in self.linear: 153 | linear_out = F.relu(linear(linear_out)) 154 | 155 | o = self.output(linear_out) 156 | return o 157 | -------------------------------------------------------------------------------- /attention-xml/deepxml/networks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/9 5 | @author yrh 6 | 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | from deepxml.modules import * 13 | from deepxml.lib.embeddings import get_vectors 14 | from deepxml.lib.mathops import get_appx_inv, circular_conv, complexMagProj 15 | 16 | 17 | __all__ = ['AttentionRNN', 'FastAttentionRNN'] 18 | 19 | 20 | class Network(nn.Module): 21 | """ 22 | 23 | """ 24 | def __init__(self, emb_size, vocab_size=None, emb_init=None, 25 | emb_trainable=True, padding_idx=0, emb_dropout=0.2, 26 | **kwargs): 27 | super(Network, self).__init__() 28 | self.emb = Embedding(vocab_size, emb_size, emb_init, emb_trainable, padding_idx, emb_dropout) 29 | 30 | def forward(self, *args, **kwargs): 31 | raise NotImplementedError 32 | 33 | 34 | class AttentionRNN(Network): 35 | """ 36 | 37 | """ 38 | def __init__(self, labels_num, emb_size, hidden_size, layers_num, 39 | linear_size, dropout, use_spn, spn_dim, no_grad, 40 | without_negative, **kwargs): 41 | super(AttentionRNN, self).__init__(emb_size, **kwargs) 42 | self.use_spn = use_spn 43 | if self.use_spn: 44 | self.label_size = spn_dim 45 | self.no_grad = no_grad 46 | self.without_negative = without_negative 47 | else: 48 | self.label_size = labels_num 49 | 50 | self.num_labels = labels_num 51 | self.lstm = LSTMEncoder(emb_size, hidden_size, layers_num, dropout) 52 | self.attention = MLAttention(self.label_size, hidden_size * 2) 53 | self.linear = MLLinear([hidden_size * 2] + linear_size, self.label_size) 54 | 55 | if self.use_spn: 56 | self.create_label_embedding() # Create the labels. 57 | 58 | def create_label_embedding(self): 59 | # Class labels. # +1 for the END of LIST Label. 60 | self._class_vectors = get_vectors(self.num_labels + 1, self.label_size) 61 | 62 | # Initialize embedding layer. 63 | self.class_vec = nn.Embedding(self.num_labels + 1, self.label_size) 64 | self.class_vec.load_state_dict({'weight': self._class_vectors}) 65 | self.class_vec.weight.requires_grad = False 66 | 67 | # Initialize weights vector. 68 | weights = torch.ones((self.num_labels + 1, 1), dtype=torch.int8) 69 | weights[self.num_labels] = 0 # Padding vector is made 0. 70 | self.class_weights = nn.Embedding(self.num_labels + 1, 1) 71 | self.class_weights.load_state_dict({'weight': weights}) 72 | self.class_weights.weight.requires_grad = False 73 | 74 | # P & N vectors. 75 | p_n_vec = get_vectors(2, self.label_size, ortho=True) 76 | if self.no_grad: 77 | print("P & N vectors WILL NOT be updated while training...") 78 | self.p = nn.Parameter(p_n_vec[0], requires_grad=False) 79 | self.n = nn.Parameter(p_n_vec[1], requires_grad=False) 80 | else: 81 | print("P & N vectors WILL be updated while training...") 82 | self.p = nn.Parameter(p_n_vec[0], requires_grad=True) 83 | self.n = nn.Parameter(p_n_vec[1], requires_grad=True) 84 | 85 | 86 | def inference(self, s, batch_size, positive=True): 87 | #(batch, dims) 88 | if positive: 89 | vec = self.p.unsqueeze(0).expand(batch_size, self.label_size) 90 | else: 91 | vec = self.n.unsqueeze(0).expand(batch_size, self.label_size) 92 | 93 | # vec = complexMagProj(vec) 94 | inv_vec = get_appx_inv(vec) 95 | y = circular_conv(inv_vec, s) #(batch, dims) 96 | y = y / (torch.norm(y, dim=-1, keepdim=True) + 1e-8) 97 | return y 98 | 99 | def spp_loss(self, s, target): 100 | """ 101 | Train with SPP. 102 | """ 103 | pos_classes = self.class_vec(target) #(batch, no_label, dims) 104 | pos_classes = pos_classes * self.class_weights(target) # exit(0) 105 | 106 | # Normalize the class vectors. 107 | # tgt_shape = pos_classes.shape 108 | # pos_classes = torch.reshape(pos_classes, (tgt_shape[0] * tgt_shape[1], 109 | # tgt_shape[2])) 110 | # pos_classes = torch.reshape(complexMagProj(pos_classes), (tgt_shape[0], tgt_shape[1], 111 | # tgt_shape[2])) 112 | 113 | # Remove the padding idx vectors. 114 | # pos_classes = pos_classes.to(device) 115 | 116 | # Positive prediction loss 117 | convolve = self.inference(s, target.size(0)) 118 | cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1) 119 | J_p = torch.mean(torch.sum(1 - torch.abs(cosine), dim=-1)) 120 | 121 | # Negative prediction loss. 122 | J_n = 0.0 123 | if self.without_negative is False: 124 | convolve = self.inference(s, target.size(0), positive=False) 125 | cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1) 126 | J_n = torch.mean(torch.sum(torch.abs(cosine), dim=-1)) 127 | 128 | # Total Loss. 129 | loss = J_n + J_p 130 | return loss 131 | 132 | 133 | def forward(self, inputs, **kwargs): 134 | emb_out, lengths, masks = self.emb(inputs, **kwargs) 135 | rnn_out = self.lstm(emb_out, lengths) # N, L, hidden_size * 2 (Bidirectional RNN) 136 | attn_out = self.attention(rnn_out, masks) # N, labels_num, hidden_size * 2 137 | return self.linear(attn_out) 138 | 139 | class FastAttentionRNN(Network): 140 | """ 141 | 142 | """ 143 | def __init__(self, labels_num, emb_size, hidden_size, layers_num, linear_size, dropout, parallel_attn, **kwargs): 144 | super(FastAttentionRNN, self).__init__(emb_size, **kwargs) 145 | self.lstm = LSTMEncoder(emb_size, hidden_size, layers_num, dropout) 146 | self.attention = FastMLAttention(labels_num, hidden_size * 2, parallel_attn) 147 | self.linear = MLLinear([hidden_size * 2] + linear_size, 1) 148 | 149 | def forward(self, inputs, candidates, attn_weights: nn.Module, **kwargs): 150 | emb_out, lengths, masks = self.emb(inputs, **kwargs) 151 | rnn_out = self.lstm(emb_out, lengths) # N, L, hidden_size * 2 152 | attn_out = self.attention(rnn_out, masks, candidates, attn_weights) # N, sampled_size, hidden_size * 2 153 | return self.linear(attn_out) 154 | -------------------------------------------------------------------------------- /attention-xml/deepxml/optimizers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2019/3/7 5 | @author yrh 6 | 7 | """ 8 | 9 | import math 10 | import torch 11 | from torch.optim.optimizer import Optimizer 12 | 13 | 14 | __all__ = ['DenseSparseAdam'] 15 | 16 | 17 | class DenseSparseAdam(Optimizer): 18 | """ 19 | 20 | """ 21 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0): 22 | if not 0.0 <= lr: 23 | raise ValueError("Invalid learning rate: {}".format(lr)) 24 | if not 0.0 <= eps: 25 | raise ValueError("Invalid epsilon value: {}".format(eps)) 26 | if not 0.0 <= betas[0] < 1.0: 27 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 28 | if not 0.0 <= betas[1] < 1.0: 29 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 30 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 31 | super(DenseSparseAdam, self).__init__(params, defaults) 32 | 33 | def step(self, closure=None): 34 | """ 35 | Performs a single optimization step. 36 | 37 | Parameters 38 | ---------- 39 | closure : ``callable``, optional. 40 | A closure that reevaluates the model and returns the loss. 41 | """ 42 | loss = None 43 | if closure is not None: 44 | loss = closure() 45 | 46 | for group in self.param_groups: 47 | for p in group['params']: 48 | if p.grad is None: 49 | continue 50 | grad = p.grad.data 51 | 52 | state = self.state[p] 53 | 54 | # State initialization 55 | if 'step' not in state: 56 | state['step'] = 0 57 | if 'exp_avg' not in state: 58 | # Exponential moving average of gradient values 59 | state['exp_avg'] = torch.zeros_like(p.data) 60 | if 'exp_avg_sq' not in state: 61 | # Exponential moving average of squared gradient values 62 | state['exp_avg_sq'] = torch.zeros_like(p.data) 63 | 64 | state['step'] += 1 65 | 66 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 67 | beta1, beta2 = group['betas'] 68 | 69 | weight_decay = group['weight_decay'] 70 | 71 | if grad.is_sparse: 72 | grad = grad.coalesce() # the update is non-linear so indices must be unique 73 | grad_indices = grad._indices() 74 | grad_values = grad._values() 75 | size = grad.size() 76 | 77 | def make_sparse(values): 78 | constructor = grad.new 79 | if grad_indices.dim() == 0 or values.dim() == 0: 80 | return constructor().resize_as_(grad) 81 | return constructor(grad_indices, values, size) 82 | 83 | # Decay the first and second moment running average coefficient 84 | # old <- b * old + (1 - b) * new 85 | # <==> old += (1 - b) * (new - old) 86 | old_exp_avg_values = exp_avg.sparse_mask(grad)._values() 87 | exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1) 88 | exp_avg.add_(make_sparse(exp_avg_update_values)) 89 | old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values() 90 | exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2) 91 | exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values)) 92 | 93 | # Dense addition again is intended, avoiding another sparse_mask 94 | numer = exp_avg_update_values.add_(old_exp_avg_values) 95 | exp_avg_sq_update_values.add_(old_exp_avg_sq_values) 96 | denom = exp_avg_sq_update_values.sqrt_().add_(group['eps']) 97 | del exp_avg_update_values, exp_avg_sq_update_values 98 | 99 | bias_correction1 = 1 - beta1 ** state['step'] 100 | bias_correction2 = 1 - beta2 ** state['step'] 101 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 102 | 103 | p.data.add_(make_sparse(-step_size * numer.div_(denom))) 104 | if weight_decay > 0.0: 105 | p.data.add_(-group['lr'] * weight_decay, p.data.sparse_mask(grad)) 106 | else: 107 | # Decay the first and second moment running average coefficient 108 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 109 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 110 | denom = exp_avg_sq.sqrt().add_(group['eps']) 111 | 112 | bias_correction1 = 1 - beta1 ** state['step'] 113 | bias_correction2 = 1 - beta2 ** state['step'] 114 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 115 | 116 | p.data.addcdiv_(-step_size, exp_avg, denom) 117 | if weight_decay > 0.0: 118 | p.data.add_(-group['lr'] * weight_decay, p.data) 119 | 120 | return loss 121 | -------------------------------------------------------------------------------- /attention-xml/ensemble.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2019/6/11 5 | @author yrh 6 | 7 | """ 8 | 9 | import click 10 | import numpy as np 11 | from collections import defaultdict 12 | from tqdm import tqdm 13 | 14 | 15 | @click.command() 16 | @click.option('-p', '--prefix', help='Prefix of results.') 17 | @click.option('-t', '--trees', type=click.INT, help='The number of results using for ensemble.') 18 | def main(prefix, trees): 19 | labels, scores = [], [] 20 | for i in range(trees): 21 | labels.append(np.load(F'{prefix}-Tree-{i}-labels.npy', allow_pickle=True)) 22 | scores.append(np.load(F'{prefix}-Tree-{i}-scores.npy', allow_pickle=True)) 23 | ensemble_labels, ensemble_scores = [], [] 24 | for i in tqdm(range(len(labels[0]))): 25 | s = defaultdict(float) 26 | for j in range(len(labels[0][i])): 27 | for k in range(trees): 28 | s[labels[k][i][j]] += scores[k][i][j] 29 | s = sorted(s.items(), key=lambda x: x[1], reverse=True) 30 | ensemble_labels.append([x[0] for x in s[:len(labels[0][i])]]) 31 | ensemble_scores.append([x[1] for x in s[:len(labels[0][i])]]) 32 | np.save(F'{prefix}-Ensemble-labels', np.asarray(ensemble_labels)) 33 | np.save(F'{prefix}-Ensemble-scores', np.asarray(ensemble_scores)) 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /attention-xml/evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2019/8/21 5 | @author yrh 6 | """ 7 | 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | import click 12 | import numpy as np 13 | from sklearn.preprocessing import MultiLabelBinarizer 14 | 15 | from deepxml.evaluation import * 16 | 17 | 18 | @click.command() 19 | @click.option('-r', '--results', type=click.Path(exists=True), help='Path of results.') 20 | @click.option('-t', '--targets', type=click.Path(exists=True), help='Path of targets.') 21 | @click.option('--train-labels', type=click.Path(exists=True), default=None, help='Path of labels for training set.') 22 | @click.option('-a', type=click.FLOAT, default=0.55, help='Parameter A for propensity score.') 23 | @click.option('-b', type=click.FLOAT, default=1.5, help='Parameter B for propensity score.') 24 | def main(results, targets, train_labels, a, b): 25 | res, targets = np.load(results, allow_pickle=True), np.load(targets, allow_pickle=True) 26 | mlb = MultiLabelBinarizer(sparse_output=True) 27 | targets = mlb.fit_transform(targets) 28 | print('Precision@1,3,5:', get_p_1(res, targets, mlb), get_p_3(res, targets, mlb), get_p_5(res, targets, mlb)) 29 | print('nDCG@1,3,5:', get_n_1(res, targets, mlb), get_n_3(res, targets, mlb), get_n_5(res, targets, mlb)) 30 | if train_labels is not None: 31 | train_labels = np.load(train_labels, allow_pickle=True) 32 | inv_w = get_inv_propensity(mlb.transform(train_labels), a, b) 33 | print('PSPrecision@1,3,5:', get_psp_1(res, targets, inv_w, mlb), get_psp_3(res, targets, inv_w, mlb), 34 | get_psp_5(res, targets, inv_w, mlb)) 35 | print('PSnDCG@1,3,5:', get_psndcg_1(res, targets, inv_w, mlb), get_psndcg_3(res, targets, inv_w, mlb), 36 | get_psndcg_5(res, targets, inv_w, mlb)) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /attention-xml/experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # set -vx 3 | 4 | # AUTHOR: Ashwinkumar Ganesan. 5 | 6 | # NOTE: Usage:- 7 | # 1. ./experiments.sh all (for running experiments on all datasets). 8 | # 2. ./experiments.sh (for running experiments on a specific dataset). 9 | # 3. ./experiments.sh gather (for gathering the precision only). 10 | 11 | # Config. 12 | NAME=${1:-"all"} 13 | MEM=256000 14 | SAVE_LOC="results" 15 | EXP_NAME=${2:-"test"} 16 | MODEL_TYPE=${3:-"all"} 17 | DIMS=${4:-400} 18 | WITH_GRAD=${5:-"grad"} # no-grad for training with no gradient to p & n vectors. 19 | WITHOUT_NEGATIVE=${6:-"with-negative"} # without-negative for training. 20 | PROP_A=${7:-"0.55"} # Propensity value A. 21 | PROP_B=${8:-"1.5"} # Propensity value B. 22 | 23 | create_job () { 24 | echo "Location to save model: $SAVE_LOC/$1 ..." 25 | if [[ ( "$MODEL_TYPE" == "all" ) ]]; then 26 | echo "Creating jobs for both models..." 27 | sbatch --job-name=$1-${DIMS}-all --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \ 28 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 29 | elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then 30 | echo "Creating jobs for baseline model..." 31 | sbatch --job-name=$1-${DIMS}-base --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \ 32 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 33 | elif [[ ( "$MODEL_TYPE" == "hrr" ) ]]; then 34 | echo "Creating jobs for HRR model..." 35 | sbatch --job-name=$1-${DIMS}-hrr --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \ 36 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 37 | fi 38 | } 39 | 40 | # NOTE: Individual jobs for each dataset are easier to track. 41 | # This keeps the SLURM files simple. 42 | 43 | # Eurlex dataset. 44 | if [[ ( "$NAME" == "EUR-Lex" ) || ( "$NAME" == "all" ) ]] 45 | then 46 | create_job EUR-Lex $WITH_GRAD $WITHOUT_NEGATIVE 47 | fi 48 | 49 | # Wiki30k dataset. 50 | if [[ ( "$NAME" == "Wiki10-31K" ) || ( "$NAME" == "all" ) ]] 51 | then 52 | create_job Wiki10-31K $WITH_GRAD $WITHOUT_NEGATIVE 53 | fi 54 | 55 | # AmazonCat-13K dataset. 56 | if [[ ( "$NAME" == "AmazonCat-13K" ) || ( "$NAME" == "all" ) ]] 57 | then 58 | create_job AmazonCat-13K $WITH_GRAD $WITHOUT_NEGATIVE 59 | fi 60 | 61 | # Amazon-670K dataset. 62 | if [[ ( "$NAME" == "Amazon-670K" ) || ( "$NAME" == "all" ) ]] 63 | then 64 | create_job Amazon-670K $WITH_GRAD $WITHOUT_NEGATIVE 65 | fi 66 | -------------------------------------------------------------------------------- /attention-xml/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2018/12/9 5 | @author yrh 6 | 7 | """ 8 | 9 | import os 10 | import click 11 | import numpy as np 12 | from pathlib import Path 13 | from ruamel.yaml import YAML 14 | from sklearn.model_selection import train_test_split 15 | from torch.utils.data import DataLoader 16 | from logzero import logger 17 | 18 | from torch.nn import DataParallel 19 | from pytorch_model_summary import summary 20 | from deepxml.dataset import MultiLabelDataset 21 | from deepxml.data_utils import get_data, get_mlb, get_word_emb, output_res, build_spn_labels, convert_to_spn 22 | from deepxml.models import Model 23 | from deepxml.tree import FastAttentionXML 24 | from deepxml.networks import AttentionRNN 25 | 26 | 27 | @click.command() 28 | @click.option('-d', '--data-cnf', type=click.Path(exists=True), help='Path of dataset configure yaml.') 29 | @click.option('-m', '--model-cnf', type=click.Path(exists=True), help='Path of model configure yaml.') 30 | @click.option('--mode', type=click.Choice(['train', 'eval']), default=None) 31 | @click.option('-t', '--tree-id', type=click.INT, default=None) 32 | def main(data_cnf, model_cnf, mode, tree_id): 33 | tree_id = F'-Tree-{tree_id}' if tree_id is not None else '' 34 | yaml = YAML(typ='safe') 35 | data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf)) 36 | model, model_name, data_name = None, model_cnf['name'], data_cnf['name'] 37 | dim_size = model_cnf['model']['spn_dim'] if model_cnf['model']['spn_dim'] is not False else 0 38 | model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}{tree_id}-{dim_size}') 39 | emb_init = get_word_emb(data_cnf['embedding']['emb_init']) 40 | logger.info(F'Model Name: {model_name}') 41 | 42 | # NOTE: The training and validation labels are a list of textual labels/ row. 43 | if mode is None or mode == 'train': 44 | logger.info('Loading Training and Validation Set') 45 | train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels']) 46 | if 'size' in data_cnf['valid']: 47 | random_state = data_cnf['valid'].get('random_state', 1240) 48 | train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels, 49 | test_size=data_cnf['valid']['size'], 50 | random_state=random_state) 51 | else: 52 | valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels']) 53 | mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels))) 54 | train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels) 55 | labels_num = len(mlb.classes_) 56 | logger.info(F'Number of Labels: {labels_num}') 57 | logger.info(F'Size of Training Set: {len(train_x)}') 58 | logger.info(F'Size of Validation Set: {len(valid_x)}') 59 | 60 | if data_cnf['use_spn']: 61 | logger.info(F'Processing SPN Labels...') 62 | spn_train_labels = convert_to_spn(train_y) 63 | spn_valid_labels = convert_to_spn(valid_y) 64 | 65 | logger.info(F'Number of SPN Labels: {labels_num + 1}') 66 | logger.info(F'Maximum label in single row: {spn_train_labels.shape[1]}') 67 | logger.info(F'Training labels: {spn_train_labels.shape}') 68 | logger.info(F'Validation labels: {spn_valid_labels.shape}') 69 | 70 | logger.info('Training') 71 | if 'cluster' not in model_cnf: 72 | if data_cnf['use_spn']: 73 | train_loader = DataLoader(MultiLabelDataset(train_x, train_y, spn_train_labels), 74 | model_cnf['train']['batch_size'], shuffle=True, num_workers=4) 75 | valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, spn_valid_labels, training=False), 76 | model_cnf['valid']['batch_size'], num_workers=4) 77 | model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, 78 | emb_init=emb_init, use_spn=data_cnf['use_spn'], 79 | **data_cnf['model'], **model_cnf['model']) 80 | else: 81 | train_loader = DataLoader(MultiLabelDataset(train_x, train_y), 82 | model_cnf['train']['batch_size'], shuffle=True, num_workers=4) 83 | valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=False), 84 | model_cnf['valid']['batch_size'], num_workers=4) 85 | model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, 86 | emb_init=emb_init, use_spn=data_cnf['use_spn'], 87 | **data_cnf['model'], **model_cnf['model']) 88 | 89 | # Print Summary. 90 | model.train(train_loader, valid_loader, **model_cnf['train']) 91 | else: 92 | model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id) 93 | model.train(train_x, train_y, valid_x, valid_y, mlb) 94 | logger.info('Finish Training') 95 | 96 | if mode is None or mode == 'eval': 97 | logger.info('Loading Test Set') 98 | mlb = get_mlb(data_cnf['labels_binarizer']) 99 | labels_num = len(mlb.classes_) 100 | test_x, _ = get_data(data_cnf['test']['texts'], None) 101 | logger.info(F'Size of Test Set: {len(test_x)}') 102 | 103 | logger.info('Predicting') 104 | if 'cluster' not in model_cnf: 105 | test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], 106 | num_workers=4) 107 | if model is None: 108 | model = DataParallel(Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init, 109 | **data_cnf['model'], **model_cnf['model'])) 110 | scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100)) 111 | else: 112 | if model is None: 113 | model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id) 114 | scores, labels = model.predict(test_x) 115 | logger.info('Finish Predicting') 116 | labels = mlb.classes_[labels] 117 | output_res(data_cnf['output']['res'], F'{model_name}-{dim_size}-{data_name}{tree_id}', scores, labels) 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /attention-xml/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 3 | """ 4 | Created on 2019/1/20 5 | @author yrh 6 | 7 | """ 8 | 9 | import os 10 | import re 11 | import click 12 | import numpy as np 13 | from nltk.tokenize import word_tokenize 14 | from tqdm import tqdm 15 | from logzero import logger 16 | 17 | from deepxml.data_utils import * 18 | 19 | 20 | def tokenize(sentence: str, sep='/SEP/'): 21 | # We added a /SEP/ symbol between titles and descriptions such as Amazon datasets. 22 | return [token.lower() if token != sep else token for token in word_tokenize(sentence) 23 | if len(re.sub(r'[^\w]', '', token)) > 0] 24 | 25 | 26 | @click.command() 27 | @click.option('--text-path', type=click.Path(exists=True), help='Path of text.') 28 | @click.option('--tokenized-path', type=click.Path(), default=None, help='Path of tokenized text.') 29 | @click.option('--label-path', type=click.Path(exists=True), default=None, help='Path of labels.') 30 | @click.option('--vocab-path', type=click.Path(), default=None, 31 | help='Path of vocab, if it doesn\'t exit, build one and save it.') 32 | @click.option('--emb-path', type=click.Path(), default=None, help='Path of word embedding.') 33 | @click.option('--w2v-model', type=click.Path(), default=None, help='Path of Gensim Word2Vec Model.') 34 | @click.option('--vocab-size', type=click.INT, default=500000, help='Size of vocab.') 35 | @click.option('--max-len', type=click.INT, default=500, help='Truncated length.') 36 | @click.option('--add-spn', type=click.BOOL, default=False, help='Add SPN labels to the dataset.') 37 | def main(text_path, tokenized_path, label_path, vocab_path, emb_path, w2v_model, 38 | vocab_size, max_len, add_spn): 39 | if tokenized_path is not None: 40 | logger.info(F'Tokenizing Text. {text_path}') 41 | with open(text_path) as fp, open(tokenized_path, 'w') as fout: 42 | for line in tqdm(fp, desc='Tokenizing'): 43 | print(*tokenize(line), file=fout) 44 | text_path = tokenized_path 45 | 46 | if not os.path.exists(vocab_path): 47 | logger.info(F'Building Vocab. {text_path}') 48 | logger.info(F'Embedding Path. {w2v_model}') 49 | with open(text_path) as fp: 50 | vocab, emb_init = build_vocab(fp, w2v_model, vocab_size=vocab_size) 51 | np.save(vocab_path, vocab) 52 | np.save(emb_path, emb_init) 53 | 54 | vocab = {word: i for i, word in enumerate(np.load(vocab_path))} 55 | logger.info(F'Vocab Size: {len(vocab)}') 56 | 57 | logger.info(F'Getting Dataset: {text_path} Max Length: {max_len}') 58 | texts, labels = convert_to_binary(text_path, label_path, max_len, vocab) 59 | logger.info(F'Size of Samples: {len(texts)}') 60 | np.save(os.path.splitext(text_path)[0], texts) 61 | if labels is not None: 62 | assert len(texts) == len(labels) 63 | np.save(os.path.splitext(label_path)[0], labels) 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /attention-xml/requirements.txt: -------------------------------------------------------------------------------- 1 | click==7.0 2 | ruamel.yaml==0.16.5 3 | numpy==1.16.2 4 | scipy==1.3.1 5 | scikit-learn==0.21.2 6 | gensim==3.4.0 7 | torch==1.0.1 8 | nltk==3.4 9 | tqdm==4.31.1 10 | joblib==0.13.2 11 | logzero==1.5.0 12 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_amazon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=Amazon-670K 4 | MODEL=FastAttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy \ 13 | -a 0.6 \ 14 | -b 2.6 15 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_amazon3m.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=Amazon-3M 4 | MODEL=FastAttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy \ 13 | -a 0.6 \ 14 | -b 2.6 15 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_amazoncat.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=AmazonCat-13K 4 | MODEL=AttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy 13 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_eurlex.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=EUR-Lex 4 | MODEL=AttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy 13 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $1 == "EUR-Lex" ]; then 4 | TRAIN_TEXT="--text-path data/$1/train_texts.txt" 5 | TEST_TEXT="--text-path data/$1/test_texts.txt" 6 | else 7 | TRAIN_TEXT="--text-path data/$1/train_raw_texts.txt --tokenized-path data/$1/train_texts.txt" 8 | TEST_TEXT="--text-path data/$1/test_raw_texts.txt --tokenized-path data/$1/test_texts.txt" 9 | fi 10 | 11 | if [ ! -f data/$1/train_texts.npy ]; then 12 | python preprocess.py $TRAIN_TEXT --label-path data/$1/train_labels.txt --vocab-path data/$1/vocab.npy --emb-path data/$1/emb_init.npy --w2v-model data/embeddings_weights/glove.6B.300d.bin 13 | fi 14 | 15 | if [ ! -f data/$1/test_texts.npy ]; then 16 | python preprocess.py $TEST_TEXT --label-path data/$1/test_labels.txt --vocab-path data/$1/vocab.npy 17 | fi 18 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_wiki.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=Wiki-500K 4 | MODEL=FastAttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy \ 13 | -a 0.5 \ 14 | -b 0.4 15 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_wiki10.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA=Wiki10-31K 4 | MODEL=AttentionXML 5 | 6 | ./scripts/run_preprocess.sh $DATA 7 | ./scripts/run_xml.sh $DATA $MODEL 8 | 9 | python evaluation.py \ 10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \ 11 | --targets data/$DATA/test_labels.npy \ 12 | --train-labels data/$DATA/train_labels.npy 13 | -------------------------------------------------------------------------------- /attention-xml/scripts/run_xml.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 0 4 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 1 5 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 2 6 | python ensemble.py -p results/$2-$1 -t 3 7 | -------------------------------------------------------------------------------- /attention-xml/train.slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is for GPU allocation is available. #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --output=output/slurm-%x-%a.out 7 | #SBATCH --error=output/slurm-%x-%a.err 8 | 9 | # Set the environment. 10 | # source deactivate # Remove previous environments. 11 | source ~/anaconda3/etc/profile.d/conda.sh 12 | conda activate spp # Environment name. 13 | 14 | # Execute the code. 15 | set -o xtrace 16 | TASK_ID=$((SLURM_ARRAY_TASK_ID)) 17 | NAME=$1 18 | SAVE_MODEL=$2 19 | EXP_NAME=$3 20 | DIMS=$4 21 | WITH_GRAD=${5} 22 | WITHOUT_NEGATIVE=${6} 23 | PROP_A=${7:-"0.55"} # Propensity value A. For Amazon-670K it is 0.6 24 | PROP_B=${8:-"1.5"} # Propensity value B. For Amazon-670K it is 2.6 25 | 26 | # Model information. 27 | MODEL=("baseline" "hrr") 28 | MODEL_NETWORK="AttentionXML" 29 | 30 | # Select the model. 31 | MODEL_TYPE=${MODEL[${TASK_ID}]} 32 | FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE}-${DIMS}-${WITH_GRAD}-${WITHOUT_NEGATIVE} 33 | echo "Parameters: $NAME $SAVE_MODEL" 34 | echo " $MODEL_TYPE $EXP_NAME $DIMS" 35 | echo " ${WITH_GRAD} ${WITHOUT_NEGATIVE}" 36 | 37 | # Construct list of options. 38 | OPTIONS="" 39 | if [ "$MODEL_TYPE" == "hrr" ] 40 | then 41 | DATA_YAML=${NAME}-spn 42 | MODEL_YAML=${MODEL_NETWORK}-${NAME}-spn-${DIMS} 43 | LABEL_NAME=${MODEL_NETWORK}-${DIMS}-${NAME}-spn-${DIMS} 44 | else 45 | DATA_YAML=${NAME} 46 | MODEL_YAML=${MODEL_NETWORK}-${NAME} 47 | LABEL_NAME=${MODEL_NETWORK}-0-${NAME}-baseline-0 48 | fi 49 | 50 | if [ "$WITH_GRAD" == "no-grad" ] 51 | then 52 | OPTIONS="${OPTIONS} --no-grad" 53 | fi 54 | 55 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ] 56 | then 57 | OPTIONS="${OPTIONS} --without-negative" 58 | fi 59 | 60 | # Train the the models. 61 | echo $DATA_YAML, $MODEL_YAML 62 | echo "OPTIONS: $OPTIONS" 63 | python main.py --data-cnf configure/datasets/${DATA_YAML}.yaml --model-cnf configure/models/${MODEL_YAML}.yaml > results/${FIN_EXP_NAME}.results 64 | 65 | # Evaluation. 66 | echo "Test Results..." 67 | python evaluation.py --results results/${LABEL_NAME}-labels.npy \ 68 | --targets data/${NAME}/test_labels.npy --train-labels data/${NAME}/train_labels.npy >> results/${FIN_EXP_NAME}.results 69 | -------------------------------------------------------------------------------- /combine_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # set -vx 3 | 4 | # Script to combine results from different experiments. 5 | # AUTHOR: Ashwinkumar Ganesan. 6 | 7 | # Config. 8 | NAME=${1:-"all"} 9 | MEM=256000 10 | SAVE_LOC="data/model+results" 11 | EXP_NAME=${2:-"temp-exp"} 12 | MODEL_TYPE=${3:-"all"} 13 | DIMS=${4:-400} 14 | THRESHOLD=${5:-0.3} 15 | SAVE_FILE_NAME="$EXP_NAME.results" 16 | 17 | get_results () { 18 | if [[ ( "$NAME" == "$1" ) || ( "$NAME" == "all" ) || ( "$NAME" == "gather" ) ]] 19 | then 20 | SAVE_FILE=$SAVE_LOC/$SAVE_FILE_NAME 21 | echo -e "\n" >> $SAVE_FILE 22 | echo "Dataset: $1" >> $SAVE_FILE 23 | 24 | if [[ ( "$MODEL_TYPE" == "baseline" ) || ( "$MODEL_TYPE" == "all" ) ]]; then 25 | echo "Baseline..." >> $SAVE_FILE 26 | tail -7 $SAVE_LOC/$1/$1_baseline_$EXP_NAME.results >> $SAVE_FILE 27 | fi 28 | 29 | if [[ ( "$MODEL_TYPE" == "spn" ) || ( "$MODEL_TYPE" == "all" ) ]]; then 30 | echo -e "\nSPN..." >> $SAVE_FILE 31 | tail -7 $SAVE_LOC/$1/$1_spn_$EXP_NAME.results >> $SAVE_FILE 32 | fi 33 | fi 34 | } 35 | 36 | echo "Delete old results..." 37 | rm $SAVE_LOC/$SAVE_FILE_NAME 38 | 39 | # Combine results. 40 | get_results Bibtex 41 | get_results Delicious 42 | get_results Mediamill 43 | get_results Eurlex4k 44 | get_results Wiki10 45 | get_results AmazonCat13K 46 | get_results Amazon670K 47 | get_results DeliciousLarge 48 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ### Location to store all datasets. -------------------------------------------------------------------------------- /experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # set -vx 3 | 4 | # Script to execute experiments with different datasets. 5 | # AUTHOR: Ashwinkumar Ganesan. 6 | 7 | # NOTE: Usage:- 8 | # 1. ./experiments.sh all (for running experiments on all datasets). 9 | # 2. ./experiments.sh (for running experiments on a specific dataset). 10 | # 3. ./experiments.sh gather (for gathering the precision only). 11 | 12 | # Config. 13 | NAME=${1:-"all"} 14 | MEM=256000 15 | SAVE_LOC="data/model+results" 16 | EXP_NAME=${2:-"temp-exp"} 17 | MODEL_TYPE=${3:-"all"} 18 | DIMS=${4:-400} 19 | THRESHOLD=${5:-0.3} 20 | WITH_GRAD=${6:-"grad"} # no-grad for training with no gradient to p & n vectors. 21 | WITHOUT_NEGATIVE=${7:-"with-negative"} # without-negative for training. 22 | SAVE_FILE_NAME="$EXP_NAME.results" 23 | 24 | create_job () { 25 | echo "Location to save model: $SAVE_LOC/$1 ..." 26 | if [[ ( "$MODEL_TYPE" == "all" ) ]]; then 27 | echo "Creating jobs for both models..." 28 | sbatch --job-name=$5 --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \ 29 | $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9 30 | elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then 31 | echo "Creating jobs for baseline model..." 32 | sbatch --job-name=$5 --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \ 33 | $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9 34 | elif [[ ( "$MODEL_TYPE" == "spn" ) ]]; then 35 | echo "Creating jobs for SPN model..." 36 | sbatch --job-name=$5 --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \ 37 | $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9 38 | fi 39 | } 40 | 41 | # NOTE: Individual jobs for each dataset are easier to track. 42 | # This keeps the SLURM files simple. 43 | 44 | # Bibtex dataset. 45 | if [[ ( "$NAME" == "Bibtex" ) || ( "$NAME" == "all" ) ]] 46 | then 47 | create_job Bibtex data/Bibtex/Bibtex_data.txt data/Bibtex/bibtex_trSplit.txt \ 48 | data/Bibtex/bibtex_tstSplit.txt bibtex 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 49 | fi 50 | 51 | # Delicious dataset. 52 | if [[ ( "$NAME" == "Delicious" ) || ( "$NAME" == "all" ) ]] 53 | then 54 | create_job Delicious data/Delicious/Delicious_data.txt data/Delicious/delicious_trSplit.txt \ 55 | data/Delicious/delicious_tstSplit.txt delic 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 56 | fi 57 | 58 | # Mediamill dataset. 59 | if [[ ( "$NAME" == "Mediamill" ) || ( "$NAME" == "all" ) ]] 60 | then 61 | create_job Mediamill data/Mediamill/Mediamill_data.txt data/Mediamill/mediamill_trSplit.txt \ 62 | data/Mediamill/mediamill_tstSplit.txt mediam 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 63 | fi 64 | 65 | # Eurlex-4K dataset. 66 | if [[ ( "$NAME" == "Eurlex4k" ) || ( "$NAME" == "all" ) ]] 67 | then 68 | create_job Eurlex4k None data/Eurlex4k/eurlex_train.txt data/Eurlex4k/eurlex_test.txt eurlex 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 69 | fi 70 | 71 | # Wiki10 dataset. 72 | if [[ ( "$NAME" == "Wiki10" ) || ( "$NAME" == "all" ) ]] 73 | then 74 | create_job Wiki10 None data/Wiki10/train.txt data/Wiki10/test.txt wiki10 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 75 | fi 76 | 77 | # AmazonCat13K dataset. 78 | if [[ ( "$NAME" == "AmazonCat13K" ) || ( "$NAME" == "all" ) ]] 79 | then 80 | create_job AmazonCat13K None data/AmazonCat13K/train.txt data/AmazonCat13K/test.txt ama13k 64 64 $WITH_GRAD $WITHOUT_NEGATIVE 81 | fi 82 | 83 | # Amazon670K dataset. 84 | if [[ ( "$NAME" == "Amazon670K" ) || ( "$NAME" == "all" ) ]] 85 | then 86 | create_job Amazon670K None data/Amazon670K/train.txt data/Amazon670K/test.txt ama670 16 16 $WITH_GRAD $WITHOUT_NEGATIVE 87 | fi 88 | 89 | # DeliciousLarge dataset. 90 | if [[ ( "$NAME" == "DeliciousLarge" ) || ( "$NAME" == "all" ) ]] 91 | then 92 | create_job DeliciousLarge None data/DeliciousLarge/deliciousLarge_train.txt \ 93 | data/DeliciousLarge/deliciousLarge_test.txt dlarge 8 8 $WITH_GRAD $WITHOUT_NEGATIVE 94 | fi 95 | -------------------------------------------------------------------------------- /hrr-example-representation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/hrr-example-representation.png -------------------------------------------------------------------------------- /hrr-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/hrr-example.png -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__init__.py -------------------------------------------------------------------------------- /lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/mathops.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__pycache__/mathops.cpython-36.pyc -------------------------------------------------------------------------------- /lib/embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Operations to generate embeddings. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | import numpy as np 9 | import torch 10 | from gensim.models import KeyedVectors 11 | 12 | from .mathops import complex_multiplication, complex_division, circular_conv 13 | from .mathops import get_appx_inv, get_inv, complexMagProj, normalize 14 | from .mathops import npcomplexMagProj 15 | 16 | """ 17 | Load Pretrained Label Embeddings. 18 | """ 19 | def load_embeddings(save_loc, vocab_size): 20 | fname = save_loc + "-complex.bin" 21 | model = KeyedVectors.load_word2vec_format(fname, binary=True) 22 | rand_vec_cnt = 0 23 | vectors = [] # positions in vector space. 24 | for i in range(0, vocab_size): 25 | if str(i) in model.wv.vocab: 26 | vectors.append(model.wv[str(i)]) 27 | else: 28 | # NOTE: When a label is not present in training then we generate a 29 | # default vector and add it to the label vector matrix. 30 | # As SPN select the label based on the index it remains consistent while training. 31 | rand_vec_cnt += 1 32 | vectors.append(gen_rand_vec(model.vector_size)) 33 | 34 | # Add Padding idx. 35 | print("Vocabulary Size: {}".format(vocab_size)) 36 | print("Number of Random vectors generated: {}".format(rand_vec_cnt)) 37 | vectors.append(gen_rand_vec(model.vector_size)) 38 | vectors = torch.from_numpy(np.array(vectors, dtype=np.float32)) 39 | return vectors 40 | 41 | """ 42 | NumPY operations for embeddings. 43 | """ 44 | def generate_vectors(num_vectors, dims): 45 | """ 46 | Generate n vectors of size dims that are orthogonal to each other. 47 | """ 48 | if num_vectors > dims: 49 | raise ValueError("num_vectors cannot be greater than dims!") 50 | 51 | # Intializing class vectors. 52 | vecs = torch.randn(dims, num_vectors, dtype=torch.float) 53 | 54 | # Using QR decomposition to get orthogonal vectors. 55 | vecs, _ = torch.qr(vecs) 56 | vecs = vecs.t() 57 | vecs = vecs / torch.norm(vecs, dim=-1, keepdim=True) 58 | return vecs 59 | 60 | 61 | def gen_rand_vec(dims): 62 | """ 63 | Generate a random vector of size dims. 64 | """ 65 | return npcomplexMagProj(np.random.normal(0, 1. / dims, size=(dims))) 66 | 67 | 68 | """ 69 | Torch functions. 70 | """ 71 | def get_vectors(num_vectors, dims, ortho=False): 72 | if ortho: 73 | vectors = generate_vectors(num_vectors, dims) 74 | return complexMagProj(vectors) 75 | else: 76 | vectors = [gen_rand_vec(dims) for i in range(num_vectors)] 77 | return torch.from_numpy(np.array(vectors, dtype=np.float32)) 78 | 79 | def get_static_embedding(seeds, dims): 80 | vec = [] 81 | for s in seeds: 82 | torch.manual_seed(s) 83 | vec.append(torch.randn((1, dims), dtype=torch.float)) 84 | 85 | return torch.cat(vec, dim=0) 86 | -------------------------------------------------------------------------------- /lib/mathops.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to perform circular convolution operations. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan, Sunil Gandhi, Hang Gao" 6 | __email__ = "gashwin1@umbc.edu,sunilga1@umbc.edu,hanggao@umbc.edu" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | """ 14 | Pytorch functions. 15 | """ 16 | def complex_multiplication(left, right): 17 | """ 18 | Multiply two vectors in complex domain. 19 | """ 20 | left_real, left_complex = left[..., 0], left[..., 1] 21 | right_real, right_complex = right[..., 0], right[..., 1] 22 | 23 | output_real = left_real * right_real - left_complex * right_complex 24 | output_complex = left_real * right_complex + left_complex * right_real 25 | return torch.stack([output_real, output_complex], dim=-1) 26 | 27 | def complex_division(left, right): 28 | """ 29 | Divide two vectors in complex domain. 30 | """ 31 | left_real, left_complex = left[..., 0], left[..., 1] 32 | right_real, right_complex = right[..., 0], right[..., 1] 33 | 34 | output_real = torch.div((left_real * right_real + left_complex * right_complex),(right_real**2 + right_complex**2)) 35 | output_complex = torch.div((left_complex * right_real - left_real * right_complex ),(right_real**2 + right_complex**2)) 36 | return torch.stack([output_real, output_complex], dim=-1) 37 | 38 | def circular_conv(a, b): 39 | """ Defines the circular convolution operation 40 | a: tensor of shape (batch, D) 41 | b: tensor of shape (batch, D) 42 | """ 43 | left = torch.rfft(a, 1, onesided=False) 44 | right = torch.rfft(b, 1, onesided=False) 45 | output = complex_multiplication(left, right) 46 | output = torch.irfft(output, 1, signal_sizes=a.shape[-1:], onesided=False) 47 | return output 48 | 49 | def get_appx_inv(a): 50 | """ 51 | Compute approximate inverse of vector a. 52 | """ 53 | return torch.roll(torch.flip(a, dims=[-1]), 1,-1) 54 | 55 | def get_inv(a, typ=torch.DoubleTensor): 56 | """ 57 | Compute exact inverse of vector a. 58 | """ 59 | left = torch.rfft(a, 1, onesided=False) 60 | complex_1 = np.zeros(left.shape) 61 | complex_1[...,0] = 1 62 | op = complex_division(typ(complex_1),left) 63 | return torch.irfft(op,1,onesided=False) 64 | 65 | def complexMagProj(x): 66 | """ 67 | Normalize a vector x in complex domain. 68 | """ 69 | c = torch.rfft(x, 1, onesided=False) 70 | c_ish=c/torch.norm(c, dim=-1,keepdim=True) 71 | output = torch.irfft(c_ish, 1, signal_sizes=x.shape[1:], onesided=False) 72 | return output 73 | 74 | def normalize(x): 75 | return x/torch.norm(x) 76 | 77 | """ 78 | Numpy Functions. 79 | """ 80 | # Make them work with batch dimensions 81 | def cc(a, b): 82 | return np.fft.irfft(np.fft.rfft(a) * np.fft.rfft(b)) 83 | 84 | def np_inv(a): 85 | return np.fft.irfft((1.0/np.fft.rfft(a)),n=a.shape[-1]) 86 | 87 | def np_appx_inv(a): 88 | #Faster implementation 89 | return np.roll(np.flip(a, axis=-1), 1,-1) 90 | 91 | def npcomplexMagProj(x): 92 | """ 93 | Normalize a vector x in complex domain. 94 | """ 95 | c = np.fft.rfft(x) 96 | 97 | # Look at real and image as if they were real 98 | c_ish = np.vstack([c.real, c.imag]) 99 | 100 | # Normalize magnitude of each complex/real pair 101 | c_ish=c_ish/np.linalg.norm(c_ish, axis=0) 102 | c_proj = c_ish[0,:] + 1j * c_ish[1,:] 103 | return np.fft.irfft(c_proj,n=x.shape[-1]) 104 | 105 | def nrm(a): 106 | return a / np.linalg.norm(a) 107 | -------------------------------------------------------------------------------- /lib/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to compute different metrics for tasks. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | from tabulate import tabulate 9 | import math 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import torch 13 | import xclib.evaluation.xc_metrics as xc_metrics 14 | 15 | # Compute the precision score for multi-label binary classification task. 16 | def mbprecision(y_true, y_pred): 17 | correct_pred = torch.sum(y_pred & y_true, axis=1).float() 18 | print(correct_pred.dtype) 19 | return torch.mean(correct_pred / torch.sum(y_true, axis=1)) 20 | 21 | # Compute the recall score for multi-label binary classification task. 22 | def mbrecall(y_true, y_pred): 23 | return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1)) 24 | 25 | 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename): 27 | """ 28 | Plot stats about the experiment. 29 | tr_stats: Training statistics (includes loss, precision, recall and F1) 30 | th_stats: Grid search statistics for configuring threshold. 31 | epochs: Number of epochs that the model is trained for. 32 | spoch: epoch that has optimal paramaters. 33 | sth: optimal threshold. 34 | filename: location to store plots. 35 | """ 36 | fig, ax = plt.subplots(3, figsize=(10, 10)) 37 | 38 | ep = tr_stats['Epoch'] 39 | tr_loss = tr_stats['Training Loss'] 40 | val_loss = tr_stats['Val Loss'] 41 | pr = tr_stats['Precision'] 42 | re = tr_stats['Recall'] 43 | f1 = tr_stats['F1 Score'] 44 | th = th_stats['Threshold'] 45 | 46 | ax[0].plot(ep, tr_loss) 47 | ax[0].plot(ep, val_loss) 48 | ax[0].set_title("Training & Validation Loss Per Epoch", size=16) 49 | ax[0].set_xlabel("Epoch", size=14) 50 | ax[0].set_ylabel("Loss", size=14) 51 | ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large") 52 | ax[0].axvline(x=spoch, linestyle='dashed') 53 | 54 | ax[1].plot(ep, pr) 55 | ax[1].plot(ep, re) 56 | ax[1].plot(ep, f1) 57 | ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16) 58 | ax[1].set_xlabel("Epoch", size=14) 59 | ax[1].set_ylabel("Score", size=14) 60 | ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 61 | ax[1].axvline(x=spoch, linestyle='dashed') 62 | 63 | ax[2].plot(th, th_stats['Precision']) 64 | ax[2].plot(th, th_stats['Recall']) 65 | ax[2].plot(th, th_stats['F1 Score']) 66 | ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16) 67 | ax[2].set_xlabel("Theshold", size=14) 68 | ax[2].set_ylabel("Score", size=14) 69 | ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 70 | ax[2].axvline(x=sth, linestyle='dashed') 71 | 72 | fig.tight_layout() 73 | plt.savefig(filename + ".png") 74 | 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5): 77 | """ 78 | Compute Inverse propensity values 79 | Values for A/B: 80 | Wikpedia-500K: 0.5/0.4 81 | Amazon-670K, Amazon-3M: 0.6/2.6 82 | Others: 0.55/1.5 83 | 84 | Arguments: 85 | train_labels : numpy ndarray 86 | """ 87 | inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B) 88 | return inv_propen 89 | 90 | # Compute metrics with propensity. 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5): 92 | """Compute propensity weighted precision@k and DCG@k. 93 | Arguments: 94 | true_labels : numpy ndarray 95 | Ground truth labels from the dataset (one-hot vector). 96 | predicted_labels : numpy ndarray 97 | Predicted labels (one-hot vector of labels) 98 | """ 99 | acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores, 100 | remove_invalid=False) 101 | return acc.eval(predicted_labels, topk) 102 | 103 | 104 | # Print the final results. 105 | # This provides the results for agg metrics when threshold for inference 106 | # is optimized and metrics are then computed. 107 | def display_agg_results(args, te_loss, pr, rec, f1): 108 | print("----------Tests with Threshold Inference------------") 109 | print("Inference Threshold: {:.3f}".format(args.th)) 110 | print("Test Loss: {:.3f}".format(te_loss)) 111 | print("Test Precision: {:.3f}".format(pr * 100)) 112 | print("Test Recall: {:.3f}".format(rec * 100)) 113 | print("Test F1-Score: {:.3f}\n".format(f1 * 100)) 114 | 115 | 116 | def display_metrics(metrics, k=5): 117 | # Merge batchwise metrics. 118 | final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k] 119 | for idx, metric in enumerate(metrics): 120 | for i in range(0, 4): 121 | for j in range(0, k): 122 | final_metrics[i][j] += metric[i][j] 123 | 124 | # Dataset metrics. 125 | print("----------Tests with Ordered Retrieval------------") 126 | table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]] 127 | table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]]) 128 | table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]]) 129 | table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]]) 130 | print(tabulate(table, headers=[i+1 for i in range(0, k)], 131 | floatfmt=".3f")) 132 | -------------------------------------------------------------------------------- /lib/metrics_old.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library functions to compute different metrics for tasks. 3 | """ 4 | 5 | __author__ = "Ashwinkumar Ganesan" 6 | __email__ = "gashwin1@umbc.edu" 7 | 8 | from tabulate import tabulate 9 | import math 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import torch 13 | import xclib.evaluation.xc_metrics as xc_metrics 14 | 15 | # Compute the precision score for multi-label binary classification task. 16 | def mbprecision(y_true, y_pred): 17 | correct_pred = torch.sum(y_pred & y_true, axis=1).float() 18 | print(correct_pred.dtype) 19 | return torch.mean(correct_pred / torch.sum(y_true, axis=1)) 20 | 21 | # Compute the recall score for multi-label binary classification task. 22 | def mbrecall(y_true, y_pred): 23 | return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1)) 24 | 25 | 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename): 27 | """ 28 | Plot stats about the experiment. 29 | tr_stats: Training statistics (includes loss, precision, recall and F1) 30 | th_stats: Grid search statistics for configuring threshold. 31 | epochs: Number of epochs that the model is trained for. 32 | spoch: epoch that has optimal paramaters. 33 | sth: optimal threshold. 34 | filename: location to store plots. 35 | """ 36 | fig, ax = plt.subplots(3, figsize=(10, 10)) 37 | 38 | ep = tr_stats['Epoch'] 39 | tr_loss = tr_stats['Training Loss'] 40 | val_loss = tr_stats['Val Loss'] 41 | pr = tr_stats['Precision'] 42 | re = tr_stats['Recall'] 43 | f1 = tr_stats['F1 Score'] 44 | th = th_stats['Threshold'] 45 | 46 | ax[0].plot(ep, tr_loss) 47 | ax[0].plot(ep, val_loss) 48 | ax[0].set_title("Training & Validation Loss Per Epoch", size=16) 49 | ax[0].set_xlabel("Epoch", size=14) 50 | ax[0].set_ylabel("Loss", size=14) 51 | ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large") 52 | ax[0].axvline(x=spoch, linestyle='dashed') 53 | 54 | ax[1].plot(ep, pr) 55 | ax[1].plot(ep, re) 56 | ax[1].plot(ep, f1) 57 | ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16) 58 | ax[1].set_xlabel("Epoch", size=14) 59 | ax[1].set_ylabel("Score", size=14) 60 | ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 61 | ax[1].axvline(x=spoch, linestyle='dashed') 62 | 63 | ax[2].plot(th, th_stats['Precision']) 64 | ax[2].plot(th, th_stats['Recall']) 65 | ax[2].plot(th, th_stats['F1 Score']) 66 | ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16) 67 | ax[2].set_xlabel("Theshold", size=14) 68 | ax[2].set_ylabel("Score", size=14) 69 | ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large") 70 | ax[2].axvline(x=sth, linestyle='dashed') 71 | 72 | fig.tight_layout() 73 | plt.savefig(filename + ".png") 74 | 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5): 77 | """ 78 | Compute Inverse propensity values 79 | Values for A/B: 80 | Wikpedia-500K: 0.5/0.4 81 | Amazon-670K, Amazon-3M: 0.6/2.6 82 | Others: 0.55/1.5 83 | 84 | Arguments: 85 | train_labels : numpy ndarray 86 | """ 87 | inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B) 88 | return inv_propen 89 | 90 | # Compute metrics with propensity. 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5): 92 | """Compute propensity weighted precision@k and DCG@k. 93 | Arguments: 94 | true_labels : numpy ndarray 95 | Ground truth labels from the dataset (one-hot vector). 96 | predicted_labels : numpy ndarray 97 | Predicted labels (one-hot vector of labels) 98 | """ 99 | acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores, 100 | remove_invalid=False) 101 | return acc.eval(predicted_labels, topk) 102 | 103 | # Print the final results. 104 | # This provides the results for agg metrics when threshold for inference 105 | # is optimized and metrics are then computed. 106 | def display_agg_results(args, te_loss, pr, rec, f1): 107 | print("----------Tests with Threshold Inference------------") 108 | print("Inference Threshold: {:.3f}".format(args.th)) 109 | print("Test Loss: {:.3f}".format(te_loss)) 110 | print("Test Precision: {:.3f}".format(pr * 100)) 111 | print("Test Recall: {:.3f}".format(rec * 100)) 112 | print("Test F1-Score: {:.3f}\n".format(f1 * 100)) 113 | 114 | 115 | def display_metrics(metrics, k=5): 116 | # Merge batchwise metrics. 117 | final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k] 118 | for idx, metric in enumerate(metrics): 119 | for i in range(0, 4): 120 | for j in range(0, k): 121 | final_metrics[i][j] += metric[i][j] 122 | 123 | # Dataset metrics. 124 | print("----------Tests with Ordered Retrieval------------") 125 | table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]] 126 | table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]]) 127 | table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]]) 128 | table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]]) 129 | print(tabulate(table, headers=[i+1 for i in range(0, k)], 130 | floatfmt=".3f")) 131 | -------------------------------------------------------------------------------- /lib/plots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manage plots. 3 | AUTHOR: Ashwinkumar Ganesan. 4 | """ 5 | 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import csv 9 | import pandas as pd 10 | 11 | """ 12 | Plot training and testing curves. 13 | The graph includes: 14 | 1. Training loss per epoch. 15 | 2. Test loss per epoch. 16 | 3. Precision per epoch. 17 | 4. Recall per epoch. 18 | 5. F1 score per epoch. 19 | """ 20 | def plot_stats(tr_stats): 21 | fig, ax = plt.subplots(2) 22 | 23 | ep = [i for i in range(0, epochs)] 24 | tr_loss = tr_stats['Training Loss'] 25 | te_loss = tr_stats['Test Loss'] 26 | pr = tr_stats['Precision'] 27 | re = tr_stats['Recall'] 28 | f1 = tr_stats['F1 Score'] 29 | 30 | # Loss Curve. 31 | ax[0].plot(ep, tr_loss) 32 | ax[0].plot(ep, te_loss) 33 | ax[0].set_title("Training & Testing Loss Per Epoch") 34 | 35 | 36 | ax[1].plot(ep, pr) 37 | ax[1].plot(ep, re) 38 | ax[1].plot(ep, f1) 39 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. 3 | """ 4 | 5 | from prettytable import PrettyTable 6 | import pandas as pd 7 | from time import time 8 | import torch 9 | 10 | GB_DIV = 1024 * 1024 * 1024 11 | 12 | 13 | def print_memory_profile(): 14 | """ 15 | Get basic memory information. 16 | """ 17 | device = torch.cuda.current_device() 18 | print("Allocated: {:.4f}".format(int(torch.cuda.memory_allocated()) / GB_DIV)) 19 | print("Reserved: {:.4f}\n".format(int(torch.cuda.memory_allocated()) / GB_DIV)) 20 | 21 | # https://stackoverflow.com/questions/9535954/printing-lists-as-tabular-data 22 | def print_command_arguments(args): 23 | table = PrettyTable(['Parameter', 'Value']) 24 | table.title = 'Experimental Setup' 25 | for arg in vars(args): 26 | table.add_row([arg, getattr(args, arg)]) 27 | print(table) 28 | 29 | class Measure(object): 30 | """ 31 | Manage runtimes for a specific code block. 32 | """ 33 | def __init__(self, name): 34 | self._measure = name 35 | self._is_measuring = False 36 | self._elapsed_time = 0 37 | 38 | def is_measuring(self): 39 | return self._is_measuring 40 | 41 | def start(self): 42 | self._stime = time() 43 | self._is_measuring = True 44 | 45 | def end(self): 46 | self._etime = time() 47 | self._elapsed_time += self._etime - self._stime 48 | self._is_measuring = False 49 | 50 | def get_elapsed_time(self): 51 | return self._elapsed_time 52 | 53 | def get_name(self): 54 | return self._measure 55 | 56 | 57 | class ExperimentTime(object): 58 | """ 59 | Manage time for different parts in an experiment. 60 | """ 61 | def __init__(self): 62 | self._table = pd.DataFrame(columns=['Measurement', 'Elapsed Time']) 63 | self._pos = 0 64 | self.measure = {} 65 | 66 | def _append(self, name): 67 | self._table.loc[self._pos] = [name, self.measure[name].get_elapsed_time()] 68 | self._pos += 1 69 | 70 | def register(self, name): 71 | if name in self.measure: 72 | print("Measurement with same name previously added.") 73 | else: 74 | self.measure[name] = Measure(name) 75 | 76 | def measure_time(self, name): 77 | if self.measure[name].is_measuring(): 78 | self.measure[name].end() 79 | # Add time to the dataframe. 80 | self._append(name) 81 | else: 82 | self.measure[name].start() 83 | 84 | def get_measurements(self): 85 | return self._table 86 | -------------------------------------------------------------------------------- /output/README.md: -------------------------------------------------------------------------------- 1 | ### Output 2 | This folder contains all output from stdout & stderr when ```run_classifier.py``` is executed. When a cluster (like SLURM) is utilized, this folder contains SLURM outputs. 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argon2-cffi @ file:///tmp/build/80754af9/argon2-cffi_1596828496740/work 2 | attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work 3 | backcall==0.2.0 4 | bleach==3.1.5 5 | boto==2.49.0 6 | boto3==1.14.52 7 | botocore==1.17.52 8 | brotlipy==0.7.0 9 | certifi==2020.6.20 10 | cffi==1.14.0 11 | chardet==3.0.4 12 | click==7.1.2 13 | colorlover==0.3.0 14 | cryptography @ file:///tmp/build/80754af9/cryptography_1598892041992/work 15 | cufflinks==0.17.3 16 | cycler==0.10.0 17 | Cython==0.29.17 18 | decorator==4.4.2 19 | defusedxml==0.6.0 20 | docutils==0.15.2 21 | entrypoints==0.3 22 | faiss==1.6.3 23 | filelock==3.0.12 24 | future==0.18.2 25 | gdown==3.12.2 26 | gensim==3.8.3 27 | idna @ file:///tmp/build/80754af9/idna_1593446292537/work 28 | importlib-metadata @ file:///tmp/build/80754af9/importlib-metadata_1593446433964/work 29 | ipykernel @ file:///tmp/build/80754af9/ipykernel_1596206602906/work/dist/ipykernel-5.3.4-py3-none-any.whl 30 | ipython @ file:///tmp/build/80754af9/ipython_1593447367857/work 31 | ipython-genutils==0.2.0 32 | ipywidgets==7.5.1 33 | jedi @ file:///tmp/build/80754af9/jedi_1598371618777/work 34 | Jinja2==2.11.2 35 | jmespath==0.10.0 36 | joblib==0.17.0 37 | json5==0.9.5 38 | jsonpatch==1.26 39 | jsonpointer==2.0 40 | jsonschema==3.2.0 41 | jupyter==1.0.0 42 | jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work 43 | jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1598884538475/work 44 | jupyter-contrib-core==0.3.3 45 | jupyter-core==4.6.3 46 | jupyter-nbextensions-configurator==0.4.1 47 | jupyterlab==2.2.6 48 | jupyterlab-server @ file:///tmp/build/80754af9/jupyterlab_server_1594164409481/work 49 | kiwisolver==1.2.0 50 | llvmlite==0.33.0+1.g022ab0f 51 | logzero==1.5.0 52 | MarkupSafe==1.1.1 53 | matplotlib==3.1.3 54 | mistune==0.8.4 55 | mkl-fft==1.1.0 56 | mkl-random==1.1.1 57 | mkl-service==2.3.0 58 | nbconvert==5.6.1 59 | nbformat==5.0.7 60 | nltk @ file:///tmp/build/80754af9/nltk_1592496090529/work 61 | nmslib==2.0.6 62 | notebook @ file:///tmp/build/80754af9/notebook_1596838602091/work 63 | numba==0.50.1 64 | numpy==1.19.2 65 | packaging==20.4 66 | pandas==1.0.3 67 | pandocfilters==1.4.2 68 | parso==0.7.0 69 | pexpect==4.8.0 70 | pickleshare==0.7.5 71 | Pillow==7.2.0 72 | plotly==4.11.0 73 | prettytable==0.7.2 74 | prometheus-client==0.8.0 75 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1598885458782/work 76 | psutil==5.7.0 77 | ptyprocess==0.6.0 78 | pybind11==2.5.0 79 | pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work 80 | Pygments==2.6.1 81 | pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work 82 | pyparsing==2.4.7 83 | pyrsistent==0.16.0 84 | PySocks==1.7.1 85 | python-dateutil==2.8.1 86 | pytorch-model-summary==0.1.2 87 | pytz==2020.1 88 | PyYAML==5.3.1 89 | pyzmq==19.0.1 90 | qtconsole @ file:///tmp/build/80754af9/qtconsole_1598374667791/work 91 | QtPy==1.9.0 92 | regex @ file:///tmp/build/80754af9/regex_1596829710510/work 93 | requests @ file:///tmp/build/80754af9/requests_1592841827918/work 94 | retrying==1.3.3 95 | ruamel.yaml==0.16.12 96 | ruamel.yaml.clib==0.2.2 97 | s3transfer==0.3.3 98 | scikit-learn==0.22.1 99 | scipy==1.4.1 100 | Send2Trash==1.5.0 101 | six==1.15.0 102 | sklearn==0.0 103 | smart-open==2.1.1 104 | tabulate==0.8.7 105 | terminado==0.8.3 106 | testpath==0.4.4 107 | threadpoolctl==2.1.0 108 | torch==1.4.0 109 | torchfile==0.1.0 110 | torchsummary==1.5.1 111 | torchvision==0.5.0 112 | tornado==6.0.4 113 | tqdm @ file:///tmp/build/80754af9/tqdm_1596810128862/work 114 | traitlets==4.3.3 115 | urllib3 @ file:///tmp/build/80754af9/urllib3_1597086586889/work 116 | visdom==0.1.8.9 117 | wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work 118 | webencodings==0.5.1 119 | websocket-client==0.57.0 120 | widgetsnbextension==3.5.1 121 | xclib==0.96 122 | zipp==3.1.0 123 | -------------------------------------------------------------------------------- /train.slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is for GPU allocation is available. # SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --output=output/slurm-%A-%a.out 7 | #SBATCH --error=output/slurm-%A-%a.err 8 | 9 | # Set the environment. 10 | # source deactivate # Remove previous environments. 11 | source ~/anaconda3/etc/profile.d/conda.sh 12 | conda activate spp # Environment name. 13 | 14 | # Execute the code. 15 | set -o xtrace 16 | TASK_ID=$((SLURM_ARRAY_TASK_ID)) 17 | NAME=$1 18 | DATA_FILE=$2 19 | TR_SPLIT=$3 20 | TE_SPLIT=$4 21 | SAVE_MODEL=$5 22 | THRESHOLD=$6 23 | EXP_NAME=$7 24 | DIMS=$8 25 | BATCH_SIZE=$9 26 | TEST_BATCH_SIZE=${10} 27 | WITH_GRAD=${11} 28 | WITHOUT_NEGATIVE=${12} 29 | 30 | MODEL=("baseline" "spn") 31 | 32 | # Select the model. 33 | MODEL_TYPE=${MODEL[${TASK_ID}]} 34 | echo "Parameters: $NAME $DATA_FILE $TR_SPLIT $TE_SPLIT $SAVE_MODEL $THRESHOLD" 35 | echo " $MODEL_TYPE $EXP_NAME $DIMS $BATCH_SIZE $TEST_BATCH_SIZE" 36 | echo " ${WITH_GRAD} ${WITHOUT_NEGATIVE}" 37 | 38 | # Construct list of options. 39 | OPTIONS="--th $THRESHOLD --debug" 40 | if [ "$MODEL_TYPE" == "baseline" ] 41 | then 42 | OPTIONS="${OPTIONS} --baseline" 43 | fi 44 | 45 | if [ "$WITH_GRAD" == "no-grad" ] 46 | then 47 | OPTIONS="${OPTIONS} --no-grad" 48 | fi 49 | 50 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ] 51 | then 52 | OPTIONS="${OPTIONS} --without-negative" 53 | fi 54 | 55 | python run_classifier.py --data-file $DATA_FILE \ 56 | --tr-split $TR_SPLIT \ 57 | --te-split $TE_SPLIT --spn-dim $DIMS \ 58 | --save $SAVE_MODEL --name ${NAME}_${MODEL_TYPE}_${EXP_NAME} \ 59 | --batch-size $BATCH_SIZE --test-batch-size $TEST_BATCH_SIZE \ 60 | $OPTIONS > $SAVE_MODEL/${NAME}_${MODEL_TYPE}_${EXP_NAME}.results -------------------------------------------------------------------------------- /xml-cnn/README.md: -------------------------------------------------------------------------------- 1 | # HRR-CNN 2 | This is a modified implementation of [XML-CNN](https://github.com/siddsax/XML-CNN) from this [repository](https://github.com/siddsax/XML-CNN) that uses HRR for labal representation and inference. The Pytorch implementation is of the paper [Deep Learning for Extreme Multi-label Text Classification](http://nyc.lti.cs.cmu.edu/yiming/Publications/jliu-sigir17.pdf) with dynamic pooling. 3 | 4 | ## List of changes to the Codebase. 5 | The XML-CNN codebase has been modified to with the following list of changes: 6 | 1. Retooled to use semantic pointers. The architecture can use HRRs to learn and infer labels. 7 | 2. Modifications to operate seamlessly with large datasets and models using a Pytorch dataset object. 8 | 3. The codebase also contains two scripts, i.e., ```experiments.sh``` and ```train.slurm.sh``` for execution of training and evaluation jobs on a SLURM enabled cluster. 9 | 10 | ### NOTE: Before running experiments, perform preprocessing as discussed [here](https://github.com/siddsax/XML-CNN). 11 | 12 | Example Execution with RCV Dataset 13 | ---------------------------------- 14 | To train the model with HRR. 15 | ```bash 16 | EXP_NAME="test" 17 | PROP_A=0.55 18 | PROP_B=1.5 19 | python main.py --ds rcv1 --mn rcv1-${EXP_NAME}-hrr -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --hrr_labels 20 | ``` 21 | 22 | To evaluate the model: 23 | ```bash 24 | python main.py --ds $NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --tr 0 --lm ../saved_models/rcv1-${EXP_NAME}-hrr/model_best_test --hrr_labels 25 | ``` 26 | 27 | References 28 | ---------- 29 | [Deep Learning for Extreme Multi-label Text Classification](http://nyc.lti.cs.cmu.edu/yiming/Publications/jliu-sigir17.pdf) -------------------------------------------------------------------------------- /xml-cnn/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/xml-cnn/code/__init__.py -------------------------------------------------------------------------------- /xml-cnn/code/cnn_test.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | from collections import OrderedDict 3 | from sklearn.metrics import log_loss 4 | from lib.metrics import compute_prop_metrics, display_metrics 5 | 6 | def test_class(test_loader, params, device, model=None, embedding_weights=None, 7 | verbose=True, propensity=None, topk=5): 8 | if(model == None): 9 | if(embedding_weights is None): 10 | print("Error: Embedding weights needed!") 11 | exit() 12 | else: 13 | model = xmlCNN(params, embedding_weights) 14 | model = load_model(model, params.load_model) 15 | 16 | if(torch.cuda.is_available()): 17 | params.dtype_f = torch.cuda.FloatTensor 18 | params.dtype_i = torch.cuda.LongTensor 19 | model = model.cuda() 20 | else: 21 | params.dtype_f = torch.FloatTensor 22 | params.dtype_i = torch.LongTensor 23 | 24 | # Testing data. 25 | loss = 0.0; prec = 0.0; num_batch = 0.0; all_acc = [] 26 | for i, (batch_x, batch_y) in enumerate(test_loader): 27 | # Load Data. 28 | batch_x = batch_x.type(torch.LongTensor).to(device) 29 | batch_y = batch_y.type(torch.FloatTensor).to(device) 30 | 31 | model.time['test_forward_pass'].start() 32 | e_emb = model.embedding_layer.forward(batch_x) 33 | s = model.classifier(e_emb) 34 | model.time['test_forward_pass'].end() 35 | 36 | model.time['inference'].start() 37 | if params.hrr_labels: 38 | batch_size = batch_y.size()[0] 39 | combined_y = model.classifier.inference(s, batch_size) 40 | y_pred = torch.abs(torch.mm(combined_y, model.classifier.class_vec.weight.t())).cpu().data[:, :batch_y.shape[1]].numpy() 41 | else: 42 | y_pred = s.cpu().data.numpy() 43 | model.time['inference'].end() # Measure forward pass during inference. 44 | 45 | # Measure. 46 | y_cpu = batch_y.cpu().data.numpy() 47 | loss += log_loss(y_cpu, y_pred) 48 | acc = compute_prop_metrics(sparse.csr_matrix(y_cpu), 49 | sparse.csr_matrix(y_pred), propensity, 50 | topk=topk) 51 | all_acc.append(acc) 52 | num_batch += 1 53 | 54 | loss /= num_batch 55 | print('Test Loss; Cross Entropy {};'.format(loss)) 56 | display_metrics(all_acc) 57 | return loss 58 | -------------------------------------------------------------------------------- /xml-cnn/code/cnn_train.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | from cnn_test import * 3 | 4 | # --------------------------------------------------------------------------------- 5 | 6 | def train(train_loader, test_loader, embedding_weights, params, device, 7 | propensity=None): 8 | loss_best = float('Inf') 9 | bestTotalLoss = float('Inf') 10 | best_test_loss = float("inf") 11 | max_grad = 0 12 | num_mb = np.ceil(params.N/params.mb_size) 13 | model = xmlCNN(params, embedding_weights) 14 | if(torch.cuda.is_available()): 15 | print("--------------- Using GPU! ---------") 16 | model.params.dtype_f = torch.cuda.FloatTensor 17 | model.params.dtype_i = torch.cuda.LongTensor 18 | model = model.to(device) 19 | else: 20 | model.params.dtype_f = torch.FloatTensor 21 | model.params.dtype_i = torch.LongTensor 22 | print("=============== Using CPU =========") 23 | 24 | optimizer = optim.Adam(filter(lambda p: p.requires_grad,model.parameters()), lr=params.lr) 25 | print(model);print("%"*100) 26 | 27 | if params.dataparallel: 28 | model = nn.DataParallel(model) 29 | 30 | if(len(params.load_model)): 31 | params.model_name = params.load_model 32 | print(params.load_model) 33 | model, optimizer, init = load_model(model, params.load_model, optimizer=optimizer) 34 | else: 35 | init = 0 36 | 37 | # =============================== TRAINING ================================= 38 | for epoch in range(init, params.num_epochs): 39 | totalLoss = 0.0 40 | model.time['train'].start() 41 | model.time['data_load'].start() 42 | for i, (batch_x, batch_y) in enumerate(train_loader): 43 | model.time['data_load'].end() 44 | model.train() 45 | optimizer.zero_grad() 46 | 47 | # Load data to GPU. 48 | batch_x = batch_x.type(torch.LongTensor).to(device) 49 | if params.hrr_labels: 50 | batch_y = batch_y.type(torch.LongTensor).to(device) 51 | else: 52 | batch_y = batch_y.type(torch.FloatTensor).to(device) 53 | 54 | # Model forward. 55 | loss, output = model.forward(batch_x, batch_y) 56 | 57 | # ------------------------------------------------------------------ 58 | loss = loss.mean().squeeze() 59 | totalLoss += loss.data 60 | 61 | # NOTE: This block is not part of training. 62 | model.time['train'].end() 63 | if i % int(num_mb/12) == 0: 64 | print('Iter-{}; Loss: {:.4}; best_loss: {:.4}; max_grad: {}:'.format(i, loss.data, loss_best, max_grad)) 65 | if not os.path.exists('../saved_models/' + params.model_name ): 66 | os.makedirs('../saved_models/' + params.model_name) 67 | save_model(model, optimizer, epoch, params.model_name + "/model_best_batch") 68 | if(loss (for running experiments on a specific dataset). 7 | # 3. ./experiments.sh gather (for gathering the precision only). 8 | 9 | # Config. 10 | NAME=${1:-"all"} 11 | MEM=256000 12 | SAVE_LOC="results" 13 | EXP_NAME=${2:-"test"} 14 | MODEL_TYPE=${3:-"all"} 15 | DIMS=${4:-400} 16 | WITH_GRAD=${5:-"grad"} # no-grad for training with no gradient to p & n vectors. 17 | WITHOUT_NEGATIVE=${6:-"with-negative"} # without-negative for training. 18 | PROP_A=${7:-"0.55"} # Propensity value A. 19 | PROP_B=${8:-"1.5"} # Propensity value B. 20 | 21 | create_job () { 22 | echo "Location to save model: $SAVE_LOC/$1 ..." 23 | if [[ ( "$MODEL_TYPE" == "all" ) ]]; then 24 | echo "Creating jobs for both models..." 25 | sbatch --job-name=$1-all --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \ 26 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 27 | elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then 28 | echo "Creating jobs for baseline model..." 29 | sbatch --job-name=$1-base --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \ 30 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 31 | elif [[ ( "$MODEL_TYPE" == "hrr" ) ]]; then 32 | echo "Creating jobs for HRR model..." 33 | sbatch --job-name=$1-hrr --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \ 34 | $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B} 35 | fi 36 | } 37 | 38 | # NOTE: Individual jobs for each dataset are easier to track. 39 | # This keeps the SLURM files simple. 40 | 41 | # RCV1 dataset. 42 | if [[ ( "$NAME" == "rcv1" ) || ( "$NAME" == "all" ) ]] 43 | then 44 | create_job rcv1 $WITH_GRAD $WITHOUT_NEGATIVE 45 | fi 46 | 47 | # Eurlex dataset. 48 | if [[ ( "$NAME" == "eurlex" ) || ( "$NAME" == "all" ) ]] 49 | then 50 | create_job eurlex $WITH_GRAD $WITHOUT_NEGATIVE 51 | fi 52 | 53 | # Wiki30k dataset. 54 | if [[ ( "$NAME" == "wiki30k" ) || ( "$NAME" == "all" ) ]] 55 | then 56 | create_job wiki30k $WITH_GRAD $WITHOUT_NEGATIVE 57 | fi 58 | 59 | # Amazon12k dataset. 60 | if [[ ( "$NAME" == "amazon12k" ) || ( "$NAME" == "all" ) ]] 61 | then 62 | create_job amazon12k $WITH_GRAD $WITHOUT_NEGATIVE 63 | fi 64 | 65 | # Amazon12k dataset. 66 | if [[ ( "$NAME" == "amazon670K" ) || ( "$NAME" == "all" ) ]] 67 | then 68 | create_job amazon670K $WITH_GRAD $WITHOUT_NEGATIVE 69 | fi 70 | -------------------------------------------------------------------------------- /xml-cnn/code/header.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.autograd as autograd 4 | import torch.optim as optim 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import matplotlib.gridspec as gridspec 8 | import os 9 | from torch.autograd import Variable 10 | import sys 11 | import numpy as np 12 | sys.path.append('../utils/') 13 | sys.path.append('models') 14 | import data_helpers 15 | 16 | from w2v import * 17 | from embedding_layer import embedding_layer 18 | from cnn_encoder import cnn_encoder 19 | from sklearn import preprocessing 20 | from sklearn.decomposition import PCA 21 | import scipy.io as sio 22 | from scipy import sparse 23 | import argparse 24 | from visdom import Visdom 25 | from sklearn.externals import joblib 26 | from futils import * 27 | from loss import loss 28 | from xmlCNN import xmlCNN 29 | import timeit 30 | from precision_k import precision_k -------------------------------------------------------------------------------- /xml-cnn/code/lib: -------------------------------------------------------------------------------- 1 | ../../lib -------------------------------------------------------------------------------- /xml-cnn/code/main.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | from cnn_train import * 3 | from cnn_test import * 4 | import pdb 5 | from lib.metrics import compute_inv_propensity 6 | from lib.utils import print_command_arguments 7 | 8 | # ------------------------ Params ------------------------------------------------------------------------------- 9 | parser = argparse.ArgumentParser(description='Process some integers.') 10 | 11 | parser.add_argument('--zd', dest='Z_dim', type=int, default=100, help='Latent layer dimension') 12 | parser.add_argument('--mb', dest='mb_size', type=int, default=20, help='Size of minibatch, changing might result in latent layer variance overflow') 13 | parser.add_argument('--lr', dest='lr', type=float, default=0.001, help='Learning Rate') 14 | parser.add_argument('--p', dest='plot_flg', type=int, default=0, help='1 to plot, 0 to not plot') 15 | # parser.add_argument('--e', dest='num_epochs', type=int, default=50, help='step for displaying loss') 16 | parser.add_argument('--e', dest='num_epochs', type=int, default=2, help='step for displaying loss') 17 | parser.add_argument('--seed', type=int, default=100, metavar='S', help='random seed (default: 100)') 18 | parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') 19 | # parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') 20 | parser.add_argument('--batch-size', type=int, default=16, metavar='N', help='input batch size for training (default: 64)') 21 | parser.add_argument('-a', type=float, default=0.55, 22 | help='Inverse propensity value A (Default: 0.55).') 23 | parser.add_argument('-b', type=float, default=1.5, 24 | help='Inverse propensity value A (Default: 1.5).') 25 | 26 | parser.add_argument('--d', dest='disp_flg', type=int, default=0, help='display graphs') 27 | parser.add_argument('--sve', dest='save', type=int, default=1, help='save models or not') 28 | parser.add_argument('--ss', dest='save_step', type=int, default=10, help='gap between model saves') 29 | parser.add_argument('--mn', dest='model_name', type=str, default='', help='model name') 30 | parser.add_argument('--tr', dest='training', type=int, default=1, help='model name') 31 | parser.add_argument('--lm', dest='load_model', type=str, default="", help='model name') 32 | parser.add_argument('--ds', dest='data_set', type=str, default="rcv", help='dataset name') 33 | 34 | parser.add_argument('--pp', dest='pp_flg', type=int, default=0, help='1 is for min-max pp, 2 is for gaussian pp, 0 for none') 35 | parser.add_argument('--loss', dest='loss_type', type=str, default="BCELoss", help='Loss') 36 | 37 | parser.add_argument('--hidden_dims', type=int, default=512, help='hidden layer dimension') 38 | # parser.add_argument('--hidden_dims', type=int, default=1024, help='hidden layer dimension') # Amazon670K 39 | parser.add_argument('--sequence_length',help='max sequence length of a document', type=int,default=500) 40 | parser.add_argument('--embedding_dim', help='dimension of word embedding representation', type=int, default=300) 41 | parser.add_argument('--model_variation', help='model variation: CNN-rand or CNN-pretrain', type=str, default='pretrain') 42 | parser.add_argument('--pretrain_type', help='pretrain model: GoogleNews or glove', type=str, default='glove') 43 | parser.add_argument('--vocab_size', help='size of vocabulary keeping the most frequent words', type=int, default=30000) 44 | parser.add_argument('--drop_prob', help='Dropout probability', type=int, default=.3) 45 | parser.add_argument('--load_data', help='Load Data or not', type=int, default=0) 46 | parser.add_argument('--mg', dest='multi_gpu', type=int, default=0, help='1 for 2 gpus and 0 for normal') 47 | parser.add_argument('--filter_sizes', help='number of filter sizes (could be a list of integer)', type=int, default=[2, 4, 8], nargs='+') 48 | 49 | # Large Datasets. 50 | parser.add_argument('--num_filters', help='number of filters (i.e. kernels) in CNN model', type=int, default=32) 51 | parser.add_argument('--pooling_units', help='number of pooling units in 1D pooling layer', type=int, default=32) 52 | 53 | # Small Datasets. 54 | # parser.add_argument('--num_filters', help='number of filters (i.e. kernels) in CNN model', type=int, default=128) 55 | # parser.add_argument('--pooling_units', help='number of pooling units in 1D pooling layer', type=int, default=128) 56 | 57 | parser.add_argument('--pooling_type', help='max or average', type=str, default='max') 58 | parser.add_argument('--model_type', help='glove or GoogleNews', type=str, default='glove') 59 | parser.add_argument('--num_features', help='50, 100, 200, 300', type=int, default=300) 60 | parser.add_argument('--dropouts', help='0 for not using, 1 for using', type=int, default=0) 61 | parser.add_argument('--clip', help='gradient clipping', type=float, default=1000) 62 | # parser.add_argument('--clip', help='gradient clipping', type=float, default=2.0) 63 | parser.add_argument('--dataset_gpu', help='load dataset in full to gpu', type=int, default=1) 64 | parser.add_argument('--dp', dest='dataparallel', help='to train on multiple GPUs or not', type=bool, default=False) 65 | 66 | # HRR specific arguments. 67 | parser.add_argument('--hrr_labels', action='store_true', default=False, help='Use HRR Labels.') 68 | parser.add_argument('--hrr_dim', type=int, default=400, help='HRR Label Dimension.') 69 | parser.add_argument('--no-grad', action='store_true', default=False, 70 | help='Update Label vectors.') 71 | parser.add_argument('--without-negative', action='store_true', default=False, 72 | help='disable negative loss.') 73 | 74 | params = parser.parse_args() 75 | print_command_arguments(params) 76 | 77 | if(len(params.model_name)==0): 78 | params.model_name = "Gen_data_CNN_Z_dim-{}_mb_size-{}_hidden_dims-{}_preproc-{}_loss-{}_sequence_length-{}_embedding_dim-{}_params.vocab_size={}".format(params.Z_dim, params.mb_size, params.hidden_dims, params.pp_flg, params.loss_type, params.sequence_length, params.embedding_dim, params.vocab_size) 79 | 80 | print('Saving Model to: ' + params.model_name) 81 | 82 | # Begin. 83 | torch.backends.cudnn.deterministic = True 84 | torch.backends.cudnn.benchmark = False 85 | use_cuda = not params.no_cuda and torch.cuda.is_available() 86 | torch.manual_seed(params.seed) 87 | np.random.seed(params.seed) 88 | device = torch.device("cuda" if use_cuda else "cpu") 89 | 90 | if use_cuda: 91 | kwargs = {'num_workers': 16, 'pin_memory': True, 'drop_last': True, 92 | 'batch_size': params.batch_size, 'shuffle': True} 93 | else: 94 | kwargs = {'drop_last': True, 'num_workers': 8, 95 | 'batch_size': params.batch_size, 'shuffle': True} 96 | 97 | # ------------------ data ---------------------------------------------- 98 | params.data_path = '../datasets/' + params.data_set 99 | 100 | # Create training and test data loaders. 101 | train_dataset = XMLDataset(params) 102 | print("-----------Training Dataset Statistics-----------") 103 | print("Features: {}".format(train_dataset.features.shape)) 104 | print("Labels: {}".format(train_dataset.labels.shape)) 105 | 106 | # Compute Propensity Scores. 107 | inv_propen = compute_inv_propensity(train_dataset.labels, A=params.a, B=params.b) 108 | 109 | # Create dataloader. 110 | train_loader = torch.utils.data.DataLoader(train_dataset, **kwargs) 111 | 112 | test_dataset = XMLDataset(params, train=False) 113 | print("-----------Testing Dataset Statistics------------") 114 | print("Features: {}".format(test_dataset.features.shape)) 115 | print("Labels: {}".format(test_dataset.labels.shape)) 116 | test_loader = torch.utils.data.DataLoader(test_dataset, **kwargs) 117 | 118 | params = update_params(params) 119 | # ----------------------- Loss ------------------------------------------------ 120 | if not params.hrr_labels: 121 | params.loss_fn = torch.nn.BCELoss(size_average=False) 122 | 123 | # -------------------------- Params -------------------------------------------- 124 | if params.model_variation == 'pretrain': 125 | embedding_weights = load_word2vec(params) 126 | else: 127 | embedding_weights = None 128 | 129 | if torch.cuda.is_available(): 130 | params.dtype = torch.cuda.FloatTensor 131 | else: 132 | params.dtype = torch.FloatTensor 133 | 134 | 135 | if(params.training): 136 | train(train_loader, test_loader, embedding_weights, params, device, 137 | propensity=inv_propen) 138 | else: 139 | test_class(test_loader, params, model=model, device=device, verbose=False, 140 | propensity=inv_propen) 141 | -------------------------------------------------------------------------------- /xml-cnn/code/models/classifier.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | class classifier(nn.Module): 3 | def __init__(self, params): 4 | super(classifier, self).__init__() 5 | self.params = params 6 | if(self.params.dropouts): 7 | self.drp = nn.Dropout(.5) 8 | self.l1 = nn.Linear(params.h_dim, params.H_dim) 9 | self.l2 = nn.Linear(params.H_dim, params.y_dim) 10 | self.relu = nn.ReLU() 11 | self.sigmoid = nn.Sigmoid() 12 | torch.nn.init.xavier_uniform_(self.l1.weight) 13 | 14 | def forward(self, H): 15 | H = self.l1(H) 16 | H = self.relu(H) 17 | H = self.l2(H) 18 | H = self.sigmoid(H) 19 | return H -------------------------------------------------------------------------------- /xml-cnn/code/models/cnn_encoder.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | from lib.embeddings import get_vectors 3 | from lib.mathops import get_appx_inv, circular_conv, complexMagProj 4 | 5 | def out_size(l_in, kernel_size, padding=0, dilation=1, stride=1): 6 | a = l_in + 2*padding - dilation*(kernel_size - 1) - 1 7 | b = int(a/stride) 8 | return b + 1 9 | 10 | class cnn_encoder(torch.nn.Module): 11 | 12 | def __init__(self, params): 13 | super(cnn_encoder, self).__init__() 14 | self.params = params 15 | self.conv_layers = nn.ModuleList() 16 | self.pool_layers = nn.ModuleList() 17 | fin_l_out_size = 0 18 | 19 | if(params.dropouts): 20 | self.drp = nn.Dropout(p=.25) 21 | self.drp5 = nn.Dropout(p=.5) 22 | 23 | for fsz in params.filter_sizes: 24 | l_out_size = out_size(params.sequence_length, fsz, stride=2) 25 | pool_size = l_out_size // params.pooling_units 26 | l_conv = nn.Conv1d(params.embedding_dim, params.num_filters, fsz, stride=2) 27 | torch.nn.init.xavier_uniform_(l_conv.weight) 28 | if params.pooling_type == 'average': 29 | l_pool = nn.AvgPool1d(pool_size, stride=None, count_include_pad=True) 30 | pool_out_size = (int((l_out_size - pool_size)/pool_size) + 1)*params.num_filters 31 | elif params.pooling_type == 'max': 32 | l_pool = nn.MaxPool1d(2, stride=1) 33 | pool_out_size = (int(l_out_size*params.num_filters - 2) + 1) 34 | fin_l_out_size += pool_out_size 35 | 36 | self.conv_layers.append(l_conv) 37 | self.pool_layers.append(l_pool) 38 | 39 | self.fc_layer_1 = nn.Linear(fin_l_out_size, params.hidden_dims) 40 | torch.nn.init.xavier_uniform_(self.fc_layer_1.weight) 41 | 42 | # NOTE: Comment out fc2 and fc3 for Amazon670K 43 | self.fc_layer_2 = nn.Linear(params.hidden_dims, params.hidden_dims) 44 | torch.nn.init.xavier_uniform_(self.fc_layer_2.weight) 45 | 46 | self.fc_layer_3 = nn.Linear(params.hidden_dims, params.hidden_dims) 47 | torch.nn.init.xavier_uniform_(self.fc_layer_3.weight) 48 | ### 49 | 50 | if params.hrr_labels: 51 | self.out_layer = nn.Linear(params.hidden_dims, params.hrr_dim) 52 | self.create_label_embedding() # Create the labels. 53 | else: 54 | self.out_layer = nn.Linear(params.hidden_dims, params.y_dim) 55 | 56 | torch.nn.init.xavier_uniform_(self.out_layer.weight) 57 | 58 | 59 | def create_label_embedding(self): 60 | # Class labels. # +1 for the END of LIST Label. 61 | self._class_vectors = get_vectors(self.params.y_dim + 1, self.params.hrr_dim) 62 | 63 | # Initialize embedding layer. 64 | self.class_vec = nn.Embedding(self.params.y_dim + 1, self.params.hrr_dim) 65 | self.class_vec.load_state_dict({'weight': self._class_vectors}) 66 | self.class_vec.weight.requires_grad = False 67 | 68 | # Initialize weights vector. 69 | weights = torch.ones((self.params.y_dim + 1, 1), dtype=torch.int8) 70 | weights[self.params.y_dim] = 0 # Padding vector is made 0. 71 | self.class_weights = nn.Embedding(self.params.y_dim + 1, 1) 72 | self.class_weights.load_state_dict({'weight': weights}) 73 | self.class_weights.weight.requires_grad = False 74 | 75 | # P & N vectors. 76 | p_n_vec = get_vectors(2, self.params.hrr_dim, ortho=True) 77 | if self.params.no_grad: 78 | print("P & N vectors WILL NOT be updated while training...") 79 | self.p = nn.Parameter(p_n_vec[0], requires_grad=False) 80 | self.n = nn.Parameter(p_n_vec[1], requires_grad=False) 81 | else: 82 | print("P & N vectors WILL be updated while training...") 83 | self.p = nn.Parameter(p_n_vec[0], requires_grad=True) 84 | self.n = nn.Parameter(p_n_vec[1], requires_grad=True) 85 | 86 | 87 | def inference(self, s, batch_size, positive=True): 88 | #(batch, dims) 89 | if positive: 90 | vec = self.p.unsqueeze(0).expand(batch_size, self.params.hrr_dim) 91 | else: 92 | vec = self.n.unsqueeze(0).expand(batch_size, self.params.hrr_dim) 93 | 94 | # vec = complexMagProj(vec) 95 | inv_vec = get_appx_inv(vec) 96 | y = circular_conv(inv_vec, s) #(batch, dims) 97 | y = y / (torch.norm(y, dim=-1, keepdim=True) + 1e-8) 98 | return y 99 | 100 | def spp_loss(self, s, target): 101 | """ 102 | Train with SPP. 103 | """ 104 | pos_classes = self.class_vec(target) #(batch, no_label, dims) 105 | pos_classes = pos_classes * self.class_weights(target) 106 | 107 | # Normalize the class vectors. 108 | # tgt_shape = pos_classes.shape 109 | # pos_classes = torch.reshape(pos_classes, (tgt_shape[0] * tgt_shape[1], 110 | # tgt_shape[2])) 111 | # pos_classes = torch.reshape(complexMagProj(pos_classes), (tgt_shape[0], tgt_shape[1], 112 | # tgt_shape[2])) 113 | 114 | # Remove the padding idx vectors. 115 | # pos_classes = pos_classes.to(device) 116 | 117 | # Positive prediction loss 118 | convolve = self.inference(s, target.size(0)) 119 | cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1) 120 | J_p = torch.mean(torch.sum(1 - torch.abs(cosine), dim=-1)) 121 | 122 | # Negative prediction loss. 123 | J_n = 0.0 124 | if self.params.without_negative is False: 125 | convolve = self.inference(s, target.size(0), positive=False) 126 | cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1) 127 | J_n = torch.mean(torch.sum(torch.abs(cosine), dim=-1)) 128 | 129 | # Total Loss. 130 | loss = J_n + J_p 131 | return loss 132 | 133 | 134 | def forward(self, inputs): 135 | #o0 = self.drp(self.bn_1(inputs)).permute(0,2,1) 136 | o0 = inputs.permute(0,2,1)# self.bn_1(inputs.permute(0,2,1)) 137 | if(self.params.dropouts): 138 | o0 = self.drp(o0) 139 | conv_out = [] 140 | 141 | for i in range(len(self.params.filter_sizes)): 142 | o = self.conv_layers[i](o0) 143 | o = o.view(o.shape[0], 1, o.shape[1] * o.shape[2]) 144 | o = self.pool_layers[i](o) 145 | o = nn.functional.relu(o) 146 | o = o.view(o.shape[0],-1) 147 | conv_out.append(o) 148 | del o 149 | if len(self.params.filter_sizes)>1: 150 | o = torch.cat(conv_out,1) 151 | else: 152 | o = conv_out[0] 153 | 154 | # Additional fully connected layers added to the model. 155 | o = self.fc_layer_1(o) 156 | o = nn.functional.relu(o) 157 | 158 | # NOTE: Comment out fc2 and fc3 for Amazon670K 159 | o = self.fc_layer_2(o) 160 | o = nn.functional.relu(o) 161 | 162 | o = self.fc_layer_3(o) 163 | o = nn.functional.relu(o) 164 | ### 165 | 166 | if(self.params.dropouts): 167 | o = self.drp5(o) 168 | o = self.out_layer(o) 169 | 170 | if not self.params.hrr_labels: 171 | o = torch.sigmoid(o) 172 | 173 | return o 174 | -------------------------------------------------------------------------------- /xml-cnn/code/models/embedding_layer.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | 3 | class embedding_layer(torch.nn.Module): 4 | 5 | def __init__(self, params, embedding_weights): 6 | super(embedding_layer, self).__init__() 7 | self.l = nn.Embedding(params.vocab_size, params.embedding_dim) 8 | if params.model_variation == 'pretrain': 9 | self.l.weight.data.copy_(torch.from_numpy(embedding_weights)) 10 | self.l.weight.requires_grad=False 11 | 12 | def forward(self, inputs): 13 | o = self.l(inputs) 14 | return o 15 | -------------------------------------------------------------------------------- /xml-cnn/code/models/header.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.autograd as autograd 4 | import torch.optim as optim 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import matplotlib.gridspec as gridspec 8 | import os 9 | from torch.autograd import Variable 10 | import sys 11 | import numpy as np 12 | sys.path.append('../../utils/') 13 | sys.path.append('models/') 14 | import data_helpers 15 | 16 | from w2v import * 17 | from embedding_layer import embedding_layer 18 | from sklearn import preprocessing 19 | from sklearn.decomposition import PCA 20 | import scipy.io as sio 21 | from scipy import sparse 22 | import argparse 23 | from visdom import Visdom 24 | from sklearn.externals import joblib 25 | from futils import * 26 | from loss import loss 27 | -------------------------------------------------------------------------------- /xml-cnn/code/models/xmlCNN.py: -------------------------------------------------------------------------------- 1 | from header import * 2 | from cnn_encoder import cnn_encoder 3 | from lib.utils import Measure 4 | 5 | class xmlCNN(nn.Module): 6 | def __init__(self, params, embedding_weights): 7 | super(xmlCNN, self).__init__() 8 | self.params = params 9 | self.embedding_layer = embedding_layer(params, embedding_weights) 10 | self.classifier = cnn_encoder(params) 11 | 12 | if params.hrr_labels: 13 | self.loss = self.classifier.spp_loss 14 | else: 15 | self.loss = self.params.loss_fn 16 | 17 | # Create measurements. 18 | self.time = { 19 | 'train': Measure("Train"), 20 | 'train_forward_pass': Measure("Train Forward Pass"), 21 | 'train_loss': Measure("Train Loss"), 22 | 'optimization': Measure("Optimization"), 23 | 'test_forward_pass': Measure("Test Forward Pass"), 24 | 'inference': Measure("Inference"), 25 | 'data_load': Measure("Data Loader"), 26 | } 27 | 28 | def forward(self, batch_x, batch_y): 29 | # ----------- Encode (X, Y) -------------------------------------------- 30 | self.time['train_forward_pass'].start() 31 | e_emb = self.embedding_layer.forward(batch_x) 32 | Y = self.classifier.forward(e_emb) 33 | self.time['train_forward_pass'].end() 34 | 35 | # Compute time for loss. 36 | self.time['train_loss'].start() 37 | loss = self.loss(Y, batch_y) 38 | self.time['train_loss'].end() 39 | 40 | if(loss < 0): 41 | print(cross_entropy) 42 | print(Y[0:100]) 43 | print(batch_y[0:100]) 44 | sys.exit() 45 | 46 | return loss.view(-1,1), Y 47 | -------------------------------------------------------------------------------- /xml-cnn/code/precision_k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.io as sio 3 | def precision_k(true_mat, score_mat, k): 4 | p = np.zeros((k, 1)) 5 | rank_mat = np.argsort(score_mat) 6 | backup = np.copy(score_mat) 7 | for k in range(k): 8 | score_mat = np.copy(backup) 9 | for i in range(rank_mat.shape[0]): 10 | score_mat[i][rank_mat[i, :-(k+1)]] = 0 11 | 12 | score_mat = np.ceil(score_mat) 13 | kk = np.argwhere(score_mat > 0) 14 | mat = np.multiply(score_mat, true_mat) 15 | num = np.sum(mat, axis=1) 16 | p[k] = np.mean(num/(k+1)) 17 | 18 | return np.around(p, decimals=4) 19 | -------------------------------------------------------------------------------- /xml-cnn/code/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # set -vx 3 | 4 | DATASET=${1:-"eurlex"} 5 | USE_HRR=${2:-"False"} 6 | EXP_NAME=${3:-"test"} 7 | PARALLELIZE=${4:-"False"} 8 | 9 | OPTIONS="" 10 | if [[ "$PARALLELIZE" == "True" ]] 11 | then 12 | echo "Train WITHOUT data parallelism..." 13 | OPTIONS="$OPTIONS --dp 1" 14 | fi 15 | 16 | # Build name for HRR and basic baseline models. 17 | if [[ "$USE_HRR" == "False" ]] 18 | then 19 | echo "Train WITHOUT HRR representations.." 20 | elif [[ "$USE_HRR" == "True" ]] 21 | then 22 | echo "Train WITH HRR representations.." 23 | OPTIONS="$OPTIONS --hrr_labels" 24 | fi 25 | 26 | echo "OPTIONS: $OPTIONS" 27 | python main.py --ds $DATASET --mn $DATASET --model_type glove-bin $OPTIONS > ../results/${DATASET}.results 28 | 29 | # Test the model. 30 | echo "Test Results..." 31 | python main.py --ds $DATASET --model_type glove-bin --tr 0 --lm ../saved_models/$DATASET/model_best_test >> ../results/${DATASET}.results 32 | -------------------------------------------------------------------------------- /xml-cnn/code/test_manik.m: -------------------------------------------------------------------------------- 1 | addpath('/scratch/work/saxenas2/fastxml/manik/Tools/matlab/') 2 | addpath('/scratch/work/saxenas2/fastxml/manik/tools/') 3 | addpath('/scratch/work/saxenas2/fastxml/manik/Tools/metrics/') 4 | addpath('/scratch/work/saxenas2/fastxml/manik/FastXML/') 5 | 6 | A = .55; 7 | B = 1.5; 8 | 9 | load score_matrix.mat 10 | [I, J, S] = find(score_matrix); 11 | [sorted_I, idx] = sort(I); 12 | J = J(idx); 13 | S = S(idx); 14 | score_matrix = sparse(J, sorted_I, S); 15 | 16 | load ty.mat 17 | [I, J, S] = find(ty); 18 | [sorted_I, idx] = sort(I); 19 | J = J(idx); 20 | S = S(idx); 21 | ty = sparse(J, sorted_I, S); 22 | ip = inv_propensity(ty,A,B); 23 | 24 | [metrics] = get_all_metrics(score_matrix , ty, ip) 25 | disp(metrics) 26 | 27 | % -------- For RCV1 His neural net-------- 28 | 29 | % prec 96.58 89.82 79.66 65.28 55.15 30 | % nDCG 96.58 92.51 90.96 91.01 91.46 31 | % prec_wt 86.22 86.25 87.38 87.70 88.48 32 | % nDCG_wt 86.22 86.24 87.00 87.21 87.65 33 | 34 | % ----------------------------------------- 35 | 36 | % prec 93.26 86.08 75.64 62.28 52.79 37 | % nDCG 93.26 88.84 86.81 87.18 87.84 38 | % prec_wt 73.04 76.45 78.40 80.02 81.59 39 | % nDCG_wt 73.04 75.62 77.04 78.06 78.96 40 | 41 | % prec 95.50 87.29 76.72 63.20 53.59 42 | % nDCG 95.50 90.29 88.17 88.53 89.18 43 | % prec_wt 72.24 76.67 79.44 81.27 82.96 44 | % nDCG_wt 72.24 75.59 77.59 78.76 79.73 45 | 46 | 47 | % ---------- Initialized weights with Dropouts ------------- 48 | % Best for test ------------------- 49 | % prec 94.06 84.04 73.35 60.90 51.89 50 | % nDCG 94.06 87.45 84.92 85.63 86.51 51 | % prec_wt 70.89 73.01 74.81 77.17 79.28 52 | % nDCG_wt 70.89 72.50 73.76 75.21 76.40 53 | 54 | % Best for train ------------------- 55 | % prec 93.62 84.88 74.66 61.41 52.02 56 | % nDCG 93.62 88.00 86.00 86.34 86.98 57 | % prec_wt 71.90 75.07 77.10 78.54 80.01 58 | % nDCG_wt 71.90 74.30 75.76 76.67 77.52 59 | 60 | 61 | % ---------------- base_model_with_test_saving_after_each_run ------ 62 | % model_best_batch 63 | % prec 94.49 86.20 75.71 62.40 52.84 64 | % nDCG 94.49 89.23 87.11 87.53 88.16 65 | % prec_wt 72.40 76.11 78.32 80.02 81.60 66 | % nDCG_wt 72.40 75.21 76.81 77.88 78.79 67 | 68 | % model_best_for_test 69 | % prec 94.98 86.05 75.65 62.45 53.06 70 | % nDCG 94.98 89.21 87.08 87.54 88.29 71 | % prec_wt 71.91 75.42 77.85 79.69 81.58 72 | % nDCG_wt 71.91 74.57 76.30 77.47 78.54 73 | 74 | 75 | % --------------- L1 loss ---------------------- 76 | model_best_for_test 77 | bad!!! 78 | 79 | model_best_batch 80 | bad!!! 81 | 82 | % ------------------ Ablation -------------- 83 | % prec 94.59 87.66 77.32 63.61 53.84 84 | % nDCG 94.59 90.38 88.51 88.81 89.37 85 | % prec_wt 74.26 77.99 80.16 81.68 83.20 86 | % nDCG_wt 74.26 77.08 78.66 79.63 80.50 -------------------------------------------------------------------------------- /xml-cnn/code/train.slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is for GPU allocation is available. #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --ntasks-per-node=8 6 | #SBATCH --output=output/slurm-%x-%a.out 7 | #SBATCH --error=output/slurm-%x-%a.err 8 | 9 | # Set the environment. 10 | # source deactivate # Remove previous environments. 11 | source ~/anaconda3/etc/profile.d/conda.sh 12 | conda activate spp # Environment name. 13 | 14 | # Execute the code. 15 | set -o xtrace 16 | TASK_ID=$((SLURM_ARRAY_TASK_ID)) 17 | NAME=$1 18 | SAVE_MODEL=$2 19 | EXP_NAME=$3 20 | DIMS=$4 21 | WITH_GRAD=${5} 22 | WITHOUT_NEGATIVE=${6} 23 | PROP_A=${7:-"0.55"} # Propensity value A. For Amazon-670K it is 0.6 24 | PROP_B=${8:-"1.5"} # Propensity value B. For Amazon-670K it is 2.6 25 | MODEL=("baseline" "hrr") 26 | 27 | # Select the model. 28 | MODEL_TYPE=${MODEL[${TASK_ID}]} 29 | # FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE}-${DIMS}-${WITH_GRAD}-${WITHOUT_NEGATIVE} 30 | FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE} 31 | echo "Parameters: $NAME $SAVE_MODEL" 32 | echo " $MODEL_TYPE $EXP_NAME $DIMS" 33 | echo " ${WITH_GRAD} ${WITHOUT_NEGATIVE}" 34 | 35 | # Construct list of options. 36 | OPTIONS="" 37 | if [ "$MODEL_TYPE" == "hrr" ] 38 | then 39 | OPTIONS="${OPTIONS} --hrr_labels" 40 | NAME="${NAME}_hrr" 41 | fi 42 | 43 | if [ "$WITH_GRAD" == "no-grad" ] 44 | then 45 | OPTIONS="${OPTIONS} --no-grad" 46 | fi 47 | 48 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ] 49 | then 50 | OPTIONS="${OPTIONS} --without-negative" 51 | fi 52 | 53 | # Train the the models. 54 | # --dp 1 for data parallel option. 55 | echo "OPTIONS: $OPTIONS" 56 | python main.py --ds $NAME --mn $FIN_EXP_NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin $OPTIONS > ../results/${FIN_EXP_NAME}.results 57 | 58 | # Test the model. 59 | echo "Test Results..." 60 | python main.py --ds $NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --tr 0 --lm ../saved_models/$FIN_EXP_NAME/model_best_test $OPTIONS >> ../results/${FIN_EXP_NAME}.results 61 | -------------------------------------------------------------------------------- /xml-cnn/data/README.md: -------------------------------------------------------------------------------- 1 | # AttentionXML 2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727) 3 | 4 | ## Requirements 5 | 6 | * python==3.7.4 7 | * click==7.0 8 | * ruamel.yaml==0.16.5 9 | * numpy==1.16.2 10 | * scipy==1.3.1 11 | * scikit-learn==0.21.2 12 | * gensim==3.4.0 13 | * torch==1.0.1 14 | * nltk==3.4 15 | * tqdm==4.31.1 16 | * joblib==0.13.2 17 | * logzero==1.5.0 18 | 19 | ## Datasets 20 | 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2) 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR) 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL) 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW) 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk) 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4) 27 | 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**). 29 | 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 31 | 32 | ## XML Experiments 33 | 34 | XML experiments in paper can be run directly such as: 35 | ```bash 36 | ./scripts/run_eurlex.sh 37 | ``` 38 | ## Preprocess 39 | 40 | Run preprocess.py for train and test datasets with tokenized texts as follows: 41 | ```bash 42 | python preprocess.py \ 43 | --text-path data/EUR-Lex/train_texts.txt \ 44 | --label-path data/EUR-Lex/train_labels.txt \ 45 | --vocab-path data/EUR-Lex/vocab.npy \ 46 | --emb-path data/EUR-Lex/emb_init.npy \ 47 | --w2v-model data/glove.840B.300d.gensim 48 | 49 | python preprocess.py \ 50 | --text-path data/EUR-Lex/test_texts.txt \ 51 | --label-path data/EUR-Lex/test_labels.txt \ 52 | --vocab-path data/EUR-Lex/vocab.npy 53 | ``` 54 | 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows: 56 | ```bash 57 | python preprocess.py \ 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \ 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \ 60 | --label-path data/Wiki10-31K/train_labels.txt \ 61 | --vocab-path data/Wiki10-31K/vocab.npy \ 62 | --emb-path data/Wiki10-31K/emb_init.npy \ 63 | --w2v-model data/glove.840B.300d.gensim 64 | 65 | python preprocess.py \ 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \ 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \ 68 | --label-path data/Wiki10-31K/test_labels.txt \ 69 | --vocab-path data/Wiki10-31K/vocab.npy 70 | ``` 71 | 72 | 73 | ## Train and Predict 74 | 75 | Train and predict as follows: 76 | ```bash 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 78 | ``` 79 | 80 | Or do prediction only with option "--mode eval". 81 | 82 | ## Ensemble 83 | 84 | Train and predict with an ensemble: 85 | ```bash 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3 90 | ``` 91 | 92 | ## Evaluation 93 | 94 | ```bash 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy 96 | ``` 97 | Or get propensity scored metrics together: 98 | 99 | ```bash 100 | python evaluation.py \ 101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \ 102 | --targets data/Amazon-670K/test_labels.npy \ 103 | --train-labels data/Amazon-670K/train_labels.npy \ 104 | -a 0.6 \ 105 | -b 2.6 106 | 107 | ``` 108 | 109 | ## Reference 110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019 111 | 112 | ## Declaration 113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn). -------------------------------------------------------------------------------- /xml-cnn/embedding_weights/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /xml-cnn/utils/data_dive.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | # import torch 4 | import timeit 5 | import argparse 6 | import numpy as np 7 | import time 8 | # import torch.nn as nn 9 | # import torch.optim as optim 10 | import matplotlib.pyplot as plt 11 | # import torch.autograd as autograd 12 | from sklearn import preprocessing 13 | # from torch.autograd import Variable 14 | from sklearn.decomposition import PCA 15 | import matplotlib.gridspec as gridspec 16 | 17 | # this file is to explore the generated data and the data that already exist to see how much similarity do they share. 18 | # It prits some stats and qualitative results 19 | 20 | new_data_x_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_x.npy" 21 | new_data_y_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_y.npy" 22 | actual_data_x_file = "../datasets/Eurlex/eurlex_docs/x_tr.npy" 23 | actual_data_y_file = "../datasets/Eurlex/eurlex_docs/y_tr.npy" 24 | indx2word_file = "../datasets/Eurlex/eurlex_docs/feature_names.txt" 25 | indx2label = "../datasets/Eurlex/eurlex_docs/label_set.txt" 26 | K = 10 27 | # ---------------------------------------------------------------------------- 28 | 29 | new_data_x = np.load(new_data_x_file) 30 | new_data_y = np.load(new_data_y_file) 31 | actual_data_x = np.load(actual_data_x_file) 32 | actual_data_y = np.load(actual_data_y_file) 33 | f = open(indx2label, 'r') 34 | temp = f.read().splitlines() 35 | labels = [] 36 | for i in temp: 37 | labels.append(i.split(":")[1]) 38 | f = open(indx2word_file, 'r') 39 | temp = f.read().splitlines() 40 | words = [] 41 | for i in temp: 42 | words.append(i.split(":")[1]) 43 | 44 | print("Shapes: new_x: {}; new_y: {}; original_x: {}; original_y: {};".format(new_data_x.shape, \ 45 | new_data_y.shape, actual_data_x.shape, actual_data_y.shape)) 46 | print("Num Words: {}; Num Labels: {};".format(len(labels), len(words))) 47 | 48 | for data_pt_num in range(K): 49 | data_pt_labels = np.argwhere(new_data_y[data_pt_num]==1) 50 | label_names = [] 51 | for label in data_pt_labels.tolist(): 52 | # print(label) 53 | label_names.append(labels[label[0]]) 54 | print("Labels in the data point : {}".format(label_names)) 55 | 56 | data_pt_words = np.argsort(new_data_x[data_pt_num])[-10:] 57 | word_names = [] 58 | for word in data_pt_words.tolist(): 59 | word_names.append(words[word]) 60 | print("Top 10 words in the data point : {}".format(word_names)) 61 | 62 | # Nearest Data point in actual data 63 | indx = -1 64 | closest = 1e10 65 | # print(actual_data_y) 66 | for i in range(len(actual_data_y)): 67 | dist = -len(np.intersect1d(np.argwhere(actual_data_y[i]==1), np.argwhere(new_data_y[data_pt_num]==1))) 68 | # print(np.argwhere(actual_data_y[i]==1)) 69 | # print(np.argwhere(new_data_y[data_pt_num]==1)) 70 | if(dist max_label_per_row: 64 | max_label_per_row = len(l_list) 65 | 66 | for pos, y in enumerate(l_list): 67 | row_idx.append(i) 68 | col_idx.append(y) 69 | val_idx.append(1) 70 | 71 | if hrr_labels: 72 | Y_hrr.append(l_list) 73 | 74 | m = max(row_idx) + 1 75 | n = max(col_idx) + 1 76 | print("Number of Labels: {}".format(n)) 77 | 78 | # NOTE: n + 1 represents the number of labels. For HRR it is the last 79 | # label. 80 | if(M and N): 81 | if(N > n): 82 | #y_te = y_te.resize((np.shape(y_te)[0], np.shape(y_tr)[1])) 83 | Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, N)) 84 | elif(N <= n): 85 | Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n)) 86 | Y = Y[:, :N] # This eliminates labels not present in the training by default. 87 | else: 88 | Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n)) 89 | 90 | return [x_text, Y, m, n, Y_hrr, max_label_per_row] 91 | 92 | # max_labels are the number of labels per row. 93 | # num_labels are the total number of labels in the dataset. 94 | def build_hrr_labels(Y_tr_hrr, Y_te_hrr, max_tr_labels, max_te_labels, num_labels): 95 | max_labels = max_tr_labels if max_tr_labels > max_te_labels else max_te_labels 96 | for i in range(0, len(Y_tr_hrr)): 97 | diff = max_labels - len(Y_tr_hrr[i]) 98 | Y_tr_hrr[i] = Y_tr_hrr[i] + [num_labels for i in range(0, diff)] if diff > 0 else Y_tr_hrr[i][: max_labels] 99 | 100 | Y_tr_hrr = sp.csr_matrix(np.array(Y_tr_hrr)) 101 | 102 | for i in range(0, len(Y_te_hrr)): 103 | diff = max_labels - len(Y_te_hrr[i]) 104 | Y_te_hrr[i] = Y_te_hrr[i] + [num_labels for i in range(0, diff)] if diff > 0 else Y_te_hrr[i][: max_labels] 105 | 106 | Y_te_hrr = sp.csr_matrix(np.array(Y_te_hrr)) 107 | return Y_tr_hrr, Y_te_hrr 108 | 109 | 110 | def build_vocab(sentences, params, vocab_size=50000): 111 | word_counts = Counter(itertools.chain(*sentences)) 112 | vocabulary_inv = [x[0] for x in word_counts.most_common(vocab_size)] 113 | vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} 114 | # append symbol to the vocabulary 115 | vocabulary[''] = len(vocabulary) 116 | vocabulary_inv.append('') 117 | vocabulary[params.go_token] = len(vocabulary) 118 | vocabulary_inv.append(params.go_token) 119 | vocabulary[params.end_token] = len(vocabulary) 120 | vocabulary_inv.append(params.end_token) 121 | 122 | return [vocabulary, vocabulary_inv] 123 | 124 | 125 | def build_input_data(sentences, vocabulary): 126 | x = np.array([[vocabulary[word] if word in vocabulary else vocabulary[''] for word in sentence] for sentence in sentences]) 127 | #x = np.array([[vocabulary[word] if word in vocabulary else len(vocabulary) for word in sentence] for sentence in sentences]) 128 | return x 129 | 130 | 131 | def load_data(params, max_length=500, vocab_size=50000, hrr_labels=False): 132 | # Load and preprocess data 133 | with open(os.path.join(params.data_path), 'rb') as fin: 134 | [train, test, vocab, catgy] = pickle.load(fin, encoding="latin1") 135 | 136 | # dirty trick to prevent errors happen when test is empty 137 | if len(test) == 0: 138 | test[:5] = train[:5] 139 | 140 | trn_sents, Y_trn, m, n_tr, Y_tr_hrr, max_tr_labels = load_data_and_labels(train, hrr_labels=hrr_labels) 141 | tst_sents, Y_tst, m, n, Y_te_hrr, max_te_labels = load_data_and_labels(test, M=m, N=n_tr, hrr_labels=hrr_labels, max_labels=max_tr_labels) 142 | 143 | if hrr_labels: 144 | Y_tr_hrr, Y_te_hrr = build_hrr_labels(Y_tr_hrr, Y_te_hrr, max_tr_labels, max_te_labels, num_labels=n_tr) 145 | 146 | sents_padded_sets, params.sequence_length = pad_sentences([trn_sents, tst_sents] , padding_word=params.pad_token, max_length=max_length) 147 | # tst_sents_padded = pad_sentences(tst_sents, padding_word=params.pad_token, max_length=max_length) 148 | vocabulary, vocabulary_inv = build_vocab(sents_padded_sets[0] + sents_padded_sets[1], params, vocab_size=vocab_size) 149 | X_trn = build_input_data(sents_padded_sets[0], vocabulary) 150 | X_tst = build_input_data(sents_padded_sets[1], vocabulary) 151 | 152 | """ 153 | Dataset Information. 154 | """ 155 | print("Train X: {}, Train Y: {}".format(X_trn.shape, Y_trn.shape)) 156 | print("Test X: {}, Test Y: {}".format(X_tst.shape, Y_tst.shape)) 157 | if hrr_labels: 158 | print("Max Tr Labels: {}, Max Te Labels: {}".format(max_tr_labels, max_te_labels)) 159 | print("Train Y HRR: {}, Test Y HRR: {}".format(Y_tr_hrr.shape, Y_te_hrr.shape)) 160 | 161 | return X_trn, Y_trn, X_tst, Y_tst, vocabulary, vocabulary_inv, params, Y_tr_hrr, Y_te_hrr 162 | # return X_trn, Y_trn, vocabulary, vocabulary_inv 163 | 164 | 165 | def batch_iter(data, batch_size, num_epochs): 166 | """ 167 | Generates a batch iterator for a dataset. 168 | """ 169 | data = np.array(data) 170 | data_size = len(data) 171 | num_batches_per_epoch = int(len(data)/batch_size) + 1 172 | for epoch in range(num_epochs): 173 | # Shuffle the data at each epoch 174 | shuffle_indices = np.random.permutation(np.arange(data_size)) 175 | shuffled_data = data[shuffle_indices] 176 | for batch_num in range(num_batches_per_epoch): 177 | start_index = batch_num * batch_size 178 | end_index = min((batch_num + 1) * batch_size, data_size) 179 | yield shuffled_data[start_index:end_index] 180 | -------------------------------------------------------------------------------- /xml-cnn/utils/fiddle_clusters.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('utils/') 3 | sys.path.append('models/') 4 | import numpy as np 5 | import os 6 | from sklearn import preprocessing 7 | from sklearn.decomposition import PCA 8 | import argparse 9 | from sklearn.cluster import KMeans 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import matplotlib.gridspec as gridspec 13 | import cPickle 14 | from sklearn.metrics import silhouette_score 15 | from dpmeans import * 16 | from sklearn.decomposition import PCA 17 | from sklearn.decomposition import TruncatedSVD 18 | import scipy.io as sio 19 | x_tr = np.load('datasets/Eurlex/eurlex_docs/x_tr.npy') 20 | y_tr = np.load('datasets/Eurlex/eurlex_docs/y_tr.npy') 21 | x_te = np.load('datasets/Eurlex/eurlex_docs/x_te.npy') 22 | y_te = np.load('datasets/Eurlex/eurlex_docs/y_te.npy') 23 | 24 | n = np.shape(x_tr)[0] 25 | m = np.shape(y_tr)[1] 26 | 27 | 28 | 29 | # ------ Making Adjacency ------------------ 30 | dct = {} 31 | for i in range(m): 32 | dct[i] = np.argwhere(y_tr[:,i]==1) 33 | 34 | adjacency_mat = np.zeros((m,m)) 35 | check_mat = np.zeros((m,m)) 36 | for i in range(m): 37 | for j in range(m): 38 | adjacency_mat[i,j] = len(np.intersect1d(dct[i],dct[j])) 39 | adjacency_mat[j, i] = adjacency_mat[i,j] 40 | check_mat[i,j] = check_mat[j,i] = 1 41 | # adjacency_mat[i, i] = len(dct[i]) 42 | # check_mat[i,i] = 1 43 | 44 | print(i) 45 | np.save('adjacency_mat', adjacency_mat) 46 | adjacency_mat = sparse.csr_matrix(adjacency_mat) 47 | sio.savemat('adjacency_mat', adjacency_mat) 48 | print((check_mat==0).any()) 49 | print(adjacency_mat[:100,:100]) 50 | # ----------------------------------------- 51 | 52 | # ------------- PP --------------------------------------- 53 | adjacency_mat = np.load('/scratch/work/saxenas2/CVAE_XML/adjacency_mat.npy') 54 | pp = preprocessing.MinMaxScaler() 55 | scaler = pp.fit(adjacency_mat) 56 | adjacency_mat = scaler.transform(adjacency_mat) 57 | # ------------------------------------------------------- 58 | 59 | # ----------------------- cluster + score --------------- 60 | clusters = [2, 4, 6, 8, 10, 12, 15, 18, 21, 24, 27, 30] 61 | scores = [] 62 | scores_silhoette = [] 63 | for cluster_no in clusters: 64 | print(cluster_no) 65 | kmeans = KMeans(n_clusters=cluster_no, random_state=0).fit(adjacency_mat) 66 | scores.append(kmeans.score(adjacency_mat)) 67 | label = kmeans.labels_ 68 | scores_silhoette.append(silhouette_score(adjacency_mat, label, metric='euclidean')) 69 | with open('classifier_' + str(cluster_no) + '.pkl', 'wb') as fid: 70 | cPickle.dump(kmeans, fid) 71 | # --------------------------------------------------------- 72 | 73 | # scores = [] 74 | # for cluster_no in clusters: 75 | # with open('classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid: 76 | # kmeans = cPickle.load(fid) 77 | # label = kmeans.labels_ 78 | # scores.append(silhouette_score(adjacency_mat, label, metric='euclidean')) 79 | 80 | matplotlib.pyplot.plot(clusters, scores) 81 | plt.show() 82 | 83 | # ---------------------- Explore Clusters ------------------------- 84 | cluster_no = 30 85 | # with open('clusterings/classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid: 86 | with open('classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid: 87 | kmeans = cPickle.load(fid) 88 | 89 | y_pred = kmeans.predict(adjacency_mat) 90 | clusters = {} 91 | y_of_cluster = {} 92 | for i in range(cluster_no): 93 | clusters[i] = np.argwhere(y_pred==i) 94 | y_of_cluster[i] = y_tr[:, clusters[i]] 95 | # y_of_cluster[i] = np.array(y_of_cluster[i][:,0]) 96 | x = np.sum(y_tr, 0) 97 | y = np.sum(y_of_cluster[i], 0) 98 | mean_labels = np.mean(np.sum(y_of_cluster[i], 0)) 99 | top5_labels = np.argsort(y)[-10:] 100 | top5_label_counts = np.sort(y)[-10:] 101 | num_tail_labels_1 = len(np.argwhere(x[clusters[i]]<=1)) 102 | num_tail_labels_2 = len(np.argwhere(x[clusters[i]]<=2)) 103 | num_tail_labels_5 = len(np.argwhere(x[clusters[i]]<=5)) 104 | 105 | print("No. of Labels {6}; Mean No. of Labels {0}; top 5 labels {1}, top 5 label counts {2}; num tail labels(1) \ 106 | {3}; num tail labels(2) {4}; num tail labels(5) {5}".format(mean_labels, top5_labels, top5_label_counts, 107 | num_tail_labels_1, num_tail_labels_2, num_tail_labels_5, len(clusters[i]))) 108 | # ---------------------- Explore Clusters ------------------------- 109 | -------------------------------------------------------------------------------- /xml-cnn/utils/loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import timeit 5 | import argparse 6 | import numpy as np 7 | import time 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import matplotlib.pyplot as plt 11 | import torch.autograd as autograd 12 | from sklearn import preprocessing 13 | from torch.autograd import Variable 14 | from sklearn.decomposition import PCA 15 | import matplotlib.gridspec as gridspec 16 | import pdb 17 | 18 | def isnan(x): 19 | return x != x 20 | 21 | class loss: 22 | 23 | def MSLoss(self, X_sample, X): 24 | t = torch.mean(torch.norm((X_sample - X),1),dim=0) 25 | return t 26 | 27 | def BCELoss(self, y_pred, y, eps = 1e-25): 28 | t = torch.nn.functional.binary_cross_entropy(y_pred, y)*y.shape[-1] 29 | return t 30 | 31 | def L1Loss(self, X_sample, X): 32 | t = torch.sum(torch.mean(torch.abs(X_sample - X),dim=0)) 33 | return t 34 | -------------------------------------------------------------------------------- /xml-cnn/utils/process_eurlex.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import numpy as np 3 | from numpy import genfromtxt 4 | 5 | #bashCommand = "java -cp ~/Downloads/weka-3-8-2/weka.jar weka.core.converters.CSVSaver -i eurlex_nA-5k_CV1-10_train.arff > eurlex_nA-5k_CV1-10_train.csv" 6 | #process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) 7 | #output, error = process.communicate() 8 | 9 | with open('eurlex_nA-5k_CV1-10_train.csv') as f: 10 | lines = f.read().splitlines()[0] 11 | 12 | a = genfromtxt('eurlex_nA-5k_CV1-10_train.csv', delimiter=',') 13 | words = lines.split(',')[1:] 14 | doc_id = {} 15 | doc_id_inv = {} 16 | 17 | words_dict = {} 18 | for i, w in enumerate(words): 19 | words_dict[i] = w 20 | 21 | with open('feature_names.txt', 'w') as f: 22 | for key, value in words_dict.items(): 23 | f.write('%s:%s\n' % (key, value)) 24 | 25 | for i in range(1, len(a[:,0])): 26 | doc_id_inv[a[i,0]] = i-1 27 | doc_id[i-1] = a[i,0] 28 | # doc_id_list = doc_id. 29 | x_tr = a[1:,1:] 30 | np.save('words',words) 31 | np.save('doc_id',doc_id) # dictionary 32 | np.save('doc_id_inv',doc_id_inv) # dictionary 33 | np.save('x_tr',x_tr) 34 | 35 | 36 | labels_data_pt = genfromtxt('/u/79/wa.saxenas2/unix/Downloads/eurlex_id2class/id2class_eurlex_eurovoc.qrels', delimiter=' ')[:,1] 37 | with open('/u/79/wa.saxenas2/unix/Downloads/eurlex_id2class/id2class_eurlex_eurovoc.qrels') as f: 38 | lines = f.read().splitlines() 39 | 40 | label_names = [] 41 | for line in lines: 42 | label_names.append(line.split(' ')[0]) 43 | 44 | 45 | label_set = {} 46 | label_set_inv = {} 47 | count = 0 48 | # data_map = {} 49 | # data_count = 0 50 | for i in range(np.shape(labels_data_pt)[0]): 51 | if label_names[i] not in label_set.keys(): 52 | label_set[label_names[i]] = count 53 | label_set_inv[count] = label_names[i] 54 | count+=1 55 | print(count) 56 | # if labels[i] not in data_map.keys() and labels[i] in doc_id_list: 57 | # data_map[labels[i]] = data_count 58 | # data_count+=1 59 | 60 | np.save('label_set', label_set) # dictionary 61 | np.save('label_set_inv', label_set_inv) # dictionary 62 | 63 | with open('label_set.txt', 'w') as f: 64 | for key, value in label_set_inv.items(): 65 | f.write('%s:%s\n' % (key, value)) 66 | 67 | y_tr = np.zeros((np.shape(x_tr)[0], count)) 68 | y_tr_named = {} 69 | for i in range(np.shape(labels_data_pt)[0]): 70 | if labels_data_pt[i] in doc_id_inv.keys(): 71 | y_tr[doc_id_inv[labels_data_pt[i]], label_set[label_names[i]]] = 1 72 | if doc_id_inv[labels_data_pt[i]] not in y_tr_named.keys(): 73 | y_tr_named[doc_id_inv[labels_data_pt[i]]] = [] 74 | y_tr_named[doc_id_inv[labels_data_pt[i]]].append(label_names[i]) 75 | np.save('y_tr', y_tr) 76 | 77 | with open('y_tr_named.txt', 'w') as f: 78 | for key, value in y_tr_named.items(): 79 | f.write('%s:%s\n' % (key, value)) 80 | -------------------------------------------------------------------------------- /xml-cnn/utils/w2v.py: -------------------------------------------------------------------------------- 1 | from gensim.models import KeyedVectors 2 | from os.path import join, exists, split 3 | import os 4 | import numpy as np 5 | 6 | def train_word2vec(sentence_matrix, vocabulary_inv, 7 | num_features=300, min_word_count=1, context=10): 8 | """ 9 | Trains, saves, loads Word2Vec model 10 | Returns initial weights for embedding layer. 11 | 12 | inputs: 13 | sentence_matrix # int matrix: num_sentences x max_sentence_len 14 | vocabulary_inv # dict {str:int} 15 | num_features # Word vector dimensionality 16 | min_word_count # Minimum word count 17 | context # Context window size 18 | """ 19 | model_dir = '../embedding_weights' 20 | model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context) 21 | model_name = join(model_dir, model_name) 22 | if exists(model_name): 23 | embedding_model = word2vec.Word2Vec.load(model_name) 24 | #print 'Loading existing Word2Vec model \'%s\'' % split(model_name)[-1] 25 | else: 26 | # Set values for various parameters 27 | num_workers = 2 # Number of threads to run in parallel 28 | downsampling = 1e-3 # Downsample setting for frequent words 29 | 30 | # Initialize and train the model 31 | print( "Training Word2Vec model...") 32 | sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix] 33 | embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \ 34 | size=num_features, min_count = min_word_count, \ 35 | window = context, sample = downsampling) 36 | 37 | # If we don't plan to train the model any further, calling 38 | # init_sims will make the model much more memory-efficient. 39 | embedding_model.init_sims(replace=True) 40 | 41 | # Saving the model for later use. You can load it later using Word2Vec.load() 42 | if not exists(model_dir): 43 | os.mkdir(model_dir) 44 | print ('Saving Word2Vec model' + str(split(model_name)[-1])) 45 | embedding_model.save(model_name) 46 | 47 | # add unknown words 48 | embedding_weights = [np.array([embedding_model[w] if w in embedding_model\ 49 | else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\ 50 | for w in vocabulary_inv])] 51 | return embedding_weights 52 | 53 | 54 | def load_word2vec(params): 55 | """ 56 | loads Word2Vec model 57 | Returns initial weights for embedding layer. 58 | 59 | inputs: 60 | model_type # GoogleNews / glove 61 | vocabulary_inv # dict {str:int} 62 | num_features # Word vector dimensionality 63 | """ 64 | 65 | model_dir = '../embedding_weights' 66 | 67 | if params.model_type == 'GoogleNews': 68 | model_name = join(model_dir, 'GoogleNews-vectors-negative300.bin.gz') 69 | assert(params.num_features == 300) 70 | assert(exists(model_name)) 71 | print('Loading existing Word2Vec model (GoogleNews-300)') 72 | embedding_model = KeyedVectors.load_word2vec_format(model_name, binary=True) 73 | 74 | elif params.model_type == 'glove-bin': 75 | model_name = join(model_dir, 'glove.6B.%dd.bin' % (params.num_features)) 76 | assert(params.num_features == 300) 77 | assert(exists(model_name)) 78 | print('Loading existing Glove Binary model...') 79 | embedding_model = KeyedVectors.load_word2vec_format(model_name, binary=True) 80 | 81 | elif params.model_type == 'glove': 82 | model_name = join(model_dir, 'glove.6B.%dd.txt' % (params.num_features)) 83 | print(model_name) 84 | assert(exists(model_name)) 85 | print('Loading existing Word2Vec model (Glove.6B.%dd)' % (params.num_features)) 86 | 87 | # dictionary, where key is word, value is word vectors 88 | embedding_model = {} 89 | for line in open(model_name, 'r'): 90 | tmp = line.strip().split() 91 | word, vec = tmp[0], map(float, tmp[1:]) 92 | assert(len(vec) == params.num_features) 93 | if word not in embedding_model: 94 | embedding_model[word] = vec 95 | assert(len(embedding_model) == 400000) 96 | 97 | else: 98 | raise ValueError('Unknown pretrain model type: %s!' % (params.model_type)) 99 | 100 | embedding_weights = [embedding_model[w] if w in embedding_model 101 | else np.random.uniform(-0.25, 0.25, params.num_features) 102 | for w in params.vocabulary_inv] 103 | embedding_weights = np.array(embedding_weights).astype('float32') 104 | 105 | return embedding_weights 106 | 107 | 108 | if __name__=='__main__': 109 | import data_helpers 110 | print("Loading data...") 111 | x, _, _, params.vocabulary_inv = data_helpers.load_data() 112 | w = train_word2vec(x, params.vocabulary_inv) 113 | --------------------------------------------------------------------------------