├── LICENSE
├── README.md
├── __init__.py
├── analysis
    ├── data
    │   └── README.md
    ├── graphs
    │   ├── Eurlex4k-complex.html
    │   └── Eurlex4k.html
    ├── lib
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── mathops.cpython-36.pyc
    │   ├── data.py
    │   ├── embeddings.py
    │   ├── mathops.py
    │   ├── metrics.py
    │   ├── metrics_old.py
    │   ├── model.py
    │   ├── model_v1.py
    │   ├── model_v2.py
    │   ├── plots.py
    │   └── utils.py
    └── random_labels.ipynb
├── attention-xml
    ├── README.md
    ├── configure
    │   ├── datasets
    │   │   ├── Amazon-3M.yaml
    │   │   ├── Amazon-670K-spn.yaml
    │   │   ├── Amazon-670K.yaml
    │   │   ├── AmazonCat-13K-spn.yaml
    │   │   ├── AmazonCat-13K.yaml
    │   │   ├── EUR-Lex-spn.yaml
    │   │   ├── EUR-Lex.yaml
    │   │   ├── Wiki-500K-spn.yaml
    │   │   ├── Wiki-500K.yaml
    │   │   ├── Wiki10-31K-spn.yaml
    │   │   └── Wiki10-31K.yaml
    │   └── models
    │   │   ├── AttentionXML-Amazon-670K.yaml
    │   │   ├── AttentionXML-AmazonCat-13K.yaml
    │   │   ├── AttentionXML-EUR-Lex.yaml
    │   │   ├── AttentionXML-Wiki10-31K.yaml
    │   │   ├── FastAttentionXML-Amazon-3M.yaml
    │   │   ├── FastAttentionXML-Amazon-670K.yaml
    │   │   └── FastAttentionXML-Wiki-500K.yaml
    ├── data
    │   └── README.md
    ├── deepxml
    │   ├── __init__.py
    │   ├── cluster.py
    │   ├── data_utils.py
    │   ├── dataset.py
    │   ├── evaluation.py
    │   ├── lib
    │   ├── models.py
    │   ├── modules.py
    │   ├── networks.py
    │   ├── optimizers.py
    │   └── tree.py
    ├── ensemble.py
    ├── evaluation.py
    ├── experiments.sh
    ├── main.py
    ├── preprocess.py
    ├── requirements.txt
    ├── scripts
    │   ├── run_amazon.sh
    │   ├── run_amazon3m.sh
    │   ├── run_amazoncat.sh
    │   ├── run_eurlex.sh
    │   ├── run_preprocess.sh
    │   ├── run_wiki.sh
    │   ├── run_wiki10.sh
    │   └── run_xml.sh
    └── train.slurm.sh
├── build_label_vectors.py
├── combine_results.sh
├── data
    └── README.md
├── experiments.sh
├── hrr-example-representation.png
├── hrr-example.png
├── lib
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── mathops.cpython-36.pyc
    ├── data.py
    ├── embeddings.py
    ├── mathops.py
    ├── metrics.py
    ├── metrics_old.py
    ├── model.py
    ├── plots.py
    └── utils.py
├── output
    └── README.md
├── requirements.txt
├── run_classifier.py
├── train.slurm.sh
└── xml-cnn
    ├── README.md
    ├── code
        ├── __init__.py
        ├── cnn_test.py
        ├── cnn_train.py
        ├── experiments.sh
        ├── header.py
        ├── lib
        ├── main.py
        ├── models
        │   ├── classifier.py
        │   ├── cnn_encoder.py
        │   ├── embedding_layer.py
        │   ├── header.py
        │   └── xmlCNN.py
        ├── precision_k.py
        ├── run.sh
        ├── test_manik.m
        └── train.slurm.sh
    ├── data
        └── README.md
    ├── embedding_weights
        └── .gitignore
    └── utils
        ├── data_dive.py
        ├── data_helpers.py
        ├── fiddle_clusters.py
        ├── futils.py
        ├── futils_old.py
        ├── loss.py
        ├── process_eurlex.py
        └── w2v.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, Federico Bianchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/__init__.py


--------------------------------------------------------------------------------
/analysis/data/README.md:
--------------------------------------------------------------------------------
  1 | # AttentionXML
  2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727)
  3 | 
  4 | ## Requirements
  5 | 
  6 | * python==3.7.4
  7 | * click==7.0
  8 | * ruamel.yaml==0.16.5
  9 | * numpy==1.16.2
 10 | * scipy==1.3.1
 11 | * scikit-learn==0.21.2
 12 | * gensim==3.4.0
 13 | * torch==1.0.1
 14 | * nltk==3.4
 15 | * tqdm==4.31.1
 16 | * joblib==0.13.2
 17 | * logzero==1.5.0
 18 | 
 19 | ## Datasets
 20 | 
 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2)
 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR)
 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL)
 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW)
 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk)
 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4)
 27 | 
 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**).
 29 | 
 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 
 31 | 
 32 | ## XML Experiments
 33 | 
 34 | XML experiments in paper can be run directly such as:
 35 | ```bash
 36 | ./scripts/run_eurlex.sh
 37 | ```
 38 | ## Preprocess
 39 | 
 40 | Run preprocess.py for train and test datasets with tokenized texts as follows:
 41 | ```bash
 42 | python preprocess.py \
 43 | --text-path data/EUR-Lex/train_texts.txt \
 44 | --label-path data/EUR-Lex/train_labels.txt \
 45 | --vocab-path data/EUR-Lex/vocab.npy \
 46 | --emb-path data/EUR-Lex/emb_init.npy \
 47 | --w2v-model data/glove.840B.300d.gensim
 48 | 
 49 | python preprocess.py \
 50 | --text-path data/EUR-Lex/test_texts.txt \
 51 | --label-path data/EUR-Lex/test_labels.txt \
 52 | --vocab-path data/EUR-Lex/vocab.npy 
 53 | ```
 54 | 
 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows:
 56 | ```bash
 57 | python preprocess.py \
 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \
 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \
 60 | --label-path data/Wiki10-31K/train_labels.txt \
 61 | --vocab-path data/Wiki10-31K/vocab.npy \
 62 | --emb-path data/Wiki10-31K/emb_init.npy \
 63 | --w2v-model data/glove.840B.300d.gensim
 64 | 
 65 | python preprocess.py \
 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \
 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \
 68 | --label-path data/Wiki10-31K/test_labels.txt \
 69 | --vocab-path data/Wiki10-31K/vocab.npy 
 70 | ```
 71 | 
 72 | 
 73 | ## Train and Predict
 74 | 
 75 | Train and predict as follows:
 76 | ```bash
 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 
 78 | ```
 79 | 
 80 | Or do prediction only with option "--mode eval".
 81 | 
 82 | ## Ensemble
 83 | 
 84 | Train and predict with an ensemble:
 85 | ```bash
 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0
 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1
 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2
 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3
 90 | ```
 91 | 
 92 | ## Evaluation
 93 | 
 94 | ```bash
 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy
 96 | ```
 97 | Or get propensity scored metrics together:
 98 | 
 99 | ```bash
100 | python evaluation.py \
101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \
102 | --targets data/Amazon-670K/test_labels.npy \
103 | --train-labels data/Amazon-670K/train_labels.npy \
104 | -a 0.6 \
105 | -b 2.6
106 | 
107 | ```
108 | 
109 | ## Reference
110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019
111 | 
112 | ## Declaration
113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn).


--------------------------------------------------------------------------------
/analysis/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__init__.py


--------------------------------------------------------------------------------
/analysis/lib/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/analysis/lib/__pycache__/mathops.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/analysis/lib/__pycache__/mathops.cpython-36.pyc


--------------------------------------------------------------------------------
/analysis/lib/embeddings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Operations to generate embeddings.
 3 | """
 4 | 
 5 | __author__ = "Ashwinkumar Ganesan"
 6 | __email__ = "gashwin1@umbc.edu"
 7 | 
 8 | import numpy as np
 9 | import torch
10 | from gensim.models import KeyedVectors
11 | 
12 | from .mathops import complex_multiplication, complex_division, circular_conv
13 | from .mathops import get_appx_inv, get_inv, complexMagProj, normalize
14 | from .mathops import npcomplexMagProj
15 | 
16 | """
17 | Load Pretrained Label Embeddings.
18 | """
19 | def load_embeddings(save_loc, vocab_size):
20 |     fname = save_loc + "-complex.bin"
21 |     model = KeyedVectors.load_word2vec_format(fname, binary=True)
22 |     rand_vec_cnt = 0
23 |     vectors = [] # positions in vector space.
24 |     for i in range(0, vocab_size):
25 |         if str(i) in model.wv.vocab:
26 |             vectors.append(model.wv[str(i)])
27 |         else:
28 |             # NOTE: When a label is not present in training then we generate a
29 |             #       default vector and add it to the label vector matrix.
30 |             #  As SPN select the label based on the index it remains consistent while training.
31 |             rand_vec_cnt += 1
32 |             vectors.append(gen_rand_vec(model.vector_size))
33 | 
34 |     # Add Padding idx.
35 |     print("Vocabulary Size: {}".format(vocab_size))
36 |     print("Number of Random vectors generated: {}".format(rand_vec_cnt))
37 |     vectors.append(gen_rand_vec(model.vector_size))
38 |     vectors = torch.from_numpy(np.array(vectors, dtype=np.float32))
39 |     return vectors
40 | 
41 | """
42 | NumPY operations for embeddings.
43 | """
44 | def generate_vectors(num_vectors, dims):
45 |     """
46 |     Generate n vectors of size dims that are orthogonal to each other.
47 |     """
48 |     if num_vectors > dims:
49 |         raise ValueError("num_vectors cannot be greater than dims!")
50 | 
51 |     # Intializing class vectors.
52 |     vecs = torch.randn(dims, num_vectors, dtype=torch.float)
53 | 
54 |     # Using QR decomposition to get orthogonal vectors.
55 |     vecs, _ = torch.qr(vecs)
56 |     vecs = vecs.t()
57 |     vecs = vecs / torch.norm(vecs, dim=-1, keepdim=True)
58 |     return vecs
59 | 
60 | 
61 | def gen_rand_vec(dims):
62 |     """
63 |     Generate a random vector of size dims.
64 |     """
65 |     return npcomplexMagProj(np.random.normal(0, 1. / dims, size=(dims)))
66 | 
67 | 
68 | """
69 | Torch functions.
70 | """
71 | def get_vectors(num_vectors, dims, ortho=False):
72 |     if ortho:
73 |         vectors = generate_vectors(num_vectors, dims)
74 |         return complexMagProj(vectors)
75 |     else:
76 |         vectors = [gen_rand_vec(dims) for i in range(num_vectors)]
77 |         return torch.from_numpy(np.array(vectors, dtype=np.float32))
78 | 
79 | def get_static_embedding(seeds, dims):
80 |     vec = []
81 |     for s in seeds:
82 |         torch.manual_seed(s)
83 |         vec.append(torch.randn((1, dims), dtype=torch.float))
84 | 
85 |     return torch.cat(vec, dim=0)
86 | 


--------------------------------------------------------------------------------
/analysis/lib/mathops.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to perform circular convolution operations.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan"
  6 | __email__ = "gashwin1@umbc.edu"
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | """
 14 | Pytorch functions.
 15 | """
 16 | def complex_multiplication(left, right):
 17 |     """
 18 |     Multiply two vectors in complex domain.
 19 |     """
 20 |     left_real, left_complex = left[..., 0], left[..., 1]
 21 |     right_real, right_complex = right[..., 0], right[..., 1]
 22 | 
 23 |     output_real = left_real * right_real - left_complex * right_complex
 24 |     output_complex = left_real * right_complex + left_complex * right_real
 25 |     return torch.stack([output_real, output_complex], dim=-1)
 26 | 
 27 | def complex_division(left, right):
 28 |     """
 29 |     Divide two vectors in complex domain.
 30 |     """
 31 |     left_real, left_complex = left[..., 0], left[..., 1]
 32 |     right_real, right_complex = right[..., 0], right[..., 1]
 33 | 
 34 |     output_real = torch.div((left_real * right_real + left_complex * right_complex),(right_real**2 + right_complex**2))
 35 |     output_complex = torch.div((left_complex * right_real - left_real * right_complex ),(right_real**2 + right_complex**2))
 36 |     return torch.stack([output_real, output_complex], dim=-1)
 37 | 
 38 | def circular_conv(a, b):
 39 |     """ Defines the circular convolution operation
 40 |     a: tensor of shape (batch, D)
 41 |     b: tensor of shape (batch, D)
 42 |     """
 43 |     left = torch.rfft(a, 1, onesided=False)
 44 |     right = torch.rfft(b, 1, onesided=False)
 45 |     output = complex_multiplication(left, right)
 46 |     output = torch.irfft(output, 1, signal_sizes=a.shape[-1:], onesided=False)
 47 |     return output
 48 | 
 49 | def get_appx_inv(a):
 50 |     """
 51 |     Compute approximate inverse of vector a.
 52 |     """
 53 |     return torch.roll(torch.flip(a, dims=[-1]), 1,-1)
 54 | 
 55 | def get_inv(a, typ=torch.DoubleTensor):
 56 |     """
 57 |     Compute exact inverse of vector a.
 58 |     """
 59 |     left = torch.rfft(a, 1, onesided=False)
 60 |     complex_1 = np.zeros(left.shape)
 61 |     complex_1[...,0] = 1
 62 |     op = complex_division(typ(complex_1),left)
 63 |     return torch.irfft(op,1,onesided=False)
 64 | 
 65 | def complexMagProj(x):
 66 |     """
 67 |     Normalize a vector x in complex domain.
 68 |     """
 69 |     c = torch.rfft(x, 1, onesided=False)
 70 |     c_ish=c/torch.norm(c, dim=-1,keepdim=True)
 71 |     output = torch.irfft(c_ish, 1, signal_sizes=x.shape[1:], onesided=False)
 72 |     return output
 73 | 
 74 | def normalize(x):
 75 |     return x/torch.norm(x)
 76 | 
 77 | """
 78 | Numpy Functions.
 79 | """
 80 | # Make them work with batch dimensions
 81 | def cc(a, b):
 82 |     return np.fft.irfft(np.fft.rfft(a) * np.fft.rfft(b))
 83 | 
 84 | def np_inv(a):
 85 |     return np.fft.irfft((1.0/np.fft.rfft(a)),n=a.shape[-1])
 86 | 
 87 | def np_appx_inv(a):
 88 |     #Faster implementation
 89 |     return np.roll(np.flip(a, axis=-1), 1,-1)
 90 | 
 91 | def npcomplexMagProj(x):
 92 |     """
 93 |     Normalize a vector x in complex domain.
 94 |     """
 95 |     c = np.fft.rfft(x)
 96 | 
 97 |     # Look at real and image as if they were real
 98 |     c_ish = np.vstack([c.real, c.imag])
 99 | 
100 |     # Normalize magnitude of each complex/real pair
101 |     c_ish=c_ish/np.linalg.norm(c_ish, axis=0)
102 |     c_proj = c_ish[0,:] + 1j * c_ish[1,:]
103 |     return np.fft.irfft(c_proj,n=x.shape[-1])
104 | 
105 | def nrm(a):
106 |     return a / np.linalg.norm(a)
107 | 


--------------------------------------------------------------------------------
/analysis/lib/metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to compute different metrics for tasks.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan"
  6 | __email__ = "gashwin1@umbc.edu"
  7 | 
  8 | from tabulate import tabulate
  9 | import math
 10 | import matplotlib
 11 | import matplotlib.pyplot as plt
 12 | import torch
 13 | import xclib.evaluation.xc_metrics as xc_metrics
 14 | 
 15 | # Compute the precision score for multi-label binary classification task.
 16 | def mbprecision(y_true, y_pred):
 17 |     correct_pred = torch.sum(y_pred & y_true, axis=1).float()
 18 |     print(correct_pred.dtype)
 19 |     return torch.mean(correct_pred / torch.sum(y_true, axis=1))
 20 | 
 21 | # Compute the recall score for multi-label binary classification task.
 22 | def mbrecall(y_true, y_pred):
 23 |     return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1))
 24 | 
 25 | 
 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename):
 27 |     """
 28 |     Plot stats about the experiment.
 29 |     tr_stats: Training statistics (includes loss, precision, recall and F1)
 30 |     th_stats: Grid search statistics for configuring threshold.
 31 |     epochs: Number of epochs that the model is trained for.
 32 |     spoch: epoch that has optimal paramaters.
 33 |     sth: optimal threshold.
 34 |     filename: location to store plots.
 35 |     """
 36 |     fig, ax = plt.subplots(3, figsize=(10, 10))
 37 | 
 38 |     ep = tr_stats['Epoch']
 39 |     tr_loss = tr_stats['Training Loss']
 40 |     val_loss = tr_stats['Val Loss']
 41 |     pr = tr_stats['Precision']
 42 |     re = tr_stats['Recall']
 43 |     f1 = tr_stats['F1 Score']
 44 |     th = th_stats['Threshold']
 45 | 
 46 |     ax[0].plot(ep, tr_loss)
 47 |     ax[0].plot(ep, val_loss)
 48 |     ax[0].set_title("Training & Validation Loss Per Epoch", size=16)
 49 |     ax[0].set_xlabel("Epoch", size=14)
 50 |     ax[0].set_ylabel("Loss", size=14)
 51 |     ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large")
 52 |     ax[0].axvline(x=spoch, linestyle='dashed')
 53 | 
 54 |     ax[1].plot(ep, pr)
 55 |     ax[1].plot(ep, re)
 56 |     ax[1].plot(ep, f1)
 57 |     ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16)
 58 |     ax[1].set_xlabel("Epoch", size=14)
 59 |     ax[1].set_ylabel("Score", size=14)
 60 |     ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 61 |     ax[1].axvline(x=spoch, linestyle='dashed')
 62 | 
 63 |     ax[2].plot(th, th_stats['Precision'])
 64 |     ax[2].plot(th, th_stats['Recall'])
 65 |     ax[2].plot(th, th_stats['F1 Score'])
 66 |     ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16)
 67 |     ax[2].set_xlabel("Theshold", size=14)
 68 |     ax[2].set_ylabel("Score", size=14)
 69 |     ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 70 |     ax[2].axvline(x=sth, linestyle='dashed')
 71 | 
 72 |     fig.tight_layout()
 73 |     plt.savefig(filename + ".png")
 74 | 
 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib
 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5):
 77 |     """
 78 |         Compute Inverse propensity values
 79 |         Values for A/B:
 80 |             Wikpedia-500K: 0.5/0.4
 81 |             Amazon-670K, Amazon-3M: 0.6/2.6
 82 |             Others: 0.55/1.5
 83 | 
 84 |         Arguments:
 85 |         train_labels : numpy ndarray
 86 |     """
 87 |     inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B)
 88 |     return inv_propen
 89 | 
 90 | # Compute metrics with propensity.
 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5):
 92 |     """Compute propensity weighted precision@k and DCG@k.
 93 |        Arguments:
 94 |        true_labels : numpy ndarray
 95 |                      Ground truth labels from the dataset (one-hot vector).
 96 |        predicted_labels : numpy ndarray
 97 |                           Predicted labels (one-hot vector of labels)
 98 |     """
 99 |     acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores,
100 |                              remove_invalid=False)
101 |     return acc.eval(predicted_labels, topk)
102 | 
103 | 
104 | # Print the final results.
105 | # This provides the results for agg metrics when threshold for inference
106 | # is optimized and metrics are then computed.
107 | def display_agg_results(args, te_loss, pr, rec, f1):
108 |     print("----------Tests with Threshold Inference------------")
109 |     print("Inference Threshold: {:.3f}".format(args.th))
110 |     print("Test Loss: {:.3f}".format(te_loss))
111 |     print("Test Precision: {:.3f}".format(pr * 100))
112 |     print("Test Recall: {:.3f}".format(rec * 100))
113 |     print("Test F1-Score: {:.3f}\n".format(f1 * 100))
114 | 
115 | 
116 | def display_metrics(metrics, k=5):
117 |     # Merge batchwise metrics.
118 |     final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k]
119 |     for idx, metric in enumerate(metrics):
120 |         for i in range(0, 4):
121 |             for j in range(0, k):
122 |                 final_metrics[i][j] += metric[i][j]
123 | 
124 |     # Dataset metrics.
125 |     print("----------Tests with Ordered Retrieval------------")
126 |     table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]]
127 |     table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]])
128 |     table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]])
129 |     table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]])
130 |     print(tabulate(table, headers=[i+1 for i in range(0, k)],
131 |                    floatfmt=".3f"))
132 | 


--------------------------------------------------------------------------------
/analysis/lib/metrics_old.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to compute different metrics for tasks.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan"
  6 | __email__ = "gashwin1@umbc.edu"
  7 | 
  8 | from tabulate import tabulate
  9 | import math
 10 | import matplotlib
 11 | import matplotlib.pyplot as plt
 12 | import torch
 13 | import xclib.evaluation.xc_metrics as xc_metrics
 14 | 
 15 | # Compute the precision score for multi-label binary classification task.
 16 | def mbprecision(y_true, y_pred):
 17 |     correct_pred = torch.sum(y_pred & y_true, axis=1).float()
 18 |     print(correct_pred.dtype)
 19 |     return torch.mean(correct_pred / torch.sum(y_true, axis=1))
 20 | 
 21 | # Compute the recall score for multi-label binary classification task.
 22 | def mbrecall(y_true, y_pred):
 23 |     return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1))
 24 | 
 25 | 
 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename):
 27 |     """
 28 |     Plot stats about the experiment.
 29 |     tr_stats: Training statistics (includes loss, precision, recall and F1)
 30 |     th_stats: Grid search statistics for configuring threshold.
 31 |     epochs: Number of epochs that the model is trained for.
 32 |     spoch: epoch that has optimal paramaters.
 33 |     sth: optimal threshold.
 34 |     filename: location to store plots.
 35 |     """
 36 |     fig, ax = plt.subplots(3, figsize=(10, 10))
 37 | 
 38 |     ep = tr_stats['Epoch']
 39 |     tr_loss = tr_stats['Training Loss']
 40 |     val_loss = tr_stats['Val Loss']
 41 |     pr = tr_stats['Precision']
 42 |     re = tr_stats['Recall']
 43 |     f1 = tr_stats['F1 Score']
 44 |     th = th_stats['Threshold']
 45 | 
 46 |     ax[0].plot(ep, tr_loss)
 47 |     ax[0].plot(ep, val_loss)
 48 |     ax[0].set_title("Training & Validation Loss Per Epoch", size=16)
 49 |     ax[0].set_xlabel("Epoch", size=14)
 50 |     ax[0].set_ylabel("Loss", size=14)
 51 |     ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large")
 52 |     ax[0].axvline(x=spoch, linestyle='dashed')
 53 | 
 54 |     ax[1].plot(ep, pr)
 55 |     ax[1].plot(ep, re)
 56 |     ax[1].plot(ep, f1)
 57 |     ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16)
 58 |     ax[1].set_xlabel("Epoch", size=14)
 59 |     ax[1].set_ylabel("Score", size=14)
 60 |     ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 61 |     ax[1].axvline(x=spoch, linestyle='dashed')
 62 | 
 63 |     ax[2].plot(th, th_stats['Precision'])
 64 |     ax[2].plot(th, th_stats['Recall'])
 65 |     ax[2].plot(th, th_stats['F1 Score'])
 66 |     ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16)
 67 |     ax[2].set_xlabel("Theshold", size=14)
 68 |     ax[2].set_ylabel("Score", size=14)
 69 |     ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 70 |     ax[2].axvline(x=sth, linestyle='dashed')
 71 | 
 72 |     fig.tight_layout()
 73 |     plt.savefig(filename + ".png")
 74 | 
 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib
 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5):
 77 |     """
 78 |         Compute Inverse propensity values
 79 |         Values for A/B:
 80 |             Wikpedia-500K: 0.5/0.4
 81 |             Amazon-670K, Amazon-3M: 0.6/2.6
 82 |             Others: 0.55/1.5
 83 | 
 84 |         Arguments:
 85 |         train_labels : numpy ndarray
 86 |     """
 87 |     inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B)
 88 |     return inv_propen
 89 | 
 90 | # Compute metrics with propensity.
 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5):
 92 |     """Compute propensity weighted precision@k and DCG@k.
 93 |        Arguments:
 94 |        true_labels : numpy ndarray
 95 |                      Ground truth labels from the dataset (one-hot vector).
 96 |        predicted_labels : numpy ndarray
 97 |                           Predicted labels (one-hot vector of labels)
 98 |     """
 99 |     acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores,
100 |                              remove_invalid=False)
101 |     return acc.eval(predicted_labels, topk)
102 | 
103 | # Print the final results.
104 | # This provides the results for agg metrics when threshold for inference
105 | # is optimized and metrics are then computed.
106 | def display_agg_results(args, te_loss, pr, rec, f1):
107 |     print("----------Tests with Threshold Inference------------")
108 |     print("Inference Threshold: {:.3f}".format(args.th))
109 |     print("Test Loss: {:.3f}".format(te_loss))
110 |     print("Test Precision: {:.3f}".format(pr * 100))
111 |     print("Test Recall: {:.3f}".format(rec * 100))
112 |     print("Test F1-Score: {:.3f}\n".format(f1 * 100))
113 | 
114 | 
115 | def display_metrics(metrics, k=5):
116 |     # Merge batchwise metrics.
117 |     final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k]
118 |     for idx, metric in enumerate(metrics):
119 |         for i in range(0, 4):
120 |             for j in range(0, k):
121 |                 final_metrics[i][j] += metric[i][j]
122 | 
123 |     # Dataset metrics.
124 |     print("----------Tests with Ordered Retrieval------------")
125 |     table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]]
126 |     table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]])
127 |     table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]])
128 |     table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]])
129 |     print(tabulate(table, headers=[i+1 for i in range(0, k)],
130 |                    floatfmt=".3f"))
131 | 


--------------------------------------------------------------------------------
/analysis/lib/plots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manage plots.
 3 | AUTHOR: Ashwinkumar Ganesan.
 4 | """
 5 | 
 6 | import matplotlib
 7 | import matplotlib.pyplot as plt
 8 | import csv
 9 | import pandas as pd
10 | 
11 | """
12 | Plot training and testing curves.
13 | The graph includes:
14 | 1. Training loss per epoch.
15 | 2. Test loss per epoch.
16 | 3. Precision per epoch.
17 | 4. Recall per epoch.
18 | 5. F1 score per epoch.
19 | """
20 | def plot_stats(tr_stats):    
21 |     fig, ax = plt.subplots(2)
22 |     
23 |     ep = [i for i in range(0, epochs)]
24 |     tr_loss = tr_stats['Training Loss']
25 |     te_loss = tr_stats['Test Loss']
26 |     pr = tr_stats['Precision']
27 |     re = tr_stats['Recall']
28 |     f1 = tr_stats['F1 Score']
29 |     
30 |     # Loss Curve.
31 |     ax[0].plot(ep, tr_loss)
32 |     ax[0].plot(ep, te_loss)
33 |     ax[0].set_title("Training & Testing Loss Per Epoch")
34 |     
35 |     
36 |     ax[1].plot(ep, pr)
37 |     ax[1].plot(ep, re)
38 |     ax[1].plot(ep, f1)
39 | 


--------------------------------------------------------------------------------
/analysis/lib/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions.
 3 | """
 4 | 
 5 | from prettytable import PrettyTable
 6 | import pandas as pd
 7 | from time import time
 8 | import torch
 9 | 
10 | GB_DIV = 1024 * 1024 * 1024
11 | 
12 | 
13 | def print_memory_profile():
14 |     """
15 |     Get basic memory information.
16 |     """
17 |     device = torch.cuda.current_device()
18 |     print("Allocated: {:.4f}".format(int(torch.cuda.memory_allocated()) / GB_DIV))
19 |     print("Reserved: {:.4f}\n".format(int(torch.cuda.memory_allocated()) / GB_DIV))
20 | 
21 | # https://stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
22 | def print_command_arguments(args):
23 |     table = PrettyTable(['Parameter', 'Value'])
24 |     table.title = 'Experimental Setup'
25 |     for arg in vars(args):
26 |         table.add_row([arg, getattr(args, arg)])
27 |     print(table)
28 | 
29 | class Measure(object):
30 |     """
31 |     Manage runtimes for a specific code block.
32 |     """
33 |     def __init__(self, name):
34 |         self._measure = name
35 |         self._is_measuring = False
36 |         self._elapsed_time = 0
37 | 
38 |     def is_measuring(self):
39 |         return self._is_measuring
40 | 
41 |     def start(self):
42 |         self._stime = time()
43 |         self._is_measuring = True
44 | 
45 |     def end(self):
46 |         self._etime = time()
47 |         self._elapsed_time += self._etime - self._stime
48 |         self._is_measuring = False
49 | 
50 |     def get_elapsed_time(self):
51 |         return self._elapsed_time
52 | 
53 |     def get_name(self):
54 |         return self._measure
55 | 
56 | 
57 | class ExperimentTime(object):
58 |     """
59 |     Manage time for different parts in an experiment.
60 |     """
61 |     def __init__(self):
62 |         self._table = pd.DataFrame(columns=['Measurement', 'Elapsed Time'])
63 |         self._pos = 0
64 |         self.measure = {}
65 | 
66 |     def _append(self, name):
67 |         self._table.loc[self._pos] = [name, self.measure[name].get_elapsed_time()]
68 |         self._pos += 1
69 | 
70 |     def register(self, name):
71 |         if name in self.measure:
72 |             print("Measurement with same name previously added.")
73 |         else:
74 |             self.measure[name] = Measure(name)
75 | 
76 |     def measure_time(self, name):
77 |         if self.measure[name].is_measuring():
78 |             self.measure[name].end()
79 |             # Add time to the dataframe.
80 |             self._append(name)
81 |         else:
82 |             self.measure[name].start()
83 | 
84 |     def get_measurements(self):
85 |         return self._table
86 | 


--------------------------------------------------------------------------------
/attention-xml/README.md:
--------------------------------------------------------------------------------
 1 | # HRR-AttentionXML
 2 | This is a modified implementation of [AttentionalXML architecture](https://arxiv.org/abs/1811.01727) to train with HRR label representations and perform inference.
 3 | 
 4 | ## List of changes to the Codebase.
 5 | The XML-CNN codebase has been modified to with the following list of changes:
 6 | 1. Retooled to use semantic pointers. The architecture can use HRRs to learn and infer labels.
 7 | 2. The dataset and model YAML files have additional arguments for HRR label representations.
 8 | 
 9 | ## NOTES
10 | 1. For details about datasets and how to setup the repository, please look at instructions [here](https://github.com/yourh/AttentionXML).
11 | 2. AttentionXML is NOT configured for tree-based inference and HRR is applied only to a standard inference with a softmax layer.
12 | 3. The codebase also contains two scripts, i.e., ```experiments.sh``` and ```train.slurm.sh``` for execution of training and evaluation jobs on a SLURM enabled cluster.
13 | 
14 | ## Datasets Locations
15 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2)
16 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR)
17 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL)
18 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW)
19 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk)
20 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4)
21 | 
22 | ## Preprocess
23 | Run ```preprocss.py``` including tokenizing the raw texts by NLTK as follows:
24 | ```bash
25 | python preprocess.py \
26 | --text-path data/Wiki10-31K/train_raw_texts.txt \
27 | --tokenized-path data/Wiki10-31K/train_texts.txt \
28 | --label-path data/Wiki10-31K/train_labels.txt \
29 | --vocab-path data/Wiki10-31K/vocab.npy \
30 | --emb-path data/Wiki10-31K/emb_init.npy \
31 | --w2v-model data/glove.840B.300d.gensim
32 | 
33 | python preprocess.py \
34 | --text-path data/Wiki10-31K/test_raw_texts.txt \
35 | --tokenized-path data/Wiki10-31K/test_texts.txt \
36 | --label-path data/Wiki10-31K/test_labels.txt \
37 | --vocab-path data/Wiki10-31K/vocab.npy 
38 | ```
39 | 
40 | ## XML Experiments
41 | In this example let us consider the dataset: ```Wiki10-31K```.
42 | 
43 | To execute the baseline model:
44 | ```bash
45 | python main.py --data-cnf configure/datasets/Wiki10-31K.yaml --model-cnf configure/models/AttentionXML-Wiki10-31K.yaml
46 | ```
47 | 
48 | To execute the same model with HRR labels:
49 | ```bash
50 | python main.py --data-cnf configure/datasets/Wiki10-31K-spn.yaml --model-cnf configure/models/AttentionXML-Wiki10-31K.yaml
51 | ```
52 | 
53 | To evaluate the model:
54 | ```bash
55 | LABEL_NAME=AttentionXML-400-Wiki10-31K-spn-400 # For baseline the LABEL_NAME is AttentionXML-0-Wiki10-31K-spn-baseline-0.
56 | NAME=Wiki10-31K
57 | python evaluation.py --results results/${LABEL_NAME}-labels.npy \
58 |                      --targets data/${NAME}/test_labels.npy --train-labels data/${NAME}/train_labels.npy
59 | ```
60 | where ```${LABEL_NAME}``` is the name of the file containing labels for the above experiment run. ```${NAME}``` is the name of the dataset.
61 | 
62 | References
63 | ----------
64 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727)


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Amazon-3M.yaml:
--------------------------------------------------------------------------------
 1 | name: Amazon-3M
 2 | 
 3 | train:
 4 |   sparse: data/Amazon-3M/train_v1.txt
 5 |   texts: data/Amazon-3M/train_texts.npy
 6 |   labels: data/Amazon-3M/train_labels.npy
 7 | 
 8 | valid:
 9 |   size: 4000
10 | 
11 | test:
12 |   texts: data/Amazon-3M/test_texts.npy
13 | 
14 | embedding:
15 |   emb_init: data/Amazon-3M/emb_init.npy
16 | 
17 | output:
18 |   res: results
19 | 
20 | labels_binarizer: data/Amazon-3M/labels_binarizer
21 | 
22 | model:
23 |   emb_size: 300
24 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Amazon-670K-spn.yaml:
--------------------------------------------------------------------------------
 1 | name: Amazon-670K-spn
 2 | 
 3 | train:
 4 |   sparse: data/Amazon-670K/train_v1.txt
 5 |   texts: data/Amazon-670K/train_texts.npy
 6 |   labels: data/Amazon-670K/train_labels.npy
 7 | 
 8 | valid:
 9 |   size: 4000
10 | 
11 | test:
12 |   texts: data/Amazon-670K/test_texts.npy
13 | 
14 | embedding:
15 |   emb_init: data/Amazon-670K/emb_init.npy
16 | 
17 | output:
18 |   res: results
19 | 
20 | labels_binarizer: data/Amazon-670K/labels_binarizer
21 | 
22 | use_spn: True
23 | 
24 | model:
25 |   emb_size: 300
26 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Amazon-670K.yaml:
--------------------------------------------------------------------------------
 1 | name: Amazon-670K-baseline
 2 | 
 3 | train:
 4 |   sparse: data/Amazon-670K/train_v1.txt
 5 |   texts: data/Amazon-670K/train_texts.npy
 6 |   labels: data/Amazon-670K/train_labels.npy
 7 | 
 8 | valid:
 9 |   size: 4000
10 | 
11 | test:
12 |   texts: data/Amazon-670K/test_texts.npy
13 | 
14 | embedding:
15 |   emb_init: data/Amazon-670K/emb_init.npy
16 | 
17 | output:
18 |   res: results
19 | 
20 | labels_binarizer: data/Amazon-670K/labels_binarizer
21 | 
22 | use_spn: False
23 | 
24 | model:
25 |   emb_size: 300
26 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/AmazonCat-13K-spn.yaml:
--------------------------------------------------------------------------------
 1 | name: AmazonCat-13K-spn
 2 | 
 3 | train:
 4 |   texts: data/AmazonCat-13K/train_texts.npy
 5 |   labels: data/AmazonCat-13K/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 4000
 9 | 
10 | test:
11 |   texts: data/AmazonCat-13K/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/AmazonCat-13K/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/AmazonCat-13K/labels_binarizer
20 | 
21 | use_spn: True
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/AmazonCat-13K.yaml:
--------------------------------------------------------------------------------
 1 | name: AmazonCat-13K-baseline
 2 | 
 3 | train:
 4 |   texts: data/AmazonCat-13K/train_texts.npy
 5 |   labels: data/AmazonCat-13K/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 4000
 9 | 
10 | test:
11 |   texts: data/AmazonCat-13K/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/AmazonCat-13K/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/AmazonCat-13K/labels_binarizer
20 | 
21 | use_spn: False
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/EUR-Lex-spn.yaml:
--------------------------------------------------------------------------------
 1 | name: EUR-Lex-spn
 2 | 
 3 | train:
 4 |   texts: data/EUR-Lex/train_texts.npy
 5 |   labels: data/EUR-Lex/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 200
 9 | 
10 | test:
11 |   texts: data/EUR-Lex/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/EUR-Lex/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/EUR-Lex/labels_binarizer
20 | 
21 | use_spn: True
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/EUR-Lex.yaml:
--------------------------------------------------------------------------------
 1 | name: EUR-Lex-baseline
 2 | 
 3 | train:
 4 |   texts: data/EUR-Lex/train_texts.npy
 5 |   labels: data/EUR-Lex/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 200
 9 | 
10 | test:
11 |   texts: data/EUR-Lex/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/EUR-Lex/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/EUR-Lex/labels_binarizer
20 | 
21 | use_spn: False
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Wiki-500K-spn.yaml:
--------------------------------------------------------------------------------
 1 | name: Wiki-500K-spn
 2 | 
 3 | train:
 4 |   sparse: data/Wiki-500K/train.txt
 5 |   texts: data/Wiki-500K/train_texts.npy
 6 |   labels: data/Wiki-500K/train_labels.npy
 7 | 
 8 | valid:
 9 |   size: 4000
10 | 
11 | test:
12 |   texts: data/Wiki-500K/test_texts.npy
13 | 
14 | embedding:
15 |   emb_init: data/Wiki-500K/emb_init.npy
16 | 
17 | output:
18 |   res: results
19 | 
20 | labels_binarizer: data/Wiki-500K/labels_binarizer
21 | 
22 | use_spn: True
23 | 
24 | model:
25 |   emb_size: 300
26 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Wiki-500K.yaml:
--------------------------------------------------------------------------------
 1 | name: Wiki-500K-baseline
 2 | 
 3 | train:
 4 |   sparse: data/Wiki-500K/train.txt
 5 |   texts: data/Wiki-500K/train_texts.npy
 6 |   labels: data/Wiki-500K/train_labels.npy
 7 | 
 8 | valid:
 9 |   size: 4000
10 | 
11 | test:
12 |   texts: data/Wiki-500K/test_texts.npy
13 | 
14 | embedding:
15 |   emb_init: data/Wiki-500K/emb_init.npy
16 | 
17 | output:
18 |   res: results
19 | 
20 | labels_binarizer: data/Wiki-500K/labels_binarizer
21 | 
22 | use_spn: False
23 | 
24 | model:
25 |   emb_size: 300
26 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Wiki10-31K-spn.yaml:
--------------------------------------------------------------------------------
 1 | name: Wiki10-31K-spn
 2 | 
 3 | train:
 4 |   texts: data/Wiki10-31K/train_texts.npy
 5 |   labels: data/Wiki10-31K/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 200
 9 | 
10 | test:
11 |   texts: data/Wiki10-31K/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/Wiki10-31K/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/Wiki10-31K/labels_binarizer
20 | 
21 | use_spn: True
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/datasets/Wiki10-31K.yaml:
--------------------------------------------------------------------------------
 1 | name: Wiki10-31K-baseline
 2 | 
 3 | train:
 4 |   texts: data/Wiki10-31K/train_texts.npy
 5 |   labels: data/Wiki10-31K/train_labels.npy
 6 | 
 7 | valid:
 8 |   size: 200
 9 | 
10 | test:
11 |   texts: data/Wiki10-31K/test_texts.npy
12 | 
13 | embedding:
14 |   emb_init: data/Wiki10-31K/emb_init.npy
15 | 
16 | output:
17 |   res: results
18 | 
19 | labels_binarizer: data/Wiki10-31K/labels_binarizer
20 | 
21 | use_spn: False
22 | 
23 | model:
24 |   emb_size: 300
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/AttentionXML-Amazon-670K.yaml:
--------------------------------------------------------------------------------
 1 | name: AttentionXML
 2 | 
 3 | model:
 4 |   hidden_size: 256
 5 |   layers_num: 1
 6 |   linear_size: [256, 256]
 7 |   dropout: 0.5
 8 |   emb_trainable: False
 9 |   spn_dim: False
10 |   no_grad: False
11 |   without_negative: False
12 | 
13 | train:
14 |   batch_size: 8
15 |   nb_epoch: 30
16 |   swa_warmup: 4
17 | 
18 | valid:
19 |   batch_size: 8
20 | 
21 | predict:
22 |   batch_size: 8
23 | 
24 | path: models
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/AttentionXML-AmazonCat-13K.yaml:
--------------------------------------------------------------------------------
 1 | name: AttentionXML
 2 | 
 3 | model:
 4 |   hidden_size: 512
 5 |   layers_num: 1
 6 |   linear_size: [512, 512]
 7 |   dropout: 0.5
 8 |   emb_trainable: False
 9 |   spn_dim: False
10 |   no_grad: False
11 |   without_negative: False
12 | 
13 | train:
14 |   batch_size: 32
15 |   nb_epoch: 10
16 |   swa_warmup: 2
17 | 
18 | valid:
19 |   batch_size: 32
20 | 
21 | predict:
22 |   batch_size: 32
23 | 
24 | path: models
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/AttentionXML-EUR-Lex.yaml:
--------------------------------------------------------------------------------
 1 | name: AttentionXML
 2 | 
 3 | model:
 4 |   hidden_size: 512
 5 |   layers_num: 1
 6 |   linear_size: [1024, 1024]
 7 |   dropout: 0.5
 8 |   emb_trainable: False
 9 |   spn_dim: False
10 |   no_grad: False
11 |   without_negative: False
12 | 
13 | train:
14 |   batch_size: 40
15 |   nb_epoch: 30
16 |   swa_warmup: 10
17 | 
18 | valid:
19 |   batch_size: 40
20 | 
21 | predict:
22 |   batch_size: 40
23 | 
24 | path: models
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/AttentionXML-Wiki10-31K.yaml:
--------------------------------------------------------------------------------
 1 | name: AttentionXML
 2 | 
 3 | model:
 4 |   hidden_size: 512
 5 |   layers_num: 1
 6 |   linear_size: [1024, 1024]
 7 |   dropout: 0.5
 8 |   emb_trainable: False
 9 |   spn_dim: False
10 |   no_grad: False
11 |   without_negative: False
12 | 
13 | train:
14 |   batch_size: 16
15 |   nb_epoch: 30
16 |   swa_warmup: 4
17 | 
18 | valid:
19 |   batch_size: 32
20 | 
21 | predict:
22 |   batch_size: 40
23 | 
24 | path: models
25 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/FastAttentionXML-Amazon-3M.yaml:
--------------------------------------------------------------------------------
 1 | name: FastAttentionXML
 2 | 
 3 | level: 4
 4 | k: 8
 5 | top: 160
 6 | 
 7 | model:
 8 |   hidden_size: 512
 9 |   layers_num: 1
10 |   linear_size: [512, 256]
11 |   dropout: 0.5
12 | 
13 | cluster:
14 |   max_leaf: 8
15 |   eps: 1e-4
16 |   levels: [13, 16, 19]
17 | 
18 | 
19 | train:
20 |   [{batch_size: 200, nb_epoch: 5, swa_warmup: 2},
21 |    {batch_size: 200, nb_epoch: 5, swa_warmup: 1},
22 |    {batch_size: 200, nb_epoch: 5, swa_warmup: 1},
23 |    {batch_size: 200, nb_epoch: 5, swa_warmup: 1}]
24 | 
25 | valid:
26 |   batch_size: 200
27 | 
28 | predict:
29 |   batch_size: 200
30 | 
31 | path: models
32 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/FastAttentionXML-Amazon-670K.yaml:
--------------------------------------------------------------------------------
 1 | name: FastAttentionXML
 2 | 
 3 | level: 2
 4 | k: 8
 5 | top: 160
 6 | 
 7 | model:
 8 |   hidden_size: 512
 9 |   layers_num: 1
10 |   linear_size: [512, 256]
11 |   dropout: 0.5
12 | 
13 | cluster:
14 |   max_leaf: 8
15 |   eps: 1e-4
16 |   levels: [11, 14, 17]
17 | 
18 | train:
19 |   [{batch_size: 128, nb_epoch: 10, swa_warmup: 6},
20 |    {batch_size: 128, nb_epoch: 10, swa_warmup: 2},
21 |    {batch_size: 128, nb_epoch: 10, swa_warmup: 2},
22 |    {batch_size: 128, nb_epoch: 10, swa_warmup: 2}]
23 | 
24 | valid:
25 |   batch_size: 200
26 | 
27 | predict:
28 |   batch_size: 200
29 | 
30 | path: models
31 | 


--------------------------------------------------------------------------------
/attention-xml/configure/models/FastAttentionXML-Wiki-500K.yaml:
--------------------------------------------------------------------------------
 1 | name: FastAttentionXML
 2 | 
 3 | level: 2
 4 | k: 64
 5 | top: 15
 6 | 
 7 | model:
 8 |   hidden_size: 512
 9 |   layers_num: 1
10 |   linear_size: [512, 256]
11 |   dropout: 0.5
12 | 
13 | cluster:
14 |   max_leaf: 64
15 |   eps: 1e-4
16 |   levels: [13]
17 | 
18 | train:
19 |   [{batch_size: 200, nb_epoch: 5, swa_warmup: 2},
20 |    {batch_size: 200, nb_epoch: 5, swa_warmup: 1}]
21 | 
22 | valid:
23 |   batch_size: 200
24 | 
25 | predict:
26 |   batch_size: 200
27 | 
28 | path: models
29 | 


--------------------------------------------------------------------------------
/attention-xml/data/README.md:
--------------------------------------------------------------------------------
  1 | # AttentionXML
  2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727)
  3 | 
  4 | ## Requirements
  5 | 
  6 | * python==3.7.4
  7 | * click==7.0
  8 | * ruamel.yaml==0.16.5
  9 | * numpy==1.16.2
 10 | * scipy==1.3.1
 11 | * scikit-learn==0.21.2
 12 | * gensim==3.4.0
 13 | * torch==1.0.1
 14 | * nltk==3.4
 15 | * tqdm==4.31.1
 16 | * joblib==0.13.2
 17 | * logzero==1.5.0
 18 | 
 19 | ## Datasets
 20 | 
 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2)
 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR)
 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL)
 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW)
 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk)
 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4)
 27 | 
 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**).
 29 | 
 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 
 31 | 
 32 | ## XML Experiments
 33 | 
 34 | XML experiments in paper can be run directly such as:
 35 | ```bash
 36 | ./scripts/run_eurlex.sh
 37 | ```
 38 | ## Preprocess
 39 | 
 40 | Run preprocess.py for train and test datasets with tokenized texts as follows:
 41 | ```bash
 42 | python preprocess.py \
 43 | --text-path data/EUR-Lex/train_texts.txt \
 44 | --label-path data/EUR-Lex/train_labels.txt \
 45 | --vocab-path data/EUR-Lex/vocab.npy \
 46 | --emb-path data/EUR-Lex/emb_init.npy \
 47 | --w2v-model data/glove.840B.300d.gensim
 48 | 
 49 | python preprocess.py \
 50 | --text-path data/EUR-Lex/test_texts.txt \
 51 | --label-path data/EUR-Lex/test_labels.txt \
 52 | --vocab-path data/EUR-Lex/vocab.npy 
 53 | ```
 54 | 
 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows:
 56 | ```bash
 57 | python preprocess.py \
 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \
 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \
 60 | --label-path data/Wiki10-31K/train_labels.txt \
 61 | --vocab-path data/Wiki10-31K/vocab.npy \
 62 | --emb-path data/Wiki10-31K/emb_init.npy \
 63 | --w2v-model data/glove.840B.300d.gensim
 64 | 
 65 | python preprocess.py \
 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \
 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \
 68 | --label-path data/Wiki10-31K/test_labels.txt \
 69 | --vocab-path data/Wiki10-31K/vocab.npy 
 70 | ```
 71 | 
 72 | 
 73 | ## Train and Predict
 74 | 
 75 | Train and predict as follows:
 76 | ```bash
 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 
 78 | ```
 79 | 
 80 | Or do prediction only with option "--mode eval".
 81 | 
 82 | ## Ensemble
 83 | 
 84 | Train and predict with an ensemble:
 85 | ```bash
 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0
 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1
 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2
 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3
 90 | ```
 91 | 
 92 | ## Evaluation
 93 | 
 94 | ```bash
 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy
 96 | ```
 97 | Or get propensity scored metrics together:
 98 | 
 99 | ```bash
100 | python evaluation.py \
101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \
102 | --targets data/Amazon-670K/test_labels.npy \
103 | --train-labels data/Amazon-670K/train_labels.npy \
104 | -a 0.6 \
105 | -b 2.6
106 | 
107 | ```
108 | 
109 | ## Reference
110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019
111 | 
112 | ## Declaration
113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn).


--------------------------------------------------------------------------------
/attention-xml/deepxml/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8
3 | """
4 | Created on 2018/10/17
5 | @author yrh
6 | 
7 | """


--------------------------------------------------------------------------------
/attention-xml/deepxml/cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8
 3 | """
 4 | Created on 2018/12/24
 5 | @author yrh
 6 | 
 7 | """
 8 | 
 9 | import os
10 | import numpy as np
11 | from scipy.sparse import csr_matrix, csc_matrix
12 | from sklearn.preprocessing import normalize
13 | from logzero import logger
14 | 
15 | from deepxml.data_utils import get_sparse_feature
16 | 
17 | 
18 | __all__ = ['build_tree_by_level']
19 | 
20 | 
21 | def build_tree_by_level(sparse_data_x, sparse_data_y, mlb, eps: float, max_leaf: int, levels: list, groups_path):
22 |     os.makedirs(os.path.split(groups_path)[0], exist_ok=True)
23 |     logger.info('Clustering')
24 |     sparse_x, sparse_labels = get_sparse_feature(sparse_data_x, sparse_data_y)
25 |     sparse_y = mlb.transform(sparse_labels)
26 |     logger.info('Getting Labels Feature')
27 |     labels_f = normalize(csr_matrix(sparse_y.T) @ csc_matrix(sparse_x))
28 |     logger.info(F'Start Clustering {levels}')
29 |     levels, q = [2**x for x in levels], None
30 |     for i in range(len(levels)-1, -1, -1):
31 |         if os.path.exists(F'{groups_path}-Level-{i}.npy'):
32 |             labels_list = np.load(F'{groups_path}-Level-{i}.npy', allow_pickle=True)
33 |             q = [(labels_i, labels_f[labels_i]) for labels_i in labels_list]
34 |             break
35 |     if q is None:
36 |         q = [(np.arange(labels_f.shape[0]), labels_f)]
37 |     while q:
38 |         labels_list = np.asarray([x[0] for x in q])
39 |         assert sum(len(labels) for labels in labels_list) == labels_f.shape[0]
40 |         if len(labels_list) in levels:
41 |             level = levels.index(len(labels_list))
42 |             logger.info(F'Finish Clustering Level-{level}')
43 |             np.save(F'{groups_path}-Level-{level}.npy', np.asarray(labels_list))
44 |         else:
45 |             logger.info(F'Finish Clustering {len(labels_list)}')
46 |         next_q = []
47 |         for node_i, node_f in q:
48 |             if len(node_i) > max_leaf:
49 |                 next_q += list(split_node(node_i, node_f, eps))
50 |         q = next_q
51 |     logger.info('Finish Clustering')
52 | 
53 | 
54 | def split_node(labels_i: np.ndarray, labels_f: csr_matrix, eps: float):
55 |     n = len(labels_i)
56 |     c1, c2 = np.random.choice(np.arange(n), 2, replace=False)
57 |     centers, old_dis, new_dis = labels_f[[c1, c2]].toarray(), -10000.0, -1.0
58 |     l_labels_i, r_labels_i = None, None
59 |     while new_dis - old_dis >= eps:
60 |         dis = labels_f @ centers.T  # N, 2
61 |         partition = np.argsort(dis[:, 1] - dis[:, 0])
62 |         l_labels_i, r_labels_i = partition[:n//2], partition[n//2:]
63 |         old_dis, new_dis = new_dis, (dis[l_labels_i, 0].sum() + dis[r_labels_i, 1].sum()) / n
64 |         centers = normalize(np.asarray([np.squeeze(np.asarray(labels_f[l_labels_i].sum(axis=0))),
65 |                                         np.squeeze(np.asarray(labels_f[r_labels_i].sum(axis=0)))]))
66 |     return (labels_i[l_labels_i], labels_f[l_labels_i]), (labels_i[r_labels_i], labels_f[r_labels_i])
67 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/data_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/9
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import os
 10 | import numpy as np
 11 | import joblib
 12 | from collections import Counter
 13 | from scipy import sparse
 14 | from sklearn.preprocessing import MultiLabelBinarizer, normalize
 15 | from sklearn.datasets import load_svmlight_file
 16 | from gensim.models import KeyedVectors
 17 | from tqdm import tqdm
 18 | from typing import Union, Iterable
 19 | 
 20 | 
 21 | __all__ = ['build_vocab', 'get_data', 'convert_to_binary', 'truncate_text', 'get_word_emb', 'get_mlb',
 22 |            'get_sparse_feature', 'output_res']
 23 | 
 24 | 
 25 | def build_vocab(texts: Iterable, w2v_model: Union[KeyedVectors, str], vocab_size=500000,
 26 |                 pad='<PAD>', unknown='<UNK>', sep='/SEP/', max_times=1, freq_times=1):
 27 |     if isinstance(w2v_model, str):
 28 |         # w2v_model = KeyedVectors.load(w2v_model, binary=True)
 29 |         w2v_model = KeyedVectors.load_word2vec_format(w2v_model, binary=True)
 30 |     emb_size = w2v_model.vector_size
 31 |     vocab, emb_init = [pad, unknown], [np.zeros(emb_size), np.random.uniform(-1.0, 1.0, emb_size)]
 32 |     counter = Counter(token for t in texts for token in set(t.split()))
 33 |     for word, freq in sorted(counter.items(), key=lambda x: (x[1], x[0] in w2v_model), reverse=True):
 34 |         if word in w2v_model or freq >= freq_times:
 35 |             vocab.append(word)
 36 |             # We used embedding of '.' as embedding of '/SEP/' symbol.
 37 |             word = '.' if word == sep else word
 38 |             emb_init.append(w2v_model[word] if word in w2v_model else np.random.uniform(-1.0, 1.0, emb_size))
 39 |         if freq < max_times or vocab_size == len(vocab):
 40 |             break
 41 | 
 42 |     return np.asarray(vocab), np.asarray(emb_init)
 43 | 
 44 | 
 45 | def get_word_emb(vec_path, vocab_path=None):
 46 |     if vocab_path is not None:
 47 |         with open(vocab_path) as fp:
 48 |             vocab = {word: idx for idx, word in enumerate(fp)}
 49 |         return np.load(vec_path, allow_pickle=True), vocab
 50 |     else:
 51 |         return np.load(vec_path, allow_pickle=True)
 52 | 
 53 | 
 54 | def get_data(text_file, label_file=None):
 55 |     return np.load(text_file, allow_pickle=True), np.load(label_file, allow_pickle=True) if label_file is not None else None
 56 | 
 57 | 
 58 | def convert_to_binary(text_file, label_file=None, max_len=None, vocab=None, pad='<PAD>', unknown='<UNK>'):
 59 |     with open(text_file) as fp:
 60 |         texts = np.asarray([[vocab.get(word, vocab[unknown]) for word in line.split()]
 61 |                            for line in tqdm(fp, desc='Converting token to id', leave=False)])
 62 |     labels = None
 63 |     if label_file is not None:
 64 |         with open(label_file) as fp:
 65 |             labels = np.asarray([[label for label in line.split()]
 66 |                                  for line in tqdm(fp, desc='Converting labels', leave=False)])
 67 |     return truncate_text(texts, max_len, vocab[pad], vocab[unknown]), labels
 68 | 
 69 | 
 70 | def truncate_text(texts, max_len=500, padding_idx=0, unknown_idx=1):
 71 |     if max_len is None:
 72 |         return texts
 73 |     texts = np.asarray([list(x[:max_len]) + [padding_idx] * (max_len - len(x)) for x in texts])
 74 |     texts[(texts == padding_idx).all(axis=1), 0] = unknown_idx
 75 |     return texts
 76 | 
 77 | def build_spn_labels(labels, split_idx=None, label_idx=None, max_len=0):
 78 |     if label_idx is None:
 79 |         label_idx = {}
 80 | 
 81 |     idx = 0
 82 |     spn_labels = []
 83 |     for label_row in labels:
 84 |         spn_label_row = []
 85 |         for l in label_row:
 86 |             if l not in label_idx:
 87 |                 label_idx[l] = idx
 88 |                 idx += 1
 89 | 
 90 |             spn_label_row.append(label_idx[l])
 91 | 
 92 |         spn_labels.append(spn_label_row)
 93 |         if len(spn_label_row) > max_len:
 94 |             max_len = len(spn_label_row)
 95 | 
 96 |     # Add a END of set label for SPN.
 97 |     label_idx["SPN"] = idx
 98 |     idx += 1
 99 | 
100 |     # Create a constant size matrix.
101 |     # NOTE: This is for SPN label set.
102 |     for i, row in enumerate(spn_labels):
103 |         row = row + [label_idx["SPN"] for i in range(0, max_len - len(row))]
104 |         spn_labels[i] = row
105 | 
106 |     spn_labels = sparse.csr_matrix(np.asarray(spn_labels))
107 |     return label_idx, spn_labels[0: split_idx], spn_labels[split_idx: ], idx
108 | 
109 | 
110 | def convert_to_spn(labels):
111 |     sum = labels.sum(axis=1)
112 |     max_len = sum.max()
113 | 
114 |     spn_labels = [[] for i in range(labels.shape[0])]
115 |     row, col = labels.nonzero()
116 |     for i in range(row.shape[0]):
117 |         spn_labels[row[i]].append(col[i])
118 | 
119 |     # Create a constant size matrix.
120 |     # NOTE: This is for SPN label set.
121 |     for i, row in enumerate(spn_labels):
122 |         row = row + [labels.shape[1] for i in range(0, max_len - len(row))]
123 |         spn_labels[i] = row
124 | 
125 | 
126 |     spn_labels = sparse.csr_matrix(np.asarray(spn_labels))
127 |     return spn_labels
128 | 
129 | def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer:
130 |     if os.path.exists(mlb_path):
131 |         return joblib.load(mlb_path)
132 |     mlb = MultiLabelBinarizer(sparse_output=True) # Create a binarizer if one has not been created before.
133 |     mlb.fit(labels)
134 |     joblib.dump(mlb, mlb_path)
135 |     return mlb
136 | 
137 | 
138 | def get_sparse_feature(feature_file, label_file):
139 |     sparse_x, _ = load_svmlight_file(feature_file, multilabel=True)
140 |     return normalize(sparse_x), np.load(label_file, allow_pickle=True) if label_file is not None else None
141 | 
142 | 
143 | def output_res(output_path, name, scores, labels):
144 |     os.makedirs(output_path, exist_ok=True)
145 |     np.save(os.path.join(output_path, F'{name}-scores'), scores)
146 |     np.save(os.path.join(output_path, F'{name}-labels'), labels)
147 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8
 3 | """
 4 | Created on 2018/12/10
 5 | @author yrh
 6 | 
 7 | """
 8 | 
 9 | import numpy as np
10 | import torch
11 | from torch.utils.data import Dataset
12 | from scipy.sparse import csr_matrix
13 | from tqdm import tqdm
14 | from typing import Sequence, Optional, Union
15 | 
16 | 
17 | __all__ = ['MultiLabelDataset', 'XMLDataset']
18 | 
19 | TDataX = Sequence[Sequence]
20 | TDataY = Optional[csr_matrix]
21 | TDataZ = Optional[csr_matrix]
22 | TCandidate = TGroup = Optional[np.ndarray]
23 | TGroupLabel = TGroupScore = Optional[Union[np.ndarray, torch.Tensor]]
24 | 
25 | 
26 | class MultiLabelDataset(Dataset):
27 |     """
28 | 
29 |     """
30 |     def __init__(self, data_x: TDataX, data_y: TDataY = None,
31 |                  spn_data_y: TDataZ = None, training=True):
32 |         self.data_x, self.data_y, self.training = data_x, data_y, training
33 |         self.spn_data_y = spn_data_y
34 | 
35 |     def __getitem__(self, item):
36 |         data_x = self.data_x[item]
37 |         if self.training and self.data_y is not None:
38 |             data_y = self.data_y[item].toarray().squeeze(0).astype(np.float32)
39 |             if self.spn_data_y is not None:
40 |                 spn_data_y = self.spn_data_y[item].toarray().squeeze(0).astype(np.long)
41 |                 return data_x, data_y, spn_data_y
42 |             else:
43 |                 return data_x, data_y
44 |         else:
45 |             return data_x
46 | 
47 |     def __len__(self):
48 |         return len(self.data_x)
49 | 
50 | 
51 | class XMLDataset(MultiLabelDataset):
52 |     """
53 | 
54 |     """
55 |     def __init__(self, data_x: TDataX, data_y: TDataY = None, training=True,
56 |                  labels_num=None, candidates: TCandidate = None, candidates_num=None,
57 |                  groups: TGroup = None, group_labels: TGroupLabel = None, group_scores: TGroupScore = None):
58 |         super(XMLDataset, self).__init__(data_x, data_y, training)
59 |         self.labels_num, self.candidates, self.candidates_num = labels_num, candidates, candidates_num
60 |         self.groups, self.group_labels, self.group_scores = groups, group_labels, group_scores
61 |         if self.candidates is None:
62 |             self.candidates = [np.concatenate([self.groups[g] for g in group_labels])
63 |                                for group_labels in tqdm(self.group_labels, leave=False, desc='Candidates')]
64 |             if self.group_scores is not None:
65 |                 self.candidates_scores = [np.concatenate([[s] * len(self.groups[g])
66 |                                                           for g, s in zip(group_labels, group_scores)])
67 |                                           for group_labels, group_scores in zip(self.group_labels, self.group_scores)]
68 |         else:
69 |             self.candidates_scores = [np.ones_like(candidates) for candidates in self.candidates]
70 |         if self.candidates_num is None:
71 |             self.candidates_num = self.group_labels.shape[1] * max(len(g) for g in groups)
72 | 
73 |     def __getitem__(self, item):
74 |         data_x, candidates = self.data_x[item], np.asarray(self.candidates[item], dtype=np.int)
75 |         if self.training and self.data_y is not None:
76 |             if len(candidates) < self.candidates_num:
77 |                 sample = np.random.randint(self.labels_num, size=self.candidates_num - len(candidates))
78 |                 candidates = np.concatenate([candidates, sample])
79 |             elif len(candidates) > self.candidates_num:
80 |                 candidates = np.random.choice(candidates, self.candidates_num, replace=False)
81 |             data_y = self.data_y[item, candidates].toarray().squeeze(0).astype(np.float32)
82 |             return (data_x, candidates), data_y
83 |         else:
84 |             scores = self.candidates_scores[item]
85 |             if len(candidates) < self.candidates_num:
86 |                 scores = np.concatenate([scores, [-np.inf] * (self.candidates_num - len(candidates))])
87 |                 candidates = np.concatenate([candidates, [self.labels_num] * (self.candidates_num - len(candidates))])
88 |             scores = np.asarray(scores, dtype=np.float32)
89 |             return data_x, candidates, scores
90 | 
91 |     def __len__(self):
92 |         return len(self.data_x)
93 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/9
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | from functools import partial
 11 | from scipy.sparse import csr_matrix
 12 | from sklearn.preprocessing import MultiLabelBinarizer
 13 | from typing import Union, Optional, List, Iterable, Hashable
 14 | 
 15 | 
 16 | __all__ = ['get_precision', 'get_p_1', 'get_p_3', 'get_p_5', 'get_p_10',
 17 |            'get_ndcg', 'get_n_1', 'get_n_3', 'get_n_5', 'get_n_10',
 18 |            'get_inv_propensity', 'get_psp',
 19 |            'get_psp_1', 'get_psp_3', 'get_psp_5', 'get_psp_10',
 20 |            'get_psndcg_1', 'get_psndcg_3', 'get_psndcg_5', 'get_psndcg_10']
 21 | 
 22 | TPredict = np.ndarray
 23 | TTarget = Union[Iterable[Iterable[Hashable]], csr_matrix]
 24 | TMlb = Optional[MultiLabelBinarizer]
 25 | TClass = Optional[List[Hashable]]
 26 | 
 27 | 
 28 | def get_mlb(classes: TClass = None, mlb: TMlb = None, targets: TTarget = None):
 29 |     if classes is not None:
 30 |         mlb = MultiLabelBinarizer(classes, sparse_output=True)
 31 |     if mlb is None and targets is not None:
 32 |         if isinstance(targets, csr_matrix):
 33 |             mlb = MultiLabelBinarizer(range(targets.shape[1]), sparse_output=True)
 34 |             mlb.fit(None)
 35 |         else:
 36 |             mlb = MultiLabelBinarizer(sparse_output=True)
 37 |             mlb.fit(targets)
 38 |     return mlb
 39 | 
 40 | 
 41 | def get_precision(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5):
 42 |     mlb = get_mlb(classes, mlb, targets)
 43 |     if not isinstance(targets, csr_matrix):
 44 |         targets = mlb.transform(targets)
 45 |     prediction = mlb.transform(prediction[:, :top])
 46 |     return prediction.multiply(targets).sum() / (top * targets.shape[0])
 47 | 
 48 | 
 49 | get_p_1 = partial(get_precision, top=1)
 50 | get_p_3 = partial(get_precision, top=3)
 51 | get_p_5 = partial(get_precision, top=5)
 52 | get_p_10 = partial(get_precision, top=10)
 53 | 
 54 | 
 55 | def get_ndcg(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5):
 56 |     mlb = get_mlb(classes, mlb, targets)
 57 |     log = 1.0 / np.log2(np.arange(top) + 2)
 58 |     dcg = np.zeros((targets.shape[0], 1))
 59 |     if not isinstance(targets, csr_matrix):
 60 |         targets = mlb.transform(targets)
 61 |     for i in range(top):
 62 |         p = mlb.transform(prediction[:, i: i+1])
 63 |         dcg += p.multiply(targets).sum(axis=-1) * log[i]
 64 |     return np.average(dcg / log.cumsum()[np.minimum(targets.sum(axis=-1), top) - 1])
 65 | 
 66 | 
 67 | get_n_1 = partial(get_ndcg, top=1)
 68 | get_n_3 = partial(get_ndcg, top=3)
 69 | get_n_5 = partial(get_ndcg, top=5)
 70 | get_n_10 = partial(get_ndcg, top=10)
 71 | 
 72 | 
 73 | def get_inv_propensity(train_y: csr_matrix, a=0.55, b=1.5):
 74 |     n, number = train_y.shape[0], np.asarray(train_y.sum(axis=0)).squeeze()
 75 |     c = (np.log(n) - 1) * ((b + 1) ** a)
 76 |     return 1.0 + c * (number + b) ** (-a)
 77 | 
 78 | 
 79 | def get_psp(prediction: TPredict, targets: TTarget, inv_w: np.ndarray, mlb: TMlb = None,
 80 |             classes: TClass = None, top=5):
 81 |     mlb = get_mlb(classes, mlb)
 82 |     if not isinstance(targets, csr_matrix):
 83 |         targets = mlb.transform(targets)
 84 |     prediction = mlb.transform(prediction[:, :top]).multiply(inv_w)
 85 |     num = prediction.multiply(targets).sum()
 86 |     t, den = csr_matrix(targets.multiply(inv_w)), 0
 87 |     for i in range(t.shape[0]):
 88 |         den += np.sum(np.sort(t.getrow(i).data)[-top:])
 89 |     return num / den
 90 | 
 91 | 
 92 | get_psp_1 = partial(get_psp, top=1)
 93 | get_psp_3 = partial(get_psp, top=3)
 94 | get_psp_5 = partial(get_psp, top=5)
 95 | get_psp_10 = partial(get_psp, top=10)
 96 | 
 97 | 
 98 | def get_psndcg(prediction: TPredict, targets: TTarget, inv_w: np.ndarray, mlb: TMlb = None,
 99 |                classes: TClass = None, top=5):
100 |     mlb = get_mlb(classes, mlb)
101 |     log = 1.0 / np.log2(np.arange(top) + 2)
102 |     psdcg = 0.0
103 |     if not isinstance(targets, csr_matrix):
104 |         targets = mlb.transform(targets)
105 |     for i in range(top):
106 |         p = mlb.transform(prediction[:, i: i+1]).multiply(inv_w)
107 |         psdcg += p.multiply(targets).sum() * log[i]
108 |     t, den = csr_matrix(targets.multiply(inv_w)), 0.0
109 |     for i in range(t.shape[0]):
110 |         num = min(top, len(t.getrow(i).data))
111 |         den += -np.sum(np.sort(-t.getrow(i).data)[:num] * log[:num])
112 |     return psdcg / den
113 | 
114 | 
115 | get_psndcg_1 = partial(get_psndcg, top=1)
116 | get_psndcg_3 = partial(get_psndcg, top=3)
117 | get_psndcg_5 = partial(get_psndcg, top=5)
118 | get_psndcg_10 = partial(get_psndcg, top=10)
119 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/lib:
--------------------------------------------------------------------------------
1 | ../../lib


--------------------------------------------------------------------------------
/attention-xml/deepxml/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/9
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import os
 10 | import numpy as np
 11 | import torch
 12 | import torch.nn as nn
 13 | from collections import deque
 14 | from torch.utils.data import DataLoader
 15 | from tqdm import tqdm
 16 | from logzero import logger
 17 | from typing import Optional, Mapping, Tuple
 18 | 
 19 | import torch.optim as optim
 20 | from deepxml.evaluation import get_p_5, get_n_5
 21 | from deepxml.modules import *
 22 | from deepxml.optimizers import *
 23 | 
 24 | 
 25 | __all__ = ['Model', 'XMLModel']
 26 | 
 27 | 
 28 | class Model(object):
 29 |     """
 30 | 
 31 |     """
 32 |     def __init__(self, network, model_path, gradient_clip_value=5.0, device_ids=None, **kwargs):
 33 |         self.model = nn.DataParallel(network(**kwargs).cuda(), device_ids=device_ids)
 34 |         if self.model.module.use_spn:
 35 |             self.loss_fn = self.model.module.spp_loss
 36 |         else:
 37 |             self.loss_fn = nn.BCEWithLogitsLoss()
 38 | 
 39 |         self.model_path, self.state = model_path, {}
 40 |         os.makedirs(os.path.split(self.model_path)[0], exist_ok=True)
 41 |         self.gradient_clip_value, self.gradient_norm_queue = gradient_clip_value, deque([np.inf], maxlen=5)
 42 |         self.optimizer = None
 43 | 
 44 |     def train_step(self, train_x: torch.Tensor, train_y: torch.Tensor):
 45 |         self.optimizer.zero_grad()
 46 |         self.model.train()
 47 |         scores = self.model(train_x)
 48 |         loss = self.loss_fn(scores, train_y)
 49 |         loss.backward()
 50 |         if self.model.module.use_spn == False:
 51 |             self.clip_gradient()
 52 | 
 53 |         self.optimizer.step(closure=None)
 54 |         return loss.item()
 55 | 
 56 |     def predict_step(self, data_x: torch.Tensor, k: int):
 57 |         self.model.eval()
 58 |         with torch.no_grad():
 59 |             if self.model.module.use_spn:
 60 |                 s = self.model(data_x)
 61 |                 y = self.model.module.inference(s, s.shape[0])
 62 |                 weights = self.model.module.class_vec.weight.t()
 63 |                 all_scores = torch.abs(torch.mm(y, weights)[:,:-1])
 64 | 
 65 |                 scores, labels = torch.topk(all_scores, k)
 66 |                 return scores.cpu(), labels.cpu()
 67 |             else:
 68 |                 scores, labels = torch.topk(self.model(data_x), k)
 69 |                 return torch.sigmoid(scores).cpu(), labels.cpu()
 70 | 
 71 |     def get_optimizer(self, **kwargs):
 72 |         self.optimizer = DenseSparseAdam(self.model.parameters(), **kwargs)
 73 | 
 74 |     def train(self, train_loader: DataLoader, valid_loader: DataLoader, opt_params: Optional[Mapping] = None,
 75 |               nb_epoch=100, step=100, k=5, early=50, verbose=True, swa_warmup=None, **kwargs):
 76 |         self.get_optimizer(**({} if opt_params is None else opt_params))
 77 |         global_step, best_n5, e = 0, 0.0, 0
 78 |         for epoch_idx in range(nb_epoch):
 79 |             if epoch_idx == swa_warmup:
 80 |                 self.swa_init()
 81 | 
 82 |             for i, data in enumerate(train_loader, 1):
 83 |                 train_x, train_y = data[0], data[1]
 84 |                 global_step += 1
 85 |                 if self.model.module.use_spn:
 86 |                     spn_train_y = data[2]
 87 |                     loss = self.train_step(train_x, spn_train_y.cuda().long())
 88 |                 else:
 89 |                     loss = self.train_step(train_x, train_y.cuda())
 90 |                 if global_step % step == 0:
 91 |                     self.swa_step()
 92 |                     self.swap_swa_params()
 93 |                     targets = valid_loader.dataset.data_y
 94 |                     labels = np.concatenate([self.predict_step(valid_x, k)[1] for valid_x in valid_loader])
 95 | 
 96 |                     p5, n5 = get_p_5(labels, targets), get_n_5(labels, targets)
 97 |                     if n5 > best_n5:
 98 |                         self.save_model()
 99 |                         best_n5, e = n5, 0
100 |                     else:
101 |                         e += 1
102 |                         if early is not None and e > early:
103 |                             return
104 |                     self.swap_swa_params()
105 |                     if verbose:
106 |                         logger.info(F'{epoch_idx} {i * train_loader.batch_size} train loss: {round(loss, 5)} '
107 |                                     F'P@5: {round(p5, 5)} nDCG@5: {round(n5, 5)} early stop: {e}')
108 | 
109 |     def predict(self, data_loader: DataLoader, k=100, desc='Predict', **kwargs):
110 |         self.load_model()
111 |         scores_list, labels_list = zip(*(self.predict_step(data_x, k)
112 |                                          for data_x in tqdm(data_loader, desc=desc, leave=False)))
113 |         return np.concatenate(scores_list), np.concatenate(labels_list)
114 | 
115 |     def save_model(self):
116 |         torch.save(self.model.module.state_dict(), self.model_path)
117 | 
118 |     def load_model(self):
119 |         self.model.module.load_state_dict(torch.load(self.model_path))
120 | 
121 |     def clip_gradient(self):
122 |         if self.gradient_clip_value is not None:
123 |             max_norm = max(self.gradient_norm_queue)
124 |             total_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm * self.gradient_clip_value)
125 |             self.gradient_norm_queue.append(min(total_norm, max_norm * 2.0, 1.0))
126 |             if total_norm > max_norm * self.gradient_clip_value:
127 |                 logger.warn(F'Clipping gradients with total norm {round(total_norm, 5)} '
128 |                             F'and max norm {round(max_norm, 5)}')
129 | 
130 |     def swa_init(self):
131 |         if 'swa' not in self.state:
132 |             logger.info('SWA Initializing')
133 |             swa_state = self.state['swa'] = {'models_num': 1}
134 |             for n, p in self.model.named_parameters():
135 |                 swa_state[n] = p.data.clone().detach()
136 | 
137 |     def swa_step(self):
138 |         if 'swa' in self.state:
139 |             swa_state = self.state['swa']
140 |             swa_state['models_num'] += 1
141 |             beta = 1.0 / swa_state['models_num']
142 |             with torch.no_grad():
143 |                 for n, p in self.model.named_parameters():
144 |                     swa_state[n].mul_(1.0 - beta).add_(beta, p.data)
145 | 
146 |     def swap_swa_params(self):
147 |         if 'swa' in self.state:
148 |             swa_state = self.state['swa']
149 |             for n, p in self.model.named_parameters():
150 |                 p.data, swa_state[n] = swa_state[n], p.data
151 | 
152 |     def disable_swa(self):
153 |         if 'swa' in self.state:
154 |             del self.state['swa']
155 | 
156 | 
157 | class XMLModel(Model):
158 |     """
159 | 
160 |     """
161 |     def __init__(self, labels_num, hidden_size, device_ids=None, attn_device_ids=None,
162 |                  most_labels_parallel_attn=80000, **kwargs):
163 |         parallel_attn = labels_num <= most_labels_parallel_attn
164 |         super(XMLModel, self).__init__(hidden_size=hidden_size, device_ids=device_ids, labels_num=labels_num,
165 |                                        parallel_attn=parallel_attn, **kwargs)
166 |         self.network, self.attn_weights = self.model, nn.Sequential()
167 |         if not parallel_attn:
168 |             self.attn_weights = AttentionWeights(labels_num, hidden_size*2, attn_device_ids)
169 |         self.model = nn.ModuleDict({'Network': self.network.module, 'AttentionWeights': self.attn_weights})
170 |         self.state['best'] = {}
171 | 
172 |     def train_step(self, train_x: Tuple[torch.Tensor, torch.Tensor], train_y: torch.Tensor):
173 |         self.optimizer.zero_grad()
174 |         train_x, candidates = train_x
175 |         self.model.train()
176 |         scores = self.network(train_x, candidates=candidates, attn_weights=self.attn_weights)
177 |         loss = self.loss_fn(scores, train_y)
178 |         loss.backward()
179 |         # self.clip_gradient()
180 |         self.optimizer.step(closure=None)
181 |         return loss.item()
182 | 
183 |     def predict_step(self, data_x: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], k):
184 |         data_x, candidates, group_scores = data_x
185 |         self.model.eval()
186 |         with torch.no_grad():
187 |             scores = torch.sigmoid(self.network(data_x, candidates=candidates, attn_weights=self.attn_weights))
188 |             scores, labels = torch.topk(scores * group_scores.cuda(), k)
189 |             return scores.cpu(), candidates[np.arange(len(data_x)).reshape(-1, 1), labels.cpu()]
190 | 
191 |     def train(self, *args, **kwargs):
192 |         super(XMLModel, self).train(*args, **kwargs)
193 |         self.save_model_to_disk()
194 | 
195 |     def save_model(self):
196 |         model_dict = self.model.state_dict()
197 |         for key in model_dict:
198 |             self.state['best'][key] = model_dict[key].cpu().detach()
199 | 
200 |     def save_model_to_disk(self):
201 |         model_dict = self.model.state_dict()
202 |         for key in model_dict:
203 |             model_dict[key][:] = self.state['best'][key]
204 |         torch.save(self.model.state_dict(), self.model_path)
205 | 
206 |     def load_model(self):
207 |         self.model.load_state_dict(torch.load(self.model_path))
208 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/modules.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/29
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | 
 14 | 
 15 | __all__ = ['Embedding', 'LSTMEncoder', 'MLAttention', 'AttentionWeights', 'FastMLAttention', 'MLLinear']
 16 | 
 17 | 
 18 | class Embedding(nn.Module):
 19 |     """
 20 | 
 21 |     """
 22 |     def __init__(self, vocab_size=None, emb_size=None, emb_init=None, emb_trainable=True, padding_idx=0, dropout=0.2):
 23 |         super(Embedding, self).__init__()
 24 |         if emb_init is not None:
 25 |             if vocab_size is not None:
 26 |                 assert vocab_size == emb_init.shape[0]
 27 |             if emb_size is not None:
 28 |                 assert emb_size == emb_init.shape[1]
 29 |             vocab_size, emb_size = emb_init.shape
 30 |         self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx, sparse=True,
 31 |                                 _weight=torch.from_numpy(emb_init).float() if emb_init is not None else None)
 32 |         self.emb.weight.requires_grad = emb_trainable
 33 |         self.dropout = nn.Dropout(dropout)
 34 |         self.padding_idx = padding_idx
 35 | 
 36 |     def forward(self, inputs):
 37 |         emb_out = self.dropout(self.emb(inputs))
 38 |         lengths, masks = (inputs != self.padding_idx).sum(dim=-1), inputs != self.padding_idx
 39 |         return emb_out[:, :lengths.max()], lengths, masks[:, :lengths.max()]
 40 | 
 41 | 
 42 | class LSTMEncoder(nn.Module):
 43 |     """
 44 | 
 45 |     """
 46 |     def __init__(self, input_size, hidden_size, layers_num, dropout):
 47 |         super(LSTMEncoder, self).__init__()
 48 |         self.lstm = nn.LSTM(input_size, hidden_size, layers_num, batch_first=True, bidirectional=True)
 49 |         self.init_state = nn.Parameter(torch.zeros(2*2*layers_num, 1, hidden_size))
 50 |         self.dropout = nn.Dropout(dropout)
 51 | 
 52 |     def forward(self, inputs, lengths, **kwargs):
 53 |         self.lstm.flatten_parameters()
 54 |         init_state = self.init_state.repeat([1, inputs.size(0), 1])
 55 |         cell_init, hidden_init = init_state[:init_state.size(0)//2], init_state[init_state.size(0)//2:]
 56 |         idx = torch.argsort(lengths, descending=True)
 57 |         packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs[idx], lengths[idx], batch_first=True)
 58 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(
 59 |             self.lstm(packed_inputs, (hidden_init, cell_init))[0], batch_first=True)
 60 |         return self.dropout(outputs[torch.argsort(idx)])
 61 | 
 62 | 
 63 | class MLAttention(nn.Module):
 64 |     """
 65 | 
 66 |     """
 67 |     def __init__(self, labels_num, hidden_size):
 68 |         super(MLAttention, self).__init__()
 69 |         self.attention = nn.Linear(hidden_size, labels_num, bias=False)
 70 |         nn.init.xavier_uniform_(self.attention.weight)
 71 | 
 72 |     def forward(self, inputs, masks):
 73 |         masks = torch.unsqueeze(masks, 1)  # N, 1, L
 74 |         attention = self.attention(inputs).transpose(1, 2).masked_fill(~masks, -np.inf)  # N, labels_num, L
 75 |         attention = F.softmax(attention, -1)
 76 |         output = attention @ inputs
 77 |         output = output.mean(dim=1) # Take a mean of the vectors across axis=1.
 78 |         return output   # N, labels_num, hidden_size
 79 | 
 80 | 
 81 | class AttentionWeights(nn.Module):
 82 |     """
 83 | 
 84 |     """
 85 |     def __init__(self, labels_num, hidden_size, device_ids=None):
 86 |         super(AttentionWeights, self).__init__()
 87 |         if device_ids is None:
 88 |             device_ids = list(range(1, torch.cuda.device_count()))
 89 |         assert labels_num >= len(device_ids)
 90 |         group_size, plus_num = labels_num // len(device_ids), labels_num % len(device_ids)
 91 |         self.group = [group_size + 1] * plus_num + [group_size] * (len(device_ids) - plus_num)
 92 |         assert sum(self.group) == labels_num
 93 |         self.emb = nn.ModuleList(nn.Embedding(size, hidden_size, sparse=True).cuda(device_ids[i])
 94 |                                  for i, size in enumerate(self.group))
 95 |         std = (6.0 / (labels_num + hidden_size)) ** 0.5
 96 |         with torch.no_grad():
 97 |             for emb in self.emb:
 98 |                 emb.weight.data.uniform_(-std, std)
 99 |         self.group_offset, self.hidden_size = np.cumsum([0] + self.group), hidden_size
100 | 
101 |     def forward(self, inputs: torch.Tensor):
102 |         outputs = torch.zeros(*inputs.size(), self.hidden_size, device=inputs.device)
103 |         for left, right, emb in zip(self.group_offset[:-1], self.group_offset[1:], self.emb):
104 |             index = (left <= inputs) & (inputs < right)
105 |             group_inputs = (inputs[index] - left).to(emb.weight.device)
106 |             outputs[index] = emb(group_inputs).to(inputs.device)
107 |         return outputs
108 | 
109 | 
110 | class FastMLAttention(nn.Module):
111 |     """
112 | 
113 |     """
114 |     def __init__(self, labels_num, hidden_size, parallel_attn=False):
115 |         super(FastMLAttention, self).__init__()
116 |         if parallel_attn:
117 |             self.attention = nn.Embedding(labels_num + 1, hidden_size, sparse=True)
118 |             nn.init.xavier_uniform_(self.attention.weight)
119 | 
120 |     def forward(self, inputs, masks, candidates, attn_weights: nn.Module):
121 |         masks = torch.unsqueeze(masks, 1)   # N, 1, L
122 |         attn_inputs = inputs.transpose(1, 2)    # N, hidden, L
123 |         attn_weights = self.attention(candidates) if hasattr(self, 'attention') else attn_weights(candidates)
124 |         # attention = (attn_weights @ attn_inputs).masked_fill(1.0 - masks, -np.inf)  # N, sampled_size, L
125 |         attention = (attn_weights @ attn_inputs).masked_fill(~masks, -np.inf)  # N, sampled_size, L
126 |         attention = F.softmax(attention, -1)    # N, sampled_size, L
127 |         return attention @ inputs   # N, sampled_size, hidden_size
128 | 
129 | 
130 | class MLLinear(nn.Module):
131 |     """
132 |     """
133 |     def __init__(self, linear_size, output_size):
134 |         super(MLLinear, self).__init__()
135 |         self.linear = nn.ModuleList(nn.Linear(in_s, out_s)
136 |                                     for in_s, out_s in zip(linear_size[:-1],
137 |                                                            linear_size[1:]))
138 |         # Initialize each layer.
139 |         for linear in self.linear:
140 |             nn.init.xavier_uniform_(linear.weight)
141 | 
142 |         # NOTE: In the original architecture:
143 |         #       Output size is 1.
144 |         #       Giving an output of x, 1 tensor.
145 | 
146 |         # New Architecture: output size is label size.
147 |         self.output = nn.Linear(linear_size[-1], output_size)
148 |         nn.init.xavier_uniform_(self.output.weight)
149 | 
150 |     def forward(self, inputs):
151 |         linear_out = inputs
152 |         for linear in self.linear:
153 |             linear_out = F.relu(linear(linear_out))
154 | 
155 |         o = self.output(linear_out)
156 |         return o
157 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/networks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/9
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | 
 12 | from deepxml.modules import *
 13 | from deepxml.lib.embeddings import get_vectors
 14 | from deepxml.lib.mathops import get_appx_inv, circular_conv, complexMagProj
 15 | 
 16 | 
 17 | __all__ = ['AttentionRNN', 'FastAttentionRNN']
 18 | 
 19 | 
 20 | class Network(nn.Module):
 21 |     """
 22 | 
 23 |     """
 24 |     def __init__(self, emb_size, vocab_size=None, emb_init=None,
 25 |                  emb_trainable=True, padding_idx=0, emb_dropout=0.2,
 26 |                  **kwargs):
 27 |         super(Network, self).__init__()
 28 |         self.emb = Embedding(vocab_size, emb_size, emb_init, emb_trainable, padding_idx, emb_dropout)
 29 | 
 30 |     def forward(self, *args, **kwargs):
 31 |         raise NotImplementedError
 32 | 
 33 | 
 34 | class AttentionRNN(Network):
 35 |     """
 36 | 
 37 |     """
 38 |     def __init__(self, labels_num, emb_size, hidden_size, layers_num,
 39 |                  linear_size, dropout, use_spn, spn_dim, no_grad,
 40 |                  without_negative, **kwargs):
 41 |         super(AttentionRNN, self).__init__(emb_size, **kwargs)
 42 |         self.use_spn = use_spn
 43 |         if self.use_spn:
 44 |             self.label_size = spn_dim
 45 |             self.no_grad = no_grad
 46 |             self.without_negative = without_negative
 47 |         else:
 48 |             self.label_size = labels_num
 49 | 
 50 |         self.num_labels = labels_num
 51 |         self.lstm = LSTMEncoder(emb_size, hidden_size, layers_num, dropout)
 52 |         self.attention = MLAttention(self.label_size, hidden_size * 2)
 53 |         self.linear = MLLinear([hidden_size * 2] + linear_size, self.label_size)
 54 | 
 55 |         if self.use_spn:
 56 |             self.create_label_embedding() # Create the labels.
 57 | 
 58 |     def create_label_embedding(self):
 59 |         # Class labels. # +1 for the END of LIST Label.
 60 |         self._class_vectors = get_vectors(self.num_labels + 1, self.label_size)
 61 | 
 62 |         # Initialize embedding layer.
 63 |         self.class_vec = nn.Embedding(self.num_labels + 1, self.label_size)
 64 |         self.class_vec.load_state_dict({'weight': self._class_vectors})
 65 |         self.class_vec.weight.requires_grad = False
 66 | 
 67 |         # Initialize weights vector.
 68 |         weights = torch.ones((self.num_labels + 1, 1), dtype=torch.int8)
 69 |         weights[self.num_labels] = 0 # Padding vector is made 0.
 70 |         self.class_weights = nn.Embedding(self.num_labels + 1, 1)
 71 |         self.class_weights.load_state_dict({'weight': weights})
 72 |         self.class_weights.weight.requires_grad = False
 73 | 
 74 |         # P & N vectors.
 75 |         p_n_vec = get_vectors(2, self.label_size, ortho=True)
 76 |         if self.no_grad:
 77 |             print("P & N vectors WILL NOT be updated while training...")
 78 |             self.p = nn.Parameter(p_n_vec[0], requires_grad=False)
 79 |             self.n = nn.Parameter(p_n_vec[1], requires_grad=False)
 80 |         else:
 81 |             print("P & N vectors WILL be updated while training...")
 82 |             self.p = nn.Parameter(p_n_vec[0], requires_grad=True)
 83 |             self.n = nn.Parameter(p_n_vec[1], requires_grad=True)
 84 | 
 85 | 
 86 |     def inference(self, s, batch_size, positive=True):
 87 |         #(batch, dims)
 88 |         if positive:
 89 |             vec = self.p.unsqueeze(0).expand(batch_size, self.label_size)
 90 |         else:
 91 |             vec = self.n.unsqueeze(0).expand(batch_size, self.label_size)
 92 | 
 93 |         # vec = complexMagProj(vec)
 94 |         inv_vec = get_appx_inv(vec)
 95 |         y = circular_conv(inv_vec, s) #(batch, dims)
 96 |         y = y / (torch.norm(y, dim=-1, keepdim=True) + 1e-8)
 97 |         return y
 98 | 
 99 |     def spp_loss(self, s, target):
100 |         """
101 |         Train with SPP.
102 |         """
103 |         pos_classes = self.class_vec(target)   #(batch, no_label, dims)
104 |         pos_classes = pos_classes * self.class_weights(target)        # exit(0)
105 | 
106 |         # Normalize the class vectors.
107 |         # tgt_shape = pos_classes.shape
108 |         # pos_classes = torch.reshape(pos_classes, (tgt_shape[0] * tgt_shape[1],
109 |         #                                           tgt_shape[2]))
110 |         # pos_classes = torch.reshape(complexMagProj(pos_classes), (tgt_shape[0], tgt_shape[1],
111 |         #                                            tgt_shape[2]))
112 | 
113 |         # Remove the padding idx vectors.
114 |         # pos_classes = pos_classes.to(device)
115 | 
116 |         # Positive prediction loss
117 |         convolve = self.inference(s, target.size(0))
118 |         cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1)
119 |         J_p = torch.mean(torch.sum(1 - torch.abs(cosine), dim=-1))
120 | 
121 |         # Negative prediction loss.
122 |         J_n = 0.0
123 |         if self.without_negative is False:
124 |             convolve = self.inference(s, target.size(0), positive=False)
125 |             cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1)
126 |             J_n = torch.mean(torch.sum(torch.abs(cosine), dim=-1))
127 | 
128 |         # Total Loss.
129 |         loss = J_n + J_p
130 |         return loss
131 | 
132 | 
133 |     def forward(self, inputs, **kwargs):
134 |         emb_out, lengths, masks = self.emb(inputs, **kwargs)
135 |         rnn_out = self.lstm(emb_out, lengths)   # N, L, hidden_size * 2 (Bidirectional RNN)
136 |         attn_out = self.attention(rnn_out, masks)      # N, labels_num, hidden_size * 2
137 |         return self.linear(attn_out)
138 | 
139 | class FastAttentionRNN(Network):
140 |     """
141 | 
142 |     """
143 |     def __init__(self, labels_num, emb_size, hidden_size, layers_num, linear_size, dropout, parallel_attn, **kwargs):
144 |         super(FastAttentionRNN, self).__init__(emb_size, **kwargs)
145 |         self.lstm = LSTMEncoder(emb_size, hidden_size, layers_num, dropout)
146 |         self.attention = FastMLAttention(labels_num, hidden_size * 2, parallel_attn)
147 |         self.linear = MLLinear([hidden_size * 2] + linear_size, 1)
148 | 
149 |     def forward(self, inputs, candidates, attn_weights: nn.Module, **kwargs):
150 |         emb_out, lengths, masks = self.emb(inputs, **kwargs)
151 |         rnn_out = self.lstm(emb_out, lengths)   # N, L, hidden_size * 2
152 |         attn_out = self.attention(rnn_out, masks, candidates, attn_weights)     # N, sampled_size, hidden_size * 2
153 |         return self.linear(attn_out)
154 | 


--------------------------------------------------------------------------------
/attention-xml/deepxml/optimizers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2019/3/7
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import math
 10 | import torch
 11 | from torch.optim.optimizer import Optimizer
 12 | 
 13 | 
 14 | __all__ = ['DenseSparseAdam']
 15 | 
 16 | 
 17 | class DenseSparseAdam(Optimizer):
 18 |     """
 19 | 
 20 |     """
 21 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0):
 22 |         if not 0.0 <= lr:
 23 |             raise ValueError("Invalid learning rate: {}".format(lr))
 24 |         if not 0.0 <= eps:
 25 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 26 |         if not 0.0 <= betas[0] < 1.0:
 27 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 28 |         if not 0.0 <= betas[1] < 1.0:
 29 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 30 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 31 |         super(DenseSparseAdam, self).__init__(params, defaults)
 32 | 
 33 |     def step(self, closure=None):
 34 |         """
 35 |         Performs a single optimization step.
 36 | 
 37 |         Parameters
 38 |         ----------
 39 |         closure : ``callable``, optional.
 40 |             A closure that reevaluates the model and returns the loss.
 41 |         """
 42 |         loss = None
 43 |         if closure is not None:
 44 |             loss = closure()
 45 | 
 46 |         for group in self.param_groups:
 47 |             for p in group['params']:
 48 |                 if p.grad is None:
 49 |                     continue
 50 |                 grad = p.grad.data
 51 | 
 52 |                 state = self.state[p]
 53 | 
 54 |                 # State initialization
 55 |                 if 'step' not in state:
 56 |                     state['step'] = 0
 57 |                 if 'exp_avg' not in state:
 58 |                     # Exponential moving average of gradient values
 59 |                     state['exp_avg'] = torch.zeros_like(p.data)
 60 |                 if 'exp_avg_sq' not in state:
 61 |                     # Exponential moving average of squared gradient values
 62 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 63 | 
 64 |                 state['step'] += 1
 65 | 
 66 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 67 |                 beta1, beta2 = group['betas']
 68 | 
 69 |                 weight_decay = group['weight_decay']
 70 | 
 71 |                 if grad.is_sparse:
 72 |                     grad = grad.coalesce()  # the update is non-linear so indices must be unique
 73 |                     grad_indices = grad._indices()
 74 |                     grad_values = grad._values()
 75 |                     size = grad.size()
 76 | 
 77 |                     def make_sparse(values):
 78 |                         constructor = grad.new
 79 |                         if grad_indices.dim() == 0 or values.dim() == 0:
 80 |                             return constructor().resize_as_(grad)
 81 |                         return constructor(grad_indices, values, size)
 82 | 
 83 |                     # Decay the first and second moment running average coefficient
 84 |                     #      old <- b * old + (1 - b) * new
 85 |                     # <==> old += (1 - b) * (new - old)
 86 |                     old_exp_avg_values = exp_avg.sparse_mask(grad)._values()
 87 |                     exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
 88 |                     exp_avg.add_(make_sparse(exp_avg_update_values))
 89 |                     old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values()
 90 |                     exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
 91 |                     exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))
 92 | 
 93 |                     # Dense addition again is intended, avoiding another sparse_mask
 94 |                     numer = exp_avg_update_values.add_(old_exp_avg_values)
 95 |                     exp_avg_sq_update_values.add_(old_exp_avg_sq_values)
 96 |                     denom = exp_avg_sq_update_values.sqrt_().add_(group['eps'])
 97 |                     del exp_avg_update_values, exp_avg_sq_update_values
 98 | 
 99 |                     bias_correction1 = 1 - beta1 ** state['step']
100 |                     bias_correction2 = 1 - beta2 ** state['step']
101 |                     step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
102 | 
103 |                     p.data.add_(make_sparse(-step_size * numer.div_(denom)))
104 |                     if weight_decay > 0.0:
105 |                         p.data.add_(-group['lr'] * weight_decay, p.data.sparse_mask(grad))
106 |                 else:
107 |                     # Decay the first and second moment running average coefficient
108 |                     exp_avg.mul_(beta1).add_(1 - beta1, grad)
109 |                     exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
110 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
111 | 
112 |                     bias_correction1 = 1 - beta1 ** state['step']
113 |                     bias_correction2 = 1 - beta2 ** state['step']
114 |                     step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
115 | 
116 |                     p.data.addcdiv_(-step_size, exp_avg, denom)
117 |                     if weight_decay > 0.0:
118 |                         p.data.add_(-group['lr'] * weight_decay, p.data)
119 | 
120 |         return loss
121 | 


--------------------------------------------------------------------------------
/attention-xml/ensemble.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8
 3 | """
 4 | Created on 2019/6/11
 5 | @author yrh
 6 | 
 7 | """
 8 | 
 9 | import click
10 | import numpy as np
11 | from collections import defaultdict
12 | from tqdm import tqdm
13 | 
14 | 
15 | @click.command()
16 | @click.option('-p', '--prefix', help='Prefix of results.')
17 | @click.option('-t', '--trees', type=click.INT, help='The number of results using for ensemble.')
18 | def main(prefix, trees):
19 |     labels, scores = [], []
20 |     for i in range(trees):
21 |         labels.append(np.load(F'{prefix}-Tree-{i}-labels.npy', allow_pickle=True))
22 |         scores.append(np.load(F'{prefix}-Tree-{i}-scores.npy', allow_pickle=True))
23 |     ensemble_labels, ensemble_scores = [], []
24 |     for i in tqdm(range(len(labels[0]))):
25 |         s = defaultdict(float)
26 |         for j in range(len(labels[0][i])):
27 |             for k in range(trees):
28 |                 s[labels[k][i][j]] += scores[k][i][j]
29 |         s = sorted(s.items(), key=lambda x: x[1], reverse=True)
30 |         ensemble_labels.append([x[0] for x in s[:len(labels[0][i])]])
31 |         ensemble_scores.append([x[1] for x in s[:len(labels[0][i])]])
32 |     np.save(F'{prefix}-Ensemble-labels', np.asarray(ensemble_labels))
33 |     np.save(F'{prefix}-Ensemble-scores', np.asarray(ensemble_scores))
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/attention-xml/evaluation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8
 3 | """
 4 | Created on 2019/8/21
 5 | @author yrh
 6 | """
 7 | 
 8 | import warnings
 9 | warnings.filterwarnings('ignore')
10 | 
11 | import click
12 | import numpy as np
13 | from sklearn.preprocessing import MultiLabelBinarizer
14 | 
15 | from deepxml.evaluation import *
16 | 
17 | 
18 | @click.command()
19 | @click.option('-r', '--results', type=click.Path(exists=True), help='Path of results.')
20 | @click.option('-t', '--targets', type=click.Path(exists=True), help='Path of targets.')
21 | @click.option('--train-labels', type=click.Path(exists=True), default=None, help='Path of labels for training set.')
22 | @click.option('-a', type=click.FLOAT, default=0.55, help='Parameter A for propensity score.')
23 | @click.option('-b', type=click.FLOAT, default=1.5, help='Parameter B for propensity score.')
24 | def main(results, targets, train_labels, a, b):
25 |     res, targets = np.load(results, allow_pickle=True), np.load(targets, allow_pickle=True)
26 |     mlb = MultiLabelBinarizer(sparse_output=True)
27 |     targets = mlb.fit_transform(targets)
28 |     print('Precision@1,3,5:', get_p_1(res, targets, mlb), get_p_3(res, targets, mlb), get_p_5(res, targets, mlb))
29 |     print('nDCG@1,3,5:', get_n_1(res, targets, mlb), get_n_3(res, targets, mlb), get_n_5(res, targets, mlb))
30 |     if train_labels is not None:
31 |         train_labels = np.load(train_labels, allow_pickle=True)
32 |         inv_w = get_inv_propensity(mlb.transform(train_labels), a, b)
33 |         print('PSPrecision@1,3,5:', get_psp_1(res, targets, inv_w, mlb), get_psp_3(res, targets, inv_w, mlb),
34 |               get_psp_5(res, targets, inv_w, mlb))
35 |         print('PSnDCG@1,3,5:', get_psndcg_1(res, targets, inv_w, mlb), get_psndcg_3(res, targets, inv_w, mlb),
36 |               get_psndcg_5(res, targets, inv_w, mlb))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/attention-xml/experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -vx
 3 | 
 4 | # AUTHOR: Ashwinkumar Ganesan.
 5 | 
 6 | # NOTE: Usage:-
 7 | #       1. ./experiments.sh all (for running experiments on all datasets).
 8 | #       2. ./experiments.sh <dataset> (for running experiments on a specific dataset).
 9 | #       3. ./experiments.sh gather (for gathering the precision only).
10 | 
11 | # Config.
12 | NAME=${1:-"all"}
13 | MEM=256000
14 | SAVE_LOC="results"
15 | EXP_NAME=${2:-"test"}
16 | MODEL_TYPE=${3:-"all"}
17 | DIMS=${4:-400}
18 | WITH_GRAD=${5:-"grad"} # no-grad for training with no gradient to p & n vectors.
19 | WITHOUT_NEGATIVE=${6:-"with-negative"} # without-negative for training.
20 | PROP_A=${7:-"0.55"} # Propensity value A.
21 | PROP_B=${8:-"1.5"} # Propensity value B.
22 | 
23 | create_job () {
24 |     echo "Location to save model: $SAVE_LOC/$1 ..."
25 |     if [[ ( "$MODEL_TYPE" == "all" ) ]]; then
26 |         echo "Creating jobs for both models..."
27 |         sbatch  --job-name=$1-${DIMS}-all --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \
28 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
29 |     elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then
30 |         echo "Creating jobs for baseline model..."
31 |         sbatch  --job-name=$1-${DIMS}-base --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \
32 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
33 |     elif [[ ( "$MODEL_TYPE" == "hrr" ) ]]; then
34 |         echo "Creating jobs for HRR model..."
35 |         sbatch  --job-name=$1-${DIMS}-hrr --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \
36 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
37 |     fi
38 | }
39 | 
40 | # NOTE: Individual jobs for each dataset are easier to track.
41 | #       This keeps the SLURM files simple.
42 | 
43 | # Eurlex dataset.
44 | if [[ ( "$NAME" == "EUR-Lex" ) || ( "$NAME" == "all" ) ]]
45 | then
46 |     create_job EUR-Lex $WITH_GRAD $WITHOUT_NEGATIVE
47 | fi
48 | 
49 | # Wiki30k dataset.
50 | if [[ ( "$NAME" == "Wiki10-31K" ) || ( "$NAME" == "all" ) ]]
51 | then
52 |     create_job Wiki10-31K $WITH_GRAD $WITHOUT_NEGATIVE
53 | fi
54 | 
55 | # AmazonCat-13K dataset.
56 | if [[ ( "$NAME" == "AmazonCat-13K" ) || ( "$NAME" == "all" ) ]]
57 | then
58 |     create_job AmazonCat-13K $WITH_GRAD $WITHOUT_NEGATIVE
59 | fi
60 | 
61 | # Amazon-670K dataset.
62 | if [[ ( "$NAME" == "Amazon-670K" ) || ( "$NAME" == "all" ) ]]
63 | then
64 |     create_job Amazon-670K $WITH_GRAD $WITHOUT_NEGATIVE
65 | fi
66 | 


--------------------------------------------------------------------------------
/attention-xml/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8
  3 | """
  4 | Created on 2018/12/9
  5 | @author yrh
  6 | 
  7 | """
  8 | 
  9 | import os
 10 | import click
 11 | import numpy as np
 12 | from pathlib import Path
 13 | from ruamel.yaml import YAML
 14 | from sklearn.model_selection import train_test_split
 15 | from torch.utils.data import DataLoader
 16 | from logzero import logger
 17 | 
 18 | from torch.nn import DataParallel
 19 | from pytorch_model_summary import summary
 20 | from deepxml.dataset import MultiLabelDataset
 21 | from deepxml.data_utils import get_data, get_mlb, get_word_emb, output_res, build_spn_labels, convert_to_spn
 22 | from deepxml.models import Model
 23 | from deepxml.tree import FastAttentionXML
 24 | from deepxml.networks import AttentionRNN
 25 | 
 26 | 
 27 | @click.command()
 28 | @click.option('-d', '--data-cnf', type=click.Path(exists=True), help='Path of dataset configure yaml.')
 29 | @click.option('-m', '--model-cnf', type=click.Path(exists=True), help='Path of model configure yaml.')
 30 | @click.option('--mode', type=click.Choice(['train', 'eval']), default=None)
 31 | @click.option('-t', '--tree-id', type=click.INT, default=None)
 32 | def main(data_cnf, model_cnf, mode, tree_id):
 33 |     tree_id = F'-Tree-{tree_id}' if tree_id is not None else ''
 34 |     yaml = YAML(typ='safe')
 35 |     data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf))
 36 |     model, model_name, data_name = None, model_cnf['name'], data_cnf['name']
 37 |     dim_size = model_cnf['model']['spn_dim'] if model_cnf['model']['spn_dim'] is not False else 0
 38 |     model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}{tree_id}-{dim_size}')
 39 |     emb_init = get_word_emb(data_cnf['embedding']['emb_init'])
 40 |     logger.info(F'Model Name: {model_name}')
 41 | 
 42 |     # NOTE: The training and validation labels are a list of textual labels/ row.
 43 |     if mode is None or mode == 'train':
 44 |         logger.info('Loading Training and Validation Set')
 45 |         train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels'])
 46 |         if 'size' in data_cnf['valid']:
 47 |             random_state = data_cnf['valid'].get('random_state', 1240)
 48 |             train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels,
 49 |                                                                             test_size=data_cnf['valid']['size'],
 50 |                                                                             random_state=random_state)
 51 |         else:
 52 |             valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels'])
 53 |         mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels)))
 54 |         train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)
 55 |         labels_num = len(mlb.classes_)
 56 |         logger.info(F'Number of Labels: {labels_num}')
 57 |         logger.info(F'Size of Training Set: {len(train_x)}')
 58 |         logger.info(F'Size of Validation Set: {len(valid_x)}')
 59 | 
 60 |         if data_cnf['use_spn']:
 61 |             logger.info(F'Processing SPN Labels...')
 62 |             spn_train_labels = convert_to_spn(train_y)
 63 |             spn_valid_labels = convert_to_spn(valid_y)
 64 | 
 65 |             logger.info(F'Number of SPN Labels: {labels_num + 1}')
 66 |             logger.info(F'Maximum label in single row: {spn_train_labels.shape[1]}')
 67 |             logger.info(F'Training labels: {spn_train_labels.shape}')
 68 |             logger.info(F'Validation labels: {spn_valid_labels.shape}')
 69 | 
 70 |         logger.info('Training')
 71 |         if 'cluster' not in model_cnf:
 72 |             if data_cnf['use_spn']:
 73 |                 train_loader = DataLoader(MultiLabelDataset(train_x, train_y, spn_train_labels),
 74 |                                           model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
 75 |                 valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, spn_valid_labels, training=False),
 76 |                                           model_cnf['valid']['batch_size'], num_workers=4)
 77 |                 model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path,
 78 |                               emb_init=emb_init, use_spn=data_cnf['use_spn'],
 79 |                               **data_cnf['model'], **model_cnf['model'])
 80 |             else:
 81 |                 train_loader = DataLoader(MultiLabelDataset(train_x, train_y),
 82 |                                           model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
 83 |                 valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=False),
 84 |                                           model_cnf['valid']['batch_size'], num_workers=4)
 85 |                 model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path,
 86 |                               emb_init=emb_init, use_spn=data_cnf['use_spn'],
 87 |                               **data_cnf['model'], **model_cnf['model'])
 88 | 
 89 |             # Print Summary.
 90 |             model.train(train_loader, valid_loader, **model_cnf['train'])
 91 |         else:
 92 |             model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
 93 |             model.train(train_x, train_y, valid_x, valid_y, mlb)
 94 |         logger.info('Finish Training')
 95 | 
 96 |     if mode is None or mode == 'eval':
 97 |         logger.info('Loading Test Set')
 98 |         mlb = get_mlb(data_cnf['labels_binarizer'])
 99 |         labels_num = len(mlb.classes_)
100 |         test_x, _ = get_data(data_cnf['test']['texts'], None)
101 |         logger.info(F'Size of Test Set: {len(test_x)}')
102 | 
103 |         logger.info('Predicting')
104 |         if 'cluster' not in model_cnf:
105 |             test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'],
106 |                                      num_workers=4)
107 |             if model is None:
108 |                 model = DataParallel(Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init,
109 |                               **data_cnf['model'], **model_cnf['model']))
110 |             scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100))
111 |         else:
112 |             if model is None:
113 |                 model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
114 |             scores, labels = model.predict(test_x)
115 |         logger.info('Finish Predicting')
116 |         labels = mlb.classes_[labels]
117 |         output_res(data_cnf['output']['res'], F'{model_name}-{dim_size}-{data_name}{tree_id}', scores, labels)
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/attention-xml/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8
 3 | """
 4 | Created on 2019/1/20
 5 | @author yrh
 6 | 
 7 | """
 8 | 
 9 | import os
10 | import re
11 | import click
12 | import numpy as np
13 | from nltk.tokenize import word_tokenize
14 | from tqdm import tqdm
15 | from logzero import logger
16 | 
17 | from deepxml.data_utils import *
18 | 
19 | 
20 | def tokenize(sentence: str, sep='/SEP/'):
21 |     # We added a /SEP/ symbol between titles and descriptions such as Amazon datasets.
22 |     return [token.lower() if token != sep else token for token in word_tokenize(sentence)
23 |             if len(re.sub(r'[^\w]', '', token)) > 0]
24 | 
25 | 
26 | @click.command()
27 | @click.option('--text-path', type=click.Path(exists=True), help='Path of text.')
28 | @click.option('--tokenized-path', type=click.Path(), default=None, help='Path of tokenized text.')
29 | @click.option('--label-path', type=click.Path(exists=True), default=None, help='Path of labels.')
30 | @click.option('--vocab-path', type=click.Path(), default=None,
31 |               help='Path of vocab, if it doesn\'t exit, build one and save it.')
32 | @click.option('--emb-path', type=click.Path(), default=None, help='Path of word embedding.')
33 | @click.option('--w2v-model', type=click.Path(), default=None, help='Path of Gensim Word2Vec Model.')
34 | @click.option('--vocab-size', type=click.INT, default=500000, help='Size of vocab.')
35 | @click.option('--max-len', type=click.INT, default=500, help='Truncated length.')
36 | @click.option('--add-spn', type=click.BOOL, default=False, help='Add SPN labels to the dataset.')
37 | def main(text_path, tokenized_path, label_path, vocab_path, emb_path, w2v_model,
38 |          vocab_size, max_len, add_spn):
39 |     if tokenized_path is not None:
40 |         logger.info(F'Tokenizing Text. {text_path}')
41 |         with open(text_path) as fp, open(tokenized_path, 'w') as fout:
42 |             for line in tqdm(fp, desc='Tokenizing'):
43 |                 print(*tokenize(line), file=fout)
44 |         text_path = tokenized_path
45 | 
46 |     if not os.path.exists(vocab_path):
47 |         logger.info(F'Building Vocab. {text_path}')
48 |         logger.info(F'Embedding Path. {w2v_model}')
49 |         with open(text_path) as fp:
50 |             vocab, emb_init = build_vocab(fp, w2v_model, vocab_size=vocab_size)
51 |             np.save(vocab_path, vocab)
52 |             np.save(emb_path, emb_init)
53 | 
54 |     vocab = {word: i for i, word in enumerate(np.load(vocab_path))}
55 |     logger.info(F'Vocab Size: {len(vocab)}')
56 | 
57 |     logger.info(F'Getting Dataset: {text_path} Max Length: {max_len}')
58 |     texts, labels = convert_to_binary(text_path, label_path, max_len, vocab)
59 |     logger.info(F'Size of Samples: {len(texts)}')
60 |     np.save(os.path.splitext(text_path)[0], texts)
61 |     if labels is not None:
62 |         assert len(texts) == len(labels)
63 |         np.save(os.path.splitext(label_path)[0], labels)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/attention-xml/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==7.0
 2 | ruamel.yaml==0.16.5
 3 | numpy==1.16.2
 4 | scipy==1.3.1
 5 | scikit-learn==0.21.2
 6 | gensim==3.4.0
 7 | torch==1.0.1
 8 | nltk==3.4
 9 | tqdm==4.31.1
10 | joblib==0.13.2
11 | logzero==1.5.0
12 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_amazon.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=Amazon-670K
 4 | MODEL=FastAttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy \
13 | -a 0.6 \
14 | -b 2.6
15 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_amazon3m.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=Amazon-3M
 4 | MODEL=FastAttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy \
13 | -a 0.6 \
14 | -b 2.6
15 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_amazoncat.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=AmazonCat-13K
 4 | MODEL=AttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy
13 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_eurlex.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=EUR-Lex
 4 | MODEL=AttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy
13 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $1 == "EUR-Lex" ]; then
 4 |   TRAIN_TEXT="--text-path data/$1/train_texts.txt"
 5 |   TEST_TEXT="--text-path data/$1/test_texts.txt"
 6 | else
 7 |   TRAIN_TEXT="--text-path data/$1/train_raw_texts.txt --tokenized-path data/$1/train_texts.txt"
 8 |   TEST_TEXT="--text-path data/$1/test_raw_texts.txt --tokenized-path data/$1/test_texts.txt"
 9 | fi
10 | 
11 | if [ ! -f data/$1/train_texts.npy ]; then
12 |   python preprocess.py $TRAIN_TEXT --label-path data/$1/train_labels.txt --vocab-path data/$1/vocab.npy --emb-path data/$1/emb_init.npy --w2v-model data/embeddings_weights/glove.6B.300d.bin
13 | fi
14 | 
15 | if [ ! -f data/$1/test_texts.npy ]; then
16 |   python preprocess.py $TEST_TEXT --label-path data/$1/test_labels.txt --vocab-path data/$1/vocab.npy
17 | fi
18 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=Wiki-500K
 4 | MODEL=FastAttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy \
13 | -a 0.5 \
14 | -b 0.4
15 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_wiki10.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA=Wiki10-31K
 4 | MODEL=AttentionXML
 5 | 
 6 | ./scripts/run_preprocess.sh $DATA
 7 | ./scripts/run_xml.sh $DATA $MODEL
 8 | 
 9 | python evaluation.py \
10 | --results results/$MODEL-$DATA-Ensemble-labels.npy \
11 | --targets data/$DATA/test_labels.npy \
12 | --train-labels data/$DATA/train_labels.npy
13 | 


--------------------------------------------------------------------------------
/attention-xml/scripts/run_xml.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 0
4 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 1
5 | python main.py --data-cnf configure/datasets/$1.yaml --model-cnf configure/models/$2-$1.yaml -t 2
6 | python ensemble.py -p results/$2-$1 -t 3
7 | 


--------------------------------------------------------------------------------
/attention-xml/train.slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is for GPU allocation is available. #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=8
 6 | #SBATCH --output=output/slurm-%x-%a.out
 7 | #SBATCH --error=output/slurm-%x-%a.err
 8 | 
 9 | # Set the environment.
10 | # source deactivate # Remove previous environments.
11 | source ~/anaconda3/etc/profile.d/conda.sh
12 | conda activate spp # Environment name.
13 | 
14 | # Execute the code.
15 | set -o xtrace
16 | TASK_ID=$((SLURM_ARRAY_TASK_ID))
17 | NAME=$1
18 | SAVE_MODEL=$2
19 | EXP_NAME=$3
20 | DIMS=$4
21 | WITH_GRAD=${5}
22 | WITHOUT_NEGATIVE=${6}
23 | PROP_A=${7:-"0.55"} # Propensity value A. For Amazon-670K it is 0.6
24 | PROP_B=${8:-"1.5"} # Propensity value B. For Amazon-670K it is 2.6
25 | 
26 | # Model information.
27 | MODEL=("baseline" "hrr")
28 | MODEL_NETWORK="AttentionXML"
29 | 
30 | # Select the model.
31 | MODEL_TYPE=${MODEL[${TASK_ID}]}
32 | FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE}-${DIMS}-${WITH_GRAD}-${WITHOUT_NEGATIVE}
33 | echo "Parameters: $NAME $SAVE_MODEL"
34 | echo "            $MODEL_TYPE $EXP_NAME $DIMS"
35 | echo "            ${WITH_GRAD} ${WITHOUT_NEGATIVE}"
36 | 
37 | # Construct list of options.
38 | OPTIONS=""
39 | if [ "$MODEL_TYPE" == "hrr" ]
40 | then
41 |     DATA_YAML=${NAME}-spn
42 |     MODEL_YAML=${MODEL_NETWORK}-${NAME}-spn-${DIMS}
43 |     LABEL_NAME=${MODEL_NETWORK}-${DIMS}-${NAME}-spn-${DIMS}
44 | else
45 |     DATA_YAML=${NAME}
46 |     MODEL_YAML=${MODEL_NETWORK}-${NAME}
47 |     LABEL_NAME=${MODEL_NETWORK}-0-${NAME}-baseline-0
48 | fi
49 | 
50 | if [ "$WITH_GRAD" == "no-grad" ]
51 | then
52 |         OPTIONS="${OPTIONS} --no-grad"
53 | fi
54 | 
55 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ]
56 | then
57 |         OPTIONS="${OPTIONS} --without-negative"
58 | fi
59 | 
60 | # Train the the models.
61 | echo $DATA_YAML, $MODEL_YAML
62 | echo "OPTIONS: $OPTIONS"
63 | python main.py --data-cnf configure/datasets/${DATA_YAML}.yaml --model-cnf configure/models/${MODEL_YAML}.yaml > results/${FIN_EXP_NAME}.results
64 | 
65 | # Evaluation.
66 | echo "Test Results..."
67 | python evaluation.py --results results/${LABEL_NAME}-labels.npy \
68 |                      --targets data/${NAME}/test_labels.npy --train-labels data/${NAME}/train_labels.npy >> results/${FIN_EXP_NAME}.results
69 | 


--------------------------------------------------------------------------------
/combine_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -vx
 3 | 
 4 | # Script to combine results from different experiments.
 5 | # AUTHOR: Ashwinkumar Ganesan.
 6 | 
 7 | # Config.
 8 | NAME=${1:-"all"}
 9 | MEM=256000
10 | SAVE_LOC="data/model+results"
11 | EXP_NAME=${2:-"temp-exp"}
12 | MODEL_TYPE=${3:-"all"}
13 | DIMS=${4:-400}
14 | THRESHOLD=${5:-0.3}
15 | SAVE_FILE_NAME="$EXP_NAME.results"
16 | 
17 | get_results () {
18 |     if [[ ( "$NAME" == "$1" ) || ( "$NAME" == "all" ) || ( "$NAME" == "gather" ) ]]
19 |     then
20 |         SAVE_FILE=$SAVE_LOC/$SAVE_FILE_NAME
21 |         echo -e "\n" >> $SAVE_FILE
22 |         echo "Dataset: $1" >> $SAVE_FILE
23 | 
24 |         if [[ ( "$MODEL_TYPE" == "baseline" ) || ( "$MODEL_TYPE" == "all" ) ]]; then
25 |             echo "Baseline..." >> $SAVE_FILE
26 |             tail -7 $SAVE_LOC/$1/$1_baseline_$EXP_NAME.results >> $SAVE_FILE
27 |         fi
28 | 
29 |         if [[ ( "$MODEL_TYPE" == "spn" ) || ( "$MODEL_TYPE" == "all" ) ]]; then
30 |             echo -e "\nSPN..." >> $SAVE_FILE
31 |             tail -7 $SAVE_LOC/$1/$1_spn_$EXP_NAME.results >> $SAVE_FILE
32 |         fi
33 |     fi
34 | }
35 | 
36 | echo "Delete old results..."
37 | rm $SAVE_LOC/$SAVE_FILE_NAME
38 | 
39 | # Combine results.
40 | get_results Bibtex
41 | get_results Delicious
42 | get_results Mediamill
43 | get_results Eurlex4k
44 | get_results Wiki10
45 | get_results AmazonCat13K
46 | get_results Amazon670K
47 | get_results DeliciousLarge
48 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | ### Location to store all datasets.


--------------------------------------------------------------------------------
/experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -vx
 3 | 
 4 | # Script to execute experiments with different datasets.
 5 | # AUTHOR: Ashwinkumar Ganesan.
 6 | 
 7 | # NOTE: Usage:-
 8 | #       1. ./experiments.sh all (for running experiments on all datasets).
 9 | #       2. ./experiments.sh <dataset> (for running experiments on a specific dataset).
10 | #       3. ./experiments.sh gather (for gathering the precision only).
11 | 
12 | # Config.
13 | NAME=${1:-"all"}
14 | MEM=256000
15 | SAVE_LOC="data/model+results"
16 | EXP_NAME=${2:-"temp-exp"}
17 | MODEL_TYPE=${3:-"all"}
18 | DIMS=${4:-400}
19 | THRESHOLD=${5:-0.3}
20 | WITH_GRAD=${6:-"grad"} # no-grad for training with no gradient to p & n vectors.
21 | WITHOUT_NEGATIVE=${7:-"with-negative"} # without-negative for training.
22 | SAVE_FILE_NAME="$EXP_NAME.results"
23 | 
24 | create_job () {
25 |     echo "Location to save model: $SAVE_LOC/$1 ..."
26 |     if [[ ( "$MODEL_TYPE" == "all" ) ]]; then
27 |         echo "Creating jobs for both models..."
28 |         sbatch  --job-name=$5 --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \
29 |                 $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9
30 |     elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then
31 |         echo "Creating jobs for baseline model..."
32 |         sbatch  --job-name=$5 --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \
33 |                 $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9
34 |     elif [[ ( "$MODEL_TYPE" == "spn" ) ]]; then
35 |         echo "Creating jobs for SPN model..."
36 |         sbatch  --job-name=$5 --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \
37 |                 $1 $2 $3 $4 $SAVE_LOC/$1 $THRESHOLD $EXP_NAME $DIMS $6 $7 $8 $9
38 |     fi
39 | }
40 | 
41 | # NOTE: Individual jobs for each dataset are easier to track.
42 | #       This keeps the SLURM files simple.
43 | 
44 | # Bibtex dataset.
45 | if [[ ( "$NAME" == "Bibtex" ) || ( "$NAME" == "all" ) ]]
46 | then
47 |     create_job Bibtex data/Bibtex/Bibtex_data.txt data/Bibtex/bibtex_trSplit.txt \
48 |                data/Bibtex/bibtex_tstSplit.txt bibtex 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
49 | fi
50 | 
51 | # Delicious dataset.
52 | if [[ ( "$NAME" == "Delicious" ) || ( "$NAME" == "all" ) ]]
53 | then
54 |     create_job Delicious data/Delicious/Delicious_data.txt data/Delicious/delicious_trSplit.txt \
55 |                data/Delicious/delicious_tstSplit.txt delic 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
56 | fi
57 | 
58 | # Mediamill dataset.
59 | if [[ ( "$NAME" == "Mediamill" ) || ( "$NAME" == "all" ) ]]
60 | then
61 |     create_job Mediamill data/Mediamill/Mediamill_data.txt data/Mediamill/mediamill_trSplit.txt \
62 |                data/Mediamill/mediamill_tstSplit.txt mediam 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
63 | fi
64 | 
65 | # Eurlex-4K dataset.
66 | if [[ ( "$NAME" == "Eurlex4k" ) || ( "$NAME" == "all" ) ]]
67 | then
68 |     create_job Eurlex4k None data/Eurlex4k/eurlex_train.txt data/Eurlex4k/eurlex_test.txt eurlex 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
69 | fi
70 | 
71 | # Wiki10 dataset.
72 | if [[ ( "$NAME" == "Wiki10" ) || ( "$NAME" == "all" ) ]]
73 | then
74 |     create_job Wiki10 None data/Wiki10/train.txt data/Wiki10/test.txt wiki10 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
75 | fi
76 | 
77 | # AmazonCat13K dataset.
78 | if [[ ( "$NAME" == "AmazonCat13K" ) || ( "$NAME" == "all" ) ]]
79 | then
80 |     create_job AmazonCat13K None data/AmazonCat13K/train.txt data/AmazonCat13K/test.txt ama13k 64 64 $WITH_GRAD $WITHOUT_NEGATIVE
81 | fi
82 | 
83 | # Amazon670K dataset.
84 | if [[ ( "$NAME" == "Amazon670K" ) || ( "$NAME" == "all" ) ]]
85 | then
86 |     create_job Amazon670K None data/Amazon670K/train.txt data/Amazon670K/test.txt ama670 16 16 $WITH_GRAD $WITHOUT_NEGATIVE
87 | fi
88 | 
89 | # DeliciousLarge dataset.
90 | if [[ ( "$NAME" == "DeliciousLarge" ) || ( "$NAME" == "all" ) ]]
91 | then
92 |     create_job DeliciousLarge None data/DeliciousLarge/deliciousLarge_train.txt \
93 |                data/DeliciousLarge/deliciousLarge_test.txt dlarge 8 8 $WITH_GRAD $WITHOUT_NEGATIVE
94 | fi
95 | 


--------------------------------------------------------------------------------
/hrr-example-representation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/hrr-example-representation.png


--------------------------------------------------------------------------------
/hrr-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/hrr-example.png


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__init__.py


--------------------------------------------------------------------------------
/lib/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/mathops.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/lib/__pycache__/mathops.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/embeddings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Operations to generate embeddings.
 3 | """
 4 | 
 5 | __author__ = "Ashwinkumar Ganesan"
 6 | __email__ = "gashwin1@umbc.edu"
 7 | 
 8 | import numpy as np
 9 | import torch
10 | from gensim.models import KeyedVectors
11 | 
12 | from .mathops import complex_multiplication, complex_division, circular_conv
13 | from .mathops import get_appx_inv, get_inv, complexMagProj, normalize
14 | from .mathops import npcomplexMagProj
15 | 
16 | """
17 | Load Pretrained Label Embeddings.
18 | """
19 | def load_embeddings(save_loc, vocab_size):
20 |     fname = save_loc + "-complex.bin"
21 |     model = KeyedVectors.load_word2vec_format(fname, binary=True)
22 |     rand_vec_cnt = 0
23 |     vectors = [] # positions in vector space.
24 |     for i in range(0, vocab_size):
25 |         if str(i) in model.wv.vocab:
26 |             vectors.append(model.wv[str(i)])
27 |         else:
28 |             # NOTE: When a label is not present in training then we generate a
29 |             #       default vector and add it to the label vector matrix.
30 |             #  As SPN select the label based on the index it remains consistent while training.
31 |             rand_vec_cnt += 1
32 |             vectors.append(gen_rand_vec(model.vector_size))
33 | 
34 |     # Add Padding idx.
35 |     print("Vocabulary Size: {}".format(vocab_size))
36 |     print("Number of Random vectors generated: {}".format(rand_vec_cnt))
37 |     vectors.append(gen_rand_vec(model.vector_size))
38 |     vectors = torch.from_numpy(np.array(vectors, dtype=np.float32))
39 |     return vectors
40 | 
41 | """
42 | NumPY operations for embeddings.
43 | """
44 | def generate_vectors(num_vectors, dims):
45 |     """
46 |     Generate n vectors of size dims that are orthogonal to each other.
47 |     """
48 |     if num_vectors > dims:
49 |         raise ValueError("num_vectors cannot be greater than dims!")
50 | 
51 |     # Intializing class vectors.
52 |     vecs = torch.randn(dims, num_vectors, dtype=torch.float)
53 | 
54 |     # Using QR decomposition to get orthogonal vectors.
55 |     vecs, _ = torch.qr(vecs)
56 |     vecs = vecs.t()
57 |     vecs = vecs / torch.norm(vecs, dim=-1, keepdim=True)
58 |     return vecs
59 | 
60 | 
61 | def gen_rand_vec(dims):
62 |     """
63 |     Generate a random vector of size dims.
64 |     """
65 |     return npcomplexMagProj(np.random.normal(0, 1. / dims, size=(dims)))
66 | 
67 | 
68 | """
69 | Torch functions.
70 | """
71 | def get_vectors(num_vectors, dims, ortho=False):
72 |     if ortho:
73 |         vectors = generate_vectors(num_vectors, dims)
74 |         return complexMagProj(vectors)
75 |     else:
76 |         vectors = [gen_rand_vec(dims) for i in range(num_vectors)]
77 |         return torch.from_numpy(np.array(vectors, dtype=np.float32))
78 | 
79 | def get_static_embedding(seeds, dims):
80 |     vec = []
81 |     for s in seeds:
82 |         torch.manual_seed(s)
83 |         vec.append(torch.randn((1, dims), dtype=torch.float))
84 | 
85 |     return torch.cat(vec, dim=0)
86 | 


--------------------------------------------------------------------------------
/lib/mathops.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to perform circular convolution operations.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan, Sunil Gandhi, Hang Gao"
  6 | __email__ = "gashwin1@umbc.edu,sunilga1@umbc.edu,hanggao@umbc.edu"
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | """
 14 | Pytorch functions.
 15 | """
 16 | def complex_multiplication(left, right):
 17 |     """
 18 |     Multiply two vectors in complex domain.
 19 |     """
 20 |     left_real, left_complex = left[..., 0], left[..., 1]
 21 |     right_real, right_complex = right[..., 0], right[..., 1]
 22 | 
 23 |     output_real = left_real * right_real - left_complex * right_complex
 24 |     output_complex = left_real * right_complex + left_complex * right_real
 25 |     return torch.stack([output_real, output_complex], dim=-1)
 26 | 
 27 | def complex_division(left, right):
 28 |     """
 29 |     Divide two vectors in complex domain.
 30 |     """
 31 |     left_real, left_complex = left[..., 0], left[..., 1]
 32 |     right_real, right_complex = right[..., 0], right[..., 1]
 33 | 
 34 |     output_real = torch.div((left_real * right_real + left_complex * right_complex),(right_real**2 + right_complex**2))
 35 |     output_complex = torch.div((left_complex * right_real - left_real * right_complex ),(right_real**2 + right_complex**2))
 36 |     return torch.stack([output_real, output_complex], dim=-1)
 37 | 
 38 | def circular_conv(a, b):
 39 |     """ Defines the circular convolution operation
 40 |     a: tensor of shape (batch, D)
 41 |     b: tensor of shape (batch, D)
 42 |     """
 43 |     left = torch.rfft(a, 1, onesided=False)
 44 |     right = torch.rfft(b, 1, onesided=False)
 45 |     output = complex_multiplication(left, right)
 46 |     output = torch.irfft(output, 1, signal_sizes=a.shape[-1:], onesided=False)
 47 |     return output
 48 | 
 49 | def get_appx_inv(a):
 50 |     """
 51 |     Compute approximate inverse of vector a.
 52 |     """
 53 |     return torch.roll(torch.flip(a, dims=[-1]), 1,-1)
 54 | 
 55 | def get_inv(a, typ=torch.DoubleTensor):
 56 |     """
 57 |     Compute exact inverse of vector a.
 58 |     """
 59 |     left = torch.rfft(a, 1, onesided=False)
 60 |     complex_1 = np.zeros(left.shape)
 61 |     complex_1[...,0] = 1
 62 |     op = complex_division(typ(complex_1),left)
 63 |     return torch.irfft(op,1,onesided=False)
 64 | 
 65 | def complexMagProj(x):
 66 |     """
 67 |     Normalize a vector x in complex domain.
 68 |     """
 69 |     c = torch.rfft(x, 1, onesided=False)
 70 |     c_ish=c/torch.norm(c, dim=-1,keepdim=True)
 71 |     output = torch.irfft(c_ish, 1, signal_sizes=x.shape[1:], onesided=False)
 72 |     return output
 73 | 
 74 | def normalize(x):
 75 |     return x/torch.norm(x)
 76 | 
 77 | """
 78 | Numpy Functions.
 79 | """
 80 | # Make them work with batch dimensions
 81 | def cc(a, b):
 82 |     return np.fft.irfft(np.fft.rfft(a) * np.fft.rfft(b))
 83 | 
 84 | def np_inv(a):
 85 |     return np.fft.irfft((1.0/np.fft.rfft(a)),n=a.shape[-1])
 86 | 
 87 | def np_appx_inv(a):
 88 |     #Faster implementation
 89 |     return np.roll(np.flip(a, axis=-1), 1,-1)
 90 | 
 91 | def npcomplexMagProj(x):
 92 |     """
 93 |     Normalize a vector x in complex domain.
 94 |     """
 95 |     c = np.fft.rfft(x)
 96 | 
 97 |     # Look at real and image as if they were real
 98 |     c_ish = np.vstack([c.real, c.imag])
 99 | 
100 |     # Normalize magnitude of each complex/real pair
101 |     c_ish=c_ish/np.linalg.norm(c_ish, axis=0)
102 |     c_proj = c_ish[0,:] + 1j * c_ish[1,:]
103 |     return np.fft.irfft(c_proj,n=x.shape[-1])
104 | 
105 | def nrm(a):
106 |     return a / np.linalg.norm(a)
107 | 


--------------------------------------------------------------------------------
/lib/metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to compute different metrics for tasks.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan"
  6 | __email__ = "gashwin1@umbc.edu"
  7 | 
  8 | from tabulate import tabulate
  9 | import math
 10 | import matplotlib
 11 | import matplotlib.pyplot as plt
 12 | import torch
 13 | import xclib.evaluation.xc_metrics as xc_metrics
 14 | 
 15 | # Compute the precision score for multi-label binary classification task.
 16 | def mbprecision(y_true, y_pred):
 17 |     correct_pred = torch.sum(y_pred & y_true, axis=1).float()
 18 |     print(correct_pred.dtype)
 19 |     return torch.mean(correct_pred / torch.sum(y_true, axis=1))
 20 | 
 21 | # Compute the recall score for multi-label binary classification task.
 22 | def mbrecall(y_true, y_pred):
 23 |     return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1))
 24 | 
 25 | 
 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename):
 27 |     """
 28 |     Plot stats about the experiment.
 29 |     tr_stats: Training statistics (includes loss, precision, recall and F1)
 30 |     th_stats: Grid search statistics for configuring threshold.
 31 |     epochs: Number of epochs that the model is trained for.
 32 |     spoch: epoch that has optimal paramaters.
 33 |     sth: optimal threshold.
 34 |     filename: location to store plots.
 35 |     """
 36 |     fig, ax = plt.subplots(3, figsize=(10, 10))
 37 | 
 38 |     ep = tr_stats['Epoch']
 39 |     tr_loss = tr_stats['Training Loss']
 40 |     val_loss = tr_stats['Val Loss']
 41 |     pr = tr_stats['Precision']
 42 |     re = tr_stats['Recall']
 43 |     f1 = tr_stats['F1 Score']
 44 |     th = th_stats['Threshold']
 45 | 
 46 |     ax[0].plot(ep, tr_loss)
 47 |     ax[0].plot(ep, val_loss)
 48 |     ax[0].set_title("Training & Validation Loss Per Epoch", size=16)
 49 |     ax[0].set_xlabel("Epoch", size=14)
 50 |     ax[0].set_ylabel("Loss", size=14)
 51 |     ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large")
 52 |     ax[0].axvline(x=spoch, linestyle='dashed')
 53 | 
 54 |     ax[1].plot(ep, pr)
 55 |     ax[1].plot(ep, re)
 56 |     ax[1].plot(ep, f1)
 57 |     ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16)
 58 |     ax[1].set_xlabel("Epoch", size=14)
 59 |     ax[1].set_ylabel("Score", size=14)
 60 |     ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 61 |     ax[1].axvline(x=spoch, linestyle='dashed')
 62 | 
 63 |     ax[2].plot(th, th_stats['Precision'])
 64 |     ax[2].plot(th, th_stats['Recall'])
 65 |     ax[2].plot(th, th_stats['F1 Score'])
 66 |     ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16)
 67 |     ax[2].set_xlabel("Theshold", size=14)
 68 |     ax[2].set_ylabel("Score", size=14)
 69 |     ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 70 |     ax[2].axvline(x=sth, linestyle='dashed')
 71 | 
 72 |     fig.tight_layout()
 73 |     plt.savefig(filename + ".png")
 74 | 
 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib
 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5):
 77 |     """
 78 |         Compute Inverse propensity values
 79 |         Values for A/B:
 80 |             Wikpedia-500K: 0.5/0.4
 81 |             Amazon-670K, Amazon-3M: 0.6/2.6
 82 |             Others: 0.55/1.5
 83 | 
 84 |         Arguments:
 85 |         train_labels : numpy ndarray
 86 |     """
 87 |     inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B)
 88 |     return inv_propen
 89 | 
 90 | # Compute metrics with propensity.
 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5):
 92 |     """Compute propensity weighted precision@k and DCG@k.
 93 |        Arguments:
 94 |        true_labels : numpy ndarray
 95 |                      Ground truth labels from the dataset (one-hot vector).
 96 |        predicted_labels : numpy ndarray
 97 |                           Predicted labels (one-hot vector of labels)
 98 |     """
 99 |     acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores,
100 |                              remove_invalid=False)
101 |     return acc.eval(predicted_labels, topk)
102 | 
103 | 
104 | # Print the final results.
105 | # This provides the results for agg metrics when threshold for inference
106 | # is optimized and metrics are then computed.
107 | def display_agg_results(args, te_loss, pr, rec, f1):
108 |     print("----------Tests with Threshold Inference------------")
109 |     print("Inference Threshold: {:.3f}".format(args.th))
110 |     print("Test Loss: {:.3f}".format(te_loss))
111 |     print("Test Precision: {:.3f}".format(pr * 100))
112 |     print("Test Recall: {:.3f}".format(rec * 100))
113 |     print("Test F1-Score: {:.3f}\n".format(f1 * 100))
114 | 
115 | 
116 | def display_metrics(metrics, k=5):
117 |     # Merge batchwise metrics.
118 |     final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k]
119 |     for idx, metric in enumerate(metrics):
120 |         for i in range(0, 4):
121 |             for j in range(0, k):
122 |                 final_metrics[i][j] += metric[i][j]
123 | 
124 |     # Dataset metrics.
125 |     print("----------Tests with Ordered Retrieval------------")
126 |     table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]]
127 |     table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]])
128 |     table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]])
129 |     table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]])
130 |     print(tabulate(table, headers=[i+1 for i in range(0, k)],
131 |                    floatfmt=".3f"))
132 | 


--------------------------------------------------------------------------------
/lib/metrics_old.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Library functions to compute different metrics for tasks.
  3 | """
  4 | 
  5 | __author__ = "Ashwinkumar Ganesan"
  6 | __email__ = "gashwin1@umbc.edu"
  7 | 
  8 | from tabulate import tabulate
  9 | import math
 10 | import matplotlib
 11 | import matplotlib.pyplot as plt
 12 | import torch
 13 | import xclib.evaluation.xc_metrics as xc_metrics
 14 | 
 15 | # Compute the precision score for multi-label binary classification task.
 16 | def mbprecision(y_true, y_pred):
 17 |     correct_pred = torch.sum(y_pred & y_true, axis=1).float()
 18 |     print(correct_pred.dtype)
 19 |     return torch.mean(correct_pred / torch.sum(y_true, axis=1))
 20 | 
 21 | # Compute the recall score for multi-label binary classification task.
 22 | def mbrecall(y_true, y_pred):
 23 |     return torch.mean(torch.sum(y_pred & y_true, axis=1) / torch.sum(y_true, axis=1))
 24 | 
 25 | 
 26 | def plot_tr_stats(tr_stats, th_stats, spoch, sth, filename):
 27 |     """
 28 |     Plot stats about the experiment.
 29 |     tr_stats: Training statistics (includes loss, precision, recall and F1)
 30 |     th_stats: Grid search statistics for configuring threshold.
 31 |     epochs: Number of epochs that the model is trained for.
 32 |     spoch: epoch that has optimal paramaters.
 33 |     sth: optimal threshold.
 34 |     filename: location to store plots.
 35 |     """
 36 |     fig, ax = plt.subplots(3, figsize=(10, 10))
 37 | 
 38 |     ep = tr_stats['Epoch']
 39 |     tr_loss = tr_stats['Training Loss']
 40 |     val_loss = tr_stats['Val Loss']
 41 |     pr = tr_stats['Precision']
 42 |     re = tr_stats['Recall']
 43 |     f1 = tr_stats['F1 Score']
 44 |     th = th_stats['Threshold']
 45 | 
 46 |     ax[0].plot(ep, tr_loss)
 47 |     ax[0].plot(ep, val_loss)
 48 |     ax[0].set_title("Training & Validation Loss Per Epoch", size=16)
 49 |     ax[0].set_xlabel("Epoch", size=14)
 50 |     ax[0].set_ylabel("Loss", size=14)
 51 |     ax[0].legend(["Training Loss", "Validation Loss"], fontsize="large")
 52 |     ax[0].axvline(x=spoch, linestyle='dashed')
 53 | 
 54 |     ax[1].plot(ep, pr)
 55 |     ax[1].plot(ep, re)
 56 |     ax[1].plot(ep, f1)
 57 |     ax[1].set_title("Validation Precision, Recall & F-1 Score \n (Threshold = 0.25)", size=16)
 58 |     ax[1].set_xlabel("Epoch", size=14)
 59 |     ax[1].set_ylabel("Score", size=14)
 60 |     ax[1].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 61 |     ax[1].axvline(x=spoch, linestyle='dashed')
 62 | 
 63 |     ax[2].plot(th, th_stats['Precision'])
 64 |     ax[2].plot(th, th_stats['Recall'])
 65 |     ax[2].plot(th, th_stats['F1 Score'])
 66 |     ax[2].set_title("Validation Precision, Recall & F-1 Score \n Optimize Threshold", size=16)
 67 |     ax[2].set_xlabel("Theshold", size=14)
 68 |     ax[2].set_ylabel("Score", size=14)
 69 |     ax[2].legend(["Validation Precision", "Validation Recall", "Validation F1 Score"], fontsize="large")
 70 |     ax[2].axvline(x=sth, linestyle='dashed')
 71 | 
 72 |     fig.tight_layout()
 73 |     plt.savefig(filename + ".png")
 74 | 
 75 | # Adapted from: https://github.com/kunaldahiya/pyxclib
 76 | def compute_inv_propensity(train_labels, A=0.55, B=1.5):
 77 |     """
 78 |         Compute Inverse propensity values
 79 |         Values for A/B:
 80 |             Wikpedia-500K: 0.5/0.4
 81 |             Amazon-670K, Amazon-3M: 0.6/2.6
 82 |             Others: 0.55/1.5
 83 | 
 84 |         Arguments:
 85 |         train_labels : numpy ndarray
 86 |     """
 87 |     inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B)
 88 |     return inv_propen
 89 | 
 90 | # Compute metrics with propensity.
 91 | def compute_prop_metrics(true_labels, predicted_labels, inv_prop_scores, topk=5):
 92 |     """Compute propensity weighted precision@k and DCG@k.
 93 |        Arguments:
 94 |        true_labels : numpy ndarray
 95 |                      Ground truth labels from the dataset (one-hot vector).
 96 |        predicted_labels : numpy ndarray
 97 |                           Predicted labels (one-hot vector of labels)
 98 |     """
 99 |     acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_prop_scores,
100 |                              remove_invalid=False)
101 |     return acc.eval(predicted_labels, topk)
102 | 
103 | # Print the final results.
104 | # This provides the results for agg metrics when threshold for inference
105 | # is optimized and metrics are then computed.
106 | def display_agg_results(args, te_loss, pr, rec, f1):
107 |     print("----------Tests with Threshold Inference------------")
108 |     print("Inference Threshold: {:.3f}".format(args.th))
109 |     print("Test Loss: {:.3f}".format(te_loss))
110 |     print("Test Precision: {:.3f}".format(pr * 100))
111 |     print("Test Recall: {:.3f}".format(rec * 100))
112 |     print("Test F1-Score: {:.3f}\n".format(f1 * 100))
113 | 
114 | 
115 | def display_metrics(metrics, k=5):
116 |     # Merge batchwise metrics.
117 |     final_metrics = [[0.0] * k,[0.0] * k,[0.0] * k,[0.0] * k]
118 |     for idx, metric in enumerate(metrics):
119 |         for i in range(0, 4):
120 |             for j in range(0, k):
121 |                 final_metrics[i][j] += metric[i][j]
122 | 
123 |     # Dataset metrics.
124 |     print("----------Tests with Ordered Retrieval------------")
125 |     table = [['Precision@k'] + [i * 100 / (idx + 1) for i in final_metrics[0]]]
126 |     table.append(['nDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[1]])
127 |     table.append(['PSprec@k'] + [i * 100 / (idx + 1) for i in final_metrics[2]])
128 |     table.append(['PSnDCG@k'] + [i * 100 / (idx + 1) for i in final_metrics[3]])
129 |     print(tabulate(table, headers=[i+1 for i in range(0, k)],
130 |                    floatfmt=".3f"))
131 | 


--------------------------------------------------------------------------------
/lib/plots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manage plots.
 3 | AUTHOR: Ashwinkumar Ganesan.
 4 | """
 5 | 
 6 | import matplotlib
 7 | import matplotlib.pyplot as plt
 8 | import csv
 9 | import pandas as pd
10 | 
11 | """
12 | Plot training and testing curves.
13 | The graph includes:
14 | 1. Training loss per epoch.
15 | 2. Test loss per epoch.
16 | 3. Precision per epoch.
17 | 4. Recall per epoch.
18 | 5. F1 score per epoch.
19 | """
20 | def plot_stats(tr_stats):    
21 |     fig, ax = plt.subplots(2)
22 |     
23 |     ep = [i for i in range(0, epochs)]
24 |     tr_loss = tr_stats['Training Loss']
25 |     te_loss = tr_stats['Test Loss']
26 |     pr = tr_stats['Precision']
27 |     re = tr_stats['Recall']
28 |     f1 = tr_stats['F1 Score']
29 |     
30 |     # Loss Curve.
31 |     ax[0].plot(ep, tr_loss)
32 |     ax[0].plot(ep, te_loss)
33 |     ax[0].set_title("Training & Testing Loss Per Epoch")
34 |     
35 |     
36 |     ax[1].plot(ep, pr)
37 |     ax[1].plot(ep, re)
38 |     ax[1].plot(ep, f1)
39 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions.
 3 | """
 4 | 
 5 | from prettytable import PrettyTable
 6 | import pandas as pd
 7 | from time import time
 8 | import torch
 9 | 
10 | GB_DIV = 1024 * 1024 * 1024
11 | 
12 | 
13 | def print_memory_profile():
14 |     """
15 |     Get basic memory information.
16 |     """
17 |     device = torch.cuda.current_device()
18 |     print("Allocated: {:.4f}".format(int(torch.cuda.memory_allocated()) / GB_DIV))
19 |     print("Reserved: {:.4f}\n".format(int(torch.cuda.memory_allocated()) / GB_DIV))
20 | 
21 | # https://stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
22 | def print_command_arguments(args):
23 |     table = PrettyTable(['Parameter', 'Value'])
24 |     table.title = 'Experimental Setup'
25 |     for arg in vars(args):
26 |         table.add_row([arg, getattr(args, arg)])
27 |     print(table)
28 | 
29 | class Measure(object):
30 |     """
31 |     Manage runtimes for a specific code block.
32 |     """
33 |     def __init__(self, name):
34 |         self._measure = name
35 |         self._is_measuring = False
36 |         self._elapsed_time = 0
37 | 
38 |     def is_measuring(self):
39 |         return self._is_measuring
40 | 
41 |     def start(self):
42 |         self._stime = time()
43 |         self._is_measuring = True
44 | 
45 |     def end(self):
46 |         self._etime = time()
47 |         self._elapsed_time += self._etime - self._stime
48 |         self._is_measuring = False
49 | 
50 |     def get_elapsed_time(self):
51 |         return self._elapsed_time
52 | 
53 |     def get_name(self):
54 |         return self._measure
55 | 
56 | 
57 | class ExperimentTime(object):
58 |     """
59 |     Manage time for different parts in an experiment.
60 |     """
61 |     def __init__(self):
62 |         self._table = pd.DataFrame(columns=['Measurement', 'Elapsed Time'])
63 |         self._pos = 0
64 |         self.measure = {}
65 | 
66 |     def _append(self, name):
67 |         self._table.loc[self._pos] = [name, self.measure[name].get_elapsed_time()]
68 |         self._pos += 1
69 | 
70 |     def register(self, name):
71 |         if name in self.measure:
72 |             print("Measurement with same name previously added.")
73 |         else:
74 |             self.measure[name] = Measure(name)
75 | 
76 |     def measure_time(self, name):
77 |         if self.measure[name].is_measuring():
78 |             self.measure[name].end()
79 |             # Add time to the dataframe.
80 |             self._append(name)
81 |         else:
82 |             self.measure[name].start()
83 | 
84 |     def get_measurements(self):
85 |         return self._table
86 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | ### Output
2 | This folder contains all output from stdout & stderr when ```run_classifier.py``` is executed. When a cluster (like SLURM) is utilized, this folder contains SLURM outputs.
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | argon2-cffi @ file:///tmp/build/80754af9/argon2-cffi_1596828496740/work
  2 | attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work
  3 | backcall==0.2.0
  4 | bleach==3.1.5
  5 | boto==2.49.0
  6 | boto3==1.14.52
  7 | botocore==1.17.52
  8 | brotlipy==0.7.0
  9 | certifi==2020.6.20
 10 | cffi==1.14.0
 11 | chardet==3.0.4
 12 | click==7.1.2
 13 | colorlover==0.3.0
 14 | cryptography @ file:///tmp/build/80754af9/cryptography_1598892041992/work
 15 | cufflinks==0.17.3
 16 | cycler==0.10.0
 17 | Cython==0.29.17
 18 | decorator==4.4.2
 19 | defusedxml==0.6.0
 20 | docutils==0.15.2
 21 | entrypoints==0.3
 22 | faiss==1.6.3
 23 | filelock==3.0.12
 24 | future==0.18.2
 25 | gdown==3.12.2
 26 | gensim==3.8.3
 27 | idna @ file:///tmp/build/80754af9/idna_1593446292537/work
 28 | importlib-metadata @ file:///tmp/build/80754af9/importlib-metadata_1593446433964/work
 29 | ipykernel @ file:///tmp/build/80754af9/ipykernel_1596206602906/work/dist/ipykernel-5.3.4-py3-none-any.whl
 30 | ipython @ file:///tmp/build/80754af9/ipython_1593447367857/work
 31 | ipython-genutils==0.2.0
 32 | ipywidgets==7.5.1
 33 | jedi @ file:///tmp/build/80754af9/jedi_1598371618777/work
 34 | Jinja2==2.11.2
 35 | jmespath==0.10.0
 36 | joblib==0.17.0
 37 | json5==0.9.5
 38 | jsonpatch==1.26
 39 | jsonpointer==2.0
 40 | jsonschema==3.2.0
 41 | jupyter==1.0.0
 42 | jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work
 43 | jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1598884538475/work
 44 | jupyter-contrib-core==0.3.3
 45 | jupyter-core==4.6.3
 46 | jupyter-nbextensions-configurator==0.4.1
 47 | jupyterlab==2.2.6
 48 | jupyterlab-server @ file:///tmp/build/80754af9/jupyterlab_server_1594164409481/work
 49 | kiwisolver==1.2.0
 50 | llvmlite==0.33.0+1.g022ab0f
 51 | logzero==1.5.0
 52 | MarkupSafe==1.1.1
 53 | matplotlib==3.1.3
 54 | mistune==0.8.4
 55 | mkl-fft==1.1.0
 56 | mkl-random==1.1.1
 57 | mkl-service==2.3.0
 58 | nbconvert==5.6.1
 59 | nbformat==5.0.7
 60 | nltk @ file:///tmp/build/80754af9/nltk_1592496090529/work
 61 | nmslib==2.0.6
 62 | notebook @ file:///tmp/build/80754af9/notebook_1596838602091/work
 63 | numba==0.50.1
 64 | numpy==1.19.2
 65 | packaging==20.4
 66 | pandas==1.0.3
 67 | pandocfilters==1.4.2
 68 | parso==0.7.0
 69 | pexpect==4.8.0
 70 | pickleshare==0.7.5
 71 | Pillow==7.2.0
 72 | plotly==4.11.0
 73 | prettytable==0.7.2
 74 | prometheus-client==0.8.0
 75 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1598885458782/work
 76 | psutil==5.7.0
 77 | ptyprocess==0.6.0
 78 | pybind11==2.5.0
 79 | pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
 80 | Pygments==2.6.1
 81 | pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work
 82 | pyparsing==2.4.7
 83 | pyrsistent==0.16.0
 84 | PySocks==1.7.1
 85 | python-dateutil==2.8.1
 86 | pytorch-model-summary==0.1.2
 87 | pytz==2020.1
 88 | PyYAML==5.3.1
 89 | pyzmq==19.0.1
 90 | qtconsole @ file:///tmp/build/80754af9/qtconsole_1598374667791/work
 91 | QtPy==1.9.0
 92 | regex @ file:///tmp/build/80754af9/regex_1596829710510/work
 93 | requests @ file:///tmp/build/80754af9/requests_1592841827918/work
 94 | retrying==1.3.3
 95 | ruamel.yaml==0.16.12
 96 | ruamel.yaml.clib==0.2.2
 97 | s3transfer==0.3.3
 98 | scikit-learn==0.22.1
 99 | scipy==1.4.1
100 | Send2Trash==1.5.0
101 | six==1.15.0
102 | sklearn==0.0
103 | smart-open==2.1.1
104 | tabulate==0.8.7
105 | terminado==0.8.3
106 | testpath==0.4.4
107 | threadpoolctl==2.1.0
108 | torch==1.4.0
109 | torchfile==0.1.0
110 | torchsummary==1.5.1
111 | torchvision==0.5.0
112 | tornado==6.0.4
113 | tqdm @ file:///tmp/build/80754af9/tqdm_1596810128862/work
114 | traitlets==4.3.3
115 | urllib3 @ file:///tmp/build/80754af9/urllib3_1597086586889/work
116 | visdom==0.1.8.9
117 | wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
118 | webencodings==0.5.1
119 | websocket-client==0.57.0
120 | widgetsnbextension==3.5.1
121 | xclib==0.96
122 | zipp==3.1.0
123 | 


--------------------------------------------------------------------------------
/train.slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is for GPU allocation is available. # SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=8
 6 | #SBATCH --output=output/slurm-%A-%a.out
 7 | #SBATCH --error=output/slurm-%A-%a.err
 8 | 
 9 | # Set the environment.
10 | # source deactivate # Remove previous environments.
11 | source ~/anaconda3/etc/profile.d/conda.sh
12 | conda activate spp # Environment name.
13 | 
14 | # Execute the code.
15 | set -o xtrace
16 | TASK_ID=$((SLURM_ARRAY_TASK_ID))
17 | NAME=$1
18 | DATA_FILE=$2
19 | TR_SPLIT=$3
20 | TE_SPLIT=$4
21 | SAVE_MODEL=$5
22 | THRESHOLD=$6
23 | EXP_NAME=$7
24 | DIMS=$8
25 | BATCH_SIZE=$9
26 | TEST_BATCH_SIZE=${10}
27 | WITH_GRAD=${11}
28 | WITHOUT_NEGATIVE=${12}
29 | 
30 | MODEL=("baseline" "spn")
31 | 
32 | # Select the model.
33 | MODEL_TYPE=${MODEL[${TASK_ID}]}
34 | echo "Parameters: $NAME $DATA_FILE $TR_SPLIT $TE_SPLIT $SAVE_MODEL $THRESHOLD"
35 | echo "            $MODEL_TYPE $EXP_NAME $DIMS $BATCH_SIZE $TEST_BATCH_SIZE"
36 | echo "            ${WITH_GRAD} ${WITHOUT_NEGATIVE}"
37 | 
38 | # Construct list of options.
39 | OPTIONS="--th $THRESHOLD --debug"
40 | if [ "$MODEL_TYPE" == "baseline" ]
41 | then
42 |     OPTIONS="${OPTIONS} --baseline"
43 | fi
44 | 
45 | if [ "$WITH_GRAD" == "no-grad" ]
46 | then
47 |         OPTIONS="${OPTIONS} --no-grad"
48 | fi
49 | 
50 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ]
51 | then
52 |         OPTIONS="${OPTIONS} --without-negative"
53 | fi
54 | 
55 | python run_classifier.py --data-file $DATA_FILE \
56 |                          --tr-split $TR_SPLIT \
57 |                          --te-split $TE_SPLIT --spn-dim $DIMS \
58 |                          --save $SAVE_MODEL --name ${NAME}_${MODEL_TYPE}_${EXP_NAME} \
59 |                          --batch-size $BATCH_SIZE --test-batch-size $TEST_BATCH_SIZE \
60 |                          $OPTIONS > $SAVE_MODEL/${NAME}_${MODEL_TYPE}_${EXP_NAME}.results


--------------------------------------------------------------------------------
/xml-cnn/README.md:
--------------------------------------------------------------------------------
 1 | # HRR-CNN
 2 |   This is a modified implementation of [XML-CNN](https://github.com/siddsax/XML-CNN) from this [repository](https://github.com/siddsax/XML-CNN) that uses HRR for labal representation and inference. The Pytorch implementation is of the paper [Deep Learning for Extreme Multi-label Text Classification](http://nyc.lti.cs.cmu.edu/yiming/Publications/jliu-sigir17.pdf) with dynamic pooling.
 3 | 
 4 | ## List of changes to the Codebase.
 5 | The XML-CNN codebase has been modified to with the following list of changes:
 6 | 1. Retooled to use semantic pointers. The architecture can use HRRs to learn and infer labels.
 7 | 2. Modifications to operate seamlessly with large datasets and models using a Pytorch dataset object.
 8 | 3. The codebase also contains two scripts, i.e., ```experiments.sh``` and ```train.slurm.sh``` for execution of training and evaluation jobs on a SLURM enabled cluster.
 9 | 
10 | ### NOTE: Before running experiments, perform preprocessing as discussed [here](https://github.com/siddsax/XML-CNN).
11 | 
12 | Example Execution with RCV Dataset
13 | ----------------------------------
14 | To train the model with HRR.
15 | ```bash
16 | EXP_NAME="test"
17 | PROP_A=0.55
18 | PROP_B=1.5
19 | python main.py --ds rcv1 --mn rcv1-${EXP_NAME}-hrr -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --hrr_labels
20 | ```
21 | 
22 | To evaluate the model:
23 | ```bash
24 | python main.py --ds $NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --tr 0 --lm ../saved_models/rcv1-${EXP_NAME}-hrr/model_best_test --hrr_labels
25 | ```
26 | 
27 | References
28 | ----------
29 | [Deep Learning for Extreme Multi-label Text Classification](http://nyc.lti.cs.cmu.edu/yiming/Publications/jliu-sigir17.pdf)


--------------------------------------------------------------------------------
/xml-cnn/code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FutureComputing4AI/Learning-with-Holographic-Reduced-Representations/ce3cb8fc4c63b16f41fc41d8788503ebe146c73c/xml-cnn/code/__init__.py


--------------------------------------------------------------------------------
/xml-cnn/code/cnn_test.py:
--------------------------------------------------------------------------------
 1 | from header import *
 2 | from collections import OrderedDict
 3 | from sklearn.metrics import log_loss
 4 | from lib.metrics import compute_prop_metrics, display_metrics
 5 | 
 6 | def test_class(test_loader, params, device, model=None, embedding_weights=None,
 7 |                verbose=True, propensity=None, topk=5):
 8 |     if(model == None):
 9 |         if(embedding_weights is None):
10 |             print("Error: Embedding weights needed!")
11 |             exit()
12 |         else:
13 |             model = xmlCNN(params, embedding_weights)
14 |             model = load_model(model, params.load_model)
15 | 
16 |     if(torch.cuda.is_available()):
17 |         params.dtype_f = torch.cuda.FloatTensor
18 |         params.dtype_i = torch.cuda.LongTensor
19 |         model = model.cuda()
20 |     else:
21 |         params.dtype_f = torch.FloatTensor
22 |         params.dtype_i = torch.LongTensor
23 | 
24 |     # Testing data.
25 |     loss = 0.0; prec = 0.0; num_batch = 0.0; all_acc = []
26 |     for i, (batch_x, batch_y) in enumerate(test_loader):
27 |         # Load Data.
28 |         batch_x = batch_x.type(torch.LongTensor).to(device)
29 |         batch_y = batch_y.type(torch.FloatTensor).to(device)
30 | 
31 |         model.time['test_forward_pass'].start()
32 |         e_emb = model.embedding_layer.forward(batch_x)
33 |         s = model.classifier(e_emb)
34 |         model.time['test_forward_pass'].end()
35 | 
36 |         model.time['inference'].start()
37 |         if params.hrr_labels:
38 |             batch_size = batch_y.size()[0]
39 |             combined_y = model.classifier.inference(s, batch_size)
40 |             y_pred = torch.abs(torch.mm(combined_y, model.classifier.class_vec.weight.t())).cpu().data[:, :batch_y.shape[1]].numpy()
41 |         else:
42 |             y_pred = s.cpu().data.numpy()
43 |         model.time['inference'].end() # Measure forward pass during inference.
44 | 
45 |         # Measure.
46 |         y_cpu = batch_y.cpu().data.numpy()
47 |         loss += log_loss(y_cpu, y_pred)
48 |         acc = compute_prop_metrics(sparse.csr_matrix(y_cpu),
49 |                                    sparse.csr_matrix(y_pred), propensity,
50 |                                    topk=topk)
51 |         all_acc.append(acc)
52 |         num_batch += 1
53 | 
54 |     loss /= num_batch
55 |     print('Test Loss; Cross Entropy {};'.format(loss))
56 |     display_metrics(all_acc)
57 |     return loss
58 | 


--------------------------------------------------------------------------------
/xml-cnn/code/cnn_train.py:
--------------------------------------------------------------------------------
  1 | from header import *
  2 | from cnn_test import *
  3 | 
  4 | # ---------------------------------------------------------------------------------
  5 | 
  6 | def train(train_loader, test_loader, embedding_weights, params, device,
  7 | 		  propensity=None):
  8 | 	loss_best = float('Inf')
  9 | 	bestTotalLoss = float('Inf')
 10 | 	best_test_loss = float("inf")
 11 | 	max_grad = 0
 12 | 	num_mb = np.ceil(params.N/params.mb_size)
 13 | 	model = xmlCNN(params, embedding_weights)
 14 | 	if(torch.cuda.is_available()):
 15 | 		print("--------------- Using GPU! ---------")
 16 | 		model.params.dtype_f = torch.cuda.FloatTensor
 17 | 		model.params.dtype_i = torch.cuda.LongTensor
 18 | 		model = model.to(device)
 19 | 	else:
 20 | 		model.params.dtype_f = torch.FloatTensor
 21 | 		model.params.dtype_i = torch.LongTensor
 22 | 		print("=============== Using CPU =========")
 23 | 
 24 | 	optimizer = optim.Adam(filter(lambda p: p.requires_grad,model.parameters()), lr=params.lr)
 25 | 	print(model);print("%"*100)
 26 | 
 27 | 	if params.dataparallel:
 28 | 		model = nn.DataParallel(model)
 29 | 
 30 | 	if(len(params.load_model)):
 31 | 		params.model_name = params.load_model
 32 | 		print(params.load_model)
 33 | 		model, optimizer, init = load_model(model, params.load_model, optimizer=optimizer)
 34 | 	else:
 35 | 		init = 0
 36 | 
 37 | 	# =============================== TRAINING =================================
 38 | 	for epoch in range(init, params.num_epochs):
 39 | 		totalLoss = 0.0
 40 | 		model.time['train'].start()
 41 | 		model.time['data_load'].start()
 42 | 		for i, (batch_x, batch_y) in enumerate(train_loader):
 43 | 			model.time['data_load'].end()
 44 | 			model.train()
 45 | 			optimizer.zero_grad()
 46 | 
 47 | 			# Load data to GPU.
 48 | 			batch_x = batch_x.type(torch.LongTensor).to(device)
 49 | 			if params.hrr_labels:
 50 | 				batch_y = batch_y.type(torch.LongTensor).to(device)
 51 | 			else:
 52 | 				batch_y = batch_y.type(torch.FloatTensor).to(device)
 53 | 
 54 | 			# Model forward.
 55 | 			loss, output = model.forward(batch_x, batch_y)
 56 | 
 57 | 			# ------------------------------------------------------------------
 58 | 			loss = loss.mean().squeeze()
 59 | 			totalLoss += loss.data
 60 | 
 61 | 			# NOTE: This block is not part of training.
 62 | 			model.time['train'].end()
 63 | 			if i % int(num_mb/12) == 0:
 64 | 				print('Iter-{}; Loss: {:.4}; best_loss: {:.4}; max_grad: {}:'.format(i, loss.data, loss_best, max_grad))
 65 | 				if not os.path.exists('../saved_models/' + params.model_name ):
 66 | 					os.makedirs('../saved_models/' + params.model_name)
 67 | 				save_model(model, optimizer, epoch, params.model_name + "/model_best_batch")
 68 | 				if(loss<loss_best):
 69 | 					loss_best = loss.data
 70 | 			model.time['train'].start()
 71 | 
 72 | 			# ------------------------ Propagate loss --------------------------
 73 | 			model.time['optimization'].start()
 74 | 			loss.backward()
 75 | 			loss = loss.data
 76 | 			model.time['optimization'].end()
 77 | 
 78 | 			torch.nn.utils.clip_grad_norm_(model.parameters(), params.clip)
 79 | 
 80 | 			model.time['optimization'].start()
 81 | 			optimizer.step()
 82 | 			model.time['optimization'].end()
 83 | 
 84 | 			model.time['data_load'].start()
 85 | 
 86 | 		model.time['data_load'].end()
 87 | 		model.time['train'].end()
 88 | 
 89 | 		if(totalLoss < bestTotalLoss):
 90 | 			bestTotalLoss = totalLoss
 91 | 			if not os.path.exists('../saved_models/' + params.model_name ):
 92 | 				os.makedirs('../saved_models/' + params.model_name)
 93 | 			save_model(model, optimizer, epoch, params.model_name + "/model_best_epoch")
 94 | 
 95 | 		print('End-of-Epoch: {} Loss: {:.4}; best_loss: {:.4};'.format(epoch, totalLoss, bestTotalLoss))
 96 | 
 97 | 		model.eval()
 98 | 		test_ce_loss = test_class(test_loader, params, model=model,
 99 | 								  device=device, verbose=False, propensity=propensity)
100 | 
101 | 		if(test_ce_loss < best_test_loss):
102 | 			best_test_loss = test_ce_loss
103 | 			print("This loss is better than previous recorded CE loss:- {}".format(best_test_loss))
104 | 			if not os.path.exists('../saved_models/' + params.model_name ):
105 | 				os.makedirs('../saved_models/' + params.model_name)
106 | 			save_model(model, optimizer, epoch, params.model_name + "/model_best_test")
107 | 
108 | 		if epoch % params.save_step == 0:
109 | 			save_model(model, optimizer, epoch, params.model_name + "/model_" + str(epoch))
110 | 
111 | 	print("-----------Running Measurements---------")
112 | 	print("Training time / epoch: {:0.3f}".format(model.time['train'].get_elapsed_time() / params.num_epochs))
113 | 	print("Data Loader time / epoch: {:0.3f}".format(model.time['data_load'].get_elapsed_time() / params.num_epochs))
114 | 	print("Train Forward Pass time / epoch: {:0.3f}".format(model.time['train_forward_pass'].get_elapsed_time() / params.num_epochs))
115 | 	print("Train Loss time / epoch: {:0.3f}".format(model.time['train_loss'].get_elapsed_time() / params.num_epochs))
116 | 	print("Optimization time / epoch: {:0.3f}".format(model.time['optimization'].get_elapsed_time() / params.num_epochs))
117 | 	print("Test Forward Pass time / epoch: {:0.3f}".format(model.time['test_forward_pass'].get_elapsed_time() / params.num_epochs))
118 | 	print("Inference time / epoch: {:0.3f}".format(model.time['inference'].get_elapsed_time() / params.num_epochs))
119 | 


--------------------------------------------------------------------------------
/xml-cnn/code/experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -vx
 3 | 
 4 | # NOTE: Usage:-
 5 | #       1. ./experiments.sh all (for running experiments on all datasets).
 6 | #       2. ./experiments.sh <dataset> (for running experiments on a specific dataset).
 7 | #       3. ./experiments.sh gather (for gathering the precision only).
 8 | 
 9 | # Config.
10 | NAME=${1:-"all"}
11 | MEM=256000
12 | SAVE_LOC="results"
13 | EXP_NAME=${2:-"test"}
14 | MODEL_TYPE=${3:-"all"}
15 | DIMS=${4:-400}
16 | WITH_GRAD=${5:-"grad"} # no-grad for training with no gradient to p & n vectors.
17 | WITHOUT_NEGATIVE=${6:-"with-negative"} # without-negative for training.
18 | PROP_A=${7:-"0.55"} # Propensity value A.
19 | PROP_B=${8:-"1.5"} # Propensity value B.
20 | 
21 | create_job () {
22 |     echo "Location to save model: $SAVE_LOC/$1 ..."
23 |     if [[ ( "$MODEL_TYPE" == "all" ) ]]; then
24 |         echo "Creating jobs for both models..."
25 |         sbatch  --job-name=$1-all --mem=$MEM --array=0-1 --exclude=node[17-32] train.slurm.sh \
26 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
27 |     elif [[ ( "$MODEL_TYPE" == "baseline" ) ]]; then
28 |         echo "Creating jobs for baseline model..."
29 |         sbatch  --job-name=$1-base --mem=$MEM --array=0 --exclude=node[17-32] train.slurm.sh \
30 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
31 |     elif [[ ( "$MODEL_TYPE" == "hrr" ) ]]; then
32 |         echo "Creating jobs for HRR model..."
33 |         sbatch  --job-name=$1-hrr --mem=$MEM --array=1 --exclude=node[17-32] train.slurm.sh \
34 |                 $1 $SAVE_LOC/$1 $EXP_NAME $DIMS $2 $3 ${PROP_A} ${PROP_B}
35 |     fi
36 | }
37 | 
38 | # NOTE: Individual jobs for each dataset are easier to track.
39 | #       This keeps the SLURM files simple.
40 | 
41 | # RCV1 dataset.
42 | if [[ ( "$NAME" == "rcv1" ) || ( "$NAME" == "all" ) ]]
43 | then
44 |     create_job rcv1 $WITH_GRAD $WITHOUT_NEGATIVE
45 | fi
46 | 
47 | # Eurlex dataset.
48 | if [[ ( "$NAME" == "eurlex" ) || ( "$NAME" == "all" ) ]]
49 | then
50 |     create_job eurlex $WITH_GRAD $WITHOUT_NEGATIVE
51 | fi
52 | 
53 | # Wiki30k dataset.
54 | if [[ ( "$NAME" == "wiki30k" ) || ( "$NAME" == "all" ) ]]
55 | then
56 |     create_job wiki30k $WITH_GRAD $WITHOUT_NEGATIVE
57 | fi
58 | 
59 | # Amazon12k dataset.
60 | if [[ ( "$NAME" == "amazon12k" ) || ( "$NAME" == "all" ) ]]
61 | then
62 |     create_job amazon12k $WITH_GRAD $WITHOUT_NEGATIVE
63 | fi
64 | 
65 | # Amazon12k dataset.
66 | if [[ ( "$NAME" == "amazon670K" ) || ( "$NAME" == "all" ) ]]
67 | then
68 |     create_job amazon670K $WITH_GRAD $WITHOUT_NEGATIVE
69 | fi
70 | 


--------------------------------------------------------------------------------
/xml-cnn/code/header.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.autograd as autograd
 4 | import torch.optim as optim
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | import matplotlib.gridspec as gridspec
 8 | import os
 9 | from torch.autograd import Variable
10 | import sys
11 | import numpy as np
12 | sys.path.append('../utils/')
13 | sys.path.append('models')
14 | import data_helpers 
15 | 
16 | from w2v import *
17 | from embedding_layer import embedding_layer
18 | from cnn_encoder import cnn_encoder
19 | from sklearn import preprocessing
20 | from sklearn.decomposition import PCA
21 | import scipy.io as sio
22 | from scipy import sparse
23 | import argparse
24 | from visdom import Visdom
25 | from sklearn.externals import joblib 
26 | from futils import *
27 | from loss import loss
28 | from xmlCNN import xmlCNN
29 | import timeit
30 | from precision_k import precision_k


--------------------------------------------------------------------------------
/xml-cnn/code/lib:
--------------------------------------------------------------------------------
1 | ../../lib


--------------------------------------------------------------------------------
/xml-cnn/code/main.py:
--------------------------------------------------------------------------------
  1 | from header import *
  2 | from cnn_train import *
  3 | from cnn_test import *
  4 | import pdb
  5 | from lib.metrics import compute_inv_propensity
  6 | from lib.utils import print_command_arguments
  7 | 
  8 | # ------------------------ Params -------------------------------------------------------------------------------
  9 | parser = argparse.ArgumentParser(description='Process some integers.')
 10 | 
 11 | parser.add_argument('--zd', dest='Z_dim', type=int, default=100, help='Latent layer dimension')
 12 | parser.add_argument('--mb', dest='mb_size', type=int, default=20, help='Size of minibatch, changing might result in latent layer variance overflow')
 13 | parser.add_argument('--lr', dest='lr', type=float, default=0.001, help='Learning Rate')
 14 | parser.add_argument('--p', dest='plot_flg', type=int, default=0, help='1 to plot, 0 to not plot')
 15 | # parser.add_argument('--e', dest='num_epochs', type=int, default=50, help='step for displaying loss')
 16 | parser.add_argument('--e', dest='num_epochs', type=int, default=2, help='step for displaying loss')
 17 | parser.add_argument('--seed', type=int, default=100, metavar='S', help='random seed (default: 100)')
 18 | parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
 19 | # parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)')
 20 | parser.add_argument('--batch-size', type=int, default=16, metavar='N', help='input batch size for training (default: 64)')
 21 | parser.add_argument('-a', type=float, default=0.55,
 22 |                     help='Inverse propensity value A (Default: 0.55).')
 23 | parser.add_argument('-b', type=float, default=1.5,
 24 |                     help='Inverse propensity value A (Default: 1.5).')
 25 | 
 26 | parser.add_argument('--d', dest='disp_flg', type=int, default=0, help='display graphs')
 27 | parser.add_argument('--sve', dest='save', type=int, default=1, help='save models or not')
 28 | parser.add_argument('--ss', dest='save_step', type=int, default=10, help='gap between model saves')
 29 | parser.add_argument('--mn', dest='model_name', type=str, default='', help='model name')
 30 | parser.add_argument('--tr', dest='training', type=int, default=1, help='model name')
 31 | parser.add_argument('--lm', dest='load_model', type=str, default="", help='model name')
 32 | parser.add_argument('--ds', dest='data_set', type=str, default="rcv", help='dataset name')
 33 | 
 34 | parser.add_argument('--pp', dest='pp_flg', type=int, default=0, help='1 is for min-max pp, 2 is for gaussian pp, 0 for none')
 35 | parser.add_argument('--loss', dest='loss_type', type=str, default="BCELoss", help='Loss')
 36 | 
 37 | parser.add_argument('--hidden_dims', type=int, default=512, help='hidden layer dimension')
 38 | # parser.add_argument('--hidden_dims', type=int, default=1024, help='hidden layer dimension') # Amazon670K
 39 | parser.add_argument('--sequence_length',help='max sequence length of a document', type=int,default=500)
 40 | parser.add_argument('--embedding_dim', help='dimension of word embedding representation', type=int, default=300)
 41 | parser.add_argument('--model_variation', help='model variation: CNN-rand or CNN-pretrain', type=str, default='pretrain')
 42 | parser.add_argument('--pretrain_type', help='pretrain model: GoogleNews or glove', type=str, default='glove')
 43 | parser.add_argument('--vocab_size', help='size of vocabulary keeping the most frequent words', type=int, default=30000)
 44 | parser.add_argument('--drop_prob', help='Dropout probability', type=int, default=.3)
 45 | parser.add_argument('--load_data', help='Load Data or not', type=int, default=0)
 46 | parser.add_argument('--mg', dest='multi_gpu', type=int, default=0, help='1 for 2 gpus and 0 for normal')
 47 | parser.add_argument('--filter_sizes', help='number of filter sizes (could be a list of integer)', type=int, default=[2, 4, 8], nargs='+')
 48 | 
 49 | # Large Datasets.
 50 | parser.add_argument('--num_filters', help='number of filters (i.e. kernels) in CNN model', type=int, default=32)
 51 | parser.add_argument('--pooling_units', help='number of pooling units in 1D pooling layer', type=int, default=32)
 52 | 
 53 | # Small Datasets.
 54 | # parser.add_argument('--num_filters', help='number of filters (i.e. kernels) in CNN model', type=int, default=128)
 55 | # parser.add_argument('--pooling_units', help='number of pooling units in 1D pooling layer', type=int, default=128)
 56 | 
 57 | parser.add_argument('--pooling_type', help='max or average', type=str, default='max')
 58 | parser.add_argument('--model_type', help='glove or GoogleNews', type=str, default='glove')
 59 | parser.add_argument('--num_features', help='50, 100, 200, 300', type=int, default=300)
 60 | parser.add_argument('--dropouts', help='0 for not using, 1 for using', type=int, default=0)
 61 | parser.add_argument('--clip', help='gradient clipping', type=float, default=1000)
 62 | # parser.add_argument('--clip', help='gradient clipping', type=float, default=2.0)
 63 | parser.add_argument('--dataset_gpu', help='load dataset in full to gpu', type=int, default=1)
 64 | parser.add_argument('--dp', dest='dataparallel', help='to train on multiple GPUs or not', type=bool, default=False)
 65 | 
 66 | # HRR specific arguments.
 67 | parser.add_argument('--hrr_labels', action='store_true', default=False, help='Use HRR Labels.')
 68 | parser.add_argument('--hrr_dim', type=int, default=400, help='HRR Label Dimension.')
 69 | parser.add_argument('--no-grad', action='store_true', default=False,
 70 |                     help='Update Label vectors.')
 71 | parser.add_argument('--without-negative', action='store_true', default=False,
 72 |                     help='disable negative loss.')
 73 | 
 74 | params = parser.parse_args()
 75 | print_command_arguments(params)
 76 | 
 77 | if(len(params.model_name)==0):
 78 |     params.model_name = "Gen_data_CNN_Z_dim-{}_mb_size-{}_hidden_dims-{}_preproc-{}_loss-{}_sequence_length-{}_embedding_dim-{}_params.vocab_size={}".format(params.Z_dim, params.mb_size, params.hidden_dims, params.pp_flg, params.loss_type, params.sequence_length, params.embedding_dim, params.vocab_size)
 79 | 
 80 | print('Saving Model to: ' + params.model_name)
 81 | 
 82 | # Begin.
 83 | torch.backends.cudnn.deterministic = True
 84 | torch.backends.cudnn.benchmark = False
 85 | use_cuda = not params.no_cuda and torch.cuda.is_available()
 86 | torch.manual_seed(params.seed)
 87 | np.random.seed(params.seed)
 88 | device = torch.device("cuda" if use_cuda else "cpu")
 89 | 
 90 | if use_cuda:
 91 |     kwargs = {'num_workers': 16, 'pin_memory': True, 'drop_last': True,
 92 |               'batch_size': params.batch_size, 'shuffle': True}
 93 | else:
 94 |     kwargs = {'drop_last': True, 'num_workers': 8,
 95 |               'batch_size': params.batch_size, 'shuffle': True}
 96 | 
 97 | # ------------------ data ----------------------------------------------
 98 | params.data_path = '../datasets/' + params.data_set
 99 | 
100 | # Create training and test data loaders.
101 | train_dataset = XMLDataset(params)
102 | print("-----------Training Dataset Statistics-----------")
103 | print("Features: {}".format(train_dataset.features.shape))
104 | print("Labels: {}".format(train_dataset.labels.shape))
105 | 
106 | # Compute Propensity Scores.
107 | inv_propen = compute_inv_propensity(train_dataset.labels, A=params.a, B=params.b)
108 | 
109 | # Create dataloader.
110 | train_loader = torch.utils.data.DataLoader(train_dataset, **kwargs)
111 | 
112 | test_dataset = XMLDataset(params, train=False)
113 | print("-----------Testing Dataset Statistics------------")
114 | print("Features: {}".format(test_dataset.features.shape))
115 | print("Labels: {}".format(test_dataset.labels.shape))
116 | test_loader = torch.utils.data.DataLoader(test_dataset, **kwargs)
117 | 
118 | params = update_params(params)
119 | # -----------------------  Loss ------------------------------------------------
120 | if not params.hrr_labels:
121 |     params.loss_fn = torch.nn.BCELoss(size_average=False)
122 | 
123 | # -------------------------- Params --------------------------------------------
124 | if params.model_variation == 'pretrain':
125 |     embedding_weights = load_word2vec(params)
126 | else:
127 |     embedding_weights = None
128 | 
129 | if torch.cuda.is_available():
130 |     params.dtype = torch.cuda.FloatTensor
131 | else:
132 |     params.dtype = torch.FloatTensor
133 | 
134 | 
135 | if(params.training):
136 |     train(train_loader, test_loader, embedding_weights, params, device,
137 |           propensity=inv_propen)
138 | else:
139 | 	test_class(test_loader, params, model=model, device=device, verbose=False,
140 |                propensity=inv_propen)
141 | 


--------------------------------------------------------------------------------
/xml-cnn/code/models/classifier.py:
--------------------------------------------------------------------------------
 1 | from header import *
 2 | class classifier(nn.Module):
 3 |     def __init__(self, params):
 4 |         super(classifier, self).__init__()
 5 |         self.params = params
 6 |         if(self.params.dropouts):
 7 |             self.drp = nn.Dropout(.5)
 8 |         self.l1 = nn.Linear(params.h_dim, params.H_dim)
 9 |         self.l2 = nn.Linear(params.H_dim, params.y_dim)
10 |         self.relu = nn.ReLU()
11 |         self.sigmoid = nn.Sigmoid()
12 |         torch.nn.init.xavier_uniform_(self.l1.weight)
13 | 
14 |     def forward(self, H):
15 |         H = self.l1(H)
16 |         H = self.relu(H)
17 |         H = self.l2(H)
18 |         H = self.sigmoid(H)
19 |         return H


--------------------------------------------------------------------------------
/xml-cnn/code/models/cnn_encoder.py:
--------------------------------------------------------------------------------
  1 | from header import *
  2 | from lib.embeddings import get_vectors
  3 | from lib.mathops import get_appx_inv, circular_conv, complexMagProj
  4 | 
  5 | def out_size(l_in, kernel_size, padding=0, dilation=1, stride=1):
  6 |     a = l_in + 2*padding - dilation*(kernel_size - 1) - 1
  7 |     b = int(a/stride)
  8 |     return b + 1
  9 | 
 10 | class cnn_encoder(torch.nn.Module):
 11 | 
 12 |     def __init__(self, params):
 13 |         super(cnn_encoder, self).__init__()
 14 |         self.params = params
 15 |         self.conv_layers = nn.ModuleList()
 16 |         self.pool_layers = nn.ModuleList()
 17 |         fin_l_out_size = 0
 18 | 
 19 |         if(params.dropouts):
 20 |             self.drp = nn.Dropout(p=.25)
 21 |             self.drp5 = nn.Dropout(p=.5)
 22 | 
 23 |         for fsz in params.filter_sizes:
 24 |             l_out_size = out_size(params.sequence_length, fsz, stride=2)
 25 |             pool_size = l_out_size // params.pooling_units
 26 |             l_conv = nn.Conv1d(params.embedding_dim, params.num_filters, fsz, stride=2)
 27 |             torch.nn.init.xavier_uniform_(l_conv.weight)
 28 |             if params.pooling_type == 'average':
 29 |                 l_pool = nn.AvgPool1d(pool_size, stride=None, count_include_pad=True)
 30 |                 pool_out_size = (int((l_out_size - pool_size)/pool_size) + 1)*params.num_filters
 31 |             elif params.pooling_type == 'max':
 32 |                 l_pool = nn.MaxPool1d(2, stride=1)
 33 |                 pool_out_size = (int(l_out_size*params.num_filters - 2) + 1)
 34 |             fin_l_out_size += pool_out_size
 35 | 
 36 |             self.conv_layers.append(l_conv)
 37 |             self.pool_layers.append(l_pool)
 38 | 
 39 |         self.fc_layer_1 = nn.Linear(fin_l_out_size, params.hidden_dims)
 40 |         torch.nn.init.xavier_uniform_(self.fc_layer_1.weight)
 41 | 
 42 |         # NOTE: Comment out fc2 and fc3 for Amazon670K
 43 |         self.fc_layer_2 = nn.Linear(params.hidden_dims, params.hidden_dims)
 44 |         torch.nn.init.xavier_uniform_(self.fc_layer_2.weight)
 45 | 
 46 |         self.fc_layer_3 = nn.Linear(params.hidden_dims, params.hidden_dims)
 47 |         torch.nn.init.xavier_uniform_(self.fc_layer_3.weight)
 48 |         ###
 49 | 
 50 |         if params.hrr_labels:
 51 |             self.out_layer = nn.Linear(params.hidden_dims, params.hrr_dim)
 52 |             self.create_label_embedding() # Create the labels.
 53 |         else:
 54 |             self.out_layer = nn.Linear(params.hidden_dims, params.y_dim)
 55 | 
 56 |         torch.nn.init.xavier_uniform_(self.out_layer.weight)
 57 | 
 58 | 
 59 |     def create_label_embedding(self):
 60 |         # Class labels. # +1 for the END of LIST Label.
 61 |         self._class_vectors = get_vectors(self.params.y_dim + 1, self.params.hrr_dim)
 62 | 
 63 |         # Initialize embedding layer.
 64 |         self.class_vec = nn.Embedding(self.params.y_dim + 1, self.params.hrr_dim)
 65 |         self.class_vec.load_state_dict({'weight': self._class_vectors})
 66 |         self.class_vec.weight.requires_grad = False
 67 | 
 68 |         # Initialize weights vector.
 69 |         weights = torch.ones((self.params.y_dim + 1, 1), dtype=torch.int8)
 70 |         weights[self.params.y_dim] = 0 # Padding vector is made 0.
 71 |         self.class_weights = nn.Embedding(self.params.y_dim + 1, 1)
 72 |         self.class_weights.load_state_dict({'weight': weights})
 73 |         self.class_weights.weight.requires_grad = False
 74 | 
 75 |         # P & N vectors.
 76 |         p_n_vec = get_vectors(2, self.params.hrr_dim, ortho=True)
 77 |         if self.params.no_grad:
 78 |             print("P & N vectors WILL NOT be updated while training...")
 79 |             self.p = nn.Parameter(p_n_vec[0], requires_grad=False)
 80 |             self.n = nn.Parameter(p_n_vec[1], requires_grad=False)
 81 |         else:
 82 |             print("P & N vectors WILL be updated while training...")
 83 |             self.p = nn.Parameter(p_n_vec[0], requires_grad=True)
 84 |             self.n = nn.Parameter(p_n_vec[1], requires_grad=True)
 85 | 
 86 | 
 87 |     def inference(self, s, batch_size, positive=True):
 88 |         #(batch, dims)
 89 |         if positive:
 90 |             vec = self.p.unsqueeze(0).expand(batch_size, self.params.hrr_dim)
 91 |         else:
 92 |             vec = self.n.unsqueeze(0).expand(batch_size, self.params.hrr_dim)
 93 | 
 94 |         # vec = complexMagProj(vec)
 95 |         inv_vec = get_appx_inv(vec)
 96 |         y = circular_conv(inv_vec, s) #(batch, dims)
 97 |         y = y / (torch.norm(y, dim=-1, keepdim=True) + 1e-8)
 98 |         return y
 99 | 
100 |     def spp_loss(self, s, target):
101 |         """
102 |         Train with SPP.
103 |         """
104 |         pos_classes = self.class_vec(target)   #(batch, no_label, dims)
105 |         pos_classes = pos_classes * self.class_weights(target)
106 | 
107 |         # Normalize the class vectors.
108 |         # tgt_shape = pos_classes.shape
109 |         # pos_classes = torch.reshape(pos_classes, (tgt_shape[0] * tgt_shape[1],
110 |         #                                           tgt_shape[2]))
111 |         # pos_classes = torch.reshape(complexMagProj(pos_classes), (tgt_shape[0], tgt_shape[1],
112 |         #                                            tgt_shape[2]))
113 | 
114 |         # Remove the padding idx vectors.
115 |         # pos_classes = pos_classes.to(device)
116 | 
117 |         # Positive prediction loss
118 |         convolve = self.inference(s, target.size(0))
119 |         cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1)
120 |         J_p = torch.mean(torch.sum(1 - torch.abs(cosine), dim=-1))
121 | 
122 |         # Negative prediction loss.
123 |         J_n = 0.0
124 |         if self.params.without_negative is False:
125 |             convolve = self.inference(s, target.size(0), positive=False)
126 |             cosine = torch.matmul(pos_classes, convolve.unsqueeze(1).transpose(-1, -2)).squeeze(-1)
127 |             J_n = torch.mean(torch.sum(torch.abs(cosine), dim=-1))
128 | 
129 |         # Total Loss.
130 |         loss = J_n + J_p
131 |         return loss
132 | 
133 | 
134 |     def forward(self, inputs):
135 |         #o0 = self.drp(self.bn_1(inputs)).permute(0,2,1)
136 |         o0 = inputs.permute(0,2,1)# self.bn_1(inputs.permute(0,2,1))
137 |         if(self.params.dropouts):
138 |             o0 = self.drp(o0)
139 |         conv_out = []
140 | 
141 |         for i in range(len(self.params.filter_sizes)):
142 |             o = self.conv_layers[i](o0)
143 |             o = o.view(o.shape[0], 1, o.shape[1] * o.shape[2])
144 |             o = self.pool_layers[i](o)
145 |             o = nn.functional.relu(o)
146 |             o = o.view(o.shape[0],-1)
147 |             conv_out.append(o)
148 |             del o
149 |         if len(self.params.filter_sizes)>1:
150 |             o = torch.cat(conv_out,1)
151 |         else:
152 |             o = conv_out[0]
153 | 
154 |         # Additional fully connected layers added to the model.
155 |         o = self.fc_layer_1(o)
156 |         o = nn.functional.relu(o)
157 | 
158 |         # NOTE: Comment out fc2 and fc3 for Amazon670K
159 |         o = self.fc_layer_2(o)
160 |         o = nn.functional.relu(o)
161 | 
162 |         o = self.fc_layer_3(o)
163 |         o = nn.functional.relu(o)
164 |         ###
165 | 
166 |         if(self.params.dropouts):
167 |             o = self.drp5(o)
168 |         o = self.out_layer(o)
169 | 
170 |         if not self.params.hrr_labels:
171 |             o = torch.sigmoid(o)
172 | 
173 |         return o
174 | 


--------------------------------------------------------------------------------
/xml-cnn/code/models/embedding_layer.py:
--------------------------------------------------------------------------------
 1 | from header import *
 2 | 
 3 | class embedding_layer(torch.nn.Module):
 4 | 
 5 |     def __init__(self, params, embedding_weights):
 6 |         super(embedding_layer, self).__init__()
 7 |         self.l = nn.Embedding(params.vocab_size, params.embedding_dim)
 8 |         if params.model_variation == 'pretrain':
 9 |             self.l.weight.data.copy_(torch.from_numpy(embedding_weights))
10 |             self.l.weight.requires_grad=False
11 | 
12 |     def forward(self, inputs):
13 |         o = self.l(inputs)
14 |         return o
15 | 


--------------------------------------------------------------------------------
/xml-cnn/code/models/header.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.autograd as autograd
 4 | import torch.optim as optim
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | import matplotlib.gridspec as gridspec
 8 | import os
 9 | from torch.autograd import Variable
10 | import sys
11 | import numpy as np
12 | sys.path.append('../../utils/')
13 | sys.path.append('models/')
14 | import data_helpers
15 | 
16 | from w2v import *
17 | from embedding_layer import embedding_layer
18 | from sklearn import preprocessing
19 | from sklearn.decomposition import PCA
20 | import scipy.io as sio
21 | from scipy import sparse
22 | import argparse
23 | from visdom import Visdom
24 | from sklearn.externals import joblib
25 | from futils import *
26 | from loss import loss
27 | 


--------------------------------------------------------------------------------
/xml-cnn/code/models/xmlCNN.py:
--------------------------------------------------------------------------------
 1 | from header import *
 2 | from cnn_encoder import cnn_encoder
 3 | from lib.utils import Measure
 4 | 
 5 | class xmlCNN(nn.Module):
 6 |     def __init__(self, params, embedding_weights):
 7 |         super(xmlCNN, self).__init__()
 8 |         self.params = params
 9 |         self.embedding_layer = embedding_layer(params, embedding_weights)
10 |         self.classifier = cnn_encoder(params)
11 | 
12 |         if params.hrr_labels:
13 |             self.loss = self.classifier.spp_loss
14 |         else:
15 |             self.loss = self.params.loss_fn
16 | 
17 |         # Create measurements.
18 |         self.time = {
19 |             'train': Measure("Train"),
20 |             'train_forward_pass': Measure("Train Forward Pass"),
21 |             'train_loss': Measure("Train Loss"),
22 |             'optimization': Measure("Optimization"),
23 |             'test_forward_pass': Measure("Test Forward Pass"),
24 |             'inference': Measure("Inference"),
25 |             'data_load': Measure("Data Loader"),
26 |         }
27 | 
28 |     def forward(self, batch_x, batch_y):
29 |         # ----------- Encode (X, Y) --------------------------------------------
30 |         self.time['train_forward_pass'].start()
31 |         e_emb = self.embedding_layer.forward(batch_x)
32 |         Y = self.classifier.forward(e_emb)
33 |         self.time['train_forward_pass'].end()
34 | 
35 |         # Compute time for loss.
36 |         self.time['train_loss'].start()
37 |         loss = self.loss(Y, batch_y)
38 |         self.time['train_loss'].end()
39 | 
40 |         if(loss < 0):
41 |             print(cross_entropy)
42 |             print(Y[0:100])
43 |             print(batch_y[0:100])
44 |             sys.exit()
45 | 
46 |         return loss.view(-1,1), Y
47 | 


--------------------------------------------------------------------------------
/xml-cnn/code/precision_k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.io as sio
 3 | def precision_k(true_mat, score_mat, k):
 4 |     p = np.zeros((k, 1))
 5 |     rank_mat = np.argsort(score_mat)
 6 |     backup = np.copy(score_mat)
 7 |     for k in range(k):
 8 |         score_mat = np.copy(backup)
 9 |         for i in range(rank_mat.shape[0]):
10 |             score_mat[i][rank_mat[i, :-(k+1)]] = 0
11 | 
12 |         score_mat = np.ceil(score_mat)
13 |         kk = np.argwhere(score_mat > 0)
14 |         mat = np.multiply(score_mat, true_mat)
15 |         num = np.sum(mat, axis=1)
16 |         p[k] = np.mean(num/(k+1))
17 | 
18 |     return np.around(p, decimals=4)
19 | 


--------------------------------------------------------------------------------
/xml-cnn/code/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # set -vx
 3 | 
 4 | DATASET=${1:-"eurlex"}
 5 | USE_HRR=${2:-"False"}
 6 | EXP_NAME=${3:-"test"}
 7 | PARALLELIZE=${4:-"False"}
 8 | 
 9 | OPTIONS=""
10 | if [[ "$PARALLELIZE" == "True" ]]
11 | then
12 |     echo "Train WITHOUT data parallelism..."
13 |     OPTIONS="$OPTIONS --dp 1"
14 | fi
15 | 
16 | # Build name for HRR and basic baseline models.
17 | if [[ "$USE_HRR" == "False" ]]
18 | then
19 |     echo "Train WITHOUT HRR representations.."
20 | elif [[ "$USE_HRR" == "True" ]]
21 | then
22 |     echo "Train WITH HRR representations.."
23 |     OPTIONS="$OPTIONS --hrr_labels"
24 | fi
25 | 
26 | echo "OPTIONS: $OPTIONS"
27 | python main.py --ds $DATASET --mn $DATASET --model_type glove-bin $OPTIONS > ../results/${DATASET}.results
28 | 
29 | # Test the model.
30 | echo "Test Results..."
31 | python main.py --ds $DATASET --model_type glove-bin --tr 0 --lm ../saved_models/$DATASET/model_best_test >> ../results/${DATASET}.results
32 | 


--------------------------------------------------------------------------------
/xml-cnn/code/test_manik.m:
--------------------------------------------------------------------------------
 1 | addpath('/scratch/work/saxenas2/fastxml/manik/Tools/matlab/')
 2 | addpath('/scratch/work/saxenas2/fastxml/manik/tools/')
 3 | addpath('/scratch/work/saxenas2/fastxml/manik/Tools/metrics/')
 4 | addpath('/scratch/work/saxenas2/fastxml/manik/FastXML/')
 5 | 
 6 | A = .55;
 7 | B = 1.5;
 8 | 
 9 | load score_matrix.mat
10 | [I, J, S] = find(score_matrix);
11 | [sorted_I, idx] = sort(I);
12 | J = J(idx);
13 | S = S(idx);
14 | score_matrix = sparse(J, sorted_I, S);
15 | 
16 | load ty.mat
17 | [I, J, S] = find(ty);
18 | [sorted_I, idx] = sort(I);
19 | J = J(idx);
20 | S = S(idx);
21 | ty = sparse(J, sorted_I, S);
22 | ip = inv_propensity(ty,A,B);
23 | 
24 | [metrics] = get_all_metrics(score_matrix , ty, ip)
25 | disp(metrics)
26 | 
27 | % -------- For RCV1 His neural net--------
28 | 
29 | % prec 96.58 89.82 79.66 65.28 55.15
30 | % nDCG 96.58 92.51 90.96 91.01 91.46
31 | % prec_wt 86.22 86.25 87.38 87.70 88.48
32 | % nDCG_wt 86.22 86.24 87.00 87.21 87.65
33 | 
34 | % -----------------------------------------
35 | 
36 | % prec 93.26 86.08 75.64 62.28 52.79
37 | % nDCG 93.26 88.84 86.81 87.18 87.84
38 | % prec_wt 73.04 76.45 78.40 80.02 81.59
39 | % nDCG_wt 73.04 75.62 77.04 78.06 78.96
40 | 
41 | % prec 95.50 87.29 76.72 63.20 53.59
42 | % nDCG 95.50 90.29 88.17 88.53 89.18
43 | % prec_wt 72.24 76.67 79.44 81.27 82.96
44 | % nDCG_wt 72.24 75.59 77.59 78.76 79.73
45 | 
46 | 
47 | % ---------- Initialized weights with Dropouts -------------
48 | %  Best for test -------------------
49 | % prec 94.06 84.04 73.35 60.90 51.89
50 | % nDCG 94.06 87.45 84.92 85.63 86.51
51 | % prec_wt 70.89 73.01 74.81 77.17 79.28
52 | % nDCG_wt 70.89 72.50 73.76 75.21 76.40
53 | 
54 | %  Best for train -------------------
55 | % prec 93.62 84.88 74.66 61.41 52.02
56 | % nDCG 93.62 88.00 86.00 86.34 86.98
57 | % prec_wt 71.90 75.07 77.10 78.54 80.01
58 | % nDCG_wt 71.90 74.30 75.76 76.67 77.52
59 | 
60 | 
61 | % ---------------- base_model_with_test_saving_after_each_run ------
62 | % model_best_batch
63 | % prec 94.49 86.20 75.71 62.40 52.84
64 | % nDCG 94.49 89.23 87.11 87.53 88.16
65 | % prec_wt 72.40 76.11 78.32 80.02 81.60
66 | % nDCG_wt 72.40 75.21 76.81 77.88 78.79
67 | 
68 | % model_best_for_test
69 | % prec 94.98 86.05 75.65 62.45 53.06
70 | % nDCG 94.98 89.21 87.08 87.54 88.29
71 | % prec_wt 71.91 75.42 77.85 79.69 81.58
72 | % nDCG_wt 71.91 74.57 76.30 77.47 78.54
73 | 
74 | 
75 | %  --------------- L1 loss ----------------------
76 | model_best_for_test
77 | bad!!!
78 | 
79 | model_best_batch
80 | bad!!!
81 | 
82 | % ------------------ Ablation --------------
83 | % prec 94.59 87.66 77.32 63.61 53.84
84 | % nDCG 94.59 90.38 88.51 88.81 89.37
85 | % prec_wt 74.26 77.99 80.16 81.68 83.20
86 | % nDCG_wt 74.26 77.08 78.66 79.63 80.50


--------------------------------------------------------------------------------
/xml-cnn/code/train.slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is for GPU allocation is available. #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=8
 6 | #SBATCH --output=output/slurm-%x-%a.out
 7 | #SBATCH --error=output/slurm-%x-%a.err
 8 | 
 9 | # Set the environment.
10 | # source deactivate # Remove previous environments.
11 | source ~/anaconda3/etc/profile.d/conda.sh
12 | conda activate spp # Environment name.
13 | 
14 | # Execute the code.
15 | set -o xtrace
16 | TASK_ID=$((SLURM_ARRAY_TASK_ID))
17 | NAME=$1
18 | SAVE_MODEL=$2
19 | EXP_NAME=$3
20 | DIMS=$4
21 | WITH_GRAD=${5}
22 | WITHOUT_NEGATIVE=${6}
23 | PROP_A=${7:-"0.55"} # Propensity value A. For Amazon-670K it is 0.6
24 | PROP_B=${8:-"1.5"} # Propensity value B. For Amazon-670K it is 2.6
25 | MODEL=("baseline" "hrr")
26 | 
27 | # Select the model.
28 | MODEL_TYPE=${MODEL[${TASK_ID}]}
29 | # FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE}-${DIMS}-${WITH_GRAD}-${WITHOUT_NEGATIVE}
30 | FIN_EXP_NAME=${NAME}-${EXP_NAME}-${MODEL_TYPE}
31 | echo "Parameters: $NAME $SAVE_MODEL"
32 | echo "            $MODEL_TYPE $EXP_NAME $DIMS"
33 | echo "            ${WITH_GRAD} ${WITHOUT_NEGATIVE}"
34 | 
35 | # Construct list of options.
36 | OPTIONS=""
37 | if [ "$MODEL_TYPE" == "hrr" ]
38 | then
39 |     OPTIONS="${OPTIONS} --hrr_labels"
40 |     NAME="${NAME}_hrr"
41 | fi
42 | 
43 | if [ "$WITH_GRAD" == "no-grad" ]
44 | then
45 |         OPTIONS="${OPTIONS} --no-grad"
46 | fi
47 | 
48 | if [ "${WITHOUT_NEGATIVE}" == "without-negative" ]
49 | then
50 |         OPTIONS="${OPTIONS} --without-negative"
51 | fi
52 | 
53 | # Train the the models.
54 | # --dp 1 for data parallel option.
55 | echo "OPTIONS: $OPTIONS"
56 | python main.py --ds $NAME --mn $FIN_EXP_NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin $OPTIONS > ../results/${FIN_EXP_NAME}.results
57 | 
58 | # Test the model.
59 | echo "Test Results..."
60 | python main.py --ds $NAME -a ${PROP_A} -b ${PROP_B} --model_type glove-bin --tr 0 --lm ../saved_models/$FIN_EXP_NAME/model_best_test $OPTIONS >> ../results/${FIN_EXP_NAME}.results
61 | 


--------------------------------------------------------------------------------
/xml-cnn/data/README.md:
--------------------------------------------------------------------------------
  1 | # AttentionXML
  2 | [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727)
  3 | 
  4 | ## Requirements
  5 | 
  6 | * python==3.7.4
  7 | * click==7.0
  8 | * ruamel.yaml==0.16.5
  9 | * numpy==1.16.2
 10 | * scipy==1.3.1
 11 | * scikit-learn==0.21.2
 12 | * gensim==3.4.0
 13 | * torch==1.0.1
 14 | * nltk==3.4
 15 | * tqdm==4.31.1
 16 | * joblib==0.13.2
 17 | * logzero==1.5.0
 18 | 
 19 | ## Datasets
 20 | 
 21 | * [EUR-Lex](https://drive.google.com/open?id=1iPGbr5-z2LogtMFG1rwwekV_aTubvAb2)
 22 | * [Wiki10-31K](https://drive.google.com/open?id=1Tv4MHQzDWTUC9hRFihRhG8_jt1h0VhnR)
 23 | * [AmazonCat-13K](https://drive.google.com/open?id=1VwHAbri6y6oh8lkpZ6sSY_b1FRNnCLFL)
 24 | * [Amazon-670K](https://drive.google.com/open?id=1Xd4BPFy1RPmE7MEXMu77E2_xWOhR1pHW)
 25 | * [Wiki-500K](https://drive.google.com/open?id=1bGEcCagh8zaDV0ZNGsgF0QtwjcAm0Afk)
 26 | * [Amazon-3M](https://drive.google.com/open?id=187vt5vAkGI2mS2WOMZ2Qv48YKSjNbQv4)
 27 | 
 28 | Download the GloVe embedding (840B,300d) and convert it to gensim format (which can be loaded by **gensim.models.KeyedVectors.load**).
 29 | 
 30 | We also provide a converted GloVe embedding at [here](https://drive.google.com/file/d/10w_HuLklGc8GA_FtUSdnHT8Yo1mxYziP/view?usp=sharing). 
 31 | 
 32 | ## XML Experiments
 33 | 
 34 | XML experiments in paper can be run directly such as:
 35 | ```bash
 36 | ./scripts/run_eurlex.sh
 37 | ```
 38 | ## Preprocess
 39 | 
 40 | Run preprocess.py for train and test datasets with tokenized texts as follows:
 41 | ```bash
 42 | python preprocess.py \
 43 | --text-path data/EUR-Lex/train_texts.txt \
 44 | --label-path data/EUR-Lex/train_labels.txt \
 45 | --vocab-path data/EUR-Lex/vocab.npy \
 46 | --emb-path data/EUR-Lex/emb_init.npy \
 47 | --w2v-model data/glove.840B.300d.gensim
 48 | 
 49 | python preprocess.py \
 50 | --text-path data/EUR-Lex/test_texts.txt \
 51 | --label-path data/EUR-Lex/test_labels.txt \
 52 | --vocab-path data/EUR-Lex/vocab.npy 
 53 | ```
 54 | 
 55 | Or run preprocss.py including tokenizing the raw texts by NLTK as follows:
 56 | ```bash
 57 | python preprocess.py \
 58 | --text-path data/Wiki10-31K/train_raw_texts.txt \
 59 | --tokenized-path data/Wiki10-31K/train_texts.txt \
 60 | --label-path data/Wiki10-31K/train_labels.txt \
 61 | --vocab-path data/Wiki10-31K/vocab.npy \
 62 | --emb-path data/Wiki10-31K/emb_init.npy \
 63 | --w2v-model data/glove.840B.300d.gensim
 64 | 
 65 | python preprocess.py \
 66 | --text-path data/Wiki10-31K/test_raw_texts.txt \
 67 | --tokenized-path data/Wiki10-31K/test_texts.txt \
 68 | --label-path data/Wiki10-31K/test_labels.txt \
 69 | --vocab-path data/Wiki10-31K/vocab.npy 
 70 | ```
 71 | 
 72 | 
 73 | ## Train and Predict
 74 | 
 75 | Train and predict as follows:
 76 | ```bash
 77 | python main.py --data-cnf configure/datasets/EUR-Lex.yaml --model-cnf configure/models/AttentionXML-EUR-Lex.yaml 
 78 | ```
 79 | 
 80 | Or do prediction only with option "--mode eval".
 81 | 
 82 | ## Ensemble
 83 | 
 84 | Train and predict with an ensemble:
 85 | ```bash
 86 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 0
 87 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 1
 88 | python main.py --data-cnf configure/datasets/Wiki-500K.yaml --model-cnf configure/models/FastAttentionXML-Wiki-500K.yaml -t 2
 89 | python ensemble.py -p results/FastAttentionXML-Wiki-500K -t 3
 90 | ```
 91 | 
 92 | ## Evaluation
 93 | 
 94 | ```bash
 95 | python evaluation.py --results results/AttentionXML-EUR-Lex-labels.npy --targets data/EUR-Lex/test_labels.npy
 96 | ```
 97 | Or get propensity scored metrics together:
 98 | 
 99 | ```bash
100 | python evaluation.py \
101 | --results results/FastAttentionXML-Amazon-670K-labels.npy \
102 | --targets data/Amazon-670K/test_labels.npy \
103 | --train-labels data/Amazon-670K/train_labels.npy \
104 | -a 0.6 \
105 | -b 2.6
106 | 
107 | ```
108 | 
109 | ## Reference
110 | You et al., [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727), NeurIPS 2019
111 | 
112 | ## Declaration
113 | It is free for non-commercial use. For commercial use, please contact Mr. Ronghi You and Prof. Shanfeng Zhu (zhusf@fudan.edu.cn).


--------------------------------------------------------------------------------
/xml-cnn/embedding_weights/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/xml-cnn/utils/data_dive.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | # import torch
 4 | import timeit
 5 | import argparse
 6 | import numpy as np
 7 | import time
 8 | # import torch.nn as nn
 9 | # import torch.optim as optim
10 | import matplotlib.pyplot as plt
11 | # import torch.autograd as autograd
12 | from sklearn import preprocessing
13 | # from torch.autograd import Variable
14 | from sklearn.decomposition import PCA
15 | import matplotlib.gridspec as gridspec
16 | 
17 | # this file is to explore the generated data and the data that already exist to see how much similarity do they share. 
18 | # It prits some stats and qualitative results
19 | 
20 | new_data_x_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_x.npy"
21 | new_data_y_file = "../datasets/Gen_data_Z_dim-200_mb_size-100_h_dim-600_preproc-1_beta-1.01_final_ly-Sigmoid_loss-BCELoss/new_y.npy"
22 | actual_data_x_file = "../datasets/Eurlex/eurlex_docs/x_tr.npy"
23 | actual_data_y_file = "../datasets/Eurlex/eurlex_docs/y_tr.npy"
24 | indx2word_file = "../datasets/Eurlex/eurlex_docs/feature_names.txt"
25 | indx2label = "../datasets/Eurlex/eurlex_docs/label_set.txt"
26 | K = 10
27 | # ----------------------------------------------------------------------------
28 | 
29 | new_data_x = np.load(new_data_x_file)
30 | new_data_y = np.load(new_data_y_file)
31 | actual_data_x = np.load(actual_data_x_file)
32 | actual_data_y = np.load(actual_data_y_file)
33 | f = open(indx2label, 'r')
34 | temp = f.read().splitlines()
35 | labels = []
36 | for i in temp:
37 |     labels.append(i.split(":")[1])
38 | f = open(indx2word_file, 'r')
39 | temp = f.read().splitlines()
40 | words = []
41 | for i in temp:
42 |     words.append(i.split(":")[1])
43 | 
44 | print("Shapes: new_x: {}; new_y: {}; original_x: {}; original_y: {};".format(new_data_x.shape, \
45 |                                     new_data_y.shape, actual_data_x.shape, actual_data_y.shape))
46 | print("Num Words: {}; Num Labels: {};".format(len(labels), len(words)))
47 | 
48 | for data_pt_num in range(K):
49 |     data_pt_labels = np.argwhere(new_data_y[data_pt_num]==1)
50 |     label_names = []
51 |     for label in data_pt_labels.tolist():
52 |         # print(label)
53 |         label_names.append(labels[label[0]])
54 |     print("Labels in the data point : {}".format(label_names))
55 | 
56 |     data_pt_words = np.argsort(new_data_x[data_pt_num])[-10:]
57 |     word_names = []
58 |     for word in data_pt_words.tolist():
59 |         word_names.append(words[word])
60 |     print("Top 10 words in the data point : {}".format(word_names))
61 | 
62 |     # Nearest Data point in actual data
63 |     indx = -1
64 |     closest = 1e10
65 |     # print(actual_data_y)
66 |     for i in range(len(actual_data_y)):
67 |         dist = -len(np.intersect1d(np.argwhere(actual_data_y[i]==1), np.argwhere(new_data_y[data_pt_num]==1)))
68 |         # print(np.argwhere(actual_data_y[i]==1))
69 |         # print(np.argwhere(new_data_y[data_pt_num]==1))
70 |         if(dist<closest):
71 |             closest = dist
72 |             indx = i
73 |     print(-closest)
74 |     print(indx)
75 |     data_pt_labels = np.argwhere(actual_data_y[indx]==1)
76 |     label_names = []
77 |     for label in data_pt_labels.tolist():
78 |         label_names.append(labels[label[0]])
79 |     print("Closest Label Set in the original data set has labels: {}".format(label_names))
80 | 
81 |     data_pt_words = np.argsort(actual_data_x[indx])[-10:]
82 |     word_names = []
83 |     for word in data_pt_words.tolist():
84 |         word_names.append(words[word])
85 |     print("Top 10 words in the data point with the above label: {}".format(word_names))
86 | 
87 |     print("="*50)


--------------------------------------------------------------------------------
/xml-cnn/utils/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import re
  4 | import itertools
  5 | import scipy.sparse as sp
  6 | import pickle
  7 | from collections import Counter
  8 | from nltk.corpus import stopwords
  9 | 
 10 | cachedStopWords = stopwords.words("english")
 11 | 
 12 | 
 13 | def clean_str(string):
 14 |     # remove stopwords
 15 |     # string = ' '.join([word for word in string.split() if word not in cachedStopWords])
 16 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 17 |     string = re.sub(r"\'s", " \'s", string)
 18 |     string = re.sub(r"\'ve", " \'ve", string)
 19 |     string = re.sub(r"n\'t", " n\'t", string)
 20 |     string = re.sub(r"\'re", " \'re", string)
 21 |     string = re.sub(r"\'d", " \'d", string)
 22 |     string = re.sub(r"\'ll", " \'ll", string)
 23 |     string = re.sub(r",", " , ", string)
 24 |     string = re.sub(r"!", " ! ", string)
 25 |     string = re.sub(r"\(", " \( ", string)
 26 |     string = re.sub(r"\)", " \) ", string)
 27 |     string = re.sub(r"\?", " \? ", string)
 28 |     string = re.sub(r"\s{2,}", " ", string)
 29 |     return string.strip().lower()
 30 | 
 31 | 
 32 | def pad_sentences(sentence_sets, padding_word="<PAD/>", max_length=500):
 33 |     sequence_length = 0
 34 |     for sentences in sentence_sets:
 35 |         sequence_length = max(min(max(len(x) for x in sentences), max_length), sequence_length)
 36 | 
 37 |     padded_sentence_sets = []
 38 |     for sentences in sentence_sets:
 39 |         padded_sentences = []
 40 |         for i in range(len(sentences)):
 41 |             sentence = sentences[i]
 42 |             if len(sentence) < max_length:
 43 |                 num_padding = sequence_length - len(sentence)
 44 |                 new_sentence = sentence + [padding_word] * num_padding
 45 |             else:
 46 |                 new_sentence = sentence[:max_length]
 47 |             padded_sentences.append(new_sentence)
 48 |         padded_sentence_sets.append(padded_sentences)
 49 |     return padded_sentence_sets, sequence_length
 50 | 
 51 | 
 52 | def load_data_and_labels(data, M=0, N=0, hrr_labels=False, max_labels=0):
 53 |     x_text = [clean_str(doc['text']) for doc in data]
 54 |     x_text = [s.split(" ") for s in x_text]
 55 |     labels = [doc['catgy'] for doc in data]
 56 |     row_idx, col_idx, val_idx = [], [], []
 57 |     max_label_per_row = 0
 58 |     Y_hrr = []
 59 | 
 60 |     for i in range(len(labels)):
 61 |         l_list = list(set(labels[i])) # remove duplicate cateories to avoid double count
 62 | 
 63 |         if len(l_list) > max_label_per_row:
 64 |             max_label_per_row = len(l_list)
 65 | 
 66 |         for pos, y in enumerate(l_list):
 67 |             row_idx.append(i)
 68 |             col_idx.append(y)
 69 |             val_idx.append(1)
 70 | 
 71 |         if hrr_labels:
 72 |             Y_hrr.append(l_list)
 73 | 
 74 |     m = max(row_idx) + 1
 75 |     n = max(col_idx) + 1
 76 |     print("Number of Labels: {}".format(n))
 77 | 
 78 |     # NOTE: n + 1 represents the number of labels. For HRR it is the last
 79 |     #       label.
 80 |     if(M and N):
 81 |     	if(N > n):
 82 |        		#y_te = y_te.resize((np.shape(y_te)[0], np.shape(y_tr)[1]))
 83 | 	    	Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, N))
 84 |     	elif(N <= n):
 85 |             Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n))
 86 |             Y = Y[:, :N] # This eliminates labels not present in the training by default.
 87 |     else:
 88 |         Y = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n))
 89 | 
 90 |     return [x_text, Y, m, n, Y_hrr, max_label_per_row]
 91 | 
 92 | # max_labels are the number of labels per row.
 93 | # num_labels are the total number of labels in the dataset.
 94 | def build_hrr_labels(Y_tr_hrr, Y_te_hrr, max_tr_labels, max_te_labels, num_labels):
 95 |     max_labels = max_tr_labels if max_tr_labels > max_te_labels else max_te_labels
 96 |     for i in range(0, len(Y_tr_hrr)):
 97 |         diff = max_labels - len(Y_tr_hrr[i])
 98 |         Y_tr_hrr[i] = Y_tr_hrr[i] + [num_labels for i in range(0, diff)] if diff > 0 else Y_tr_hrr[i][: max_labels]
 99 | 
100 |     Y_tr_hrr = sp.csr_matrix(np.array(Y_tr_hrr))
101 | 
102 |     for i in range(0, len(Y_te_hrr)):
103 |         diff = max_labels - len(Y_te_hrr[i])
104 |         Y_te_hrr[i] = Y_te_hrr[i] + [num_labels for i in range(0, diff)] if diff > 0 else Y_te_hrr[i][: max_labels]
105 | 
106 |     Y_te_hrr = sp.csr_matrix(np.array(Y_te_hrr))
107 |     return Y_tr_hrr, Y_te_hrr
108 | 
109 | 
110 | def build_vocab(sentences, params, vocab_size=50000):
111 |     word_counts = Counter(itertools.chain(*sentences))
112 |     vocabulary_inv = [x[0] for x in word_counts.most_common(vocab_size)]
113 |     vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
114 |     # append <UNK/> symbol to the vocabulary
115 |     vocabulary['<UNK/>'] = len(vocabulary)
116 |     vocabulary_inv.append('<UNK/>')
117 |     vocabulary[params.go_token] = len(vocabulary)
118 |     vocabulary_inv.append(params.go_token)
119 |     vocabulary[params.end_token] = len(vocabulary)
120 |     vocabulary_inv.append(params.end_token)
121 | 
122 |     return [vocabulary, vocabulary_inv]
123 | 
124 | 
125 | def build_input_data(sentences, vocabulary):
126 |     x = np.array([[vocabulary[word] if word in vocabulary else vocabulary['<UNK/>'] for word in sentence] for sentence in sentences])
127 |     #x = np.array([[vocabulary[word] if word in vocabulary else len(vocabulary) for word in sentence] for sentence in sentences])
128 |     return x
129 | 
130 | 
131 | def load_data(params, max_length=500, vocab_size=50000, hrr_labels=False):
132 |     # Load and preprocess data
133 |     with open(os.path.join(params.data_path), 'rb') as fin:
134 |         [train, test, vocab, catgy] = pickle.load(fin, encoding="latin1")
135 | 
136 |     # dirty trick to prevent errors happen when test is empty
137 |     if len(test) == 0:
138 |         test[:5] = train[:5]
139 | 
140 |     trn_sents, Y_trn, m, n_tr, Y_tr_hrr, max_tr_labels = load_data_and_labels(train, hrr_labels=hrr_labels)
141 |     tst_sents, Y_tst, m, n, Y_te_hrr, max_te_labels = load_data_and_labels(test, M=m, N=n_tr, hrr_labels=hrr_labels, max_labels=max_tr_labels)
142 | 
143 |     if hrr_labels:
144 |         Y_tr_hrr, Y_te_hrr = build_hrr_labels(Y_tr_hrr, Y_te_hrr, max_tr_labels, max_te_labels, num_labels=n_tr)
145 | 
146 |     sents_padded_sets, params.sequence_length = pad_sentences([trn_sents, tst_sents] , padding_word=params.pad_token, max_length=max_length)
147 |     # tst_sents_padded = pad_sentences(tst_sents, padding_word=params.pad_token, max_length=max_length)
148 |     vocabulary, vocabulary_inv = build_vocab(sents_padded_sets[0] + sents_padded_sets[1], params, vocab_size=vocab_size)
149 |     X_trn = build_input_data(sents_padded_sets[0], vocabulary)
150 |     X_tst = build_input_data(sents_padded_sets[1], vocabulary)
151 | 
152 |     """
153 |     Dataset Information.
154 |     """
155 |     print("Train X: {}, Train Y: {}".format(X_trn.shape, Y_trn.shape))
156 |     print("Test X: {}, Test Y: {}".format(X_tst.shape, Y_tst.shape))
157 |     if hrr_labels:
158 |         print("Max Tr Labels: {}, Max Te Labels: {}".format(max_tr_labels, max_te_labels))
159 |         print("Train Y HRR: {}, Test Y HRR: {}".format(Y_tr_hrr.shape, Y_te_hrr.shape))
160 | 
161 |     return X_trn, Y_trn, X_tst, Y_tst, vocabulary, vocabulary_inv, params, Y_tr_hrr, Y_te_hrr
162 |     # return X_trn, Y_trn, vocabulary, vocabulary_inv
163 | 
164 | 
165 | def batch_iter(data, batch_size, num_epochs):
166 |     """
167 |     Generates a batch iterator for a dataset.
168 |     """
169 |     data = np.array(data)
170 |     data_size = len(data)
171 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
172 |     for epoch in range(num_epochs):
173 |         # Shuffle the data at each epoch
174 |         shuffle_indices = np.random.permutation(np.arange(data_size))
175 |         shuffled_data = data[shuffle_indices]
176 |         for batch_num in range(num_batches_per_epoch):
177 |             start_index = batch_num * batch_size
178 |             end_index = min((batch_num + 1) * batch_size, data_size)
179 |             yield shuffled_data[start_index:end_index]
180 | 


--------------------------------------------------------------------------------
/xml-cnn/utils/fiddle_clusters.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('utils/')
  3 | sys.path.append('models/')
  4 | import numpy as np
  5 | import os
  6 | from sklearn import preprocessing
  7 | from sklearn.decomposition import PCA
  8 | import argparse
  9 | from sklearn.cluster import KMeans
 10 | import matplotlib
 11 | import matplotlib.pyplot as plt
 12 | import matplotlib.gridspec as gridspec
 13 | import cPickle
 14 | from sklearn.metrics import silhouette_score
 15 | from dpmeans import *
 16 | from sklearn.decomposition import PCA
 17 | from sklearn.decomposition import TruncatedSVD
 18 | import scipy.io as sio
 19 | x_tr = np.load('datasets/Eurlex/eurlex_docs/x_tr.npy')
 20 | y_tr = np.load('datasets/Eurlex/eurlex_docs/y_tr.npy')
 21 | x_te = np.load('datasets/Eurlex/eurlex_docs/x_te.npy')
 22 | y_te = np.load('datasets/Eurlex/eurlex_docs/y_te.npy')
 23 | 
 24 | n = np.shape(x_tr)[0]
 25 | m = np.shape(y_tr)[1]
 26 | 
 27 | 
 28 | 
 29 | # ------ Making Adjacency ------------------
 30 | dct = {}
 31 | for i in range(m):
 32 |     dct[i] = np.argwhere(y_tr[:,i]==1)
 33 | 
 34 | adjacency_mat = np.zeros((m,m))
 35 | check_mat = np.zeros((m,m))
 36 | for i in range(m):
 37 |     for j in range(m):
 38 |         adjacency_mat[i,j] = len(np.intersect1d(dct[i],dct[j]))
 39 |         adjacency_mat[j, i] = adjacency_mat[i,j]
 40 |         check_mat[i,j] = check_mat[j,i] = 1
 41 |     # adjacency_mat[i, i] = len(dct[i])
 42 |     # check_mat[i,i] = 1
 43 |     
 44 |     print(i)
 45 | np.save('adjacency_mat', adjacency_mat)
 46 | adjacency_mat = sparse.csr_matrix(adjacency_mat)
 47 | sio.savemat('adjacency_mat', adjacency_mat)
 48 | print((check_mat==0).any())
 49 | print(adjacency_mat[:100,:100])
 50 | # -----------------------------------------
 51 | 
 52 | #  ------------- PP ---------------------------------------
 53 | adjacency_mat = np.load('/scratch/work/saxenas2/CVAE_XML/adjacency_mat.npy')
 54 | pp = preprocessing.MinMaxScaler()
 55 | scaler = pp.fit(adjacency_mat)
 56 | adjacency_mat = scaler.transform(adjacency_mat)
 57 | #  -------------------------------------------------------
 58 | 
 59 | #  ----------------------- cluster + score ---------------
 60 | clusters = [2, 4, 6, 8, 10, 12, 15, 18, 21, 24, 27, 30]
 61 | scores = []
 62 | scores_silhoette = []
 63 | for cluster_no in clusters:
 64 |     print(cluster_no)
 65 |     kmeans = KMeans(n_clusters=cluster_no, random_state=0).fit(adjacency_mat)
 66 |     scores.append(kmeans.score(adjacency_mat))
 67 |     label = kmeans.labels_
 68 |     scores_silhoette.append(silhouette_score(adjacency_mat, label, metric='euclidean'))
 69 |     with open('classifier_' + str(cluster_no) + '.pkl', 'wb') as fid:
 70 |         cPickle.dump(kmeans, fid)
 71 | # ---------------------------------------------------------
 72 | 
 73 | # scores = []
 74 | # for cluster_no in clusters:
 75 | #     with open('classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid:
 76 | #         kmeans = cPickle.load(fid)
 77 | #         label = kmeans.labels_
 78 | #         scores.append(silhouette_score(adjacency_mat, label, metric='euclidean'))
 79 | 
 80 | matplotlib.pyplot.plot(clusters, scores)
 81 | plt.show()
 82 | 
 83 | # ---------------------- Explore Clusters -------------------------
 84 | cluster_no = 30
 85 | # with open('clusterings/classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid:
 86 | with open('classifier_'+ str(cluster_no) + '.pkl', 'rb') as fid:
 87 |     kmeans = cPickle.load(fid)
 88 | 
 89 | y_pred = kmeans.predict(adjacency_mat)
 90 | clusters = {}
 91 | y_of_cluster = {}
 92 | for i in range(cluster_no):
 93 |     clusters[i] = np.argwhere(y_pred==i)
 94 |     y_of_cluster[i] = y_tr[:, clusters[i]]
 95 |     # y_of_cluster[i] = np.array(y_of_cluster[i][:,0])
 96 |     x = np.sum(y_tr, 0)
 97 |     y = np.sum(y_of_cluster[i], 0)
 98 |     mean_labels = np.mean(np.sum(y_of_cluster[i], 0))
 99 |     top5_labels = np.argsort(y)[-10:]
100 |     top5_label_counts = np.sort(y)[-10:]
101 |     num_tail_labels_1 = len(np.argwhere(x[clusters[i]]<=1))
102 |     num_tail_labels_2 = len(np.argwhere(x[clusters[i]]<=2))
103 |     num_tail_labels_5 = len(np.argwhere(x[clusters[i]]<=5))   
104 | 
105 |     print("No. of Labels {6}; Mean No. of Labels {0}; top 5 labels {1}, top 5 label counts {2}; num tail labels(1) \
106 |     {3}; num tail labels(2) {4}; num tail labels(5) {5}".format(mean_labels, top5_labels, top5_label_counts,  
107 |     num_tail_labels_1, num_tail_labels_2, num_tail_labels_5, len(clusters[i])))
108 | # ---------------------- Explore Clusters -------------------------
109 | 


--------------------------------------------------------------------------------
/xml-cnn/utils/loss.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import timeit
 5 | import argparse
 6 | import numpy as np
 7 | import time
 8 | import torch.nn as nn
 9 | import torch.optim as optim
10 | import matplotlib.pyplot as plt
11 | import torch.autograd as autograd
12 | from sklearn import preprocessing
13 | from torch.autograd import Variable
14 | from sklearn.decomposition import PCA
15 | import matplotlib.gridspec as gridspec
16 | import pdb
17 | 
18 | def isnan(x):
19 |     return x != x
20 | 
21 | class loss:
22 | 
23 |     def MSLoss(self, X_sample, X):
24 |         t = torch.mean(torch.norm((X_sample - X),1),dim=0) 
25 |         return t
26 |     
27 |     def BCELoss(self, y_pred, y, eps = 1e-25):
28 |         t = torch.nn.functional.binary_cross_entropy(y_pred, y)*y.shape[-1]
29 |         return t
30 |     
31 |     def L1Loss(self, X_sample, X):
32 |         t = torch.sum(torch.mean(torch.abs(X_sample - X),dim=0))
33 |         return t
34 | 


--------------------------------------------------------------------------------
/xml-cnn/utils/process_eurlex.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import numpy as np 
 3 | from numpy import genfromtxt
 4 | 
 5 | #bashCommand = "java -cp ~/Downloads/weka-3-8-2/weka.jar weka.core.converters.CSVSaver -i eurlex_nA-5k_CV1-10_train.arff > eurlex_nA-5k_CV1-10_train.csv"
 6 | #process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
 7 | #output, error = process.communicate()
 8 | 
 9 | with open('eurlex_nA-5k_CV1-10_train.csv') as f:
10 |     lines = f.read().splitlines()[0]
11 | 
12 | a = genfromtxt('eurlex_nA-5k_CV1-10_train.csv', delimiter=',')
13 | words = lines.split(',')[1:]
14 | doc_id = {}
15 | doc_id_inv = {}
16 | 
17 | words_dict = {}
18 | for i, w in enumerate(words):
19 |     words_dict[i] = w
20 | 
21 | with open('feature_names.txt', 'w') as f:
22 |     for key, value in words_dict.items():
23 |         f.write('%s:%s\n' % (key, value))
24 | 
25 | for i in range(1, len(a[:,0])):
26 |     doc_id_inv[a[i,0]] = i-1
27 |     doc_id[i-1] = a[i,0]
28 | # doc_id_list = doc_id.
29 | x_tr = a[1:,1:]
30 | np.save('words',words)
31 | np.save('doc_id',doc_id) # dictionary
32 | np.save('doc_id_inv',doc_id_inv) # dictionary
33 | np.save('x_tr',x_tr)
34 | 
35 | 
36 | labels_data_pt = genfromtxt('/u/79/wa.saxenas2/unix/Downloads/eurlex_id2class/id2class_eurlex_eurovoc.qrels', delimiter=' ')[:,1]
37 | with open('/u/79/wa.saxenas2/unix/Downloads/eurlex_id2class/id2class_eurlex_eurovoc.qrels') as f:
38 |     lines = f.read().splitlines()
39 | 
40 | label_names = []
41 | for line in lines:
42 |     label_names.append(line.split(' ')[0])
43 | 
44 | 
45 | label_set = {}
46 | label_set_inv = {}
47 | count = 0
48 | # data_map = {}
49 | # data_count = 0
50 | for i in range(np.shape(labels_data_pt)[0]):
51 |     if label_names[i] not in label_set.keys():
52 |         label_set[label_names[i]] = count
53 |         label_set_inv[count] = label_names[i]        
54 |         count+=1
55 |         print(count)
56 |     # if labels[i] not in data_map.keys() and labels[i] in doc_id_list:
57 |     #     data_map[labels[i]] = data_count
58 |     #     data_count+=1
59 | 
60 | np.save('label_set', label_set) # dictionary
61 | np.save('label_set_inv', label_set_inv) # dictionary
62 | 
63 | with open('label_set.txt', 'w') as f:
64 |     for key, value in label_set_inv.items():
65 |         f.write('%s:%s\n' % (key, value))
66 | 
67 | y_tr = np.zeros((np.shape(x_tr)[0], count))
68 | y_tr_named = {}
69 | for i in range(np.shape(labels_data_pt)[0]):
70 |     if labels_data_pt[i] in doc_id_inv.keys():
71 |         y_tr[doc_id_inv[labels_data_pt[i]], label_set[label_names[i]]] = 1
72 |         if doc_id_inv[labels_data_pt[i]] not in y_tr_named.keys():
73 |             y_tr_named[doc_id_inv[labels_data_pt[i]]] = []
74 |         y_tr_named[doc_id_inv[labels_data_pt[i]]].append(label_names[i])
75 | np.save('y_tr', y_tr)
76 | 
77 | with open('y_tr_named.txt', 'w') as f:
78 |     for key, value in y_tr_named.items():
79 |         f.write('%s:%s\n' % (key, value))
80 | 


--------------------------------------------------------------------------------
/xml-cnn/utils/w2v.py:
--------------------------------------------------------------------------------
  1 | from gensim.models import KeyedVectors
  2 | from os.path import join, exists, split
  3 | import os
  4 | import numpy as np
  5 | 
  6 | def train_word2vec(sentence_matrix, vocabulary_inv,
  7 |                    num_features=300, min_word_count=1, context=10):
  8 |     """
  9 |     Trains, saves, loads Word2Vec model
 10 |     Returns initial weights for embedding layer.
 11 | 
 12 |     inputs:
 13 |     sentence_matrix # int matrix: num_sentences x max_sentence_len
 14 |     vocabulary_inv  # dict {str:int}
 15 |     num_features    # Word vector dimensionality
 16 |     min_word_count  # Minimum word count
 17 |     context         # Context window size
 18 |     """
 19 |     model_dir = '../embedding_weights'
 20 |     model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
 21 |     model_name = join(model_dir, model_name)
 22 |     if exists(model_name):
 23 |         embedding_model = word2vec.Word2Vec.load(model_name)
 24 |         #print 'Loading existing Word2Vec model \'%s\'' % split(model_name)[-1]
 25 |     else:
 26 |         # Set values for various parameters
 27 |         num_workers = 2       # Number of threads to run in parallel
 28 |         downsampling = 1e-3   # Downsample setting for frequent words
 29 | 
 30 |         # Initialize and train the model
 31 |         print( "Training Word2Vec model...")
 32 |         sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
 33 |         embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
 34 |                                             size=num_features, min_count = min_word_count, \
 35 |                                             window = context, sample = downsampling)
 36 | 
 37 |         # If we don't plan to train the model any further, calling
 38 |         # init_sims will make the model much more memory-efficient.
 39 |         embedding_model.init_sims(replace=True)
 40 | 
 41 |         # Saving the model for later use. You can load it later using Word2Vec.load()
 42 |         if not exists(model_dir):
 43 |             os.mkdir(model_dir)
 44 |         print ('Saving Word2Vec model' +  str(split(model_name)[-1]))
 45 |         embedding_model.save(model_name)
 46 | 
 47 |     #  add unknown words
 48 |     embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
 49 |                                    else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\
 50 |                                    for w in vocabulary_inv])]
 51 |     return embedding_weights
 52 | 
 53 | 
 54 | def load_word2vec(params):
 55 |     """
 56 |     loads Word2Vec model
 57 |     Returns initial weights for embedding layer.
 58 | 
 59 |     inputs:
 60 |     model_type      # GoogleNews / glove
 61 |     vocabulary_inv  # dict {str:int}
 62 |     num_features    # Word vector dimensionality
 63 |     """
 64 | 
 65 |     model_dir = '../embedding_weights'
 66 | 
 67 |     if params.model_type == 'GoogleNews':
 68 |         model_name = join(model_dir, 'GoogleNews-vectors-negative300.bin.gz')
 69 |         assert(params.num_features == 300)
 70 |         assert(exists(model_name))
 71 |         print('Loading existing Word2Vec model (GoogleNews-300)')
 72 |         embedding_model = KeyedVectors.load_word2vec_format(model_name, binary=True)
 73 | 
 74 |     elif params.model_type == 'glove-bin':
 75 |         model_name = join(model_dir, 'glove.6B.%dd.bin' % (params.num_features))
 76 |         assert(params.num_features == 300)
 77 |         assert(exists(model_name))
 78 |         print('Loading existing Glove Binary model...')
 79 |         embedding_model = KeyedVectors.load_word2vec_format(model_name, binary=True)
 80 | 
 81 |     elif params.model_type == 'glove':
 82 |         model_name = join(model_dir, 'glove.6B.%dd.txt' % (params.num_features))
 83 |         print(model_name)
 84 |         assert(exists(model_name))
 85 |         print('Loading existing Word2Vec model (Glove.6B.%dd)' % (params.num_features))
 86 | 
 87 |         # dictionary, where key is word, value is word vectors
 88 |         embedding_model = {}
 89 |         for line in open(model_name, 'r'):
 90 |             tmp = line.strip().split()
 91 |             word, vec = tmp[0], map(float, tmp[1:])
 92 |             assert(len(vec) == params.num_features)
 93 |             if word not in embedding_model:
 94 |                 embedding_model[word] = vec
 95 |         assert(len(embedding_model) == 400000)
 96 | 
 97 |     else:
 98 |         raise ValueError('Unknown pretrain model type: %s!' % (params.model_type))
 99 | 
100 |     embedding_weights = [embedding_model[w] if w in embedding_model
101 |                          else np.random.uniform(-0.25, 0.25, params.num_features)
102 |                          for w in params.vocabulary_inv]
103 |     embedding_weights = np.array(embedding_weights).astype('float32')
104 | 
105 |     return embedding_weights
106 | 
107 | 
108 | if __name__=='__main__':
109 |     import data_helpers
110 |     print("Loading data...")
111 |     x, _, _, params.vocabulary_inv = data_helpers.load_data()
112 |     w = train_word2vec(x, params.vocabulary_inv)
113 | 


--------------------------------------------------------------------------------