├── .gitignore ├── images ├── cosine-annealing.png ├── implementation1-learning-curve.png ├── implementation1-precision-recall.png ├── implementation2-precision-recall.png ├── implementation4-precision-recall.png ├── implementation4b-precision-recall.png ├── implementation5-learning-curve.png ├── implementation5-precision-recall.png ├── implementation5b-precision-recall.png ├── implementation6-learning-curve.png ├── implementation7-learning-curve.png └── implementation7-precision-recall.png ├── readme.md ├── requirements.txt ├── run.sh └── src ├── __init__.py ├── config.py ├── ml ├── __init__.py ├── data_loader.py ├── data_loader_edges.py ├── data_loader_with_meta.py ├── mf.py ├── mf_bias.py ├── mf_bias_continuous.py ├── mf_continuous.py ├── skipgram.py ├── skipgram_with_meta.py ├── skipgram_with_meta_weighted.py ├── train_gensim_embedding.py ├── train_node2vec_embeddings.py ├── train_torch_embedding.py ├── train_torch_embedding_with_meta.py ├── train_torch_mf.py ├── train_torch_mf_bias.py ├── train_torch_mf_bias_continuous_edges.py ├── train_torch_mf_bias_edges.py ├── train_torch_mf_bias_edges_parallel.py ├── train_torch_mf_continuous_edges.py └── train_torch_mf_edges.py ├── parse ├── __init__.py └── parse_json.py ├── prep ├── __init__.py ├── prep_edges.py ├── prep_graph_samples.py ├── prep_meta.py ├── prep_node_relationship.py └── train_val_split.py ├── utils ├── __init__.py ├── io_utils.py └── logger.py └── viz ├── __init__.py ├── plot_results.py └── prep_results.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | # Data 129 | data/ 130 | model/ 131 | results/ 132 | *.zip 133 | 134 | # Mac 135 | .DS_Store 136 | .idea/ 137 | notebooks/ 138 | -------------------------------------------------------------------------------- /images/cosine-annealing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/cosine-annealing.png -------------------------------------------------------------------------------- /images/implementation1-learning-curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation1-learning-curve.png -------------------------------------------------------------------------------- /images/implementation1-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation1-precision-recall.png -------------------------------------------------------------------------------- /images/implementation2-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation2-precision-recall.png -------------------------------------------------------------------------------- /images/implementation4-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation4-precision-recall.png -------------------------------------------------------------------------------- /images/implementation4b-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation4b-precision-recall.png -------------------------------------------------------------------------------- /images/implementation5-learning-curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5-learning-curve.png -------------------------------------------------------------------------------- /images/implementation5-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5-precision-recall.png -------------------------------------------------------------------------------- /images/implementation5b-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5b-precision-recall.png -------------------------------------------------------------------------------- /images/implementation6-learning-curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation6-learning-curve.png -------------------------------------------------------------------------------- /images/implementation7-learning-curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation7-learning-curve.png -------------------------------------------------------------------------------- /images/implementation7-precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation7-precision-recall.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # recsys-nlp-graph 2 | 3 | **Undocumented** code for personal project on simple recsys via matrix factorization (part 1), and nlp and graph techniques (part 2). Sharing as part of meet-up follow along. 4 | 5 | Associated articles: 6 | - Part 1: [Building a Strong Baseline Recommender in PyTorch](https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/) 7 | - Part 2: [Beating the Baseline Recommender with Graph & NLP in Pytorch](https://eugeneyan.com/writing/recommender-systems-graph-and-nlp-pytorch/) 8 | 9 | Talk and Slides: 10 | - [DataScience SG Meetup - RecSys, Beyond the Baseline](https://eugeneyan.com/speaking/recommender-systems-beyond-the-baseline-talk/) 11 | - [Slideshare](https://www.slideshare.net/eugeneyan/recommender-systems-beyond-the-useritem-matrix) 12 | 13 | ## Data 14 | 15 | Electronics and books data from the [Amazon dataset (May 1996 – July 2014)](http://jmcauley.ucsd.edu/data/amazon/) was used. Here's how an example JSON entry looks like. 16 | 17 | ``` 18 | { 19 | "asin": "0000031852", 20 | "title": "Girls Ballet Tutu Zebra Hot Pink", 21 | "price": 3.17, 22 | "imUrl": "http://ecx.images-amazon.com/images/I/51fAmVkTbyL._SY300_.jpg", 23 | "related”: 24 | { "also_bought":[ 25 | "B00JHONN1S", 26 | "B002BZX8Z6", 27 | "B00D2K1M3O", 28 | ... 29 | "B007R2RM8W" 30 | ], 31 | "also_viewed":[ 32 | "B002BZX8Z6", 33 | "B00JHONN1S", 34 | "B008F0SU0Y", 35 | ... 36 | "B00BFXLZ8M" 37 | ], 38 | "bought_together":[ 39 | "B002BZX8Z6" 40 | ] 41 | }, 42 | "salesRank": 43 | { 44 | "Toys & Games":211836 45 | }, 46 | "brand": "Coxlures", 47 | "categories":[ 48 | [ "Sports & Outdoors", 49 | "Other Sports", 50 | "Dance" 51 | ] 52 | ] 53 | } 54 | ``` 55 | 56 | ## Comparing Matrix Factorization to Skip-gram (Node2Vec) 57 | 58 | ### Overall results for Electronics dataset 59 | 60 | | | All Products | Seen Products Only | Runtime (min) | 61 | |--------------------------------------------- |-------------- |-------------------- |--------------- | 62 | | PyTorch Matrix Factorization | 0.7951 | - | 45 | 63 | | Node2Vec | NA | NA | NA | 64 | | Gensim Word2Vec | 0.9082 | 0.9735 | 2.58 | 65 | | PyTorch Word2Vec | 0.9554 | 0.9855 | 23.63 | 66 | | PyTorch Word2Vec with Side Info | NA | NA | NA | 67 | | PyTorch Matrix Factorization With Sequences | 0.9320 | - | 70.39 | 68 | | Alibaba Paper* | 0.9327 | - | - | 69 | 70 | ### Overall results for Books dataset 71 | 72 | | | All Products | Seen Products Only | Runtime (min) | 73 | |--------------------------------------------- |-------------- |-------------------- |--------------- | 74 | | PyTorch Matrix Factorization | 0.4996 | - | 1353.12 | 75 | | Gensim Word2Vec | 0.9701 | 0.9892 | 16.24 | 76 | | PyTorch Word2Vec | 0.9775 | - | 122.66 | 77 | | PyTorch Word2Vec with Side Info | NA | NA | NA | 78 | | PyTorch Matrix Factorization With Sequences | 0.7196 | - | 1393.08 | 79 | 80 | 81 | 82 | *[Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba](https://arxiv.org/abs/1803.02349) 83 | 84 | ### 1. Matrix Factorization (iteratively pair by pair) 85 | 86 | At a high level, for each pair: 87 | 88 | - Get the embedding for each product 89 | - Multiply embeddings and sum the resulting vector (this is the prediction) 90 | - Reduce the difference between predicted score and actual score (via gradient descent and a loss function like mean squared error or BCE) 91 | 92 | Here's some pseudo-code on how it would work. 93 | 94 | ``` 95 | for product_pair, label in train_set: 96 | # Get embedding for each product 97 | product1_emb = embedding(product1) 98 | product2_emb = embedding(product2) 99 | 100 | # Predict product-pair score (interaction term and sum) 101 | prediction = sig(sum(product1_emb * product2_emb, dim=1)) 102 | l2_reg = lambda * sum(embedding.weight ** 2) 103 | 104 | # Minimize loss 105 | loss = BinaryCrossEntropyLoss(prediction, label) 106 | loss += l2_reg 107 | 108 | loss.backward() 109 | optimizer.step() 110 | ``` 111 | 112 | For the training schedule, we run it over 5 epochs with cosine annealing. For each epoch, learning rate starts high (0.01) and drops rapidly to a minimum value near zero, before being reset for to the next epoch. 113 | 114 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/cosine-annealing.png) 115 | 116 | One epoch seems sufficient to achive close to optimal ROC-AUC. 117 | 118 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation1-precision-recall.png) 119 | 120 | However, if we look at the precision-recall curves below, we see that at around 0.5 we hit the “cliff of death”. If we estimate the threshold slightly too low, precision drops from close to 1.0 to 0.5; slightly too high and recall is poor. 121 | 122 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation1-learning-curve.png) 123 | 124 | ### 2. Matrix Factorization with Bias 125 | 126 | Adding bias reduces the steepness of the curves where they intersect, making it more production-friendly. (Though AUC-ROC decreases slightly, this implementation is preferable.) 127 | 128 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation2-precision-recall.png) 129 | 130 | ### 3. `Node2Vec` 131 | 132 | I tried using the implementation of `Node2Vec` [here](https://github.com/aditya-grover/node2vec) but it was too memory intensive and slow. It didn't run to completion, even on a 64gb instance. 133 | 134 | Digging deeper, I found that its approach to generating sequences was traversing the graph. If you allowed `networkx` to use multiple threads, it would spawn multiple processes to create sequences and cache them temporarily in memory. In short, very memory hungry. Overall, this didn’t work for the datasets I had. 135 | 136 | ### 4. `gensim.word2vec` 137 | 138 | Gensim has an implementation of w2v that takes in a list of sequences and can be multi-threaded. It was very easy to use and the fastest to complete five epochs. 139 | 140 | But the precision-recall curve shows a sharp cliff around threshold == 0.73. This is due to out-of-vocabulary products in our validation datasets (which don't have embeddings). 141 | 142 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation4-precision-recall.png) 143 | 144 | If we _only_ evaluate in-vocabulary items, performance improves significantly. 145 | 146 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation4b-precision-recall.png) 147 | 148 | ### 5. `PyTorch` word2vec 149 | 150 | We implement Skip-gram in PyTorch. Here's some simplified code of how it looks. 151 | 152 | ``` 153 | class SkipGram(nn.Module): 154 | def __init__(self, emb_size, emb_dim): 155 | self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True) 156 | self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True) 157 | 158 | def forward(self, center, context, neg_context): 159 | emb_center, emb_context, emb_neg_context = self.get_embeddings() 160 | 161 | # Get score for positive pairs 162 | score = torch.sum(emb_center * emb_context, dim=1) 163 | score = -F.logsigmoid(score) 164 | 165 | # Get score for negative pairs 166 | neg_score = torch.bmm(emb_neg_context, emb_center.unsqueeze(2)).squeeze() 167 | neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) 168 | 169 | # Return combined score 170 | return torch.mean(score + neg_score) 171 | ``` 172 | 173 | It performed better than `gensim` when considering all products. 174 | 175 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5-precision-recall.png) 176 | 177 | If considering _only_ seen products, it's still an improvement, but less dramatic. 178 | 179 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5b-precision-recall.png) 180 | 181 | When examining the learning curves, it seems that a single epoch is sufficient. In contrast to the learning curves from matrix factorization (implementation 1), the AUC-ROC doesn't drop drastically with each learning rate reset. 182 | 183 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5-learning-curve.png) 184 | 185 | ### 6. `PyTorch` word2vec with side info 186 | 187 | Why did we build the skip-gram model from scratch? Because we wanted to extend it with side information (e.g., brand, category, price). 188 | 189 | ``` 190 | B001T9NUFS -> B003AVEU6G -> B007ZN5Y56 ... -> B007ZN5Y56 191 | Television Sound bar Lamp Standing Fan 192 | Sony Sony Phillips Dyson 193 | 500 – 600 200 – 300 50 – 75 300 - 400 194 | ``` 195 | 196 | Perhaps by learning on these we can create better embeddings? 197 | 198 | Unfortunately, it didn't work out. Here's how the learning curve looks. 199 | 200 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation6-learning-curve.png) 201 | 202 | One possible reason for this non-result is the sparsity of the meta data. Out of 418,749 electronic products, we only had metadata for 162,023 (39%). Of these, brand was 51% empty. 203 | 204 | ### 7. Sequences + Matrix Factorization 205 | 206 | Why did the w2v approach do so much better than matrix factorization? Was it due to the skipgram model, or due to the training data format (i.e., sequences)? 207 | 208 | To understand this better, I tried the previous matrix factorization with bias implementation (AUC-ROC = 0.7951) with the new sequences and dataloader. It worked very well. 209 | 210 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation7-precision-recall.png) 211 | 212 | Oddly though, the matrix factorization approach still exhibits the effect of “forgetting” as learning rate resets with each epoch (Fig 9.), though not as pronounced as Figure 3 in the previous post. 213 | 214 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation7-learning-curve.png) 215 | 216 | _I wonder if this is due to using the same embeddings for both center and context._ 217 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | backcall==0.1.0 3 | boto==2.49.0 4 | boto3==1.10.26 5 | botocore==1.13.26 6 | certifi==2024.7.4 7 | chardet==3.0.4 8 | cPython==0.0.5 9 | decorator==4.4.1 10 | docutils==0.15.2 11 | gensim==3.8.1 12 | idna==2.8 13 | ipykernel==5.1.3 14 | ipython==8.10.0 15 | ipython-genutils==0.2.0 16 | jedi==0.15.1 17 | jmespath==0.9.4 18 | joblib==1.2.0 19 | jupyter-client==5.3.4 20 | jupyter-core==4.11.2 21 | networkx==2.4 22 | node2vec==0.3.1 23 | numpy==1.22.0 24 | pandas==0.25.3 25 | parso==0.5.1 26 | pexpect==4.7.0 27 | pickleshare==0.7.5 28 | prompt-toolkit==2.0.10 29 | ptyprocess==0.6.0 30 | Pygments==2.15.0 31 | pymongo==3.9.0 32 | python-dateutil==2.8.0 33 | pytz==2019.3 34 | pyzmq==18.1.0 35 | requests==2.31.0 36 | s3transfer==0.2.1 37 | scikit-learn==0.21.3 38 | scipy==1.10.0 39 | six==1.13.0 40 | smart-open==1.9.0 41 | tornado==6.3.3 42 | tqdm==4.39.0 43 | traitlets==4.3.3 44 | urllib3==1.26.5 45 | wcwidth==0.1.7 46 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | ### Workflow for books 2 | python -m src.parse.parse_json data/meta_Books.json.gz data/books.csv 3 | 4 | python -m src.prep.prep_node_relationship data/books.csv data/books_relationships.csv 5 | python -m src.prep.prep_meta data/books.csv data/books_meta.csv 6 | 7 | python -m src.prep.prep_edges data/books_relationships.csv data/books_edges.csv 8 | python -m src.prep.train_val_split data/books_edges.csv 0.33 9 | 10 | python -m src.prep.prep_graph_samples data/books_edges_train.edgelist data/books_sequences.npy books 11 | 12 | # Slow and requires a lot of ram 13 | python -m src.ml.train_node2vec_embeddings data/books_edges_train.edgelist data/books_embeddings.kv 14 | 15 | # Works fine with multiprocess 16 | python -m src.ml.train_gensim_embedding data/books_sequences_sample.npy 8 17 | 18 | # PyTorch 19 | # For dev testing 20 | python -m src.ml.train_torch_embedding data/books_sequences_sample.npy data/books_edges_val_samp.csv data/books_edges_train_samp.csv 32 4 21 | # For training 22 | python -m src.ml.train_torch_embedding data/books_sequences.npy data/books_edges_val.csv data/books_edges_val_samp.csv 128 10 # Best params? 23 | 24 | # ========================================================================================================================================== 25 | ### Workflow for electronics 26 | python -m src.parse.parse_json data/meta_Electronics.json.gz data/electronics.csv 27 | 28 | python -m src.prep.prep_node_relationship data/electronics.csv data/electronics_relationships.csv 29 | python -m src.prep.prep_meta data/electronics.csv data/electronics_meta.csv 30 | 31 | python -m src.prep.prep_edges data/electronics_relationships.csv data/electronics_edges.csv 32 | python -m src.prep.train_val_split data/electronics_edges.csv 0.33 33 | 34 | python -m src.prep.prep_graph_samples data/electronics_edges_train.edgelist data/electronics_sequences.npy electronics 35 | 36 | # Slow and requires a lot of ram 37 | python -m src.ml.train_node2vec_embeddings data/electronics_edges_train.edgelist data/electronics_embeddings.kv 38 | 39 | # Works fine with multiprocess 40 | python -m src.ml.train_gensim_embedding data/electronics_sequences_sample.npy 6 41 | 42 | # PyTorch 43 | # For dev testing 44 | python -m src.ml.train_torch_embedding data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_edges_train_samp.csv 32 4 45 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_meta.csv data/electronics_edges_train_samp.csv 32 4 46 | # For training 47 | python -m src.ml.train_torch_embedding data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4 # Best params? 48 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_meta.csv data/electronics_edges_val_samp.csv 128 10 # Best params? 49 | 50 | # MF Dev 51 | python -m src.ml.train_torch_mf data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_edges_val_samp.csv 32 4 52 | python -m src.ml.train_torch_mf data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 # Best params? 53 | python -m src.ml.train_torch_mf_bias data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 # Best params? 54 | 55 | # Edges model 56 | python -m src.ml.train_torch_mf_edges data/electronics_edges_train_samp.csv data/electronics_edges_val_samp.csv data/electronics_edges_val_samp.csv 32 4 57 | 58 | # ========================================================================================================================================== 59 | ### Running for results 60 | python -m src.ml.train_gensim_embedding data/electronics_sequences.npy 8 61 | python -m src.ml.train_torch_embedding data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 # Best params? 62 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_meta.csv data/electronics_edges_val_samp.csv 128 8 # Best params? 63 | python -m src.ml.train_torch_mf data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 # Best params? 64 | python -m src.ml.train_torch_mf_bias data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4 # Best params? 65 | python -m src.ml.train_torch_mf_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 66 | python -m src.ml.train_torch_mf_bias_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4 67 | python -m src.ml.train_torch_mf_continuous_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4 68 | python -m src.ml.train_torch_mf_bias_continuous_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8 69 | 70 | 71 | python -m src.ml.train_gensim_embedding data/books_sequences.npy 8 72 | python -m src.ml.train_torch_embedding data/books_sequences.npy data/books_edges_val.csv data/books_edges_val_samp.csv 128 8 # Best params? 73 | python -m src.ml.train_torch_mf_bias data/books_sequences.npy data/books_edges_val.csv data/books_edges_val_samp.csv 128 8 # Best params? 74 | python -m src.ml.train_torch_mf_bias_edges data/books_edges_train.csv data/books_edges_val.csv data/books_edges_val_samp.csv 128 8 75 | python -m src.ml.train_torch_mf_bias_continuous_edges data/books_edges_train.csv data/books_edges_val.csv data/books_edges_val_samp.csv 128 8 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/__init__.py -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = 'data' 2 | MODEL_PATH = 'model' 3 | -------------------------------------------------------------------------------- /src/ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/ml/__init__.py -------------------------------------------------------------------------------- /src/ml/data_loader.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import Counter 3 | from typing import Dict, List, Tuple 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.config import MODEL_PATH 11 | from src.utils.io_utils import save_model 12 | from src.utils.logger import logger 13 | 14 | 15 | class Sequences: 16 | NEGATIVE_SAMPLE_TABLE_SIZE = 1e7 17 | WINDOW = 5 18 | 19 | def __init__(self, sequence_path: str, val_path: str, subsample: float = 0.001, power: float = 0.75): 20 | """ 21 | Initializes a Sequences object for use in a Dataset. 22 | 23 | Args: 24 | sequence_path: Path to numpy array of sequences, where each row is a sequence 25 | subsample: Subsampling parameter; suggested range (0, 1e-5) 26 | power: Negative sampling parameter; suggested 0.75 27 | """ 28 | self.negative_idx = 0 29 | self.n_unique_tokens = 0 30 | 31 | self.sequences = np.load(sequence_path).tolist() 32 | self.n_sequences = len(self.sequences) 33 | logger.info('Sequences loaded (length = {:,})'.format(self.n_sequences)) 34 | 35 | self.val = pd.read_csv(val_path) 36 | logger.info('Validation set loaded: {}'.format(self.val.shape)) 37 | 38 | self.word_freq = self.get_word_freq() 39 | logger.info('Word frequency calculated') 40 | 41 | self.word2id, self.id2word = self.get_mapping_dicts() 42 | self.add_val_product_to_mapping_dicts() 43 | self.n_unique_tokens = len(self.word2id) 44 | logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens)) 45 | save_model(self.word2id, '{}/word2id'.format(MODEL_PATH)) 46 | save_model(self.id2word, '{}/id2word'.format(MODEL_PATH)) 47 | logger.info('Word2Id and Id2Word created and saved') 48 | 49 | self.sequences = self.convert_sequence_to_id() 50 | self.word_freq = self.convert_word_freq_to_id() 51 | logger.info('Convert sequence and wordfreq to ID') 52 | 53 | self.discard_probs = self.get_discard_probs(sample=subsample) 54 | logger.info('Discard probability calculated') 55 | 56 | self.neg_table = self.get_negative_sample_table(power=power) 57 | logger.info('Negative sample table created') 58 | 59 | # Used to preload all center context pairs (very memory heavy) 60 | # self.pairs = self.get_all_center_context_pairs(window=window) 61 | # self.n_pairs = len(self.pairs) 62 | # logger.info('Center Context pairs created') 63 | 64 | def get_word_freq(self) -> Counter: 65 | """ 66 | Returns a dictionary of word frequencies. 67 | 68 | Returns: 69 | 70 | """ 71 | # Flatten list 72 | seq_flat = list(itertools.chain.from_iterable(self.sequences)) 73 | 74 | # Get word frequency 75 | word_freq = Counter(seq_flat) 76 | 77 | return word_freq 78 | 79 | def get_mapping_dicts(self): 80 | word2id = dict() 81 | id2word = dict() 82 | 83 | wid = 0 84 | for w, c in self.word_freq.items(): 85 | word2id[w] = wid 86 | id2word[wid] = w 87 | wid += 1 88 | 89 | return word2id, id2word 90 | 91 | def add_val_product_to_mapping_dicts(self): 92 | val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values)) 93 | 94 | logger.info('Adding val products to word2id, original size: {}'.format(len(self.word2id))) 95 | wid = max(self.word2id.values()) + 1 96 | for w in val_product_set: 97 | if w not in self.word2id: 98 | self.word2id[w] = wid 99 | self.id2word[wid] = w 100 | wid += 1 101 | 102 | self.val = None # Release memory 103 | logger.info('Added val products to word2id, updated size: {}'.format(len(self.word2id))) 104 | 105 | def convert_sequence_to_id(self): 106 | return np.vectorize(self.word2id.get)(self.sequences) 107 | 108 | def get_product_id(self, x): 109 | return self.word2id.get(x, -1) 110 | 111 | def convert_word_freq_to_id(self): 112 | return {self.word2id[k]: v for k, v in self.word_freq.items()} 113 | 114 | def get_discard_probs(self, sample=0.001) -> Dict[int, float]: 115 | """ 116 | Returns a dictionary of words and their associated discard probability, where the word should be discarded 117 | if np.random.rand() < probability. 118 | 119 | Args: 120 | sample: 121 | 122 | Returns: 123 | 124 | """ 125 | # Convert to array 126 | word_freq = np.array(list(self.word_freq.items()), dtype=np.float64) 127 | 128 | # Convert to probabilities 129 | word_freq[:, 1] = word_freq[:, 1] / word_freq[:, 1].sum() 130 | 131 | # Perform subsampling 132 | # http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/ 133 | word_freq[:, 1] = (np.sqrt(word_freq[:, 1] / sample) + 1) * (sample / word_freq[:, 1]) 134 | 135 | # Get dict 136 | discard_probs = {int(k): v for k, v in word_freq.tolist()} 137 | 138 | return discard_probs 139 | 140 | def get_negative_sample_table(self, power=0.75) -> np.array: 141 | """ 142 | Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing. 143 | 144 | Args: 145 | power: 146 | 147 | Returns: 148 | 149 | """ 150 | # Convert to array 151 | word_freq = np.array(list(self.word_freq.items()), dtype=np.float64) 152 | 153 | # Adjust by power 154 | word_freq[:, 1] = word_freq[:, 1] ** power 155 | 156 | # Get probabilities 157 | word_freq_sum = word_freq[:, 1].sum() 158 | word_freq[:, 1] = word_freq[:, 1] / word_freq_sum 159 | 160 | # Multiply probabilities by sample table size 161 | word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE) 162 | 163 | # Convert to int 164 | word_freq = word_freq.astype(int).tolist() 165 | 166 | # Create sample table 167 | sample_table = [[tup[0]] * tup[1] for tup in word_freq] 168 | sample_table = np.array(list(itertools.chain.from_iterable(sample_table))) 169 | np.random.shuffle(sample_table) 170 | 171 | return sample_table 172 | 173 | # Works on per sequence 174 | def get_pairs(self, idx, window=5): 175 | pairs = [] 176 | sequence = self.sequences[idx] 177 | 178 | for center_idx, node in enumerate(sequence): 179 | for i in range(-window, window + 1): 180 | context_idx = center_idx + i 181 | if context_idx >= 0 and context_idx < len(sequence) and node != sequence[ 182 | context_idx] and np.random.rand() < self.discard_probs[sequence[context_idx]]: 183 | pairs.append((node, sequence[context_idx])) 184 | 185 | return pairs 186 | 187 | def get_all_center_context_pairs(self, window=5) -> List[Tuple[int, int]]: 188 | """ 189 | Returns a list of tuples (center, context). 190 | 191 | Args: 192 | window: 193 | 194 | Returns: 195 | 196 | """ 197 | 198 | pairs = [] 199 | 200 | for sequence in self.sequences: 201 | for center_idx, node in enumerate(sequence): 202 | for i in range(-window, window + 1): 203 | context_idx = center_idx + i 204 | if (0 <= context_idx < len(sequence)) \ 205 | and node != sequence[context_idx] \ 206 | and np.random.rand() < self.discard_probs[sequence[context_idx]]: 207 | pairs.append((node, sequence[context_idx])) 208 | 209 | return pairs 210 | 211 | def get_negative_samples(self, context, sample_size=5) -> np.array: 212 | """ 213 | Returns a list of negative samples, where len = sample_size. 214 | 215 | Args: 216 | sample_size: 217 | 218 | Returns: 219 | 220 | """ 221 | while True: 222 | # Get a batch from the shuffled table 223 | neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size] 224 | 225 | # Update negative index 226 | self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table) 227 | 228 | # Check if batch insufficient 229 | if len(neg_sample) != sample_size: 230 | neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx])) 231 | 232 | # Check if context in negative sample 233 | if not context in neg_sample: 234 | return neg_sample 235 | 236 | 237 | class SequencesDataset(Dataset): 238 | def __init__(self, sequences: Sequences, neg_sample_size=5): 239 | self.sequences = sequences 240 | self.neg_sample_size = neg_sample_size 241 | 242 | def __len__(self): 243 | return self.sequences.n_sequences 244 | 245 | def __getitem__(self, idx): 246 | pairs = self.sequences.get_pairs(idx) 247 | neg_samples = [] 248 | for center, context in pairs: 249 | neg_samples.append(self.sequences.get_negative_samples(context)) 250 | 251 | return pairs, neg_samples 252 | 253 | @staticmethod 254 | def collate(batches): 255 | # logger.info('Batches: {}'.format(batches)) 256 | pairs_batch = [batch[0] for batch in batches] 257 | neg_contexts_batch = [batch[1] for batch in batches] 258 | 259 | pairs_batch = list(itertools.chain.from_iterable(pairs_batch)) 260 | neg_contexts = list(itertools.chain.from_iterable(neg_contexts_batch)) 261 | 262 | centers = [center for center, _ in pairs_batch] 263 | contexts = [context for _, context in pairs_batch] 264 | 265 | return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(neg_contexts) 266 | 267 | @staticmethod 268 | def collate_for_mf(batches): 269 | batch_list = [] 270 | 271 | for batch in batches: 272 | pairs = np.array(batch[0]) 273 | negs = np.array(batch[1]) 274 | negs = np.vstack((pairs[:, 0].repeat(negs.shape[1]), negs.ravel())).T 275 | 276 | pairs_arr = np.ones((pairs.shape[0], pairs.shape[1] + 1), dtype=int) 277 | pairs_arr[:, :-1] = pairs 278 | 279 | negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int) 280 | negs_arr[:, :-1] = negs 281 | 282 | all_arr = np.vstack((pairs_arr, negs_arr)) 283 | batch_list.append(all_arr) 284 | 285 | batch_array = np.vstack(batch_list) 286 | # np.random.shuffle(batch_array) 287 | 288 | # Return item1, item2, label 289 | return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]), 290 | torch.FloatTensor(batch_array[:, 2])) -------------------------------------------------------------------------------- /src/ml/data_loader_edges.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import Counter 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from torch.utils.data import Dataset 8 | 9 | from src.config import MODEL_PATH 10 | from src.utils.io_utils import save_model 11 | from src.utils.logger import logger 12 | 13 | 14 | class Edges: 15 | NEGATIVE_SAMPLE_TABLE_SIZE = 1e7 16 | 17 | def __init__(self, edge_path: str, val_path: str, power: float = 0.75): 18 | """ 19 | Initializes an Edges object for use in a Dataset. 20 | 21 | Args: 22 | edge_path: Path to numpy array of sequences, where each row is a sequence 23 | power: Negative sampling parameter; suggested 0.75 24 | """ 25 | self.power = power 26 | self.negative_idx = 0 27 | self.n_unique_tokens = 0 28 | 29 | self.edges = pd.read_csv(edge_path) 30 | self.n_edges = len(self.edges) 31 | logger.info('Edges loaded (length = {:,})'.format(self.n_edges)) 32 | 33 | self.val = pd.read_csv(val_path) 34 | logger.info('Validation set loaded: {}'.format(self.val.shape)) 35 | 36 | self.product_set = self.get_product_set() 37 | self.word2id, self.id2word = self.get_mapping_dicts() 38 | self.get_product_id_func = np.vectorize(self.get_product_id) 39 | self.n_unique_tokens = len(self.word2id) 40 | logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens)) 41 | save_model(self.word2id, '{}/word2id_edge'.format(MODEL_PATH)) 42 | save_model(self.id2word, '{}/id2word_edge'.format(MODEL_PATH)) 43 | logger.info('Word2Id and Id2Word created and saved') 44 | 45 | # Convert product ID strings to integers 46 | self.edges = self.prep_edges() 47 | logger.info('Edges prepared') 48 | 49 | # Prepare negative sampling table 50 | self.word_freq = self.get_word_freq(self.edges[:, :2]) 51 | self.neg_table = self.get_negative_sample_table(self.power) 52 | 53 | def get_product_set(self): 54 | product_set = set(self.edges['product1'].tolist() + self.edges['product2'].tolist() + 55 | self.val['product1'].tolist() + self.val['product2'].tolist()) 56 | 57 | return product_set 58 | 59 | def get_mapping_dicts(self): 60 | word2id = dict() 61 | id2word = dict() 62 | 63 | wid = 0 64 | for w in self.product_set: 65 | word2id[w] = wid 66 | id2word[wid] = w 67 | wid += 1 68 | 69 | return word2id, id2word 70 | 71 | def get_product_id(self, x): 72 | return self.word2id.get(x, -1) 73 | 74 | def prep_edges(self): 75 | self.edges['product1_id'] = self.get_product_id_func(self.edges['product1']).astype(int) 76 | self.edges['product2_id'] = self.get_product_id_func(self.edges['product2']).astype(int) 77 | edges = self.edges[['product1_id', 'product2_id', 'weight']].copy().values 78 | 79 | return edges 80 | 81 | def get_word_freq(self, edges): 82 | product_counts = list(itertools.chain.from_iterable(edges)) 83 | word_freq = Counter(product_counts) 84 | return word_freq 85 | 86 | def get_negative_sample_table(self, power=0.75) -> np.array: 87 | """ 88 | Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing. 89 | 90 | Args: 91 | power: 92 | 93 | Returns: 94 | 95 | """ 96 | # Convert to array 97 | word_freq = np.array(list(self.word_freq.items()), dtype=np.float64) 98 | 99 | # Adjust by power 100 | word_freq[:, 1] = word_freq[:, 1] ** power 101 | 102 | # Get probabilities 103 | word_freq_sum = word_freq[:, 1].sum() 104 | word_freq[:, 1] = word_freq[:, 1] / word_freq_sum 105 | 106 | # Multiply probabilities by sample table size 107 | word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE) 108 | 109 | # Convert to int 110 | word_freq = word_freq.astype(int).tolist() 111 | 112 | # Create sample table 113 | sample_table = [[tup[0]] * tup[1] for tup in word_freq] 114 | sample_table = np.array(list(itertools.chain.from_iterable(sample_table))) 115 | np.random.shuffle(sample_table) 116 | 117 | return sample_table 118 | 119 | def get_negative_samples(self, context, sample_size=5) -> np.array: 120 | """ 121 | Returns a list of negative samples, where len = sample_size. 122 | 123 | Args: 124 | sample_size: 125 | 126 | Returns: 127 | 128 | """ 129 | while True: 130 | # Get a batch from the shuffled table 131 | neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size] 132 | 133 | # Update negative index 134 | self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table) 135 | 136 | # Check if batch insufficient 137 | if len(neg_sample) != sample_size: 138 | neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx])) 139 | 140 | # Check if context in negative sample 141 | if not context in neg_sample: 142 | return neg_sample 143 | 144 | 145 | class EdgesDataset(Dataset): 146 | def __init__(self, edges, neg_sample_size=5): 147 | self.edges = edges 148 | self.neg_sample_size = neg_sample_size 149 | 150 | def __len__(self): 151 | return self.edges.n_edges 152 | 153 | def __getitem__(self, idx): 154 | pair = self.edges.edges[idx] 155 | neg_samples = self.edges.get_negative_samples(context=pair[1]) 156 | 157 | return pair, neg_samples 158 | 159 | @staticmethod 160 | def collate(batches): 161 | logger.debug('Batches: {}'.format(batches)) 162 | batch_list = [] 163 | 164 | for batch in batches: 165 | pair = np.array(batch[0]) 166 | negs = np.array(batch[1]) 167 | negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T 168 | 169 | # Create arrays 170 | pair_arr = np.ones((pair.shape[0]), dtype=int) # This sets label to 1 # TODO: Leave label as continuous 171 | pair_arr[:-1] = pair[:-1] 172 | negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int) 173 | negs_arr[:, :-1] = negs 174 | all_arr = np.vstack((pair_arr, negs_arr)) 175 | batch_list.append(all_arr) 176 | 177 | batch_array = np.vstack(batch_list) 178 | 179 | # Return item1, item2, label 180 | return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]), 181 | torch.FloatTensor(batch_array[:, 2])) 182 | 183 | @staticmethod 184 | def collate_continuous(batches): 185 | logger.debug('Batches: {}'.format(batches)) 186 | batch_list = [] 187 | 188 | for batch in batches: 189 | pair = np.array(batch[0]) 190 | negs = np.array(batch[1]) 191 | negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T 192 | 193 | # Create arrays 194 | pair_arr = pair 195 | negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int) 196 | negs_arr[:, :-1] = negs 197 | all_arr = np.vstack((pair_arr, negs_arr)) 198 | batch_list.append(all_arr) 199 | 200 | batch_array = np.vstack(batch_list) 201 | 202 | # Return item1, item2, label 203 | return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]), 204 | torch.FloatTensor(batch_array[:, 2])) 205 | -------------------------------------------------------------------------------- /src/ml/data_loader_with_meta.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import Counter, OrderedDict 3 | from typing import Dict, List, Tuple 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from category_encoders import OrdinalEncoder 9 | from torch.utils.data import Dataset 10 | 11 | from src.config import MODEL_PATH 12 | from src.utils.io_utils import save_model 13 | from src.utils.logger import logger 14 | 15 | # META_COLS = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand'] 16 | 17 | 18 | def round_up(num, divisor=5): 19 | return ((num + divisor - 1) // divisor) * divisor 20 | 21 | 22 | def bin_price(price): 23 | if price < 25: 24 | return price 25 | elif 25 < price < 50: 26 | return round_up(price, divisor=5) 27 | elif 50 < price < 500: 28 | return round_up(price, divisor=10) 29 | else: 30 | return 500 31 | 32 | 33 | def prep_price(price_col): 34 | price = np.round(price_col) 35 | price.fillna(-1, inplace=True) 36 | price = price.astype(int) 37 | price = price.apply(bin_price) 38 | price = price + 1 39 | 40 | return price 41 | 42 | 43 | def prep_categorical(cat_col, min_threshold=100): 44 | counts = cat_col.value_counts() 45 | category_set = counts[counts > min_threshold].index 46 | 47 | return np.where(cat_col.isin(category_set), cat_col, 'MISC') 48 | 49 | 50 | def get_dict_values(meta, META_COLS): 51 | return [meta[col] for col in META_COLS] 52 | 53 | 54 | class Sequences: 55 | NEGATIVE_SAMPLE_TABLE_SIZE = 1e7 56 | WINDOW = 5 57 | 58 | def __init__(self, sequence_path: str, val_path: str, meta_path: str, subsample: float = 0.001, 59 | power: float = 0.75): 60 | """ 61 | Initializes a Sequence object for use in a Dataset. 62 | 63 | Args: 64 | sequence_path: Path to numpy array of sequences, where each row is a sequence 65 | subsample: Subsampling parameter; suggested range (0, 1e-5) 66 | power: Negative sampling parameter; suggested 0.75 67 | """ 68 | self.negative_idx = 0 69 | self.n_unique_tokens = 0 70 | # META_COLS = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand'] 71 | self.META_COLS = ['category_lvl_3', 'brand'] # Add meta columns here 72 | 73 | self.sequences = np.load(sequence_path).tolist() 74 | self.n_sequences = len(self.sequences) 75 | logger.info('Sequences loaded (length = {:,})'.format(self.n_sequences)) 76 | 77 | self.val = pd.read_csv(val_path) 78 | logger.info('Validation set loaded: {}'.format(self.val.shape)) 79 | 80 | self.word_freq = self.get_word_freq() 81 | logger.info('Word frequency calculated') 82 | 83 | self.word2id, self.id2word = self.get_mapping_dicts() 84 | self.add_val_product_to_mapping_dicts() 85 | self.n_unique_tokens = len(self.word2id) 86 | logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens)) 87 | save_model(self.word2id, '{}/word2id'.format(MODEL_PATH)) 88 | save_model(self.id2word, '{}/id2word'.format(MODEL_PATH)) 89 | logger.info('Word2Id and Id2Word created and saved') 90 | 91 | self.meta = pd.read_csv(meta_path, dtype={'asin': 'object'}) 92 | self.meta.drop_duplicates(subset='asin', inplace=True) 93 | self.meta['productid'] = self.meta['asin'].copy() 94 | self.meta = self.prep_meta() 95 | self.meta_dict, self.emb_sizes = self.convert_meta_to_dict() 96 | self.emb_sizes['product'] = len(self.word2id) 97 | logger.info('Embedding dimensions: {}'.format(self.emb_sizes)) 98 | save_model(self.meta_dict, '{}/meta_dict'.format(MODEL_PATH)) 99 | self.meta = None 100 | 101 | self.sequences = self.convert_sequence_to_id() 102 | self.word_freq = self.convert_word_freq_to_id() 103 | logger.info('Convert sequence and wordfreq to ID') 104 | 105 | self.discard_probs = self.get_discard_probs(sample=subsample) 106 | logger.info('Discard probability calculated') 107 | 108 | self.neg_table = self.get_negative_sample_table(power=power) 109 | logger.info('Negative sample table created') 110 | 111 | # Used to preload all center context pairs (very memory heavy) 112 | # self.pairs = self.get_all_center_context_pairs(window=window) 113 | # self.n_pairs = len(self.pairs) 114 | # logger.info('Center Context pairs created') 115 | 116 | def get_word_freq(self) -> Counter: 117 | """ 118 | Returns a dictionary of word frequencies. 119 | 120 | Returns: 121 | 122 | """ 123 | # Flatten list 124 | seq_flat = list(itertools.chain.from_iterable(self.sequences)) 125 | 126 | # Get word frequency 127 | word_freq = Counter(seq_flat) 128 | 129 | return word_freq 130 | 131 | def get_mapping_dicts(self): 132 | word2id = dict() 133 | id2word = dict() 134 | 135 | wid = 0 136 | for w, c in self.word_freq.items(): 137 | word2id[w] = wid 138 | id2word[wid] = w 139 | wid += 1 140 | 141 | return word2id, id2word 142 | 143 | def add_val_product_to_mapping_dicts(self): 144 | val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values)) 145 | 146 | logger.info('Adding val products to word2id, original size: {}'.format(len(self.word2id))) 147 | wid = max(self.word2id.values()) + 1 148 | for w in val_product_set: 149 | if w not in self.word2id: 150 | self.word2id[w] = wid 151 | self.id2word[wid] = w 152 | wid += 1 153 | 154 | self.val = None # Release memory 155 | logger.info('Added val products to word2id, updated size: {}'.format(len(self.word2id))) 156 | 157 | def convert_sequence_to_id(self): 158 | return np.vectorize(self.word2id.get)(self.sequences) 159 | 160 | def get_product_id(self, x): 161 | return self.word2id.get(x, -1) 162 | 163 | def convert_word_freq_to_id(self): 164 | return {self.word2id[k]: v for k, v in self.word_freq.items()} 165 | 166 | def prep_meta(self): 167 | logger.info('No. of rows in meta before filter by word2id: {}'.format(self.meta.shape[0])) 168 | meta = self.meta[self.meta['asin'].isin(self.word2id.keys())].copy() 169 | logger.info('No. of rows in meta after filter by word2id: {}'.format(meta.shape[0])) 170 | 171 | meta['price'] = prep_price(meta['price']) 172 | meta['category_lvl_2'] = prep_categorical(meta['category_lvl_2']) 173 | meta['category_lvl_3'] = prep_categorical(meta['category_lvl_3']) 174 | meta['category_lvl_4'] = prep_categorical(meta['category_lvl_4']) 175 | meta['brand'] = prep_categorical(meta['brand']) 176 | 177 | return meta 178 | 179 | def convert_meta_to_dict(self): 180 | meta = self.meta[['productid'] + self.META_COLS].copy() 181 | 182 | # Encode to int 183 | encoder = OrdinalEncoder(cols=self.META_COLS) 184 | meta = encoder.fit_transform(meta) 185 | save_model(encoder, '{}/encoder'.format(MODEL_PATH)) 186 | 187 | meta['values'] = meta.apply(get_dict_values, args=(self.META_COLS,), axis=1) 188 | meta_dict = meta.set_index('productid')['values'].to_dict() 189 | meta_dict = {self.word2id[k]: v for k, v in meta_dict.items()} 190 | 191 | meta_counts_dict = (meta[self.META_COLS].max() + 1).to_dict() # Need to +1 to account for index starting from zero 192 | # Without +1 the embedding size will be insufficient by 1 193 | ordered_meta_counts_dict = OrderedDict() 194 | for col in ['product'] + self.META_COLS: 195 | ordered_meta_counts_dict[col] = meta_counts_dict.get(col, 0) 196 | 197 | return meta_dict, ordered_meta_counts_dict 198 | 199 | def get_discard_probs(self, sample=0.001) -> Dict[int, float]: 200 | """ 201 | Returns a dictionary of words and their associated discard probability, where the word should be discarded 202 | if np.random.rand() < probability. 203 | 204 | Args: 205 | sample: 206 | 207 | Returns: 208 | 209 | """ 210 | # Convert to array 211 | word_freq = np.array(list(self.word_freq.items()), dtype=np.float64) 212 | 213 | # Convert to probabilities 214 | word_freq[:, 1] = word_freq[:, 1] / word_freq[:, 1].sum() 215 | 216 | # Perform subsampling 217 | # http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/ 218 | word_freq[:, 1] = (np.sqrt(word_freq[:, 1] / sample) + 1) * (sample / word_freq[:, 1]) 219 | 220 | # Get dict 221 | discard_probs = {int(k): v for k, v in word_freq.tolist()} 222 | 223 | return discard_probs 224 | 225 | def get_negative_sample_table(self, power=0.75) -> np.array: 226 | """ 227 | Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing. 228 | 229 | Args: 230 | power: 231 | 232 | Returns: 233 | 234 | """ 235 | # Convert to array 236 | word_freq = np.array(list(self.word_freq.items()), dtype=np.float64) 237 | 238 | # Adjust by power 239 | word_freq[:, 1] = word_freq[:, 1] ** power 240 | 241 | # Get probabilities 242 | word_freq_sum = word_freq[:, 1].sum() 243 | word_freq[:, 1] = word_freq[:, 1] / word_freq_sum 244 | 245 | # Multiply probabilities by sample table size 246 | word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE) 247 | 248 | # Convert to int 249 | word_freq = word_freq.astype(int).tolist() 250 | 251 | # Create sample table 252 | sample_table = [[tup[0]] * tup[1] for tup in word_freq] 253 | sample_table = np.array(list(itertools.chain.from_iterable(sample_table))) 254 | np.random.shuffle(sample_table) 255 | 256 | return sample_table 257 | 258 | def get_meta(self, idx): 259 | return self.meta_dict.get(idx, [0] * len(self.META_COLS)) 260 | 261 | # Works on per sequence 262 | def get_pairs(self, idx, window=5): 263 | pairs = [] 264 | sequence = self.sequences[idx] 265 | 266 | for center_idx, center in enumerate(sequence): 267 | for i in range(-window, window + 1): 268 | context_idx = center_idx + i 269 | if context_idx >= 0 and context_idx < len(sequence) and center != sequence[ 270 | context_idx] and np.random.rand() < self.discard_probs[sequence[context_idx]]: 271 | context = sequence[context_idx] 272 | center_meta = self.get_meta(center) 273 | context_meta = self.get_meta(center) 274 | pairs.append(([center] + center_meta, [context] + context_meta)) 275 | 276 | return pairs 277 | 278 | def get_all_center_context_pairs(self, window=5) -> List[Tuple[int, int]]: 279 | """ 280 | Returns a list of tuples (center, context). 281 | 282 | Args: 283 | window: 284 | 285 | Returns: 286 | 287 | """ 288 | 289 | pairs = [] 290 | 291 | for sequence in self.sequences: 292 | for center_idx, node in enumerate(sequence): 293 | for i in range(-window, window + 1): 294 | context_idx = center_idx + i 295 | if (0 <= context_idx < len(sequence)) \ 296 | and node != sequence[context_idx] \ 297 | and np.random.rand() < self.discard_probs[sequence[context_idx]]: 298 | pairs.append((node, sequence[context_idx])) 299 | 300 | return pairs 301 | 302 | def get_negative_samples(self, context, sample_size=5) -> np.array: 303 | """ 304 | Returns a list of negative samples, where len = sample_size. 305 | 306 | Args: 307 | sample_size: 308 | 309 | Returns: 310 | 311 | """ 312 | while True: 313 | # Get a batch from the shuffled table 314 | neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size] 315 | 316 | # Update negative index 317 | self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table) 318 | 319 | # Check if batch insufficient 320 | if len(neg_sample) != sample_size: 321 | neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx])) 322 | 323 | # Check if context in negative sample 324 | if not context in neg_sample: 325 | return [[samp] + self.get_meta(samp) for samp in neg_sample] 326 | 327 | 328 | class EdgesDataset(Dataset): 329 | def __init__(self, sequences: Sequences, neg_sample_size=5): 330 | self.sequences = sequences 331 | self.neg_sample_size = neg_sample_size 332 | 333 | def __len__(self): 334 | return self.sequences.n_sequences 335 | 336 | def __getitem__(self, idx): 337 | pairs = self.sequences.get_pairs(idx) 338 | neg_samples = [] 339 | for center, context in pairs: 340 | neg_samples.append(self.sequences.get_negative_samples(context)) 341 | 342 | return pairs, neg_samples 343 | 344 | @staticmethod 345 | def collate(batches): 346 | # logger.info('Batches: {}'.format(batches)) 347 | pairs_batch = [batch[0] for batch in batches] 348 | neg_contexts_batch = [batch[1] for batch in batches] 349 | 350 | pairs_batch = list(itertools.chain.from_iterable(pairs_batch)) 351 | neg_contexts = list(itertools.chain.from_iterable(neg_contexts_batch)) 352 | 353 | centers = [center for center, _ in pairs_batch] 354 | contexts = [context for _, context in pairs_batch] 355 | 356 | return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(neg_contexts) 357 | -------------------------------------------------------------------------------- /src/ml/mf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from src.utils.logger import logger 5 | 6 | torch.manual_seed(1368) 7 | 8 | 9 | def regularize_l2(array): 10 | loss = torch.sum(array ** 2.0) 11 | return loss 12 | 13 | 14 | class MF(nn.Module): 15 | def __init__(self, emb_size, emb_dim, c_vector=1e-6): 16 | super().__init__() 17 | self.emb_size = emb_size 18 | self.emb_dim = emb_dim 19 | self.c_vector = c_vector 20 | 21 | # Layers 22 | self.embedding = nn.Embedding(emb_size, emb_dim) 23 | self.sig = nn.Sigmoid() 24 | 25 | # Loss 26 | self.bce = nn.BCELoss() 27 | 28 | logger.info('Model initialized: {}'.format(self)) 29 | 30 | def forward(self, product1, product2): 31 | emb_product1 = self.embedding(product1) 32 | emb_product2 = self.embedding(product2) 33 | interaction = self.sig(torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)) 34 | 35 | return interaction 36 | 37 | def loss(self, pred, label): 38 | mf_loss = self.bce(pred, label) 39 | 40 | # L2 regularization 41 | product_prior = regularize_l2(self.embedding.weight) * self.c_vector 42 | 43 | loss_total = mf_loss + product_prior 44 | 45 | return loss_total 46 | -------------------------------------------------------------------------------- /src/ml/mf_bias.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from src.utils.logger import logger 5 | 6 | torch.manual_seed(1368) 7 | 8 | 9 | def regularize_l2(array): 10 | loss = torch.sum(array ** 2.0) 11 | return loss 12 | 13 | 14 | class MFBias(nn.Module): 15 | def __init__(self, emb_size, emb_dim, c_vector=1e-6, c_bias=1e-6): 16 | super().__init__() 17 | self.emb_size = emb_size 18 | self.emb_dim = emb_dim 19 | self.c_vector = c_vector 20 | self.c_bias = c_bias 21 | 22 | # Layers 23 | self.product_embedding = nn.Embedding(emb_size, emb_dim) 24 | self.sig = nn.Sigmoid() 25 | 26 | # Bias 27 | self.product_bias = nn.Embedding(emb_size, 1) 28 | self.bias = nn.Parameter(torch.ones(1)) 29 | 30 | # Loss 31 | self.bce = nn.BCELoss() 32 | 33 | logger.info('Model initialized: {}'.format(self)) 34 | 35 | def forward(self, product1, product2): 36 | emb_product1 = self.product_embedding(product1) 37 | emb_product2 = self.product_embedding(product2) 38 | interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float) 39 | 40 | bias_product1 = self.product_bias(product1).squeeze() 41 | bias_product2 = self.product_bias(product2).squeeze() 42 | biases = self.bias + bias_product1 + bias_product2 43 | 44 | prediction = self.sig((interaction + biases)) 45 | 46 | return prediction 47 | 48 | def loss(self, pred, label): 49 | mf_loss = self.bce(pred, label) 50 | 51 | # L2 regularization 52 | product_prior = regularize_l2(self.product_embedding.weight) * self.c_vector 53 | product_bias_prior = regularize_l2(self.product_bias.weight) * self.c_bias 54 | 55 | loss_total = mf_loss + product_prior + product_bias_prior 56 | 57 | return loss_total 58 | -------------------------------------------------------------------------------- /src/ml/mf_bias_continuous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from src.utils.logger import logger 5 | 6 | torch.manual_seed(1368) 7 | 8 | 9 | def regularize_l2(array): 10 | loss = torch.sum(array ** 2.0) 11 | return loss 12 | 13 | 14 | class MFBiasContinuous(nn.Module): 15 | def __init__(self, emb_size, emb_dim, c_vector=1e-6, c_bias=1e-6): 16 | super().__init__() 17 | self.emb_size = emb_size 18 | self.emb_dim = emb_dim 19 | self.c_vector = c_vector 20 | self.c_bias = c_bias 21 | 22 | # Layers 23 | self.product_embedding = nn.Embedding(emb_size, emb_dim) 24 | self.sig = nn.Sigmoid() 25 | 26 | # Bias 27 | self.product_bias = nn.Embedding(emb_size, 1) 28 | self.bias = nn.Parameter(torch.ones(1)) 29 | 30 | # Loss 31 | self.mse = nn.MSELoss() 32 | 33 | logger.info('Model initialized: {}'.format(self)) 34 | 35 | def forward(self, product1, product2): 36 | emb_product1 = self.product_embedding(product1) 37 | emb_product2 = self.product_embedding(product2) 38 | interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float) 39 | 40 | bias_product1 = self.product_bias(product1).squeeze() 41 | bias_product2 = self.product_bias(product2).squeeze() 42 | biases = self.bias + bias_product1 + bias_product2 43 | 44 | prediction = (interaction + biases) 45 | 46 | return prediction 47 | 48 | def predict(self, product1, product2): 49 | emb_product1 = self.product_embedding(product1) 50 | emb_product2 = self.product_embedding(product2) 51 | interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float) 52 | 53 | bias_product1 = self.product_bias(product1).squeeze() 54 | bias_product2 = self.product_bias(product2).squeeze() 55 | biases = self.bias + bias_product1 + bias_product2 56 | 57 | prediction = self.sig((interaction + biases)) 58 | 59 | return prediction 60 | 61 | def loss(self, pred, label): 62 | mf_loss = self.mse(pred, label) 63 | 64 | # L2 regularization 65 | product_prior = regularize_l2(self.product_embedding.weight) * self.c_vector 66 | product_bias_prior = regularize_l2(self.product_bias.weight) * self.c_bias 67 | 68 | loss_total = mf_loss + product_prior + product_bias_prior 69 | 70 | return loss_total 71 | -------------------------------------------------------------------------------- /src/ml/mf_continuous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from src.utils.logger import logger 5 | 6 | torch.manual_seed(1368) 7 | 8 | 9 | def regularize_l2(array): 10 | loss = torch.sum(array ** 2.0) 11 | return loss 12 | 13 | 14 | class MFContinuous(nn.Module): 15 | def __init__(self, emb_size, emb_dim, c_vector=1e-6): 16 | super().__init__() 17 | self.emb_size = emb_size 18 | self.emb_dim = emb_dim 19 | self.c_vector = c_vector 20 | 21 | # Layers 22 | self.embedding = nn.Embedding(emb_size, emb_dim) 23 | self.sig = nn.Sigmoid() 24 | 25 | # Loss 26 | self.mse = nn.MSELoss() 27 | 28 | logger.info('Model initialized: {}'.format(self)) 29 | 30 | def forward(self, product1, product2): 31 | emb_product1 = self.embedding(product1) 32 | emb_product2 = self.embedding(product2) 33 | interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float) 34 | 35 | return interaction 36 | 37 | def predict(self, product1, product2): 38 | emb_product1 = self.embedding(product1) 39 | emb_product2 = self.embedding(product2) 40 | interaction = self.sig(torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)) # Add sigmoid 41 | 42 | return interaction 43 | 44 | def loss(self, pred, label): 45 | mf_loss = self.mse(pred, label) 46 | 47 | # L2 regularization 48 | product_prior = regularize_l2(self.embedding.weight) * self.c_vector 49 | 50 | loss_total = mf_loss + product_prior 51 | 52 | return loss_total 53 | -------------------------------------------------------------------------------- /src/ml/skipgram.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | torch.manual_seed(1368) 8 | 9 | 10 | class SkipGram(nn.Module): 11 | 12 | def __init__(self, emb_size, emb_dim): 13 | super().__init__() 14 | self.emb_size = emb_size 15 | self.emb_dim = emb_dim 16 | self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True) 17 | self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True) 18 | self.init_emb() 19 | 20 | def init_emb(self): 21 | """ 22 | Init embeddings like word2vec 23 | 24 | Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim]. 25 | Context embeddings are initialized with 0s. 26 | 27 | Returns: 28 | 29 | """ 30 | emb_range = 0.5 / self.emb_dim 31 | 32 | # Initializing embeddings: 33 | # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch 34 | self.center_embeddings.weight.data.uniform_(-emb_range, emb_range) 35 | self.context_embeddings.weight.data.uniform_(0, 0) 36 | 37 | def forward(self, center, context, neg_context): 38 | """ 39 | 40 | Args: 41 | center: List of center words 42 | context: List of context words 43 | neg_context: List of list of negative context words 44 | 45 | Returns: 46 | 47 | """ 48 | # Calculate positive score 49 | emb_center = self.center_embeddings(center) # Get embeddings for center word 50 | emb_context = self.context_embeddings(context) # Get embeddings for context word 51 | emb_neg_context = self.context_embeddings(neg_context) # Get embeddings for negative context words 52 | 53 | # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch 54 | score = torch.mul(emb_center, emb_context) # Get dot product (part 1) 55 | score = torch.sum(score, dim=1) # Get dot product (part2) 56 | score = torch.clamp(score, max=10, min=-10) 57 | score = -F.logsigmoid(score) # Get score for the positive pairs 58 | 59 | # Calculate negative score (for negative samples) 60 | neg_score = torch.bmm(emb_neg_context, emb_center.unsqueeze(2)).squeeze() # Get dot product 61 | neg_score = torch.clamp(neg_score, max=10, min=-10) 62 | neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) 63 | 64 | # Return combined score 65 | return torch.mean(score + neg_score) 66 | 67 | def get_center_emb(self, center): 68 | return self.center_embeddings(center) 69 | 70 | def save_embeddings(self, file_name): 71 | embedding = self.center_embeddings.weight.cpu().data.numpy() 72 | np.save(file_name, embedding) 73 | -------------------------------------------------------------------------------- /src/ml/skipgram_with_meta.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from src.utils.logger import logger 7 | 8 | torch.manual_seed(1368) 9 | 10 | 11 | class SkipGram(nn.Module): 12 | 13 | def __init__(self, emb_sizes, emb_dim): 14 | super().__init__() 15 | self.emb_sizes = emb_sizes 16 | self.emb_dim = emb_dim 17 | 18 | # Create embedding layers 19 | self.center_embeddings = nn.ModuleList() 20 | for k, v in self.emb_sizes.items(): 21 | self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True)) 22 | 23 | self.context_embeddings = nn.ModuleList() 24 | for k, v in self.emb_sizes.items(): 25 | self.context_embeddings.append(nn.Embedding(v, emb_dim, sparse=True)) 26 | 27 | self.init_emb() 28 | 29 | def init_emb(self): 30 | """ 31 | Init embeddings like word2vec 32 | 33 | Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim]. 34 | Context embeddings are initialized with 0s. 35 | 36 | Returns: 37 | 38 | """ 39 | emb_range = 0.5 / self.emb_dim 40 | 41 | # Initializing embeddings: 42 | # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch 43 | for emb in self.center_embeddings: 44 | emb.weight.data.uniform_(-emb_range, emb_range) 45 | 46 | for emb in self.context_embeddings: 47 | emb.weight.data.uniform_(0, 0) 48 | 49 | def forward(self, centers, contexts, neg_contexts): 50 | """ 51 | 52 | Args: 53 | center: List of center words 54 | context: List of context words 55 | neg_context: List of list of negative context words 56 | 57 | Returns: 58 | 59 | """ 60 | # Calculate positive score 61 | emb_centers = [] 62 | for i in range(centers.shape[1]): 63 | logger.debug('center i: {}'.format(i)) 64 | emb_centers.append(self.center_embeddings[i](centers[:, i])) 65 | emb_center = torch.mean(torch.stack(emb_centers), axis=0) 66 | 67 | emb_contexts = [] 68 | for i in range(contexts.shape[1]): 69 | logger.debug('context i: {}'.format(i)) 70 | emb_contexts.append(self.context_embeddings[i](contexts[:, i])) 71 | emb_context = torch.mean(torch.stack(emb_contexts), axis=0) 72 | 73 | emb_neg_contexts = [] 74 | neg_contexts = neg_contexts.view(-1, len(self.context_embeddings)) 75 | for i in range(neg_contexts.shape[1]): 76 | logger.debug('neg context i: {}, {}'.format(i, neg_contexts[:, i])) 77 | emb_neg_contexts.append(self.context_embeddings[i](neg_contexts[:, i])) 78 | emb_neg_context = torch.mean(torch.stack(emb_neg_contexts), axis=0) 79 | 80 | # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch 81 | score = torch.mul(emb_center, emb_context) # Get dot product (part 1) 82 | score = torch.sum(score, dim=1) # Get dot product (part2) 83 | score = torch.clamp(score, max=10, min=-10) 84 | score = -F.logsigmoid(score) # Get score for the positive pairs 85 | 86 | # Calculate negative score (for negative samples) 87 | neg_score = torch.bmm(emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]), 88 | emb_center.unsqueeze(2)).squeeze() # Get dot product 89 | neg_score = torch.clamp(neg_score, max=10, min=-10) 90 | neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) 91 | 92 | # Return combined score 93 | return torch.mean(score + neg_score) 94 | 95 | def get_center_emb(self, centers): 96 | emb_centers = [] 97 | for row_idx, center in enumerate(centers): 98 | emb_center = [] 99 | for col_idx, center_ in enumerate(center): 100 | emb_center.append(self.center_embeddings[col_idx](center_)) 101 | 102 | emb_centers.append(torch.mean(torch.stack(emb_center), axis=0)) 103 | 104 | return torch.stack(emb_centers) 105 | 106 | def save_embeddings(self, file_name): 107 | embedding = self.center_embeddings.weight.cpu().data.numpy() 108 | np.save(file_name, embedding) 109 | -------------------------------------------------------------------------------- /src/ml/skipgram_with_meta_weighted.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from src.utils.logger import logger 7 | 8 | torch.manual_seed(1368) 9 | 10 | 11 | class SkipGram(nn.Module): 12 | 13 | def __init__(self, emb_sizes, emb_dim): 14 | super().__init__() 15 | self.emb_sizes = emb_sizes 16 | self.emb_dim = emb_dim 17 | 18 | # Create embedding layers 19 | self.center_embeddings = nn.ModuleList() 20 | for k, v in self.emb_sizes.items(): 21 | self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True)) 22 | 23 | self.context_embeddings = nn.ModuleList() 24 | for k, v in self.emb_sizes.items(): 25 | self.context_embeddings.append(nn.Embedding(v, emb_dim, sparse=True)) 26 | 27 | # Create embedding weighting layer 28 | self.emb_weights = nn.Embedding(emb_sizes['product'], len(emb_sizes), 29 | sparse=True) # emb_sizes['product'] is total number of products 30 | self.emb_weights_softmax = nn.Softmax(dim=1) 31 | 32 | self.init_emb() 33 | 34 | logger.info('Model initialized: {}'.format(self)) 35 | 36 | def init_emb(self): 37 | """ 38 | Init embeddings like word2vec 39 | 40 | Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim]. 41 | Context embeddings are initialized with 0s. 42 | 43 | Returns: 44 | 45 | """ 46 | emb_range = 0.5 / self.emb_dim 47 | 48 | # Initializing embeddings: 49 | # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch 50 | for emb in self.center_embeddings: 51 | emb.weight.data.uniform_(-emb_range, emb_range) 52 | 53 | for emb in self.context_embeddings: 54 | emb.weight.data.uniform_(0, 0) 55 | 56 | emb_weights_init = 1 / len(self.emb_sizes) 57 | self.emb_weights.weight.data.uniform_(emb_weights_init) 58 | 59 | def get_embedding(self, nodes): 60 | embs = [] 61 | emb_weight = self.emb_weights(nodes[:, 0]) 62 | emb_weight_norm = self.emb_weights_softmax(emb_weight) 63 | 64 | for i in range(nodes.shape[1]): 65 | logger.debug('center i: {}'.format(i)) 66 | embs.append(self.center_embeddings[i](nodes[:, i])) 67 | emb_stack = torch.stack(embs) 68 | embs_weighted = emb_stack * emb_weight_norm.T.unsqueeze(2).expand_as(emb_stack) 69 | emb = torch.sum(embs_weighted, axis=0) 70 | 71 | return emb 72 | 73 | def forward(self, centers, contexts, neg_contexts): 74 | """ 75 | 76 | Args: 77 | center: List of center words 78 | context: List of context words 79 | neg_context: List of list of negative context words 80 | 81 | Returns: 82 | 83 | """ 84 | emb_center = self.get_embedding(centers) 85 | emb_context = self.get_embedding(contexts) 86 | 87 | neg_contexts = neg_contexts.view(-1, len(self.context_embeddings)) # Need to expand this first 88 | emb_neg_context = self.get_embedding(neg_contexts) 89 | 90 | # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch 91 | score = torch.mul(emb_center, emb_context) # Get dot product (part 1) 92 | score = torch.sum(score, dim=1) # Get dot product (part2) 93 | score = torch.clamp(score, max=10, min=-10) 94 | score = -F.logsigmoid(score) # Get score for the positive pairs 95 | 96 | # Calculate negative score (for negative samples) 97 | neg_score = torch.bmm(emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]), 98 | emb_center.unsqueeze(2)).squeeze() # Get dot product 99 | neg_score = torch.clamp(neg_score, max=10, min=-10) 100 | neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) 101 | 102 | # Return combined score 103 | return torch.mean(score + neg_score) 104 | 105 | def get_center_emb(self, centers): 106 | emb_centers = [] 107 | for row_idx, center in enumerate(centers): 108 | emb_center = [] 109 | for col_idx, center_ in enumerate(center): 110 | emb_center.append(self.center_embeddings[col_idx](center_)) 111 | 112 | emb_centers.append(torch.mean(torch.stack(emb_center), axis=0)) 113 | 114 | return torch.stack(emb_centers) 115 | 116 | def save_embeddings(self, file_name): 117 | embedding = self.center_embeddings.weight.cpu().data.numpy() 118 | np.save(file_name, embedding) 119 | -------------------------------------------------------------------------------- /src/ml/train_gensim_embedding.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | from gensim.models import Word2Vec 6 | 7 | from src.config import MODEL_PATH 8 | from src.utils.logger import logger 9 | 10 | 11 | def load_sequences(sequence_path): 12 | """ 13 | Expects a numpy array at sequence_path 14 | 15 | Args: 16 | sequence_path: 17 | 18 | Returns: 19 | 20 | """ 21 | sequences = np.load(sequence_path) 22 | logger.info('Sequences shape: {}'.format(sequences.shape)) 23 | 24 | # Convert sequences to string and list of list 25 | sequences = sequences.astype(str).tolist() 26 | 27 | return sequences 28 | 29 | 30 | def train_embeddings(sequences, workers, dimension=128, window=5, min_count=1, negative=5, epochs=3, seed=42): 31 | # Logging specific to gensim training 32 | import logging 33 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 34 | 35 | # Initialize model 36 | model = Word2Vec(sequences, workers=workers, 37 | size=dimension, window=window, min_count=min_count, negative=negative, seed=seed) 38 | logger.info('Model initialized') 39 | 40 | # Train model (No need to retrain model as initialization includes training) 41 | # model.train(sequences, total_examples=len(sequences), epochs=epochs) 42 | # logger.info('Model trained!') 43 | 44 | return model 45 | 46 | 47 | def save_model(model): 48 | # Save model and keyedvectors 49 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 50 | model.save('{}/gensim-w2v-{}.model'.format(MODEL_PATH, current_datetime)) 51 | model.wv.save('{}/gensim-w2v-{}.kv'.format(MODEL_PATH, current_datetime)) 52 | 53 | 54 | if __name__ == '__main__': 55 | parser = argparse.ArgumentParser(description='Create embeddings using gensim package') 56 | parser.add_argument('read_path', type=str, help='Path to input sequences') 57 | parser.add_argument('n_workers', type=int, help='Number of workers') 58 | args = parser.parse_args() 59 | 60 | sequences = load_sequences(args.read_path) 61 | 62 | start_time = datetime.datetime.now() 63 | model = train_embeddings(sequences, workers=args.n_workers) 64 | end_time = datetime.datetime.now() 65 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 66 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 67 | save_model(model) 68 | -------------------------------------------------------------------------------- /src/ml/train_node2vec_embeddings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import networkx as nx 4 | from node2vec import Node2Vec 5 | 6 | from src.config import DATA_PATH 7 | from src.utils.logger import logger 8 | 9 | 10 | def train_embeddings(edgelist_path, embedding_path): 11 | # Create path 12 | graph = nx.read_weighted_edgelist(edgelist_path) 13 | logger.info('Graph created!') 14 | assert graph.get_edge_data('0000013714', '0005064295')['weight'] == 3.2, 'Expected edge weight of 3.2' 15 | 16 | # Precomput probabilities and generate walks 17 | node2vec = Node2Vec(graph, dimensions=128, walk_length=30, num_walks=10, workers=10, temp_folder=DATA_PATH) 18 | logger.info('Computed probabilities and generated walks') 19 | graph = None # We don't need graph anymore since probabilities have been precomputed 20 | 21 | # Embed nodes 22 | model = node2vec.fit(window=5, min_count=1, batch_words=128) 23 | logger.info('Nodes embedded') 24 | 25 | # Save embeddings for later use 26 | model.wv.save_word2vec_format(embedding_path) 27 | logger.info('Embedding saved') 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser(description='Create embeddings using node2vec package') 32 | parser.add_argument('read_path', type=str, help='Path to input (train) graph edgelist') 33 | parser.add_argument('write_path', type=str, help='Path to output embeddings') 34 | args = parser.parse_args() 35 | 36 | train_embeddings(args.read_path, args.write_path) 37 | -------------------------------------------------------------------------------- /src/ml/train_torch_embedding.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | import torch.nn.functional as F 8 | from sklearn.metrics import roc_auc_score 9 | from torch import optim 10 | from torch.utils.data import DataLoader 11 | 12 | from src.config import MODEL_PATH 13 | from src.ml.data_loader import Sequences, SequencesDataset 14 | from src.ml.skipgram import SkipGram 15 | from src.utils.logger import logger 16 | 17 | shuffle = True 18 | emb_dim = 128 19 | epochs = 5 20 | initial_lr = 0.025 21 | 22 | # Torch parameters 23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 24 | torch.cuda.set_device(1) # Set to use 2nd GPU 25 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 29 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 30 | parser.add_argument('val_path', type=str, help='Path to val.csv') 31 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 32 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 33 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 34 | args = parser.parse_args() 35 | 36 | # Initialize dataset 37 | sequences = Sequences(args.read_path, args.val_path) 38 | dataset = SequencesDataset(sequences) 39 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 40 | collate_fn=dataset.collate) 41 | 42 | # Initialize validation set 43 | val_samp = pd.read_csv(args.val_samp_path) 44 | 45 | # Get product ID 46 | word2id_func = np.vectorize(sequences.get_product_id) 47 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 48 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 49 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 50 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 51 | 52 | product1_id = val_samp['product1_id'].values 53 | product2_id = val_samp['product2_id'].values 54 | 55 | # Initialize model 56 | skipgram = SkipGram(sequences.n_unique_tokens, emb_dim).to(device) 57 | 58 | # Train loop 59 | optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr) 60 | 61 | results = [] 62 | start_time = datetime.datetime.now() 63 | for epoch in range(epochs): 64 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 65 | running_loss = 0 66 | 67 | # Training loop 68 | for i, batches in enumerate(dataloader): 69 | 70 | centers = batches[0].to(device) 71 | contexts = batches[1].to(device) 72 | neg_contexts = batches[2].to(device) 73 | 74 | optimizer.zero_grad() 75 | loss = skipgram.forward(centers, contexts, neg_contexts) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | scheduler.step() 80 | running_loss = running_loss * 0.9 + loss.item() * 0.1 81 | 82 | if i > 0 and i % 1000 == 0: 83 | # Validation Check 84 | with torch.no_grad(): 85 | product1_emb = skipgram.get_center_emb(torch.LongTensor(product1_id).to(device)) 86 | product2_emb = skipgram.get_center_emb(torch.LongTensor(product2_id).to(device)) 87 | cos_sim = F.cosine_similarity(product1_emb, product2_emb) 88 | score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy()) 89 | 90 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 91 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 92 | score, optimizer.param_groups[0]['lr'])) 93 | results.append([epoch, i, running_loss, score]) 94 | running_loss = 0 95 | 96 | # save model 97 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 98 | state_dict_path = '{}/skipgram_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 99 | torch.save(skipgram.state_dict(), state_dict_path) 100 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 101 | 102 | end_time = datetime.datetime.now() 103 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 104 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 105 | 106 | # Save results 107 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 108 | results_df.to_csv('{}/model_metrics_w2v.csv'.format(MODEL_PATH), index=False) 109 | -------------------------------------------------------------------------------- /src/ml/train_torch_embedding_with_meta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Does not do well as fill rate for meta data is low. Just considering product IDs, only 40% of the data is present. 3 | - Embedding dimensions: OrderedDict([('product', 418749), ('asin', 162024)]) 4 | 5 | This number is much lower when we consider category level 2 - 3, and brand. 6 | """ 7 | import argparse 8 | import datetime 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | import torch.nn.functional as F 14 | from sklearn.metrics import roc_auc_score 15 | from torch import optim 16 | from torch.utils.data import DataLoader 17 | 18 | from src.config import MODEL_PATH 19 | from src.ml.data_loader_with_meta import Sequences, SequencesDataset 20 | from src.ml.skipgram_with_meta_weighted import SkipGram 21 | from src.utils.logger import logger 22 | 23 | shuffle = True 24 | emb_dim = 128 25 | epochs = 5 26 | initial_lr = 0.025 27 | 28 | # Torch parameters 29 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 30 | torch.cuda.set_device(1) # Set to use 2nd GPU 31 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 35 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 36 | parser.add_argument('val_path', type=str, help='Path to val.csv') 37 | parser.add_argument('meta_path', type=str, help='Path to meta.csv') 38 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 39 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 40 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 41 | args = parser.parse_args() 42 | 43 | # Initialize dataset 44 | sequences = Sequences(args.read_path, args.val_path, args.meta_path) 45 | dataset = SequencesDataset(sequences) 46 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 47 | collate_fn=dataset.collate) 48 | 49 | # Initialize validation set 50 | val_samp = pd.read_csv(args.val_samp_path) 51 | 52 | # Get product ID 53 | word2id_func = np.vectorize(sequences.get_product_id) 54 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 55 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 56 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 57 | 58 | 59 | def get_id_and_meta(product_id): 60 | return [product_id] + sequences.get_meta(product_id) 61 | 62 | 63 | val_product1 = val_samp['product1_id'].apply(get_id_and_meta) 64 | val_product2 = val_samp['product2_id'].apply(get_id_and_meta) 65 | 66 | # Initialize model 67 | skipgram = SkipGram(sequences.emb_sizes, emb_dim).to(device) 68 | 69 | # Train loop 70 | optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr) 71 | 72 | results = [] 73 | start_time = datetime.datetime.now() 74 | for epoch in range(epochs): 75 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 76 | running_loss = 0 77 | 78 | # Training loop 79 | for i, batches in enumerate(dataloader): 80 | 81 | centers = batches[0].to(device) 82 | contexts = batches[1].to(device) 83 | neg_contexts = batches[2].to(device) 84 | 85 | optimizer.zero_grad() 86 | loss = skipgram.forward(centers, contexts, neg_contexts) 87 | loss.backward() 88 | optimizer.step() 89 | 90 | scheduler.step() 91 | running_loss = running_loss * 0.9 + loss.item() * 0.1 92 | 93 | if i > 0 and i % 1000 == 0: 94 | # Validation Check 95 | with torch.no_grad(): 96 | product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1).to(device)) 97 | product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2).to(device)) 98 | cos_sim = F.cosine_similarity(product1_emb, product2_emb) 99 | score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy()) 100 | 101 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 102 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 103 | score, optimizer.param_groups[0]['lr'])) 104 | results.append([epoch, i, running_loss, score]) 105 | running_loss = 0 106 | 107 | # save model 108 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 109 | state_dict_path = '{}/skipgram_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 110 | torch.save(skipgram.state_dict(), state_dict_path) 111 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 112 | 113 | end_time = datetime.datetime.now() 114 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 115 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 116 | 117 | # Save results 118 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 119 | results_df.to_csv('{}/model_metrics_w2v_meta.csv'.format(MODEL_PATH), index=False) 120 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader import Sequences, SequencesDataset 13 | from src.ml.mf import MF 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | torch.cuda.set_device(1) # Set to use 2nd GPU 24 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 28 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 29 | parser.add_argument('val_path', type=str, help='Path to val.csv') 30 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 31 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 32 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 33 | args = parser.parse_args() 34 | 35 | # Initialize dataset 36 | sequences = Sequences(args.read_path, args.val_path) 37 | dataset = SequencesDataset(sequences) 38 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 39 | collate_fn=dataset.collate_for_mf) 40 | 41 | # Initialize validation set 42 | val_samp = pd.read_csv(args.val_samp_path) 43 | 44 | # Get product ID 45 | word2id_func = np.vectorize(sequences.get_product_id) 46 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 47 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 48 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 49 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 50 | 51 | product1_id = val_samp['product1_id'].values 52 | product2_id = val_samp['product2_id'].values 53 | 54 | # Initialize model 55 | mf = MF(sequences.n_unique_tokens, emb_dim).to(device) 56 | 57 | # Train loop 58 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 59 | 60 | results = [] 61 | start_time = datetime.datetime.now() 62 | for epoch in range(epochs): 63 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 64 | running_loss = 0 65 | 66 | # Training loop 67 | for i, batches in enumerate(dataloader): 68 | 69 | product1 = batches[0].to(device) 70 | product2 = batches[1].to(device) 71 | label = batches[2].to(device) 72 | 73 | optimizer.zero_grad() 74 | 75 | pred = mf.forward(product1, product2) 76 | loss = mf.loss(pred, label) 77 | loss.backward() 78 | optimizer.step() 79 | 80 | scheduler.step() 81 | running_loss = running_loss * 0.9 + loss.item() * 0.1 82 | 83 | if i > 0 and i % 1000 == 0: 84 | # Validation Check 85 | with torch.no_grad(): 86 | pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device), 87 | torch.LongTensor(val_samp['product2_id']).to(device)) 88 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 89 | 90 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 91 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 92 | score, optimizer.param_groups[0]['lr'])) 93 | results.append([epoch, i, running_loss, score]) 94 | running_loss = 0 95 | 96 | # save model 97 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 98 | state_dict_path = '{}/mf_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 99 | torch.save(mf.state_dict(), state_dict_path) 100 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 101 | 102 | end_time = datetime.datetime.now() 103 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 104 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 105 | 106 | # Save results 107 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 108 | results_df.to_csv('{}/model_metrics_mf.csv'.format(MODEL_PATH), index=False) 109 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_bias.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader import Sequences, SequencesDataset 13 | from src.ml.mf_bias import MFBias 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | torch.cuda.set_device(0) # Set to use 2nd GPU 24 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 28 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 29 | parser.add_argument('val_path', type=str, help='Path to val.csv') 30 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 31 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 32 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 33 | args = parser.parse_args() 34 | 35 | # Initialize dataset 36 | sequences = Sequences(args.read_path, args.val_path) 37 | dataset = SequencesDataset(sequences) 38 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 39 | collate_fn=dataset.collate_for_mf) 40 | 41 | # Initialize validation set 42 | val_samp = pd.read_csv(args.val_samp_path) 43 | 44 | # Get product ID 45 | word2id_func = np.vectorize(sequences.get_product_id) 46 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 47 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 48 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 49 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 50 | 51 | product1_id = val_samp['product1_id'].values 52 | product2_id = val_samp['product2_id'].values 53 | 54 | # Initialize model 55 | mf = MFBias(sequences.n_unique_tokens, emb_dim).to(device) 56 | 57 | # Train loop 58 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 59 | 60 | results = [] 61 | start_time = datetime.datetime.now() 62 | for epoch in range(epochs): 63 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 64 | running_loss = 0 65 | 66 | # Training loop 67 | for i, batches in enumerate(dataloader): 68 | 69 | product1 = batches[0].to(device) 70 | product2 = batches[1].to(device) 71 | label = batches[2].to(device) 72 | 73 | optimizer.zero_grad() 74 | 75 | pred = mf.forward(product1, product2) 76 | loss = mf.loss(pred, label) 77 | loss.backward() 78 | optimizer.step() 79 | 80 | scheduler.step() 81 | running_loss = running_loss * 0.9 + loss.item() * 0.1 82 | 83 | if i > 0 and i % 1000 == 0: 84 | # Validation Check 85 | with torch.no_grad(): 86 | pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device), 87 | torch.LongTensor(val_samp['product2_id']).to(device)) 88 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 89 | 90 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 91 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 92 | score, optimizer.param_groups[0]['lr'])) 93 | results.append([epoch, i, running_loss, score]) 94 | running_loss = 0 95 | 96 | # save model 97 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 98 | state_dict_path = '{}/mf_bias_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 99 | torch.save(mf.state_dict(), state_dict_path) 100 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 101 | 102 | end_time = datetime.datetime.now() 103 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 104 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 105 | 106 | # Save results 107 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 108 | results_df.to_csv('{}/model_metrics_mf_bias.csv'.format(MODEL_PATH), index=False) 109 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_bias_continuous_edges.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader_edges import Edges, EdgesDataset 13 | from src.ml.mf_bias_continuous import MFBiasContinuous 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 27 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 28 | parser.add_argument('val_path', type=str, help='Path to val.csv') 29 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 30 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 31 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 32 | args = parser.parse_args() 33 | 34 | # Initialize dataset 35 | edges = Edges(args.read_path, args.val_path) 36 | dataset = EdgesDataset(edges) 37 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 38 | collate_fn=dataset.collate_continuous) 39 | 40 | # Initialize validation set 41 | val_samp = pd.read_csv(args.val_samp_path) 42 | 43 | # Get product ID 44 | word2id_func = np.vectorize(edges.get_product_id) 45 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 46 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 47 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 48 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 49 | 50 | product1_id = val_samp['product1_id'].values 51 | product2_id = val_samp['product2_id'].values 52 | 53 | # Initialize model 54 | mf = MFBiasContinuous(edges.n_unique_tokens, emb_dim).to(device) 55 | 56 | # Train loop 57 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 58 | 59 | results = [] 60 | start_time = datetime.datetime.now() 61 | for epoch in range(epochs): 62 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 63 | running_loss = 0 64 | 65 | # Training loop 66 | for i, batches in enumerate(dataloader): 67 | 68 | product1 = batches[0].to(device) 69 | product2 = batches[1].to(device) 70 | label = batches[2].to(device) 71 | 72 | optimizer.zero_grad() 73 | 74 | pred = mf.forward(product1, product2) 75 | loss = mf.loss(pred, label) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | scheduler.step() 80 | running_loss = running_loss * 0.9 + loss.item() * 0.1 81 | 82 | if i > 0 and i % 1000 == 0: 83 | # Validation Check 84 | with torch.no_grad(): 85 | pred = mf.predict(torch.LongTensor(val_samp['product1_id']).to(device), 86 | torch.LongTensor(val_samp['product2_id']).to(device)) 87 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 88 | 89 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 90 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 91 | score, optimizer.param_groups[0]['lr'])) 92 | results.append([epoch, i, running_loss, score]) 93 | running_loss = 0 94 | 95 | # save model 96 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 97 | state_dict_path = '{}/mf_bias_continuous_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 98 | torch.save(mf.state_dict(), state_dict_path) 99 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 100 | 101 | end_time = datetime.datetime.now() 102 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 103 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 104 | 105 | # Save results 106 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 107 | results_df.to_csv('{}/model_metrics_mf_bias_continuous_edges.csv'.format(MODEL_PATH), index=False) 108 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_bias_edges.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader_edges import Edges, EdgesDataset 13 | from src.ml.mf_bias import MFBias 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 27 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 28 | parser.add_argument('val_path', type=str, help='Path to val.csv') 29 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 30 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 31 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 32 | args = parser.parse_args() 33 | 34 | # Initialize dataset 35 | edges = Edges(args.read_path, args.val_path) 36 | dataset = EdgesDataset(edges) 37 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 38 | collate_fn=dataset.collate) 39 | 40 | # Initialize validation set 41 | val_samp = pd.read_csv(args.val_samp_path) 42 | 43 | # Get product ID 44 | word2id_func = np.vectorize(edges.get_product_id) 45 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 46 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 47 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 48 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 49 | 50 | product1_id = val_samp['product1_id'].values 51 | product2_id = val_samp['product2_id'].values 52 | 53 | # Initialize model 54 | mf = MFBias(edges.n_unique_tokens, emb_dim).to(device) 55 | 56 | # Train loop 57 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 58 | 59 | results = [] 60 | start_time = datetime.datetime.now() 61 | for epoch in range(epochs): 62 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 63 | running_loss = 0 64 | 65 | # Training loop 66 | for i, batches in enumerate(dataloader): 67 | 68 | product1 = batches[0].to(device) 69 | product2 = batches[1].to(device) 70 | label = batches[2].to(device) 71 | 72 | optimizer.zero_grad() 73 | 74 | pred = mf.forward(product1, product2) 75 | loss = mf.loss(pred, label) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | scheduler.step() 80 | running_loss = running_loss * 0.9 + loss.item() * 0.1 81 | 82 | if i > 0 and i % 1000 == 0: 83 | # Validation Check 84 | with torch.no_grad(): 85 | pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device), 86 | torch.LongTensor(val_samp['product2_id']).to(device)) 87 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 88 | 89 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 90 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 91 | score, optimizer.param_groups[0]['lr'])) 92 | results.append([epoch, i, running_loss, score]) 93 | running_loss = 0 94 | 95 | # save model 96 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 97 | state_dict_path = '{}/mf_bias_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 98 | torch.save(mf.state_dict(), state_dict_path) 99 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 100 | 101 | end_time = datetime.datetime.now() 102 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 103 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 104 | 105 | # Save results 106 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 107 | results_df.to_csv('{}/model_metrics_mf_bias_edges.csv'.format(MODEL_PATH), index=False) 108 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_bias_edges_parallel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader_edges import Edges, EdgesDataset 13 | from src.ml.mf_bias import MFBias 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 27 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 28 | parser.add_argument('val_path', type=str, help='Path to val.csv') 29 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 30 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 31 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 32 | args = parser.parse_args() 33 | 34 | # Initialize dataset 35 | edges = Edges(args.read_path, args.val_path) 36 | dataset = EdgesDataset(edges) 37 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 38 | collate_fn=dataset.collate) 39 | 40 | # Initialize validation set 41 | val_samp = pd.read_csv(args.val_samp_path) 42 | 43 | # Get product ID 44 | word2id_func = np.vectorize(edges.get_product_id) 45 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 46 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 47 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 48 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 49 | 50 | product1_id = val_samp['product1_id'].values 51 | product2_id = val_samp['product2_id'].values 52 | 53 | # Initialize model 54 | mf = MFBias(edges.n_unique_tokens, emb_dim) 55 | if torch.cuda.device_count() > 1: 56 | logger.info('Detected {} GPUs, using them all'.format(torch.cuda.device_count())) 57 | mf = torch.nn.DataParallel(mf) 58 | mf.to(device) 59 | 60 | 61 | # Train loop 62 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 63 | 64 | results = [] 65 | start_time = datetime.datetime.now() 66 | for epoch in range(epochs): 67 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 68 | running_loss = 0 69 | 70 | # Training loop 71 | for i, batches in enumerate(dataloader): 72 | 73 | product1 = batches[0].to(device) 74 | product2 = batches[1].to(device) 75 | label = batches[2].to(device) 76 | 77 | optimizer.zero_grad() 78 | 79 | pred = mf.forward(product1, product2) 80 | # loss = mf.loss(pred, label) 81 | loss = mf.module.loss(pred, label) 82 | loss.backward() 83 | optimizer.step() 84 | 85 | scheduler.step() 86 | running_loss = running_loss * 0.9 + loss.item() * 0.1 87 | 88 | if i > 0 and i % 1000 == 0: 89 | # Validation Check 90 | with torch.no_grad(): 91 | pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device), 92 | torch.LongTensor(val_samp['product2_id']).to(device)) 93 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 94 | 95 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 96 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 97 | score, optimizer.param_groups[0]['lr'])) 98 | results.append([epoch, i, running_loss, score]) 99 | running_loss = 0 100 | 101 | # save model 102 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 103 | state_dict_path = '{}/mf_bias_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 104 | torch.save(mf.state_dict(), state_dict_path) 105 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 106 | 107 | end_time = datetime.datetime.now() 108 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 109 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 110 | 111 | # Save results 112 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 113 | results_df.to_csv('{}/model_metrics_mf_bias_edges.csv'.format(MODEL_PATH), index=False) 114 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_continuous_edges.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader_edges import Edges, EdgesDataset 13 | from src.ml.mf_continuous import MFContinuous 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 27 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 28 | parser.add_argument('val_path', type=str, help='Path to val.csv') 29 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 30 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 31 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 32 | args = parser.parse_args() 33 | 34 | # Initialize dataset 35 | edges = Edges(args.read_path, args.val_path) 36 | dataset = EdgesDataset(edges) 37 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 38 | collate_fn=dataset.collate_continuous) 39 | 40 | # Initialize validation set 41 | val_samp = pd.read_csv(args.val_samp_path) 42 | 43 | # Get product ID 44 | word2id_func = np.vectorize(edges.get_product_id) 45 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 46 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 47 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 48 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 49 | 50 | product1_id = val_samp['product1_id'].values 51 | product2_id = val_samp['product2_id'].values 52 | 53 | # Initialize model 54 | mf = MFContinuous(edges.n_unique_tokens, emb_dim).to(device) 55 | 56 | # Train loop 57 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 58 | 59 | results = [] 60 | start_time = datetime.datetime.now() 61 | for epoch in range(epochs): 62 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 63 | running_loss = 0 64 | 65 | # Training loop 66 | for i, batches in enumerate(dataloader): 67 | 68 | product1 = batches[0].to(device) 69 | product2 = batches[1].to(device) 70 | label = batches[2].to(device) 71 | 72 | optimizer.zero_grad() 73 | 74 | pred = mf.forward(product1, product2) 75 | loss = mf.loss(pred, label) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | scheduler.step() 80 | running_loss = running_loss * 0.9 + loss.item() * 0.1 81 | 82 | if i > 0 and i % 1000 == 0: 83 | # Validation Check 84 | with torch.no_grad(): 85 | pred = mf.predict(torch.LongTensor(val_samp['product1_id']).to(device), 86 | torch.LongTensor(val_samp['product2_id']).to(device)) 87 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 88 | 89 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 90 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 91 | score, optimizer.param_groups[0]['lr'])) 92 | results.append([epoch, i, running_loss, score]) 93 | running_loss = 0 94 | 95 | # save model 96 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 97 | state_dict_path = '{}/mf_continuous_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 98 | torch.save(mf.state_dict(), state_dict_path) 99 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 100 | 101 | end_time = datetime.datetime.now() 102 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 103 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 104 | 105 | # Save results 106 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 107 | results_df.to_csv('{}/model_metrics_mf_continuous_edges.csv'.format(MODEL_PATH), index=False) 108 | -------------------------------------------------------------------------------- /src/ml/train_torch_mf_edges.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import roc_auc_score 8 | from torch import optim 9 | from torch.utils.data import DataLoader 10 | 11 | from src.config import MODEL_PATH 12 | from src.ml.data_loader_edges import Edges, EdgesDataset 13 | from src.ml.mf import MF 14 | from src.utils.logger import logger 15 | 16 | shuffle = True 17 | emb_dim = 128 18 | epochs = 5 19 | initial_lr = 0.01 20 | 21 | # Torch parameters 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr)) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Training embeddings on torch') 27 | parser.add_argument('read_path', type=str, help='Path to sequences.npy') 28 | parser.add_argument('val_path', type=str, help='Path to val.csv') 29 | parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv') 30 | parser.add_argument('batch_size', type=int, help='Batchsize for dataloader') 31 | parser.add_argument('n_workers', type=int, help='Number of workers for dataloader') 32 | args = parser.parse_args() 33 | 34 | # Initialize dataset 35 | edges = Edges(args.read_path, args.val_path) 36 | dataset = EdgesDataset(edges) 37 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers, 38 | collate_fn=dataset.collate) 39 | 40 | # Initialize validation set 41 | val_samp = pd.read_csv(args.val_samp_path) 42 | 43 | # Get product ID 44 | word2id_func = np.vectorize(edges.get_product_id) 45 | val_samp['product1_id'] = word2id_func(val_samp['product1'].values) 46 | val_samp['product2_id'] = word2id_func(val_samp['product2'].values) 47 | val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)] # Keep those with valid ID 48 | logger.info('No. of validation samples: {}'.format(val_samp.shape[0])) 49 | 50 | product1_id = val_samp['product1_id'].values 51 | product2_id = val_samp['product2_id'].values 52 | 53 | # Initialize model 54 | mf = MF(edges.n_unique_tokens, emb_dim).to(device) 55 | 56 | # Train loop 57 | optimizer = optim.Adam(mf.parameters(), lr=initial_lr) 58 | 59 | results = [] 60 | start_time = datetime.datetime.now() 61 | for epoch in range(epochs): 62 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader)) 63 | running_loss = 0 64 | 65 | # Training loop 66 | for i, batches in enumerate(dataloader): 67 | 68 | product1 = batches[0].to(device) 69 | product2 = batches[1].to(device) 70 | label = batches[2].to(device) 71 | 72 | optimizer.zero_grad() 73 | 74 | pred = mf.forward(product1, product2) 75 | loss = mf.loss(pred, label) 76 | loss.backward() 77 | optimizer.step() 78 | 79 | scheduler.step() 80 | running_loss = running_loss * 0.9 + loss.item() * 0.1 81 | 82 | if i > 0 and i % 1000 == 0: 83 | # Validation Check 84 | with torch.no_grad(): 85 | pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device), 86 | torch.LongTensor(val_samp['product2_id']).to(device)) 87 | score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy()) 88 | 89 | logger.info("Epoch: {}, Seq: {:,}/{:,}, " \ 90 | "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss, 91 | score, optimizer.param_groups[0]['lr'])) 92 | results.append([epoch, i, running_loss, score]) 93 | running_loss = 0 94 | 95 | # save model 96 | current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M') 97 | state_dict_path = '{}/mf_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime) 98 | torch.save(mf.state_dict(), state_dict_path) 99 | logger.info('Model state dict saved to {}'.format(state_dict_path)) 100 | 101 | end_time = datetime.datetime.now() 102 | time_diff = round((end_time - start_time).total_seconds() / 60, 2) 103 | logger.info('Total time taken: {:,} minutes'.format(time_diff)) 104 | 105 | # Save results 106 | results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc']) 107 | results_df.to_csv('{}/model_metrics_mf_edges.csv'.format(MODEL_PATH), index=False) 108 | -------------------------------------------------------------------------------- /src/parse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/parse/__init__.py -------------------------------------------------------------------------------- /src/parse/parse_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses the raw json data into csv file for faster loading into pd.DataFrame. 3 | """ 4 | import argparse 5 | import csv 6 | import gzip 7 | from typing import List 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from pandas.api.types import is_object_dtype 12 | 13 | from src.utils.logger import logger 14 | 15 | 16 | def parse(path: str): 17 | g = gzip.open(path, 'rb') 18 | for l in g: 19 | yield eval(l) 20 | 21 | 22 | def parse_json_to_df(path: str) -> pd.DataFrame: 23 | i = 0 24 | df_dict = {} 25 | for d in parse(path): 26 | df_dict[i] = d 27 | i += 1 28 | if i % 10000 == 0: 29 | logger.info('Rows processed: {:,}'.format(i)) 30 | 31 | df = pd.DataFrame.from_dict(df_dict, orient='index') 32 | 33 | # Lowercase 34 | df['related'] = df['related'].astype(str) 35 | df['categories'] = df['categories'].astype(str) 36 | df['salesRank'] = df['salesRank'].astype(str) 37 | df = lowercase_df(df) 38 | 39 | return df 40 | 41 | 42 | # Lowercase Functions 43 | def lowercase_df(df: pd.DataFrame) -> pd.DataFrame: 44 | """ 45 | Lowercase characters from all columns in a dataframe. 46 | 47 | Args: 48 | df: Pandas dataframe 49 | 50 | Returns: 51 | Lowercased dataframe 52 | """ 53 | df = df.copy() 54 | for col in df.columns: 55 | if is_object_dtype(df[col]): 56 | df = lowercase_cols(df, [col]) 57 | return df 58 | 59 | 60 | def lowercase_cols(df: pd.DataFrame, colnames: List[str]) -> pd.DataFrame: 61 | """ 62 | Lowercase characters from specified columns in a dataframe 63 | 64 | Args: 65 | df: Pandas dataframe 66 | colnames (List): Names of columns to be lowercased 67 | 68 | Returns: Lowercased dataframe 69 | 70 | """ 71 | df = df.copy() 72 | for col in colnames: 73 | assert df[col].dtype != np.float64 and df[col].dtype != np.int64, \ 74 | 'Trying to lowercase a non-string column: {}'.format(col) 75 | df[col] = df[col].str.lower() 76 | return df 77 | 78 | 79 | def parse_json_to_csv(read_path: str, write_path: str) -> None: 80 | """ 81 | Note: This assumes that the first json in the path has all the keys, which could be WRONG 82 | 83 | Args: 84 | read_path: 85 | write_path: 86 | 87 | Returns: 88 | 89 | """ 90 | csv_writer = csv.writer(open(write_path, 'w')) 91 | i = 0 92 | for d in parse(read_path): 93 | if i == 0: 94 | header = d.keys() 95 | csv_writer.writerow(header) 96 | 97 | csv_writer.writerow(d.values().lower()) 98 | i += 1 99 | if i % 10000 == 0: 100 | logger.info('Rows processed: {:,}'.format(i)) 101 | 102 | logger.info('Csv saved to {}'.format(write_path)) 103 | 104 | 105 | if __name__ == '__main__': 106 | parser = argparse.ArgumentParser(description='Parsing json (gzipped) to csv') 107 | parser.add_argument('read_path', type=str, help='Path to input gzipped json') 108 | parser.add_argument('write_path', type=str, help='Path to output csv') 109 | args = parser.parse_args() 110 | 111 | df = parse_json_to_df(args.read_path) 112 | df.to_csv(args.write_path, index=False) 113 | logger.info('Csv saved to {}'.format(args.write_path)) 114 | -------------------------------------------------------------------------------- /src/prep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/prep/__init__.py -------------------------------------------------------------------------------- /src/prep/prep_edges.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts edge relationships (e.g., bought together, also bought) to numeric weights between two nodes. 3 | """ 4 | import argparse 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from src.utils.logger import logger 10 | 11 | relationship_weights = {'bought_together': 1.2, 12 | 'also_bought': 1.0, 13 | 'also_viewed': 0.5} 14 | 15 | 16 | def create_product_pair(df, col_list): 17 | pairs = df[col_list].values 18 | pairs.sort(axis=1) 19 | df['product_pair'] = ['|'.join(arr) for arr in pairs] 20 | 21 | return df 22 | 23 | 24 | def split_product_pair(product_pair): 25 | result = product_pair.split('|') 26 | return result[0], result[1] 27 | 28 | 29 | def get_relationship_weights(df, relationship_weights): 30 | df['weight'] = 0 31 | for relationship, weight in relationship_weights.items(): 32 | df.loc[df['relationship'] == relationship, 'weight'] += weight 33 | 34 | return df 35 | 36 | 37 | def get_edges(df): 38 | """ 39 | Returns a dataframe of products and the weights of the edges between them. 40 | 41 | Args: 42 | df: 43 | 44 | Returns: 45 | 46 | """ 47 | logger.info('Relationship distribution: \n{}'.format(df['relationship'].value_counts())) 48 | 49 | df = create_product_pair(df, col_list=['asin', 'related']) 50 | logger.info('Product pairs created') 51 | 52 | df = get_relationship_weights(df, relationship_weights) 53 | logger.info('Relationship weights updated') 54 | 55 | # Aggregate to remove duplicates 56 | logger.info('Original no. of edges: {:,}'.format(df.shape[0])) 57 | df = df.groupby('product_pair').agg({'weight': 'sum'}).reset_index() 58 | logger.info('Deduplicated no. of edges: {:,}'.format(df.shape[0])) 59 | 60 | # Save edge list 61 | df['product1'], df['product2'] = zip(*df['product_pair'].apply(split_product_pair)) 62 | 63 | df = df[['product1', 'product2', 'weight', 'product_pair']] 64 | return df 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser(description='Preparing edges and associated weights') 69 | parser.add_argument('read_path', type=str, help='Path to input csv (of node relationships)') 70 | parser.add_argument('write_path', type=str, help='Path to output edges') 71 | parser.add_argument('--sample_size', type=int, help='Sample size (default: no sampling)', 72 | default=None) 73 | args = parser.parse_args() 74 | 75 | df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True, 76 | dtype={'asin': 'str', 'related': 'str'}) 77 | logger.info('DF shape: {}'.format(df.shape)) 78 | 79 | # Sample for development efficiency 80 | if args.sample_size: 81 | sample_idx = np.random.choice(df.shape[0], size=args.sample_size, replace=False) 82 | df = df.iloc[sample_idx] 83 | 84 | df = get_edges(df) 85 | 86 | df.to_csv(args.write_path, index=False) 87 | logger.info('Csv saved to {}'.format(args.write_path)) 88 | -------------------------------------------------------------------------------- /src/prep/prep_graph_samples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Builds a graph from the edges (training set) and performs random walk sampling from the graph 3 | - Currently returns 10 samples of sequence length 10 for each node (this is a parameter in create_random_walk_samples) 4 | """ 5 | import argparse 6 | import random 7 | 8 | import networkx 9 | import numpy as np 10 | import scipy as sp 11 | 12 | from src.config import DATA_PATH 13 | from src.utils.io_utils import save_model 14 | from src.utils.logger import logger 15 | 16 | 17 | def load_network(edgelist_path): 18 | graph = networkx.read_weighted_edgelist(edgelist_path) 19 | logger.info('No of nodes ({:,}) and edges ({:,})'.format(graph.number_of_nodes(), graph.number_of_edges())) 20 | 21 | # Get dictionary mapping of integer to nodes 22 | node_dict = {i: key for i, key in enumerate(graph.nodes.keys())} 23 | 24 | return graph, node_dict 25 | 26 | 27 | def create_transition_matrix(graph): 28 | """ 29 | https://stackoverflow.com/questions/37311651/get-node-list-from-random-walk-in-networkx 30 | https://stackoverflow.com/questions/15330380/probability-to-visit-nodes-in-a-random-walk-on-graph 31 | 32 | Args: 33 | graph: 34 | 35 | Returns: 36 | 37 | """ 38 | adjacency_mat = networkx.adj_matrix(graph) 39 | logger.info('Adjacency matrix shape: {}'.format(adjacency_mat.shape)) 40 | graph = None 41 | 42 | degree_vector = sp.sparse.csr_matrix(1 / np.sum(adjacency_mat, axis=0)) 43 | 44 | transition_matrix = adjacency_mat.multiply(degree_vector).T # Need to transpose so each row probability sum to 1 45 | logger.info('Transition matrix shape: {}'.format(transition_matrix.shape)) 46 | 47 | return transition_matrix 48 | 49 | 50 | def create_transition_dict(transition_matrix): 51 | transition_dict = {} 52 | rows, cols = transition_matrix.nonzero() 53 | 54 | # Create dictionary of transition product and probabilities for each product 55 | prev_row = -1 56 | for row, col in zip(rows, cols): 57 | if row != prev_row: 58 | transition_dict.setdefault(row, {}) 59 | transition_dict[row].setdefault('product', []) 60 | transition_dict[row].setdefault('probability', []) 61 | 62 | transition_dict[row]['product'].append(col) 63 | transition_dict[row]['probability'].append(transition_matrix[row, col]) 64 | prev_row = row 65 | 66 | return transition_dict 67 | 68 | 69 | def create_random_walk_samples(node_dict, transition_dict, samples_per_node=10, sequence_len=10): 70 | random.seed(42) 71 | n_nodes = len(node_dict) 72 | 73 | sample_array = np.zeros((n_nodes * samples_per_node, sequence_len), dtype=int) 74 | logger.info('Sample array shape: {}'.format(sample_array.shape)) 75 | 76 | # For each node 77 | for node_idx in range(n_nodes): 78 | 79 | if node_idx % 100000 == 0: 80 | logger.info('Getting samples for node: {:,}/{:,}'.format(node_idx, n_nodes)) 81 | 82 | # For each sample 83 | for sample_idx in range(samples_per_node): 84 | node = node_idx 85 | 86 | # For each event in sequence 87 | for seq_idx in range(sequence_len): 88 | sample_array[node_idx * samples_per_node + sample_idx, seq_idx] = node 89 | node = random.choices(population=transition_dict[node]['product'], 90 | weights=transition_dict[node]['probability'], k=1)[0] 91 | 92 | return sample_array 93 | 94 | 95 | def get_samples(edgelist_path): 96 | graph, node_dict = load_network(edgelist_path) 97 | logger.info('Network loaded') 98 | 99 | transition_matrix = create_transition_matrix(graph) 100 | logger.info('Transition matrix created') 101 | graph = None 102 | 103 | transition_dict = create_transition_dict(transition_matrix) 104 | logger.info('Transition dict created') 105 | transition_matrix = None 106 | 107 | sample_array = create_random_walk_samples(node_dict, transition_dict) 108 | logger.info('Random walk samples created') 109 | 110 | # Convert array of nodeIDs back to product IDs 111 | sample_array = np.vectorize(node_dict.get)(sample_array) 112 | logger.info('Converted back to product IDs') 113 | 114 | return sample_array, node_dict, transition_dict 115 | 116 | 117 | if __name__ == '__main__': 118 | parser = argparse.ArgumentParser(description='Preparing graph samples via random walk') 119 | parser.add_argument('read_path', type=str, help='Path to input graph edgelist') 120 | parser.add_argument('write_path', type=str, help='Path to output samples (.npy format)') 121 | parser.add_argument('graph_name', type=str, help='Name for node dict and transition dict') 122 | args = parser.parse_args() 123 | 124 | sample_array, node_dict, transition_dict = get_samples(args.read_path) 125 | 126 | np.save(args.write_path, sample_array) 127 | logger.info('Sample array saved to {}'.format(args.write_path)) 128 | sample_array = None 129 | 130 | save_model(node_dict, '{}/{}_node_dict.tar.gz'.format(DATA_PATH, args.graph_name)) 131 | node_dict = None 132 | 133 | save_model(transition_dict, '{}/{}_transition_dict.tar.gz'.format(DATA_PATH, args.graph_name)) 134 | transition_dict = None 135 | -------------------------------------------------------------------------------- /src/prep/prep_meta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses out the metadata from the original csv. 3 | """ 4 | import argparse 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from src.utils.logger import logger 10 | 11 | 12 | def get_category_lvl(category_list: list, lvl=0) -> str: 13 | try: 14 | return category_list[lvl] 15 | except IndexError: 16 | return 'NA_VALUE' 17 | 18 | 19 | def get_categories(df: pd.DataFrame) -> pd.DataFrame: 20 | df['category_lvl_1'] = df['categories'].apply(get_category_lvl, args=(0,)) 21 | df['category_lvl_2'] = df['categories'].apply(get_category_lvl, args=(1,)) 22 | df['category_lvl_3'] = df['categories'].apply(get_category_lvl, args=(2,)) 23 | df['category_lvl_4'] = df['categories'].apply(get_category_lvl, args=(3,)) 24 | logger.info('Categories lvl 1 - 4 prepared') 25 | 26 | return df 27 | 28 | 29 | def get_meta(df: pd.DataFrame) -> pd.DataFrame: 30 | # Update to reflect if relationship exist 31 | df['related'] = np.where(df['related'].isnull(), 0, 1) 32 | 33 | # Prep categories 34 | df['categories'] = df['categories'].apply(eval) 35 | df['categories'] = df['categories'].apply(lambda x: x[0]) # Get first category only 36 | df = get_categories(df) 37 | 38 | # Prep title and description 39 | # TODO: Add cleaning of title and description 40 | 41 | return df 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='Preparing item metadata') 46 | parser.add_argument('read_path', type=str, help='Path to input csv') 47 | parser.add_argument('write_path', type=str, help='Path to output csv (of metadata') 48 | args = parser.parse_args() 49 | 50 | META_COLS = ['asin', 'categories', 'title', 'description', 'price', 'brand', 'related'] 51 | df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True, 52 | dtype={'asin': 'str', 'title': 'str', 'brand': 'str'}, 53 | usecols=META_COLS) 54 | logger.info('DF shape: {}'.format(df.shape)) 55 | 56 | meta_df = get_meta(df) 57 | 58 | meta_df.to_csv(args.write_path, index=False) 59 | logger.info('Csv saved to {}'.format(args.write_path)) 60 | -------------------------------------------------------------------------------- /src/prep/prep_node_relationship.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parses item to item relationships in 'related' field and explodes it such that each relationship is a single row. 3 | """ 4 | import argparse 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from src.utils.logger import logger 10 | 11 | 12 | def get_also_bought_count(related): 13 | try: 14 | return len(related['also_bought']) 15 | except KeyError: 16 | return -1 17 | 18 | 19 | def explode_on_related(df: pd.DataFrame, relationship: str) -> pd.DataFrame: 20 | # Filter on relationship 21 | df = df[df['related'].apply(lambda x: relationship in x.keys())].copy() 22 | 23 | # Get value (list) from relationship dict 24 | df['related'] = df['related'].apply(lambda x: x[relationship]) 25 | 26 | # Explode efficiently using numpy 27 | vals = df['related'].values.tolist() 28 | lens = [len(val_list) for val_list in vals] 29 | vals_array = np.repeat(df['asin'], lens) 30 | exploded_df = pd.DataFrame(np.column_stack((vals_array, np.concatenate(vals))), columns=df.columns) 31 | 32 | # Add relationship 33 | exploded_df['relationship'] = relationship 34 | logger.info('Exploded for relationship: {}'.format(relationship)) 35 | 36 | return exploded_df 37 | 38 | 39 | def get_node_relationship(df: pd.DataFrame) -> pd.DataFrame: 40 | """ 41 | Returns a dataframe of products and their relationships (e.g., bought together, also bought, also viewed) 42 | 43 | Args: 44 | df: 45 | 46 | Returns: 47 | 48 | """ 49 | # Keep only rows with related data 50 | df = df[~df['related'].isnull()].copy() 51 | logger.info('DF shape after dropping empty related: {}'.format(df.shape)) 52 | 53 | df = df[~df['title'].isnull()].copy() 54 | logger.info('DF shape after dropping empty title: {}'.format(df.shape)) 55 | df = df[['asin', 'related']].copy() 56 | 57 | # Evaluate related str into dict 58 | df['related'] = df['related'].apply(eval) 59 | logger.info('Completed eval on "related" string') 60 | 61 | # Exclude products where also bought relationships less than 2 62 | df['also_bought_count'] = df['related'].apply(get_also_bought_count) 63 | df = df[df['also_bought_count'] >= 2].copy() 64 | logger.info('DF shape after dropping products with <2 edges: {}'.format(df.shape)) 65 | df.drop(columns='also_bought_count', inplace=True) 66 | 67 | # Explode columns 68 | bought_together_df = explode_on_related(df, relationship='bought_together') 69 | also_bought_df = explode_on_related(df, relationship='also_bought') 70 | also_viewed_df = explode_on_related(df, relationship='also_viewed') 71 | 72 | # Concatenate df 73 | combined_df = pd.concat([bought_together_df, also_bought_df, also_viewed_df], axis=0) 74 | logger.info('Distribution of relationships: \n{}'.format(combined_df['relationship'].value_counts())) 75 | 76 | return combined_df 77 | 78 | 79 | if __name__ == '__main__': 80 | parser = argparse.ArgumentParser(description='Preparing node relationships') 81 | parser.add_argument('read_path', type=str, help='Path to input csv') 82 | parser.add_argument('write_path', type=str, help='Path to output csv (of nodes relationships)') 83 | args = parser.parse_args() 84 | 85 | df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True, 86 | dtype={'asin': 'str', 'title': 'str', 'brand': 'str'}) 87 | logger.info('DF shape: {}'.format(df.shape)) 88 | 89 | exploded_df = get_node_relationship(df) 90 | 91 | exploded_df.to_csv(args.write_path, index=False) 92 | logger.info('Csv saved to {}'.format(args.write_path)) 93 | -------------------------------------------------------------------------------- /src/prep/train_val_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Splits all ground truth edges into train and validation set, with some constraints 3 | - The validation set should only contain edges where both products are in the train set 4 | 5 | For the validation set, negative samples are created by randomly selecting a pair of nodes and creating a negative edge. 6 | - From these samples, we exclude valid edges from either the train or validation set. 7 | """ 8 | import argparse 9 | from pathlib import Path 10 | from typing import Tuple 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.model_selection import train_test_split 15 | 16 | from src.config import DATA_PATH 17 | from src.prep.prep_edges import create_product_pair 18 | from src.utils.logger import logger 19 | 20 | 21 | def train_val_split(df, n_val_samples: int, filter_out_unseen: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: 22 | if filter_out_unseen: 23 | # First split to get some test samples 24 | train, val = train_test_split(df, test_size=int(1.1 * n_val_samples), random_state=42) # Need slightly more 25 | logger.info('Train shape: {}, val shape: {}'.format(train.shape, val.shape)) 26 | 27 | # Get set of products in train 28 | train_product_set = set(train['product1']).union(set(train['product2'])) 29 | logger.info('No. of unique products in train: {:,}'.format(len(train_product_set))) 30 | 31 | # Only keep val where both products are in train product set 32 | val = val[(val['product1'].isin(train_product_set)) & (val['product2'].isin(train_product_set))] 33 | logger.info('Updated val shape: {}'.format(val.shape)) 34 | 35 | # Split again to only get n_val_samples 36 | val = val.iloc[:n_val_samples].copy() 37 | logger.info('Final val shape: {}'.format(val.shape)) 38 | 39 | # Get train set 40 | train = df[~df.index.isin(set(val.index))].copy() 41 | logger.info('Final train shape: {}'.format(train.shape)) 42 | 43 | else: 44 | # First split to get some test samples 45 | train, val = train_test_split(df, test_size=int(n_val_samples), random_state=42) 46 | logger.info('Train shape: {}, val shape: {}'.format(train.shape, val.shape)) 47 | 48 | return train, val 49 | 50 | 51 | def get_sample(item_array, n_iter=None, sample_size=2): 52 | np.random.seed(42) 53 | n = len(item_array) 54 | 55 | # find the index we last sampled from 56 | start_idx = (n_iter * sample_size) % n 57 | if (start_idx + sample_size >= n) or (start_idx <= sample_size): 58 | # shuffle array if we have reached the end and repeat again 59 | np.random.shuffle(item_array) 60 | 61 | return item_array[start_idx:start_idx + sample_size] 62 | 63 | 64 | def collect_samples(item_array, sample_size, n_samples): 65 | samples = [] 66 | 67 | for i in range(0, n_samples): 68 | if i % 1000000 == 0: 69 | logger.info('Neg sample: {:,}'.format(i)) 70 | 71 | sample = get_sample(item_array, n_iter=i, sample_size=sample_size) 72 | samples.append(sample) 73 | 74 | return samples 75 | 76 | 77 | def create_negative_edges(df, val, n_val_samples): 78 | # Get set of valid product edges (across both train and val) 79 | valid_product_pairs = set(df['product_pair']) 80 | logger.info('No. of valid product pairs: {:,}'.format(len(valid_product_pairs))) 81 | 82 | # Get set of products in val (to generate edges) 83 | val_product_arr = np.array(list(set(val['product1']).union(set(val['product2'])))) 84 | logger.info('No. of unique products in val: {:,}'.format(len(val_product_arr))) 85 | 86 | # Create negative samples 87 | neg_samples = collect_samples(val_product_arr, sample_size=2, n_samples=int(1.1 * n_val_samples)) 88 | neg_samples_df = pd.DataFrame(neg_samples, columns=['product1', 'product2']) 89 | neg_samples_df.dropna(inplace=True) 90 | neg_samples_df = create_product_pair(neg_samples_df, col_list=['product1', 'product2']) 91 | logger.info('No. of negative samples: {:,}'.format(neg_samples_df.shape[0])) 92 | 93 | # Exclude neg samples that are valid pairs 94 | neg_samples_df = neg_samples_df[~neg_samples_df['product_pair'].isin(valid_product_pairs)].copy() 95 | logger.info('Updated no. of negative samples: {:,}'.format(neg_samples_df.shape[0])) 96 | 97 | # Only keep no. of val samples required 98 | neg_samples_df = neg_samples_df.iloc[:n_val_samples].copy() 99 | logger.info('Final no. of negative samples: {:,}'.format(neg_samples_df.shape[0])) 100 | 101 | return neg_samples_df 102 | 103 | 104 | def combine_val_and_neg_edges(val, neg_samples): 105 | neg_samples['edge'] = 0 106 | val['edge'] = 1 107 | 108 | VAL_COLS = ['product1', 'product2', 'edge'] 109 | neg = neg_samples[VAL_COLS].copy() 110 | val = val[VAL_COLS].copy() 111 | logger.info('Val shape: {}, Neg edges shape: {}, Ratio: {}'.format(val.shape, neg.shape, 112 | val.shape[0] / (val.shape[0] + neg.shape[0]))) 113 | 114 | val = pd.concat([val, neg]) 115 | logger.info('Final val shape: {}'.format(val.shape)) 116 | 117 | return val 118 | 119 | 120 | def get_train_and_val(df, val_prop: float): 121 | """ 122 | Splits into training and validation set, where validation set has 50% negative edges 123 | 124 | Args: 125 | df: 126 | val_prop: 127 | 128 | Returns: 129 | 130 | """ 131 | n_val_samples = int(val_prop * df.shape[0]) 132 | logger.info('Eventual required val samples (proportion: {}): {:,}'.format(val_prop, n_val_samples)) 133 | 134 | train, val = train_val_split(df, n_val_samples) 135 | logger.info('Ratio of train to val: {:,}:{:,} ({:.2f})'.format(train.shape[0], val.shape[0], 136 | val.shape[0] / (train.shape[0] + val.shape[0]))) 137 | 138 | neg_samples = create_negative_edges(df, val, n_val_samples) 139 | 140 | val = combine_val_and_neg_edges(val, neg_samples) 141 | train = train[['product1', 'product2', 'weight']].copy() 142 | 143 | return train, val 144 | 145 | 146 | if __name__ == '__main__': 147 | parser = argparse.ArgumentParser(description='Splitting into train and val set') 148 | parser.add_argument('read_path', type=str, help='Path to input csv of edges') 149 | parser.add_argument('val_prop', type=float, help='Proportion of validation set (e.g., 0.33)') 150 | args = parser.parse_args() 151 | 152 | df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True, 153 | dtype={'product1': 'str', 'product2': 'str'}) 154 | logger.info('DF shape: {}'.format(df.shape)) 155 | 156 | train, val = get_train_and_val(df, val_prop=args.val_prop) 157 | 158 | # Save to train, val, and train edgelist 159 | input_filename = Path(args.read_path).resolve().stem 160 | train.to_csv('{}/{}_train.csv'.format(DATA_PATH, input_filename), index=False) 161 | logger.info('Train saved as: {}/{}_train.csv'.format(DATA_PATH, input_filename)) 162 | val.to_csv('{}/{}_val.csv'.format(DATA_PATH, input_filename), index=False) 163 | logger.info('Val saved as: {}/{}_val.csv'.format(DATA_PATH, input_filename)) 164 | 165 | train.to_csv('{}/{}_train.edgelist'.format(DATA_PATH, input_filename), sep=' ', index=False, header=False) 166 | logger.info('Train edgelist saved as: {}/{}_train.edgelist'.format(DATA_PATH, input_filename)) 167 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import pickle 3 | from typing import Any 4 | 5 | from src.utils.logger import logger 6 | 7 | 8 | def save_model(model: Any, model_path: str) -> None: 9 | """ 10 | Saves model in gzip format 11 | 12 | Args: 13 | model: Model to be saved 14 | model_path: Path to save model to 15 | 16 | Returns: 17 | (None) 18 | """ 19 | with gzip.open(model_path, "wb") as f: 20 | pickle.dump(model, f) 21 | 22 | logger.info('Model saved to {}'.format(model_path)) 23 | 24 | 25 | def load_model(model_path: str) -> Any: 26 | """ 27 | Loads model from gzip format 28 | 29 | Args: 30 | model_path: Path to load model from 31 | 32 | Returns: 33 | 34 | """ 35 | with gzip.open(model_path, 'rb') as f: 36 | model = pickle.load(f) 37 | 38 | logger.info('Model loaded from: {}'.format(model_path)) 39 | return model 40 | -------------------------------------------------------------------------------- /src/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | logger.setLevel(logging.INFO) 5 | formatter = logging.Formatter('%(asctime)s - %(message)s') 6 | 7 | # create console handler and set level to info 8 | ch = logging.StreamHandler() 9 | ch.setFormatter(formatter) 10 | ch.setLevel(logging.INFO) 11 | 12 | # add ch to logger 13 | logger.addHandler(ch) 14 | -------------------------------------------------------------------------------- /src/viz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/viz/__init__.py -------------------------------------------------------------------------------- /src/viz/plot_results.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import precision_recall_curve, roc_curve 5 | 6 | 7 | def plot_auc(label, score, title): 8 | precision, recall, thresholds = precision_recall_curve(label, score) 9 | plt.figure(figsize=(15, 5)) 10 | plt.grid() 11 | plt.plot(thresholds, precision[1:], color='r', label='Precision') 12 | plt.plot(thresholds, recall[1:], color='b', label='Recall') 13 | plt.gca().invert_xaxis() 14 | plt.legend(loc='lower right') 15 | 16 | plt.xlabel('Threshold (0.00 - 1.00)') 17 | plt.ylabel('Precision / Recall') 18 | _ = plt.title(title) 19 | 20 | 21 | def plot_roc(label, score, title): 22 | fpr, tpr, roc_thresholds = roc_curve(label, score) 23 | plt.figure(figsize=(5, 5)) 24 | plt.grid() 25 | plt.plot(fpr, tpr, color='b') 26 | 27 | plt.xlabel('False Positive Rate') 28 | plt.ylabel('True Positive Rate') 29 | _ = plt.title(title) 30 | 31 | 32 | def plot_tradeoff(label, score, title): 33 | precision, recall, thresholds = precision_recall_curve(label, score) 34 | plt.figure(figsize=(5, 5)) 35 | plt.grid() 36 | plt.step(recall, precision, color='b', label='Precision-Recall Trade-off') 37 | plt.fill_between(recall, precision, alpha=0.1, color='b') 38 | 39 | plt.xlabel('Recall') 40 | plt.ylabel('Precision') 41 | _ = plt.title(title) 42 | 43 | 44 | def plot_metrics(df, ylim=None): 45 | plt.figure(figsize=(15, 5)) 46 | plt.grid() 47 | plt.plot(df.index, df['auc'], label='AUC-ROC', color='black') 48 | 49 | # Plot learning rate resets 50 | lr_reset_batch = df[df['batches'] == df['batches'].max()] 51 | for idx in lr_reset_batch.index: 52 | plt.vlines(idx, df['auc'].min(), 1, label='LR reset (per epoch)', 53 | linestyles='--', colors='grey') 54 | 55 | # PLot legend 56 | handles, labels = plt.gca().get_legend_handles_labels() 57 | by_label = OrderedDict(zip(labels, handles)) 58 | _ = plt.legend(by_label.values(), by_label.keys(), loc='lower right') 59 | 60 | # Tidy axis 61 | if ylim: 62 | plt.ylim(ylim) 63 | else: 64 | plt.ylim(df['auc'].min() * 1.2, 0.96) 65 | plt.xlim(0, df.index.max()) 66 | plt.ylabel('AUC-ROC', size=12) 67 | plt.xlabel('Batches (over 5 epochs)', size=12) 68 | _ = plt.title('AUC-ROC on sample val set over 5 epochs', size=15) 69 | -------------------------------------------------------------------------------- /src/viz/prep_results.py: -------------------------------------------------------------------------------- 1 | def get_product_id(mapping): 2 | def func(x): 3 | return mapping.get(x, -1) 4 | return func --------------------------------------------------------------------------------