├── .gitignore
├── images
    ├── cosine-annealing.png
    ├── implementation1-learning-curve.png
    ├── implementation1-precision-recall.png
    ├── implementation2-precision-recall.png
    ├── implementation4-precision-recall.png
    ├── implementation4b-precision-recall.png
    ├── implementation5-learning-curve.png
    ├── implementation5-precision-recall.png
    ├── implementation5b-precision-recall.png
    ├── implementation6-learning-curve.png
    ├── implementation7-learning-curve.png
    └── implementation7-precision-recall.png
├── readme.md
├── requirements.txt
├── run.sh
└── src
    ├── __init__.py
    ├── config.py
    ├── ml
        ├── __init__.py
        ├── data_loader.py
        ├── data_loader_edges.py
        ├── data_loader_with_meta.py
        ├── mf.py
        ├── mf_bias.py
        ├── mf_bias_continuous.py
        ├── mf_continuous.py
        ├── skipgram.py
        ├── skipgram_with_meta.py
        ├── skipgram_with_meta_weighted.py
        ├── train_gensim_embedding.py
        ├── train_node2vec_embeddings.py
        ├── train_torch_embedding.py
        ├── train_torch_embedding_with_meta.py
        ├── train_torch_mf.py
        ├── train_torch_mf_bias.py
        ├── train_torch_mf_bias_continuous_edges.py
        ├── train_torch_mf_bias_edges.py
        ├── train_torch_mf_bias_edges_parallel.py
        ├── train_torch_mf_continuous_edges.py
        └── train_torch_mf_edges.py
    ├── parse
        ├── __init__.py
        └── parse_json.py
    ├── prep
        ├── __init__.py
        ├── prep_edges.py
        ├── prep_graph_samples.py
        ├── prep_meta.py
        ├── prep_node_relationship.py
        └── train_val_split.py
    ├── utils
        ├── __init__.py
        ├── io_utils.py
        └── logger.py
    └── viz
        ├── __init__.py
        ├── plot_results.py
        └── prep_results.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | venv/
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # celery beat schedule file
 96 | celerybeat-schedule
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 | 
125 | # Pyre type checker
126 | .pyre/
127 | 
128 | # Data
129 | data/
130 | model/
131 | results/
132 | *.zip
133 | 
134 | # Mac
135 | .DS_Store
136 | .idea/
137 | notebooks/
138 | 


--------------------------------------------------------------------------------
/images/cosine-annealing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/cosine-annealing.png


--------------------------------------------------------------------------------
/images/implementation1-learning-curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation1-learning-curve.png


--------------------------------------------------------------------------------
/images/implementation1-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation1-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation2-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation2-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation4-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation4-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation4b-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation4b-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation5-learning-curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5-learning-curve.png


--------------------------------------------------------------------------------
/images/implementation5-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation5b-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation5b-precision-recall.png


--------------------------------------------------------------------------------
/images/implementation6-learning-curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation6-learning-curve.png


--------------------------------------------------------------------------------
/images/implementation7-learning-curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation7-learning-curve.png


--------------------------------------------------------------------------------
/images/implementation7-precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/images/implementation7-precision-recall.png


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # recsys-nlp-graph
  2 | 
  3 | **Undocumented** code for personal project on simple recsys via matrix factorization (part 1), and nlp and graph techniques (part 2). Sharing as part of meet-up follow along.
  4 | 
  5 | Associated articles:  
  6 | - Part 1: [Building a Strong Baseline Recommender in PyTorch](https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/)  
  7 | - Part 2: [Beating the Baseline Recommender with Graph & NLP in Pytorch](https://eugeneyan.com/writing/recommender-systems-graph-and-nlp-pytorch/)
  8 | 
  9 | Talk and Slides:  
 10 | - [DataScience SG Meetup - RecSys, Beyond the Baseline](https://eugeneyan.com/speaking/recommender-systems-beyond-the-baseline-talk/)  
 11 | - [Slideshare](https://www.slideshare.net/eugeneyan/recommender-systems-beyond-the-useritem-matrix)
 12 | 
 13 | ## Data
 14 | 
 15 | Electronics and books data from the [Amazon dataset (May 1996 – July 2014)](http://jmcauley.ucsd.edu/data/amazon/) was used. Here's how an example JSON entry looks like.
 16 | 
 17 | ```
 18 | { 
 19 | "asin": "0000031852",
 20 | "title": "Girls Ballet Tutu Zebra Hot Pink",
 21 | "price": 3.17,
 22 | "imUrl": "http://ecx.images-amazon.com/images/I/51fAmVkTbyL._SY300_.jpg",
 23 | "related”:
 24 |     { "also_bought":[
 25 | 		  	"B00JHONN1S",
 26 | 		  	"B002BZX8Z6",
 27 | 		  	"B00D2K1M3O", 
 28 | 		  	...
 29 | 		  	"B007R2RM8W"
 30 |                     ],
 31 |       "also_viewed":[ 
 32 | 		  	"B002BZX8Z6",
 33 | 		  	"B00JHONN1S",
 34 | 		  	"B008F0SU0Y",
 35 | 		  	...
 36 | 		  	"B00BFXLZ8M"
 37 |                      ],
 38 |       "bought_together":[ 
 39 | 		  	"B002BZX8Z6"
 40 |                      ]
 41 |     },
 42 | "salesRank":
 43 |     { 
 44 |       "Toys & Games":211836
 45 |     },
 46 | "brand": "Coxlures",
 47 | "categories":[ 
 48 | 	    [ "Sports & Outdoors",
 49 | 	      "Other Sports",
 50 | 	      "Dance"
 51 | 	    ]
 52 |     ]
 53 | }
 54 | ```
 55 | 
 56 | ## Comparing Matrix Factorization to Skip-gram (Node2Vec)
 57 | 
 58 | ### Overall results for Electronics dataset
 59 | 
 60 | |                                             	| All Products 	| Seen Products Only 	| Runtime (min) 	|
 61 | |---------------------------------------------	|--------------	|--------------------	|---------------	|
 62 | | PyTorch Matrix Factorization                	| 0.7951       	| -                  	| 45            	|
 63 | | Node2Vec                                    	| NA           	| NA                 	| NA            	|
 64 | | Gensim Word2Vec                             	| 0.9082       	| 0.9735             	| 2.58          	|
 65 | | PyTorch Word2Vec                            	| 0.9554       	| 0.9855             	| 23.63         	|
 66 | | PyTorch Word2Vec with Side Info             	| NA           	| NA                 	| NA            	|
 67 | | PyTorch Matrix Factorization With Sequences 	| 0.9320       	| -                  	| 70.39         	|
 68 | | Alibaba Paper*                              	| 0.9327       	| -                  	| -             	|
 69 | 
 70 | ### Overall results for Books dataset
 71 | 
 72 | |                                             	| All Products 	| Seen Products Only 	| Runtime (min) 	|
 73 | |---------------------------------------------	|--------------	|--------------------	|---------------	|
 74 | | PyTorch Matrix Factorization                	| 0.4996       	| -                  	| 1353.12       	|
 75 | | Gensim Word2Vec                             	| 0.9701       	| 0.9892             	| 16.24         	|
 76 | | PyTorch Word2Vec                            	| 0.9775       	| -                  	| 122.66        	|
 77 | | PyTorch Word2Vec with Side Info             	| NA           	| NA                 	| NA            	|
 78 | | PyTorch Matrix Factorization With Sequences 	| 0.7196       	| -                  	| 1393.08       	|
 79 | 
 80 | 
 81 | 
 82 | *[Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba](https://arxiv.org/abs/1803.02349)
 83 | 
 84 | ### 1. Matrix Factorization (iteratively pair by pair)
 85 | 
 86 | At a high level, for each pair:
 87 | 
 88 | - Get the embedding for each product
 89 | - Multiply embeddings and sum the resulting vector (this is the prediction)
 90 | - Reduce the difference between predicted score and actual score (via gradient descent and a loss function like mean squared error or BCE)
 91 | 
 92 | Here's some pseudo-code on how it would work.
 93 | 
 94 | ```
 95 | for product_pair, label in train_set:
 96 |     # Get embedding for each product
 97 |     product1_emb = embedding(product1)
 98 |     product2_emb = embedding(product2)
 99 | 
100 |     # Predict product-pair score (interaction term and sum)
101 |     prediction = sig(sum(product1_emb * product2_emb, dim=1))
102 |     l2_reg = lambda * sum(embedding.weight ** 2) 
103 | 
104 |     # Minimize loss
105 |     loss = BinaryCrossEntropyLoss(prediction, label)
106 |     loss += l2_reg
107 | 
108 |     loss.backward()
109 |     optimizer.step()
110 | ```
111 | 
112 | For the training schedule, we run it over 5 epochs with cosine annealing. For each epoch, learning rate starts high (0.01) and drops rapidly to a minimum value near zero, before being reset for to the next epoch.
113 | 
114 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/cosine-annealing.png)
115 | 
116 | One epoch seems sufficient to achive close to optimal ROC-AUC.
117 | 
118 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation1-precision-recall.png)
119 | 
120 | However, if we look at the precision-recall curves below, we see that at around 0.5 we hit the “cliff of death”. If we estimate the threshold slightly too low, precision drops from close to 1.0 to 0.5; slightly too high and recall is poor.
121 | 
122 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation1-learning-curve.png)
123 | 
124 | ### 2. Matrix Factorization with Bias
125 | 
126 | Adding bias reduces the steepness of the curves where they intersect, making it more production-friendly. (Though AUC-ROC decreases slightly, this implementation is preferable.)
127 | 
128 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation2-precision-recall.png)
129 | 
130 | ### 3. `Node2Vec`
131 | 
132 | I tried using the implementation of `Node2Vec` [here](https://github.com/aditya-grover/node2vec) but it was too memory intensive and slow. It didn't run to completion, even on a 64gb instance.
133 | 
134 | Digging deeper, I found that its approach to generating sequences was traversing the graph. If you allowed `networkx` to use multiple threads, it would spawn multiple processes to create sequences and cache them temporarily in memory. In short, very memory hungry. Overall, this didn’t work for the datasets I had.
135 | 
136 | ### 4. `gensim.word2vec`
137 | 
138 | Gensim has an implementation of w2v that takes in a list of sequences and can be multi-threaded. It was very easy to use and the fastest to complete five epochs.
139 | 
140 | But the precision-recall curve shows a sharp cliff around threshold == 0.73. This is due to out-of-vocabulary products in our validation datasets (which don't have embeddings).
141 | 
142 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation4-precision-recall.png)
143 | 
144 | If we _only_ evaluate in-vocabulary items, performance improves significantly.
145 | 
146 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation4b-precision-recall.png)
147 | 
148 | ### 5. `PyTorch` word2vec
149 | 
150 | We implement Skip-gram in PyTorch. Here's some simplified code of how it looks.
151 | 
152 | ```
153 | class SkipGram(nn.Module):
154 |     def __init__(self, emb_size, emb_dim):
155 |         self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)
156 |         self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)
157 | 
158 |     def forward(self, center, context, neg_context):
159 |         emb_center, emb_context, emb_neg_context = self.get_embeddings()
160 | 
161 |         # Get score for positive pairs
162 |         score = torch.sum(emb_center * emb_context, dim=1)
163 |         score = -F.logsigmoid(score)
164 | 
165 |         # Get score for negative pairs
166 |         neg_score = torch.bmm(emb_neg_context, emb_center.unsqueeze(2)).squeeze()
167 |         neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
168 | 
169 |         # Return combined score
170 |         return torch.mean(score + neg_score)
171 | ```
172 | 
173 | It performed better than `gensim` when considering all products.
174 | 
175 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5-precision-recall.png)
176 | 
177 | If considering _only_ seen products, it's still an improvement, but less dramatic.
178 | 
179 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5b-precision-recall.png)
180 | 
181 | When examining the learning curves, it seems that a single epoch is sufficient. In contrast to the learning curves from matrix factorization (implementation 1), the AUC-ROC doesn't drop drastically with each learning rate reset.
182 | 
183 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation5-learning-curve.png)
184 | 
185 | ### 6. `PyTorch` word2vec with side info
186 | 
187 | Why did we build the skip-gram model from scratch? Because we wanted to extend it with side information (e.g., brand, category, price).
188 | 
189 | ```
190 | B001T9NUFS -> B003AVEU6G -> B007ZN5Y56 ... -> B007ZN5Y56
191 | Television    Sound bar     Lamp              Standing Fan
192 | Sony          Sony          Phillips          Dyson
193 | 500 – 600     200 – 300     50 – 75           300 - 400
194 | ```
195 | 
196 | Perhaps by learning on these we can create better embeddings? 
197 | 
198 | Unfortunately, it didn't work out. Here's how the learning curve looks. 
199 | 
200 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation6-learning-curve.png)
201 | 
202 | One possible reason for this non-result is the sparsity of the meta data. Out of 418,749 electronic products, we only had metadata for 162,023 (39%). Of these, brand was 51% empty.
203 | 
204 | ### 7. Sequences + Matrix Factorization
205 | 
206 | Why did the w2v approach do so much better than matrix factorization? Was it due to the skipgram model, or due to the training data format (i.e., sequences)?
207 | 
208 | To understand this better, I tried the previous matrix factorization with bias implementation (AUC-ROC = 0.7951) with the new sequences and dataloader. It worked very well.
209 | 
210 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation7-precision-recall.png)
211 | 
212 | Oddly though, the matrix factorization approach still exhibits the effect of “forgetting” as learning rate resets with each epoch (Fig 9.), though not as pronounced as Figure 3 in the previous post.
213 | 
214 | ![](https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/master/images/implementation7-learning-curve.png)
215 | 
216 | _I wonder if this is due to using the same embeddings for both center and context._
217 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | backcall==0.1.0
 3 | boto==2.49.0
 4 | boto3==1.10.26
 5 | botocore==1.13.26
 6 | certifi==2024.7.4
 7 | chardet==3.0.4
 8 | cPython==0.0.5
 9 | decorator==4.4.1
10 | docutils==0.15.2
11 | gensim==3.8.1
12 | idna==2.8
13 | ipykernel==5.1.3
14 | ipython==8.10.0
15 | ipython-genutils==0.2.0
16 | jedi==0.15.1
17 | jmespath==0.9.4
18 | joblib==1.2.0
19 | jupyter-client==5.3.4
20 | jupyter-core==4.11.2
21 | networkx==2.4
22 | node2vec==0.3.1
23 | numpy==1.22.0
24 | pandas==0.25.3
25 | parso==0.5.1
26 | pexpect==4.7.0
27 | pickleshare==0.7.5
28 | prompt-toolkit==2.0.10
29 | ptyprocess==0.6.0
30 | Pygments==2.15.0
31 | pymongo==3.9.0
32 | python-dateutil==2.8.0
33 | pytz==2019.3
34 | pyzmq==18.1.0
35 | requests==2.31.0
36 | s3transfer==0.2.1
37 | scikit-learn==0.21.3
38 | scipy==1.10.0
39 | six==1.13.0
40 | smart-open==1.9.0
41 | tornado==6.3.3
42 | tqdm==4.39.0
43 | traitlets==4.3.3
44 | urllib3==1.26.5
45 | wcwidth==0.1.7
46 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | ### Workflow for books
 2 | python -m src.parse.parse_json data/meta_Books.json.gz data/books.csv
 3 | 
 4 | python -m src.prep.prep_node_relationship data/books.csv data/books_relationships.csv
 5 | python -m src.prep.prep_meta data/books.csv data/books_meta.csv
 6 | 
 7 | python -m src.prep.prep_edges data/books_relationships.csv data/books_edges.csv
 8 | python -m src.prep.train_val_split data/books_edges.csv 0.33
 9 | 
10 | python -m src.prep.prep_graph_samples data/books_edges_train.edgelist data/books_sequences.npy books
11 | 
12 | # Slow and requires a lot of ram
13 | python -m src.ml.train_node2vec_embeddings data/books_edges_train.edgelist data/books_embeddings.kv
14 | 
15 | # Works fine with multiprocess
16 | python -m src.ml.train_gensim_embedding data/books_sequences_sample.npy 8
17 | 
18 | # PyTorch
19 | # For dev testing
20 | python -m src.ml.train_torch_embedding data/books_sequences_sample.npy data/books_edges_val_samp.csv data/books_edges_train_samp.csv 32 4
21 | # For training
22 | python -m src.ml.train_torch_embedding data/books_sequences.npy data/books_edges_val.csv data/books_edges_val_samp.csv 128 10  # Best params?
23 | 
24 | # ==========================================================================================================================================
25 | ### Workflow for electronics
26 | python -m src.parse.parse_json data/meta_Electronics.json.gz data/electronics.csv
27 | 
28 | python -m src.prep.prep_node_relationship data/electronics.csv data/electronics_relationships.csv
29 | python -m src.prep.prep_meta data/electronics.csv data/electronics_meta.csv
30 | 
31 | python -m src.prep.prep_edges data/electronics_relationships.csv data/electronics_edges.csv
32 | python -m src.prep.train_val_split data/electronics_edges.csv 0.33
33 | 
34 | python -m src.prep.prep_graph_samples data/electronics_edges_train.edgelist data/electronics_sequences.npy electronics
35 | 
36 | # Slow and requires a lot of ram
37 | python -m src.ml.train_node2vec_embeddings data/electronics_edges_train.edgelist data/electronics_embeddings.kv
38 | 
39 | # Works fine with multiprocess
40 | python -m src.ml.train_gensim_embedding data/electronics_sequences_sample.npy 6
41 | 
42 | # PyTorch
43 | # For dev testing
44 | python -m src.ml.train_torch_embedding data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_edges_train_samp.csv 32 4
45 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_meta.csv data/electronics_edges_train_samp.csv 32 4
46 | # For training
47 | python -m src.ml.train_torch_embedding data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 4  # Best params?
48 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_meta.csv data/electronics_edges_val_samp.csv 128 10  # Best params?
49 | 
50 | # MF Dev
51 | python -m src.ml.train_torch_mf data/electronics_sequences_samp.npy data/electronics_edges_val_samp.csv data/electronics_edges_val_samp.csv 32 4
52 | python -m src.ml.train_torch_mf data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 8  # Best params?
53 | python -m src.ml.train_torch_mf_bias data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 8  # Best params?
54 | 
55 | # Edges model
56 | python -m src.ml.train_torch_mf_edges data/electronics_edges_train_samp.csv data/electronics_edges_val_samp.csv data/electronics_edges_val_samp.csv 32 4
57 | 
58 | # ==========================================================================================================================================
59 | ### Running for results
60 | python -m src.ml.train_gensim_embedding data/electronics_sequences.npy 8
61 | python -m src.ml.train_torch_embedding data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 8  # Best params?
62 | python -m src.ml.train_torch_embedding_with_meta data/electronics_sequences.npy data/electronics_edges_val.csv data/electronics_meta.csv data/electronics_edges_val_samp.csv 128 8  # Best params?
63 | python -m src.ml.train_torch_mf data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 8  # Best params?
64 | python -m src.ml.train_torch_mf_bias data/electronics_sequences.npy data/electronics_edges_val.csv  data/electronics_edges_val_samp.csv 128 4  # Best params?
65 | python -m src.ml.train_torch_mf_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8
66 | python -m src.ml.train_torch_mf_bias_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4
67 | python -m src.ml.train_torch_mf_continuous_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 4
68 | python -m src.ml.train_torch_mf_bias_continuous_edges data/electronics_edges_train.csv data/electronics_edges_val.csv data/electronics_edges_val_samp.csv 128 8
69 | 
70 | 
71 | python -m src.ml.train_gensim_embedding data/books_sequences.npy 8
72 | python -m src.ml.train_torch_embedding data/books_sequences.npy data/books_edges_val.csv  data/books_edges_val_samp.csv 128 8  # Best params?
73 | python -m src.ml.train_torch_mf_bias data/books_sequences.npy data/books_edges_val.csv  data/books_edges_val_samp.csv 128 8  # Best params?
74 | python -m src.ml.train_torch_mf_bias_edges data/books_edges_train.csv data/books_edges_val.csv data/books_edges_val_samp.csv 128 8
75 | python -m src.ml.train_torch_mf_bias_continuous_edges data/books_edges_train.csv data/books_edges_val.csv data/books_edges_val_samp.csv 128 8
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/__init__.py


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = 'data'
2 | MODEL_PATH = 'model'
3 | 


--------------------------------------------------------------------------------
/src/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/ml/__init__.py


--------------------------------------------------------------------------------
/src/ml/data_loader.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import Counter
  3 | from typing import Dict, List, Tuple
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.config import MODEL_PATH
 11 | from src.utils.io_utils import save_model
 12 | from src.utils.logger import logger
 13 | 
 14 | 
 15 | class Sequences:
 16 |     NEGATIVE_SAMPLE_TABLE_SIZE = 1e7
 17 |     WINDOW = 5
 18 | 
 19 |     def __init__(self, sequence_path: str, val_path: str, subsample: float = 0.001, power: float = 0.75):
 20 |         """
 21 |         Initializes a Sequences object for use in a Dataset.
 22 | 
 23 |         Args:
 24 |             sequence_path: Path to numpy array of sequences, where each row is a sequence
 25 |             subsample: Subsampling parameter; suggested range (0, 1e-5)
 26 |             power: Negative sampling parameter; suggested 0.75
 27 |         """
 28 |         self.negative_idx = 0
 29 |         self.n_unique_tokens = 0
 30 | 
 31 |         self.sequences = np.load(sequence_path).tolist()
 32 |         self.n_sequences = len(self.sequences)
 33 |         logger.info('Sequences loaded (length = {:,})'.format(self.n_sequences))
 34 | 
 35 |         self.val = pd.read_csv(val_path)
 36 |         logger.info('Validation set loaded: {}'.format(self.val.shape))
 37 | 
 38 |         self.word_freq = self.get_word_freq()
 39 |         logger.info('Word frequency calculated')
 40 | 
 41 |         self.word2id, self.id2word = self.get_mapping_dicts()
 42 |         self.add_val_product_to_mapping_dicts()
 43 |         self.n_unique_tokens = len(self.word2id)
 44 |         logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens))
 45 |         save_model(self.word2id, '{}/word2id'.format(MODEL_PATH))
 46 |         save_model(self.id2word, '{}/id2word'.format(MODEL_PATH))
 47 |         logger.info('Word2Id and Id2Word created and saved')
 48 | 
 49 |         self.sequences = self.convert_sequence_to_id()
 50 |         self.word_freq = self.convert_word_freq_to_id()
 51 |         logger.info('Convert sequence and wordfreq to ID')
 52 | 
 53 |         self.discard_probs = self.get_discard_probs(sample=subsample)
 54 |         logger.info('Discard probability calculated')
 55 | 
 56 |         self.neg_table = self.get_negative_sample_table(power=power)
 57 |         logger.info('Negative sample table created')
 58 | 
 59 |         # Used to preload all center context pairs (very memory heavy)
 60 |         # self.pairs = self.get_all_center_context_pairs(window=window)
 61 |         # self.n_pairs = len(self.pairs)
 62 |         # logger.info('Center Context pairs created')
 63 | 
 64 |     def get_word_freq(self) -> Counter:
 65 |         """
 66 |         Returns a dictionary of word frequencies.
 67 | 
 68 |         Returns:
 69 | 
 70 |         """
 71 |         # Flatten list
 72 |         seq_flat = list(itertools.chain.from_iterable(self.sequences))
 73 | 
 74 |         # Get word frequency
 75 |         word_freq = Counter(seq_flat)
 76 | 
 77 |         return word_freq
 78 | 
 79 |     def get_mapping_dicts(self):
 80 |         word2id = dict()
 81 |         id2word = dict()
 82 | 
 83 |         wid = 0
 84 |         for w, c in self.word_freq.items():
 85 |             word2id[w] = wid
 86 |             id2word[wid] = w
 87 |             wid += 1
 88 | 
 89 |         return word2id, id2word
 90 | 
 91 |     def add_val_product_to_mapping_dicts(self):
 92 |         val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values))
 93 | 
 94 |         logger.info('Adding val products to word2id, original size: {}'.format(len(self.word2id)))
 95 |         wid = max(self.word2id.values()) + 1
 96 |         for w in val_product_set:
 97 |             if w not in self.word2id:
 98 |                 self.word2id[w] = wid
 99 |                 self.id2word[wid] = w
100 |                 wid += 1
101 | 
102 |         self.val = None  # Release memory
103 |         logger.info('Added val products to word2id, updated size: {}'.format(len(self.word2id)))
104 | 
105 |     def convert_sequence_to_id(self):
106 |         return np.vectorize(self.word2id.get)(self.sequences)
107 | 
108 |     def get_product_id(self, x):
109 |         return self.word2id.get(x, -1)
110 | 
111 |     def convert_word_freq_to_id(self):
112 |         return {self.word2id[k]: v for k, v in self.word_freq.items()}
113 | 
114 |     def get_discard_probs(self, sample=0.001) -> Dict[int, float]:
115 |         """
116 |         Returns a dictionary of words and their associated discard probability, where the word should be discarded
117 |         if np.random.rand() < probability.
118 | 
119 |         Args:
120 |             sample:
121 | 
122 |         Returns:
123 | 
124 |         """
125 |         # Convert to array
126 |         word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
127 | 
128 |         # Convert to probabilities
129 |         word_freq[:, 1] = word_freq[:, 1] / word_freq[:, 1].sum()
130 | 
131 |         # Perform subsampling
132 |         # http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
133 |         word_freq[:, 1] = (np.sqrt(word_freq[:, 1] / sample) + 1) * (sample / word_freq[:, 1])
134 | 
135 |         # Get dict
136 |         discard_probs = {int(k): v for k, v in word_freq.tolist()}
137 | 
138 |         return discard_probs
139 | 
140 |     def get_negative_sample_table(self, power=0.75) -> np.array:
141 |         """
142 |         Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing.
143 | 
144 |         Args:
145 |             power:
146 | 
147 |         Returns:
148 | 
149 |         """
150 |         # Convert to array
151 |         word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
152 | 
153 |         # Adjust by power
154 |         word_freq[:, 1] = word_freq[:, 1] ** power
155 | 
156 |         # Get probabilities
157 |         word_freq_sum = word_freq[:, 1].sum()
158 |         word_freq[:, 1] = word_freq[:, 1] / word_freq_sum
159 | 
160 |         # Multiply probabilities by sample table size
161 |         word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE)
162 | 
163 |         # Convert to int
164 |         word_freq = word_freq.astype(int).tolist()
165 | 
166 |         # Create sample table
167 |         sample_table = [[tup[0]] * tup[1] for tup in word_freq]
168 |         sample_table = np.array(list(itertools.chain.from_iterable(sample_table)))
169 |         np.random.shuffle(sample_table)
170 | 
171 |         return sample_table
172 | 
173 |     # Works on per sequence
174 |     def get_pairs(self, idx, window=5):
175 |         pairs = []
176 |         sequence = self.sequences[idx]
177 | 
178 |         for center_idx, node in enumerate(sequence):
179 |             for i in range(-window, window + 1):
180 |                 context_idx = center_idx + i
181 |                 if context_idx >= 0 and context_idx < len(sequence) and node != sequence[
182 |                     context_idx] and np.random.rand() < self.discard_probs[sequence[context_idx]]:
183 |                     pairs.append((node, sequence[context_idx]))
184 | 
185 |         return pairs
186 | 
187 |     def get_all_center_context_pairs(self, window=5) -> List[Tuple[int, int]]:
188 |         """
189 |         Returns a list of tuples (center, context).
190 | 
191 |         Args:
192 |             window:
193 | 
194 |         Returns:
195 | 
196 |         """
197 | 
198 |         pairs = []
199 | 
200 |         for sequence in self.sequences:
201 |             for center_idx, node in enumerate(sequence):
202 |                 for i in range(-window, window + 1):
203 |                     context_idx = center_idx + i
204 |                     if (0 <= context_idx < len(sequence)) \
205 |                         and node != sequence[context_idx] \
206 |                         and np.random.rand() < self.discard_probs[sequence[context_idx]]:
207 |                         pairs.append((node, sequence[context_idx]))
208 | 
209 |         return pairs
210 | 
211 |     def get_negative_samples(self, context, sample_size=5) -> np.array:
212 |         """
213 |         Returns a list of negative samples, where len = sample_size.
214 | 
215 |         Args:
216 |             sample_size:
217 | 
218 |         Returns:
219 | 
220 |         """
221 |         while True:
222 |             # Get a batch from the shuffled table
223 |             neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size]
224 | 
225 |             # Update negative index
226 |             self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table)
227 | 
228 |             # Check if batch insufficient
229 |             if len(neg_sample) != sample_size:
230 |                 neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx]))
231 | 
232 |             # Check if context in negative sample
233 |             if not context in neg_sample:
234 |                 return neg_sample
235 | 
236 | 
237 | class SequencesDataset(Dataset):
238 |     def __init__(self, sequences: Sequences, neg_sample_size=5):
239 |         self.sequences = sequences
240 |         self.neg_sample_size = neg_sample_size
241 | 
242 |     def __len__(self):
243 |         return self.sequences.n_sequences
244 | 
245 |     def __getitem__(self, idx):
246 |         pairs = self.sequences.get_pairs(idx)
247 |         neg_samples = []
248 |         for center, context in pairs:
249 |             neg_samples.append(self.sequences.get_negative_samples(context))
250 | 
251 |         return pairs, neg_samples
252 | 
253 |     @staticmethod
254 |     def collate(batches):
255 |         # logger.info('Batches: {}'.format(batches))
256 |         pairs_batch = [batch[0] for batch in batches]
257 |         neg_contexts_batch = [batch[1] for batch in batches]
258 | 
259 |         pairs_batch = list(itertools.chain.from_iterable(pairs_batch))
260 |         neg_contexts = list(itertools.chain.from_iterable(neg_contexts_batch))
261 | 
262 |         centers = [center for center, _ in pairs_batch]
263 |         contexts = [context for _, context in pairs_batch]
264 | 
265 |         return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(neg_contexts)
266 | 
267 |     @staticmethod
268 |     def collate_for_mf(batches):
269 |         batch_list = []
270 | 
271 |         for batch in batches:
272 |             pairs = np.array(batch[0])
273 |             negs = np.array(batch[1])
274 |             negs = np.vstack((pairs[:, 0].repeat(negs.shape[1]), negs.ravel())).T
275 | 
276 |             pairs_arr = np.ones((pairs.shape[0], pairs.shape[1] + 1), dtype=int)
277 |             pairs_arr[:, :-1] = pairs
278 | 
279 |             negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int)
280 |             negs_arr[:, :-1] = negs
281 | 
282 |             all_arr = np.vstack((pairs_arr, negs_arr))
283 |             batch_list.append(all_arr)
284 | 
285 |         batch_array = np.vstack(batch_list)
286 |         # np.random.shuffle(batch_array)
287 | 
288 |         # Return item1, item2, label
289 |         return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]),
290 |                 torch.FloatTensor(batch_array[:, 2]))


--------------------------------------------------------------------------------
/src/ml/data_loader_edges.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import Counter
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from src.config import MODEL_PATH
 10 | from src.utils.io_utils import save_model
 11 | from src.utils.logger import logger
 12 | 
 13 | 
 14 | class Edges:
 15 |     NEGATIVE_SAMPLE_TABLE_SIZE = 1e7
 16 | 
 17 |     def __init__(self, edge_path: str, val_path: str, power: float = 0.75):
 18 |         """
 19 |         Initializes an Edges object for use in a Dataset.
 20 | 
 21 |         Args:
 22 |             edge_path: Path to numpy array of sequences, where each row is a sequence
 23 |             power: Negative sampling parameter; suggested 0.75
 24 |         """
 25 |         self.power = power
 26 |         self.negative_idx = 0
 27 |         self.n_unique_tokens = 0
 28 | 
 29 |         self.edges = pd.read_csv(edge_path)
 30 |         self.n_edges = len(self.edges)
 31 |         logger.info('Edges loaded (length = {:,})'.format(self.n_edges))
 32 | 
 33 |         self.val = pd.read_csv(val_path)
 34 |         logger.info('Validation set loaded: {}'.format(self.val.shape))
 35 | 
 36 |         self.product_set = self.get_product_set()
 37 |         self.word2id, self.id2word = self.get_mapping_dicts()
 38 |         self.get_product_id_func = np.vectorize(self.get_product_id)
 39 |         self.n_unique_tokens = len(self.word2id)
 40 |         logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens))
 41 |         save_model(self.word2id, '{}/word2id_edge'.format(MODEL_PATH))
 42 |         save_model(self.id2word, '{}/id2word_edge'.format(MODEL_PATH))
 43 |         logger.info('Word2Id and Id2Word created and saved')
 44 | 
 45 |         # Convert product ID strings to integers
 46 |         self.edges = self.prep_edges()
 47 |         logger.info('Edges prepared')
 48 | 
 49 |         # Prepare negative sampling table
 50 |         self.word_freq = self.get_word_freq(self.edges[:, :2])
 51 |         self.neg_table = self.get_negative_sample_table(self.power)
 52 | 
 53 |     def get_product_set(self):
 54 |         product_set = set(self.edges['product1'].tolist() + self.edges['product2'].tolist() +
 55 |                           self.val['product1'].tolist() + self.val['product2'].tolist())
 56 | 
 57 |         return product_set
 58 | 
 59 |     def get_mapping_dicts(self):
 60 |         word2id = dict()
 61 |         id2word = dict()
 62 | 
 63 |         wid = 0
 64 |         for w in self.product_set:
 65 |             word2id[w] = wid
 66 |             id2word[wid] = w
 67 |             wid += 1
 68 | 
 69 |         return word2id, id2word
 70 | 
 71 |     def get_product_id(self, x):
 72 |         return self.word2id.get(x, -1)
 73 | 
 74 |     def prep_edges(self):
 75 |         self.edges['product1_id'] = self.get_product_id_func(self.edges['product1']).astype(int)
 76 |         self.edges['product2_id'] = self.get_product_id_func(self.edges['product2']).astype(int)
 77 |         edges = self.edges[['product1_id', 'product2_id', 'weight']].copy().values
 78 | 
 79 |         return edges
 80 | 
 81 |     def get_word_freq(self, edges):
 82 |         product_counts = list(itertools.chain.from_iterable(edges))
 83 |         word_freq = Counter(product_counts)
 84 |         return word_freq
 85 | 
 86 |     def get_negative_sample_table(self, power=0.75) -> np.array:
 87 |         """
 88 |         Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing.
 89 | 
 90 |         Args:
 91 |             power:
 92 | 
 93 |         Returns:
 94 | 
 95 |         """
 96 |         # Convert to array
 97 |         word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
 98 | 
 99 |         # Adjust by power
100 |         word_freq[:, 1] = word_freq[:, 1] ** power
101 | 
102 |         # Get probabilities
103 |         word_freq_sum = word_freq[:, 1].sum()
104 |         word_freq[:, 1] = word_freq[:, 1] / word_freq_sum
105 | 
106 |         # Multiply probabilities by sample table size
107 |         word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE)
108 | 
109 |         # Convert to int
110 |         word_freq = word_freq.astype(int).tolist()
111 | 
112 |         # Create sample table
113 |         sample_table = [[tup[0]] * tup[1] for tup in word_freq]
114 |         sample_table = np.array(list(itertools.chain.from_iterable(sample_table)))
115 |         np.random.shuffle(sample_table)
116 | 
117 |         return sample_table
118 | 
119 |     def get_negative_samples(self, context, sample_size=5) -> np.array:
120 |         """
121 |         Returns a list of negative samples, where len = sample_size.
122 | 
123 |         Args:
124 |             sample_size:
125 | 
126 |         Returns:
127 | 
128 |         """
129 |         while True:
130 |             # Get a batch from the shuffled table
131 |             neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size]
132 | 
133 |             # Update negative index
134 |             self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table)
135 | 
136 |             # Check if batch insufficient
137 |             if len(neg_sample) != sample_size:
138 |                 neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx]))
139 | 
140 |             # Check if context in negative sample
141 |             if not context in neg_sample:
142 |                 return neg_sample
143 | 
144 | 
145 | class EdgesDataset(Dataset):
146 |     def __init__(self, edges, neg_sample_size=5):
147 |         self.edges = edges
148 |         self.neg_sample_size = neg_sample_size
149 | 
150 |     def __len__(self):
151 |         return self.edges.n_edges
152 | 
153 |     def __getitem__(self, idx):
154 |         pair = self.edges.edges[idx]
155 |         neg_samples = self.edges.get_negative_samples(context=pair[1])
156 | 
157 |         return pair, neg_samples
158 | 
159 |     @staticmethod
160 |     def collate(batches):
161 |         logger.debug('Batches: {}'.format(batches))
162 |         batch_list = []
163 | 
164 |         for batch in batches:
165 |             pair = np.array(batch[0])
166 |             negs = np.array(batch[1])
167 |             negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T
168 | 
169 |             # Create arrays
170 |             pair_arr = np.ones((pair.shape[0]), dtype=int)  # This sets label to 1  # TODO: Leave label as continuous
171 |             pair_arr[:-1] = pair[:-1]
172 |             negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int)
173 |             negs_arr[:, :-1] = negs
174 |             all_arr = np.vstack((pair_arr, negs_arr))
175 |             batch_list.append(all_arr)
176 | 
177 |         batch_array = np.vstack(batch_list)
178 | 
179 |         # Return item1, item2, label
180 |         return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]),
181 |                 torch.FloatTensor(batch_array[:, 2]))
182 | 
183 |     @staticmethod
184 |     def collate_continuous(batches):
185 |         logger.debug('Batches: {}'.format(batches))
186 |         batch_list = []
187 | 
188 |         for batch in batches:
189 |             pair = np.array(batch[0])
190 |             negs = np.array(batch[1])
191 |             negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T
192 | 
193 |             # Create arrays
194 |             pair_arr = pair
195 |             negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int)
196 |             negs_arr[:, :-1] = negs
197 |             all_arr = np.vstack((pair_arr, negs_arr))
198 |             batch_list.append(all_arr)
199 | 
200 |         batch_array = np.vstack(batch_list)
201 | 
202 |         # Return item1, item2, label
203 |         return (torch.LongTensor(batch_array[:, 0]), torch.LongTensor(batch_array[:, 1]),
204 |                 torch.FloatTensor(batch_array[:, 2]))
205 | 


--------------------------------------------------------------------------------
/src/ml/data_loader_with_meta.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import Counter, OrderedDict
  3 | from typing import Dict, List, Tuple
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from category_encoders import OrdinalEncoder
  9 | from torch.utils.data import Dataset
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.utils.io_utils import save_model
 13 | from src.utils.logger import logger
 14 | 
 15 | # META_COLS = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand']
 16 | 
 17 | 
 18 | def round_up(num, divisor=5):
 19 |     return ((num + divisor - 1) // divisor) * divisor
 20 | 
 21 | 
 22 | def bin_price(price):
 23 |     if price < 25:
 24 |         return price
 25 |     elif 25 < price < 50:
 26 |         return round_up(price, divisor=5)
 27 |     elif 50 < price < 500:
 28 |         return round_up(price, divisor=10)
 29 |     else:
 30 |         return 500
 31 | 
 32 | 
 33 | def prep_price(price_col):
 34 |     price = np.round(price_col)
 35 |     price.fillna(-1, inplace=True)
 36 |     price = price.astype(int)
 37 |     price = price.apply(bin_price)
 38 |     price = price + 1
 39 | 
 40 |     return price
 41 | 
 42 | 
 43 | def prep_categorical(cat_col, min_threshold=100):
 44 |     counts = cat_col.value_counts()
 45 |     category_set = counts[counts > min_threshold].index
 46 | 
 47 |     return np.where(cat_col.isin(category_set), cat_col, 'MISC')
 48 | 
 49 | 
 50 | def get_dict_values(meta, META_COLS):
 51 |     return [meta[col] for col in META_COLS]
 52 | 
 53 | 
 54 | class Sequences:
 55 |     NEGATIVE_SAMPLE_TABLE_SIZE = 1e7
 56 |     WINDOW = 5
 57 | 
 58 |     def __init__(self, sequence_path: str, val_path: str, meta_path: str, subsample: float = 0.001,
 59 |                  power: float = 0.75):
 60 |         """
 61 |         Initializes a Sequence object for use in a Dataset.
 62 | 
 63 |         Args:
 64 |             sequence_path: Path to numpy array of sequences, where each row is a sequence
 65 |             subsample: Subsampling parameter; suggested range (0, 1e-5)
 66 |             power: Negative sampling parameter; suggested 0.75
 67 |         """
 68 |         self.negative_idx = 0
 69 |         self.n_unique_tokens = 0
 70 |         # META_COLS = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand']
 71 |         self.META_COLS = ['category_lvl_3', 'brand']  # Add meta columns here
 72 | 
 73 |         self.sequences = np.load(sequence_path).tolist()
 74 |         self.n_sequences = len(self.sequences)
 75 |         logger.info('Sequences loaded (length = {:,})'.format(self.n_sequences))
 76 | 
 77 |         self.val = pd.read_csv(val_path)
 78 |         logger.info('Validation set loaded: {}'.format(self.val.shape))
 79 | 
 80 |         self.word_freq = self.get_word_freq()
 81 |         logger.info('Word frequency calculated')
 82 | 
 83 |         self.word2id, self.id2word = self.get_mapping_dicts()
 84 |         self.add_val_product_to_mapping_dicts()
 85 |         self.n_unique_tokens = len(self.word2id)
 86 |         logger.info('No. of unique tokens: {}'.format(self.n_unique_tokens))
 87 |         save_model(self.word2id, '{}/word2id'.format(MODEL_PATH))
 88 |         save_model(self.id2word, '{}/id2word'.format(MODEL_PATH))
 89 |         logger.info('Word2Id and Id2Word created and saved')
 90 | 
 91 |         self.meta = pd.read_csv(meta_path, dtype={'asin': 'object'})
 92 |         self.meta.drop_duplicates(subset='asin', inplace=True)
 93 |         self.meta['productid'] = self.meta['asin'].copy()
 94 |         self.meta = self.prep_meta()
 95 |         self.meta_dict, self.emb_sizes = self.convert_meta_to_dict()
 96 |         self.emb_sizes['product'] = len(self.word2id)
 97 |         logger.info('Embedding dimensions: {}'.format(self.emb_sizes))
 98 |         save_model(self.meta_dict, '{}/meta_dict'.format(MODEL_PATH))
 99 |         self.meta = None
100 | 
101 |         self.sequences = self.convert_sequence_to_id()
102 |         self.word_freq = self.convert_word_freq_to_id()
103 |         logger.info('Convert sequence and wordfreq to ID')
104 | 
105 |         self.discard_probs = self.get_discard_probs(sample=subsample)
106 |         logger.info('Discard probability calculated')
107 | 
108 |         self.neg_table = self.get_negative_sample_table(power=power)
109 |         logger.info('Negative sample table created')
110 | 
111 |         # Used to preload all center context pairs (very memory heavy)
112 |         # self.pairs = self.get_all_center_context_pairs(window=window)
113 |         # self.n_pairs = len(self.pairs)
114 |         # logger.info('Center Context pairs created')
115 | 
116 |     def get_word_freq(self) -> Counter:
117 |         """
118 |         Returns a dictionary of word frequencies.
119 | 
120 |         Returns:
121 | 
122 |         """
123 |         # Flatten list
124 |         seq_flat = list(itertools.chain.from_iterable(self.sequences))
125 | 
126 |         # Get word frequency
127 |         word_freq = Counter(seq_flat)
128 | 
129 |         return word_freq
130 | 
131 |     def get_mapping_dicts(self):
132 |         word2id = dict()
133 |         id2word = dict()
134 | 
135 |         wid = 0
136 |         for w, c in self.word_freq.items():
137 |             word2id[w] = wid
138 |             id2word[wid] = w
139 |             wid += 1
140 | 
141 |         return word2id, id2word
142 | 
143 |     def add_val_product_to_mapping_dicts(self):
144 |         val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values))
145 | 
146 |         logger.info('Adding val products to word2id, original size: {}'.format(len(self.word2id)))
147 |         wid = max(self.word2id.values()) + 1
148 |         for w in val_product_set:
149 |             if w not in self.word2id:
150 |                 self.word2id[w] = wid
151 |                 self.id2word[wid] = w
152 |                 wid += 1
153 | 
154 |         self.val = None  # Release memory
155 |         logger.info('Added val products to word2id, updated size: {}'.format(len(self.word2id)))
156 | 
157 |     def convert_sequence_to_id(self):
158 |         return np.vectorize(self.word2id.get)(self.sequences)
159 | 
160 |     def get_product_id(self, x):
161 |         return self.word2id.get(x, -1)
162 | 
163 |     def convert_word_freq_to_id(self):
164 |         return {self.word2id[k]: v for k, v in self.word_freq.items()}
165 | 
166 |     def prep_meta(self):
167 |         logger.info('No. of rows in meta before filter by word2id: {}'.format(self.meta.shape[0]))
168 |         meta = self.meta[self.meta['asin'].isin(self.word2id.keys())].copy()
169 |         logger.info('No. of rows in meta after filter by word2id: {}'.format(meta.shape[0]))
170 | 
171 |         meta['price'] = prep_price(meta['price'])
172 |         meta['category_lvl_2'] = prep_categorical(meta['category_lvl_2'])
173 |         meta['category_lvl_3'] = prep_categorical(meta['category_lvl_3'])
174 |         meta['category_lvl_4'] = prep_categorical(meta['category_lvl_4'])
175 |         meta['brand'] = prep_categorical(meta['brand'])
176 | 
177 |         return meta
178 | 
179 |     def convert_meta_to_dict(self):
180 |         meta = self.meta[['productid'] + self.META_COLS].copy()
181 | 
182 |         # Encode to int
183 |         encoder = OrdinalEncoder(cols=self.META_COLS)
184 |         meta = encoder.fit_transform(meta)
185 |         save_model(encoder, '{}/encoder'.format(MODEL_PATH))
186 | 
187 |         meta['values'] = meta.apply(get_dict_values, args=(self.META_COLS,), axis=1)
188 |         meta_dict = meta.set_index('productid')['values'].to_dict()
189 |         meta_dict = {self.word2id[k]: v for k, v in meta_dict.items()}
190 | 
191 |         meta_counts_dict = (meta[self.META_COLS].max() + 1).to_dict()  # Need to +1 to account for index starting from zero
192 |         # Without +1 the embedding size will be insufficient by 1
193 |         ordered_meta_counts_dict = OrderedDict()
194 |         for col in ['product'] + self.META_COLS:
195 |             ordered_meta_counts_dict[col] = meta_counts_dict.get(col, 0)
196 | 
197 |         return meta_dict, ordered_meta_counts_dict
198 | 
199 |     def get_discard_probs(self, sample=0.001) -> Dict[int, float]:
200 |         """
201 |         Returns a dictionary of words and their associated discard probability, where the word should be discarded
202 |         if np.random.rand() < probability.
203 | 
204 |         Args:
205 |             sample:
206 | 
207 |         Returns:
208 | 
209 |         """
210 |         # Convert to array
211 |         word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
212 | 
213 |         # Convert to probabilities
214 |         word_freq[:, 1] = word_freq[:, 1] / word_freq[:, 1].sum()
215 | 
216 |         # Perform subsampling
217 |         # http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
218 |         word_freq[:, 1] = (np.sqrt(word_freq[:, 1] / sample) + 1) * (sample / word_freq[:, 1])
219 | 
220 |         # Get dict
221 |         discard_probs = {int(k): v for k, v in word_freq.tolist()}
222 | 
223 |         return discard_probs
224 | 
225 |     def get_negative_sample_table(self, power=0.75) -> np.array:
226 |         """
227 |         Returns a table (size = NEGATIVE_SAMPLE_TABLE_SIZE) of negative samples which can be selected via indexing.
228 | 
229 |         Args:
230 |             power:
231 | 
232 |         Returns:
233 | 
234 |         """
235 |         # Convert to array
236 |         word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
237 | 
238 |         # Adjust by power
239 |         word_freq[:, 1] = word_freq[:, 1] ** power
240 | 
241 |         # Get probabilities
242 |         word_freq_sum = word_freq[:, 1].sum()
243 |         word_freq[:, 1] = word_freq[:, 1] / word_freq_sum
244 | 
245 |         # Multiply probabilities by sample table size
246 |         word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE)
247 | 
248 |         # Convert to int
249 |         word_freq = word_freq.astype(int).tolist()
250 | 
251 |         # Create sample table
252 |         sample_table = [[tup[0]] * tup[1] for tup in word_freq]
253 |         sample_table = np.array(list(itertools.chain.from_iterable(sample_table)))
254 |         np.random.shuffle(sample_table)
255 | 
256 |         return sample_table
257 | 
258 |     def get_meta(self, idx):
259 |         return self.meta_dict.get(idx, [0] * len(self.META_COLS))
260 | 
261 |     # Works on per sequence
262 |     def get_pairs(self, idx, window=5):
263 |         pairs = []
264 |         sequence = self.sequences[idx]
265 | 
266 |         for center_idx, center in enumerate(sequence):
267 |             for i in range(-window, window + 1):
268 |                 context_idx = center_idx + i
269 |                 if context_idx >= 0 and context_idx < len(sequence) and center != sequence[
270 |                     context_idx] and np.random.rand() < self.discard_probs[sequence[context_idx]]:
271 |                     context = sequence[context_idx]
272 |                     center_meta = self.get_meta(center)
273 |                     context_meta = self.get_meta(center)
274 |                     pairs.append(([center] + center_meta, [context] + context_meta))
275 | 
276 |         return pairs
277 | 
278 |     def get_all_center_context_pairs(self, window=5) -> List[Tuple[int, int]]:
279 |         """
280 |         Returns a list of tuples (center, context).
281 | 
282 |         Args:
283 |             window:
284 | 
285 |         Returns:
286 | 
287 |         """
288 | 
289 |         pairs = []
290 | 
291 |         for sequence in self.sequences:
292 |             for center_idx, node in enumerate(sequence):
293 |                 for i in range(-window, window + 1):
294 |                     context_idx = center_idx + i
295 |                     if (0 <= context_idx < len(sequence)) \
296 |                         and node != sequence[context_idx] \
297 |                         and np.random.rand() < self.discard_probs[sequence[context_idx]]:
298 |                         pairs.append((node, sequence[context_idx]))
299 | 
300 |         return pairs
301 | 
302 |     def get_negative_samples(self, context, sample_size=5) -> np.array:
303 |         """
304 |         Returns a list of negative samples, where len = sample_size.
305 | 
306 |         Args:
307 |             sample_size:
308 | 
309 |         Returns:
310 | 
311 |         """
312 |         while True:
313 |             # Get a batch from the shuffled table
314 |             neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size]
315 | 
316 |             # Update negative index
317 |             self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table)
318 | 
319 |             # Check if batch insufficient
320 |             if len(neg_sample) != sample_size:
321 |                 neg_sample = np.concatenate((neg_sample, self.neg_table[:self.negative_idx]))
322 | 
323 |             # Check if context in negative sample
324 |             if not context in neg_sample:
325 |                 return [[samp] + self.get_meta(samp) for samp in neg_sample]
326 | 
327 | 
328 | class EdgesDataset(Dataset):
329 |     def __init__(self, sequences: Sequences, neg_sample_size=5):
330 |         self.sequences = sequences
331 |         self.neg_sample_size = neg_sample_size
332 | 
333 |     def __len__(self):
334 |         return self.sequences.n_sequences
335 | 
336 |     def __getitem__(self, idx):
337 |         pairs = self.sequences.get_pairs(idx)
338 |         neg_samples = []
339 |         for center, context in pairs:
340 |             neg_samples.append(self.sequences.get_negative_samples(context))
341 | 
342 |         return pairs, neg_samples
343 | 
344 |     @staticmethod
345 |     def collate(batches):
346 |         # logger.info('Batches: {}'.format(batches))
347 |         pairs_batch = [batch[0] for batch in batches]
348 |         neg_contexts_batch = [batch[1] for batch in batches]
349 | 
350 |         pairs_batch = list(itertools.chain.from_iterable(pairs_batch))
351 |         neg_contexts = list(itertools.chain.from_iterable(neg_contexts_batch))
352 | 
353 |         centers = [center for center, _ in pairs_batch]
354 |         contexts = [context for _, context in pairs_batch]
355 | 
356 |         return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(neg_contexts)
357 | 


--------------------------------------------------------------------------------
/src/ml/mf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from src.utils.logger import logger
 5 | 
 6 | torch.manual_seed(1368)
 7 | 
 8 | 
 9 | def regularize_l2(array):
10 |     loss = torch.sum(array ** 2.0)
11 |     return loss
12 | 
13 | 
14 | class MF(nn.Module):
15 |     def __init__(self, emb_size, emb_dim, c_vector=1e-6):
16 |         super().__init__()
17 |         self.emb_size = emb_size
18 |         self.emb_dim = emb_dim
19 |         self.c_vector = c_vector
20 | 
21 |         # Layers
22 |         self.embedding = nn.Embedding(emb_size, emb_dim)
23 |         self.sig = nn.Sigmoid()
24 | 
25 |         # Loss
26 |         self.bce = nn.BCELoss()
27 | 
28 |         logger.info('Model initialized: {}'.format(self))
29 | 
30 |     def forward(self, product1, product2):
31 |         emb_product1 = self.embedding(product1)
32 |         emb_product2 = self.embedding(product2)
33 |         interaction = self.sig(torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float))
34 | 
35 |         return interaction
36 | 
37 |     def loss(self, pred, label):
38 |         mf_loss = self.bce(pred, label)
39 | 
40 |         # L2 regularization
41 |         product_prior = regularize_l2(self.embedding.weight) * self.c_vector
42 | 
43 |         loss_total = mf_loss + product_prior
44 | 
45 |         return loss_total
46 | 


--------------------------------------------------------------------------------
/src/ml/mf_bias.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from src.utils.logger import logger
 5 | 
 6 | torch.manual_seed(1368)
 7 | 
 8 | 
 9 | def regularize_l2(array):
10 |     loss = torch.sum(array ** 2.0)
11 |     return loss
12 | 
13 | 
14 | class MFBias(nn.Module):
15 |     def __init__(self, emb_size, emb_dim, c_vector=1e-6, c_bias=1e-6):
16 |         super().__init__()
17 |         self.emb_size = emb_size
18 |         self.emb_dim = emb_dim
19 |         self.c_vector = c_vector
20 |         self.c_bias = c_bias
21 | 
22 |         # Layers
23 |         self.product_embedding = nn.Embedding(emb_size, emb_dim)
24 |         self.sig = nn.Sigmoid()
25 | 
26 |         # Bias
27 |         self.product_bias = nn.Embedding(emb_size, 1)
28 |         self.bias = nn.Parameter(torch.ones(1))
29 | 
30 |         # Loss
31 |         self.bce = nn.BCELoss()
32 | 
33 |         logger.info('Model initialized: {}'.format(self))
34 | 
35 |     def forward(self, product1, product2):
36 |         emb_product1 = self.product_embedding(product1)
37 |         emb_product2 = self.product_embedding(product2)
38 |         interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)
39 | 
40 |         bias_product1 = self.product_bias(product1).squeeze()
41 |         bias_product2 = self.product_bias(product2).squeeze()
42 |         biases = self.bias + bias_product1 + bias_product2
43 | 
44 |         prediction = self.sig((interaction + biases))
45 | 
46 |         return prediction
47 | 
48 |     def loss(self, pred, label):
49 |         mf_loss = self.bce(pred, label)
50 | 
51 |         # L2 regularization
52 |         product_prior = regularize_l2(self.product_embedding.weight) * self.c_vector
53 |         product_bias_prior = regularize_l2(self.product_bias.weight) * self.c_bias
54 | 
55 |         loss_total = mf_loss + product_prior + product_bias_prior
56 | 
57 |         return loss_total
58 | 


--------------------------------------------------------------------------------
/src/ml/mf_bias_continuous.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from src.utils.logger import logger
 5 | 
 6 | torch.manual_seed(1368)
 7 | 
 8 | 
 9 | def regularize_l2(array):
10 |     loss = torch.sum(array ** 2.0)
11 |     return loss
12 | 
13 | 
14 | class MFBiasContinuous(nn.Module):
15 |     def __init__(self, emb_size, emb_dim, c_vector=1e-6, c_bias=1e-6):
16 |         super().__init__()
17 |         self.emb_size = emb_size
18 |         self.emb_dim = emb_dim
19 |         self.c_vector = c_vector
20 |         self.c_bias = c_bias
21 | 
22 |         # Layers
23 |         self.product_embedding = nn.Embedding(emb_size, emb_dim)
24 |         self.sig = nn.Sigmoid()
25 | 
26 |         # Bias
27 |         self.product_bias = nn.Embedding(emb_size, 1)
28 |         self.bias = nn.Parameter(torch.ones(1))
29 | 
30 |         # Loss
31 |         self.mse = nn.MSELoss()
32 | 
33 |         logger.info('Model initialized: {}'.format(self))
34 | 
35 |     def forward(self, product1, product2):
36 |         emb_product1 = self.product_embedding(product1)
37 |         emb_product2 = self.product_embedding(product2)
38 |         interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)
39 | 
40 |         bias_product1 = self.product_bias(product1).squeeze()
41 |         bias_product2 = self.product_bias(product2).squeeze()
42 |         biases = self.bias + bias_product1 + bias_product2
43 | 
44 |         prediction = (interaction + biases)
45 | 
46 |         return prediction
47 | 
48 |     def predict(self, product1, product2):
49 |         emb_product1 = self.product_embedding(product1)
50 |         emb_product2 = self.product_embedding(product2)
51 |         interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)
52 | 
53 |         bias_product1 = self.product_bias(product1).squeeze()
54 |         bias_product2 = self.product_bias(product2).squeeze()
55 |         biases = self.bias + bias_product1 + bias_product2
56 | 
57 |         prediction = self.sig((interaction + biases))
58 | 
59 |         return prediction
60 | 
61 |     def loss(self, pred, label):
62 |         mf_loss = self.mse(pred, label)
63 | 
64 |         # L2 regularization
65 |         product_prior = regularize_l2(self.product_embedding.weight) * self.c_vector
66 |         product_bias_prior = regularize_l2(self.product_bias.weight) * self.c_bias
67 | 
68 |         loss_total = mf_loss + product_prior + product_bias_prior
69 | 
70 |         return loss_total
71 | 


--------------------------------------------------------------------------------
/src/ml/mf_continuous.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from src.utils.logger import logger
 5 | 
 6 | torch.manual_seed(1368)
 7 | 
 8 | 
 9 | def regularize_l2(array):
10 |     loss = torch.sum(array ** 2.0)
11 |     return loss
12 | 
13 | 
14 | class MFContinuous(nn.Module):
15 |     def __init__(self, emb_size, emb_dim, c_vector=1e-6):
16 |         super().__init__()
17 |         self.emb_size = emb_size
18 |         self.emb_dim = emb_dim
19 |         self.c_vector = c_vector
20 | 
21 |         # Layers
22 |         self.embedding = nn.Embedding(emb_size, emb_dim)
23 |         self.sig = nn.Sigmoid()
24 | 
25 |         # Loss
26 |         self.mse = nn.MSELoss()
27 | 
28 |         logger.info('Model initialized: {}'.format(self))
29 | 
30 |     def forward(self, product1, product2):
31 |         emb_product1 = self.embedding(product1)
32 |         emb_product2 = self.embedding(product2)
33 |         interaction = torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float)
34 | 
35 |         return interaction
36 | 
37 |     def predict(self, product1, product2):
38 |         emb_product1 = self.embedding(product1)
39 |         emb_product2 = self.embedding(product2)
40 |         interaction = self.sig(torch.sum(emb_product1 * emb_product2, dim=1, dtype=torch.float))  # Add sigmoid
41 | 
42 |         return interaction
43 | 
44 |     def loss(self, pred, label):
45 |         mf_loss = self.mse(pred, label)
46 | 
47 |         # L2 regularization
48 |         product_prior = regularize_l2(self.embedding.weight) * self.c_vector
49 | 
50 |         loss_total = mf_loss + product_prior
51 | 
52 |         return loss_total
53 | 


--------------------------------------------------------------------------------
/src/ml/skipgram.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | torch.manual_seed(1368)
 8 | 
 9 | 
10 | class SkipGram(nn.Module):
11 | 
12 |     def __init__(self, emb_size, emb_dim):
13 |         super().__init__()
14 |         self.emb_size = emb_size
15 |         self.emb_dim = emb_dim
16 |         self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)
17 |         self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)
18 |         self.init_emb()
19 | 
20 |     def init_emb(self):
21 |         """
22 |         Init embeddings like word2vec
23 | 
24 |         Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim].
25 |         Context embeddings are initialized with 0s.
26 | 
27 |         Returns:
28 | 
29 |         """
30 |         emb_range = 0.5 / self.emb_dim
31 | 
32 |         # Initializing embeddings:
33 |         # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch
34 |         self.center_embeddings.weight.data.uniform_(-emb_range, emb_range)
35 |         self.context_embeddings.weight.data.uniform_(0, 0)
36 | 
37 |     def forward(self, center, context, neg_context):
38 |         """
39 | 
40 |         Args:
41 |             center: List of center words
42 |             context: List of context words
43 |             neg_context: List of list of negative context words
44 | 
45 |         Returns:
46 | 
47 |         """
48 |         # Calculate positive score
49 |         emb_center = self.center_embeddings(center)  # Get embeddings for center word
50 |         emb_context = self.context_embeddings(context)  # Get embeddings for context word
51 |         emb_neg_context = self.context_embeddings(neg_context)  # Get embeddings for negative context words
52 | 
53 |         # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch
54 |         score = torch.mul(emb_center, emb_context)  # Get dot product (part 1)
55 |         score = torch.sum(score, dim=1)  # Get dot product (part2)
56 |         score = torch.clamp(score, max=10, min=-10)
57 |         score = -F.logsigmoid(score)  # Get score for the positive pairs
58 | 
59 |         # Calculate negative score (for negative samples)
60 |         neg_score = torch.bmm(emb_neg_context, emb_center.unsqueeze(2)).squeeze()  # Get dot product
61 |         neg_score = torch.clamp(neg_score, max=10, min=-10)
62 |         neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
63 | 
64 |         # Return combined score
65 |         return torch.mean(score + neg_score)
66 | 
67 |     def get_center_emb(self, center):
68 |         return self.center_embeddings(center)
69 | 
70 |     def save_embeddings(self, file_name):
71 |         embedding = self.center_embeddings.weight.cpu().data.numpy()
72 |         np.save(file_name, embedding)
73 | 


--------------------------------------------------------------------------------
/src/ml/skipgram_with_meta.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from src.utils.logger import logger
  7 | 
  8 | torch.manual_seed(1368)
  9 | 
 10 | 
 11 | class SkipGram(nn.Module):
 12 | 
 13 |     def __init__(self, emb_sizes, emb_dim):
 14 |         super().__init__()
 15 |         self.emb_sizes = emb_sizes
 16 |         self.emb_dim = emb_dim
 17 | 
 18 |         # Create embedding layers
 19 |         self.center_embeddings = nn.ModuleList()
 20 |         for k, v in self.emb_sizes.items():
 21 |             self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
 22 | 
 23 |         self.context_embeddings = nn.ModuleList()
 24 |         for k, v in self.emb_sizes.items():
 25 |             self.context_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
 26 | 
 27 |         self.init_emb()
 28 | 
 29 |     def init_emb(self):
 30 |         """
 31 |         Init embeddings like word2vec
 32 | 
 33 |         Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim].
 34 |         Context embeddings are initialized with 0s.
 35 | 
 36 |         Returns:
 37 | 
 38 |         """
 39 |         emb_range = 0.5 / self.emb_dim
 40 | 
 41 |         # Initializing embeddings:
 42 |         # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch
 43 |         for emb in self.center_embeddings:
 44 |             emb.weight.data.uniform_(-emb_range, emb_range)
 45 | 
 46 |         for emb in self.context_embeddings:
 47 |             emb.weight.data.uniform_(0, 0)
 48 | 
 49 |     def forward(self, centers, contexts, neg_contexts):
 50 |         """
 51 | 
 52 |         Args:
 53 |             center: List of center words
 54 |             context: List of context words
 55 |             neg_context: List of list of negative context words
 56 | 
 57 |         Returns:
 58 | 
 59 |         """
 60 |         # Calculate positive score
 61 |         emb_centers = []
 62 |         for i in range(centers.shape[1]):
 63 |             logger.debug('center i: {}'.format(i))
 64 |             emb_centers.append(self.center_embeddings[i](centers[:, i]))
 65 |         emb_center = torch.mean(torch.stack(emb_centers), axis=0)
 66 | 
 67 |         emb_contexts = []
 68 |         for i in range(contexts.shape[1]):
 69 |             logger.debug('context i: {}'.format(i))
 70 |             emb_contexts.append(self.context_embeddings[i](contexts[:, i]))
 71 |         emb_context = torch.mean(torch.stack(emb_contexts), axis=0)
 72 | 
 73 |         emb_neg_contexts = []
 74 |         neg_contexts = neg_contexts.view(-1, len(self.context_embeddings))
 75 |         for i in range(neg_contexts.shape[1]):
 76 |             logger.debug('neg context i: {}, {}'.format(i, neg_contexts[:, i]))
 77 |             emb_neg_contexts.append(self.context_embeddings[i](neg_contexts[:, i]))
 78 |         emb_neg_context = torch.mean(torch.stack(emb_neg_contexts), axis=0)
 79 | 
 80 |         # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch
 81 |         score = torch.mul(emb_center, emb_context)  # Get dot product (part 1)
 82 |         score = torch.sum(score, dim=1)  # Get dot product (part2)
 83 |         score = torch.clamp(score, max=10, min=-10)
 84 |         score = -F.logsigmoid(score)  # Get score for the positive pairs
 85 | 
 86 |         # Calculate negative score (for negative samples)
 87 |         neg_score = torch.bmm(emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]),
 88 |                               emb_center.unsqueeze(2)).squeeze()  # Get dot product
 89 |         neg_score = torch.clamp(neg_score, max=10, min=-10)
 90 |         neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
 91 | 
 92 |         # Return combined score
 93 |         return torch.mean(score + neg_score)
 94 | 
 95 |     def get_center_emb(self, centers):
 96 |         emb_centers = []
 97 |         for row_idx, center in enumerate(centers):
 98 |             emb_center = []
 99 |             for col_idx, center_ in enumerate(center):
100 |                 emb_center.append(self.center_embeddings[col_idx](center_))
101 | 
102 |             emb_centers.append(torch.mean(torch.stack(emb_center), axis=0))
103 | 
104 |         return torch.stack(emb_centers)
105 | 
106 |     def save_embeddings(self, file_name):
107 |         embedding = self.center_embeddings.weight.cpu().data.numpy()
108 |         np.save(file_name, embedding)
109 | 


--------------------------------------------------------------------------------
/src/ml/skipgram_with_meta_weighted.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from src.utils.logger import logger
  7 | 
  8 | torch.manual_seed(1368)
  9 | 
 10 | 
 11 | class SkipGram(nn.Module):
 12 | 
 13 |     def __init__(self, emb_sizes, emb_dim):
 14 |         super().__init__()
 15 |         self.emb_sizes = emb_sizes
 16 |         self.emb_dim = emb_dim
 17 | 
 18 |         # Create embedding layers
 19 |         self.center_embeddings = nn.ModuleList()
 20 |         for k, v in self.emb_sizes.items():
 21 |             self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
 22 | 
 23 |         self.context_embeddings = nn.ModuleList()
 24 |         for k, v in self.emb_sizes.items():
 25 |             self.context_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
 26 | 
 27 |         # Create embedding weighting layer
 28 |         self.emb_weights = nn.Embedding(emb_sizes['product'], len(emb_sizes),
 29 |                                         sparse=True)  # emb_sizes['product'] is total number of products
 30 |         self.emb_weights_softmax = nn.Softmax(dim=1)
 31 | 
 32 |         self.init_emb()
 33 | 
 34 |         logger.info('Model initialized: {}'.format(self))
 35 | 
 36 |     def init_emb(self):
 37 |         """
 38 |         Init embeddings like word2vec
 39 | 
 40 |         Center embeddings have uniform distribution in [-0.5/emb_dim , 0.5/emb_dim].
 41 |         Context embeddings are initialized with 0s.
 42 | 
 43 |         Returns:
 44 | 
 45 |         """
 46 |         emb_range = 0.5 / self.emb_dim
 47 | 
 48 |         # Initializing embeddings:
 49 |         # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch
 50 |         for emb in self.center_embeddings:
 51 |             emb.weight.data.uniform_(-emb_range, emb_range)
 52 | 
 53 |         for emb in self.context_embeddings:
 54 |             emb.weight.data.uniform_(0, 0)
 55 | 
 56 |         emb_weights_init = 1 / len(self.emb_sizes)
 57 |         self.emb_weights.weight.data.uniform_(emb_weights_init)
 58 | 
 59 |     def get_embedding(self, nodes):
 60 |         embs = []
 61 |         emb_weight = self.emb_weights(nodes[:, 0])
 62 |         emb_weight_norm = self.emb_weights_softmax(emb_weight)
 63 | 
 64 |         for i in range(nodes.shape[1]):
 65 |             logger.debug('center i: {}'.format(i))
 66 |             embs.append(self.center_embeddings[i](nodes[:, i]))
 67 |         emb_stack = torch.stack(embs)
 68 |         embs_weighted = emb_stack * emb_weight_norm.T.unsqueeze(2).expand_as(emb_stack)
 69 |         emb = torch.sum(embs_weighted, axis=0)
 70 | 
 71 |         return emb
 72 | 
 73 |     def forward(self, centers, contexts, neg_contexts):
 74 |         """
 75 | 
 76 |         Args:
 77 |             center: List of center words
 78 |             context: List of context words
 79 |             neg_context: List of list of negative context words
 80 | 
 81 |         Returns:
 82 | 
 83 |         """
 84 |         emb_center = self.get_embedding(centers)
 85 |         emb_context = self.get_embedding(contexts)
 86 | 
 87 |         neg_contexts = neg_contexts.view(-1, len(self.context_embeddings))  # Need to expand this first
 88 |         emb_neg_context = self.get_embedding(neg_contexts)
 89 | 
 90 |         # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch
 91 |         score = torch.mul(emb_center, emb_context)  # Get dot product (part 1)
 92 |         score = torch.sum(score, dim=1)  # Get dot product (part2)
 93 |         score = torch.clamp(score, max=10, min=-10)
 94 |         score = -F.logsigmoid(score)  # Get score for the positive pairs
 95 | 
 96 |         # Calculate negative score (for negative samples)
 97 |         neg_score = torch.bmm(emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]),
 98 |                               emb_center.unsqueeze(2)).squeeze()  # Get dot product
 99 |         neg_score = torch.clamp(neg_score, max=10, min=-10)
100 |         neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
101 | 
102 |         # Return combined score
103 |         return torch.mean(score + neg_score)
104 | 
105 |     def get_center_emb(self, centers):
106 |         emb_centers = []
107 |         for row_idx, center in enumerate(centers):
108 |             emb_center = []
109 |             for col_idx, center_ in enumerate(center):
110 |                 emb_center.append(self.center_embeddings[col_idx](center_))
111 | 
112 |             emb_centers.append(torch.mean(torch.stack(emb_center), axis=0))
113 | 
114 |         return torch.stack(emb_centers)
115 | 
116 |     def save_embeddings(self, file_name):
117 |         embedding = self.center_embeddings.weight.cpu().data.numpy()
118 |         np.save(file_name, embedding)
119 | 


--------------------------------------------------------------------------------
/src/ml/train_gensim_embedding.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | 
 4 | import numpy as np
 5 | from gensim.models import Word2Vec
 6 | 
 7 | from src.config import MODEL_PATH
 8 | from src.utils.logger import logger
 9 | 
10 | 
11 | def load_sequences(sequence_path):
12 |     """
13 |     Expects a numpy array at sequence_path
14 | 
15 |     Args:
16 |         sequence_path:
17 | 
18 |     Returns:
19 | 
20 |     """
21 |     sequences = np.load(sequence_path)
22 |     logger.info('Sequences shape: {}'.format(sequences.shape))
23 | 
24 |     # Convert sequences to string and list of list
25 |     sequences = sequences.astype(str).tolist()
26 | 
27 |     return sequences
28 | 
29 | 
30 | def train_embeddings(sequences, workers, dimension=128, window=5, min_count=1, negative=5, epochs=3, seed=42):
31 |     # Logging specific to gensim training
32 |     import logging
33 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
34 | 
35 |     # Initialize model
36 |     model = Word2Vec(sequences, workers=workers,
37 |                      size=dimension, window=window, min_count=min_count, negative=negative, seed=seed)
38 |     logger.info('Model initialized')
39 | 
40 |     # Train model (No need to retrain model as initialization includes training)
41 |     # model.train(sequences, total_examples=len(sequences), epochs=epochs)
42 |     # logger.info('Model trained!')
43 | 
44 |     return model
45 | 
46 | 
47 | def save_model(model):
48 |     # Save model and keyedvectors
49 |     current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
50 |     model.save('{}/gensim-w2v-{}.model'.format(MODEL_PATH, current_datetime))
51 |     model.wv.save('{}/gensim-w2v-{}.kv'.format(MODEL_PATH, current_datetime))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     parser = argparse.ArgumentParser(description='Create embeddings using gensim package')
56 |     parser.add_argument('read_path', type=str, help='Path to input sequences')
57 |     parser.add_argument('n_workers', type=int, help='Number of workers')
58 |     args = parser.parse_args()
59 | 
60 |     sequences = load_sequences(args.read_path)
61 | 
62 |     start_time = datetime.datetime.now()
63 |     model = train_embeddings(sequences, workers=args.n_workers)
64 |     end_time = datetime.datetime.now()
65 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
66 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
67 |     save_model(model)
68 | 


--------------------------------------------------------------------------------
/src/ml/train_node2vec_embeddings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import networkx as nx
 4 | from node2vec import Node2Vec
 5 | 
 6 | from src.config import DATA_PATH
 7 | from src.utils.logger import logger
 8 | 
 9 | 
10 | def train_embeddings(edgelist_path, embedding_path):
11 |     # Create path
12 |     graph = nx.read_weighted_edgelist(edgelist_path)
13 |     logger.info('Graph created!')
14 |     assert graph.get_edge_data('0000013714', '0005064295')['weight'] == 3.2, 'Expected edge weight of 3.2'
15 | 
16 |     # Precomput probabilities and generate walks
17 |     node2vec = Node2Vec(graph, dimensions=128, walk_length=30, num_walks=10, workers=10, temp_folder=DATA_PATH)
18 |     logger.info('Computed probabilities and generated walks')
19 |     graph = None  # We don't need graph anymore since probabilities have been precomputed
20 | 
21 |     # Embed nodes
22 |     model = node2vec.fit(window=5, min_count=1, batch_words=128)
23 |     logger.info('Nodes embedded')
24 | 
25 |     # Save embeddings for later use
26 |     model.wv.save_word2vec_format(embedding_path)
27 |     logger.info('Embedding saved')
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     parser = argparse.ArgumentParser(description='Create embeddings using node2vec package')
32 |     parser.add_argument('read_path', type=str, help='Path to input (train) graph edgelist')
33 |     parser.add_argument('write_path', type=str, help='Path to output embeddings')
34 |     args = parser.parse_args()
35 | 
36 |     train_embeddings(args.read_path, args.write_path)
37 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_embedding.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from sklearn.metrics import roc_auc_score
  9 | from torch import optim
 10 | from torch.utils.data import DataLoader
 11 | 
 12 | from src.config import MODEL_PATH
 13 | from src.ml.data_loader import Sequences, SequencesDataset
 14 | from src.ml.skipgram import SkipGram
 15 | from src.utils.logger import logger
 16 | 
 17 | shuffle = True
 18 | emb_dim = 128
 19 | epochs = 5
 20 | initial_lr = 0.025
 21 | 
 22 | # Torch parameters
 23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 24 | torch.cuda.set_device(1)  # Set to use 2nd GPU
 25 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 26 | 
 27 | if __name__ == '__main__':
 28 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 29 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 30 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 31 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 32 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 33 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 34 |     args = parser.parse_args()
 35 | 
 36 |     # Initialize dataset
 37 |     sequences = Sequences(args.read_path, args.val_path)
 38 |     dataset = SequencesDataset(sequences)
 39 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 40 |                             collate_fn=dataset.collate)
 41 | 
 42 |     # Initialize validation set
 43 |     val_samp = pd.read_csv(args.val_samp_path)
 44 | 
 45 |     # Get product ID
 46 |     word2id_func = np.vectorize(sequences.get_product_id)
 47 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 48 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 49 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 50 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 51 | 
 52 |     product1_id = val_samp['product1_id'].values
 53 |     product2_id = val_samp['product2_id'].values
 54 | 
 55 |     # Initialize model
 56 |     skipgram = SkipGram(sequences.n_unique_tokens, emb_dim).to(device)
 57 | 
 58 |     # Train loop
 59 |     optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)
 60 | 
 61 |     results = []
 62 |     start_time = datetime.datetime.now()
 63 |     for epoch in range(epochs):
 64 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 65 |         running_loss = 0
 66 | 
 67 |         # Training loop
 68 |         for i, batches in enumerate(dataloader):
 69 | 
 70 |             centers = batches[0].to(device)
 71 |             contexts = batches[1].to(device)
 72 |             neg_contexts = batches[2].to(device)
 73 | 
 74 |             optimizer.zero_grad()
 75 |             loss = skipgram.forward(centers, contexts, neg_contexts)
 76 |             loss.backward()
 77 |             optimizer.step()
 78 | 
 79 |             scheduler.step()
 80 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 81 | 
 82 |             if i > 0 and i % 1000 == 0:
 83 |                 # Validation Check
 84 |                 with torch.no_grad():
 85 |                     product1_emb = skipgram.get_center_emb(torch.LongTensor(product1_id).to(device))
 86 |                     product2_emb = skipgram.get_center_emb(torch.LongTensor(product2_id).to(device))
 87 |                     cos_sim = F.cosine_similarity(product1_emb, product2_emb)
 88 |                     score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy())
 89 | 
 90 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 91 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 92 |                                                                                score, optimizer.param_groups[0]['lr']))
 93 |                 results.append([epoch, i, running_loss, score])
 94 |                 running_loss = 0
 95 | 
 96 |         # save model
 97 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 98 |         state_dict_path = '{}/skipgram_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 99 |         torch.save(skipgram.state_dict(), state_dict_path)
100 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
101 | 
102 |     end_time = datetime.datetime.now()
103 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
104 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
105 | 
106 |     # Save results
107 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
108 |     results_df.to_csv('{}/model_metrics_w2v.csv'.format(MODEL_PATH), index=False)
109 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_embedding_with_meta.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Does not do well as fill rate for meta data is low. Just considering product IDs, only 40% of the data is present.
  3 | - Embedding dimensions: OrderedDict([('product', 418749), ('asin', 162024)])
  4 | 
  5 | This number is much lower when we consider category level 2 - 3, and brand.
  6 | """
  7 | import argparse
  8 | import datetime
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | import torch.nn.functional as F
 14 | from sklearn.metrics import roc_auc_score
 15 | from torch import optim
 16 | from torch.utils.data import DataLoader
 17 | 
 18 | from src.config import MODEL_PATH
 19 | from src.ml.data_loader_with_meta import Sequences, SequencesDataset
 20 | from src.ml.skipgram_with_meta_weighted import SkipGram
 21 | from src.utils.logger import logger
 22 | 
 23 | shuffle = True
 24 | emb_dim = 128
 25 | epochs = 5
 26 | initial_lr = 0.025
 27 | 
 28 | # Torch parameters
 29 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 30 | torch.cuda.set_device(1)  # Set to use 2nd GPU
 31 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 32 | 
 33 | if __name__ == '__main__':
 34 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 35 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 36 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 37 |     parser.add_argument('meta_path', type=str, help='Path to meta.csv')
 38 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 39 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 40 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 41 |     args = parser.parse_args()
 42 | 
 43 |     # Initialize dataset
 44 |     sequences = Sequences(args.read_path, args.val_path, args.meta_path)
 45 |     dataset = SequencesDataset(sequences)
 46 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 47 |                             collate_fn=dataset.collate)
 48 | 
 49 |     # Initialize validation set
 50 |     val_samp = pd.read_csv(args.val_samp_path)
 51 | 
 52 |     # Get product ID
 53 |     word2id_func = np.vectorize(sequences.get_product_id)
 54 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 55 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 56 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 57 | 
 58 | 
 59 |     def get_id_and_meta(product_id):
 60 |         return [product_id] + sequences.get_meta(product_id)
 61 | 
 62 | 
 63 |     val_product1 = val_samp['product1_id'].apply(get_id_and_meta)
 64 |     val_product2 = val_samp['product2_id'].apply(get_id_and_meta)
 65 | 
 66 |     # Initialize model
 67 |     skipgram = SkipGram(sequences.emb_sizes, emb_dim).to(device)
 68 | 
 69 |     # Train loop
 70 |     optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)
 71 | 
 72 |     results = []
 73 |     start_time = datetime.datetime.now()
 74 |     for epoch in range(epochs):
 75 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 76 |         running_loss = 0
 77 | 
 78 |         # Training loop
 79 |         for i, batches in enumerate(dataloader):
 80 | 
 81 |             centers = batches[0].to(device)
 82 |             contexts = batches[1].to(device)
 83 |             neg_contexts = batches[2].to(device)
 84 | 
 85 |             optimizer.zero_grad()
 86 |             loss = skipgram.forward(centers, contexts, neg_contexts)
 87 |             loss.backward()
 88 |             optimizer.step()
 89 | 
 90 |             scheduler.step()
 91 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 92 | 
 93 |             if i > 0 and i % 1000 == 0:
 94 |                 # Validation Check
 95 |                 with torch.no_grad():
 96 |                     product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1).to(device))
 97 |                     product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2).to(device))
 98 |                     cos_sim = F.cosine_similarity(product1_emb, product2_emb)
 99 |                     score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy())
100 | 
101 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
102 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
103 |                                                                                score, optimizer.param_groups[0]['lr']))
104 |                 results.append([epoch, i, running_loss, score])
105 |                 running_loss = 0
106 | 
107 |         # save model
108 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
109 |         state_dict_path = '{}/skipgram_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
110 |         torch.save(skipgram.state_dict(), state_dict_path)
111 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
112 | 
113 |     end_time = datetime.datetime.now()
114 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
115 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
116 | 
117 |     # Save results
118 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
119 |     results_df.to_csv('{}/model_metrics_w2v_meta.csv'.format(MODEL_PATH), index=False)
120 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader import Sequences, SequencesDataset
 13 | from src.ml.mf import MF
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | torch.cuda.set_device(1)  # Set to use 2nd GPU
 24 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 25 | 
 26 | if __name__ == '__main__':
 27 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 28 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 29 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 30 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 31 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 32 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 33 |     args = parser.parse_args()
 34 | 
 35 |     # Initialize dataset
 36 |     sequences = Sequences(args.read_path, args.val_path)
 37 |     dataset = SequencesDataset(sequences)
 38 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 39 |                             collate_fn=dataset.collate_for_mf)
 40 | 
 41 |     # Initialize validation set
 42 |     val_samp = pd.read_csv(args.val_samp_path)
 43 | 
 44 |     # Get product ID
 45 |     word2id_func = np.vectorize(sequences.get_product_id)
 46 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 47 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 48 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 49 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 50 | 
 51 |     product1_id = val_samp['product1_id'].values
 52 |     product2_id = val_samp['product2_id'].values
 53 | 
 54 |     # Initialize model
 55 |     mf = MF(sequences.n_unique_tokens, emb_dim).to(device)
 56 | 
 57 |     # Train loop
 58 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 59 | 
 60 |     results = []
 61 |     start_time = datetime.datetime.now()
 62 |     for epoch in range(epochs):
 63 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 64 |         running_loss = 0
 65 | 
 66 |         # Training loop
 67 |         for i, batches in enumerate(dataloader):
 68 | 
 69 |             product1 = batches[0].to(device)
 70 |             product2 = batches[1].to(device)
 71 |             label = batches[2].to(device)
 72 | 
 73 |             optimizer.zero_grad()
 74 | 
 75 |             pred = mf.forward(product1, product2)
 76 |             loss = mf.loss(pred, label)
 77 |             loss.backward()
 78 |             optimizer.step()
 79 | 
 80 |             scheduler.step()
 81 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 82 | 
 83 |             if i > 0 and i % 1000 == 0:
 84 |                 # Validation Check
 85 |                 with torch.no_grad():
 86 |                     pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device),
 87 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 88 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 89 | 
 90 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 91 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 92 |                                                                                score, optimizer.param_groups[0]['lr']))
 93 |                 results.append([epoch, i, running_loss, score])
 94 |                 running_loss = 0
 95 | 
 96 |         # save model
 97 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 98 |         state_dict_path = '{}/mf_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 99 |         torch.save(mf.state_dict(), state_dict_path)
100 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
101 | 
102 |     end_time = datetime.datetime.now()
103 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
104 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
105 | 
106 |     # Save results
107 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
108 |     results_df.to_csv('{}/model_metrics_mf.csv'.format(MODEL_PATH), index=False)
109 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_bias.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader import Sequences, SequencesDataset
 13 | from src.ml.mf_bias import MFBias
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | torch.cuda.set_device(0)  # Set to use 2nd GPU
 24 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 25 | 
 26 | if __name__ == '__main__':
 27 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 28 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 29 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 30 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 31 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 32 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 33 |     args = parser.parse_args()
 34 | 
 35 |     # Initialize dataset
 36 |     sequences = Sequences(args.read_path, args.val_path)
 37 |     dataset = SequencesDataset(sequences)
 38 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 39 |                             collate_fn=dataset.collate_for_mf)
 40 | 
 41 |     # Initialize validation set
 42 |     val_samp = pd.read_csv(args.val_samp_path)
 43 | 
 44 |     # Get product ID
 45 |     word2id_func = np.vectorize(sequences.get_product_id)
 46 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 47 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 48 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 49 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 50 | 
 51 |     product1_id = val_samp['product1_id'].values
 52 |     product2_id = val_samp['product2_id'].values
 53 | 
 54 |     # Initialize model
 55 |     mf = MFBias(sequences.n_unique_tokens, emb_dim).to(device)
 56 | 
 57 |     # Train loop
 58 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 59 | 
 60 |     results = []
 61 |     start_time = datetime.datetime.now()
 62 |     for epoch in range(epochs):
 63 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 64 |         running_loss = 0
 65 | 
 66 |         # Training loop
 67 |         for i, batches in enumerate(dataloader):
 68 | 
 69 |             product1 = batches[0].to(device)
 70 |             product2 = batches[1].to(device)
 71 |             label = batches[2].to(device)
 72 | 
 73 |             optimizer.zero_grad()
 74 | 
 75 |             pred = mf.forward(product1, product2)
 76 |             loss = mf.loss(pred, label)
 77 |             loss.backward()
 78 |             optimizer.step()
 79 | 
 80 |             scheduler.step()
 81 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 82 | 
 83 |             if i > 0 and i % 1000 == 0:
 84 |                 # Validation Check
 85 |                 with torch.no_grad():
 86 |                     pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device),
 87 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 88 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 89 | 
 90 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 91 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 92 |                                                                                score, optimizer.param_groups[0]['lr']))
 93 |                 results.append([epoch, i, running_loss, score])
 94 |                 running_loss = 0
 95 | 
 96 |         # save model
 97 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 98 |         state_dict_path = '{}/mf_bias_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 99 |         torch.save(mf.state_dict(), state_dict_path)
100 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
101 | 
102 |     end_time = datetime.datetime.now()
103 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
104 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
105 | 
106 |     # Save results
107 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
108 |     results_df.to_csv('{}/model_metrics_mf_bias.csv'.format(MODEL_PATH), index=False)
109 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_bias_continuous_edges.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader_edges import Edges, EdgesDataset
 13 | from src.ml.mf_bias_continuous import MFBiasContinuous
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 27 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 28 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 29 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 30 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 31 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 32 |     args = parser.parse_args()
 33 | 
 34 |     # Initialize dataset
 35 |     edges = Edges(args.read_path, args.val_path)
 36 |     dataset = EdgesDataset(edges)
 37 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 38 |                             collate_fn=dataset.collate_continuous)
 39 | 
 40 |     # Initialize validation set
 41 |     val_samp = pd.read_csv(args.val_samp_path)
 42 | 
 43 |     # Get product ID
 44 |     word2id_func = np.vectorize(edges.get_product_id)
 45 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 46 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 47 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 48 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 49 | 
 50 |     product1_id = val_samp['product1_id'].values
 51 |     product2_id = val_samp['product2_id'].values
 52 | 
 53 |     # Initialize model
 54 |     mf = MFBiasContinuous(edges.n_unique_tokens, emb_dim).to(device)
 55 | 
 56 |     # Train loop
 57 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 58 | 
 59 |     results = []
 60 |     start_time = datetime.datetime.now()
 61 |     for epoch in range(epochs):
 62 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 63 |         running_loss = 0
 64 | 
 65 |         # Training loop
 66 |         for i, batches in enumerate(dataloader):
 67 | 
 68 |             product1 = batches[0].to(device)
 69 |             product2 = batches[1].to(device)
 70 |             label = batches[2].to(device)
 71 | 
 72 |             optimizer.zero_grad()
 73 | 
 74 |             pred = mf.forward(product1, product2)
 75 |             loss = mf.loss(pred, label)
 76 |             loss.backward()
 77 |             optimizer.step()
 78 | 
 79 |             scheduler.step()
 80 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 81 | 
 82 |             if i > 0 and i % 1000 == 0:
 83 |                 # Validation Check
 84 |                 with torch.no_grad():
 85 |                     pred = mf.predict(torch.LongTensor(val_samp['product1_id']).to(device),
 86 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 87 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 88 | 
 89 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 90 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 91 |                                                                                score, optimizer.param_groups[0]['lr']))
 92 |                 results.append([epoch, i, running_loss, score])
 93 |                 running_loss = 0
 94 | 
 95 |         # save model
 96 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 97 |         state_dict_path = '{}/mf_bias_continuous_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 98 |         torch.save(mf.state_dict(), state_dict_path)
 99 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
100 | 
101 |     end_time = datetime.datetime.now()
102 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
103 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
104 | 
105 |     # Save results
106 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
107 |     results_df.to_csv('{}/model_metrics_mf_bias_continuous_edges.csv'.format(MODEL_PATH), index=False)
108 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_bias_edges.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader_edges import Edges, EdgesDataset
 13 | from src.ml.mf_bias import MFBias
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 27 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 28 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 29 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 30 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 31 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 32 |     args = parser.parse_args()
 33 | 
 34 |     # Initialize dataset
 35 |     edges = Edges(args.read_path, args.val_path)
 36 |     dataset = EdgesDataset(edges)
 37 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 38 |                             collate_fn=dataset.collate)
 39 | 
 40 |     # Initialize validation set
 41 |     val_samp = pd.read_csv(args.val_samp_path)
 42 | 
 43 |     # Get product ID
 44 |     word2id_func = np.vectorize(edges.get_product_id)
 45 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 46 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 47 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 48 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 49 | 
 50 |     product1_id = val_samp['product1_id'].values
 51 |     product2_id = val_samp['product2_id'].values
 52 | 
 53 |     # Initialize model
 54 |     mf = MFBias(edges.n_unique_tokens, emb_dim).to(device)
 55 | 
 56 |     # Train loop
 57 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 58 | 
 59 |     results = []
 60 |     start_time = datetime.datetime.now()
 61 |     for epoch in range(epochs):
 62 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 63 |         running_loss = 0
 64 | 
 65 |         # Training loop
 66 |         for i, batches in enumerate(dataloader):
 67 | 
 68 |             product1 = batches[0].to(device)
 69 |             product2 = batches[1].to(device)
 70 |             label = batches[2].to(device)
 71 | 
 72 |             optimizer.zero_grad()
 73 | 
 74 |             pred = mf.forward(product1, product2)
 75 |             loss = mf.loss(pred, label)
 76 |             loss.backward()
 77 |             optimizer.step()
 78 | 
 79 |             scheduler.step()
 80 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 81 | 
 82 |             if i > 0 and i % 1000 == 0:
 83 |                 # Validation Check
 84 |                 with torch.no_grad():
 85 |                     pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device),
 86 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 87 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 88 | 
 89 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 90 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 91 |                                                                                score, optimizer.param_groups[0]['lr']))
 92 |                 results.append([epoch, i, running_loss, score])
 93 |                 running_loss = 0
 94 | 
 95 |         # save model
 96 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 97 |         state_dict_path = '{}/mf_bias_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 98 |         torch.save(mf.state_dict(), state_dict_path)
 99 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
100 | 
101 |     end_time = datetime.datetime.now()
102 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
103 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
104 | 
105 |     # Save results
106 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
107 |     results_df.to_csv('{}/model_metrics_mf_bias_edges.csv'.format(MODEL_PATH), index=False)
108 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_bias_edges_parallel.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader_edges import Edges, EdgesDataset
 13 | from src.ml.mf_bias import MFBias
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 27 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 28 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 29 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 30 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 31 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 32 |     args = parser.parse_args()
 33 | 
 34 |     # Initialize dataset
 35 |     edges = Edges(args.read_path, args.val_path)
 36 |     dataset = EdgesDataset(edges)
 37 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 38 |                             collate_fn=dataset.collate)
 39 | 
 40 |     # Initialize validation set
 41 |     val_samp = pd.read_csv(args.val_samp_path)
 42 | 
 43 |     # Get product ID
 44 |     word2id_func = np.vectorize(edges.get_product_id)
 45 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 46 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 47 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 48 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 49 | 
 50 |     product1_id = val_samp['product1_id'].values
 51 |     product2_id = val_samp['product2_id'].values
 52 | 
 53 |     # Initialize model
 54 |     mf = MFBias(edges.n_unique_tokens, emb_dim)
 55 |     if torch.cuda.device_count() > 1:
 56 |         logger.info('Detected {} GPUs, using them all'.format(torch.cuda.device_count()))
 57 |         mf = torch.nn.DataParallel(mf)
 58 |     mf.to(device)
 59 | 
 60 | 
 61 |     # Train loop
 62 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 63 | 
 64 |     results = []
 65 |     start_time = datetime.datetime.now()
 66 |     for epoch in range(epochs):
 67 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 68 |         running_loss = 0
 69 | 
 70 |         # Training loop
 71 |         for i, batches in enumerate(dataloader):
 72 | 
 73 |             product1 = batches[0].to(device)
 74 |             product2 = batches[1].to(device)
 75 |             label = batches[2].to(device)
 76 | 
 77 |             optimizer.zero_grad()
 78 | 
 79 |             pred = mf.forward(product1, product2)
 80 |             # loss = mf.loss(pred, label)
 81 |             loss = mf.module.loss(pred, label)
 82 |             loss.backward()
 83 |             optimizer.step()
 84 | 
 85 |             scheduler.step()
 86 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 87 | 
 88 |             if i > 0 and i % 1000 == 0:
 89 |                 # Validation Check
 90 |                 with torch.no_grad():
 91 |                     pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device),
 92 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 93 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 94 | 
 95 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 96 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 97 |                                                                                score, optimizer.param_groups[0]['lr']))
 98 |                 results.append([epoch, i, running_loss, score])
 99 |                 running_loss = 0
100 | 
101 |         # save model
102 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
103 |         state_dict_path = '{}/mf_bias_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
104 |         torch.save(mf.state_dict(), state_dict_path)
105 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
106 | 
107 |     end_time = datetime.datetime.now()
108 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
109 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
110 | 
111 |     # Save results
112 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
113 |     results_df.to_csv('{}/model_metrics_mf_bias_edges.csv'.format(MODEL_PATH), index=False)
114 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_continuous_edges.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader_edges import Edges, EdgesDataset
 13 | from src.ml.mf_continuous import MFContinuous
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 27 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 28 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 29 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 30 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 31 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 32 |     args = parser.parse_args()
 33 | 
 34 |     # Initialize dataset
 35 |     edges = Edges(args.read_path, args.val_path)
 36 |     dataset = EdgesDataset(edges)
 37 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 38 |                             collate_fn=dataset.collate_continuous)
 39 | 
 40 |     # Initialize validation set
 41 |     val_samp = pd.read_csv(args.val_samp_path)
 42 | 
 43 |     # Get product ID
 44 |     word2id_func = np.vectorize(edges.get_product_id)
 45 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 46 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 47 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 48 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 49 | 
 50 |     product1_id = val_samp['product1_id'].values
 51 |     product2_id = val_samp['product2_id'].values
 52 | 
 53 |     # Initialize model
 54 |     mf = MFContinuous(edges.n_unique_tokens, emb_dim).to(device)
 55 | 
 56 |     # Train loop
 57 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 58 | 
 59 |     results = []
 60 |     start_time = datetime.datetime.now()
 61 |     for epoch in range(epochs):
 62 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 63 |         running_loss = 0
 64 | 
 65 |         # Training loop
 66 |         for i, batches in enumerate(dataloader):
 67 | 
 68 |             product1 = batches[0].to(device)
 69 |             product2 = batches[1].to(device)
 70 |             label = batches[2].to(device)
 71 | 
 72 |             optimizer.zero_grad()
 73 | 
 74 |             pred = mf.forward(product1, product2)
 75 |             loss = mf.loss(pred, label)
 76 |             loss.backward()
 77 |             optimizer.step()
 78 | 
 79 |             scheduler.step()
 80 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 81 | 
 82 |             if i > 0 and i % 1000 == 0:
 83 |                 # Validation Check
 84 |                 with torch.no_grad():
 85 |                     pred = mf.predict(torch.LongTensor(val_samp['product1_id']).to(device),
 86 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 87 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 88 | 
 89 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 90 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 91 |                                                                                score, optimizer.param_groups[0]['lr']))
 92 |                 results.append([epoch, i, running_loss, score])
 93 |                 running_loss = 0
 94 | 
 95 |         # save model
 96 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 97 |         state_dict_path = '{}/mf_continuous_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 98 |         torch.save(mf.state_dict(), state_dict_path)
 99 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
100 | 
101 |     end_time = datetime.datetime.now()
102 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
103 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
104 | 
105 |     # Save results
106 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
107 |     results_df.to_csv('{}/model_metrics_mf_continuous_edges.csv'.format(MODEL_PATH), index=False)
108 | 


--------------------------------------------------------------------------------
/src/ml/train_torch_mf_edges.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from sklearn.metrics import roc_auc_score
  8 | from torch import optim
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from src.config import MODEL_PATH
 12 | from src.ml.data_loader_edges import Edges, EdgesDataset
 13 | from src.ml.mf import MF
 14 | from src.utils.logger import logger
 15 | 
 16 | shuffle = True
 17 | emb_dim = 128
 18 | epochs = 5
 19 | initial_lr = 0.01
 20 | 
 21 | # Torch parameters
 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 23 | logger.info('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))
 24 | 
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser(description='Training embeddings on torch')
 27 |     parser.add_argument('read_path', type=str, help='Path to sequences.npy')
 28 |     parser.add_argument('val_path', type=str, help='Path to val.csv')
 29 |     parser.add_argument('val_samp_path', type=str, help='Path to val_samp.csv')
 30 |     parser.add_argument('batch_size', type=int, help='Batchsize for dataloader')
 31 |     parser.add_argument('n_workers', type=int, help='Number of workers for dataloader')
 32 |     args = parser.parse_args()
 33 | 
 34 |     # Initialize dataset
 35 |     edges = Edges(args.read_path, args.val_path)
 36 |     dataset = EdgesDataset(edges)
 37 |     dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle, num_workers=args.n_workers,
 38 |                             collate_fn=dataset.collate)
 39 | 
 40 |     # Initialize validation set
 41 |     val_samp = pd.read_csv(args.val_samp_path)
 42 | 
 43 |     # Get product ID
 44 |     word2id_func = np.vectorize(edges.get_product_id)
 45 |     val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
 46 |     val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
 47 |     val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
 48 |     logger.info('No. of validation samples: {}'.format(val_samp.shape[0]))
 49 | 
 50 |     product1_id = val_samp['product1_id'].values
 51 |     product2_id = val_samp['product2_id'].values
 52 | 
 53 |     # Initialize model
 54 |     mf = MF(edges.n_unique_tokens, emb_dim).to(device)
 55 | 
 56 |     # Train loop
 57 |     optimizer = optim.Adam(mf.parameters(), lr=initial_lr)
 58 | 
 59 |     results = []
 60 |     start_time = datetime.datetime.now()
 61 |     for epoch in range(epochs):
 62 |         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
 63 |         running_loss = 0
 64 | 
 65 |         # Training loop
 66 |         for i, batches in enumerate(dataloader):
 67 | 
 68 |             product1 = batches[0].to(device)
 69 |             product2 = batches[1].to(device)
 70 |             label = batches[2].to(device)
 71 | 
 72 |             optimizer.zero_grad()
 73 | 
 74 |             pred = mf.forward(product1, product2)
 75 |             loss = mf.loss(pred, label)
 76 |             loss.backward()
 77 |             optimizer.step()
 78 | 
 79 |             scheduler.step()
 80 |             running_loss = running_loss * 0.9 + loss.item() * 0.1
 81 | 
 82 |             if i > 0 and i % 1000 == 0:
 83 |                 # Validation Check
 84 |                 with torch.no_grad():
 85 |                     pred = mf.forward(torch.LongTensor(val_samp['product1_id']).to(device),
 86 |                                       torch.LongTensor(val_samp['product2_id']).to(device))
 87 |                     score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
 88 | 
 89 |                 logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
 90 |                             "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
 91 |                                                                                score, optimizer.param_groups[0]['lr']))
 92 |                 results.append([epoch, i, running_loss, score])
 93 |                 running_loss = 0
 94 | 
 95 |         # save model
 96 |         current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
 97 |         state_dict_path = '{}/mf_edges_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
 98 |         torch.save(mf.state_dict(), state_dict_path)
 99 |         logger.info('Model state dict saved to {}'.format(state_dict_path))
100 | 
101 |     end_time = datetime.datetime.now()
102 |     time_diff = round((end_time - start_time).total_seconds() / 60, 2)
103 |     logger.info('Total time taken: {:,} minutes'.format(time_diff))
104 | 
105 |     # Save results
106 |     results_df = pd.DataFrame(results, columns=['epoch', 'batches', 'loss', 'auc'])
107 |     results_df.to_csv('{}/model_metrics_mf_edges.csv'.format(MODEL_PATH), index=False)
108 | 


--------------------------------------------------------------------------------
/src/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/parse/__init__.py


--------------------------------------------------------------------------------
/src/parse/parse_json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parses the raw json data into csv file for faster loading into pd.DataFrame.
  3 | """
  4 | import argparse
  5 | import csv
  6 | import gzip
  7 | from typing import List
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from pandas.api.types import is_object_dtype
 12 | 
 13 | from src.utils.logger import logger
 14 | 
 15 | 
 16 | def parse(path: str):
 17 |     g = gzip.open(path, 'rb')
 18 |     for l in g:
 19 |         yield eval(l)
 20 | 
 21 | 
 22 | def parse_json_to_df(path: str) -> pd.DataFrame:
 23 |     i = 0
 24 |     df_dict = {}
 25 |     for d in parse(path):
 26 |         df_dict[i] = d
 27 |         i += 1
 28 |         if i % 10000 == 0:
 29 |             logger.info('Rows processed: {:,}'.format(i))
 30 | 
 31 |     df = pd.DataFrame.from_dict(df_dict, orient='index')
 32 | 
 33 |     # Lowercase
 34 |     df['related'] = df['related'].astype(str)
 35 |     df['categories'] = df['categories'].astype(str)
 36 |     df['salesRank'] = df['salesRank'].astype(str)
 37 |     df = lowercase_df(df)
 38 | 
 39 |     return df
 40 | 
 41 | 
 42 | # Lowercase Functions
 43 | def lowercase_df(df: pd.DataFrame) -> pd.DataFrame:
 44 |     """
 45 |     Lowercase characters from all columns in a dataframe.
 46 | 
 47 |     Args:
 48 |         df: Pandas dataframe
 49 | 
 50 |     Returns:
 51 |         Lowercased dataframe
 52 |     """
 53 |     df = df.copy()
 54 |     for col in df.columns:
 55 |         if is_object_dtype(df[col]):
 56 |             df = lowercase_cols(df, [col])
 57 |     return df
 58 | 
 59 | 
 60 | def lowercase_cols(df: pd.DataFrame, colnames: List[str]) -> pd.DataFrame:
 61 |     """
 62 |     Lowercase characters from specified columns in a dataframe
 63 | 
 64 |     Args:
 65 |         df: Pandas dataframe
 66 |         colnames (List): Names of columns to be lowercased
 67 | 
 68 |     Returns: Lowercased dataframe
 69 | 
 70 |     """
 71 |     df = df.copy()
 72 |     for col in colnames:
 73 |         assert df[col].dtype != np.float64 and df[col].dtype != np.int64, \
 74 |             'Trying to lowercase a non-string column: {}'.format(col)
 75 |         df[col] = df[col].str.lower()
 76 |     return df
 77 | 
 78 | 
 79 | def parse_json_to_csv(read_path: str, write_path: str) -> None:
 80 |     """
 81 |     Note: This assumes that the first json in the path has all the keys, which could be WRONG
 82 | 
 83 |     Args:
 84 |         read_path:
 85 |         write_path:
 86 | 
 87 |     Returns:
 88 | 
 89 |     """
 90 |     csv_writer = csv.writer(open(write_path, 'w'))
 91 |     i = 0
 92 |     for d in parse(read_path):
 93 |         if i == 0:
 94 |             header = d.keys()
 95 |             csv_writer.writerow(header)
 96 | 
 97 |         csv_writer.writerow(d.values().lower())
 98 |         i += 1
 99 |         if i % 10000 == 0:
100 |             logger.info('Rows processed: {:,}'.format(i))
101 | 
102 |     logger.info('Csv saved to {}'.format(write_path))
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     parser = argparse.ArgumentParser(description='Parsing json (gzipped) to csv')
107 |     parser.add_argument('read_path', type=str, help='Path to input gzipped json')
108 |     parser.add_argument('write_path', type=str, help='Path to output csv')
109 |     args = parser.parse_args()
110 | 
111 |     df = parse_json_to_df(args.read_path)
112 |     df.to_csv(args.write_path, index=False)
113 |     logger.info('Csv saved to {}'.format(args.write_path))
114 | 


--------------------------------------------------------------------------------
/src/prep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/prep/__init__.py


--------------------------------------------------------------------------------
/src/prep/prep_edges.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Converts edge relationships (e.g., bought together, also bought) to numeric weights between two nodes.
 3 | """
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from src.utils.logger import logger
10 | 
11 | relationship_weights = {'bought_together': 1.2,
12 |                         'also_bought': 1.0,
13 |                         'also_viewed': 0.5}
14 | 
15 | 
16 | def create_product_pair(df, col_list):
17 |     pairs = df[col_list].values
18 |     pairs.sort(axis=1)
19 |     df['product_pair'] = ['|'.join(arr) for arr in pairs]
20 | 
21 |     return df
22 | 
23 | 
24 | def split_product_pair(product_pair):
25 |     result = product_pair.split('|')
26 |     return result[0], result[1]
27 | 
28 | 
29 | def get_relationship_weights(df, relationship_weights):
30 |     df['weight'] = 0
31 |     for relationship, weight in relationship_weights.items():
32 |         df.loc[df['relationship'] == relationship, 'weight'] += weight
33 | 
34 |     return df
35 | 
36 | 
37 | def get_edges(df):
38 |     """
39 |     Returns a dataframe of products and the weights of the edges between them.
40 | 
41 |     Args:
42 |         df:
43 | 
44 |     Returns:
45 | 
46 |     """
47 |     logger.info('Relationship distribution: \n{}'.format(df['relationship'].value_counts()))
48 | 
49 |     df = create_product_pair(df, col_list=['asin', 'related'])
50 |     logger.info('Product pairs created')
51 | 
52 |     df = get_relationship_weights(df, relationship_weights)
53 |     logger.info('Relationship weights updated')
54 | 
55 |     # Aggregate to remove duplicates
56 |     logger.info('Original no. of edges: {:,}'.format(df.shape[0]))
57 |     df = df.groupby('product_pair').agg({'weight': 'sum'}).reset_index()
58 |     logger.info('Deduplicated no. of edges: {:,}'.format(df.shape[0]))
59 | 
60 |     # Save edge list
61 |     df['product1'], df['product2'] = zip(*df['product_pair'].apply(split_product_pair))
62 | 
63 |     df = df[['product1', 'product2', 'weight', 'product_pair']]
64 |     return df
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     parser = argparse.ArgumentParser(description='Preparing edges and associated weights')
69 |     parser.add_argument('read_path', type=str, help='Path to input csv (of node relationships)')
70 |     parser.add_argument('write_path', type=str, help='Path to output edges')
71 |     parser.add_argument('--sample_size', type=int, help='Sample size (default: no sampling)',
72 |                         default=None)
73 |     args = parser.parse_args()
74 | 
75 |     df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True,
76 |                      dtype={'asin': 'str', 'related': 'str'})
77 |     logger.info('DF shape: {}'.format(df.shape))
78 | 
79 |     # Sample for development efficiency
80 |     if args.sample_size:
81 |         sample_idx = np.random.choice(df.shape[0], size=args.sample_size, replace=False)
82 |         df = df.iloc[sample_idx]
83 | 
84 |     df = get_edges(df)
85 | 
86 |     df.to_csv(args.write_path, index=False)
87 |     logger.info('Csv saved to {}'.format(args.write_path))
88 | 


--------------------------------------------------------------------------------
/src/prep/prep_graph_samples.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Builds a graph from the edges (training set) and performs random walk sampling from the graph
  3 | - Currently returns 10 samples of sequence length 10 for each node (this is a parameter in create_random_walk_samples)
  4 | """
  5 | import argparse
  6 | import random
  7 | 
  8 | import networkx
  9 | import numpy as np
 10 | import scipy as sp
 11 | 
 12 | from src.config import DATA_PATH
 13 | from src.utils.io_utils import save_model
 14 | from src.utils.logger import logger
 15 | 
 16 | 
 17 | def load_network(edgelist_path):
 18 |     graph = networkx.read_weighted_edgelist(edgelist_path)
 19 |     logger.info('No of nodes ({:,}) and edges ({:,})'.format(graph.number_of_nodes(), graph.number_of_edges()))
 20 | 
 21 |     # Get dictionary mapping of integer to nodes
 22 |     node_dict = {i: key for i, key in enumerate(graph.nodes.keys())}
 23 | 
 24 |     return graph, node_dict
 25 | 
 26 | 
 27 | def create_transition_matrix(graph):
 28 |     """
 29 |     https://stackoverflow.com/questions/37311651/get-node-list-from-random-walk-in-networkx
 30 |     https://stackoverflow.com/questions/15330380/probability-to-visit-nodes-in-a-random-walk-on-graph
 31 | 
 32 |     Args:
 33 |         graph:
 34 | 
 35 |     Returns:
 36 | 
 37 |     """
 38 |     adjacency_mat = networkx.adj_matrix(graph)
 39 |     logger.info('Adjacency matrix shape: {}'.format(adjacency_mat.shape))
 40 |     graph = None
 41 | 
 42 |     degree_vector = sp.sparse.csr_matrix(1 / np.sum(adjacency_mat, axis=0))
 43 | 
 44 |     transition_matrix = adjacency_mat.multiply(degree_vector).T  # Need to transpose so each row probability sum to 1
 45 |     logger.info('Transition matrix shape: {}'.format(transition_matrix.shape))
 46 | 
 47 |     return transition_matrix
 48 | 
 49 | 
 50 | def create_transition_dict(transition_matrix):
 51 |     transition_dict = {}
 52 |     rows, cols = transition_matrix.nonzero()
 53 | 
 54 |     # Create dictionary of transition product and probabilities for each product
 55 |     prev_row = -1
 56 |     for row, col in zip(rows, cols):
 57 |         if row != prev_row:
 58 |             transition_dict.setdefault(row, {})
 59 |             transition_dict[row].setdefault('product', [])
 60 |             transition_dict[row].setdefault('probability', [])
 61 | 
 62 |         transition_dict[row]['product'].append(col)
 63 |         transition_dict[row]['probability'].append(transition_matrix[row, col])
 64 |         prev_row = row
 65 | 
 66 |     return transition_dict
 67 | 
 68 | 
 69 | def create_random_walk_samples(node_dict, transition_dict, samples_per_node=10, sequence_len=10):
 70 |     random.seed(42)
 71 |     n_nodes = len(node_dict)
 72 | 
 73 |     sample_array = np.zeros((n_nodes * samples_per_node, sequence_len), dtype=int)
 74 |     logger.info('Sample array shape: {}'.format(sample_array.shape))
 75 | 
 76 |     # For each node
 77 |     for node_idx in range(n_nodes):
 78 | 
 79 |         if node_idx % 100000 == 0:
 80 |             logger.info('Getting samples for node: {:,}/{:,}'.format(node_idx, n_nodes))
 81 | 
 82 |         # For each sample
 83 |         for sample_idx in range(samples_per_node):
 84 |             node = node_idx
 85 | 
 86 |             # For each event in sequence
 87 |             for seq_idx in range(sequence_len):
 88 |                 sample_array[node_idx * samples_per_node + sample_idx, seq_idx] = node
 89 |                 node = random.choices(population=transition_dict[node]['product'],
 90 |                                       weights=transition_dict[node]['probability'], k=1)[0]
 91 | 
 92 |     return sample_array
 93 | 
 94 | 
 95 | def get_samples(edgelist_path):
 96 |     graph, node_dict = load_network(edgelist_path)
 97 |     logger.info('Network loaded')
 98 | 
 99 |     transition_matrix = create_transition_matrix(graph)
100 |     logger.info('Transition matrix created')
101 |     graph = None
102 | 
103 |     transition_dict = create_transition_dict(transition_matrix)
104 |     logger.info('Transition dict created')
105 |     transition_matrix = None
106 | 
107 |     sample_array = create_random_walk_samples(node_dict, transition_dict)
108 |     logger.info('Random walk samples created')
109 | 
110 |     # Convert array of nodeIDs back to product IDs
111 |     sample_array = np.vectorize(node_dict.get)(sample_array)
112 |     logger.info('Converted back to product IDs')
113 | 
114 |     return sample_array, node_dict, transition_dict
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     parser = argparse.ArgumentParser(description='Preparing graph samples via random walk')
119 |     parser.add_argument('read_path', type=str, help='Path to input graph edgelist')
120 |     parser.add_argument('write_path', type=str, help='Path to output samples (.npy format)')
121 |     parser.add_argument('graph_name', type=str, help='Name for node dict and transition dict')
122 |     args = parser.parse_args()
123 | 
124 |     sample_array, node_dict, transition_dict = get_samples(args.read_path)
125 | 
126 |     np.save(args.write_path, sample_array)
127 |     logger.info('Sample array saved to {}'.format(args.write_path))
128 |     sample_array = None
129 | 
130 |     save_model(node_dict, '{}/{}_node_dict.tar.gz'.format(DATA_PATH, args.graph_name))
131 |     node_dict = None
132 | 
133 |     save_model(transition_dict, '{}/{}_transition_dict.tar.gz'.format(DATA_PATH, args.graph_name))
134 |     transition_dict = None
135 | 


--------------------------------------------------------------------------------
/src/prep/prep_meta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parses out the metadata from the original csv.
 3 | """
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from src.utils.logger import logger
10 | 
11 | 
12 | def get_category_lvl(category_list: list, lvl=0) -> str:
13 |     try:
14 |         return category_list[lvl]
15 |     except IndexError:
16 |         return 'NA_VALUE'
17 | 
18 | 
19 | def get_categories(df: pd.DataFrame) -> pd.DataFrame:
20 |     df['category_lvl_1'] = df['categories'].apply(get_category_lvl, args=(0,))
21 |     df['category_lvl_2'] = df['categories'].apply(get_category_lvl, args=(1,))
22 |     df['category_lvl_3'] = df['categories'].apply(get_category_lvl, args=(2,))
23 |     df['category_lvl_4'] = df['categories'].apply(get_category_lvl, args=(3,))
24 |     logger.info('Categories lvl 1 - 4 prepared')
25 | 
26 |     return df
27 | 
28 | 
29 | def get_meta(df: pd.DataFrame) -> pd.DataFrame:
30 |     # Update to reflect if relationship exist
31 |     df['related'] = np.where(df['related'].isnull(), 0, 1)
32 | 
33 |     # Prep categories
34 |     df['categories'] = df['categories'].apply(eval)
35 |     df['categories'] = df['categories'].apply(lambda x: x[0])  # Get first category only
36 |     df = get_categories(df)
37 | 
38 |     # Prep title and description
39 |     # TODO: Add cleaning of title and description
40 | 
41 |     return df
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='Preparing item metadata')
46 |     parser.add_argument('read_path', type=str, help='Path to input csv')
47 |     parser.add_argument('write_path', type=str, help='Path to output csv (of metadata')
48 |     args = parser.parse_args()
49 | 
50 |     META_COLS = ['asin', 'categories', 'title', 'description', 'price', 'brand', 'related']
51 |     df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True,
52 |                      dtype={'asin': 'str', 'title': 'str', 'brand': 'str'},
53 |                      usecols=META_COLS)
54 |     logger.info('DF shape: {}'.format(df.shape))
55 | 
56 |     meta_df = get_meta(df)
57 | 
58 |     meta_df.to_csv(args.write_path, index=False)
59 |     logger.info('Csv saved to {}'.format(args.write_path))
60 | 


--------------------------------------------------------------------------------
/src/prep/prep_node_relationship.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parses item to item relationships in 'related' field and explodes it such that each relationship is a single row.
 3 | """
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from src.utils.logger import logger
10 | 
11 | 
12 | def get_also_bought_count(related):
13 |     try:
14 |         return len(related['also_bought'])
15 |     except KeyError:
16 |         return -1
17 | 
18 | 
19 | def explode_on_related(df: pd.DataFrame, relationship: str) -> pd.DataFrame:
20 |     # Filter on relationship
21 |     df = df[df['related'].apply(lambda x: relationship in x.keys())].copy()
22 | 
23 |     # Get value (list) from relationship dict
24 |     df['related'] = df['related'].apply(lambda x: x[relationship])
25 | 
26 |     # Explode efficiently using numpy
27 |     vals = df['related'].values.tolist()
28 |     lens = [len(val_list) for val_list in vals]
29 |     vals_array = np.repeat(df['asin'], lens)
30 |     exploded_df = pd.DataFrame(np.column_stack((vals_array, np.concatenate(vals))), columns=df.columns)
31 | 
32 |     # Add relationship
33 |     exploded_df['relationship'] = relationship
34 |     logger.info('Exploded for relationship: {}'.format(relationship))
35 | 
36 |     return exploded_df
37 | 
38 | 
39 | def get_node_relationship(df: pd.DataFrame) -> pd.DataFrame:
40 |     """
41 |     Returns a dataframe of products and their relationships (e.g., bought together, also bought, also viewed)
42 | 
43 |     Args:
44 |         df:
45 | 
46 |     Returns:
47 | 
48 |     """
49 |     # Keep only rows with related data
50 |     df = df[~df['related'].isnull()].copy()
51 |     logger.info('DF shape after dropping empty related: {}'.format(df.shape))
52 | 
53 |     df = df[~df['title'].isnull()].copy()
54 |     logger.info('DF shape after dropping empty title: {}'.format(df.shape))
55 |     df = df[['asin', 'related']].copy()
56 | 
57 |     # Evaluate related str into dict
58 |     df['related'] = df['related'].apply(eval)
59 |     logger.info('Completed eval on "related" string')
60 | 
61 |     # Exclude products where also bought relationships less than 2
62 |     df['also_bought_count'] = df['related'].apply(get_also_bought_count)
63 |     df = df[df['also_bought_count'] >= 2].copy()
64 |     logger.info('DF shape after dropping products with <2 edges: {}'.format(df.shape))
65 |     df.drop(columns='also_bought_count', inplace=True)
66 | 
67 |     # Explode columns
68 |     bought_together_df = explode_on_related(df, relationship='bought_together')
69 |     also_bought_df = explode_on_related(df, relationship='also_bought')
70 |     also_viewed_df = explode_on_related(df, relationship='also_viewed')
71 | 
72 |     # Concatenate df
73 |     combined_df = pd.concat([bought_together_df, also_bought_df, also_viewed_df], axis=0)
74 |     logger.info('Distribution of relationships: \n{}'.format(combined_df['relationship'].value_counts()))
75 | 
76 |     return combined_df
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     parser = argparse.ArgumentParser(description='Preparing node relationships')
81 |     parser.add_argument('read_path', type=str, help='Path to input csv')
82 |     parser.add_argument('write_path', type=str, help='Path to output csv (of nodes relationships)')
83 |     args = parser.parse_args()
84 | 
85 |     df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True,
86 |                      dtype={'asin': 'str', 'title': 'str', 'brand': 'str'})
87 |     logger.info('DF shape: {}'.format(df.shape))
88 | 
89 |     exploded_df = get_node_relationship(df)
90 | 
91 |     exploded_df.to_csv(args.write_path, index=False)
92 |     logger.info('Csv saved to {}'.format(args.write_path))
93 | 


--------------------------------------------------------------------------------
/src/prep/train_val_split.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Splits all ground truth edges into train and validation set, with some constraints
  3 | - The validation set should only contain edges where both products are in the train set
  4 | 
  5 | For the validation set, negative samples are created by randomly selecting a pair of nodes and creating a negative edge.
  6 | - From these samples, we exclude valid edges from either the train or validation set.
  7 | """
  8 | import argparse
  9 | from pathlib import Path
 10 | from typing import Tuple
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | from sklearn.model_selection import train_test_split
 15 | 
 16 | from src.config import DATA_PATH
 17 | from src.prep.prep_edges import create_product_pair
 18 | from src.utils.logger import logger
 19 | 
 20 | 
 21 | def train_val_split(df, n_val_samples: int, filter_out_unseen: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]:
 22 |     if filter_out_unseen:
 23 |         # First split to get some test samples
 24 |         train, val = train_test_split(df, test_size=int(1.1 * n_val_samples), random_state=42)  # Need slightly more
 25 |         logger.info('Train shape: {}, val shape: {}'.format(train.shape, val.shape))
 26 | 
 27 |         # Get set of products in train
 28 |         train_product_set = set(train['product1']).union(set(train['product2']))
 29 |         logger.info('No. of unique products in train: {:,}'.format(len(train_product_set)))
 30 | 
 31 |         # Only keep val where both products are in train product set
 32 |         val = val[(val['product1'].isin(train_product_set)) & (val['product2'].isin(train_product_set))]
 33 |         logger.info('Updated val shape: {}'.format(val.shape))
 34 | 
 35 |         # Split again to only get n_val_samples
 36 |         val = val.iloc[:n_val_samples].copy()
 37 |         logger.info('Final val shape: {}'.format(val.shape))
 38 | 
 39 |         # Get train set
 40 |         train = df[~df.index.isin(set(val.index))].copy()
 41 |         logger.info('Final train shape: {}'.format(train.shape))
 42 | 
 43 |     else:
 44 |         # First split to get some test samples
 45 |         train, val = train_test_split(df, test_size=int(n_val_samples), random_state=42)
 46 |         logger.info('Train shape: {}, val shape: {}'.format(train.shape, val.shape))
 47 | 
 48 |     return train, val
 49 | 
 50 | 
 51 | def get_sample(item_array, n_iter=None, sample_size=2):
 52 |     np.random.seed(42)
 53 |     n = len(item_array)
 54 | 
 55 |     # find the index we last sampled from
 56 |     start_idx = (n_iter * sample_size) % n
 57 |     if (start_idx + sample_size >= n) or (start_idx <= sample_size):
 58 |         # shuffle array if we have reached the end and repeat again
 59 |         np.random.shuffle(item_array)
 60 | 
 61 |     return item_array[start_idx:start_idx + sample_size]
 62 | 
 63 | 
 64 | def collect_samples(item_array, sample_size, n_samples):
 65 |     samples = []
 66 | 
 67 |     for i in range(0, n_samples):
 68 |         if i % 1000000 == 0:
 69 |             logger.info('Neg sample: {:,}'.format(i))
 70 | 
 71 |         sample = get_sample(item_array, n_iter=i, sample_size=sample_size)
 72 |         samples.append(sample)
 73 | 
 74 |     return samples
 75 | 
 76 | 
 77 | def create_negative_edges(df, val, n_val_samples):
 78 |     # Get set of valid product edges (across both train and val)
 79 |     valid_product_pairs = set(df['product_pair'])
 80 |     logger.info('No. of valid product pairs: {:,}'.format(len(valid_product_pairs)))
 81 | 
 82 |     # Get set of products in val (to generate edges)
 83 |     val_product_arr = np.array(list(set(val['product1']).union(set(val['product2']))))
 84 |     logger.info('No. of unique products in val: {:,}'.format(len(val_product_arr)))
 85 | 
 86 |     # Create negative samples
 87 |     neg_samples = collect_samples(val_product_arr, sample_size=2, n_samples=int(1.1 * n_val_samples))
 88 |     neg_samples_df = pd.DataFrame(neg_samples, columns=['product1', 'product2'])
 89 |     neg_samples_df.dropna(inplace=True)
 90 |     neg_samples_df = create_product_pair(neg_samples_df, col_list=['product1', 'product2'])
 91 |     logger.info('No. of negative samples: {:,}'.format(neg_samples_df.shape[0]))
 92 | 
 93 |     # Exclude neg samples that are valid pairs
 94 |     neg_samples_df = neg_samples_df[~neg_samples_df['product_pair'].isin(valid_product_pairs)].copy()
 95 |     logger.info('Updated no. of negative samples: {:,}'.format(neg_samples_df.shape[0]))
 96 | 
 97 |     # Only keep no. of val samples required
 98 |     neg_samples_df = neg_samples_df.iloc[:n_val_samples].copy()
 99 |     logger.info('Final no. of negative samples: {:,}'.format(neg_samples_df.shape[0]))
100 | 
101 |     return neg_samples_df
102 | 
103 | 
104 | def combine_val_and_neg_edges(val, neg_samples):
105 |     neg_samples['edge'] = 0
106 |     val['edge'] = 1
107 | 
108 |     VAL_COLS = ['product1', 'product2', 'edge']
109 |     neg = neg_samples[VAL_COLS].copy()
110 |     val = val[VAL_COLS].copy()
111 |     logger.info('Val shape: {}, Neg edges shape: {}, Ratio: {}'.format(val.shape, neg.shape,
112 |                                                                        val.shape[0] / (val.shape[0] + neg.shape[0])))
113 | 
114 |     val = pd.concat([val, neg])
115 |     logger.info('Final val shape: {}'.format(val.shape))
116 | 
117 |     return val
118 | 
119 | 
120 | def get_train_and_val(df, val_prop: float):
121 |     """
122 |     Splits into training and validation set, where validation set has 50% negative edges
123 | 
124 |     Args:
125 |         df:
126 |         val_prop:
127 | 
128 |     Returns:
129 | 
130 |     """
131 |     n_val_samples = int(val_prop * df.shape[0])
132 |     logger.info('Eventual required val samples (proportion: {}): {:,}'.format(val_prop, n_val_samples))
133 | 
134 |     train, val = train_val_split(df, n_val_samples)
135 |     logger.info('Ratio of train to val: {:,}:{:,} ({:.2f})'.format(train.shape[0], val.shape[0],
136 |                                                                    val.shape[0] / (train.shape[0] + val.shape[0])))
137 | 
138 |     neg_samples = create_negative_edges(df, val, n_val_samples)
139 | 
140 |     val = combine_val_and_neg_edges(val, neg_samples)
141 |     train = train[['product1', 'product2', 'weight']].copy()
142 | 
143 |     return train, val
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     parser = argparse.ArgumentParser(description='Splitting into train and val set')
148 |     parser.add_argument('read_path', type=str, help='Path to input csv of edges')
149 |     parser.add_argument('val_prop', type=float, help='Proportion of validation set (e.g., 0.33)')
150 |     args = parser.parse_args()
151 | 
152 |     df = pd.read_csv(args.read_path, error_bad_lines=False, warn_bad_lines=True,
153 |                      dtype={'product1': 'str', 'product2': 'str'})
154 |     logger.info('DF shape: {}'.format(df.shape))
155 | 
156 |     train, val = get_train_and_val(df, val_prop=args.val_prop)
157 | 
158 |     # Save to train, val, and train edgelist
159 |     input_filename = Path(args.read_path).resolve().stem
160 |     train.to_csv('{}/{}_train.csv'.format(DATA_PATH, input_filename), index=False)
161 |     logger.info('Train saved as: {}/{}_train.csv'.format(DATA_PATH, input_filename))
162 |     val.to_csv('{}/{}_val.csv'.format(DATA_PATH, input_filename), index=False)
163 |     logger.info('Val saved as: {}/{}_val.csv'.format(DATA_PATH, input_filename))
164 | 
165 |     train.to_csv('{}/{}_train.edgelist'.format(DATA_PATH, input_filename), sep=' ', index=False, header=False)
166 |     logger.info('Train edgelist saved as: {}/{}_train.edgelist'.format(DATA_PATH, input_filename))
167 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/io_utils.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import pickle
 3 | from typing import Any
 4 | 
 5 | from src.utils.logger import logger
 6 | 
 7 | 
 8 | def save_model(model: Any, model_path: str) -> None:
 9 |     """
10 |     Saves model in gzip format
11 | 
12 |     Args:
13 |         model: Model to be saved
14 |         model_path: Path to save model to
15 | 
16 |     Returns:
17 |         (None)
18 |     """
19 |     with gzip.open(model_path, "wb") as f:
20 |         pickle.dump(model, f)
21 | 
22 |     logger.info('Model saved to {}'.format(model_path))
23 | 
24 | 
25 | def load_model(model_path: str) -> Any:
26 |     """
27 |     Loads model from gzip format
28 | 
29 |     Args:
30 |         model_path: Path to load model from
31 | 
32 |     Returns:
33 | 
34 |     """
35 |     with gzip.open(model_path, 'rb') as f:
36 |         model = pickle.load(f)
37 | 
38 |     logger.info('Model loaded from: {}'.format(model_path))
39 |     return model
40 | 


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | logger.setLevel(logging.INFO)
 5 | formatter = logging.Formatter('%(asctime)s - %(message)s')
 6 | 
 7 | # create console handler and set level to info
 8 | ch = logging.StreamHandler()
 9 | ch.setFormatter(formatter)
10 | ch.setLevel(logging.INFO)
11 | 
12 | # add ch to logger
13 | logger.addHandler(ch)
14 | 


--------------------------------------------------------------------------------
/src/viz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/recsys-nlp-graph/43529b68f33016cffad19c7fd8807073285154f8/src/viz/__init__.py


--------------------------------------------------------------------------------
/src/viz/plot_results.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.metrics import precision_recall_curve, roc_curve
 5 | 
 6 | 
 7 | def plot_auc(label, score, title):
 8 |     precision, recall, thresholds = precision_recall_curve(label, score)
 9 |     plt.figure(figsize=(15, 5))
10 |     plt.grid()
11 |     plt.plot(thresholds, precision[1:], color='r', label='Precision')
12 |     plt.plot(thresholds, recall[1:], color='b', label='Recall')
13 |     plt.gca().invert_xaxis()
14 |     plt.legend(loc='lower right')
15 | 
16 |     plt.xlabel('Threshold (0.00 - 1.00)')
17 |     plt.ylabel('Precision / Recall')
18 |     _ = plt.title(title)
19 | 
20 | 
21 | def plot_roc(label, score, title):
22 |     fpr, tpr, roc_thresholds = roc_curve(label, score)
23 |     plt.figure(figsize=(5, 5))
24 |     plt.grid()
25 |     plt.plot(fpr, tpr, color='b')
26 | 
27 |     plt.xlabel('False Positive Rate')
28 |     plt.ylabel('True Positive Rate')
29 |     _ = plt.title(title)
30 | 
31 | 
32 | def plot_tradeoff(label, score, title):
33 |     precision, recall, thresholds = precision_recall_curve(label, score)
34 |     plt.figure(figsize=(5, 5))
35 |     plt.grid()
36 |     plt.step(recall, precision, color='b', label='Precision-Recall Trade-off')
37 |     plt.fill_between(recall, precision, alpha=0.1, color='b')
38 | 
39 |     plt.xlabel('Recall')
40 |     plt.ylabel('Precision')
41 |     _ = plt.title(title)
42 | 
43 | 
44 | def plot_metrics(df, ylim=None):
45 |     plt.figure(figsize=(15, 5))
46 |     plt.grid()
47 |     plt.plot(df.index, df['auc'], label='AUC-ROC', color='black')
48 | 
49 |     # Plot learning rate resets
50 |     lr_reset_batch = df[df['batches'] == df['batches'].max()]
51 |     for idx in lr_reset_batch.index:
52 |         plt.vlines(idx, df['auc'].min(), 1, label='LR reset (per epoch)',
53 |                    linestyles='--', colors='grey')
54 | 
55 |     # PLot legend
56 |     handles, labels = plt.gca().get_legend_handles_labels()
57 |     by_label = OrderedDict(zip(labels, handles))
58 |     _ = plt.legend(by_label.values(), by_label.keys(), loc='lower right')
59 | 
60 |     # Tidy axis
61 |     if ylim:
62 |         plt.ylim(ylim)
63 |     else:
64 |         plt.ylim(df['auc'].min() * 1.2, 0.96)
65 |     plt.xlim(0, df.index.max())
66 |     plt.ylabel('AUC-ROC', size=12)
67 |     plt.xlabel('Batches (over 5 epochs)', size=12)
68 |     _ = plt.title('AUC-ROC on sample val set over 5 epochs', size=15)
69 | 


--------------------------------------------------------------------------------
/src/viz/prep_results.py:
--------------------------------------------------------------------------------
1 | def get_product_id(mapping):
2 |     def func(x):
3 |         return mapping.get(x, -1)
4 |     return func


--------------------------------------------------------------------------------