├── src ├── __init__.py ├── train.sh ├── config.py ├── train_umap_embeddings.py ├── collect_co_occur_matrix.py ├── train_i2i_model.py ├── collect_miscellaneous_features.py ├── train_valid_split.py ├── collect_gbm_dataset.py ├── purchases_to_jrows.py ├── utils.py ├── train_lgb_model.py └── predictors.py ├── submit ├── solution │ ├── src │ ├── metadata.json │ └── server.py ├── custom_docker │ ├── Dockerfile │ └── README.md └── run_queries.py ├── slides └── retailhero_recommender.pdf ├── .gitignore ├── LICENSE └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /submit/solution/src: -------------------------------------------------------------------------------- 1 | ../../src -------------------------------------------------------------------------------- /src/train.sh: -------------------------------------------------------------------------------- 1 | python collect_gbm_dataset.py --N 100 --max-records 10 2 | python train_lgb_model.py --N 100 3 | -------------------------------------------------------------------------------- /slides/retailhero_recommender.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aprotopopov/retailhero_recommender/HEAD/slides/retailhero_recommender.pdf -------------------------------------------------------------------------------- /submit/solution/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "aprotopopov/ds-base:retailhero", 3 | "entry_point": "gunicorn --bind 0.0.0.0:8000 server:app" 4 | } -------------------------------------------------------------------------------- /submit/custom_docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-stretch 2 | RUN python3 -m pip install -U scikit-learn pandas scipy numpy gunicorn flask 3 | RUN python3 -m pip install -U implicit lightgbm catboost 4 | RUN python3 -m pip install -U feather-format 5 | RUN pip install -U faiss-cpu 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | tmp 3 | 4 | .idea/ 5 | .vscode/ 6 | 7 | **/.DS_Store 8 | **/__pycache__ 9 | **/*.pyc 10 | **/.ipynb_checkpoints 11 | **/*.csv 12 | **/*.log 13 | 14 | *.txt 15 | *.pkl 16 | *.zip 17 | *.cbm 18 | **/*.np 19 | **/*.npy 20 | **/*.npz 21 | **/*.pickle 22 | **/*.feather 23 | **/*.idx 24 | 25 | -------------------------------------------------------------------------------- /submit/custom_docker/README.md: -------------------------------------------------------------------------------- 1 | ## Custom docker image 2 | 3 | ### Build image locally 4 | ```bash 5 | docker build -t aprotopopov/ds-base:retailhero ./ 6 | ``` 7 | `aprotopopov/ds-base:retailhero` - docker image. Available via docker hub. 8 | 9 | 10 | ## Publish image 11 | 12 | ```bash 13 | docker push aprotopopov/ds-base:retailhero 14 | ``` 15 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # all path are from ./src folder 2 | from pathlib import Path 3 | 4 | PURCHASE_CSV_PATH = "../data/raw/purchases.csv" 5 | CLIENT_CSV_PATH = "../data/raw/clients.csv" 6 | PRODUCT_CSV_PATH = "../data/raw/products.csv" 7 | CHECK_QUERY_PATH = "../data/raw/check_queries.tsv" 8 | JSONS_DIR = "../tmp/jsons/" 9 | MAX_CHUNKS = None 10 | NUM_SHARDS = 16 11 | ASSETS_DIR = Path("../submit/solution/assets") 12 | 13 | # determed from check quieries 14 | BASE_SPLIT_POINT = "2019-03-02 10:05:00" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/train_umap_embeddings.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import umap 7 | from scipy import sparse as sp 8 | from tqdm import tqdm 9 | 10 | import config as cfg 11 | from utils import ProductEncoder, get_shard_path, make_coo_row 12 | 13 | logging.basicConfig( 14 | level=logging.INFO, 15 | format="%(asctime)s - %(message)s", 16 | handlers=[logging.StreamHandler()], 17 | ) 18 | 19 | 20 | def get_train_data(max_rows=None): 21 | product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) 22 | 23 | rows = [] 24 | num_rows = 0 25 | for shard_idx in tqdm(range(cfg.NUM_SHARDS)): 26 | for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))): 27 | rows.append( 28 | make_coo_row(js["transaction_history"], product_encoder, normalize=True) 29 | ) 30 | num_rows += 1 31 | 32 | if max_rows and num_rows == max_rows: 33 | return sp.vstack(rows) 34 | 35 | trans_mat = sp.vstack(rows) 36 | return trans_mat 37 | 38 | 39 | if __name__ == "__main__": 40 | logger = logging.getLogger(__name__) 41 | 42 | UMAP_MAX_ROWS = 1000 43 | trans_mat = get_train_data(UMAP_MAX_ROWS) 44 | 45 | umap_params = dict( 46 | random_state=14, metric="cosine", n_neighbors=10, low_memory=True 47 | ) 48 | 49 | logger.info("Training UMAP embeddings.") 50 | umap_items = umap.UMAP(**umap_params) 51 | item_embeddings = umap_items.fit_transform(trans_mat.T.tocsr()) 52 | 53 | filename = cfg.ASSETS_DIR / "umap_item_emb.npy" 54 | logger.info(f"Saving UMAP embeddings to {filename}") 55 | np.save(filename, item_embeddings) 56 | -------------------------------------------------------------------------------- /src/collect_co_occur_matrix.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | from scipy import sparse as sp 6 | from tqdm import tqdm 7 | 8 | import config as cfg 9 | from utils import ProductEncoder, get_shard_path 10 | 11 | 12 | def collect_cooccur_matrix(shard_indices, product_encoder): 13 | num_products = product_encoder.num_products 14 | co_occurrence = np.zeros((num_products, num_products)) 15 | occurrence = np.zeros(num_products) 16 | for shard_idx in tqdm(shard_indices): 17 | for js in tqdm((json.loads(s) for s in open(get_shard_path(shard_idx)))): 18 | tids = js.get("transaction_history", []) 19 | for tid in tids: 20 | product_ind = [ 21 | product_encoder.toIdx(item["product_id"]) 22 | for item in tid.get("products", []) 23 | ] 24 | for pid_num, pid in enumerate(product_ind): 25 | occurrence[pid] += 1 26 | for co_pid in product_ind[pid_num + 1 :]: 27 | co_occurrence[co_pid][pid] += 1 28 | co_occurrence[pid][co_pid] += 1 29 | return co_occurrence, occurrence 30 | 31 | 32 | if __name__ == "__main__": 33 | product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) 34 | co_occurrence, occurrence = collect_cooccur_matrix( 35 | range(cfg.NUM_SHARDS), product_encoder 36 | ) 37 | 38 | # cut the low count records to reduce size and improve speed 39 | min_count = 5 40 | co_occurrence_sp = sp.csc_matrix( 41 | np.where(co_occurrence >= min_count, co_occurrence, 0), dtype=np.int32 42 | ) 43 | 44 | # not compressed for fast loading 45 | sp.save_npz( 46 | cfg.ASSETS_DIR / f"item_co_occurrence_min_cnt_{min_count}.npz", 47 | co_occurrence_sp, 48 | compressed=False, 49 | ) 50 | np.save(cfg.ASSETS_DIR / "item_occurrence.npy", occurrence) 51 | -------------------------------------------------------------------------------- /submit/solution/server.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "src") 3 | 4 | import datetime as dt 5 | from flask import Flask, jsonify, request 6 | from predictors import GBMPredictor 7 | from pathlib import Path 8 | 9 | app = Flask(__name__) 10 | 11 | ASSETS_DIR = Path("assets") 12 | PREDICTOR = GBMPredictor( 13 | lgbm_model_path=str(ASSETS_DIR / "lgbm_model.txt"), 14 | product_csv_path=ASSETS_DIR / "products.csv", 15 | model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl", 16 | products_misc_path=ASSETS_DIR / "products_misc.csv", 17 | product_features_encoder_path=ASSETS_DIR / "product_features.pkl", 18 | implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl", 19 | implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl", 20 | fm_features_feather_path=ASSETS_DIR / "implicit_scores.feather", 21 | implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl", 22 | umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy", 23 | item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz", 24 | item_occurrence_path=ASSETS_DIR / "item_occurrence.npy", 25 | user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy", 26 | tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy", 27 | N=100, 28 | trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy", 29 | faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"), 30 | train_scores_path=ASSETS_DIR / "X_scores_sparse.npz", 31 | faiss_neighbors=512, 32 | faiss_nprobe=16, 33 | ) 34 | 35 | @app.route("/ready") 36 | def ready(): 37 | return "OK" 38 | 39 | 40 | @app.route("/recommend", methods=["POST"]) 41 | def recommend(): 42 | r = request.json 43 | 44 | result = PREDICTOR.predict(r, PREDICTOR.lgb_model) 45 | return jsonify({"recommended_products": result}) 46 | 47 | 48 | if __name__ == "__main__": 49 | # Only for debugging while developing 50 | app.run(host="0.0.0.0", debug=True, port=8000) -------------------------------------------------------------------------------- /src/train_i2i_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pickle 4 | import sys 5 | 6 | import implicit 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import sparse as sp 10 | from tqdm import tqdm 11 | 12 | import config as cfg 13 | from utils import ( 14 | ProductEncoder, 15 | get_shard_path, 16 | make_coo_row, 17 | normalized_average_precision, 18 | ) 19 | 20 | if __name__ == "__main__": 21 | product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) 22 | 23 | rows = [] 24 | for i in range(cfg.NUM_SHARDS - 1): 25 | for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): 26 | rows.append( 27 | make_coo_row(js["transaction_history"], product_encoder, normalize=True) 28 | ) 29 | train_mat = sp.vstack(rows) 30 | 31 | model = implicit.nearest_neighbours.CosineRecommender(K=2) 32 | # model = implicit.nearest_neighbours.CosineRecommender(K=50) 33 | # model = implicit.nearest_neighbours.TFIDFRecommender(K=100) 34 | 35 | # ALS should be trained with normalize = False 36 | # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12) 37 | model.fit(train_mat.T) 38 | 39 | out_dir = cfg.ASSETS_DIR 40 | os.makedirs(out_dir, exist_ok=True) 41 | print(f"Dump model to {out_dir}") 42 | pickle.dump(model, open(out_dir / "model.pkl", "wb")) 43 | 44 | print("Estimate quality...") 45 | scores = [] 46 | for js in tqdm((json.loads(s) for s in open(get_shard_path(cfg.NUM_SHARDS - 1)))): 47 | row = make_coo_row(js["transaction_history"], product_encoder).tocsr() 48 | raw_recs = model.recommend( 49 | userid=0, 50 | user_items=row, 51 | N=30, 52 | filter_already_liked_items=False, 53 | recalculate_user=True, 54 | ) 55 | 56 | recommended_items = product_encoder.toPid([idx for (idx, score) in raw_recs]) 57 | gt_items = js["target"][0]["product_ids"] 58 | nap = normalized_average_precision(gt_items, recommended_items) 59 | scores.append(nap) 60 | print("nap: {}".format(np.mean(scores))) 61 | -------------------------------------------------------------------------------- /src/collect_miscellaneous_features.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import sparse as sp 7 | from tqdm import tqdm 8 | 9 | import config as cfg 10 | from utils import ( 11 | ProductEncoder, 12 | get_shard_path, 13 | make_coo_row, 14 | normalized_average_precision, 15 | ) 16 | 17 | 18 | def update_item_cost(transaction_history, product_encoder, storage): 19 | for txn in transaction_history: 20 | for item in txn["products"]: 21 | key = product_encoder.toIdx(item["product_id"]) 22 | item_cost = item["s"] / max(item["quantity"], 1) 23 | if storage[key] == 0: 24 | storage[key] = item_cost 25 | else: 26 | storage[key] = (storage[key] + item_cost) / 2.0 27 | 28 | 29 | if __name__ == "__main__": 30 | product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) 31 | num_products = product_encoder.num_products 32 | 33 | items_cost = defaultdict(int) 34 | rows = [] 35 | num_transactions = 0 36 | for i in tqdm(range(cfg.NUM_SHARDS)): 37 | for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))): 38 | update_item_cost(js["transaction_history"], product_encoder, items_cost) 39 | rows.append( 40 | make_coo_row( 41 | js["transaction_history"], product_encoder, normalize=False 42 | ) 43 | ) 44 | num_transactions += len(js["transaction_history"]) 45 | trans_mat = sp.vstack(rows) 46 | 47 | items_cnt = trans_mat.sum(axis=0).A[0] 48 | df_top_items = ( 49 | pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame() 50 | ) 51 | df_items_cost = pd.Series(items_cost, name="cost").to_frame() 52 | df_misc_features = df_top_items.join(df_items_cost) 53 | df_misc_features["popularity_position"] = range(num_products) 54 | 55 | df_misc_features.to_csv(cfg.ASSETS_DIR / "products_misc.csv") 56 | 57 | # iDF, products in user purchases 58 | bought_products = trans_mat.sum(axis=0).A[0] 59 | user_prod_log_idf = np.log(trans_mat.shape[0] / (bought_products + 1)) 60 | np.save(cfg.ASSETS_DIR / "user_prod_log_idf.npy", user_prod_log_idf) 61 | 62 | # iDF, products in transactions 63 | tran_prod_log_idf = np.log(num_transactions / (bought_products + 1)) 64 | np.save(cfg.ASSETS_DIR / "tran_prod_log_idf.npy", tran_prod_log_idf) 65 | -------------------------------------------------------------------------------- /src/train_valid_split.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import random 4 | from typing import Any, Dict 5 | 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | import config as cfg 10 | 11 | 12 | def transaction_to_target(transaction: Dict[str, Any]) -> Dict[str, Any]: 13 | return { 14 | "tid": transaction["tid"], 15 | "datetime": transaction["datetime"], 16 | "product_ids": [e["product_id"] for e in transaction["products"]], 17 | "store_id": transaction["store_id"], 18 | } 19 | 20 | 21 | def get_client_info(client_data_path: str) -> Dict[str, Dict]: 22 | client_info = {} 23 | for row in pd.read_csv(client_data_path).itertuples(): 24 | client_info[row.client_id] = { 25 | "age": row.age, 26 | "gender": row.gender, 27 | "client_id": row.client_id, 28 | } 29 | return client_info 30 | 31 | 32 | if __name__ == "__main__": 33 | random.seed(43) # lets be special 34 | 35 | client_csv_path = cfg.CLIENT_CSV_PATH 36 | jsons_root = cfg.JSONS_DIR 37 | 38 | client_info = get_client_info(client_csv_path) 39 | 40 | print("process shards") 41 | for js_path in tqdm(sorted(glob.glob(jsons_root + "/*.jsons"))): 42 | fout = open(js_path + ".splitted", "w") 43 | for js in (json.loads(s) for s in open(js_path)): 44 | sorted_transactions = sorted( 45 | js["transaction_history"], key=lambda x: x["datetime"] 46 | ) 47 | split_candidates = [ 48 | t["datetime"] 49 | for t in sorted_transactions 50 | if t["datetime"] > cfg.BASE_SPLIT_POINT 51 | ] 52 | if len(split_candidates) == 0: 53 | # no transactions after split points - so we cannot validates on this sample, skip it. 54 | continue 55 | split_point = random.choice(split_candidates) 56 | train_transactions = [ 57 | t for t in sorted_transactions if t["datetime"] < split_point 58 | ] 59 | test_transactons = [ 60 | t for t in sorted_transactions if t["datetime"] >= split_point 61 | ] 62 | 63 | # copy info about client% client_id, age, gender 64 | sample = {**client_info[js["client_id"]]} 65 | sample["transaction_history"] = train_transactions 66 | sample["target"] = [transaction_to_target(x) for x in test_transactons] 67 | 68 | fout.write(json.dumps(sample) + "\n") 69 | fout.close() 70 | -------------------------------------------------------------------------------- /submit/run_queries.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import requests 4 | import datetime as dt 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | 10 | def average_precision(actual, recommended, k=30): 11 | ap_sum = 0 12 | hits = 0 13 | for i in range(k): 14 | product_id = recommended[i] if i < len(recommended) else None 15 | if product_id is not None and product_id in actual: 16 | hits += 1 17 | ap_sum += hits / (i + 1) 18 | return ap_sum / k 19 | 20 | 21 | def normalized_average_precision(actual, recommended, k=30): 22 | actual = set(actual) 23 | if len(actual) == 0: 24 | return 0.0 25 | 26 | ap = average_precision(actual, recommended, k=k) 27 | ap_ideal = average_precision(actual, list(actual)[:k], k=k) 28 | return ap / ap_ideal 29 | 30 | 31 | def run_queries(url, queryset_file, max_queries=1000): 32 | ap_values = [] 33 | durations = [] 34 | 35 | total_records = 0 36 | with open(queryset_file) as fin: 37 | for line in fin: 38 | total_records += 1 39 | 40 | total_records = min(total_records, max_queries) 41 | 42 | with open(queryset_file) as fin: 43 | for i, line in enumerate(tqdm.tqdm(fin, total=total_records)): 44 | splitted = line.strip().split('\t') 45 | if len(splitted) == 1: 46 | query_data = json.loads(splitted[0]) 47 | next_transaction = query_data['target'][0] 48 | else: 49 | query_data, next_transaction = map(json.loads, splitted) 50 | 51 | start_time = dt.datetime.now() 52 | # resp = requests.post(url, json=query_data, timeout=0.3) 53 | resp = requests.post(url, json=query_data) 54 | duration = (dt.datetime.now() - start_time).total_seconds() 55 | durations.append(duration) 56 | resp.raise_for_status() 57 | resp_data = resp.json() 58 | 59 | if len(set(resp_data['recommended_products'])) < 30: 60 | print(query_data) 61 | print(resp_data) 62 | 63 | assert len(resp_data['recommended_products']) == 30 64 | assert len(set(resp_data['recommended_products'])) == 30 65 | assert all(isinstance(item, str) for item in resp_data['recommended_products']) 66 | assert "recommended_products" in resp_data 67 | 68 | ap = normalized_average_precision(next_transaction['product_ids'], resp_data['recommended_products']) 69 | ap_values.append(ap) 70 | 71 | if i >= max_queries: 72 | break 73 | 74 | map_score = sum(ap_values) / len(ap_values) 75 | print("Max time:", np.max(durations), "mean_time:", np.mean(durations), "min_time:", np.min(durations)) 76 | return map_score 77 | 78 | 79 | if __name__ == '__main__': 80 | url = sys.argv[1] # 'http://localhost:8000/recommend' 81 | queryset_file = sys.argv[2] # 'data/check_queries.tsv' 82 | max_queries = int(sys.argv[3]) # 1000 83 | score = run_queries(url, queryset_file, max_queries) 84 | print('Score:', score) 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 1st place solution [RetailHero.ai/#2](https://retailhero.ai/c/recommender_system/overview) 2 | 3 | ## Overview 4 | 5 | [solution presentation](https://github.com/aprotopopov/retailhero_recommender/tree/master/slides/retailhero_recommender.pdf) (on russian) 6 | 7 | - Data preparation, train/valid split are heavily based on excellent [baseline](https://github.com/datagym-ru/retailhero-recomender-baseline) from [@geffy](https://github.com/geffy) 8 | - Co occurrence of purchased items per transaction 9 | - User transactions UMAP embeddings 10 | - Collecting miscellaneous features like `item_cost`, `popularity position`, etc 11 | - Dataset preparation with saving each chunk as feather DataFrame 12 | - LightGBM training on pool of items from the MF models, top products and history items 13 | 14 | ## Steps to prepare data 15 | 16 | 1. Copy data to `data/raw` 17 | 18 | ``` 19 | cd {REPO_ROOT} 20 | mkdir -p data/raw 21 | cp /path/to/unpacked/data/*.csv ./data/raw 22 | cd src 23 | ``` 24 | 25 | 2. Divide source purchase data into 16 shards 26 | 27 | ```bash 28 | python purchases_to_jrows.py 29 | ``` 30 | 31 | 3. Prepare train/valid data with similar structure to `check_queries.tsv` 32 | 33 | ```bash 34 | python train_valid_split.py 35 | ``` 36 | 37 | ## Train embedding and collect statistics 38 | 39 | 1. Collect co occurrence matrix 40 | 41 | ```bash 42 | python collect_co_occur_matrix.py 43 | ``` 44 | 45 | 2. Build UMAP embeddings 46 | 47 | ```bash 48 | python train_umap_embeddings.py 49 | ``` 50 | 51 | 3. Collect miscellaneous features like item popularity position, item_cost, iDF for items in transactions 52 | 53 | ```bash 54 | python collect_miscellaneous_features.py 55 | ``` 56 | 57 | ## Train models 58 | 59 | 1. Train item2item `implicit` models. It have to be launched for each model separately 60 | 61 | ```bash 62 | python train_i2i_model.py 63 | ``` 64 | 65 | 2. Collect dataset for GBM training 66 | 67 | ```bash 68 | python collect_gbm_dataset.py 69 | ``` 70 | 71 | 3. Train LGBM model 72 | 73 | ```bash 74 | python train_lgb_model.py 75 | ``` 76 | 77 | 4. Create submission file 78 | 79 | ```bash 80 | cd submit 81 | zip -r model.zip solution/* 82 | ``` 83 | 84 | Joint collection and training parts can be launched via `bash train.sh` as well as via notebook `sandbox/GBM.ipynb` 85 | 86 | ## The best achieved results 87 | 88 | Scores (NMAP@30): 89 | Check: 0.1350 90 | Public: 0.1339 91 | Private: 0.148325 92 | Local: 0.155055 93 | 94 | [final submit](https://drive.google.com/file/d/17yR-klDIZ8vXvhTCIAEwkaBqXzuXagsg/view?usp=sharing). Code a bit messier and with less number of features but with better overall result. 95 | 96 | *Note: LightGBM wasn't tuned carefully so higher score is achievable with the current set of features* 97 | 98 | ## Other participants solutions 99 | 100 | - [3rd place](https://github.com/geffy/retailhero-recommender-solution) from [@geffy](https://github.com/geffy). [presentation](https://github.com/geffy/retailhero-recommender-solution/blob/master/slides/retailhero-conf.pdf) 101 | - [8th place](https://github.com/greenwolf-nsk/retailhero-rec) from [@greenwolf-nsk](https://github.com/greenwolf-nsk) 102 | - [9th place](https://github.com/mike-chesnokov/x5_retailhero_2020_recs) from [@mike-chesnokov](https://github.com/mike-chesnokov) 103 | -------------------------------------------------------------------------------- /src/collect_gbm_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | import json 4 | import logging 5 | from pathlib import Path 6 | 7 | import feather 8 | import numpy as np 9 | import lightgbm as lgb 10 | import pandas as pd 11 | from scipy import sparse as sp 12 | from tqdm import tqdm 13 | 14 | import config as cfg 15 | from predictors import GBMFeatures, GBMPredictor 16 | from utils import ( 17 | ProductEncoder, 18 | make_coo_row, 19 | normalized_average_precision, 20 | get_shard_path, 21 | cache_to_feather, 22 | get_check_users, 23 | ) 24 | 25 | logging.basicConfig( 26 | level=logging.INFO, 27 | format="%(asctime)s - %(message)s", 28 | handlers=[logging.FileHandler("collect_dataset.log"), logging.StreamHandler()], 29 | ) 30 | 31 | 32 | def get_gbm_records(shard_indices, gbm_feat, max_records=None, **kwargs): 33 | check_users = get_check_users() 34 | 35 | gbm_records = [] 36 | num_records = 0 37 | 38 | for shard_idx in tqdm(shard_indices, leave=False): 39 | for js in tqdm( 40 | (json.loads(s) for s in open(get_shard_path(shard_idx))), leave=False 41 | ): 42 | if js["client_id"] in check_users: 43 | continue 44 | 45 | feat_records, _ = gbm_feat.get_gbm_features(js, train=True, **kwargs) 46 | gbm_records.extend(feat_records) 47 | num_records += 1 48 | 49 | if max_records and num_records >= max_records: 50 | return gbm_records 51 | 52 | return gbm_records 53 | 54 | 55 | def parse_args(): 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--N", type=int, default=100) 58 | parser.add_argument("--max-records", type=int, default=None) 59 | return parser.parse_args() 60 | 61 | 62 | if __name__ == "__main__": 63 | args = parse_args() 64 | N_POOL = args.N 65 | MAX_RECORDS = args.max_records 66 | 67 | logger = logging.getLogger(__name__) 68 | ASSETS_DIR = cfg.ASSETS_DIR 69 | NUM_TEST_SHARD = 15 70 | 71 | # to test pipeline 72 | SHARDS = [14] 73 | 74 | # full training 75 | # SHARDS = range(15) 76 | 77 | gbm_feat = GBMFeatures( 78 | product_csv_path=ASSETS_DIR / "products.csv", 79 | model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl", 80 | products_misc_path=ASSETS_DIR / "products_misc.csv", 81 | product_features_encoder_path=ASSETS_DIR / "product_features.pkl", 82 | implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl", 83 | implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl", 84 | implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl", 85 | umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy", 86 | item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz", 87 | item_occurrence_path=ASSETS_DIR / "item_occurrence.npy", 88 | user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy", 89 | tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy", 90 | N=N_POOL, 91 | # trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy", 92 | # faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"), 93 | # train_scores_path=ASSETS_DIR / "X_scores_sparse.npz", 94 | # faiss_neighbors=512, 95 | # faiss_nprobe=16, 96 | ) 97 | 98 | train_dir = Path(f"../tmp/train_chunks_{gbm_feat.N}") 99 | train_dir.mkdir(exist_ok=True) 100 | test_dir = Path(f"../tmp/test_chunks_{gbm_feat.N}") 101 | test_dir.mkdir(exist_ok=True) 102 | 103 | logger.info("Collecting train dataset") 104 | for num_shard in tqdm(SHARDS, leave=False): 105 | gbm_rec_train = get_gbm_records([num_shard], gbm_feat, max_records=MAX_RECORDS) 106 | df_gbm_train_chunk = pd.DataFrame(gbm_rec_train) 107 | 108 | train_shard_path = f"{train_dir}/df_train_{num_shard}.feather" 109 | logger.info(f"Saving train {num_shard} shard to {train_shard_path}") 110 | feather.write_dataframe(df_gbm_train_chunk, train_shard_path) 111 | 112 | del gbm_rec_train 113 | del df_gbm_train_chunk 114 | gc.collect() 115 | 116 | logger.info("Collect test dataset") 117 | gbm_rec_test = get_gbm_records([NUM_TEST_SHARD], gbm_feat, max_records=MAX_RECORDS) 118 | df_gbm_test = pd.DataFrame(gbm_rec_test) 119 | 120 | test_shard_path = f"{test_dir}/df_test_{num_shard}.feather" 121 | logger.info(f"Saving test {NUM_TEST_SHARD} shard to {test_shard_path}") 122 | feather.write_dataframe(df_gbm_test, test_dir / "df_test_15.feather") 123 | 124 | logger.info("Transform and save FM cached features to feather for faster loading") 125 | cache_to_feather(gbm_feat.cache_fm_feat) 126 | -------------------------------------------------------------------------------- /src/purchases_to_jrows.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | import config as cfg 8 | from utils import md5_hash 9 | 10 | 11 | class Transaction: 12 | def __init__(self, transaction_id, transaction_datetime, **kwargs): 13 | self.data = { 14 | **{"tid": transaction_id, "datetime": transaction_datetime, "products": []}, 15 | **kwargs, 16 | } 17 | 18 | def add_item( 19 | self, 20 | product_id: str, 21 | product_quantity: float, 22 | trn_sum_from_iss: float, 23 | trn_sum_from_red: float, 24 | ) -> None: 25 | p = { 26 | "product_id": product_id, 27 | "quantity": product_quantity, 28 | "s": trn_sum_from_iss, 29 | "r": "0" 30 | if trn_sum_from_red is None or pd.isna(trn_sum_from_red) 31 | else trn_sum_from_red, 32 | } 33 | self.data["products"].append(p) 34 | 35 | def as_dict(self,): 36 | return self.data 37 | 38 | def transaction_id(self,): 39 | return self.data["tid"] 40 | 41 | 42 | class ClientHistory: 43 | def __init__(self, client_id): 44 | self.data = {"client_id": client_id, "transaction_history": []} 45 | 46 | def add_transaction(self, transaction): 47 | self.data["transaction_history"].append(transaction) 48 | 49 | def as_dict(self,): 50 | return self.data 51 | 52 | def client_id(self,): 53 | return self.data["client_id"] 54 | 55 | 56 | class RowSplitter: 57 | def __init__(self, output_path, n_shards=cfg.NUM_SHARDS): 58 | self.n_shards = n_shards 59 | os.makedirs(output_path, exist_ok=True) 60 | self.outs = [] 61 | for i in range(self.n_shards): 62 | self.outs.append(open(output_path + "/{:02d}.jsons".format(i), "w")) 63 | self._client = None 64 | self._transaction = None 65 | 66 | def finish(self,): 67 | self.flush() 68 | for outs in self.outs: 69 | outs.close() 70 | 71 | def flush(self,): 72 | if self._client is not None: 73 | self._client.add_transaction(self._transaction.as_dict()) 74 | # rows are sharded by cliend_id 75 | shard_idx = md5_hash(self._client.client_id()) % self.n_shards 76 | data = self._client.as_dict() 77 | self.outs[shard_idx].write(json.dumps(data) + "\n") 78 | 79 | self._client = None 80 | self._transaction = None 81 | 82 | def consume_row(self, row): 83 | if self._client is not None and self._client.client_id() != row.client_id: 84 | self.flush() 85 | 86 | if self._client is None: 87 | self._client = ClientHistory(client_id=row.client_id) 88 | 89 | if ( 90 | self._transaction is not None 91 | and self._transaction.transaction_id() != row.transaction_id 92 | ): 93 | self._client.add_transaction(self._transaction.as_dict()) 94 | self._transaction = None 95 | 96 | if self._transaction is None: 97 | self._transaction = Transaction( 98 | transaction_id=row.transaction_id, 99 | transaction_datetime=row.transaction_datetime, 100 | regular_points_received=row.regular_points_received, 101 | express_points_received=row.express_points_received, 102 | regular_points_spent=row.regular_points_spent, 103 | express_points_spent=row.express_points_spent, 104 | purchase_sum=row.purchase_sum, 105 | store_id=row.store_id, 106 | ) 107 | 108 | self._transaction.add_item( 109 | product_id=row.product_id, 110 | product_quantity=row.product_quantity, 111 | trn_sum_from_iss=row.trn_sum_from_iss, 112 | trn_sum_from_red=row.trn_sum_from_red, 113 | ) 114 | 115 | 116 | def split_data_to_chunks( 117 | input_path, output_dir, n_shards=cfg.NUM_SHARDS, max_chunks=None 118 | ): 119 | splitter = RowSplitter(output_path=output_dir, n_shards=n_shards) 120 | print("split_data_to_chunks: {} -> {}".format(input_path, output_dir)) 121 | for i, df in enumerate(tqdm(pd.read_csv(input_path, chunksize=500000))): 122 | for row in df.itertuples(): 123 | splitter.consume_row(row) 124 | if max_chunks and i == max_chunks: 125 | splitter.finish() 126 | return 127 | splitter.finish() 128 | 129 | 130 | def calculate_unique_clients_from_input(input_path, max_chunks=None): 131 | client_set = set() 132 | print("calculate_unique_clients_from: {}".format(input_path)) 133 | for i, df in enumerate(tqdm(pd.read_csv(input_path, chunksize=500000))): 134 | client_set.update(set([row.client_id for row in df.itertuples()])) 135 | if max_chunks and i == max_chunks: 136 | break 137 | return len(client_set) 138 | 139 | 140 | def calculate_unique_clients_from_output(output_dir,): 141 | import glob 142 | 143 | client_cnt = 0 144 | print("calculate_unique_clients_from: {}".format(output_dir)) 145 | for js_file in glob.glob(output_dir + "/*.jsons"): 146 | for _ in open(js_file): 147 | client_cnt += 1 148 | return client_cnt 149 | 150 | 151 | if __name__ == "__main__": 152 | purchases_csv_path = cfg.PURCHASE_CSV_PATH 153 | output_jsons_dir = cfg.JSONS_DIR 154 | max_chunks = cfg.MAX_CHUNKS 155 | 156 | split_data_to_chunks( 157 | purchases_csv_path, 158 | output_jsons_dir, 159 | n_shards=cfg.NUM_SHARDS, 160 | max_chunks=max_chunks, 161 | ) 162 | 163 | # check splitting for correctness 164 | _from_input = calculate_unique_clients_from_input( 165 | purchases_csv_path, max_chunks=max_chunks 166 | ) 167 | _from_output = calculate_unique_clients_from_output(output_jsons_dir) 168 | assert _from_input == _from_output 169 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import hashlib 3 | from typing import List, Set 4 | from collections import defaultdict 5 | 6 | import feather 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import sparse as sp 10 | 11 | import config as cfg 12 | 13 | 14 | class ProductEncoder: 15 | def __init__(self, product_csv_path): 16 | self.product_idx = {} 17 | self.product_pid = {} 18 | for idx, pid in enumerate(pd.read_csv(product_csv_path).product_id.values): 19 | self.product_idx[pid] = idx 20 | self.product_pid[idx] = pid 21 | 22 | def toIdx(self, x): 23 | if type(x) == str: 24 | pid = x 25 | return self.product_idx[pid] 26 | return [self.product_idx[pid] for pid in x] 27 | 28 | def toPid(self, x): 29 | if type(x) == int: 30 | idx = x 31 | return self.product_pid[idx] 32 | return [self.product_pid[idx] for idx in x] 33 | 34 | @property 35 | def num_products(self): 36 | return len(self.product_idx) 37 | 38 | 39 | class ProductFeatEncoder: 40 | def __init__(self, product_csv_path, init=True, exclude=None): 41 | # self.products = pd.read_csv(product_csv_path).fillna(0) 42 | self.products = pd.read_csv(product_csv_path) 43 | self.products_map = defaultdict(dict) 44 | self.features = defaultdict(dict) 45 | if exclude: 46 | self.products = self.products.drop(exclude, axis=1) 47 | 48 | if init: 49 | self.create_encoder() 50 | self.create_features() 51 | 52 | def create_encoder(self): 53 | cols = self.products.dtypes[self.products.dtypes == "object"].index.tolist() 54 | global_idx = 0 55 | for col in cols: 56 | self.products_map[col] = defaultdict(dict) 57 | for idx, pid in enumerate(self.products[col].unique()): 58 | self.products_map[col]["pid"][pid] = idx 59 | self.products_map[col]["idx"][idx] = pid 60 | self.products_map[col]["gidx"][idx] = pid 61 | 62 | def create_features(self): 63 | for tup in self.products.set_index("product_id").itertuples(): 64 | for name, val in tup._asdict().items(): 65 | if name == "Index": 66 | continue 67 | 68 | val = ( 69 | self.products_map[name]["pid"][val] 70 | if name in self.products_map 71 | else val 72 | ) 73 | self.features[tup.Index][name] = val 74 | 75 | def product_features(self, ids): 76 | if type(ids) == str: 77 | return self.features[ids] 78 | return [self.features[product_id] for product_id in ids] 79 | 80 | def product_features_idx(self, ind): 81 | if type(ind) == str: 82 | return self.features[ind] 83 | return [ 84 | self.features[self.products_map["product_id"]["idx"][idx]] for idx in ind 85 | ] 86 | 87 | 88 | class TrainingSample: 89 | def __init__( 90 | self, row: sp.coo_matrix, target_items: Set[int], client_id: str = None 91 | ): 92 | self.row = row 93 | self.target_items = target_items 94 | self.client_id = client_id 95 | 96 | 97 | def make_coo_row( 98 | transaction_history, 99 | product_encoder: ProductEncoder, 100 | last_transaction=False, 101 | normalize=True, 102 | entity=False, 103 | ): 104 | idx = [] 105 | values = [] 106 | 107 | items = defaultdict(int) 108 | if last_transaction: 109 | transaction_history = transaction_history[-1:] 110 | 111 | for trans in transaction_history: 112 | for i in trans["products"]: 113 | pidx = product_encoder.toIdx(i["product_id"]) 114 | items[pidx] += 1.0 115 | n_items = sum(items.values()) 116 | 117 | for pidx, val in items.items(): 118 | idx.append(pidx) 119 | if normalize: 120 | val = val / n_items 121 | if entity: 122 | val = 1 123 | values.append(val) 124 | 125 | return sp.coo_matrix( 126 | (np.array(values).astype(np.float32), ([0] * len(idx), idx)), 127 | shape=(1, product_encoder.num_products), 128 | ) 129 | 130 | 131 | def cache_to_feather(cache_fm, num_products=43038): 132 | dfs = [] 133 | for key, item in cache_fm.items(): 134 | df_model = pd.Series(item).apply(pd.Series) 135 | dfs.append(df_model) 136 | 137 | df_scores = dfs[0] 138 | for df in dfs[1:]: 139 | df_scores = df_scores.join(df) 140 | 141 | df_scores = df_scores.sort_index().reindex(range(num_products)).fillna(0) 142 | feather.write_dataframe(df_scores, cfg.ASSETS_DIR / "implicit_scores.feather") 143 | 144 | 145 | def create_products_in_transaction( 146 | transaction_history, product_encoder: ProductEncoder, outfile 147 | ): 148 | """Collect item2vec file.""" 149 | for trans in transaction_history: 150 | products_str = " ".join( 151 | str(product_encoder.toIdx(i["product_id"])) for i in trans["products"] 152 | ) 153 | outfile.write(products_str + "\n") 154 | outfile.flush() 155 | 156 | 157 | def update_item_cost(transaction_history, product_encoder, storage): 158 | for txn in transaction_history: 159 | for item in txn["products"]: 160 | key = product_encoder.toIdx(item["product_id"]) 161 | item_cost = item["s"] / max(item["quantity"], 1) 162 | 163 | if storage[key] == 0: 164 | storage[key] = item_cost 165 | else: 166 | storage[key] = (storage[key] + item_cost) / 2.0 167 | 168 | 169 | def average_precision(actual, recommended, k=30): 170 | ap_sum = 0 171 | hits = 0 172 | for i in range(k): 173 | product_id = recommended[i] if i < len(recommended) else None 174 | if product_id is not None and product_id in actual: 175 | hits += 1 176 | ap_sum += hits / (i + 1) 177 | return ap_sum / k 178 | 179 | 180 | def normalized_average_precision(actual, recommended, k=30): 181 | actual = set(actual) 182 | if len(actual) == 0: 183 | return 0.0 184 | 185 | ap = average_precision(actual, recommended, k=k) 186 | ap_ideal = average_precision(actual, list(actual)[:k], k=k) 187 | return ap / ap_ideal 188 | 189 | 190 | def recall_k(actual, recommended, k=30): 191 | return len(set(actual).intersection(set(recommended[:k]))) / max( 192 | len(set(actual)), 1 193 | ) 194 | 195 | 196 | def get_shard_path(n_shard, jsons_dir=cfg.JSONS_DIR): 197 | return "{}/{:02d}.jsons.splitted".format(jsons_dir, n_shard) 198 | 199 | 200 | def md5_hash(x): 201 | return int(hashlib.md5(x.encode()).hexdigest(), 16) 202 | 203 | 204 | def get_check_users(): 205 | check_users = [] 206 | with open(cfg.CHECK_QUERY_PATH) as f: 207 | for line in f: 208 | query_data, _ = line.strip().split("\t") 209 | client_id = json.loads(query_data)["client_id"] 210 | check_users.append(client_id) 211 | return check_users 212 | -------------------------------------------------------------------------------- /src/train_lgb_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | import json 4 | import logging 5 | import pprint 6 | import sys 7 | from pathlib import Path 8 | 9 | import feather 10 | import numpy as np 11 | import lightgbm as lgb 12 | import pandas as pd 13 | from scipy import sparse as sp 14 | from tqdm import tqdm 15 | 16 | import config as cfg 17 | from predictors import GBMFeatures, GBMPredictor 18 | from utils import ( 19 | ProductEncoder, 20 | make_coo_row, 21 | normalized_average_precision, 22 | get_shard_path, 23 | cache_to_feather, 24 | ) 25 | 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format="%(asctime)s - %(message)s", 29 | handlers=[logging.FileHandler("lgb_model.log"), logging.StreamHandler()], 30 | ) 31 | 32 | 33 | def evalute_queries(queryset_file, max_records=1000): 34 | check_scores = [] 35 | with open(queryset_file) as fin: 36 | for i, line in enumerate(tqdm(fin)): 37 | splitted = line.strip().split("\t") 38 | if len(splitted) == 1: 39 | query_data = json.loads(splitted[0]) 40 | next_transaction = query_data["target"][0] 41 | else: 42 | query_data, next_transaction = map(json.loads, splitted) 43 | query_data["target"] = [next_transaction] 44 | 45 | query_data["transaction_history"] = sorted( 46 | query_data["transaction_history"], key=lambda x: x["datetime"] 47 | ) 48 | recommended_items = PREDICTOR.predict(query_data, PREDICTOR.lgb_model) 49 | 50 | gt_items = query_data["target"][0]["product_ids"] 51 | nap = normalized_average_precision(gt_items, recommended_items) 52 | check_scores.append(nap) 53 | 54 | if i == max_records: 55 | break 56 | return np.mean(check_scores) 57 | 58 | 59 | def parse_args(): 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument("--N", type=int, default=100) 62 | return parser.parse_args() 63 | 64 | 65 | if __name__ == "__main__": 66 | args = parse_args() 67 | N_POOL = args.N 68 | 69 | ASSETS_DIR = cfg.ASSETS_DIR 70 | SHARDS = range(cfg.NUM_SHARDS - 1) 71 | NUM_TEST_SHARD = cfg.NUM_SHARDS - 1 72 | 73 | TRAIN_DIR = Path(f"../tmp/train_chunks_{N_POOL}") 74 | TEST_DIR = Path(f"../tmp/test_chunks_{N_POOL}") 75 | logger = logging.getLogger(__name__) 76 | product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH) 77 | 78 | logger.info("Loading train dataset") 79 | dfs = [] 80 | for num_shard in tqdm(SHARDS, leave=False): 81 | if Path(f"{TRAIN_DIR}/df_train_{num_shard}.feather").exists(): 82 | dfs.append( 83 | feather.read_dataframe(f"{TRAIN_DIR}/df_train_{num_shard}.feather") 84 | ) 85 | 86 | logger.info("Join chunks to full train dataframe") 87 | df_gbm_train = pd.concat(dfs, sort=False) 88 | logger.info(f"Shape of the train dataframe {df_gbm_train.shape}") 89 | 90 | del dfs 91 | gc.collect() 92 | 93 | logger.info("Loading test dataset") 94 | df_gbm_test = feather.read_dataframe(TEST_DIR / f"df_test_{NUM_TEST_SHARD}.feather") 95 | gt_all_rec_test = [] 96 | for js in tqdm( 97 | (json.loads(s) for s in open(get_shard_path(NUM_TEST_SHARD))), leave=False 98 | ): 99 | target_products = set( 100 | product_encoder.toIdx([pid for pid in js["target"][0]["product_ids"]]) 101 | ) 102 | gt_products = dict(client_id=js["client_id"], products=list(target_products)) 103 | gt_all_rec_test.append(gt_products) 104 | logger.info(f"Shape of the test dataframe {df_gbm_test.shape}") 105 | 106 | logger.info("Add query_id column") 107 | df_gbm_train["query_id"] = df_gbm_train.groupby("client_id").ngroup() 108 | df_gbm_test["query_id"] = df_gbm_test.groupby("client_id").ngroup() 109 | 110 | logger.info("Build LGB datasets") 111 | drop_cols = ["client_id", "target", "query_id"] 112 | train_ds = lgb.Dataset( 113 | df_gbm_train.drop(drop_cols, errors="ignore", axis=1), 114 | df_gbm_train["target"], 115 | group=df_gbm_train["query_id"].value_counts().sort_index().values, 116 | ) 117 | test_ds = lgb.Dataset( 118 | df_gbm_test.drop(drop_cols, errors="ignore", axis=1), 119 | df_gbm_test["target"], 120 | group=df_gbm_test["query_id"].value_counts().sort_index().values, 121 | ) 122 | 123 | lgb_params = dict( 124 | objective="binary", 125 | # objective='lambdarank', 126 | max_depth=12, 127 | random_state=42, 128 | learning_rate=0.05, 129 | lambda_l2=10, 130 | metric=("binary", "map"), 131 | eval_at=30, 132 | max_bin=63, 133 | first_metric_only=True, 134 | ) 135 | num_boost_round = 6000 136 | logger.info("LGB params:\n%s", pprint.pformat(lgb_params)) 137 | 138 | gbm = lgb.train( 139 | lgb_params, 140 | train_ds, 141 | num_boost_round, 142 | valid_sets=(train_ds, test_ds), 143 | verbose_eval=10, 144 | early_stopping_rounds=100, 145 | ) 146 | 147 | drop_cols = ["client_id", "target", "lgb_scores", "query_id"] 148 | lgb_scores = gbm.predict(df_gbm_test.drop(drop_cols, axis=1, errors="ignore")) 149 | df_gbm_test["lgb_scores"] = lgb_scores 150 | 151 | lgb_ranked = ( 152 | df_gbm_test.groupby("client_id")[["idx", "lgb_scores"]] 153 | .apply( 154 | lambda x: x.sort_values("lgb_scores", ascending=False)[:30]["idx"].tolist() 155 | ) 156 | .to_dict() 157 | ) 158 | 159 | gt_test = {item["client_id"]: item["products"] for item in gt_all_rec_test} 160 | scores = [] 161 | for client_id, recommended_idx in lgb_ranked.items(): 162 | ap = normalized_average_precision(gt_test[client_id], recommended_idx) 163 | scores.append(ap) 164 | model_score = np.mean(scores) 165 | logger.info(f"Test score: {model_score}") 166 | 167 | params_str = "__".join( 168 | "_".join(map(str, item)) for item in gbm.params.items() if item[0] != "metric" 169 | ) 170 | model_filename = f"lgbm_model__pool_{N_POOL}__{params_str}__{model_score:.6f}.txt" 171 | model_path = str(ASSETS_DIR / model_filename) 172 | gbm.save_model(model_path) 173 | logger.info(f"Model was saved to {model_path}") 174 | 175 | # Check predictor 176 | PREDICTOR = GBMPredictor( 177 | lgbm_model_path=str(ASSETS_DIR / model_filename), 178 | product_csv_path=ASSETS_DIR / "products.csv", 179 | model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl", 180 | products_misc_path=ASSETS_DIR / "products_misc.csv", 181 | product_features_encoder_path=ASSETS_DIR / "product_features.pkl", 182 | implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl", 183 | implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl", 184 | fm_features_feather_path=ASSETS_DIR / "implicit_scores.feather", 185 | implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl", 186 | umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy", 187 | item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz", 188 | item_occurrence_path=ASSETS_DIR / "item_occurrence.npy", 189 | user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy", 190 | tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy", 191 | N=N_POOL, 192 | # trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy", 193 | # faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"), 194 | # train_scores_path=ASSETS_DIR / "X_scores_sparse.npz", 195 | # faiss_neighbors=512, 196 | # faiss_nprobe=8, 197 | ) 198 | 199 | # check queries 200 | check_queryset_file = cfg.CHECK_QUERY_PATH 201 | logger.info(f"Evaluating check queries {check_queryset_file}") 202 | check_score = evalute_queries(check_queryset_file) 203 | logger.info(f"Check score: {check_score}") 204 | 205 | # test queries 206 | max_records = 1000 207 | queryset_file = f"{cfg.JSONS_DIR}/{NUM_TEST_SHARD}.jsons.splitted" 208 | logger.info( 209 | f"Evaluating test queries {queryset_file} with {max_records} max_records" 210 | ) 211 | test_score = evalute_queries(queryset_file, max_records=max_records) 212 | logger.info(f"Test score: {test_score}") 213 | -------------------------------------------------------------------------------- /src/predictors.py: -------------------------------------------------------------------------------- 1 | import random 2 | import datetime as dt 3 | import itertools as it 4 | import pickle 5 | from collections import defaultdict, Counter 6 | 7 | import faiss 8 | import lightgbm as lgb 9 | import math 10 | import numpy as np 11 | import pandas as pd 12 | import feather 13 | from catboost import CatBoost 14 | from scipy import sparse as sp 15 | 16 | from utils import ProductEncoder, ProductFeatEncoder, make_coo_row 17 | 18 | 19 | class GBMFeatures: 20 | def __init__( 21 | self, 22 | product_csv_path, 23 | model_pickled_path, 24 | products_misc_path, 25 | implicit_tfidf_path=None, 26 | product_features_encoder_path=None, 27 | implicit_als_path=None, 28 | N=100, 29 | cache_fm_path=None, 30 | implicit_cosine2_path=None, 31 | fm_features_feather_path=None, 32 | trunk_svd_arr_path=None, 33 | faiss_index_path=None, 34 | train_scores_path=None, 35 | umap_item_emb_path=None, 36 | faiss_neighbors=128, 37 | faiss_nprobe=32, 38 | item_co_occurrence_path=None, 39 | item_occurrence_path=None, 40 | user_prod_log_idf_path=None, 41 | tran_prod_log_idf_path=None, 42 | random_seed=0, 43 | ): 44 | self.product_encoder = ProductEncoder(product_csv_path) 45 | 46 | if not product_features_encoder_path: 47 | self.product_features = ProductFeatEncoder(product_csv_path) 48 | else: 49 | self.product_features = pickle.load( 50 | open(product_features_encoder_path, "rb") 51 | ) 52 | 53 | self.model = pickle.load(open(model_pickled_path, "rb")) 54 | self.init_misc_features(products_misc_path, N) 55 | 56 | if implicit_tfidf_path: 57 | self.implicit_tfidf = pickle.load(open(implicit_tfidf_path, "rb")) 58 | if implicit_als_path: 59 | self.implicit_als = pickle.load(open(implicit_als_path, "rb")) 60 | if implicit_cosine2_path: 61 | self.implicit_cosine2 = pickle.load(open(implicit_cosine2_path, "rb")) 62 | 63 | if cache_fm_path is not None: 64 | self.cache_fm_feat = pickle.load(open(cache_fm_path, "rb")) 65 | else: 66 | self.cache_fm_feat = defaultdict(dict) 67 | 68 | if trunk_svd_arr_path: 69 | self.trunk_svd_arr = np.load(trunk_svd_arr_path) 70 | if train_scores_path: 71 | self.X_scores = sp.load_npz(train_scores_path) 72 | 73 | self.faiss_neighbors = faiss_neighbors 74 | self.faiss_nprobe = faiss_nprobe 75 | if faiss_index_path: 76 | self.faiss_index = faiss.read_index(faiss_index_path) 77 | self.faiss_index.nprobe = faiss_nprobe 78 | 79 | if item_occurrence_path: 80 | # add 1 to be able to divide co_occurrence features on occurrence 81 | self.item_occurrence = np.load(item_occurrence_path) + 1 82 | if item_co_occurrence_path: 83 | self.item_co_occurrence = sp.load_npz(item_co_occurrence_path) 84 | 85 | if user_prod_log_idf_path: 86 | self.user_prod_log_idf = np.load(user_prod_log_idf_path) 87 | if tran_prod_log_idf_path: 88 | self.tran_prod_log_idf = np.load(tran_prod_log_idf_path) 89 | 90 | if umap_item_emb_path: 91 | self.umap_item_emb = np.load(umap_item_emb_path) 92 | 93 | if fm_features_feather_path: 94 | self.feather_fm_features = feather.read_dataframe(fm_features_feather_path) 95 | self.feather_names = self.feather_fm_features.columns 96 | self.feather_values = self.feather_fm_features.values 97 | self.get_implicit_features = self.get_feather_features 98 | else: 99 | self.get_implicit_features = self.get_implicit_train_model_features 100 | 101 | self.feature_extractor = {"gender": {"M": 0, "F": 1, "U": 2}} 102 | 103 | self.N = N 104 | self.N_05 = self.N // 2 105 | self.pos_0_score = 1000 106 | self.datetime_stamp_2000 = dt.datetime(2000, 1, 1).toordinal() 107 | 108 | # cache 109 | self.default_history_feat = defaultdict(dict) 110 | self.date_cache = {} 111 | self.cooccur_default_values = {} 112 | 113 | random.seed(random_seed) 114 | 115 | def init_misc_features(self, path, N): 116 | self.df_products_misc = pd.read_csv(path, index_col=0) 117 | self.pos_in_top = self.df_products_misc["popularity_position"].to_dict() 118 | self.top_products = {pos: idx for idx, pos in self.pos_in_top.items()} 119 | self.product_cost = self.df_products_misc["cost"].round(2).to_dict() 120 | self.top_n_items = list(self.top_products.values())[:N] 121 | 122 | # TODO: refactor 123 | def get_items_pool(self, product_ind_csr, product_ind_csr_not_normed): 124 | selected_items = set() 125 | 126 | # add cosine 50 items 127 | recs_cosine50 = self.model.recommend( 128 | userid=0, 129 | user_items=product_ind_csr, 130 | N=self.N, 131 | filter_already_liked_items=False, 132 | recalculate_user=True, 133 | ) 134 | selected_items |= set(item[0] for item in recs_cosine50) 135 | 136 | # add tfidf items 137 | recs_tfidf = self.implicit_tfidf.recommend( 138 | userid=0, 139 | user_items=product_ind_csr, 140 | N=self.N, 141 | filter_already_liked_items=False, 142 | recalculate_user=True, 143 | ) 144 | selected_items |= set(item[0] for item in recs_tfidf) 145 | 146 | # add cosine2 items 147 | recs_cosine2 = self.implicit_cosine2.recommend( 148 | userid=0, 149 | user_items=product_ind_csr, 150 | N=self.N, 151 | filter_already_liked_items=False, 152 | recalculate_user=True, 153 | ) 154 | selected_items |= set(item[0] for item in recs_cosine2) 155 | 156 | # TODO: increase speed or delete 157 | # add ALS items 158 | # recs_als = self.implicit_als.recommend( 159 | # userid=0, user_items=product_ind_csr_not_normed, N=self.N, 160 | # filter_already_liked_items=False, recalculate_user=True, 161 | # ) 162 | # selected_items |= set(item[0] for item in recs_als) 163 | 164 | # add top items 165 | selected_items |= set(self.top_n_items) 166 | 167 | # add top history items 168 | if product_ind_csr.nnz > 0: 169 | selected_items |= set( 170 | self._get_top_recommendations( 171 | product_ind_csr.A[0], k=min(product_ind_csr.nnz + 1, self.N_05) 172 | )[0] 173 | ) 174 | return selected_items 175 | 176 | def get_default_history_feat(self, idx, cache=True): 177 | if cache and idx in self.default_history_feat: 178 | return self.default_history_feat[idx] 179 | 180 | product_feats = {} 181 | product_feats["quantity"] = 0 182 | product_feats["items_in_different_transactions"] = 0 183 | product_feats["mean_items_in_trans"] = 0 184 | product_feats["mean_items_in_trans_log"] = 0 185 | product_feats["mean_items_in_history"] = 0 186 | product_feats["mean_items_in_trans_log_denom"] = 0 187 | 188 | product_feats["user_item_idf"] = -1 189 | product_feats["user_tran_idf_mult_total_idf"] = -1 190 | 191 | product_feats["total_items_in_history_pct"] = 0 192 | product_feats["item_spent"] = 0 193 | product_feats["item_cost"] = self.product_cost[idx] 194 | 195 | product_feats["item_is_in_last_transaction"] = 0 196 | product_feats["last_item_purchase_num_days"] = -1 197 | product_feats["mean_item_days_between_purchase"] = -1 198 | 199 | product_feats["cooc_item_sum_score"] = -1 200 | product_feats["cooc_item_mean_score"] = -1 201 | product_feats["cooc_mean_from_items"] = -1 202 | product_feats["cooc_sum_from_items"] = -1 203 | product_feats["cooc_max_from_items"] = -1 204 | 205 | if cache: 206 | self.default_history_feat[idx] = product_feats 207 | return self.default_history_feat[idx] 208 | 209 | def to_date(self, x, cache=True): 210 | """ 211 | Convert datetime to datetime.date 212 | 213 | # format for training and inference is different: 214 | # training: "%Y-%m-%d %H:%M:%S" 215 | # inference: "%Y-%m-%dT%H:%M:%S" 216 | """ 217 | str_date = x[:10] 218 | if str_date in self.date_cache: 219 | return self.date_cache[str_date] 220 | 221 | # self.date_cache[str_date] = dt.datetime.strptime(str_date, "%Y-%m-%d") 222 | # faster approach 223 | self.date_cache[str_date] = dt.datetime(*map(int, str_date.split("-"))) 224 | return self.date_cache[str_date] 225 | 226 | def get_product_feat_from_history(self, transaction_history, cooc_scores, item2pos): 227 | product_feats = defaultdict(Counter) 228 | if not transaction_history: 229 | return product_feats 230 | 231 | items_cooc_scores = np.zeros(len(cooc_scores)) 232 | items_dates_purchase = defaultdict(list) 233 | num_items_in_purchase = 0 234 | 235 | for txn in transaction_history: 236 | purchase_date = self.to_date(txn["datetime"]) 237 | product_ind = self.product_encoder.toIdx( 238 | item["product_id"] for item in txn["products"] 239 | ) 240 | product_cooc_pos = [item2pos[pidx] for pidx in product_ind] 241 | items_cooc_scores[product_cooc_pos] += cooc_scores[product_cooc_pos].sum( 242 | axis=1 243 | ) 244 | 245 | for pidx, item in zip(product_ind, txn["products"]): 246 | product_feats[pidx]["quantity"] += item["quantity"] 247 | product_feats[pidx]["items_in_different_transactions"] += 1 248 | num_items_in_purchase += 1 249 | items_dates_purchase[pidx].append(purchase_date) 250 | 251 | num_items_in_purchase = max(num_items_in_purchase, 1) 252 | len_trans = max(len(transaction_history), 1) 253 | cooc_mean_from_items = items_cooc_scores.mean() 254 | cooc_sum_from_items = items_cooc_scores.sum() 255 | cooc_max_from_items = items_cooc_scores.max() 256 | 257 | for pidx, item in product_feats.items(): 258 | num_trans_with_item = item["items_in_different_transactions"] 259 | item["mean_items_in_trans"] = num_trans_with_item / len_trans 260 | # 1e-2 for better distribution 261 | item["mean_items_in_trans_log"] = np.log( 262 | num_trans_with_item / len_trans + 1e-2 263 | ) 264 | item["mean_items_in_history"] = num_trans_with_item / num_items_in_purchase 265 | item["mean_items_in_trans_log_denom"] = num_trans_with_item / ( 266 | 1 + math.log(len_trans) 267 | ) 268 | 269 | # iDF scores, usual item-item TF-iDF is calculated via the implicit model 270 | item["user_item_idf"] = self.user_prod_log_idf[pidx] 271 | item["user_tran_idf_mult_total_idf"] = ( 272 | math.log(len_trans / num_trans_with_item) * self.tran_prod_log_idf[pidx] 273 | ) 274 | 275 | max_quantity = max(item["quantity"], 1) 276 | item["total_items_in_history_pct"] = max_quantity / num_items_in_purchase 277 | item["item_spent"] = self.product_cost[pidx] * max_quantity 278 | item["item_cost"] = self.product_cost[pidx] 279 | 280 | item["item_is_in_last_transaction"] = int( 281 | items_dates_purchase[pidx][-1] == purchase_date 282 | ) 283 | item["last_item_purchase_num_days"] = ( 284 | purchase_date - items_dates_purchase[pidx][-1] 285 | ).days 286 | 287 | item["cooc_item_sum_score"] = items_cooc_scores[item2pos[pidx]] 288 | item["cooc_item_mean_score"] = ( 289 | items_cooc_scores[item2pos[pidx]] / num_trans_with_item 290 | ) 291 | item["cooc_mean_from_items"] = cooc_mean_from_items 292 | item["cooc_sum_from_items"] = cooc_sum_from_items 293 | item["cooc_max_from_items"] = cooc_max_from_items 294 | 295 | if len(items_dates_purchase[pidx]) > 1: 296 | items_purchase = items_dates_purchase[pidx] 297 | items_days_between_purchase = [ 298 | (end_date - start_date).days 299 | for start_date, end_date in zip( 300 | items_purchase[:-1], items_purchase[1:] 301 | ) 302 | ] 303 | item["mean_item_days_between_purchase"] = sum( 304 | items_days_between_purchase 305 | ) / len(items_days_between_purchase) 306 | else: 307 | item["mean_item_days_between_purchase"] = -1 308 | 309 | return product_feats 310 | 311 | def get_highlevel_feat_from_history(self, transaction_history): 312 | feats = {} 313 | num_items_in_purchase = [] 314 | uniq_stores = set() 315 | purchase_sum = 0 316 | prev_date = None 317 | dates_intervals = [] 318 | min_date = None 319 | 320 | for txn in transaction_history: 321 | purchase_sum += txn["purchase_sum"] 322 | uniq_stores.add(txn["store_id"]) 323 | num_items_in_purchase.append(len(txn["products"])) 324 | 325 | cur_date = self.to_date(txn["datetime"]) 326 | if prev_date: 327 | date_diff = (cur_date - prev_date).days 328 | dates_intervals.append(date_diff) 329 | 330 | if min_date is None: 331 | min_date = cur_date 332 | 333 | prev_date = cur_date 334 | 335 | total_purchases = sum(num_items_in_purchase) 336 | feats["purchase_sum"] = purchase_sum 337 | feats["num_uniq_store"] = len(uniq_stores) 338 | feats["mean_num_items"] = total_purchases / max(len(num_items_in_purchase), 1) 339 | feats["total_bought_items"] = total_purchases 340 | feats["max_bought_items"] = ( 341 | max(num_items_in_purchase) if num_items_in_purchase else 0 342 | ) 343 | feats["mean_days_between_purchases"] = len(dates_intervals) / max( 344 | sum(dates_intervals), 1 345 | ) 346 | feats["max_days_between_purchases"] = ( 347 | max(dates_intervals) if dates_intervals else -1 348 | ) 349 | 350 | if min_date is not None: 351 | feats["days_between_first_and_last_purchase"] = (cur_date - min_date).days 352 | else: 353 | feats["days_between_first_and_last_purchase"] = -1 354 | 355 | return feats 356 | 357 | def get_implicit_model_features( 358 | self, 359 | model, 360 | idx, 361 | model_prefix="fm", 362 | n=5, 363 | k_score=0, 364 | skip_first=False, 365 | only_score=True, 366 | cache=True, 367 | only_second=True, 368 | ): 369 | model_cache_name = str(model.__class__) + model_prefix 370 | if cache and idx in self.cache_fm_feat[model_cache_name]: 371 | return self.cache_fm_feat[model_cache_name][idx] 372 | 373 | similar_items = model.similar_items(idx, max(k_score, n)) 374 | fm_features = {} 375 | for i, (_, score) in enumerate(similar_items[:n]): 376 | if i == 0 and skip_first: 377 | continue 378 | 379 | fm_features[f"top_item_{model_prefix}_{i}_score"] = score 380 | 381 | if k_score != 0: 382 | fm_features[f"sum_top_{model_prefix}_{k_score}_score"] = sum( 383 | score for _, score in similar_items 384 | ) 385 | fm_features[f"mean_top_{model_prefix}_{k_score}_score"] = np.mean( 386 | [score for _, score in similar_items] 387 | ) 388 | 389 | if cache: 390 | self.cache_fm_feat[model_cache_name][idx] = fm_features 391 | return fm_features 392 | 393 | def get_feather_features(self, idx): 394 | """Transform feather dataframe to features dict.""" 395 | scores = self.feather_values[idx] 396 | return {key: val for key, val in zip(self.feather_names, scores)} 397 | 398 | def get_implicit_train_model_features(self, idx): 399 | return { 400 | **self.get_implicit_model_features( 401 | self.model, idx, "cosine50", n=3, k_score=0 402 | ), 403 | **self.get_implicit_model_features( 404 | self.implicit_cosine2, idx, "cosine2", n=2, skip_first=True, k_score=0 405 | ), 406 | **self.get_implicit_model_features( 407 | self.implicit_tfidf, idx, "tfidf50", n=3, skip_first=True, k_score=0 408 | ), 409 | **self.get_implicit_model_features( 410 | self.implicit_als, idx, "als", n=3, k_score=0 411 | ), 412 | } 413 | 414 | @staticmethod 415 | def _get_top_recommendations(row, k=100): 416 | k = min(len(row), k) 417 | ind = np.argpartition(row, -k)[-k:] 418 | top_k_ind = ind[np.argsort(row[ind])][::-1] 419 | return top_k_ind, row[top_k_ind] 420 | 421 | def _get_faiss_scores(self, product_ind_csr): 422 | x_dense = product_ind_csr * self.trunk_svd_arr 423 | faiss_result = self.faiss_index.search(x_dense, self.faiss_neighbors) 424 | neighbors = faiss_result[1] 425 | scores = np.asarray(faiss_result[0] * self.X_scores[neighbors[0]]).flatten() 426 | return scores, faiss_result 427 | 428 | def get_faiss_features(self, product_ind_csr, selected_items): 429 | scores, faiss_result = self._get_faiss_scores(product_ind_csr) 430 | top_k_ind, sorted_predictions = self._get_top_recommendations( 431 | scores[selected_items], k=10000 432 | ) 433 | 434 | top_neighbor = faiss_result[0][0][0] 435 | faiss_neighbor_mean = faiss_result[0].mean() 436 | non_zero_idx = scores.nonzero()[0] 437 | non_zero_scores = len(non_zero_idx) 438 | sum_scores = scores.sum() 439 | features = defaultdict(dict) 440 | 441 | pos = 0 442 | prev_score = -10 443 | for idx, score in zip(top_k_ind, sorted_predictions): 444 | if score == 0: 445 | pos = self.pos_0_score 446 | 447 | pidx = selected_items[idx] 448 | features[pidx]["faiss_score"] = score 449 | features[pidx]["faiss_pos"] = pos 450 | features[pidx]["faiss_neighbor_top_score"] = top_neighbor 451 | features[pidx]["faiss_neighbor_mean_score"] = faiss_neighbor_mean 452 | features[pidx]["faiss_num_non_zero_scores"] = non_zero_scores 453 | features[pidx]["faiss_scores_sum"] = sum_scores 454 | 455 | if prev_score != score: 456 | pos += 1 457 | prev_score = score 458 | return features 459 | 460 | def get_umap_scores(self, product_ind_csr, selected_items): 461 | product_emb = (product_ind_csr * self.umap_item_emb)[0] 462 | 463 | features = defaultdict(dict) 464 | for pidx in selected_items: 465 | features[pidx]["umap_user_emb_0"] = product_emb[0] 466 | features[pidx]["umap_user_emb_1"] = product_emb[1] 467 | 468 | item_emb = self.umap_item_emb[pidx] 469 | features[pidx]["umap_item_emb_0"] = item_emb[0] 470 | features[pidx]["umap_item_emb_1"] = item_emb[1] 471 | 472 | user_item_emb = np.mean([product_emb, item_emb], axis=0) 473 | features[pidx]["umap_user_item_emb_0"] = user_item_emb[0] 474 | features[pidx]["umap_user_item_emb_1"] = user_item_emb[1] 475 | 476 | return features 477 | 478 | @staticmethod 479 | def _implicit_rank(model, product_ind_csr, selected_items): 480 | predictions = model.rank_items( 481 | 0, product_ind_csr, selected_items, recalculate_user=True 482 | ) 483 | return predictions 484 | 485 | @staticmethod 486 | def _custom_implicit_rank(model, product_ind_csr, selected_items): 487 | recommendations = product_ind_csr.dot(model.similarity) 488 | predictions = sorted( 489 | zip(selected_items, recommendations[0, selected_items].A[0]), 490 | key=lambda x: -x[1], 491 | ) 492 | return predictions 493 | 494 | def get_implicit_scores( 495 | self, model, product_ind_csr, selected_items, model_prefix="tfidf" 496 | ): 497 | if model_prefix in ("tfidf", "cosine2", "cosine50"): 498 | sorted_predictions = self._custom_implicit_rank( 499 | model, product_ind_csr, selected_items 500 | ) 501 | else: 502 | sorted_predictions = self._implicit_rank( 503 | model, product_ind_csr, selected_items 504 | ) 505 | 506 | features = defaultdict(dict) 507 | pos = 0 508 | prev_score = -10 509 | for pidx, score in sorted_predictions: 510 | if score == 0: 511 | pos = self.pos_0_score 512 | 513 | features[pidx][f"{model_prefix}_score"] = score 514 | features[pidx][f"{model_prefix}_pos"] = pos 515 | 516 | if prev_score != score: 517 | pos += 1 518 | prev_score = score 519 | return features 520 | 521 | def get_num_days_from_last_transaction(self, js, last_transaction_date): 522 | if last_transaction_date is None: 523 | return -1 524 | 525 | try: 526 | query_date = self.to_date(js.get("query_time")) 527 | num_days_from_last_transaction = (query_date - last_transaction_date).days 528 | 529 | except TypeError: 530 | target_date = self.to_date(js["target"][0]["datetime"]) 531 | num_days_from_last_transaction = (target_date - last_transaction_date).days 532 | num_days_from_last_transaction = random.randint( 533 | 0, num_days_from_last_transaction 534 | ) 535 | 536 | return num_days_from_last_transaction 537 | 538 | def get_co_occurrence_features( 539 | self, 540 | cooc_scores, 541 | weights=None, 542 | prefix="co_occurrence", 543 | default_value=-1, 544 | aggs=("max", "mean", "sum"), 545 | ): 546 | keys = [f"{prefix}_{agg_name}" for agg_name in aggs] 547 | 548 | if weights is not None: 549 | keys += [f"{key}_weighted" for key in keys] 550 | 551 | if cooc_scores.size == 0: 552 | if prefix not in self.cooccur_default_values: 553 | self.cooccur_default_values[prefix] = dict( 554 | zip(keys, it.repeat(default_value)) 555 | ) 556 | return it.repeat( 557 | self.cooccur_default_values[prefix], 1000 558 | ) # to prevent endless iterations 559 | 560 | scores = [] 561 | keys = [] 562 | for agg_name in aggs: 563 | agg_func = getattr(np, agg_name) 564 | scores.append(agg_func(cooc_scores, axis=1)) 565 | keys.append(f"{prefix}_{agg_name}") 566 | 567 | if weights is not None: 568 | cooc_scores_w = cooc_scores * weights 569 | for agg_name in aggs: 570 | agg_func = getattr(np, agg_name) 571 | scores.append(agg_func(cooc_scores_w, axis=1)) 572 | keys.append(f"{prefix}_{agg_name}_weighted") 573 | 574 | features = [] 575 | for values in np.vstack(scores).T: 576 | features.append(dict(zip(keys, values))) 577 | return iter(features) 578 | 579 | def get_cooc_features( 580 | self, cooc_purchased_all_scores, selected_items, purchased_items, cooc_weights 581 | ): 582 | cooc_scores = cooc_purchased_all_scores[selected_items].A 583 | 584 | cooc_scores_norm_item = ( 585 | cooc_scores / self.item_occurrence[selected_items][:, None] 586 | ) 587 | cooc_scores_norm_co_item = cooc_scores / self.item_occurrence[purchased_items] 588 | 589 | cooc_norm_item_features = self.get_co_occurrence_features( 590 | cooc_scores_norm_item, cooc_weights, "co_occurrence_item_norm" 591 | ) 592 | cooc_scores_norm_co_item_features = self.get_co_occurrence_features( 593 | cooc_scores_norm_co_item, cooc_weights, "co_occurrence_co_item_norm" 594 | ) 595 | return cooc_norm_item_features, cooc_scores_norm_co_item_features 596 | 597 | def get_gbm_features( 598 | self, js, train=False, drop_null_target_records=False, add_target_records=False 599 | ): 600 | # sort history as in public and check it was unordered 601 | js["transaction_history"] = sorted( 602 | js["transaction_history"], key=lambda x: x["datetime"] 603 | ) 604 | 605 | if train: 606 | target_products = set( 607 | self.product_encoder.toIdx( 608 | [pid for pid in js["target"][0]["product_ids"]] 609 | ) 610 | ) 611 | 612 | transaction_history = js.get("transaction_history", []) 613 | if transaction_history: 614 | last_transaction_date = self.to_date( 615 | transaction_history[-1].get("datetime") 616 | ) 617 | # num days from 2000/1/1 618 | last_transaction_timestamp = ( 619 | last_transaction_date.toordinal() - self.datetime_stamp_2000 620 | ) 621 | else: 622 | last_transaction_date = None 623 | last_transaction_timestamp = None 624 | num_days_from_last_transaction = self.get_num_days_from_last_transaction( 625 | js, last_transaction_date 626 | ) 627 | 628 | product_ind_csr_not_normed = make_coo_row( 629 | transaction_history, self.product_encoder, normalize=False 630 | ).tocsr() 631 | product_ind_csr = make_coo_row( 632 | js.get("transaction_history", []), self.product_encoder 633 | ).tocsr() 634 | 635 | selected_items = list( 636 | self.get_items_pool(product_ind_csr, product_ind_csr_not_normed) 637 | ) 638 | 639 | cosine50_scores = self.get_implicit_scores( 640 | self.model, product_ind_csr, selected_items, model_prefix="cosine50" 641 | ) 642 | tf_idf_scores = self.get_implicit_scores( 643 | self.implicit_tfidf, product_ind_csr, selected_items, model_prefix="tfidf" 644 | ) 645 | cosine2_scores = self.get_implicit_scores( 646 | self.implicit_cosine2, 647 | product_ind_csr, 648 | selected_items, 649 | model_prefix="cosine2", 650 | ) 651 | als_scores = self.get_implicit_scores( 652 | self.implicit_als, 653 | product_ind_csr_not_normed, 654 | selected_items, 655 | model_prefix="als", 656 | ) 657 | 658 | # co occurrence features 659 | # all user items purchases 660 | purchased_items = product_ind_csr.indices 661 | cooc_weights = product_ind_csr.data 662 | cooc_purchased_all_scores = self.item_co_occurrence[:, purchased_items] 663 | 664 | cooc_norm_item_features, cooc_scores_norm_co_item_features = self.get_cooc_features( 665 | cooc_purchased_all_scores, selected_items, purchased_items, cooc_weights 666 | ) 667 | 668 | # scores per each transaction 669 | cooc_purchased_scores = cooc_purchased_all_scores[purchased_items].A 670 | cooc_purchased_scores_norm_co_item = ( 671 | cooc_purchased_scores / self.item_occurrence[purchased_items] 672 | ) 673 | purchased_item2pos = {pid: pos for pos, pid in enumerate(purchased_items)} 674 | 675 | product_history_features = self.get_product_feat_from_history( 676 | js.get("transaction_history", []), 677 | cooc_purchased_scores_norm_co_item, 678 | purchased_item2pos, 679 | ) 680 | 681 | high_level_features = self.get_highlevel_feat_from_history( 682 | js.get("transaction_history", []) 683 | ) 684 | 685 | # faiss_features = self.get_faiss_features(product_ind_csr, selected_items) 686 | umap_scores = self.get_umap_scores(product_ind_csr, selected_items) 687 | 688 | gbm_records = [] 689 | for product_idx in selected_items: 690 | record = dict( 691 | **{ 692 | "idx": product_idx, 693 | "age": js["age"], 694 | "gender": self.feature_extractor["gender"][js["gender"]], 695 | "num_transactions": len(js.get("transaction_history", [])), 696 | "popularity_position": self.pos_in_top[product_idx], 697 | "last_transaction_timestamp": last_transaction_timestamp, 698 | "num_days_from_last_transaction": num_days_from_last_transaction, 699 | }, 700 | **high_level_features, 701 | **product_history_features.get( 702 | product_idx, self.get_default_history_feat(product_idx) 703 | ), 704 | **self.product_features.product_features( 705 | self.product_encoder.toPid(int(product_idx)) 706 | ), 707 | **self.get_implicit_features(product_idx), 708 | **als_scores[product_idx], 709 | **tf_idf_scores[product_idx], 710 | **cosine50_scores[product_idx], 711 | **cosine2_scores[product_idx], 712 | **next(cooc_norm_item_features), 713 | **next(cooc_scores_norm_co_item_features), 714 | # **faiss_features[product_idx], 715 | **umap_scores[product_idx], 716 | ) 717 | 718 | record["item_pct_spent"] = record.get("item_spent", 0) / max( 719 | record.get("purchase_sum", 1), 1 720 | ) 721 | 722 | if train: 723 | record["target"] = int(product_idx in target_products) 724 | record["client_id"] = js["client_id"] 725 | 726 | gbm_records.append(record) 727 | 728 | if train: 729 | gt_products = dict( 730 | client_id=js["client_id"], products=list(target_products) 731 | ) 732 | return gbm_records, gt_products 733 | 734 | return gbm_records 735 | 736 | 737 | class GBMPredictor(GBMFeatures): 738 | def __init__( 739 | self, 740 | product_csv_path, 741 | *args, 742 | lgbm_model_path=None, 743 | cat_model_path=None, 744 | **kwargs, 745 | ): 746 | super(GBMPredictor, self).__init__(product_csv_path, *args, **kwargs) 747 | self.product_encoder = ProductEncoder(product_csv_path) 748 | 749 | if lgbm_model_path: 750 | self.lgb_model = lgb.Booster(model_file=lgbm_model_path) 751 | 752 | if cat_model_path: 753 | self.cat_model = CatBoost().load_model(cat_model_path) 754 | 755 | @staticmethod 756 | def predict_proba(X, model): 757 | pred = model.predict(X) 758 | return pred 759 | 760 | def sort_predictions(self, product_idx, gbm_pred, n=30): 761 | product_idx_sorted, _ = zip( 762 | *sorted(zip(product_idx, gbm_pred), key=lambda x: -x[1]) 763 | ) 764 | product_ids = self.product_encoder.toPid( 765 | [idx for idx in product_idx_sorted[:n]] 766 | ) 767 | return product_ids 768 | 769 | # TODO: add different blending weights 770 | def predict(self, js, models): 771 | X = self.get_gbm_features(js) 772 | feature_values = [list(item.values()) for item in X] 773 | product_idx = [item["idx"] for item in X] 774 | if isinstance(models, (list, tuple)): 775 | gbm_pred = np.zeros_like(product_idx, dtype=float) 776 | for model in models: 777 | gbm_pred += 0.5 * self.predict_proba(feature_values, model) 778 | else: 779 | gbm_pred = self.predict_proba(feature_values, models) 780 | 781 | return self.sort_predictions(product_idx, gbm_pred) 782 | --------------------------------------------------------------------------------