├── src
    ├── __init__.py
    ├── train.sh
    ├── config.py
    ├── train_umap_embeddings.py
    ├── collect_co_occur_matrix.py
    ├── train_i2i_model.py
    ├── collect_miscellaneous_features.py
    ├── train_valid_split.py
    ├── collect_gbm_dataset.py
    ├── purchases_to_jrows.py
    ├── utils.py
    ├── train_lgb_model.py
    └── predictors.py
├── submit
    ├── solution
    │   ├── src
    │   ├── metadata.json
    │   └── server.py
    ├── custom_docker
    │   ├── Dockerfile
    │   └── README.md
    └── run_queries.py
├── slides
    └── retailhero_recommender.pdf
├── .gitignore
├── LICENSE
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/submit/solution/src:
--------------------------------------------------------------------------------
1 | ../../src


--------------------------------------------------------------------------------
/src/train.sh:
--------------------------------------------------------------------------------
1 | python collect_gbm_dataset.py --N 100 --max-records 10
2 | python train_lgb_model.py --N 100
3 | 


--------------------------------------------------------------------------------
/slides/retailhero_recommender.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aprotopopov/retailhero_recommender/HEAD/slides/retailhero_recommender.pdf


--------------------------------------------------------------------------------
/submit/solution/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "image": "aprotopopov/ds-base:retailhero",
3 |     "entry_point": "gunicorn --bind 0.0.0.0:8000 server:app"
4 | }


--------------------------------------------------------------------------------
/submit/custom_docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-stretch
2 | RUN python3 -m pip install -U scikit-learn pandas scipy numpy gunicorn flask
3 | RUN python3 -m pip install -U implicit lightgbm catboost
4 | RUN python3 -m pip install -U feather-format
5 | RUN pip install -U faiss-cpu
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | tmp
 3 | 
 4 | .idea/
 5 | .vscode/
 6 | 
 7 | **/.DS_Store
 8 | **/__pycache__
 9 | **/*.pyc
10 | **/.ipynb_checkpoints
11 | **/*.csv
12 | **/*.log
13 | 
14 | *.txt
15 | *.pkl
16 | *.zip
17 | *.cbm
18 | **/*.np
19 | **/*.npy
20 | **/*.npz
21 | **/*.pickle
22 | **/*.feather
23 | **/*.idx
24 | 
25 | 


--------------------------------------------------------------------------------
/submit/custom_docker/README.md:
--------------------------------------------------------------------------------
 1 | ## Custom docker image
 2 | 
 3 | ### Build image locally
 4 | ```bash
 5 | docker build -t aprotopopov/ds-base:retailhero ./
 6 | ```
 7 | `aprotopopov/ds-base:retailhero` - docker image. Available via docker hub.
 8 | 
 9 | 
10 | ## Publish image 
11 | 
12 | ```bash
13 | docker push aprotopopov/ds-base:retailhero
14 | ```
15 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | # all path are from ./src folder
 2 | from pathlib import Path
 3 | 
 4 | PURCHASE_CSV_PATH = "../data/raw/purchases.csv"
 5 | CLIENT_CSV_PATH = "../data/raw/clients.csv"
 6 | PRODUCT_CSV_PATH = "../data/raw/products.csv"
 7 | CHECK_QUERY_PATH = "../data/raw/check_queries.tsv"
 8 | JSONS_DIR = "../tmp/jsons/"
 9 | MAX_CHUNKS = None
10 | NUM_SHARDS = 16
11 | ASSETS_DIR = Path("../submit/solution/assets")
12 | 
13 | # determed from check quieries
14 | BASE_SPLIT_POINT = "2019-03-02 10:05:00"
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/train_umap_embeddings.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import umap
 7 | from scipy import sparse as sp
 8 | from tqdm import tqdm
 9 | 
10 | import config as cfg
11 | from utils import ProductEncoder, get_shard_path, make_coo_row
12 | 
13 | logging.basicConfig(
14 |     level=logging.INFO,
15 |     format="%(asctime)s - %(message)s",
16 |     handlers=[logging.StreamHandler()],
17 | )
18 | 
19 | 
20 | def get_train_data(max_rows=None):
21 |     product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
22 | 
23 |     rows = []
24 |     num_rows = 0
25 |     for shard_idx in tqdm(range(cfg.NUM_SHARDS)):
26 |         for js in tqdm(json.loads(s) for s in open(get_shard_path(shard_idx))):
27 |             rows.append(
28 |                 make_coo_row(js["transaction_history"], product_encoder, normalize=True)
29 |             )
30 |             num_rows += 1
31 | 
32 |             if max_rows and num_rows == max_rows:
33 |                 return sp.vstack(rows)
34 | 
35 |     trans_mat = sp.vstack(rows)
36 |     return trans_mat
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     logger = logging.getLogger(__name__)
41 | 
42 |     UMAP_MAX_ROWS = 1000
43 |     trans_mat = get_train_data(UMAP_MAX_ROWS)
44 | 
45 |     umap_params = dict(
46 |         random_state=14, metric="cosine", n_neighbors=10, low_memory=True
47 |     )
48 | 
49 |     logger.info("Training UMAP embeddings.")
50 |     umap_items = umap.UMAP(**umap_params)
51 |     item_embeddings = umap_items.fit_transform(trans_mat.T.tocsr())
52 | 
53 |     filename = cfg.ASSETS_DIR / "umap_item_emb.npy"
54 |     logger.info(f"Saving UMAP embeddings to {filename}")
55 |     np.save(filename, item_embeddings)
56 | 


--------------------------------------------------------------------------------
/src/collect_co_occur_matrix.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | from scipy import sparse as sp
 6 | from tqdm import tqdm
 7 | 
 8 | import config as cfg
 9 | from utils import ProductEncoder, get_shard_path
10 | 
11 | 
12 | def collect_cooccur_matrix(shard_indices, product_encoder):
13 |     num_products = product_encoder.num_products
14 |     co_occurrence = np.zeros((num_products, num_products))
15 |     occurrence = np.zeros(num_products)
16 |     for shard_idx in tqdm(shard_indices):
17 |         for js in tqdm((json.loads(s) for s in open(get_shard_path(shard_idx)))):
18 |             tids = js.get("transaction_history", [])
19 |             for tid in tids:
20 |                 product_ind = [
21 |                     product_encoder.toIdx(item["product_id"])
22 |                     for item in tid.get("products", [])
23 |                 ]
24 |                 for pid_num, pid in enumerate(product_ind):
25 |                     occurrence[pid] += 1
26 |                     for co_pid in product_ind[pid_num + 1 :]:
27 |                         co_occurrence[co_pid][pid] += 1
28 |                         co_occurrence[pid][co_pid] += 1
29 |     return co_occurrence, occurrence
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
34 |     co_occurrence, occurrence = collect_cooccur_matrix(
35 |         range(cfg.NUM_SHARDS), product_encoder
36 |     )
37 | 
38 |     # cut the low count records to reduce size and improve speed
39 |     min_count = 5
40 |     co_occurrence_sp = sp.csc_matrix(
41 |         np.where(co_occurrence >= min_count, co_occurrence, 0), dtype=np.int32
42 |     )
43 | 
44 |     # not compressed for fast loading
45 |     sp.save_npz(
46 |         cfg.ASSETS_DIR / f"item_co_occurrence_min_cnt_{min_count}.npz",
47 |         co_occurrence_sp,
48 |         compressed=False,
49 |     )
50 |     np.save(cfg.ASSETS_DIR / "item_occurrence.npy", occurrence)
51 | 


--------------------------------------------------------------------------------
/submit/solution/server.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "src")
 3 | 
 4 | import datetime as dt
 5 | from flask import Flask, jsonify, request
 6 | from predictors import GBMPredictor
 7 | from pathlib import Path
 8 | 
 9 | app = Flask(__name__)
10 | 
11 | ASSETS_DIR = Path("assets")
12 | PREDICTOR = GBMPredictor(
13 |     lgbm_model_path=str(ASSETS_DIR / "lgbm_model.txt"),
14 |     product_csv_path=ASSETS_DIR / "products.csv", 
15 |     model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl",
16 |     products_misc_path=ASSETS_DIR / "products_misc.csv",
17 |     product_features_encoder_path=ASSETS_DIR / "product_features.pkl",
18 |     implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl",
19 |     implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl",
20 |     fm_features_feather_path=ASSETS_DIR / "implicit_scores.feather",
21 |     implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl",
22 |     umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy",
23 |     item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz",
24 |     item_occurrence_path=ASSETS_DIR / "item_occurrence.npy",
25 |     user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy", 
26 |     tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy", 
27 |     N=100,
28 |     trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy",
29 |     faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"),
30 |     train_scores_path=ASSETS_DIR / "X_scores_sparse.npz",
31 |     faiss_neighbors=512,
32 |     faiss_nprobe=16,
33 | )
34 | 
35 | @app.route("/ready")
36 | def ready():
37 |     return "OK"
38 | 
39 | 
40 | @app.route("/recommend", methods=["POST"])
41 | def recommend():
42 |     r = request.json
43 | 
44 |     result = PREDICTOR.predict(r, PREDICTOR.lgb_model)
45 |     return jsonify({"recommended_products": result})
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     # Only for debugging while developing
50 |     app.run(host="0.0.0.0", debug=True, port=8000)


--------------------------------------------------------------------------------
/src/train_i2i_model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pickle
 4 | import sys
 5 | 
 6 | import implicit
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import sparse as sp
10 | from tqdm import tqdm
11 | 
12 | import config as cfg
13 | from utils import (
14 |     ProductEncoder,
15 |     get_shard_path,
16 |     make_coo_row,
17 |     normalized_average_precision,
18 | )
19 | 
20 | if __name__ == "__main__":
21 |     product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
22 | 
23 |     rows = []
24 |     for i in range(cfg.NUM_SHARDS - 1):
25 |         for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
26 |             rows.append(
27 |                 make_coo_row(js["transaction_history"], product_encoder, normalize=True)
28 |             )
29 |     train_mat = sp.vstack(rows)
30 | 
31 |     model = implicit.nearest_neighbours.CosineRecommender(K=2)
32 |     # model = implicit.nearest_neighbours.CosineRecommender(K=50)
33 |     # model = implicit.nearest_neighbours.TFIDFRecommender(K=100)
34 | 
35 |     # ALS should be trained with normalize = False
36 |     # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12)
37 |     model.fit(train_mat.T)
38 | 
39 |     out_dir = cfg.ASSETS_DIR
40 |     os.makedirs(out_dir, exist_ok=True)
41 |     print(f"Dump model to {out_dir}")
42 |     pickle.dump(model, open(out_dir / "model.pkl", "wb"))
43 | 
44 |     print("Estimate quality...")
45 |     scores = []
46 |     for js in tqdm((json.loads(s) for s in open(get_shard_path(cfg.NUM_SHARDS - 1)))):
47 |         row = make_coo_row(js["transaction_history"], product_encoder).tocsr()
48 |         raw_recs = model.recommend(
49 |             userid=0,
50 |             user_items=row,
51 |             N=30,
52 |             filter_already_liked_items=False,
53 |             recalculate_user=True,
54 |         )
55 | 
56 |         recommended_items = product_encoder.toPid([idx for (idx, score) in raw_recs])
57 |         gt_items = js["target"][0]["product_ids"]
58 |         nap = normalized_average_precision(gt_items, recommended_items)
59 |         scores.append(nap)
60 |     print("nap: {}".format(np.mean(scores)))
61 | 


--------------------------------------------------------------------------------
/src/collect_miscellaneous_features.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import defaultdict
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from scipy import sparse as sp
 7 | from tqdm import tqdm
 8 | 
 9 | import config as cfg
10 | from utils import (
11 |     ProductEncoder,
12 |     get_shard_path,
13 |     make_coo_row,
14 |     normalized_average_precision,
15 | )
16 | 
17 | 
18 | def update_item_cost(transaction_history, product_encoder, storage):
19 |     for txn in transaction_history:
20 |         for item in txn["products"]:
21 |             key = product_encoder.toIdx(item["product_id"])
22 |             item_cost = item["s"] / max(item["quantity"], 1)
23 |             if storage[key] == 0:
24 |                 storage[key] = item_cost
25 |             else:
26 |                 storage[key] = (storage[key] + item_cost) / 2.0
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
31 |     num_products = product_encoder.num_products
32 | 
33 |     items_cost = defaultdict(int)
34 |     rows = []
35 |     num_transactions = 0
36 |     for i in tqdm(range(cfg.NUM_SHARDS)):
37 |         for js in tqdm((json.loads(s) for s in open(get_shard_path(i)))):
38 |             update_item_cost(js["transaction_history"], product_encoder, items_cost)
39 |             rows.append(
40 |                 make_coo_row(
41 |                     js["transaction_history"], product_encoder, normalize=False
42 |                 )
43 |             )
44 |             num_transactions += len(js["transaction_history"])
45 |     trans_mat = sp.vstack(rows)
46 | 
47 |     items_cnt = trans_mat.sum(axis=0).A[0]
48 |     df_top_items = (
49 |         pd.Series(items_cnt, name="items_cnt").sort_values(ascending=False).to_frame()
50 |     )
51 |     df_items_cost = pd.Series(items_cost, name="cost").to_frame()
52 |     df_misc_features = df_top_items.join(df_items_cost)
53 |     df_misc_features["popularity_position"] = range(num_products)
54 | 
55 |     df_misc_features.to_csv(cfg.ASSETS_DIR / "products_misc.csv")
56 | 
57 |     # iDF, products in user purchases
58 |     bought_products = trans_mat.sum(axis=0).A[0]
59 |     user_prod_log_idf = np.log(trans_mat.shape[0] / (bought_products + 1))
60 |     np.save(cfg.ASSETS_DIR / "user_prod_log_idf.npy", user_prod_log_idf)
61 | 
62 |     # iDF, products in transactions
63 |     tran_prod_log_idf = np.log(num_transactions / (bought_products + 1))
64 |     np.save(cfg.ASSETS_DIR / "tran_prod_log_idf.npy", tran_prod_log_idf)
65 | 


--------------------------------------------------------------------------------
/src/train_valid_split.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import random
 4 | from typing import Any, Dict
 5 | 
 6 | import pandas as pd
 7 | from tqdm import tqdm
 8 | 
 9 | import config as cfg
10 | 
11 | 
12 | def transaction_to_target(transaction: Dict[str, Any]) -> Dict[str, Any]:
13 |     return {
14 |         "tid": transaction["tid"],
15 |         "datetime": transaction["datetime"],
16 |         "product_ids": [e["product_id"] for e in transaction["products"]],
17 |         "store_id": transaction["store_id"],
18 |     }
19 | 
20 | 
21 | def get_client_info(client_data_path: str) -> Dict[str, Dict]:
22 |     client_info = {}
23 |     for row in pd.read_csv(client_data_path).itertuples():
24 |         client_info[row.client_id] = {
25 |             "age": row.age,
26 |             "gender": row.gender,
27 |             "client_id": row.client_id,
28 |         }
29 |     return client_info
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     random.seed(43)  # lets be special
34 | 
35 |     client_csv_path = cfg.CLIENT_CSV_PATH
36 |     jsons_root = cfg.JSONS_DIR
37 | 
38 |     client_info = get_client_info(client_csv_path)
39 | 
40 |     print("process shards")
41 |     for js_path in tqdm(sorted(glob.glob(jsons_root + "/*.jsons"))):
42 |         fout = open(js_path + ".splitted", "w")
43 |         for js in (json.loads(s) for s in open(js_path)):
44 |             sorted_transactions = sorted(
45 |                 js["transaction_history"], key=lambda x: x["datetime"]
46 |             )
47 |             split_candidates = [
48 |                 t["datetime"]
49 |                 for t in sorted_transactions
50 |                 if t["datetime"] > cfg.BASE_SPLIT_POINT
51 |             ]
52 |             if len(split_candidates) == 0:
53 |                 # no transactions after split points - so we cannot validates on this sample, skip it.
54 |                 continue
55 |             split_point = random.choice(split_candidates)
56 |             train_transactions = [
57 |                 t for t in sorted_transactions if t["datetime"] < split_point
58 |             ]
59 |             test_transactons = [
60 |                 t for t in sorted_transactions if t["datetime"] >= split_point
61 |             ]
62 | 
63 |             # copy info about client% client_id, age, gender
64 |             sample = {**client_info[js["client_id"]]}
65 |             sample["transaction_history"] = train_transactions
66 |             sample["target"] = [transaction_to_target(x) for x in test_transactons]
67 | 
68 |             fout.write(json.dumps(sample) + "\n")
69 |         fout.close()
70 | 


--------------------------------------------------------------------------------
/submit/run_queries.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import requests
 4 | import datetime as dt
 5 | 
 6 | import numpy as np
 7 | import tqdm 
 8 | 
 9 | 
10 | def average_precision(actual, recommended, k=30):
11 |     ap_sum = 0
12 |     hits = 0
13 |     for i in range(k):
14 |         product_id = recommended[i] if i < len(recommended) else None
15 |         if product_id is not None and product_id in actual:
16 |             hits += 1
17 |             ap_sum += hits / (i + 1)
18 |     return ap_sum / k
19 | 
20 | 
21 | def normalized_average_precision(actual, recommended, k=30):
22 |     actual = set(actual)
23 |     if len(actual) == 0:
24 |         return 0.0
25 |     
26 |     ap = average_precision(actual, recommended, k=k)
27 |     ap_ideal = average_precision(actual, list(actual)[:k], k=k)
28 |     return ap / ap_ideal
29 | 
30 | 
31 | def run_queries(url, queryset_file, max_queries=1000):
32 |     ap_values = []
33 |     durations = []
34 | 
35 |     total_records = 0
36 |     with open(queryset_file) as fin:
37 |         for line in fin:
38 |             total_records += 1
39 | 
40 |     total_records = min(total_records, max_queries)
41 |     
42 |     with open(queryset_file) as fin:
43 |         for i, line in enumerate(tqdm.tqdm(fin, total=total_records)):
44 |             splitted = line.strip().split('\t')
45 |             if len(splitted) == 1:
46 |                 query_data = json.loads(splitted[0])
47 |                 next_transaction = query_data['target'][0]
48 |             else:
49 |                 query_data, next_transaction = map(json.loads, splitted)
50 |             
51 |             start_time = dt.datetime.now()
52 |             # resp = requests.post(url, json=query_data, timeout=0.3)
53 |             resp = requests.post(url, json=query_data)
54 |             duration = (dt.datetime.now() - start_time).total_seconds()
55 |             durations.append(duration)
56 |             resp.raise_for_status()
57 |             resp_data = resp.json()
58 |             
59 |             if len(set(resp_data['recommended_products'])) < 30:
60 |                 print(query_data)
61 |                 print(resp_data)
62 | 
63 |             assert len(resp_data['recommended_products']) == 30
64 |             assert len(set(resp_data['recommended_products'])) == 30
65 |             assert all(isinstance(item, str) for item in resp_data['recommended_products'])
66 |             assert "recommended_products" in resp_data
67 |             
68 |             ap = normalized_average_precision(next_transaction['product_ids'], resp_data['recommended_products'])
69 |             ap_values.append(ap)
70 |             
71 |             if i >= max_queries:
72 |                 break
73 |             
74 |     map_score = sum(ap_values) / len(ap_values)
75 |     print("Max time:", np.max(durations), "mean_time:", np.mean(durations), "min_time:", np.min(durations))
76 |     return map_score
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     url = sys.argv[1] # 'http://localhost:8000/recommend'
81 |     queryset_file = sys.argv[2] # 'data/check_queries.tsv'
82 |     max_queries = int(sys.argv[3]) # 1000
83 |     score = run_queries(url, queryset_file, max_queries)
84 |     print('Score:', score)
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 1st place solution [RetailHero.ai/#2](https://retailhero.ai/c/recommender_system/overview)
  2 | 
  3 | ## Overview
  4 | 
  5 | [solution presentation](https://github.com/aprotopopov/retailhero_recommender/tree/master/slides/retailhero_recommender.pdf) (on russian)
  6 | 
  7 | - Data preparation, train/valid split are heavily based on excellent [baseline](https://github.com/datagym-ru/retailhero-recomender-baseline) from [@geffy](https://github.com/geffy)
  8 | - Co occurrence of purchased items per transaction
  9 | - User transactions UMAP embeddings
 10 | - Collecting miscellaneous features like `item_cost`, `popularity position`, etc
 11 | - Dataset preparation with saving each chunk as feather DataFrame
 12 | - LightGBM training on pool of items from the MF models, top products and history items
 13 | 
 14 | ## Steps to prepare data
 15 | 
 16 | 1. Copy data to `data/raw`
 17 | 
 18 | ```
 19 | cd {REPO_ROOT}
 20 | mkdir -p data/raw
 21 | cp /path/to/unpacked/data/*.csv ./data/raw
 22 | cd src
 23 | ```
 24 | 
 25 | 2. Divide source purchase data into 16 shards
 26 | 
 27 | ```bash
 28 | python purchases_to_jrows.py
 29 | ```
 30 | 
 31 | 3. Prepare train/valid data with similar structure to `check_queries.tsv`
 32 | 
 33 | ```bash
 34 | python train_valid_split.py
 35 | ```
 36 | 
 37 | ## Train embedding and collect statistics
 38 | 
 39 | 1. Collect co occurrence matrix
 40 | 
 41 | ```bash
 42 | python collect_co_occur_matrix.py
 43 | ```
 44 | 
 45 | 2. Build UMAP embeddings
 46 | 
 47 | ```bash
 48 | python train_umap_embeddings.py
 49 | ```
 50 | 
 51 | 3. Collect miscellaneous features like item popularity position, item_cost, iDF for items in transactions
 52 | 
 53 | ```bash
 54 | python collect_miscellaneous_features.py
 55 | ```
 56 | 
 57 | ## Train models
 58 | 
 59 | 1. Train item2item `implicit` models. It have to be launched for each model separately
 60 | 
 61 | ```bash
 62 | python train_i2i_model.py
 63 | ```
 64 | 
 65 | 2. Collect dataset for GBM training
 66 | 
 67 | ```bash
 68 | python collect_gbm_dataset.py
 69 | ```
 70 | 
 71 | 3. Train LGBM model
 72 | 
 73 | ```bash
 74 | python train_lgb_model.py
 75 | ```
 76 | 
 77 | 4. Create submission file
 78 | 
 79 | ```bash
 80 | cd submit
 81 | zip -r model.zip solution/*
 82 | ```
 83 | 
 84 | Joint collection and training parts can be launched via `bash train.sh` as well as via notebook `sandbox/GBM.ipynb`
 85 | 
 86 | ## The best achieved results
 87 | 
 88 | Scores (NMAP@30):  
 89 | Check: 0.1350  
 90 | Public: 0.1339  
 91 | Private: 0.148325  
 92 | Local: 0.155055  
 93 | 
 94 | [final submit](https://drive.google.com/file/d/17yR-klDIZ8vXvhTCIAEwkaBqXzuXagsg/view?usp=sharing). Code a bit messier and with less number of features but with better overall result.
 95 | 
 96 | *Note: LightGBM wasn't tuned carefully so higher score is achievable with the current set of features*
 97 | 
 98 | ## Other participants solutions
 99 | 
100 | - [3rd place](https://github.com/geffy/retailhero-recommender-solution) from [@geffy](https://github.com/geffy). [presentation](https://github.com/geffy/retailhero-recommender-solution/blob/master/slides/retailhero-conf.pdf)
101 | - [8th place](https://github.com/greenwolf-nsk/retailhero-rec) from [@greenwolf-nsk](https://github.com/greenwolf-nsk)
102 | - [9th place](https://github.com/mike-chesnokov/x5_retailhero_2020_recs) from [@mike-chesnokov](https://github.com/mike-chesnokov)
103 | 


--------------------------------------------------------------------------------
/src/collect_gbm_dataset.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gc
  3 | import json
  4 | import logging
  5 | from pathlib import Path
  6 | 
  7 | import feather
  8 | import numpy as np
  9 | import lightgbm as lgb
 10 | import pandas as pd
 11 | from scipy import sparse as sp
 12 | from tqdm import tqdm
 13 | 
 14 | import config as cfg
 15 | from predictors import GBMFeatures, GBMPredictor
 16 | from utils import (
 17 |     ProductEncoder,
 18 |     make_coo_row,
 19 |     normalized_average_precision,
 20 |     get_shard_path,
 21 |     cache_to_feather,
 22 |     get_check_users,
 23 | )
 24 | 
 25 | logging.basicConfig(
 26 |     level=logging.INFO,
 27 |     format="%(asctime)s - %(message)s",
 28 |     handlers=[logging.FileHandler("collect_dataset.log"), logging.StreamHandler()],
 29 | )
 30 | 
 31 | 
 32 | def get_gbm_records(shard_indices, gbm_feat, max_records=None, **kwargs):
 33 |     check_users = get_check_users()
 34 | 
 35 |     gbm_records = []
 36 |     num_records = 0
 37 | 
 38 |     for shard_idx in tqdm(shard_indices, leave=False):
 39 |         for js in tqdm(
 40 |             (json.loads(s) for s in open(get_shard_path(shard_idx))), leave=False
 41 |         ):
 42 |             if js["client_id"] in check_users:
 43 |                 continue
 44 | 
 45 |             feat_records, _ = gbm_feat.get_gbm_features(js, train=True, **kwargs)
 46 |             gbm_records.extend(feat_records)
 47 |             num_records += 1
 48 | 
 49 |             if max_records and num_records >= max_records:
 50 |                 return gbm_records
 51 | 
 52 |     return gbm_records
 53 | 
 54 | 
 55 | def parse_args():
 56 |     parser = argparse.ArgumentParser()
 57 |     parser.add_argument("--N", type=int, default=100)
 58 |     parser.add_argument("--max-records", type=int, default=None)
 59 |     return parser.parse_args()
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 |     args = parse_args()
 64 |     N_POOL = args.N
 65 |     MAX_RECORDS = args.max_records
 66 | 
 67 |     logger = logging.getLogger(__name__)
 68 |     ASSETS_DIR = cfg.ASSETS_DIR
 69 |     NUM_TEST_SHARD = 15
 70 | 
 71 |     # to test pipeline
 72 |     SHARDS = [14]
 73 | 
 74 |     # full training
 75 |     # SHARDS = range(15)
 76 | 
 77 |     gbm_feat = GBMFeatures(
 78 |         product_csv_path=ASSETS_DIR / "products.csv",
 79 |         model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl",
 80 |         products_misc_path=ASSETS_DIR / "products_misc.csv",
 81 |         product_features_encoder_path=ASSETS_DIR / "product_features.pkl",
 82 |         implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl",
 83 |         implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl",
 84 |         implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl",
 85 |         umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy",
 86 |         item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz",
 87 |         item_occurrence_path=ASSETS_DIR / "item_occurrence.npy",
 88 |         user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy",
 89 |         tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy",
 90 |         N=N_POOL,
 91 |         # trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy",
 92 |         # faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"),
 93 |         # train_scores_path=ASSETS_DIR / "X_scores_sparse.npz",
 94 |         # faiss_neighbors=512,
 95 |         # faiss_nprobe=16,
 96 |     )
 97 | 
 98 |     train_dir = Path(f"../tmp/train_chunks_{gbm_feat.N}")
 99 |     train_dir.mkdir(exist_ok=True)
100 |     test_dir = Path(f"../tmp/test_chunks_{gbm_feat.N}")
101 |     test_dir.mkdir(exist_ok=True)
102 | 
103 |     logger.info("Collecting train dataset")
104 |     for num_shard in tqdm(SHARDS, leave=False):
105 |         gbm_rec_train = get_gbm_records([num_shard], gbm_feat, max_records=MAX_RECORDS)
106 |         df_gbm_train_chunk = pd.DataFrame(gbm_rec_train)
107 | 
108 |         train_shard_path = f"{train_dir}/df_train_{num_shard}.feather"
109 |         logger.info(f"Saving train {num_shard} shard to {train_shard_path}")
110 |         feather.write_dataframe(df_gbm_train_chunk, train_shard_path)
111 | 
112 |     del gbm_rec_train
113 |     del df_gbm_train_chunk
114 |     gc.collect()
115 | 
116 |     logger.info("Collect test dataset")
117 |     gbm_rec_test = get_gbm_records([NUM_TEST_SHARD], gbm_feat, max_records=MAX_RECORDS)
118 |     df_gbm_test = pd.DataFrame(gbm_rec_test)
119 | 
120 |     test_shard_path = f"{test_dir}/df_test_{num_shard}.feather"
121 |     logger.info(f"Saving test {NUM_TEST_SHARD} shard to {test_shard_path}")
122 |     feather.write_dataframe(df_gbm_test, test_dir / "df_test_15.feather")
123 | 
124 |     logger.info("Transform and save FM cached features to feather for faster loading")
125 |     cache_to_feather(gbm_feat.cache_fm_feat)
126 | 


--------------------------------------------------------------------------------
/src/purchases_to_jrows.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | 
  7 | import config as cfg
  8 | from utils import md5_hash
  9 | 
 10 | 
 11 | class Transaction:
 12 |     def __init__(self, transaction_id, transaction_datetime, **kwargs):
 13 |         self.data = {
 14 |             **{"tid": transaction_id, "datetime": transaction_datetime, "products": []},
 15 |             **kwargs,
 16 |         }
 17 | 
 18 |     def add_item(
 19 |         self,
 20 |         product_id: str,
 21 |         product_quantity: float,
 22 |         trn_sum_from_iss: float,
 23 |         trn_sum_from_red: float,
 24 |     ) -> None:
 25 |         p = {
 26 |             "product_id": product_id,
 27 |             "quantity": product_quantity,
 28 |             "s": trn_sum_from_iss,
 29 |             "r": "0"
 30 |             if trn_sum_from_red is None or pd.isna(trn_sum_from_red)
 31 |             else trn_sum_from_red,
 32 |         }
 33 |         self.data["products"].append(p)
 34 | 
 35 |     def as_dict(self,):
 36 |         return self.data
 37 | 
 38 |     def transaction_id(self,):
 39 |         return self.data["tid"]
 40 | 
 41 | 
 42 | class ClientHistory:
 43 |     def __init__(self, client_id):
 44 |         self.data = {"client_id": client_id, "transaction_history": []}
 45 | 
 46 |     def add_transaction(self, transaction):
 47 |         self.data["transaction_history"].append(transaction)
 48 | 
 49 |     def as_dict(self,):
 50 |         return self.data
 51 | 
 52 |     def client_id(self,):
 53 |         return self.data["client_id"]
 54 | 
 55 | 
 56 | class RowSplitter:
 57 |     def __init__(self, output_path, n_shards=cfg.NUM_SHARDS):
 58 |         self.n_shards = n_shards
 59 |         os.makedirs(output_path, exist_ok=True)
 60 |         self.outs = []
 61 |         for i in range(self.n_shards):
 62 |             self.outs.append(open(output_path + "/{:02d}.jsons".format(i), "w"))
 63 |         self._client = None
 64 |         self._transaction = None
 65 | 
 66 |     def finish(self,):
 67 |         self.flush()
 68 |         for outs in self.outs:
 69 |             outs.close()
 70 | 
 71 |     def flush(self,):
 72 |         if self._client is not None:
 73 |             self._client.add_transaction(self._transaction.as_dict())
 74 |             # rows are sharded by cliend_id
 75 |             shard_idx = md5_hash(self._client.client_id()) % self.n_shards
 76 |             data = self._client.as_dict()
 77 |             self.outs[shard_idx].write(json.dumps(data) + "\n")
 78 | 
 79 |             self._client = None
 80 |             self._transaction = None
 81 | 
 82 |     def consume_row(self, row):
 83 |         if self._client is not None and self._client.client_id() != row.client_id:
 84 |             self.flush()
 85 | 
 86 |         if self._client is None:
 87 |             self._client = ClientHistory(client_id=row.client_id)
 88 | 
 89 |         if (
 90 |             self._transaction is not None
 91 |             and self._transaction.transaction_id() != row.transaction_id
 92 |         ):
 93 |             self._client.add_transaction(self._transaction.as_dict())
 94 |             self._transaction = None
 95 | 
 96 |         if self._transaction is None:
 97 |             self._transaction = Transaction(
 98 |                 transaction_id=row.transaction_id,
 99 |                 transaction_datetime=row.transaction_datetime,
100 |                 regular_points_received=row.regular_points_received,
101 |                 express_points_received=row.express_points_received,
102 |                 regular_points_spent=row.regular_points_spent,
103 |                 express_points_spent=row.express_points_spent,
104 |                 purchase_sum=row.purchase_sum,
105 |                 store_id=row.store_id,
106 |             )
107 | 
108 |         self._transaction.add_item(
109 |             product_id=row.product_id,
110 |             product_quantity=row.product_quantity,
111 |             trn_sum_from_iss=row.trn_sum_from_iss,
112 |             trn_sum_from_red=row.trn_sum_from_red,
113 |         )
114 | 
115 | 
116 | def split_data_to_chunks(
117 |     input_path, output_dir, n_shards=cfg.NUM_SHARDS, max_chunks=None
118 | ):
119 |     splitter = RowSplitter(output_path=output_dir, n_shards=n_shards)
120 |     print("split_data_to_chunks: {} -> {}".format(input_path, output_dir))
121 |     for i, df in enumerate(tqdm(pd.read_csv(input_path, chunksize=500000))):
122 |         for row in df.itertuples():
123 |             splitter.consume_row(row)
124 |         if max_chunks and i == max_chunks:
125 |             splitter.finish()
126 |             return
127 |     splitter.finish()
128 | 
129 | 
130 | def calculate_unique_clients_from_input(input_path, max_chunks=None):
131 |     client_set = set()
132 |     print("calculate_unique_clients_from: {}".format(input_path))
133 |     for i, df in enumerate(tqdm(pd.read_csv(input_path, chunksize=500000))):
134 |         client_set.update(set([row.client_id for row in df.itertuples()]))
135 |         if max_chunks and i == max_chunks:
136 |             break
137 |     return len(client_set)
138 | 
139 | 
140 | def calculate_unique_clients_from_output(output_dir,):
141 |     import glob
142 | 
143 |     client_cnt = 0
144 |     print("calculate_unique_clients_from: {}".format(output_dir))
145 |     for js_file in glob.glob(output_dir + "/*.jsons"):
146 |         for _ in open(js_file):
147 |             client_cnt += 1
148 |     return client_cnt
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     purchases_csv_path = cfg.PURCHASE_CSV_PATH
153 |     output_jsons_dir = cfg.JSONS_DIR
154 |     max_chunks = cfg.MAX_CHUNKS
155 | 
156 |     split_data_to_chunks(
157 |         purchases_csv_path,
158 |         output_jsons_dir,
159 |         n_shards=cfg.NUM_SHARDS,
160 |         max_chunks=max_chunks,
161 |     )
162 | 
163 |     # check splitting for correctness
164 |     _from_input = calculate_unique_clients_from_input(
165 |         purchases_csv_path, max_chunks=max_chunks
166 |     )
167 |     _from_output = calculate_unique_clients_from_output(output_jsons_dir)
168 |     assert _from_input == _from_output
169 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import hashlib
  3 | from typing import List, Set
  4 | from collections import defaultdict
  5 | 
  6 | import feather
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy import sparse as sp
 10 | 
 11 | import config as cfg
 12 | 
 13 | 
 14 | class ProductEncoder:
 15 |     def __init__(self, product_csv_path):
 16 |         self.product_idx = {}
 17 |         self.product_pid = {}
 18 |         for idx, pid in enumerate(pd.read_csv(product_csv_path).product_id.values):
 19 |             self.product_idx[pid] = idx
 20 |             self.product_pid[idx] = pid
 21 | 
 22 |     def toIdx(self, x):
 23 |         if type(x) == str:
 24 |             pid = x
 25 |             return self.product_idx[pid]
 26 |         return [self.product_idx[pid] for pid in x]
 27 | 
 28 |     def toPid(self, x):
 29 |         if type(x) == int:
 30 |             idx = x
 31 |             return self.product_pid[idx]
 32 |         return [self.product_pid[idx] for idx in x]
 33 | 
 34 |     @property
 35 |     def num_products(self):
 36 |         return len(self.product_idx)
 37 | 
 38 | 
 39 | class ProductFeatEncoder:
 40 |     def __init__(self, product_csv_path, init=True, exclude=None):
 41 |         # self.products = pd.read_csv(product_csv_path).fillna(0)
 42 |         self.products = pd.read_csv(product_csv_path)
 43 |         self.products_map = defaultdict(dict)
 44 |         self.features = defaultdict(dict)
 45 |         if exclude:
 46 |             self.products = self.products.drop(exclude, axis=1)
 47 | 
 48 |         if init:
 49 |             self.create_encoder()
 50 |             self.create_features()
 51 | 
 52 |     def create_encoder(self):
 53 |         cols = self.products.dtypes[self.products.dtypes == "object"].index.tolist()
 54 |         global_idx = 0
 55 |         for col in cols:
 56 |             self.products_map[col] = defaultdict(dict)
 57 |             for idx, pid in enumerate(self.products[col].unique()):
 58 |                 self.products_map[col]["pid"][pid] = idx
 59 |                 self.products_map[col]["idx"][idx] = pid
 60 |                 self.products_map[col]["gidx"][idx] = pid
 61 | 
 62 |     def create_features(self):
 63 |         for tup in self.products.set_index("product_id").itertuples():
 64 |             for name, val in tup._asdict().items():
 65 |                 if name == "Index":
 66 |                     continue
 67 | 
 68 |                 val = (
 69 |                     self.products_map[name]["pid"][val]
 70 |                     if name in self.products_map
 71 |                     else val
 72 |                 )
 73 |                 self.features[tup.Index][name] = val
 74 | 
 75 |     def product_features(self, ids):
 76 |         if type(ids) == str:
 77 |             return self.features[ids]
 78 |         return [self.features[product_id] for product_id in ids]
 79 | 
 80 |     def product_features_idx(self, ind):
 81 |         if type(ind) == str:
 82 |             return self.features[ind]
 83 |         return [
 84 |             self.features[self.products_map["product_id"]["idx"][idx]] for idx in ind
 85 |         ]
 86 | 
 87 | 
 88 | class TrainingSample:
 89 |     def __init__(
 90 |         self, row: sp.coo_matrix, target_items: Set[int], client_id: str = None
 91 |     ):
 92 |         self.row = row
 93 |         self.target_items = target_items
 94 |         self.client_id = client_id
 95 | 
 96 | 
 97 | def make_coo_row(
 98 |     transaction_history,
 99 |     product_encoder: ProductEncoder,
100 |     last_transaction=False,
101 |     normalize=True,
102 |     entity=False,
103 | ):
104 |     idx = []
105 |     values = []
106 | 
107 |     items = defaultdict(int)
108 |     if last_transaction:
109 |         transaction_history = transaction_history[-1:]
110 | 
111 |     for trans in transaction_history:
112 |         for i in trans["products"]:
113 |             pidx = product_encoder.toIdx(i["product_id"])
114 |             items[pidx] += 1.0
115 |     n_items = sum(items.values())
116 | 
117 |     for pidx, val in items.items():
118 |         idx.append(pidx)
119 |         if normalize:
120 |             val = val / n_items
121 |         if entity:
122 |             val = 1
123 |         values.append(val)
124 | 
125 |     return sp.coo_matrix(
126 |         (np.array(values).astype(np.float32), ([0] * len(idx), idx)),
127 |         shape=(1, product_encoder.num_products),
128 |     )
129 | 
130 | 
131 | def cache_to_feather(cache_fm, num_products=43038):
132 |     dfs = []
133 |     for key, item in cache_fm.items():
134 |         df_model = pd.Series(item).apply(pd.Series)
135 |         dfs.append(df_model)
136 | 
137 |     df_scores = dfs[0]
138 |     for df in dfs[1:]:
139 |         df_scores = df_scores.join(df)
140 | 
141 |     df_scores = df_scores.sort_index().reindex(range(num_products)).fillna(0)
142 |     feather.write_dataframe(df_scores, cfg.ASSETS_DIR / "implicit_scores.feather")
143 | 
144 | 
145 | def create_products_in_transaction(
146 |     transaction_history, product_encoder: ProductEncoder, outfile
147 | ):
148 |     """Collect item2vec file."""
149 |     for trans in transaction_history:
150 |         products_str = " ".join(
151 |             str(product_encoder.toIdx(i["product_id"])) for i in trans["products"]
152 |         )
153 |         outfile.write(products_str + "\n")
154 |     outfile.flush()
155 | 
156 | 
157 | def update_item_cost(transaction_history, product_encoder, storage):
158 |     for txn in transaction_history:
159 |         for item in txn["products"]:
160 |             key = product_encoder.toIdx(item["product_id"])
161 |             item_cost = item["s"] / max(item["quantity"], 1)
162 | 
163 |             if storage[key] == 0:
164 |                 storage[key] = item_cost
165 |             else:
166 |                 storage[key] = (storage[key] + item_cost) / 2.0
167 | 
168 | 
169 | def average_precision(actual, recommended, k=30):
170 |     ap_sum = 0
171 |     hits = 0
172 |     for i in range(k):
173 |         product_id = recommended[i] if i < len(recommended) else None
174 |         if product_id is not None and product_id in actual:
175 |             hits += 1
176 |             ap_sum += hits / (i + 1)
177 |     return ap_sum / k
178 | 
179 | 
180 | def normalized_average_precision(actual, recommended, k=30):
181 |     actual = set(actual)
182 |     if len(actual) == 0:
183 |         return 0.0
184 | 
185 |     ap = average_precision(actual, recommended, k=k)
186 |     ap_ideal = average_precision(actual, list(actual)[:k], k=k)
187 |     return ap / ap_ideal
188 | 
189 | 
190 | def recall_k(actual, recommended, k=30):
191 |     return len(set(actual).intersection(set(recommended[:k]))) / max(
192 |         len(set(actual)), 1
193 |     )
194 | 
195 | 
196 | def get_shard_path(n_shard, jsons_dir=cfg.JSONS_DIR):
197 |     return "{}/{:02d}.jsons.splitted".format(jsons_dir, n_shard)
198 | 
199 | 
200 | def md5_hash(x):
201 |     return int(hashlib.md5(x.encode()).hexdigest(), 16)
202 | 
203 | 
204 | def get_check_users():
205 |     check_users = []
206 |     with open(cfg.CHECK_QUERY_PATH) as f:
207 |         for line in f:
208 |             query_data, _ = line.strip().split("\t")
209 |             client_id = json.loads(query_data)["client_id"]
210 |             check_users.append(client_id)
211 |     return check_users
212 | 


--------------------------------------------------------------------------------
/src/train_lgb_model.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gc
  3 | import json
  4 | import logging
  5 | import pprint
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | import feather
 10 | import numpy as np
 11 | import lightgbm as lgb
 12 | import pandas as pd
 13 | from scipy import sparse as sp
 14 | from tqdm import tqdm
 15 | 
 16 | import config as cfg
 17 | from predictors import GBMFeatures, GBMPredictor
 18 | from utils import (
 19 |     ProductEncoder,
 20 |     make_coo_row,
 21 |     normalized_average_precision,
 22 |     get_shard_path,
 23 |     cache_to_feather,
 24 | )
 25 | 
 26 | logging.basicConfig(
 27 |     level=logging.INFO,
 28 |     format="%(asctime)s - %(message)s",
 29 |     handlers=[logging.FileHandler("lgb_model.log"), logging.StreamHandler()],
 30 | )
 31 | 
 32 | 
 33 | def evalute_queries(queryset_file, max_records=1000):
 34 |     check_scores = []
 35 |     with open(queryset_file) as fin:
 36 |         for i, line in enumerate(tqdm(fin)):
 37 |             splitted = line.strip().split("\t")
 38 |             if len(splitted) == 1:
 39 |                 query_data = json.loads(splitted[0])
 40 |                 next_transaction = query_data["target"][0]
 41 |             else:
 42 |                 query_data, next_transaction = map(json.loads, splitted)
 43 |                 query_data["target"] = [next_transaction]
 44 | 
 45 |             query_data["transaction_history"] = sorted(
 46 |                 query_data["transaction_history"], key=lambda x: x["datetime"]
 47 |             )
 48 |             recommended_items = PREDICTOR.predict(query_data, PREDICTOR.lgb_model)
 49 | 
 50 |             gt_items = query_data["target"][0]["product_ids"]
 51 |             nap = normalized_average_precision(gt_items, recommended_items)
 52 |             check_scores.append(nap)
 53 | 
 54 |             if i == max_records:
 55 |                 break
 56 |     return np.mean(check_scores)
 57 | 
 58 | 
 59 | def parse_args():
 60 |     parser = argparse.ArgumentParser()
 61 |     parser.add_argument("--N", type=int, default=100)
 62 |     return parser.parse_args()
 63 | 
 64 | 
 65 | if __name__ == "__main__":
 66 |     args = parse_args()
 67 |     N_POOL = args.N
 68 | 
 69 |     ASSETS_DIR = cfg.ASSETS_DIR
 70 |     SHARDS = range(cfg.NUM_SHARDS - 1)
 71 |     NUM_TEST_SHARD = cfg.NUM_SHARDS - 1
 72 | 
 73 |     TRAIN_DIR = Path(f"../tmp/train_chunks_{N_POOL}")
 74 |     TEST_DIR = Path(f"../tmp/test_chunks_{N_POOL}")
 75 |     logger = logging.getLogger(__name__)
 76 |     product_encoder = ProductEncoder(cfg.PRODUCT_CSV_PATH)
 77 | 
 78 |     logger.info("Loading train dataset")
 79 |     dfs = []
 80 |     for num_shard in tqdm(SHARDS, leave=False):
 81 |         if Path(f"{TRAIN_DIR}/df_train_{num_shard}.feather").exists():
 82 |             dfs.append(
 83 |                 feather.read_dataframe(f"{TRAIN_DIR}/df_train_{num_shard}.feather")
 84 |             )
 85 | 
 86 |     logger.info("Join chunks to full train dataframe")
 87 |     df_gbm_train = pd.concat(dfs, sort=False)
 88 |     logger.info(f"Shape of the train dataframe {df_gbm_train.shape}")
 89 | 
 90 |     del dfs
 91 |     gc.collect()
 92 | 
 93 |     logger.info("Loading test dataset")
 94 |     df_gbm_test = feather.read_dataframe(TEST_DIR / f"df_test_{NUM_TEST_SHARD}.feather")
 95 |     gt_all_rec_test = []
 96 |     for js in tqdm(
 97 |         (json.loads(s) for s in open(get_shard_path(NUM_TEST_SHARD))), leave=False
 98 |     ):
 99 |         target_products = set(
100 |             product_encoder.toIdx([pid for pid in js["target"][0]["product_ids"]])
101 |         )
102 |         gt_products = dict(client_id=js["client_id"], products=list(target_products))
103 |         gt_all_rec_test.append(gt_products)
104 |     logger.info(f"Shape of the test dataframe {df_gbm_test.shape}")
105 | 
106 |     logger.info("Add query_id column")
107 |     df_gbm_train["query_id"] = df_gbm_train.groupby("client_id").ngroup()
108 |     df_gbm_test["query_id"] = df_gbm_test.groupby("client_id").ngroup()
109 | 
110 |     logger.info("Build LGB datasets")
111 |     drop_cols = ["client_id", "target", "query_id"]
112 |     train_ds = lgb.Dataset(
113 |         df_gbm_train.drop(drop_cols, errors="ignore", axis=1),
114 |         df_gbm_train["target"],
115 |         group=df_gbm_train["query_id"].value_counts().sort_index().values,
116 |     )
117 |     test_ds = lgb.Dataset(
118 |         df_gbm_test.drop(drop_cols, errors="ignore", axis=1),
119 |         df_gbm_test["target"],
120 |         group=df_gbm_test["query_id"].value_counts().sort_index().values,
121 |     )
122 | 
123 |     lgb_params = dict(
124 |         objective="binary",
125 |         #     objective='lambdarank',
126 |         max_depth=12,
127 |         random_state=42,
128 |         learning_rate=0.05,
129 |         lambda_l2=10,
130 |         metric=("binary", "map"),
131 |         eval_at=30,
132 |         max_bin=63,
133 |         first_metric_only=True,
134 |     )
135 |     num_boost_round = 6000
136 |     logger.info("LGB params:\n%s", pprint.pformat(lgb_params))
137 | 
138 |     gbm = lgb.train(
139 |         lgb_params,
140 |         train_ds,
141 |         num_boost_round,
142 |         valid_sets=(train_ds, test_ds),
143 |         verbose_eval=10,
144 |         early_stopping_rounds=100,
145 |     )
146 | 
147 |     drop_cols = ["client_id", "target", "lgb_scores", "query_id"]
148 |     lgb_scores = gbm.predict(df_gbm_test.drop(drop_cols, axis=1, errors="ignore"))
149 |     df_gbm_test["lgb_scores"] = lgb_scores
150 | 
151 |     lgb_ranked = (
152 |         df_gbm_test.groupby("client_id")[["idx", "lgb_scores"]]
153 |         .apply(
154 |             lambda x: x.sort_values("lgb_scores", ascending=False)[:30]["idx"].tolist()
155 |         )
156 |         .to_dict()
157 |     )
158 | 
159 |     gt_test = {item["client_id"]: item["products"] for item in gt_all_rec_test}
160 |     scores = []
161 |     for client_id, recommended_idx in lgb_ranked.items():
162 |         ap = normalized_average_precision(gt_test[client_id], recommended_idx)
163 |         scores.append(ap)
164 |     model_score = np.mean(scores)
165 |     logger.info(f"Test score: {model_score}")
166 | 
167 |     params_str = "__".join(
168 |         "_".join(map(str, item)) for item in gbm.params.items() if item[0] != "metric"
169 |     )
170 |     model_filename = f"lgbm_model__pool_{N_POOL}__{params_str}__{model_score:.6f}.txt"
171 |     model_path = str(ASSETS_DIR / model_filename)
172 |     gbm.save_model(model_path)
173 |     logger.info(f"Model was saved to {model_path}")
174 | 
175 |     # Check predictor
176 |     PREDICTOR = GBMPredictor(
177 |         lgbm_model_path=str(ASSETS_DIR / model_filename),
178 |         product_csv_path=ASSETS_DIR / "products.csv",
179 |         model_pickled_path=ASSETS_DIR / "model_implicit_cosine_50.pkl",
180 |         products_misc_path=ASSETS_DIR / "products_misc.csv",
181 |         product_features_encoder_path=ASSETS_DIR / "product_features.pkl",
182 |         implicit_tfidf_path=ASSETS_DIR / "model_implicit_tf_idf100.pkl",
183 |         implicit_als_path=ASSETS_DIR / "model_implicit_als_16fact_12iter.pkl",
184 |         fm_features_feather_path=ASSETS_DIR / "implicit_scores.feather",
185 |         implicit_cosine2_path=ASSETS_DIR / "model_implicit_cosine2.pkl",
186 |         umap_item_emb_path=ASSETS_DIR / "umap_item_emb.npy",
187 |         item_co_occurrence_path=ASSETS_DIR / "item_co_occurrence_min_cnt_5.npz",
188 |         item_occurrence_path=ASSETS_DIR / "item_occurrence.npy",
189 |         user_prod_log_idf_path=ASSETS_DIR / "user_prod_log_idf.npy",
190 |         tran_prod_log_idf_path=ASSETS_DIR / "tran_prod_log_idf.npy",
191 |         N=N_POOL,
192 |         # trunk_svd_arr_path=ASSETS_DIR / "svd_128_components_T.npy",
193 |         # faiss_index_path=str(ASSETS_DIR / "faiss_base.idx"),
194 |         # train_scores_path=ASSETS_DIR / "X_scores_sparse.npz",
195 |         # faiss_neighbors=512,
196 |         # faiss_nprobe=8,
197 |     )
198 | 
199 |     # check queries
200 |     check_queryset_file = cfg.CHECK_QUERY_PATH
201 |     logger.info(f"Evaluating check queries {check_queryset_file}")
202 |     check_score = evalute_queries(check_queryset_file)
203 |     logger.info(f"Check score: {check_score}")
204 | 
205 |     # test queries
206 |     max_records = 1000
207 |     queryset_file = f"{cfg.JSONS_DIR}/{NUM_TEST_SHARD}.jsons.splitted"
208 |     logger.info(
209 |         f"Evaluating test queries {queryset_file} with {max_records} max_records"
210 |     )
211 |     test_score = evalute_queries(queryset_file, max_records=max_records)
212 |     logger.info(f"Test score: {test_score}")
213 | 


--------------------------------------------------------------------------------
/src/predictors.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import datetime as dt
  3 | import itertools as it
  4 | import pickle
  5 | from collections import defaultdict, Counter
  6 | 
  7 | import faiss
  8 | import lightgbm as lgb
  9 | import math
 10 | import numpy as np
 11 | import pandas as pd
 12 | import feather
 13 | from catboost import CatBoost
 14 | from scipy import sparse as sp
 15 | 
 16 | from utils import ProductEncoder, ProductFeatEncoder, make_coo_row
 17 | 
 18 | 
 19 | class GBMFeatures:
 20 |     def __init__(
 21 |         self,
 22 |         product_csv_path,
 23 |         model_pickled_path,
 24 |         products_misc_path,
 25 |         implicit_tfidf_path=None,
 26 |         product_features_encoder_path=None,
 27 |         implicit_als_path=None,
 28 |         N=100,
 29 |         cache_fm_path=None,
 30 |         implicit_cosine2_path=None,
 31 |         fm_features_feather_path=None,
 32 |         trunk_svd_arr_path=None,
 33 |         faiss_index_path=None,
 34 |         train_scores_path=None,
 35 |         umap_item_emb_path=None,
 36 |         faiss_neighbors=128,
 37 |         faiss_nprobe=32,
 38 |         item_co_occurrence_path=None,
 39 |         item_occurrence_path=None,
 40 |         user_prod_log_idf_path=None,
 41 |         tran_prod_log_idf_path=None,
 42 |         random_seed=0,
 43 |     ):
 44 |         self.product_encoder = ProductEncoder(product_csv_path)
 45 | 
 46 |         if not product_features_encoder_path:
 47 |             self.product_features = ProductFeatEncoder(product_csv_path)
 48 |         else:
 49 |             self.product_features = pickle.load(
 50 |                 open(product_features_encoder_path, "rb")
 51 |             )
 52 | 
 53 |         self.model = pickle.load(open(model_pickled_path, "rb"))
 54 |         self.init_misc_features(products_misc_path, N)
 55 | 
 56 |         if implicit_tfidf_path:
 57 |             self.implicit_tfidf = pickle.load(open(implicit_tfidf_path, "rb"))
 58 |         if implicit_als_path:
 59 |             self.implicit_als = pickle.load(open(implicit_als_path, "rb"))
 60 |         if implicit_cosine2_path:
 61 |             self.implicit_cosine2 = pickle.load(open(implicit_cosine2_path, "rb"))
 62 | 
 63 |         if cache_fm_path is not None:
 64 |             self.cache_fm_feat = pickle.load(open(cache_fm_path, "rb"))
 65 |         else:
 66 |             self.cache_fm_feat = defaultdict(dict)
 67 | 
 68 |         if trunk_svd_arr_path:
 69 |             self.trunk_svd_arr = np.load(trunk_svd_arr_path)
 70 |         if train_scores_path:
 71 |             self.X_scores = sp.load_npz(train_scores_path)
 72 | 
 73 |         self.faiss_neighbors = faiss_neighbors
 74 |         self.faiss_nprobe = faiss_nprobe
 75 |         if faiss_index_path:
 76 |             self.faiss_index = faiss.read_index(faiss_index_path)
 77 |             self.faiss_index.nprobe = faiss_nprobe
 78 | 
 79 |         if item_occurrence_path:
 80 |             # add 1 to be able to divide co_occurrence features on occurrence
 81 |             self.item_occurrence = np.load(item_occurrence_path) + 1
 82 |         if item_co_occurrence_path:
 83 |             self.item_co_occurrence = sp.load_npz(item_co_occurrence_path)
 84 | 
 85 |         if user_prod_log_idf_path:
 86 |             self.user_prod_log_idf = np.load(user_prod_log_idf_path)
 87 |         if tran_prod_log_idf_path:
 88 |             self.tran_prod_log_idf = np.load(tran_prod_log_idf_path)
 89 | 
 90 |         if umap_item_emb_path:
 91 |             self.umap_item_emb = np.load(umap_item_emb_path)
 92 | 
 93 |         if fm_features_feather_path:
 94 |             self.feather_fm_features = feather.read_dataframe(fm_features_feather_path)
 95 |             self.feather_names = self.feather_fm_features.columns
 96 |             self.feather_values = self.feather_fm_features.values
 97 |             self.get_implicit_features = self.get_feather_features
 98 |         else:
 99 |             self.get_implicit_features = self.get_implicit_train_model_features
100 | 
101 |         self.feature_extractor = {"gender": {"M": 0, "F": 1, "U": 2}}
102 | 
103 |         self.N = N
104 |         self.N_05 = self.N // 2
105 |         self.pos_0_score = 1000
106 |         self.datetime_stamp_2000 = dt.datetime(2000, 1, 1).toordinal()
107 | 
108 |         # cache
109 |         self.default_history_feat = defaultdict(dict)
110 |         self.date_cache = {}
111 |         self.cooccur_default_values = {}
112 | 
113 |         random.seed(random_seed)
114 | 
115 |     def init_misc_features(self, path, N):
116 |         self.df_products_misc = pd.read_csv(path, index_col=0)
117 |         self.pos_in_top = self.df_products_misc["popularity_position"].to_dict()
118 |         self.top_products = {pos: idx for idx, pos in self.pos_in_top.items()}
119 |         self.product_cost = self.df_products_misc["cost"].round(2).to_dict()
120 |         self.top_n_items = list(self.top_products.values())[:N]
121 | 
122 |     # TODO: refactor
123 |     def get_items_pool(self, product_ind_csr, product_ind_csr_not_normed):
124 |         selected_items = set()
125 | 
126 |         # add cosine 50 items
127 |         recs_cosine50 = self.model.recommend(
128 |             userid=0,
129 |             user_items=product_ind_csr,
130 |             N=self.N,
131 |             filter_already_liked_items=False,
132 |             recalculate_user=True,
133 |         )
134 |         selected_items |= set(item[0] for item in recs_cosine50)
135 | 
136 |         # add tfidf items
137 |         recs_tfidf = self.implicit_tfidf.recommend(
138 |             userid=0,
139 |             user_items=product_ind_csr,
140 |             N=self.N,
141 |             filter_already_liked_items=False,
142 |             recalculate_user=True,
143 |         )
144 |         selected_items |= set(item[0] for item in recs_tfidf)
145 | 
146 |         # add cosine2 items
147 |         recs_cosine2 = self.implicit_cosine2.recommend(
148 |             userid=0,
149 |             user_items=product_ind_csr,
150 |             N=self.N,
151 |             filter_already_liked_items=False,
152 |             recalculate_user=True,
153 |         )
154 |         selected_items |= set(item[0] for item in recs_cosine2)
155 | 
156 |         # TODO: increase speed or delete
157 |         # add ALS items
158 |         # recs_als = self.implicit_als.recommend(
159 |         #     userid=0, user_items=product_ind_csr_not_normed, N=self.N,
160 |         #     filter_already_liked_items=False, recalculate_user=True,
161 |         # )
162 |         # selected_items |= set(item[0] for item in recs_als)
163 | 
164 |         # add top items
165 |         selected_items |= set(self.top_n_items)
166 | 
167 |         # add top history items
168 |         if product_ind_csr.nnz > 0:
169 |             selected_items |= set(
170 |                 self._get_top_recommendations(
171 |                     product_ind_csr.A[0], k=min(product_ind_csr.nnz + 1, self.N_05)
172 |                 )[0]
173 |             )
174 |         return selected_items
175 | 
176 |     def get_default_history_feat(self, idx, cache=True):
177 |         if cache and idx in self.default_history_feat:
178 |             return self.default_history_feat[idx]
179 | 
180 |         product_feats = {}
181 |         product_feats["quantity"] = 0
182 |         product_feats["items_in_different_transactions"] = 0
183 |         product_feats["mean_items_in_trans"] = 0
184 |         product_feats["mean_items_in_trans_log"] = 0
185 |         product_feats["mean_items_in_history"] = 0
186 |         product_feats["mean_items_in_trans_log_denom"] = 0
187 | 
188 |         product_feats["user_item_idf"] = -1
189 |         product_feats["user_tran_idf_mult_total_idf"] = -1
190 | 
191 |         product_feats["total_items_in_history_pct"] = 0
192 |         product_feats["item_spent"] = 0
193 |         product_feats["item_cost"] = self.product_cost[idx]
194 | 
195 |         product_feats["item_is_in_last_transaction"] = 0
196 |         product_feats["last_item_purchase_num_days"] = -1
197 |         product_feats["mean_item_days_between_purchase"] = -1
198 | 
199 |         product_feats["cooc_item_sum_score"] = -1
200 |         product_feats["cooc_item_mean_score"] = -1
201 |         product_feats["cooc_mean_from_items"] = -1
202 |         product_feats["cooc_sum_from_items"] = -1
203 |         product_feats["cooc_max_from_items"] = -1
204 | 
205 |         if cache:
206 |             self.default_history_feat[idx] = product_feats
207 |         return self.default_history_feat[idx]
208 | 
209 |     def to_date(self, x, cache=True):
210 |         """
211 |         Convert datetime to datetime.date
212 | 
213 |         # format for training and inference is different:
214 |         # training: "%Y-%m-%d %H:%M:%S"
215 |         # inference: "%Y-%m-%dT%H:%M:%S"
216 |         """
217 |         str_date = x[:10]
218 |         if str_date in self.date_cache:
219 |             return self.date_cache[str_date]
220 | 
221 |         # self.date_cache[str_date] = dt.datetime.strptime(str_date, "%Y-%m-%d")
222 |         # faster approach
223 |         self.date_cache[str_date] = dt.datetime(*map(int, str_date.split("-")))
224 |         return self.date_cache[str_date]
225 | 
226 |     def get_product_feat_from_history(self, transaction_history, cooc_scores, item2pos):
227 |         product_feats = defaultdict(Counter)
228 |         if not transaction_history:
229 |             return product_feats
230 | 
231 |         items_cooc_scores = np.zeros(len(cooc_scores))
232 |         items_dates_purchase = defaultdict(list)
233 |         num_items_in_purchase = 0
234 | 
235 |         for txn in transaction_history:
236 |             purchase_date = self.to_date(txn["datetime"])
237 |             product_ind = self.product_encoder.toIdx(
238 |                 item["product_id"] for item in txn["products"]
239 |             )
240 |             product_cooc_pos = [item2pos[pidx] for pidx in product_ind]
241 |             items_cooc_scores[product_cooc_pos] += cooc_scores[product_cooc_pos].sum(
242 |                 axis=1
243 |             )
244 | 
245 |             for pidx, item in zip(product_ind, txn["products"]):
246 |                 product_feats[pidx]["quantity"] += item["quantity"]
247 |                 product_feats[pidx]["items_in_different_transactions"] += 1
248 |                 num_items_in_purchase += 1
249 |                 items_dates_purchase[pidx].append(purchase_date)
250 | 
251 |         num_items_in_purchase = max(num_items_in_purchase, 1)
252 |         len_trans = max(len(transaction_history), 1)
253 |         cooc_mean_from_items = items_cooc_scores.mean()
254 |         cooc_sum_from_items = items_cooc_scores.sum()
255 |         cooc_max_from_items = items_cooc_scores.max()
256 | 
257 |         for pidx, item in product_feats.items():
258 |             num_trans_with_item = item["items_in_different_transactions"]
259 |             item["mean_items_in_trans"] = num_trans_with_item / len_trans
260 |             # 1e-2 for better distribution
261 |             item["mean_items_in_trans_log"] = np.log(
262 |                 num_trans_with_item / len_trans + 1e-2
263 |             )
264 |             item["mean_items_in_history"] = num_trans_with_item / num_items_in_purchase
265 |             item["mean_items_in_trans_log_denom"] = num_trans_with_item / (
266 |                 1 + math.log(len_trans)
267 |             )
268 | 
269 |             # iDF scores, usual item-item TF-iDF is calculated via the implicit model
270 |             item["user_item_idf"] = self.user_prod_log_idf[pidx]
271 |             item["user_tran_idf_mult_total_idf"] = (
272 |                 math.log(len_trans / num_trans_with_item) * self.tran_prod_log_idf[pidx]
273 |             )
274 | 
275 |             max_quantity = max(item["quantity"], 1)
276 |             item["total_items_in_history_pct"] = max_quantity / num_items_in_purchase
277 |             item["item_spent"] = self.product_cost[pidx] * max_quantity
278 |             item["item_cost"] = self.product_cost[pidx]
279 | 
280 |             item["item_is_in_last_transaction"] = int(
281 |                 items_dates_purchase[pidx][-1] == purchase_date
282 |             )
283 |             item["last_item_purchase_num_days"] = (
284 |                 purchase_date - items_dates_purchase[pidx][-1]
285 |             ).days
286 | 
287 |             item["cooc_item_sum_score"] = items_cooc_scores[item2pos[pidx]]
288 |             item["cooc_item_mean_score"] = (
289 |                 items_cooc_scores[item2pos[pidx]] / num_trans_with_item
290 |             )
291 |             item["cooc_mean_from_items"] = cooc_mean_from_items
292 |             item["cooc_sum_from_items"] = cooc_sum_from_items
293 |             item["cooc_max_from_items"] = cooc_max_from_items
294 | 
295 |             if len(items_dates_purchase[pidx]) > 1:
296 |                 items_purchase = items_dates_purchase[pidx]
297 |                 items_days_between_purchase = [
298 |                     (end_date - start_date).days
299 |                     for start_date, end_date in zip(
300 |                         items_purchase[:-1], items_purchase[1:]
301 |                     )
302 |                 ]
303 |                 item["mean_item_days_between_purchase"] = sum(
304 |                     items_days_between_purchase
305 |                 ) / len(items_days_between_purchase)
306 |             else:
307 |                 item["mean_item_days_between_purchase"] = -1
308 | 
309 |         return product_feats
310 | 
311 |     def get_highlevel_feat_from_history(self, transaction_history):
312 |         feats = {}
313 |         num_items_in_purchase = []
314 |         uniq_stores = set()
315 |         purchase_sum = 0
316 |         prev_date = None
317 |         dates_intervals = []
318 |         min_date = None
319 | 
320 |         for txn in transaction_history:
321 |             purchase_sum += txn["purchase_sum"]
322 |             uniq_stores.add(txn["store_id"])
323 |             num_items_in_purchase.append(len(txn["products"]))
324 | 
325 |             cur_date = self.to_date(txn["datetime"])
326 |             if prev_date:
327 |                 date_diff = (cur_date - prev_date).days
328 |                 dates_intervals.append(date_diff)
329 | 
330 |             if min_date is None:
331 |                 min_date = cur_date
332 | 
333 |             prev_date = cur_date
334 | 
335 |         total_purchases = sum(num_items_in_purchase)
336 |         feats["purchase_sum"] = purchase_sum
337 |         feats["num_uniq_store"] = len(uniq_stores)
338 |         feats["mean_num_items"] = total_purchases / max(len(num_items_in_purchase), 1)
339 |         feats["total_bought_items"] = total_purchases
340 |         feats["max_bought_items"] = (
341 |             max(num_items_in_purchase) if num_items_in_purchase else 0
342 |         )
343 |         feats["mean_days_between_purchases"] = len(dates_intervals) / max(
344 |             sum(dates_intervals), 1
345 |         )
346 |         feats["max_days_between_purchases"] = (
347 |             max(dates_intervals) if dates_intervals else -1
348 |         )
349 | 
350 |         if min_date is not None:
351 |             feats["days_between_first_and_last_purchase"] = (cur_date - min_date).days
352 |         else:
353 |             feats["days_between_first_and_last_purchase"] = -1
354 | 
355 |         return feats
356 | 
357 |     def get_implicit_model_features(
358 |         self,
359 |         model,
360 |         idx,
361 |         model_prefix="fm",
362 |         n=5,
363 |         k_score=0,
364 |         skip_first=False,
365 |         only_score=True,
366 |         cache=True,
367 |         only_second=True,
368 |     ):
369 |         model_cache_name = str(model.__class__) + model_prefix
370 |         if cache and idx in self.cache_fm_feat[model_cache_name]:
371 |             return self.cache_fm_feat[model_cache_name][idx]
372 | 
373 |         similar_items = model.similar_items(idx, max(k_score, n))
374 |         fm_features = {}
375 |         for i, (_, score) in enumerate(similar_items[:n]):
376 |             if i == 0 and skip_first:
377 |                 continue
378 | 
379 |             fm_features[f"top_item_{model_prefix}_{i}_score"] = score
380 | 
381 |         if k_score != 0:
382 |             fm_features[f"sum_top_{model_prefix}_{k_score}_score"] = sum(
383 |                 score for _, score in similar_items
384 |             )
385 |             fm_features[f"mean_top_{model_prefix}_{k_score}_score"] = np.mean(
386 |                 [score for _, score in similar_items]
387 |             )
388 | 
389 |         if cache:
390 |             self.cache_fm_feat[model_cache_name][idx] = fm_features
391 |         return fm_features
392 | 
393 |     def get_feather_features(self, idx):
394 |         """Transform feather dataframe to features dict."""
395 |         scores = self.feather_values[idx]
396 |         return {key: val for key, val in zip(self.feather_names, scores)}
397 | 
398 |     def get_implicit_train_model_features(self, idx):
399 |         return {
400 |             **self.get_implicit_model_features(
401 |                 self.model, idx, "cosine50", n=3, k_score=0
402 |             ),
403 |             **self.get_implicit_model_features(
404 |                 self.implicit_cosine2, idx, "cosine2", n=2, skip_first=True, k_score=0
405 |             ),
406 |             **self.get_implicit_model_features(
407 |                 self.implicit_tfidf, idx, "tfidf50", n=3, skip_first=True, k_score=0
408 |             ),
409 |             **self.get_implicit_model_features(
410 |                 self.implicit_als, idx, "als", n=3, k_score=0
411 |             ),
412 |         }
413 | 
414 |     @staticmethod
415 |     def _get_top_recommendations(row, k=100):
416 |         k = min(len(row), k)
417 |         ind = np.argpartition(row, -k)[-k:]
418 |         top_k_ind = ind[np.argsort(row[ind])][::-1]
419 |         return top_k_ind, row[top_k_ind]
420 | 
421 |     def _get_faiss_scores(self, product_ind_csr):
422 |         x_dense = product_ind_csr * self.trunk_svd_arr
423 |         faiss_result = self.faiss_index.search(x_dense, self.faiss_neighbors)
424 |         neighbors = faiss_result[1]
425 |         scores = np.asarray(faiss_result[0] * self.X_scores[neighbors[0]]).flatten()
426 |         return scores, faiss_result
427 | 
428 |     def get_faiss_features(self, product_ind_csr, selected_items):
429 |         scores, faiss_result = self._get_faiss_scores(product_ind_csr)
430 |         top_k_ind, sorted_predictions = self._get_top_recommendations(
431 |             scores[selected_items], k=10000
432 |         )
433 | 
434 |         top_neighbor = faiss_result[0][0][0]
435 |         faiss_neighbor_mean = faiss_result[0].mean()
436 |         non_zero_idx = scores.nonzero()[0]
437 |         non_zero_scores = len(non_zero_idx)
438 |         sum_scores = scores.sum()
439 |         features = defaultdict(dict)
440 | 
441 |         pos = 0
442 |         prev_score = -10
443 |         for idx, score in zip(top_k_ind, sorted_predictions):
444 |             if score == 0:
445 |                 pos = self.pos_0_score
446 | 
447 |             pidx = selected_items[idx]
448 |             features[pidx]["faiss_score"] = score
449 |             features[pidx]["faiss_pos"] = pos
450 |             features[pidx]["faiss_neighbor_top_score"] = top_neighbor
451 |             features[pidx]["faiss_neighbor_mean_score"] = faiss_neighbor_mean
452 |             features[pidx]["faiss_num_non_zero_scores"] = non_zero_scores
453 |             features[pidx]["faiss_scores_sum"] = sum_scores
454 | 
455 |             if prev_score != score:
456 |                 pos += 1
457 |             prev_score = score
458 |         return features
459 | 
460 |     def get_umap_scores(self, product_ind_csr, selected_items):
461 |         product_emb = (product_ind_csr * self.umap_item_emb)[0]
462 | 
463 |         features = defaultdict(dict)
464 |         for pidx in selected_items:
465 |             features[pidx]["umap_user_emb_0"] = product_emb[0]
466 |             features[pidx]["umap_user_emb_1"] = product_emb[1]
467 | 
468 |             item_emb = self.umap_item_emb[pidx]
469 |             features[pidx]["umap_item_emb_0"] = item_emb[0]
470 |             features[pidx]["umap_item_emb_1"] = item_emb[1]
471 | 
472 |             user_item_emb = np.mean([product_emb, item_emb], axis=0)
473 |             features[pidx]["umap_user_item_emb_0"] = user_item_emb[0]
474 |             features[pidx]["umap_user_item_emb_1"] = user_item_emb[1]
475 | 
476 |         return features
477 | 
478 |     @staticmethod
479 |     def _implicit_rank(model, product_ind_csr, selected_items):
480 |         predictions = model.rank_items(
481 |             0, product_ind_csr, selected_items, recalculate_user=True
482 |         )
483 |         return predictions
484 | 
485 |     @staticmethod
486 |     def _custom_implicit_rank(model, product_ind_csr, selected_items):
487 |         recommendations = product_ind_csr.dot(model.similarity)
488 |         predictions = sorted(
489 |             zip(selected_items, recommendations[0, selected_items].A[0]),
490 |             key=lambda x: -x[1],
491 |         )
492 |         return predictions
493 | 
494 |     def get_implicit_scores(
495 |         self, model, product_ind_csr, selected_items, model_prefix="tfidf"
496 |     ):
497 |         if model_prefix in ("tfidf", "cosine2", "cosine50"):
498 |             sorted_predictions = self._custom_implicit_rank(
499 |                 model, product_ind_csr, selected_items
500 |             )
501 |         else:
502 |             sorted_predictions = self._implicit_rank(
503 |                 model, product_ind_csr, selected_items
504 |             )
505 | 
506 |         features = defaultdict(dict)
507 |         pos = 0
508 |         prev_score = -10
509 |         for pidx, score in sorted_predictions:
510 |             if score == 0:
511 |                 pos = self.pos_0_score
512 | 
513 |             features[pidx][f"{model_prefix}_score"] = score
514 |             features[pidx][f"{model_prefix}_pos"] = pos
515 | 
516 |             if prev_score != score:
517 |                 pos += 1
518 |             prev_score = score
519 |         return features
520 | 
521 |     def get_num_days_from_last_transaction(self, js, last_transaction_date):
522 |         if last_transaction_date is None:
523 |             return -1
524 | 
525 |         try:
526 |             query_date = self.to_date(js.get("query_time"))
527 |             num_days_from_last_transaction = (query_date - last_transaction_date).days
528 | 
529 |         except TypeError:
530 |             target_date = self.to_date(js["target"][0]["datetime"])
531 |             num_days_from_last_transaction = (target_date - last_transaction_date).days
532 |             num_days_from_last_transaction = random.randint(
533 |                 0, num_days_from_last_transaction
534 |             )
535 | 
536 |         return num_days_from_last_transaction
537 | 
538 |     def get_co_occurrence_features(
539 |         self,
540 |         cooc_scores,
541 |         weights=None,
542 |         prefix="co_occurrence",
543 |         default_value=-1,
544 |         aggs=("max", "mean", "sum"),
545 |     ):
546 |         keys = [f"{prefix}_{agg_name}" for agg_name in aggs]
547 | 
548 |         if weights is not None:
549 |             keys += [f"{key}_weighted" for key in keys]
550 | 
551 |         if cooc_scores.size == 0:
552 |             if prefix not in self.cooccur_default_values:
553 |                 self.cooccur_default_values[prefix] = dict(
554 |                     zip(keys, it.repeat(default_value))
555 |                 )
556 |             return it.repeat(
557 |                 self.cooccur_default_values[prefix], 1000
558 |             )  # to prevent endless iterations
559 | 
560 |         scores = []
561 |         keys = []
562 |         for agg_name in aggs:
563 |             agg_func = getattr(np, agg_name)
564 |             scores.append(agg_func(cooc_scores, axis=1))
565 |             keys.append(f"{prefix}_{agg_name}")
566 | 
567 |         if weights is not None:
568 |             cooc_scores_w = cooc_scores * weights
569 |             for agg_name in aggs:
570 |                 agg_func = getattr(np, agg_name)
571 |                 scores.append(agg_func(cooc_scores_w, axis=1))
572 |                 keys.append(f"{prefix}_{agg_name}_weighted")
573 | 
574 |         features = []
575 |         for values in np.vstack(scores).T:
576 |             features.append(dict(zip(keys, values)))
577 |         return iter(features)
578 | 
579 |     def get_cooc_features(
580 |         self, cooc_purchased_all_scores, selected_items, purchased_items, cooc_weights
581 |     ):
582 |         cooc_scores = cooc_purchased_all_scores[selected_items].A
583 | 
584 |         cooc_scores_norm_item = (
585 |             cooc_scores / self.item_occurrence[selected_items][:, None]
586 |         )
587 |         cooc_scores_norm_co_item = cooc_scores / self.item_occurrence[purchased_items]
588 | 
589 |         cooc_norm_item_features = self.get_co_occurrence_features(
590 |             cooc_scores_norm_item, cooc_weights, "co_occurrence_item_norm"
591 |         )
592 |         cooc_scores_norm_co_item_features = self.get_co_occurrence_features(
593 |             cooc_scores_norm_co_item, cooc_weights, "co_occurrence_co_item_norm"
594 |         )
595 |         return cooc_norm_item_features, cooc_scores_norm_co_item_features
596 | 
597 |     def get_gbm_features(
598 |         self, js, train=False, drop_null_target_records=False, add_target_records=False
599 |     ):
600 |         # sort history as in public and check it was unordered
601 |         js["transaction_history"] = sorted(
602 |             js["transaction_history"], key=lambda x: x["datetime"]
603 |         )
604 | 
605 |         if train:
606 |             target_products = set(
607 |                 self.product_encoder.toIdx(
608 |                     [pid for pid in js["target"][0]["product_ids"]]
609 |                 )
610 |             )
611 | 
612 |         transaction_history = js.get("transaction_history", [])
613 |         if transaction_history:
614 |             last_transaction_date = self.to_date(
615 |                 transaction_history[-1].get("datetime")
616 |             )
617 |             # num days from 2000/1/1
618 |             last_transaction_timestamp = (
619 |                 last_transaction_date.toordinal() - self.datetime_stamp_2000
620 |             )
621 |         else:
622 |             last_transaction_date = None
623 |             last_transaction_timestamp = None
624 |         num_days_from_last_transaction = self.get_num_days_from_last_transaction(
625 |             js, last_transaction_date
626 |         )
627 | 
628 |         product_ind_csr_not_normed = make_coo_row(
629 |             transaction_history, self.product_encoder, normalize=False
630 |         ).tocsr()
631 |         product_ind_csr = make_coo_row(
632 |             js.get("transaction_history", []), self.product_encoder
633 |         ).tocsr()
634 | 
635 |         selected_items = list(
636 |             self.get_items_pool(product_ind_csr, product_ind_csr_not_normed)
637 |         )
638 | 
639 |         cosine50_scores = self.get_implicit_scores(
640 |             self.model, product_ind_csr, selected_items, model_prefix="cosine50"
641 |         )
642 |         tf_idf_scores = self.get_implicit_scores(
643 |             self.implicit_tfidf, product_ind_csr, selected_items, model_prefix="tfidf"
644 |         )
645 |         cosine2_scores = self.get_implicit_scores(
646 |             self.implicit_cosine2,
647 |             product_ind_csr,
648 |             selected_items,
649 |             model_prefix="cosine2",
650 |         )
651 |         als_scores = self.get_implicit_scores(
652 |             self.implicit_als,
653 |             product_ind_csr_not_normed,
654 |             selected_items,
655 |             model_prefix="als",
656 |         )
657 | 
658 |         # co occurrence features
659 |         # all user items purchases
660 |         purchased_items = product_ind_csr.indices
661 |         cooc_weights = product_ind_csr.data
662 |         cooc_purchased_all_scores = self.item_co_occurrence[:, purchased_items]
663 | 
664 |         cooc_norm_item_features, cooc_scores_norm_co_item_features = self.get_cooc_features(
665 |             cooc_purchased_all_scores, selected_items, purchased_items, cooc_weights
666 |         )
667 | 
668 |         # scores per each transaction
669 |         cooc_purchased_scores = cooc_purchased_all_scores[purchased_items].A
670 |         cooc_purchased_scores_norm_co_item = (
671 |             cooc_purchased_scores / self.item_occurrence[purchased_items]
672 |         )
673 |         purchased_item2pos = {pid: pos for pos, pid in enumerate(purchased_items)}
674 | 
675 |         product_history_features = self.get_product_feat_from_history(
676 |             js.get("transaction_history", []),
677 |             cooc_purchased_scores_norm_co_item,
678 |             purchased_item2pos,
679 |         )
680 | 
681 |         high_level_features = self.get_highlevel_feat_from_history(
682 |             js.get("transaction_history", [])
683 |         )
684 | 
685 |         # faiss_features = self.get_faiss_features(product_ind_csr, selected_items)
686 |         umap_scores = self.get_umap_scores(product_ind_csr, selected_items)
687 | 
688 |         gbm_records = []
689 |         for product_idx in selected_items:
690 |             record = dict(
691 |                 **{
692 |                     "idx": product_idx,
693 |                     "age": js["age"],
694 |                     "gender": self.feature_extractor["gender"][js["gender"]],
695 |                     "num_transactions": len(js.get("transaction_history", [])),
696 |                     "popularity_position": self.pos_in_top[product_idx],
697 |                     "last_transaction_timestamp": last_transaction_timestamp,
698 |                     "num_days_from_last_transaction": num_days_from_last_transaction,
699 |                 },
700 |                 **high_level_features,
701 |                 **product_history_features.get(
702 |                     product_idx, self.get_default_history_feat(product_idx)
703 |                 ),
704 |                 **self.product_features.product_features(
705 |                     self.product_encoder.toPid(int(product_idx))
706 |                 ),
707 |                 **self.get_implicit_features(product_idx),
708 |                 **als_scores[product_idx],
709 |                 **tf_idf_scores[product_idx],
710 |                 **cosine50_scores[product_idx],
711 |                 **cosine2_scores[product_idx],
712 |                 **next(cooc_norm_item_features),
713 |                 **next(cooc_scores_norm_co_item_features),
714 |                 # **faiss_features[product_idx],
715 |                 **umap_scores[product_idx],
716 |             )
717 | 
718 |             record["item_pct_spent"] = record.get("item_spent", 0) / max(
719 |                 record.get("purchase_sum", 1), 1
720 |             )
721 | 
722 |             if train:
723 |                 record["target"] = int(product_idx in target_products)
724 |                 record["client_id"] = js["client_id"]
725 | 
726 |             gbm_records.append(record)
727 | 
728 |         if train:
729 |             gt_products = dict(
730 |                 client_id=js["client_id"], products=list(target_products)
731 |             )
732 |             return gbm_records, gt_products
733 | 
734 |         return gbm_records
735 | 
736 | 
737 | class GBMPredictor(GBMFeatures):
738 |     def __init__(
739 |         self,
740 |         product_csv_path,
741 |         *args,
742 |         lgbm_model_path=None,
743 |         cat_model_path=None,
744 |         **kwargs,
745 |     ):
746 |         super(GBMPredictor, self).__init__(product_csv_path, *args, **kwargs)
747 |         self.product_encoder = ProductEncoder(product_csv_path)
748 | 
749 |         if lgbm_model_path:
750 |             self.lgb_model = lgb.Booster(model_file=lgbm_model_path)
751 | 
752 |         if cat_model_path:
753 |             self.cat_model = CatBoost().load_model(cat_model_path)
754 | 
755 |     @staticmethod
756 |     def predict_proba(X, model):
757 |         pred = model.predict(X)
758 |         return pred
759 | 
760 |     def sort_predictions(self, product_idx, gbm_pred, n=30):
761 |         product_idx_sorted, _ = zip(
762 |             *sorted(zip(product_idx, gbm_pred), key=lambda x: -x[1])
763 |         )
764 |         product_ids = self.product_encoder.toPid(
765 |             [idx for idx in product_idx_sorted[:n]]
766 |         )
767 |         return product_ids
768 | 
769 |     # TODO: add different blending weights
770 |     def predict(self, js, models):
771 |         X = self.get_gbm_features(js)
772 |         feature_values = [list(item.values()) for item in X]
773 |         product_idx = [item["idx"] for item in X]
774 |         if isinstance(models, (list, tuple)):
775 |             gbm_pred = np.zeros_like(product_idx, dtype=float)
776 |             for model in models:
777 |                 gbm_pred += 0.5 * self.predict_proba(feature_values, model)
778 |         else:
779 |             gbm_pred = self.predict_proba(feature_values, models)
780 | 
781 |         return self.sort_predictions(product_idx, gbm_pred)
782 | 


--------------------------------------------------------------------------------