├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── baseline
    │   └── ebnerd_feat_baselines.py
    ├── beyond_accuracy
    │   └── make_beyond_accuracy.ipynb
    ├── datasets
    │   ├── ebnerd_descriptive_analysis.ipynb
    │   ├── ebnerd_overview.ipynb
    │   └── plot
    │   │   ├── article_read_time.png
    │   │   ├── body_len.png
    │   │   ├── category_distribution.png
    │   │   ├── category_distribution_ba.png
    │   │   ├── front_article_page.png
    │   │   ├── front_read_time.png
    │   │   ├── inview_len.png
    │   │   ├── subtitle_len.png
    │   │   └── title_len.png
    ├── quick_start
    │   ├── lstur_dummy.py
    │   ├── make_embedding_artifacts.ipynb
    │   ├── naml_dummy.py
    │   ├── npa_dummy.py
    │   ├── nrms_docvec_dummy.py
    │   ├── nrms_dummy.py
    │   ├── nrms_ebnerd.ipynb
    │   └── nrms_ebnerd.py
    └── reproducibility_scripts
    │   ├── args_nrms.py
    │   ├── args_nrms_docvec.py
    │   ├── ebnerd_nrms.py
    │   ├── ebnerd_nrms_doc_hist.py
    │   └── ebnerd_nrms_docvec.py
├── pyproject.toml
├── src
    ├── __init__.py
    └── ebrec
    │   ├── evaluation
    │       ├── __init__.py
    │       ├── _ba_test.py
    │       ├── beyond_accuracy.py
    │       ├── metrics
    │       │   ├── __init__.py
    │       │   ├── _beyond_accuracy.py
    │       │   ├── _classification.py
    │       │   ├── _ranking.py
    │       │   └── _sklearn.py
    │       ├── metrics_protocols.py
    │       ├── protocols.py
    │       └── utils.py
    │   ├── models
    │       ├── fastformer
    │       │   ├── __init__.py
    │       │   ├── dataloader.py
    │       │   ├── fastformer.py
    │       │   └── fastformer_wu.py
    │       └── newsrec
    │       │   ├── __init__.py
    │       │   ├── base_model.py
    │       │   ├── dataloader.py
    │       │   ├── layers.py
    │       │   ├── lstur.py
    │       │   ├── model_config.py
    │       │   ├── naml.py
    │       │   ├── npa.py
    │       │   ├── nrms.py
    │       │   ├── nrms_docvec.py
    │       │   └── utils.py
    │   └── utils
    │       ├── __init__.py
    │       ├── _articles.py
    │       ├── _articles_behaviors.py
    │       ├── _behaviors.py
    │       ├── _constants.py
    │       ├── _decay.py
    │       ├── _descriptive_analysis.py
    │       ├── _nlp.py
    │       ├── _polars.py
    │       ├── _python.py
    │       └── _torch.py
└── test
    ├── bombing
        └── bomb_dataloader.py
    ├── data
        └── ebnerd
        │   ├── articles.parquet
        │   ├── behaviors.parquet
        │   ├── document_vector.parquet
        │   └── history.parquet
    ├── dataloader
        ├── test_fastformer.py
        └── test_newsrec.py
    └── evaluation
        └── test_beyond_accuracy.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | share/python-wheels/
 2 | pip-wheel-metadata/
 3 | .ipynb_checkpoints/
 4 | .installed.cfg
 5 | develop-eggs/
 6 | __pycache__/
 7 | *.egg-info/
 8 | downloads/
 9 | .DS_Store
10 | .Python
11 | wheels/
12 | .vscode
13 | mlruns
14 | build/
15 | .eggs/
16 | lib64/
17 | parts/
18 | sdist/
19 | dist/
20 | eggs/
21 | lib/
22 | var/
23 | *.egg
24 | build
25 | .venv
26 | venv
27 | 
28 | # just for now:
29 | evaluate_predictions.py
30 | ebnerd_predictions/
31 | downloads.py
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Ekstra Bladet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Contributors
  2 | <p align="left">
  3 |   <img src="https://contributors-img.web.app/image?repo=ebanalyse/ebnerd-benchmark" width = 50/>
  4 | </p>
  5 | 
  6 | # Introduction
  7 | Hello there 👋🏽
  8 | 
  9 | We recommend to check the repository frequently, as we are updating and documenting it along the way!
 10 | 
 11 | ## EBNeRD 
 12 | Ekstra Bladet Recommender System repository, created for the RecSys'24 Challenge. 
 13 | 
 14 | # Getting Started
 15 | We recommend [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html#conda-environment) for environment management, and [VS Code](https://code.visualstudio.com/) for development. To install the necessart packages and run the example notebook:
 16 | 
 17 | ```
 18 | # 1. Create and activate a new conda environment
 19 | conda create -n <environment_name> python=3.11
 20 | conda activate <environment_name>
 21 | 
 22 | # 2. Clone this repo within VSCode or using command line:
 23 | git clone https://github.com/ebanalyse/ebnerd-benchmark.git
 24 | 
 25 | # 3. Install the core ebrec package to the enviroment:
 26 | pip install .
 27 | ```
 28 | 
 29 | We have experienced issues installing *tensorflow* for M1 Macbooks (```sys_platform == 'darwin'```) when using conda. To avoid this, we suggest to use venv if running on macbooks.
 30 | ```
 31 | python3 -m .venv .venv
 32 | source  .venv/bin/activate
 33 | ```
 34 | 
 35 | Installing ```.venv``` in project folder:
 36 | ```
 37 | conda create -p .venv python==3.11.8
 38 | conda activate ./.venv
 39 | ```
 40 | 
 41 | ## Running GPU
 42 | ```
 43 | tensorflow-gpu; sys_platform == 'linux'
 44 | tensorflow-macos; sys_platform == 'darwin'
 45 | ```
 46 | 
 47 | # Algorithms
 48 | To get started quickly, we have implemented a couple of News Recommender Systems, specifically, 
 49 | [Neural Recommendation with Long- and Short-term User Representations](https://aclanthology.org/P19-1033/) (LSTUR),
 50 | [Neural Recommendation with Personalized Attention](https://arxiv.org/abs/1907.05559) (NPA),
 51 | [Neural Recommendation with Attentive Multi-View Learning](https://arxiv.org/abs/1907.05576) (NAML), and
 52 | [Neural Recommendation with Multi-Head Self-Attention](https://aclanthology.org/D19-1671/) (NRMS). 
 53 | The source code originates from the brilliant RS repository, [recommenders](https://github.com/recommenders-team/recommenders). We have simply stripped it of all non-model-related code.
 54 | 
 55 | 
 56 | # Notebooks
 57 | To help you get started, we have created a few notebooks. These are somewhat simple and designed to get you started. We do plan to have more at a later stage, such as reproducible model trainings.
 58 | The notebooks were made on macOS, and you might need to perform small modifications to have them running on your system.
 59 | 
 60 | ## Model training
 61 | We have created a [notebook](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/nrms_ebnerd.ipynb) where we train NRMS on EB-NeRD - this is a very simple version using the demo dataset.
 62 | 
 63 | ## Data manipulation and enrichment
 64 | In the [dataset_ebnerd](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/dataset_ebnerd.ipynb) demo, we show how one can join histories and create binary labels.
 65 | 
 66 | # Reproduce EB-NeRD Experiments
 67 | 
 68 | Activate your enviroment:
 69 | ```
 70 | conda activate <environment_name>
 71 | ```
 72 | 
 73 | ### [NRMSModel](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms.py) 
 74 | 
 75 | ```
 76 | python examples/reproducibility_scripts/ebnerd_nrms.py
 77 |   --datasplit ebnerd_small \
 78 |   --epochs 5 \
 79 |   --bs_train 32 \
 80 |   --bs_test 32 \
 81 |   --history_size 20 \
 82 |   --npratio 4 \
 83 |   --transformer_model_name FacebookAI/xlm-roberta-large \
 84 |   --max_title_length 30 \
 85 |   --head_num 20 \
 86 |   --head_dim 20 \
 87 |   --attention_hidden_dim 200 \
 88 |   --learning_rate 1e-4 \
 89 |   --dropout 0.20
 90 | ```
 91 | 
 92 | Tensorboards:
 93 | ```
 94 | tensorboard --logdir=ebnerd_predictions/runs
 95 | ```
 96 | 
 97 | ### [NRMSDocVec](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms_docvec.py) 
 98 | 
 99 | ```
100 | python examples/reproducibility_scripts/ebnerd_nrms_docvec.py \
101 |   --datasplit ebnerd_small \
102 |   --epochs 5 \
103 |   --bs_train 32 \
104 |   --history_size 20 \
105 |   --npratio 4 \
106 |   --document_embeddings Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet \
107 |   --head_num 16 \
108 |   --head_dim 16 \
109 |   --attention_hidden_dim 200 \
110 |   --newsencoder_units_per_layer 512 512 512 \
111 |   --learning_rate 1e-4 \
112 |   --dropout 0.2 \
113 |   --newsencoder_l2_regularization 1e-4
114 | ```
115 | 
116 | Tensorboards:
117 | ```
118 | tensorboard --logdir=ebnerd_predictions/runs
119 | ```
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/examples/baseline/ebnerd_feat_baselines.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from tqdm import tqdm
  3 | import polars as pl
  4 | 
  5 | from ebrec.utils._python import (
  6 |     rank_predictions_by_score,
  7 |     write_submission_file,
  8 |     create_lookup_dict,
  9 | )
 10 | from ebrec.utils._constants import *
 11 | 
 12 | PATH = Path("~/ebnerd_data/ebnerd_testset")
 13 | 
 14 | df_behaviors = pl.scan_parquet(PATH.joinpath("test", "behaviors.parquet"))
 15 | df_articles = pl.scan_parquet(PATH.joinpath("articles.parquet"))
 16 | 
 17 | # ==== LOOKUP DICTS
 18 | clicked_dict = create_lookup_dict(
 19 |     df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).collect(),
 20 |     DEFAULT_ARTICLE_ID_COL,
 21 |     DEFAULT_TOTAL_PAGEVIEWS_COL,
 22 | )
 23 | inview_dict = create_lookup_dict(
 24 |     df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_INVIEWS_COL).collect(),
 25 |     DEFAULT_ARTICLE_ID_COL,
 26 |     DEFAULT_TOTAL_INVIEWS_COL,
 27 | )
 28 | readtime_dict = create_lookup_dict(
 29 |     df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_READ_TIME_COL).collect(),
 30 |     DEFAULT_ARTICLE_ID_COL,
 31 |     DEFAULT_TOTAL_READ_TIME_COL,
 32 | )
 33 | 
 34 | # Estimate:
 35 | df_inview_estimate = (
 36 |     df_behaviors.select(DEFAULT_INVIEW_ARTICLES_COL)
 37 |     .explode(DEFAULT_INVIEW_ARTICLES_COL)
 38 |     .select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).value_counts())
 39 |     .unnest(DEFAULT_INVIEW_ARTICLES_COL)
 40 |     .collect()
 41 | )
 42 | inview_dict_estimate = create_lookup_dict(
 43 |     df_inview_estimate.select(DEFAULT_INVIEW_ARTICLES_COL, "count"),
 44 |     DEFAULT_INVIEW_ARTICLES_COL,
 45 |     "count",
 46 | )
 47 | 
 48 | # ==== CLICKED PREDICTIONS
 49 | CLICKED_SCORE_COL = "clicked_prediction_scores"
 50 | INVIEW_SCORE_COL = "inview_prediction_scores"
 51 | INVIEW_ESTIMATE_SCORE_COL = "inview_estimate_prediction_scores"
 52 | READTIME_SCORE_COL = "readtime_prediction_scores"
 53 | 
 54 | df_predictions = (
 55 |     df_behaviors.select(DEFAULT_IMPRESSION_ID_COL, DEFAULT_INVIEW_ARTICLES_COL)
 56 |     .with_columns(
 57 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
 58 |         .list.eval(pl.element().replace(clicked_dict).fill_null(0))
 59 |         .alias(CLICKED_SCORE_COL)
 60 |     )
 61 |     .with_columns(
 62 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
 63 |         .list.eval(pl.element().replace(inview_dict).fill_null(0))
 64 |         .alias(INVIEW_SCORE_COL)
 65 |     )
 66 |     .with_columns(
 67 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
 68 |         .list.eval(pl.element().replace(inview_dict_estimate).fill_null(0))
 69 |         .alias(INVIEW_ESTIMATE_SCORE_COL)
 70 |     )
 71 |     .with_columns(
 72 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
 73 |         .list.eval(pl.element().replace(readtime_dict).fill_null(0))
 74 |         .alias(READTIME_SCORE_COL)
 75 |     )
 76 |     .collect()
 77 | )
 78 | 
 79 | # CONVERT TO RANKS:
 80 | impression_id = []
 81 | clicked_scores = []
 82 | inview_scores = []
 83 | inview_estimate_scores = []
 84 | readtime_scores = []
 85 | for row in tqdm(
 86 |     df_predictions.iter_rows(named=True),
 87 |     total=df_predictions.shape[0],
 88 |     ncols=80,
 89 | ):
 90 |     impression_id.append(row[DEFAULT_IMPRESSION_ID_COL])
 91 |     clicked_scores.append(rank_predictions_by_score(row[CLICKED_SCORE_COL]))
 92 |     inview_scores.append(rank_predictions_by_score(row[INVIEW_SCORE_COL]))
 93 |     inview_estimate_scores.append(
 94 |         rank_predictions_by_score(row[INVIEW_ESTIMATE_SCORE_COL])
 95 |     )
 96 |     readtime_scores.append(rank_predictions_by_score(row[READTIME_SCORE_COL]))
 97 | 
 98 | #
 99 | for col, scores in zip(
100 |     [
101 |         CLICKED_SCORE_COL,
102 |         INVIEW_SCORE_COL,
103 |         INVIEW_ESTIMATE_SCORE_COL,
104 |         READTIME_SCORE_COL,
105 |     ],
106 |     [clicked_scores, inview_scores, inview_estimate_scores, readtime_scores],
107 | ):
108 |     print("Writing submission file for:", col)
109 |     Path("downloads").mkdir(exist_ok=True)
110 |     write_submission_file(
111 |         impression_ids=impression_id,
112 |         prediction_scores=scores,
113 |         path="downloads/predictions.txt",
114 |         filename_zip=f"{col}.zip",
115 |     )
116 | 


--------------------------------------------------------------------------------
/examples/datasets/plot/article_read_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/article_read_time.png


--------------------------------------------------------------------------------
/examples/datasets/plot/body_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/body_len.png


--------------------------------------------------------------------------------
/examples/datasets/plot/category_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution.png


--------------------------------------------------------------------------------
/examples/datasets/plot/category_distribution_ba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution_ba.png


--------------------------------------------------------------------------------
/examples/datasets/plot/front_article_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_article_page.png


--------------------------------------------------------------------------------
/examples/datasets/plot/front_read_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_read_time.png


--------------------------------------------------------------------------------
/examples/datasets/plot/inview_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/inview_len.png


--------------------------------------------------------------------------------
/examples/datasets/plot/subtitle_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/subtitle_len.png


--------------------------------------------------------------------------------
/examples/datasets/plot/title_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/title_len.png


--------------------------------------------------------------------------------
/examples/quick_start/lstur_dummy.py:
--------------------------------------------------------------------------------
 1 | # TODO make a notebook with it
 2 | from ebrec.models.newsrec.model_config import hparams_lstur
 3 | from ebrec.models.newsrec.lstur import LSTURModel
 4 | import numpy as np
 5 | 
 6 | config = hparams_lstur
 7 | 
 8 | # Define the number of samples in your batch
 9 | BATCH_SIZE = 300
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 | 
15 | # Define the shapes of the input data
16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
18 | vocab_size = word_embeddings.shape[0]
19 | n_users = config.n_users
20 | label_shape = (NPRATIO + 1,)
21 | user_indexes_shape = (1,)
22 | 
23 | model = LSTURModel(hparams=config, word2vec_embedding=word_embeddings)
24 | model.model.summary()
25 | 
26 | # Generate some random input data for input_1 with values between 0 and 1
27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
28 | # Generate some random input data for input_2 with values between 0 and 1
29 | pred_input_title = np.random.randint(
30 |     0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
31 | )
32 | # Input data for user_indexes
33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape))
34 | 
35 | # Generate some random label data with values between 0 and 1
36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
37 | for row in label_data:
38 |     row[np.random.choice(label_shape[0])] = 1
39 | 
40 | # Print the shapes of the input data to verify they match the model's input layers
41 | print(his_input_title.shape)
42 | print(pred_input_title.shape)
43 | print(user_indexes.shape)
44 | print(label_data.shape)
45 | 
46 | # Make input for model:
47 | input = (user_indexes, his_input_title, pred_input_title)
48 | 
49 | # fit/predict:
50 | model.model.fit(input, label_data)
51 | model.model.predict(input)
52 | 


--------------------------------------------------------------------------------
/examples/quick_start/naml_dummy.py:
--------------------------------------------------------------------------------
 1 | # TODO make a notebook with it
 2 | from ebrec.models.newsrec.model_config import hparams_naml
 3 | from ebrec.models.newsrec.naml import NAMLModel
 4 | import numpy as np
 5 | 
 6 | config = hparams_naml
 7 | 
 8 | # Define the number of samples in your batch
 9 | BATCH_SIZE = 300
10 | NPRATIO = 4
11 | HISTORY_SIZE = config.history_size
12 | TITLE_SIZE = config.title_size
13 | BODY_SIZE = config.body_size
14 | 
15 | label_shape = (NPRATIO + 1,)
16 | word_embeddings = np.random.rand(1000, 100)
17 | 
18 | vocab_size = word_embeddings.shape[0]
19 | n_verts = config.vert_num
20 | n_subverts = config.subvert_num
21 | 
22 | # Model
23 | model = NAMLModel(hparams=config, word2vec_embedding=word_embeddings)
24 | model.model.summary()
25 | 
26 | # Define the shapes of the input data
27 | his_input_title = np.random.randint(
28 |     0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, TITLE_SIZE)
29 | )
30 | his_input_body = np.random.randint(
31 |     0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, BODY_SIZE)
32 | )
33 | his_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, HISTORY_SIZE, 1))
34 | his_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, HISTORY_SIZE, 1))
35 | pred_input_title = np.random.randint(
36 |     0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, TITLE_SIZE)
37 | )
38 | pred_input_body = np.random.randint(
39 |     0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, BODY_SIZE)
40 | )
41 | pred_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, NPRATIO + 1, 1))
42 | pred_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, NPRATIO + 1, 1))
43 | 
44 | # Generate some random label data with values between 0 and 1
45 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
46 | for row in label_data:
47 |     row[np.random.choice(label_shape[0])] = 1
48 | 
49 | #
50 | his_input_title.shape
51 | his_input_body.shape
52 | his_input_vert.shape
53 | his_input_subvert.shape
54 | pred_input_title.shape
55 | pred_input_body.shape
56 | pred_input_vert.shape
57 | pred_input_subvert.shape
58 | label_data.shape
59 | 
60 | # Make input for model:
61 | input = (
62 |     his_input_title,
63 |     his_input_body,
64 |     his_input_vert,
65 |     his_input_subvert,
66 |     pred_input_title,
67 |     pred_input_body,
68 |     pred_input_vert,
69 |     pred_input_subvert,
70 | )
71 | 
72 | # fit/predict:
73 | model.model.fit(input, label_data)
74 | model.model.predict(input)
75 | 


--------------------------------------------------------------------------------
/examples/quick_start/npa_dummy.py:
--------------------------------------------------------------------------------
 1 | # TODO make a notebook with it
 2 | from ebrec.models.newsrec.model_config import hparams_npa
 3 | from ebrec.models.newsrec.npa import NPAModel
 4 | import numpy as np
 5 | 
 6 | config = hparams_npa
 7 | 
 8 | # Define the number of samples in your batch
 9 | BATCH_SIZE = 300
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 | 
15 | # Define the shapes of the input data
16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
18 | vocab_size = word_embeddings.shape[0]
19 | n_users = config.n_users
20 | label_shape = (NPRATIO + 1,)
21 | user_indexes_shape = (1,)
22 | 
23 | model = NPAModel(hparams=config)
24 | model.model.summary()
25 | 
26 | # Generate some random input data for input_1 with values between 0 and 1
27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
28 | # Generate some random input data for input_2 with values between 0 and 1
29 | pred_input_title = np.random.randint(
30 |     0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
31 | )
32 | # Input data for user_indexes
33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape))
34 | 
35 | # Generate some random label data with values between 0 and 1
36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
37 | for row in label_data:
38 |     row[np.random.choice(label_shape[0])] = 1
39 | 
40 | # Print the shapes of the input data to verify they match the model's input layers
41 | print(his_input_title.shape)
42 | print(pred_input_title.shape)
43 | print(user_indexes.shape)
44 | print(label_data.shape)
45 | 
46 | # Make input for model:
47 | input = (user_indexes, his_input_title, pred_input_title)
48 | 
49 | # fit/predict:
50 | model.model.fit(input, label_data)
51 | model.model.predict(input)
52 | 


--------------------------------------------------------------------------------
/examples/quick_start/nrms_docvec_dummy.py:
--------------------------------------------------------------------------------
 1 | # TODO make a notebook with it
 2 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec
 3 | from ebrec.models.newsrec.model_config import hparams_nrms
 4 | import numpy as np
 5 | 
 6 | DOCVEC_DIM = 300
 7 | BATCH_SIZE = 10
 8 | HISTORY_SIZE = 20
 9 | NPRATIO = 4
10 | 
11 | #
12 | config = hparams_nrms
13 | config.history_size = HISTORY_SIZE
14 | config.title_size = DOCVEC_DIM
15 | 
16 | # MODEL:
17 | model = NRMSDocVec(hparams=config, newsencoder_units_per_layer=[512, 512])
18 | model.model.summary()
19 | 
20 | #
21 | his_input_title_shape = (HISTORY_SIZE, DOCVEC_DIM)
22 | pred_input_title_shape = (NPRATIO + 1, DOCVEC_DIM)
23 | label_shape = (NPRATIO + 1,)
24 | 
25 | # Generate some random input data for input_1
26 | his_input_title = np.array(
27 |     [np.random.rand(*his_input_title_shape) for _ in range(BATCH_SIZE)]
28 | )
29 | # Generate some random input data for input_2
30 | pred_input_title = np.array(
31 |     [np.random.rand(*pred_input_title_shape) for _ in range(BATCH_SIZE)]
32 | )
33 | # Generate some random label data with values between 0 and 1
34 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
35 | for row in label_data:
36 |     row[np.random.choice(label_shape[0])] = 1
37 | 
38 | # Print the shapes of the input data to verify they match the model's input layers
39 | print(his_input_title.shape)
40 | print(pred_input_title.shape)
41 | print(label_data.shape)
42 | 
43 | # Make input for model:
44 | input = (his_input_title, pred_input_title)
45 | 
46 | # fit/predict:
47 | model.model.fit(input, label_data, epochs=10)
48 | model.model.predict(input)
49 | 


--------------------------------------------------------------------------------
/examples/quick_start/nrms_dummy.py:
--------------------------------------------------------------------------------
 1 | # TODO make a notebook with it
 2 | from ebrec.models.newsrec.model_config import hparams_nrms
 3 | from ebrec.models.newsrec.nrms import NRMSModel
 4 | import numpy as np
 5 | 
 6 | config = hparams_nrms
 7 | 
 8 | # Define the number of samples in your batch
 9 | BATCH_SIZE = 10
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 | 
15 | model = NRMSModel(hparams=config, word2vec_embedding=word_embeddings)
16 | model.model.summary()
17 | 
18 | # Define the shapes of the input data
19 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
20 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
21 | label_shape = (NPRATIO + 1,)
22 | vocab_size = word_embeddings.shape[0]
23 | 
24 | # Generate some random input data for input_1 with values between 0 and 1
25 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
26 | 
27 | # Generate some random input data for input_2 with values between 0 and 1
28 | pred_input_title = np.random.randint(
29 |     0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
30 | )
31 | 
32 | # Generate some random label data with values between 0 and 1
33 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
34 | for row in label_data:
35 |     row[np.random.choice(label_shape[0])] = 1
36 | 
37 | # Print the shapes of the input data to verify they match the model's input layers
38 | print(his_input_title.shape)
39 | print(pred_input_title.shape)
40 | print(label_data.shape)
41 | 
42 | # Make input for model:
43 | input = (his_input_title, pred_input_title)
44 | 
45 | # fit/predict:
46 | model.model.fit(input, label_data)
47 | model.model.predict(input)
48 | 


--------------------------------------------------------------------------------
/examples/quick_start/nrms_ebnerd.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras.backend import clear_session
  2 | from transformers import AutoTokenizer, AutoModel
  3 | from pathlib import Path
  4 | import tensorflow as tf
  5 | import datetime as dt
  6 | import polars as pl
  7 | import numpy as np
  8 | import gc
  9 | import os
 10 | 
 11 | from ebrec.utils._constants import (
 12 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
 13 |     DEFAULT_IS_BEYOND_ACCURACY_COL,
 14 |     DEFAULT_CLICKED_ARTICLES_COL,
 15 |     DEFAULT_INVIEW_ARTICLES_COL,
 16 |     DEFAULT_IMPRESSION_ID_COL,
 17 |     DEFAULT_SUBTITLE_COL,
 18 |     DEFAULT_LABELS_COL,
 19 |     DEFAULT_TITLE_COL,
 20 |     DEFAULT_USER_COL,
 21 | )
 22 | 
 23 | from ebrec.utils._behaviors import (
 24 |     create_binary_labels_column,
 25 |     sampling_strategy_wu2019,
 26 |     add_known_user_column,
 27 |     add_prediction_scores,
 28 |     truncate_history,
 29 | )
 30 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
 31 | from ebrec.utils._articles import convert_text2encoding_with_transformers
 32 | from ebrec.utils._polars import (
 33 |     slice_join_dataframes,
 34 |     concat_str_columns,
 35 |     chunk_dataframe,
 36 |     split_df,
 37 | )
 38 | from ebrec.utils._articles import create_article_id_to_value_mapping
 39 | from ebrec.utils._nlp import get_transformers_word_embeddings
 40 | from ebrec.utils._python import write_submission_file, rank_predictions_by_score
 41 | 
 42 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform
 43 | from ebrec.models.newsrec.model_config import hparams_nrms
 44 | from ebrec.models.newsrec import NRMSModel
 45 | 
 46 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 47 | gpus = tf.config.experimental.list_physical_devices("GPU")
 48 | for gpu in gpus:
 49 |     tf.config.experimental.set_memory_growth(gpu, True)
 50 | 
 51 | # conda activate ./venv/
 52 | # python -i examples/00_quick_start/nrms_ebnerd.py
 53 | 
 54 | 
 55 | def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
 56 |     """
 57 |     Load ebnerd - function
 58 |     """
 59 |     df_history = (
 60 |         pl.scan_parquet(path.joinpath("history.parquet"))
 61 |         .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
 62 |         .pipe(
 63 |             truncate_history,
 64 |             column=DEFAULT_HISTORY_ARTICLE_ID_COL,
 65 |             history_size=history_size,
 66 |             padding_value=0,
 67 |             enable_warning=False,
 68 |         )
 69 |     )
 70 |     df_behaviors = (
 71 |         pl.scan_parquet(path.joinpath("behaviors.parquet"))
 72 |         .collect()
 73 |         .pipe(
 74 |             slice_join_dataframes,
 75 |             df2=df_history.collect(),
 76 |             on=DEFAULT_USER_COL,
 77 |             how="left",
 78 |         )
 79 |     )
 80 |     return df_behaviors
 81 | 
 82 | 
 83 | PATH = Path("~/ebnerd_data").expanduser()
 84 | DUMP_DIR = Path("ebnerd_predictions")
 85 | DUMP_DIR.mkdir(exist_ok=True, parents=True)
 86 | SEED = np.random.randint(0, 1_000)
 87 | 
 88 | MODEL_NAME = f"NRMS-{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}-{SEED}"
 89 | # MODEL_NAME = "NRMS-382861963-2024-11-12 01:34:49.050070"
 90 | 
 91 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_NAME}/weights")
 92 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_NAME}")
 93 | TEST_DF_DUMP = DUMP_DIR.joinpath("test_predictions", MODEL_NAME)
 94 | TEST_DF_DUMP.mkdir(parents=True, exist_ok=True)
 95 | 
 96 | print(f"Dir: {MODEL_NAME}")
 97 | 
 98 | DATASPLIT = "ebnerd_small"
 99 | MAX_TITLE_LENGTH = 30
100 | HISTORY_SIZE = 20
101 | FRACTION = 1.0
102 | EPOCHS = 5
103 | FRACTION_TEST = 1.0
104 | #
105 | hparams_nrms.history_size = HISTORY_SIZE
106 | 
107 | BATCH_SIZE_TRAIN = 32
108 | BATCH_SIZE_VAL = 32
109 | BATCH_SIZE_TEST_WO_B = 32
110 | BATCH_SIZE_TEST_W_B = 4
111 | N_CHUNKS_TEST = 10
112 | CHUNKS_DONE = 0
113 | 
114 | COLUMNS = [
115 |     DEFAULT_USER_COL,
116 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
117 |     DEFAULT_INVIEW_ARTICLES_COL,
118 |     DEFAULT_CLICKED_ARTICLES_COL,
119 |     DEFAULT_IMPRESSION_ID_COL,
120 | ]
121 | 
122 | df_train = (
123 |     ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
124 |     .sample(fraction=FRACTION)
125 |     .select(COLUMNS)
126 |     .pipe(
127 |         sampling_strategy_wu2019,
128 |         npratio=4,
129 |         shuffle=True,
130 |         with_replacement=True,
131 |         seed=SEED,
132 |     )
133 |     .pipe(create_binary_labels_column)
134 | )
135 | df_train, df_validation = split_df(df_train, fraction=0.9, seed=SEED, shuffle=False)
136 | 
137 | # df_test = df_validation
138 | # df_train = df_train[:100]
139 | # df_validation = df_validation[:100]
140 | # df_test = df_test[:100]
141 | df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
142 | 
143 | # =>
144 | TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
145 | TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
146 | 
147 | # LOAD HUGGINGFACE:
148 | transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
149 | transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
150 | 
151 | word2vec_embedding = get_transformers_word_embeddings(transformer_model)
152 | #
153 | df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
154 | df_articles, token_col_title = convert_text2encoding_with_transformers(
155 |     df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
156 | )
157 | # =>
158 | article_mapping = create_article_id_to_value_mapping(
159 |     df=df_articles, value_col=token_col_title
160 | )
161 | 
162 | # =>
163 | print("Init train- and val-dataloader")
164 | train_dataloader = NRMSDataLoaderPretransform(
165 |     behaviors=df_train,
166 |     article_dict=article_mapping,
167 |     unknown_representation="zeros",
168 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
169 |     eval_mode=False,
170 |     batch_size=BATCH_SIZE_TRAIN,
171 | )
172 | val_dataloader = NRMSDataLoaderPretransform(
173 |     behaviors=df_validation,
174 |     article_dict=article_mapping,
175 |     unknown_representation="zeros",
176 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
177 |     eval_mode=False,
178 |     batch_size=BATCH_SIZE_VAL,
179 | )
180 | 
181 | # CALLBACKS
182 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
183 | early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
184 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
185 |     filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
186 | )
187 | 
188 | model = NRMSModel(
189 |     hparams=hparams_nrms,
190 |     word2vec_embedding=word2vec_embedding,
191 |     seed=42,
192 | )
193 | hist = model.model.fit(
194 |     train_dataloader,
195 |     validation_data=val_dataloader,
196 |     epochs=EPOCHS,
197 |     callbacks=[tensorboard_callback, early_stopping],
198 | )
199 | del (
200 |     transformer_tokenizer,
201 |     transformer_model,
202 |     train_dataloader,
203 |     val_dataloader,
204 |     df_validation,
205 |     df_train,
206 | )
207 | gc.collect()
208 | 
209 | print(f"saving model: {MODEL_WEIGHTS}")
210 | model.model.save_weights(MODEL_WEIGHTS)
211 | print(f"loading model: {MODEL_WEIGHTS}")
212 | model.model.load_weights(MODEL_WEIGHTS)
213 | 
214 | # =>
215 | print("Init df_test")
216 | df_test = (
217 |     ebnerd_from_path(PATH.joinpath("ebnerd_testset", "test"), history_size=HISTORY_SIZE)
218 |     .sample(fraction=FRACTION_TEST)
219 |     .with_columns(
220 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
221 |         .list.first()
222 |         .alias(DEFAULT_CLICKED_ARTICLES_COL)
223 |     )
224 |     .select(COLUMNS + [DEFAULT_IS_BEYOND_ACCURACY_COL])
225 |     .with_columns(
226 |         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
227 |         .list.eval(pl.element() * 0)
228 |         .alias(DEFAULT_LABELS_COL)
229 |     )
230 | )
231 | # Split test in beyond-accuracy. BA samples have more 'article_ids_inview'.
232 | df_test_wo_beyond = df_test.filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
233 | df_test_w_beyond = df_test.filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
234 | 
235 | df_test_chunks = chunk_dataframe(df_test_wo_beyond, n_chunks=N_CHUNKS_TEST)
236 | df_pred_test_wo_beyond = []
237 | 
238 | for i, df_test_chunk in enumerate(df_test_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE):
239 |     print(f"Init test-dataloader: {i}/{len(df_test_chunks)}")
240 |     # Initialize DataLoader
241 |     test_dataloader_wo_b = NRMSDataLoader(
242 |         behaviors=df_test_chunk,
243 |         article_dict=article_mapping,
244 |         unknown_representation="zeros",
245 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
246 |         eval_mode=True,
247 |         batch_size=BATCH_SIZE_TEST_WO_B,
248 |     )
249 |     # Predict and clear session
250 |     scores = model.scorer.predict(test_dataloader_wo_b)
251 |     clear_session()
252 | 
253 |     # Process the predictions
254 |     df_test_chunk = add_prediction_scores(df_test_chunk, scores.tolist()).with_columns(
255 |         pl.col("scores")
256 |         .map_elements(lambda x: list(rank_predictions_by_score(x)))
257 |         .alias("ranked_scores")
258 |     )
259 | 
260 |     # Save the processed chunk
261 |     df_test_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
262 |         TEST_DF_DUMP.joinpath(f"pred_wo_ba_{i}.parquet")
263 |     )
264 | 
265 |     # Append and clean up
266 |     df_pred_test_wo_beyond.append(df_test_chunk)
267 | 
268 |     # Cleanup
269 |     del df_test_chunk, test_dataloader_wo_b, scores
270 |     gc.collect()
271 | 
272 | # =>
273 | df_pred_test_wo_beyond = pl.concat(df_pred_test_wo_beyond)
274 | df_pred_test_wo_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
275 |     TEST_DF_DUMP.joinpath("pred_wo_ba.parquet")
276 | )
277 | 
278 | print("Init test-dataloader: beyond-accuracy")
279 | test_dataloader_w_b = NRMSDataLoader(
280 |     behaviors=df_test_w_beyond,
281 |     article_dict=article_mapping,
282 |     unknown_representation="zeros",
283 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
284 |     eval_mode=True,
285 |     batch_size=BATCH_SIZE_TEST_W_B,
286 | )
287 | scores = model.scorer.predict(test_dataloader_w_b)
288 | df_pred_test_w_beyond = add_prediction_scores(
289 |     df_test_w_beyond, scores.tolist()
290 | ).with_columns(
291 |     pl.col("scores")
292 |     .map_elements(lambda x: list(rank_predictions_by_score(x)))
293 |     .alias("ranked_scores")
294 | )
295 | df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
296 |     TEST_DF_DUMP.joinpath("pred_w_ba.parquet")
297 | )
298 | 
299 | # =>
300 | df_test = pl.concat([df_pred_test_wo_beyond, df_pred_test_w_beyond])
301 | df_test.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
302 |     TEST_DF_DUMP.joinpath("pred_concat.parquet")
303 | )
304 | # metrics = MetricEvaluator(
305 | #     labels=df_validation["labels"].to_list(),
306 | #     predictions=df_validation["scores"].to_list(),
307 | #     metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
308 | # )
309 | # metrics.evaluate()
310 | 
311 | write_submission_file(
312 |     impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
313 |     prediction_scores=df_test["ranked_scores"],
314 |     path=DUMP_DIR.joinpath("predictions.txt"),
315 |     filename_zip=f"{DATASPLIT}_predictions-{MODEL_NAME}.zip",
316 | )
317 | 


--------------------------------------------------------------------------------
/examples/reproducibility_scripts/args_nrms.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def get_args():
  5 |     parser = argparse.ArgumentParser(
  6 |         description="Argument parser for NRMSModel training"
  7 |     )
  8 | 
  9 |     parser.add_argument(
 10 |         "--data_path",
 11 |         type=str,
 12 |         default=str("~/ebnerd_data"),
 13 |         help="Path to the data directory",
 14 |     )
 15 | 
 16 |     # General settings
 17 |     parser.add_argument("--seed", type=int, default=123, help="Random seed")
 18 |     parser.add_argument(
 19 |         "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use"
 20 |     )
 21 |     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
 22 | 
 23 |     # Batch sizes
 24 |     parser.add_argument(
 25 |         "--bs_train", type=int, default=32, help="Batch size for training"
 26 |     )
 27 |     parser.add_argument(
 28 |         "--bs_test", type=int, default=32, help="Batch size for testing"
 29 |     )
 30 |     parser.add_argument(
 31 |         "--batch_size_test_wo_b",
 32 |         type=int,
 33 |         default=32,
 34 |         help="Batch size for testing without balancing",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--batch_size_test_w_b",
 38 |         type=int,
 39 |         default=4,
 40 |         help="Batch size for testing with balancing",
 41 |     )
 42 | 
 43 |     # History and ratios
 44 |     parser.add_argument(
 45 |         "--history_size", type=int, default=20, help="History size for the model"
 46 |     )
 47 |     parser.add_argument(
 48 |         "--npratio", type=int, default=4, help="Negative-positive ratio"
 49 |     )
 50 | 
 51 |     # Training settings
 52 |     parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
 53 |     parser.add_argument(
 54 |         "--train_fraction",
 55 |         type=float,
 56 |         default=1.0,
 57 |         help="Fraction of training data to use",
 58 |     )
 59 |     parser.add_argument(
 60 |         "--fraction_test",
 61 |         type=float,
 62 |         default=1.0,
 63 |         help="Fraction of testing data to use",
 64 |     )
 65 | 
 66 |     # Model and loader settings
 67 |     parser.add_argument(
 68 |         "--nrms_loader",
 69 |         type=str,
 70 |         default="NRMSDataLoaderPretransform",
 71 |         choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"],
 72 |         help="Data loader type (speed or memory efficient)",
 73 |     )
 74 | 
 75 |     # Chunk processing
 76 |     parser.add_argument(
 77 |         "--n_chunks_test", type=int, default=10, help="Number of test chunks to process"
 78 |     )
 79 |     parser.add_argument(
 80 |         "--chunks_done", type=int, default=0, help="Number of chunks already processed"
 81 |     )
 82 | 
 83 |     # =====================================================================================
 84 |     #  ############################# UNIQUE FOR NRMSDocVec ###############################
 85 |     # =====================================================================================
 86 |     # Transformer settings
 87 |     parser.add_argument(
 88 |         "--transformer_model_name",
 89 |         type=str,
 90 |         default="FacebookAI/xlm-roberta-large",
 91 |         help="Transformer model name",
 92 |     )
 93 |     parser.add_argument(
 94 |         "--max_title_length",
 95 |         type=int,
 96 |         default=30,
 97 |         help="Maximum length of title encoding",
 98 |     )
 99 | 
100 |     # Hyperparameters
101 |     parser.add_argument(
102 |         "--head_num", type=int, default=20, help="Number of attention heads"
103 |     )
104 |     parser.add_argument(
105 |         "--head_dim", type=int, default=20, help="Dimension of each attention head"
106 |     )
107 |     parser.add_argument(
108 |         "--attention_hidden_dim",
109 |         type=int,
110 |         default=200,
111 |         help="Dimension of attention hidden layers",
112 |     )
113 | 
114 |     # Optimizer settings
115 |     parser.add_argument(
116 |         "--optimizer", type=str, default="adam", help="Optimizer to use"
117 |     )
118 |     parser.add_argument(
119 |         "--loss", type=str, default="cross_entropy_loss", help="Loss function"
120 |     )
121 |     parser.add_argument("--dropout", type=float, default=0.20, help="Dropout rate")
122 |     parser.add_argument(
123 |         "--learning_rate", type=float, default=1e-4, help="Learning rate"
124 |     )
125 | 
126 |     return parser.parse_args()
127 | 


--------------------------------------------------------------------------------
/examples/reproducibility_scripts/args_nrms_docvec.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def get_args():
  5 |     parser = argparse.ArgumentParser(
  6 |         description="Argument parser for NRMSModel training"
  7 |     )
  8 | 
  9 |     parser.add_argument(
 10 |         "--data_path",
 11 |         type=str,
 12 |         default=str("~/ebnerd_data"),
 13 |         help="Path to the data directory",
 14 |     )
 15 | 
 16 |     # General settings
 17 |     parser.add_argument("--seed", type=int, default=123, help="Random seed")
 18 |     parser.add_argument(
 19 |         "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use"
 20 |     )
 21 |     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
 22 | 
 23 |     # Batch sizes
 24 |     parser.add_argument(
 25 |         "--bs_train", type=int, default=32, help="Batch size for training"
 26 |     )
 27 |     parser.add_argument(
 28 |         "--bs_test", type=int, default=32, help="Batch size for testing"
 29 |     )
 30 |     parser.add_argument(
 31 |         "--batch_size_test_wo_b",
 32 |         type=int,
 33 |         default=32,
 34 |         help="Batch size for testing without balancing",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--batch_size_test_w_b",
 38 |         type=int,
 39 |         default=4,
 40 |         help="Batch size for testing with balancing",
 41 |     )
 42 | 
 43 |     # History and ratios
 44 |     parser.add_argument(
 45 |         "--history_size", type=int, default=20, help="History size for the model"
 46 |     )
 47 |     parser.add_argument(
 48 |         "--npratio", type=int, default=4, help="Negative-positive ratio"
 49 |     )
 50 | 
 51 |     # Training settings
 52 |     parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
 53 |     parser.add_argument(
 54 |         "--train_fraction",
 55 |         type=float,
 56 |         default=1.0,
 57 |         help="Fraction of training data to use",
 58 |     )
 59 |     parser.add_argument(
 60 |         "--fraction_test",
 61 |         type=float,
 62 |         default=1.0,
 63 |         help="Fraction of testing data to use",
 64 |     )
 65 | 
 66 |     # Model and loader settings
 67 |     parser.add_argument(
 68 |         "--nrms_loader",
 69 |         type=str,
 70 |         default="NRMSDataLoaderPretransform",
 71 |         choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"],
 72 |         help="Data loader type (speed or memory efficient)",
 73 |     )
 74 | 
 75 |     # Chunk processing
 76 |     parser.add_argument(
 77 |         "--n_chunks_test", type=int, default=10, help="Number of test chunks to process"
 78 |     )
 79 |     parser.add_argument(
 80 |         "--chunks_done", type=int, default=0, help="Number of chunks already processed"
 81 |     )
 82 | 
 83 |     # =====================================================================================
 84 |     #  ############################# UNIQUE FOR NRMSDocVec ###############################
 85 |     # =====================================================================================
 86 | 
 87 |     parser.add_argument(
 88 |         "--document_embeddings",
 89 |         type=str,
 90 |         default="Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet",
 91 |         help="Path to the document embeddings file",
 92 |     )
 93 |     # Model function and architecture
 94 |     parser.add_argument(
 95 |         "--title_size", type=int, default=768, help="Size of title encoding"
 96 |     )
 97 |     parser.add_argument(
 98 |         "--head_num", type=int, default=16, help="Number of attention heads"
 99 |     )
100 |     parser.add_argument(
101 |         "--head_dim", type=int, default=16, help="Dimension of each attention head"
102 |     )
103 |     parser.add_argument(
104 |         "--attention_hidden_dim",
105 |         type=int,
106 |         default=200,
107 |         help="Dimension of attention hidden layers",
108 |     )
109 |     parser.add_argument(
110 |         "--newsencoder_units_per_layer",
111 |         nargs="+",
112 |         type=int,
113 |         default=[512, 512, 512],
114 |         help="List of units per layer in the news encoder",
115 |     )
116 | 
117 |     # Optimizer settings
118 |     parser.add_argument(
119 |         "--optimizer", type=str, default="adam", help="Optimizer to use"
120 |     )
121 |     parser.add_argument(
122 |         "--loss", type=str, default="cross_entropy_loss", help="Loss function"
123 |     )
124 |     parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate")
125 |     parser.add_argument(
126 |         "--learning_rate", type=float, default=1e-4, help="Learning rate"
127 |     )
128 |     parser.add_argument(
129 |         "--newsencoder_l2_regularization",
130 |         type=float,
131 |         default=1e-4,
132 |         help="L2 regularization for the news encoder",
133 |     )
134 | 
135 |     return parser.parse_args()
136 | 


--------------------------------------------------------------------------------
/examples/reproducibility_scripts/ebnerd_nrms_doc_hist.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModel
  2 | from ebrec.utils._nlp import get_transformers_word_embeddings
  3 | from ebrec.utils._articles import convert_text2encoding_with_transformers
  4 | 
  5 | from pathlib import Path
  6 | import tensorflow as tf
  7 | import datetime as dt
  8 | import polars as pl
  9 | import shutil
 10 | import gc
 11 | import os
 12 | 
 13 | from ebrec.utils._constants import *
 14 | 
 15 | from ebrec.utils._behaviors import (
 16 |     create_binary_labels_column,
 17 |     sampling_strategy_wu2019,
 18 |     add_prediction_scores,
 19 |     truncate_history,
 20 |     ebnerd_from_path,
 21 | )
 22 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
 23 | 
 24 | from ebrec.utils._python import (
 25 |     write_submission_file,
 26 |     rank_predictions_by_score,
 27 |     write_json_file,
 28 | )
 29 | from ebrec.utils._articles import create_article_id_to_value_mapping
 30 | from ebrec.utils._polars import split_df_chunks, concat_str_columns
 31 | 
 32 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform
 33 | from ebrec.models.newsrec.model_config import (
 34 |     hparams_nrms,
 35 |     hparams_nrms_docvec,
 36 |     hparams_to_dict,
 37 |     print_hparams,
 38 | )
 39 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec
 40 | from ebrec.models.newsrec import NRMSModel
 41 | 
 42 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 43 | 
 44 | from args_nrms_docvec import get_args
 45 | 
 46 | args = get_args()
 47 | 
 48 | for arg, val in vars(args).items():
 49 |     print(f"{arg} : {val}")
 50 | 
 51 | PATH = Path(args.data_path).expanduser()
 52 | # Access arguments as variables
 53 | SEED = args.seed
 54 | DATASPLIT = args.datasplit
 55 | DEBUG = args.debug
 56 | BS_TRAIN = args.bs_train
 57 | BS_TEST = args.bs_test
 58 | BATCH_SIZE_TEST_WO_B = args.batch_size_test_wo_b
 59 | BATCH_SIZE_TEST_W_B = args.batch_size_test_w_b
 60 | HISTORY_SIZE = args.history_size
 61 | NPRATIO = args.npratio
 62 | EPOCHS = args.epochs
 63 | TRAIN_FRACTION = args.train_fraction if not DEBUG else 0.0001
 64 | FRACTION_TEST = args.fraction_test if not DEBUG else 0.0001
 65 | 
 66 | NRMSLoader_training = (
 67 |     NRMSDataLoaderPretransform
 68 |     if args.nrms_loader == "NRMSDataLoaderPretransform"
 69 |     else NRMSDataLoader
 70 | )
 71 | 
 72 | # =====================================================================================
 73 | #  ############################# UNIQUE FOR NRMSModel ################################
 74 | # =====================================================================================
 75 | 
 76 | # Model in use:
 77 | model_func = NRMSDocVec
 78 | hparams = hparams_nrms_docvec
 79 | #
 80 | hparams.title_size = args.title_size
 81 | hparams.history_size = args.history_size
 82 | hparams.head_num = args.head_num
 83 | hparams.head_dim = args.head_dim
 84 | hparams.attention_hidden_dim = args.attention_hidden_dim
 85 | hparams.newsencoder_units_per_layer = args.newsencoder_units_per_layer
 86 | hparams.optimizer = args.optimizer
 87 | hparams.loss = args.loss
 88 | hparams.dropout = args.dropout
 89 | hparams.learning_rate = args.learning_rate
 90 | hparams.newsencoder_l2_regularization = args.newsencoder_l2_regularization
 91 | print_hparams(hparams)
 92 | 
 93 | # =============
 94 | # Data-path
 95 | DOC_VEC_PATH = PATH.joinpath(f"artifacts/{args.document_embeddings}")
 96 | print("Initiating articles...")
 97 | df_articles = pl.read_parquet(DOC_VEC_PATH)
 98 | article_mapping = create_article_id_to_value_mapping(
 99 |     df=df_articles, value_col=df_articles.columns[-1]
100 | )
101 | 
102 | # =====================================================================================
103 | #  ############################# UNIQUE FOR NRMSDocVec ###############################
104 | # =====================================================================================
105 | 
106 | 
107 | # Dump paths:
108 | DUMP_DIR = Path("ebnerd_predictions")
109 | DUMP_DIR.mkdir(exist_ok=True, parents=True)
110 | #
111 | DT_NOW = dt.datetime.now()
112 | #
113 | MODEL_NAME = model_func.__name__
114 | MODEL_OUTPUT_NAME = f"{MODEL_NAME}-{DT_NOW}"
115 | #
116 | ARTIFACT_DIR = DUMP_DIR.joinpath("test_predictions", MODEL_NAME)
117 | # Model monitoring:
118 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_OUTPUT_NAME}/weights")
119 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_OUTPUT_NAME}")
120 | # Evaluating the test test can be memory intensive, we'll chunk it up:
121 | TEST_CHUNKS_DIR = ARTIFACT_DIR.joinpath("test_chunks")
122 | TEST_CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
123 | N_CHUNKS_TEST = 10
124 | CHUNKS_DONE = 0  # if it crashes, you can start from here.
125 | # Just trying keeping the dataframe slime:
126 | COLUMNS = [
127 |     DEFAULT_IMPRESSION_TIMESTAMP_COL,
128 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
129 |     DEFAULT_INVIEW_ARTICLES_COL,
130 |     DEFAULT_CLICKED_ARTICLES_COL,
131 |     DEFAULT_IMPRESSION_ID_COL,
132 |     DEFAULT_USER_COL,
133 | ]
134 | # Store hparams
135 | write_json_file(
136 |     hparams_to_dict(hparams),
137 |     ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_hparams.json"),
138 | )
139 | write_json_file(vars(args), ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_argparser.json"))
140 | 
141 | # =====================================================================================
142 | 
143 | df = (
144 |     ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
145 |     .sample(fraction=TRAIN_FRACTION, shuffle=True, seed=SEED)
146 |     .select(COLUMNS)
147 |     .pipe(
148 |         sampling_strategy_wu2019,
149 |         npratio=4,
150 |         shuffle=True,
151 |         with_replacement=True,
152 |         seed=SEED,
153 |     )
154 |     .pipe(create_binary_labels_column)
155 | )
156 | #
157 | last_dt = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].dt.date().max() - dt.timedelta(days=1)
158 | df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() < last_dt)
159 | df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() >= last_dt)
160 | 
161 | 
162 | # =====================================================================================
163 | train_dataloader = NRMSDataLoaderPretransform(
164 |     behaviors=df_train,
165 |     article_dict=article_mapping,
166 |     unknown_representation="zeros",
167 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
168 |     eval_mode=False,
169 |     batch_size=BS_TRAIN,
170 | )
171 | 
172 | val_dataloader = NRMSDataLoaderPretransform(
173 |     behaviors=df_validation,
174 |     article_dict=article_mapping,
175 |     unknown_representation="zeros",
176 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
177 |     eval_mode=False,
178 |     batch_size=BS_TEST,
179 | )
180 | 
181 | # =====================================================================================
182 | print(f"Initiating training-dataloader")
183 | train_dataloader = NRMSLoader_training(
184 |     behaviors=df_train,
185 |     article_dict=article_mapping,
186 |     unknown_representation="zeros",
187 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
188 |     eval_mode=False,
189 |     batch_size=BS_TRAIN,
190 | )
191 | 
192 | val_dataloader = NRMSLoader_training(
193 |     behaviors=df_validation,
194 |     article_dict=article_mapping,
195 |     unknown_representation="zeros",
196 |     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
197 |     eval_mode=False,
198 |     batch_size=BS_TRAIN,
199 | )
200 | 
201 | # =====================================================================================
202 | # CALLBACKS
203 | tensorboard_callback = tf.keras.callbacks.TensorBoard(
204 |     log_dir=LOG_DIR,
205 |     histogram_freq=1,
206 | )
207 | early_stopping = tf.keras.callbacks.EarlyStopping(
208 |     monitor="val_auc",
209 |     mode="max",
210 |     patience=4,
211 |     restore_best_weights=True,
212 | )
213 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
214 |     filepath=MODEL_WEIGHTS,
215 |     monitor="val_auc",
216 |     mode="max",
217 |     save_best_only=True,
218 |     save_weights_only=True,
219 |     verbose=1,
220 | )
221 | lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
222 |     monitor="val_auc",
223 |     mode="max",
224 |     factor=0.2,
225 |     patience=2,
226 |     min_lr=1e-6,
227 | )
228 | callbacks = [tensorboard_callback, early_stopping, modelcheckpoint, lr_scheduler]
229 | 
230 | # =====================================================================================
231 | model = model_func(
232 |     hparams=hparams,
233 |     seed=42,
234 | )
235 | model.model.compile(
236 |     optimizer=model.model.optimizer,
237 |     loss=model.model.loss,
238 |     metrics=["AUC"],
239 | )
240 | f"Initiating {MODEL_NAME}, start training..."
241 | # =>
242 | hist = model.model.fit(
243 |     train_dataloader,
244 |     validation_data=val_dataloader,
245 |     epochs=EPOCHS,
246 |     callbacks=callbacks,
247 | )
248 | 
249 | print(f"loading model: {MODEL_WEIGHTS}")
250 | model.model.load_weights(MODEL_WEIGHTS)
251 | 
252 | # =====================================================================================
253 | 
254 | # First filter: only keep users with >FILTER_MIN_HISTORY in history-size
255 | FILTER_MIN_HISTORY = 100
256 | # Truncate the history
257 | HIST_SIZE = 100
258 | 
259 | # =>
260 | df = (
261 |     ebnerd_from_path(
262 |         PATH.joinpath(DATASPLIT, "validation"), history_size=120, padding=None
263 |     )
264 |     .sample(fraction=FRACTION_TEST)
265 |     .filter(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.len() >= FILTER_MIN_HISTORY)
266 |     .select(COLUMNS)
267 |     .pipe(create_binary_labels_column)
268 | )
269 | 
270 | pairs = [
271 |     (1, 256),
272 |     (2, 256),
273 |     (3, 256),
274 |     (4, 256),
275 |     (5, 256),
276 |     (6, 256),
277 |     (7, 256),
278 |     (8, 256),
279 |     (9, 256),
280 |     (10, 256),
281 |     (15, 128),
282 |     (20, 128),
283 |     (30, 64),
284 |     (40, 64),
285 |     (50, 64),
286 | ]
287 | 
288 | aucs = []
289 | hists = []
290 | for hist_size, batch_size in pairs:
291 |     print(f"History size: {hist_size}, Batch size: {batch_size}")
292 | 
293 |     df_ = df.pipe(
294 |         truncate_history,
295 |         column=DEFAULT_HISTORY_ARTICLE_ID_COL,
296 |         history_size=hist_size,
297 |         padding_value=0,
298 |         enable_warning=False,
299 |     )
300 | 
301 |     test_dataloader = NRMSDataLoader(
302 |         behaviors=df_,
303 |         article_dict=article_mapping,
304 |         unknown_representation="zeros",
305 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
306 |         eval_mode=True,
307 |         batch_size=batch_size,
308 |     )
309 | 
310 |     scores = model.scorer.predict(test_dataloader)
311 | 
312 |     df_pred = add_prediction_scores(df_, scores.tolist())
313 | 
314 |     metrics = MetricEvaluator(
315 |         labels=df_pred["labels"],
316 |         predictions=df_pred["scores"],
317 |         metric_functions=[AucScore()],
318 |     )
319 |     metrics.evaluate()
320 |     auc = metrics.evaluations["auc"]
321 |     aucs.append(round(auc, 4))
322 |     hists.append(hist_size)
323 |     print(f"{auc} (History size: {hist_size}, Batch size: {batch_size})")
324 | 
325 | for h, a in zip(hists, aucs):
326 |     print(f"({a}, {h}),")
327 | 
328 | results = {h: a for h, a in zip(hists, aucs)}
329 | write_json_file(results, ARTIFACT_DIR.joinpath("auc_history_length.json"))
330 | 
331 | # Clean up
332 | if TEST_CHUNKS_DIR.exists() and TEST_CHUNKS_DIR.is_dir():
333 |     shutil.rmtree(TEST_CHUNKS_DIR)
334 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ebrec"
 3 | description = "Ekstra Bladet Benchmark"
 4 | version = "0.0.1"
 5 | authors = [{ name = "Johannes Kruse", email = "johannes.kruse@eb.dk" }]
 6 | requires-python = ">=3.10, <3.12"
 7 | dependencies = [
 8 |     # fastformer:
 9 |     "transformers>=4.30.0, <4.37.3",
10 |     # newsrec:
11 |     "tensorflow>=2.12.0, <2.16.0",
12 |     # Fastformer; DeepCTR
13 |     "torch>=2.0.0, <2.3.0",
14 |     # Evaluation:
15 |     "scikit-learn==1.4.0",
16 |     # GENERAL:
17 |     "numpy>=1.24.0, <1.26.1",
18 |     "polars==0.20.8",
19 |     "pyyaml==6.0.1",
20 |     "tqdm",
21 | ]
22 | 
23 | [project.optional-dependencies]
24 | # pip install "my_project[extras]"
25 | # pip install -e .'[notebooks]'
26 | notebooks = ["transformers", "jupyter"]
27 | tests = [
28 |     "pytest",
29 |     "transformers>=4.30.0, <4.37.3",
30 |     "tensorflow>=2.12.0, <2.16.0",
31 |     "torch>=2.0.0, <2.3.0",
32 | ]
33 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/__init__.py


--------------------------------------------------------------------------------
/src/ebrec/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .metrics_protocols import (
 2 |     RootMeanSquaredError,
 3 |     MetricEvaluator,
 4 |     AccuracyScore,
 5 |     LogLossScore,
 6 |     NdcgScore,
 7 |     AucScore,
 8 |     F1Score,
 9 |     MrrScore,
10 | )
11 | from .beyond_accuracy import (
12 |     IntralistDiversity,
13 |     Distribution,
14 |     Serendipity,
15 |     Coverage,
16 |     Novelty,
17 | )
18 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/_ba_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics.pairwise import cosine_distances
 3 | 
 4 | from ebrec.evaluation.beyond_accuracy import (
 5 |     IntralistDiversity,
 6 |     Distribution,
 7 |     Serendipity,
 8 |     Novelty,
 9 |     Coverage,
10 | )
11 | 
12 | lookup_dict = {
13 |     "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50},
14 |     "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25},
15 |     "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75},
16 |     "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50},
17 |     "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94},
18 |     "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95},
19 |     "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96},
20 |     "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50},
21 |     "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20},
22 |     "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20},
23 | }
24 | 
25 | # 404 is not excepted, however, setup supports it:
26 | R = np.array(
27 |     [
28 |         ["101", "102", "400"],
29 |         ["101", "103", "400"],
30 |         ["101", "102", "103"],
31 |         ["101", "104", "400"],
32 |         ["101", "106", "404"],
33 |         ["404", "404", "404"],
34 |     ]
35 | )
36 | 
37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"]
38 | 
39 | click_histories = [
40 |     np.array([["101", "102"]]),
41 |     np.array([["105", "106", "400"]]),
42 |     np.array([["102", "103", "104"]]),
43 |     np.array([["101", "400"]]),
44 |     np.array([["400"]]),
45 |     np.array([["400"]]),
46 | ]
47 | pairwise_distance_function = cosine_distances
48 | 
49 | # => IntralistDiversity
50 | lookup_key = "doc_vec"
51 | div = IntralistDiversity()
52 | div(R, lookup_dict=lookup_dict, lookup_key=lookup_key)
53 | div._candidate_diversity(
54 |     R=C,
55 |     n_recommendations=2,
56 |     lookup_dict=lookup_dict,
57 |     lookup_key=lookup_key,
58 |     pairwise_distance_function=pairwise_distance_function,
59 | )
60 | 
61 | try:
62 |     div._candidate_diversity(C, 7, lookup_dict=lookup_dict, lookup_key=lookup_key)
63 | except ValueError as e:
64 |     print(f"Failed - hurra! Error message: \n {e}")
65 | 
66 | # => Distribution
67 | dist = Distribution()
68 | dist(R[:2], lookup_dict, "v")
69 | dist(R, lookup_dict, "sv")
70 | dist(C, lookup_dict, "v")
71 | try:
72 |     dist(C, lookup_dict, "q")
73 | except ValueError as e:
74 |     print(f"Failed - hurra! Error message: \n {e}")
75 | 
76 | # => Coverage
77 | cov = Coverage()
78 | cov(R)
79 | cov(R, C)
80 | 
81 | # => Serendipity
82 | ser = Serendipity()
83 | ser(
84 |     R=R,
85 |     H=click_histories,
86 |     lookup_dict=lookup_dict,
87 |     lookup_key=lookup_key,
88 |     pairwise_distance_function=pairwise_distance_function,
89 | )
90 | # np.nan_to_num(ser(R, click_histories, lookup_dict, lookup_key), 0.0)
91 | 
92 | # => Novelty
93 | nov = Novelty()
94 | nov(R, lookup_dict=lookup_dict, lookup_key="pop_sc")
95 | nov._candidate_novelty(C, 2, lookup_dict=lookup_dict, lookup_key="pop_sc")
96 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from ._beyond_accuracy import *
2 | from ._classification import *
3 | from ._ranking import *
4 | from ._sklearn import *
5 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_beyond_accuracy.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | 
  3 | from sklearn.metrics.pairwise import cosine_distances
  4 | from collections import Counter
  5 | import numpy as np
  6 | 
  7 | 
  8 | def intralist_diversity(
  9 |     R: np.ndarray[np.ndarray],
 10 |     pairwise_distance_function: Callable = cosine_distances,
 11 | ) -> float:
 12 |     """Calculate the intra-list diversity of a recommendation list.
 13 | 
 14 |     This function implements the method described by Smyth and McClave (2001) to
 15 |     measure the diversity within a recommendation list. It calculates the average
 16 |     pairwise distance between all items in the list.
 17 | 
 18 |     Formula:
 19 |         Diversity(R) = ( sum_{i∈R} sum_{j∈R_{i}} dist(i, j) )  / ( |R|(|R|-1) )
 20 | 
 21 |     where `R` is the recommendation list, and `dist` represents the pairwise distance function used.
 22 | 
 23 |     Args:
 24 |         R (np.ndarray[np.ndarray]): A 2D numpy array where each row represents a recommendation.
 25 |             This array should be either array-like or a sparse matrix, with shape (n_samples_X, n_features).
 26 |         pairwise_distance_function (Callable, optional): A function to compute pairwise distance
 27 |             between samples. Defaults to `cosine_distances`.
 28 | 
 29 |     Returns:
 30 |         float: The calculated diversity score. If the recommendation list contains less than or
 31 |             equal to one item, NaN is returned to signify an undefined diversity score.
 32 | 
 33 |     References:
 34 |         Smyth, B., McClave, P. (2001). Similarity vs. Diversity. In: Aha, D.W., Watson, I. (eds)
 35 |         Case-Based Reasoning Research and Development. ICCBR 2001. Lecture Notes in Computer Science(),
 36 |         vol 2080. Springer, Berlin, Heidelberg. https://doi.org/10.1007/3-540-44593-5_25
 37 | 
 38 |     Examples:
 39 |         >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
 40 |         >>> print(intralist_diversity(R1))
 41 |             0.022588438516842262
 42 |         >>> print(intralist_diversity(np.array([[0.1, 0.2], [0.1, 0.2]])))
 43 |             1.1102230246251565e-16
 44 |     """
 45 |     R_n = R.shape[0]  # number of recommendations
 46 |     if R_n <= 1:
 47 |         # Less than or equal to 1 recommendations in recommendation list
 48 |         diversity = np.nan
 49 |     else:
 50 |         pairwise_distances = pairwise_distance_function(R, R)
 51 |         diversity = np.sum(pairwise_distances) / (R_n * (R_n - 1))
 52 |     return diversity
 53 | 
 54 | 
 55 | def serendipity(
 56 |     R: np.ndarray[np.ndarray],
 57 |     H: np.ndarray[np.ndarray],
 58 |     pairwise_distance_function: Callable = cosine_distances,
 59 | ) -> float:
 60 |     """Calculate the serendipity score between a set of recommendations and user's reading history.
 61 | 
 62 |     This function implements the concept of serendipity as defined by Feng Lu, Anca Dumitrache, and David Graus (2020).
 63 |     Serendipity in this context is measured as the mean distance between the items in the recommendation list and the
 64 |     user's reading history.
 65 | 
 66 |     Formula:
 67 |         Serendipity(R, H) = ( sum_{i∈R} sum_{j∈R} dist(i, j) )  / ( |R||H| )
 68 | 
 69 |     where `R` is the recommendation list, `H` is the user's reading history, and `dist` is the pairwise distance function.
 70 | 
 71 |     Args:
 72 |         R (np.ndarray[np.ndarray]): A 2D numpy array representing the recommendation list, where each row is a recommendation.
 73 |             It should be either array-like or a sparse matrix, with shape (n_samples_X, n_features).
 74 |         H (np.ndarray[np.ndarray]): A 2D numpy array representing the user's reading history, with the same format as R.
 75 |         pairwise_distance_function (Callable, optional): A function to compute pairwise distance between samples.
 76 |             Defaults to `cosine_distances`.
 77 | 
 78 |     Returns:
 79 |         float: The calculated serendipity score.
 80 | 
 81 |     References:
 82 |         Lu, F., Dumitrache, A., & Graus, D. (2020). Beyond Optimizing for Clicks: Incorporating Editorial Values in News Recommendation.
 83 |         Retrieved from https://arxiv.org/abs/2004.09980
 84 | 
 85 |     Examples:
 86 |         >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
 87 |         >>> H1 = np.array([[0.7, 0.8, 0.9], [0.1, 0.2, 0.3]])
 88 |         >>> print(serendipity(R1, H1))
 89 |             0.016941328887631724
 90 |     """
 91 |     # Compute the pairwise distances between each vector:
 92 |     dists = pairwise_distance_function(R, H)
 93 |     # Compute serendipity:
 94 |     return np.mean(dists)
 95 | 
 96 | 
 97 | def coverage_count(R: np.ndarray) -> int:
 98 |     """Calculate the number of distinct items in a recommendation list.
 99 | 
100 |     Args:
101 |         R (np.ndarray): An array containing the items in the recommendation list.
102 | 
103 |     Returns:
104 |         int: The count of distinct items in the recommendation list.
105 | 
106 |     Examples:
107 |         >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6])
108 |         >>> print(coverage_count(R1))
109 |             6
110 |     """
111 |     # Distinct items:
112 |     return np.unique(R).size
113 | 
114 | 
115 | def coverage_fraction(R: np.ndarray, C: np.ndarray) -> float:
116 |     """Calculate the fraction of distinct items in the recommendation list compared to a universal set.
117 | 
118 |     Args:
119 |         R (np.ndarray): An array containing the items in the recommendation list.
120 |         C (np.ndarray): An array representing the universal set of items.
121 |             It should contain all possible items that can be recommended.
122 | 
123 |     Returns:
124 |         float: The fraction representing the coverage of the recommendation system.
125 |             This is calculated as the size of unique elements in R divided by the size of unique elements in C.
126 | 
127 |     Examples:
128 |         >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6])
129 |         >>> C1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
130 |         >>> print(coverage_fraction(R1, C1))  # Expected output: 0.6
131 |             0.6
132 |     """
133 |     # Distinct items:
134 |     return np.unique(R).size / np.unique(C).size
135 | 
136 | 
137 | def novelty(R: np.ndarray[float]) -> float:
138 |     """Calculate the novelty score of recommendations based on their popularity.
139 | 
140 |     This function computes the novelty score for a set of recommendations by applying the self-information popularity metric.
141 |     It uses the formula described by Zhou et al. (2010) and Vargas and Castells (2011). The novelty is calculated as the
142 |     average negative logarithm (base 2) of the popularity scores of the items in the recommendation list.
143 | 
144 |     Formula:
145 |         Novelty(R) = ( sum_{i∈R} -log2( p_i ) / ( |R| )
146 | 
147 |     where p_i represents the popularity score of each item in the recommendation list R, and |R| is the size of R.
148 | 
149 |     Args:
150 |         R (np.ndarray[float]): An array of popularity scores (p_i) for each item in the recommendation list.
151 | 
152 |     Returns:
153 |         float: The calculated novelty score. Higher values indicate less popular (more novel) recommendations.
154 | 
155 |     References:
156 |         Zhou et al. (2010).
157 |         Vargas & Castells (2011).
158 | 
159 |     Examples:
160 |         >>> print(novelty([0.1, 0.2, 0.3, 0.4, 0.5]))  # Expected: High score (low popularity scores)
161 |             1.9405499757656586
162 |         >>> print(novelty([0.9, 0.9, 0.9, 1.0, 0.5]))  # Expected: Low score (high popularity scores)
163 |             0.29120185606703
164 |     """
165 |     return np.mean(-np.log2(R))
166 | 
167 | 
168 | def index_of_dispersion(x: list[int]) -> float:
169 |     """
170 |     Computes the Index of Dispersion (variance-to-mean ratio) for a given dataset of nominal variables.
171 | 
172 |     The Index of Dispersion is a statistical measure used to quantify the dispersion or variability of a distribution
173 |     relative to its mean. It's particularly useful in identifying whether a dataset follows a Poisson distribution,
174 |     where the Index of Dispersion would be approximately 1.
175 | 
176 |     Formula:
177 |         D = ( k * (N^2 - Σf^2) ) / ( N^2 * (k-1) )
178 |     Where:
179 |         k = number of categories in the data set (including categories with zero items),
180 |         N = number of items in the set,
181 |         f = number of frequencies or ratings,
182 |         Σf^2 = sum of squared frequencies/ratings.
183 | 
184 |     Args:
185 |         x (list[int]): A list of integers representing frequencies or counts of occurrences in different categories.
186 |                         Each integer in the list corresponds to the count of occurrences in a given category.
187 | 
188 |     Returns:
189 |         float: The Index of Dispersion for the dataset. Returns `np.nan` if the input list contains only one item,
190 |                 indicating an undefined Index of Dispersion. Returns 0 if there's only one category present in the dataset.
191 | 
192 |     References:
193 |         Walker, 1999, Statistics in criminal
194 |         Source: https://www.statisticshowto.com/index-of-dispersion/
195 | 
196 |     Examples:
197 |         Given the following categories: Math(25), Economics(42), Chemistry(13), Physical Education (8), Religious Studies (13).
198 |         >>> N = np.sum(25+42+13+8+13)
199 |         >>> k = 5
200 |         >>> sq_f2 = np.sum(25**2 + 42**2 + 13**2 + 8**2 + 13**2)
201 |         >>> iod = ( k * (N**2 - sq_f2)) / ( N**2 * (k-1) )
202 |             0.9079992157631604
203 | 
204 |         Validate method:
205 |         >>> cat = [[1]*25, [2]*42, [3]*13, [4]*8, [5]*13]
206 |         >>> flat_list = [item for sublist in cat for item in sublist]
207 |         >>> index_of_dispersion(flat_list)
208 |             0.9079992157631604
209 |     """
210 |     # number of items
211 |     N = len(x)
212 |     # compute frequencies
213 |     count = Counter(x)
214 |     # number of categories
215 |     k = len(count)
216 |     if k == 1:
217 |         if N == 1:
218 |             return np.nan
219 |         else:
220 |             return 0
221 |     # squared frequencies
222 |     f_squared = [count.get(f) ** 2 for f in count]
223 |     # compute Index of Dispersion
224 |     D = k * (N**2 - sum(f_squared)) / (N**2 * (k - 1))
225 |     return D
226 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_classification.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def auc_score_custom(y_true: np.ndarray, y_pred: np.ndarray) -> float:
 5 |     """
 6 |     Computes the Area Under the Curve (AUC) score for the Receiver Operating Characteristic (ROC) curve using a
 7 |     custom method. This implementation is particularly useful for understanding basic ROC curve properties and
 8 |     for educational purposes to demonstrate how AUC scores can be manually calculated.
 9 | 
10 |     This function may produce slightly different results compared to standard library implementations (e.g., sklearn's roc_auc_score)
11 |     in cases where positive and negative predictions have the same score. The function treats the problem as a binary classification task,
12 |     comparing the prediction scores for positive instances against those for negative instances directly.
13 | 
14 |     Args:
15 |         y_true (np.ndarray): A binary array indicating the true classification (1 for positive class and 0 for negative class).
16 |         y_pred (np.ndarray): An array of scores as predicted by a model, indicating the likelihood of each instance being positive.
17 | 
18 |     Returns:
19 |         float: The calculated AUC score, representing the probability that a randomly chosen positive instance is ranked
20 |                 higher than a randomly chosen negative instance based on the prediction scores.
21 | 
22 |     Raises:
23 |         ValueError: If `y_true` and `y_pred` do not have the same length or if they contain invalid data types.
24 | 
25 |     Examples:
26 |         >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0])
27 |         >>> y_pred = np.array([0.9999, 0.9838, 0.5747, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985])
28 |         >>> auc_score_custom(y_true, y_pred)
29 |             0.9333333333333333
30 |         >>> from sklearn.metrics import roc_auc_score
31 |         >>> roc_auc_score(y_true, y_pred)
32 |             0.9333333333333333
33 | 
34 |         An error will occur when pos/neg prediction have same score:
35 |         >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0])
36 |         >>> y_pred = np.array([0.9999, 0.8, 0.8, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985])
37 |         >>> auc_score_custom(y_true, y_pred)
38 |             0.7333
39 |         >>> roc_auc_score(y_true, y_pred)
40 |             0.7667
41 |     """
42 |     y_true = np.asarray(y_true)
43 |     y_pred = np.asarray(y_pred)
44 | 
45 |     y_true_bool = y_true.astype(np.bool_)
46 |     # Index:
47 |     pos_scores = y_pred[y_true_bool]
48 |     neg_scores = y_pred[np.logical_not(y_true_bool)]
49 |     # Arrange:
50 |     pos_scores = np.repeat(pos_scores, len(neg_scores))
51 |     neg_scores = np.tile(neg_scores, sum(y_true_bool))
52 |     assert len(neg_scores) == len(pos_scores)
53 |     return (pos_scores > neg_scores).sum() / len(neg_scores)
54 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_ranking.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def reciprocal_rank_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
  5 |     """Computes the Mean Reciprocal Rank (MRR) score.
  6 | 
  7 |     Args:
  8 |         y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1),
  9 |                                 where 1 indicates the relevant item.
 10 |         y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood
 11 |                                 of items being relevant.
 12 | 
 13 |     Returns:
 14 |         float: The mean reciprocal rank (MRR) score.
 15 | 
 16 |     Note:
 17 |         Both `y_true` and `y_pred` should be 1D arrays of the same length.
 18 |         The function assumes higher scores in `y_pred` indicate higher relevance.
 19 | 
 20 |     Examples:
 21 |         >>> y_true_1 = np.array([0, 0, 1])
 22 |         >>> y_pred_1 = np.array([0.5, 0.2, 0.1])
 23 |         >>> reciprocal_rank_score(y_true_1, y_pred_1)
 24 |             0.33
 25 | 
 26 |         >>> y_true_2 = np.array([0, 1, 1])
 27 |         >>> y_pred_2 = np.array([0.5, 0.2, 0.1])
 28 |         >>> reciprocal_rank_score(y_true_2, y_pred_2)
 29 |             0.5
 30 | 
 31 |         >>> y_true_3 = np.array([1, 1, 0])
 32 |         >>> y_pred_3 = np.array([0.5, 0.2, 0.1])
 33 |         >>> reciprocal_rank_score(y_true_3, y_pred_3)
 34 |             1.0
 35 | 
 36 |         >>> np.mean(
 37 |                 [
 38 |                     reciprocal_rank_score(y_true, y_pred)
 39 |                     for y_true, y_pred in zip(
 40 |                         [y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3]
 41 |                     )
 42 |                 ]
 43 |             )
 44 |             0.61
 45 |             mrr_score([y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3])
 46 |     """
 47 |     order = np.argsort(y_pred)[::-1]
 48 |     y_true = np.take(y_true, order)
 49 |     first_positive_rank = np.argmax(y_true) + 1
 50 |     return 1.0 / first_positive_rank
 51 | 
 52 | 
 53 | def dcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float:
 54 |     """
 55 |     Compute the Discounted Cumulative Gain (DCG) score at a particular rank `k`.
 56 | 
 57 |     Args:
 58 |         y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels.
 59 |                             Each element should be a non-negative integer.
 60 |         y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is
 61 |                             a score corresponding to the predicted relevance.
 62 |         k (int, optional): The rank at which the DCG score is calculated. Defaults
 63 |                             to 10. If `k` is larger than the number of elements, it
 64 |                             will be truncated to the number of elements.
 65 | 
 66 |     Note:
 67 |         In case of a 2D array, each row represents a different sample.
 68 | 
 69 |     Returns:
 70 |         float: The calculated DCG score for the top `k` elements.
 71 | 
 72 |     Raises:
 73 |         ValueError: If `y_true` and `y_pred` have different shapes.
 74 | 
 75 |     Examples:
 76 |         >>> from sklearn.metrics import dcg_score as dcg_score_sklearn
 77 |         >>> y_true = np.array([1, 0, 0, 1, 0])
 78 |         >>> y_pred = np.array([0.5, 0.2, 0.1, 0.8, 0.4])
 79 |         >>> dcg_score(y_true, y_pred)
 80 |             1.6309297535714575
 81 |         >>> dcg_score_sklearn([y_true], [y_pred])
 82 |             1.6309297535714573
 83 |     """
 84 |     k = min(np.shape(y_true)[-1], k)
 85 |     order = np.argsort(y_pred)[::-1]
 86 |     y_true = np.take(y_true, order[:k])
 87 |     gains = 2**y_true - 1
 88 |     discounts = np.log2(np.arange(len(y_true)) + 2)
 89 |     return np.sum(gains / discounts)
 90 | 
 91 | 
 92 | def ndcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float:
 93 |     """
 94 |     Compute the Normalized Discounted Cumulative Gain (NDCG) score at a rank `k`.
 95 | 
 96 |     Args:
 97 |         y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels.
 98 |                             Each element should be a non-negative integer. In case
 99 |                             of a 2D array, each row represents a different sample.
100 |         y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is
101 |                             a score corresponding to the predicted relevance. The
102 |                             array should have the same shape as `y_true`.
103 |         k (int, optional): The rank at which the NDCG score is calculated. Defaults
104 |                             to 10. If `k` is larger than the number of elements, it
105 |                             will be truncated to the number of elements.
106 | 
107 |     Returns:
108 |         float: The calculated NDCG score for the top `k` elements. The score ranges
109 |                 from 0 to 1, with 1 representing the perfect ranking.
110 | 
111 |     Examples:
112 |         >>> from sklearn.metrics import ndcg_score as ndcg_score_sklearn
113 |         >>> y_true = np.array([1, 0, 0, 1, 0])
114 |         >>> y_pred = np.array([0.1, 0.2, 0.1, 0.8, 0.4])
115 |         >>> ndcg_score([y_true], [y_pred])
116 |             0.863780110436402
117 |         >>> ndcg_score_sklearn([y_true], [y_pred])
118 |             0.863780110436402
119 |         >>>
120 |     """
121 |     best = dcg_score(y_true, y_true, k)
122 |     actual = dcg_score(y_true, y_pred, k)
123 |     return actual / best
124 | 
125 | 
126 | def mrr_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
127 |     """Computes the Mean Reciprocal Rank (MRR) score.
128 | 
129 |     THIS MIGHT NOT ALL PROPER, TO BE DETERMIEND:
130 |         - https://github.com/recommenders-team/recommenders/issues/2141
131 | 
132 |     Args:
133 |         y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1),
134 |                                 where 1 indicates the relevant item.
135 |         y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood
136 |                                 of items being relevant.
137 | 
138 |     Returns:
139 |         float: The mean reciprocal rank (MRR) score.
140 | 
141 |     Note:
142 |         Both `y_true` and `y_pred` should be 1D arrays of the same length.
143 |         The function assumes higher scores in `y_pred` indicate higher relevance.
144 | 
145 |     Examples:
146 |         >>> y_true = np.array([[1, 0, 0, 1, 0]])
147 |         >>> y_pred = np.array([[0.5, 0.2, 0.1, 0.8, 0.4]])
148 |         >>> mrr_score(y_true, y_pred)
149 |             0.75
150 | 
151 |     """
152 |     order = np.argsort(y_pred)[::-1]
153 |     y_true = np.take(y_true, order)
154 |     rr_score = y_true / (np.arange(len(y_true)) + 1)
155 |     return np.sum(rr_score) / np.sum(y_true)
156 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_sklearn.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from sklearn.metrics import (
 3 |         # _regression:
 4 |         mean_squared_error,
 5 |         # _ranking:
 6 |         roc_auc_score,
 7 |         # _classification:
 8 |         accuracy_score,
 9 |         f1_score,
10 |         log_loss,
11 |     )
12 | except ImportError:
13 |     print("sklearn not available")
14 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics_protocols.py:
--------------------------------------------------------------------------------
  1 | from itertools import compress
  2 | from typing import Iterable
  3 | from tqdm import tqdm
  4 | import numpy as np
  5 | import json
  6 | 
  7 | from ebrec.evaluation.utils import convert_to_binary
  8 | from ebrec.evaluation.protocols import Metric
  9 | 
 10 | from ebrec.evaluation.metrics import (
 11 |     mean_squared_error,
 12 |     accuracy_score,
 13 |     roc_auc_score,
 14 |     ndcg_score,
 15 |     mrr_score,
 16 |     log_loss,
 17 |     f1_score,
 18 | )
 19 | 
 20 | 
 21 | class AccuracyScore(Metric):
 22 |     def __init__(self, threshold: float = 0.5):
 23 |         self.threshold = threshold
 24 |         self.name = "accuracy"
 25 | 
 26 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
 27 |         res = np.mean(
 28 |             [
 29 |                 accuracy_score(
 30 |                     each_labels, convert_to_binary(each_preds, self.threshold)
 31 |                 )
 32 |                 for each_labels, each_preds in tqdm(
 33 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
 34 |                 )
 35 |             ]
 36 |         )
 37 |         return float(res)
 38 | 
 39 | 
 40 | class F1Score(Metric):
 41 |     def __init__(self, threshold: float = 0.5):
 42 |         self.threshold = threshold
 43 |         self.name = "f1"
 44 | 
 45 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
 46 |         res = np.mean(
 47 |             [
 48 |                 f1_score(each_labels, convert_to_binary(each_preds, self.threshold))
 49 |                 for each_labels, each_preds in tqdm(
 50 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
 51 |                 )
 52 |             ]
 53 |         )
 54 |         return float(res)
 55 | 
 56 | 
 57 | class RootMeanSquaredError(Metric):
 58 |     def __init__(self):
 59 |         self.name = "rmse"
 60 | 
 61 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
 62 |         res = np.mean(
 63 |             [
 64 |                 np.sqrt(mean_squared_error(each_labels, each_preds))
 65 |                 for each_labels, each_preds in tqdm(
 66 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
 67 |                 )
 68 |             ]
 69 |         )
 70 |         return float(res)
 71 | 
 72 | 
 73 | class AucScore(Metric):
 74 |     def __init__(self):
 75 |         self.name = "auc"
 76 | 
 77 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
 78 |         res = np.mean(
 79 |             [
 80 |                 roc_auc_score(each_labels, each_preds)
 81 |                 for each_labels, each_preds in tqdm(
 82 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
 83 |                 )
 84 |             ]
 85 |         )
 86 |         return float(res)
 87 | 
 88 | 
 89 | class LogLossScore(Metric):
 90 |     def __init__(self):
 91 |         self.name = "logloss"
 92 | 
 93 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
 94 |         res = np.mean(
 95 |             [
 96 |                 log_loss(
 97 |                     each_labels,
 98 |                     [max(min(p, 1.0 - 10e-12), 10e-12) for p in each_preds],
 99 |                 )
100 |                 for each_labels, each_preds in tqdm(
101 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
102 |                 )
103 |             ]
104 |         )
105 |         return float(res)
106 | 
107 | 
108 | class MrrScore(Metric):
109 |     def __init__(self) -> Metric:
110 |         self.name = "mrr"
111 | 
112 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
113 |         mean_mrr = np.mean(
114 |             [
115 |                 mrr_score(each_labels, each_preds)
116 |                 for each_labels, each_preds in tqdm(
117 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
118 |                 )
119 |             ]
120 |         )
121 |         return float(mean_mrr)
122 | 
123 | 
124 | class NdcgScore(Metric):
125 |     def __init__(self, k: int):
126 |         self.k = k
127 |         self.name = f"ndcg@{k}"
128 | 
129 |     def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
130 |         res = np.mean(
131 |             [
132 |                 ndcg_score(each_labels, each_preds, self.k)
133 |                 for each_labels, each_preds in tqdm(
134 |                     zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
135 |                 )
136 |             ]
137 |         )
138 |         return float(res)
139 | 
140 | 
141 | class MetricEvaluator:
142 |     """
143 |     >>> y_true = [[1, 0, 0], [1, 1, 0], [1, 0, 0, 0]]
144 |     >>> y_pred = [[0.2, 0.3, 0.5], [0.18, 0.7, 0.1], [0.18, 0.2, 0.1, 0.1]]
145 | 
146 |     >>> met_eval = MetricEvaluator(
147 |             labels=y_true,
148 |             predictions=y_pred,
149 |             metric_functions=[
150 |                 AucScore(),
151 |                 MrrScore(),
152 |                 NdcgScore(k=5),
153 |                 NdcgScore(k=10),
154 |                 LogLossScore(),
155 |                 RootMeanSquaredError(),
156 |                 AccuracyScore(threshold=0.5),
157 |                 F1Score(threshold=0.5),
158 |             ],
159 |         )
160 |     >>> met_eval.evaluate()
161 |     {
162 |         "auc": 0.5555555555555556,
163 |         "mrr": 0.5277777777777778,
164 |         "ndcg@5": 0.7103099178571526,
165 |         "ndcg@10": 0.7103099178571526,
166 |         "logloss": 0.716399020295845,
167 |         "rmse": 0.5022870658128165
168 |         "accuracy": 0.5833333333333334,
169 |         "f1": 0.2222222222222222
170 |     }
171 |     """
172 | 
173 |     def __init__(
174 |         self,
175 |         labels: list[np.ndarray],
176 |         predictions: list[np.ndarray],
177 |         metric_functions: list[Metric],
178 |     ):
179 |         self.labels = labels
180 |         self.predictions = predictions
181 |         self.metric_functions = metric_functions
182 |         self.evaluations = dict()
183 | 
184 |     def evaluate(self) -> dict:
185 |         self.evaluations = {
186 |             metric_function.name: metric_function(self.labels, self.predictions)
187 |             for metric_function in self.metric_functions
188 |         }
189 |         return self
190 | 
191 |     @property
192 |     def metric_functions(self):
193 |         return self.__metric_functions
194 | 
195 |     @metric_functions.setter
196 |     def metric_functions(self, values):
197 |         invalid_callables = self.__invalid_callables(values)
198 |         if not any(invalid_callables) and invalid_callables:
199 |             self.__metric_functions = values
200 |         else:
201 |             invalid_objects = list(compress(values, invalid_callables))
202 |             invalid_types = [type(item) for item in invalid_objects]
203 |             raise TypeError(f"Following object(s) are not callable: {invalid_types}")
204 | 
205 |     @staticmethod
206 |     def __invalid_callables(iter: Iterable):
207 |         return [not callable(item) for item in iter]
208 | 
209 |     def __str__(self):
210 |         if self.evaluations:
211 |             evaluations_json = json.dumps(self.evaluations, indent=4)
212 |             return f"<MetricEvaluator class>: \n {evaluations_json}"
213 |         else:
214 |             return f"<MetricEvaluator class>: {self.evaluations}"
215 | 
216 |     def __repr__(self):
217 |         return str(self)
218 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/protocols.py:
--------------------------------------------------------------------------------
 1 | from typing import Protocol
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Metric(Protocol):
 6 |     name: str
 7 | 
 8 |     def calculate(self, y_true: np.ndarray, y_score: np.ndarray) -> float: ...
 9 | 
10 |     def __str__(self) -> str:
11 |         return f"<Callable Metric: {self.name}>: params: {self.__dict__}"
12 | 
13 |     def __repr__(self) -> str:
14 |         return str(self)
15 | 
16 |     def __call__(self, y_true: np.ndarray, y_score: np.ndarray) -> float:
17 |         return self.calculate(y_true, y_score)
18 | 


--------------------------------------------------------------------------------
/src/ebrec/evaluation/utils.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from typing import Iterable
  3 | import numpy as np
  4 | 
  5 | 
  6 | def convert_to_binary(y_pred: np.ndarray, threshold: float):
  7 |     y_pred = np.asarray(y_pred)
  8 |     y_pred[y_pred >= threshold] = 1
  9 |     y_pred[y_pred < threshold] = 0
 10 |     return y_pred
 11 | 
 12 | 
 13 | def is_iterable_nested_dtype(iterable: Iterable[any], dtypes) -> bool:
 14 |     """
 15 |     Check whether iterable is a nested with dtype,
 16 |     note, we assume all types in iterable are the the same.
 17 |     Check all cases: any(isinstance(i, dtypes) for i in a)
 18 | 
 19 |     Args:
 20 |         iterable (Iterable[Any]): iterable (list, array, tuple) of any type of data
 21 |         dtypes (Tuple): tuple of possible dtypes, e.g. dtypes = (list, np.ndarray)
 22 |     Returns:
 23 |         bool: boolean whether it is true or false
 24 | 
 25 |     Examples:
 26 |         >>> is_iterable_nested_dtype([1, 2, 3], list)
 27 |             False
 28 |         >>> is_iterable_nested_dtype([1, 2, 3], (list, int))
 29 |             True
 30 |         >>> is_iterable_nested_dtype([[1], [2], [3]], list)
 31 |             True
 32 |     """
 33 |     return isinstance(iterable[0], dtypes)
 34 | 
 35 | 
 36 | def compute_combinations(n: int, r: int) -> int:
 37 |     """Compute Combinations where order does not matter (without replacement)
 38 | 
 39 |     Source: https://www.statskingdom.com/combinations-calculator.html
 40 |     Args:
 41 |         n (int): number of items
 42 |         r (int): number of items being chosen at a time
 43 |     Returns:
 44 |         int: number of possible combinations
 45 | 
 46 |     Formula:
 47 |     * nCr = n! / ( (n - r)! * r! )
 48 | 
 49 |     Assume the following:
 50 |     * we sample without replacement of items
 51 |     * order of the outcomes does NOT matter
 52 |     """
 53 |     return int(
 54 |         (np.math.factorial(n)) / (np.math.factorial(n - r) * np.math.factorial(r))
 55 |     )
 56 | 
 57 | 
 58 | def scale_range(
 59 |     m: np.ndarray,
 60 |     r_min: float = None,
 61 |     r_max: float = None,
 62 |     t_min: float = 0,
 63 |     t_max: float = 1.0,
 64 | ) -> None:
 65 |     """Scale an array between a range
 66 |     Source: https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range
 67 | 
 68 |     m -> ((m-r_min)/(r_max-r_min)) * (t_max-t_min) + t_min
 69 | 
 70 |     Args:
 71 |         m ∈ [r_min,r_max] denote your measurements to be scaled
 72 |         r_min denote the minimum of the range of your measurement
 73 |         r_max denote the maximum of the range of your measurement
 74 |         t_min denote the minimum of the range of your desired target scaling
 75 |         t_max denote the maximum of the range of your desired target scaling
 76 |     """
 77 |     if not r_min:
 78 |         r_min = np.min(m)
 79 |     if not r_max:
 80 |         r_max = np.max(m)
 81 |     return ((m - r_min) / (r_max - r_min)) * (t_max - t_min) + t_min
 82 | 
 83 | 
 84 | # utils for
 85 | def compute_item_popularity_scores(R: Iterable[np.ndarray]) -> dict[str, float]:
 86 |     """Compute popularity scores for items based on their occurrence in user interactions.
 87 | 
 88 |     This function calculates the popularity score of each item as the fraction of users who have interacted with that item.
 89 |     The popularity score, p_i, for an item is defined as the number of users who have interacted with the item divided by the
 90 |     total number of users.
 91 | 
 92 |     Formula:
 93 |         p_i = | {u ∈ U}, r_ui != Ø | / |U|
 94 | 
 95 |     where p_i is the popularity score of an item, U is the total number of users, and r_ui is the interaction of user u with item i (non-zero
 96 |     interaction implies the user has seen the item).
 97 | 
 98 |     Note:
 99 |         Each entry can only have the same item ones. TODO - ADD THE TEXT DONE HERE.
100 | 
101 |     Args:
102 |         R (Iterable[np.ndarray]): An iterable of numpy arrays, where each array represents the items interacted with by a single user.
103 |             Each element in the array should be a string identifier for an item.
104 | 
105 |     Returns:
106 |         dict[str, float]: A dictionary where keys are item identifiers and values are their corresponding popularity scores (as floats).
107 | 
108 |     Examples:
109 |     >>> R = [
110 |             np.array(["item1", "item2", "item3"]),
111 |             np.array(["item1", "item3"]),
112 |             np.array(["item1", "item4"]),
113 |         ]
114 |     >>> print(popularity_scores(R))
115 |         {'item1': 1.0, 'item2': 0.3333333333333333, 'item3': 0.6666666666666666, 'item4': 0.3333333333333333}
116 |     """
117 |     U = len(R)
118 |     R_flatten = np.concatenate(R)
119 |     item_counts = Counter(R_flatten)
120 |     return {item: (r_ui / U) for item, r_ui in item_counts.items()}
121 | 
122 | 
123 | def compute_normalized_distribution(
124 |     R: np.ndarray[str],
125 |     weights: np.ndarray[float] = None,
126 |     distribution: dict[str, float] = None,
127 | ) -> dict[str, float]:
128 |     """
129 |     Compute a normalized weigted distribution for a list of items that each can have a single representation assigned.
130 | 
131 |     Args:
132 |         a (np.ndarray[str]): an array of items representation.
133 |         weights (np.ndarray[float], optional): weights to assign each element in a. Defaults to None.
134 |             * Following yields: len(weights) == len(a)
135 |         distribution (Dict[str, float], optional): dictionary to assign the distribution values, if None it will be generated as {}. Defaults to None.
136 |             * Use case; if you want to add distribution values to existing, one can input it.
137 | 
138 |     Returns:
139 |         Dict[str, float]: dictionary with normalized distribution values
140 | 
141 |     Examples:
142 |         >>> a = np.array(["a", "b", "c", "c"])
143 |         >>> compute_normalized_distribution(a)
144 |             {'a': 0.25, 'b': 0.25, 'c': 0.5}
145 |     """
146 |     n_elements = len(R)
147 | 
148 |     distr = distribution if distribution is not None else {}
149 |     weights = weights if weights is not None else np.ones(n_elements) / n_elements
150 |     for item, weight in zip(R, weights):
151 |         distr[item] = weight + distr.get(item, 0.0)
152 |     return distr
153 | 
154 | 
155 | def get_keys_in_dict(id_list: any, dictionary: dict) -> list[any]:
156 |     """
157 |     Returns a list of IDs from id_list that are keys in the dictionary.
158 |     Args:
159 |         id_list (List[Any]): List of IDs to check against the dictionary.
160 |         dictionary (Dict[Any, Any]): Dictionary where keys are checked against the IDs.
161 | 
162 |     Returns:
163 |         List[Any]: List of IDs that are also keys in the dictionary.
164 | 
165 |     Examples:
166 |         >>> get_keys_in_dict(['a', 'b', 'c'], {'a': 1, 'c': 3, 'd': 4})
167 |             ['a', 'c']
168 |     """
169 |     return [id_ for id_ in id_list if id_ in dictionary]
170 | 
171 | 
172 | def check_key_in_all_nested_dicts(dictionary: dict, key: str) -> None:
173 |     """
174 |     Checks if the given key is present in all nested dictionaries within the main dictionary.
175 |     Raises a ValueError if the key is not found in any of the nested dictionaries.
176 | 
177 |     Args:
178 |         dictionary (dict): The dictionary containing nested dictionaries to check.
179 |         key (str): The key to look for in all nested dictionaries.
180 | 
181 |     Raises:
182 |         ValueError: If the key is not present in any of the nested dictionaries.
183 | 
184 |     Example:
185 |         >>> nested_dict = {
186 |                 "101": {"name": "Alice", "age": 30},
187 |                 "102": {"name": "Bob", "age": 25},
188 |             }
189 |         >>> check_key_in_all_nested_dicts(nested_dict, "age")
190 |         # No error is raised
191 |         >>> check_key_in_all_nested_dicts(nested_dict, "salary")
192 |         # Raises ValueError: 'salary is not present in all nested dictionaries.'
193 |     """
194 |     for dict_key, sub_dict in dictionary.items():
195 |         if not isinstance(sub_dict, dict) or key not in sub_dict:
196 |             raise ValueError(
197 |                 f"'{key}' is not present in '{dict_key}' nested dictionary."
198 |             )
199 | 


--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/models/fastformer/__init__.py


--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/dataloader.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from tqdm import tqdm
  3 | import polars as pl
  4 | import numpy as np
  5 | 
  6 | from torch.utils.tensorboard import SummaryWriter
  7 | from torch.utils.data import DataLoader
  8 | from torch.utils.data import Dataset
  9 | import torch.optim as optim
 10 | import torch.nn as nn
 11 | import torch
 12 | 
 13 | from ebrec.utils._constants import DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_LABELS_COL
 14 | 
 15 | from ebrec.utils._python import (
 16 |     repeat_by_list_values_from_matrix,
 17 |     convert_to_nested_list,
 18 |     create_lookup_objects,
 19 | )
 20 | from ebrec.utils._articles_behaviors import map_list_article_id_to_value
 21 | from ebrec.utils._polars import shuffle_rows
 22 | 
 23 | from ebrec.evaluation import AucScore
 24 | from ebrec.utils._torch import save_checkpoint
 25 | 
 26 | 
 27 | @dataclass
 28 | class FastformerDataset(Dataset):
 29 |     """_summary_
 30 |     The batch-size is aggragating multiple impressions and processing them simultaneous, which
 31 |     has a major effect on the training time. Hence, you should put the batch_size=1 in the 'DataLoader'
 32 |     and just use FastformerDataset batch_size.
 33 | 
 34 |     Note, the outut is then (1, output_shape), where the 1 is the DataLoader batch_size.
 35 |     """
 36 | 
 37 |     behaviors: pl.DataFrame
 38 |     history_column: str
 39 |     article_dict: dict[int, pl.Series]
 40 |     batch_size: int = 64
 41 |     shuffle: bool = True
 42 |     device: str = "cpu"
 43 |     seed: int = None
 44 |     labels_col: str = DEFAULT_LABELS_COL
 45 |     inview_col: str = DEFAULT_INVIEW_ARTICLES_COL
 46 |     n_samples_col: str = "n_samples"
 47 | 
 48 |     def __post_init__(self):
 49 |         self.unknown_index = [0]
 50 |         if self.shuffle:
 51 |             self.behaviors = shuffle_rows(self.behaviors, seed=self.seed)
 52 |         self.behaviors = self.behaviors.with_columns(
 53 |             pl.col(self.labels_col).list.len().alias(self.n_samples_col)
 54 |         )
 55 |         self.lookup_indexes, self.lookup_matrix = create_lookup_objects(
 56 |             self.article_dict, unknown_representation="zeros"
 57 |         )
 58 | 
 59 |     def __len__(self):
 60 |         """
 61 |         Number of batch steps in the data
 62 |         """
 63 |         return int(np.ceil(self.behaviors.shape[0] / self.batch_size))
 64 | 
 65 |     def __getitem__(self, index: int):
 66 |         """
 67 |         Get the batch of samples for the given index.
 68 | 
 69 |         Note: The dataset class provides a single index for each iteration. The batching is done internally in this method
 70 |         to utilize and optimize for speed. This can be seen as a mini-batching approach.
 71 | 
 72 |         Args:
 73 |             index (int): An integer index.
 74 | 
 75 |         Returns:
 76 |             Tuple[torch.Tensor, torch.Tensor]: A tuple containing the input features and labels as torch Tensors.
 77 |                 Note, the output of the PyTorch DataLoader is (1, *shape), where 1 is the DataLoader's batch_size.
 78 |         """
 79 |         # Clever way to batch the data:
 80 |         batch_indices = range(index * self.batch_size, (index + 1) * self.batch_size)
 81 |         batch = self.behaviors[batch_indices]
 82 |         if self.shuffle:
 83 |             batch = shuffle_rows(batch, seed=self.seed)
 84 |         # =>
 85 |         x = (
 86 |             batch.drop(self.labels_col)
 87 |             .pipe(
 88 |                 map_list_article_id_to_value,
 89 |                 behaviors_column=self.history_column,
 90 |                 mapping=self.lookup_indexes,
 91 |                 fill_nulls=self.unknown_index,
 92 |             )
 93 |             .pipe(
 94 |                 map_list_article_id_to_value,
 95 |                 behaviors_column=self.inview_col,
 96 |                 mapping=self.lookup_indexes,
 97 |                 fill_nulls=self.unknown_index,
 98 |             )
 99 |         )
100 |         # =>
101 |         repeats = np.array(batch[self.n_samples_col])
102 |         # =>
103 |         history_input = repeat_by_list_values_from_matrix(
104 |             input_array=x[self.history_column].to_list(),
105 |             matrix=self.lookup_matrix,
106 |             repeats=repeats,
107 |         ).squeeze(2)
108 |         # =>
109 |         candidate_input = self.lookup_matrix[x[self.inview_col].explode().to_list()]
110 |         # =>
111 |         history_input = torch.Tensor(history_input).type(torch.int).to(self.device)
112 |         candidate_input = torch.Tensor(candidate_input).type(torch.int).to(self.device)
113 |         y = (
114 |             torch.Tensor(batch[self.labels_col].explode())
115 |             .view(-1, 1)
116 |             .type(torch.float)
117 |             .to(self.device)
118 |         )
119 |         # ========================
120 |         return (history_input, candidate_input), y
121 | 
122 | 
123 | def batch_input_label_concatenation(
124 |     inputs: tuple[torch.Tensor], labels: torch.Tensor
125 | ) -> tuple[torch.Tensor, torch.Tensor]:
126 |     """ """
127 |     return (inputs[0].squeeze(0), inputs[1].squeeze(0)), labels.squeeze(0)
128 | 
129 | 
130 | def compute_auc_from_fixed_pos_neg_samples(
131 |     y_true: list[float], y_pred: list[float]
132 | ) -> float:
133 |     #
134 |     n_samples = int(np.sum(y_true))
135 |     y_true = convert_to_nested_list(y_true, n_samples)
136 |     y_pred = convert_to_nested_list(y_pred, n_samples)
137 |     val_auc = AucScore().calculate(y_true=y_true, y_pred=y_pred)
138 |     return val_auc
139 | 
140 | 
141 | def train(
142 |     model: nn.Module,
143 |     train_dataloader: DataLoader,
144 |     criterion: nn.Module,
145 |     optimizer: optim.Optimizer,
146 |     num_epochs: int = 5,
147 |     val_dataloader: DataLoader = None,
148 |     state_dict_path: str = "model_state_dict.pt",
149 |     patience: int = None,
150 |     summary_writer: SummaryWriter = None,
151 |     gradient_accumulation_steps: int = 1,
152 |     tqdm_disable: bool = False,
153 |     tqdm_ncol: int = 80,
154 |     monitor_metric: str = "loss",
155 | ) -> nn.Module:
156 |     """ """
157 |     min_val_loss = np.inf
158 |     max_val_auc = -np.inf
159 |     early_stop = 0
160 |     global_steps = 0
161 |     total_batches = len(train_dataloader)
162 |     running_loss = 0.0
163 |     running_samples = 0
164 |     # ==> TRAIN LOOP:
165 |     for epoch in range(num_epochs):
166 |         # => Set the model to train mode
167 |         model.train(True)
168 |         progress_bar = tqdm(
169 |             train_dataloader,
170 |             desc=f"Epoch [{epoch + 1}/{num_epochs}]",
171 |             disable=tqdm_disable,
172 |             ncols=tqdm_ncol,
173 |         )
174 |         # => Zero the parameter gradients
175 |         optimizer.zero_grad()
176 |         for batch_idx, (inputs, labels) in enumerate(progress_bar, start=1):
177 |             # => Move inputs and labels to device
178 |             inputs, labels = batch_input_label_concatenation(inputs, labels)
179 |             # => Forward pass
180 |             outputs = model(*inputs)
181 |             loss = criterion(outputs, labels)
182 |             # => Backward pass and optimization
183 |             loss.backward()
184 |             # => Update training loss
185 |             global_steps += 1
186 |             running_loss += loss.item() * len(outputs)
187 |             running_samples += len(outputs)
188 |             current_loss = running_loss / running_samples
189 |             progress_bar.set_postfix({"Loss": round(current_loss, 6)})
190 |             # =>
191 |             if summary_writer is not None:
192 |                 summary_writer.add_scalar(
193 |                     tag="Train/Loss",
194 |                     scalar_value=current_loss,
195 |                     global_step=global_steps,
196 |                 )
197 |             # => Accumulated gradient step:
198 |             if (
199 |                 batch_idx % gradient_accumulation_steps == 0
200 |                 or batch_idx == total_batches
201 |             ):
202 |                 # => Take step and zero gradients
203 |                 optimizer.step()
204 |                 optimizer.zero_grad()
205 | 
206 |         # ==> EVAL LOOP:
207 |         if val_dataloader:
208 |             model.train(False)
209 |             all_outputs, all_labels, val_loss = evaluate(
210 |                 model=model,
211 |                 dataloader=val_dataloader,
212 |                 criterion=criterion,
213 |                 tqdm_disable=tqdm_disable,
214 |             )
215 | 
216 |             if summary_writer is not None:
217 |                 summary_writer.add_scalar(
218 |                     tag="Val/Loss", scalar_value=val_loss, global_step=global_steps
219 |                 )
220 | 
221 |             if monitor_metric == "auc":
222 |                 val_auc = compute_auc_from_fixed_pos_neg_samples(
223 |                     y_true=np.ravel(all_labels.tolist()),
224 |                     y_pred=np.ravel(all_outputs.tolist()),
225 |                 )
226 |                 print(f"Val/AUC : {round(val_auc, 6)}")
227 |                 if summary_writer is not None:
228 |                     summary_writer.add_scalar(
229 |                         tag="Val/AUC", scalar_value=val_auc, global_step=global_steps
230 |                     )
231 | 
232 |             # => MODEL CHECKPOINT
233 |             if monitor_metric == "loss" and val_loss < min_val_loss:
234 |                 save_checkpoint(model, path=state_dict_path)
235 |                 min_val_loss = val_loss
236 |                 early_stop = 0
237 |             elif monitor_metric == "auc" and val_auc > max_val_auc:
238 |                 save_checkpoint(model, path=state_dict_path)
239 |                 max_val_auc = val_auc
240 |                 early_stop = 0
241 |             else:
242 |                 early_stop += 1
243 |             # => EARLYSTOP
244 |             if patience is not None and early_stop == patience:
245 |                 break
246 | 
247 |     if summary_writer is not None:
248 |         summary_writer.close()
249 | 
250 |     if val_dataloader:
251 |         model.load_state_dict(torch.load(state_dict_path), strict=True)
252 | 
253 |     return model
254 | 
255 | 
256 | def evaluate(
257 |     model: nn.Module,
258 |     dataloader: DataLoader,
259 |     criterion: nn.Module,
260 |     tqdm_disable: bool = False,
261 |     tqdm_ncol: int = 80,
262 |     device: str = "cpu",
263 | ) -> tuple[list[float], list[float], float]:
264 |     model.eval()
265 |     all_outputs = []
266 |     all_labels = []
267 |     loss = 0.0
268 |     n_samples = 0
269 |     with torch.no_grad():
270 |         progress_bar = tqdm(
271 |             dataloader,
272 |             desc="Evaluating",
273 |             total=dataloader.__len__(),
274 |             disable=tqdm_disable,
275 |             ncols=tqdm_ncol,
276 |         )
277 |         for inputs, labels in progress_bar:
278 |             inputs, labels = batch_input_label_concatenation(inputs, labels)
279 |             # Forward pass
280 |             outputs = model(*inputs)
281 |             batch_loss = criterion(outputs, labels)
282 |             # =>
283 |             loss += batch_loss.item() * len(outputs)
284 |             n_samples += len(outputs)
285 |             # =>
286 |             all_outputs.append(outputs)
287 |             all_labels.append(labels)
288 |             #
289 |             progress_bar.set_postfix({"Eval Loss": round(loss / n_samples, 4)})
290 |         # =>
291 |         all_outputs = torch.cat(all_outputs, dim=0)
292 |         all_labels = torch.cat(all_labels, dim=0)
293 |         loss = loss / n_samples
294 |     return all_outputs, all_labels, loss
295 | 


--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/fastformer_wu.py:
--------------------------------------------------------------------------------
  1 | from transformers.models.bert.modeling_bert import (
  2 |     BertSelfOutput,
  3 |     BertIntermediate,
  4 |     BertOutput,
  5 | )
  6 | import logging
  7 | import torch.nn as nn
  8 | import torch
  9 | 
 10 | 
 11 | class AttentionPooling(nn.Module):
 12 |     def __init__(self, config):
 13 |         self.config = config
 14 |         super(AttentionPooling, self).__init__()
 15 |         self.att_fc1 = nn.Linear(config.hidden_size, config.hidden_size)
 16 |         self.att_fc2 = nn.Linear(config.hidden_size, 1)
 17 |         self.apply(self.init_weights)
 18 | 
 19 |     def init_weights(self, module):
 20 |         if isinstance(module, nn.Linear):
 21 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 22 |         if isinstance(module, nn.Linear) and module.bias is not None:
 23 |             module.bias.data.zero_()
 24 | 
 25 |     def forward(self, x, attn_mask=None):
 26 |         bz = x.shape[0]
 27 |         e = self.att_fc1(x)
 28 |         e = nn.Tanh()(e)
 29 |         alpha = self.att_fc2(e)
 30 |         alpha = torch.exp(alpha)
 31 |         if attn_mask is not None:
 32 |             alpha = alpha * attn_mask.unsqueeze(2)
 33 |         alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8)
 34 |         x = torch.bmm(x.permute(0, 2, 1), alpha)
 35 |         x = torch.reshape(x, (bz, -1))
 36 |         return x
 37 | 
 38 | 
 39 | class FastSelfAttention(nn.Module):
 40 |     def __init__(self, config):
 41 |         super(FastSelfAttention, self).__init__()
 42 |         self.config = config
 43 |         if config.hidden_size % config.num_attention_heads != 0:
 44 |             raise ValueError(
 45 |                 "The hidden size (%d) is not a multiple of the number of attention "
 46 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads)
 47 |             )
 48 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 49 |         self.num_attention_heads = config.num_attention_heads
 50 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 51 |         self.input_dim = config.hidden_size
 52 | 
 53 |         self.query = nn.Linear(self.input_dim, self.all_head_size)
 54 |         self.query_att = nn.Linear(self.all_head_size, self.num_attention_heads)
 55 |         self.key = nn.Linear(self.input_dim, self.all_head_size)
 56 |         self.key_att = nn.Linear(self.all_head_size, self.num_attention_heads)
 57 |         self.transform = nn.Linear(self.all_head_size, self.all_head_size)
 58 | 
 59 |         self.softmax = nn.Softmax(dim=-1)
 60 | 
 61 |         self.apply(self.init_weights)
 62 | 
 63 |     def init_weights(self, module):
 64 |         if isinstance(module, nn.Linear):
 65 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 66 |         if isinstance(module, nn.Linear) and module.bias is not None:
 67 |             module.bias.data.zero_()
 68 | 
 69 |     def transpose_for_scores(self, x):
 70 |         new_x_shape = x.size()[:-1] + (
 71 |             self.num_attention_heads,
 72 |             self.attention_head_size,
 73 |         )
 74 |         x = x.view(*new_x_shape)
 75 |         return x.permute(0, 2, 1, 3)
 76 | 
 77 |     def forward(self, hidden_states, attention_mask):
 78 |         # batch_size, seq_len, num_head * head_dim, batch_size, seq_len
 79 |         batch_size, seq_len, _ = hidden_states.shape
 80 |         mixed_query_layer = self.query(hidden_states)
 81 |         mixed_key_layer = self.key(hidden_states)
 82 |         # batch_size, num_head, seq_len
 83 |         query_for_score = (
 84 |             self.query_att(mixed_query_layer).transpose(1, 2)
 85 |             / self.attention_head_size**0.5
 86 |         )
 87 |         # add attention mask
 88 |         query_for_score += attention_mask
 89 | 
 90 |         # batch_size, num_head, 1, seq_len
 91 |         query_weight = self.softmax(query_for_score).unsqueeze(2)
 92 | 
 93 |         # batch_size, num_head, seq_len, head_dim
 94 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 95 | 
 96 |         # batch_size, num_head, head_dim, 1
 97 |         pooled_query = (
 98 |             torch.matmul(query_weight, query_layer)
 99 |             .transpose(1, 2)
100 |             .view(-1, 1, self.num_attention_heads * self.attention_head_size)
101 |         )
102 |         pooled_query_repeat = pooled_query.repeat(1, seq_len, 1)
103 |         # batch_size, num_head, seq_len, head_dim
104 | 
105 |         # batch_size, num_head, seq_len
106 |         mixed_query_key_layer = mixed_key_layer * pooled_query_repeat
107 | 
108 |         query_key_score = (
109 |             self.key_att(mixed_query_key_layer) / self.attention_head_size**0.5
110 |         ).transpose(1, 2)
111 | 
112 |         # add attention mask
113 |         query_key_score += attention_mask
114 | 
115 |         # batch_size, num_head, 1, seq_len
116 |         query_key_weight = self.softmax(query_key_score).unsqueeze(2)
117 | 
118 |         key_layer = self.transpose_for_scores(mixed_query_key_layer)
119 |         pooled_key = torch.matmul(query_key_weight, key_layer)
120 | 
121 |         # query = value
122 |         weighted_value = (pooled_key * query_layer).transpose(1, 2)
123 |         weighted_value = weighted_value.reshape(
124 |             weighted_value.size()[:-2]
125 |             + (self.num_attention_heads * self.attention_head_size,)
126 |         )
127 |         weighted_value = self.transform(weighted_value) + mixed_query_layer
128 | 
129 |         return weighted_value
130 | 
131 | 
132 | class FastAttention(nn.Module):
133 |     def __init__(self, config):
134 |         super(FastAttention, self).__init__()
135 |         self.self = FastSelfAttention(config)
136 |         self.output = BertSelfOutput(config)
137 | 
138 |     def forward(self, input_tensor, attention_mask):
139 |         self_output = self.self(input_tensor, attention_mask)
140 |         attention_output = self.output(self_output, input_tensor)
141 |         return attention_output
142 | 
143 | 
144 | class FastformerLayer(nn.Module):
145 |     def __init__(self, config):
146 |         super(FastformerLayer, self).__init__()
147 |         self.attention = FastAttention(config)
148 |         self.intermediate = BertIntermediate(config)
149 |         self.output = BertOutput(config)
150 | 
151 |     def forward(self, hidden_states, attention_mask):
152 |         attention_output = self.attention(hidden_states, attention_mask)
153 |         intermediate_output = self.intermediate(attention_output)
154 |         layer_output = self.output(intermediate_output, attention_output)
155 |         return layer_output
156 | 
157 | 
158 | class StandardFastformerEncoder(nn.Module):
159 |     def __init__(self, config, pooler_count=1):
160 |         super(StandardFastformerEncoder, self).__init__()
161 |         self.config = config
162 |         self.encoders = nn.ModuleList(
163 |             [FastformerLayer(config) for _ in range(config.num_hidden_layers)]
164 |         )
165 |         self.position_embeddings = nn.Embedding(
166 |             config.max_position_embeddings, config.hidden_size
167 |         )
168 |         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
169 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
170 | 
171 |         # support multiple different poolers with shared bert encoder.
172 |         self.poolers = nn.ModuleList()
173 |         if config.pooler_type == "weightpooler":
174 |             for _ in range(pooler_count):
175 |                 self.poolers.append(AttentionPooling(config))
176 |         logging.info(f"This model has {len(self.poolers)} poolers.")
177 |         self.apply(self.init_weights)
178 | 
179 |     def init_weights(self, module):
180 |         if isinstance(module, (nn.Linear, nn.Embedding)):
181 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
182 |             if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
183 |                 with torch.no_grad():
184 |                     module.weight[module.padding_idx].fill_(0)
185 |         elif isinstance(module, nn.LayerNorm):
186 |             module.bias.data.zero_()
187 |             module.weight.data.fill_(1.0)
188 |         if isinstance(module, nn.Linear) and module.bias is not None:
189 |             module.bias.data.zero_()
190 | 
191 |     def forward(self, input_embs, attention_mask, pooler_index=0) -> torch.Tensor:
192 |         """
193 |         Forward pass through the encoder.
194 | 
195 |         Parameters:
196 |         input_embs (torch.Tensor): The input embeddings, with shape (batch_size, n_tokens, emb_dim).
197 |         attention_mask (torch.Tensor): The attention mask, with shape (batch_size, n_tokens), where
198 |                                     values of 1 indicate positions to attend to and 0s indicate positions to mask.
199 |         pooler_index (int, optional): Index of the pooler to use to aggregate the encoder's output. Default is 0.
200 | 
201 |         Returns:
202 |         torch.Tensor: The output of the encoder, processed and pooled according to the specified pooler.
203 |                     with shape (batch_size, config.hidden_size).
204 | 
205 |         Usage:
206 |         >>> encoder_output = model.forward(input_embs, attention_mask, pooler_index=0)
207 |         """
208 |         extended_attention_mask = attention_mask.unsqueeze(1)
209 |         extended_attention_mask = extended_attention_mask.to(
210 |             dtype=next(self.parameters()).dtype
211 |         )  # fp16 compatibility
212 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
213 | 
214 |         batch_size, n_tokens, emb_dim = input_embs.shape
215 | 
216 |         position_ids = torch.arange(
217 |             n_tokens, dtype=torch.long, device=input_embs.device
218 |         )
219 |         position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
220 |         position_embeddings = self.position_embeddings(position_ids)
221 | 
222 |         embeddings = input_embs + position_embeddings
223 |         embeddings = self.LayerNorm(embeddings)
224 |         embeddings = self.dropout(embeddings)
225 | 
226 |         all_hidden_states = [embeddings]
227 | 
228 |         for layer_module in self.encoders:
229 |             layer_outputs = layer_module(all_hidden_states[-1], extended_attention_mask)
230 |             all_hidden_states.append(layer_outputs)
231 | 
232 |         output = self.poolers[pooler_index](all_hidden_states[-1], attention_mask)
233 | 
234 |         return output
235 | 
236 | 
237 | class Fastformer_wu(torch.nn.Module):
238 |     def __init__(
239 |         self,
240 |         config,
241 |         word_embedding: nn.Embedding,
242 |     ):
243 |         super(Fastformer_wu, self).__init__()
244 |         self.config = config
245 |         self.word_embedding = word_embedding
246 |         self.embedding_transform = nn.Linear(
247 |             word_embedding.weight.shape[1], config.hidden_size
248 |         )
249 |         # 4 classes; likely the npratio
250 |         self.output_layer = nn.Linear(config.hidden_size, 4)
251 |         self.fastformer_model = StandardFastformerEncoder(config)
252 |         self.criterion = nn.CrossEntropyLoss()
253 |         self.apply(self.init_weights)
254 | 
255 |     def init_weights(self, module):
256 |         if isinstance(module, (nn.Linear, nn.Embedding)):
257 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
258 |             if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
259 |                 with torch.no_grad():
260 |                     module.weight[module.padding_idx].fill_(0)
261 |         if isinstance(module, nn.Linear) and module.bias is not None:
262 |             module.bias.data.zero_()
263 | 
264 |     def forward(self, input_ids, targets):
265 |         mask = input_ids.bool().float()
266 |         embds = self.word_embedding(input_ids)
267 |         embds = self.embedding_transform(embds)
268 |         text_vec = self.fastformer_model(embds, mask)
269 |         score = self.output_layer(text_vec)
270 |         loss = self.criterion(score, targets)
271 |         return loss, score
272 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/__init__.py:
--------------------------------------------------------------------------------
1 | from .npa import NPAModel
2 | from .lstur import LSTURModel
3 | from .nrms import NRMSModel
4 | from .naml import NAMLModel
5 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/base_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | from tensorflow import keras
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | import abc
 6 | 
 7 | __all__ = ["BaseModel"]
 8 | 
 9 | 
10 | class BaseModel:
11 |     """Basic class of models
12 | 
13 |     Attributes:
14 |         hparams (object): A tf.contrib.training.HParams object, hold the entire set of hyperparameters.
15 |         graph (object): An optional graph.
16 |         seed (int): Random seed.
17 |     """
18 | 
19 |     def __init__(
20 |         self,
21 |         hparams: Dict[str, Any],
22 |         word2vec_embedding: np.ndarray = None,
23 |         # if 'word2vec_embedding' not provided:
24 |         word_emb_dim: int = 300,
25 |         vocab_size: int = 32000,
26 |         seed=None,
27 |     ):
28 |         """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
29 |         parameter set.
30 | 
31 |         Args:
32 |             hparams (object): Hold the entire set of hyperparameters.
33 |             seed (int): Random seed.
34 |         """
35 |         self.seed = seed
36 |         tf.random.set_seed(seed)
37 |         np.random.seed(seed)
38 | 
39 |         # ASSIGN 'hparams':
40 |         self.hparams = hparams
41 | 
42 |         # INIT THE WORD-EMBEDDINGS:
43 |         if word2vec_embedding is None:
44 |             self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim)
45 |         else:
46 |             self.word2vec_embedding = word2vec_embedding
47 | 
48 |         # BUILD AND COMPILE MODEL:
49 |         self.model, self.scorer = self._build_graph()
50 |         self.loss = self._get_loss(self.hparams.loss)
51 |         self.train_optimizer = self._get_opt(
52 |             optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
53 |         )
54 |         self.model.compile(loss=self.loss, optimizer=self.train_optimizer)
55 | 
56 |     @abc.abstractmethod
57 |     def _build_graph(self):
58 |         """Subclass will implement this."""
59 |         pass
60 | 
61 |     def _get_loss(self, loss: str):
62 |         """Make loss function, consists of data loss and regularization loss
63 | 
64 |         Returns:
65 |             object: Loss function or loss function name
66 |         """
67 |         if loss == "cross_entropy_loss":
68 |             data_loss = "categorical_crossentropy"
69 |         elif loss == "log_loss":
70 |             data_loss = "binary_crossentropy"
71 |         else:
72 |             raise ValueError(f"this loss not defined {loss}")
73 |         return data_loss
74 | 
75 |     def _get_opt(self, optimizer: str, lr: float):
76 |         """Get the optimizer according to configuration. Usually we will use Adam.
77 |         Returns:
78 |             object: An optimizer.
79 |         """
80 | 
81 |         if optimizer == "adam":
82 |             train_opt = keras.optimizers.Adam(learning_rate=lr)
83 |         else:
84 |             raise ValueError(f"this optimizer not defined {optimizer}")
85 | 
86 |         return train_opt
87 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.keras as keras
  3 | from tensorflow.keras import layers
  4 | from tensorflow.keras import backend as K
  5 | 
  6 | 
  7 | class AttLayer2(layers.Layer):
  8 |     """Soft alignment attention implement.
  9 | 
 10 |     Attributes:
 11 |         dim (int): attention hidden dim
 12 |     """
 13 | 
 14 |     def __init__(self, dim=200, seed=0, **kwargs):
 15 |         """Initialization steps for AttLayer2.
 16 | 
 17 |         Args:
 18 |             dim (int): attention hidden dim
 19 |         """
 20 | 
 21 |         self.dim = dim
 22 |         self.seed = seed
 23 |         super(AttLayer2, self).__init__(**kwargs)
 24 | 
 25 |     def build(self, input_shape):
 26 |         """Initialization for variables in AttLayer2
 27 |         There are there variables in AttLayer2, i.e. W, b and q.
 28 | 
 29 |         Args:
 30 |             input_shape (object): shape of input tensor.
 31 |         """
 32 | 
 33 |         assert len(input_shape) == 3
 34 |         dim = self.dim
 35 |         self.W = self.add_weight(
 36 |             name="W",
 37 |             shape=(int(input_shape[-1]), dim),
 38 |             initializer=keras.initializers.glorot_uniform(seed=self.seed),
 39 |             trainable=True,
 40 |         )
 41 |         self.b = self.add_weight(
 42 |             name="b",
 43 |             shape=(dim,),
 44 |             initializer=keras.initializers.Zeros(),
 45 |             trainable=True,
 46 |         )
 47 |         self.q = self.add_weight(
 48 |             name="q",
 49 |             shape=(dim, 1),
 50 |             initializer=keras.initializers.glorot_uniform(seed=self.seed),
 51 |             trainable=True,
 52 |         )
 53 |         super(AttLayer2, self).build(input_shape)  # be sure you call this somewhere!
 54 | 
 55 |     def call(self, inputs, mask=None, **kwargs):
 56 |         """Core implemention of soft attention
 57 | 
 58 |         Args:
 59 |             inputs (object): input tensor.
 60 | 
 61 |         Returns:
 62 |             object: weighted sum of input tensors.
 63 |         """
 64 | 
 65 |         attention = K.tanh(K.dot(inputs, self.W) + self.b)
 66 |         attention = K.dot(attention, self.q)
 67 | 
 68 |         attention = K.squeeze(attention, axis=2)
 69 | 
 70 |         if mask == None:
 71 |             attention = K.exp(attention)
 72 |         else:
 73 |             attention = K.exp(attention) * K.cast(mask, dtype="float32")
 74 | 
 75 |         attention_weight = attention / (
 76 |             K.sum(attention, axis=-1, keepdims=True) + K.epsilon()
 77 |         )
 78 | 
 79 |         attention_weight = K.expand_dims(attention_weight)
 80 |         weighted_input = inputs * attention_weight
 81 |         return K.sum(weighted_input, axis=1)
 82 | 
 83 |     def compute_mask(self, input, input_mask=None):
 84 |         """Compte output mask value
 85 | 
 86 |         Args:
 87 |             input (object): input tensor.
 88 |             input_mask: input mask
 89 | 
 90 |         Returns:
 91 |             object: output mask.
 92 |         """
 93 |         return None
 94 | 
 95 |     def compute_output_shape(self, input_shape):
 96 |         """Compute shape of output tensor
 97 | 
 98 |         Args:
 99 |             input_shape (tuple): shape of input tensor.
100 | 
101 |         Returns:
102 |             tuple: shape of output tensor.
103 |         """
104 |         return input_shape[0], input_shape[-1]
105 | 
106 | 
107 | class SelfAttention(layers.Layer):
108 |     """Multi-head self attention implement.
109 | 
110 |     Args:
111 |         multiheads (int): The number of heads.
112 |         head_dim (object): Dimention of each head.
113 |         mask_right (boolean): whether to mask right words.
114 | 
115 |     Returns:
116 |         object: Weighted sum after attention.
117 |     """
118 | 
119 |     def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs):
120 |         """Initialization steps for AttLayer2.
121 | 
122 |         Args:
123 |             multiheads (int): The number of heads.
124 |             head_dim (object): Dimention of each head.
125 |             mask_right (boolean): whether to mask right words.
126 |         """
127 | 
128 |         self.multiheads = multiheads
129 |         self.head_dim = head_dim
130 |         self.output_dim = multiheads * head_dim
131 |         self.mask_right = mask_right
132 |         self.seed = seed
133 |         super(SelfAttention, self).__init__(**kwargs)
134 | 
135 |     def compute_output_shape(self, input_shape):
136 |         """Compute shape of output tensor.
137 | 
138 |         Returns:
139 |             tuple: output shape tuple.
140 |         """
141 | 
142 |         return (input_shape[0][0], input_shape[0][1], self.output_dim)
143 | 
144 |     def build(self, input_shape):
145 |         """Initialization for variables in SelfAttention.
146 |         There are three variables in SelfAttention, i.e. WQ, WK ans WV.
147 |         WQ is used for linear transformation of query.
148 |         WK is used for linear transformation of key.
149 |         WV is used for linear transformation of value.
150 | 
151 |         Args:
152 |             input_shape (object): shape of input tensor.
153 |         """
154 | 
155 |         self.WQ = self.add_weight(
156 |             name="WQ",
157 |             shape=(int(input_shape[0][-1]), self.output_dim),
158 |             initializer=keras.initializers.glorot_uniform(seed=self.seed),
159 |             trainable=True,
160 |         )
161 |         self.WK = self.add_weight(
162 |             name="WK",
163 |             shape=(int(input_shape[1][-1]), self.output_dim),
164 |             initializer=keras.initializers.glorot_uniform(seed=self.seed),
165 |             trainable=True,
166 |         )
167 |         self.WV = self.add_weight(
168 |             name="WV",
169 |             shape=(int(input_shape[2][-1]), self.output_dim),
170 |             initializer=keras.initializers.glorot_uniform(seed=self.seed),
171 |             trainable=True,
172 |         )
173 |         super(SelfAttention, self).build(input_shape)
174 | 
175 |     def Mask(self, inputs, seq_len, mode="add"):
176 |         """Mask operation used in multi-head self attention
177 | 
178 |         Args:
179 |             seq_len (object): sequence length of inputs.
180 |             mode (str): mode of mask.
181 | 
182 |         Returns:
183 |             object: tensors after masking.
184 |         """
185 | 
186 |         if seq_len is None:
187 |             return inputs
188 |         else:
189 |             mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1])
190 |             mask = 1 - K.cumsum(mask, axis=1)
191 | 
192 |             for _ in range(len(inputs.shape) - 2):
193 |                 mask = K.expand_dims(mask, 2)
194 | 
195 |             if mode == "mul":
196 |                 return inputs * mask
197 |             elif mode == "add":
198 |                 return inputs - (1 - mask) * 1e12
199 | 
200 |     def call(self, QKVs):
201 |         """Core logic of multi-head self attention.
202 | 
203 |         Args:
204 |             QKVs (list): inputs of multi-head self attention i.e. qeury, key and value.
205 | 
206 |         Returns:
207 |             object: ouput tensors.
208 |         """
209 |         if len(QKVs) == 3:
210 |             Q_seq, K_seq, V_seq = QKVs
211 |             Q_len, V_len = None, None
212 |         elif len(QKVs) == 5:
213 |             Q_seq, K_seq, V_seq, Q_len, V_len = QKVs
214 |         Q_seq = K.dot(Q_seq, self.WQ)
215 |         Q_seq = K.reshape(
216 |             Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)
217 |         )
218 |         Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))
219 | 
220 |         K_seq = K.dot(K_seq, self.WK)
221 |         K_seq = K.reshape(
222 |             K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)
223 |         )
224 |         K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))
225 | 
226 |         V_seq = K.dot(V_seq, self.WV)
227 |         V_seq = K.reshape(
228 |             V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)
229 |         )
230 |         V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))
231 |         A = tf.matmul(Q_seq, K_seq, adjoint_a=False, adjoint_b=True) / K.sqrt(
232 |             K.cast(self.head_dim, dtype="float32")
233 |         )
234 | 
235 |         A = K.permute_dimensions(
236 |             A, pattern=(0, 3, 2, 1)
237 |         )  # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]
238 | 
239 |         A = self.Mask(A, V_len, "add")
240 |         A = K.permute_dimensions(A, pattern=(0, 3, 2, 1))
241 | 
242 |         if self.mask_right:
243 |             ones = K.ones_like(A[:1, :1])
244 |             lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
245 |             mask = (ones - lower_triangular) * 1e12
246 |             A = A - mask
247 |         A = K.softmax(A)
248 | 
249 |         O_seq = tf.matmul(A, V_seq, adjoint_a=True, adjoint_b=False)
250 |         O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))
251 | 
252 |         O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim))
253 |         O_seq = self.Mask(O_seq, Q_len, "mul")
254 |         return O_seq
255 | 
256 |     def get_config(self):
257 |         """add multiheads, multiheads and mask_right into layer config.
258 | 
259 |         Returns:
260 |             dict: config of SelfAttention layer.
261 |         """
262 |         config = super(SelfAttention, self).get_config()
263 |         config.update(
264 |             {
265 |                 "multiheads": self.multiheads,
266 |                 "head_dim": self.head_dim,
267 |                 "mask_right": self.mask_right,
268 |             }
269 |         )
270 |         return config
271 | 
272 | 
273 | class ComputeMasking(layers.Layer):
274 |     """Compute if inputs contains zero value.
275 | 
276 |     Returns:
277 |         bool tensor: True for values not equal to zero.
278 |     """
279 | 
280 |     def __init__(self, **kwargs):
281 |         super(ComputeMasking, self).__init__(**kwargs)
282 | 
283 |     def call(self, inputs, **kwargs):
284 |         mask = K.not_equal(inputs, 0)
285 |         return K.cast(mask, K.floatx())
286 | 
287 |     def compute_output_shape(self, input_shape):
288 |         return input_shape
289 | 
290 | 
291 | class OverwriteMasking(layers.Layer):
292 |     """Set values at spasific positions to zero.
293 | 
294 |     Args:
295 |         inputs (list): value tensor and mask tensor.
296 | 
297 |     Returns:
298 |         object: tensor after setting values to zero.
299 |     """
300 | 
301 |     def __init__(self, **kwargs):
302 |         super(OverwriteMasking, self).__init__(**kwargs)
303 | 
304 |     def build(self, input_shape):
305 |         super(OverwriteMasking, self).build(input_shape)
306 | 
307 |     def call(self, inputs, **kwargs):
308 |         return inputs[0] * K.expand_dims(inputs[1])
309 | 
310 |     def compute_output_shape(self, input_shape):
311 |         return input_shape[0]
312 | 
313 | 
314 | def PersonalizedAttentivePooling(dim1, dim2, dim3, seed=0):
315 |     """Soft alignment attention implement.
316 |     Attributes:
317 |         dim1 (int): first dimention of value shape.
318 |         dim2 (int): second dimention of value shape.
319 |         dim3 (int): shape of query
320 | 
321 |     Returns:
322 |         object: weighted summary of inputs value.
323 |     """
324 |     vecs_input = keras.Input(shape=(dim1, dim2), dtype="float32")
325 |     query_input = keras.Input(shape=(dim3,), dtype="float32")
326 | 
327 |     user_vecs = layers.Dropout(0.2)(vecs_input)
328 |     user_att = layers.Dense(
329 |         dim3,
330 |         activation="tanh",
331 |         kernel_initializer=keras.initializers.glorot_uniform(seed=seed),
332 |         bias_initializer=keras.initializers.Zeros(),
333 |     )(user_vecs)
334 |     user_att2 = layers.Dot(axes=-1)([query_input, user_att])
335 |     user_att2 = layers.Activation("softmax")(user_att2)
336 |     user_vec = layers.Dot((1, 1))([user_vecs, user_att2])
337 | 
338 |     model = keras.Model([vecs_input, query_input], user_vec)
339 |     return model
340 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/lstur.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | from ebrec.models.newsrec.layers import AttLayer2, ComputeMasking, OverwriteMasking
  4 | from ebrec.models.newsrec.base_model import BaseModel
  5 | from tensorflow.keras import layers
  6 | import tensorflow.keras as keras
  7 | 
  8 | 
  9 | __all__ = ["LSTURModel"]
 10 | 
 11 | 
 12 | class LSTURModel(BaseModel):
 13 |     """LSTUR model(Neural News Recommendation with Multi-Head Self-Attention)
 14 | 
 15 |     Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie:
 16 |     Neural News Recommendation with Long- and Short-term User Representations, ACL 2019
 17 | 
 18 |     Attributes:0
 19 |         word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
 20 |         hparam (object): Global hyper-parameters.
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         hparams,
 26 |         word2vec_embedding=None,
 27 |         seed=None,
 28 |         **kwargs,
 29 |     ):
 30 |         """Initialization steps for LSTUR.
 31 |         Compared with the BaseModel, LSTUR need word embedding.
 32 |         After creating word embedding matrix, BaseModel's __init__ method will be called.
 33 | 
 34 |         Args:
 35 |             hparams (object): Global hyper-parameters. Some key setttings such as type and gru_unit are there.
 36 |         """
 37 | 
 38 |         super().__init__(
 39 |             hparams=hparams,
 40 |             word2vec_embedding=word2vec_embedding,
 41 |             seed=seed,
 42 |             **kwargs,
 43 |         )
 44 | 
 45 |     def _build_graph(self):
 46 |         """Build LSTUR model and scorer.
 47 | 
 48 |         Returns:
 49 |             object: a model used to train.
 50 |             object: a model used to evaluate and inference.
 51 |         """
 52 | 
 53 |         model, scorer = self._build_lstur()
 54 |         return model, scorer
 55 | 
 56 |     def _build_userencoder(self, titleencoder, type="ini"):
 57 |         """The main function to create user encoder of LSTUR.
 58 | 
 59 |         Args:
 60 |             titleencoder (object): the news encoder of LSTUR.
 61 | 
 62 |         Return:
 63 |             object: the user encoder of LSTUR.
 64 |         """
 65 | 
 66 |         his_input_title = keras.Input(
 67 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
 68 |         )
 69 |         user_indexes = keras.Input(shape=(1,), dtype="int32")
 70 | 
 71 |         user_embedding_layer = layers.Embedding(
 72 |             input_dim=self.hparams.n_users + 1,
 73 |             output_dim=self.hparams.gru_unit,  # Dimension of the dense embedding.
 74 |             trainable=True,
 75 |             embeddings_initializer="zeros",
 76 |         )
 77 | 
 78 |         long_u_emb = layers.Reshape((self.hparams.gru_unit,))(
 79 |             user_embedding_layer(user_indexes)
 80 |         )
 81 |         click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title)
 82 | 
 83 |         if type == "ini":
 84 |             user_present = layers.GRU(
 85 |                 self.hparams.gru_unit,
 86 |                 kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
 87 |                 recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
 88 |                 bias_initializer=keras.initializers.Zeros(),
 89 |             )(
 90 |                 layers.Masking(mask_value=0.0)(click_title_presents),
 91 |                 initial_state=[long_u_emb],
 92 |             )
 93 |         elif type == "con":
 94 |             short_uemb = layers.GRU(
 95 |                 self.hparams.gru_unit,
 96 |                 kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
 97 |                 recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
 98 |                 bias_initializer=keras.initializers.Zeros(),
 99 |             )(layers.Masking(mask_value=0.0)(click_title_presents))
100 | 
101 |             user_present = layers.Concatenate()([short_uemb, long_u_emb])
102 |             user_present = layers.Dense(
103 |                 self.hparams.gru_unit,
104 |                 bias_initializer=keras.initializers.Zeros(),
105 |                 kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
106 |             )(user_present)
107 | 
108 |         model = keras.Model(
109 |             [his_input_title, user_indexes], user_present, name="user_encoder"
110 |         )
111 |         return model
112 | 
113 |     def _build_newsencoder(self, embedding_layer):
114 |         """The main function to create news encoder of LSTUR.
115 | 
116 |         Args:
117 |             embedding_layer (object): a word embedding layer.
118 | 
119 |         Return:
120 |             object: the news encoder of LSTUR.
121 |         """
122 | 
123 |         sequences_input_title = keras.Input(
124 |             shape=(self.hparams.title_size,), dtype="int32"
125 |         )
126 |         embedded_sequences_title = embedding_layer(sequences_input_title)
127 | 
128 |         y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
129 |         y = layers.Conv1D(
130 |             self.hparams.filter_num,
131 |             self.hparams.window_size,
132 |             activation=self.hparams.cnn_activation,
133 |             padding="same",
134 |             bias_initializer=keras.initializers.Zeros(),
135 |             kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
136 |         )(y)
137 |         y = layers.Dropout(self.hparams.dropout)(y)
138 |         y = layers.Masking()(
139 |             OverwriteMasking()([y, ComputeMasking()(sequences_input_title)])
140 |         )
141 |         pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
142 |         print(pred_title)
143 |         model = keras.Model(sequences_input_title, pred_title, name="news_encoder")
144 |         return model
145 | 
146 |     def _build_lstur(self):
147 |         """The main function to create LSTUR's logic. The core of LSTUR
148 |         is a user encoder and a news encoder.
149 | 
150 |         Returns:
151 |             object: a model used to train.
152 |             object: a model used to evaluate and inference.
153 |         """
154 | 
155 |         his_input_title = keras.Input(
156 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
157 |         )
158 |         pred_input_title = keras.Input(
159 |             # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
160 |             shape=(None, self.hparams.title_size),
161 |             dtype="int32",
162 |         )
163 |         pred_input_title_one = keras.Input(
164 |             shape=(
165 |                 1,
166 |                 self.hparams.title_size,
167 |             ),
168 |             dtype="int32",
169 |         )
170 |         pred_title_reshape = layers.Reshape((self.hparams.title_size,))(
171 |             pred_input_title_one
172 |         )
173 |         user_indexes = keras.Input(shape=(1,), dtype="int32")
174 | 
175 |         embedding_layer = layers.Embedding(
176 |             self.word2vec_embedding.shape[0],
177 |             self.word2vec_embedding.shape[1],
178 |             weights=[self.word2vec_embedding],
179 |             trainable=True,
180 |         )
181 | 
182 |         titleencoder = self._build_newsencoder(embedding_layer)
183 |         self.userencoder = self._build_userencoder(titleencoder, type=self.hparams.type)
184 |         self.newsencoder = titleencoder
185 | 
186 |         user_present = self.userencoder([his_input_title, user_indexes])
187 |         news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title)
188 |         news_present_one = self.newsencoder(pred_title_reshape)
189 | 
190 |         preds = layers.Dot(axes=-1)([news_present, user_present])
191 |         preds = layers.Activation(activation="softmax")(preds)
192 | 
193 |         pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
194 |         pred_one = layers.Activation(activation="sigmoid")(pred_one)
195 | 
196 |         model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
197 |         scorer = keras.Model(
198 |             [user_indexes, his_input_title, pred_input_title_one], pred_one
199 |         )
200 | 
201 |         return model, scorer
202 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/model_config.py:
--------------------------------------------------------------------------------
  1 | #
  2 | DEFAULT_TITLE_SIZE = 30
  3 | DEFAULT_BODY_SIZE = 40
  4 | UNKNOWN_TITLE_VALUE = [0] * DEFAULT_TITLE_SIZE
  5 | UNKNOWN_BODY_VALUE = [0] * DEFAULT_BODY_SIZE
  6 | 
  7 | DEFAULT_DOCUMENT_SIZE = 768
  8 | 
  9 | 
 10 | def print_hparams(hparams_class):
 11 |     for attr, value in hparams_class.__annotations__.items():
 12 |         # Print attribute names and values
 13 |         print(f"{attr}: {getattr(hparams_class, attr)}")
 14 | 
 15 | 
 16 | def hparams_to_dict(hparams_class) -> dict:
 17 |     params = {}
 18 |     for attr, value in hparams_class.__annotations__.items():
 19 |         params[attr] = getattr(hparams_class, attr)
 20 |     return params
 21 | 
 22 | 
 23 | class hparams_naml:
 24 |     # INPUT DIMENTIONS:
 25 |     title_size: int = DEFAULT_TITLE_SIZE
 26 |     history_size: int = 20
 27 |     body_size: int = DEFAULT_BODY_SIZE
 28 |     vert_num: int = 100
 29 |     vert_emb_dim: int = 10
 30 |     subvert_num: int = 100
 31 |     subvert_emb_dim: int = 10
 32 |     # MODEL ARCHITECTURE
 33 |     dense_activation: str = "relu"
 34 |     cnn_activation: str = "relu"
 35 |     attention_hidden_dim: int = 200
 36 |     filter_num: int = 400
 37 |     window_size: int = 3
 38 |     # MODEL OPTIMIZER:
 39 |     optimizer: str = "adam"
 40 |     loss: str = "cross_entropy_loss"
 41 |     dropout: float = 0.2
 42 |     learning_rate: float = 1e-4
 43 | 
 44 | 
 45 | class hparams_lstur:
 46 |     # INPUT DIMENTIONS:
 47 |     title_size: int = DEFAULT_TITLE_SIZE
 48 |     history_size: int = 20
 49 |     n_users: int = 50000
 50 |     # MODEL ARCHITECTURE
 51 |     cnn_activation: str = "relu"
 52 |     type: str = "ini"
 53 |     attention_hidden_dim: int = 200
 54 |     gru_unit: int = 400
 55 |     filter_num: int = 400
 56 |     window_size: int = 3
 57 |     # MODEL OPTIMIZER:
 58 |     optimizer: str = "adam"
 59 |     loss: str = "cross_entropy_loss"
 60 |     dropout: float = 0.2
 61 |     learning_rate: float = 1e-4
 62 | 
 63 | 
 64 | class hparams_npa:
 65 |     # INPUT DIMENTIONS:
 66 |     title_size: int = DEFAULT_TITLE_SIZE
 67 |     history_size: int = 20
 68 |     n_users: int = 50000
 69 |     # MODEL ARCHITECTURE
 70 |     cnn_activation: str = "relu"
 71 |     attention_hidden_dim: int = 200
 72 |     user_emb_dim: int = 400
 73 |     filter_num: int = 400
 74 |     window_size: int = 3
 75 |     # MODEL OPTIMIZER:
 76 |     optimizer: str = "adam"
 77 |     loss: str = "cross_entropy_loss"
 78 |     dropout: float = 0.2
 79 |     learning_rate: float = 1e-4
 80 | 
 81 | 
 82 | class hparams_nrms:
 83 |     # INPUT DIMENTIONS:
 84 |     title_size: int = DEFAULT_TITLE_SIZE
 85 |     history_size: int = 20
 86 |     # MODEL ARCHITECTURE
 87 |     head_num: int = 20
 88 |     head_dim: int = 20
 89 |     attention_hidden_dim: int = 200
 90 |     # MODEL OPTIMIZER:
 91 |     optimizer: str = "adam"
 92 |     loss: str = "cross_entropy_loss"
 93 |     dropout: float = 0.2
 94 |     learning_rate: float = 1e-4
 95 |     # MY OWN LITTLE TWIST:
 96 |     newsencoder_units_per_layer: list[int] = None
 97 |     newsencoder_l2_regularization: float = 1e-4
 98 | 
 99 | 
100 | class hparams_nrms_docvec:
101 |     # INPUT DIMENTIONS:
102 |     title_size: int = DEFAULT_DOCUMENT_SIZE
103 |     history_size: int = 20
104 |     # MODEL ARCHITECTURE
105 |     head_num: int = 16
106 |     head_dim: int = 16
107 |     attention_hidden_dim: int = 200
108 |     # MODEL OPTIMIZER:
109 |     optimizer: str = "adam"
110 |     loss: str = "cross_entropy_loss"
111 |     dropout: float = 0.2
112 |     learning_rate: float = 1e-4
113 |     newsencoder_units_per_layer: list[int] = [512, 512, 512]
114 |     newsencoder_l2_regularization: float = 1e-4
115 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/npa.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | from tensorflow.keras import layers
  4 | import tensorflow.keras as keras
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | from ebrec.models.newsrec.layers import PersonalizedAttentivePooling
  9 | from ebrec.models.newsrec.base_model import BaseModel
 10 | 
 11 | __all__ = ["NPAModel"]
 12 | 
 13 | 
 14 | class NPAModel(BaseModel):
 15 |     """NPA model(Neural News Recommendation with Attentive Multi-View Learning)
 16 | 
 17 |     Attributes:
 18 |         word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
 19 |         hparam (object): Global hyper-parameters.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         hparams,
 25 |         word2vec_embedding=None,
 26 |         seed=None,
 27 |         **kwargs,
 28 |     ):
 29 |         """Initialization steps for MANL.
 30 |         Compared with the BaseModel, NPA need word embedding.
 31 |         After creating word embedding matrix, BaseModel's __init__ method will be called.
 32 | 
 33 |         Args:
 34 |             hparams (object): Global hyper-parameters. Some key setttings such as filter_num are there.
 35 |         """
 36 | 
 37 |         super().__init__(
 38 |             hparams=hparams,
 39 |             word2vec_embedding=word2vec_embedding,
 40 |             seed=seed,
 41 |             **kwargs,
 42 |         )
 43 | 
 44 |     def _get_input_label_from_iter(self, batch_data):
 45 |         input_feat = [
 46 |             batch_data["user_index_batch"],
 47 |             batch_data["clicked_title_batch"],
 48 |             batch_data["candidate_title_batch"],
 49 |         ]
 50 |         input_label = batch_data["labels"]
 51 |         return input_feat, input_label
 52 | 
 53 |     def _build_graph(self):
 54 |         """Build NPA model and scorer.
 55 | 
 56 |         Returns:
 57 |             object: a model used to train.
 58 |             object: a model used to evaluate and inference.
 59 |         """
 60 | 
 61 |         model, scorer = self._build_npa()
 62 |         return model, scorer
 63 | 
 64 |     def _build_userencoder(self, titleencoder, user_embedding_layer):
 65 |         """The main function to create user encoder of NPA.
 66 | 
 67 |         Args:
 68 |             titleencoder (object): the news encoder of NPA.
 69 | 
 70 |         Return:
 71 |             object: the user encoder of NPA.
 72 |         """
 73 | 
 74 |         his_input_title = keras.Input(
 75 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
 76 |         )
 77 |         user_indexes = keras.Input(shape=(1,), dtype="int32")
 78 | 
 79 |         nuser_id = layers.Reshape((1, 1))(user_indexes)
 80 |         repeat_uids = layers.Concatenate(axis=-2)(
 81 |             [nuser_id] * self.hparams.history_size
 82 |         )
 83 |         his_title_uid = layers.Concatenate(axis=-1)([his_input_title, repeat_uids])
 84 | 
 85 |         click_title_presents = layers.TimeDistributed(titleencoder)(his_title_uid)
 86 | 
 87 |         u_emb = layers.Reshape((self.hparams.user_emb_dim,))(
 88 |             user_embedding_layer(user_indexes)
 89 |         )
 90 |         user_present = PersonalizedAttentivePooling(
 91 |             self.hparams.history_size,
 92 |             self.hparams.filter_num,
 93 |             self.hparams.attention_hidden_dim,
 94 |             seed=self.seed,
 95 |         )(
 96 |             [
 97 |                 click_title_presents,
 98 |                 layers.Dense(self.hparams.attention_hidden_dim)(u_emb),
 99 |             ]
100 |         )
101 | 
102 |         model = keras.Model(
103 |             [his_input_title, user_indexes], user_present, name="user_encoder"
104 |         )
105 |         return model
106 | 
107 |     def _build_newsencoder(self, embedding_layer, user_embedding_layer):
108 |         """The main function to create news encoder of NPA.
109 | 
110 |         Args:
111 |             embedding_layer (object): a word embedding layer.
112 | 
113 |         Return:
114 |             object: the news encoder of NPA.
115 |         """
116 | 
117 |         sequence_title_uindex = keras.Input(
118 |             shape=(self.hparams.title_size + 1,), dtype="int32"
119 |         )
120 | 
121 |         sequences_input_title = layers.Lambda(
122 |             lambda x: x[:, : self.hparams.title_size]
123 |         )(sequence_title_uindex)
124 |         user_index = layers.Lambda(lambda x: x[:, self.hparams.title_size :])(
125 |             sequence_title_uindex
126 |         )
127 | 
128 |         u_emb = layers.Reshape((self.hparams.user_emb_dim,))(
129 |             user_embedding_layer(user_index)
130 |         )
131 |         embedded_sequences_title = embedding_layer(sequences_input_title)
132 | 
133 |         y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
134 |         y = layers.Conv1D(
135 |             self.hparams.filter_num,
136 |             self.hparams.window_size,
137 |             activation=self.hparams.cnn_activation,
138 |             padding="same",
139 |             bias_initializer=keras.initializers.Zeros(),
140 |             kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
141 |         )(y)
142 |         y = layers.Dropout(self.hparams.dropout)(y)
143 | 
144 |         pred_title = PersonalizedAttentivePooling(
145 |             self.hparams.title_size,
146 |             self.hparams.filter_num,
147 |             self.hparams.attention_hidden_dim,
148 |             seed=self.seed,
149 |         )([y, layers.Dense(self.hparams.attention_hidden_dim)(u_emb)])
150 | 
151 |         # pred_title = Reshape((1, feature_size))(pred_title)
152 |         model = keras.Model(sequence_title_uindex, pred_title, name="news_encoder")
153 |         return model
154 | 
155 |     def _build_npa(self):
156 |         """The main function to create NPA's logic. The core of NPA
157 |         is a user encoder and a news encoder.
158 | 
159 |         Returns:
160 |             object: a model used to train.
161 |             object: a model used to evaluate and predict.
162 |         """
163 | 
164 |         his_input_title = keras.Input(
165 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
166 |         )
167 |         pred_input_title = keras.Input(
168 |             # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
169 |             shape=(None, self.hparams.title_size),
170 |             dtype="int32",
171 |         )
172 |         pred_input_title_one = keras.Input(
173 |             shape=(
174 |                 1,
175 |                 self.hparams.title_size,
176 |             ),
177 |             dtype="int32",
178 |         )
179 |         pred_title_one_reshape = layers.Reshape((self.hparams.title_size,))(
180 |             pred_input_title_one
181 |         )
182 | 
183 |         user_indexes = keras.Input(shape=(1,), dtype="int32")
184 | 
185 |         nuser_index = layers.Reshape((1, 1))(user_indexes)
186 | 
187 |         # Calculate npratio + 1 based on the dynamic shape of pred_input_title
188 |         npratio_plus_one = tf.shape(pred_input_title)[1]
189 | 
190 |         repeat_uindex = tf.tile(nuser_index, [1, npratio_plus_one, 1])
191 | 
192 |         pred_title_uindex = layers.Concatenate(axis=-1)(
193 |             [pred_input_title, repeat_uindex]
194 |         )
195 |         pred_title_uindex_one = layers.Concatenate()(
196 |             [pred_title_one_reshape, user_indexes]
197 |         )
198 | 
199 |         embedding_layer = layers.Embedding(
200 |             self.word2vec_embedding.shape[0],
201 |             self.word2vec_embedding.shape[1],
202 |             weights=[self.word2vec_embedding],
203 |             trainable=True,
204 |         )
205 | 
206 |         user_embedding_layer = layers.Embedding(
207 |             input_dim=self.hparams.n_users + 1,
208 |             output_dim=self.hparams.user_emb_dim,
209 |             trainable=True,
210 |             embeddings_initializer="zeros",
211 |         )
212 | 
213 |         titleencoder = self._build_newsencoder(embedding_layer, user_embedding_layer)
214 |         userencoder = self._build_userencoder(titleencoder, user_embedding_layer)
215 |         newsencoder = titleencoder
216 | 
217 |         user_present = userencoder([his_input_title, user_indexes])
218 | 
219 |         news_present = layers.TimeDistributed(newsencoder)(pred_title_uindex)
220 |         news_present_one = newsencoder(pred_title_uindex_one)
221 | 
222 |         preds = layers.Dot(axes=-1)([news_present, user_present])
223 |         preds = layers.Activation(activation="softmax")(preds)
224 | 
225 |         pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
226 |         pred_one = layers.Activation(activation="sigmoid")(pred_one)
227 | 
228 |         model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
229 |         scorer = keras.Model(
230 |             [user_indexes, his_input_title, pred_input_title_one], pred_one
231 |         )
232 | 
233 |         return model, scorer
234 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/nrms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | from tensorflow.keras.layers import Embedding, Input, Dropout, Dense, BatchNormalization
  8 | from tensorflow.keras.initializers import GlorotUniform
  9 | from tensorflow.keras.regularizers import l2
 10 | 
 11 | 
 12 | class NRMSModel:
 13 |     """NRMS model(Neural News Recommendation with Multi-Head Self-Attention)
 14 | 
 15 |     Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
 16 |     Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
 17 |     on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
 18 |     on Natural Language Processing (EMNLP-IJCNLP)
 19 | 
 20 |     Attributes:
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         hparams: dict,
 26 |         word2vec_embedding: np.ndarray = None,
 27 |         word_emb_dim: int = 300,
 28 |         vocab_size: int = 32000,
 29 |         seed: int = None,
 30 |     ):
 31 |         """Initialization steps for NRMS."""
 32 |         self.hparams = hparams
 33 |         self.seed = seed
 34 | 
 35 |         # SET SEED:
 36 |         tf.random.set_seed(seed)
 37 |         np.random.seed(seed)
 38 | 
 39 |         # INIT THE WORD-EMBEDDINGS:
 40 |         if word2vec_embedding is None:
 41 |             # Xavier Initialization
 42 |             initializer = GlorotUniform(seed=self.seed)
 43 |             self.word2vec_embedding = initializer(shape=(vocab_size, word_emb_dim))
 44 |             # self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim)
 45 |         else:
 46 |             self.word2vec_embedding = word2vec_embedding
 47 | 
 48 |         # BUILD AND COMPILE MODEL:
 49 |         self.model, self.scorer = self._build_graph()
 50 |         data_loss = self._get_loss(self.hparams.loss)
 51 |         train_optimizer = self._get_opt(
 52 |             optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
 53 |         )
 54 |         self.model.compile(loss=data_loss, optimizer=train_optimizer)
 55 | 
 56 |     def _get_loss(self, loss: str):
 57 |         """Make loss function, consists of data loss and regularization loss
 58 |         Returns:
 59 |             object: Loss function or loss function name
 60 |         """
 61 |         if loss == "cross_entropy_loss":
 62 |             data_loss = "categorical_crossentropy"
 63 |         elif loss == "log_loss":
 64 |             data_loss = "binary_crossentropy"
 65 |         else:
 66 |             raise ValueError(f"this loss not defined {loss}")
 67 |         return data_loss
 68 | 
 69 |     def _get_opt(self, optimizer: str, lr: float):
 70 |         """Get the optimizer according to configuration. Usually we will use Adam.
 71 |         Returns:
 72 |             object: An optimizer.
 73 |         """
 74 |         # TODO: shouldn't be a string input you should just set the optimizer, to avoid stuff like this:
 75 |         # => 'WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.'
 76 |         if optimizer == "adam":
 77 |             train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
 78 |         else:
 79 |             raise ValueError(f"this optimizer not defined {optimizer}")
 80 |         return train_opt
 81 | 
 82 |     def _build_graph(self):
 83 |         """Build NRMS model and scorer.
 84 | 
 85 |         Returns:
 86 |             object: a model used to train.
 87 |             object: a model used to evaluate and inference.
 88 |         """
 89 |         model, scorer = self._build_nrms()
 90 |         return model, scorer
 91 | 
 92 |     def _build_userencoder(self, titleencoder):
 93 |         """The main function to create user encoder of NRMS.
 94 | 
 95 |         Args:
 96 |             titleencoder (object): the news encoder of NRMS.
 97 | 
 98 |         Return:
 99 |             object: the user encoder of NRMS.
100 |         """
101 |         his_input_title = tf.keras.Input(
102 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
103 |         )
104 | 
105 |         click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)(
106 |             his_input_title
107 |         )
108 |         y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
109 |             [click_title_presents] * 3
110 |         )
111 |         user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
112 | 
113 |         model = tf.keras.Model(his_input_title, user_present, name="user_encoder")
114 |         return model
115 | 
116 |     def _build_newsencoder(self, units_per_layer: list[int] = None):
117 |         """The main function to create news encoder of NRMS.
118 | 
119 |         Args:
120 |             embedding_layer (object): a word embedding layer.
121 | 
122 |         Return:
123 |             object: the news encoder of NRMS.
124 |         """
125 |         embedding_layer = tf.keras.layers.Embedding(
126 |             self.word2vec_embedding.shape[0],
127 |             self.word2vec_embedding.shape[1],
128 |             weights=[self.word2vec_embedding],
129 |             trainable=True,
130 |         )
131 |         sequences_input_title = tf.keras.Input(
132 |             shape=(self.hparams.title_size,), dtype="int32"
133 |         )
134 |         embedded_sequences_title = embedding_layer(sequences_input_title)
135 | 
136 |         y = tf.keras.layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
137 |         y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
138 |             [y, y, y]
139 |         )
140 | 
141 |         # Create configurable Dense layers (the if - else is something I've added):
142 |         if units_per_layer:
143 |             for layer in units_per_layer:
144 |                 y = tf.keras.layers.Dense(
145 |                     units=layer,
146 |                     activation="relu",
147 |                     kernel_regularizer=tf.keras.regularizers.l2(
148 |                         self.hparams.newsencoder_l2_regularization
149 |                     ),
150 |                 )(y)
151 |                 y = tf.keras.layers.BatchNormalization()(y)
152 |                 y = tf.keras.layers.Dropout(self.hparams.dropout)(y)
153 |         else:
154 |             y = tf.keras.layers.Dropout(self.hparams.dropout)(y)
155 | 
156 |         pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
157 | 
158 |         model = tf.keras.Model(sequences_input_title, pred_title, name="news_encoder")
159 |         return model
160 | 
161 |     def _build_nrms(self):
162 |         """The main function to create NRMS's logic. The core of NRMS
163 |         is a user encoder and a news encoder.
164 | 
165 |         Returns:
166 |             object: a model used to train.
167 |             object: a model used to evaluate and inference.
168 |         """
169 | 
170 |         his_input_title = tf.keras.Input(
171 |             shape=(self.hparams.history_size, self.hparams.title_size),
172 |             dtype="int32",
173 |         )
174 |         pred_input_title = tf.keras.Input(
175 |             # shape = (hparams.npratio + 1, hparams.title_size)
176 |             shape=(None, self.hparams.title_size),
177 |             dtype="int32",
178 |         )
179 |         pred_input_title_one = tf.keras.Input(
180 |             shape=(
181 |                 1,
182 |                 self.hparams.title_size,
183 |             ),
184 |             dtype="int32",
185 |         )
186 |         pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))(
187 |             pred_input_title_one
188 |         )
189 |         titleencoder = self._build_newsencoder(
190 |             units_per_layer=self.hparams.newsencoder_units_per_layer
191 |         )
192 |         self.userencoder = self._build_userencoder(titleencoder)
193 |         self.newsencoder = titleencoder
194 | 
195 |         user_present = self.userencoder(his_input_title)
196 |         news_present = tf.keras.layers.TimeDistributed(self.newsencoder)(
197 |             pred_input_title
198 |         )
199 |         news_present_one = self.newsencoder(pred_title_one_reshape)
200 | 
201 |         preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
202 |         preds = tf.keras.layers.Activation(activation="softmax")(preds)
203 | 
204 |         pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
205 |         pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one)
206 | 
207 |         model = tf.keras.Model([his_input_title, pred_input_title], preds)
208 |         scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)
209 | 
210 |         return model, scorer
211 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/nrms_docvec.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | 
  8 | class NRMSDocVec:
  9 |     """
 10 |     Modified NRMS model (Neural News Recommendation with Multi-Head Self-Attention)
 11 |     - Initiated with article-embeddings.
 12 | 
 13 |     Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
 14 |     Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
 15 |     on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
 16 |     on Natural Language Processing (EMNLP-IJCNLP)
 17 | 
 18 |     Attributes:
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         hparams: dict,
 24 |         seed: int = None,
 25 |     ):
 26 |         """Initialization steps for NRMS."""
 27 |         self.hparams = hparams
 28 |         self.seed = seed
 29 | 
 30 |         # SET SEED:
 31 |         tf.random.set_seed(seed)
 32 |         np.random.seed(seed)
 33 |         # BUILD AND COMPILE MODEL:
 34 |         self.model, self.scorer = self._build_graph()
 35 |         data_loss = self._get_loss(self.hparams.loss)
 36 |         train_optimizer = self._get_opt(
 37 |             optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
 38 |         )
 39 |         self.model.compile(loss=data_loss, optimizer=train_optimizer)
 40 | 
 41 |     def _get_loss(self, loss: str):
 42 |         """Make loss function, consists of data loss and regularization loss
 43 |         Returns:
 44 |             object: Loss function or loss function name
 45 |         """
 46 |         if loss == "cross_entropy_loss":
 47 |             data_loss = "categorical_crossentropy"
 48 |         elif loss == "log_loss":
 49 |             data_loss = "binary_crossentropy"
 50 |         else:
 51 |             raise ValueError(f"this loss not defined {loss}")
 52 |         return data_loss
 53 | 
 54 |     def _get_opt(self, optimizer: str, lr: float):
 55 |         """Get the optimizer according to configuration. Usually we will use Adam.
 56 |         Returns:
 57 |             object: An optimizer.
 58 |         """
 59 |         if optimizer == "adam":
 60 |             train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
 61 |         else:
 62 |             raise ValueError(f"this optimizer not defined {optimizer}")
 63 |         return train_opt
 64 | 
 65 |     def _build_graph(self):
 66 |         """Build NRMS model and scorer.
 67 | 
 68 |         Returns:
 69 |             object: a model used to train.
 70 |             object: a model used to evaluate and inference.
 71 |         """
 72 |         model, scorer = self._build_nrms()
 73 |         return model, scorer
 74 | 
 75 |     def _build_userencoder(self, titleencoder):
 76 |         """The main function to create user encoder of NRMS.
 77 | 
 78 |         Args:
 79 |             titleencoder (object): the news encoder of NRMS.
 80 | 
 81 |         Return:
 82 |             object: the user encoder of NRMS.
 83 |         """
 84 |         his_input_title = tf.keras.Input(
 85 |             shape=(self.hparams.history_size, self.hparams.title_size), dtype="float32"
 86 |         )
 87 | 
 88 |         click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)(
 89 |             his_input_title
 90 |         )
 91 |         y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
 92 |             [click_title_presents] * 3
 93 |         )
 94 |         user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
 95 | 
 96 |         model = tf.keras.Model(his_input_title, user_present, name="user_encoder")
 97 |         return model
 98 | 
 99 |     def _build_newsencoder(self, units_per_layer: list[int] = list[512, 512, 512]):
100 |         """THIS IS OUR IMPLEMENTATION.
101 |         The main function to create a news encoder.
102 | 
103 |         Parameters:
104 |             units_per_layer (int): The number of neurons in each Dense layer.
105 | 
106 |         Return:
107 |             object: the news encoder.
108 |         """
109 |         DOCUMENT_VECTOR_DIM = self.hparams.title_size
110 |         OUTPUT_DIM = self.hparams.head_num * self.hparams.head_dim
111 | 
112 |         # DENSE LAYERS (FINE-TUNED):
113 |         sequences_input_title = tf.keras.Input(
114 |             shape=(DOCUMENT_VECTOR_DIM), dtype="float32"
115 |         )
116 |         x = sequences_input_title
117 |         # Create configurable Dense layers:
118 |         for layer in units_per_layer:
119 |             x = tf.keras.layers.Dense(
120 |                 units=layer,
121 |                 activation="relu",
122 |                 kernel_regularizer=tf.keras.regularizers.l2(
123 |                     self.hparams.newsencoder_l2_regularization
124 |                 ),
125 |             )(x)
126 |             x = tf.keras.layers.BatchNormalization()(x)
127 |             x = tf.keras.layers.Dropout(self.hparams.dropout)(x)
128 | 
129 |         # OUTPUT:
130 |         pred_title = tf.keras.layers.Dense(units=OUTPUT_DIM, activation="relu")(x)
131 | 
132 |         # Construct the final model
133 |         model = tf.keras.Model(
134 |             inputs=sequences_input_title, outputs=pred_title, name="news_encoder"
135 |         )
136 | 
137 |         return model
138 | 
139 |     def _build_nrms(self):
140 |         """The main function to create NRMS's logic. The core of NRMS
141 |         is a user encoder and a news encoder.
142 | 
143 |         Returns:
144 |             object: a model used to train.
145 |             object: a model used to evaluate and inference.
146 |         """
147 | 
148 |         his_input_title = tf.keras.Input(
149 |             shape=(self.hparams.history_size, self.hparams.title_size),
150 |             dtype="float32",
151 |         )
152 |         pred_input_title = tf.keras.Input(
153 |             # shape = (hparams.npratio + 1, hparams.title_size)
154 |             shape=(None, self.hparams.title_size),
155 |             dtype="float32",
156 |         )
157 |         pred_input_title_one = tf.keras.Input(
158 |             shape=(
159 |                 1,
160 |                 self.hparams.title_size,
161 |             ),
162 |             dtype="float32",
163 |         )
164 |         pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))(
165 |             pred_input_title_one
166 |         )
167 |         titleencoder = self._build_newsencoder(
168 |             units_per_layer=self.hparams.newsencoder_units_per_layer
169 |         )
170 |         self.userencoder = self._build_userencoder(titleencoder)
171 |         self.newsencoder = titleencoder
172 | 
173 |         user_present = self.userencoder(his_input_title)
174 |         news_present = tf.keras.layers.TimeDistributed(self.newsencoder)(
175 |             pred_input_title
176 |         )
177 |         news_present_one = self.newsencoder(pred_title_one_reshape)
178 | 
179 |         preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
180 |         preds = tf.keras.layers.Activation(activation="softmax")(preds)
181 | 
182 |         pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
183 |         pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one)
184 | 
185 |         model = tf.keras.Model([his_input_title, pred_input_title], preds)
186 |         scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)
187 | 
188 |         return model, scorer
189 | 


--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/utils.py:
--------------------------------------------------------------------------------
 1 | class set_args:
 2 |     def __init__(self, args_dict):
 3 |         _ = [setattr(set_args, key, val) for key, val in args_dict.items()]
 4 | 
 5 | 
 6 | def print_n_parameters(model) -> None:
 7 |     num_params = model.count_params()
 8 |     print("Number of parameters:", num_params)
 9 | 
10 | 
11 | def print_parameter_device(model) -> None:
12 |     for variable in model.variables:
13 |         print(f"Variable name: {variable.name}, Device: {variable.device}")
14 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/utils/__init__.py


--------------------------------------------------------------------------------
/src/ebrec/utils/_articles.py:
--------------------------------------------------------------------------------
  1 | from ebrec.utils._python import create_lookup_dict
  2 | import polars as pl
  3 | from ebrec.utils._constants import DEFAULT_ARTICLE_ID_COL
  4 | 
  5 | try:
  6 |     from transformers import AutoTokenizer
  7 | except ImportError:
  8 |     print("transformers not available")
  9 | 
 10 | 
 11 | def load_article_id_embeddings(
 12 |     df: pl.DataFrame, path: str, item_col: str = DEFAULT_ARTICLE_ID_COL
 13 | ) -> pl.DataFrame:
 14 |     """Load embeddings artifacts and join to articles on 'article_id'
 15 |     Args:
 16 |         path (str): Path to document embeddings
 17 |     """
 18 |     return df.join(pl.read_parquet(path), on=item_col, how="left")
 19 | 
 20 | 
 21 | def create_article_id_to_value_mapping(
 22 |     df: pl.DataFrame,
 23 |     value_col: str,
 24 |     article_col: str = DEFAULT_ARTICLE_ID_COL,
 25 | ):
 26 |     return create_lookup_dict(
 27 |         df.select(article_col, value_col), key=article_col, value=value_col
 28 |     )
 29 | 
 30 | 
 31 | def convert_text2encoding_with_transformers(
 32 |     df: pl.DataFrame,
 33 |     tokenizer: AutoTokenizer,
 34 |     column: str,
 35 |     max_length: int = None,
 36 | ) -> pl.DataFrame:
 37 |     """Converts text in a specified DataFrame column to tokens using a provided tokenizer.
 38 |     Args:
 39 |         df (pl.DataFrame): The input DataFrame containing the text column.
 40 |         tokenizer (AutoTokenizer): The tokenizer to use for encoding the text. (from transformers import AutoTokenizer)
 41 |         column (str): The name of the column containing the text.
 42 |         max_length (int, optional): The maximum length of the encoded tokens. Defaults to None.
 43 |     Returns:
 44 |         pl.DataFrame: A new DataFrame with an additional column containing the encoded tokens.
 45 |     Example:
 46 |     >>> from transformers import AutoTokenizer
 47 |     >>> import polars as pl
 48 |     >>> df = pl.DataFrame({
 49 |             'text': ['This is a test.', 'Another test string.', 'Yet another one.']
 50 |         })
 51 |     >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 52 |     >>> encoded_df, new_column = convert_text2encoding_with_transformers(df, tokenizer, 'text', max_length=20)
 53 |     >>> print(encoded_df)
 54 |         shape: (3, 2)
 55 |         ┌──────────────────────┬───────────────────────────────┐
 56 |         │ text                 ┆ text_encode_bert-base-uncased │
 57 |         │ ---                  ┆ ---                           │
 58 |         │ str                  ┆ list[i64]                     │
 59 |         ╞══════════════════════╪═══════════════════════════════╡
 60 |         │ This is a test.      ┆ [2023, 2003, … 0]             │
 61 |         │ Another test string. ┆ [2178, 3231, … 0]             │
 62 |         │ Yet another one.     ┆ [2664, 2178, … 0]             │
 63 |         └──────────────────────┴───────────────────────────────┘
 64 |     >>> print(new_column)
 65 |         text_encode_bert-base-uncased
 66 |     """
 67 |     text = df[column].to_list()
 68 |     # set columns
 69 |     new_column = f"{column}_encode_{tokenizer.name_or_path}"
 70 |     # If 'max_length' is provided then set it, else encode each string its original length
 71 |     padding = "max_length" if max_length else False
 72 |     encoded_tokens = tokenizer(
 73 |         text,
 74 |         add_special_tokens=False,
 75 |         padding=padding,
 76 |         max_length=max_length,
 77 |         truncation=True,
 78 |     )["input_ids"]
 79 |     return df.with_columns(pl.Series(new_column, encoded_tokens)), new_column
 80 | 
 81 | 
 82 | def create_sort_based_prediction_score(
 83 |     df: pl.DataFrame,
 84 |     column: str,
 85 |     desc: bool,
 86 |     article_id_col: str = DEFAULT_ARTICLE_ID_COL,
 87 |     prediction_score_col: str = "prediction_score",
 88 | ) -> pl.DataFrame:
 89 |     """
 90 |     Generates a prediction score for each row in a Polars DataFrame based on the sorting of a specified column.
 91 | 
 92 |     Args:
 93 |         df (pl.DataFrame): The input DataFrame to process.
 94 |         column (str): The name of the column to sort by and to base the prediction scores on.
 95 |         desc (bool): Determines the sorting order. If True, sort in descending order; otherwise, in ascending order.
 96 |         article_id_col (str, optional): The name article ID column. Defaults to "article_id".
 97 |         prediction_score_col (str, optional): The name to assign to the prediction score column. Defaults to "prediction_score".
 98 | 
 99 |     Returns:
100 |         pl.DataFrame: A Polars DataFrame including the original data along with the new prediction score column.
101 | 
102 |     Examples:
103 |     >>> import polars as pl
104 |     >>> df = pl.DataFrame({
105 |             "article_id": [1, 2, 3, 4, 5],
106 |             "views": [100, 150, 200, 50, 300],
107 |         })
108 |     >>> create_sort_based_prediction_score(df, "views", True)
109 |         shape: (5, 3)
110 |         ┌────────────┬───────┬──────────────────┐
111 |         │ article_id ┆ views ┆ prediction_score │
112 |         │ ---        ┆ ---   ┆ ---              │
113 |         │ i64        ┆ i64   ┆ f64              │
114 |         ╞════════════╪═══════╪══════════════════╡
115 |         │ 5          ┆ 300   ┆ 1.0              │
116 |         │ 3          ┆ 200   ┆ 0.5              │
117 |         │ 2          ┆ 150   ┆ 0.333333         │
118 |         │ 1          ┆ 100   ┆ 0.25             │
119 |         │ 4          ┆ 50    ┆ 0.2              │
120 |         └────────────┴───────┴──────────────────┘
121 |     """
122 |     _TEMP_NAME = "index"
123 |     return (
124 |         (
125 |             df.select(article_id_col, column)
126 |             .sort(by=column, descending=desc)
127 |             .with_row_index(name=_TEMP_NAME, offset=1)
128 |         )
129 |         .with_columns((1 / pl.col(_TEMP_NAME)).alias(prediction_score_col))
130 |         .drop(_TEMP_NAME)
131 |     )
132 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_articles_behaviors.py:
--------------------------------------------------------------------------------
  1 | from ebrec.utils._python import generate_unique_name
  2 | 
  3 | try:
  4 |     import polars as pl
  5 | except ImportError:
  6 |     print("polars not available")
  7 | 
  8 | 
  9 | def map_list_article_id_to_value(
 10 |     behaviors: pl.DataFrame,
 11 |     behaviors_column: str,
 12 |     mapping: dict[int, pl.Series],
 13 |     drop_nulls: bool = False,
 14 |     fill_nulls: any = None,
 15 | ) -> pl.DataFrame:
 16 |     """
 17 | 
 18 |     Maps the values of a column in a DataFrame `behaviors` containing article IDs to their corresponding values
 19 |     in a column in another DataFrame `articles`. The mapping is performed using a dictionary constructed from
 20 |     the two DataFrames. The resulting DataFrame has the same columns as `behaviors`, but with the article IDs
 21 |     replaced by their corresponding values.
 22 | 
 23 |     Args:
 24 |         behaviors (pl.DataFrame): The DataFrame containing the column to be mapped.
 25 |         behaviors_column (str): The name of the column to be mapped in `behaviors`.
 26 |         mapping (dict[int, pl.Series]): A dictionary with article IDs as keys and corresponding values as values.
 27 |             Note, 'replace' works a lot faster when values are of type pl.Series!
 28 |         drop_nulls (bool): If `True`, any rows in the resulting DataFrame with null values will be dropped.
 29 |             If `False` and `fill_nulls` is specified, null values in `behaviors_column` will be replaced with `fill_null`.
 30 |         fill_nulls (Optional[any]): If specified, any null values in `behaviors_column` will be replaced with this value.
 31 | 
 32 |     Returns:
 33 |         pl.DataFrame: A new DataFrame with the same columns as `behaviors`, but with the article IDs in
 34 |             `behaviors_column` replaced by their corresponding values in `mapping`.
 35 | 
 36 |     Example:
 37 |     >>> behaviors = pl.DataFrame(
 38 |             {"user_id": [1, 2, 3, 4, 5], "article_ids": [["A1", "A2"], ["A2", "A3"], ["A1", "A4"], ["A4", "A4"], None]}
 39 |         )
 40 |     >>> articles = pl.DataFrame(
 41 |             {
 42 |                 "article_id": ["A1", "A2", "A3"],
 43 |                 "article_type": ["News", "Sports", "Entertainment"],
 44 |             }
 45 |         )
 46 |     >>> articles_dict = dict(zip(articles["article_id"], articles["article_type"]))
 47 |     >>> map_list_article_id_to_value(
 48 |             behaviors=behaviors,
 49 |             behaviors_column="article_ids",
 50 |             mapping=articles_dict,
 51 |             fill_nulls="Unknown",
 52 |         )
 53 |         shape: (4, 2)
 54 |         ┌─────────┬─────────────────────────────┐
 55 |         │ user_id ┆ article_ids                 │
 56 |         │ ---     ┆ ---                         │
 57 |         │ i64     ┆ list[str]                   │
 58 |         ╞═════════╪═════════════════════════════╡
 59 |         │ 1       ┆ ["News", "Sports"]          │
 60 |         │ 2       ┆ ["Sports", "Entertainment"] │
 61 |         │ 3       ┆ ["News", "Unknown"]         │
 62 |         │ 4       ┆ ["Unknown", "Unknown"]      │
 63 |         │ 5       ┆ ["Unknown"]                 │
 64 |         └─────────┴─────────────────────────────┘
 65 |     >>> map_list_article_id_to_value(
 66 |             behaviors=behaviors,
 67 |             behaviors_column="article_ids",
 68 |             mapping=articles_dict,
 69 |             drop_nulls=True,
 70 |         )
 71 |         shape: (4, 2)
 72 |         ┌─────────┬─────────────────────────────┐
 73 |         │ user_id ┆ article_ids                 │
 74 |         │ ---     ┆ ---                         │
 75 |         │ i64     ┆ list[str]                   │
 76 |         ╞═════════╪═════════════════════════════╡
 77 |         │ 1       ┆ ["News", "Sports"]          │
 78 |         │ 2       ┆ ["Sports", "Entertainment"] │
 79 |         │ 3       ┆ ["News"]                    │
 80 |         │ 4       ┆ null                        │
 81 |         │ 5       ┆ null                        │
 82 |         └─────────┴─────────────────────────────┘
 83 |     >>> map_list_article_id_to_value(
 84 |             behaviors=behaviors,
 85 |             behaviors_column="article_ids",
 86 |             mapping=articles_dict,
 87 |             drop_nulls=False,
 88 |         )
 89 |         shape: (4, 2)
 90 |         ┌─────────┬─────────────────────────────┐
 91 |         │ user_id ┆ article_ids                 │
 92 |         │ ---     ┆ ---                         │
 93 |         │ i64     ┆ list[str]                   │
 94 |         ╞═════════╪═════════════════════════════╡
 95 |         │ 1       ┆ ["News", "Sports"]          │
 96 |         │ 2       ┆ ["Sports", "Entertainment"] │
 97 |         │ 3       ┆ ["News", null]              │
 98 |         │ 4       ┆ [null, null]                │
 99 |         │ 5       ┆ [null]                      │
100 |         └─────────┴─────────────────────────────┘
101 |     """
102 |     GROUPBY_ID = generate_unique_name(behaviors.columns, "_groupby_id")
103 |     behaviors = behaviors.lazy().with_row_index(GROUPBY_ID)
104 |     # =>
105 |     select_column = (
106 |         behaviors.select(pl.col(GROUPBY_ID), pl.col(behaviors_column))
107 |         .explode(behaviors_column)
108 |         .with_columns(pl.col(behaviors_column).replace(mapping, default=None))
109 |         .collect()
110 |     )
111 |     # =>
112 |     if drop_nulls:
113 |         select_column = select_column.drop_nulls()
114 |     elif fill_nulls is not None:
115 |         select_column = select_column.with_columns(
116 |             pl.col(behaviors_column).fill_null(fill_nulls)
117 |         )
118 |     # =>
119 |     select_column = (
120 |         select_column.lazy().group_by(GROUPBY_ID).agg(behaviors_column).collect()
121 |     )
122 |     return (
123 |         behaviors.drop(behaviors_column)
124 |         .collect()
125 |         .join(select_column, on=GROUPBY_ID, how="left")
126 |         .drop(GROUPBY_ID)
127 |     )
128 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_constants.py:
--------------------------------------------------------------------------------
 1 | # BEHAVIORS
 2 | DEFAULT_IMPRESSION_TIMESTAMP_COL = "impression_time"
 3 | DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"
 4 | DEFAULT_CLICKED_ARTICLES_COL = "article_ids_clicked"
 5 | DEFAULT_SCROLL_PERCENTAGE_COL = "scroll_percentage"
 6 | DEFAULT_INVIEW_ARTICLES_COL = "article_ids_inview"
 7 | DEFAULT_IMPRESSION_ID_COL = "impression_id"
 8 | DEFAULT_IS_SUBSCRIBER_COL = "is_subscriber"
 9 | DEFAULT_IS_SSO_USER_COL = "is_sso_user"
10 | DEFAULT_ARTICLE_ID_COL = "article_id"
11 | DEFAULT_SESSION_ID_COL = "session_id"
12 | DEFAULT_READ_TIME_COL = "read_time"
13 | DEFAULT_DEVICE_COL = "device_type"
14 | DEFAULT_POSTCODE_COL = "postcode"
15 | DEFAULT_GENDER_COL = "gender"
16 | DEFAULT_USER_COL = "user_id"
17 | DEFAULT_AGE_COL = "age"
18 | 
19 | DEFAULT_NEXT_SCROLL_PERCENTAGE_COL = f"next_{DEFAULT_SCROLL_PERCENTAGE_COL}"
20 | DEFAULT_NEXT_READ_TIME_COL = f"next_{DEFAULT_READ_TIME_COL}"
21 | 
22 | # ARTICLES
23 | DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL = "last_modified_time"
24 | DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL = "published_time"
25 | DEFAULT_SENTIMENT_LABEL_COL = "sentiment_label"
26 | DEFAULT_SENTIMENT_SCORE_COL = "sentiment_score"
27 | DEFAULT_TOTAL_READ_TIME_COL = "total_read_time"
28 | DEFAULT_TOTAL_PAGEVIEWS_COL = "total_pageviews"
29 | DEFAULT_TOTAL_INVIEWS_COL = "total_inviews"
30 | DEFAULT_ARTICLE_TYPE_COL = "article_type"
31 | DEFAULT_CATEGORY_STR_COL = "category_str"
32 | DEFAULT_SUBCATEGORY_COL = "subcategory"
33 | DEFAULT_ENTITIES_COL = "entity_groups"
34 | DEFAULT_IMAGE_IDS_COL = "image_ids"
35 | DEFAULT_SUBTITLE_COL = "subtitle"
36 | DEFAULT_CATEGORY_COL = "category"
37 | DEFAULT_NER_COL = "ner_clusters"
38 | DEFAULT_PREMIUM_COL = "premium"
39 | DEFAULT_TOPICS_COL = "topics"
40 | DEFAULT_TITLE_COL = "title"
41 | DEFAULT_BODY_COL = "body"
42 | DEFAULT_URL_COL = "url"
43 | 
44 | # HISTORY
45 | DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL = f"{DEFAULT_IMPRESSION_TIMESTAMP_COL}_fixed"
46 | DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL = f"{DEFAULT_SCROLL_PERCENTAGE_COL}_fixed"
47 | DEFAULT_HISTORY_ARTICLE_ID_COL = f"{DEFAULT_ARTICLE_ID_COL}_fixed"
48 | DEFAULT_HISTORY_READ_TIME_COL = f"{DEFAULT_READ_TIME_COL}_fixed"
49 | 
50 | # CREATE
51 | DEFAULT_KNOWN_USER_COL = "is_known_user"
52 | DEFAULT_LABELS_COL = "labels"
53 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_decay.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     import polars as pl
  3 | except ImportError:
  4 |     print("polars not available")
  5 | 
  6 | 
  7 | def linear_decay_weights(n: int, ascending: bool = True, **kwargs) -> list[float]:
  8 |     """
  9 |     Generates a list of weights in a linear decaying pattern.
 10 |     Args:
 11 |         n (int): The number of weights to generate. Must be a positive integer.
 12 |         ascending (bool, optional): Flag to determine the order of decay.
 13 |                                     If True, the decay is ascending. If False, it's descending.
 14 |                                     Defaults to True.
 15 |     Returns:
 16 |         List[float]: A list of linearly decaying weights.
 17 |     Raises:
 18 |         ValueError: If 'n' is not a positive integer.
 19 |     Examples:
 20 |     >>> linear_decay_weights(5, True)
 21 |         [0.2, 0.4, 0.6, 0.8, 1.0]
 22 |     >>> linear_decay_weights(10, False)
 23 |         [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
 24 |     """
 25 |     weights = [(n - i) / n for i in range(n)]
 26 |     return weights if not ascending else weights[::-1]
 27 | 
 28 | 
 29 | def exponential_decay_weights(
 30 |     n: int, lambda_factor: float, ascending: bool = True, **kwargs
 31 | ) -> list[float]:
 32 |     """
 33 |     Generates a list of weights in an exponential decay pattern.
 34 |     Args:
 35 |         n (int): The number of weights to generate. Must be a non-negative integer.
 36 |         lambda_factor (float): The factor by which the weights decay exponentially.
 37 |         ascending (bool, optional): Flag to determine the order of decay.
 38 |                                     If True, the decay is ascending. If False, it's descending.
 39 |                                     Defaults to True.
 40 |     Returns:
 41 |         List[float]: A list of exponentially decaying weights.
 42 |     Raises:
 43 |         ValueError: If 'n' is negative.
 44 |     Examples:
 45 |     >>> exponential_decay_weights(5, 0.5, True)
 46 |         [0.0625, 0.125, 0.25, 0.5, 1.0]
 47 |     >>> exponential_decay_weights(10, 0.5, False)
 48 |         [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, 0.001953125]
 49 |     """
 50 |     weights = [lambda_factor ** (n - i - 1) for i in range(n)]
 51 |     return weights if ascending else weights[::-1]
 52 | 
 53 | 
 54 | def add_decay_weights(
 55 |     df, column: str, decay_func: callable, ascending: bool = True, **kwargs: dict
 56 | ):
 57 |     """
 58 |     Wrapper function: Adding decay weights to column using decay function scheme
 59 |     >>> df = pl.DataFrame(
 60 |             {
 61 |                 "col1": [
 62 |                     [[1], [1], [1], [1]],
 63 |                     [[1, 1], [1, 1], [1, 1]],
 64 |                     [[1, 1, 1], [1, 1, 1]],
 65 |                     None,
 66 |                 ],
 67 |                 "col2": [4, 5, 6, 7],
 68 |             }
 69 |         )
 70 |     >>> add_decay_weights(df, "col1", decay_func=linear_decay_weights, ascending=True)
 71 |         shape: (4, 3)
 72 |         ┌──────────────────────────┬───────────────────────────┬──────┐
 73 |         │ col1                     ┆ col1_weights              ┆ col2 │
 74 |         │ ---                      ┆ ---                       ┆ ---  │
 75 |         │ list[list[i64]]          ┆ list[f64]                 ┆ i64  │
 76 |         ╞══════════════════════════╪═══════════════════════════╪══════╡
 77 |         │ [[1], [1], … [1]]        ┆ [0.25, 0.5, … 1.0]        ┆ 4    │
 78 |         │ [[1, 1], [1, 1], [1, 1]] ┆ [0.333333, 0.666667, 1.0] ┆ 5    │
 79 |         │ [[1, 1, 1], [1, 1, 1]]   ┆ [0.5, 1.0]                ┆ 6    │
 80 |         │ null                     ┆ []                        ┆ 7    │
 81 |         └──────────────────────────┴───────────────────────────┴──────┘
 82 |     >>> add_decay_weights(df, "col1", decay_func=exponential_decay_weights, ascending=True, **{"lambda_factor" : 0.5})
 83 |         shape: (4, 3)
 84 |         ┌──────────────────────────┬──────────────────────┬──────┐
 85 |         │ col1                     ┆ col1_weights         ┆ col2 │
 86 |         │ ---                      ┆ ---                  ┆ ---  │
 87 |         │ list[list[i64]]          ┆ list[f64]            ┆ i64  │
 88 |         ╞══════════════════════════╪══════════════════════╪══════╡
 89 |         │ [[1], [1], … [1]]        ┆ [0.125, 0.25, … 1.0] ┆ 4    │
 90 |         │ [[1, 1], [1, 1], [1, 1]] ┆ [0.25, 0.5, 1.0]     ┆ 5    │
 91 |         │ [[1, 1, 1], [1, 1, 1]]   ┆ [0.5, 1.0]           ┆ 6    │
 92 |         │ null                     ┆ []                   ┆ 7    │
 93 |         └──────────────────────────┴──────────────────────┴──────┘
 94 |     """
 95 |     lengths = df[column].list.len().to_list()
 96 |     weights = [decay_func(n=i, ascending=ascending, **kwargs) for i in lengths]
 97 |     return df.with_columns(pl.Series(f"{column}_weights", weights))
 98 | 
 99 | 
100 | def decay_weighting_nested_lists(
101 |     df, column_history: str, column_history_weights: str, fill_nulls: int = None
102 | ):
103 |     """
104 |     >>> df = pl.DataFrame(
105 |             {
106 |                 "col1": [
107 |                     [[1], [1], [1], [1]],
108 |                     [[1, 1], [1, 1], [1, 1]],
109 |                     [[1, 1, 1], [1, 1, 1]],
110 |                     [[1], None],
111 |                     None,
112 |                 ],
113 |                 "col1_weights":
114 |                     [[0.25, 0.5, 0.75, 1.0],
115 |                     [0.33, 0.67, 1.0],
116 |                     [0.5, 1.0],
117 |                     [0.5, 1.0],
118 |                     []
119 |                 ],
120 |                 "col2": [4, 5, 6, 7, 8 ],
121 |             }
122 |         )
123 |     >>> decay_weighting_nested_lists(df, column_history="col1", column_history_weights="col1_weights")["col1"]
124 |         Series: 'col1' [list[list[f64]]]
125 |         [
126 |             [[0.25], [0.5], … [1.0]]
127 |             [[0.33, 0.33], [0.67, 0.67], [1.0, 1.0]]
128 |             [[0.5, 0.5, 0.5], [1.0, 1.0, 1.0]]
129 |             [[0.5], [null]]
130 |             null
131 |         ]
132 |     >>> decay_weighting_nested_lists(df.lazy(), "col1", "col1_weights").collect()
133 |     """
134 |     GROUP_BY_COLUMN_FIRST = "group_by_1"
135 |     GROUP_BY_COLUMN_SECOND = "group_by_2"
136 |     COLUMNS = df.columns
137 | 
138 |     df = df.with_row_count(GROUP_BY_COLUMN_FIRST)
139 | 
140 |     exploded_weights = df.drop_nulls(column_history).select(
141 |         pl.col(column_history_weights).explode()
142 |     )
143 | 
144 |     if isinstance(exploded_weights, pl.LazyFrame):
145 |         exploded_weights = exploded_weights.collect()
146 | 
147 |     df_ = (
148 |         df.select(pl.col(GROUP_BY_COLUMN_FIRST, column_history))
149 |         .drop_nulls(column_history)
150 |         .explode(column_history)
151 |         .with_columns(exploded_weights.select(column_history_weights))
152 |         .with_row_count(GROUP_BY_COLUMN_SECOND)
153 |         # Not optimal to explode, I want to compute [1,2,2] * 0.5 => (list * float)
154 |         .explode(column_history)
155 |         .with_columns(
156 |             (pl.col(column_history) * pl.col(column_history_weights)).alias(
157 |                 column_history
158 |             )
159 |         )
160 |         .group_by([GROUP_BY_COLUMN_SECOND])
161 |         .agg(pl.col(GROUP_BY_COLUMN_FIRST).first(), column_history)
162 |         .group_by(GROUP_BY_COLUMN_FIRST)
163 |         .agg(column_history)
164 |         .sort(GROUP_BY_COLUMN_FIRST)
165 |     )
166 | 
167 |     return (
168 |         df.drop(column_history)
169 |         .join(df_, on=GROUP_BY_COLUMN_FIRST, how="left")
170 |         .select(COLUMNS)
171 |     )
172 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_descriptive_analysis.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | from ebrec.utils._constants import (
 4 |     DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
 5 |     DEFAULT_IMPRESSION_TIMESTAMP_COL,
 6 | )
 7 | 
 8 | 
 9 | def min_max_impression_time_history(
10 |     df: pl.DataFrame, timestamp_col: str = DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
11 | ):
12 |     """
13 |     Check min/max for user history timestamp column.
14 |     """
15 |     return (
16 |         df.select(pl.col(timestamp_col))
17 |         .with_columns(
18 |             pl.col(timestamp_col).list.eval(pl.element().min()).explode().alias("min")
19 |         )
20 |         .with_columns(
21 |             pl.col(timestamp_col).list.eval(pl.element().max()).explode().alias("max")
22 |         )
23 |         .select(pl.col("min").min(), pl.col("max").max())
24 |     )
25 | 
26 | 
27 | def min_max_impression_time_behaviors(
28 |     df: pl.DataFrame, timestamp_col: str = DEFAULT_IMPRESSION_TIMESTAMP_COL
29 | ):
30 |     """
31 |     Check min/max for behaviors timestamp column.
32 |     """
33 |     return df.select(
34 |         pl.col(timestamp_col).min().alias("min"),
35 |         pl.col(timestamp_col).max().alias("max"),
36 |     )
37 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_nlp.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | from ebrec.utils._python import get_torch_device
 6 | 
 7 | try:
 8 |     from torch.utils.data import DataLoader, TensorDataset
 9 | except ImportError:
10 |     print("torch not available")
11 | try:
12 |     from transformers import AutoTokenizer, AutoModel
13 | except ImportError:
14 |     print("transformers not available")
15 | 
16 | 
17 | def get_transformers_word_embeddings(model: AutoModel):
18 |     return model.embeddings.word_embeddings.weight.data.to("cpu").numpy()
19 | 
20 | 
21 | def generate_embeddings_with_transformers(
22 |     model: AutoModel,
23 |     tokenizer: AutoTokenizer,
24 |     text_list: list[str],
25 |     batch_size: int = 8,
26 |     device: str = None,
27 |     disable_tqdm: bool = False,
28 | ) -> torch.Tensor:
29 |     """
30 |     Generates embeddings for a list of texts using a pre-trained transformer model.
31 | 
32 |     Args:
33 |         model_name (str): The name of the pre-trained transformer model to use.
34 |         text_list (list of str): A list of texts to generate embeddings for.
35 |         batch_size (int): The batch size to use for generating embeddings. Defaults to 8.
36 |         device (str): The device to use for generating embeddings (e.g., "cpu", "cuda").
37 |             If None, defaults to the first available GPU or CPU.
38 | 
39 |     Returns:
40 |         embeddings (torch.Tensor): A tensor containing the embeddings for the input texts.
41 |             The shape of the tensor is (num_texts, embedding_dim), where num_texts is the number
42 |             of input texts and embedding_dim is the dimensionality of the embeddings produced by
43 |             the pre-trained model.
44 | 
45 |     Examples:
46 |     >>> model_name = "bert-base-uncased"
47 |     >>> text_list = ["hello world", "how are you"]
48 |     >>> batch_size = 2
49 |     >>> device = "cpu"
50 |     >>> model = AutoModel.from_pretrained(model_name)
51 |     >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
52 |     >>> embeddings_tensor = generate_embeddings_with_transformers(model, tokenizer, text_list, batch_size, device)
53 |     >>> print(embeddings_tensor)
54 |         tensor([[-0.0243,  0.1144,  0.0830,  ..., -0.2666,  0.1662,  0.1519],
55 |                 [ 0.0827,  0.0877, -0.0688,  ..., -0.4381,  0.0462, -0.1446]])
56 |     >>> print(embeddings_tensor.shape)
57 |         torch.Size([2, 768])
58 |     """
59 |     device = get_torch_device(use_gpu=True) if device is None else device
60 |     model = model.to(device)
61 | 
62 |     tokenized_text = tokenizer(
63 |         text_list, padding=True, truncation=True, return_tensors="pt"
64 |     )
65 |     feature_names = list(tokenized_text)
66 | 
67 |     dataset = TensorDataset(
68 |         tokenized_text["input_ids"], tokenized_text["attention_mask"]
69 |     )
70 |     dataloader = DataLoader(dataset, batch_size=batch_size)
71 |     embeddings = []
72 |     with torch.no_grad():
73 |         for batch in tqdm(dataloader, desc="Encoding", disable=disable_tqdm):
74 |             inputs = {feat: t.to(device) for feat, t in zip(feature_names, batch)}
75 |             outputs = model(
76 |                 **inputs,
77 |                 output_hidden_states=True,
78 |             )
79 |             embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze(dim=1))
80 |     return torch.vstack(embeddings)
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     #
85 |     model_name = "xlm-roberta-base"
86 |     batch_size = 8
87 |     text_list = [
88 |         "hej med dig. Jeg er en tekst.",
89 |         "Jeg er en anden tekst, skal du spille smart?",
90 |         "oh nej..",
91 |     ]
92 |     model = AutoModel.from_pretrained(model_name)
93 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
94 |     t = generate_embeddings_with_transformers(
95 |         model, tokenizer, text_list, batch_size, "cpu"
96 |     )
97 | 


--------------------------------------------------------------------------------
/src/ebrec/utils/_torch.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | try:
 4 |     import torch
 5 | except ImportError:
 6 |     print("torch not available")
 7 | 
 8 | 
 9 | def save_checkpoint(model, path="model_state_dict.pt"):
10 |     path = Path(path)
11 |     path.parent.mkdir(parents=True, exist_ok=True)
12 |     print(f"Saving model weights: {path}")
13 |     torch.save(model.state_dict(), path.as_posix())
14 | 


--------------------------------------------------------------------------------
/test/bombing/bomb_dataloader.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import polars as pl
  3 | import numpy as np
  4 | 
  5 | from ebrec.models.newsrec.dataloader import (
  6 |     LSTURDataLoader,
  7 |     NRMSDataLoader,
  8 | )
  9 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
 10 | from ebrec.utils._articles import create_article_id_to_value_mapping
 11 | 
 12 | from ebrec.utils._python import time_it
 13 | from tqdm import tqdm
 14 | 
 15 | from ebrec.utils._behaviors import create_binary_labels_column
 16 | from ebrec.utils._constants import (
 17 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
 18 |     DEFAULT_CLICKED_ARTICLES_COL,
 19 |     DEFAULT_INVIEW_ARTICLES_COL,
 20 |     DEFAULT_ARTICLE_ID_COL,
 21 |     DEFAULT_CATEGORY_COL,
 22 |     DEFAULT_USER_COL,
 23 | )
 24 | 
 25 | from ebrec.models.fastformer.dataloader import FastformerDataset
 26 | from torch.utils.data import DataLoader
 27 | 
 28 | N_ITERATIONS = 300
 29 | BATCH_SIZE = 100
 30 | TOKEN_COL = "tokens"
 31 | N_SAMPLES = "n"
 32 | 
 33 | # LOAD DATA:
 34 | PATH_DATA = Path("test/data")
 35 | df_articles = (
 36 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
 37 |     .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
 38 |     .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
 39 |     .collect()
 40 | )
 41 | df_history = (
 42 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
 43 |     .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
 44 |     .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
 45 | )
 46 | df_behaviors = (
 47 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
 48 |     .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
 49 |     .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
 50 |     .join(df_history, on=DEFAULT_USER_COL, how="left")
 51 |     .collect()
 52 |     .pipe(create_binary_labels_column)
 53 | )
 54 | # => MAPPINGS:
 55 | article_mapping = create_article_id_to_value_mapping(
 56 |     df=df_articles, value_col=TOKEN_COL
 57 | )
 58 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
 59 | # => NPRATIO IMPRESSION - SAME LENGTHS:
 60 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
 61 | # => FOR TEST-DATALOADER
 62 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
 63 | 
 64 | 
 65 | def iter_dataloader(dataloader, name: str, iterations: int):
 66 |     for _ in tqdm(range(iterations), desc=name):
 67 |         for _ in dataloader:
 68 |             pass
 69 | 
 70 | 
 71 | # ===
 72 | @time_it(True)
 73 | def bomb_NRMSDataLoader():
 74 |     dataloader = NRMSDataLoader(
 75 |         behaviors=df_behaviors_train,
 76 |         article_dict=article_mapping,
 77 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
 78 |         unknown_representation="zeros",
 79 |         eval_mode=False,
 80 |         batch_size=BATCH_SIZE,
 81 |     )
 82 |     iter_dataloader(dataloader, "NRMS-train", iterations=N_ITERATIONS)
 83 | 
 84 |     dataloader = NRMSDataLoader(
 85 |         behaviors=df_behaviors,
 86 |         article_dict=article_mapping,
 87 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
 88 |         unknown_representation="zeros",
 89 |         eval_mode=True,
 90 |         batch_size=BATCH_SIZE,
 91 |     )
 92 |     iter_dataloader(dataloader, "NRMS-test", iterations=N_ITERATIONS)
 93 | 
 94 | 
 95 | @time_it(True)
 96 | def bomb_LSTURDataLoader():
 97 |     user_mapping = create_user_id_to_int_mapping(df=df_behaviors_train)
 98 | 
 99 |     dataloader = LSTURDataLoader(
100 |         behaviors=df_behaviors_train,
101 |         article_dict=article_mapping,
102 |         user_id_mapping=user_mapping,
103 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
104 |         unknown_representation="zeros",
105 |         batch_size=BATCH_SIZE,
106 |     )
107 |     iter_dataloader(dataloader, "LSTUR-train", iterations=N_ITERATIONS)
108 | 
109 |     dataloader = LSTURDataLoader(
110 |         behaviors=df_behaviors,
111 |         article_dict=article_mapping,
112 |         user_id_mapping=user_mapping,
113 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
114 |         unknown_representation="zeros",
115 |         batch_size=BATCH_SIZE,
116 |         eval_mode=True,
117 |     )
118 |     iter_dataloader(dataloader, "LSTUR-test", iterations=N_ITERATIONS)
119 | 
120 | 
121 | # ===
122 | @time_it(True)
123 | def bomb_FastformerDataLoader():
124 |     dataloader = DataLoader(
125 |         FastformerDataset(
126 |             behaviors=df_behaviors_train,
127 |             history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
128 |             article_dict=article_mapping,
129 |             batch_size=BATCH_SIZE,
130 |             shuffle=True,
131 |         )
132 |     )
133 |     iter_dataloader(dataloader, "Fastformer-train", iterations=N_ITERATIONS)
134 | 
135 |     dataloader = DataLoader(
136 |         FastformerDataset(
137 |             behaviors=df_behaviors,
138 |             history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
139 |             article_dict=article_mapping,
140 |             batch_size=BATCH_SIZE,
141 |             shuffle=False,
142 |         )
143 |     )
144 |     iter_dataloader(dataloader, "Fastformer-test", iterations=N_ITERATIONS)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     bomb_NRMSDataLoader()
149 |     bomb_LSTURDataLoader()
150 |     bomb_FastformerDataLoader()
151 | 


--------------------------------------------------------------------------------
/test/data/ebnerd/articles.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/articles.parquet


--------------------------------------------------------------------------------
/test/data/ebnerd/behaviors.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/behaviors.parquet


--------------------------------------------------------------------------------
/test/data/ebnerd/document_vector.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/document_vector.parquet


--------------------------------------------------------------------------------
/test/data/ebnerd/history.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/history.parquet


--------------------------------------------------------------------------------
/test/dataloader/test_fastformer.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import polars as pl
 3 | import numpy as np
 4 | import torch
 5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
 6 | from ebrec.utils._articles import create_article_id_to_value_mapping
 7 | 
 8 | from ebrec.utils._python import time_it
 9 | from ebrec.utils._behaviors import create_binary_labels_column
10 | from ebrec.utils._constants import (
11 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
12 |     DEFAULT_CLICKED_ARTICLES_COL,
13 |     DEFAULT_INVIEW_ARTICLES_COL,
14 |     DEFAULT_ARTICLE_ID_COL,
15 |     DEFAULT_CATEGORY_COL,
16 |     DEFAULT_USER_COL,
17 | )
18 | 
19 | from ebrec.models.fastformer.dataloader import FastformerDataset
20 | from torch.utils.data import DataLoader
21 | 
22 | TOKEN_COL = "tokens"
23 | N_SAMPLES = "n"
24 | BATCH_SIZE = 100
25 | 
26 | # LOAD DATA:
27 | PATH_DATA = Path("test/data")
28 | df_articles = (
29 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
30 |     .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
31 |     .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
32 |     .collect()
33 | )
34 | df_history = (
35 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
36 |     .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
37 |     .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
38 | )
39 | df_behaviors = (
40 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
41 |     .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
42 |     .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
43 |     .join(df_history, on=DEFAULT_USER_COL, how="left")
44 |     .collect()
45 |     .pipe(create_binary_labels_column)
46 | )
47 | # => MAPPINGS:
48 | article_mapping = create_article_id_to_value_mapping(
49 |     df=df_articles, value_col=TOKEN_COL
50 | )
51 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
52 | # => NPRATIO IMPRESSION - SAME LENGTHS:
53 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
54 | # => FOR TEST-DATALOADER
55 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
56 | 
57 | 
58 | @time_it(True)
59 | def test_FastformerDataloader():
60 |     train_dataloader = DataLoader(
61 |         FastformerDataset(
62 |             behaviors=df_behaviors_train,
63 |             history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
64 |             article_dict=article_mapping,
65 |             batch_size=BATCH_SIZE,
66 |             shuffle=True,
67 |         )
68 |     )
69 | 
70 |     batch = train_dataloader.__iter__().__next__()
71 | 
72 |     assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
73 |     assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
74 |     assert (
75 |         len(batch[0]) == 2
76 |     ), "Fastformer has two outputs (history_input, candidate_input)"
77 | 
78 |     for type_in_batch in batch[0]:
79 |         assert (
80 |             type_in_batch.dtype == torch.int
81 |         ), "Expected output to be integer; used for lookup value"
82 | 
83 |     assert batch[1].dtype == torch.float, "Expected output to be integer; this is label"
84 | 
85 |     test_dataloader = DataLoader(
86 |         FastformerDataset(
87 |             behaviors=df_behaviors,
88 |             history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
89 |             article_dict=article_mapping,
90 |             batch_size=BATCH_SIZE,
91 |             shuffle=False,
92 |         )
93 |     )
94 | 
95 |     batch = test_dataloader.__iter__().__next__()
96 |     assert len(batch[1].squeeze(0)) == sum(
97 |         label_lengths[:BATCH_SIZE]
98 |     ), "Should have unfolded all the test samples"
99 | 


--------------------------------------------------------------------------------
/test/dataloader/test_newsrec.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import polars as pl
  3 | import numpy as np
  4 | import torch
  5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
  6 | from ebrec.utils._articles import create_article_id_to_value_mapping
  7 | from ebrec.utils._python import create_lookup_dict
  8 | 
  9 | from ebrec.models.newsrec.dataloader import (
 10 |     LSTURDataLoader,
 11 |     NAMLDataLoader,
 12 |     NRMSDataLoader,
 13 | )
 14 | from ebrec.utils._python import time_it
 15 | from ebrec.utils._behaviors import create_binary_labels_column
 16 | from ebrec.utils._constants import (
 17 |     DEFAULT_HISTORY_ARTICLE_ID_COL,
 18 |     DEFAULT_CLICKED_ARTICLES_COL,
 19 |     DEFAULT_INVIEW_ARTICLES_COL,
 20 |     DEFAULT_ARTICLE_ID_COL,
 21 |     DEFAULT_CATEGORY_COL,
 22 |     DEFAULT_USER_COL,
 23 | )
 24 | 
 25 | from ebrec.models.fastformer.dataloader import FastformerDataset
 26 | from torch.utils.data import DataLoader
 27 | 
 28 | TOKEN_COL = "tokens"
 29 | N_SAMPLES = "n"
 30 | BATCH_SIZE = 100
 31 | 
 32 | # LOAD DATA:
 33 | PATH_DATA = Path("test/data")
 34 | df_articles = (
 35 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
 36 |     .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
 37 |     .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
 38 |     .collect()
 39 | )
 40 | df_history = (
 41 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
 42 |     .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
 43 |     .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
 44 | )
 45 | df_behaviors = (
 46 |     pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
 47 |     .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
 48 |     .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
 49 |     .join(df_history, on=DEFAULT_USER_COL, how="left")
 50 |     .collect()
 51 |     .pipe(create_binary_labels_column)
 52 | )
 53 | # => MAPPINGS:
 54 | article_mapping = create_article_id_to_value_mapping(
 55 |     df=df_articles, value_col=TOKEN_COL
 56 | )
 57 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
 58 | # => NPRATIO IMPRESSION - SAME LENGTHS:
 59 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
 60 | # => FOR TEST-DATALOADER
 61 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
 62 | 
 63 | 
 64 | # ===
 65 | @time_it(True)
 66 | def test_NRMSDataLoader():
 67 |     train_dataloader = NRMSDataLoader(
 68 |         behaviors=df_behaviors_train,
 69 |         article_dict=article_mapping,
 70 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
 71 |         unknown_representation="zeros",
 72 |         eval_mode=False,
 73 |         batch_size=BATCH_SIZE,
 74 |     )
 75 | 
 76 |     batch = train_dataloader.__iter__().__next__()
 77 | 
 78 |     assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
 79 |     assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
 80 |     assert (
 81 |         len(batch[0]) == 2
 82 |     ), "NRMS has two outputs (his_input_title, pred_input_title_one)"
 83 | 
 84 |     for type_in_batch in batch[0][0]:
 85 |         assert isinstance(
 86 |             type_in_batch.ravel()[0], np.integer
 87 |         ), "Expected output to be integer; used for lookup value"
 88 | 
 89 |     assert isinstance(
 90 |         batch[1].ravel()[0], np.integer
 91 |     ), "Expected output to be integer; this is label"
 92 | 
 93 |     test_dataloader = NRMSDataLoader(
 94 |         behaviors=df_behaviors,
 95 |         article_dict=article_mapping,
 96 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
 97 |         unknown_representation="zeros",
 98 |         eval_mode=True,
 99 |         batch_size=BATCH_SIZE,
100 |     )
101 | 
102 |     batch = test_dataloader.__iter__().__next__()
103 |     assert len(batch[1]) == sum(
104 |         label_lengths[:BATCH_SIZE]
105 |     ), "Should have unfolded all the test samples"
106 | 
107 | 
108 | @time_it(True)
109 | def test_LSTURDataLoader():
110 |     train_dataloader = LSTURDataLoader(
111 |         behaviors=df_behaviors_train,
112 |         article_dict=article_mapping,
113 |         user_id_mapping=user_mapping,
114 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
115 |         unknown_representation="zeros",
116 |         batch_size=BATCH_SIZE,
117 |     )
118 | 
119 |     batch = train_dataloader.__iter__().__next__()
120 | 
121 |     assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
122 |     assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
123 |     assert (
124 |         len(batch[0]) == 3
125 |     ), "LSTUR has two outputs (user_indexes, his_input_title, pred_input_title_one)"
126 | 
127 |     for type_in_batch in batch[0][0]:
128 |         assert isinstance(
129 |             type_in_batch.ravel()[0], np.integer
130 |         ), "Expected output to be integer; used for lookup value"
131 | 
132 |     assert isinstance(
133 |         batch[1].ravel()[0], np.integer
134 |     ), "Expected output to be integer; this is label"
135 | 
136 |     test_dataloader = LSTURDataLoader(
137 |         behaviors=df_behaviors,
138 |         article_dict=article_mapping,
139 |         user_id_mapping=user_mapping,
140 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
141 |         unknown_representation="zeros",
142 |         batch_size=BATCH_SIZE,
143 |         eval_mode=True,
144 |     )
145 | 
146 |     batch = test_dataloader.__iter__().__next__()
147 |     assert len(batch[1]) == sum(
148 |         label_lengths[:BATCH_SIZE]
149 |     ), "Should have unfolded all the test samples"
150 | 
151 | 
152 | @time_it(True)
153 | def test_NAMLDataLoader():
154 |     body_mapping = article_mapping
155 |     category_mapping = create_lookup_dict(
156 |         df_articles.select(pl.col(DEFAULT_CATEGORY_COL).unique()).with_row_index(
157 |             "row_nr"
158 |         ),
159 |         key=DEFAULT_CATEGORY_COL,
160 |         value="row_nr",
161 |     )
162 |     subcategory_mapping = category_mapping
163 | 
164 |     train_dataloader = NAMLDataLoader(
165 |         behaviors=df_behaviors_train,
166 |         article_dict=article_mapping,
167 |         body_mapping=body_mapping,
168 |         category_mapping=category_mapping,
169 |         unknown_representation="zeros",
170 |         subcategory_mapping=subcategory_mapping,
171 |         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
172 |         batch_size=BATCH_SIZE,
173 |     )
174 | 
175 |     batch = train_dataloader.__iter__().__next__()
176 | 
177 |     assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
178 |     assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
179 |     assert (
180 |         len(batch[0]) == 8
181 |     ), "NAML has two outputs (his_input_title,his_input_body,his_input_vert,his_input_subvert,pred_input_title,pred_input_body,pred_input_vert,pred_input_subvert)"
182 | 
183 |     for type_in_batch in batch[0][0]:
184 |         assert isinstance(
185 |             type_in_batch.ravel()[0], np.integer
186 |         ), "Expected output to be integer; used for lookup value"
187 | 
188 |     assert isinstance(
189 |         batch[1].ravel()[0], np.integer
190 |     ), "Expected output to be integer; this is label"
191 | 


--------------------------------------------------------------------------------
/test/evaluation/test_beyond_accuracy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics.pairwise import cosine_distances
 3 | 
 4 | from ebrec.evaluation.beyond_accuracy import (
 5 |     IntralistDiversity,
 6 |     Distribution,
 7 |     Serendipity,
 8 |     Novelty,
 9 |     Coverage,
10 | )
11 | 
12 | lookup_dict = {
13 |     "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50},
14 |     "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25},
15 |     "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75},
16 |     "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50},
17 |     "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94},
18 |     "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95},
19 |     "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96},
20 |     "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50},
21 |     "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20},
22 |     "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20},
23 | }
24 | 
25 | # 404 is not excepted, however, setup supports it:
26 | R = np.array(
27 |     [
28 |         ["101", "102", "400"],
29 |         ["101", "103", "400"],
30 |         ["101", "102", "103"],
31 |         ["101", "104", "400"],
32 |         ["101", "106", "404"],
33 |         ["404", "404", "404"],
34 |     ]
35 | )
36 | 
37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"]
38 | 
39 | click_histories = [
40 |     np.array([["101", "102"]]),
41 |     np.array([["105", "106", "400"]]),
42 |     np.array([["102", "103", "104"]]),
43 |     np.array([["101", "400"]]),
44 |     np.array([["400"]]),
45 |     np.array([["400"]]),
46 | ]
47 | pairwise_distance_function = cosine_distances
48 | 
49 | # TODO: add the test
50 | 


--------------------------------------------------------------------------------