├── .gitignore
├── LICENSE
├── README.md
├── examples
├── baseline
│ └── ebnerd_feat_baselines.py
├── beyond_accuracy
│ └── make_beyond_accuracy.ipynb
├── datasets
│ ├── ebnerd_descriptive_analysis.ipynb
│ ├── ebnerd_overview.ipynb
│ └── plot
│ │ ├── article_read_time.png
│ │ ├── body_len.png
│ │ ├── category_distribution.png
│ │ ├── category_distribution_ba.png
│ │ ├── front_article_page.png
│ │ ├── front_read_time.png
│ │ ├── inview_len.png
│ │ ├── subtitle_len.png
│ │ └── title_len.png
├── quick_start
│ ├── lstur_dummy.py
│ ├── make_embedding_artifacts.ipynb
│ ├── naml_dummy.py
│ ├── npa_dummy.py
│ ├── nrms_docvec_dummy.py
│ ├── nrms_dummy.py
│ ├── nrms_ebnerd.ipynb
│ └── nrms_ebnerd.py
└── reproducibility_scripts
│ ├── args_nrms.py
│ ├── args_nrms_docvec.py
│ ├── ebnerd_nrms.py
│ ├── ebnerd_nrms_doc_hist.py
│ └── ebnerd_nrms_docvec.py
├── pyproject.toml
├── src
├── __init__.py
└── ebrec
│ ├── evaluation
│ ├── __init__.py
│ ├── _ba_test.py
│ ├── beyond_accuracy.py
│ ├── metrics
│ │ ├── __init__.py
│ │ ├── _beyond_accuracy.py
│ │ ├── _classification.py
│ │ ├── _ranking.py
│ │ └── _sklearn.py
│ ├── metrics_protocols.py
│ ├── protocols.py
│ └── utils.py
│ ├── models
│ ├── fastformer
│ │ ├── __init__.py
│ │ ├── dataloader.py
│ │ ├── fastformer.py
│ │ └── fastformer_wu.py
│ └── newsrec
│ │ ├── __init__.py
│ │ ├── base_model.py
│ │ ├── dataloader.py
│ │ ├── layers.py
│ │ ├── lstur.py
│ │ ├── model_config.py
│ │ ├── naml.py
│ │ ├── npa.py
│ │ ├── nrms.py
│ │ ├── nrms_docvec.py
│ │ └── utils.py
│ └── utils
│ ├── __init__.py
│ ├── _articles.py
│ ├── _articles_behaviors.py
│ ├── _behaviors.py
│ ├── _constants.py
│ ├── _decay.py
│ ├── _descriptive_analysis.py
│ ├── _nlp.py
│ ├── _polars.py
│ ├── _python.py
│ └── _torch.py
└── test
├── bombing
└── bomb_dataloader.py
├── data
└── ebnerd
│ ├── articles.parquet
│ ├── behaviors.parquet
│ ├── document_vector.parquet
│ └── history.parquet
├── dataloader
├── test_fastformer.py
└── test_newsrec.py
└── evaluation
└── test_beyond_accuracy.py
/.gitignore:
--------------------------------------------------------------------------------
1 | share/python-wheels/
2 | pip-wheel-metadata/
3 | .ipynb_checkpoints/
4 | .installed.cfg
5 | develop-eggs/
6 | __pycache__/
7 | *.egg-info/
8 | downloads/
9 | .DS_Store
10 | .Python
11 | wheels/
12 | .vscode
13 | mlruns
14 | build/
15 | .eggs/
16 | lib64/
17 | parts/
18 | sdist/
19 | dist/
20 | eggs/
21 | lib/
22 | var/
23 | *.egg
24 | build
25 | .venv
26 | venv
27 |
28 | # just for now:
29 | evaluate_predictions.py
30 | ebnerd_predictions/
31 | downloads.py
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Ekstra Bladet
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 |
4 |
5 |
6 | # Introduction
7 | Hello there 👋🏽
8 |
9 | We recommend to check the repository frequently, as we are updating and documenting it along the way!
10 |
11 | ## EBNeRD
12 | Ekstra Bladet Recommender System repository, created for the RecSys'24 Challenge.
13 |
14 | # Getting Started
15 | We recommend [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html#conda-environment) for environment management, and [VS Code](https://code.visualstudio.com/) for development. To install the necessart packages and run the example notebook:
16 |
17 | ```
18 | # 1. Create and activate a new conda environment
19 | conda create -n python=3.11
20 | conda activate
21 |
22 | # 2. Clone this repo within VSCode or using command line:
23 | git clone https://github.com/ebanalyse/ebnerd-benchmark.git
24 |
25 | # 3. Install the core ebrec package to the enviroment:
26 | pip install .
27 | ```
28 |
29 | We have experienced issues installing *tensorflow* for M1 Macbooks (```sys_platform == 'darwin'```) when using conda. To avoid this, we suggest to use venv if running on macbooks.
30 | ```
31 | python3 -m .venv .venv
32 | source .venv/bin/activate
33 | ```
34 |
35 | Installing ```.venv``` in project folder:
36 | ```
37 | conda create -p .venv python==3.11.8
38 | conda activate ./.venv
39 | ```
40 |
41 | ## Running GPU
42 | ```
43 | tensorflow-gpu; sys_platform == 'linux'
44 | tensorflow-macos; sys_platform == 'darwin'
45 | ```
46 |
47 | # Algorithms
48 | To get started quickly, we have implemented a couple of News Recommender Systems, specifically,
49 | [Neural Recommendation with Long- and Short-term User Representations](https://aclanthology.org/P19-1033/) (LSTUR),
50 | [Neural Recommendation with Personalized Attention](https://arxiv.org/abs/1907.05559) (NPA),
51 | [Neural Recommendation with Attentive Multi-View Learning](https://arxiv.org/abs/1907.05576) (NAML), and
52 | [Neural Recommendation with Multi-Head Self-Attention](https://aclanthology.org/D19-1671/) (NRMS).
53 | The source code originates from the brilliant RS repository, [recommenders](https://github.com/recommenders-team/recommenders). We have simply stripped it of all non-model-related code.
54 |
55 |
56 | # Notebooks
57 | To help you get started, we have created a few notebooks. These are somewhat simple and designed to get you started. We do plan to have more at a later stage, such as reproducible model trainings.
58 | The notebooks were made on macOS, and you might need to perform small modifications to have them running on your system.
59 |
60 | ## Model training
61 | We have created a [notebook](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/nrms_ebnerd.ipynb) where we train NRMS on EB-NeRD - this is a very simple version using the demo dataset.
62 |
63 | ## Data manipulation and enrichment
64 | In the [dataset_ebnerd](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/dataset_ebnerd.ipynb) demo, we show how one can join histories and create binary labels.
65 |
66 | # Reproduce EB-NeRD Experiments
67 |
68 | Activate your enviroment:
69 | ```
70 | conda activate
71 | ```
72 |
73 | ### [NRMSModel](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms.py)
74 |
75 | ```
76 | python examples/reproducibility_scripts/ebnerd_nrms.py
77 | --datasplit ebnerd_small \
78 | --epochs 5 \
79 | --bs_train 32 \
80 | --bs_test 32 \
81 | --history_size 20 \
82 | --npratio 4 \
83 | --transformer_model_name FacebookAI/xlm-roberta-large \
84 | --max_title_length 30 \
85 | --head_num 20 \
86 | --head_dim 20 \
87 | --attention_hidden_dim 200 \
88 | --learning_rate 1e-4 \
89 | --dropout 0.20
90 | ```
91 |
92 | Tensorboards:
93 | ```
94 | tensorboard --logdir=ebnerd_predictions/runs
95 | ```
96 |
97 | ### [NRMSDocVec](https://github.com/ebanalyse/ebnerd-benchmark/blob/main/src/ebrec/models/newsrec/nrms_docvec.py)
98 |
99 | ```
100 | python examples/reproducibility_scripts/ebnerd_nrms_docvec.py \
101 | --datasplit ebnerd_small \
102 | --epochs 5 \
103 | --bs_train 32 \
104 | --history_size 20 \
105 | --npratio 4 \
106 | --document_embeddings Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet \
107 | --head_num 16 \
108 | --head_dim 16 \
109 | --attention_hidden_dim 200 \
110 | --newsencoder_units_per_layer 512 512 512 \
111 | --learning_rate 1e-4 \
112 | --dropout 0.2 \
113 | --newsencoder_l2_regularization 1e-4
114 | ```
115 |
116 | Tensorboards:
117 | ```
118 | tensorboard --logdir=ebnerd_predictions/runs
119 | ```
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/examples/baseline/ebnerd_feat_baselines.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from tqdm import tqdm
3 | import polars as pl
4 |
5 | from ebrec.utils._python import (
6 | rank_predictions_by_score,
7 | write_submission_file,
8 | create_lookup_dict,
9 | )
10 | from ebrec.utils._constants import *
11 |
12 | PATH = Path("~/ebnerd_data/ebnerd_testset")
13 |
14 | df_behaviors = pl.scan_parquet(PATH.joinpath("test", "behaviors.parquet"))
15 | df_articles = pl.scan_parquet(PATH.joinpath("articles.parquet"))
16 |
17 | # ==== LOOKUP DICTS
18 | clicked_dict = create_lookup_dict(
19 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).collect(),
20 | DEFAULT_ARTICLE_ID_COL,
21 | DEFAULT_TOTAL_PAGEVIEWS_COL,
22 | )
23 | inview_dict = create_lookup_dict(
24 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_INVIEWS_COL).collect(),
25 | DEFAULT_ARTICLE_ID_COL,
26 | DEFAULT_TOTAL_INVIEWS_COL,
27 | )
28 | readtime_dict = create_lookup_dict(
29 | df_articles.select(DEFAULT_ARTICLE_ID_COL, DEFAULT_TOTAL_READ_TIME_COL).collect(),
30 | DEFAULT_ARTICLE_ID_COL,
31 | DEFAULT_TOTAL_READ_TIME_COL,
32 | )
33 |
34 | # Estimate:
35 | df_inview_estimate = (
36 | df_behaviors.select(DEFAULT_INVIEW_ARTICLES_COL)
37 | .explode(DEFAULT_INVIEW_ARTICLES_COL)
38 | .select(pl.col(DEFAULT_INVIEW_ARTICLES_COL).value_counts())
39 | .unnest(DEFAULT_INVIEW_ARTICLES_COL)
40 | .collect()
41 | )
42 | inview_dict_estimate = create_lookup_dict(
43 | df_inview_estimate.select(DEFAULT_INVIEW_ARTICLES_COL, "count"),
44 | DEFAULT_INVIEW_ARTICLES_COL,
45 | "count",
46 | )
47 |
48 | # ==== CLICKED PREDICTIONS
49 | CLICKED_SCORE_COL = "clicked_prediction_scores"
50 | INVIEW_SCORE_COL = "inview_prediction_scores"
51 | INVIEW_ESTIMATE_SCORE_COL = "inview_estimate_prediction_scores"
52 | READTIME_SCORE_COL = "readtime_prediction_scores"
53 |
54 | df_predictions = (
55 | df_behaviors.select(DEFAULT_IMPRESSION_ID_COL, DEFAULT_INVIEW_ARTICLES_COL)
56 | .with_columns(
57 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
58 | .list.eval(pl.element().replace(clicked_dict).fill_null(0))
59 | .alias(CLICKED_SCORE_COL)
60 | )
61 | .with_columns(
62 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
63 | .list.eval(pl.element().replace(inview_dict).fill_null(0))
64 | .alias(INVIEW_SCORE_COL)
65 | )
66 | .with_columns(
67 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
68 | .list.eval(pl.element().replace(inview_dict_estimate).fill_null(0))
69 | .alias(INVIEW_ESTIMATE_SCORE_COL)
70 | )
71 | .with_columns(
72 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
73 | .list.eval(pl.element().replace(readtime_dict).fill_null(0))
74 | .alias(READTIME_SCORE_COL)
75 | )
76 | .collect()
77 | )
78 |
79 | # CONVERT TO RANKS:
80 | impression_id = []
81 | clicked_scores = []
82 | inview_scores = []
83 | inview_estimate_scores = []
84 | readtime_scores = []
85 | for row in tqdm(
86 | df_predictions.iter_rows(named=True),
87 | total=df_predictions.shape[0],
88 | ncols=80,
89 | ):
90 | impression_id.append(row[DEFAULT_IMPRESSION_ID_COL])
91 | clicked_scores.append(rank_predictions_by_score(row[CLICKED_SCORE_COL]))
92 | inview_scores.append(rank_predictions_by_score(row[INVIEW_SCORE_COL]))
93 | inview_estimate_scores.append(
94 | rank_predictions_by_score(row[INVIEW_ESTIMATE_SCORE_COL])
95 | )
96 | readtime_scores.append(rank_predictions_by_score(row[READTIME_SCORE_COL]))
97 |
98 | #
99 | for col, scores in zip(
100 | [
101 | CLICKED_SCORE_COL,
102 | INVIEW_SCORE_COL,
103 | INVIEW_ESTIMATE_SCORE_COL,
104 | READTIME_SCORE_COL,
105 | ],
106 | [clicked_scores, inview_scores, inview_estimate_scores, readtime_scores],
107 | ):
108 | print("Writing submission file for:", col)
109 | Path("downloads").mkdir(exist_ok=True)
110 | write_submission_file(
111 | impression_ids=impression_id,
112 | prediction_scores=scores,
113 | path="downloads/predictions.txt",
114 | filename_zip=f"{col}.zip",
115 | )
116 |
--------------------------------------------------------------------------------
/examples/datasets/plot/article_read_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/article_read_time.png
--------------------------------------------------------------------------------
/examples/datasets/plot/body_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/body_len.png
--------------------------------------------------------------------------------
/examples/datasets/plot/category_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution.png
--------------------------------------------------------------------------------
/examples/datasets/plot/category_distribution_ba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/category_distribution_ba.png
--------------------------------------------------------------------------------
/examples/datasets/plot/front_article_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_article_page.png
--------------------------------------------------------------------------------
/examples/datasets/plot/front_read_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/front_read_time.png
--------------------------------------------------------------------------------
/examples/datasets/plot/inview_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/inview_len.png
--------------------------------------------------------------------------------
/examples/datasets/plot/subtitle_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/subtitle_len.png
--------------------------------------------------------------------------------
/examples/datasets/plot/title_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/examples/datasets/plot/title_len.png
--------------------------------------------------------------------------------
/examples/quick_start/lstur_dummy.py:
--------------------------------------------------------------------------------
1 | # TODO make a notebook with it
2 | from ebrec.models.newsrec.model_config import hparams_lstur
3 | from ebrec.models.newsrec.lstur import LSTURModel
4 | import numpy as np
5 |
6 | config = hparams_lstur
7 |
8 | # Define the number of samples in your batch
9 | BATCH_SIZE = 300
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 |
15 | # Define the shapes of the input data
16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
18 | vocab_size = word_embeddings.shape[0]
19 | n_users = config.n_users
20 | label_shape = (NPRATIO + 1,)
21 | user_indexes_shape = (1,)
22 |
23 | model = LSTURModel(hparams=config, word2vec_embedding=word_embeddings)
24 | model.model.summary()
25 |
26 | # Generate some random input data for input_1 with values between 0 and 1
27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
28 | # Generate some random input data for input_2 with values between 0 and 1
29 | pred_input_title = np.random.randint(
30 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
31 | )
32 | # Input data for user_indexes
33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape))
34 |
35 | # Generate some random label data with values between 0 and 1
36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
37 | for row in label_data:
38 | row[np.random.choice(label_shape[0])] = 1
39 |
40 | # Print the shapes of the input data to verify they match the model's input layers
41 | print(his_input_title.shape)
42 | print(pred_input_title.shape)
43 | print(user_indexes.shape)
44 | print(label_data.shape)
45 |
46 | # Make input for model:
47 | input = (user_indexes, his_input_title, pred_input_title)
48 |
49 | # fit/predict:
50 | model.model.fit(input, label_data)
51 | model.model.predict(input)
52 |
--------------------------------------------------------------------------------
/examples/quick_start/naml_dummy.py:
--------------------------------------------------------------------------------
1 | # TODO make a notebook with it
2 | from ebrec.models.newsrec.model_config import hparams_naml
3 | from ebrec.models.newsrec.naml import NAMLModel
4 | import numpy as np
5 |
6 | config = hparams_naml
7 |
8 | # Define the number of samples in your batch
9 | BATCH_SIZE = 300
10 | NPRATIO = 4
11 | HISTORY_SIZE = config.history_size
12 | TITLE_SIZE = config.title_size
13 | BODY_SIZE = config.body_size
14 |
15 | label_shape = (NPRATIO + 1,)
16 | word_embeddings = np.random.rand(1000, 100)
17 |
18 | vocab_size = word_embeddings.shape[0]
19 | n_verts = config.vert_num
20 | n_subverts = config.subvert_num
21 |
22 | # Model
23 | model = NAMLModel(hparams=config, word2vec_embedding=word_embeddings)
24 | model.model.summary()
25 |
26 | # Define the shapes of the input data
27 | his_input_title = np.random.randint(
28 | 0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, TITLE_SIZE)
29 | )
30 | his_input_body = np.random.randint(
31 | 0, vocab_size, size=(BATCH_SIZE, HISTORY_SIZE, BODY_SIZE)
32 | )
33 | his_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, HISTORY_SIZE, 1))
34 | his_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, HISTORY_SIZE, 1))
35 | pred_input_title = np.random.randint(
36 | 0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, TITLE_SIZE)
37 | )
38 | pred_input_body = np.random.randint(
39 | 0, vocab_size, size=(BATCH_SIZE, NPRATIO + 1, BODY_SIZE)
40 | )
41 | pred_input_vert = np.random.randint(0, n_verts, size=(BATCH_SIZE, NPRATIO + 1, 1))
42 | pred_input_subvert = np.random.randint(0, n_subverts, size=(BATCH_SIZE, NPRATIO + 1, 1))
43 |
44 | # Generate some random label data with values between 0 and 1
45 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
46 | for row in label_data:
47 | row[np.random.choice(label_shape[0])] = 1
48 |
49 | #
50 | his_input_title.shape
51 | his_input_body.shape
52 | his_input_vert.shape
53 | his_input_subvert.shape
54 | pred_input_title.shape
55 | pred_input_body.shape
56 | pred_input_vert.shape
57 | pred_input_subvert.shape
58 | label_data.shape
59 |
60 | # Make input for model:
61 | input = (
62 | his_input_title,
63 | his_input_body,
64 | his_input_vert,
65 | his_input_subvert,
66 | pred_input_title,
67 | pred_input_body,
68 | pred_input_vert,
69 | pred_input_subvert,
70 | )
71 |
72 | # fit/predict:
73 | model.model.fit(input, label_data)
74 | model.model.predict(input)
75 |
--------------------------------------------------------------------------------
/examples/quick_start/npa_dummy.py:
--------------------------------------------------------------------------------
1 | # TODO make a notebook with it
2 | from ebrec.models.newsrec.model_config import hparams_npa
3 | from ebrec.models.newsrec.npa import NPAModel
4 | import numpy as np
5 |
6 | config = hparams_npa
7 |
8 | # Define the number of samples in your batch
9 | BATCH_SIZE = 300
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 |
15 | # Define the shapes of the input data
16 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
17 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
18 | vocab_size = word_embeddings.shape[0]
19 | n_users = config.n_users
20 | label_shape = (NPRATIO + 1,)
21 | user_indexes_shape = (1,)
22 |
23 | model = NPAModel(hparams=config)
24 | model.model.summary()
25 |
26 | # Generate some random input data for input_1 with values between 0 and 1
27 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
28 | # Generate some random input data for input_2 with values between 0 and 1
29 | pred_input_title = np.random.randint(
30 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
31 | )
32 | # Input data for user_indexes
33 | user_indexes = np.random.randint(0, n_users, size=(BATCH_SIZE, *user_indexes_shape))
34 |
35 | # Generate some random label data with values between 0 and 1
36 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
37 | for row in label_data:
38 | row[np.random.choice(label_shape[0])] = 1
39 |
40 | # Print the shapes of the input data to verify they match the model's input layers
41 | print(his_input_title.shape)
42 | print(pred_input_title.shape)
43 | print(user_indexes.shape)
44 | print(label_data.shape)
45 |
46 | # Make input for model:
47 | input = (user_indexes, his_input_title, pred_input_title)
48 |
49 | # fit/predict:
50 | model.model.fit(input, label_data)
51 | model.model.predict(input)
52 |
--------------------------------------------------------------------------------
/examples/quick_start/nrms_docvec_dummy.py:
--------------------------------------------------------------------------------
1 | # TODO make a notebook with it
2 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec
3 | from ebrec.models.newsrec.model_config import hparams_nrms
4 | import numpy as np
5 |
6 | DOCVEC_DIM = 300
7 | BATCH_SIZE = 10
8 | HISTORY_SIZE = 20
9 | NPRATIO = 4
10 |
11 | #
12 | config = hparams_nrms
13 | config.history_size = HISTORY_SIZE
14 | config.title_size = DOCVEC_DIM
15 |
16 | # MODEL:
17 | model = NRMSDocVec(hparams=config, newsencoder_units_per_layer=[512, 512])
18 | model.model.summary()
19 |
20 | #
21 | his_input_title_shape = (HISTORY_SIZE, DOCVEC_DIM)
22 | pred_input_title_shape = (NPRATIO + 1, DOCVEC_DIM)
23 | label_shape = (NPRATIO + 1,)
24 |
25 | # Generate some random input data for input_1
26 | his_input_title = np.array(
27 | [np.random.rand(*his_input_title_shape) for _ in range(BATCH_SIZE)]
28 | )
29 | # Generate some random input data for input_2
30 | pred_input_title = np.array(
31 | [np.random.rand(*pred_input_title_shape) for _ in range(BATCH_SIZE)]
32 | )
33 | # Generate some random label data with values between 0 and 1
34 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
35 | for row in label_data:
36 | row[np.random.choice(label_shape[0])] = 1
37 |
38 | # Print the shapes of the input data to verify they match the model's input layers
39 | print(his_input_title.shape)
40 | print(pred_input_title.shape)
41 | print(label_data.shape)
42 |
43 | # Make input for model:
44 | input = (his_input_title, pred_input_title)
45 |
46 | # fit/predict:
47 | model.model.fit(input, label_data, epochs=10)
48 | model.model.predict(input)
49 |
--------------------------------------------------------------------------------
/examples/quick_start/nrms_dummy.py:
--------------------------------------------------------------------------------
1 | # TODO make a notebook with it
2 | from ebrec.models.newsrec.model_config import hparams_nrms
3 | from ebrec.models.newsrec.nrms import NRMSModel
4 | import numpy as np
5 |
6 | config = hparams_nrms
7 |
8 | # Define the number of samples in your batch
9 | BATCH_SIZE = 10
10 | HISTORY_SIZE = config.history_size
11 | TITLE_SIZE = config.title_size
12 | NPRATIO = 4
13 | word_embeddings = np.random.rand(1000, 100)
14 |
15 | model = NRMSModel(hparams=config, word2vec_embedding=word_embeddings)
16 | model.model.summary()
17 |
18 | # Define the shapes of the input data
19 | his_input_title_shape = (HISTORY_SIZE, TITLE_SIZE)
20 | pred_input_title_shape = (NPRATIO + 1, TITLE_SIZE)
21 | label_shape = (NPRATIO + 1,)
22 | vocab_size = word_embeddings.shape[0]
23 |
24 | # Generate some random input data for input_1 with values between 0 and 1
25 | his_input_title = np.random.randint(0, vocab_size, (BATCH_SIZE, *his_input_title_shape))
26 |
27 | # Generate some random input data for input_2 with values between 0 and 1
28 | pred_input_title = np.random.randint(
29 | 0, vocab_size, (BATCH_SIZE, *pred_input_title_shape)
30 | )
31 |
32 | # Generate some random label data with values between 0 and 1
33 | label_data = np.zeros((BATCH_SIZE, *label_shape), dtype=int)
34 | for row in label_data:
35 | row[np.random.choice(label_shape[0])] = 1
36 |
37 | # Print the shapes of the input data to verify they match the model's input layers
38 | print(his_input_title.shape)
39 | print(pred_input_title.shape)
40 | print(label_data.shape)
41 |
42 | # Make input for model:
43 | input = (his_input_title, pred_input_title)
44 |
45 | # fit/predict:
46 | model.model.fit(input, label_data)
47 | model.model.predict(input)
48 |
--------------------------------------------------------------------------------
/examples/quick_start/nrms_ebnerd.py:
--------------------------------------------------------------------------------
1 | from tensorflow.keras.backend import clear_session
2 | from transformers import AutoTokenizer, AutoModel
3 | from pathlib import Path
4 | import tensorflow as tf
5 | import datetime as dt
6 | import polars as pl
7 | import numpy as np
8 | import gc
9 | import os
10 |
11 | from ebrec.utils._constants import (
12 | DEFAULT_HISTORY_ARTICLE_ID_COL,
13 | DEFAULT_IS_BEYOND_ACCURACY_COL,
14 | DEFAULT_CLICKED_ARTICLES_COL,
15 | DEFAULT_INVIEW_ARTICLES_COL,
16 | DEFAULT_IMPRESSION_ID_COL,
17 | DEFAULT_SUBTITLE_COL,
18 | DEFAULT_LABELS_COL,
19 | DEFAULT_TITLE_COL,
20 | DEFAULT_USER_COL,
21 | )
22 |
23 | from ebrec.utils._behaviors import (
24 | create_binary_labels_column,
25 | sampling_strategy_wu2019,
26 | add_known_user_column,
27 | add_prediction_scores,
28 | truncate_history,
29 | )
30 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
31 | from ebrec.utils._articles import convert_text2encoding_with_transformers
32 | from ebrec.utils._polars import (
33 | slice_join_dataframes,
34 | concat_str_columns,
35 | chunk_dataframe,
36 | split_df,
37 | )
38 | from ebrec.utils._articles import create_article_id_to_value_mapping
39 | from ebrec.utils._nlp import get_transformers_word_embeddings
40 | from ebrec.utils._python import write_submission_file, rank_predictions_by_score
41 |
42 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform
43 | from ebrec.models.newsrec.model_config import hparams_nrms
44 | from ebrec.models.newsrec import NRMSModel
45 |
46 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
47 | gpus = tf.config.experimental.list_physical_devices("GPU")
48 | for gpu in gpus:
49 | tf.config.experimental.set_memory_growth(gpu, True)
50 |
51 | # conda activate ./venv/
52 | # python -i examples/00_quick_start/nrms_ebnerd.py
53 |
54 |
55 | def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
56 | """
57 | Load ebnerd - function
58 | """
59 | df_history = (
60 | pl.scan_parquet(path.joinpath("history.parquet"))
61 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
62 | .pipe(
63 | truncate_history,
64 | column=DEFAULT_HISTORY_ARTICLE_ID_COL,
65 | history_size=history_size,
66 | padding_value=0,
67 | enable_warning=False,
68 | )
69 | )
70 | df_behaviors = (
71 | pl.scan_parquet(path.joinpath("behaviors.parquet"))
72 | .collect()
73 | .pipe(
74 | slice_join_dataframes,
75 | df2=df_history.collect(),
76 | on=DEFAULT_USER_COL,
77 | how="left",
78 | )
79 | )
80 | return df_behaviors
81 |
82 |
83 | PATH = Path("~/ebnerd_data").expanduser()
84 | DUMP_DIR = Path("ebnerd_predictions")
85 | DUMP_DIR.mkdir(exist_ok=True, parents=True)
86 | SEED = np.random.randint(0, 1_000)
87 |
88 | MODEL_NAME = f"NRMS-{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}-{SEED}"
89 | # MODEL_NAME = "NRMS-382861963-2024-11-12 01:34:49.050070"
90 |
91 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_NAME}/weights")
92 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_NAME}")
93 | TEST_DF_DUMP = DUMP_DIR.joinpath("test_predictions", MODEL_NAME)
94 | TEST_DF_DUMP.mkdir(parents=True, exist_ok=True)
95 |
96 | print(f"Dir: {MODEL_NAME}")
97 |
98 | DATASPLIT = "ebnerd_small"
99 | MAX_TITLE_LENGTH = 30
100 | HISTORY_SIZE = 20
101 | FRACTION = 1.0
102 | EPOCHS = 5
103 | FRACTION_TEST = 1.0
104 | #
105 | hparams_nrms.history_size = HISTORY_SIZE
106 |
107 | BATCH_SIZE_TRAIN = 32
108 | BATCH_SIZE_VAL = 32
109 | BATCH_SIZE_TEST_WO_B = 32
110 | BATCH_SIZE_TEST_W_B = 4
111 | N_CHUNKS_TEST = 10
112 | CHUNKS_DONE = 0
113 |
114 | COLUMNS = [
115 | DEFAULT_USER_COL,
116 | DEFAULT_HISTORY_ARTICLE_ID_COL,
117 | DEFAULT_INVIEW_ARTICLES_COL,
118 | DEFAULT_CLICKED_ARTICLES_COL,
119 | DEFAULT_IMPRESSION_ID_COL,
120 | ]
121 |
122 | df_train = (
123 | ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
124 | .sample(fraction=FRACTION)
125 | .select(COLUMNS)
126 | .pipe(
127 | sampling_strategy_wu2019,
128 | npratio=4,
129 | shuffle=True,
130 | with_replacement=True,
131 | seed=SEED,
132 | )
133 | .pipe(create_binary_labels_column)
134 | )
135 | df_train, df_validation = split_df(df_train, fraction=0.9, seed=SEED, shuffle=False)
136 |
137 | # df_test = df_validation
138 | # df_train = df_train[:100]
139 | # df_validation = df_validation[:100]
140 | # df_test = df_test[:100]
141 | df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
142 |
143 | # =>
144 | TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
145 | TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
146 |
147 | # LOAD HUGGINGFACE:
148 | transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
149 | transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
150 |
151 | word2vec_embedding = get_transformers_word_embeddings(transformer_model)
152 | #
153 | df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
154 | df_articles, token_col_title = convert_text2encoding_with_transformers(
155 | df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
156 | )
157 | # =>
158 | article_mapping = create_article_id_to_value_mapping(
159 | df=df_articles, value_col=token_col_title
160 | )
161 |
162 | # =>
163 | print("Init train- and val-dataloader")
164 | train_dataloader = NRMSDataLoaderPretransform(
165 | behaviors=df_train,
166 | article_dict=article_mapping,
167 | unknown_representation="zeros",
168 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
169 | eval_mode=False,
170 | batch_size=BATCH_SIZE_TRAIN,
171 | )
172 | val_dataloader = NRMSDataLoaderPretransform(
173 | behaviors=df_validation,
174 | article_dict=article_mapping,
175 | unknown_representation="zeros",
176 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
177 | eval_mode=False,
178 | batch_size=BATCH_SIZE_VAL,
179 | )
180 |
181 | # CALLBACKS
182 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
183 | early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
184 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
185 | filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
186 | )
187 |
188 | model = NRMSModel(
189 | hparams=hparams_nrms,
190 | word2vec_embedding=word2vec_embedding,
191 | seed=42,
192 | )
193 | hist = model.model.fit(
194 | train_dataloader,
195 | validation_data=val_dataloader,
196 | epochs=EPOCHS,
197 | callbacks=[tensorboard_callback, early_stopping],
198 | )
199 | del (
200 | transformer_tokenizer,
201 | transformer_model,
202 | train_dataloader,
203 | val_dataloader,
204 | df_validation,
205 | df_train,
206 | )
207 | gc.collect()
208 |
209 | print(f"saving model: {MODEL_WEIGHTS}")
210 | model.model.save_weights(MODEL_WEIGHTS)
211 | print(f"loading model: {MODEL_WEIGHTS}")
212 | model.model.load_weights(MODEL_WEIGHTS)
213 |
214 | # =>
215 | print("Init df_test")
216 | df_test = (
217 | ebnerd_from_path(PATH.joinpath("ebnerd_testset", "test"), history_size=HISTORY_SIZE)
218 | .sample(fraction=FRACTION_TEST)
219 | .with_columns(
220 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
221 | .list.first()
222 | .alias(DEFAULT_CLICKED_ARTICLES_COL)
223 | )
224 | .select(COLUMNS + [DEFAULT_IS_BEYOND_ACCURACY_COL])
225 | .with_columns(
226 | pl.col(DEFAULT_INVIEW_ARTICLES_COL)
227 | .list.eval(pl.element() * 0)
228 | .alias(DEFAULT_LABELS_COL)
229 | )
230 | )
231 | # Split test in beyond-accuracy. BA samples have more 'article_ids_inview'.
232 | df_test_wo_beyond = df_test.filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
233 | df_test_w_beyond = df_test.filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
234 |
235 | df_test_chunks = chunk_dataframe(df_test_wo_beyond, n_chunks=N_CHUNKS_TEST)
236 | df_pred_test_wo_beyond = []
237 |
238 | for i, df_test_chunk in enumerate(df_test_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE):
239 | print(f"Init test-dataloader: {i}/{len(df_test_chunks)}")
240 | # Initialize DataLoader
241 | test_dataloader_wo_b = NRMSDataLoader(
242 | behaviors=df_test_chunk,
243 | article_dict=article_mapping,
244 | unknown_representation="zeros",
245 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
246 | eval_mode=True,
247 | batch_size=BATCH_SIZE_TEST_WO_B,
248 | )
249 | # Predict and clear session
250 | scores = model.scorer.predict(test_dataloader_wo_b)
251 | clear_session()
252 |
253 | # Process the predictions
254 | df_test_chunk = add_prediction_scores(df_test_chunk, scores.tolist()).with_columns(
255 | pl.col("scores")
256 | .map_elements(lambda x: list(rank_predictions_by_score(x)))
257 | .alias("ranked_scores")
258 | )
259 |
260 | # Save the processed chunk
261 | df_test_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
262 | TEST_DF_DUMP.joinpath(f"pred_wo_ba_{i}.parquet")
263 | )
264 |
265 | # Append and clean up
266 | df_pred_test_wo_beyond.append(df_test_chunk)
267 |
268 | # Cleanup
269 | del df_test_chunk, test_dataloader_wo_b, scores
270 | gc.collect()
271 |
272 | # =>
273 | df_pred_test_wo_beyond = pl.concat(df_pred_test_wo_beyond)
274 | df_pred_test_wo_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
275 | TEST_DF_DUMP.joinpath("pred_wo_ba.parquet")
276 | )
277 |
278 | print("Init test-dataloader: beyond-accuracy")
279 | test_dataloader_w_b = NRMSDataLoader(
280 | behaviors=df_test_w_beyond,
281 | article_dict=article_mapping,
282 | unknown_representation="zeros",
283 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
284 | eval_mode=True,
285 | batch_size=BATCH_SIZE_TEST_W_B,
286 | )
287 | scores = model.scorer.predict(test_dataloader_w_b)
288 | df_pred_test_w_beyond = add_prediction_scores(
289 | df_test_w_beyond, scores.tolist()
290 | ).with_columns(
291 | pl.col("scores")
292 | .map_elements(lambda x: list(rank_predictions_by_score(x)))
293 | .alias("ranked_scores")
294 | )
295 | df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
296 | TEST_DF_DUMP.joinpath("pred_w_ba.parquet")
297 | )
298 |
299 | # =>
300 | df_test = pl.concat([df_pred_test_wo_beyond, df_pred_test_w_beyond])
301 | df_test.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
302 | TEST_DF_DUMP.joinpath("pred_concat.parquet")
303 | )
304 | # metrics = MetricEvaluator(
305 | # labels=df_validation["labels"].to_list(),
306 | # predictions=df_validation["scores"].to_list(),
307 | # metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
308 | # )
309 | # metrics.evaluate()
310 |
311 | write_submission_file(
312 | impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
313 | prediction_scores=df_test["ranked_scores"],
314 | path=DUMP_DIR.joinpath("predictions.txt"),
315 | filename_zip=f"{DATASPLIT}_predictions-{MODEL_NAME}.zip",
316 | )
317 |
--------------------------------------------------------------------------------
/examples/reproducibility_scripts/args_nrms.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def get_args():
5 | parser = argparse.ArgumentParser(
6 | description="Argument parser for NRMSModel training"
7 | )
8 |
9 | parser.add_argument(
10 | "--data_path",
11 | type=str,
12 | default=str("~/ebnerd_data"),
13 | help="Path to the data directory",
14 | )
15 |
16 | # General settings
17 | parser.add_argument("--seed", type=int, default=123, help="Random seed")
18 | parser.add_argument(
19 | "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use"
20 | )
21 | parser.add_argument("--debug", action="store_true", help="Enable debug mode")
22 |
23 | # Batch sizes
24 | parser.add_argument(
25 | "--bs_train", type=int, default=32, help="Batch size for training"
26 | )
27 | parser.add_argument(
28 | "--bs_test", type=int, default=32, help="Batch size for testing"
29 | )
30 | parser.add_argument(
31 | "--batch_size_test_wo_b",
32 | type=int,
33 | default=32,
34 | help="Batch size for testing without balancing",
35 | )
36 | parser.add_argument(
37 | "--batch_size_test_w_b",
38 | type=int,
39 | default=4,
40 | help="Batch size for testing with balancing",
41 | )
42 |
43 | # History and ratios
44 | parser.add_argument(
45 | "--history_size", type=int, default=20, help="History size for the model"
46 | )
47 | parser.add_argument(
48 | "--npratio", type=int, default=4, help="Negative-positive ratio"
49 | )
50 |
51 | # Training settings
52 | parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
53 | parser.add_argument(
54 | "--train_fraction",
55 | type=float,
56 | default=1.0,
57 | help="Fraction of training data to use",
58 | )
59 | parser.add_argument(
60 | "--fraction_test",
61 | type=float,
62 | default=1.0,
63 | help="Fraction of testing data to use",
64 | )
65 |
66 | # Model and loader settings
67 | parser.add_argument(
68 | "--nrms_loader",
69 | type=str,
70 | default="NRMSDataLoaderPretransform",
71 | choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"],
72 | help="Data loader type (speed or memory efficient)",
73 | )
74 |
75 | # Chunk processing
76 | parser.add_argument(
77 | "--n_chunks_test", type=int, default=10, help="Number of test chunks to process"
78 | )
79 | parser.add_argument(
80 | "--chunks_done", type=int, default=0, help="Number of chunks already processed"
81 | )
82 |
83 | # =====================================================================================
84 | # ############################# UNIQUE FOR NRMSDocVec ###############################
85 | # =====================================================================================
86 | # Transformer settings
87 | parser.add_argument(
88 | "--transformer_model_name",
89 | type=str,
90 | default="FacebookAI/xlm-roberta-large",
91 | help="Transformer model name",
92 | )
93 | parser.add_argument(
94 | "--max_title_length",
95 | type=int,
96 | default=30,
97 | help="Maximum length of title encoding",
98 | )
99 |
100 | # Hyperparameters
101 | parser.add_argument(
102 | "--head_num", type=int, default=20, help="Number of attention heads"
103 | )
104 | parser.add_argument(
105 | "--head_dim", type=int, default=20, help="Dimension of each attention head"
106 | )
107 | parser.add_argument(
108 | "--attention_hidden_dim",
109 | type=int,
110 | default=200,
111 | help="Dimension of attention hidden layers",
112 | )
113 |
114 | # Optimizer settings
115 | parser.add_argument(
116 | "--optimizer", type=str, default="adam", help="Optimizer to use"
117 | )
118 | parser.add_argument(
119 | "--loss", type=str, default="cross_entropy_loss", help="Loss function"
120 | )
121 | parser.add_argument("--dropout", type=float, default=0.20, help="Dropout rate")
122 | parser.add_argument(
123 | "--learning_rate", type=float, default=1e-4, help="Learning rate"
124 | )
125 |
126 | return parser.parse_args()
127 |
--------------------------------------------------------------------------------
/examples/reproducibility_scripts/args_nrms_docvec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def get_args():
5 | parser = argparse.ArgumentParser(
6 | description="Argument parser for NRMSModel training"
7 | )
8 |
9 | parser.add_argument(
10 | "--data_path",
11 | type=str,
12 | default=str("~/ebnerd_data"),
13 | help="Path to the data directory",
14 | )
15 |
16 | # General settings
17 | parser.add_argument("--seed", type=int, default=123, help="Random seed")
18 | parser.add_argument(
19 | "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use"
20 | )
21 | parser.add_argument("--debug", action="store_true", help="Enable debug mode")
22 |
23 | # Batch sizes
24 | parser.add_argument(
25 | "--bs_train", type=int, default=32, help="Batch size for training"
26 | )
27 | parser.add_argument(
28 | "--bs_test", type=int, default=32, help="Batch size for testing"
29 | )
30 | parser.add_argument(
31 | "--batch_size_test_wo_b",
32 | type=int,
33 | default=32,
34 | help="Batch size for testing without balancing",
35 | )
36 | parser.add_argument(
37 | "--batch_size_test_w_b",
38 | type=int,
39 | default=4,
40 | help="Batch size for testing with balancing",
41 | )
42 |
43 | # History and ratios
44 | parser.add_argument(
45 | "--history_size", type=int, default=20, help="History size for the model"
46 | )
47 | parser.add_argument(
48 | "--npratio", type=int, default=4, help="Negative-positive ratio"
49 | )
50 |
51 | # Training settings
52 | parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
53 | parser.add_argument(
54 | "--train_fraction",
55 | type=float,
56 | default=1.0,
57 | help="Fraction of training data to use",
58 | )
59 | parser.add_argument(
60 | "--fraction_test",
61 | type=float,
62 | default=1.0,
63 | help="Fraction of testing data to use",
64 | )
65 |
66 | # Model and loader settings
67 | parser.add_argument(
68 | "--nrms_loader",
69 | type=str,
70 | default="NRMSDataLoaderPretransform",
71 | choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"],
72 | help="Data loader type (speed or memory efficient)",
73 | )
74 |
75 | # Chunk processing
76 | parser.add_argument(
77 | "--n_chunks_test", type=int, default=10, help="Number of test chunks to process"
78 | )
79 | parser.add_argument(
80 | "--chunks_done", type=int, default=0, help="Number of chunks already processed"
81 | )
82 |
83 | # =====================================================================================
84 | # ############################# UNIQUE FOR NRMSDocVec ###############################
85 | # =====================================================================================
86 |
87 | parser.add_argument(
88 | "--document_embeddings",
89 | type=str,
90 | default="Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet",
91 | help="Path to the document embeddings file",
92 | )
93 | # Model function and architecture
94 | parser.add_argument(
95 | "--title_size", type=int, default=768, help="Size of title encoding"
96 | )
97 | parser.add_argument(
98 | "--head_num", type=int, default=16, help="Number of attention heads"
99 | )
100 | parser.add_argument(
101 | "--head_dim", type=int, default=16, help="Dimension of each attention head"
102 | )
103 | parser.add_argument(
104 | "--attention_hidden_dim",
105 | type=int,
106 | default=200,
107 | help="Dimension of attention hidden layers",
108 | )
109 | parser.add_argument(
110 | "--newsencoder_units_per_layer",
111 | nargs="+",
112 | type=int,
113 | default=[512, 512, 512],
114 | help="List of units per layer in the news encoder",
115 | )
116 |
117 | # Optimizer settings
118 | parser.add_argument(
119 | "--optimizer", type=str, default="adam", help="Optimizer to use"
120 | )
121 | parser.add_argument(
122 | "--loss", type=str, default="cross_entropy_loss", help="Loss function"
123 | )
124 | parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate")
125 | parser.add_argument(
126 | "--learning_rate", type=float, default=1e-4, help="Learning rate"
127 | )
128 | parser.add_argument(
129 | "--newsencoder_l2_regularization",
130 | type=float,
131 | default=1e-4,
132 | help="L2 regularization for the news encoder",
133 | )
134 |
135 | return parser.parse_args()
136 |
--------------------------------------------------------------------------------
/examples/reproducibility_scripts/ebnerd_nrms_doc_hist.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, AutoModel
2 | from ebrec.utils._nlp import get_transformers_word_embeddings
3 | from ebrec.utils._articles import convert_text2encoding_with_transformers
4 |
5 | from pathlib import Path
6 | import tensorflow as tf
7 | import datetime as dt
8 | import polars as pl
9 | import shutil
10 | import gc
11 | import os
12 |
13 | from ebrec.utils._constants import *
14 |
15 | from ebrec.utils._behaviors import (
16 | create_binary_labels_column,
17 | sampling_strategy_wu2019,
18 | add_prediction_scores,
19 | truncate_history,
20 | ebnerd_from_path,
21 | )
22 | from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
23 |
24 | from ebrec.utils._python import (
25 | write_submission_file,
26 | rank_predictions_by_score,
27 | write_json_file,
28 | )
29 | from ebrec.utils._articles import create_article_id_to_value_mapping
30 | from ebrec.utils._polars import split_df_chunks, concat_str_columns
31 |
32 | from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform
33 | from ebrec.models.newsrec.model_config import (
34 | hparams_nrms,
35 | hparams_nrms_docvec,
36 | hparams_to_dict,
37 | print_hparams,
38 | )
39 | from ebrec.models.newsrec.nrms_docvec import NRMSDocVec
40 | from ebrec.models.newsrec import NRMSModel
41 |
42 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
43 |
44 | from args_nrms_docvec import get_args
45 |
46 | args = get_args()
47 |
48 | for arg, val in vars(args).items():
49 | print(f"{arg} : {val}")
50 |
51 | PATH = Path(args.data_path).expanduser()
52 | # Access arguments as variables
53 | SEED = args.seed
54 | DATASPLIT = args.datasplit
55 | DEBUG = args.debug
56 | BS_TRAIN = args.bs_train
57 | BS_TEST = args.bs_test
58 | BATCH_SIZE_TEST_WO_B = args.batch_size_test_wo_b
59 | BATCH_SIZE_TEST_W_B = args.batch_size_test_w_b
60 | HISTORY_SIZE = args.history_size
61 | NPRATIO = args.npratio
62 | EPOCHS = args.epochs
63 | TRAIN_FRACTION = args.train_fraction if not DEBUG else 0.0001
64 | FRACTION_TEST = args.fraction_test if not DEBUG else 0.0001
65 |
66 | NRMSLoader_training = (
67 | NRMSDataLoaderPretransform
68 | if args.nrms_loader == "NRMSDataLoaderPretransform"
69 | else NRMSDataLoader
70 | )
71 |
72 | # =====================================================================================
73 | # ############################# UNIQUE FOR NRMSModel ################################
74 | # =====================================================================================
75 |
76 | # Model in use:
77 | model_func = NRMSDocVec
78 | hparams = hparams_nrms_docvec
79 | #
80 | hparams.title_size = args.title_size
81 | hparams.history_size = args.history_size
82 | hparams.head_num = args.head_num
83 | hparams.head_dim = args.head_dim
84 | hparams.attention_hidden_dim = args.attention_hidden_dim
85 | hparams.newsencoder_units_per_layer = args.newsencoder_units_per_layer
86 | hparams.optimizer = args.optimizer
87 | hparams.loss = args.loss
88 | hparams.dropout = args.dropout
89 | hparams.learning_rate = args.learning_rate
90 | hparams.newsencoder_l2_regularization = args.newsencoder_l2_regularization
91 | print_hparams(hparams)
92 |
93 | # =============
94 | # Data-path
95 | DOC_VEC_PATH = PATH.joinpath(f"artifacts/{args.document_embeddings}")
96 | print("Initiating articles...")
97 | df_articles = pl.read_parquet(DOC_VEC_PATH)
98 | article_mapping = create_article_id_to_value_mapping(
99 | df=df_articles, value_col=df_articles.columns[-1]
100 | )
101 |
102 | # =====================================================================================
103 | # ############################# UNIQUE FOR NRMSDocVec ###############################
104 | # =====================================================================================
105 |
106 |
107 | # Dump paths:
108 | DUMP_DIR = Path("ebnerd_predictions")
109 | DUMP_DIR.mkdir(exist_ok=True, parents=True)
110 | #
111 | DT_NOW = dt.datetime.now()
112 | #
113 | MODEL_NAME = model_func.__name__
114 | MODEL_OUTPUT_NAME = f"{MODEL_NAME}-{DT_NOW}"
115 | #
116 | ARTIFACT_DIR = DUMP_DIR.joinpath("test_predictions", MODEL_NAME)
117 | # Model monitoring:
118 | MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_OUTPUT_NAME}/weights")
119 | LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_OUTPUT_NAME}")
120 | # Evaluating the test test can be memory intensive, we'll chunk it up:
121 | TEST_CHUNKS_DIR = ARTIFACT_DIR.joinpath("test_chunks")
122 | TEST_CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
123 | N_CHUNKS_TEST = 10
124 | CHUNKS_DONE = 0 # if it crashes, you can start from here.
125 | # Just trying keeping the dataframe slime:
126 | COLUMNS = [
127 | DEFAULT_IMPRESSION_TIMESTAMP_COL,
128 | DEFAULT_HISTORY_ARTICLE_ID_COL,
129 | DEFAULT_INVIEW_ARTICLES_COL,
130 | DEFAULT_CLICKED_ARTICLES_COL,
131 | DEFAULT_IMPRESSION_ID_COL,
132 | DEFAULT_USER_COL,
133 | ]
134 | # Store hparams
135 | write_json_file(
136 | hparams_to_dict(hparams),
137 | ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_hparams.json"),
138 | )
139 | write_json_file(vars(args), ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_argparser.json"))
140 |
141 | # =====================================================================================
142 |
143 | df = (
144 | ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
145 | .sample(fraction=TRAIN_FRACTION, shuffle=True, seed=SEED)
146 | .select(COLUMNS)
147 | .pipe(
148 | sampling_strategy_wu2019,
149 | npratio=4,
150 | shuffle=True,
151 | with_replacement=True,
152 | seed=SEED,
153 | )
154 | .pipe(create_binary_labels_column)
155 | )
156 | #
157 | last_dt = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].dt.date().max() - dt.timedelta(days=1)
158 | df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() < last_dt)
159 | df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() >= last_dt)
160 |
161 |
162 | # =====================================================================================
163 | train_dataloader = NRMSDataLoaderPretransform(
164 | behaviors=df_train,
165 | article_dict=article_mapping,
166 | unknown_representation="zeros",
167 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
168 | eval_mode=False,
169 | batch_size=BS_TRAIN,
170 | )
171 |
172 | val_dataloader = NRMSDataLoaderPretransform(
173 | behaviors=df_validation,
174 | article_dict=article_mapping,
175 | unknown_representation="zeros",
176 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
177 | eval_mode=False,
178 | batch_size=BS_TEST,
179 | )
180 |
181 | # =====================================================================================
182 | print(f"Initiating training-dataloader")
183 | train_dataloader = NRMSLoader_training(
184 | behaviors=df_train,
185 | article_dict=article_mapping,
186 | unknown_representation="zeros",
187 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
188 | eval_mode=False,
189 | batch_size=BS_TRAIN,
190 | )
191 |
192 | val_dataloader = NRMSLoader_training(
193 | behaviors=df_validation,
194 | article_dict=article_mapping,
195 | unknown_representation="zeros",
196 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
197 | eval_mode=False,
198 | batch_size=BS_TRAIN,
199 | )
200 |
201 | # =====================================================================================
202 | # CALLBACKS
203 | tensorboard_callback = tf.keras.callbacks.TensorBoard(
204 | log_dir=LOG_DIR,
205 | histogram_freq=1,
206 | )
207 | early_stopping = tf.keras.callbacks.EarlyStopping(
208 | monitor="val_auc",
209 | mode="max",
210 | patience=4,
211 | restore_best_weights=True,
212 | )
213 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
214 | filepath=MODEL_WEIGHTS,
215 | monitor="val_auc",
216 | mode="max",
217 | save_best_only=True,
218 | save_weights_only=True,
219 | verbose=1,
220 | )
221 | lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
222 | monitor="val_auc",
223 | mode="max",
224 | factor=0.2,
225 | patience=2,
226 | min_lr=1e-6,
227 | )
228 | callbacks = [tensorboard_callback, early_stopping, modelcheckpoint, lr_scheduler]
229 |
230 | # =====================================================================================
231 | model = model_func(
232 | hparams=hparams,
233 | seed=42,
234 | )
235 | model.model.compile(
236 | optimizer=model.model.optimizer,
237 | loss=model.model.loss,
238 | metrics=["AUC"],
239 | )
240 | f"Initiating {MODEL_NAME}, start training..."
241 | # =>
242 | hist = model.model.fit(
243 | train_dataloader,
244 | validation_data=val_dataloader,
245 | epochs=EPOCHS,
246 | callbacks=callbacks,
247 | )
248 |
249 | print(f"loading model: {MODEL_WEIGHTS}")
250 | model.model.load_weights(MODEL_WEIGHTS)
251 |
252 | # =====================================================================================
253 |
254 | # First filter: only keep users with >FILTER_MIN_HISTORY in history-size
255 | FILTER_MIN_HISTORY = 100
256 | # Truncate the history
257 | HIST_SIZE = 100
258 |
259 | # =>
260 | df = (
261 | ebnerd_from_path(
262 | PATH.joinpath(DATASPLIT, "validation"), history_size=120, padding=None
263 | )
264 | .sample(fraction=FRACTION_TEST)
265 | .filter(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.len() >= FILTER_MIN_HISTORY)
266 | .select(COLUMNS)
267 | .pipe(create_binary_labels_column)
268 | )
269 |
270 | pairs = [
271 | (1, 256),
272 | (2, 256),
273 | (3, 256),
274 | (4, 256),
275 | (5, 256),
276 | (6, 256),
277 | (7, 256),
278 | (8, 256),
279 | (9, 256),
280 | (10, 256),
281 | (15, 128),
282 | (20, 128),
283 | (30, 64),
284 | (40, 64),
285 | (50, 64),
286 | ]
287 |
288 | aucs = []
289 | hists = []
290 | for hist_size, batch_size in pairs:
291 | print(f"History size: {hist_size}, Batch size: {batch_size}")
292 |
293 | df_ = df.pipe(
294 | truncate_history,
295 | column=DEFAULT_HISTORY_ARTICLE_ID_COL,
296 | history_size=hist_size,
297 | padding_value=0,
298 | enable_warning=False,
299 | )
300 |
301 | test_dataloader = NRMSDataLoader(
302 | behaviors=df_,
303 | article_dict=article_mapping,
304 | unknown_representation="zeros",
305 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
306 | eval_mode=True,
307 | batch_size=batch_size,
308 | )
309 |
310 | scores = model.scorer.predict(test_dataloader)
311 |
312 | df_pred = add_prediction_scores(df_, scores.tolist())
313 |
314 | metrics = MetricEvaluator(
315 | labels=df_pred["labels"],
316 | predictions=df_pred["scores"],
317 | metric_functions=[AucScore()],
318 | )
319 | metrics.evaluate()
320 | auc = metrics.evaluations["auc"]
321 | aucs.append(round(auc, 4))
322 | hists.append(hist_size)
323 | print(f"{auc} (History size: {hist_size}, Batch size: {batch_size})")
324 |
325 | for h, a in zip(hists, aucs):
326 | print(f"({a}, {h}),")
327 |
328 | results = {h: a for h, a in zip(hists, aucs)}
329 | write_json_file(results, ARTIFACT_DIR.joinpath("auc_history_length.json"))
330 |
331 | # Clean up
332 | if TEST_CHUNKS_DIR.exists() and TEST_CHUNKS_DIR.is_dir():
333 | shutil.rmtree(TEST_CHUNKS_DIR)
334 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "ebrec"
3 | description = "Ekstra Bladet Benchmark"
4 | version = "0.0.1"
5 | authors = [{ name = "Johannes Kruse", email = "johannes.kruse@eb.dk" }]
6 | requires-python = ">=3.10, <3.12"
7 | dependencies = [
8 | # fastformer:
9 | "transformers>=4.30.0, <4.37.3",
10 | # newsrec:
11 | "tensorflow>=2.12.0, <2.16.0",
12 | # Fastformer; DeepCTR
13 | "torch>=2.0.0, <2.3.0",
14 | # Evaluation:
15 | "scikit-learn==1.4.0",
16 | # GENERAL:
17 | "numpy>=1.24.0, <1.26.1",
18 | "polars==0.20.8",
19 | "pyyaml==6.0.1",
20 | "tqdm",
21 | ]
22 |
23 | [project.optional-dependencies]
24 | # pip install "my_project[extras]"
25 | # pip install -e .'[notebooks]'
26 | notebooks = ["transformers", "jupyter"]
27 | tests = [
28 | "pytest",
29 | "transformers>=4.30.0, <4.37.3",
30 | "tensorflow>=2.12.0, <2.16.0",
31 | "torch>=2.0.0, <2.3.0",
32 | ]
33 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/__init__.py
--------------------------------------------------------------------------------
/src/ebrec/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .metrics_protocols import (
2 | RootMeanSquaredError,
3 | MetricEvaluator,
4 | AccuracyScore,
5 | LogLossScore,
6 | NdcgScore,
7 | AucScore,
8 | F1Score,
9 | MrrScore,
10 | )
11 | from .beyond_accuracy import (
12 | IntralistDiversity,
13 | Distribution,
14 | Serendipity,
15 | Coverage,
16 | Novelty,
17 | )
18 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/_ba_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics.pairwise import cosine_distances
3 |
4 | from ebrec.evaluation.beyond_accuracy import (
5 | IntralistDiversity,
6 | Distribution,
7 | Serendipity,
8 | Novelty,
9 | Coverage,
10 | )
11 |
12 | lookup_dict = {
13 | "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50},
14 | "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25},
15 | "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75},
16 | "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50},
17 | "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94},
18 | "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95},
19 | "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96},
20 | "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50},
21 | "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20},
22 | "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20},
23 | }
24 |
25 | # 404 is not excepted, however, setup supports it:
26 | R = np.array(
27 | [
28 | ["101", "102", "400"],
29 | ["101", "103", "400"],
30 | ["101", "102", "103"],
31 | ["101", "104", "400"],
32 | ["101", "106", "404"],
33 | ["404", "404", "404"],
34 | ]
35 | )
36 |
37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"]
38 |
39 | click_histories = [
40 | np.array([["101", "102"]]),
41 | np.array([["105", "106", "400"]]),
42 | np.array([["102", "103", "104"]]),
43 | np.array([["101", "400"]]),
44 | np.array([["400"]]),
45 | np.array([["400"]]),
46 | ]
47 | pairwise_distance_function = cosine_distances
48 |
49 | # => IntralistDiversity
50 | lookup_key = "doc_vec"
51 | div = IntralistDiversity()
52 | div(R, lookup_dict=lookup_dict, lookup_key=lookup_key)
53 | div._candidate_diversity(
54 | R=C,
55 | n_recommendations=2,
56 | lookup_dict=lookup_dict,
57 | lookup_key=lookup_key,
58 | pairwise_distance_function=pairwise_distance_function,
59 | )
60 |
61 | try:
62 | div._candidate_diversity(C, 7, lookup_dict=lookup_dict, lookup_key=lookup_key)
63 | except ValueError as e:
64 | print(f"Failed - hurra! Error message: \n {e}")
65 |
66 | # => Distribution
67 | dist = Distribution()
68 | dist(R[:2], lookup_dict, "v")
69 | dist(R, lookup_dict, "sv")
70 | dist(C, lookup_dict, "v")
71 | try:
72 | dist(C, lookup_dict, "q")
73 | except ValueError as e:
74 | print(f"Failed - hurra! Error message: \n {e}")
75 |
76 | # => Coverage
77 | cov = Coverage()
78 | cov(R)
79 | cov(R, C)
80 |
81 | # => Serendipity
82 | ser = Serendipity()
83 | ser(
84 | R=R,
85 | H=click_histories,
86 | lookup_dict=lookup_dict,
87 | lookup_key=lookup_key,
88 | pairwise_distance_function=pairwise_distance_function,
89 | )
90 | # np.nan_to_num(ser(R, click_histories, lookup_dict, lookup_key), 0.0)
91 |
92 | # => Novelty
93 | nov = Novelty()
94 | nov(R, lookup_dict=lookup_dict, lookup_key="pop_sc")
95 | nov._candidate_novelty(C, 2, lookup_dict=lookup_dict, lookup_key="pop_sc")
96 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from ._beyond_accuracy import *
2 | from ._classification import *
3 | from ._ranking import *
4 | from ._sklearn import *
5 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_beyond_accuracy.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | from sklearn.metrics.pairwise import cosine_distances
4 | from collections import Counter
5 | import numpy as np
6 |
7 |
8 | def intralist_diversity(
9 | R: np.ndarray[np.ndarray],
10 | pairwise_distance_function: Callable = cosine_distances,
11 | ) -> float:
12 | """Calculate the intra-list diversity of a recommendation list.
13 |
14 | This function implements the method described by Smyth and McClave (2001) to
15 | measure the diversity within a recommendation list. It calculates the average
16 | pairwise distance between all items in the list.
17 |
18 | Formula:
19 | Diversity(R) = ( sum_{i∈R} sum_{j∈R_{i}} dist(i, j) ) / ( |R|(|R|-1) )
20 |
21 | where `R` is the recommendation list, and `dist` represents the pairwise distance function used.
22 |
23 | Args:
24 | R (np.ndarray[np.ndarray]): A 2D numpy array where each row represents a recommendation.
25 | This array should be either array-like or a sparse matrix, with shape (n_samples_X, n_features).
26 | pairwise_distance_function (Callable, optional): A function to compute pairwise distance
27 | between samples. Defaults to `cosine_distances`.
28 |
29 | Returns:
30 | float: The calculated diversity score. If the recommendation list contains less than or
31 | equal to one item, NaN is returned to signify an undefined diversity score.
32 |
33 | References:
34 | Smyth, B., McClave, P. (2001). Similarity vs. Diversity. In: Aha, D.W., Watson, I. (eds)
35 | Case-Based Reasoning Research and Development. ICCBR 2001. Lecture Notes in Computer Science(),
36 | vol 2080. Springer, Berlin, Heidelberg. https://doi.org/10.1007/3-540-44593-5_25
37 |
38 | Examples:
39 | >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
40 | >>> print(intralist_diversity(R1))
41 | 0.022588438516842262
42 | >>> print(intralist_diversity(np.array([[0.1, 0.2], [0.1, 0.2]])))
43 | 1.1102230246251565e-16
44 | """
45 | R_n = R.shape[0] # number of recommendations
46 | if R_n <= 1:
47 | # Less than or equal to 1 recommendations in recommendation list
48 | diversity = np.nan
49 | else:
50 | pairwise_distances = pairwise_distance_function(R, R)
51 | diversity = np.sum(pairwise_distances) / (R_n * (R_n - 1))
52 | return diversity
53 |
54 |
55 | def serendipity(
56 | R: np.ndarray[np.ndarray],
57 | H: np.ndarray[np.ndarray],
58 | pairwise_distance_function: Callable = cosine_distances,
59 | ) -> float:
60 | """Calculate the serendipity score between a set of recommendations and user's reading history.
61 |
62 | This function implements the concept of serendipity as defined by Feng Lu, Anca Dumitrache, and David Graus (2020).
63 | Serendipity in this context is measured as the mean distance between the items in the recommendation list and the
64 | user's reading history.
65 |
66 | Formula:
67 | Serendipity(R, H) = ( sum_{i∈R} sum_{j∈R} dist(i, j) ) / ( |R||H| )
68 |
69 | where `R` is the recommendation list, `H` is the user's reading history, and `dist` is the pairwise distance function.
70 |
71 | Args:
72 | R (np.ndarray[np.ndarray]): A 2D numpy array representing the recommendation list, where each row is a recommendation.
73 | It should be either array-like or a sparse matrix, with shape (n_samples_X, n_features).
74 | H (np.ndarray[np.ndarray]): A 2D numpy array representing the user's reading history, with the same format as R.
75 | pairwise_distance_function (Callable, optional): A function to compute pairwise distance between samples.
76 | Defaults to `cosine_distances`.
77 |
78 | Returns:
79 | float: The calculated serendipity score.
80 |
81 | References:
82 | Lu, F., Dumitrache, A., & Graus, D. (2020). Beyond Optimizing for Clicks: Incorporating Editorial Values in News Recommendation.
83 | Retrieved from https://arxiv.org/abs/2004.09980
84 |
85 | Examples:
86 | >>> R1 = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
87 | >>> H1 = np.array([[0.7, 0.8, 0.9], [0.1, 0.2, 0.3]])
88 | >>> print(serendipity(R1, H1))
89 | 0.016941328887631724
90 | """
91 | # Compute the pairwise distances between each vector:
92 | dists = pairwise_distance_function(R, H)
93 | # Compute serendipity:
94 | return np.mean(dists)
95 |
96 |
97 | def coverage_count(R: np.ndarray) -> int:
98 | """Calculate the number of distinct items in a recommendation list.
99 |
100 | Args:
101 | R (np.ndarray): An array containing the items in the recommendation list.
102 |
103 | Returns:
104 | int: The count of distinct items in the recommendation list.
105 |
106 | Examples:
107 | >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6])
108 | >>> print(coverage_count(R1))
109 | 6
110 | """
111 | # Distinct items:
112 | return np.unique(R).size
113 |
114 |
115 | def coverage_fraction(R: np.ndarray, C: np.ndarray) -> float:
116 | """Calculate the fraction of distinct items in the recommendation list compared to a universal set.
117 |
118 | Args:
119 | R (np.ndarray): An array containing the items in the recommendation list.
120 | C (np.ndarray): An array representing the universal set of items.
121 | It should contain all possible items that can be recommended.
122 |
123 | Returns:
124 | float: The fraction representing the coverage of the recommendation system.
125 | This is calculated as the size of unique elements in R divided by the size of unique elements in C.
126 |
127 | Examples:
128 | >>> R1 = np.array([1, 2, 3, 4, 5, 5, 6])
129 | >>> C1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
130 | >>> print(coverage_fraction(R1, C1)) # Expected output: 0.6
131 | 0.6
132 | """
133 | # Distinct items:
134 | return np.unique(R).size / np.unique(C).size
135 |
136 |
137 | def novelty(R: np.ndarray[float]) -> float:
138 | """Calculate the novelty score of recommendations based on their popularity.
139 |
140 | This function computes the novelty score for a set of recommendations by applying the self-information popularity metric.
141 | It uses the formula described by Zhou et al. (2010) and Vargas and Castells (2011). The novelty is calculated as the
142 | average negative logarithm (base 2) of the popularity scores of the items in the recommendation list.
143 |
144 | Formula:
145 | Novelty(R) = ( sum_{i∈R} -log2( p_i ) / ( |R| )
146 |
147 | where p_i represents the popularity score of each item in the recommendation list R, and |R| is the size of R.
148 |
149 | Args:
150 | R (np.ndarray[float]): An array of popularity scores (p_i) for each item in the recommendation list.
151 |
152 | Returns:
153 | float: The calculated novelty score. Higher values indicate less popular (more novel) recommendations.
154 |
155 | References:
156 | Zhou et al. (2010).
157 | Vargas & Castells (2011).
158 |
159 | Examples:
160 | >>> print(novelty([0.1, 0.2, 0.3, 0.4, 0.5])) # Expected: High score (low popularity scores)
161 | 1.9405499757656586
162 | >>> print(novelty([0.9, 0.9, 0.9, 1.0, 0.5])) # Expected: Low score (high popularity scores)
163 | 0.29120185606703
164 | """
165 | return np.mean(-np.log2(R))
166 |
167 |
168 | def index_of_dispersion(x: list[int]) -> float:
169 | """
170 | Computes the Index of Dispersion (variance-to-mean ratio) for a given dataset of nominal variables.
171 |
172 | The Index of Dispersion is a statistical measure used to quantify the dispersion or variability of a distribution
173 | relative to its mean. It's particularly useful in identifying whether a dataset follows a Poisson distribution,
174 | where the Index of Dispersion would be approximately 1.
175 |
176 | Formula:
177 | D = ( k * (N^2 - Σf^2) ) / ( N^2 * (k-1) )
178 | Where:
179 | k = number of categories in the data set (including categories with zero items),
180 | N = number of items in the set,
181 | f = number of frequencies or ratings,
182 | Σf^2 = sum of squared frequencies/ratings.
183 |
184 | Args:
185 | x (list[int]): A list of integers representing frequencies or counts of occurrences in different categories.
186 | Each integer in the list corresponds to the count of occurrences in a given category.
187 |
188 | Returns:
189 | float: The Index of Dispersion for the dataset. Returns `np.nan` if the input list contains only one item,
190 | indicating an undefined Index of Dispersion. Returns 0 if there's only one category present in the dataset.
191 |
192 | References:
193 | Walker, 1999, Statistics in criminal
194 | Source: https://www.statisticshowto.com/index-of-dispersion/
195 |
196 | Examples:
197 | Given the following categories: Math(25), Economics(42), Chemistry(13), Physical Education (8), Religious Studies (13).
198 | >>> N = np.sum(25+42+13+8+13)
199 | >>> k = 5
200 | >>> sq_f2 = np.sum(25**2 + 42**2 + 13**2 + 8**2 + 13**2)
201 | >>> iod = ( k * (N**2 - sq_f2)) / ( N**2 * (k-1) )
202 | 0.9079992157631604
203 |
204 | Validate method:
205 | >>> cat = [[1]*25, [2]*42, [3]*13, [4]*8, [5]*13]
206 | >>> flat_list = [item for sublist in cat for item in sublist]
207 | >>> index_of_dispersion(flat_list)
208 | 0.9079992157631604
209 | """
210 | # number of items
211 | N = len(x)
212 | # compute frequencies
213 | count = Counter(x)
214 | # number of categories
215 | k = len(count)
216 | if k == 1:
217 | if N == 1:
218 | return np.nan
219 | else:
220 | return 0
221 | # squared frequencies
222 | f_squared = [count.get(f) ** 2 for f in count]
223 | # compute Index of Dispersion
224 | D = k * (N**2 - sum(f_squared)) / (N**2 * (k - 1))
225 | return D
226 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_classification.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def auc_score_custom(y_true: np.ndarray, y_pred: np.ndarray) -> float:
5 | """
6 | Computes the Area Under the Curve (AUC) score for the Receiver Operating Characteristic (ROC) curve using a
7 | custom method. This implementation is particularly useful for understanding basic ROC curve properties and
8 | for educational purposes to demonstrate how AUC scores can be manually calculated.
9 |
10 | This function may produce slightly different results compared to standard library implementations (e.g., sklearn's roc_auc_score)
11 | in cases where positive and negative predictions have the same score. The function treats the problem as a binary classification task,
12 | comparing the prediction scores for positive instances against those for negative instances directly.
13 |
14 | Args:
15 | y_true (np.ndarray): A binary array indicating the true classification (1 for positive class and 0 for negative class).
16 | y_pred (np.ndarray): An array of scores as predicted by a model, indicating the likelihood of each instance being positive.
17 |
18 | Returns:
19 | float: The calculated AUC score, representing the probability that a randomly chosen positive instance is ranked
20 | higher than a randomly chosen negative instance based on the prediction scores.
21 |
22 | Raises:
23 | ValueError: If `y_true` and `y_pred` do not have the same length or if they contain invalid data types.
24 |
25 | Examples:
26 | >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0])
27 | >>> y_pred = np.array([0.9999, 0.9838, 0.5747, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985])
28 | >>> auc_score_custom(y_true, y_pred)
29 | 0.9333333333333333
30 | >>> from sklearn.metrics import roc_auc_score
31 | >>> roc_auc_score(y_true, y_pred)
32 | 0.9333333333333333
33 |
34 | An error will occur when pos/neg prediction have same score:
35 | >>> y_true = np.array([1, 1, 0, 0, 1, 0, 0, 0])
36 | >>> y_pred = np.array([0.9999, 0.8, 0.8, 0.8485, 0.8624, 0.4502, 0.3357, 0.8985])
37 | >>> auc_score_custom(y_true, y_pred)
38 | 0.7333
39 | >>> roc_auc_score(y_true, y_pred)
40 | 0.7667
41 | """
42 | y_true = np.asarray(y_true)
43 | y_pred = np.asarray(y_pred)
44 |
45 | y_true_bool = y_true.astype(np.bool_)
46 | # Index:
47 | pos_scores = y_pred[y_true_bool]
48 | neg_scores = y_pred[np.logical_not(y_true_bool)]
49 | # Arrange:
50 | pos_scores = np.repeat(pos_scores, len(neg_scores))
51 | neg_scores = np.tile(neg_scores, sum(y_true_bool))
52 | assert len(neg_scores) == len(pos_scores)
53 | return (pos_scores > neg_scores).sum() / len(neg_scores)
54 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_ranking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def reciprocal_rank_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
5 | """Computes the Mean Reciprocal Rank (MRR) score.
6 |
7 | Args:
8 | y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1),
9 | where 1 indicates the relevant item.
10 | y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood
11 | of items being relevant.
12 |
13 | Returns:
14 | float: The mean reciprocal rank (MRR) score.
15 |
16 | Note:
17 | Both `y_true` and `y_pred` should be 1D arrays of the same length.
18 | The function assumes higher scores in `y_pred` indicate higher relevance.
19 |
20 | Examples:
21 | >>> y_true_1 = np.array([0, 0, 1])
22 | >>> y_pred_1 = np.array([0.5, 0.2, 0.1])
23 | >>> reciprocal_rank_score(y_true_1, y_pred_1)
24 | 0.33
25 |
26 | >>> y_true_2 = np.array([0, 1, 1])
27 | >>> y_pred_2 = np.array([0.5, 0.2, 0.1])
28 | >>> reciprocal_rank_score(y_true_2, y_pred_2)
29 | 0.5
30 |
31 | >>> y_true_3 = np.array([1, 1, 0])
32 | >>> y_pred_3 = np.array([0.5, 0.2, 0.1])
33 | >>> reciprocal_rank_score(y_true_3, y_pred_3)
34 | 1.0
35 |
36 | >>> np.mean(
37 | [
38 | reciprocal_rank_score(y_true, y_pred)
39 | for y_true, y_pred in zip(
40 | [y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3]
41 | )
42 | ]
43 | )
44 | 0.61
45 | mrr_score([y_true_1, y_true_2, y_true_3], [y_pred_1, y_pred_2, y_pred_3])
46 | """
47 | order = np.argsort(y_pred)[::-1]
48 | y_true = np.take(y_true, order)
49 | first_positive_rank = np.argmax(y_true) + 1
50 | return 1.0 / first_positive_rank
51 |
52 |
53 | def dcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float:
54 | """
55 | Compute the Discounted Cumulative Gain (DCG) score at a particular rank `k`.
56 |
57 | Args:
58 | y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels.
59 | Each element should be a non-negative integer.
60 | y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is
61 | a score corresponding to the predicted relevance.
62 | k (int, optional): The rank at which the DCG score is calculated. Defaults
63 | to 10. If `k` is larger than the number of elements, it
64 | will be truncated to the number of elements.
65 |
66 | Note:
67 | In case of a 2D array, each row represents a different sample.
68 |
69 | Returns:
70 | float: The calculated DCG score for the top `k` elements.
71 |
72 | Raises:
73 | ValueError: If `y_true` and `y_pred` have different shapes.
74 |
75 | Examples:
76 | >>> from sklearn.metrics import dcg_score as dcg_score_sklearn
77 | >>> y_true = np.array([1, 0, 0, 1, 0])
78 | >>> y_pred = np.array([0.5, 0.2, 0.1, 0.8, 0.4])
79 | >>> dcg_score(y_true, y_pred)
80 | 1.6309297535714575
81 | >>> dcg_score_sklearn([y_true], [y_pred])
82 | 1.6309297535714573
83 | """
84 | k = min(np.shape(y_true)[-1], k)
85 | order = np.argsort(y_pred)[::-1]
86 | y_true = np.take(y_true, order[:k])
87 | gains = 2**y_true - 1
88 | discounts = np.log2(np.arange(len(y_true)) + 2)
89 | return np.sum(gains / discounts)
90 |
91 |
92 | def ndcg_score(y_true: np.ndarray, y_pred: np.ndarray, k: int = 10) -> float:
93 | """
94 | Compute the Normalized Discounted Cumulative Gain (NDCG) score at a rank `k`.
95 |
96 | Args:
97 | y_true (np.ndarray): A 1D or 2D array of ground-truth relevance labels.
98 | Each element should be a non-negative integer. In case
99 | of a 2D array, each row represents a different sample.
100 | y_pred (np.ndarray): A 1D or 2D array of predicted scores. Each element is
101 | a score corresponding to the predicted relevance. The
102 | array should have the same shape as `y_true`.
103 | k (int, optional): The rank at which the NDCG score is calculated. Defaults
104 | to 10. If `k` is larger than the number of elements, it
105 | will be truncated to the number of elements.
106 |
107 | Returns:
108 | float: The calculated NDCG score for the top `k` elements. The score ranges
109 | from 0 to 1, with 1 representing the perfect ranking.
110 |
111 | Examples:
112 | >>> from sklearn.metrics import ndcg_score as ndcg_score_sklearn
113 | >>> y_true = np.array([1, 0, 0, 1, 0])
114 | >>> y_pred = np.array([0.1, 0.2, 0.1, 0.8, 0.4])
115 | >>> ndcg_score([y_true], [y_pred])
116 | 0.863780110436402
117 | >>> ndcg_score_sklearn([y_true], [y_pred])
118 | 0.863780110436402
119 | >>>
120 | """
121 | best = dcg_score(y_true, y_true, k)
122 | actual = dcg_score(y_true, y_pred, k)
123 | return actual / best
124 |
125 |
126 | def mrr_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
127 | """Computes the Mean Reciprocal Rank (MRR) score.
128 |
129 | THIS MIGHT NOT ALL PROPER, TO BE DETERMIEND:
130 | - https://github.com/recommenders-team/recommenders/issues/2141
131 |
132 | Args:
133 | y_true (np.ndarray): A 1D array of ground-truth labels. These should be binary (0 or 1),
134 | where 1 indicates the relevant item.
135 | y_pred (np.ndarray): A 1D array of predicted scores. These scores indicate the likelihood
136 | of items being relevant.
137 |
138 | Returns:
139 | float: The mean reciprocal rank (MRR) score.
140 |
141 | Note:
142 | Both `y_true` and `y_pred` should be 1D arrays of the same length.
143 | The function assumes higher scores in `y_pred` indicate higher relevance.
144 |
145 | Examples:
146 | >>> y_true = np.array([[1, 0, 0, 1, 0]])
147 | >>> y_pred = np.array([[0.5, 0.2, 0.1, 0.8, 0.4]])
148 | >>> mrr_score(y_true, y_pred)
149 | 0.75
150 |
151 | """
152 | order = np.argsort(y_pred)[::-1]
153 | y_true = np.take(y_true, order)
154 | rr_score = y_true / (np.arange(len(y_true)) + 1)
155 | return np.sum(rr_score) / np.sum(y_true)
156 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics/_sklearn.py:
--------------------------------------------------------------------------------
1 | try:
2 | from sklearn.metrics import (
3 | # _regression:
4 | mean_squared_error,
5 | # _ranking:
6 | roc_auc_score,
7 | # _classification:
8 | accuracy_score,
9 | f1_score,
10 | log_loss,
11 | )
12 | except ImportError:
13 | print("sklearn not available")
14 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/metrics_protocols.py:
--------------------------------------------------------------------------------
1 | from itertools import compress
2 | from typing import Iterable
3 | from tqdm import tqdm
4 | import numpy as np
5 | import json
6 |
7 | from ebrec.evaluation.utils import convert_to_binary
8 | from ebrec.evaluation.protocols import Metric
9 |
10 | from ebrec.evaluation.metrics import (
11 | mean_squared_error,
12 | accuracy_score,
13 | roc_auc_score,
14 | ndcg_score,
15 | mrr_score,
16 | log_loss,
17 | f1_score,
18 | )
19 |
20 |
21 | class AccuracyScore(Metric):
22 | def __init__(self, threshold: float = 0.5):
23 | self.threshold = threshold
24 | self.name = "accuracy"
25 |
26 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
27 | res = np.mean(
28 | [
29 | accuracy_score(
30 | each_labels, convert_to_binary(each_preds, self.threshold)
31 | )
32 | for each_labels, each_preds in tqdm(
33 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
34 | )
35 | ]
36 | )
37 | return float(res)
38 |
39 |
40 | class F1Score(Metric):
41 | def __init__(self, threshold: float = 0.5):
42 | self.threshold = threshold
43 | self.name = "f1"
44 |
45 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
46 | res = np.mean(
47 | [
48 | f1_score(each_labels, convert_to_binary(each_preds, self.threshold))
49 | for each_labels, each_preds in tqdm(
50 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
51 | )
52 | ]
53 | )
54 | return float(res)
55 |
56 |
57 | class RootMeanSquaredError(Metric):
58 | def __init__(self):
59 | self.name = "rmse"
60 |
61 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
62 | res = np.mean(
63 | [
64 | np.sqrt(mean_squared_error(each_labels, each_preds))
65 | for each_labels, each_preds in tqdm(
66 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
67 | )
68 | ]
69 | )
70 | return float(res)
71 |
72 |
73 | class AucScore(Metric):
74 | def __init__(self):
75 | self.name = "auc"
76 |
77 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
78 | res = np.mean(
79 | [
80 | roc_auc_score(each_labels, each_preds)
81 | for each_labels, each_preds in tqdm(
82 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
83 | )
84 | ]
85 | )
86 | return float(res)
87 |
88 |
89 | class LogLossScore(Metric):
90 | def __init__(self):
91 | self.name = "logloss"
92 |
93 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
94 | res = np.mean(
95 | [
96 | log_loss(
97 | each_labels,
98 | [max(min(p, 1.0 - 10e-12), 10e-12) for p in each_preds],
99 | )
100 | for each_labels, each_preds in tqdm(
101 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
102 | )
103 | ]
104 | )
105 | return float(res)
106 |
107 |
108 | class MrrScore(Metric):
109 | def __init__(self) -> Metric:
110 | self.name = "mrr"
111 |
112 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
113 | mean_mrr = np.mean(
114 | [
115 | mrr_score(each_labels, each_preds)
116 | for each_labels, each_preds in tqdm(
117 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
118 | )
119 | ]
120 | )
121 | return float(mean_mrr)
122 |
123 |
124 | class NdcgScore(Metric):
125 | def __init__(self, k: int):
126 | self.k = k
127 | self.name = f"ndcg@{k}"
128 |
129 | def calculate(self, y_true: list[np.ndarray], y_pred: list[np.ndarray]) -> float:
130 | res = np.mean(
131 | [
132 | ndcg_score(each_labels, each_preds, self.k)
133 | for each_labels, each_preds in tqdm(
134 | zip(y_true, y_pred), ncols=80, total=len(y_true), desc="AUC"
135 | )
136 | ]
137 | )
138 | return float(res)
139 |
140 |
141 | class MetricEvaluator:
142 | """
143 | >>> y_true = [[1, 0, 0], [1, 1, 0], [1, 0, 0, 0]]
144 | >>> y_pred = [[0.2, 0.3, 0.5], [0.18, 0.7, 0.1], [0.18, 0.2, 0.1, 0.1]]
145 |
146 | >>> met_eval = MetricEvaluator(
147 | labels=y_true,
148 | predictions=y_pred,
149 | metric_functions=[
150 | AucScore(),
151 | MrrScore(),
152 | NdcgScore(k=5),
153 | NdcgScore(k=10),
154 | LogLossScore(),
155 | RootMeanSquaredError(),
156 | AccuracyScore(threshold=0.5),
157 | F1Score(threshold=0.5),
158 | ],
159 | )
160 | >>> met_eval.evaluate()
161 | {
162 | "auc": 0.5555555555555556,
163 | "mrr": 0.5277777777777778,
164 | "ndcg@5": 0.7103099178571526,
165 | "ndcg@10": 0.7103099178571526,
166 | "logloss": 0.716399020295845,
167 | "rmse": 0.5022870658128165
168 | "accuracy": 0.5833333333333334,
169 | "f1": 0.2222222222222222
170 | }
171 | """
172 |
173 | def __init__(
174 | self,
175 | labels: list[np.ndarray],
176 | predictions: list[np.ndarray],
177 | metric_functions: list[Metric],
178 | ):
179 | self.labels = labels
180 | self.predictions = predictions
181 | self.metric_functions = metric_functions
182 | self.evaluations = dict()
183 |
184 | def evaluate(self) -> dict:
185 | self.evaluations = {
186 | metric_function.name: metric_function(self.labels, self.predictions)
187 | for metric_function in self.metric_functions
188 | }
189 | return self
190 |
191 | @property
192 | def metric_functions(self):
193 | return self.__metric_functions
194 |
195 | @metric_functions.setter
196 | def metric_functions(self, values):
197 | invalid_callables = self.__invalid_callables(values)
198 | if not any(invalid_callables) and invalid_callables:
199 | self.__metric_functions = values
200 | else:
201 | invalid_objects = list(compress(values, invalid_callables))
202 | invalid_types = [type(item) for item in invalid_objects]
203 | raise TypeError(f"Following object(s) are not callable: {invalid_types}")
204 |
205 | @staticmethod
206 | def __invalid_callables(iter: Iterable):
207 | return [not callable(item) for item in iter]
208 |
209 | def __str__(self):
210 | if self.evaluations:
211 | evaluations_json = json.dumps(self.evaluations, indent=4)
212 | return f": \n {evaluations_json}"
213 | else:
214 | return f": {self.evaluations}"
215 |
216 | def __repr__(self):
217 | return str(self)
218 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/protocols.py:
--------------------------------------------------------------------------------
1 | from typing import Protocol
2 | import numpy as np
3 |
4 |
5 | class Metric(Protocol):
6 | name: str
7 |
8 | def calculate(self, y_true: np.ndarray, y_score: np.ndarray) -> float: ...
9 |
10 | def __str__(self) -> str:
11 | return f": params: {self.__dict__}"
12 |
13 | def __repr__(self) -> str:
14 | return str(self)
15 |
16 | def __call__(self, y_true: np.ndarray, y_score: np.ndarray) -> float:
17 | return self.calculate(y_true, y_score)
18 |
--------------------------------------------------------------------------------
/src/ebrec/evaluation/utils.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from typing import Iterable
3 | import numpy as np
4 |
5 |
6 | def convert_to_binary(y_pred: np.ndarray, threshold: float):
7 | y_pred = np.asarray(y_pred)
8 | y_pred[y_pred >= threshold] = 1
9 | y_pred[y_pred < threshold] = 0
10 | return y_pred
11 |
12 |
13 | def is_iterable_nested_dtype(iterable: Iterable[any], dtypes) -> bool:
14 | """
15 | Check whether iterable is a nested with dtype,
16 | note, we assume all types in iterable are the the same.
17 | Check all cases: any(isinstance(i, dtypes) for i in a)
18 |
19 | Args:
20 | iterable (Iterable[Any]): iterable (list, array, tuple) of any type of data
21 | dtypes (Tuple): tuple of possible dtypes, e.g. dtypes = (list, np.ndarray)
22 | Returns:
23 | bool: boolean whether it is true or false
24 |
25 | Examples:
26 | >>> is_iterable_nested_dtype([1, 2, 3], list)
27 | False
28 | >>> is_iterable_nested_dtype([1, 2, 3], (list, int))
29 | True
30 | >>> is_iterable_nested_dtype([[1], [2], [3]], list)
31 | True
32 | """
33 | return isinstance(iterable[0], dtypes)
34 |
35 |
36 | def compute_combinations(n: int, r: int) -> int:
37 | """Compute Combinations where order does not matter (without replacement)
38 |
39 | Source: https://www.statskingdom.com/combinations-calculator.html
40 | Args:
41 | n (int): number of items
42 | r (int): number of items being chosen at a time
43 | Returns:
44 | int: number of possible combinations
45 |
46 | Formula:
47 | * nCr = n! / ( (n - r)! * r! )
48 |
49 | Assume the following:
50 | * we sample without replacement of items
51 | * order of the outcomes does NOT matter
52 | """
53 | return int(
54 | (np.math.factorial(n)) / (np.math.factorial(n - r) * np.math.factorial(r))
55 | )
56 |
57 |
58 | def scale_range(
59 | m: np.ndarray,
60 | r_min: float = None,
61 | r_max: float = None,
62 | t_min: float = 0,
63 | t_max: float = 1.0,
64 | ) -> None:
65 | """Scale an array between a range
66 | Source: https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range
67 |
68 | m -> ((m-r_min)/(r_max-r_min)) * (t_max-t_min) + t_min
69 |
70 | Args:
71 | m ∈ [r_min,r_max] denote your measurements to be scaled
72 | r_min denote the minimum of the range of your measurement
73 | r_max denote the maximum of the range of your measurement
74 | t_min denote the minimum of the range of your desired target scaling
75 | t_max denote the maximum of the range of your desired target scaling
76 | """
77 | if not r_min:
78 | r_min = np.min(m)
79 | if not r_max:
80 | r_max = np.max(m)
81 | return ((m - r_min) / (r_max - r_min)) * (t_max - t_min) + t_min
82 |
83 |
84 | # utils for
85 | def compute_item_popularity_scores(R: Iterable[np.ndarray]) -> dict[str, float]:
86 | """Compute popularity scores for items based on their occurrence in user interactions.
87 |
88 | This function calculates the popularity score of each item as the fraction of users who have interacted with that item.
89 | The popularity score, p_i, for an item is defined as the number of users who have interacted with the item divided by the
90 | total number of users.
91 |
92 | Formula:
93 | p_i = | {u ∈ U}, r_ui != Ø | / |U|
94 |
95 | where p_i is the popularity score of an item, U is the total number of users, and r_ui is the interaction of user u with item i (non-zero
96 | interaction implies the user has seen the item).
97 |
98 | Note:
99 | Each entry can only have the same item ones. TODO - ADD THE TEXT DONE HERE.
100 |
101 | Args:
102 | R (Iterable[np.ndarray]): An iterable of numpy arrays, where each array represents the items interacted with by a single user.
103 | Each element in the array should be a string identifier for an item.
104 |
105 | Returns:
106 | dict[str, float]: A dictionary where keys are item identifiers and values are their corresponding popularity scores (as floats).
107 |
108 | Examples:
109 | >>> R = [
110 | np.array(["item1", "item2", "item3"]),
111 | np.array(["item1", "item3"]),
112 | np.array(["item1", "item4"]),
113 | ]
114 | >>> print(popularity_scores(R))
115 | {'item1': 1.0, 'item2': 0.3333333333333333, 'item3': 0.6666666666666666, 'item4': 0.3333333333333333}
116 | """
117 | U = len(R)
118 | R_flatten = np.concatenate(R)
119 | item_counts = Counter(R_flatten)
120 | return {item: (r_ui / U) for item, r_ui in item_counts.items()}
121 |
122 |
123 | def compute_normalized_distribution(
124 | R: np.ndarray[str],
125 | weights: np.ndarray[float] = None,
126 | distribution: dict[str, float] = None,
127 | ) -> dict[str, float]:
128 | """
129 | Compute a normalized weigted distribution for a list of items that each can have a single representation assigned.
130 |
131 | Args:
132 | a (np.ndarray[str]): an array of items representation.
133 | weights (np.ndarray[float], optional): weights to assign each element in a. Defaults to None.
134 | * Following yields: len(weights) == len(a)
135 | distribution (Dict[str, float], optional): dictionary to assign the distribution values, if None it will be generated as {}. Defaults to None.
136 | * Use case; if you want to add distribution values to existing, one can input it.
137 |
138 | Returns:
139 | Dict[str, float]: dictionary with normalized distribution values
140 |
141 | Examples:
142 | >>> a = np.array(["a", "b", "c", "c"])
143 | >>> compute_normalized_distribution(a)
144 | {'a': 0.25, 'b': 0.25, 'c': 0.5}
145 | """
146 | n_elements = len(R)
147 |
148 | distr = distribution if distribution is not None else {}
149 | weights = weights if weights is not None else np.ones(n_elements) / n_elements
150 | for item, weight in zip(R, weights):
151 | distr[item] = weight + distr.get(item, 0.0)
152 | return distr
153 |
154 |
155 | def get_keys_in_dict(id_list: any, dictionary: dict) -> list[any]:
156 | """
157 | Returns a list of IDs from id_list that are keys in the dictionary.
158 | Args:
159 | id_list (List[Any]): List of IDs to check against the dictionary.
160 | dictionary (Dict[Any, Any]): Dictionary where keys are checked against the IDs.
161 |
162 | Returns:
163 | List[Any]: List of IDs that are also keys in the dictionary.
164 |
165 | Examples:
166 | >>> get_keys_in_dict(['a', 'b', 'c'], {'a': 1, 'c': 3, 'd': 4})
167 | ['a', 'c']
168 | """
169 | return [id_ for id_ in id_list if id_ in dictionary]
170 |
171 |
172 | def check_key_in_all_nested_dicts(dictionary: dict, key: str) -> None:
173 | """
174 | Checks if the given key is present in all nested dictionaries within the main dictionary.
175 | Raises a ValueError if the key is not found in any of the nested dictionaries.
176 |
177 | Args:
178 | dictionary (dict): The dictionary containing nested dictionaries to check.
179 | key (str): The key to look for in all nested dictionaries.
180 |
181 | Raises:
182 | ValueError: If the key is not present in any of the nested dictionaries.
183 |
184 | Example:
185 | >>> nested_dict = {
186 | "101": {"name": "Alice", "age": 30},
187 | "102": {"name": "Bob", "age": 25},
188 | }
189 | >>> check_key_in_all_nested_dicts(nested_dict, "age")
190 | # No error is raised
191 | >>> check_key_in_all_nested_dicts(nested_dict, "salary")
192 | # Raises ValueError: 'salary is not present in all nested dictionaries.'
193 | """
194 | for dict_key, sub_dict in dictionary.items():
195 | if not isinstance(sub_dict, dict) or key not in sub_dict:
196 | raise ValueError(
197 | f"'{key}' is not present in '{dict_key}' nested dictionary."
198 | )
199 |
--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/models/fastformer/__init__.py
--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/dataloader.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from tqdm import tqdm
3 | import polars as pl
4 | import numpy as np
5 |
6 | from torch.utils.tensorboard import SummaryWriter
7 | from torch.utils.data import DataLoader
8 | from torch.utils.data import Dataset
9 | import torch.optim as optim
10 | import torch.nn as nn
11 | import torch
12 |
13 | from ebrec.utils._constants import DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_LABELS_COL
14 |
15 | from ebrec.utils._python import (
16 | repeat_by_list_values_from_matrix,
17 | convert_to_nested_list,
18 | create_lookup_objects,
19 | )
20 | from ebrec.utils._articles_behaviors import map_list_article_id_to_value
21 | from ebrec.utils._polars import shuffle_rows
22 |
23 | from ebrec.evaluation import AucScore
24 | from ebrec.utils._torch import save_checkpoint
25 |
26 |
27 | @dataclass
28 | class FastformerDataset(Dataset):
29 | """_summary_
30 | The batch-size is aggragating multiple impressions and processing them simultaneous, which
31 | has a major effect on the training time. Hence, you should put the batch_size=1 in the 'DataLoader'
32 | and just use FastformerDataset batch_size.
33 |
34 | Note, the outut is then (1, output_shape), where the 1 is the DataLoader batch_size.
35 | """
36 |
37 | behaviors: pl.DataFrame
38 | history_column: str
39 | article_dict: dict[int, pl.Series]
40 | batch_size: int = 64
41 | shuffle: bool = True
42 | device: str = "cpu"
43 | seed: int = None
44 | labels_col: str = DEFAULT_LABELS_COL
45 | inview_col: str = DEFAULT_INVIEW_ARTICLES_COL
46 | n_samples_col: str = "n_samples"
47 |
48 | def __post_init__(self):
49 | self.unknown_index = [0]
50 | if self.shuffle:
51 | self.behaviors = shuffle_rows(self.behaviors, seed=self.seed)
52 | self.behaviors = self.behaviors.with_columns(
53 | pl.col(self.labels_col).list.len().alias(self.n_samples_col)
54 | )
55 | self.lookup_indexes, self.lookup_matrix = create_lookup_objects(
56 | self.article_dict, unknown_representation="zeros"
57 | )
58 |
59 | def __len__(self):
60 | """
61 | Number of batch steps in the data
62 | """
63 | return int(np.ceil(self.behaviors.shape[0] / self.batch_size))
64 |
65 | def __getitem__(self, index: int):
66 | """
67 | Get the batch of samples for the given index.
68 |
69 | Note: The dataset class provides a single index for each iteration. The batching is done internally in this method
70 | to utilize and optimize for speed. This can be seen as a mini-batching approach.
71 |
72 | Args:
73 | index (int): An integer index.
74 |
75 | Returns:
76 | Tuple[torch.Tensor, torch.Tensor]: A tuple containing the input features and labels as torch Tensors.
77 | Note, the output of the PyTorch DataLoader is (1, *shape), where 1 is the DataLoader's batch_size.
78 | """
79 | # Clever way to batch the data:
80 | batch_indices = range(index * self.batch_size, (index + 1) * self.batch_size)
81 | batch = self.behaviors[batch_indices]
82 | if self.shuffle:
83 | batch = shuffle_rows(batch, seed=self.seed)
84 | # =>
85 | x = (
86 | batch.drop(self.labels_col)
87 | .pipe(
88 | map_list_article_id_to_value,
89 | behaviors_column=self.history_column,
90 | mapping=self.lookup_indexes,
91 | fill_nulls=self.unknown_index,
92 | )
93 | .pipe(
94 | map_list_article_id_to_value,
95 | behaviors_column=self.inview_col,
96 | mapping=self.lookup_indexes,
97 | fill_nulls=self.unknown_index,
98 | )
99 | )
100 | # =>
101 | repeats = np.array(batch[self.n_samples_col])
102 | # =>
103 | history_input = repeat_by_list_values_from_matrix(
104 | input_array=x[self.history_column].to_list(),
105 | matrix=self.lookup_matrix,
106 | repeats=repeats,
107 | ).squeeze(2)
108 | # =>
109 | candidate_input = self.lookup_matrix[x[self.inview_col].explode().to_list()]
110 | # =>
111 | history_input = torch.Tensor(history_input).type(torch.int).to(self.device)
112 | candidate_input = torch.Tensor(candidate_input).type(torch.int).to(self.device)
113 | y = (
114 | torch.Tensor(batch[self.labels_col].explode())
115 | .view(-1, 1)
116 | .type(torch.float)
117 | .to(self.device)
118 | )
119 | # ========================
120 | return (history_input, candidate_input), y
121 |
122 |
123 | def batch_input_label_concatenation(
124 | inputs: tuple[torch.Tensor], labels: torch.Tensor
125 | ) -> tuple[torch.Tensor, torch.Tensor]:
126 | """ """
127 | return (inputs[0].squeeze(0), inputs[1].squeeze(0)), labels.squeeze(0)
128 |
129 |
130 | def compute_auc_from_fixed_pos_neg_samples(
131 | y_true: list[float], y_pred: list[float]
132 | ) -> float:
133 | #
134 | n_samples = int(np.sum(y_true))
135 | y_true = convert_to_nested_list(y_true, n_samples)
136 | y_pred = convert_to_nested_list(y_pred, n_samples)
137 | val_auc = AucScore().calculate(y_true=y_true, y_pred=y_pred)
138 | return val_auc
139 |
140 |
141 | def train(
142 | model: nn.Module,
143 | train_dataloader: DataLoader,
144 | criterion: nn.Module,
145 | optimizer: optim.Optimizer,
146 | num_epochs: int = 5,
147 | val_dataloader: DataLoader = None,
148 | state_dict_path: str = "model_state_dict.pt",
149 | patience: int = None,
150 | summary_writer: SummaryWriter = None,
151 | gradient_accumulation_steps: int = 1,
152 | tqdm_disable: bool = False,
153 | tqdm_ncol: int = 80,
154 | monitor_metric: str = "loss",
155 | ) -> nn.Module:
156 | """ """
157 | min_val_loss = np.inf
158 | max_val_auc = -np.inf
159 | early_stop = 0
160 | global_steps = 0
161 | total_batches = len(train_dataloader)
162 | running_loss = 0.0
163 | running_samples = 0
164 | # ==> TRAIN LOOP:
165 | for epoch in range(num_epochs):
166 | # => Set the model to train mode
167 | model.train(True)
168 | progress_bar = tqdm(
169 | train_dataloader,
170 | desc=f"Epoch [{epoch + 1}/{num_epochs}]",
171 | disable=tqdm_disable,
172 | ncols=tqdm_ncol,
173 | )
174 | # => Zero the parameter gradients
175 | optimizer.zero_grad()
176 | for batch_idx, (inputs, labels) in enumerate(progress_bar, start=1):
177 | # => Move inputs and labels to device
178 | inputs, labels = batch_input_label_concatenation(inputs, labels)
179 | # => Forward pass
180 | outputs = model(*inputs)
181 | loss = criterion(outputs, labels)
182 | # => Backward pass and optimization
183 | loss.backward()
184 | # => Update training loss
185 | global_steps += 1
186 | running_loss += loss.item() * len(outputs)
187 | running_samples += len(outputs)
188 | current_loss = running_loss / running_samples
189 | progress_bar.set_postfix({"Loss": round(current_loss, 6)})
190 | # =>
191 | if summary_writer is not None:
192 | summary_writer.add_scalar(
193 | tag="Train/Loss",
194 | scalar_value=current_loss,
195 | global_step=global_steps,
196 | )
197 | # => Accumulated gradient step:
198 | if (
199 | batch_idx % gradient_accumulation_steps == 0
200 | or batch_idx == total_batches
201 | ):
202 | # => Take step and zero gradients
203 | optimizer.step()
204 | optimizer.zero_grad()
205 |
206 | # ==> EVAL LOOP:
207 | if val_dataloader:
208 | model.train(False)
209 | all_outputs, all_labels, val_loss = evaluate(
210 | model=model,
211 | dataloader=val_dataloader,
212 | criterion=criterion,
213 | tqdm_disable=tqdm_disable,
214 | )
215 |
216 | if summary_writer is not None:
217 | summary_writer.add_scalar(
218 | tag="Val/Loss", scalar_value=val_loss, global_step=global_steps
219 | )
220 |
221 | if monitor_metric == "auc":
222 | val_auc = compute_auc_from_fixed_pos_neg_samples(
223 | y_true=np.ravel(all_labels.tolist()),
224 | y_pred=np.ravel(all_outputs.tolist()),
225 | )
226 | print(f"Val/AUC : {round(val_auc, 6)}")
227 | if summary_writer is not None:
228 | summary_writer.add_scalar(
229 | tag="Val/AUC", scalar_value=val_auc, global_step=global_steps
230 | )
231 |
232 | # => MODEL CHECKPOINT
233 | if monitor_metric == "loss" and val_loss < min_val_loss:
234 | save_checkpoint(model, path=state_dict_path)
235 | min_val_loss = val_loss
236 | early_stop = 0
237 | elif monitor_metric == "auc" and val_auc > max_val_auc:
238 | save_checkpoint(model, path=state_dict_path)
239 | max_val_auc = val_auc
240 | early_stop = 0
241 | else:
242 | early_stop += 1
243 | # => EARLYSTOP
244 | if patience is not None and early_stop == patience:
245 | break
246 |
247 | if summary_writer is not None:
248 | summary_writer.close()
249 |
250 | if val_dataloader:
251 | model.load_state_dict(torch.load(state_dict_path), strict=True)
252 |
253 | return model
254 |
255 |
256 | def evaluate(
257 | model: nn.Module,
258 | dataloader: DataLoader,
259 | criterion: nn.Module,
260 | tqdm_disable: bool = False,
261 | tqdm_ncol: int = 80,
262 | device: str = "cpu",
263 | ) -> tuple[list[float], list[float], float]:
264 | model.eval()
265 | all_outputs = []
266 | all_labels = []
267 | loss = 0.0
268 | n_samples = 0
269 | with torch.no_grad():
270 | progress_bar = tqdm(
271 | dataloader,
272 | desc="Evaluating",
273 | total=dataloader.__len__(),
274 | disable=tqdm_disable,
275 | ncols=tqdm_ncol,
276 | )
277 | for inputs, labels in progress_bar:
278 | inputs, labels = batch_input_label_concatenation(inputs, labels)
279 | # Forward pass
280 | outputs = model(*inputs)
281 | batch_loss = criterion(outputs, labels)
282 | # =>
283 | loss += batch_loss.item() * len(outputs)
284 | n_samples += len(outputs)
285 | # =>
286 | all_outputs.append(outputs)
287 | all_labels.append(labels)
288 | #
289 | progress_bar.set_postfix({"Eval Loss": round(loss / n_samples, 4)})
290 | # =>
291 | all_outputs = torch.cat(all_outputs, dim=0)
292 | all_labels = torch.cat(all_labels, dim=0)
293 | loss = loss / n_samples
294 | return all_outputs, all_labels, loss
295 |
--------------------------------------------------------------------------------
/src/ebrec/models/fastformer/fastformer_wu.py:
--------------------------------------------------------------------------------
1 | from transformers.models.bert.modeling_bert import (
2 | BertSelfOutput,
3 | BertIntermediate,
4 | BertOutput,
5 | )
6 | import logging
7 | import torch.nn as nn
8 | import torch
9 |
10 |
11 | class AttentionPooling(nn.Module):
12 | def __init__(self, config):
13 | self.config = config
14 | super(AttentionPooling, self).__init__()
15 | self.att_fc1 = nn.Linear(config.hidden_size, config.hidden_size)
16 | self.att_fc2 = nn.Linear(config.hidden_size, 1)
17 | self.apply(self.init_weights)
18 |
19 | def init_weights(self, module):
20 | if isinstance(module, nn.Linear):
21 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
22 | if isinstance(module, nn.Linear) and module.bias is not None:
23 | module.bias.data.zero_()
24 |
25 | def forward(self, x, attn_mask=None):
26 | bz = x.shape[0]
27 | e = self.att_fc1(x)
28 | e = nn.Tanh()(e)
29 | alpha = self.att_fc2(e)
30 | alpha = torch.exp(alpha)
31 | if attn_mask is not None:
32 | alpha = alpha * attn_mask.unsqueeze(2)
33 | alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8)
34 | x = torch.bmm(x.permute(0, 2, 1), alpha)
35 | x = torch.reshape(x, (bz, -1))
36 | return x
37 |
38 |
39 | class FastSelfAttention(nn.Module):
40 | def __init__(self, config):
41 | super(FastSelfAttention, self).__init__()
42 | self.config = config
43 | if config.hidden_size % config.num_attention_heads != 0:
44 | raise ValueError(
45 | "The hidden size (%d) is not a multiple of the number of attention "
46 | "heads (%d)" % (config.hidden_size, config.num_attention_heads)
47 | )
48 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
49 | self.num_attention_heads = config.num_attention_heads
50 | self.all_head_size = self.num_attention_heads * self.attention_head_size
51 | self.input_dim = config.hidden_size
52 |
53 | self.query = nn.Linear(self.input_dim, self.all_head_size)
54 | self.query_att = nn.Linear(self.all_head_size, self.num_attention_heads)
55 | self.key = nn.Linear(self.input_dim, self.all_head_size)
56 | self.key_att = nn.Linear(self.all_head_size, self.num_attention_heads)
57 | self.transform = nn.Linear(self.all_head_size, self.all_head_size)
58 |
59 | self.softmax = nn.Softmax(dim=-1)
60 |
61 | self.apply(self.init_weights)
62 |
63 | def init_weights(self, module):
64 | if isinstance(module, nn.Linear):
65 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
66 | if isinstance(module, nn.Linear) and module.bias is not None:
67 | module.bias.data.zero_()
68 |
69 | def transpose_for_scores(self, x):
70 | new_x_shape = x.size()[:-1] + (
71 | self.num_attention_heads,
72 | self.attention_head_size,
73 | )
74 | x = x.view(*new_x_shape)
75 | return x.permute(0, 2, 1, 3)
76 |
77 | def forward(self, hidden_states, attention_mask):
78 | # batch_size, seq_len, num_head * head_dim, batch_size, seq_len
79 | batch_size, seq_len, _ = hidden_states.shape
80 | mixed_query_layer = self.query(hidden_states)
81 | mixed_key_layer = self.key(hidden_states)
82 | # batch_size, num_head, seq_len
83 | query_for_score = (
84 | self.query_att(mixed_query_layer).transpose(1, 2)
85 | / self.attention_head_size**0.5
86 | )
87 | # add attention mask
88 | query_for_score += attention_mask
89 |
90 | # batch_size, num_head, 1, seq_len
91 | query_weight = self.softmax(query_for_score).unsqueeze(2)
92 |
93 | # batch_size, num_head, seq_len, head_dim
94 | query_layer = self.transpose_for_scores(mixed_query_layer)
95 |
96 | # batch_size, num_head, head_dim, 1
97 | pooled_query = (
98 | torch.matmul(query_weight, query_layer)
99 | .transpose(1, 2)
100 | .view(-1, 1, self.num_attention_heads * self.attention_head_size)
101 | )
102 | pooled_query_repeat = pooled_query.repeat(1, seq_len, 1)
103 | # batch_size, num_head, seq_len, head_dim
104 |
105 | # batch_size, num_head, seq_len
106 | mixed_query_key_layer = mixed_key_layer * pooled_query_repeat
107 |
108 | query_key_score = (
109 | self.key_att(mixed_query_key_layer) / self.attention_head_size**0.5
110 | ).transpose(1, 2)
111 |
112 | # add attention mask
113 | query_key_score += attention_mask
114 |
115 | # batch_size, num_head, 1, seq_len
116 | query_key_weight = self.softmax(query_key_score).unsqueeze(2)
117 |
118 | key_layer = self.transpose_for_scores(mixed_query_key_layer)
119 | pooled_key = torch.matmul(query_key_weight, key_layer)
120 |
121 | # query = value
122 | weighted_value = (pooled_key * query_layer).transpose(1, 2)
123 | weighted_value = weighted_value.reshape(
124 | weighted_value.size()[:-2]
125 | + (self.num_attention_heads * self.attention_head_size,)
126 | )
127 | weighted_value = self.transform(weighted_value) + mixed_query_layer
128 |
129 | return weighted_value
130 |
131 |
132 | class FastAttention(nn.Module):
133 | def __init__(self, config):
134 | super(FastAttention, self).__init__()
135 | self.self = FastSelfAttention(config)
136 | self.output = BertSelfOutput(config)
137 |
138 | def forward(self, input_tensor, attention_mask):
139 | self_output = self.self(input_tensor, attention_mask)
140 | attention_output = self.output(self_output, input_tensor)
141 | return attention_output
142 |
143 |
144 | class FastformerLayer(nn.Module):
145 | def __init__(self, config):
146 | super(FastformerLayer, self).__init__()
147 | self.attention = FastAttention(config)
148 | self.intermediate = BertIntermediate(config)
149 | self.output = BertOutput(config)
150 |
151 | def forward(self, hidden_states, attention_mask):
152 | attention_output = self.attention(hidden_states, attention_mask)
153 | intermediate_output = self.intermediate(attention_output)
154 | layer_output = self.output(intermediate_output, attention_output)
155 | return layer_output
156 |
157 |
158 | class StandardFastformerEncoder(nn.Module):
159 | def __init__(self, config, pooler_count=1):
160 | super(StandardFastformerEncoder, self).__init__()
161 | self.config = config
162 | self.encoders = nn.ModuleList(
163 | [FastformerLayer(config) for _ in range(config.num_hidden_layers)]
164 | )
165 | self.position_embeddings = nn.Embedding(
166 | config.max_position_embeddings, config.hidden_size
167 | )
168 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
169 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
170 |
171 | # support multiple different poolers with shared bert encoder.
172 | self.poolers = nn.ModuleList()
173 | if config.pooler_type == "weightpooler":
174 | for _ in range(pooler_count):
175 | self.poolers.append(AttentionPooling(config))
176 | logging.info(f"This model has {len(self.poolers)} poolers.")
177 | self.apply(self.init_weights)
178 |
179 | def init_weights(self, module):
180 | if isinstance(module, (nn.Linear, nn.Embedding)):
181 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
182 | if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
183 | with torch.no_grad():
184 | module.weight[module.padding_idx].fill_(0)
185 | elif isinstance(module, nn.LayerNorm):
186 | module.bias.data.zero_()
187 | module.weight.data.fill_(1.0)
188 | if isinstance(module, nn.Linear) and module.bias is not None:
189 | module.bias.data.zero_()
190 |
191 | def forward(self, input_embs, attention_mask, pooler_index=0) -> torch.Tensor:
192 | """
193 | Forward pass through the encoder.
194 |
195 | Parameters:
196 | input_embs (torch.Tensor): The input embeddings, with shape (batch_size, n_tokens, emb_dim).
197 | attention_mask (torch.Tensor): The attention mask, with shape (batch_size, n_tokens), where
198 | values of 1 indicate positions to attend to and 0s indicate positions to mask.
199 | pooler_index (int, optional): Index of the pooler to use to aggregate the encoder's output. Default is 0.
200 |
201 | Returns:
202 | torch.Tensor: The output of the encoder, processed and pooled according to the specified pooler.
203 | with shape (batch_size, config.hidden_size).
204 |
205 | Usage:
206 | >>> encoder_output = model.forward(input_embs, attention_mask, pooler_index=0)
207 | """
208 | extended_attention_mask = attention_mask.unsqueeze(1)
209 | extended_attention_mask = extended_attention_mask.to(
210 | dtype=next(self.parameters()).dtype
211 | ) # fp16 compatibility
212 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
213 |
214 | batch_size, n_tokens, emb_dim = input_embs.shape
215 |
216 | position_ids = torch.arange(
217 | n_tokens, dtype=torch.long, device=input_embs.device
218 | )
219 | position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
220 | position_embeddings = self.position_embeddings(position_ids)
221 |
222 | embeddings = input_embs + position_embeddings
223 | embeddings = self.LayerNorm(embeddings)
224 | embeddings = self.dropout(embeddings)
225 |
226 | all_hidden_states = [embeddings]
227 |
228 | for layer_module in self.encoders:
229 | layer_outputs = layer_module(all_hidden_states[-1], extended_attention_mask)
230 | all_hidden_states.append(layer_outputs)
231 |
232 | output = self.poolers[pooler_index](all_hidden_states[-1], attention_mask)
233 |
234 | return output
235 |
236 |
237 | class Fastformer_wu(torch.nn.Module):
238 | def __init__(
239 | self,
240 | config,
241 | word_embedding: nn.Embedding,
242 | ):
243 | super(Fastformer_wu, self).__init__()
244 | self.config = config
245 | self.word_embedding = word_embedding
246 | self.embedding_transform = nn.Linear(
247 | word_embedding.weight.shape[1], config.hidden_size
248 | )
249 | # 4 classes; likely the npratio
250 | self.output_layer = nn.Linear(config.hidden_size, 4)
251 | self.fastformer_model = StandardFastformerEncoder(config)
252 | self.criterion = nn.CrossEntropyLoss()
253 | self.apply(self.init_weights)
254 |
255 | def init_weights(self, module):
256 | if isinstance(module, (nn.Linear, nn.Embedding)):
257 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
258 | if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
259 | with torch.no_grad():
260 | module.weight[module.padding_idx].fill_(0)
261 | if isinstance(module, nn.Linear) and module.bias is not None:
262 | module.bias.data.zero_()
263 |
264 | def forward(self, input_ids, targets):
265 | mask = input_ids.bool().float()
266 | embds = self.word_embedding(input_ids)
267 | embds = self.embedding_transform(embds)
268 | text_vec = self.fastformer_model(embds, mask)
269 | score = self.output_layer(text_vec)
270 | loss = self.criterion(score, targets)
271 | return loss, score
272 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/__init__.py:
--------------------------------------------------------------------------------
1 | from .npa import NPAModel
2 | from .lstur import LSTURModel
3 | from .nrms import NRMSModel
4 | from .naml import NAMLModel
5 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/base_model.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | from tensorflow import keras
3 | import tensorflow as tf
4 | import numpy as np
5 | import abc
6 |
7 | __all__ = ["BaseModel"]
8 |
9 |
10 | class BaseModel:
11 | """Basic class of models
12 |
13 | Attributes:
14 | hparams (object): A tf.contrib.training.HParams object, hold the entire set of hyperparameters.
15 | graph (object): An optional graph.
16 | seed (int): Random seed.
17 | """
18 |
19 | def __init__(
20 | self,
21 | hparams: Dict[str, Any],
22 | word2vec_embedding: np.ndarray = None,
23 | # if 'word2vec_embedding' not provided:
24 | word_emb_dim: int = 300,
25 | vocab_size: int = 32000,
26 | seed=None,
27 | ):
28 | """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
29 | parameter set.
30 |
31 | Args:
32 | hparams (object): Hold the entire set of hyperparameters.
33 | seed (int): Random seed.
34 | """
35 | self.seed = seed
36 | tf.random.set_seed(seed)
37 | np.random.seed(seed)
38 |
39 | # ASSIGN 'hparams':
40 | self.hparams = hparams
41 |
42 | # INIT THE WORD-EMBEDDINGS:
43 | if word2vec_embedding is None:
44 | self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim)
45 | else:
46 | self.word2vec_embedding = word2vec_embedding
47 |
48 | # BUILD AND COMPILE MODEL:
49 | self.model, self.scorer = self._build_graph()
50 | self.loss = self._get_loss(self.hparams.loss)
51 | self.train_optimizer = self._get_opt(
52 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
53 | )
54 | self.model.compile(loss=self.loss, optimizer=self.train_optimizer)
55 |
56 | @abc.abstractmethod
57 | def _build_graph(self):
58 | """Subclass will implement this."""
59 | pass
60 |
61 | def _get_loss(self, loss: str):
62 | """Make loss function, consists of data loss and regularization loss
63 |
64 | Returns:
65 | object: Loss function or loss function name
66 | """
67 | if loss == "cross_entropy_loss":
68 | data_loss = "categorical_crossentropy"
69 | elif loss == "log_loss":
70 | data_loss = "binary_crossentropy"
71 | else:
72 | raise ValueError(f"this loss not defined {loss}")
73 | return data_loss
74 |
75 | def _get_opt(self, optimizer: str, lr: float):
76 | """Get the optimizer according to configuration. Usually we will use Adam.
77 | Returns:
78 | object: An optimizer.
79 | """
80 |
81 | if optimizer == "adam":
82 | train_opt = keras.optimizers.Adam(learning_rate=lr)
83 | else:
84 | raise ValueError(f"this optimizer not defined {optimizer}")
85 |
86 | return train_opt
87 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/layers.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow.keras as keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras import backend as K
5 |
6 |
7 | class AttLayer2(layers.Layer):
8 | """Soft alignment attention implement.
9 |
10 | Attributes:
11 | dim (int): attention hidden dim
12 | """
13 |
14 | def __init__(self, dim=200, seed=0, **kwargs):
15 | """Initialization steps for AttLayer2.
16 |
17 | Args:
18 | dim (int): attention hidden dim
19 | """
20 |
21 | self.dim = dim
22 | self.seed = seed
23 | super(AttLayer2, self).__init__(**kwargs)
24 |
25 | def build(self, input_shape):
26 | """Initialization for variables in AttLayer2
27 | There are there variables in AttLayer2, i.e. W, b and q.
28 |
29 | Args:
30 | input_shape (object): shape of input tensor.
31 | """
32 |
33 | assert len(input_shape) == 3
34 | dim = self.dim
35 | self.W = self.add_weight(
36 | name="W",
37 | shape=(int(input_shape[-1]), dim),
38 | initializer=keras.initializers.glorot_uniform(seed=self.seed),
39 | trainable=True,
40 | )
41 | self.b = self.add_weight(
42 | name="b",
43 | shape=(dim,),
44 | initializer=keras.initializers.Zeros(),
45 | trainable=True,
46 | )
47 | self.q = self.add_weight(
48 | name="q",
49 | shape=(dim, 1),
50 | initializer=keras.initializers.glorot_uniform(seed=self.seed),
51 | trainable=True,
52 | )
53 | super(AttLayer2, self).build(input_shape) # be sure you call this somewhere!
54 |
55 | def call(self, inputs, mask=None, **kwargs):
56 | """Core implemention of soft attention
57 |
58 | Args:
59 | inputs (object): input tensor.
60 |
61 | Returns:
62 | object: weighted sum of input tensors.
63 | """
64 |
65 | attention = K.tanh(K.dot(inputs, self.W) + self.b)
66 | attention = K.dot(attention, self.q)
67 |
68 | attention = K.squeeze(attention, axis=2)
69 |
70 | if mask == None:
71 | attention = K.exp(attention)
72 | else:
73 | attention = K.exp(attention) * K.cast(mask, dtype="float32")
74 |
75 | attention_weight = attention / (
76 | K.sum(attention, axis=-1, keepdims=True) + K.epsilon()
77 | )
78 |
79 | attention_weight = K.expand_dims(attention_weight)
80 | weighted_input = inputs * attention_weight
81 | return K.sum(weighted_input, axis=1)
82 |
83 | def compute_mask(self, input, input_mask=None):
84 | """Compte output mask value
85 |
86 | Args:
87 | input (object): input tensor.
88 | input_mask: input mask
89 |
90 | Returns:
91 | object: output mask.
92 | """
93 | return None
94 |
95 | def compute_output_shape(self, input_shape):
96 | """Compute shape of output tensor
97 |
98 | Args:
99 | input_shape (tuple): shape of input tensor.
100 |
101 | Returns:
102 | tuple: shape of output tensor.
103 | """
104 | return input_shape[0], input_shape[-1]
105 |
106 |
107 | class SelfAttention(layers.Layer):
108 | """Multi-head self attention implement.
109 |
110 | Args:
111 | multiheads (int): The number of heads.
112 | head_dim (object): Dimention of each head.
113 | mask_right (boolean): whether to mask right words.
114 |
115 | Returns:
116 | object: Weighted sum after attention.
117 | """
118 |
119 | def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs):
120 | """Initialization steps for AttLayer2.
121 |
122 | Args:
123 | multiheads (int): The number of heads.
124 | head_dim (object): Dimention of each head.
125 | mask_right (boolean): whether to mask right words.
126 | """
127 |
128 | self.multiheads = multiheads
129 | self.head_dim = head_dim
130 | self.output_dim = multiheads * head_dim
131 | self.mask_right = mask_right
132 | self.seed = seed
133 | super(SelfAttention, self).__init__(**kwargs)
134 |
135 | def compute_output_shape(self, input_shape):
136 | """Compute shape of output tensor.
137 |
138 | Returns:
139 | tuple: output shape tuple.
140 | """
141 |
142 | return (input_shape[0][0], input_shape[0][1], self.output_dim)
143 |
144 | def build(self, input_shape):
145 | """Initialization for variables in SelfAttention.
146 | There are three variables in SelfAttention, i.e. WQ, WK ans WV.
147 | WQ is used for linear transformation of query.
148 | WK is used for linear transformation of key.
149 | WV is used for linear transformation of value.
150 |
151 | Args:
152 | input_shape (object): shape of input tensor.
153 | """
154 |
155 | self.WQ = self.add_weight(
156 | name="WQ",
157 | shape=(int(input_shape[0][-1]), self.output_dim),
158 | initializer=keras.initializers.glorot_uniform(seed=self.seed),
159 | trainable=True,
160 | )
161 | self.WK = self.add_weight(
162 | name="WK",
163 | shape=(int(input_shape[1][-1]), self.output_dim),
164 | initializer=keras.initializers.glorot_uniform(seed=self.seed),
165 | trainable=True,
166 | )
167 | self.WV = self.add_weight(
168 | name="WV",
169 | shape=(int(input_shape[2][-1]), self.output_dim),
170 | initializer=keras.initializers.glorot_uniform(seed=self.seed),
171 | trainable=True,
172 | )
173 | super(SelfAttention, self).build(input_shape)
174 |
175 | def Mask(self, inputs, seq_len, mode="add"):
176 | """Mask operation used in multi-head self attention
177 |
178 | Args:
179 | seq_len (object): sequence length of inputs.
180 | mode (str): mode of mask.
181 |
182 | Returns:
183 | object: tensors after masking.
184 | """
185 |
186 | if seq_len is None:
187 | return inputs
188 | else:
189 | mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1])
190 | mask = 1 - K.cumsum(mask, axis=1)
191 |
192 | for _ in range(len(inputs.shape) - 2):
193 | mask = K.expand_dims(mask, 2)
194 |
195 | if mode == "mul":
196 | return inputs * mask
197 | elif mode == "add":
198 | return inputs - (1 - mask) * 1e12
199 |
200 | def call(self, QKVs):
201 | """Core logic of multi-head self attention.
202 |
203 | Args:
204 | QKVs (list): inputs of multi-head self attention i.e. qeury, key and value.
205 |
206 | Returns:
207 | object: ouput tensors.
208 | """
209 | if len(QKVs) == 3:
210 | Q_seq, K_seq, V_seq = QKVs
211 | Q_len, V_len = None, None
212 | elif len(QKVs) == 5:
213 | Q_seq, K_seq, V_seq, Q_len, V_len = QKVs
214 | Q_seq = K.dot(Q_seq, self.WQ)
215 | Q_seq = K.reshape(
216 | Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)
217 | )
218 | Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))
219 |
220 | K_seq = K.dot(K_seq, self.WK)
221 | K_seq = K.reshape(
222 | K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)
223 | )
224 | K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))
225 |
226 | V_seq = K.dot(V_seq, self.WV)
227 | V_seq = K.reshape(
228 | V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)
229 | )
230 | V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))
231 | A = tf.matmul(Q_seq, K_seq, adjoint_a=False, adjoint_b=True) / K.sqrt(
232 | K.cast(self.head_dim, dtype="float32")
233 | )
234 |
235 | A = K.permute_dimensions(
236 | A, pattern=(0, 3, 2, 1)
237 | ) # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]
238 |
239 | A = self.Mask(A, V_len, "add")
240 | A = K.permute_dimensions(A, pattern=(0, 3, 2, 1))
241 |
242 | if self.mask_right:
243 | ones = K.ones_like(A[:1, :1])
244 | lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
245 | mask = (ones - lower_triangular) * 1e12
246 | A = A - mask
247 | A = K.softmax(A)
248 |
249 | O_seq = tf.matmul(A, V_seq, adjoint_a=True, adjoint_b=False)
250 | O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))
251 |
252 | O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim))
253 | O_seq = self.Mask(O_seq, Q_len, "mul")
254 | return O_seq
255 |
256 | def get_config(self):
257 | """add multiheads, multiheads and mask_right into layer config.
258 |
259 | Returns:
260 | dict: config of SelfAttention layer.
261 | """
262 | config = super(SelfAttention, self).get_config()
263 | config.update(
264 | {
265 | "multiheads": self.multiheads,
266 | "head_dim": self.head_dim,
267 | "mask_right": self.mask_right,
268 | }
269 | )
270 | return config
271 |
272 |
273 | class ComputeMasking(layers.Layer):
274 | """Compute if inputs contains zero value.
275 |
276 | Returns:
277 | bool tensor: True for values not equal to zero.
278 | """
279 |
280 | def __init__(self, **kwargs):
281 | super(ComputeMasking, self).__init__(**kwargs)
282 |
283 | def call(self, inputs, **kwargs):
284 | mask = K.not_equal(inputs, 0)
285 | return K.cast(mask, K.floatx())
286 |
287 | def compute_output_shape(self, input_shape):
288 | return input_shape
289 |
290 |
291 | class OverwriteMasking(layers.Layer):
292 | """Set values at spasific positions to zero.
293 |
294 | Args:
295 | inputs (list): value tensor and mask tensor.
296 |
297 | Returns:
298 | object: tensor after setting values to zero.
299 | """
300 |
301 | def __init__(self, **kwargs):
302 | super(OverwriteMasking, self).__init__(**kwargs)
303 |
304 | def build(self, input_shape):
305 | super(OverwriteMasking, self).build(input_shape)
306 |
307 | def call(self, inputs, **kwargs):
308 | return inputs[0] * K.expand_dims(inputs[1])
309 |
310 | def compute_output_shape(self, input_shape):
311 | return input_shape[0]
312 |
313 |
314 | def PersonalizedAttentivePooling(dim1, dim2, dim3, seed=0):
315 | """Soft alignment attention implement.
316 | Attributes:
317 | dim1 (int): first dimention of value shape.
318 | dim2 (int): second dimention of value shape.
319 | dim3 (int): shape of query
320 |
321 | Returns:
322 | object: weighted summary of inputs value.
323 | """
324 | vecs_input = keras.Input(shape=(dim1, dim2), dtype="float32")
325 | query_input = keras.Input(shape=(dim3,), dtype="float32")
326 |
327 | user_vecs = layers.Dropout(0.2)(vecs_input)
328 | user_att = layers.Dense(
329 | dim3,
330 | activation="tanh",
331 | kernel_initializer=keras.initializers.glorot_uniform(seed=seed),
332 | bias_initializer=keras.initializers.Zeros(),
333 | )(user_vecs)
334 | user_att2 = layers.Dot(axes=-1)([query_input, user_att])
335 | user_att2 = layers.Activation("softmax")(user_att2)
336 | user_vec = layers.Dot((1, 1))([user_vecs, user_att2])
337 |
338 | model = keras.Model([vecs_input, query_input], user_vec)
339 | return model
340 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/lstur.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 | # Licensed under the MIT License.
3 | from ebrec.models.newsrec.layers import AttLayer2, ComputeMasking, OverwriteMasking
4 | from ebrec.models.newsrec.base_model import BaseModel
5 | from tensorflow.keras import layers
6 | import tensorflow.keras as keras
7 |
8 |
9 | __all__ = ["LSTURModel"]
10 |
11 |
12 | class LSTURModel(BaseModel):
13 | """LSTUR model(Neural News Recommendation with Multi-Head Self-Attention)
14 |
15 | Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie:
16 | Neural News Recommendation with Long- and Short-term User Representations, ACL 2019
17 |
18 | Attributes:0
19 | word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
20 | hparam (object): Global hyper-parameters.
21 | """
22 |
23 | def __init__(
24 | self,
25 | hparams,
26 | word2vec_embedding=None,
27 | seed=None,
28 | **kwargs,
29 | ):
30 | """Initialization steps for LSTUR.
31 | Compared with the BaseModel, LSTUR need word embedding.
32 | After creating word embedding matrix, BaseModel's __init__ method will be called.
33 |
34 | Args:
35 | hparams (object): Global hyper-parameters. Some key setttings such as type and gru_unit are there.
36 | """
37 |
38 | super().__init__(
39 | hparams=hparams,
40 | word2vec_embedding=word2vec_embedding,
41 | seed=seed,
42 | **kwargs,
43 | )
44 |
45 | def _build_graph(self):
46 | """Build LSTUR model and scorer.
47 |
48 | Returns:
49 | object: a model used to train.
50 | object: a model used to evaluate and inference.
51 | """
52 |
53 | model, scorer = self._build_lstur()
54 | return model, scorer
55 |
56 | def _build_userencoder(self, titleencoder, type="ini"):
57 | """The main function to create user encoder of LSTUR.
58 |
59 | Args:
60 | titleencoder (object): the news encoder of LSTUR.
61 |
62 | Return:
63 | object: the user encoder of LSTUR.
64 | """
65 |
66 | his_input_title = keras.Input(
67 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
68 | )
69 | user_indexes = keras.Input(shape=(1,), dtype="int32")
70 |
71 | user_embedding_layer = layers.Embedding(
72 | input_dim=self.hparams.n_users + 1,
73 | output_dim=self.hparams.gru_unit, # Dimension of the dense embedding.
74 | trainable=True,
75 | embeddings_initializer="zeros",
76 | )
77 |
78 | long_u_emb = layers.Reshape((self.hparams.gru_unit,))(
79 | user_embedding_layer(user_indexes)
80 | )
81 | click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title)
82 |
83 | if type == "ini":
84 | user_present = layers.GRU(
85 | self.hparams.gru_unit,
86 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
87 | recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
88 | bias_initializer=keras.initializers.Zeros(),
89 | )(
90 | layers.Masking(mask_value=0.0)(click_title_presents),
91 | initial_state=[long_u_emb],
92 | )
93 | elif type == "con":
94 | short_uemb = layers.GRU(
95 | self.hparams.gru_unit,
96 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
97 | recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
98 | bias_initializer=keras.initializers.Zeros(),
99 | )(layers.Masking(mask_value=0.0)(click_title_presents))
100 |
101 | user_present = layers.Concatenate()([short_uemb, long_u_emb])
102 | user_present = layers.Dense(
103 | self.hparams.gru_unit,
104 | bias_initializer=keras.initializers.Zeros(),
105 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
106 | )(user_present)
107 |
108 | model = keras.Model(
109 | [his_input_title, user_indexes], user_present, name="user_encoder"
110 | )
111 | return model
112 |
113 | def _build_newsencoder(self, embedding_layer):
114 | """The main function to create news encoder of LSTUR.
115 |
116 | Args:
117 | embedding_layer (object): a word embedding layer.
118 |
119 | Return:
120 | object: the news encoder of LSTUR.
121 | """
122 |
123 | sequences_input_title = keras.Input(
124 | shape=(self.hparams.title_size,), dtype="int32"
125 | )
126 | embedded_sequences_title = embedding_layer(sequences_input_title)
127 |
128 | y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
129 | y = layers.Conv1D(
130 | self.hparams.filter_num,
131 | self.hparams.window_size,
132 | activation=self.hparams.cnn_activation,
133 | padding="same",
134 | bias_initializer=keras.initializers.Zeros(),
135 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
136 | )(y)
137 | y = layers.Dropout(self.hparams.dropout)(y)
138 | y = layers.Masking()(
139 | OverwriteMasking()([y, ComputeMasking()(sequences_input_title)])
140 | )
141 | pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
142 | print(pred_title)
143 | model = keras.Model(sequences_input_title, pred_title, name="news_encoder")
144 | return model
145 |
146 | def _build_lstur(self):
147 | """The main function to create LSTUR's logic. The core of LSTUR
148 | is a user encoder and a news encoder.
149 |
150 | Returns:
151 | object: a model used to train.
152 | object: a model used to evaluate and inference.
153 | """
154 |
155 | his_input_title = keras.Input(
156 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
157 | )
158 | pred_input_title = keras.Input(
159 | # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
160 | shape=(None, self.hparams.title_size),
161 | dtype="int32",
162 | )
163 | pred_input_title_one = keras.Input(
164 | shape=(
165 | 1,
166 | self.hparams.title_size,
167 | ),
168 | dtype="int32",
169 | )
170 | pred_title_reshape = layers.Reshape((self.hparams.title_size,))(
171 | pred_input_title_one
172 | )
173 | user_indexes = keras.Input(shape=(1,), dtype="int32")
174 |
175 | embedding_layer = layers.Embedding(
176 | self.word2vec_embedding.shape[0],
177 | self.word2vec_embedding.shape[1],
178 | weights=[self.word2vec_embedding],
179 | trainable=True,
180 | )
181 |
182 | titleencoder = self._build_newsencoder(embedding_layer)
183 | self.userencoder = self._build_userencoder(titleencoder, type=self.hparams.type)
184 | self.newsencoder = titleencoder
185 |
186 | user_present = self.userencoder([his_input_title, user_indexes])
187 | news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title)
188 | news_present_one = self.newsencoder(pred_title_reshape)
189 |
190 | preds = layers.Dot(axes=-1)([news_present, user_present])
191 | preds = layers.Activation(activation="softmax")(preds)
192 |
193 | pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
194 | pred_one = layers.Activation(activation="sigmoid")(pred_one)
195 |
196 | model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
197 | scorer = keras.Model(
198 | [user_indexes, his_input_title, pred_input_title_one], pred_one
199 | )
200 |
201 | return model, scorer
202 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/model_config.py:
--------------------------------------------------------------------------------
1 | #
2 | DEFAULT_TITLE_SIZE = 30
3 | DEFAULT_BODY_SIZE = 40
4 | UNKNOWN_TITLE_VALUE = [0] * DEFAULT_TITLE_SIZE
5 | UNKNOWN_BODY_VALUE = [0] * DEFAULT_BODY_SIZE
6 |
7 | DEFAULT_DOCUMENT_SIZE = 768
8 |
9 |
10 | def print_hparams(hparams_class):
11 | for attr, value in hparams_class.__annotations__.items():
12 | # Print attribute names and values
13 | print(f"{attr}: {getattr(hparams_class, attr)}")
14 |
15 |
16 | def hparams_to_dict(hparams_class) -> dict:
17 | params = {}
18 | for attr, value in hparams_class.__annotations__.items():
19 | params[attr] = getattr(hparams_class, attr)
20 | return params
21 |
22 |
23 | class hparams_naml:
24 | # INPUT DIMENTIONS:
25 | title_size: int = DEFAULT_TITLE_SIZE
26 | history_size: int = 20
27 | body_size: int = DEFAULT_BODY_SIZE
28 | vert_num: int = 100
29 | vert_emb_dim: int = 10
30 | subvert_num: int = 100
31 | subvert_emb_dim: int = 10
32 | # MODEL ARCHITECTURE
33 | dense_activation: str = "relu"
34 | cnn_activation: str = "relu"
35 | attention_hidden_dim: int = 200
36 | filter_num: int = 400
37 | window_size: int = 3
38 | # MODEL OPTIMIZER:
39 | optimizer: str = "adam"
40 | loss: str = "cross_entropy_loss"
41 | dropout: float = 0.2
42 | learning_rate: float = 1e-4
43 |
44 |
45 | class hparams_lstur:
46 | # INPUT DIMENTIONS:
47 | title_size: int = DEFAULT_TITLE_SIZE
48 | history_size: int = 20
49 | n_users: int = 50000
50 | # MODEL ARCHITECTURE
51 | cnn_activation: str = "relu"
52 | type: str = "ini"
53 | attention_hidden_dim: int = 200
54 | gru_unit: int = 400
55 | filter_num: int = 400
56 | window_size: int = 3
57 | # MODEL OPTIMIZER:
58 | optimizer: str = "adam"
59 | loss: str = "cross_entropy_loss"
60 | dropout: float = 0.2
61 | learning_rate: float = 1e-4
62 |
63 |
64 | class hparams_npa:
65 | # INPUT DIMENTIONS:
66 | title_size: int = DEFAULT_TITLE_SIZE
67 | history_size: int = 20
68 | n_users: int = 50000
69 | # MODEL ARCHITECTURE
70 | cnn_activation: str = "relu"
71 | attention_hidden_dim: int = 200
72 | user_emb_dim: int = 400
73 | filter_num: int = 400
74 | window_size: int = 3
75 | # MODEL OPTIMIZER:
76 | optimizer: str = "adam"
77 | loss: str = "cross_entropy_loss"
78 | dropout: float = 0.2
79 | learning_rate: float = 1e-4
80 |
81 |
82 | class hparams_nrms:
83 | # INPUT DIMENTIONS:
84 | title_size: int = DEFAULT_TITLE_SIZE
85 | history_size: int = 20
86 | # MODEL ARCHITECTURE
87 | head_num: int = 20
88 | head_dim: int = 20
89 | attention_hidden_dim: int = 200
90 | # MODEL OPTIMIZER:
91 | optimizer: str = "adam"
92 | loss: str = "cross_entropy_loss"
93 | dropout: float = 0.2
94 | learning_rate: float = 1e-4
95 | # MY OWN LITTLE TWIST:
96 | newsencoder_units_per_layer: list[int] = None
97 | newsencoder_l2_regularization: float = 1e-4
98 |
99 |
100 | class hparams_nrms_docvec:
101 | # INPUT DIMENTIONS:
102 | title_size: int = DEFAULT_DOCUMENT_SIZE
103 | history_size: int = 20
104 | # MODEL ARCHITECTURE
105 | head_num: int = 16
106 | head_dim: int = 16
107 | attention_hidden_dim: int = 200
108 | # MODEL OPTIMIZER:
109 | optimizer: str = "adam"
110 | loss: str = "cross_entropy_loss"
111 | dropout: float = 0.2
112 | learning_rate: float = 1e-4
113 | newsencoder_units_per_layer: list[int] = [512, 512, 512]
114 | newsencoder_l2_regularization: float = 1e-4
115 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/npa.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 | # Licensed under the MIT License.
3 | from tensorflow.keras import layers
4 | import tensorflow.keras as keras
5 | import tensorflow as tf
6 | import numpy as np
7 |
8 | from ebrec.models.newsrec.layers import PersonalizedAttentivePooling
9 | from ebrec.models.newsrec.base_model import BaseModel
10 |
11 | __all__ = ["NPAModel"]
12 |
13 |
14 | class NPAModel(BaseModel):
15 | """NPA model(Neural News Recommendation with Attentive Multi-View Learning)
16 |
17 | Attributes:
18 | word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
19 | hparam (object): Global hyper-parameters.
20 | """
21 |
22 | def __init__(
23 | self,
24 | hparams,
25 | word2vec_embedding=None,
26 | seed=None,
27 | **kwargs,
28 | ):
29 | """Initialization steps for MANL.
30 | Compared with the BaseModel, NPA need word embedding.
31 | After creating word embedding matrix, BaseModel's __init__ method will be called.
32 |
33 | Args:
34 | hparams (object): Global hyper-parameters. Some key setttings such as filter_num are there.
35 | """
36 |
37 | super().__init__(
38 | hparams=hparams,
39 | word2vec_embedding=word2vec_embedding,
40 | seed=seed,
41 | **kwargs,
42 | )
43 |
44 | def _get_input_label_from_iter(self, batch_data):
45 | input_feat = [
46 | batch_data["user_index_batch"],
47 | batch_data["clicked_title_batch"],
48 | batch_data["candidate_title_batch"],
49 | ]
50 | input_label = batch_data["labels"]
51 | return input_feat, input_label
52 |
53 | def _build_graph(self):
54 | """Build NPA model and scorer.
55 |
56 | Returns:
57 | object: a model used to train.
58 | object: a model used to evaluate and inference.
59 | """
60 |
61 | model, scorer = self._build_npa()
62 | return model, scorer
63 |
64 | def _build_userencoder(self, titleencoder, user_embedding_layer):
65 | """The main function to create user encoder of NPA.
66 |
67 | Args:
68 | titleencoder (object): the news encoder of NPA.
69 |
70 | Return:
71 | object: the user encoder of NPA.
72 | """
73 |
74 | his_input_title = keras.Input(
75 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
76 | )
77 | user_indexes = keras.Input(shape=(1,), dtype="int32")
78 |
79 | nuser_id = layers.Reshape((1, 1))(user_indexes)
80 | repeat_uids = layers.Concatenate(axis=-2)(
81 | [nuser_id] * self.hparams.history_size
82 | )
83 | his_title_uid = layers.Concatenate(axis=-1)([his_input_title, repeat_uids])
84 |
85 | click_title_presents = layers.TimeDistributed(titleencoder)(his_title_uid)
86 |
87 | u_emb = layers.Reshape((self.hparams.user_emb_dim,))(
88 | user_embedding_layer(user_indexes)
89 | )
90 | user_present = PersonalizedAttentivePooling(
91 | self.hparams.history_size,
92 | self.hparams.filter_num,
93 | self.hparams.attention_hidden_dim,
94 | seed=self.seed,
95 | )(
96 | [
97 | click_title_presents,
98 | layers.Dense(self.hparams.attention_hidden_dim)(u_emb),
99 | ]
100 | )
101 |
102 | model = keras.Model(
103 | [his_input_title, user_indexes], user_present, name="user_encoder"
104 | )
105 | return model
106 |
107 | def _build_newsencoder(self, embedding_layer, user_embedding_layer):
108 | """The main function to create news encoder of NPA.
109 |
110 | Args:
111 | embedding_layer (object): a word embedding layer.
112 |
113 | Return:
114 | object: the news encoder of NPA.
115 | """
116 |
117 | sequence_title_uindex = keras.Input(
118 | shape=(self.hparams.title_size + 1,), dtype="int32"
119 | )
120 |
121 | sequences_input_title = layers.Lambda(
122 | lambda x: x[:, : self.hparams.title_size]
123 | )(sequence_title_uindex)
124 | user_index = layers.Lambda(lambda x: x[:, self.hparams.title_size :])(
125 | sequence_title_uindex
126 | )
127 |
128 | u_emb = layers.Reshape((self.hparams.user_emb_dim,))(
129 | user_embedding_layer(user_index)
130 | )
131 | embedded_sequences_title = embedding_layer(sequences_input_title)
132 |
133 | y = layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
134 | y = layers.Conv1D(
135 | self.hparams.filter_num,
136 | self.hparams.window_size,
137 | activation=self.hparams.cnn_activation,
138 | padding="same",
139 | bias_initializer=keras.initializers.Zeros(),
140 | kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
141 | )(y)
142 | y = layers.Dropout(self.hparams.dropout)(y)
143 |
144 | pred_title = PersonalizedAttentivePooling(
145 | self.hparams.title_size,
146 | self.hparams.filter_num,
147 | self.hparams.attention_hidden_dim,
148 | seed=self.seed,
149 | )([y, layers.Dense(self.hparams.attention_hidden_dim)(u_emb)])
150 |
151 | # pred_title = Reshape((1, feature_size))(pred_title)
152 | model = keras.Model(sequence_title_uindex, pred_title, name="news_encoder")
153 | return model
154 |
155 | def _build_npa(self):
156 | """The main function to create NPA's logic. The core of NPA
157 | is a user encoder and a news encoder.
158 |
159 | Returns:
160 | object: a model used to train.
161 | object: a model used to evaluate and predict.
162 | """
163 |
164 | his_input_title = keras.Input(
165 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
166 | )
167 | pred_input_title = keras.Input(
168 | # shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
169 | shape=(None, self.hparams.title_size),
170 | dtype="int32",
171 | )
172 | pred_input_title_one = keras.Input(
173 | shape=(
174 | 1,
175 | self.hparams.title_size,
176 | ),
177 | dtype="int32",
178 | )
179 | pred_title_one_reshape = layers.Reshape((self.hparams.title_size,))(
180 | pred_input_title_one
181 | )
182 |
183 | user_indexes = keras.Input(shape=(1,), dtype="int32")
184 |
185 | nuser_index = layers.Reshape((1, 1))(user_indexes)
186 |
187 | # Calculate npratio + 1 based on the dynamic shape of pred_input_title
188 | npratio_plus_one = tf.shape(pred_input_title)[1]
189 |
190 | repeat_uindex = tf.tile(nuser_index, [1, npratio_plus_one, 1])
191 |
192 | pred_title_uindex = layers.Concatenate(axis=-1)(
193 | [pred_input_title, repeat_uindex]
194 | )
195 | pred_title_uindex_one = layers.Concatenate()(
196 | [pred_title_one_reshape, user_indexes]
197 | )
198 |
199 | embedding_layer = layers.Embedding(
200 | self.word2vec_embedding.shape[0],
201 | self.word2vec_embedding.shape[1],
202 | weights=[self.word2vec_embedding],
203 | trainable=True,
204 | )
205 |
206 | user_embedding_layer = layers.Embedding(
207 | input_dim=self.hparams.n_users + 1,
208 | output_dim=self.hparams.user_emb_dim,
209 | trainable=True,
210 | embeddings_initializer="zeros",
211 | )
212 |
213 | titleencoder = self._build_newsencoder(embedding_layer, user_embedding_layer)
214 | userencoder = self._build_userencoder(titleencoder, user_embedding_layer)
215 | newsencoder = titleencoder
216 |
217 | user_present = userencoder([his_input_title, user_indexes])
218 |
219 | news_present = layers.TimeDistributed(newsencoder)(pred_title_uindex)
220 | news_present_one = newsencoder(pred_title_uindex_one)
221 |
222 | preds = layers.Dot(axes=-1)([news_present, user_present])
223 | preds = layers.Activation(activation="softmax")(preds)
224 |
225 | pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
226 | pred_one = layers.Activation(activation="sigmoid")(pred_one)
227 |
228 | model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
229 | scorer = keras.Model(
230 | [user_indexes, his_input_title, pred_input_title_one], pred_one
231 | )
232 |
233 | return model, scorer
234 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/nrms.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 | # Licensed under the MIT License.
3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 | from tensorflow.keras.layers import Embedding, Input, Dropout, Dense, BatchNormalization
8 | from tensorflow.keras.initializers import GlorotUniform
9 | from tensorflow.keras.regularizers import l2
10 |
11 |
12 | class NRMSModel:
13 | """NRMS model(Neural News Recommendation with Multi-Head Self-Attention)
14 |
15 | Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
16 | Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
17 | on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
18 | on Natural Language Processing (EMNLP-IJCNLP)
19 |
20 | Attributes:
21 | """
22 |
23 | def __init__(
24 | self,
25 | hparams: dict,
26 | word2vec_embedding: np.ndarray = None,
27 | word_emb_dim: int = 300,
28 | vocab_size: int = 32000,
29 | seed: int = None,
30 | ):
31 | """Initialization steps for NRMS."""
32 | self.hparams = hparams
33 | self.seed = seed
34 |
35 | # SET SEED:
36 | tf.random.set_seed(seed)
37 | np.random.seed(seed)
38 |
39 | # INIT THE WORD-EMBEDDINGS:
40 | if word2vec_embedding is None:
41 | # Xavier Initialization
42 | initializer = GlorotUniform(seed=self.seed)
43 | self.word2vec_embedding = initializer(shape=(vocab_size, word_emb_dim))
44 | # self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim)
45 | else:
46 | self.word2vec_embedding = word2vec_embedding
47 |
48 | # BUILD AND COMPILE MODEL:
49 | self.model, self.scorer = self._build_graph()
50 | data_loss = self._get_loss(self.hparams.loss)
51 | train_optimizer = self._get_opt(
52 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
53 | )
54 | self.model.compile(loss=data_loss, optimizer=train_optimizer)
55 |
56 | def _get_loss(self, loss: str):
57 | """Make loss function, consists of data loss and regularization loss
58 | Returns:
59 | object: Loss function or loss function name
60 | """
61 | if loss == "cross_entropy_loss":
62 | data_loss = "categorical_crossentropy"
63 | elif loss == "log_loss":
64 | data_loss = "binary_crossentropy"
65 | else:
66 | raise ValueError(f"this loss not defined {loss}")
67 | return data_loss
68 |
69 | def _get_opt(self, optimizer: str, lr: float):
70 | """Get the optimizer according to configuration. Usually we will use Adam.
71 | Returns:
72 | object: An optimizer.
73 | """
74 | # TODO: shouldn't be a string input you should just set the optimizer, to avoid stuff like this:
75 | # => 'WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.'
76 | if optimizer == "adam":
77 | train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
78 | else:
79 | raise ValueError(f"this optimizer not defined {optimizer}")
80 | return train_opt
81 |
82 | def _build_graph(self):
83 | """Build NRMS model and scorer.
84 |
85 | Returns:
86 | object: a model used to train.
87 | object: a model used to evaluate and inference.
88 | """
89 | model, scorer = self._build_nrms()
90 | return model, scorer
91 |
92 | def _build_userencoder(self, titleencoder):
93 | """The main function to create user encoder of NRMS.
94 |
95 | Args:
96 | titleencoder (object): the news encoder of NRMS.
97 |
98 | Return:
99 | object: the user encoder of NRMS.
100 | """
101 | his_input_title = tf.keras.Input(
102 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
103 | )
104 |
105 | click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)(
106 | his_input_title
107 | )
108 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
109 | [click_title_presents] * 3
110 | )
111 | user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
112 |
113 | model = tf.keras.Model(his_input_title, user_present, name="user_encoder")
114 | return model
115 |
116 | def _build_newsencoder(self, units_per_layer: list[int] = None):
117 | """The main function to create news encoder of NRMS.
118 |
119 | Args:
120 | embedding_layer (object): a word embedding layer.
121 |
122 | Return:
123 | object: the news encoder of NRMS.
124 | """
125 | embedding_layer = tf.keras.layers.Embedding(
126 | self.word2vec_embedding.shape[0],
127 | self.word2vec_embedding.shape[1],
128 | weights=[self.word2vec_embedding],
129 | trainable=True,
130 | )
131 | sequences_input_title = tf.keras.Input(
132 | shape=(self.hparams.title_size,), dtype="int32"
133 | )
134 | embedded_sequences_title = embedding_layer(sequences_input_title)
135 |
136 | y = tf.keras.layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
137 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
138 | [y, y, y]
139 | )
140 |
141 | # Create configurable Dense layers (the if - else is something I've added):
142 | if units_per_layer:
143 | for layer in units_per_layer:
144 | y = tf.keras.layers.Dense(
145 | units=layer,
146 | activation="relu",
147 | kernel_regularizer=tf.keras.regularizers.l2(
148 | self.hparams.newsencoder_l2_regularization
149 | ),
150 | )(y)
151 | y = tf.keras.layers.BatchNormalization()(y)
152 | y = tf.keras.layers.Dropout(self.hparams.dropout)(y)
153 | else:
154 | y = tf.keras.layers.Dropout(self.hparams.dropout)(y)
155 |
156 | pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
157 |
158 | model = tf.keras.Model(sequences_input_title, pred_title, name="news_encoder")
159 | return model
160 |
161 | def _build_nrms(self):
162 | """The main function to create NRMS's logic. The core of NRMS
163 | is a user encoder and a news encoder.
164 |
165 | Returns:
166 | object: a model used to train.
167 | object: a model used to evaluate and inference.
168 | """
169 |
170 | his_input_title = tf.keras.Input(
171 | shape=(self.hparams.history_size, self.hparams.title_size),
172 | dtype="int32",
173 | )
174 | pred_input_title = tf.keras.Input(
175 | # shape = (hparams.npratio + 1, hparams.title_size)
176 | shape=(None, self.hparams.title_size),
177 | dtype="int32",
178 | )
179 | pred_input_title_one = tf.keras.Input(
180 | shape=(
181 | 1,
182 | self.hparams.title_size,
183 | ),
184 | dtype="int32",
185 | )
186 | pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))(
187 | pred_input_title_one
188 | )
189 | titleencoder = self._build_newsencoder(
190 | units_per_layer=self.hparams.newsencoder_units_per_layer
191 | )
192 | self.userencoder = self._build_userencoder(titleencoder)
193 | self.newsencoder = titleencoder
194 |
195 | user_present = self.userencoder(his_input_title)
196 | news_present = tf.keras.layers.TimeDistributed(self.newsencoder)(
197 | pred_input_title
198 | )
199 | news_present_one = self.newsencoder(pred_title_one_reshape)
200 |
201 | preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
202 | preds = tf.keras.layers.Activation(activation="softmax")(preds)
203 |
204 | pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
205 | pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one)
206 |
207 | model = tf.keras.Model([his_input_title, pred_input_title], preds)
208 | scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)
209 |
210 | return model, scorer
211 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/nrms_docvec.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 | # Licensed under the MIT License.
3 | from ebrec.models.newsrec.layers import AttLayer2, SelfAttention
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 |
8 | class NRMSDocVec:
9 | """
10 | Modified NRMS model (Neural News Recommendation with Multi-Head Self-Attention)
11 | - Initiated with article-embeddings.
12 |
13 | Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
14 | Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
15 | on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
16 | on Natural Language Processing (EMNLP-IJCNLP)
17 |
18 | Attributes:
19 | """
20 |
21 | def __init__(
22 | self,
23 | hparams: dict,
24 | seed: int = None,
25 | ):
26 | """Initialization steps for NRMS."""
27 | self.hparams = hparams
28 | self.seed = seed
29 |
30 | # SET SEED:
31 | tf.random.set_seed(seed)
32 | np.random.seed(seed)
33 | # BUILD AND COMPILE MODEL:
34 | self.model, self.scorer = self._build_graph()
35 | data_loss = self._get_loss(self.hparams.loss)
36 | train_optimizer = self._get_opt(
37 | optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
38 | )
39 | self.model.compile(loss=data_loss, optimizer=train_optimizer)
40 |
41 | def _get_loss(self, loss: str):
42 | """Make loss function, consists of data loss and regularization loss
43 | Returns:
44 | object: Loss function or loss function name
45 | """
46 | if loss == "cross_entropy_loss":
47 | data_loss = "categorical_crossentropy"
48 | elif loss == "log_loss":
49 | data_loss = "binary_crossentropy"
50 | else:
51 | raise ValueError(f"this loss not defined {loss}")
52 | return data_loss
53 |
54 | def _get_opt(self, optimizer: str, lr: float):
55 | """Get the optimizer according to configuration. Usually we will use Adam.
56 | Returns:
57 | object: An optimizer.
58 | """
59 | if optimizer == "adam":
60 | train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
61 | else:
62 | raise ValueError(f"this optimizer not defined {optimizer}")
63 | return train_opt
64 |
65 | def _build_graph(self):
66 | """Build NRMS model and scorer.
67 |
68 | Returns:
69 | object: a model used to train.
70 | object: a model used to evaluate and inference.
71 | """
72 | model, scorer = self._build_nrms()
73 | return model, scorer
74 |
75 | def _build_userencoder(self, titleencoder):
76 | """The main function to create user encoder of NRMS.
77 |
78 | Args:
79 | titleencoder (object): the news encoder of NRMS.
80 |
81 | Return:
82 | object: the user encoder of NRMS.
83 | """
84 | his_input_title = tf.keras.Input(
85 | shape=(self.hparams.history_size, self.hparams.title_size), dtype="float32"
86 | )
87 |
88 | click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)(
89 | his_input_title
90 | )
91 | y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
92 | [click_title_presents] * 3
93 | )
94 | user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
95 |
96 | model = tf.keras.Model(his_input_title, user_present, name="user_encoder")
97 | return model
98 |
99 | def _build_newsencoder(self, units_per_layer: list[int] = list[512, 512, 512]):
100 | """THIS IS OUR IMPLEMENTATION.
101 | The main function to create a news encoder.
102 |
103 | Parameters:
104 | units_per_layer (int): The number of neurons in each Dense layer.
105 |
106 | Return:
107 | object: the news encoder.
108 | """
109 | DOCUMENT_VECTOR_DIM = self.hparams.title_size
110 | OUTPUT_DIM = self.hparams.head_num * self.hparams.head_dim
111 |
112 | # DENSE LAYERS (FINE-TUNED):
113 | sequences_input_title = tf.keras.Input(
114 | shape=(DOCUMENT_VECTOR_DIM), dtype="float32"
115 | )
116 | x = sequences_input_title
117 | # Create configurable Dense layers:
118 | for layer in units_per_layer:
119 | x = tf.keras.layers.Dense(
120 | units=layer,
121 | activation="relu",
122 | kernel_regularizer=tf.keras.regularizers.l2(
123 | self.hparams.newsencoder_l2_regularization
124 | ),
125 | )(x)
126 | x = tf.keras.layers.BatchNormalization()(x)
127 | x = tf.keras.layers.Dropout(self.hparams.dropout)(x)
128 |
129 | # OUTPUT:
130 | pred_title = tf.keras.layers.Dense(units=OUTPUT_DIM, activation="relu")(x)
131 |
132 | # Construct the final model
133 | model = tf.keras.Model(
134 | inputs=sequences_input_title, outputs=pred_title, name="news_encoder"
135 | )
136 |
137 | return model
138 |
139 | def _build_nrms(self):
140 | """The main function to create NRMS's logic. The core of NRMS
141 | is a user encoder and a news encoder.
142 |
143 | Returns:
144 | object: a model used to train.
145 | object: a model used to evaluate and inference.
146 | """
147 |
148 | his_input_title = tf.keras.Input(
149 | shape=(self.hparams.history_size, self.hparams.title_size),
150 | dtype="float32",
151 | )
152 | pred_input_title = tf.keras.Input(
153 | # shape = (hparams.npratio + 1, hparams.title_size)
154 | shape=(None, self.hparams.title_size),
155 | dtype="float32",
156 | )
157 | pred_input_title_one = tf.keras.Input(
158 | shape=(
159 | 1,
160 | self.hparams.title_size,
161 | ),
162 | dtype="float32",
163 | )
164 | pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))(
165 | pred_input_title_one
166 | )
167 | titleencoder = self._build_newsencoder(
168 | units_per_layer=self.hparams.newsencoder_units_per_layer
169 | )
170 | self.userencoder = self._build_userencoder(titleencoder)
171 | self.newsencoder = titleencoder
172 |
173 | user_present = self.userencoder(his_input_title)
174 | news_present = tf.keras.layers.TimeDistributed(self.newsencoder)(
175 | pred_input_title
176 | )
177 | news_present_one = self.newsencoder(pred_title_one_reshape)
178 |
179 | preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
180 | preds = tf.keras.layers.Activation(activation="softmax")(preds)
181 |
182 | pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
183 | pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one)
184 |
185 | model = tf.keras.Model([his_input_title, pred_input_title], preds)
186 | scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)
187 |
188 | return model, scorer
189 |
--------------------------------------------------------------------------------
/src/ebrec/models/newsrec/utils.py:
--------------------------------------------------------------------------------
1 | class set_args:
2 | def __init__(self, args_dict):
3 | _ = [setattr(set_args, key, val) for key, val in args_dict.items()]
4 |
5 |
6 | def print_n_parameters(model) -> None:
7 | num_params = model.count_params()
8 | print("Number of parameters:", num_params)
9 |
10 |
11 | def print_parameter_device(model) -> None:
12 | for variable in model.variables:
13 | print(f"Variable name: {variable.name}, Device: {variable.device}")
14 |
--------------------------------------------------------------------------------
/src/ebrec/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/src/ebrec/utils/__init__.py
--------------------------------------------------------------------------------
/src/ebrec/utils/_articles.py:
--------------------------------------------------------------------------------
1 | from ebrec.utils._python import create_lookup_dict
2 | import polars as pl
3 | from ebrec.utils._constants import DEFAULT_ARTICLE_ID_COL
4 |
5 | try:
6 | from transformers import AutoTokenizer
7 | except ImportError:
8 | print("transformers not available")
9 |
10 |
11 | def load_article_id_embeddings(
12 | df: pl.DataFrame, path: str, item_col: str = DEFAULT_ARTICLE_ID_COL
13 | ) -> pl.DataFrame:
14 | """Load embeddings artifacts and join to articles on 'article_id'
15 | Args:
16 | path (str): Path to document embeddings
17 | """
18 | return df.join(pl.read_parquet(path), on=item_col, how="left")
19 |
20 |
21 | def create_article_id_to_value_mapping(
22 | df: pl.DataFrame,
23 | value_col: str,
24 | article_col: str = DEFAULT_ARTICLE_ID_COL,
25 | ):
26 | return create_lookup_dict(
27 | df.select(article_col, value_col), key=article_col, value=value_col
28 | )
29 |
30 |
31 | def convert_text2encoding_with_transformers(
32 | df: pl.DataFrame,
33 | tokenizer: AutoTokenizer,
34 | column: str,
35 | max_length: int = None,
36 | ) -> pl.DataFrame:
37 | """Converts text in a specified DataFrame column to tokens using a provided tokenizer.
38 | Args:
39 | df (pl.DataFrame): The input DataFrame containing the text column.
40 | tokenizer (AutoTokenizer): The tokenizer to use for encoding the text. (from transformers import AutoTokenizer)
41 | column (str): The name of the column containing the text.
42 | max_length (int, optional): The maximum length of the encoded tokens. Defaults to None.
43 | Returns:
44 | pl.DataFrame: A new DataFrame with an additional column containing the encoded tokens.
45 | Example:
46 | >>> from transformers import AutoTokenizer
47 | >>> import polars as pl
48 | >>> df = pl.DataFrame({
49 | 'text': ['This is a test.', 'Another test string.', 'Yet another one.']
50 | })
51 | >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
52 | >>> encoded_df, new_column = convert_text2encoding_with_transformers(df, tokenizer, 'text', max_length=20)
53 | >>> print(encoded_df)
54 | shape: (3, 2)
55 | ┌──────────────────────┬───────────────────────────────┐
56 | │ text ┆ text_encode_bert-base-uncased │
57 | │ --- ┆ --- │
58 | │ str ┆ list[i64] │
59 | ╞══════════════════════╪═══════════════════════════════╡
60 | │ This is a test. ┆ [2023, 2003, … 0] │
61 | │ Another test string. ┆ [2178, 3231, … 0] │
62 | │ Yet another one. ┆ [2664, 2178, … 0] │
63 | └──────────────────────┴───────────────────────────────┘
64 | >>> print(new_column)
65 | text_encode_bert-base-uncased
66 | """
67 | text = df[column].to_list()
68 | # set columns
69 | new_column = f"{column}_encode_{tokenizer.name_or_path}"
70 | # If 'max_length' is provided then set it, else encode each string its original length
71 | padding = "max_length" if max_length else False
72 | encoded_tokens = tokenizer(
73 | text,
74 | add_special_tokens=False,
75 | padding=padding,
76 | max_length=max_length,
77 | truncation=True,
78 | )["input_ids"]
79 | return df.with_columns(pl.Series(new_column, encoded_tokens)), new_column
80 |
81 |
82 | def create_sort_based_prediction_score(
83 | df: pl.DataFrame,
84 | column: str,
85 | desc: bool,
86 | article_id_col: str = DEFAULT_ARTICLE_ID_COL,
87 | prediction_score_col: str = "prediction_score",
88 | ) -> pl.DataFrame:
89 | """
90 | Generates a prediction score for each row in a Polars DataFrame based on the sorting of a specified column.
91 |
92 | Args:
93 | df (pl.DataFrame): The input DataFrame to process.
94 | column (str): The name of the column to sort by and to base the prediction scores on.
95 | desc (bool): Determines the sorting order. If True, sort in descending order; otherwise, in ascending order.
96 | article_id_col (str, optional): The name article ID column. Defaults to "article_id".
97 | prediction_score_col (str, optional): The name to assign to the prediction score column. Defaults to "prediction_score".
98 |
99 | Returns:
100 | pl.DataFrame: A Polars DataFrame including the original data along with the new prediction score column.
101 |
102 | Examples:
103 | >>> import polars as pl
104 | >>> df = pl.DataFrame({
105 | "article_id": [1, 2, 3, 4, 5],
106 | "views": [100, 150, 200, 50, 300],
107 | })
108 | >>> create_sort_based_prediction_score(df, "views", True)
109 | shape: (5, 3)
110 | ┌────────────┬───────┬──────────────────┐
111 | │ article_id ┆ views ┆ prediction_score │
112 | │ --- ┆ --- ┆ --- │
113 | │ i64 ┆ i64 ┆ f64 │
114 | ╞════════════╪═══════╪══════════════════╡
115 | │ 5 ┆ 300 ┆ 1.0 │
116 | │ 3 ┆ 200 ┆ 0.5 │
117 | │ 2 ┆ 150 ┆ 0.333333 │
118 | │ 1 ┆ 100 ┆ 0.25 │
119 | │ 4 ┆ 50 ┆ 0.2 │
120 | └────────────┴───────┴──────────────────┘
121 | """
122 | _TEMP_NAME = "index"
123 | return (
124 | (
125 | df.select(article_id_col, column)
126 | .sort(by=column, descending=desc)
127 | .with_row_index(name=_TEMP_NAME, offset=1)
128 | )
129 | .with_columns((1 / pl.col(_TEMP_NAME)).alias(prediction_score_col))
130 | .drop(_TEMP_NAME)
131 | )
132 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_articles_behaviors.py:
--------------------------------------------------------------------------------
1 | from ebrec.utils._python import generate_unique_name
2 |
3 | try:
4 | import polars as pl
5 | except ImportError:
6 | print("polars not available")
7 |
8 |
9 | def map_list_article_id_to_value(
10 | behaviors: pl.DataFrame,
11 | behaviors_column: str,
12 | mapping: dict[int, pl.Series],
13 | drop_nulls: bool = False,
14 | fill_nulls: any = None,
15 | ) -> pl.DataFrame:
16 | """
17 |
18 | Maps the values of a column in a DataFrame `behaviors` containing article IDs to their corresponding values
19 | in a column in another DataFrame `articles`. The mapping is performed using a dictionary constructed from
20 | the two DataFrames. The resulting DataFrame has the same columns as `behaviors`, but with the article IDs
21 | replaced by their corresponding values.
22 |
23 | Args:
24 | behaviors (pl.DataFrame): The DataFrame containing the column to be mapped.
25 | behaviors_column (str): The name of the column to be mapped in `behaviors`.
26 | mapping (dict[int, pl.Series]): A dictionary with article IDs as keys and corresponding values as values.
27 | Note, 'replace' works a lot faster when values are of type pl.Series!
28 | drop_nulls (bool): If `True`, any rows in the resulting DataFrame with null values will be dropped.
29 | If `False` and `fill_nulls` is specified, null values in `behaviors_column` will be replaced with `fill_null`.
30 | fill_nulls (Optional[any]): If specified, any null values in `behaviors_column` will be replaced with this value.
31 |
32 | Returns:
33 | pl.DataFrame: A new DataFrame with the same columns as `behaviors`, but with the article IDs in
34 | `behaviors_column` replaced by their corresponding values in `mapping`.
35 |
36 | Example:
37 | >>> behaviors = pl.DataFrame(
38 | {"user_id": [1, 2, 3, 4, 5], "article_ids": [["A1", "A2"], ["A2", "A3"], ["A1", "A4"], ["A4", "A4"], None]}
39 | )
40 | >>> articles = pl.DataFrame(
41 | {
42 | "article_id": ["A1", "A2", "A3"],
43 | "article_type": ["News", "Sports", "Entertainment"],
44 | }
45 | )
46 | >>> articles_dict = dict(zip(articles["article_id"], articles["article_type"]))
47 | >>> map_list_article_id_to_value(
48 | behaviors=behaviors,
49 | behaviors_column="article_ids",
50 | mapping=articles_dict,
51 | fill_nulls="Unknown",
52 | )
53 | shape: (4, 2)
54 | ┌─────────┬─────────────────────────────┐
55 | │ user_id ┆ article_ids │
56 | │ --- ┆ --- │
57 | │ i64 ┆ list[str] │
58 | ╞═════════╪═════════════════════════════╡
59 | │ 1 ┆ ["News", "Sports"] │
60 | │ 2 ┆ ["Sports", "Entertainment"] │
61 | │ 3 ┆ ["News", "Unknown"] │
62 | │ 4 ┆ ["Unknown", "Unknown"] │
63 | │ 5 ┆ ["Unknown"] │
64 | └─────────┴─────────────────────────────┘
65 | >>> map_list_article_id_to_value(
66 | behaviors=behaviors,
67 | behaviors_column="article_ids",
68 | mapping=articles_dict,
69 | drop_nulls=True,
70 | )
71 | shape: (4, 2)
72 | ┌─────────┬─────────────────────────────┐
73 | │ user_id ┆ article_ids │
74 | │ --- ┆ --- │
75 | │ i64 ┆ list[str] │
76 | ╞═════════╪═════════════════════════════╡
77 | │ 1 ┆ ["News", "Sports"] │
78 | │ 2 ┆ ["Sports", "Entertainment"] │
79 | │ 3 ┆ ["News"] │
80 | │ 4 ┆ null │
81 | │ 5 ┆ null │
82 | └─────────┴─────────────────────────────┘
83 | >>> map_list_article_id_to_value(
84 | behaviors=behaviors,
85 | behaviors_column="article_ids",
86 | mapping=articles_dict,
87 | drop_nulls=False,
88 | )
89 | shape: (4, 2)
90 | ┌─────────┬─────────────────────────────┐
91 | │ user_id ┆ article_ids │
92 | │ --- ┆ --- │
93 | │ i64 ┆ list[str] │
94 | ╞═════════╪═════════════════════════════╡
95 | │ 1 ┆ ["News", "Sports"] │
96 | │ 2 ┆ ["Sports", "Entertainment"] │
97 | │ 3 ┆ ["News", null] │
98 | │ 4 ┆ [null, null] │
99 | │ 5 ┆ [null] │
100 | └─────────┴─────────────────────────────┘
101 | """
102 | GROUPBY_ID = generate_unique_name(behaviors.columns, "_groupby_id")
103 | behaviors = behaviors.lazy().with_row_index(GROUPBY_ID)
104 | # =>
105 | select_column = (
106 | behaviors.select(pl.col(GROUPBY_ID), pl.col(behaviors_column))
107 | .explode(behaviors_column)
108 | .with_columns(pl.col(behaviors_column).replace(mapping, default=None))
109 | .collect()
110 | )
111 | # =>
112 | if drop_nulls:
113 | select_column = select_column.drop_nulls()
114 | elif fill_nulls is not None:
115 | select_column = select_column.with_columns(
116 | pl.col(behaviors_column).fill_null(fill_nulls)
117 | )
118 | # =>
119 | select_column = (
120 | select_column.lazy().group_by(GROUPBY_ID).agg(behaviors_column).collect()
121 | )
122 | return (
123 | behaviors.drop(behaviors_column)
124 | .collect()
125 | .join(select_column, on=GROUPBY_ID, how="left")
126 | .drop(GROUPBY_ID)
127 | )
128 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_constants.py:
--------------------------------------------------------------------------------
1 | # BEHAVIORS
2 | DEFAULT_IMPRESSION_TIMESTAMP_COL = "impression_time"
3 | DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"
4 | DEFAULT_CLICKED_ARTICLES_COL = "article_ids_clicked"
5 | DEFAULT_SCROLL_PERCENTAGE_COL = "scroll_percentage"
6 | DEFAULT_INVIEW_ARTICLES_COL = "article_ids_inview"
7 | DEFAULT_IMPRESSION_ID_COL = "impression_id"
8 | DEFAULT_IS_SUBSCRIBER_COL = "is_subscriber"
9 | DEFAULT_IS_SSO_USER_COL = "is_sso_user"
10 | DEFAULT_ARTICLE_ID_COL = "article_id"
11 | DEFAULT_SESSION_ID_COL = "session_id"
12 | DEFAULT_READ_TIME_COL = "read_time"
13 | DEFAULT_DEVICE_COL = "device_type"
14 | DEFAULT_POSTCODE_COL = "postcode"
15 | DEFAULT_GENDER_COL = "gender"
16 | DEFAULT_USER_COL = "user_id"
17 | DEFAULT_AGE_COL = "age"
18 |
19 | DEFAULT_NEXT_SCROLL_PERCENTAGE_COL = f"next_{DEFAULT_SCROLL_PERCENTAGE_COL}"
20 | DEFAULT_NEXT_READ_TIME_COL = f"next_{DEFAULT_READ_TIME_COL}"
21 |
22 | # ARTICLES
23 | DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL = "last_modified_time"
24 | DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL = "published_time"
25 | DEFAULT_SENTIMENT_LABEL_COL = "sentiment_label"
26 | DEFAULT_SENTIMENT_SCORE_COL = "sentiment_score"
27 | DEFAULT_TOTAL_READ_TIME_COL = "total_read_time"
28 | DEFAULT_TOTAL_PAGEVIEWS_COL = "total_pageviews"
29 | DEFAULT_TOTAL_INVIEWS_COL = "total_inviews"
30 | DEFAULT_ARTICLE_TYPE_COL = "article_type"
31 | DEFAULT_CATEGORY_STR_COL = "category_str"
32 | DEFAULT_SUBCATEGORY_COL = "subcategory"
33 | DEFAULT_ENTITIES_COL = "entity_groups"
34 | DEFAULT_IMAGE_IDS_COL = "image_ids"
35 | DEFAULT_SUBTITLE_COL = "subtitle"
36 | DEFAULT_CATEGORY_COL = "category"
37 | DEFAULT_NER_COL = "ner_clusters"
38 | DEFAULT_PREMIUM_COL = "premium"
39 | DEFAULT_TOPICS_COL = "topics"
40 | DEFAULT_TITLE_COL = "title"
41 | DEFAULT_BODY_COL = "body"
42 | DEFAULT_URL_COL = "url"
43 |
44 | # HISTORY
45 | DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL = f"{DEFAULT_IMPRESSION_TIMESTAMP_COL}_fixed"
46 | DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL = f"{DEFAULT_SCROLL_PERCENTAGE_COL}_fixed"
47 | DEFAULT_HISTORY_ARTICLE_ID_COL = f"{DEFAULT_ARTICLE_ID_COL}_fixed"
48 | DEFAULT_HISTORY_READ_TIME_COL = f"{DEFAULT_READ_TIME_COL}_fixed"
49 |
50 | # CREATE
51 | DEFAULT_KNOWN_USER_COL = "is_known_user"
52 | DEFAULT_LABELS_COL = "labels"
53 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_decay.py:
--------------------------------------------------------------------------------
1 | try:
2 | import polars as pl
3 | except ImportError:
4 | print("polars not available")
5 |
6 |
7 | def linear_decay_weights(n: int, ascending: bool = True, **kwargs) -> list[float]:
8 | """
9 | Generates a list of weights in a linear decaying pattern.
10 | Args:
11 | n (int): The number of weights to generate. Must be a positive integer.
12 | ascending (bool, optional): Flag to determine the order of decay.
13 | If True, the decay is ascending. If False, it's descending.
14 | Defaults to True.
15 | Returns:
16 | List[float]: A list of linearly decaying weights.
17 | Raises:
18 | ValueError: If 'n' is not a positive integer.
19 | Examples:
20 | >>> linear_decay_weights(5, True)
21 | [0.2, 0.4, 0.6, 0.8, 1.0]
22 | >>> linear_decay_weights(10, False)
23 | [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
24 | """
25 | weights = [(n - i) / n for i in range(n)]
26 | return weights if not ascending else weights[::-1]
27 |
28 |
29 | def exponential_decay_weights(
30 | n: int, lambda_factor: float, ascending: bool = True, **kwargs
31 | ) -> list[float]:
32 | """
33 | Generates a list of weights in an exponential decay pattern.
34 | Args:
35 | n (int): The number of weights to generate. Must be a non-negative integer.
36 | lambda_factor (float): The factor by which the weights decay exponentially.
37 | ascending (bool, optional): Flag to determine the order of decay.
38 | If True, the decay is ascending. If False, it's descending.
39 | Defaults to True.
40 | Returns:
41 | List[float]: A list of exponentially decaying weights.
42 | Raises:
43 | ValueError: If 'n' is negative.
44 | Examples:
45 | >>> exponential_decay_weights(5, 0.5, True)
46 | [0.0625, 0.125, 0.25, 0.5, 1.0]
47 | >>> exponential_decay_weights(10, 0.5, False)
48 | [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, 0.001953125]
49 | """
50 | weights = [lambda_factor ** (n - i - 1) for i in range(n)]
51 | return weights if ascending else weights[::-1]
52 |
53 |
54 | def add_decay_weights(
55 | df, column: str, decay_func: callable, ascending: bool = True, **kwargs: dict
56 | ):
57 | """
58 | Wrapper function: Adding decay weights to column using decay function scheme
59 | >>> df = pl.DataFrame(
60 | {
61 | "col1": [
62 | [[1], [1], [1], [1]],
63 | [[1, 1], [1, 1], [1, 1]],
64 | [[1, 1, 1], [1, 1, 1]],
65 | None,
66 | ],
67 | "col2": [4, 5, 6, 7],
68 | }
69 | )
70 | >>> add_decay_weights(df, "col1", decay_func=linear_decay_weights, ascending=True)
71 | shape: (4, 3)
72 | ┌──────────────────────────┬───────────────────────────┬──────┐
73 | │ col1 ┆ col1_weights ┆ col2 │
74 | │ --- ┆ --- ┆ --- │
75 | │ list[list[i64]] ┆ list[f64] ┆ i64 │
76 | ╞══════════════════════════╪═══════════════════════════╪══════╡
77 | │ [[1], [1], … [1]] ┆ [0.25, 0.5, … 1.0] ┆ 4 │
78 | │ [[1, 1], [1, 1], [1, 1]] ┆ [0.333333, 0.666667, 1.0] ┆ 5 │
79 | │ [[1, 1, 1], [1, 1, 1]] ┆ [0.5, 1.0] ┆ 6 │
80 | │ null ┆ [] ┆ 7 │
81 | └──────────────────────────┴───────────────────────────┴──────┘
82 | >>> add_decay_weights(df, "col1", decay_func=exponential_decay_weights, ascending=True, **{"lambda_factor" : 0.5})
83 | shape: (4, 3)
84 | ┌──────────────────────────┬──────────────────────┬──────┐
85 | │ col1 ┆ col1_weights ┆ col2 │
86 | │ --- ┆ --- ┆ --- │
87 | │ list[list[i64]] ┆ list[f64] ┆ i64 │
88 | ╞══════════════════════════╪══════════════════════╪══════╡
89 | │ [[1], [1], … [1]] ┆ [0.125, 0.25, … 1.0] ┆ 4 │
90 | │ [[1, 1], [1, 1], [1, 1]] ┆ [0.25, 0.5, 1.0] ┆ 5 │
91 | │ [[1, 1, 1], [1, 1, 1]] ┆ [0.5, 1.0] ┆ 6 │
92 | │ null ┆ [] ┆ 7 │
93 | └──────────────────────────┴──────────────────────┴──────┘
94 | """
95 | lengths = df[column].list.len().to_list()
96 | weights = [decay_func(n=i, ascending=ascending, **kwargs) for i in lengths]
97 | return df.with_columns(pl.Series(f"{column}_weights", weights))
98 |
99 |
100 | def decay_weighting_nested_lists(
101 | df, column_history: str, column_history_weights: str, fill_nulls: int = None
102 | ):
103 | """
104 | >>> df = pl.DataFrame(
105 | {
106 | "col1": [
107 | [[1], [1], [1], [1]],
108 | [[1, 1], [1, 1], [1, 1]],
109 | [[1, 1, 1], [1, 1, 1]],
110 | [[1], None],
111 | None,
112 | ],
113 | "col1_weights":
114 | [[0.25, 0.5, 0.75, 1.0],
115 | [0.33, 0.67, 1.0],
116 | [0.5, 1.0],
117 | [0.5, 1.0],
118 | []
119 | ],
120 | "col2": [4, 5, 6, 7, 8 ],
121 | }
122 | )
123 | >>> decay_weighting_nested_lists(df, column_history="col1", column_history_weights="col1_weights")["col1"]
124 | Series: 'col1' [list[list[f64]]]
125 | [
126 | [[0.25], [0.5], … [1.0]]
127 | [[0.33, 0.33], [0.67, 0.67], [1.0, 1.0]]
128 | [[0.5, 0.5, 0.5], [1.0, 1.0, 1.0]]
129 | [[0.5], [null]]
130 | null
131 | ]
132 | >>> decay_weighting_nested_lists(df.lazy(), "col1", "col1_weights").collect()
133 | """
134 | GROUP_BY_COLUMN_FIRST = "group_by_1"
135 | GROUP_BY_COLUMN_SECOND = "group_by_2"
136 | COLUMNS = df.columns
137 |
138 | df = df.with_row_count(GROUP_BY_COLUMN_FIRST)
139 |
140 | exploded_weights = df.drop_nulls(column_history).select(
141 | pl.col(column_history_weights).explode()
142 | )
143 |
144 | if isinstance(exploded_weights, pl.LazyFrame):
145 | exploded_weights = exploded_weights.collect()
146 |
147 | df_ = (
148 | df.select(pl.col(GROUP_BY_COLUMN_FIRST, column_history))
149 | .drop_nulls(column_history)
150 | .explode(column_history)
151 | .with_columns(exploded_weights.select(column_history_weights))
152 | .with_row_count(GROUP_BY_COLUMN_SECOND)
153 | # Not optimal to explode, I want to compute [1,2,2] * 0.5 => (list * float)
154 | .explode(column_history)
155 | .with_columns(
156 | (pl.col(column_history) * pl.col(column_history_weights)).alias(
157 | column_history
158 | )
159 | )
160 | .group_by([GROUP_BY_COLUMN_SECOND])
161 | .agg(pl.col(GROUP_BY_COLUMN_FIRST).first(), column_history)
162 | .group_by(GROUP_BY_COLUMN_FIRST)
163 | .agg(column_history)
164 | .sort(GROUP_BY_COLUMN_FIRST)
165 | )
166 |
167 | return (
168 | df.drop(column_history)
169 | .join(df_, on=GROUP_BY_COLUMN_FIRST, how="left")
170 | .select(COLUMNS)
171 | )
172 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_descriptive_analysis.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 |
3 | from ebrec.utils._constants import (
4 | DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
5 | DEFAULT_IMPRESSION_TIMESTAMP_COL,
6 | )
7 |
8 |
9 | def min_max_impression_time_history(
10 | df: pl.DataFrame, timestamp_col: str = DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
11 | ):
12 | """
13 | Check min/max for user history timestamp column.
14 | """
15 | return (
16 | df.select(pl.col(timestamp_col))
17 | .with_columns(
18 | pl.col(timestamp_col).list.eval(pl.element().min()).explode().alias("min")
19 | )
20 | .with_columns(
21 | pl.col(timestamp_col).list.eval(pl.element().max()).explode().alias("max")
22 | )
23 | .select(pl.col("min").min(), pl.col("max").max())
24 | )
25 |
26 |
27 | def min_max_impression_time_behaviors(
28 | df: pl.DataFrame, timestamp_col: str = DEFAULT_IMPRESSION_TIMESTAMP_COL
29 | ):
30 | """
31 | Check min/max for behaviors timestamp column.
32 | """
33 | return df.select(
34 | pl.col(timestamp_col).min().alias("min"),
35 | pl.col(timestamp_col).max().alias("max"),
36 | )
37 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_nlp.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import numpy as np
3 | import torch
4 |
5 | from ebrec.utils._python import get_torch_device
6 |
7 | try:
8 | from torch.utils.data import DataLoader, TensorDataset
9 | except ImportError:
10 | print("torch not available")
11 | try:
12 | from transformers import AutoTokenizer, AutoModel
13 | except ImportError:
14 | print("transformers not available")
15 |
16 |
17 | def get_transformers_word_embeddings(model: AutoModel):
18 | return model.embeddings.word_embeddings.weight.data.to("cpu").numpy()
19 |
20 |
21 | def generate_embeddings_with_transformers(
22 | model: AutoModel,
23 | tokenizer: AutoTokenizer,
24 | text_list: list[str],
25 | batch_size: int = 8,
26 | device: str = None,
27 | disable_tqdm: bool = False,
28 | ) -> torch.Tensor:
29 | """
30 | Generates embeddings for a list of texts using a pre-trained transformer model.
31 |
32 | Args:
33 | model_name (str): The name of the pre-trained transformer model to use.
34 | text_list (list of str): A list of texts to generate embeddings for.
35 | batch_size (int): The batch size to use for generating embeddings. Defaults to 8.
36 | device (str): The device to use for generating embeddings (e.g., "cpu", "cuda").
37 | If None, defaults to the first available GPU or CPU.
38 |
39 | Returns:
40 | embeddings (torch.Tensor): A tensor containing the embeddings for the input texts.
41 | The shape of the tensor is (num_texts, embedding_dim), where num_texts is the number
42 | of input texts and embedding_dim is the dimensionality of the embeddings produced by
43 | the pre-trained model.
44 |
45 | Examples:
46 | >>> model_name = "bert-base-uncased"
47 | >>> text_list = ["hello world", "how are you"]
48 | >>> batch_size = 2
49 | >>> device = "cpu"
50 | >>> model = AutoModel.from_pretrained(model_name)
51 | >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
52 | >>> embeddings_tensor = generate_embeddings_with_transformers(model, tokenizer, text_list, batch_size, device)
53 | >>> print(embeddings_tensor)
54 | tensor([[-0.0243, 0.1144, 0.0830, ..., -0.2666, 0.1662, 0.1519],
55 | [ 0.0827, 0.0877, -0.0688, ..., -0.4381, 0.0462, -0.1446]])
56 | >>> print(embeddings_tensor.shape)
57 | torch.Size([2, 768])
58 | """
59 | device = get_torch_device(use_gpu=True) if device is None else device
60 | model = model.to(device)
61 |
62 | tokenized_text = tokenizer(
63 | text_list, padding=True, truncation=True, return_tensors="pt"
64 | )
65 | feature_names = list(tokenized_text)
66 |
67 | dataset = TensorDataset(
68 | tokenized_text["input_ids"], tokenized_text["attention_mask"]
69 | )
70 | dataloader = DataLoader(dataset, batch_size=batch_size)
71 | embeddings = []
72 | with torch.no_grad():
73 | for batch in tqdm(dataloader, desc="Encoding", disable=disable_tqdm):
74 | inputs = {feat: t.to(device) for feat, t in zip(feature_names, batch)}
75 | outputs = model(
76 | **inputs,
77 | output_hidden_states=True,
78 | )
79 | embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze(dim=1))
80 | return torch.vstack(embeddings)
81 |
82 |
83 | if __name__ == "__main__":
84 | #
85 | model_name = "xlm-roberta-base"
86 | batch_size = 8
87 | text_list = [
88 | "hej med dig. Jeg er en tekst.",
89 | "Jeg er en anden tekst, skal du spille smart?",
90 | "oh nej..",
91 | ]
92 | model = AutoModel.from_pretrained(model_name)
93 | tokenizer = AutoTokenizer.from_pretrained(model_name)
94 | t = generate_embeddings_with_transformers(
95 | model, tokenizer, text_list, batch_size, "cpu"
96 | )
97 |
--------------------------------------------------------------------------------
/src/ebrec/utils/_torch.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | try:
4 | import torch
5 | except ImportError:
6 | print("torch not available")
7 |
8 |
9 | def save_checkpoint(model, path="model_state_dict.pt"):
10 | path = Path(path)
11 | path.parent.mkdir(parents=True, exist_ok=True)
12 | print(f"Saving model weights: {path}")
13 | torch.save(model.state_dict(), path.as_posix())
14 |
--------------------------------------------------------------------------------
/test/bombing/bomb_dataloader.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import polars as pl
3 | import numpy as np
4 |
5 | from ebrec.models.newsrec.dataloader import (
6 | LSTURDataLoader,
7 | NRMSDataLoader,
8 | )
9 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
10 | from ebrec.utils._articles import create_article_id_to_value_mapping
11 |
12 | from ebrec.utils._python import time_it
13 | from tqdm import tqdm
14 |
15 | from ebrec.utils._behaviors import create_binary_labels_column
16 | from ebrec.utils._constants import (
17 | DEFAULT_HISTORY_ARTICLE_ID_COL,
18 | DEFAULT_CLICKED_ARTICLES_COL,
19 | DEFAULT_INVIEW_ARTICLES_COL,
20 | DEFAULT_ARTICLE_ID_COL,
21 | DEFAULT_CATEGORY_COL,
22 | DEFAULT_USER_COL,
23 | )
24 |
25 | from ebrec.models.fastformer.dataloader import FastformerDataset
26 | from torch.utils.data import DataLoader
27 |
28 | N_ITERATIONS = 300
29 | BATCH_SIZE = 100
30 | TOKEN_COL = "tokens"
31 | N_SAMPLES = "n"
32 |
33 | # LOAD DATA:
34 | PATH_DATA = Path("test/data")
35 | df_articles = (
36 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
37 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
38 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
39 | .collect()
40 | )
41 | df_history = (
42 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
43 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
44 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
45 | )
46 | df_behaviors = (
47 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
48 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
49 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
50 | .join(df_history, on=DEFAULT_USER_COL, how="left")
51 | .collect()
52 | .pipe(create_binary_labels_column)
53 | )
54 | # => MAPPINGS:
55 | article_mapping = create_article_id_to_value_mapping(
56 | df=df_articles, value_col=TOKEN_COL
57 | )
58 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
59 | # => NPRATIO IMPRESSION - SAME LENGTHS:
60 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
61 | # => FOR TEST-DATALOADER
62 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
63 |
64 |
65 | def iter_dataloader(dataloader, name: str, iterations: int):
66 | for _ in tqdm(range(iterations), desc=name):
67 | for _ in dataloader:
68 | pass
69 |
70 |
71 | # ===
72 | @time_it(True)
73 | def bomb_NRMSDataLoader():
74 | dataloader = NRMSDataLoader(
75 | behaviors=df_behaviors_train,
76 | article_dict=article_mapping,
77 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
78 | unknown_representation="zeros",
79 | eval_mode=False,
80 | batch_size=BATCH_SIZE,
81 | )
82 | iter_dataloader(dataloader, "NRMS-train", iterations=N_ITERATIONS)
83 |
84 | dataloader = NRMSDataLoader(
85 | behaviors=df_behaviors,
86 | article_dict=article_mapping,
87 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
88 | unknown_representation="zeros",
89 | eval_mode=True,
90 | batch_size=BATCH_SIZE,
91 | )
92 | iter_dataloader(dataloader, "NRMS-test", iterations=N_ITERATIONS)
93 |
94 |
95 | @time_it(True)
96 | def bomb_LSTURDataLoader():
97 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors_train)
98 |
99 | dataloader = LSTURDataLoader(
100 | behaviors=df_behaviors_train,
101 | article_dict=article_mapping,
102 | user_id_mapping=user_mapping,
103 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
104 | unknown_representation="zeros",
105 | batch_size=BATCH_SIZE,
106 | )
107 | iter_dataloader(dataloader, "LSTUR-train", iterations=N_ITERATIONS)
108 |
109 | dataloader = LSTURDataLoader(
110 | behaviors=df_behaviors,
111 | article_dict=article_mapping,
112 | user_id_mapping=user_mapping,
113 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
114 | unknown_representation="zeros",
115 | batch_size=BATCH_SIZE,
116 | eval_mode=True,
117 | )
118 | iter_dataloader(dataloader, "LSTUR-test", iterations=N_ITERATIONS)
119 |
120 |
121 | # ===
122 | @time_it(True)
123 | def bomb_FastformerDataLoader():
124 | dataloader = DataLoader(
125 | FastformerDataset(
126 | behaviors=df_behaviors_train,
127 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
128 | article_dict=article_mapping,
129 | batch_size=BATCH_SIZE,
130 | shuffle=True,
131 | )
132 | )
133 | iter_dataloader(dataloader, "Fastformer-train", iterations=N_ITERATIONS)
134 |
135 | dataloader = DataLoader(
136 | FastformerDataset(
137 | behaviors=df_behaviors,
138 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
139 | article_dict=article_mapping,
140 | batch_size=BATCH_SIZE,
141 | shuffle=False,
142 | )
143 | )
144 | iter_dataloader(dataloader, "Fastformer-test", iterations=N_ITERATIONS)
145 |
146 |
147 | if __name__ == "__main__":
148 | bomb_NRMSDataLoader()
149 | bomb_LSTURDataLoader()
150 | bomb_FastformerDataLoader()
151 |
--------------------------------------------------------------------------------
/test/data/ebnerd/articles.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/articles.parquet
--------------------------------------------------------------------------------
/test/data/ebnerd/behaviors.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/behaviors.parquet
--------------------------------------------------------------------------------
/test/data/ebnerd/document_vector.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/document_vector.parquet
--------------------------------------------------------------------------------
/test/data/ebnerd/history.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebanalyse/ebnerd-benchmark/c12f9b74c83eea68df69f9640d368adcd5d42a9e/test/data/ebnerd/history.parquet
--------------------------------------------------------------------------------
/test/dataloader/test_fastformer.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import polars as pl
3 | import numpy as np
4 | import torch
5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
6 | from ebrec.utils._articles import create_article_id_to_value_mapping
7 |
8 | from ebrec.utils._python import time_it
9 | from ebrec.utils._behaviors import create_binary_labels_column
10 | from ebrec.utils._constants import (
11 | DEFAULT_HISTORY_ARTICLE_ID_COL,
12 | DEFAULT_CLICKED_ARTICLES_COL,
13 | DEFAULT_INVIEW_ARTICLES_COL,
14 | DEFAULT_ARTICLE_ID_COL,
15 | DEFAULT_CATEGORY_COL,
16 | DEFAULT_USER_COL,
17 | )
18 |
19 | from ebrec.models.fastformer.dataloader import FastformerDataset
20 | from torch.utils.data import DataLoader
21 |
22 | TOKEN_COL = "tokens"
23 | N_SAMPLES = "n"
24 | BATCH_SIZE = 100
25 |
26 | # LOAD DATA:
27 | PATH_DATA = Path("test/data")
28 | df_articles = (
29 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
30 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
31 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
32 | .collect()
33 | )
34 | df_history = (
35 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
36 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
37 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
38 | )
39 | df_behaviors = (
40 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
41 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
42 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
43 | .join(df_history, on=DEFAULT_USER_COL, how="left")
44 | .collect()
45 | .pipe(create_binary_labels_column)
46 | )
47 | # => MAPPINGS:
48 | article_mapping = create_article_id_to_value_mapping(
49 | df=df_articles, value_col=TOKEN_COL
50 | )
51 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
52 | # => NPRATIO IMPRESSION - SAME LENGTHS:
53 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
54 | # => FOR TEST-DATALOADER
55 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
56 |
57 |
58 | @time_it(True)
59 | def test_FastformerDataloader():
60 | train_dataloader = DataLoader(
61 | FastformerDataset(
62 | behaviors=df_behaviors_train,
63 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
64 | article_dict=article_mapping,
65 | batch_size=BATCH_SIZE,
66 | shuffle=True,
67 | )
68 | )
69 |
70 | batch = train_dataloader.__iter__().__next__()
71 |
72 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
73 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
74 | assert (
75 | len(batch[0]) == 2
76 | ), "Fastformer has two outputs (history_input, candidate_input)"
77 |
78 | for type_in_batch in batch[0]:
79 | assert (
80 | type_in_batch.dtype == torch.int
81 | ), "Expected output to be integer; used for lookup value"
82 |
83 | assert batch[1].dtype == torch.float, "Expected output to be integer; this is label"
84 |
85 | test_dataloader = DataLoader(
86 | FastformerDataset(
87 | behaviors=df_behaviors,
88 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
89 | article_dict=article_mapping,
90 | batch_size=BATCH_SIZE,
91 | shuffle=False,
92 | )
93 | )
94 |
95 | batch = test_dataloader.__iter__().__next__()
96 | assert len(batch[1].squeeze(0)) == sum(
97 | label_lengths[:BATCH_SIZE]
98 | ), "Should have unfolded all the test samples"
99 |
--------------------------------------------------------------------------------
/test/dataloader/test_newsrec.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import polars as pl
3 | import numpy as np
4 | import torch
5 | from ebrec.utils._behaviors import create_user_id_to_int_mapping
6 | from ebrec.utils._articles import create_article_id_to_value_mapping
7 | from ebrec.utils._python import create_lookup_dict
8 |
9 | from ebrec.models.newsrec.dataloader import (
10 | LSTURDataLoader,
11 | NAMLDataLoader,
12 | NRMSDataLoader,
13 | )
14 | from ebrec.utils._python import time_it
15 | from ebrec.utils._behaviors import create_binary_labels_column
16 | from ebrec.utils._constants import (
17 | DEFAULT_HISTORY_ARTICLE_ID_COL,
18 | DEFAULT_CLICKED_ARTICLES_COL,
19 | DEFAULT_INVIEW_ARTICLES_COL,
20 | DEFAULT_ARTICLE_ID_COL,
21 | DEFAULT_CATEGORY_COL,
22 | DEFAULT_USER_COL,
23 | )
24 |
25 | from ebrec.models.fastformer.dataloader import FastformerDataset
26 | from torch.utils.data import DataLoader
27 |
28 | TOKEN_COL = "tokens"
29 | N_SAMPLES = "n"
30 | BATCH_SIZE = 100
31 |
32 | # LOAD DATA:
33 | PATH_DATA = Path("test/data")
34 | df_articles = (
35 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "articles.parquet"))
36 | .select(pl.col(DEFAULT_ARTICLE_ID_COL, DEFAULT_CATEGORY_COL))
37 | .with_columns(pl.Series(TOKEN_COL, np.random.randint(0, 20, (1, 10))))
38 | .collect()
39 | )
40 | df_history = (
41 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "history.parquet"))
42 | .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
43 | .with_columns(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).list.tail(3))
44 | )
45 | df_behaviors = (
46 | pl.scan_parquet(PATH_DATA.joinpath("ebnerd", "behaviors.parquet"))
47 | .select(DEFAULT_USER_COL, DEFAULT_INVIEW_ARTICLES_COL, DEFAULT_CLICKED_ARTICLES_COL)
48 | .with_columns(pl.col(DEFAULT_INVIEW_ARTICLES_COL).list.len().alias(N_SAMPLES))
49 | .join(df_history, on=DEFAULT_USER_COL, how="left")
50 | .collect()
51 | .pipe(create_binary_labels_column)
52 | )
53 | # => MAPPINGS:
54 | article_mapping = create_article_id_to_value_mapping(
55 | df=df_articles, value_col=TOKEN_COL
56 | )
57 | user_mapping = create_user_id_to_int_mapping(df=df_behaviors)
58 | # => NPRATIO IMPRESSION - SAME LENGTHS:
59 | df_behaviors_train = df_behaviors.filter(pl.col(N_SAMPLES) == pl.col(N_SAMPLES).min())
60 | # => FOR TEST-DATALOADER
61 | label_lengths = df_behaviors[DEFAULT_INVIEW_ARTICLES_COL].list.len().to_list()
62 |
63 |
64 | # ===
65 | @time_it(True)
66 | def test_NRMSDataLoader():
67 | train_dataloader = NRMSDataLoader(
68 | behaviors=df_behaviors_train,
69 | article_dict=article_mapping,
70 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
71 | unknown_representation="zeros",
72 | eval_mode=False,
73 | batch_size=BATCH_SIZE,
74 | )
75 |
76 | batch = train_dataloader.__iter__().__next__()
77 |
78 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
79 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
80 | assert (
81 | len(batch[0]) == 2
82 | ), "NRMS has two outputs (his_input_title, pred_input_title_one)"
83 |
84 | for type_in_batch in batch[0][0]:
85 | assert isinstance(
86 | type_in_batch.ravel()[0], np.integer
87 | ), "Expected output to be integer; used for lookup value"
88 |
89 | assert isinstance(
90 | batch[1].ravel()[0], np.integer
91 | ), "Expected output to be integer; this is label"
92 |
93 | test_dataloader = NRMSDataLoader(
94 | behaviors=df_behaviors,
95 | article_dict=article_mapping,
96 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
97 | unknown_representation="zeros",
98 | eval_mode=True,
99 | batch_size=BATCH_SIZE,
100 | )
101 |
102 | batch = test_dataloader.__iter__().__next__()
103 | assert len(batch[1]) == sum(
104 | label_lengths[:BATCH_SIZE]
105 | ), "Should have unfolded all the test samples"
106 |
107 |
108 | @time_it(True)
109 | def test_LSTURDataLoader():
110 | train_dataloader = LSTURDataLoader(
111 | behaviors=df_behaviors_train,
112 | article_dict=article_mapping,
113 | user_id_mapping=user_mapping,
114 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
115 | unknown_representation="zeros",
116 | batch_size=BATCH_SIZE,
117 | )
118 |
119 | batch = train_dataloader.__iter__().__next__()
120 |
121 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
122 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
123 | assert (
124 | len(batch[0]) == 3
125 | ), "LSTUR has two outputs (user_indexes, his_input_title, pred_input_title_one)"
126 |
127 | for type_in_batch in batch[0][0]:
128 | assert isinstance(
129 | type_in_batch.ravel()[0], np.integer
130 | ), "Expected output to be integer; used for lookup value"
131 |
132 | assert isinstance(
133 | batch[1].ravel()[0], np.integer
134 | ), "Expected output to be integer; this is label"
135 |
136 | test_dataloader = LSTURDataLoader(
137 | behaviors=df_behaviors,
138 | article_dict=article_mapping,
139 | user_id_mapping=user_mapping,
140 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
141 | unknown_representation="zeros",
142 | batch_size=BATCH_SIZE,
143 | eval_mode=True,
144 | )
145 |
146 | batch = test_dataloader.__iter__().__next__()
147 | assert len(batch[1]) == sum(
148 | label_lengths[:BATCH_SIZE]
149 | ), "Should have unfolded all the test samples"
150 |
151 |
152 | @time_it(True)
153 | def test_NAMLDataLoader():
154 | body_mapping = article_mapping
155 | category_mapping = create_lookup_dict(
156 | df_articles.select(pl.col(DEFAULT_CATEGORY_COL).unique()).with_row_index(
157 | "row_nr"
158 | ),
159 | key=DEFAULT_CATEGORY_COL,
160 | value="row_nr",
161 | )
162 | subcategory_mapping = category_mapping
163 |
164 | train_dataloader = NAMLDataLoader(
165 | behaviors=df_behaviors_train,
166 | article_dict=article_mapping,
167 | body_mapping=body_mapping,
168 | category_mapping=category_mapping,
169 | unknown_representation="zeros",
170 | subcategory_mapping=subcategory_mapping,
171 | history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
172 | batch_size=BATCH_SIZE,
173 | )
174 |
175 | batch = train_dataloader.__iter__().__next__()
176 |
177 | assert train_dataloader.__len__() == int(np.ceil(df_behaviors_train.shape[0] / 100))
178 | assert len(batch) == 2, "There should be two outputs: (inputs, labels)"
179 | assert (
180 | len(batch[0]) == 8
181 | ), "NAML has two outputs (his_input_title,his_input_body,his_input_vert,his_input_subvert,pred_input_title,pred_input_body,pred_input_vert,pred_input_subvert)"
182 |
183 | for type_in_batch in batch[0][0]:
184 | assert isinstance(
185 | type_in_batch.ravel()[0], np.integer
186 | ), "Expected output to be integer; used for lookup value"
187 |
188 | assert isinstance(
189 | batch[1].ravel()[0], np.integer
190 | ), "Expected output to be integer; this is label"
191 |
--------------------------------------------------------------------------------
/test/evaluation/test_beyond_accuracy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics.pairwise import cosine_distances
3 |
4 | from ebrec.evaluation.beyond_accuracy import (
5 | IntralistDiversity,
6 | Distribution,
7 | Serendipity,
8 | Novelty,
9 | Coverage,
10 | )
11 |
12 | lookup_dict = {
13 | "101": {"doc_vec": np.array([1, 0, 0]), "v": 1, "sv": [1], "pop_sc": 0.50},
14 | "102": {"doc_vec": np.array([0, 1, 0]), "v": 2, "sv": [1], "pop_sc": 0.25},
15 | "103": {"doc_vec": np.array([1, 1, 1]), "v": 3, "sv": [1], "pop_sc": 0.75},
16 | "104": {"doc_vec": np.array([1, 1, 1]), "v": 4, "sv": [1], "pop_sc": 0.50},
17 | "105": {"doc_vec": np.array([-1, 0, 0]), "v": 5, "sv": [1], "pop_sc": 0.94},
18 | "106": {"doc_vec": np.array([-1, 0, 0]), "v": 6, "sv": [1, 2], "pop_sc": 0.95},
19 | "107": {"doc_vec": np.array([-1, 0, 0]), "v": 7, "sv": [1, 2], "pop_sc": 0.96},
20 | "108": {"doc_vec": np.array([0, 0, 1]), "v": 8, "sv": [1, 2], "pop_sc": 0.50},
21 | "400": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4], "pop_sc": 0.20},
22 | "401": {"doc_vec": np.array([0, 0, 1]), "v": 9, "sv": [4, 5], "pop_sc": 0.20},
23 | }
24 |
25 | # 404 is not excepted, however, setup supports it:
26 | R = np.array(
27 | [
28 | ["101", "102", "400"],
29 | ["101", "103", "400"],
30 | ["101", "102", "103"],
31 | ["101", "104", "400"],
32 | ["101", "106", "404"],
33 | ["404", "404", "404"],
34 | ]
35 | )
36 |
37 | C = ["1", "2", "101", "102", "103", "104", "105", "106", "107", "108", "400", "401"]
38 |
39 | click_histories = [
40 | np.array([["101", "102"]]),
41 | np.array([["105", "106", "400"]]),
42 | np.array([["102", "103", "104"]]),
43 | np.array([["101", "400"]]),
44 | np.array([["400"]]),
45 | np.array([["400"]]),
46 | ]
47 | pairwise_distance_function = cosine_distances
48 |
49 | # TODO: add the test
50 |
--------------------------------------------------------------------------------